Skip to content

Commit

Permalink
Merge pull request #1020 from xxyzz/en
Browse files Browse the repository at this point in the history
[en] fix Lua error and `IndexError` exception in categories.py
  • Loading branch information
kristian-clausal authored Feb 5, 2025
2 parents 7f3aeb6 + a17cde8 commit 0ee4b87
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 25 deletions.
50 changes: 26 additions & 24 deletions src/wiktextract/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@
#
# Copyright (c) 2021 Tatu Ylonen. See file LICENSE and https://ylonen.org

from typing import (
Optional,
TypedDict,
)
from typing import TypedDict

from wikitextprocessor.core import NamespaceDataEntry

Expand Down Expand Up @@ -41,17 +38,19 @@
print( k..": "..desc )
desc = string.gsub(desc, "\n", "\\n")
table.insert(parts, k .. "@@" .. desc)
for kk, vv in pairs(v.parents) do
local name
local sort = ""
if type(vv) == "table" then
name = vv.name
sort = vv.sort or ""
else
name = vv
end
if name then
table.insert(parts, "@@" .. name .. "@@" .. sort)
if type(v.parents) == "table" then
for kk, vv in pairs(v.parents) do
local name
local sort = ""
if type(vv) == "table" then
name = vv.name
sort = vv.sort or ""
else
name = vv
end
if name then
table.insert(parts, "@@" .. name .. "@@" .. sort)
end
end
end
table.insert(parts, "\n")
Expand Down Expand Up @@ -93,22 +92,25 @@
total=False,
)


def extract_categories(wxr: WiktextractContext) -> CategoryReturn:
"""Extracts the category tree from Wiktionary."""
module_ns: Optional[NamespaceDataEntry] = wxr.wtp.NAMESPACE_DATA.get(
"Module", None)
assert module_ns is not None
module_ns_local_name = module_ns.get("name")
module_ns_id = module_ns.get("id")
wxr.wtp.add_page(f"{module_ns_local_name}:wiktextract cat tree",
module_ns_id, LUA_CODE, model="Scribunto")
module_ns: NamespaceDataEntry = wxr.wtp.NAMESPACE_DATA["Module"]
wxr.wtp.add_page(
f"{module_ns['name']}:wiktextract cat tree",
module_ns["id"],
LUA_CODE,
model="Scribunto",
)
wxr.wtp.start_page("Wiktextract category tree extraction")
rawdata = wxr.wtp.expand("{{#invoke:wiktextract cat tree|main}}")
ht: dict[str, CategoryEntry] = {}
for line in rawdata.split("\n"):
if not line:
continue
parts = line.split("@@")
if len(parts) < 2:
continue
name = parts[0]
desc = parts[1]
name = name.removeprefix("Category:")
Expand All @@ -127,7 +129,7 @@ def extract_categories(wxr: WiktextractContext) -> CategoryReturn:
parent_name_lc = parent_name.lower()
parent_sort = parts[i + 1]
if parent_name_lc not in ht:
p: CategoryEntry = {"name": parent_name}
p: CategoryEntry = {"name": parent_name}
ht[parent_name_lc] = p
else:
p = ht[parent_name_lc]
Expand Down Expand Up @@ -157,7 +159,7 @@ def recurse(name: str) -> None:

notseen_set = set(x.lower() for x in ht.keys()) - seen - is_child
notseen = list(ht[x]["name"] for x in sorted(notseen_set))
#if notseen:
# if notseen:
# print("NOT SEEN:", "; ".join(notseen))

# Sort lists of children
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/wiktwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ def main():
extract_namespace(wxr, "Module", args.modules_file)
if args.templates_file:
extract_namespace(wxr, "Template", args.templates_file)
if args.categories_file:
if args.categories_file and args.dump_file_language_code == "en":
logger.info("Extracting category tree")
tree = extract_categories(wxr)
with open(args.categories_file, "w") as f:
Expand Down

0 comments on commit 0ee4b87

Please sign in to comment.