Merge pull request #1020 from xxyzz/en

[en] fix Lua error and `IndexError` exception in categories.py
tatuylonen · Feb 5, 2025 · 0ee4b87 · 0ee4b87
2 parents 7f3aeb6 + a17cde8
commit 0ee4b87
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 25 deletions.
diff --git a/src/wiktextract/categories.py b/src/wiktextract/categories.py
@@ -2,10 +2,7 @@
 #
 # Copyright (c) 2021 Tatu Ylonen.  See file LICENSE and https://ylonen.org
 
-from typing import (
-    Optional,
-    TypedDict,
-)
+from typing import TypedDict
 
 from wikitextprocessor.core import NamespaceDataEntry
 
@@ -41,17 +38,19 @@
     print( k..": "..desc )
     desc = string.gsub(desc, "\n", "\\n")
     table.insert(parts, k .. "@@" .. desc)
-    for kk, vv in pairs(v.parents) do
-      local name
-      local sort = ""
-      if type(vv) == "table" then
-        name = vv.name
-        sort = vv.sort or ""
-      else
-        name = vv
-      end
-      if name then
-        table.insert(parts, "@@" .. name .. "@@" .. sort)
+    if type(v.parents) == "table" then
+      for kk, vv in pairs(v.parents) do
+        local name
+        local sort = ""
+        if type(vv) == "table" then
+          name = vv.name
+          sort = vv.sort or ""
+        else
+          name = vv
+        end
+        if name then
+          table.insert(parts, "@@" .. name .. "@@" .. sort)
+        end
       end
     end
     table.insert(parts, "\n")
@@ -93,22 +92,25 @@
     total=False,
 )
 
+
 def extract_categories(wxr: WiktextractContext) -> CategoryReturn:
     """Extracts the category tree from Wiktionary."""
-    module_ns: Optional[NamespaceDataEntry] = wxr.wtp.NAMESPACE_DATA.get(
-                                                            "Module", None)
-    assert module_ns is not None
-    module_ns_local_name = module_ns.get("name")
-    module_ns_id = module_ns.get("id")
-    wxr.wtp.add_page(f"{module_ns_local_name}:wiktextract cat tree",
-                 module_ns_id, LUA_CODE, model="Scribunto")
+    module_ns: NamespaceDataEntry = wxr.wtp.NAMESPACE_DATA["Module"]
+    wxr.wtp.add_page(
+        f"{module_ns['name']}:wiktextract cat tree",
+        module_ns["id"],
+        LUA_CODE,
+        model="Scribunto",
+    )
     wxr.wtp.start_page("Wiktextract category tree extraction")
     rawdata = wxr.wtp.expand("{{#invoke:wiktextract cat tree|main}}")
     ht: dict[str, CategoryEntry] = {}
     for line in rawdata.split("\n"):
         if not line:
             continue
         parts = line.split("@@")
+        if len(parts) < 2:
+            continue
         name = parts[0]
         desc = parts[1]
         name = name.removeprefix("Category:")
@@ -127,7 +129,7 @@ def extract_categories(wxr: WiktextractContext) -> CategoryReturn:
             parent_name_lc = parent_name.lower()
             parent_sort = parts[i + 1]
             if parent_name_lc not in ht:
-                p: CategoryEntry  = {"name": parent_name}
+                p: CategoryEntry = {"name": parent_name}
                 ht[parent_name_lc] = p
             else:
                 p = ht[parent_name_lc]
@@ -157,7 +159,7 @@ def recurse(name: str) -> None:
 
     notseen_set = set(x.lower() for x in ht.keys()) - seen - is_child
     notseen = list(ht[x]["name"] for x in sorted(notseen_set))
-    #if notseen:
+    # if notseen:
     #    print("NOT SEEN:", "; ".join(notseen))
 
     # Sort lists of children

diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py
@@ -502,7 +502,7 @@ def main():
         extract_namespace(wxr, "Module", args.modules_file)
     if args.templates_file:
         extract_namespace(wxr, "Template", args.templates_file)
-    if args.categories_file:
+    if args.categories_file and args.dump_file_language_code == "en":
         logger.info("Extracting category tree")
         tree = extract_categories(wxr)
         with open(args.categories_file, "w") as f: