From 546b6b495404f07dba99101cac9aa2b4a69a4290 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 25 Feb 2025 16:08:09 +0800 Subject: [PATCH 1/2] [id] add more section templates --- src/wiktextract/data/overrides/id.json | 112 +++++++++++++++++- .../extractor/id/analyze_template.py | 7 +- .../extractor/id/section_titles.py | 9 ++ 3 files changed, 125 insertions(+), 3 deletions(-) diff --git a/src/wiktextract/data/overrides/id.json b/src/wiktextract/data/overrides/id.json index 783b519b..a68829a1 100644 --- a/src/wiktextract/data/overrides/id.json +++ b/src/wiktextract/data/overrides/id.json @@ -9,7 +9,7 @@ "need_pre_expand": true }, "Templat:-syn-": { - "body": "====Sinonim====\n", + "body": "====Sinonim====\n{{#ifeq:{{{1|}}}|id||{{-syn2-|{{{1|}}}|{{{2|}}}|{{{3|}}}|{{{4|}}}|{{{5|}}}|{{{6|}}}|{{{7|}}}|{{{8|}}}|{{{9|}}}|{{{10|}}}|{{{11|}}}|{{{12|}}}|{{{13|}}}|{{{14|}}}|{{{15|}}}|{{{16|}}}|{{{17|}}}|{{{18|}}}|{{{19|}}}|{{{20|}}}|{{{21|}}}|{{{22|}}}|{{{23|}}}|{{{24|}}}|{{{25|}}}|{{{26|}}}|{{{27|}}}|{{{28|}}}|{{{29|}}}|{{{30|}}}}}}}\n", "namespace_id": 10, "need_pre_expand": true }, @@ -23,11 +23,21 @@ "namespace_id": 10, "need_pre_expand": true }, + "Templat:-drv-multi-": { + "body": "====Kata turunan====\n", + "namespace_id": 10, + "need_pre_expand": true + }, "Templat:-frasa-": { "body": "====Frasa dan kata majemuk====\n", "namespace_id": 10, "need_pre_expand": true }, + "Templat:-frasa2-": { + "body": "====Frasa dan kata majemuk====\n", + "namespace_id": 10, + "need_pre_expand": true + }, "Templat:-lihat-": { "body": "====Lihat pula====\n", "namespace_id": 10, @@ -47,5 +57,105 @@ "body": "====Terjemahan====\n", "namespace_id": 10, "need_pre_expand": true + }, + "Templat:-ref-": { + "body": "====Referensi====\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-akar-": { + "body": "===Akar kata===\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-akhir-": { + "body": "===Akhiran===[[Kategori:{{{1|id}}}:Akhiran]]\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-alt-": { + "body": "====Alternatif====\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-awal-": { + "body": "===Awalan===[[Kategori:{{{1|id}}}:Awalan]]\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-cog-": { + "body": "====Kognat====\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-desc-": { + "body": "====Turunan====\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-huruf-": { + "body": "===Huruf===\n''huruf kecil'' [[{{lc:{{PAGENAME}}}}]]; ''huruf besar'' [[{{uc:{{PAGENAME}}}}]][[Kategori:Huruf]]\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-inisialisme-": { + "body": "===Inisialisme===[[Kategori:{{{1|id}}}:Inisialisme]]\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-keluarga-": { + "body": "====Keluarga kata====\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-kiasan-": { + "body": "====Kiasan====\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-kontraksi-": { + "body": "===Kontraksi===[[Kategori:{{{1}}}:Kontraksi]]\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-lafal-": { + "body": "====Pelafalan====\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-noun-": { + "body": "===Nomina===[[Kategori:{{{1|id}}}:{{nom}}]]\n'''{{PAGENAME}}''' (''plural'' '''[[{{PAGENAME}}-{{PAGENAME}}]]''', ''posesif orang pertama'' '''{{PAGENAME}}[[-ku|ku]]''', ''posesif orang kedua'' '''{{PAGENAME}}[[-mu|mu]]''', ''posesif orang ketiga'' '''{{PAGENAME}}[[-nya|nya]]''')\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-peribahasa-": { + "body": "====Peribahasa====\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-singkatan-": { + "body": "===Singkatan===[[Kategori:{{{1}}}:Singkatan]]\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-syn-multi-": { + "body": "====Sinonim====\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-ucap-": { + "body": "====Pengucapan====\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:-var-": { + "body": "====Variasi====\n", + "namespace_id": 10, + "need_pre_expand": true + }, + "Templat:=drv=": { + "body": "====Frasa turunan====\n", + "namespace_id": 10, + "need_pre_expand": true } } diff --git a/src/wiktextract/extractor/id/analyze_template.py b/src/wiktextract/extractor/id/analyze_template.py index ccd8450a..43d4c9fb 100644 --- a/src/wiktextract/extractor/id/analyze_template.py +++ b/src/wiktextract/extractor/id/analyze_template.py @@ -2,6 +2,7 @@ # https://id.wiktionary.org/wiki/Wikikamus:Penjelasan_tataletak_entri # https://id.wiktionary.org/wiki/Kategori:Templat_kelas_kata +# https://id.wiktionary.org/wiki/Kategori:Templat_umum SECTION_TITLE_TEMPLATES = { "Templat:-adj-", "Templat:-adv-", @@ -33,8 +34,8 @@ "Templat:=pron=", "Templat:=pronom=", "Templat:=verb=", - "Templat:nomina -nya", "Templat:ulang", + "Templat:-sdd-", } @@ -48,6 +49,8 @@ def analyze_template(wtp: Wtp, page: Page) -> tuple[set[str], bool]: "Templat:sisipan ", "Templat:ulang ", "Templat:verba ", + "Templat:nomina ", ) - ), + ) + or page.title.endswith(("proper noun", "-nm")), ) diff --git a/src/wiktextract/extractor/id/section_titles.py b/src/wiktextract/extractor/id/section_titles.py index 4cdd971a..9192fb7c 100644 --- a/src/wiktextract/extractor/id/section_titles.py +++ b/src/wiktextract/extractor/id/section_titles.py @@ -1,4 +1,5 @@ # https://id.wiktionary.org/wiki/Kategori:Templat_kelas_kata +# https://id.wiktionary.org/wiki/Kategori:Templat_umum POS_DATA = { "Adjektiva": {"pos": "adj"}, "Adverbia": {"pos": "adv"}, @@ -16,4 +17,12 @@ "Pronomina": {"pos": "pron"}, "Subjungsi": {"pos": "conj"}, "Verba": {"pos": "verb"}, + "Akar kata": {"pos": "root", "tags": ["morpheme"]}, + "Akhiran": {"pos": "suffix", "tags": ["morpheme"]}, + "Huruf": {"pos": "character", "tags": ["letter"]}, + "Inisialisme": {"pos": "abbrev", "tags": ["abbreviation"]}, + "Awalan": {"pos": "prefix", "tags": ["morpheme"]}, + "Kontraksi": {"pos": "contraction", "tags": ["contraction"]}, + "Singkatan": {"pos": "abbrev", "tags": ["abbreviation"]}, + "Nama diri": {"pos": "name"}, } From 6804880443583efa976778897812d0be63861c1e Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 25 Feb 2025 17:06:27 +0800 Subject: [PATCH 2/2] [id] extract etymology section --- src/wiktextract/extractor/id/etymology.py | 25 +++++++++++++++++++++++ src/wiktextract/extractor/id/models.py | 1 + src/wiktextract/extractor/id/page.py | 5 +++++ 3 files changed, 31 insertions(+) create mode 100644 src/wiktextract/extractor/id/etymology.py diff --git a/src/wiktextract/extractor/id/etymology.py b/src/wiktextract/extractor/id/etymology.py new file mode 100644 index 00000000..8ca074ed --- /dev/null +++ b/src/wiktextract/extractor/id/etymology.py @@ -0,0 +1,25 @@ +from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import WordEntry + + +def extract_etymology_section( + wxr: WiktextractContext, + word_entry: WordEntry, + level_node: LevelNode, +) -> None: + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + e_str = clean_node(wxr, word_entry, list_item.children) + if e_str != "": + word_entry.etymology_texts.append(e_str) + if len(word_entry.etymology_texts) == 0: + e_str = clean_node( + wxr, + word_entry, + list(level_node.invert_find_child(LEVEL_KIND_FLAGS)), + ) + if e_str != "": + word_entry.etymology_texts.append(e_str) diff --git a/src/wiktextract/extractor/id/models.py b/src/wiktextract/extractor/id/models.py index 642b1c8c..8322a775 100644 --- a/src/wiktextract/extractor/id/models.py +++ b/src/wiktextract/extractor/id/models.py @@ -39,3 +39,4 @@ class WordEntry(IndonesianBaseModel): categories: list[str] = [] tags: list[str] = [] raw_tags: list[str] = [] + etymology_texts: list[str] = [] diff --git a/src/wiktextract/extractor/id/page.py b/src/wiktextract/extractor/id/page.py index 7a473ade..7a1ce4dd 100644 --- a/src/wiktextract/extractor/id/page.py +++ b/src/wiktextract/extractor/id/page.py @@ -5,6 +5,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext +from .etymology import extract_etymology_section from .models import Sense, WordEntry from .pos import extract_pos_section from .section_titles import POS_DATA @@ -20,6 +21,10 @@ def parse_section( wxr.wtp.start_subsection(title_text) if title_text in POS_DATA: extract_pos_section(wxr, page_data, base_data, level_node, title_text) + elif title_text == "Etimologi": + extract_etymology_section( + wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node + ) for next_level in level_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level)