feat: penalize subordinate clauses

zentrum-lexikographie · Feb 10, 2025 · 12e4208 · 12e4208
1 parent 609bd74
commit 12e4208
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 9 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -16,7 +16,7 @@ jobs:
         python-version: "3.10"
     - name: Install dependencies
       run: |
-        pip install -U pip
+        pip install -U pip setuptools wheel
         pip install '.[dev]'
     - name: Run unit tests
       run: |

diff --git a/README.md b/README.md
@@ -25,8 +25,9 @@ Among the gradual criteria are
 * the absence of rare characters or those normally not available on a keyboard,
 * the absence of named entities,
 * the absence of deictic expressions,
-* an optimal length of the sentence, and
-* a whitelist-based coverage test, i. e. for penalizing usage of rare lemmata.
+* an optimal length of the sentence,
+* a whitelist-based coverage test, i. e. for penalizing usage of rare lemmata, and
+* the absence of subordinate clauses / the headword being part of a main clause.
 
 ## Installation
 

diff --git a/gdex/__init__.py b/gdex/__init__.py
@@ -29,6 +29,8 @@ class SentenceScorer:
     is_misparsed: Callable[[Span], bool] = lambda sent: False
     is_deixis: Callable[[Token], bool] = lambda token: False
     num_entities: Callable[[Span], int] = lambda sent: 0
+    is_hypotactic: Callable[[Span], bool] = lambda sent: False
+    hit_in_subordinate_clause: Callable[[Span], bool] = lambda sent: False
 
     whitelist: Set[str] = field(default_factory=set, repr=False)
     blacklist: Set[str] = field(default_factory=set, repr=False)
@@ -40,6 +42,7 @@ class SentenceScorer:
     penalty_rare_char: float = 0.125
     penalty_named_entity: float = 0.1667
     penalty_deixis: float = 0.034
+    penalty_hypotaxis: float = 0.067  # doubled if hit in a subordinate clause
 
     def score_sentence(self, sent: Span, headword=None) -> float:
         score = 0.5 * self.has_no_knockout_criterion(sent)
@@ -76,6 +79,8 @@ def factor_gradual_criteria(self, sent, headword):
             factor *= self.factor_optimal_interval(sent)
         if self.penalty_deixis is not None:
             factor *= self.factor_deixis(sent, headword)
+        if self.penalty_hypotaxis is not None:
+            factor *= self.factor_hypotaxis(sent)
         return factor
 
     def has_illegal_chars(self, sent: Span):
@@ -140,6 +145,14 @@ def factor_optimal_interval(self, sent: Span) -> float:
     def factor_deixis(self, sent: Span, headword: str):
         return self.factor_tokens(sent, headword, self.is_deixis, self.penalty_deixis)
 
+    def factor_hypotaxis(self, sent: Span) -> float:
+        factor = 1.0
+        if self.hit_in_subordinate_clause(sent):  # implies hypotaxis
+            factor -= 2 * self.penalty_hypotaxis
+        elif self.is_hypotactic(sent):
+            factor -= self.penalty_hypotaxis
+        return max(0.0, factor)
+
 
 def _de_has_finite_verb_and_subject(sent: Span) -> bool:
     for root in sent:
@@ -232,6 +245,25 @@ def _de_hdt_num_entities(sent: Span):
     return sum((1 for t in sent if t.pos_ == "PROPN"))
 
 
+# aim: identify sentences with VL subordinate clauses
+_DE_HDT_HYPO_DEPS = {"acl", "advcl", "ccomp", "csubj"}
+
+
+def _de_hdt_is_hypotactic(sent: Span) -> bool:
+    deps = {t.dep_ for t in sent}
+    if deps.isdisjoint(_DE_HDT_HYPO_DEPS):
+        return False
+    return True
+
+
+def _de_hdt_hit_in_subordinate_clause(sent: Span) -> bool:
+    for token in sent:
+        if token.dep_ in _DE_HDT_HYPO_DEPS:
+            if any((t._.is_hit for t in token.subtree)):
+                return True
+    return False
+
+
 _QWERTZ_DE = set(
     (
         "^1234567890ß'qwertzuiopü+asdfghjklöä#<yxcvbnm,.-°!\"§$%&/()=?`"
@@ -273,6 +305,8 @@ def _de_hdt_num_entities(sent: Span):
     keyboard_chars=_QWERTZ_DE,
     whitelist=_de_whitelist,
     blacklist=_de_vulger_blacklist,
+    is_hypotactic=_de_hdt_is_hypotactic,
+    hit_in_subordinate_clause=_de_hdt_hit_in_subordinate_clause,
 )
 
 __all__ = ["SentenceScorer", "de_core", "de_hdt"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
     "Topic :: Text Processing :: Linguistic"
 ]
 requires-python = ">=3.7"
-dependencies = ["spacy>=3.7"]
+dependencies = ["spacy==3.7.5"]
 dynamic = ["readme", "version"]
 
 [project.optional-dependencies]
@@ -33,8 +33,7 @@ dev = [
     "mypy",
     "pre-commit",
     "pytest",
-    "de-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl#sha256=d88c737eb7eb766f730f6a2dcb99dfcdb81623e1e0d89a9c638a2182ac19c52e",
-    "de_hdt_lg @ https://huggingface.co/zentrum-lexikographie/de_hdt_lg/resolve/main/de_hdt_lg-any-py3-none-any.whl#sha256=44bd0b0299865341ee1756efd60670fa148dbfd2a14d0c1d5ab99c61af08236a"
+    "spacy-transformers",
 ]
 
 [project.urls]
@@ -46,6 +45,11 @@ omit = ["tests/**/*.py"]
 [tool.isort]
 profile = "black"
 
+[tool.pytest.ini_options]
+filterwarnings = [
+    "ignore::FutureWarning",
+]
+
 [tool.setuptools.dynamic]
 readme = {file = ["README.md"], content-type = "text/markdown"}
 version = {attr = "gdex.version.__version__"}

diff --git a/tests/test_gdex.py b/tests/test_gdex.py
@@ -1,9 +1,37 @@
+import subprocess
+
 import spacy
 
 import gdex
 
-de_core_nlp = spacy.load("de_core_news_sm")
-de_hdt_nlp = spacy.load("de_hdt_lg")
+spacy.tokens.Token.set_extension("is_hit", default=False)
+
+spacy_model_packages = {
+    "de_core_news_sm": (
+        "de-core-news-sm @ https://github.com/explosion/spacy-models/"
+        "releases/download/de_core_news_sm-3.7.0/"
+        "de_core_news_sm-3.7.0-py3-none-any.whl"
+        "#sha256=d88c737eb7eb766f730f6a2dcb99dfcdb81623e1e0d89a9c638a2182ac19c52e"
+    ),
+    "de_hdt_dist": (
+        "de_hdt_dist @ https://huggingface.co/zentrum-lexikographie/de_hdt_dist/"
+        "resolve/main/de_hdt_dist-any-py3-none-any.whl"
+        "#sha256=dd54e4f75b249d401ed664c406c1a021ee6733bca7c701eb4500480d473a1a8a"
+    ),
+}
+
+
+def spacy_model(model):
+    try:
+        return spacy.load(model)
+    except OSError:
+        assert model in spacy_model_packages, model
+        subprocess.check_call(["pip", "install", "-qqq", spacy_model_packages[model]])
+        return spacy.load(model)
+
+
+de_core_nlp = spacy_model("de_core_news_sm")
+de_hdt_nlp = spacy_model("de_hdt_dist")
 
 
 def scores(s):
@@ -44,6 +72,40 @@ def test_scoring():
             assert sent._.gdex >= 0.0 and sent._.gdex <= 1.0
 
 
+def test_hypotaxis_hdt():
+    test_sents = [
+        ("Haus", "Leider verpasste sie den Anruf, weil sie später nach Hause kam."),
+        (None, "Herbstspaziergänge sind besonders schön, wenn es nicht regnet."),
+        (
+            None,
+            (
+                "Der uralte Baum, der schon damals hier stand, "
+                "gehörte fest in das Stadtbild."
+            ),
+        ),
+        (
+            None,
+            (
+                "Um besser sehen zu können, braucht man eventuell eine Brille, "
+                "auch Sehhilfe genannt."
+            ),
+        ),
+    ]
+    nlp, scorer = de_hdt_nlp, gdex.de_hdt
+    for headword, s in test_sents:
+        doc = nlp(s)
+        if headword:
+            for token in doc:
+                if token.lemma_ == headword:
+                    token._.is_hit = True
+        for sent in doc.sents:
+            score = scorer.factor_hypotaxis(sent)
+            if headword:
+                assert score == 1.0 - 2 * scorer.penalty_hypotaxis
+            else:
+                assert score < 1.0
+
+
 def test_illegal_chars():
     assert_knockout("Das ist ein Satz mit unzulässigen Zeichen [1].")
     assert_knockout("Gleiches gilt für diesen Satz mit [email protected].")
@@ -73,7 +135,7 @@ def test_finite_verb_and_subject():
 
 def test_rarechars():
     factor_method = gdex.SentenceScorer.factor_rarechars
-    assert_penalty(factor_method, "1. Aufzählungen und Zahlen mögen wir nicht.")
+    assert_penalty(factor_method, "1. Aufzählungen und Zahlen wie 3 mögen wir nicht.")
     assert_penalty(factor_method, "Worte in Klammern (Paranthese) sind schlecht.")