Skip to content

Commit

Permalink
feat: penalize subordinate clauses
Browse files Browse the repository at this point in the history
  • Loading branch information
Natalie-T-E authored and gremid committed Feb 10, 2025
1 parent 609bd74 commit 12e4208
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
python-version: "3.10"
- name: Install dependencies
run: |
pip install -U pip
pip install -U pip setuptools wheel
pip install '.[dev]'
- name: Run unit tests
run: |
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@ Among the gradual criteria are
* the absence of rare characters or those normally not available on a keyboard,
* the absence of named entities,
* the absence of deictic expressions,
* an optimal length of the sentence, and
* a whitelist-based coverage test, i. e. for penalizing usage of rare lemmata.
* an optimal length of the sentence,
* a whitelist-based coverage test, i. e. for penalizing usage of rare lemmata, and
* the absence of subordinate clauses / the headword being part of a main clause.

## Installation

Expand Down
34 changes: 34 additions & 0 deletions gdex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ class SentenceScorer:
is_misparsed: Callable[[Span], bool] = lambda sent: False
is_deixis: Callable[[Token], bool] = lambda token: False
num_entities: Callable[[Span], int] = lambda sent: 0
is_hypotactic: Callable[[Span], bool] = lambda sent: False
hit_in_subordinate_clause: Callable[[Span], bool] = lambda sent: False

whitelist: Set[str] = field(default_factory=set, repr=False)
blacklist: Set[str] = field(default_factory=set, repr=False)
Expand All @@ -40,6 +42,7 @@ class SentenceScorer:
penalty_rare_char: float = 0.125
penalty_named_entity: float = 0.1667
penalty_deixis: float = 0.034
penalty_hypotaxis: float = 0.067 # doubled if hit in a subordinate clause

def score_sentence(self, sent: Span, headword=None) -> float:
score = 0.5 * self.has_no_knockout_criterion(sent)
Expand Down Expand Up @@ -76,6 +79,8 @@ def factor_gradual_criteria(self, sent, headword):
factor *= self.factor_optimal_interval(sent)
if self.penalty_deixis is not None:
factor *= self.factor_deixis(sent, headword)
if self.penalty_hypotaxis is not None:
factor *= self.factor_hypotaxis(sent)
return factor

def has_illegal_chars(self, sent: Span):
Expand Down Expand Up @@ -140,6 +145,14 @@ def factor_optimal_interval(self, sent: Span) -> float:
def factor_deixis(self, sent: Span, headword: str):
return self.factor_tokens(sent, headword, self.is_deixis, self.penalty_deixis)

def factor_hypotaxis(self, sent: Span) -> float:
factor = 1.0
if self.hit_in_subordinate_clause(sent): # implies hypotaxis
factor -= 2 * self.penalty_hypotaxis
elif self.is_hypotactic(sent):
factor -= self.penalty_hypotaxis
return max(0.0, factor)


def _de_has_finite_verb_and_subject(sent: Span) -> bool:
for root in sent:
Expand Down Expand Up @@ -232,6 +245,25 @@ def _de_hdt_num_entities(sent: Span):
return sum((1 for t in sent if t.pos_ == "PROPN"))


# aim: identify sentences with VL subordinate clauses
_DE_HDT_HYPO_DEPS = {"acl", "advcl", "ccomp", "csubj"}


def _de_hdt_is_hypotactic(sent: Span) -> bool:
deps = {t.dep_ for t in sent}
if deps.isdisjoint(_DE_HDT_HYPO_DEPS):
return False
return True


def _de_hdt_hit_in_subordinate_clause(sent: Span) -> bool:
for token in sent:
if token.dep_ in _DE_HDT_HYPO_DEPS:
if any((t._.is_hit for t in token.subtree)):
return True
return False


_QWERTZ_DE = set(
(
"^1234567890ß'qwertzuiopü+asdfghjklöä#<yxcvbnm,.-°!\"§$%&/()=?`"
Expand Down Expand Up @@ -273,6 +305,8 @@ def _de_hdt_num_entities(sent: Span):
keyboard_chars=_QWERTZ_DE,
whitelist=_de_whitelist,
blacklist=_de_vulger_blacklist,
is_hypotactic=_de_hdt_is_hypotactic,
hit_in_subordinate_clause=_de_hdt_hit_in_subordinate_clause,
)

__all__ = ["SentenceScorer", "de_core", "de_hdt"]
10 changes: 7 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ classifiers = [
"Topic :: Text Processing :: Linguistic"
]
requires-python = ">=3.7"
dependencies = ["spacy>=3.7"]
dependencies = ["spacy==3.7.5"]
dynamic = ["readme", "version"]

[project.optional-dependencies]
Expand All @@ -33,8 +33,7 @@ dev = [
"mypy",
"pre-commit",
"pytest",
"de-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl#sha256=d88c737eb7eb766f730f6a2dcb99dfcdb81623e1e0d89a9c638a2182ac19c52e",
"de_hdt_lg @ https://huggingface.co/zentrum-lexikographie/de_hdt_lg/resolve/main/de_hdt_lg-any-py3-none-any.whl#sha256=44bd0b0299865341ee1756efd60670fa148dbfd2a14d0c1d5ab99c61af08236a"
"spacy-transformers",
]

[project.urls]
Expand All @@ -46,6 +45,11 @@ omit = ["tests/**/*.py"]
[tool.isort]
profile = "black"

[tool.pytest.ini_options]
filterwarnings = [
"ignore::FutureWarning",
]

[tool.setuptools.dynamic]
readme = {file = ["README.md"], content-type = "text/markdown"}
version = {attr = "gdex.version.__version__"}
Expand Down
68 changes: 65 additions & 3 deletions tests/test_gdex.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,37 @@
import subprocess

import spacy

import gdex

de_core_nlp = spacy.load("de_core_news_sm")
de_hdt_nlp = spacy.load("de_hdt_lg")
spacy.tokens.Token.set_extension("is_hit", default=False)

spacy_model_packages = {
"de_core_news_sm": (
"de-core-news-sm @ https://github.com/explosion/spacy-models/"
"releases/download/de_core_news_sm-3.7.0/"
"de_core_news_sm-3.7.0-py3-none-any.whl"
"#sha256=d88c737eb7eb766f730f6a2dcb99dfcdb81623e1e0d89a9c638a2182ac19c52e"
),
"de_hdt_dist": (
"de_hdt_dist @ https://huggingface.co/zentrum-lexikographie/de_hdt_dist/"
"resolve/main/de_hdt_dist-any-py3-none-any.whl"
"#sha256=dd54e4f75b249d401ed664c406c1a021ee6733bca7c701eb4500480d473a1a8a"
),
}


def spacy_model(model):
try:
return spacy.load(model)
except OSError:
assert model in spacy_model_packages, model
subprocess.check_call(["pip", "install", "-qqq", spacy_model_packages[model]])
return spacy.load(model)


de_core_nlp = spacy_model("de_core_news_sm")
de_hdt_nlp = spacy_model("de_hdt_dist")


def scores(s):
Expand Down Expand Up @@ -44,6 +72,40 @@ def test_scoring():
assert sent._.gdex >= 0.0 and sent._.gdex <= 1.0


def test_hypotaxis_hdt():
test_sents = [
("Haus", "Leider verpasste sie den Anruf, weil sie später nach Hause kam."),
(None, "Herbstspaziergänge sind besonders schön, wenn es nicht regnet."),
(
None,
(
"Der uralte Baum, der schon damals hier stand, "
"gehörte fest in das Stadtbild."
),
),
(
None,
(
"Um besser sehen zu können, braucht man eventuell eine Brille, "
"auch Sehhilfe genannt."
),
),
]
nlp, scorer = de_hdt_nlp, gdex.de_hdt
for headword, s in test_sents:
doc = nlp(s)
if headword:
for token in doc:
if token.lemma_ == headword:
token._.is_hit = True
for sent in doc.sents:
score = scorer.factor_hypotaxis(sent)
if headword:
assert score == 1.0 - 2 * scorer.penalty_hypotaxis
else:
assert score < 1.0


def test_illegal_chars():
assert_knockout("Das ist ein Satz mit unzulässigen Zeichen [1].")
assert_knockout("Gleiches gilt für diesen Satz mit [email protected].")
Expand Down Expand Up @@ -73,7 +135,7 @@ def test_finite_verb_and_subject():

def test_rarechars():
factor_method = gdex.SentenceScorer.factor_rarechars
assert_penalty(factor_method, "1. Aufzählungen und Zahlen mögen wir nicht.")
assert_penalty(factor_method, "1. Aufzählungen und Zahlen wie 3 mögen wir nicht.")
assert_penalty(factor_method, "Worte in Klammern (Paranthese) sind schlecht.")


Expand Down

0 comments on commit 12e4208

Please sign in to comment.