Skip to content

Commit

Permalink
Batch UD evaluation script (explosion#3174)
Browse files Browse the repository at this point in the history
* running UD eval

* printing timing of tokenizer: tokens per second

* timing of default English model

* structured output and parameterization to compare different runs

* additional flag to allow evaluation without parsing info

* printing verbose log of errors for manual inspection

* printing over- and undersegmented cases (and combo's)

* add under and oversegmented numbers to Score and structured output

* print high-freq over/under segmented words and word shapes

* printing examples as part of the structured output

* print the results to file

* batch run of different models and treebanks per language

* cleaning up code

* commandline script to process all languages in spaCy & UD

* heuristic to remove blinded corpora and option to run one single best per language

* pathlib instead of os for file paths
  • Loading branch information
svlandeg authored and honnibal committed Jan 27, 2019
1 parent 5a4737d commit 66016ac
Show file tree
Hide file tree
Showing 2 changed files with 349 additions and 20 deletions.
82 changes: 62 additions & 20 deletions spacy/cli/ud/conll17_ud_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@
# - evaluate the given gold and system CoNLL-U files (loaded with load_conllu)
# - raises UDError if the concatenated tokens of gold and system file do not match
# - returns a dictionary with the metrics described above, each metrics having
# three fields: precision, recall and f1
# four fields: precision, recall, f1 and aligned_accuracy (when using aligned
# words, otherwise this is None)

# Description of token matching
# -----------------------------
Expand Down Expand Up @@ -97,7 +98,7 @@ class UDError(Exception):
pass

# Load given CoNLL-U file into internal representation
def load_conllu(file):
def load_conllu(file, check_parse=True):
# Internal representation classes
class UDRepresentation:
def __init__(self):
Expand Down Expand Up @@ -181,8 +182,9 @@ def process_word(word):
process_word(word)

# Check there is a single root node
if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
raise UDError("There are multiple roots in a sentence")
if check_parse:
if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
raise UDError("There are multiple roots in a sentence")

# End the sentence
ud.sentences[-1].end = index
Expand All @@ -198,7 +200,7 @@ def process_word(word):
if "." in columns[ID]:
continue

# Delete spaces from FORM so gold.characters == system.characters
# Delete spaces from FORM so gold.characters == system.characters
# even if one of them tokenizes the space.
columns[FORM] = columns[FORM].replace(" ", "")
if not columns[FORM]:
Expand Down Expand Up @@ -247,13 +249,17 @@ def process_word(word):
return ud

# Evaluate the gold and system treebanks (loaded using load_conllu).
def evaluate(gold_ud, system_ud, deprel_weights=None):
def evaluate(gold_ud, system_ud, deprel_weights=None, check_parse=True):
class Score:
def __init__(self, gold_total, system_total, correct, aligned_total=None):
def __init__(self, gold_total, system_total, correct, aligned_total=None, undersegmented=None, oversegmented=None):
self.precision = correct / system_total if system_total else 0.0
self.recall = correct / gold_total if gold_total else 0.0
self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0
self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total
self.undersegmented = undersegmented
self.oversegmented = oversegmented
self.under_perc = len(undersegmented) / gold_total if gold_total and undersegmented else 0.0
self.over_perc = len(oversegmented) / gold_total if gold_total and oversegmented else 0.0
class AlignmentWord:
def __init__(self, gold_word, system_word):
self.gold_word = gold_word
Expand Down Expand Up @@ -286,17 +292,43 @@ def lower(text):

def spans_score(gold_spans, system_spans):
correct, gi, si = 0, 0, 0
undersegmented = list()
oversegmented = list()
combo = 0
previous_end_si_earlier = False
previous_end_gi_earlier = False
while gi < len(gold_spans) and si < len(system_spans):
previous_si = system_spans[si-1] if si > 0 else None
previous_gi = gold_spans[gi-1] if gi > 0 else None
if system_spans[si].start < gold_spans[gi].start:
# avoid counting the same mistake twice
if not previous_end_si_earlier:
combo += 1
oversegmented.append(str(previous_gi).strip())
si += 1
elif gold_spans[gi].start < system_spans[si].start:
# avoid counting the same mistake twice
if not previous_end_gi_earlier:
combo += 1
undersegmented.append(str(previous_si).strip())
gi += 1
else:
correct += gold_spans[gi].end == system_spans[si].end
if gold_spans[gi].end < system_spans[si].end:
undersegmented.append(str(system_spans[si]).strip())
previous_end_gi_earlier = True
previous_end_si_earlier = False
elif gold_spans[gi].end > system_spans[si].end:
oversegmented.append(str(gold_spans[gi]).strip())
previous_end_si_earlier = True
previous_end_gi_earlier = False
else:
previous_end_gi_earlier = False
previous_end_si_earlier = False
si += 1
gi += 1

return Score(len(gold_spans), len(system_spans), correct)
return Score(len(gold_spans), len(system_spans), correct, None, undersegmented, oversegmented)

def alignment_score(alignment, key_fn, weight_fn=lambda w: 1):
gold, system, aligned, correct = 0, 0, 0, 0
Expand Down Expand Up @@ -425,18 +457,28 @@ def align_words(gold_words, system_words):
alignment = align_words(gold_ud.words, system_ud.words)

# Compute the F1-scores
result = {
"Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
"Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
"Words": alignment_score(alignment, None),
"UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]),
"XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]),
"Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
"AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
"Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
"UAS": alignment_score(alignment, lambda w, parent: parent),
"LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])),
}
if check_parse:
result = {
"Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
"Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
"Words": alignment_score(alignment, None),
"UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]),
"XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]),
"Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
"AllTags": alignment_score(alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
"Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
"UAS": alignment_score(alignment, lambda w, parent: parent),
"LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])),
}
else:
result = {
"Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
"Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
"Words": alignment_score(alignment, None),
"Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
"Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
}


# Add WeightedLAS if weights are given
if deprel_weights is not None:
Expand Down
Loading

0 comments on commit 66016ac

Please sign in to comment.