Clean up of char classes, few tokenizer fixes and faster default Fren…

…ch tokenizer (explosion#3293) * splitting up latin unicode interval * removing hyphen as infix for French * adding failing test for issue 1235 * test for issue explosion#3002 which now works * partial fix for issue explosion#2070 * keep the hyphen as infix for French (as it was) * restore french expressions with hyphen as infix (as it was) * added succeeding unit test for Issue explosion#2656 * Fix issue explosion#2822 with custom Italian exception * Fix issue explosion#2926 by allowing numbers right before infix / * splitting up latin unicode interval * removing hyphen as infix for French * adding failing test for issue 1235 * test for issue explosion#3002 which now works * partial fix for issue explosion#2070 * keep the hyphen as infix for French (as it was) * restore french expressions with hyphen as infix (as it was) * added succeeding unit test for Issue explosion#2656 * Fix issue explosion#2822 with custom Italian exception * Fix issue explosion#2926 by allowing numbers right before infix / * remove duplicate * remove xfail for Issue explosion#2179 fixed by Matt * adjust documentation and remove reference to regex lib
tokenmill · Feb 20, 2019 · 9a478b6 · 9a478b6
1 parent 9696cf1
commit 9a478b6
Show file tree

Hide file tree

Showing 15 changed files with 277 additions and 37 deletions.
diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
diff --git a/spacy/lang/fr/_tokenizer_exceptions_list.py b/spacy/lang/fr/_tokenizer_exceptions_list.py
@@ -7076,14 +7076,6 @@
 "au-lof",
 "au-tour",
 "aube-vigne",
-"audio-numérique",
-"audio-numériques",
-"audio-prothésiste",
-"audio-prothésistes",
-"audio-visuel",
-"audio-visuelle",
-"audio-visuelles",
-"audio-visuels",
 "aujourd'hui",
 "aulnaie-frênaie",
 "aulnaies-frênaies",
@@ -14400,7 +14392,6 @@
 "attaques surprises",
 "attaques-surprises",
 "attrape-con",
-"audio-oral",
 "auriculo-cardiaque",
 "auriculo-temporal",
 "austro-bavarois",

diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py
@@ -3,12 +3,15 @@
 
 import re
 
-from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
 from .punctuation import ELISION, HYPHENS
 from ..tokenizer_exceptions import URL_PATTERN
 from ..char_classes import ALPHA_LOWER, ALPHA
 from ...symbols import ORTH, LEMMA, TAG
 
+# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
+# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
+FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]
+
 
 def upper_first_letter(text):
     if len(text) == 0:
@@ -128,6 +131,7 @@ def lower_first_letter(text):
     "arcs?",
     "archi",
     "arrières?",
+    "audio",
     "avant",
     "avion",
     "auto",

diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py
@@ -4,6 +4,7 @@
 from .stop_words import STOP_WORDS
 from .lemmatizer import LOOKUP
 from .tag_map import TAG_MAP
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@@ -20,7 +21,7 @@ class ItalianDefaults(Language.Defaults):
     lex_attr_getters[NORM] = add_lookups(
         Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
     )
-    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
     stop_words = STOP_WORDS
     lemma_lookup = LOOKUP
     tag_map = TAG_MAP

diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py
@@ -0,0 +1,9 @@
+# coding: utf8
+from __future__ import unicode_literals
+from ...symbols import ORTH, LEMMA
+
+_exc = {
+    "po'": [{ORTH: "po'", LEMMA: 'poco'}]
+}
+
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py
@@ -39,10 +39,10 @@
     + LIST_ICONS
     + [
         r"(?<=[0-9])[+\-\*^](?=[0-9-])",
-        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
-        r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
     ]
 )
 

diff --git a/spacy/tests/lang/fr/test_exceptions.py b/spacy/tests/lang/fr/test_exceptions.py
@@ -22,9 +22,9 @@
         u"11-septembre",
         u"11-Septembre",
         u"refox-trottâmes",
-        u"K-POP",
-        u"K-Pop",
-        u"K-pop",
+        # u"K-POP",
+        # u"K-Pop",
+        # u"K-pop",
         u"z'yeutes",
         u"black-outeront",
         u"états-unienne",

diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
@@ -358,6 +358,7 @@ def test_issue850_basic():
     assert end == 4
 
 
+@pytest.mark.skip(reason="French exception list is not enabled in the default tokenizer anymore")
 @pytest.mark.parametrize(
     "text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
 )

diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py
@@ -13,6 +13,22 @@
 from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
 
 
+@pytest.mark.xfail(
+    reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)"
+)
+def test_issue1235():
+    """Test that g is not split of if preceded by a number and a letter"""
+    nlp = English()
+    testwords = u'e2g 2g 52g'
+    doc = nlp(testwords)
+    assert len(doc) == 5
+    assert doc[0].text == "e2g"
+    assert doc[1].text == "2"
+    assert doc[2].text == "g"
+    assert doc[3].text == "52"
+    assert doc[4].text == "g"
+
+
 def test_issue1242():
     nlp = English()
     doc = nlp("")

diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
@@ -7,10 +7,22 @@
 from spacy.gold import iob_to_biluo
 from spacy.lang.it import Italian
 import numpy
+from spacy.lang.en import English
 
 from ..util import add_vecs_to_vocab, get_doc
 
 
+@pytest.mark.xfail(
+    reason="The dot is now properly split off, but the prefix/suffix rules are not applied again afterwards."
+           "This means that the quote will still be attached to the remaining token."
+)
+def test_issue2070():
+    """Test that checks that a dot followed by a quote is handled appropriately."""
+    nlp = English()
+    doc = nlp('First sentence."A quoted sentence" he said ...')
+    assert len(doc) == 11
+
+
 def test_issue2179():
     """Test that spurious 'extra_labels' aren't created when initializing NER."""
     nlp = Italian()

diff --git a/spacy/tests/regression/test_issue2656.py b/spacy/tests/regression/test_issue2656.py
@@ -0,0 +1,24 @@
+# coding: utf8
+from __future__ import unicode_literals
+from spacy.lang.en import English
+
+
+def test_issue2656():
+    """ Test that tokenizer correctly splits of punctuation after numbers with decimal points """
+    text = "I went for 40.3, and got home by 10.0."
+    nlp = English()
+    doc = nlp(text)
+
+    assert len(doc) == 11
+
+    assert doc[0].text == "I"
+    assert doc[1].text == "went"
+    assert doc[2].text == "for"
+    assert doc[3].text == "40.3"
+    assert doc[4].text == ","
+    assert doc[5].text == "and"
+    assert doc[6].text == "got"
+    assert doc[7].text == "home"
+    assert doc[8].text == "by"
+    assert doc[9].text == "10.0"
+    assert doc[10].text == "."
diff --git a/spacy/tests/regression/test_issue2822.py b/spacy/tests/regression/test_issue2822.py
@@ -0,0 +1,21 @@
+# coding: utf8
+from __future__ import unicode_literals
+from spacy.lang.it import Italian
+
+
+def test_issue2822():
+    """ Test that the abbreviation of poco is kept as one word """
+    nlp = Italian()
+    text = "Vuoi un po' di zucchero?"
+
+    doc = nlp(text)
+
+    assert len(doc) == 6
+
+    assert doc[0].text == "Vuoi"
+    assert doc[1].text == "un"
+    assert doc[2].text == "po'"
+    assert doc[2].lemma_ == "poco"
+    assert doc[3].text == "di"
+    assert doc[4].text == "zucchero"
+    assert doc[5].text == "?"
diff --git a/spacy/tests/regression/test_issue2926.py b/spacy/tests/regression/test_issue2926.py
@@ -0,0 +1,21 @@
+# coding: utf8
+from __future__ import unicode_literals
+from spacy.lang.fr import French
+
+
+def test_issue2926():
+    """ Test that the tokenizer correctly splits tokens separated by a slash (/) ending in a digit """
+    nlp = French()
+    text = "Learn html5/css3/javascript/jquery"
+    doc = nlp(text)
+
+    assert len(doc) == 8
+
+    assert doc[0].text == "Learn"
+    assert doc[1].text == "html5"
+    assert doc[2].text == "/"
+    assert doc[3].text == "css3"
+    assert doc[4].text == "/"
+    assert doc[5].text == "javascript"
+    assert doc[6].text == "/"
+    assert doc[7].text == "jquery"
diff --git a/spacy/tests/regression/test_issue3002.py b/spacy/tests/regression/test_issue3002.py
@@ -0,0 +1,11 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from spacy.lang.de import German
+
+
+def test_issue3002():
+    """Test that the tokenizer doesn't hang on a long list of dots"""
+    nlp = German()
+    doc = nlp('880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl')
+    assert len(doc) == 5
diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md
@@ -102,11 +102,12 @@ language and training a language model.
 
 In order for the tokenizer to split suffixes, prefixes and infixes, spaCy needs
 to know the language's character set. If the language you're adding uses
-non-latin characters, you might need to add the required character classes to
+non-latin characters, you might need to define the required character classes in
 the global
 [`char_classes.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py).
-spaCy uses the [`regex` library](https://pypi.python.org/pypi/regex/) to keep
-this simple and readable. If the language requires very specific punctuation
+For efficiency, spaCy uses hard-coded unicode ranges to define character classes,
+the definitions of which can be found on [Wikipedia](https://en.wikipedia.org/wiki/Unicode_block). 
+If the language requires very specific punctuation
 rules, you should consider overwriting the default regular expressions with your
 own in the language's `Defaults`.