Skip to content

Commit

Permalink
Clean up of char classes, few tokenizer fixes and faster default Fren…
Browse files Browse the repository at this point in the history
…ch tokenizer (explosion#3293)

* splitting up latin unicode interval

* removing hyphen as infix for French

* adding failing test for issue 1235

* test for issue explosion#3002 which now works

* partial fix for issue explosion#2070

* keep the hyphen as infix for French (as it was)

* restore french expressions with hyphen as infix (as it was)

* added succeeding unit test for Issue explosion#2656

* Fix issue explosion#2822 with custom Italian exception

* Fix issue explosion#2926 by allowing numbers right before infix /

* splitting up latin unicode interval

* removing hyphen as infix for French

* adding failing test for issue 1235

* test for issue explosion#3002 which now works

* partial fix for issue explosion#2070

* keep the hyphen as infix for French (as it was)

* restore french expressions with hyphen as infix (as it was)

* added succeeding unit test for Issue explosion#2656

* Fix issue explosion#2822 with custom Italian exception

* Fix issue explosion#2926 by allowing numbers right before infix /

* remove duplicate

* remove xfail for Issue explosion#2179 fixed by Matt

* adjust documentation and remove reference to regex lib
  • Loading branch information
svlandeg authored and ines committed Feb 20, 2019
1 parent 9696cf1 commit 9a478b6
Show file tree
Hide file tree
Showing 15 changed files with 277 additions and 37 deletions.
164 changes: 146 additions & 18 deletions spacy/lang/char_classes.py

Large diffs are not rendered by default.

9 changes: 0 additions & 9 deletions spacy/lang/fr/_tokenizer_exceptions_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -7076,14 +7076,6 @@
"au-lof",
"au-tour",
"aube-vigne",
"audio-numérique",
"audio-numériques",
"audio-prothésiste",
"audio-prothésistes",
"audio-visuel",
"audio-visuelle",
"audio-visuelles",
"audio-visuels",
"aujourd'hui",
"aulnaie-frênaie",
"aulnaies-frênaies",
Expand Down Expand Up @@ -14400,7 +14392,6 @@
"attaques surprises",
"attaques-surprises",
"attrape-con",
"audio-oral",
"auriculo-cardiaque",
"auriculo-temporal",
"austro-bavarois",
Expand Down
6 changes: 5 additions & 1 deletion spacy/lang/fr/tokenizer_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@

import re

from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
from .punctuation import ELISION, HYPHENS
from ..tokenizer_exceptions import URL_PATTERN
from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA, TAG

# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
FR_BASE_EXCEPTIONS = ["aujourd'hui", "Aujourd'hui"]


def upper_first_letter(text):
if len(text) == 0:
Expand Down Expand Up @@ -128,6 +131,7 @@ def lower_first_letter(text):
"arcs?",
"archi",
"arrières?",
"audio",
"avant",
"avion",
"auto",
Expand Down
3 changes: 2 additions & 1 deletion spacy/lang/it/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .stop_words import STOP_WORDS
from .lemmatizer import LOOKUP
from .tag_map import TAG_MAP
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS

from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
Expand All @@ -20,7 +21,7 @@ class ItalianDefaults(Language.Defaults):
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
lemma_lookup = LOOKUP
tag_map = TAG_MAP
Expand Down
9 changes: 9 additions & 0 deletions spacy/lang/it/tokenizer_exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA

_exc = {
"po'": [{ORTH: "po'", LEMMA: 'poco'}]
}

TOKENIZER_EXCEPTIONS = _exc
4 changes: 2 additions & 2 deletions spacy/lang/punctuation.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@
+ LIST_ICONS
+ [
r"(?<=[0-9])[+\-\*^](?=[0-9-])",
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
r"(?<=[{a}])[:<>=/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)

Expand Down
6 changes: 3 additions & 3 deletions spacy/tests/lang/fr/test_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
u"11-septembre",
u"11-Septembre",
u"refox-trottâmes",
u"K-POP",
u"K-Pop",
u"K-pop",
# u"K-POP",
# u"K-Pop",
# u"K-pop",
u"z'yeutes",
u"black-outeront",
u"états-unienne",
Expand Down
1 change: 1 addition & 0 deletions spacy/tests/regression/test_issue1-1000.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ def test_issue850_basic():
assert end == 4


@pytest.mark.skip(reason="French exception list is not enabled in the default tokenizer anymore")
@pytest.mark.parametrize(
"text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"]
)
Expand Down
16 changes: 16 additions & 0 deletions spacy/tests/regression/test_issue1001-1500.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,22 @@
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part


@pytest.mark.xfail(
reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)"
)
def test_issue1235():
"""Test that g is not split of if preceded by a number and a letter"""
nlp = English()
testwords = u'e2g 2g 52g'
doc = nlp(testwords)
assert len(doc) == 5
assert doc[0].text == "e2g"
assert doc[1].text == "2"
assert doc[2].text == "g"
assert doc[3].text == "52"
assert doc[4].text == "g"


def test_issue1242():
nlp = English()
doc = nlp("")
Expand Down
12 changes: 12 additions & 0 deletions spacy/tests/regression/test_issue2001-2500.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,22 @@
from spacy.gold import iob_to_biluo
from spacy.lang.it import Italian
import numpy
from spacy.lang.en import English

from ..util import add_vecs_to_vocab, get_doc


@pytest.mark.xfail(
reason="The dot is now properly split off, but the prefix/suffix rules are not applied again afterwards."
"This means that the quote will still be attached to the remaining token."
)
def test_issue2070():
"""Test that checks that a dot followed by a quote is handled appropriately."""
nlp = English()
doc = nlp('First sentence."A quoted sentence" he said ...')
assert len(doc) == 11


def test_issue2179():
"""Test that spurious 'extra_labels' aren't created when initializing NER."""
nlp = Italian()
Expand Down
24 changes: 24 additions & 0 deletions spacy/tests/regression/test_issue2656.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.en import English


def test_issue2656():
""" Test that tokenizer correctly splits of punctuation after numbers with decimal points """
text = "I went for 40.3, and got home by 10.0."
nlp = English()
doc = nlp(text)

assert len(doc) == 11

assert doc[0].text == "I"
assert doc[1].text == "went"
assert doc[2].text == "for"
assert doc[3].text == "40.3"
assert doc[4].text == ","
assert doc[5].text == "and"
assert doc[6].text == "got"
assert doc[7].text == "home"
assert doc[8].text == "by"
assert doc[9].text == "10.0"
assert doc[10].text == "."
21 changes: 21 additions & 0 deletions spacy/tests/regression/test_issue2822.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.it import Italian


def test_issue2822():
""" Test that the abbreviation of poco is kept as one word """
nlp = Italian()
text = "Vuoi un po' di zucchero?"

doc = nlp(text)

assert len(doc) == 6

assert doc[0].text == "Vuoi"
assert doc[1].text == "un"
assert doc[2].text == "po'"
assert doc[2].lemma_ == "poco"
assert doc[3].text == "di"
assert doc[4].text == "zucchero"
assert doc[5].text == "?"
21 changes: 21 additions & 0 deletions spacy/tests/regression/test_issue2926.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# coding: utf8
from __future__ import unicode_literals
from spacy.lang.fr import French


def test_issue2926():
""" Test that the tokenizer correctly splits tokens separated by a slash (/) ending in a digit """
nlp = French()
text = "Learn html5/css3/javascript/jquery"
doc = nlp(text)

assert len(doc) == 8

assert doc[0].text == "Learn"
assert doc[1].text == "html5"
assert doc[2].text == "/"
assert doc[3].text == "css3"
assert doc[4].text == "/"
assert doc[5].text == "javascript"
assert doc[6].text == "/"
assert doc[7].text == "jquery"
11 changes: 11 additions & 0 deletions spacy/tests/regression/test_issue3002.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# coding: utf8
from __future__ import unicode_literals

from spacy.lang.de import German


def test_issue3002():
"""Test that the tokenizer doesn't hang on a long list of dots"""
nlp = German()
doc = nlp('880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl')
assert len(doc) == 5
7 changes: 4 additions & 3 deletions website/docs/usage/adding-languages.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,12 @@ language and training a language model.

In order for the tokenizer to split suffixes, prefixes and infixes, spaCy needs
to know the language's character set. If the language you're adding uses
non-latin characters, you might need to add the required character classes to
non-latin characters, you might need to define the required character classes in
the global
[`char_classes.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/char_classes.py).
spaCy uses the [`regex` library](https://pypi.python.org/pypi/regex/) to keep
this simple and readable. If the language requires very specific punctuation
For efficiency, spaCy uses hard-coded unicode ranges to define character classes,
the definitions of which can be found on [Wikipedia](https://en.wikipedia.org/wiki/Unicode_block).
If the language requires very specific punctuation
rules, you should consider overwriting the default regular expressions with your
own in the language's `Defaults`.

Expand Down

0 comments on commit 9a478b6

Please sign in to comment.