Skip to content

Commit

Permalink
Improve Italian & Urdu tokenization accuracy (explosion#3228)
Browse files Browse the repository at this point in the history
## Description

1. Added the same infix rule as in French (`d'une`, `j'ai`) for Italian (`c'è`, `l'ha`), bringing F-score on `it_isdt-ud-train.txt` from 96% to 99%. Added unit test to check this behaviour.

2. Added specific Urdu punctuation character as suffix, improving F-score on `ur_udtb-ud-train.txt` from 94% to 100%. Added unit test to check this behaviour.

### Types of change
Enhancement of Italian & Urdu tokenization

## Checklist
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
  • Loading branch information
svlandeg authored and ines committed Feb 4, 2019
1 parent a3efa3e commit 9745b0d
Show file tree
Hide file tree
Showing 10 changed files with 65 additions and 1 deletion.
3 changes: 3 additions & 0 deletions spacy/lang/it/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups

from .punctuation import TOKENIZER_INFIXES


class ItalianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
Expand All @@ -22,6 +24,7 @@ class ItalianDefaults(Language.Defaults):
stop_words = STOP_WORDS
lemma_lookup = LOOKUP
tag_map = TAG_MAP
infixes = TOKENIZER_INFIXES


class Italian(Language):
Expand Down
15 changes: 15 additions & 0 deletions spacy/lang/it/punctuation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# coding: utf8
from __future__ import unicode_literals

from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import ALPHA


ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")


_infixes = TOKENIZER_INFIXES + [
r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
]

TOKENIZER_INFIXES = _infixes
3 changes: 3 additions & 0 deletions spacy/lang/ur/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from ...language import Language
from ...attrs import LANG

from .punctuation import TOKENIZER_SUFFIXES


class UrduDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
Expand All @@ -18,6 +20,7 @@ class UrduDefaults(Language.Defaults):
tokenizer_exceptions = BASE_EXCEPTIONS
tag_map = TAG_MAP
stop_words = STOP_WORDS
suffixes = TOKENIZER_SUFFIXES


class Urdu(Language):
Expand Down
10 changes: 10 additions & 0 deletions spacy/lang/ur/punctuation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# coding: utf8
from __future__ import unicode_literals

from ..punctuation import TOKENIZER_SUFFIXES


_suffixes = TOKENIZER_SUFFIXES + ["۔"]


TOKENIZER_SUFFIXES = _suffixes
5 changes: 5 additions & 0 deletions spacy/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ def id_tokenizer():
return get_lang_class("id").Defaults.create_tokenizer()


@pytest.fixture(scope="session")
def it_tokenizer():
return get_lang_class("it").Defaults.create_tokenizer()


@pytest.fixture(scope="session")
def sv_tokenizer():
return get_lang_class("sv").Defaults.create_tokenizer()
Expand Down
1 change: 1 addition & 0 deletions spacy/tests/lang/ca/test_prefix_suffix_infix.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ def test_contractions(ca_tokenizer, text, expected_tokens):
""" Test that the contractions are split into two tokens"""
tokens = ca_tokenizer(text)
assert len(tokens) == 2
assert [t.text for t in tokens] == expected_tokens
Empty file added spacy/tests/lang/it/__init__.py
Empty file.
14 changes: 14 additions & 0 deletions spacy/tests/lang/it/test_prefix_suffix_infix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# coding: utf-8
from __future__ import unicode_literals

import pytest


@pytest.mark.parametrize(
"text,expected_tokens", [("c'è", ["c'", "è"]), ("l'ha", ["l'", "ha"])]
)
def test_contractions(it_tokenizer, text, expected_tokens):
""" Test that the contractions are split into two tokens"""
tokens = it_tokenizer(text)
assert len(tokens) == 2
assert [t.text for t in tokens] == expected_tokens
13 changes: 13 additions & 0 deletions spacy/tests/lang/ur/test_prefix_suffix_infix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals

import pytest


@pytest.mark.parametrize(
"text", ['ہےں۔', 'کیا۔']
)
def test_contractions(ur_tokenizer, text):
"""Test specific Urdu punctuation character"""
tokens = ur_tokenizer(text)
assert len(tokens) == 2
2 changes: 1 addition & 1 deletion spacy/tests/lang/ur/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def test_ur_tokenizer_handles_long_text(ur_tokenizer):
کہ ایک عدد ٹیلی ویژن ہی کیوں نہ خرید لیں ، سوچا ورلڈ کپ ہی دیکھیں گے۔اپنے پاکستان کے کھلاڑیوں کو دیکھ کر
ورلڈ کپ دیکھنے کا حوصلہ ہی نہ رہا تو اب یوں ہی ادھر اُدھر کے چینل گھمانے لگ پڑتے ہیں۔"""
tokens = ur_tokenizer(text)
assert len(tokens) == 77
assert len(tokens) == 78


@pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)])
Expand Down

0 comments on commit 9745b0d

Please sign in to comment.