Tidy up and fix small bugs and typos

tokenmill · Feb 8, 2019 · 25602c7 · 25602c7
1 parent 9e652af
commit 25602c7
Show file tree

Hide file tree

Showing 47 changed files with 750 additions and 932 deletions.
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
@@ -8,15 +8,14 @@
 from collections import Counter
 from pathlib import Path
 from thinc.v2v import Affine, Maxout
-from thinc.api import wrap, layerize
 from thinc.misc import LayerNorm as LN
-from thinc.neural.util import prefer_gpu, get_array_module
+from thinc.neural.util import prefer_gpu
 from wasabi import Printer
 import srsly
 
 from ..tokens import Doc
 from ..attrs import ID, HEAD
-from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
+from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
 from .._ml import masked_language_model
 from .. import util
 
@@ -136,7 +135,7 @@ def pretrain(
             random.shuffle(texts)
 
 
-def make_update(model, docs, optimizer, drop=0.0, objective='L2'):
+def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
     """Perform an update over a single batch of documents.
 
     docs (iterable): A batch of `Doc` objects.
@@ -171,7 +170,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500):
     return docs
 
 
-def get_vectors_loss(ops, docs, prediction, objective='L2'):
+def get_vectors_loss(ops, docs, prediction, objective="L2"):
     """Compute a mean-squared error loss between the documents' vectors and
     the prediction.
 
@@ -185,9 +184,9 @@ def get_vectors_loss(ops, docs, prediction, objective='L2'):
     # and look them up all at once. This prevents data copying.
     ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
     target = docs[0].vocab.vectors.data[ids]
-    if objective == 'L2':
+    if objective == "L2":
         d_scores = prediction - target
-        loss = (d_scores**2).sum()
+        loss = (d_scores ** 2).sum()
     else:
         raise NotImplementedError(objective)
     return loss, d_scores
@@ -201,8 +200,7 @@ def create_pretraining_model(nlp, tok2vec):
     """
     output_size = nlp.vocab.vectors.data.shape[1]
     output_layer = chain(
-        LN(Maxout(300, pieces=3)),
-        Affine(output_size, drop_factor=0.0),
+        LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
     )
     # This is annoying, but the parser etc have the flatten step after
     # the tok2vec. To load the weights in cleanly, we need to match

diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
@@ -13,13 +13,7 @@
 
 
 def render(
-    docs,
-    style="dep",
-    page=False,
-    minify=False,
-    jupyter=False,
-    options={},
-    manual=False,
+    docs, style="dep", page=False, minify=False, jupyter=False, options={}, manual=False
 ):
     """Render displaCy visualisation.
 
@@ -80,7 +74,7 @@ def serve(
     """
     from wsgiref import simple_server
 
-    if IS_JUPYTER:
+    if is_in_jupyter():
         user_warning(Warnings.W011)
 
     render(docs, style=style, page=page, minify=minify, options=options, manual=manual)

diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py
@@ -1,8 +1,9 @@
 # coding: utf8
 from __future__ import unicode_literals
 
-from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
-from ..char_classes import CONCAT_QUOTES, CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES
+from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+
 
 # removing ° from the special icons to keep e.g. 99° as one token
 _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
@@ -29,7 +30,9 @@
         r"(?<=°[FfCcKk])\.",
         r"(?<=[0-9])(?:[{c}])".format(c=_currency),
         r"(?<=[0-9])(?:{u})".format(u=UNITS),
-        r"(?<=[{al}{e}{q}(?:{c})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency),
+        r"(?<=[{al}{e}{q}(?:{c})])\.".format(
+            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
+        ),
         r"(?<=[{al})])-e".format(al=ALPHA_LOWER),
     ]
 )
@@ -40,7 +43,7 @@
     + [
         r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
         r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
-        r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
+        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
         r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),

diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
@@ -5,24 +5,24 @@
 from collections import namedtuple
 
 from .tag_map import TAG_MAP
-
 from ...attrs import LANG
 from ...language import Language
 from ...tokens import Doc, Token
 from ...util import DummyTokenizer
 
+
 ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
 
+# TODO: Is this the right place for this?
+Token.set_extension("mecab_tag", default=None)
+
 
 def try_mecab_import():
     """Mecab is required for Japanese support, so check for it.
-
     It it's not available blow up and explain how to fix it."""
     try:
         import MeCab
 
-        # XXX Is this the right place for this?
-        Token.set_extension("mecab_tag", default=None)
         return MeCab
     except ImportError:
         raise ImportError(
@@ -33,14 +33,13 @@ def try_mecab_import():
 
 def resolve_pos(token):
     """If necessary, add a field to the POS tag for UD mapping.
-
     Under Universal Dependencies, sometimes the same Unidic POS tag can
     be mapped differently depending on the literal token or its context
     in the sentence. This function adds information to the POS tag to
     resolve ambiguous mappings.
     """
 
-    # NOTE: This is a first take. The rules here are crude approximations.
+    # TODO: This is a first take. The rules here are crude approximations.
     # For many of these, full dependencies are needed to properly resolve
     # PoS mappings.
 
@@ -56,7 +55,7 @@ def resolve_pos(token):
 
 def detailed_tokens(tokenizer, text):
     """Format Mecab output into a nice data structure, based on Janome."""
-    tokenizer.parse(text)
+
     node = tokenizer.parseToNode(text)
     node = node.next  # first node is beginning of sentence and empty, skip it
     words = []
@@ -98,62 +97,15 @@ def __call__(self, text):
         return doc
 
 
-class JapaneseCharacterSegmenter(object):
-    def __init__(self, vocab):
-        self.vocab = vocab
-        self._presegmenter = self._make_presegmenter(self.vocab)
-
-    def _make_presegmenter(self, vocab):
-        rules = Japanese.Defaults.tokenizer_exceptions
-        token_match = Japanese.Defaults.token_match
-        prefix_search = (
-            util.compile_prefix_regex(Japanese.Defaults.prefixes).search
-            if Japanese.Defaults.prefixes
-            else None
-        )
-        suffix_search = (
-            util.compile_suffix_regex(Japanese.Defaults.suffixes).search
-            if Japanese.Defaults.suffixes
-            else None
-        )
-        infix_finditer = (
-            util.compile_infix_regex(Japanese.Defaults.infixes).finditer
-            if Japanese.Defaults.infixes
-            else None
-        )
-        return Tokenizer(
-            vocab,
-            rules=rules,
-            prefix_search=prefix_search,
-            suffix_search=suffix_search,
-            infix_finditer=infix_finditer,
-            token_match=token_match,
-        )
-
-    def __call__(self, text):
-        words = []
-        spaces = []
-        doc = self._presegmenter(text)
-        for token in doc:
-            words.extend(list(token.text))
-            spaces.extend([False] * len(token.text))
-            spaces[-1] = bool(token.whitespace_)
-        return Doc(self.vocab, words=words, spaces=spaces)
-
-
 class JapaneseDefaults(Language.Defaults):
     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
     lex_attr_getters[LANG] = lambda _text: "ja"
 
     tag_map = TAG_MAP
-    use_janome = True
 
     @classmethod
     def create_tokenizer(cls, nlp=None):
-        if cls.use_janome:
-            return JapaneseTokenizer(cls, nlp)
-        else:
-            return JapaneseCharacterSegmenter(nlp.vocab)
+        return JapaneseTokenizer(cls, nlp)
 
 
 class Japanese(Language):

diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py
@@ -2,10 +2,10 @@
 from __future__ import unicode_literals
 
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .punctuation import TOKENIZER_INFIXES
 from .tag_map import TAG_MAP
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
-from .punctuation import TOKENIZER_INFIXES
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@@ -22,9 +22,9 @@ class PolishDefaults(Language.Defaults):
         Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
     )
     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
-    infixes = tuple(TOKENIZER_INFIXES)
     stop_words = STOP_WORDS
     tag_map = TAG_MAP
+    infixes = TOKENIZER_INFIXES
 
 
 class Polish(Language):

diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py
@@ -1,14 +1,22 @@
 # coding: utf8
 from __future__ import unicode_literals
-from ..char_classes import LIST_ELLIPSES, LIST_ICONS
-from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
-_quotes = QUOTES.replace("'", '')
-_infixes = (LIST_ELLIPSES + LIST_ICONS +
-            [r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
-             r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
-             r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
-             r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
-             r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
-             r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)])
+
+from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS
+from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
+
+_quotes = CONCAT_QUOTES.replace("'", "")
+
+_infixes = (
+    LIST_ELLIPSES
+    + [CONCAT_ICONS]
+    + [
+        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
+    ]
+)
 
 TOKENIZER_INFIXES = _infixes
diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
+from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ
 
 
 _exc = {}

diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
@@ -6,7 +6,9 @@
 from .stop_words import STOP_WORDS
 from .morph_rules import MORPH_RULES
 from .lemmatizer import LEMMA_RULES, LOOKUP
-from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
+
+# Punctuation stolen from Danish
+from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ..norm_exceptions import BASE_NORMS
@@ -31,6 +33,7 @@ class SwedishDefaults(Language.Defaults):
     lemma_lookup = LOOKUP
     morph_rules = MORPH_RULES
 
+
 class Swedish(Language):
     lang = "sv"
     Defaults = SwedishDefaults

diff --git a/spacy/lang/sv/punctuation.py b/spacy/lang/sv/punctuation.py