Skip to content

Commit

Permalink
Tidy up and fix small bugs and typos
Browse files Browse the repository at this point in the history
  • Loading branch information
ines committed Feb 8, 2019
1 parent 9e652af commit 25602c7
Show file tree
Hide file tree
Showing 47 changed files with 750 additions and 932 deletions.
16 changes: 7 additions & 9 deletions spacy/cli/pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,14 @@
from collections import Counter
from pathlib import Path
from thinc.v2v import Affine, Maxout
from thinc.api import wrap, layerize
from thinc.misc import LayerNorm as LN
from thinc.neural.util import prefer_gpu, get_array_module
from thinc.neural.util import prefer_gpu
from wasabi import Printer
import srsly

from ..tokens import Doc
from ..attrs import ID, HEAD
from .._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer
from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
from .._ml import masked_language_model
from .. import util

Expand Down Expand Up @@ -136,7 +135,7 @@ def pretrain(
random.shuffle(texts)


def make_update(model, docs, optimizer, drop=0.0, objective='L2'):
def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
"""Perform an update over a single batch of documents.
docs (iterable): A batch of `Doc` objects.
Expand Down Expand Up @@ -171,7 +170,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500):
return docs


def get_vectors_loss(ops, docs, prediction, objective='L2'):
def get_vectors_loss(ops, docs, prediction, objective="L2"):
"""Compute a mean-squared error loss between the documents' vectors and
the prediction.
Expand All @@ -185,9 +184,9 @@ def get_vectors_loss(ops, docs, prediction, objective='L2'):
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
if objective == 'L2':
if objective == "L2":
d_scores = prediction - target
loss = (d_scores**2).sum()
loss = (d_scores ** 2).sum()
else:
raise NotImplementedError(objective)
return loss, d_scores
Expand All @@ -201,8 +200,7 @@ def create_pretraining_model(nlp, tok2vec):
"""
output_size = nlp.vocab.vectors.data.shape[1]
output_layer = chain(
LN(Maxout(300, pieces=3)),
Affine(output_size, drop_factor=0.0),
LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
)
# This is annoying, but the parser etc have the flatten step after
# the tok2vec. To load the weights in cleanly, we need to match
Expand Down
10 changes: 2 additions & 8 deletions spacy/displacy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,7 @@


def render(
docs,
style="dep",
page=False,
minify=False,
jupyter=False,
options={},
manual=False,
docs, style="dep", page=False, minify=False, jupyter=False, options={}, manual=False
):
"""Render displaCy visualisation.
Expand Down Expand Up @@ -80,7 +74,7 @@ def serve(
"""
from wsgiref import simple_server

if IS_JUPYTER:
if is_in_jupyter():
user_warning(Warnings.W011)

render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
Expand Down
11 changes: 7 additions & 4 deletions spacy/lang/hu/punctuation.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# coding: utf8
from __future__ import unicode_literals

from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES
from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER


# removing ° from the special icons to keep e.g. 99° as one token
_concat_icons = CONCAT_ICONS.replace("\u00B0", "")
Expand All @@ -29,7 +30,9 @@
r"(?<=°[FfCcKk])\.",
r"(?<=[0-9])(?:[{c}])".format(c=_currency),
r"(?<=[0-9])(?:{u})".format(u=UNITS),
r"(?<=[{al}{e}{q}(?:{c})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency),
r"(?<=[{al}{e}{q}(?:{c})])\.".format(
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
),
r"(?<=[{al})])-e".format(al=ALPHA_LOWER),
]
)
Expand All @@ -40,7 +43,7 @@
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=_quotes),
Expand Down
62 changes: 7 additions & 55 deletions spacy/lang/ja/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,24 @@
from collections import namedtuple

from .tag_map import TAG_MAP

from ...attrs import LANG
from ...language import Language
from ...tokens import Doc, Token
from ...util import DummyTokenizer


ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])

# TODO: Is this the right place for this?
Token.set_extension("mecab_tag", default=None)


def try_mecab_import():
"""Mecab is required for Japanese support, so check for it.
It it's not available blow up and explain how to fix it."""
try:
import MeCab

# XXX Is this the right place for this?
Token.set_extension("mecab_tag", default=None)
return MeCab
except ImportError:
raise ImportError(
Expand All @@ -33,14 +33,13 @@ def try_mecab_import():

def resolve_pos(token):
"""If necessary, add a field to the POS tag for UD mapping.
Under Universal Dependencies, sometimes the same Unidic POS tag can
be mapped differently depending on the literal token or its context
in the sentence. This function adds information to the POS tag to
resolve ambiguous mappings.
"""

# NOTE: This is a first take. The rules here are crude approximations.
# TODO: This is a first take. The rules here are crude approximations.
# For many of these, full dependencies are needed to properly resolve
# PoS mappings.

Expand All @@ -56,7 +55,7 @@ def resolve_pos(token):

def detailed_tokens(tokenizer, text):
"""Format Mecab output into a nice data structure, based on Janome."""
tokenizer.parse(text)

node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it
words = []
Expand Down Expand Up @@ -98,62 +97,15 @@ def __call__(self, text):
return doc


class JapaneseCharacterSegmenter(object):
def __init__(self, vocab):
self.vocab = vocab
self._presegmenter = self._make_presegmenter(self.vocab)

def _make_presegmenter(self, vocab):
rules = Japanese.Defaults.tokenizer_exceptions
token_match = Japanese.Defaults.token_match
prefix_search = (
util.compile_prefix_regex(Japanese.Defaults.prefixes).search
if Japanese.Defaults.prefixes
else None
)
suffix_search = (
util.compile_suffix_regex(Japanese.Defaults.suffixes).search
if Japanese.Defaults.suffixes
else None
)
infix_finditer = (
util.compile_infix_regex(Japanese.Defaults.infixes).finditer
if Japanese.Defaults.infixes
else None
)
return Tokenizer(
vocab,
rules=rules,
prefix_search=prefix_search,
suffix_search=suffix_search,
infix_finditer=infix_finditer,
token_match=token_match,
)

def __call__(self, text):
words = []
spaces = []
doc = self._presegmenter(text)
for token in doc:
words.extend(list(token.text))
spaces.extend([False] * len(token.text))
spaces[-1] = bool(token.whitespace_)
return Doc(self.vocab, words=words, spaces=spaces)


class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda _text: "ja"

tag_map = TAG_MAP
use_janome = True

@classmethod
def create_tokenizer(cls, nlp=None):
if cls.use_janome:
return JapaneseTokenizer(cls, nlp)
else:
return JapaneseCharacterSegmenter(nlp.vocab)
return JapaneseTokenizer(cls, nlp)


class Japanese(Language):
Expand Down
4 changes: 2 additions & 2 deletions spacy/lang/pl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
from __future__ import unicode_literals

from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
from .lex_attrs import LEX_ATTRS
from .punctuation import TOKENIZER_INFIXES

from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
Expand All @@ -22,9 +22,9 @@ class PolishDefaults(Language.Defaults):
Language.Defaults.lex_attr_getters[NORM], BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
infixes = tuple(TOKENIZER_INFIXES)
stop_words = STOP_WORDS
tag_map = TAG_MAP
infixes = TOKENIZER_INFIXES


class Polish(Language):
Expand Down
28 changes: 18 additions & 10 deletions spacy/lang/pl/punctuation.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
_quotes = QUOTES.replace("'", '')
_infixes = (LIST_ELLIPSES + LIST_ICONS +
[r'(?<=[{}])\.(?=[{}])'.format(ALPHA_LOWER, ALPHA_UPPER),
r'(?<=[{a}])[,!?](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}"])[:<>=](?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}]),(?=[{a}])'.format(a=ALPHA),
r'(?<=[{a}])([{q}\)\]\(\[])(?=[\{a}])'.format(a=ALPHA, q=_quotes),
r'(?<=[{a}])--(?=[{a}])'.format(a=ALPHA)])

from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER

_quotes = CONCAT_QUOTES.replace("'", "")

_infixes = (
LIST_ELLIPSES
+ [CONCAT_ICONS]
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[\-{a}])".format(a=ALPHA, q=CONCAT_QUOTES),
]
)

TOKENIZER_INFIXES = _infixes
1 change: 1 addition & 0 deletions spacy/lang/pl/tokenizer_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import unicode_literals

from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS
from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ


_exc = {}
Expand Down
5 changes: 4 additions & 1 deletion spacy/lang/sv/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
from .stop_words import STOP_WORDS
from .morph_rules import MORPH_RULES
from .lemmatizer import LEMMA_RULES, LOOKUP
from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES

# Punctuation stolen from Danish
from ..da.punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES

from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
Expand All @@ -31,6 +33,7 @@ class SwedishDefaults(Language.Defaults):
lemma_lookup = LOOKUP
morph_rules = MORPH_RULES


class Swedish(Language):
lang = "sv"
Defaults = SwedishDefaults
Expand Down
25 changes: 0 additions & 25 deletions spacy/lang/sv/punctuation.py

This file was deleted.

Loading

0 comments on commit 25602c7

Please sign in to comment.