Skip to content

Commit

Permalink
Tidy up and format remaining files
Browse files Browse the repository at this point in the history
  • Loading branch information
ines committed Nov 30, 2018
1 parent 2a95133 commit 323fc26
Show file tree
Hide file tree
Showing 35 changed files with 391 additions and 366 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ exclude =
_tokenizer_exceptions_list.py,
spacy/lang/fr/lemmatizer,
spacy/lang/nb/lemmatizer
spacy/__init__.py
3 changes: 2 additions & 1 deletion spacy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
import warnings

warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

Expand All @@ -15,7 +16,7 @@


def load(name, **overrides):
depr_path = overrides.get('path')
depr_path = overrides.get("path")
if depr_path not in (True, False, None):
deprecation_warning(Warnings.W001.format(path=depr_path))
return util.load_model(name, **overrides)
Expand Down
2 changes: 1 addition & 1 deletion spacy/_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
try:
import torch.nn
from thinc.extra.wrappers import PyTorchWrapperRNN
except:
except ImportError:
torch = None

VECTORS_KEY = "spacy_pretrained_vectors"
Expand Down
9 changes: 3 additions & 6 deletions spacy/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,14 +479,11 @@ def begin_training(self, get_gold_tuples=None, sgd=None, **cfg):
for _, annots_brackets in get_gold_tuples():
for annots, _ in annots_brackets:
for word in annots[1]:
_ = self.vocab[word]
contexts = []
_ = self.vocab[word] # noqa: F841
if cfg.get("device", -1) >= 0:
device = util.use_gpu(cfg["device"])
util.use_gpu(cfg["device"])
if self.vocab.vectors.data.shape[1] >= 1:
self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data)
else:
device = None
link_vectors_to_models(self.vocab)
if self.vocab.vectors.data.shape[1]:
cfg["pretrained_vectors"] = self.vocab.vectors.name
Expand Down Expand Up @@ -742,7 +739,7 @@ def from_bytes(self, bytes_data, disable=[]):
if not hasattr(proc, "from_bytes"):
continue
deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
msg = util.from_bytes(bytes_data, deserializers, {})
util.from_bytes(bytes_data, deserializers, {})
return self


Expand Down
10 changes: 5 additions & 5 deletions spacy/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,13 @@ def da_tokenizer():

@pytest.fixture(scope="session")
def ja_tokenizer():
mecab = pytest.importorskip("MeCab")
pytest.importorskip("MeCab")
return get_lang_class("ja").Defaults.create_tokenizer()


@pytest.fixture(scope="session")
def th_tokenizer():
pythainlp = pytest.importorskip("pythainlp")
pytest.importorskip("pythainlp")
return get_lang_class("th").Defaults.create_tokenizer()


Expand All @@ -112,9 +112,9 @@ def tr_tokenizer():
return get_lang_class("tr").Defaults.create_tokenizer()


@pytest.fixture(scope='session')
@pytest.fixture(scope="session")
def ca_tokenizer():
return get_lang_class('ca').Defaults.create_tokenizer()
return get_lang_class("ca").Defaults.create_tokenizer()


@pytest.fixture(scope="session")
Expand All @@ -139,7 +139,7 @@ def ur_tokenizer():

@pytest.fixture(scope="session")
def ru_tokenizer():
pymorphy = pytest.importorskip("pymorphy2")
pytest.importorskip("pymorphy2")
return get_lang_class("ru").Defaults.create_tokenizer()


Expand Down
12 changes: 5 additions & 7 deletions spacy/tests/doc/test_add_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,13 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
ner = EntityRecognizer(en_vocab)
ner.begin_training([])
ner(doc)

assert len(list(doc.ents)) == 0
assert [w.ent_iob_ for w in doc] == (['O'] * len(doc))

doc.ents = [(doc.vocab.strings['ANIMAL'], 3, 4)]
assert [w.ent_iob_ for w in doc] == ['', '', '', 'B']
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]

doc.ents = [(doc.vocab.strings['WORD'], 0, 2)]
assert [w.ent_iob_ for w in doc] == ['B', 'I', '', '']

def test_add_overlapping_entities(en_vocab):
text = ["Louisiana", "Office", "of", "Conservation"]
Expand Down
22 changes: 12 additions & 10 deletions spacy/tests/doc/test_doc_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,18 +174,20 @@ def test_doc_api_merge(en_tokenizer):
doc = en_tokenizer(text)
assert len(doc) == 9
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[4: 7], attrs={'tag':'NAMED', 'lemma':'LEMMA',
'ent_type':'TYPE'})
retokenizer.merge(doc[7: 9], attrs={'tag':'NAMED', 'lemma':'LEMMA',
'ent_type':'TYPE'})
retokenizer.merge(
doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
)
retokenizer.merge(
doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
)

assert len(doc) == 6
assert doc[4].text == 'the beach boys'
assert doc[4].text_with_ws == 'the beach boys '
assert doc[4].tag_ == 'NAMED'
assert doc[5].text == 'all night'
assert doc[5].text_with_ws == 'all night'
assert doc[5].tag_ == 'NAMED'
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED"
assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED"


def test_doc_api_merge_children(en_tokenizer):
Expand Down
6 changes: 3 additions & 3 deletions spacy/tests/doc/test_pickle_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_pickle_single_doc():
def test_list_of_docs_pickles_efficiently():
nlp = Language()
for i in range(10000):
_ = nlp.vocab[unicode_(i)]
_ = nlp.vocab[unicode_(i)] # noqa: F841
one_pickled = pickle.dumps(nlp("0"), -1)
docs = list(nlp.pipe(unicode_(i) for i in range(100)))
many_pickled = pickle.dumps(docs, -1)
Expand All @@ -33,7 +33,7 @@ def test_user_data_from_disk():
doc.user_data[(0, 1)] = False
b = doc.to_bytes()
doc2 = doc.__class__(doc.vocab).from_bytes(b)
assert doc2.user_data[(0, 1)] == False
assert doc2.user_data[(0, 1)] is False


def test_user_data_unpickles():
Expand All @@ -42,7 +42,7 @@ def test_user_data_unpickles():
doc.user_data[(0, 1)] = False
b = pickle.dumps(doc)
doc2 = pickle.loads(b)
assert doc2.user_data[(0, 1)] == False
assert doc2.user_data[(0, 1)] is False


def test_hooks_unpickle():
Expand Down
4 changes: 2 additions & 2 deletions spacy/tests/doc/test_span_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,15 @@ def test_span_np_merges(en_tokenizer):
ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) for e in doc.ents]
for start, end, label, lemma in ents:
merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label)
assert merged != None, (start, end, label, lemma)
assert merged is not None, (start, end, label, lemma)

text = "One test with entities like New York City so the ents list is not void"
heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
for span in doc.ents:
merged = doc.merge()
assert merged != None, (span.start, span.end, span.label_, span.lemma_)
assert merged is not None, (span.start, span.end, span.label_, span.lemma_)


def test_spans_entity_merge(en_tokenizer):
Expand Down
4 changes: 2 additions & 2 deletions spacy/tests/doc/test_underscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ def test_doc_underscore_getattr_setattr():
doc.user_data = {}
Underscore.doc_extensions["hello"] = (False, None, None, None)
doc._ = Underscore(Underscore.doc_extensions, doc)
assert doc._.hello == False
assert doc._.hello is False
doc._.hello = True
assert doc._.hello == True
assert doc._.hello is True


def test_create_span_underscore():
Expand Down
2 changes: 1 addition & 1 deletion spacy/tests/lang/ar/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ def test_ar_tokenizer_handles_long_text(ar_tokenizer):
و قد نجح في الحصول على جائزة نوبل للآداب، ليكون بذلك العربي الوحيد الذي فاز بها."""

tokens = ar_tokenizer(text)
assert tokens[3].is_stop == True
assert tokens[3].is_stop is True
assert len(tokens) == 77
15 changes: 7 additions & 8 deletions spacy/tests/lang/en/test_noun_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,12 @@
from ...util import get_doc


def test_en_noun_chunks_not_nested(en_tokenizer):
text = "Peter has chronic command and control issues"
def test_en_noun_chunks_not_nested(en_vocab):
words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
heads = [1, 0, 4, 3, -1, -2, -5]
deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
tokens.from_array(
doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
doc.from_array(
[HEAD, DEP],
numpy.asarray(
[
Expand All @@ -30,11 +29,11 @@ def test_en_noun_chunks_not_nested(en_tokenizer):
dtype="uint64",
),
)
tokens.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"]
doc.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"]
word_occurred = {}
for chunk in tokens.noun_chunks:
for chunk in doc.noun_chunks:
for word in chunk:
word_occurred.setdefault(word.text, 0)
word_occurred[word.text] += 1
for word, freq in word_occurred.items():
assert freq == 1, (word, [chunk.text for chunk in tokens.noun_chunks])
assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])
Loading

0 comments on commit 323fc26

Please sign in to comment.