Tidy up and format remaining files

tokenmill · Nov 30, 2018 · 323fc26 · 323fc26
1 parent 2a95133
commit 323fc26
Show file tree

Hide file tree

Showing 35 changed files with 391 additions and 366 deletions.
diff --git a/.flake8 b/.flake8
@@ -11,3 +11,4 @@ exclude =
     _tokenizer_exceptions_list.py,
     spacy/lang/fr/lemmatizer,
     spacy/lang/nb/lemmatizer
+    spacy/__init__.py
diff --git a/spacy/__init__.py b/spacy/__init__.py
@@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 import warnings
+
 warnings.filterwarnings("ignore", message="numpy.dtype size changed")
 warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 
@@ -15,7 +16,7 @@
 
 
 def load(name, **overrides):
-    depr_path = overrides.get('path')
+    depr_path = overrides.get("path")
     if depr_path not in (True, False, None):
         deprecation_warning(Warnings.W001.format(path=depr_path))
     return util.load_model(name, **overrides)

diff --git a/spacy/_ml.py b/spacy/_ml.py
@@ -29,7 +29,7 @@
 try:
     import torch.nn
     from thinc.extra.wrappers import PyTorchWrapperRNN
-except:
+except ImportError:
     torch = None
 
 VECTORS_KEY = "spacy_pretrained_vectors"

diff --git a/spacy/language.py b/spacy/language.py
@@ -479,14 +479,11 @@ def begin_training(self, get_gold_tuples=None, sgd=None, **cfg):
             for _, annots_brackets in get_gold_tuples():
                 for annots, _ in annots_brackets:
                     for word in annots[1]:
-                        _ = self.vocab[word]
-        contexts = []
+                        _ = self.vocab[word]  # noqa: F841
         if cfg.get("device", -1) >= 0:
-            device = util.use_gpu(cfg["device"])
+            util.use_gpu(cfg["device"])
             if self.vocab.vectors.data.shape[1] >= 1:
                 self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data)
-        else:
-            device = None
         link_vectors_to_models(self.vocab)
         if self.vocab.vectors.data.shape[1]:
             cfg["pretrained_vectors"] = self.vocab.vectors.name
@@ -742,7 +739,7 @@ def from_bytes(self, bytes_data, disable=[]):
             if not hasattr(proc, "from_bytes"):
                 continue
             deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False)
-        msg = util.from_bytes(bytes_data, deserializers, {})
+        util.from_bytes(bytes_data, deserializers, {})
         return self
 
 

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
@@ -97,13 +97,13 @@ def da_tokenizer():
 
 @pytest.fixture(scope="session")
 def ja_tokenizer():
-    mecab = pytest.importorskip("MeCab")
+    pytest.importorskip("MeCab")
     return get_lang_class("ja").Defaults.create_tokenizer()
 
 
 @pytest.fixture(scope="session")
 def th_tokenizer():
-    pythainlp = pytest.importorskip("pythainlp")
+    pytest.importorskip("pythainlp")
     return get_lang_class("th").Defaults.create_tokenizer()
 
 
@@ -112,9 +112,9 @@ def tr_tokenizer():
     return get_lang_class("tr").Defaults.create_tokenizer()
 
 
-@pytest.fixture(scope='session')
+@pytest.fixture(scope="session")
 def ca_tokenizer():
-    return get_lang_class('ca').Defaults.create_tokenizer()
+    return get_lang_class("ca").Defaults.create_tokenizer()
 
 
 @pytest.fixture(scope="session")
@@ -139,7 +139,7 @@ def ur_tokenizer():
 
 @pytest.fixture(scope="session")
 def ru_tokenizer():
-    pymorphy = pytest.importorskip("pymorphy2")
+    pytest.importorskip("pymorphy2")
     return get_lang_class("ru").Defaults.create_tokenizer()
 
 

diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py
@@ -14,15 +14,13 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
     ner = EntityRecognizer(en_vocab)
     ner.begin_training([])
     ner(doc)
-
     assert len(list(doc.ents)) == 0
-    assert [w.ent_iob_ for w in doc] == (['O'] * len(doc))
-
-    doc.ents = [(doc.vocab.strings['ANIMAL'], 3, 4)]
-    assert [w.ent_iob_ for w in doc] == ['', '', '', 'B']
+    assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
+    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
+    assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
+    doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
+    assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
 
-    doc.ents = [(doc.vocab.strings['WORD'], 0, 2)]
-    assert [w.ent_iob_ for w in doc] == ['B', 'I', '', '']
 
 def test_add_overlapping_entities(en_vocab):
     text = ["Louisiana", "Office", "of", "Conservation"]

diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
@@ -174,18 +174,20 @@ def test_doc_api_merge(en_tokenizer):
     doc = en_tokenizer(text)
     assert len(doc) == 9
     with doc.retokenize() as retokenizer:
-        retokenizer.merge(doc[4: 7], attrs={'tag':'NAMED', 'lemma':'LEMMA',
-              'ent_type':'TYPE'})
-        retokenizer.merge(doc[7: 9], attrs={'tag':'NAMED', 'lemma':'LEMMA',
-              'ent_type':'TYPE'})
+        retokenizer.merge(
+            doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
+        )
+        retokenizer.merge(
+            doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
+        )
 
     assert len(doc) == 6
-    assert doc[4].text == 'the beach boys'
-    assert doc[4].text_with_ws == 'the beach boys '
-    assert doc[4].tag_ == 'NAMED'
-    assert doc[5].text == 'all night'
-    assert doc[5].text_with_ws == 'all night'
-    assert doc[5].tag_ == 'NAMED'
+    assert doc[4].text == "the beach boys"
+    assert doc[4].text_with_ws == "the beach boys "
+    assert doc[4].tag_ == "NAMED"
+    assert doc[5].text == "all night"
+    assert doc[5].text_with_ws == "all night"
+    assert doc[5].tag_ == "NAMED"
 
 
 def test_doc_api_merge_children(en_tokenizer):

diff --git a/spacy/tests/doc/test_pickle_doc.py b/spacy/tests/doc/test_pickle_doc.py
@@ -16,7 +16,7 @@ def test_pickle_single_doc():
 def test_list_of_docs_pickles_efficiently():
     nlp = Language()
     for i in range(10000):
-        _ = nlp.vocab[unicode_(i)]
+        _ = nlp.vocab[unicode_(i)]  # noqa: F841
     one_pickled = pickle.dumps(nlp("0"), -1)
     docs = list(nlp.pipe(unicode_(i) for i in range(100)))
     many_pickled = pickle.dumps(docs, -1)
@@ -33,7 +33,7 @@ def test_user_data_from_disk():
     doc.user_data[(0, 1)] = False
     b = doc.to_bytes()
     doc2 = doc.__class__(doc.vocab).from_bytes(b)
-    assert doc2.user_data[(0, 1)] == False
+    assert doc2.user_data[(0, 1)] is False
 
 
 def test_user_data_unpickles():
@@ -42,7 +42,7 @@ def test_user_data_unpickles():
     doc.user_data[(0, 1)] = False
     b = pickle.dumps(doc)
     doc2 = pickle.loads(b)
-    assert doc2.user_data[(0, 1)] == False
+    assert doc2.user_data[(0, 1)] is False
 
 
 def test_hooks_unpickle():

diff --git a/spacy/tests/doc/test_span_merge.py b/spacy/tests/doc/test_span_merge.py
@@ -87,15 +87,15 @@ def test_span_np_merges(en_tokenizer):
     ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) for e in doc.ents]
     for start, end, label, lemma in ents:
         merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label)
-        assert merged != None, (start, end, label, lemma)
+        assert merged is not None, (start, end, label, lemma)
 
     text = "One test with entities like New York City so the ents list is not void"
     heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2]
     tokens = en_tokenizer(text)
     doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
     for span in doc.ents:
         merged = doc.merge()
-        assert merged != None, (span.start, span.end, span.label_, span.lemma_)
+        assert merged is not None, (span.start, span.end, span.label_, span.lemma_)
 
 
 def test_spans_entity_merge(en_tokenizer):

diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py
@@ -22,9 +22,9 @@ def test_doc_underscore_getattr_setattr():
     doc.user_data = {}
     Underscore.doc_extensions["hello"] = (False, None, None, None)
     doc._ = Underscore(Underscore.doc_extensions, doc)
-    assert doc._.hello == False
+    assert doc._.hello is False
     doc._.hello = True
-    assert doc._.hello == True
+    assert doc._.hello is True
 
 
 def test_create_span_underscore():

diff --git a/spacy/tests/lang/ar/test_text.py b/spacy/tests/lang/ar/test_text.py
@@ -9,5 +9,5 @@ def test_ar_tokenizer_handles_long_text(ar_tokenizer):
       و قد نجح في الحصول على جائزة نوبل للآداب، ليكون بذلك العربي الوحيد الذي فاز بها."""
 
     tokens = ar_tokenizer(text)
-    assert tokens[3].is_stop == True
+    assert tokens[3].is_stop is True
     assert len(tokens) == 77
diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py
@@ -9,13 +9,12 @@
 from ...util import get_doc
 
 
-def test_en_noun_chunks_not_nested(en_tokenizer):
-    text = "Peter has chronic command and control issues"
+def test_en_noun_chunks_not_nested(en_vocab):
+    words = ["Peter", "has", "chronic", "command", "and", "control", "issues"]
     heads = [1, 0, 4, 3, -1, -2, -5]
     deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"]
-    tokens = en_tokenizer(text)
-    doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
-    tokens.from_array(
+    doc = get_doc(en_vocab, words=words, heads=heads, deps=deps)
+    doc.from_array(
         [HEAD, DEP],
         numpy.asarray(
             [
@@ -30,11 +29,11 @@ def test_en_noun_chunks_not_nested(en_tokenizer):
             dtype="uint64",
         ),
     )
-    tokens.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"]
+    doc.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"]
     word_occurred = {}
-    for chunk in tokens.noun_chunks:
+    for chunk in doc.noun_chunks:
         for word in chunk:
             word_occurred.setdefault(word.text, 0)
             word_occurred[word.text] += 1
     for word, freq in word_occurred.items():
-        assert freq == 1, (word, [chunk.text for chunk in tokens.noun_chunks])
+        assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks])