diff --git a/.flake8 b/.flake8 index bd134702816..dfedc15df80 100644 --- a/.flake8 +++ b/.flake8 @@ -11,3 +11,4 @@ exclude = _tokenizer_exceptions_list.py, spacy/lang/fr/lemmatizer, spacy/lang/nb/lemmatizer + spacy/__init__.py diff --git a/spacy/__init__.py b/spacy/__init__.py index 411883eaebc..3498aafb759 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals import warnings + warnings.filterwarnings("ignore", message="numpy.dtype size changed") warnings.filterwarnings("ignore", message="numpy.ufunc size changed") @@ -15,7 +16,7 @@ def load(name, **overrides): - depr_path = overrides.get('path') + depr_path = overrides.get("path") if depr_path not in (True, False, None): deprecation_warning(Warnings.W001.format(path=depr_path)) return util.load_model(name, **overrides) diff --git a/spacy/_ml.py b/spacy/_ml.py index fd3893e5f57..b6bc1792bd2 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -29,7 +29,7 @@ try: import torch.nn from thinc.extra.wrappers import PyTorchWrapperRNN -except: +except ImportError: torch = None VECTORS_KEY = "spacy_pretrained_vectors" diff --git a/spacy/language.py b/spacy/language.py index b21fededc1d..092b02a2464 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -479,14 +479,11 @@ def begin_training(self, get_gold_tuples=None, sgd=None, **cfg): for _, annots_brackets in get_gold_tuples(): for annots, _ in annots_brackets: for word in annots[1]: - _ = self.vocab[word] - contexts = [] + _ = self.vocab[word] # noqa: F841 if cfg.get("device", -1) >= 0: - device = util.use_gpu(cfg["device"]) + util.use_gpu(cfg["device"]) if self.vocab.vectors.data.shape[1] >= 1: self.vocab.vectors.data = Model.ops.asarray(self.vocab.vectors.data) - else: - device = None link_vectors_to_models(self.vocab) if self.vocab.vectors.data.shape[1]: cfg["pretrained_vectors"] = self.vocab.vectors.name @@ -742,7 +739,7 @@ def from_bytes(self, bytes_data, disable=[]): if not hasattr(proc, "from_bytes"): continue deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False) - msg = util.from_bytes(bytes_data, deserializers, {}) + util.from_bytes(bytes_data, deserializers, {}) return self diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 43eab6f6cab..d07b79efe3d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -97,13 +97,13 @@ def da_tokenizer(): @pytest.fixture(scope="session") def ja_tokenizer(): - mecab = pytest.importorskip("MeCab") + pytest.importorskip("MeCab") return get_lang_class("ja").Defaults.create_tokenizer() @pytest.fixture(scope="session") def th_tokenizer(): - pythainlp = pytest.importorskip("pythainlp") + pytest.importorskip("pythainlp") return get_lang_class("th").Defaults.create_tokenizer() @@ -112,9 +112,9 @@ def tr_tokenizer(): return get_lang_class("tr").Defaults.create_tokenizer() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def ca_tokenizer(): - return get_lang_class('ca').Defaults.create_tokenizer() + return get_lang_class("ca").Defaults.create_tokenizer() @pytest.fixture(scope="session") @@ -139,7 +139,7 @@ def ur_tokenizer(): @pytest.fixture(scope="session") def ru_tokenizer(): - pymorphy = pytest.importorskip("pymorphy2") + pytest.importorskip("pymorphy2") return get_lang_class("ru").Defaults.create_tokenizer() diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index f05da4fe8d9..433541c483e 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -14,15 +14,13 @@ def test_doc_add_entities_set_ents_iob(en_vocab): ner = EntityRecognizer(en_vocab) ner.begin_training([]) ner(doc) - assert len(list(doc.ents)) == 0 - assert [w.ent_iob_ for w in doc] == (['O'] * len(doc)) - - doc.ents = [(doc.vocab.strings['ANIMAL'], 3, 4)] - assert [w.ent_iob_ for w in doc] == ['', '', '', 'B'] + assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) + doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] + assert [w.ent_iob_ for w in doc] == ["", "", "", "B"] + doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] + assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""] - doc.ents = [(doc.vocab.strings['WORD'], 0, 2)] - assert [w.ent_iob_ for w in doc] == ['B', 'I', '', ''] def test_add_overlapping_entities(en_vocab): text = ["Louisiana", "Office", "of", "Conservation"] diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index fe48945e723..ce4083e8aa8 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -174,18 +174,20 @@ def test_doc_api_merge(en_tokenizer): doc = en_tokenizer(text) assert len(doc) == 9 with doc.retokenize() as retokenizer: - retokenizer.merge(doc[4: 7], attrs={'tag':'NAMED', 'lemma':'LEMMA', - 'ent_type':'TYPE'}) - retokenizer.merge(doc[7: 9], attrs={'tag':'NAMED', 'lemma':'LEMMA', - 'ent_type':'TYPE'}) + retokenizer.merge( + doc[4:7], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} + ) + retokenizer.merge( + doc[7:9], attrs={"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} + ) assert len(doc) == 6 - assert doc[4].text == 'the beach boys' - assert doc[4].text_with_ws == 'the beach boys ' - assert doc[4].tag_ == 'NAMED' - assert doc[5].text == 'all night' - assert doc[5].text_with_ws == 'all night' - assert doc[5].tag_ == 'NAMED' + assert doc[4].text == "the beach boys" + assert doc[4].text_with_ws == "the beach boys " + assert doc[4].tag_ == "NAMED" + assert doc[5].text == "all night" + assert doc[5].text_with_ws == "all night" + assert doc[5].tag_ == "NAMED" def test_doc_api_merge_children(en_tokenizer): diff --git a/spacy/tests/doc/test_pickle_doc.py b/spacy/tests/doc/test_pickle_doc.py index f4bfb2f1e8b..2b6970a3862 100644 --- a/spacy/tests/doc/test_pickle_doc.py +++ b/spacy/tests/doc/test_pickle_doc.py @@ -16,7 +16,7 @@ def test_pickle_single_doc(): def test_list_of_docs_pickles_efficiently(): nlp = Language() for i in range(10000): - _ = nlp.vocab[unicode_(i)] + _ = nlp.vocab[unicode_(i)] # noqa: F841 one_pickled = pickle.dumps(nlp("0"), -1) docs = list(nlp.pipe(unicode_(i) for i in range(100))) many_pickled = pickle.dumps(docs, -1) @@ -33,7 +33,7 @@ def test_user_data_from_disk(): doc.user_data[(0, 1)] = False b = doc.to_bytes() doc2 = doc.__class__(doc.vocab).from_bytes(b) - assert doc2.user_data[(0, 1)] == False + assert doc2.user_data[(0, 1)] is False def test_user_data_unpickles(): @@ -42,7 +42,7 @@ def test_user_data_unpickles(): doc.user_data[(0, 1)] = False b = pickle.dumps(doc) doc2 = pickle.loads(b) - assert doc2.user_data[(0, 1)] == False + assert doc2.user_data[(0, 1)] is False def test_hooks_unpickle(): diff --git a/spacy/tests/doc/test_span_merge.py b/spacy/tests/doc/test_span_merge.py index 3247212c8c9..d95894dda61 100644 --- a/spacy/tests/doc/test_span_merge.py +++ b/spacy/tests/doc/test_span_merge.py @@ -87,7 +87,7 @@ def test_span_np_merges(en_tokenizer): ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) for e in doc.ents] for start, end, label, lemma in ents: merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label) - assert merged != None, (start, end, label, lemma) + assert merged is not None, (start, end, label, lemma) text = "One test with entities like New York City so the ents list is not void" heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2] @@ -95,7 +95,7 @@ def test_span_np_merges(en_tokenizer): doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) for span in doc.ents: merged = doc.merge() - assert merged != None, (span.start, span.end, span.label_, span.lemma_) + assert merged is not None, (span.start, span.end, span.label_, span.lemma_) def test_spans_entity_merge(en_tokenizer): diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py index 8b077525a5d..6d79c56e720 100644 --- a/spacy/tests/doc/test_underscore.py +++ b/spacy/tests/doc/test_underscore.py @@ -22,9 +22,9 @@ def test_doc_underscore_getattr_setattr(): doc.user_data = {} Underscore.doc_extensions["hello"] = (False, None, None, None) doc._ = Underscore(Underscore.doc_extensions, doc) - assert doc._.hello == False + assert doc._.hello is False doc._.hello = True - assert doc._.hello == True + assert doc._.hello is True def test_create_span_underscore(): diff --git a/spacy/tests/lang/ar/test_text.py b/spacy/tests/lang/ar/test_text.py index 951c24fa6cc..109c3721a2f 100644 --- a/spacy/tests/lang/ar/test_text.py +++ b/spacy/tests/lang/ar/test_text.py @@ -9,5 +9,5 @@ def test_ar_tokenizer_handles_long_text(ar_tokenizer): و قد نجح في الحصول على جائزة نوبل للآداب، ليكون بذلك العربي الوحيد الذي فاز بها.""" tokens = ar_tokenizer(text) - assert tokens[3].is_stop == True + assert tokens[3].is_stop is True assert len(tokens) == 77 diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py index b3a6696d3c5..7dc47f9cc28 100644 --- a/spacy/tests/lang/en/test_noun_chunks.py +++ b/spacy/tests/lang/en/test_noun_chunks.py @@ -9,13 +9,12 @@ from ...util import get_doc -def test_en_noun_chunks_not_nested(en_tokenizer): - text = "Peter has chronic command and control issues" +def test_en_noun_chunks_not_nested(en_vocab): + words = ["Peter", "has", "chronic", "command", "and", "control", "issues"] heads = [1, 0, 4, 3, -1, -2, -5] deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"] - tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) - tokens.from_array( + doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) + doc.from_array( [HEAD, DEP], numpy.asarray( [ @@ -30,11 +29,11 @@ def test_en_noun_chunks_not_nested(en_tokenizer): dtype="uint64", ), ) - tokens.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"] + doc.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"] word_occurred = {} - for chunk in tokens.noun_chunks: + for chunk in doc.noun_chunks: for word in chunk: word_occurred.setdefault(word.text, 0) word_occurred[word.text] += 1 for word, freq in word_occurred.items(): - assert freq == 1, (word, [chunk.text for chunk in tokens.noun_chunks]) + assert freq == 1, (word, [chunk.text for chunk in doc.noun_chunks]) diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py index c29366fc8ee..fa8e132c003 100644 --- a/spacy/tests/lang/hu/test_tokenizer.py +++ b/spacy/tests/lang/hu/test_tokenizer.py @@ -5,279 +5,309 @@ DEFAULT_TESTS = [ - ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), - pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail()), - ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), - ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), - ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), - pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail()), - ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), - ('A pl.', ['A', 'pl.']), - ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), - ('Egy..ket.', ['Egy', '..', 'ket', '.']), - ('Valami... van.', ['Valami', '...', 'van', '.']), - ('Valami ...van...', ['Valami', '...', 'van', '...']), - ('Valami...', ['Valami', '...']), - ('Valami ...', ['Valami', '...']), - ('Valami ... más.', ['Valami', '...', 'más', '.']), - ('Soha nem lesz!', ['Soha', 'nem', 'lesz', '!']), - ('Soha nem lesz?', ['Soha', 'nem', 'lesz', '?']) + ("N. kormányzósági\nszékhely.", ["N.", "kormányzósági", "székhely", "."]), + pytest.param( + "A .hu egy tld.", ["A", ".hu", "egy", "tld", "."], marks=pytest.mark.xfail() + ), + ("Az egy.ketto pelda.", ["Az", "egy.ketto", "pelda", "."]), + ("A pl. rovidites.", ["A", "pl.", "rovidites", "."]), + ("A S.M.A.R.T. szo.", ["A", "S.M.A.R.T.", "szo", "."]), + pytest.param("A .hu.", ["A", ".hu", "."], marks=pytest.mark.xfail()), + ("Az egy.ketto.", ["Az", "egy.ketto", "."]), + ("A pl.", ["A", "pl."]), + ("A S.M.A.R.T.", ["A", "S.M.A.R.T."]), + ("Egy..ket.", ["Egy", "..", "ket", "."]), + ("Valami... van.", ["Valami", "...", "van", "."]), + ("Valami ...van...", ["Valami", "...", "van", "..."]), + ("Valami...", ["Valami", "..."]), + ("Valami ...", ["Valami", "..."]), + ("Valami ... más.", ["Valami", "...", "más", "."]), + ("Soha nem lesz!", ["Soha", "nem", "lesz", "!"]), + ("Soha nem lesz?", ["Soha", "nem", "lesz", "?"]), ] HYPHEN_TESTS = [ - ('Egy -nak, -jaiért, -magyar, bel- van.', ['Egy', '-nak', ',', '-jaiért', ',', '-magyar', ',', 'bel-', 'van', '.']), - ('Szabolcs-Szatmár-Bereg megye', ['Szabolcs-Szatmár-Bereg', 'megye']), - ('Egy -nak.', ['Egy', '-nak', '.']), - ('Egy bel-.', ['Egy', 'bel-', '.']), - ('Dinnye-domb-.', ['Dinnye-domb-', '.']), - ('Ezen -e elcsatangolt.', ['Ezen', '-e', 'elcsatangolt', '.']), - ('Lakik-e', ['Lakik', '-e']), - ('A--B', ['A', '--', 'B']), - ('Lakik-e?', ['Lakik', '-e', '?']), - ('Lakik-e.', ['Lakik', '-e', '.']), - ('Lakik-e...', ['Lakik', '-e', '...']), - ('Lakik-e... van.', ['Lakik', '-e', '...', 'van', '.']), - ('Lakik-e van?', ['Lakik', '-e', 'van', '?']), - ('Lakik-elem van?', ['Lakik-elem', 'van', '?']), - ('Az életbiztosításáról- egy.', ['Az', 'életbiztosításáról-', 'egy', '.']), - ('Van lakik-elem.', ['Van', 'lakik-elem', '.']), - ('A 7-es busz?', ['A', '7-es', 'busz', '?']), - ('A 7-es?', ['A', '7-es', '?']), - ('A 7-es.', ['A', '7-es', '.']), - ('Ez (lakik)-e?', ['Ez', '(', 'lakik', ')', '-e', '?']), - ('A %-sal.', ['A', '%-sal', '.']), - ('A $-sal.', ['A', '$-sal', '.']), - ('A CD-ROM-okrol.', ['A', 'CD-ROM-okrol', '.']) + ( + "Egy -nak, -jaiért, -magyar, bel- van.", + ["Egy", "-nak", ",", "-jaiért", ",", "-magyar", ",", "bel-", "van", "."], + ), + ("Szabolcs-Szatmár-Bereg megye", ["Szabolcs-Szatmár-Bereg", "megye"]), + ("Egy -nak.", ["Egy", "-nak", "."]), + ("Egy bel-.", ["Egy", "bel-", "."]), + ("Dinnye-domb-.", ["Dinnye-domb-", "."]), + ("Ezen -e elcsatangolt.", ["Ezen", "-e", "elcsatangolt", "."]), + ("Lakik-e", ["Lakik", "-e"]), + ("A--B", ["A", "--", "B"]), + ("Lakik-e?", ["Lakik", "-e", "?"]), + ("Lakik-e.", ["Lakik", "-e", "."]), + ("Lakik-e...", ["Lakik", "-e", "..."]), + ("Lakik-e... van.", ["Lakik", "-e", "...", "van", "."]), + ("Lakik-e van?", ["Lakik", "-e", "van", "?"]), + ("Lakik-elem van?", ["Lakik-elem", "van", "?"]), + ("Az életbiztosításáról- egy.", ["Az", "életbiztosításáról-", "egy", "."]), + ("Van lakik-elem.", ["Van", "lakik-elem", "."]), + ("A 7-es busz?", ["A", "7-es", "busz", "?"]), + ("A 7-es?", ["A", "7-es", "?"]), + ("A 7-es.", ["A", "7-es", "."]), + ("Ez (lakik)-e?", ["Ez", "(", "lakik", ")", "-e", "?"]), + ("A %-sal.", ["A", "%-sal", "."]), + ("A $-sal.", ["A", "$-sal", "."]), + ("A CD-ROM-okrol.", ["A", "CD-ROM-okrol", "."]), ] NUMBER_TESTS = [ - ('A 2b van.', ['A', '2b', 'van', '.']), - ('A 2b-ben van.', ['A', '2b-ben', 'van', '.']), - ('A 2b.', ['A', '2b', '.']), - ('A 2b-ben.', ['A', '2b-ben', '.']), - ('A 3.b van.', ['A', '3.b', 'van', '.']), - ('A 3.b-ben van.', ['A', '3.b-ben', 'van', '.']), - ('A 3.b.', ['A', '3.b', '.']), - ('A 3.b-ben.', ['A', '3.b-ben', '.']), - ('A 1:20:36.7 van.', ['A', '1:20:36.7', 'van', '.']), - ('A 1:20:36.7-ben van.', ['A', '1:20:36.7-ben', 'van', '.']), - ('A 1:20:36.7-ben.', ['A', '1:20:36.7-ben', '.']), - ('A 1:35 van.', ['A', '1:35', 'van', '.']), - ('A 1:35-ben van.', ['A', '1:35-ben', 'van', '.']), - ('A 1:35-ben.', ['A', '1:35-ben', '.']), - ('A 1.35 van.', ['A', '1.35', 'van', '.']), - ('A 1.35-ben van.', ['A', '1.35-ben', 'van', '.']), - ('A 1.35-ben.', ['A', '1.35-ben', '.']), - ('A 4:01,95 van.', ['A', '4:01,95', 'van', '.']), - ('A 4:01,95-ben van.', ['A', '4:01,95-ben', 'van', '.']), - ('A 4:01,95-ben.', ['A', '4:01,95-ben', '.']), - ('A 10--12 van.', ['A', '10--12', 'van', '.']), - ('A 10--12-ben van.', ['A', '10--12-ben', 'van', '.']), - ('A 10--12-ben.', ['A', '10--12-ben', '.']), - ('A 10‐12 van.', ['A', '10‐12', 'van', '.']), - ('A 10‐12-ben van.', ['A', '10‐12-ben', 'van', '.']), - ('A 10‐12-ben.', ['A', '10‐12-ben', '.']), - ('A 10‑12 van.', ['A', '10‑12', 'van', '.']), - ('A 10‑12-ben van.', ['A', '10‑12-ben', 'van', '.']), - ('A 10‑12-ben.', ['A', '10‑12-ben', '.']), - ('A 10‒12 van.', ['A', '10‒12', 'van', '.']), - ('A 10‒12-ben van.', ['A', '10‒12-ben', 'van', '.']), - ('A 10‒12-ben.', ['A', '10‒12-ben', '.']), - ('A 10–12 van.', ['A', '10–12', 'van', '.']), - ('A 10–12-ben van.', ['A', '10–12-ben', 'van', '.']), - ('A 10–12-ben.', ['A', '10–12-ben', '.']), - ('A 10—12 van.', ['A', '10—12', 'van', '.']), - ('A 10—12-ben van.', ['A', '10—12-ben', 'van', '.']), - ('A 10—12-ben.', ['A', '10—12-ben', '.']), - ('A 10―12 van.', ['A', '10―12', 'van', '.']), - ('A 10―12-ben van.', ['A', '10―12-ben', 'van', '.']), - ('A 10―12-ben.', ['A', '10―12-ben', '.']), - ('A -23,12 van.', ['A', '-23,12', 'van', '.']), - ('A -23,12-ben van.', ['A', '-23,12-ben', 'van', '.']), - ('A -23,12-ben.', ['A', '-23,12-ben', '.']), - ('A 2+3 van.', ['A', '2+3', 'van', '.']), - ('A 2<3 van.', ['A', '2<3', 'van', '.']), - ('A 2=3 van.', ['A', '2=3', 'van', '.']), - ('A 2÷3 van.', ['A', '2÷3', 'van', '.']), - ('A 1=(2÷3)-2/5 van.', ['A', '1=(2÷3)-2/5', 'van', '.']), - ('A 2 +3 van.', ['A', '2', '+3', 'van', '.']), - ('A 2+ 3 van.', ['A', '2', '+', '3', 'van', '.']), - ('A 2 + 3 van.', ['A', '2', '+', '3', 'van', '.']), - ('A 2*3 van.', ['A', '2*3', 'van', '.']), - ('A 2 *3 van.', ['A', '2', '*', '3', 'van', '.']), - ('A 2* 3 van.', ['A', '2', '*', '3', 'van', '.']), - ('A 2 * 3 van.', ['A', '2', '*', '3', 'van', '.']), - ('A C++ van.', ['A', 'C++', 'van', '.']), - ('A C++-ben van.', ['A', 'C++-ben', 'van', '.']), - ('A C++.', ['A', 'C++', '.']), - ('A C++-ben.', ['A', 'C++-ben', '.']), - ('A 2003. I. 06. van.', ['A', '2003.', 'I.', '06.', 'van', '.']), - ('A 2003. I. 06-ben van.', ['A', '2003.', 'I.', '06-ben', 'van', '.']), - ('A 2003. I. 06.', ['A', '2003.', 'I.', '06.']), - ('A 2003. I. 06-ben.', ['A', '2003.', 'I.', '06-ben', '.']), - ('A 2003. 01. 06. van.', ['A', '2003.', '01.', '06.', 'van', '.']), - ('A 2003. 01. 06-ben van.', ['A', '2003.', '01.', '06-ben', 'van', '.']), - ('A 2003. 01. 06.', ['A', '2003.', '01.', '06.']), - ('A 2003. 01. 06-ben.', ['A', '2003.', '01.', '06-ben', '.']), - ('A IV. 12. van.', ['A', 'IV.', '12.', 'van', '.']), - ('A IV. 12-ben van.', ['A', 'IV.', '12-ben', 'van', '.']), - ('A IV. 12.', ['A', 'IV.', '12.']), - ('A IV. 12-ben.', ['A', 'IV.', '12-ben', '.']), - ('A 2003.01.06. van.', ['A', '2003.01.06.', 'van', '.']), - ('A 2003.01.06-ben van.', ['A', '2003.01.06-ben', 'van', '.']), - ('A 2003.01.06.', ['A', '2003.01.06.']), - ('A 2003.01.06-ben.', ['A', '2003.01.06-ben', '.']), - ('A IV.12. van.', ['A', 'IV.12.', 'van', '.']), - ('A IV.12-ben van.', ['A', 'IV.12-ben', 'van', '.']), - ('A IV.12.', ['A', 'IV.12.']), - ('A IV.12-ben.', ['A', 'IV.12-ben', '.']), - ('A 1.1.2. van.', ['A', '1.1.2.', 'van', '.']), - ('A 1.1.2-ben van.', ['A', '1.1.2-ben', 'van', '.']), - ('A 1.1.2.', ['A', '1.1.2.']), - ('A 1.1.2-ben.', ['A', '1.1.2-ben', '.']), - ('A 1,5--2,5 van.', ['A', '1,5--2,5', 'van', '.']), - ('A 1,5--2,5-ben van.', ['A', '1,5--2,5-ben', 'van', '.']), - ('A 1,5--2,5-ben.', ['A', '1,5--2,5-ben', '.']), - ('A 3,14 van.', ['A', '3,14', 'van', '.']), - ('A 3,14-ben van.', ['A', '3,14-ben', 'van', '.']), - ('A 3,14-ben.', ['A', '3,14-ben', '.']), - ('A 3.14 van.', ['A', '3.14', 'van', '.']), - ('A 3.14-ben van.', ['A', '3.14-ben', 'van', '.']), - ('A 3.14-ben.', ['A', '3.14-ben', '.']), - ('A 15. van.', ['A', '15.', 'van', '.']), - ('A 15-ben van.', ['A', '15-ben', 'van', '.']), - ('A 15-ben.', ['A', '15-ben', '.']), - ('A 15.-ben van.', ['A', '15.-ben', 'van', '.']), - ('A 15.-ben.', ['A', '15.-ben', '.']), - ('A 2002--2003. van.', ['A', '2002--2003.', 'van', '.']), - ('A 2002--2003-ben van.', ['A', '2002--2003-ben', 'van', '.']), - ('A 2002-2003-ben.', ['A', '2002-2003-ben', '.']), - ('A +0,99% van.', ['A', '+0,99%', 'van', '.']), - ('A -0,99% van.', ['A', '-0,99%', 'van', '.']), - ('A -0,99%-ben van.', ['A', '-0,99%-ben', 'van', '.']), - ('A -0,99%.', ['A', '-0,99%', '.']), - ('A -0,99%-ben.', ['A', '-0,99%-ben', '.']), - ('A 10--20% van.', ['A', '10--20%', 'van', '.']), - ('A 10--20%-ben van.', ['A', '10--20%-ben', 'van', '.']), - ('A 10--20%.', ['A', '10--20%', '.']), - ('A 10--20%-ben.', ['A', '10--20%-ben', '.']), - ('A 99§ van.', ['A', '99§', 'van', '.']), - ('A 99§-ben van.', ['A', '99§-ben', 'van', '.']), - ('A 99§-ben.', ['A', '99§-ben', '.']), - ('A 10--20§ van.', ['A', '10--20§', 'van', '.']), - ('A 10--20§-ben van.', ['A', '10--20§-ben', 'van', '.']), - ('A 10--20§-ben.', ['A', '10--20§-ben', '.']), - ('A 99° van.', ['A', '99°', 'van', '.']), - ('A 99°-ben van.', ['A', '99°-ben', 'van', '.']), - ('A 99°-ben.', ['A', '99°-ben', '.']), - ('A 10--20° van.', ['A', '10--20°', 'van', '.']), - ('A 10--20°-ben van.', ['A', '10--20°-ben', 'van', '.']), - ('A 10--20°-ben.', ['A', '10--20°-ben', '.']), - ('A °C van.', ['A', '°C', 'van', '.']), - ('A °C-ben van.', ['A', '°C-ben', 'van', '.']), - ('A °C.', ['A', '°C', '.']), - ('A °C-ben.', ['A', '°C-ben', '.']), - ('A 100°C van.', ['A', '100°C', 'van', '.']), - ('A 100°C-ben van.', ['A', '100°C-ben', 'van', '.']), - ('A 100°C.', ['A', '100°C', '.']), - ('A 100°C-ben.', ['A', '100°C-ben', '.']), - ('A 800x600 van.', ['A', '800x600', 'van', '.']), - ('A 800x600-ben van.', ['A', '800x600-ben', 'van', '.']), - ('A 800x600-ben.', ['A', '800x600-ben', '.']), - ('A 1x2x3x4 van.', ['A', '1x2x3x4', 'van', '.']), - ('A 1x2x3x4-ben van.', ['A', '1x2x3x4-ben', 'van', '.']), - ('A 1x2x3x4-ben.', ['A', '1x2x3x4-ben', '.']), - ('A 5/J van.', ['A', '5/J', 'van', '.']), - ('A 5/J-ben van.', ['A', '5/J-ben', 'van', '.']), - ('A 5/J-ben.', ['A', '5/J-ben', '.']), - ('A 5/J. van.', ['A', '5/J.', 'van', '.']), - ('A 5/J.-ben van.', ['A', '5/J.-ben', 'van', '.']), - ('A 5/J.-ben.', ['A', '5/J.-ben', '.']), - ('A III/1 van.', ['A', 'III/1', 'van', '.']), - ('A III/1-ben van.', ['A', 'III/1-ben', 'van', '.']), - ('A III/1-ben.', ['A', 'III/1-ben', '.']), - ('A III/1. van.', ['A', 'III/1.', 'van', '.']), - ('A III/1.-ben van.', ['A', 'III/1.-ben', 'van', '.']), - ('A III/1.-ben.', ['A', 'III/1.-ben', '.']), - ('A III/c van.', ['A', 'III/c', 'van', '.']), - ('A III/c-ben van.', ['A', 'III/c-ben', 'van', '.']), - ('A III/c.', ['A', 'III/c', '.']), - ('A III/c-ben.', ['A', 'III/c-ben', '.']), - ('A TU–154 van.', ['A', 'TU–154', 'van', '.']), - ('A TU–154-ben van.', ['A', 'TU–154-ben', 'van', '.']), - ('A TU–154-ben.', ['A', 'TU–154-ben', '.']), - ('A 5cm³', ['A', '5', 'cm³']), - ('A 5 $-ban', ['A', '5', '$-ban']), - ('A 5$-ban', ['A', '5$-ban']), - ('A 5$.', ['A', '5', '$', '.']), - ('A 5$', ['A', '5', '$']), - ('A $5', ['A', '$5']), - ('A 5km/h', ['A', '5', 'km/h']), - ('A 75%+1-100%-ig', ['A', '75%+1-100%-ig']), - ('A 5km/h.', ['A', '5', 'km/h', '.']), - ('3434/1992. évi elszámolás', ['3434/1992.', 'évi', 'elszámolás']), + ("A 2b van.", ["A", "2b", "van", "."]), + ("A 2b-ben van.", ["A", "2b-ben", "van", "."]), + ("A 2b.", ["A", "2b", "."]), + ("A 2b-ben.", ["A", "2b-ben", "."]), + ("A 3.b van.", ["A", "3.b", "van", "."]), + ("A 3.b-ben van.", ["A", "3.b-ben", "van", "."]), + ("A 3.b.", ["A", "3.b", "."]), + ("A 3.b-ben.", ["A", "3.b-ben", "."]), + ("A 1:20:36.7 van.", ["A", "1:20:36.7", "van", "."]), + ("A 1:20:36.7-ben van.", ["A", "1:20:36.7-ben", "van", "."]), + ("A 1:20:36.7-ben.", ["A", "1:20:36.7-ben", "."]), + ("A 1:35 van.", ["A", "1:35", "van", "."]), + ("A 1:35-ben van.", ["A", "1:35-ben", "van", "."]), + ("A 1:35-ben.", ["A", "1:35-ben", "."]), + ("A 1.35 van.", ["A", "1.35", "van", "."]), + ("A 1.35-ben van.", ["A", "1.35-ben", "van", "."]), + ("A 1.35-ben.", ["A", "1.35-ben", "."]), + ("A 4:01,95 van.", ["A", "4:01,95", "van", "."]), + ("A 4:01,95-ben van.", ["A", "4:01,95-ben", "van", "."]), + ("A 4:01,95-ben.", ["A", "4:01,95-ben", "."]), + ("A 10--12 van.", ["A", "10--12", "van", "."]), + ("A 10--12-ben van.", ["A", "10--12-ben", "van", "."]), + ("A 10--12-ben.", ["A", "10--12-ben", "."]), + ("A 10‐12 van.", ["A", "10‐12", "van", "."]), + ("A 10‐12-ben van.", ["A", "10‐12-ben", "van", "."]), + ("A 10‐12-ben.", ["A", "10‐12-ben", "."]), + ("A 10‑12 van.", ["A", "10‑12", "van", "."]), + ("A 10‑12-ben van.", ["A", "10‑12-ben", "van", "."]), + ("A 10‑12-ben.", ["A", "10‑12-ben", "."]), + ("A 10‒12 van.", ["A", "10‒12", "van", "."]), + ("A 10‒12-ben van.", ["A", "10‒12-ben", "van", "."]), + ("A 10‒12-ben.", ["A", "10‒12-ben", "."]), + ("A 10–12 van.", ["A", "10–12", "van", "."]), + ("A 10–12-ben van.", ["A", "10–12-ben", "van", "."]), + ("A 10–12-ben.", ["A", "10–12-ben", "."]), + ("A 10—12 van.", ["A", "10—12", "van", "."]), + ("A 10—12-ben van.", ["A", "10—12-ben", "van", "."]), + ("A 10—12-ben.", ["A", "10—12-ben", "."]), + ("A 10―12 van.", ["A", "10―12", "van", "."]), + ("A 10―12-ben van.", ["A", "10―12-ben", "van", "."]), + ("A 10―12-ben.", ["A", "10―12-ben", "."]), + ("A -23,12 van.", ["A", "-23,12", "van", "."]), + ("A -23,12-ben van.", ["A", "-23,12-ben", "van", "."]), + ("A -23,12-ben.", ["A", "-23,12-ben", "."]), + ("A 2+3 van.", ["A", "2+3", "van", "."]), + ("A 2<3 van.", ["A", "2<3", "van", "."]), + ("A 2=3 van.", ["A", "2=3", "van", "."]), + ("A 2÷3 van.", ["A", "2÷3", "van", "."]), + ("A 1=(2÷3)-2/5 van.", ["A", "1=(2÷3)-2/5", "van", "."]), + ("A 2 +3 van.", ["A", "2", "+3", "van", "."]), + ("A 2+ 3 van.", ["A", "2", "+", "3", "van", "."]), + ("A 2 + 3 van.", ["A", "2", "+", "3", "van", "."]), + ("A 2*3 van.", ["A", "2*3", "van", "."]), + ("A 2 *3 van.", ["A", "2", "*", "3", "van", "."]), + ("A 2* 3 van.", ["A", "2", "*", "3", "van", "."]), + ("A 2 * 3 van.", ["A", "2", "*", "3", "van", "."]), + ("A C++ van.", ["A", "C++", "van", "."]), + ("A C++-ben van.", ["A", "C++-ben", "van", "."]), + ("A C++.", ["A", "C++", "."]), + ("A C++-ben.", ["A", "C++-ben", "."]), + ("A 2003. I. 06. van.", ["A", "2003.", "I.", "06.", "van", "."]), + ("A 2003. I. 06-ben van.", ["A", "2003.", "I.", "06-ben", "van", "."]), + ("A 2003. I. 06.", ["A", "2003.", "I.", "06."]), + ("A 2003. I. 06-ben.", ["A", "2003.", "I.", "06-ben", "."]), + ("A 2003. 01. 06. van.", ["A", "2003.", "01.", "06.", "van", "."]), + ("A 2003. 01. 06-ben van.", ["A", "2003.", "01.", "06-ben", "van", "."]), + ("A 2003. 01. 06.", ["A", "2003.", "01.", "06."]), + ("A 2003. 01. 06-ben.", ["A", "2003.", "01.", "06-ben", "."]), + ("A IV. 12. van.", ["A", "IV.", "12.", "van", "."]), + ("A IV. 12-ben van.", ["A", "IV.", "12-ben", "van", "."]), + ("A IV. 12.", ["A", "IV.", "12."]), + ("A IV. 12-ben.", ["A", "IV.", "12-ben", "."]), + ("A 2003.01.06. van.", ["A", "2003.01.06.", "van", "."]), + ("A 2003.01.06-ben van.", ["A", "2003.01.06-ben", "van", "."]), + ("A 2003.01.06.", ["A", "2003.01.06."]), + ("A 2003.01.06-ben.", ["A", "2003.01.06-ben", "."]), + ("A IV.12. van.", ["A", "IV.12.", "van", "."]), + ("A IV.12-ben van.", ["A", "IV.12-ben", "van", "."]), + ("A IV.12.", ["A", "IV.12."]), + ("A IV.12-ben.", ["A", "IV.12-ben", "."]), + ("A 1.1.2. van.", ["A", "1.1.2.", "van", "."]), + ("A 1.1.2-ben van.", ["A", "1.1.2-ben", "van", "."]), + ("A 1.1.2.", ["A", "1.1.2."]), + ("A 1.1.2-ben.", ["A", "1.1.2-ben", "."]), + ("A 1,5--2,5 van.", ["A", "1,5--2,5", "van", "."]), + ("A 1,5--2,5-ben van.", ["A", "1,5--2,5-ben", "van", "."]), + ("A 1,5--2,5-ben.", ["A", "1,5--2,5-ben", "."]), + ("A 3,14 van.", ["A", "3,14", "van", "."]), + ("A 3,14-ben van.", ["A", "3,14-ben", "van", "."]), + ("A 3,14-ben.", ["A", "3,14-ben", "."]), + ("A 3.14 van.", ["A", "3.14", "van", "."]), + ("A 3.14-ben van.", ["A", "3.14-ben", "van", "."]), + ("A 3.14-ben.", ["A", "3.14-ben", "."]), + ("A 15. van.", ["A", "15.", "van", "."]), + ("A 15-ben van.", ["A", "15-ben", "van", "."]), + ("A 15-ben.", ["A", "15-ben", "."]), + ("A 15.-ben van.", ["A", "15.-ben", "van", "."]), + ("A 15.-ben.", ["A", "15.-ben", "."]), + ("A 2002--2003. van.", ["A", "2002--2003.", "van", "."]), + ("A 2002--2003-ben van.", ["A", "2002--2003-ben", "van", "."]), + ("A 2002-2003-ben.", ["A", "2002-2003-ben", "."]), + ("A +0,99% van.", ["A", "+0,99%", "van", "."]), + ("A -0,99% van.", ["A", "-0,99%", "van", "."]), + ("A -0,99%-ben van.", ["A", "-0,99%-ben", "van", "."]), + ("A -0,99%.", ["A", "-0,99%", "."]), + ("A -0,99%-ben.", ["A", "-0,99%-ben", "."]), + ("A 10--20% van.", ["A", "10--20%", "van", "."]), + ("A 10--20%-ben van.", ["A", "10--20%-ben", "van", "."]), + ("A 10--20%.", ["A", "10--20%", "."]), + ("A 10--20%-ben.", ["A", "10--20%-ben", "."]), + ("A 99§ van.", ["A", "99§", "van", "."]), + ("A 99§-ben van.", ["A", "99§-ben", "van", "."]), + ("A 99§-ben.", ["A", "99§-ben", "."]), + ("A 10--20§ van.", ["A", "10--20§", "van", "."]), + ("A 10--20§-ben van.", ["A", "10--20§-ben", "van", "."]), + ("A 10--20§-ben.", ["A", "10--20§-ben", "."]), + ("A 99° van.", ["A", "99°", "van", "."]), + ("A 99°-ben van.", ["A", "99°-ben", "van", "."]), + ("A 99°-ben.", ["A", "99°-ben", "."]), + ("A 10--20° van.", ["A", "10--20°", "van", "."]), + ("A 10--20°-ben van.", ["A", "10--20°-ben", "van", "."]), + ("A 10--20°-ben.", ["A", "10--20°-ben", "."]), + ("A °C van.", ["A", "°C", "van", "."]), + ("A °C-ben van.", ["A", "°C-ben", "van", "."]), + ("A °C.", ["A", "°C", "."]), + ("A °C-ben.", ["A", "°C-ben", "."]), + ("A 100°C van.", ["A", "100°C", "van", "."]), + ("A 100°C-ben van.", ["A", "100°C-ben", "van", "."]), + ("A 100°C.", ["A", "100°C", "."]), + ("A 100°C-ben.", ["A", "100°C-ben", "."]), + ("A 800x600 van.", ["A", "800x600", "van", "."]), + ("A 800x600-ben van.", ["A", "800x600-ben", "van", "."]), + ("A 800x600-ben.", ["A", "800x600-ben", "."]), + ("A 1x2x3x4 van.", ["A", "1x2x3x4", "van", "."]), + ("A 1x2x3x4-ben van.", ["A", "1x2x3x4-ben", "van", "."]), + ("A 1x2x3x4-ben.", ["A", "1x2x3x4-ben", "."]), + ("A 5/J van.", ["A", "5/J", "van", "."]), + ("A 5/J-ben van.", ["A", "5/J-ben", "van", "."]), + ("A 5/J-ben.", ["A", "5/J-ben", "."]), + ("A 5/J. van.", ["A", "5/J.", "van", "."]), + ("A 5/J.-ben van.", ["A", "5/J.-ben", "van", "."]), + ("A 5/J.-ben.", ["A", "5/J.-ben", "."]), + ("A III/1 van.", ["A", "III/1", "van", "."]), + ("A III/1-ben van.", ["A", "III/1-ben", "van", "."]), + ("A III/1-ben.", ["A", "III/1-ben", "."]), + ("A III/1. van.", ["A", "III/1.", "van", "."]), + ("A III/1.-ben van.", ["A", "III/1.-ben", "van", "."]), + ("A III/1.-ben.", ["A", "III/1.-ben", "."]), + ("A III/c van.", ["A", "III/c", "van", "."]), + ("A III/c-ben van.", ["A", "III/c-ben", "van", "."]), + ("A III/c.", ["A", "III/c", "."]), + ("A III/c-ben.", ["A", "III/c-ben", "."]), + ("A TU–154 van.", ["A", "TU–154", "van", "."]), + ("A TU–154-ben van.", ["A", "TU–154-ben", "van", "."]), + ("A TU–154-ben.", ["A", "TU–154-ben", "."]), + ("A 5cm³", ["A", "5", "cm³"]), + ("A 5 $-ban", ["A", "5", "$-ban"]), + ("A 5$-ban", ["A", "5$-ban"]), + ("A 5$.", ["A", "5", "$", "."]), + ("A 5$", ["A", "5", "$"]), + ("A $5", ["A", "$5"]), + ("A 5km/h", ["A", "5", "km/h"]), + ("A 75%+1-100%-ig", ["A", "75%+1-100%-ig"]), + ("A 5km/h.", ["A", "5", "km/h", "."]), + ("3434/1992. évi elszámolás", ["3434/1992.", "évi", "elszámolás"]), ] QUOTE_TESTS = [ - ('Az "Ime, hat"-ban irja.', ['Az', '"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), - ('"Ime, hat"-ban irja.', ['"', 'Ime', ',', 'hat', '"', '-ban', 'irja', '.']), - ('Az "Ime, hat".', ['Az', '"', 'Ime', ',', 'hat', '"', '.']), - ('Egy 24"-os monitor.', ['Egy', '24"-os', 'monitor', '.']), - ("A McDonald's van.", ['A', "McDonald's", 'van', '.']) + ( + 'Az "Ime, hat"-ban irja.', + ["Az", '"', "Ime", ",", "hat", '"', "-ban", "irja", "."], + ), + ('"Ime, hat"-ban irja.', ['"', "Ime", ",", "hat", '"', "-ban", "irja", "."]), + ('Az "Ime, hat".', ["Az", '"', "Ime", ",", "hat", '"', "."]), + ('Egy 24"-os monitor.', ["Egy", '24"-os', "monitor", "."]), + ("A McDonald's van.", ["A", "McDonald's", "van", "."]), ] DOT_TESTS = [ - ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), - pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail()), - ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), - ('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']), - ('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']), - pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail()), - ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), - ('A pl.', ['A', 'pl.']), - ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), - ('Egy..ket.', ['Egy', '..', 'ket', '.']), - ('Valami... van.', ['Valami', '...', 'van', '.']), - ('Valami ...van...', ['Valami', '...', 'van', '...']), - ('Valami...', ['Valami', '...']), - ('Valami ...', ['Valami', '...']), - ('Valami ... más.', ['Valami', '...', 'más', '.']) + ("N. kormányzósági\nszékhely.", ["N.", "kormányzósági", "székhely", "."]), + pytest.param( + "A .hu egy tld.", ["A", ".hu", "egy", "tld", "."], marks=pytest.mark.xfail() + ), + ("Az egy.ketto pelda.", ["Az", "egy.ketto", "pelda", "."]), + ("A pl. rövidítés.", ["A", "pl.", "rövidítés", "."]), + ("A S.M.A.R.T. szó.", ["A", "S.M.A.R.T.", "szó", "."]), + pytest.param("A .hu.", ["A", ".hu", "."], marks=pytest.mark.xfail()), + ("Az egy.ketto.", ["Az", "egy.ketto", "."]), + ("A pl.", ["A", "pl."]), + ("A S.M.A.R.T.", ["A", "S.M.A.R.T."]), + ("Egy..ket.", ["Egy", "..", "ket", "."]), + ("Valami... van.", ["Valami", "...", "van", "."]), + ("Valami ...van...", ["Valami", "...", "van", "..."]), + ("Valami...", ["Valami", "..."]), + ("Valami ...", ["Valami", "..."]), + ("Valami ... más.", ["Valami", "...", "más", "."]), ] TYPO_TESTS = [ ( - 'Ez egy mondat vége.Ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '.', 'Ez', 'egy', 'másik', 'eleje', '.']), - ('Ez egy mondat vége .Ez egy másik eleje.', - ['Ez', 'egy', 'mondat', 'vége', '.', 'Ez', 'egy', 'másik', 'eleje', '.']), + "Ez egy mondat vége.Ez egy másik eleje.", + ["Ez", "egy", "mondat", "vége", ".", "Ez", "egy", "másik", "eleje", "."], + ), + ( + "Ez egy mondat vége .Ez egy másik eleje.", + ["Ez", "egy", "mondat", "vége", ".", "Ez", "egy", "másik", "eleje", "."], + ), + ( + "Ez egy mondat vége!ez egy másik eleje.", + ["Ez", "egy", "mondat", "vége", "!", "ez", "egy", "másik", "eleje", "."], + ), + ( + "Ez egy mondat vége !ez egy másik eleje.", + ["Ez", "egy", "mondat", "vége", "!", "ez", "egy", "másik", "eleje", "."], + ), ( - 'Ez egy mondat vége!ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '!', 'ez', 'egy', 'másik', 'eleje', '.']), - ('Ez egy mondat vége !ez egy másik eleje.', - ['Ez', 'egy', 'mondat', 'vége', '!', 'ez', 'egy', 'másik', 'eleje', '.']), + "Ez egy mondat vége?Ez egy másik eleje.", + ["Ez", "egy", "mondat", "vége", "?", "Ez", "egy", "másik", "eleje", "."], + ), ( - 'Ez egy mondat vége?Ez egy másik eleje.', ['Ez', 'egy', 'mondat', 'vége', '?', 'Ez', 'egy', 'másik', 'eleje', '.']), - ('Ez egy mondat vége ?Ez egy másik eleje.', - ['Ez', 'egy', 'mondat', 'vége', '?', 'Ez', 'egy', 'másik', 'eleje', '.']), - ('egy,kettő', ['egy', ',', 'kettő']), - ('egy ,kettő', ['egy', ',', 'kettő']), - ('egy :kettő', ['egy', ':', 'kettő']), + "Ez egy mondat vége ?Ez egy másik eleje.", + ["Ez", "egy", "mondat", "vége", "?", "Ez", "egy", "másik", "eleje", "."], + ), + ("egy,kettő", ["egy", ",", "kettő"]), + ("egy ,kettő", ["egy", ",", "kettő"]), + ("egy :kettő", ["egy", ":", "kettő"]), ] WIKI_TESTS = [ - ('!"', ['!', '"']), - ('lány"a', ['lány', '"', 'a']), - ('lány"a', ['lány', '"', 'a']), - ('!"-lel', ['!', '"', '-lel']), - ('""-sorozat ', ['"', '"', '-sorozat']), - ('"(Köszönöm', ['"', '(', 'Köszönöm']), - ('(törvénykönyv)-ben ', ['(', 'törvénykönyv', ')', '-ben']), - ('"(...)"–sokkal ', ['"', '(', '...', ')', '"', '–sokkal']), - ('cérium(IV)-oxid', ['cérium', '(', 'IV', ')', '-oxid']) + ('!"', ["!", '"']), + ('lány"a', ["lány", '"', "a"]), + ('lány"a', ["lány", '"', "a"]), + ('!"-lel', ["!", '"', "-lel"]), + ('""-sorozat ', ['"', '"', "-sorozat"]), + ('"(Köszönöm', ['"', "(", "Köszönöm"]), + ("(törvénykönyv)-ben ", ["(", "törvénykönyv", ")", "-ben"]), + ('"(...)"–sokkal ', ['"', "(", "...", ")", '"', "–sokkal"]), + ("cérium(IV)-oxid", ["cérium", "(", "IV", ")", "-oxid"]), ] -TESTCASES = DEFAULT_TESTS + DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS +TESTCASES = ( + DEFAULT_TESTS + + DOT_TESTS + + QUOTE_TESTS + + NUMBER_TESTS + + HYPHEN_TESTS + + WIKI_TESTS + + TYPO_TESTS +) -@pytest.mark.parametrize('text,expected_tokens', TESTCASES) +@pytest.mark.parametrize("text,expected_tokens", TESTCASES) def test_hu_tokenizer_handles_testcases(hu_tokenizer, text, expected_tokens): tokens = hu_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index 690cadf5d48..cdc4b67da1a 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -9,7 +9,7 @@ @pytest.fixture def ru_lemmatizer(): - pymorphy = pytest.importorskip("pymorphy2") + pytest.importorskip("pymorphy2") return Russian.Defaults.create_lemmatizer() diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 4d75eb87023..443dd11e33c 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -51,7 +51,7 @@ def label_sentiment(matcher, doc, i, matches): matcher = Matcher(en_vocab) matcher.add("HAPPY", label_sentiment, *pos_patterns) - matches = matcher(doc) + matcher(doc) assert doc.sentiment != 0 assert doc[1].norm_ == "happy emoji" diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 80b81bdf343..41b7a486197 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -145,4 +145,4 @@ def test_get_oracle_actions(): heads, deps = projectivize(heads, deps) gold = GoldParse(doc, words=words, tags=tags, heads=heads, deps=deps) parser.moves.preprocess_gold(gold) - actions = parser.moves.get_oracle_sequence(doc, gold) + parser.moves.get_oracle_sequence(doc, gold) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 534460ccdac..43c00a963fb 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -57,6 +57,7 @@ def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots): tsys.preprocess_gold(gold) act_classes = tsys.get_oracle_sequence(doc, gold) names = [tsys.get_class_name(act) for act in act_classes] + assert names def test_get_oracle_moves_negative_entities2(tsys, vocab): @@ -66,6 +67,7 @@ def test_get_oracle_moves_negative_entities2(tsys, vocab): tsys.preprocess_gold(gold) act_classes = tsys.get_oracle_sequence(doc, gold) names = [tsys.get_class_name(act) for act in act_classes] + assert names def test_get_oracle_moves_negative_O(tsys, vocab): @@ -75,6 +77,7 @@ def test_get_oracle_moves_negative_O(tsys, vocab): tsys.preprocess_gold(gold) act_classes = tsys.get_oracle_sequence(doc, gold) names = [tsys.get_class_name(act) for act in act_classes] + assert names def test_doc_add_entities_set_ents_iob(en_vocab): diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 442d01010b6..8bf8111c158 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -47,34 +47,34 @@ def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree): def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree): - assert contains_cycle(tree) == None + assert contains_cycle(tree) is None assert contains_cycle(cyclic_tree) == set([3, 4, 5]) - assert contains_cycle(partial_tree) == None - assert contains_cycle(multirooted_tree) == None + assert contains_cycle(partial_tree) is None + assert contains_cycle(multirooted_tree) is None def test_parser_is_nonproj_arc(nonproj_tree, partial_tree, multirooted_tree): - assert is_nonproj_arc(0, nonproj_tree) == False - assert is_nonproj_arc(1, nonproj_tree) == False - assert is_nonproj_arc(2, nonproj_tree) == False - assert is_nonproj_arc(3, nonproj_tree) == False - assert is_nonproj_arc(4, nonproj_tree) == False - assert is_nonproj_arc(5, nonproj_tree) == False - assert is_nonproj_arc(6, nonproj_tree) == False - assert is_nonproj_arc(7, nonproj_tree) == True - assert is_nonproj_arc(8, nonproj_tree) == False - assert is_nonproj_arc(7, partial_tree) == False - assert is_nonproj_arc(17, multirooted_tree) == False - assert is_nonproj_arc(16, multirooted_tree) == True + assert is_nonproj_arc(0, nonproj_tree) is False + assert is_nonproj_arc(1, nonproj_tree) is False + assert is_nonproj_arc(2, nonproj_tree) is False + assert is_nonproj_arc(3, nonproj_tree) is False + assert is_nonproj_arc(4, nonproj_tree) is False + assert is_nonproj_arc(5, nonproj_tree) is False + assert is_nonproj_arc(6, nonproj_tree) is False + assert is_nonproj_arc(7, nonproj_tree) is True + assert is_nonproj_arc(8, nonproj_tree) is False + assert is_nonproj_arc(7, partial_tree) is False + assert is_nonproj_arc(17, multirooted_tree) is False + assert is_nonproj_arc(16, multirooted_tree) is True def test_parser_is_nonproj_tree( proj_tree, nonproj_tree, partial_tree, multirooted_tree ): - assert is_nonproj_tree(proj_tree) == False - assert is_nonproj_tree(nonproj_tree) == True - assert is_nonproj_tree(partial_tree) == False - assert is_nonproj_tree(multirooted_tree) == True + assert is_nonproj_tree(proj_tree) is False + assert is_nonproj_tree(nonproj_tree) is True + assert is_nonproj_tree(partial_tree) is False + assert is_nonproj_tree(multirooted_tree) is True def test_parser_pseudoprojectivity(en_tokenizer): @@ -100,8 +100,8 @@ def deprojectivize(proj_heads, deco_labels): assert nonproj.decompose("X||Y") == ("X", "Y") assert nonproj.decompose("X") == ("X", "") - assert nonproj.is_decorated("X||Y") == True - assert nonproj.is_decorated("X") == False + assert nonproj.is_decorated("X||Y") is True + assert nonproj.is_decorated("X") is False nonproj._lift(0, tree) assert tree == [2, 2, 2] diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 2f7d8484e67..04e31d6499f 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -25,7 +25,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): ) assert len(doc) == 1 - with en_parser.step_through(doc) as _: + with en_parser.step_through(doc) as _: # noqa: F841 pass assert doc[0].dep != 0 @@ -33,7 +33,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): @pytest.mark.xfail def test_parser_initial(en_tokenizer, en_parser): text = "I ate the pizza with anchovies." - heads = [1, 0, 1, -2, -3, -1, -5] + # heads = [1, 0, 1, -2, -3, -1, -5] transition = ["L-nsubj", "S", "L-det"] tokens = en_tokenizer(text) apply_transition_sequence(en_parser, tokens, transition) diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py index 3c5279bece4..945173faf84 100644 --- a/spacy/tests/parser/test_space_attachment.py +++ b/spacy/tests/parser/test_space_attachment.py @@ -71,7 +71,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser): def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length): doc = Doc(en_parser.vocab, words=text) assert len(doc) == length - with en_parser.step_through(doc) as _: + with en_parser.step_through(doc) as _: # noqa: F841 pass assert doc[0].is_space for token in doc: diff --git a/spacy/tests/regression/_test_issue2800.py b/spacy/tests/regression/_test_issue2800.py index 0d3d76d8ebf..e0d54ff37b3 100644 --- a/spacy/tests/regression/_test_issue2800.py +++ b/spacy/tests/regression/_test_issue2800.py @@ -20,7 +20,6 @@ def test_train_with_many_entity_types(): optimizer = nlp.begin_training() for i in range(20): losses = {} - index = 0 random.shuffle(train_data) for statement, entities in train_data: nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5) diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index 707ff98aece..5747538dcc9 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -152,6 +152,7 @@ def test_issue589(): vocab = Vocab() vocab.strings.set_frozen(True) doc = Doc(vocab, words=["whata"]) + assert doc def test_issue590(en_vocab): @@ -216,7 +217,7 @@ def merge_phrases(matcher, doc, i, matches): doc = en_tokenizer(text) matcher = Matcher(doc.vocab) matcher.add(label, merge_phrases, pattern) - match = matcher(doc) + matcher(doc) entities = list(doc.ents) assert entities != [] assert entities[0].label != 0 @@ -331,8 +332,7 @@ def test_issue850(): handle the ambiguity correctly.""" vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) matcher = Matcher(vocab) - IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True) - pattern = [{"LOWER": "bob"}, {"OP": "*", "IS_ANY_TOKEN": True}, {"LOWER": "frank"}] + pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}] matcher.add("FarAway", None, pattern) doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) match = matcher(doc) @@ -346,7 +346,6 @@ def test_issue850_basic(): """Test Matcher matches with '*' operator and Boolean flag""" vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) matcher = Matcher(vocab) - IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True) pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}] matcher.add("FarAway", None, pattern) doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) @@ -405,12 +404,13 @@ def test_issue912(en_vocab, text, tag, lemma): def test_issue957(en_tokenizer): """Test that spaCy doesn't hang on many periods.""" - # skip test if pytest-timeout is not installed - timeout = pytest.importorskip("pytest-timeout") + # Skip test if pytest-timeout is not installed + pytest.importorskip("pytest-timeout") string = "0" for i in range(1, 100): string += ".%d" % i doc = en_tokenizer(string) + assert doc @pytest.mark.xfail diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 71c563c41c9..a646afadcc3 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -138,12 +138,12 @@ def test_issue1757(): """Test comparison against None doesn't cause segfault.""" doc = Doc(Vocab(), words=["a", "b", "c"]) assert not doc[0] < None - assert not doc[0] == None + assert not doc[0] is None assert doc[0] >= None assert not doc[:2] < None - assert not doc[:2] == None + assert not doc[:2] is None assert doc[:2] >= None - assert not doc.vocab["a"] == None + assert not doc.vocab["a"] is None assert not doc.vocab["a"] < None diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 32839d05019..580cd77affe 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -74,4 +74,4 @@ def test_issue2482(): nlp = Italian() nlp.add_pipe(nlp.create_pipe("ner")) b = nlp.to_bytes() - nlp2 = Italian().from_bytes(b) + Italian().from_bytes(b) diff --git a/spacy/tests/regression/test_issue2626.py b/spacy/tests/regression/test_issue2626.py index afe8bc0558c..48cee35a033 100644 --- a/spacy/tests/regression/test_issue2626.py +++ b/spacy/tests/regression/test_issue2626.py @@ -8,3 +8,4 @@ def test_issue2626(en_tokenizer): ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume """ doc = en_tokenizer(text) + assert doc diff --git a/spacy/tests/regression/test_issue2671.py b/spacy/tests/regression/test_issue2671.py index 59bb8791f9c..f2595a220db 100644 --- a/spacy/tests/regression/test_issue2671.py +++ b/spacy/tests/regression/test_issue2671.py @@ -9,14 +9,6 @@ def test_issue2671(): """Ensure the correct entity ID is returned for matches with quantifiers. See also #2675 """ - - def get_rule_id(nlp, matcher, doc): - matches = matcher(doc) - for match_id, start, end in matches: - rule_id = nlp.vocab.strings[match_id] - span = doc[start:end] - return rule_id - nlp = English() matcher = Matcher(nlp.vocab) pattern_id = "test_pattern" @@ -28,5 +20,9 @@ def get_rule_id(nlp, matcher, doc): matcher.add(pattern_id, None, pattern) doc1 = nlp("This is a high-adrenaline situation.") doc2 = nlp("This is a high adrenaline situation.") - assert get_rule_id(nlp, matcher, doc1) == pattern_id - assert get_rule_id(nlp, matcher, doc2) == pattern_id + matches1 = matcher(doc1) + for match_id, start, end in matches1: + assert nlp.vocab.strings[match_id] == pattern_id + matches2 = matcher(doc2) + for match_id, start, end in matches2: + assert nlp.vocab.strings[match_id] == pattern_id diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index c39d3a325ac..680df22882c 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -72,9 +72,8 @@ def test_to_from_bytes(parser, blank_parser): reason="This seems to be a dict ordering bug somewhere. Only failing on some platforms." ) def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers): - tagger1, tagger2 = taggers + tagger1 = taggers[0] tagger1_b = tagger1.to_bytes() - tagger2_b = tagger2.to_bytes() tagger1 = tagger1.from_bytes(tagger1_b) assert tagger1.to_bytes() == tagger1_b new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b) @@ -114,4 +113,4 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab): def test_serialize_textcat_empty(en_vocab): # See issue #1105 textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"]) - textcat_bytes = textcat.to_bytes() + textcat.to_bytes() diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index 4e3dafa3007..9a273980c35 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -19,7 +19,7 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): serialized and deserialized correctly (see #2494).""" tokenizer = Tokenizer(en_vocab, suffix_search=en_tokenizer.suffix_search) tokenizer_bytes = tokenizer.to_bytes() - new_tokenizer = Tokenizer(en_vocab).from_bytes(tokenizer_bytes) + Tokenizer(en_vocab).from_bytes(tokenizer_bytes) @pytest.mark.skip(reason="Currently unreliable across platforms") diff --git a/spacy/tests/test_align.py b/spacy/tests/test_align.py index 72cfa0638d0..d6bbab04ef8 100644 --- a/spacy/tests/test_align.py +++ b/spacy/tests/test_align.py @@ -45,7 +45,7 @@ def test_align_i2j(string1, string2, i2j): ("t", "catsie", [-1, -1, 0, -1, -1, -1]), ], ) -def test_align_i2j(string1, string2, j2i): +def test_align_i2j_2(string1, string2, j2i): output_cost, output_i2j, output_j2i, matrix = align(string1, string2) assert list(output_j2i) == j2i diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index fcc2aa1b017..6aaf22fb821 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -75,7 +75,7 @@ def test_displacy_spans(en_vocab): def test_displacy_raises_for_wrong_type(en_vocab): with pytest.raises(ValueError): - html = displacy.render("hello world") + displacy.render("hello world") def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index fa32020e548..d84a56981b2 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -41,21 +41,21 @@ def test_vocab_lexeme_is_digit(en_vocab): def test_vocab_lexeme_add_flag_auto_id(en_vocab): is_len4 = en_vocab.add_flag(lambda string: len(string) == 4) - assert en_vocab["1999"].check_flag(is_len4) == True - assert en_vocab["1999"].check_flag(IS_DIGIT) == True - assert en_vocab["199"].check_flag(is_len4) == False - assert en_vocab["199"].check_flag(IS_DIGIT) == True - assert en_vocab["the"].check_flag(is_len4) == False - assert en_vocab["dogs"].check_flag(is_len4) == True + assert en_vocab["1999"].check_flag(is_len4) is True + assert en_vocab["1999"].check_flag(IS_DIGIT) is True + assert en_vocab["199"].check_flag(is_len4) is False + assert en_vocab["199"].check_flag(IS_DIGIT) is True + assert en_vocab["the"].check_flag(is_len4) is False + assert en_vocab["dogs"].check_flag(is_len4) is True def test_vocab_lexeme_add_flag_provided_id(en_vocab): is_len4 = en_vocab.add_flag(lambda string: len(string) == 4, flag_id=IS_DIGIT) - assert en_vocab["1999"].check_flag(is_len4) == True - assert en_vocab["199"].check_flag(is_len4) == False - assert en_vocab["199"].check_flag(IS_DIGIT) == False - assert en_vocab["the"].check_flag(is_len4) == False - assert en_vocab["dogs"].check_flag(is_len4) == True + assert en_vocab["1999"].check_flag(is_len4) is True + assert en_vocab["199"].check_flag(is_len4) is False + assert en_vocab["199"].check_flag(IS_DIGIT) is False + assert en_vocab["the"].check_flag(is_len4) is False + assert en_vocab["dogs"].check_flag(is_len4) is True def test_lexeme_bytes_roundtrip(en_vocab): diff --git a/spacy/tests/vocab_vectors/test_stringstore.py b/spacy/tests/vocab_vectors/test_stringstore.py index f74f6c5f549..75b1116dd8e 100644 --- a/spacy/tests/vocab_vectors/test_stringstore.py +++ b/spacy/tests/vocab_vectors/test_stringstore.py @@ -24,7 +24,7 @@ def test_stringstore_from_api_docs(stringstore): assert stringstore[apple_hash] == "apple" assert "apple" in stringstore assert "cherry" not in stringstore - orange_hash = stringstore.add("orange") + stringstore.add("orange") all_strings = [s for s in stringstore] assert all_strings == ["apple", "orange"] banana_hash = stringstore.add("banana") @@ -63,7 +63,7 @@ def test_stringstore_retrieve_id(stringstore, text): def test_stringstore_med_string(stringstore, text1, text2): store = stringstore.add(text1) assert stringstore[store] == text1.decode("utf8") - dummy = stringstore.add(text2) + stringstore.add(text2) assert stringstore[text1] == store diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index cd72cef8e8c..a9618e0ae00 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -273,9 +273,9 @@ def test_vocab_add_vector(): def test_vocab_prune_vectors(): vocab = Vocab() - _ = vocab["cat"] - _ = vocab["dog"] - _ = vocab["kitten"] + _ = vocab["cat"] # noqa: F841 + _ = vocab["dog"] # noqa: F841 + _ = vocab["kitten"] # noqa: F841 data = numpy.ndarray((5, 3), dtype="f") data[0] = 1.0 data[1] = 2.0 diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py index 61b31535062..8c826e8c309 100644 --- a/spacy/tests/vocab_vectors/test_vocab_api.py +++ b/spacy/tests/vocab_vectors/test_vocab_api.py @@ -42,6 +42,6 @@ def test_vocab_api_symbols(en_vocab, string, symbol): @pytest.mark.parametrize("text", "Hello") def test_vocab_api_contains(en_vocab, text): - _ = en_vocab[text] + _ = en_vocab[text] # noqa: F841 assert text in en_vocab assert "LKsdjvlsakdvlaksdvlkasjdvljasdlkfvm" not in en_vocab diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index b59c7c4362e..683a3974fa0 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -46,7 +46,6 @@ def add(self, doc): def get_docs(self, vocab): """Recover Doc objects from the annotations, using the given vocab.""" - attrs = self.attrs for string in self.strings: vocab[string] orth_col = self.attrs.index(ORTH)