From b9efd800e02d55e848d56ce7acfacafb2089f587 Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Mon, 19 Sep 2022 10:33:03 +0000 Subject: [PATCH] synset pos parameter (#15) * modified: spacy_wordnet/wordnet_domains.py - importing Synset class and added return type hint to __find_synsets(...) method - pep8 character limit adjustment * modified: spacy_wordnet/wordnet_domains.py - added optional pos param to __find_synsets(...) method * modified: spacy_wordnet/wordnet_domains.py - argument handling for pos param * modified: spacy_wordnet/wordnet_domains.py - swapping all(map(...)) for set(...).difference(...) which gives a slight boost in speed and readability * modified: spacy_wordnet/wordnet_domains.py - added try/except to attempt to convert pos arg to list * modified: spacy_wordnet/wordnet_domains.py - filtering acceptable_pos using pos values and assigning to token_pos which will be used to determine which tokens to get synsets for * modified: spacy_wordnet/wordnet_domains.py - moved call from self.__synsets declaration into .synsets(...) method allowing user to supply pos args * modified: spacy_wordnet/wordnet_domains.py - return type hint and docstring for synsets(...) method * modified: tests/test_wordnet_annotator.py - added three assertions for pos param in test_english_annotations() method * modified: spacy_wordnet/wordnet_domains.py - fixed error type hint in synsets(...) method * modified: spacy_wordnet/wordnet_domains.py - fixed type error in __find_lemmas() method by swapping self.__synsets attribute with self.synsets(...) method - pep8 character limit fix in __find_lemmas() method * modified: spacy_wordnet/wordnet_domains.py - defined token_synsets as a separate list and filtered returned synsets in wn.synsets and extending token_synsets in __find_synsets(...) method * modified: tests/test_wordnet_annotator.py changed expected_adj_synsets to set() instead of {} (a dict) in test_english_annotations() method * Update spacy_wordnet/wordnet_domains.py param type hint spacing/formatting in synsets(...) method Co-authored-by: Francisco Aranda * Update spacy_wordnet/wordnet_domains.py param type hint spacing/formatting in __find_synsets(...) method Co-authored-by: Francisco Aranda * use token.pos if pos argument is none to mimic previous behavior. Co-authored-by: Francisco Aranda * Update wordnet_domains.py modified docstring to reflect what happens if pos argument is none * modified: tests/test_wordnet_annotator.py - added assert to test that list of pos args will return expected results * modified: tests/test_wordnet_annotator.py - added test for when pos argument is none * Update spacy_wordnet/wordnet_domains.py Checking if `token.pos` is an acceptable value before appending its lemma to the `word_variants` list. This avoids unexpected results such as when `token.pos` is an `ADVERB`. Co-authored-by: Francisco Aranda * Update wordnet_domains.py Updated docstring so user knows results are limited to NOUN, VERB, and ADJ even if `pos` is None. Co-authored-by: Ian Thompson Co-authored-by: Francisco Aranda --- spacy_wordnet/wordnet_domains.py | 62 ++++++++++++++++++++++++-------- tests/test_wordnet_annotator.py | 46 ++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 14 deletions(-) diff --git a/spacy_wordnet/wordnet_domains.py b/spacy_wordnet/wordnet_domains.py index a9f46d6..2683bb0 100644 --- a/spacy_wordnet/wordnet_domains.py +++ b/spacy_wordnet/wordnet_domains.py @@ -1,4 +1,6 @@ +from typing import Union from nltk.corpus import wordnet as wn +from nltk.corpus.reader.wordnet import Synset from spacy.tokens.token import Token from spacy_wordnet.__utils__ import * @@ -43,16 +45,28 @@ class Wordnet(object): def __init__(self, token: Token, lang: str = "es"): self.__token = token self.__lang = fetch_wordnet_lang(lang) - self.__synsets = self.__find_synsets(token, self.__lang) + self.__synsets = self.__find_synsets self.__lemmas = self.__find_lemmas() self.__wordnet_domains = self.__find_wordnet_domains() + def synsets(self, pos: Optional[Union[str, List[str]]] = None) -> List[Synset]: + """ + Load all synsets with a given part of speech tag. + If no pos is specified and `token.pos` is a VERB, NOUN, + or ADJ, synsets with the same parts of speech as + `token.pos` will be loaded. If `token.pos` is not a + VERB, NOUN, or ADJ and no pos is specified, an empty + list will be returned. + + :param pos: filter returned synsets by part(s) of speech. + Acceptable values are "verb", "noun", and "adj". + :return: list of synsets + """ + return self.__synsets(self.__token, self.__lang, pos=pos) + def lang(self): return self.__lang - def synsets(self): - return self.__synsets - def lemmas(self): return self.__lemmas @@ -68,16 +82,40 @@ def wordnet_synsets_for_domain(self, domains: List[str]): ] @staticmethod - def __find_synsets(token: Token, lang: str): + def __find_synsets(token: Token, + lang: str, + pos: Optional[Union[str, List[str]]] = None) -> List[Synset]: + if pos is None: + pos = [] + elif isinstance(pos, str): + pos = [pos] + elif not isinstance(pos, list): + try: + pos = list(pos) + except TypeError: + raise TypeError("pos argument must be None, type str, or type list.") + + acceptable_pos = {"verb": VERB, "noun": NOUN, "adj": ADJ} # We can define this as a private class constant + # check if any element in `pos` is not in `acceptable_pos` + if set(pos).difference(acceptable_pos): + raise ValueError("pos argument must be a combination of 'verb', " + "'noun', or 'adj'.") + + token_pos: List[int] = [acceptable_pos[k] for k in pos] + if not token_pos: + token_pos = [token.pos] word_variants = [token.text] - if token.pos in [VERB, NOUN, ADJ]: + if token.pos in (token_pos if pos else acceptable_pos.values()): # extend synset coverage using lemmas word_variants.append(token.lemma_) for word in word_variants: - token_synsets = wn.synsets( - word, pos=spacy2wordnet_pos(token.pos), lang=lang - ) + token_synsets: List[Synset] = [] + for p in token_pos: + token_synsets.extend(wn.synsets( + word, pos=spacy2wordnet_pos(p), lang=lang + )) + if token_synsets: return token_synsets @@ -95,8 +133,4 @@ def __find_wordnet_domains(self): ] def __find_lemmas(self): - return [ - lemma - for synset in self.synsets() - for lemma in synset.lemmas(lang=self.__lang) - ] + return [lemma for synset in self.synsets() for lemma in synset.lemmas(lang=self.__lang)] diff --git a/tests/test_wordnet_annotator.py b/tests/test_wordnet_annotator.py index 02aff84..9c1e06c 100644 --- a/tests/test_wordnet_annotator.py +++ b/tests/test_wordnet_annotator.py @@ -1,5 +1,6 @@ import unittest +from nltk.corpus import wordnet as wn import spacy from spacy_wordnet.wordnet_annotator import WordnetAnnotator @@ -28,6 +29,51 @@ def test_english_annotations(self): assert token._.wordnet.lemmas() assert token._.wordnet.wordnet_domains() + actual_none_synsets = set(token._.wordnet.synsets(pos=None)) + expected_none_synsets = {wn.synset("contract.n.01"), + wn.synset("contract.n.02"), + wn.synset("contract.n.03")} + assert actual_none_synsets == expected_none_synsets + + actual_verb_synsets = set(token._.wordnet.synsets(pos="verb")) + expected_verb_synsets = {wn.synset('abridge.v.01'), + wn.synset('compress.v.02'), + wn.synset('condense.v.07'), + wn.synset('contract.v.01'), + wn.synset('contract.v.04'), + wn.synset('contract.v.06'), + wn.synset('narrow.v.01'), + wn.synset('shrink.v.04'), + wn.synset('sign.v.04')} + assert actual_verb_synsets == expected_verb_synsets + + actual_noun_synsets = set(token._.wordnet.synsets(pos="noun")) + expected_noun_synsets = {wn.synset('contract.n.01'), + wn.synset('contract.n.02'), + wn.synset('contract.n.03')} + assert actual_noun_synsets == expected_noun_synsets + + actual_adj_synsets = set(token._.wordnet.synsets(pos="adj")) + expected_adj_synsets = set() + assert actual_adj_synsets == expected_adj_synsets + + actual_verb_noun_synsets = set(token._.wordnet.synsets( + pos=["verb", "noun"]) + ) + expected_verb_noun_synsets = {wn.synset('abridge.v.01'), + wn.synset('compress.v.02'), + wn.synset('condense.v.07'), + wn.synset('contract.v.01'), + wn.synset('contract.v.04'), + wn.synset('contract.v.06'), + wn.synset('narrow.v.01'), + wn.synset('shrink.v.04'), + wn.synset('sign.v.04'), + wn.synset('contract.n.01'), + wn.synset('contract.n.02'), + wn.synset('contract.n.03')} + assert actual_verb_noun_synsets == expected_verb_noun_synsets + def test_generate_variants_from_domain_list(self): economy_domains = ["finance", "banking"]