diff --git a/extra_model/_adjectives.py b/extra_model/_adjectives.py index 0198c17e..97436a8c 100644 --- a/extra_model/_adjectives.py +++ b/extra_model/_adjectives.py @@ -10,9 +10,9 @@ def cluster_adjectives(adjective_counts, vectorizer): # noqa: C901 """Cluster adjectives based on a constant radius clustering algorithm. - + Technical implementation uses a scikitlearn BallTree. - + :param adjective_counts: dictionary with adjectives and their counts :type adjective_counts: [(str,int)] :param vectorizer: provide embeddings to evaluate adjective similarity @@ -118,10 +118,10 @@ def cluster_adjectives(adjective_counts, vectorizer): # noqa: C901 def fill_sentiment_dict(adjective_counts): """Given a dictionary with adjectives and their counts, will compute. - + The sentiment of each of the adjectives using the VADER sentiment analysis package and return a dictionary of the adjectives and their sentiments. - + :param adjective_counts: dictionary with adjectives and their counts :type adjective_counts: dict :return: dictionary, where the keys are the adjectives and the values are tuples of the @@ -143,7 +143,7 @@ def fill_sentiment_dict(adjective_counts): def sentiments_from_adjectives(adjective_counts, sentiment_dict): """Build the weighted average sentiment score from a list of adjetives and their counts. - + :param adjective_counts: list of tuples with adjectives and their counts :type adjective_counts: [(str,int)] :param sentiment_dict: dictionary with adjectives and their sentiment, as tuple of compound and binary sentiment @@ -170,11 +170,11 @@ def sentiments_from_adjectives(adjective_counts, sentiment_dict): def adjective_info(dataframe_topics, dataframe_aspects, vectorizer): """Add adjective related information to the dataframes. - + This has two facets: -> for each topic cluster similar adjectives, to get a more abstract/readable list -> for each topic, use the adjectives to come up with a sentiment classification - + :param dataframe_topics: the dataframe with the topics we want to enrich, needs to have a collum `rawterms` :type dataframe_topics: :class:`pandas.DataFrame` :param dataframe_aspects: the dataframe with the aspect instances and related adjectives with columsn `aspect` and `descriptor` diff --git a/extra_model/_aspects.py b/extra_model/_aspects.py index 7927cf38..cdf9ff81 100644 --- a/extra_model/_aspects.py +++ b/extra_model/_aspects.py @@ -15,7 +15,7 @@ def compound_noun_list(token): """Find compound nouns. - + :param token: token for which to generate potential compound nouns :type token: :class:`spacy.token` :return: list of potential compounds @@ -33,7 +33,7 @@ def compound_noun_list(token): def acomp_list(tokens): """Find descriptions for a given token. - + :param tokens: list of tokens that are children of the head of the nount for which descriptions are searched. :type tokens: [:class:`spacy.token`] :return: list of adjectives @@ -55,7 +55,7 @@ def acomp_list(tokens): def adjective_list(tokens): """Find adjectives modifying a given noun. - + :param tokens: tokens of potential adjectice candidates (children of the noun and children of the head for compounds) :type tokens: [:class:`spacy.token`] :return: list of adjectives @@ -77,7 +77,7 @@ def adjective_list(tokens): def adjective_negations(token): """Find all negated adjectives in a sentence. - + :param token: negation token to handle :type token: :class:`spacy.token` :return: list of negated adjectives @@ -108,9 +108,9 @@ def adjective_negations(token): def parse(dataframe_texts): # noqa: C901 """Parse the comments and extract a list of potential aspects based on grammatical relations. - + (e.g. modified by adjective) - + :param dataframe_texts: a dataframe with the raw texts. The collumn wit the texts needs to be called 'Comments' :type dataframe_texts: :class:`pandas.DataFrame` :return: a dataframe with the aspect candidates @@ -128,7 +128,8 @@ def parse(dataframe_texts): # noqa: C901 # n_threads > 5 can segfault with long (>500 tokens) sentences # n_threads has been deprecated in spacy 3.x - https://spacy.io/usage/v2-1#incompat for index, document in zip( - dataframe_texts.index, nlp.pipe(dataframe_texts.Comments, batch_size=500), + dataframe_texts.index, + nlp.pipe(dataframe_texts.Comments, batch_size=500), ): # TODO reduce for production/make configurable negated_adjectives = [] for token in document: @@ -171,7 +172,7 @@ def parse(dataframe_texts): # noqa: C901 def generate_aspects(dataframe_texts): """Generate the aspects that will be merged into topics from the raw texts. - + :param dataframe_texts: a dataframe with the raw texts in the column 'Comments' :type dataframe_texts: :class:`pandas.DataFrame` :return: a dataframe with the aspect candidates, their associated description, index of original text in the diff --git a/extra_model/_disambiguate.py b/extra_model/_disambiguate.py index e5b3f522..22b88eb2 100644 --- a/extra_model/_disambiguate.py +++ b/extra_model/_disambiguate.py @@ -14,7 +14,7 @@ def vectorize_aspects(aspect_counts, vectorizer): """Turn the aspect map into a a vector of nouns and their vector representations, which also filters aspects without embedding. - + :param aspect_counts: (dict): the dictionary with aspect counts :param vectorizer: (Vectorizer): the provider of word-embeddings :return vectors with representable aspects and their vector embeddings @@ -32,7 +32,7 @@ def vectorize_aspects(aspect_counts, vectorizer): def best_cluster(aspect_vectors): """ Find the optimal cluster size using silhouette scores. - + :param aspect_vectors: list of embeddings vectors to be clustered :type aspect_vectors: [:class:`numpy.array`] :return: the optimal number of clusters @@ -73,7 +73,7 @@ def best_cluster(aspect_vectors): def cluster(aspects, aspect_vectors, vectorizer): """Cluster aspects based on the distance of their vector representations. - + Once clusters are found, use the other aspects in a given cluster to generate the context for a specific aspect noun @@ -115,7 +115,7 @@ def cluster(aspects, aspect_vectors, vectorizer): def match(aspect_counts, vectorizer): """Match a word to a specific wordnet entry, using the vector similarity of the aspects context and the synonym gloss. - + :param aspect_counts: Counter object of aspect->number of occurrence :type aspect_counts: :class:`collections.Counter` :param vectorizer: the provider of word-embeddings for context generation diff --git a/extra_model/_filter.py b/extra_model/_filter.py index 8873a277..02b7a8b8 100644 --- a/extra_model/_filter.py +++ b/extra_model/_filter.py @@ -8,7 +8,7 @@ def filter(dataframe): """Filter a dataframe for language and text length. - + The following rules apply: 1. Only comments with at least 20 characters retained. 2. Only comments in English are retained. diff --git a/extra_model/_summarize.py b/extra_model/_summarize.py index 1d7790ed..15a8ee79 100644 --- a/extra_model/_summarize.py +++ b/extra_model/_summarize.py @@ -6,7 +6,7 @@ def qa(dataframe_texts, dataframe_aspects, dataframe_topics): """Print summary information. - + :param dataframe_texts: dataframe with the raw texts (for example output) :type dataframe_texts: :class:`pandas.DataFrame` :param dataframe_aspects: dataframe with the aspects @@ -75,7 +75,7 @@ def qa(dataframe_texts, dataframe_aspects, dataframe_topics): def set_aspect(topic, dataframe_aspects): """For a given topic, set topic and adjective cluster fields in the aspect_dataframe. - + :param topic: the topic and it's associated information that we need to copy to the relevant entries in the aspect frame :type topic: :class:`pandas.DataFrame.Row` :param dataframe_aspects: the dataframe to be enriched with topic information @@ -99,7 +99,7 @@ def set_aspect(topic, dataframe_aspects): def link_aspects_to_topics(dataframe_aspects, dataframe_topics): """Fill topic and adjective cluster information into the aspect dataframe. - + :param dataframe_aspects: the dataframe to be enriched :type dataframe_aspects: :class:`pandas.DataFrame` :param dataframe_topics: the dataframe that has the topic and adjective cluster information @@ -122,7 +122,7 @@ def link_aspects_to_topics(dataframe_aspects, dataframe_topics): def link_aspects_to_texts(dataframe_aspects, dataframe_texts): """Transfer the original text identifier from the original text data table into the final aspect table. - + :param dataframe_aspects: table to be enriched :type dataframe_aspects: :class:`pandas.DataFrame` :param dataframe_texts: original table from which this information is extracted diff --git a/extra_model/_topics.py b/extra_model/_topics.py index a8deae6f..0f3f0fea 100644 --- a/extra_model/_topics.py +++ b/extra_model/_topics.py @@ -17,7 +17,7 @@ def path_to_graph(hypernym_list, initialnoun): """Make a hypernym chain into a graph. - + :param hypernym_list: list of hypernyms for a word as obtained from wordnet :type hypernym_list: [str] :param initialnoun: the initial noun (we need this to mark it as leaf in the tree) @@ -39,7 +39,7 @@ def path_to_graph(hypernym_list, initialnoun): def get_nodevec(node, vectors): """Get the vector representation of a gloss a wordnet node. - + Used to evaluate similarity between rungs in the hypernym chain. :param node: the wornet node for which to compute the embedding :type node: str @@ -58,7 +58,7 @@ def get_nodevec(node, vectors): def iterate(transition_matrix, importance, original, alpha): """Find the stable importance vector by iterated multiplication with the distance matrix. - + This function does a simple iteration. The "jump-back" probability from the paper is implemented as a linear superposition of the new and original importance numbers. :param transition_matrix: The connectedness matrix of the graph, including similarity weights. @@ -83,7 +83,7 @@ def iterate(transition_matrix, importance, original, alpha): def aggregate(aspects, aspect_counts, synsets_match, vectors): # noqa: C901 """Aggregate the aspects by building a tree from the hypernym chains. - + Using a page-rank type algorithm to assign importance to the nodes in the graph we only consider wordnet entries for this, not the actual aspects extracted from the texts. :param aspects: List of aspects to analyze @@ -210,7 +210,7 @@ def traverse_tree( # noqa: C901 node_list, associated_aspects, aspect_counts, full_tree, weighted, direction ): """Find all hypernyms/hyponyms in the tree to a given node. - + Aggregate the number of associated mentions in the original texts, optionally weighted by term-similarity. :param nodelist: List of nodes from which to gather the subsidiary terms and their initial mentions @@ -267,7 +267,7 @@ def traverse_tree( # noqa: C901 def collect_topic_info(filtered_topics, removed_topics, aspect_counts, full_tree): """Gather various bits of information into a single DataFrame. - + For each topic we store the importance, the list of associated raw text terms and their numbers. :param filtered_topics: List of topics remaining after filtering out low-iimportance subsidiary topics :type filtered_topics: [str] @@ -350,7 +350,7 @@ def collect_topic_info(filtered_topics, removed_topics, aspect_counts, full_tree def has_connection(term, prior, full_tree): """Check if two terms are connected within the directed hyopernym graph. - + :param term: first node to test :type term: str :param prior: second node to test @@ -369,7 +369,7 @@ def has_connection(term, prior, full_tree): def filter_aggregates(topics, tree): """Filter the importance-sorted list, so that each remaining topic is the sole member of its hypernym chain. - + :param topics: List of all topics in the graph :type topics: [str] :param tree: the graph which is being traversed @@ -398,7 +398,7 @@ def filter_aggregates(topics, tree): def get_topics(dataframe_aspects, vectors): """Generate the semantically clustered topics from the raw aspects. - + :param dataframe_aspects: the collection of nouns to be aggregated into topics :type dataframe_aspects: :class:`pandas.DataFrame` :param vectors: provides embeddings for context clustering and wordsense disammbguation diff --git a/extra_model/_vectorizer.py b/extra_model/_vectorizer.py index 7b934787..84833df4 100644 --- a/extra_model/_vectorizer.py +++ b/extra_model/_vectorizer.py @@ -12,7 +12,7 @@ class Vectorizer: def __init__(self, embedding_file): """ Use the generic gensim vector embedding lookup. - + Currently using pretrained glove embeddings, but anything goes. :param embedding_file: pathname for the file that stores the word-embeddings in gensim keyed-vectors format :type str @@ -25,7 +25,7 @@ def __init__(self, embedding_file): def get_vector(self, key): """ Return the vector embedding for a given word. - + According to the following logic: - if no embedding is found for this word, check if it's a compound - if it's a compound try to take the average embedding of the constituent words diff --git a/requirements-test.txt b/requirements-test.txt index 51dd7eb8..cf10c0c9 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,5 @@ bandit==1.7.0 -black==19.10b0 +black==20.8b1 flake8==3.9.0 isort==5.7.0 mypy==0.812 diff --git a/tests/test_topics.py b/tests/test_topics.py index 97dfe4ab..f8a9ee05 100644 --- a/tests/test_topics.py +++ b/tests/test_topics.py @@ -138,25 +138,31 @@ def test__aggregate(vec): def test__traverse_tree__down_weighted(simple_graph): - assert traverse_tree( - [("R", 1)], - {}, - {"L1": 4, "L2": 1}, - simple_graph, - weighted=True, - direction="down", - ) == {"L1": 2, "L2": 0.5} + assert ( + traverse_tree( + [("R", 1)], + {}, + {"L1": 4, "L2": 1}, + simple_graph, + weighted=True, + direction="down", + ) + == {"L1": 2, "L2": 0.5} + ) def test__traverse_tree__down_unweighted(simple_graph): - assert traverse_tree( - [("R", 1)], - {}, - {"L1": 4, "L2": 1}, - simple_graph, - weighted=False, - direction="down", - ) == {"L1": 4, "L2": 1} + assert ( + traverse_tree( + [("R", 1)], + {}, + {"L1": 4, "L2": 1}, + simple_graph, + weighted=False, + direction="down", + ) + == {"L1": 4, "L2": 1} + ) def test__traverse_tree__up_weighted(simple_graph): @@ -166,14 +172,17 @@ def test__traverse_tree__up_weighted(simple_graph): def test__traverse_tree__up_unweighted(simple_graph): - assert traverse_tree( - [("I1", 1)], - {}, - {"L1": 4, "L2": 1}, - simple_graph, - weighted=False, - direction="up", - ) == {"L1": 4} + assert ( + traverse_tree( + [("I1", 1)], + {}, + {"L1": 4, "L2": 1}, + simple_graph, + weighted=False, + direction="up", + ) + == {"L1": 4} + ) @pytest.mark.skip(