-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtextrank_sentence.py
128 lines (97 loc) · 3.95 KB
/
textrank_sentence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import scipy
import re
from graph_utils import find_cliques_all
from utils import text_cleaner
from wordrank import wordrank
from config import conf
stop_words = conf.STOP_WORDS
# funcs for TextRank
def read_article(file_name):
# TODO: think about reading files approach
with open(file_name, 'rb') as f:
text = f.readlines()
text = text[0].decode("mac_cyrillic")
sentences = text[0].split("\r")
for sentence in sentences:
print(sentence)
# return sentences
def sentence_similarity(sent1, sent2, stopwords=None):
if stopwords is None:
stopwords = []
sent1 = [w.lower() for w in sent1]
sent2 = [w.lower() for w in sent2]
all_words = list(set(sent1 + sent2))
vector1 = [0] * len(all_words)
vector2 = [0] * len(all_words)
# build the vector for the first sentence
for w in sent1:
if w in stopwords:
continue
vector1[all_words.index(w)] += 1
# build the vector for the second sentence
for w in sent2:
if w in stopwords:
continue
vector2[all_words.index(w)] += 1
return 1 - cosine_distance(vector1, vector2)
def build_similarity_matrix(sentences, stop_words):
# Create an empty similarity matrix
similarity_matrix = np.zeros((len(sentences), len(sentences)))
for idx1 in range(len(sentences)):
for idx2 in range(len(sentences)):
if idx1 == idx2: # ignore if both are same sentences
continue
similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
return similarity_matrix
def generate_summary(sentences: list, need_tag: bool) -> list:
sentences_cleaned = []
for i in range(len(sentences)):
sentence_clean = text_cleaner(sentences[i], stop_words)
sentences_cleaned.append((sentence_clean, i))
sentences_cleaned_fix = [elem for elem in sentences_cleaned if elem[0] != '']
ranked_list = []
for i in range(len(sentences_cleaned_fix)):
sent_ranked = wordrank(sentences_cleaned_fix[i][0], need_tag)
ranked_list.append((sent_ranked, sentences_cleaned_fix[i][1]))
ranked_list_notags = []
if need_tag:
for i in range(len(ranked_list)):
sent_without_tags = ' '.join(
[" ".join(word.split("_")[0] for word in s.split()) for s in ranked_list[i][0]])
ranked_list_notags.append((sent_without_tags, ranked_list[i][1]))
else:
ranked_list_notags = ranked_list
ranked_list_notags_nonums = []
for i in range(len(ranked_list_notags)):
element = ranked_list_notags[i][0]
ranked_list_notags_nonums.append(element)
sentence_similarity_matrix = build_similarity_matrix(ranked_list_notags_nonums, stop_words)
# Rank sentences in similarity martix
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
cliques = find_cliques_all(sentence_similarity_graph)
min_clique = cliques[0]
snts = [] # но как мы восстановим оригинальные индексы? --> наверное сравнением элементов списка
for i in min_clique:
sent = sentences[i]
snts.append(sent)
return snts
def generate_summary_loop(need_tag: bool, n_compression: bool, text: list, n: int):
"""
need_tag: use algorithms for taged/untaged vector models
n_comression: whether to apply the summarizer algorithm several times
text: text to apply (List, temporarily --> until i decide on the input method)
n: how many times to apply summarizer algorithm
"""
if n_compression:
abstract = generate_summary(text, need_tag)
while n > 0:
n -= 1
abstract = generate_summary(abstract, need_tag)
else:
abstract = generate_summary_debug(text)
return abstract