-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSummarize.py
306 lines (227 loc) · 10.8 KB
/
Summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
import re
from collections import Counter
import math
import numpy as np
import time
import preprocess_and_create_arff as prep
import tweet_preprocessor as tp
import editdistance as ed
from nltk.corpus import wordnet as wt
from AllData import SimilarityMatrix as SMM
import Hierarchical_clustering as hc
from normalizr import Normalizr
def noiseCleaning(summarize):
"""
Cleaning hashtags, usernames, urls, numbers, punctuations, emojis, stop words folding all to lowercase
tokenizing dataset and returning tokenized set
:param summarize: obj of Similarity_Matrix
:return: cleaned and tokenized dataset
"""
tokenized_dataset = [] # keeps tokenized tweets
normalizr = Normalizr(language='en')
for line in summarize.get__tweets():
# clear url, hashtags and usernames and fold to lower case
line = tp.clean(line.lower())
# clear numbers
line = re.sub('\d', '', line)
# clear punctutations, other symbols and stopwords
line = normalizr.normalize(line)
# tokenize all tweets
tok = line.split()
# collect all tweets in a list, each tweet is a list
tokenized_dataset.append(tok)
return tokenized_dataset
def cc_UHU(summarize):
""" count Username, Hashtag, URL
:return - url, username, hashtag, special character, stop word free dataset
"""
# number of lines in our tweet set
tweet_count = summarize.tweet_count()
# TODO: TERM SIMILARITY:
# TODO: count similar URL , USERNAME and HASHTAGS
# regex patterns
url_pattern = 'http[s]*://[^\s]*'
user_name_pattern = '@\w+'
hashtag_pattern = '#\w+'
for i in range(tweet_count - 1):
for j in range(i + 1, tweet_count):
url_match_i = re.findall(url_pattern, summarize.get__tweets(i)) # URLs of tweet i
url_match_j = re.findall(url_pattern, summarize.get__tweets(j)) # URLs of tweet j
usrname_match_i = re.findall(user_name_pattern, summarize.get__tweets(i)) # username of tweet i
usrname_match_j = re.findall(user_name_pattern, summarize.get__tweets(j)) # username of tweet j
hashtag_match_i = re.findall(hashtag_pattern, summarize.get__tweets(i)) # hashtags of tweet i
hashtag_match_j = re.findall(hashtag_pattern, summarize.get__tweets(j)) # hashtags of tweet j
# intersect 2 matching object gives us a set of similar elements
# length of this intersection is similar url/username/hashtag count between 2 tweets
try:
# intersect url
summarize.increase_value('uhu', i, j, len(set(url_match_i).intersection(url_match_j)))
# intersect usernames
summarize.increase_value('uhu', i, j, len(set(usrname_match_i).intersection(usrname_match_j)))
# intersect hashtags
summarize.increase_value('uhu', i, j, len(set(hashtag_match_i).intersection(hashtag_match_j)))
except IndexError:
print("\nCurrent indexes are: {} , {}\n".format(i, j))
exit()
# old version of cleaning
# tokenized_clean_tweets = prep.noiseCleaning(tokenized_dataset) # noise cleaning
# TODO: Selecting similar tweets
def select_summarize(summarize, *args):
"""
Selects summary tweet. for clustered and non-clustered tweets
:param summarize: Object of Similarity_Matrix
:param args: for non-clustered tweets , leave empty, for clustered tweets, send list of labels in a cluster
:return:
"""
similarity = summarize.get__value('similarity')
t_similarity = similarity.transpose()
# summarizing with clustering
# selects most weighted tweet in the cluster
if len(args) == 1:
label_id = args[0]
count = len(label_id)
# weight of each tweet
weight = np.zeros(count, dtype=np.int64)
# calculate weight of each tweet
for i in range(count):
weight[i] = np.sum(similarity[label_id[i]]) + np.sum(t_similarity[label_id[i]])
# print("weight of {} >> {}".format(label_id[i], weight[i]))
# most weighted tweet
index = np.argmax(weight)
print("{}".format(summarize.get__tweets(label_id[index])))
# summarizing without clustering
# selects most weighted tweet in the set
else:
count = summarize.tweet_count()
weight_1 = np.zeros(count, dtype=np.int64)
for i in range(0, count):
# print(np.sum(similarity[i]))
# print(np.sum(t_similarity[i]),'\n')
weight_1[i] = np.sum(similarity[i]) + np.sum(t_similarity[i])
print("weight of {} >> {}".format(i, weight_1[i]))
for i in range(0, 5):
index = np.argmax(weight_1)
# print("biggest value is in {},{}".format(row, col))
weight_1[index] = 0
print("{}".format(summarize.get__tweets(index)))
# TODO: !!!! BIG PROBLEM !!!! TOTALLY EMPTY TWEETS AFTER PREPROCESS
def write2File(dataset):
"""
writes cleaned tweets to a file and creates a list of cleaned tweets
:param dataset: tweet list
:return: list of cleaned tweet
"""
# writing to a file
try:
fin = open('temp_tweet_set.txt', 'w+')
except FileNotFoundError as e:
print("error: ", e)
temp_line = ''
cleaned_tweets = []
for line in dataset:
for word in line:
if word and len(word) > 2:
print(word, file=fin, end=' ') # joining words and creating tweets again
word += ' '
temp_line += word
cleaned_tweets.append(temp_line)
temp_line = ''
print('', file=fin)
return cleaned_tweets
############################################################################################
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ END of write2File(dataset)~~~~~~~~~~~~~~~~~~~~~~~~~~~#
############################################################################################
def calc_ces(summarize, tweets):
"""
Calculates Cosine similarity, Edit distance and Semantic similarity between tweets
:param summarize: Object of Similarity_Matrix class. Keeps data
:param tweets: tokenized tweets
"""
tweet_count = summarize.tweet_count()
print("Calculating cosine similarity , Levensthein distance and Semantic similarity...", end='')
for i in range(0, tweet_count - 1):
for j in range(i + 1, tweet_count):
# todo: cosine similarity
vec1 = Counter(tweets[i]) # Creating a counter object ,
vec2 = Counter(tweets[j]) # Which gives us number of each word in the sentence
intersection = set(vec1.keys()) & set(vec2.keys()) # intersection of sentences
numerator = sum([vec1[x] * vec2[x] for x in intersection]) # calculating cosine similarity
sum1 = sum([vec1[x] ** 2 for x in vec1.keys()])
sum2 = sum([vec2[x] ** 2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator: # if one of the sentences is empty, denominator is 0
summarize.increase_value('ed', i, j, 0.0)
else:
inc_value = float(numerator) / denominator
summarize.increase_value('cosine', i, j, inc_value * 10) # increase similarity value
# todo: Levenshtein Distance and Semantic similarity
# second part - Lenevshetin distance
synset_of_t1 = [] # keeps all synsets of words in tweet1
synset_of_t2 = [] # and tweet2
for word1 in tweets[i]:
# counting synsets for word1 for semantic similarity
synset_of_t1 = synset_of_t1 + wt.synsets(word1)
for word2 in tweets[j]:
# counting synsets for word2 for semantic similarity
synset_of_t2 += wt.synsets(word2)
L_dist = len(max(word1, word2)) - ed.eval(word1, word2)
if L_dist > 0:
summarize.increase_value('ed', i, j, L_dist)
synset_intersection_count = set(synset_of_t1).intersection(set(synset_of_t2)) # intersection of t1 and t2
# len of the intersection is our similarity value
summarize.increase_value('semantic', i, j, len(synset_intersection_count))
print("OK!")
############################################################################################
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ END of ces(summarize, tweets)~~~~~~~~~~~~~~~~~~~~~~~~~~#
############################################################################################
def main():
start_time = time.time()
fin_name = "California_test\\cal_500.txt" # result set file
print("Opening and reading tweets...", end='')
summarize = SMM()
summarize.openFile(fin_name, encodeWith='utf_8') # opening file
print('OK!')
# number of lines in our tweet set
tweet_count = summarize.tweet_count()
# initializing matrices with 0
print('Initializing matrices with value 0...', end='')
summarize.initialize()
print('OK!')
# count UHU
print("Calculating similar Username, Hashtag and Url counts...", end='')
cc_UHU(summarize)
print('OK!')
# clear redundant information and tokenize
print("Clearing tweets form redundant information and tokenizing...", end='')
new_tweets = noiseCleaning(summarize)
print("OK!")
# TODO: calculate edit distance, cosine similarity and semantic similarity together
calc_ces(summarize, new_tweets)
# creating similarity graph = term_level + semantic_level
print("Creating similarity graph...", end='')
summarize.create_similarity_graph()
print('OK!')
# TODO : CLUSTERING
# clustering similar tweets
print("Clustering results...", end='')
cluster_labels = hc.cluster(summarize.get__value('similarity'))
print("OK!")
#
print("Selecting top tweets...\n")
print("Total Cluster number: ", len(cluster_labels))
for label in cluster_labels:
# Creating summary
if len(label) > 1: # don't use 1 sample clusters in summarize
select_summarize(summarize, label)
# print(label)
# summarizing without clustering
# select_summarize(summarize)
# summarize.printSlice(summarize.get__similarity_graph(), [125, 125, 150, 150], 2)
# summarize.printSlice(summarize.get__similarity_graph(), round_step=2)
print("\nEVERYTHING is OK, CONGRATULATIONS...")
end_time = time.time()
print("\nWork time = ", end_time-start_time)
######################################################################################################
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ END of main() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
######################################################################################################
if __name__ == "__main__": main()