-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrunEDCoW.py
110 lines (86 loc) · 3.11 KB
/
runEDCoW.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, math
import math
import pywt # pip install PyWavelets
import igraph # pip install python-igraph
import numpy as np
from collections import Counter
from tqdm import tqdm
from EDCoW import helperFunctions as edcow
from helper import preprocessing
from helper import readTweets
# Get all tweets
if len(sys.argv) == 2:
print ("Running EDCoW on "+sys.argv[1])
dataset = sys.argv[1]
else:
print ("Running EDCoW on data/manchester_attack.csv")
dataset = 'data/manchester_attack.csv'
tweetList, t1_time, t2_time = readTweets.getTweets(dataset,1)
# bucketSize (1=seconds,60=minutes,3600=hours,86400=days)
bucketSize = 60
# Group the tweets
tweetBuckets,_ = readTweets.tweetsToBuckets(tweetList,bucketSize,t1_time,t2_time)
cur_time = len(tweetBuckets)
# Build the bag of words (after preprocessing)
words = edcow.get_words(tweetBuckets)
# Signal for each word
signal_dict = dict()
# Sliding window size for stage 2 signal: 1 hour
delta = 4
_lambda = 11.0
print("Calculating stage 1 and stage 2 signal and filtering them, no of words:", len(words))
num_words = len(words)
auto_corr = np.zeros(num_words)
for i, word in enumerate(words):
sys.stdout.write('\r Calculating auto corr, words:%d out of %d' % (i, num_words))
stage1_sig = [edcow.s_w(tweetBuckets, word, t , cur_time) for t in range(cur_time)]
stage2_sig = edcow.second_stage(stage1_sig, delta)
# Zero time lag auto_corr: basically a dot product
auto_corr[i] = np.dot(stage2_sig, stage2_sig)
signal_dict[word] = stage2_sig
median_acc = np.median(auto_corr)
mid_abs_dev = np.median(np.abs([(auto_corr[i] - median_acc) for i in range(num_words)]))
theta_1 = median_acc + _lambda * mid_abs_dev
# Filter away signals
for i, word in enumerate(words):
if auto_corr[i] <= theta_1:
del signal_dict[word]
# Filtered
words = list(signal_dict.keys())
print("Words left after filtering 1:", len(words))
print("Computing the cross correlation")
# Zero time lag cross correlation = dot product
cross_corr = np.zeros(shape=(len(words), len(words)))
for i in range(len(words)):
for j in range(i+1, len(words)):
cross_corr[i,j] = np.dot(signal_dict[words[i]], signal_dict[words[j]])
cross_corr[j,i] = cross_corr[i,j]
print("Filtering with theta2")
# Set the elements less than theta2 to zero
for i in range(len(words)):
median_cross_corr = np.median(cross_corr[i,:])
theta_2 = median_cross_corr + _lambda * edcow.median_abs_dev(cross_corr[i,:], median_cross_corr)
for j in range(len(words)):
if cross_corr[i,j] <= theta_2:
cross_corr[i,j] = 0
print("Computing the clusters")
# Clustering
g = edcow.get_graph(cross_corr, range(len(words)))
print ("Making communities")
_, comm = edcow.get_communities(g) #THIS NEEDS FIXING
print("Numbers of clusters:", len(comm))
good_clusters = []
for cluster in comm:
n = float(len(cluster))
print("Cluster length:", n)
deg_c = 0.0
for i in cluster:
for j in cluster:
deg_c += cross_corr[i,j]
if n < 50:
eps = deg_c * math.exp(1.5*n) / math.factorial(2.0*n) #THIS NEEDS FIXING
if eps >= 0.1:
good_clusters.append(cluster)
print("Clustered Words:", [words[i] for i in cluster])