-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_arff.py
156 lines (115 loc) · 4.65 KB
/
create_arff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import re
from nltk.corpus import stopwords
import tweet_preprocessor as twt_prep
from AllData import SimilarityMatrix as SMM
import tweet_preprocessor as tp
from normalizr import Normalizr
def createArff():
label_list = []
# getting train set
train_data = SMM.get__tweets(SMM, 'train')
# tokenizing and separating label values
tokenized_tweets = clean_and_tokenize(train_data, label_list)
# creating arff file for train set
if label_list:
write2file(tokenized_tweets,'train', label_list)
else:
print("Label list is empty")
exit()
# doing same steps for train set
test_data = SMM.get__tweets(SMM, 'test')
tokenized_tweets = clean_and_tokenize(test_data)
write2file(tokenized_tweets,'test')
# TODO : add labeled test set version
def clean_and_tokenize(data, *args):
tokenized_dataset = [] # consists tokinezed tweets
tweet = '' # a temporary list for a single tweet
adLine = True
normalizr = Normalizr(language='en')
pos_pattern = re.compile('\A[-]*[0-3]+')
if (len(args) == 1):
label_list = args[0]
for line in data:
# clear url, hashtags and usernames and fold to lower case
line = tp.clean(line)
# checking if line is a tweet or label
isLabel = re.match(pos_pattern, line)
if adLine and isLabel and len(line) < 3:
# appending tokinezed tweet to our dataset and reset temp tweet list
# tokenized_dataset.append(tweet)
# tweet = ''
# add label to the label list
temp = int(line)
if temp > 0:
label_list.append(1) # positive label
else:
label_list.append(0) # negative label
else:
# clear numbers
line = re.sub('\d', '', line)
# clear punctutations, other symbols and stopwords
line = normalizr.normalize(line)
if line:
tokenized_dataset.append(line.split())
adLine = True
else:
adLine = False
# for test set , we don't need to look for labels
else:
for line in data:
# clear numbers
line = re.sub('\d', '', line)
# clear punctutations, other symbols and stopwords
line = normalizr.normalize(line)
tokenized_dataset.append(line.split())
return tokenized_dataset
def write2file(dataset, fout, label=None):
"""
Creates arff file in the same directory with project from tokenized dataset and label values
:param dataset: tokenized dataset
:param fout: name of output file test / train
:param label: label values, if label=None, this is test set, but weka requires label for it too, so we add 0 to it
:return:
"""
f = open(fout+'.arff', 'w')
# special arff keywords
arff_comments = '%This is preprocessed tweet dataset with labels\n' \
'%project: Dogal Afetler sonrasi onemli tweetlerin belirlenmesi ve ozetlenmesi\n' \
'%preprocessed by - Ilkin Huseynli\n' \
'%date - 25.11.16\n\n\n'
arff_relation = '@RELATION tweets\n\n\n'
arff_atributes = '@ATTRIBUTE tweet STRING\n' \
'@ATTRIBUTE label {1,0}\n\n\n'
# writing special arff things to the files
arff_data = '@DATA\n'
for a in (arff_comments, arff_relation, arff_atributes, arff_data):
print(a, file=f, end='')
i = 0 # index of labels
if label != None:
for tweet in dataset:
# weka recognizes strings with '. adding starting of each line
print("'", end='', file=f)
# writing words
for word in tweet:
if word:
# writing words to the files : comma-seperated
print(word, end=',', file=f)
# adding ' end of each line
print("'", end='', file=f)
# writing label values to each file
print(',' + str(label[i]), file=f, end='\n')
i += 1
else:
for tweet in dataset:
# weka recognizes strings with '. adding starting of each line
print("'", end='', file=f)
# writing words
for word in tweet:
if word:
# writing words to the files : comma-seperated
print(word, end=',', file=f)
# adding ' end of each line
print("'", end='', file=f)
# writing label values to each file
print(',0', file=f, end='\n')
i += 1