-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_and_create_arff.py
190 lines (142 loc) · 6.21 KB
/
preprocess_and_create_arff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
""" ~~~~ Preprocess for classification: ~~~~~
# Steps : Clear all ;
# - punctuations
# - URL
# - Hashtags
# - usernames
# - rt
"""
import re
from nltk.corpus import stopwords
import tweet_preprocessor as twt_prep
# file names and pathes for easier access
path = "data/"
fname = "etiketli_tweet.txt"
class_name = "nagano_"
processed_path = 'processed_data/'
def main():
filepath = path+class_name+fname
openFile(filepath)
def openFile(filename):
""""Etiketli tvitlerin bulundugu dosyayi acar, tvitlerin hepsi kucuk harf olacak sekilde donusturur ve bir liste yazar \
"""
try:
# to skip non printable ascii chars, use this
# fp = open(filename, "r", encoding='utf-8-sig', errors='ignore')
fp = open(filename, "r")
except:
print("File error!\n")
else:
# a list that contains all tweets
dataset = list()
for line in fp:
dataset.append(line.lower())
# preprocessing
tokenizeTweet(dataset)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~ END of openFile function ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
def tokenizeTweet(dataset,flabel='labels.txt',ftoken='token.txt',writeFile=True):
"""Tokenizes data , splits label values after getting cleared data , writes file
:param dataset: dataset that tokenized
:param flabel: file name for labels
:param ftoken: file name for tokens - every word in a sentence
:param writeFile default value True, write or do not write to a file
"""
pos_pattern = re.compile('\A[-]*[0-3]+')
# label_file = open(flabel, 'w', encoding='utf-8-sig', errors='ignore') # keeps label values
label_file = open(flabel, 'w')
# tokenized_file = open(ftoken, 'w', encoding='utf-8-sig', errors='ignore') # keeps tokenized tweets
tokenized_file = open(ftoken, 'w')
tokenized_dataset = [] # consists tokinezed tweets
tweet = '' # a temporary list for a single tweet
label_list = [] # contains label information for tweets
for line in dataset:
line = twt_prep.clean(line)
# checking if line is a tweet or label
isLabel = re.match(pos_pattern, line)
if isLabel and len(line) < 3:
# appending tokinezed tweet to our dataset and reset temp tweet list
tokenized_dataset.append(tweet)
tweet = ''
# add label to the label list
temp = int(line)
if temp > 0:
label_list.append(1) # positive label
else:
label_list.append(0) # negative label
else:
tweet = line.split()
# line = re.sub(neg_pattern, "", line)
# writing to the file
for a in label_list:
print(a, file=label_file, end='\n')
for b in tokenized_dataset:
print(b, file=tokenized_file, end='\n')
# next step - cleaning redundant characters
tokenized_dataset = noiseCleaning(tokenized_dataset)
# writing preprocessed data to a file
if writeFile:
write2file(tokenized_dataset, label_list)
return tokenized_dataset
# ~~~~~~~~~~~~~~~~~~~~~~~~~~ END of tokenizeTweet function ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
def noiseCleaning(dataset):
"""Tokenized edilmis tweetleri alarak, istenmeyen karakterleri ve sayilari siler"""
# negatif pattern - karakter ve sayilari silmek icin
neg_pattern = re.compile('\W+''|\d+')
tokenized_dataset = []
# tokenize edilmis tweetler icerisinden kelimeler bir-bir alinarak islemler uygulanir
for tweet in dataset:
i = 0
# print("before: ", tweet)
for word in tweet:
word = re.sub(neg_pattern, '', word) # fazla karakterlerin ve sayilarin silinmesi
if word in stopwords.words('english'): # stopword'lerin silinmesi
word = ''
tweet[i] = word # son islemler sonrasi, veri setinin guncellenmesi
i += 1
# print("after: ", tweet)
tokenized_dataset.append(tweet)
# print(tokenized_dataset)
# test print
# for tweet in tokenized_dataset:
# print(tweet)
return tokenized_dataset
def write2file(dataset, label):
"""tokenized edilmis veri setini etiket degerleri ile birlikte comma-seperated ve normal sekilde dosyaya yazilmasi"""
processed_fname = processed_path+class_name+'processed_file.arff'
processed_fname_comma = processed_path + class_name + 'processed_file_comma.arff'
# processed_file_cs = open(processed_fname, 'w', encoding='utf-8-sig', errors='ignore')
processed_file_cs = open(processed_fname, 'w')
# processed_file_normal = open(processed_fname_comma, 'w', encoding='utf-8-sig', errors='ignore')
processed_file_normal = open(processed_fname_comma, 'w')
# special arff keywords
arff_comments = '%This is preprocessed tweet dataset with labels\n' \
'%project: Dogal Afetler sonrasi onemli tweetlerin belirlenmesi ve ozetlenmesi\n' \
'%preprocessed by - Ilkin Huseynli\n' \
'%date - 25.11.16\n\n\n'
arff_relation = '@RELATION tweets\n\n\n'
arff_atributes = '@ATTRIBUTE tweet STRING\n' \
'@ATTRIBUTE label {1,0}\n\n\n'
# writing special arff things to the files
arff_data = '@DATA\n'
for f in (processed_file_cs, processed_file_normal):
for a in (arff_comments, arff_relation, arff_atributes, arff_data):
print(a, file=f, end='')
i = 0 # index of labels
for tweet in dataset:
# weka recognizes strings with '. adding starting of each line
print("'", end='', file=processed_file_cs)
print("'", end='', file=processed_file_normal)
# writing words
for word in tweet:
if word:
# writing words to the files : comma-seperated and whitespace seperated
print(word, end=',', file=processed_file_normal)
print(word, end=' ', file=processed_file_cs)
# adding ' end of each line
print("'", end='', file=processed_file_cs)
print("'", end='', file=processed_file_normal)
# writing label values to each file
print(str(label[i]), file=processed_file_normal, end='\n')
print(',' + str(label[i]), file=processed_file_cs, end='\n')
i += 1
if __name__ == "__main__": main()