-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
125 lines (95 loc) · 3.97 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# coding=utf-8
from lxml import html
import requests
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import HTMLParser
import string
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import SGDClassifier
from text import clean_text
from db import get_db
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesClassifier
db = get_db()
cursor = db.cursor()
cursor.execute("SELECT text, fake FROM news")
data = cursor.fetchall()
texts = []
labels = []
for(text, fake) in data:
texts.append(text)
labels.append("fake" if fake else "real")
validate = False
if validate:
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=1337)
else:
X_train = texts
X_test = []
y_train = labels
y_test = []
estimator = SVR(kernel="linear")
stop_words = [
"daãÿ", "dass", "ja", "titanic", "dpo", "shutterstock", "ssi", "dan", "was", "man", "ich",
"foto", "wenn", "doch", "gar", "mir", "sie", "nicht", "so", "sich", "er", "es", "postillon",
"fotolia", "cc", "by", "die", "der", "und", "in", "das", "zu", "den", "mit", "ist", "ein",
"von", "auf", "eine", "im", "dem", "auch", "als", "wie", "an", "noch", "aus", "des", "hat",
"aber", "nach", "oder", "werden", "nur", "einen", "bei", "um", "einer", "einem", "wird", "wir",
"war", "haben", "sind", "vor", "schon", "mehr", "sein", "dann", "am", "zum", "kann", "immer",
"wieder", "da", "durch", 'habe', 'mal', 'jetzt', 'seine', 'hatte', 'bis', 'zur', 'nun', 'weil',
'sei', 'gegen', 'heute', 'denn', 'unter', 'soll', 'alle', 'ihre', 'will', 'diese', 'ihr', 'keine',
'uns', 'hier', 'seiner', 'wurde', 'ganz', 'dieser', 'alles', 'selbst', 'bereits', 'mich', 'wer',
"vom", "damit", "seit", "ihm", "eines", "gibt", "wo", "ihn", "ab", "ob", "ihnen", "kein", "seinem",
"ihren", "vom", "wurden", "gibt", "seien", "sa", "fed", "com", "na", "picture", "control",
"direktlink", "kurtchen", "alliance"
]
reals = []
fakes = []
if validate:
for t, f in zip(texts, labels):
if f == "fake":
fakes.append(t)
else:
reals.append(t)
def most_freq(texts):
v = CountVectorizer(analyzer="word", token_pattern=r"(?u)\b[a-zA-Z]{2,}\b", binary=True, stop_words=stop_words)
res = v.fit_transform(texts)
fterms = sorted(zip(v.get_feature_names(), np.asarray(res.sum(axis=0)).ravel()), key=lambda idx: idx[1], reverse=True)[:50]
return fterms
if validate:
print "fake most freq: ", most_freq(fakes)
print "real most freq: ", most_freq(reals)
vect = CountVectorizer(analyzer="word", token_pattern=r"(?u)\b[a-zA-Z]{2,}\b", stop_words=stop_words)
#forest = ExtraTreesClassifier(n_estimators=10,
# random_state=1337)
#xclf = forest
xclf = SGDClassifier(loss='hinge', penalty='elasticnet', n_jobs=-1)
text_clf = Pipeline([
('vect', vect),
('tfidf', TfidfTransformer(use_idf=True)),
('clf', xclf),
])
print "Classification start..."
clf = text_clf.fit(X_train, y_train)
features = vect.get_feature_names()
if validate:
predicted = clf.predict(X_test)
print metrics.classification_report(y_test, predicted, target_names=["fake", "real"])
docs_new = [
"Martin Schulz ist toll", "die inhalte des bums und newsportals stern de leben", 'zdf titanic', "quelle", "2017"
]
predicted = clf.predict(docs_new)
for doc, category in zip(docs_new, predicted):
print('%s => %r' % (category, doc))
predicted = clf.predict(X_test[0:20])
for doc, category in zip(X_test, predicted):
print('%s => %r' % (category, doc))
joblib.dump(clf, 'clf.pkl')