-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfidf.py
113 lines (97 loc) · 3.41 KB
/
tfidf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#coding=utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import copy
import numpy as np
import math
# import tensorflow as tf
# from augmentation import aug_policy
# import word_level_augment
from nltk.text import TextCollection
from nltk.tokenize import word_tokenize
def compute_tfidf(sents):
"""
print("sents")
print(sents)
corpus = TextCollection(sents)
print("corpus")
#print(corpus)
tf_idf = {}
idf = {}
for i in range(len(corpus)):
tf_idf[corpus[i]] = corpus.tf_idf(corpus[i], corpus)
#print(corpus[i])
idf[corpus[i]] = corpus.idf(corpus[i])
print("#######")
#print(tf_idf)
return tf_idf, idf
"""
"""Compute the IDF score for each word. Then compute the TF-IDF score."""
examples = [word_tokenize(sent) for sent in sents] # examples是每一句将单词分开后的
print("sents")
print(len(sents))
#print(sents)
print("examples")
#print(examples)
word_doc_freq = collections.defaultdict(int)
# Compute IDF
for i in range(len(examples)):
cur_word_dict = {}
cur_sent = copy.deepcopy(examples[i])
for word in cur_sent:
# if word == '"':
# print('cur_sent')
# print(cur_sent)
cur_word_dict[word] = 1
for word in cur_word_dict:
word_doc_freq[word] += 1
idf = {}
for word in word_doc_freq:
idf[word] = math.log(len(examples) * 1. / word_doc_freq[word])
# Compute TF-IDF
tf_idf = {}
for i in range(len(examples)):
cur_word_dict = {}
cur_sent = copy.deepcopy(examples[i])
for word in cur_sent:
if word not in tf_idf:
tf_idf[word] = 0
tf_idf[word] += 1. / len(cur_sent) * idf[word]
return idf, tf_idf
class TfIdfWordRep():
def __init__(self, token_prob, tf_idf, idf):
self.token_prob = token_prob
self.tf_idf = tf_idf
self.idf = idf
self.reset_random_prob()
def get_replace_prob(self, all_words):
# all_words = word_tokenize(sent)
cur_tf_idf = collections.defaultdict(int)
for word in all_words:
# if word == '"':
# print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
# print(all_words)
cur_tf_idf[word] += 1. / len(all_words) * self.idf[word]
replace_prob = []
for word in all_words:
replace_prob += [cur_tf_idf[word]]
replace_prob = np.array(replace_prob)
if replace_prob.size != 0:
replace_prob = np.max(replace_prob) - replace_prob
replace_prob = (replace_prob / replace_prob.sum() *
self.token_prob * len(all_words))
return replace_prob
def reset_random_prob(self):
"""Generate many random numbers at the same time and cache them."""
cache_len = 100000
self.random_prob_cache = np.random.random(size=(cache_len,))
self.random_prob_ptr = cache_len - 1
def get_random_prob(self):
"""Get a random number."""
value = self.random_prob_cache[self.random_prob_ptr]
self.random_prob_ptr -= 1
if self.random_prob_ptr == -1:
self.reset_random_prob()
return value