-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlda.py
54 lines (44 loc) · 2.23 KB
/
lda.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from gensim import models, corpora
from kiwipiepy import Kiwi
import functions as funcs
import preprocess as prep
prep.make_user_dictionary()
kiwi = Kiwi()
kiwi.load_user_dictionary('user_dictionary.txt')
kiwi.prepare()
class LDA_Model():
def __init__(self):
'''주요 품사 정의'''
self.주요품사 = ["NNG", "NNP", "VV", "VA", "XR"]
self.용언품사 = ["VV", "VA"]
'''형태소 분석 결과를 읽어서 주요 품사만 수집한 문서 리스트를 돌려준다.'''
def read_documents(self, df, col):
문서리스트 = []
for index, row in df.iterrows():
if row[col]:
필터링결과 = [(token.form, token.tag) for token in row[col] if token.tag in self.주요품사]
필터링결과 = [form+"다" if tag in self.용언품사 else form for form, tag in 필터링결과]
문서리스트.append(필터링결과)
return 문서리스트
'''주어진 문서 집합으로 문서-어휘 행렬을 만들어 돌려준다.'''
def build_doc_term_mat(self, 문서리스트):
dictionary = corpora.Dictionary(문서리스트)
corpus = [dictionary.doc2bow(문서) for 문서 in 문서리스트]
return corpus, dictionary
'''문서-어휘 행렬을 TF-IDF 문서-단어 행렬로 변환한다.'''
def build_corpus_tfidf(self, corpus):
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
return corpus_tfidf
def get_lda_model(self, df, company_name, year, col, num_topics, passes):
df_comp = funcs.get_comp(df, company_name)
df_comp_ = df_comp[[col, 'year']]
df_year = df_comp_.query(f'year == {year}')
df_year[col] = df_year[col].apply(prep.preprocess_text)
morph_analysis = lambda x: kiwi.tokenize(x) if type(x) is str else None
df_year['morpheme'] = df_year[col].apply(morph_analysis)
doc_list = self.read_documents(df_year, "morpheme")
corpus, dictionary = self.build_doc_term_mat(doc_list)
corpus_tfidf = self.build_corpus_tfidf(corpus)
model = models.ldamodel.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, alpha=1, passes=passes)
return model, corpus, dictionary, doc_list