-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarkov_model.py
56 lines (48 loc) · 1.7 KB
/
markov_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from asyncore import read
from copyreg import pickle
import pandas as pd
import numpy as np
import re
import string
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
text = open('./MobyD.txt').read()
text_tokens = text.split()
def clean_txt(txt):
cleaned_txt = []
for line in txt:
line = line.lower()
line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
tokens = word_tokenize(line)
words = [word for word in tokens if word.isalpha()]
cleaned_txt+=words
return cleaned_txt
cleaned_stories = clean_txt(text_tokens)
# print(cleaned_stories[:10])
def make_markov_model(cleaned_stories, n_gram=2):
markov_model = {}
for i in range(len(cleaned_stories)-n_gram-1):
curr_state, next_state = "", ""
for j in range(n_gram):
curr_state += cleaned_stories[i+j] + " "
next_state += cleaned_stories[i+j+n_gram] + " "
curr_state = curr_state[:-1]
next_state = next_state[:-1]
if curr_state not in markov_model:
markov_model[curr_state] = {}
markov_model[curr_state][next_state] = 1
else:
if next_state in markov_model[curr_state]:
markov_model[curr_state][next_state] += 1
else:
markov_model[curr_state][next_state] = 1
for curr_state, transition in markov_model.items():
total = sum(transition.values())
for state, count in transition.items():
markov_model[curr_state][state] = count/total
return markov_model
markov_model = make_markov_model(cleaned_stories)
dbfile = open('MobyDick', 'ab')
pickle.dump(markov_model, dbfile)