-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnlp-project-preprocess.py
208 lines (169 loc) · 5.77 KB
/
nlp-project-preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/env python
# coding: utf-8
# **1. Load JSON file**<Br>
# **2. Data Exploration and Visualization**<br>
# **3. Select variables and Convert into CSV**<br>
# **4. Text Preprocessing**
# > a) Change to lower cases<Br>
# > b) Transform links (tentative?)<br>
# > c) Remove punctuation<br>
# > d) Remove stopwords<br>
# > e) lemmatize words (to root forms)<br>
####### 1. Loading JSON file #######
import numpy as np
import pandas as pd
import os
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
get_ipython().magic(u'matplotlib inline')
inline_rc = dict(mpl.rcParams)
from tqdm import tqdm
#True: all data (about 8 mil); False: 500,000 entries
full_data = False
#load user review data
reviews = []
with open('data/yelp_academic_dataset_review.json') as f:
for i, line in tqdm(enumerate(f)):
reviews.append(json.loads(line))
if full_data==False and i+1 >= 500000:
break
df_review = pd.DataFrame(reviews)
df_review.tail()
#load business data
biz=[]
with open('data/yelp_academic_dataset_business.json') as f1:
for i, line in tqdm(enumerate(f1)):
biz.append(json.loads(line))
if full_data==False and i+1 >= 500000:
break
df_biz = pd.DataFrame(biz)
df_biz.tail()
#load user data
user=[]
with open('data/yelp_academic_dataset_user.json') as f1:
for i, line in tqdm(enumerate(f1)):
user.append(json.loads(line))
if full_data==False and i+1 >= 500000:
break
df_user = pd.DataFrame(user)
df_user.tail()
####### 2. Data Exloration and Visualization #######
x=df_review['stars'].value_counts()
x=x.sort_index()
#plot star rating distribution
plt.figure(figsize=(6,5))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Star Rating Distribution", fontsize=16)
plt.ylabel('Number of businesses')
plt.xlabel('Star Ratings')
biz_cat = ''.join(df_biz['categories'].astype('str'))
cats=pd.DataFrame(biz_cat.split(','),columns=['categories'])
#prep for chart
x=cats.categories.value_counts()
x=x.sort_values(ascending=False)
x=x.iloc[0:20]
#chart
plt.figure(figsize=(16,4))
ax = sns.barplot(x.index, x.values, alpha=0.8)#,color=color[5])
plt.title("Top business categories",fontsize=25)
locs, labels = plt.xticks()
plt.setp(labels, rotation=80)
plt.ylabel('Number of businesses', fontsize=12)
plt.xlabel('Category', fontsize=12)
plt.show()
####### 3. Select Variables and Convert into CSV #######
# Issues for consideration:<br>
# Are we going to pick an industry, then work with the subset businesses?
# Or we do not consider the industry? e.g. cafe, restaurant, hair salon, etc.
# Replace business_id with businesss name
# Selected three variables: business_name, stars, text
df_comb=df_review.copy()
df_comb['business_name'] = df_comb['business_id'].map(df_biz.set_index('business_id')['name'])
df_comb = df_comb[['business_name','stars','text']]
df_comb
#plot 20 most reviewed business
x=df_comb['business_name'].value_counts()
x=x.sort_values(ascending=False)
x=x.iloc[0:20]
#plot chart
plt.figure(figsize=(16,4))
ax = sns.barplot(x.index, x.values, alpha=0.8)#,color=color[5])
plt.title("20 Most Reviewed Businesses",fontsize=20)
locs, labels = plt.xticks()
plt.setp(labels, rotation=80)
plt.ylabel('Number of reviews', fontsize=12)
plt.xlabel('Business', fontsize=12)
plt.show()
### Conversion into CSV ###
#Convert review, business, user datasets into CSV
#df_review.to_csv('data/yelp_reviews.csv', index=False)
#df_biz.to_csv('data/yelp_business.csv', index=False)
#df_user.to_csv('data/yelp_user.csv', index=False)
####### 4. Text Preprocessing #######
#### Preprocessing steps: ####
# For Sentiment analysis:
# >a) Change to lower cases
# >b) Remove HTML
# >c) Remove duplicate characters
# >d) Remove punctuation & Tokenize
# >e) Remove stopwords
# >f) Lemmatization/Stemming
import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')
from tqdm.auto import tqdm, trange
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import string
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
#True: preprocessing for sentiment analysis
#False: preprocessing for text summarization
sentiment=True
def preprocess(s):
if sentiment==True:
#1. lowercase
s = s.lower()
#2. remove HTML
soup = BeautifulSoup(s,'lxml')
html_free = soup.get_text()
#3. remove duplicate characters
reg = re.sub(r'([a-z])\1+', r'\1', s)
#4. Remove punctuation & Tokenize
no_punct = "".join([c for c in reg if c not in string.punctuation])
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(no_punct)
#4. Remove stopwords
filtered_words = [w for w in tokens if w not in stopwords.words('english')]
#5. lemmatize/stem words
final_words=[lemmatizer.lemmatize(w) for w in filtered_words]
#final_words=[stemmer.stem(w) for w in filtered_words]
else:
#1. lowercase
s = s.lower()
#2. remove HTML
soup = BeautifulSoup(s,'lxml')
html_free = soup.get_text()
#3. remove duplicate characters
reg = re.sub(r'([a-z])\1+', r'\1', s)
tokenizer = RegexpTokenizer(r'\w+')
final_words = tokenizer.tokenize(reg)
return " ".join(final_words)
tqdm.pandas()
df_pre['text']=df_pre['text'].progress_map(lambda s:preprocess(s))
#printout before & after of preprocessing
pd.DataFrame({'from': df_review['text'], 'to': df_pre['text']})
#save preprocessed data into CSV
df_pre.to_csv('data/yelp_pre.csv', index=False)
csv_df = pd.read_csv('data/yelp_pre.csv')
#csv_df.index +=1
#csv_df.drop(['Unnamed: 0'],axis=1)
csv_df