forked from al20139471/Bodo-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbeer-reviews.py
83 lines (64 loc) · 2.55 KB
/
beer-reviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
Source: https://medium.com/rapids-ai/real-data-has-strings-now-so-do-gpus-994497d55f8e
Analyze beer reviews to find most common words used in positive and negative reviews.
Usage:
mpiexec -n [cores] python beer-reviews.py
This example uses a sample reviews from S3 bucket (s3://bodo-example-data/beer/reviews_sample.csv).
Fulldataset is available in S3 bucket (s3://bodo-example-data/beer/reviews.csv)
or from Kaggle (https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews)
"""
import pandas as pd
import time
import bodo
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
# Create lists of stopwords and punctuation that will be removed
with open(f"{dir_path}/nltk-stopwords.txt", "r") as fh:
STOPWORDS = list(map(str.strip, fh.readlines()))
PUNCT_LIST = ["\.", "\-", "\?", "\:", ":", "!", "&", "'", ","]
# Define regex that will be used to remove these punctuation and stopwords from the reviews.
punc_regex = "|".join([f"({p})" for p in PUNCT_LIST])
stopword_regex = "|".join([f"\\b({s})\\b" for s in STOPWORDS])
@bodo.jit
def preprocess(reviews):
# lowercase and strip
reviews = reviews.str.lower()
reviews = reviews.str.strip()
# remove punctuation and stopwords
reviews = reviews.str.replace(punc_regex, "", regex=True)
reviews = reviews.str.replace(stopword_regex, "", regex=True)
return reviews
@bodo.jit
def find_top_words(review_filename):
# Load in the data
t_start = time.time()
df = pd.read_csv(review_filename, parse_dates=[2])
print("read time", time.time() - t_start)
score = df.score
reviews = df.text
t1 = time.time()
reviews = preprocess(reviews)
print("preprocess time", time.time() - t1)
t1 = time.time()
# create low and high score series
low_threshold = 1.5
high_threshold = 4.95
high_reviews = reviews[score > high_threshold]
low_reviews = reviews[score <= low_threshold]
high_reviews = high_reviews.dropna()
low_reviews = low_reviews.dropna()
high_colsplit = high_reviews.str.split()
low_colsplit = low_reviews.str.split()
print("high/low time", time.time() - t1)
t1 = time.time()
high_words = high_colsplit.explode()
low_words = low_colsplit.explode()
top_words = high_words.value_counts().head(25)
low_words = low_words.value_counts().head(25)
print("value_counts time", time.time() - t1)
print("total time", time.time() - t_start)
print("Top words: ")
print(top_words)
print("Low words: ")
print(low_words)
find_top_words("s3://bodo-example-data/beer/reviews_sample.csv")