-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_import.R
142 lines (119 loc) · 4.8 KB
/
data_import.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
library(mlr)
library(OOBCurve)
library(tidyverse)
library(lubridate)
library(dplyr)
library(purrr)
library(ggplot2)
library(scales)
library(stringr)
library(tidytext)
library(glmnet)
library(tm)
library(caret)
library(rpart)
library(rpart.plot)
library(randomForest)
library(data.table)
library(ROCR)
library(leaps)
library(xgboost)
##### Importation of data and descriptive statistics
### Data importation
url <- 'http://www.trumptwitterarchive.com/data/realdonaldtrump/%s.json'
all_tweets <- map(2009:2017, ~sprintf(url, .x)) %>%
map_df(jsonlite::fromJSON, simplifyDataFrame = TRUE) %>%
mutate(created_at = parse_date_time(created_at, "a b! d! H!:M!:S! z!* Y!")) %>%
tbl_df()
### Restriction to Twitter data and definition of iPhone/Android fields
tweets <- all_tweets %>%
select(id_str, source, text, created_at) %>%
filter(source %in% c("Twitter for iPhone", "Twitter for Android")) %>%
mutate(source = ifelse(source=="Twitter for iPhone", "iPhone", source)) %>%
mutate(source = ifelse(source=="Twitter for Android", "Android", source))
### Descriptive plots at the aggregate level
tweets %>% filter(year(with_tz(created_at, "EST"))>2014, year(with_tz(created_at, "EST"))<2017) %>%
count(source, hour = hour(with_tz(created_at, "EST"))) %>%
mutate(percent = n / sum(n)) %>%
ggplot(aes(hour, percent, color = source)) +
geom_line(lwd=2) +
scale_y_continuous(labels = percent_format()) +
theme(title=element_text(size=18),axis.title=element_text(size=18), axis.text=element_text(size=18),legend.text=element_text(size=18)) +
labs(title="Proportion of tweets by time of day, per source",
x = "Hour of day (EST)",
y = "% of tweets",
color = "") +
scale_color_brewer(palette="Set1")
tweets %>%
count(source,
quoted = ifelse(str_detect(text, '^"'), "Quoted", "Not quoted")) %>%
ggplot(aes(source, n, fill = quoted)) +
geom_bar(stat = "identity", position = "dodge") +
theme(title=element_text(size=18),axis.title=element_text(size=18), axis.text=element_text(size=18),legend.text=element_text(size=18)) +
labs(x = "", y = "Number of tweets", fill = "") +
ggtitle('Whether tweets start with a quotation mark (")') +
scale_fill_brewer(palette="Dark2")
### Restriction to the period of interest
tweets <- tweets %>%
filter(created_at < "2017-03-01" & created_at > "2015-06-01")
### Descriptive plots at the word level
# Preparation of the dataset
# L notation ensures that the number is stored as an integer not a double
tweets.source <- tweets %>%
mutate(fromiPhone = ifelse(source=="iPhone", 1L, 0L)) %>%
select(source, fromiPhone, text, created_at)
# Break down tweets into one word per line
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words <- tweets.source %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "http")) %>% # replaciong links by "http"
mutate(text = str_replace_all(text, "'", "")) %>%
mutate(text = str_replace_all(text, "badly", "bad")) %>% #manual stemming
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]")) %>%
count(word, fromiPhone, created_at, source)
tweet_words = data.frame(tweet_words)
Android_iPhone_ratios <- tweet_words %>%
group_by(word) %>%
filter(sum(n) >= 40) %>%
spread(source, n, fill = 0) %>%
ungroup() %>%
mutate(ID.iPhone = ifelse(is.na(iPhone/sum(iPhone)),0,iPhone/sum(iPhone))) %>%
mutate(ID.Android = ifelse(is.na(Android/sum(Android)),0,Android/sum(Android))) %>%
group_by(word) %>%
summarise(ID.iPhone = sum(ID.iPhone), ID.Android = sum(ID.Android)) %>%
ungroup() %>%
mutate(logratio = ifelse(ID.iPhone==0,10,ifelse(ID.Android==0,-10,log2( ID.Android / ID.iPhone)))) %>%
arrange(desc(logratio))
Android_iPhone_ratios %>%
filter(logratio > 0) %>%
top_n(20, logratio) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio)) +
geom_bar(stat = "identity", fill='red', show.legend=FALSE) +
coord_flip() +
ylim(0,10) +
ylab("Android/iPhone log ratio") +
xlab("") +
theme(axis.text.x=element_text(size=18), axis.text.y=element_text(size=18))
Android_iPhone_ratios %>%
filter(logratio < 0) %>%
top_n(20, -logratio) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio)) +
geom_bar(stat = "identity", fill='lightblue', show.legend=FALSE) +
coord_flip() +
ylim(-10,0) +
ylab("Android/iPhone log ratio") +
xlab("") +
theme(axis.text.x=element_text(size=18), axis.text.y=element_text(size=18))
##### Preparation of structured dataset
### Defining dependent variable and restriction of data sample
tweets.trump <- tweets %>%
mutate(TrumpWrote = ifelse(source=="iPhone", 0L, 1L)) %>%
select(TrumpWrote, text, created_at)
# Writing data file
write.csv(tweets.trump, "trump_tweets.csv")