-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrossmann_visualizations.r
284 lines (207 loc) · 10.1 KB
/
rossmann_visualizations.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
library(ggplot2)
library(zoo)
library(forecast)
library(tidyverse)
library(data.table)
library(skimr)
test = read.csv("C:/Users/dubey/OneDrive/Desktop/IIT COURSES/DATA-ANYLITCS/PROJECT/rossmann-store-sales/original data/test.csv", header = T)
train = read.csv("C:/Users/dubey/OneDrive/Desktop/IIT COURSES/DATA-ANYLITCS/PROJECT/rossmann-store-sales/original data/train.csv", header = T)
store = read.csv("C:/Users/dubey/OneDrive/Desktop/IIT COURSES/DATA-ANYLITCS/PROJECT/rossmann-store-sales/original data/store.csv", header = T)
#No. of missing values in each dataframe
sum(is.na(test));sum(is.na(train));sum(is.na(store))
#To view top 5 rows in each dataframe
head(train, 5);head(store, 5);head(test, 5)
#To know more about columns in each dataframe
str(train);str(test);str(store)
#Summary Statistics
summary(train);summary(test);summary(store)
skim(train);skim(test);skim(store)
#No. of rows and cols in each dataframe
dim(train);dim(test);dim(store)
#vertical - columns, horizontal - data
glimpse(train);glimpse(test);glimpse(store)
#date function
data_function <- function(f){
dataframe <- fread(f)
dataframe[, Date:= as.Date(Date)]
return(dataframe)
}
train = data_function("C:/Users/dubey/OneDrive/Desktop/IIT COURSES/DATA-ANYLITCS/PROJECT/rossmann-store-sales/original data/train.csv")
test = data_function("C:/Users/dubey/OneDrive/Desktop/IIT COURSES/DATA-ANYLITCS/PROJECT/rossmann-store-sales/original data/test.csv")
#check number of unique values in each column
train %>% summarise_all(n_distinct)
test %>% summarise_all(n_distinct)
# count of test stores are also in the train data
c <- unique(c(test$Store, train$Store))
sum(unique(test$Store) %in% unique(train$Store))
# count of train stores that are not in the test data
sum(!(unique(train$Store) %in% unique(test$Store)))
#unique values of Open column
table(train$Open)
#percentage of stores closed and opened in train csv
sum(train$Open %in% 0 ) / nrow(train);sum(train$Open %in% 1 ) / nrow(train)
#percentage of stores closed and opened in test csv
sum(test$Open %in% 0 ) / nrow(test);sum(test$Open %in% 1 ) / nrow(test)
#unique values of Promo column
table(train$Promo)
#percentage of stores displayed promos in train csv
sum(train$Promo %in% 1 ) / nrow(train);sum(train$Promo %in% 0 ) / nrow(train)
#percentage of stores displayed promos in test csv
sum(test$Promo %in% 1 ) / nrow(test);sum(test$Promo %in% 0 ) / nrow(test)
#unique values of StateHoliday column in train csv
table(train$StateHoliday)
#percentage of no holidays in train csv
sum(train$StateHoliday %in% 0 ) / nrow(train)
#unique values of StateHoliday column in test csv
table(test$StateHoliday)
#percentage of no holidays in test csv
sum(test$StateHoliday %in% 0 ) / nrow(test); sum(test$StateHoliday %in% 1 ) / nrow(test)
#percentage of no SchoolHoliday in train csv
sum(train$SchoolHoliday %in% 0 ) / nrow(train); sum(train$SchoolHoliday %in% 1 ) / nrow(train)
#percentage of no SchoolHoliday in train_csv
sum(test$SchoolHoliday %in% 0 ) / nrow(test); sum(test$SchoolHoliday %in% 1 ) / nrow(test)
#Visualization of Date Column
plot(train$Date, type = "p")
plot(test$Date, type = "p")
#In test data we need to predict all 856 stores daily
all(table(test$Date) == 856)
#fill all missing values
sum(is.na(train))
train[is.na(train)] <- 1
sum(is.na(train))
#histogram for number of sales
hist(train$Sales,main="Histogram for sales count in stores",xlab="stores" ,border="black",col="green", breaks=10, ylab = "count of sales")
#histogram for Average sales per store when opened
hist(aggregate(train[Sales != 0]$Sales,
by = list(train[Sales != 0]$Store), mean)$x, xlab="per store" ,border="black",col="green", breaks=100, ylab = "Avg sales",
main = "Average sales per store when opened")
#histogram for count of customers
hist(train$Customers,main="Histogram for customers count in stores",xlab="customers" ,border="black",col="green", breaks=100, ylab = "count of customers")
#histogram for Avg customers per store when opened
hist(aggregate(train[Sales != 0]$Customers,
by = list(train[Sales != 0]$Store), mean)$x, breaks = 100, xlab="customers" ,ylab = "Average", border="black",col="green",
main = "Avg customers per store when opened")
#How Schoolholiday is affecting sales
ggplot(train[Sales != 0], aes(x = factor(SchoolHoliday), y = Sales)) +
geom_jitter(alpha = 0.1) +
geom_boxplot(color = "red", outlier.colour = NA, fill = NA, width = 0.4)
#How number of customers are affecting sales
ggplot(train[train$Sales != 0 & train$Customers != 0],
aes(x = log(Customers), y = log(Sales))) +
geom_point(alpha = 0.2) + geom_smooth()
#How promo is affecting customers there by sales
ggplot(train[train$Sales != 0 & train$Customers != 0],
aes(x = factor(Promo), y = Sales)) +
geom_jitter(alpha = 0.1) +
geom_boxplot(color = "red", outlier.colour = NA, fill = NA)
#How Promo is influencing customers
ggplot(train[train$Sales != 0 & train$Customers != 0],
aes(x = factor(Promo), y = Customers)) +
geom_jitter(alpha = 0.1) +
geom_boxplot(color = "red", outlier.colour = NA, fill = NA)
#How promo is affecting sales
with(train[train$Sales != 0 & train$Promo == 0], mean(Sales / Customers))
with(train[train$Sales != 0 & train$Promo == 1], mean(Sales / Customers))
#How are Sales w.r.t. Promo
table(ifelse(train$Promo, "Promo", "No promo"),
ifelse(train$Sales != 0, "Sales > 0", "Sales = 0")
)
#How are Sales w.r.t. Opening of a store
table(ifelse(train$Open == 1, "Opened", "Closed"),
ifelse(train$Sales > 0, "Sales > 0", "Sales = 0"))
#which stores have 0 sales when store is opened
train[Open == 1 & Sales == 0]
#Count of zero sales per store
df1 <- sort(tapply(train$Sales, list(train$Store), function(y) sum(y == 0)))
hist(df1,100)
#dimension of dataframe
dim(df1)
head(df1, 5); tail(df1, 5)
#How sales in stores are changing day by day which has zero sales atleast once
plot(train[Store == 349, Sales], ylab = "Sales Count", xlab = "Days", main = "Store 349")
plot(train[Store == 674, Sales], ylab = "Sales Count", xlab = "Days", main = "Store 674")
plot(train[Store == 103, Sales], ylab = "Sales Count", xlab = "Days", main = "Store 103")
ggplot(train[Store == 335],
aes(x = Date, y = Sales,
color = factor(DayOfWeek == 7), shape = factor(DayOfWeek == 7))) +
geom_point(size = 3) + ggtitle("Sales of store 335 (True if sunday)")
ggplot(train[Store == 423],
aes(x = Date, y = Sales,
color = factor(DayOfWeek == 7), shape = factor(DayOfWeek == 7))) +
geom_point(size = 3) + ggtitle("Sales of store 423 (True if sunday)")
ggplot(train[Sales != 0],
aes(x = factor(DayOfWeek), y = Sales)) +
geom_jitter(alpha = 0.1) +
geom_boxplot(color = "yellow", outlier.colour = NA, fill = NA)
summary(store)
table(store$StoreType)
table(store$Assortment)
table(data.frame(Assortment = store$Assortment, StoreType = store$StoreType))
hist(store$CompetitionDistance, 100)
store$CompetitionOpenSince <- as.yearmon(paste(store$CompetitionOpenSinceYear,
store$CompetitionOpenSinceMonth, sep = "-"))
hist(as.yearmon("2015-10") - store$CompetitionOpenSince, 100,
main = "Years since opening of nearest competition")
store$Promo2Since <- as.POSIXct(paste(store$Promo2SinceYear,
store$Promo2SinceWeek, 1, sep = "-"),
format = "%Y-%U-%u")
hist(as.numeric(as.POSIXct("2015-10-01", format = "%Y-%m-%d") - store$Promo2Since),
100, main = "Days since start of promo2")
table(store$PromoInterval)
train_store <- merge(train, store, by = "Store")
ggplot(train_store[Sales != 0], aes(x = factor(PromoInterval), y = Sales)) +
geom_jitter(alpha = 0.1) +
geom_boxplot(color = "yellow", outlier.colour = NA, fill = NA)
ggplot(train_store[Sales != 0],
aes(x = as.Date(Date), y = Sales, color = factor(StoreType))) +
geom_smooth(size = 2)
ggplot(train_store[Customers != 0],
aes(x = as.Date(Date), y = Customers, color = factor(StoreType))) +
geom_smooth(size = 2)
ggplot(train_store[Sales != 0],
aes(x = as.Date(Date), y = Sales, color = factor(Assortment))) +
geom_smooth(size = 2)
ggplot(train_store[Sales != 0],
aes(x = as.Date(Date), y = Customers, color = factor(Assortment))) +
geom_smooth(size = 2)
salesByDist <- aggregate(train_store[Sales != 0 & !is.na(CompetitionDistance)]$Sales,
by = list(train_store[Sales != 0 & !is.na(CompetitionDistance)]$CompetitionDistance), mean)
colnames(salesByDist) <- c("CompetitionDistance", "MeanSales")
ggplot(salesByDist, aes(x = log(CompetitionDistance), y = log(MeanSales))) +
geom_point() + geom_smooth()
ggplot(train_store[Sales != 0],
aes(x = factor(!is.na(CompetitionOpenSinceYear)), y = Sales)) +
geom_jitter(alpha = 0.1) +
geom_boxplot(color = "yellow", outlier.colour = NA, fill = NA) +
ggtitle("Any competition?")
train_store$DateYearmon <- as.yearmon(train_store$Date)
train_store <- train_store[order(Date)]
timespan <- 100
beforeAndAfterComp <- function(s) {
x <- train_store[Store == s]
daysWithComp <- x$CompetitionOpenSince >= x$DateYearmon
if (any(!daysWithComp)) {
compOpening <- head(which(!daysWithComp), 1) - 1
if (compOpening > timespan & compOpening < (nrow(x) - timespan)) {
x <- x[(compOpening - timespan):(compOpening + timespan), ]
x$Day <- 1:nrow(x)
return(x)
}
}
}
temp <- lapply(unique(train_store[!is.na(CompetitionOpenSince)]$Store), beforeAndAfterComp)
temp <- do.call(rbind, temp)
length(unique(temp$Store))
ggplot(temp[Sales != 0], aes(x = Day, y = Sales)) +
geom_smooth() +
ggtitle(paste("Competition opening around day", timespan))
temp <- train
temp$year <- format(temp$Date, "%Y")
temp$month <- format(temp$Date, "%m")
temp[, StoreMean := mean(Sales), by = Store]
temp <- temp[, .(MonthlySalesMean = mean(Sales / (StoreMean)) * 100),
by = .(year, month)]
temp <- as.data.frame(temp)
SalesTS <- ts(temp$MonthlySalesMean, start=2013, frequency=12)
col = rainbow(3)
seasonplot(SalesTS, col=col, year.labels.left = TRUE, pch=19, las=1)