-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path.Rhistory
193 lines (193 loc) · 7.02 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
knitr::opts_chunk$set(echo = TRUE)
library(readr)
data <- read_csv("happy.csv", skip = 1, show_col_types = FALSE)
colnames(data) <- c("ID", "income", "happiness")
head(data)
model <- lm(happiness ~ income, data = data)
model_summary <- summary(model)
model_summary
# get your own working directory...
getwd()
# install package e1071 (run install.packages just the once if needed).
#install.packages("e1071")
library(e1071)
library(ggplot2)
# using the R supplied dataset cars. Show the first few rows.
head(cars)
# scatterplot
scatter.smooth(x=cars$speed, y=cars$dist, main="Dist ~ Speed")
# partition plot window into one row of two columns (one for each of the
# next two boxplots).
par(mfrow=c(1, 2))
# box plot for 'speed'
boxplot(cars$speed,
main="Speed", sub=paste("Outlier rows: ", boxplot.stats(cars$speed)$out))
# box plot for 'distance'
boxplot(cars$dist,
main="Distance", sub=paste("Outlier rows: ", boxplot.stats(cars$dist)$out))
# density plot for 'speed'
plot(density(cars$speed),
main="Density Plot: Speed",
ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(cars$speed), 2)))
polygon(density(cars$speed), col="red")
# density plot for 'dist'
plot(density(cars$dist),
main="Density Plot: Distance",
ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(cars$dist), 2)))
polygon(density(cars$dist), col="red")
# calculate correlation between speed and distance
cor(cars$speed, cars$dist)
# build linear regression model on full data
linearMod <- lm(dist ~ speed, data=cars)
print(linearMod)
summary(linearMod)
# capture model summary as an object
modelSummary <- summary(linearMod)
# model coefficients
modelCoeffs <- modelSummary$coefficients
modelCoeffs
# get beta estimate for speed
beta.estimate <- modelCoeffs["speed", "Estimate"]
beta.estimate
# get std.error for speed
std.error <- modelCoeffs["speed", "Std. Error"]
std.error
# calc t statistic (compare with model summary).
t_value <- beta.estimate/std.error
t_value
# calc p Value (compare with model summary).
p_value <- 2*pt(-abs(t_value), df=nrow(cars)-ncol(cars))
p_value
# Holdout method - Create Training and Test data.
# set seed for reproducible results.
set.seed(2023)
# Get random row indices for training data. Have a 80:20 split.
trainingRowIndex <- sample(1:nrow(cars), 0.8*nrow(cars))
# Model training data
trainingData <- cars[trainingRowIndex, ]
# Test data
testData <- cars[-trainingRowIndex, ]
# Build the model on the training data.
lmMod <- lm(dist ~ speed, data=trainingData)
# model summary
summary(lmMod)
# predict distance using test data.
distPred <- predict(lmMod, testData)
distPred
# set default plot window
par(mfrow=c(1, 1))
# plot predicted points.
plot(testData$speed, distPred, pch=16, col="red")
# plot actual points.
points(testData$speed, testData$dist, pch=16, col="green")
# Make the same plot using ggplot, there are many ways to do this.
# Here we start by creating two separate data frames
testData_pred <- data.frame(testData$speed, distPred = distPred, Type = "Predicted")
testData_actual <- data.frame(testData$speed, dist = testData$dist, Type = "Actual")
# Rename the columns to be the same
names(testData_pred) <- c("speed", "dist", "Type")
names(testData_actual) <- c("speed", "dist", "Type")
# Combine the data frames
allData <- rbind(testData_pred, testData_actual)
# Plotting the data
ggplot(allData, aes(x = speed, y = dist, color = Type)) +
geom_point(shape = 16) +
scale_color_manual(values = c("Predicted" = "red", "Actual" = "green")) +
labs(color = "Braking Distance") +
xlab("Speed (km/h)") +
ylab("Distance (m)") +
ggtitle("Speed vs Distance") +
theme(legend.position = c(.25, .95), legend.justification = c(1, 1))
# Your turn now - analyse the Anscombe quartet dataset by
# regressing y1 on x1, y2 on x2 and so on.
# Perform your analysis in an R-Markdown HTML or PDF report.
# Please feel free to consult with your tutors, Kathy and Jerry, or your fellow students if
# you have any questions.
anscombe
knitr::opts_chunk$set(echo = TRUE)
library(readr)
data <- read_csv("happy.csv", skip = 1, show_col_types = FALSE)
library(readr)
data <- read_csv("happy.csv", skip = 1, show_col_types = FALSE)
colnames(data) <- c("ID", "income", "happiness")
head(data)
model <- lm(happiness ~ income, data = data)
model_summary <- summary(model)
model_summary
library(ggplot2)
# Obtain the residuals from the model
residuals <- resid(model)
# Create scatter plot of residuals vs. fitted values
ggplot(data, aes(x = fitted(model), y = residuals)) +
geom_point() +
labs(x = "Fitted Values", y = "Residuals") +
ggtitle("Residuals vs. Fitted Values")
# Plot the residuals of the model
plot(model, which = 1:4)
library(ggplot2)
# Obtain the residuals from the model
residuals <- resid(model)
# Create scatter plot of residuals vs. fitted values
ggplot(data, aes(x = fitted(model), y = residuals)) +
geom_point() +
labs(x = "Fitted Values", y = "Residuals") +
ggtitle("Residuals vs. Fitted Values")
# Plot the residuals of the model
par(mfrow=c(2, 2))
plot(model, which = 1:4)
# Plot the observations
par(mfrow=c(1, 1))
plot <- ggplot(data, aes(x = income, y = happiness)) +
geom_point(color = "blue") +
labs(x = "Income", y = "Happiness") +
ggtitle("Scatter Plot of Income vs. Happiness")
# Add the regression line and equation
plot_with_line <- plot +
geom_smooth(method = "lm", se = FALSE, color = "red") +
annotate(
"text", x = max(data$income) - 1, y = max(data$happiness) - 1,
label = paste("Happiness ≈", round(coefficients(model)[1], 3),
"+", round(coefficients(model)[2], 3), "* Income"),
color = "black", size = 4, hjust = 2
)
# Display the plot
print(plot_with_line)
new_data <- data.frame(temp = c(7.48, 18.60, 26.35))
predictions <- predict(model, newdata = new_data)
library(dplyr)
data <- read_csv("calcofi.csv", skip = 1, show_col_types = FALSE)
colnames(data) <- c("temp", "salnty")
head(data)
# Remove missing values if any
data <- na.omit(data)
# set seed for random sampling
set.seed(1)
sample_size <- round(0.1 * nrow(data))
sample_data <- data %>% sample_n(sample_size)
# Conduct the linear regression
model <- lm(salnty ~ temp, data = sample_data)
# Get the model summary
summary(model)
# Create a scatter plot of the data points
plot <- ggplot(sample_data, aes(x = temp, y = salnty)) +
geom_point(color = "blue", size = 0.5) +
labs(x = "Temperature (°C)", y = "Salinity") +
ggtitle("Scatter Plot of Salinity vs.Temperature ")
# Add the regression line to the plot
plot_with_line <- plot +
geom_smooth(method = "lm", se = FALSE, color = "red", size = 1)
# Display the plot
print(plot_with_line)
new_data <- data.frame(temp = c(7.48, 18.60, 26.35))
predictions <- predict(model, newdata = new_data)
prediction_table <- data.frame(Water_Temperature = new_data$temp, Predicted_Salinity = round(predictions, 2))
knitr::kable(prediction_table, caption = "Predictions for Water Temperature")
residuals <- resid(model)
# Create scatter plot of residuals vs. fitted values
ggplot(sample_data, aes(x = fitted(model), y = residuals)) +
geom_point(color = "blue", size = 0.5) +
labs(x = "Fitted Values", y = "Residuals") +
ggtitle("Residuals vs. Fitted Values")
# Plot the residuals of the model
par(mfrow=c(2, 2))
plot(model, which = 1:4)