.Rhistory

knitr::opts_chunk$set(echo = TRUE)
library(readr)
data <- read_csv("happy.csv", skip = 1, show_col_types = FALSE)
colnames(data) <- c("ID", "income", "happiness")
head(data)
model <- lm(happiness ~ income, data = data)
model_summary <- summary(model)
model_summary
# get your own working directory...
getwd()
# install package e1071 (run install.packages just the once if needed).
#install.packages("e1071")
library(e1071)
library(ggplot2)
# using the R supplied dataset cars. Show the first few rows.
head(cars)
# scatterplot
scatter.smooth(x=cars$speed, y=cars$dist, main="Dist ~ Speed")
# partition plot window into one row of two columns (one for each of the
# next two boxplots).
par(mfrow=c(1, 2))
# box plot for 'speed'
boxplot(cars$speed,
main="Speed", sub=paste("Outlier rows: ", boxplot.stats(cars$speed)$out))
# box plot for 'distance'
boxplot(cars$dist,
main="Distance", sub=paste("Outlier rows: ", boxplot.stats(cars$dist)$out))
# density plot for 'speed'
plot(density(cars$speed),
main="Density Plot: Speed",
ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(cars$speed), 2)))
polygon(density(cars$speed), col="red")
# density plot for 'dist'
plot(density(cars$dist),
main="Density Plot: Distance",
ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(cars$dist), 2)))
polygon(density(cars$dist), col="red")
# calculate correlation between speed and distance
cor(cars$speed, cars$dist)
# build linear regression model on full data
linearMod <- lm(dist ~ speed, data=cars)
print(linearMod)
summary(linearMod)
# capture model summary as an object
modelSummary <- summary(linearMod)
# model coefficients
modelCoeffs <- modelSummary$coefficients
modelCoeffs
# get beta estimate for speed
beta.estimate <- modelCoeffs["speed", "Estimate"]
beta.estimate
# get std.error for speed
std.error <- modelCoeffs["speed", "Std. Error"]
std.error
# calc t statistic (compare with model summary).
t_value <- beta.estimate/std.error
t_value
# calc p Value (compare with model summary).
p_value <- 2*pt(-abs(t_value), df=nrow(cars)-ncol(cars))
p_value
# Holdout method - Create Training and Test data.
# set seed for reproducible results.
set.seed(2023)
# Get random row indices for training data. Have a 80:20 split.
trainingRowIndex <- sample(1:nrow(cars), 0.8*nrow(cars))
# Model training data
trainingData <- cars[trainingRowIndex, ]
# Test data
testData  <- cars[-trainingRowIndex, ]
# Build the model on the training data.
lmMod <- lm(dist ~ speed, data=trainingData)
# model summary
summary(lmMod)
# predict distance using test data.
distPred <- predict(lmMod, testData)
distPred
# set default plot window
par(mfrow=c(1, 1))
# plot predicted points.
plot(testData$speed, distPred, pch=16, col="red")
# plot actual points.
points(testData$speed, testData$dist, pch=16, col="green")
# Make the same plot using ggplot, there are many ways to do this.
# Here we start by creating two separate data frames
testData_pred <- data.frame(testData$speed, distPred = distPred, Type = "Predicted")
testData_actual <- data.frame(testData$speed, dist = testData$dist, Type = "Actual")
# Rename the columns to be the same
names(testData_pred) <- c("speed", "dist", "Type")
names(testData_actual) <- c("speed", "dist", "Type")
# Combine the data frames
allData <- rbind(testData_pred, testData_actual)
# Plotting the data
ggplot(allData, aes(x = speed, y = dist, color = Type)) +
geom_point(shape = 16) +
scale_color_manual(values = c("Predicted" = "red", "Actual" = "green")) +
labs(color = "Braking Distance") +
xlab("Speed (km/h)") +
ylab("Distance (m)") +
ggtitle("Speed vs Distance") +
theme(legend.position = c(.25, .95), legend.justification = c(1, 1))
# Your turn now - analyse the Anscombe quartet dataset by
# regressing y1 on x1, y2 on x2 and so on.
# Perform your analysis in an R-Markdown HTML or PDF report.
# Please feel free to consult with your tutors, Kathy and Jerry, or your fellow students if
# you have any questions.
anscombe
knitr::opts_chunk$set(echo = TRUE)
library(readr)
data <- read_csv("happy.csv", skip = 1, show_col_types = FALSE)
library(readr)
data <- read_csv("happy.csv", skip = 1, show_col_types = FALSE)
colnames(data) <- c("ID", "income", "happiness")
head(data)
model <- lm(happiness ~ income, data = data)
model_summary <- summary(model)
model_summary
library(ggplot2)
# Obtain the residuals from the model
residuals <- resid(model)
# Create scatter plot of residuals vs. fitted values
ggplot(data, aes(x = fitted(model), y = residuals)) +
geom_point() +
labs(x = "Fitted Values", y = "Residuals") +
ggtitle("Residuals vs. Fitted Values")
# Plot the residuals of the model
plot(model, which = 1:4)
library(ggplot2)
# Obtain the residuals from the model
residuals <- resid(model)
# Create scatter plot of residuals vs. fitted values
ggplot(data, aes(x = fitted(model), y = residuals)) +
geom_point() +
labs(x = "Fitted Values", y = "Residuals") +
ggtitle("Residuals vs. Fitted Values")
# Plot the residuals of the model
par(mfrow=c(2, 2))
plot(model, which = 1:4)
# Plot the observations
par(mfrow=c(1, 1))
plot <- ggplot(data, aes(x = income, y = happiness)) +
geom_point(color = "blue") +
labs(x = "Income", y = "Happiness") +
ggtitle("Scatter Plot of Income vs. Happiness")
# Add the regression line and equation
plot_with_line <- plot +
geom_smooth(method = "lm", se = FALSE, color = "red") +
annotate(
"text", x = max(data$income) - 1, y = max(data$happiness) - 1,
label = paste("Happiness ≈", round(coefficients(model)[1], 3),
"+", round(coefficients(model)[2], 3), "* Income"),
color = "black", size = 4, hjust = 2
)
# Display the plot
print(plot_with_line)
new_data <- data.frame(temp = c(7.48, 18.60, 26.35))
predictions <- predict(model, newdata = new_data)
library(dplyr)
data <- read_csv("calcofi.csv", skip = 1, show_col_types = FALSE)
colnames(data) <- c("temp", "salnty")
head(data)
# Remove missing values if any
data <- na.omit(data)
# set seed for random sampling
set.seed(1)
sample_size <- round(0.1 * nrow(data))
sample_data <- data %>% sample_n(sample_size)
# Conduct the linear regression
model <- lm(salnty ~ temp, data = sample_data)
# Get the model summary
summary(model)
# Create a scatter plot of the data points
plot <- ggplot(sample_data, aes(x = temp, y = salnty)) +
geom_point(color = "blue", size = 0.5) +
labs(x = "Temperature (°C)", y = "Salinity") +
ggtitle("Scatter Plot of Salinity vs.Temperature ")
# Add the regression line to the plot
plot_with_line <- plot +
geom_smooth(method = "lm", se = FALSE, color = "red", size = 1)
# Display the plot
print(plot_with_line)
new_data <- data.frame(temp = c(7.48, 18.60, 26.35))
predictions <- predict(model, newdata = new_data)
prediction_table <- data.frame(Water_Temperature = new_data$temp, Predicted_Salinity = round(predictions, 2))
knitr::kable(prediction_table, caption = "Predictions for Water Temperature")
residuals <- resid(model)
# Create scatter plot of residuals vs. fitted values
ggplot(sample_data, aes(x = fitted(model), y = residuals)) +
geom_point(color = "blue", size = 0.5) +
labs(x = "Fitted Values", y = "Residuals") +
ggtitle("Residuals vs. Fitted Values")
# Plot the residuals of the model
par(mfrow=c(2, 2))
plot(model, which = 1:4)