-
Notifications
You must be signed in to change notification settings - Fork 276
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Getting started vignette #915
Comments
This comment was marked as outdated.
This comment was marked as outdated.
This comment was marked as outdated.
This comment was marked as outdated.
This would be a good project for a team of folks who really love purrr 😄 |
@sierrajohnson and I will have a crack at this and will put our ideas/outlines/brain dump here 😊 |
Some ideas from a conversation I had with claude. Still needs a bunch of work but I think it has some interesting ideas. Getting Started with purrr: A Practical GuideIntroductionpurrr is a toolkit that helps you write cleaner, more maintainable R code by introducing functional programming concepts. But what is functional programming? At its core, it's an approach to programming that emphasizes using functions to transform data, similar to how you might use a series of tools to process raw materials into a final product. Instead of writing loops and modifying data step by step, functional programming encourages you to think about your data transformations as a series of function applications. This might sound abstract, but we'll see how it makes your code clearer and less prone to errors. Installation and Setup# Install purrr if you haven't already
install.packages("purrr")
# Load the package
library(purrr) Understanding Lists in RBefore diving into purrr, let's understand lists, as they're fundamental to how purrr works: # A list can contain different types of data
my_list <- list(
data_frame = data.frame(x = 1:3, y = 4:6),
model = lm(mpg ~ wt, mtcars),
custom_function = function(x) x^2
)
# Access list elements
my_list$data_frame # Using $
my_list[[1]] # Using [[]] to get the first element
my_list[1] # Using [] returns a list with one element Basic Map Functions: A Better Way to Loopmap(): The FoundationInstead of writing loops, # Process multiple texts with complex operations
documents <- list(
"The quick brown fox",
"jumps over the lazy dog",
"and then takes a nap"
)
# Complex text processing
processed_docs <- map(documents, \(doc) {
# Remove punctuation
clean <- gsub("[[:punct:]]", "", doc)
# Convert to lowercase
clean <- tolower(clean)
# Get unique words
words <- unique(strsplit(clean, "\\s+")[[1]])
# Create frequency table
table(strsplit(clean, "\\s+")[[1]])
})
# Process multiple data frames with custom operations
data_frames <- list(
df1 = data.frame(x = c(1, 2, NA), y = c("a", NA, "c")),
df2 = data.frame(x = c(4, NA, 6), y = c(NA, "e", "f"))
)
summaries <- map(data_frames, \(df) {
# Calculate custom summary statistics
numeric_cols <- sapply(df, is.numeric)
list(
missing_by_col = sapply(df, \(col) sum(is.na(col))),
complete_cases = sum(complete.cases(df)),
numeric_summary = if(any(numeric_cols)) {
sapply(df[numeric_cols], \(col) {
c(mean = mean(col, na.rm = TRUE),
sd = sd(col, na.rm = TRUE))
})
}
)
}) Type-specific Map FunctionsHere are examples where type-specific mapping is useful for non-vectorized operations: # Calculate complexity metrics for each document
complexity_scores <- map_dbl(documents, \(doc) {
words <- strsplit(doc, "\\s+")[[1]]
unique_ratio <- length(unique(words)) / length(words)
avg_word_length <- mean(nchar(words))
unique_ratio * avg_word_length
})
# Generate quality reports for data frames
quality_reports <- map_chr(data_frames, \(df) {
issues <- character(0)
# Check for missing values
if(any(is.na(df))) {
issues <- c(issues,
sprintf("Missing values: %d", sum(is.na(df))))
}
# Check for duplicates
if(any(duplicated(df))) {
issues <- c(issues,
sprintf("Duplicate rows: %d", sum(duplicated(df))))
}
# Combine all issues
if(length(issues) > 0) {
paste("Issues found:", paste(issues, collapse = "; "))
} else {
"No issues found"
}
}) Function Composition with the Base PipeHere's how to chain operations that can't be vectorized: # Process multiple datasets with different steps
models <- list(
lm(mpg ~ wt, mtcars),
lm(mpg ~ wt + cyl, mtcars),
lm(mpg ~ ., mtcars)
)
model_analysis <- models |>
map(\(model) {
# Extract model diagnostics
residuals <- residuals(model)
fitted <- fitted(model)
list(
residuals = residuals,
fitted = fitted,
stats = summary(model)
)
}) |>
map(\(analysis) {
# Perform residual diagnostics
with(analysis, {
list(
normality = shapiro.test(residuals)$p.value,
autocorrelation = dwtest(residuals ~ fitted)$p.value,
r_squared = stats$r.squared
)
})
}) |>
map_chr(\(diagnostics) {
# Create formatted summary
sprintf(
"Normality p=%.3f, Autocorr p=%.3f, R²=%.3f",
diagnostics$normality,
diagnostics$autocorrelation,
diagnostics$r_squared
)
}) Working with Multiple Inputsmap2(): Parallel ProcessingHere are examples where you need to process pairs of inputs with non-vectorized operations: # Compare pairs of models
model_pairs <- map2(models[-1], models[-length(models)], \(current, previous) {
# Compare model fits
anova_result <- anova(previous, current)
# Extract comparison metrics
list(
df_diff = diff(anova_result$Df),
deviance_diff = diff(anova_result$Deviance),
p_value = anova_result$`Pr(>F)`[2]
)
})
# Process pairs of data frames with custom comparisons
df_comparisons <- map2(data_frames[-1], data_frames[-length(data_frames)],
\(current, previous) {
# Compare structure and content
list(
added_cols = setdiff(names(current), names(previous)),
removed_cols = setdiff(names(previous), names(current)),
common_cols = intersect(names(current), names(previous))
)
}
) pmap(): Multiple InputsFor operations requiring multiple parameters that can't be vectorized: # Configure and run multiple analyses
analysis_configs <- list(
data = list(mtcars, PlantGrowth, InsectSprays),
dependent = list("mpg", "weight", "count"),
predictors = list(
c("wt", "cyl"),
c("group"),
c("spray")
),
test_type = list("lm", "aov", "glm")
)
analysis_results <- pmap(analysis_configs, \(data, dependent, predictors, test_type) {
# Construct formula
formula_str <- paste(dependent, "~", paste(predictors, collapse = " + "))
formula <- as.formula(formula_str)
# Fit model based on test type
model <- switch(test_type,
"lm" = lm(formula, data = data),
"aov" = aov(formula, data = data),
"glm" = glm(formula, data = data)
)
# Return analysis results
list(
model_type = test_type,
formula = formula_str,
summary = summary(model),
diagnostics = list(
residuals = residuals(model),
fitted = fitted(model)
)
)
}) Error HandlingWhen processing real data, things often go wrong. purrr helps you handle errors gracefully: # A complex analysis function that might fail
analyze_dataset <- function(df) {
if (!all(c("x", "y") %in% names(df))) {
stop("Missing required columns x and y")
}
if (!all(sapply(df[c("x", "y")], is.numeric))) {
stop("x and y must be numeric")
}
# Perform analysis
model <- lm(y ~ x, data = df)
residuals <- residuals(model)
list(
coefficients = coef(model),
r_squared = summary(model)$r.squared,
normality = shapiro.test(residuals)$p.value
)
}
# List of datasets to analyze
datasets <- list(
data.frame(x = 1:5, y = 1:5), # Valid
data.frame(x = "a", y = 1:5), # Invalid types
data.frame(z = 1:5), # Missing columns
data.frame(x = c(1, 1, 1), y = c(2, 2, 2)) # Perfect correlation
)
# Using safely()
safe_analysis <- safely(analyze_dataset)
results <- map(datasets, safe_analysis)
# Each element contains $result and $error
# Using possibly()
safe_analysis2 <- possibly(analyze_dataset, otherwise = list(
coefficients = NA,
r_squared = NA,
normality = NA
))
results2 <- map(datasets, safe_analysis2)
# Using quietly()
quiet_analysis <- quietly(analyze_dataset)
results3 <- map(datasets, quiet_analysis)
# Each element contains $result, $output, $warnings, and $messages Practical ExamplesExample 1: Advanced Data Frame Operations# Process multiple data frames with complex operations
data_frames <- list(
sales = data.frame(
date = as.Date("2024-01-01") + 0:9,
amount = runif(10, 100, 1000),
category = sample(c("A", "B", "C"), 10, replace = TRUE)
),
inventory = data.frame(
date = as.Date("2024-01-01") + 0:9,
item = sample(letters[1:5], 10, replace = TRUE),
quantity = rpois(10, 20)
)
)
processed_data <- data_frames |>
map(\(df) {
# Add custom datetime features
df$year <- format(df$date, "%Y")
df$month <- format(df$date, "%m")
df$day <- format(df$date, "%d")
df$weekday <- format(df$date, "%A")
# Calculate rolling statistics if numeric columns exist
numeric_cols <- sapply(df, is.numeric)
if (any(numeric_cols)) {
numeric_data <- df[numeric_cols]
df$rolling_mean <- rollapply(numeric_data[[1]],
width = 3,
FUN = mean,
fill = NA,
align = "right")
}
df
}) |>
map(\(df) {
# Calculate summary statistics by group if categorical columns exist
categorical_cols <- sapply(df, is.factor) | sapply(df, is.character)
if (any(categorical_cols)) {
group_col <- names(df)[categorical_cols][1]
numeric_cols <- sapply(df, is.numeric)
summaries <- aggregate(df[numeric_cols],
by = list(group = df[[group_col]]),
FUN = function(x) {
c(mean = mean(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE))
})
}
list(
processed_data = df,
group_summaries = if (exists("summaries")) summaries else NULL
)
}) Example 2: Model Evaluation Pipeline# Create and evaluate multiple models
model_specs <- list(
list(formula = mpg ~ wt, name = "simple"),
list(formula = mpg ~ wt + cyl, name = "medium"),
list(formula = mpg ~ ., name = "complex")
)
model_results <- model_specs |>
map(\(spec) {
# Fit model
model <- lm(spec$formula, data = mtcars)
# Perform cross-validation
cv_results <- lapply(1:5, \(fold) {
# Create train/test split
test_idx <- seq_len(nrow(mtcars)) %% 5 == (fold - 1)
train <- mtcars[!test_idx, ]
test <- mtcars[test_idx, ]
# Fit on train, predict on test
fold_model <- lm(spec$formula, data = train)
predictions <- predict(fold_model, newdata = test)
# Calculate fold metrics
mse <- mean((test$mpg - predictions)^2)
mae <- mean(abs(test$mpg - predictions))
list(mse = mse, mae = mae)
})
# Combine results
list(
name = spec$name,
formula = deparse(spec$formula),
model = model,
cv_metrics = list(
mse = mean(sapply(cv_results, \(x) x$mse)),
mae = mean(sapply(cv_results, \(x) x$mae))
),
diagnostics = list(
r_squared = summary(model)$r.squared,
aic = AIC(model),
residual_normality = shapiro.test(residuals(model))$p.value
)
)
}) Best Practices
Common Mistakes to Avoid
# Don't do this
map_dbl(1:10, \(x) x * 2) # Use regular vectorization instead: 1:10 * 2
# Do this
map_dbl(data_frames, \(df) {
# Complex non-vectorized operations
model <- lm(y ~ x, data = df)
summary(model)$r.squared
})
# Don't do this
map(datasets, analyze_dataset) # Might fail on invalid data
# Do this
map(datasets, safely(analyze_dataset)) # Handles errors gracefully
# Don't do this
map(data |
Drawn from:
Needs to cover:
_vec()
The text was updated successfully, but these errors were encountered: