Skip to content

Commit

Permalink
Merge pull request #12 from e-kotov/5-add-safe-guards-and-warnings-wh…
Browse files Browse the repository at this point in the history
…en-requesting-more-then-10k-entries

5 add safe guards and warnings when requesting more then 10k entries
  • Loading branch information
e-kotov authored Feb 1, 2025
2 parents 91880b8 + 7cc949f commit 13575c8
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 19 deletions.
99 changes: 87 additions & 12 deletions R/data.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#' @param level A `character` string specifying the NUTS level ("0", "1", "2", or "3").
#' @param x_filters A `named list` where the names are the filter fields for the x variable and the values are the selected values for those fields. Default is an empty list. To find out which filters to use, use \code{\link{mi_source_filters}} with the desired `source_name`.
#' @param y_filters (Optional) A `named list` where the names are the filter fields for the y variable and the values are the selected values for those fields. Default is `NULL`. To find out which filters to use, use \code{\link{mi_source_filters}} with the desired `source_name`.
#' @param limit An `integer` specifying the maximum number of results to return. Default is 2500.
#' @param limit An `integer` specifying the maximum number of results to return. Default is 2500. This default should be enough for most uses, as it is well above the number of NUTS 3 regions in the EU. The maximum limited by the API is 10000.
#'
#' @return A `tibble` with the following columns:
#'
Expand All @@ -21,6 +21,7 @@
#' * `x`: the value of the univariate variable.
#' * `y` (optional): the value of the y variable (only included when `y_source` is provided).
#'
#' @importFrom rlang .data
#' @export
#'
#' @examples
Expand All @@ -30,7 +31,7 @@
#' x_source = "TGS00010",
#' year = 2020,
#' level = "2",
#' x_filters = list(isced11 = "TOTAL", unit = "PC", age = "Y_GE15", freq = "A")
#' x_filters = list(isced11 = "TOTAL", sex = "F")
#' )
#'
#' # Bivariate example
Expand All @@ -39,8 +40,8 @@
#' y_source = "DEMO_R_MLIFEXP",
#' year = 2020,
#' level = "2",
#' x_filters = list(isced11 = "TOTAL", unit = "PC", age = "Y_GE15", freq = "A"),
#' y_filters = list(unit = "YR", age = "Y_LT1", freq = "A")
#' x_filters = list(isced11 = "TOTAL", sex = "F"),
#' y_filters = list(age = "Y2", sex = "F")
#' )
#' }
mi_data <- function(
Expand All @@ -58,6 +59,7 @@ mi_data <- function(
checkmate::assert_list(x_filters, types = c("character", "NULL"))
checkmate::assert_integerish(year, null.ok = TRUE, max.len = 1)
checkmate::assert_list(y_filters, types = c("character", "NULL"), null.ok = TRUE)
checkmate::assert_number(limit, lower = 1, upper = 10000)
if (!is.null(y_source)) checkmate::assert_string(y_source)

# Build filter JSONs for X and Y
Expand All @@ -69,7 +71,10 @@ mi_data <- function(
source = x_source,
conditions = x_conditions
)
x_json_string <- jsonlite::toJSON(x_json, auto_unbox = TRUE)
# Minify JSON to remove extra whitespace/newlines
x_json_string <- jsonlite::minify(
jsonlite::toJSON(x_json, auto_unbox = TRUE)
)

# Check if it's bivariate (Y filters are provided)
if (!is.null(y_source) && !is.null(y_filters)) {
Expand All @@ -80,7 +85,9 @@ mi_data <- function(
source = y_source,
conditions = y_conditions
)
y_json_string <- jsonlite::toJSON(y_json, auto_unbox = TRUE)
y_json_string <- jsonlite::minify(
jsonlite::toJSON(y_json, auto_unbox = TRUE)
)
}

# Build API endpoint
Expand All @@ -104,26 +111,94 @@ mi_data <- function(
query_params$`_outcome_year` <- as.character(year)
}

# Add JSON parameters as proper strings without URL encoding issues
query_params$`X_JSON` <- I(x_json_string)
# Add JSON parameters as proper strings so that httr2 can URL encode them automatically
query_params$`X_JSON` <- x_json_string
if (!is.null(y_source) && !is.null(y_filters)) {
query_params$`Y_JSON` <- I(y_json_string)
query_params$`Y_JSON` <- y_json_string
}

# Perform API request
response <- httr2::request(url_endpoint) |>
request <- httr2::request(url_endpoint) |>
httr2::req_headers(
"Content-Type" = "application/json",
"User-Agent" = getOption("mapineqr.user_agent")
) |>
httr2::req_url_query(!!!query_params) |>
httr2::req_method("GET") |>
httr2::req_perform()
httr2::req_method("GET")

response <- request |> httr2::req_perform()

# Parse response
response_data <- httr2::resp_body_json(response, simplifyVector = TRUE) |>
tibble::as_tibble()

# Check for duplicate values within each geo for x and (if applicable) y.
duplicate_issues <- response_data |>
dplyr::group_by(.data$geo) |>
dplyr::summarise(
distinct_x = dplyr::n_distinct(.data$x),
distinct_y = if ("y" %in% names(response_data)) dplyr::n_distinct(.data$y) else NA_integer_,
.groups = "drop"
)

x_issue <- any(duplicate_issues$distinct_x > 1)
y_issue <- if ("y" %in% names(response_data)) any(duplicate_issues$distinct_y > 1) else FALSE

# Only perform additional filter checking if duplicate geos exist
if (x_issue || y_issue) {
# --- For the x variable ---
missing_x_filters <- character(0)
if (x_issue) {
available_filters <- mi_source_filters(source_name = x_source, year = year, level = level)
# Determine which filter fields have more than one option
multi_option_fields <- available_filters |>
dplyr::group_by(.data$field) |>
dplyr::summarise(n_options = dplyr::n_distinct(.data$value), .groups = "drop") |>
dplyr::filter(.data$n_options > 1) |>
dplyr::pull(.data$field)
# Only require filters for those fields with multiple options.
missing_x_filters <- setdiff(multi_option_fields, names(x_filters))
}

# --- For the y variable (if applicable) ---
missing_y_filters <- character(0)
if (y_issue) {
available_y_filters <- mi_source_filters(source_name = y_source, year = year, level = level)
multi_option_y_fields <- available_y_filters |>
dplyr::group_by(.data$field) |>
dplyr::summarise(n_options = dplyr::n_distinct(.data$value), .groups = "drop") |>
dplyr::filter(.data$n_options > 1) |>
dplyr::pull(.data$field)
missing_y_filters <- setdiff(multi_option_y_fields, names(y_filters))
}

# Only raise an error if any missing filter is found among fields with multiple options.
if (length(missing_x_filters) > 0 || length(missing_y_filters) > 0) {
msg <- "The API returned duplicate values for some geographic regions. This may indicate that not all necessary filters were specified."
if (length(missing_x_filters) > 0) {
msg <- paste0(
msg,
"\n\nFor the 'x' variable (source: '", x_source, "'):",
"\n The following filter fields (with multiple available options) were not specified: ",
paste(missing_x_filters, collapse = ", "),
"\nYou can review available filters by running:\n mi_source_filters(source_name = '", x_source, "', year = ", year, ", level = '", level, "')"
)
}
if (length(missing_y_filters) > 0) {
msg <- paste0(
msg,
"\n\nFor the 'y' variable (source: '", y_source, "'):",
"\n The following filter fields (with multiple available options) were not specified: ",
paste(missing_y_filters, collapse = ", "),
"\nYou can review available filters by running:\n mi_source_filters(source_name = '", y_source, "', year = ", year, ", level = '", level, "')"
)
}
stop(msg)
}
}



# Define expected columns based on whether y_source is specified
if (is.null(y_source)) {
expected_columns <- c("geo", "geo_name", "geo_source", "geo_year", "data_year", "x")
Expand Down
8 changes: 4 additions & 4 deletions man/mi_data.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/mi_source_coverage.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/mi_source_filters.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/mi_sources.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 13575c8

Please sign in to comment.