diff --git a/NEWS.md b/NEWS.md index 5046a8002..21ec9f978 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,12 @@ ## rtables 0.6.11.9001 +### New Features + * Added `stat_string` to `as_result_df(make_ard = TRUE)` to preserve the original string representation of the statistics. + * Added `add_tbl_name_split` to `as_result_df()` to handle split levels constituted by different table names. + +### Bug Fixes + * Fixed issue with `split_cols_by_multivar()` when having more than one value. Now `as_result_df(make_ard = TRUE)` adds a predefined split name for each of the `multivar` splits. + ## rtables 0.6.11 ### New Features diff --git a/R/tt_as_df.R b/R/tt_as_df.R index c79ba4f3a..e4c14b136 100644 --- a/R/tt_as_df.R +++ b/R/tt_as_df.R @@ -17,6 +17,9 @@ #' @param simplify (`flag`)\cr when `TRUE`, the result data frame will have only visible labels and #' result columns. Consider showing also label rows with `keep_label_rows = TRUE`. This output can be #' used again to create a `TableTree` object with [df_to_tt()]. +#' @param add_tbl_name_split (`flag`)\cr when `TRUE` and when the table has more than one +#' `analyze(table_names = "")`, the table names will be present as a group split named +#' `""`. #' @param ... additional arguments passed to spec-specific result data frame function (`spec`). #' #' @return @@ -41,6 +44,7 @@ as_result_df <- function(tt, spec = NULL, make_ard = FALSE, expand_colnames = FALSE, keep_label_rows = FALSE, + add_tbl_name_split = FALSE, simplify = FALSE, ...) { data_format <- data_format[[1]] @@ -51,6 +55,7 @@ as_result_df <- function(tt, spec = NULL, checkmate::assert_flag(expand_colnames) checkmate::assert_flag(keep_label_rows) checkmate::assert_flag(simplify) + checkmate::assert_flag(add_tbl_name_split) if (nrow(tt) == 0) { return(sanitize_table_struct(tt)) @@ -70,24 +75,31 @@ as_result_df <- function(tt, spec = NULL, if (data_format %in% c("strings", "numeric")) { # we keep previous calculations to check the format of the data mf_tt <- matrix_form(tt) - mf_result_chars <- mf_strings(mf_tt)[-seq_len(mf_nlheader(mf_tt)), -1] + mf_result_chars <- mf_strings(mf_tt)[-seq_len(mf_nlheader(mf_tt)), -1, drop = FALSE] mf_result_chars <- .remove_empty_elements(mf_result_chars) - mf_result_numeric <- as.data.frame( - .make_numeric_char_mf(mf_result_chars) - ) + mf_result_numeric <- .make_numeric_char_mf(mf_result_chars) mf_result_chars <- as.data.frame(mf_result_chars) + mf_result_numeric <- as.data.frame(mf_result_numeric) if (!setequal(dim(mf_result_numeric), dim(cellvals)) || !setequal(dim(mf_result_chars), dim(cellvals))) { stop( "The extracted numeric data.frame does not have the same dimension of the", " cell values extracted with cell_values(). This is a bug. Please report it." ) # nocov } + + colnames(mf_result_chars) <- colnames(cellvals) + colnames(mf_result_numeric) <- colnames(cellvals) if (data_format == "strings") { - colnames(mf_result_chars) <- colnames(cellvals) cellvals <- mf_result_chars - } else { - colnames(mf_result_numeric) <- colnames(cellvals) - cellvals <- mf_result_numeric + if (isTRUE(make_ard)) { + stop("make_ard = TRUE is not compatible with data_format = 'strings'") + } + } else if (data_format == "numeric") { + if (isTRUE(make_ard)) { + cellvals <- .convert_to_character(mf_result_numeric) + } else { + cellvals <- mf_result_numeric + } } } @@ -112,7 +124,7 @@ as_result_df <- function(tt, spec = NULL, lapply( seq_len(NROW(df)), function(ii) { - handle_rdf_row(df[ii, ], maxlen = maxlen) + handle_rdf_row(df[ii, ], maxlen = maxlen, add_tbl_name_split = add_tbl_name_split) } ) ) @@ -219,9 +231,23 @@ as_result_df <- function(tt, spec = NULL, for (col_i in only_col_indexes) { # Making row splits into row specifications (group1 group1_level) current_col_split_level <- unlist(ret_tmp[seq_len(number_of_col_splits), col_i], use.names = FALSE) - flattened_cols_names <- .c_alternated(column_split_names[[1]][[1]], current_col_split_level) + col_split_names <- column_split_names[[1]][[1]] # cross section of the column split names (not values) + more_than_one_name_in_csn <- sapply(col_split_names, length) > 1 + + # Correction for cases where there is split_cols_by_multivar + if (any(more_than_one_name_in_csn)) { + col_split_names[more_than_one_name_in_csn] <- lapply( + seq(sum(more_than_one_name_in_csn)), + function(i) { + paste0("multivar_split", i) + } + ) + } + + # Alternated association of split names and values (along with group split) + flattened_cols_names <- .c_alternated(col_split_names, current_col_split_level) names(flattened_cols_names) <- .c_alternated( - paste0("group", seq_along(column_split_names[[1]][[1]]) + n_row_groups), + paste0("group", seq_along(col_split_names) + n_row_groups), paste0("group", seq_along(current_col_split_level) + n_row_groups, "_level") ) @@ -243,12 +269,12 @@ as_result_df <- function(tt, spec = NULL, # retrieving stat names and stats stat_name <- setNames(cell_stat_names[, col_i - min(only_col_indexes) + 1, drop = TRUE], NULL) stat <- setNames(ret_tmp[!col_label_rows, col_i, drop = TRUE], NULL) - necessary_stat_lengths <- sapply(stat, length) + necessary_stat_lengths <- lapply(stat, length) stat[sapply(stat, is.null)] <- NA # Truncating or adding NA if stat names has more or less elements than stats stat_name <- lapply(seq_along(stat_name), function(sn_i) { - stat_name[[sn_i]][seq_len(necessary_stat_lengths[sn_i])] + unlist(stat_name[[sn_i]], use.names = FALSE)[seq_len(necessary_stat_lengths[[sn_i]])] }) stat_name[!nzchar(stat_name)] <- NA @@ -269,6 +295,15 @@ as_result_df <- function(tt, spec = NULL, ret_w_cols <- rbind(ret_w_cols, tmp_ret_by_col_i) } + # If already_done is not present, we need to call the function again to keep precision + if (!"already_done" %in% names(list(...))) { + stat_string_ret <- as_result_df( + tt = tt, spec = spec, data_format = "numeric", + make_ard = TRUE, already_done = TRUE, ... + ) + ret_w_cols <- cbind(ret_w_cols, "stat_string" = stat_string_ret$stat) + } + ret <- ret_w_cols } @@ -290,22 +325,25 @@ as_result_df <- function(tt, spec = NULL, } # Helper function used to structure the raw values into a dataframe -.make_df_from_raw_data <- function(raw_vals, nr, nc) { +.make_df_from_raw_data <- function(rawvals, nr, nc) { ## if the table has one row and multiple columns, sometimes the cell values returns a list of the cell values ## rather than a list of length 1 representing the single row. This is bad but may not be changeable ## at this point. - if (nr == 1 && length(raw_vals) > 1) { - raw_vals <- list(raw_vals) + if (nr == 1 && length(rawvals) > 1) { + rawvals <- list(rawvals) } # Flatten the list of lists (rows) of cell values into a data frame - cellvals <- as.data.frame(do.call(rbind, raw_vals)) - row.names(cellvals) <- NULL + cellvals <- as.data.frame(do.call(rbind, rawvals)) if (nr == 1 && nc == 1) { - colnames(cellvals) <- names(raw_vals) + if (length(unlist(rawvals)) > 1) { # This happens only with nr = nc = 1 for raw values + cellvals <- as.data.frame(I(rawvals)) + } + colnames(cellvals) <- names(rawvals) } + row.names(cellvals) <- NULL cellvals } @@ -358,21 +396,30 @@ as_result_df <- function(tt, spec = NULL, return(char_df[nzchar(char_df, keepNA = TRUE)]) } - apply(char_df, 2, function(col_i) col_i[nzchar(col_i, keepNA = TRUE)]) + ret <- apply(char_df, 2, function(col_i) col_i[nzchar(col_i, keepNA = TRUE)], simplify = FALSE) + do.call(cbind, ret) } # Helper function to make the character matrix numeric .make_numeric_char_mf <- function(char_df) { if (is.null(dim(char_df))) { - return(as.numeric(stringi::stri_extract_all(char_df, regex = "\\d+.\\d+|\\d+"))) + ret <- lapply(char_df[[1]], function(x) { + as.numeric(stringi::stri_extract_all(x, regex = "\\d+.\\d+|\\d+")[[1]]) + }) # keeps the list (single element) for data.frame + return(I(ret)) } ret <- apply(char_df, 2, function(col_i) { - lapply( + out <- lapply( stringi::stri_extract_all(col_i, regex = "\\d+.\\d+|\\d+"), as.numeric ) - }) + if (all(dim(char_df) == c(1, 1)) && is.list(out[[1]])) { + return(unlist(out, use.names = FALSE)) + } else { + return(out) + } + }, simplify = FALSE) do.call(cbind, ret) } @@ -386,7 +433,7 @@ make_result_df_md_colnames <- function(maxlen) { ret <- c(ret, c("avar_name", "row_name", "label_name", "row_num", "is_group_summary", "node_class")) } -do_label_row <- function(rdfrow, maxlen) { +do_label_row <- function(rdfrow, maxlen, add_tbl_name_split = FALSE) { pth <- rdfrow$path[[1]] # Adjusting for the fact that we have two columns for each split extra_nas_from_splits <- floor((maxlen - length(pth)) / 2) * 2 @@ -395,8 +442,13 @@ do_label_row <- function(rdfrow, maxlen) { if (length(pth) %% 2 == 1) { extra_nas_from_splits <- extra_nas_from_splits + 1 } else { - pth <- c("", pth) - extra_nas_from_splits <- extra_nas_from_splits - 1 + if (isTRUE(add_tbl_name_split)) { + pth <- c("", pth) + extra_nas_from_splits <- extra_nas_from_splits - 1 + } else { + pth <- pth[-1] + extra_nas_from_splits <- extra_nas_from_splits + 1 + } } c( @@ -431,14 +483,18 @@ do_content_row <- function(rdfrow, maxlen) { ) } -do_data_row <- function(rdfrow, maxlen) { +do_data_row <- function(rdfrow, maxlen, add_tbl_name_split = FALSE) { pth <- rdfrow$path[[1]] pthlen <- length(pth) ## odd means we have a multi-analsysis step in the path, we do not want this in the result if (pthlen %% 2 == 1 && pthlen > 1) { # we remove the last element, as it is a fake split (tbl_name from analyse) # pth <- pth[-1 * (pthlen - 2)] - pth <- c("", pth) + if (isTRUE(add_tbl_name_split)) { + pth <- c("", pth) + } else { + pth <- pth[-1] + } } pthlen_new <- length(pth) if (pthlen_new == 1) { @@ -483,13 +539,13 @@ do_data_row <- function(rdfrow, maxlen) { path } -handle_rdf_row <- function(rdfrow, maxlen) { +handle_rdf_row <- function(rdfrow, maxlen, add_tbl_name_split = FALSE) { nclass <- rdfrow$node_class ret <- switch(nclass, - LabelRow = do_label_row(rdfrow, maxlen), + LabelRow = do_label_row(rdfrow, maxlen, add_tbl_name_split = add_tbl_name_split), ContentRow = do_content_row(rdfrow, maxlen), - DataRow = do_data_row(rdfrow, maxlen), + DataRow = do_data_row(rdfrow, maxlen, add_tbl_name_split = add_tbl_name_split), stop("Unrecognized node type in row dataframe, unable to generate result data frame") ) setNames(ret, make_result_df_md_colnames(maxlen)) @@ -510,6 +566,23 @@ handle_rdf_row <- function(rdfrow, maxlen) { return(ret) } } + +# Function to convert all elements to character while preserving structure +.convert_to_character <- function(df) { + # Apply transformation to each column + df_converted <- lapply(df, function(col) { + if (is.list(col)) { + # For columns with vector cells, convert each vector to a character vector + I(lapply(col, as.character)) + } else { + # For regular columns, directly convert to character + as.character(col) + } + }) + # Return the transformed data frame + data.frame(df_converted, stringsAsFactors = FALSE) +} + # path_enriched_df ------------------------------------------------------------ # #' @describeIn data.frame_export Transform a `TableTree` object to a path-enriched `data.frame`. diff --git a/man/data.frame_export.Rd b/man/data.frame_export.Rd index f87dc8db0..580f951e3 100644 --- a/man/data.frame_export.Rd +++ b/man/data.frame_export.Rd @@ -13,6 +13,7 @@ as_result_df( make_ard = FALSE, expand_colnames = FALSE, keep_label_rows = FALSE, + add_tbl_name_split = FALSE, simplify = FALSE, ... ) @@ -37,6 +38,10 @@ names above the usual output. This is useful when the result data frame is used \item{keep_label_rows}{(\code{flag})\cr when \code{TRUE}, the result data frame will have all labels as they appear in the final table.} +\item{add_tbl_name_split}{(\code{flag})\cr when \code{TRUE} and when the table has more than one +\code{analyze(table_names = "")}, the table names will be present as a group split named +\code{""}.} + \item{simplify}{(\code{flag})\cr when \code{TRUE}, the result data frame will have only visible labels and result columns. Consider showing also label rows with \code{keep_label_rows = TRUE}. This output can be used again to create a \code{TableTree} object with \code{\link[=df_to_tt]{df_to_tt()}}.} diff --git a/tests/testthat/test-result_data_frame.R b/tests/testthat/test-result_data_frame.R index bbc0a641d..28fac3226 100644 --- a/tests/testthat/test-result_data_frame.R +++ b/tests/testthat/test-result_data_frame.R @@ -98,7 +98,7 @@ test_that("as_result_df works with visual output (data_format as numeric)", { build_table(DM) expect_equal(as_result_df(tbl)$`all obs`, 5.851948, tolerance = 1e-6) expect_equal( - as_result_df(tbl, data_format = "numeric")$`all obs`, + as_result_df(tbl, data_format = "numeric")$`all obs`[[1]], as.numeric(as_result_df(tbl, data_format = "strings")$`all obs`) ) expect_equal(as_result_df(tbl, expand_colnames = TRUE)$`all obs`[2], "356") @@ -171,7 +171,7 @@ test_that("as_result_df keeps label rows", { expect_identical(ncol(rd1), ncol(rd4)) expect_identical(as.character(rd1[3, ]), as.character(rd2[5, ])) - expect_identical(rd2[is.na(rd2[, ncol(rd2)]), ], rd4[is.na(rd4[, ncol(rd4)]), ]) + expect_identical(rd3[["A: Drug X.S1"]], rd4[["A: Drug X.S1"]] %>% unlist()) # More challenging labels lyt <- make_big_lyt() @@ -246,8 +246,8 @@ test_that("as_result_df works with analyze-only tables (odd num of path elements analyze("mpg") %>% build_table(mtcars) - expect_equal(as_result_df(tbl)$group1[[1]], "") - expect_equal(as_result_df(tbl, make_ard = TRUE)$group1[[1]], "") + expect_equal(as_result_df(tbl, add_tbl_name_split = TRUE)$group1[[1]], "") + expect_equal(as_result_df(tbl, make_ard = TRUE, add_tbl_name_split = TRUE)$group1[[1]], "") }) test_that("make_ard produces realistic ARD output with as_result_df", { @@ -295,7 +295,7 @@ test_that("make_ard produces realistic ARD output with as_result_df", { analyze(vars = "SEX", afun = counts_percentage_custom) tbl <- build_table(lyt, ex_adsl) - ard_out <- as_result_df(tbl, make_ard = TRUE) + ard_out <- as_result_df(tbl, make_ard = TRUE, add_tbl_name_split = TRUE) # Numeric output expect_equal( @@ -309,7 +309,8 @@ test_that("make_ard produces realistic ARD output with as_result_df", { variable_level = "Mean (SD)", variable_label = "Mean (SD)", stat_name = "SD", - stat = 6.553326 + stat = 6.553326, + stat_string = "6.6" ), tolerance = 10e-6 ) @@ -326,7 +327,8 @@ test_that("make_ard produces realistic ARD output with as_result_df", { variable_level = "F", variable_label = "F", stat_name = "Percentage", - stat = 0.5746269 + stat = 0.5746269, + stat_string = "57" ), tolerance = 10e-6 ) @@ -369,7 +371,8 @@ test_that("make_ard works with multiple row levels", { variable_level = "UNDIFFERENTIATED", variable_label = "UNDIFFERENTIATED", stat_name = "n", - stat = 0 + stat = 0, + stat_string = "0" ), tolerance = 10e-6 ) @@ -403,7 +406,8 @@ test_that("make_ard works with multiple column levels", { variable_level = "Mean", variable_label = "Mean", stat_name = "mean", - stat = 34.4 + stat = 34.4, + stat_string = "34.4" ), tolerance = 10e-6 ) @@ -441,7 +445,8 @@ test_that("make_ard works with summarize_row_groups", { variable_level = "S1", variable_label = "S1", stat_name = "n", - stat = 18 + stat = 18, + stat_string = "18" ), tolerance = 10e-6 ) @@ -474,7 +479,8 @@ test_that("make_ard works with summarize_row_groups", { variable_level = "A: Drug X", variable_label = "A: Drug X", stat_name = "n", - stat = 18 + stat = 18, + stat_string = "18" ), tolerance = 10e-6 ) @@ -498,3 +504,62 @@ test_that("make_ard works if there are no stat_names", { expect_equal(as_result_df(tbl, make_ard = TRUE)$stat_name, rep(NA_character_, 4)) }) + +test_that("make_ard works if string precision is needed", { + lyt <- basic_table() %>% + split_rows_by("ARM") %>% + summarize_row_groups() %>% + split_cols_by("ARM") %>% + split_cols_by("STRATA1") %>% + analyze(c("AGE", "SEX")) + + tbl <- build_table(lyt, ex_adsl) + + # Some edge cases + expect_equal( + as_result_df(tbl[, 1], make_ard = TRUE) %>% dim(), + c(21, 12) + ) + expect_equal( + as_result_df(tbl[1, ], make_ard = TRUE) %>% dim(), + c(18, 12) + ) + + # One result + test_out <- as_result_df(tbl[, 1][1, ], make_ard = TRUE) + expect_equal(test_out$stat_name, c("n", "p")) + expect_equal(test_out$stat, c(38, 1)) + expect_equal(test_out$stat_string, c("38", "100")) +}) + +test_that("make_ard works with split_cols_by_multivar", { + # Regression test #970 + n <- 400 + + df <- tibble( + arm = factor(sample(c("Arm A", "Arm B"), n, replace = TRUE), levels = c("Arm A", "Arm B")), + country = factor(sample(c("CAN", "USA"), n, replace = TRUE, prob = c(.55, .45)), levels = c("CAN", "USA")), + gender = factor(sample(c("Female", "Male"), n, replace = TRUE), levels = c("Female", "Male")), + handed = factor(sample(c("Left", "Right"), n, prob = c(.6, .4), replace = TRUE), levels = c("Left", "Right")), + age = rchisq(n, 30) + 10 + ) %>% mutate( + weight = 35 * rnorm(n, sd = .5) + ifelse(gender == "Female", 140, 180) + ) + + colfuns <- list( + function(x) in_rows(mean = mean(x), .formats = "xx.x"), + function(x) in_rows("# x > 5" = sum(x > .5), .formats = "xx") + ) + + lyt <- basic_table() %>% + split_cols_by("arm") %>% + split_cols_by_multivar(c("age", "weight")) %>% + split_rows_by("country") %>% + summarize_row_groups() %>% + analyze_colvars(afun = colfuns) + + tbl <- build_table(lyt, df) + + expect_silent(out <- as_result_df(tbl, make_ard = TRUE)) + expect_true(all(out$group3 == "multivar_split1")) +})