add precision column stat_string and different table names (#986)

Fix #977 #976 #970 --------- Signed-off-by: Davide Garolini <[email protected]> Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Emily de la Rua <[email protected]>
insightsengineering · Jan 27, 2025 · 088d4bb · 088d4bb
1 parent 44b7eef
commit 088d4bb
Show file tree

Hide file tree

Showing 4 changed files with 192 additions and 42 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,12 @@
 ## rtables 0.6.11.9001
 
+### New Features
+ * Added `stat_string` to `as_result_df(make_ard = TRUE)` to preserve the original string representation of the statistics.
+ * Added `add_tbl_name_split` to `as_result_df()` to handle split levels constituted by different table names.
+
+### Bug Fixes
+ * Fixed issue with `split_cols_by_multivar()` when having more than one value. Now `as_result_df(make_ard = TRUE)` adds a predefined split name for each of the `multivar` splits.
+
 ## rtables 0.6.11
 
 ### New Features

diff --git a/R/tt_as_df.R b/R/tt_as_df.R
@@ -17,6 +17,9 @@
 #' @param simplify (`flag`)\cr when `TRUE`, the result data frame will have only visible labels and
 #'   result columns. Consider showing also label rows with `keep_label_rows = TRUE`. This output can be
 #'   used again to create a `TableTree` object with [df_to_tt()].
+#' @param add_tbl_name_split (`flag`)\cr when `TRUE` and when the table has more than one
+#'   `analyze(table_names = "<diff_names>")`, the table names will be present as a group split named
+#'   `"<analysis_spl_tbl_name>"`.
 #' @param ... additional arguments passed to spec-specific result data frame function (`spec`).
 #'
 #' @return
@@ -41,6 +44,7 @@ as_result_df <- function(tt, spec = NULL,
                          make_ard = FALSE,
                          expand_colnames = FALSE,
                          keep_label_rows = FALSE,
+                         add_tbl_name_split = FALSE,
                          simplify = FALSE,
                          ...) {
   data_format <- data_format[[1]]
@@ -51,6 +55,7 @@ as_result_df <- function(tt, spec = NULL,
   checkmate::assert_flag(expand_colnames)
   checkmate::assert_flag(keep_label_rows)
   checkmate::assert_flag(simplify)
+  checkmate::assert_flag(add_tbl_name_split)
 
   if (nrow(tt) == 0) {
     return(sanitize_table_struct(tt))
@@ -70,24 +75,31 @@ as_result_df <- function(tt, spec = NULL,
     if (data_format %in% c("strings", "numeric")) {
       # we keep previous calculations to check the format of the data
       mf_tt <- matrix_form(tt)
-      mf_result_chars <- mf_strings(mf_tt)[-seq_len(mf_nlheader(mf_tt)), -1]
+      mf_result_chars <- mf_strings(mf_tt)[-seq_len(mf_nlheader(mf_tt)), -1, drop = FALSE]
       mf_result_chars <- .remove_empty_elements(mf_result_chars)
-      mf_result_numeric <- as.data.frame(
-        .make_numeric_char_mf(mf_result_chars)
-      )
+      mf_result_numeric <- .make_numeric_char_mf(mf_result_chars)
       mf_result_chars <- as.data.frame(mf_result_chars)
+      mf_result_numeric <- as.data.frame(mf_result_numeric)
       if (!setequal(dim(mf_result_numeric), dim(cellvals)) || !setequal(dim(mf_result_chars), dim(cellvals))) {
         stop(
           "The extracted numeric data.frame does not have the same dimension of the",
           " cell values extracted with cell_values(). This is a bug. Please report it."
         ) # nocov
       }
+
+      colnames(mf_result_chars) <- colnames(cellvals)
+      colnames(mf_result_numeric) <- colnames(cellvals)
       if (data_format == "strings") {
-        colnames(mf_result_chars) <- colnames(cellvals)
         cellvals <- mf_result_chars
-      } else {
-        colnames(mf_result_numeric) <- colnames(cellvals)
-        cellvals <- mf_result_numeric
+        if (isTRUE(make_ard)) {
+          stop("make_ard = TRUE is not compatible with data_format = 'strings'")
+        }
+      } else if (data_format == "numeric") {
+        if (isTRUE(make_ard)) {
+          cellvals <- .convert_to_character(mf_result_numeric)
+        } else {
+          cellvals <- mf_result_numeric
+        }
       }
     }
 
@@ -112,7 +124,7 @@ as_result_df <- function(tt, spec = NULL,
       lapply(
         seq_len(NROW(df)),
         function(ii) {
-          handle_rdf_row(df[ii, ], maxlen = maxlen)
+          handle_rdf_row(df[ii, ], maxlen = maxlen, add_tbl_name_split = add_tbl_name_split)
         }
       )
     )
@@ -219,9 +231,23 @@ as_result_df <- function(tt, spec = NULL,
       for (col_i in only_col_indexes) {
         # Making row splits into row specifications (group1 group1_level)
         current_col_split_level <- unlist(ret_tmp[seq_len(number_of_col_splits), col_i], use.names = FALSE)
-        flattened_cols_names <- .c_alternated(column_split_names[[1]][[1]], current_col_split_level)
+        col_split_names <- column_split_names[[1]][[1]] # cross section of the column split names (not values)
+        more_than_one_name_in_csn <- sapply(col_split_names, length) > 1
+
+        # Correction for cases where there is split_cols_by_multivar
+        if (any(more_than_one_name_in_csn)) {
+          col_split_names[more_than_one_name_in_csn] <- lapply(
+            seq(sum(more_than_one_name_in_csn)),
+            function(i) {
+              paste0("multivar_split", i)
+            }
+          )
+        }
+
+        # Alternated association of split names and values (along with group split)
+        flattened_cols_names <- .c_alternated(col_split_names, current_col_split_level)
         names(flattened_cols_names) <- .c_alternated(
-          paste0("group", seq_along(column_split_names[[1]][[1]]) + n_row_groups),
+          paste0("group", seq_along(col_split_names) + n_row_groups),
           paste0("group", seq_along(current_col_split_level) + n_row_groups, "_level")
         )
 
@@ -243,12 +269,12 @@ as_result_df <- function(tt, spec = NULL,
         # retrieving stat names and stats
         stat_name <- setNames(cell_stat_names[, col_i - min(only_col_indexes) + 1, drop = TRUE], NULL)
         stat <- setNames(ret_tmp[!col_label_rows, col_i, drop = TRUE], NULL)
-        necessary_stat_lengths <- sapply(stat, length)
+        necessary_stat_lengths <- lapply(stat, length)
         stat[sapply(stat, is.null)] <- NA
 
         # Truncating or adding NA if stat names has more or less elements than stats
         stat_name <- lapply(seq_along(stat_name), function(sn_i) {
-          stat_name[[sn_i]][seq_len(necessary_stat_lengths[sn_i])]
+          unlist(stat_name[[sn_i]], use.names = FALSE)[seq_len(necessary_stat_lengths[[sn_i]])]
         })
         stat_name[!nzchar(stat_name)] <- NA
 
@@ -269,6 +295,15 @@ as_result_df <- function(tt, spec = NULL,
         ret_w_cols <- rbind(ret_w_cols, tmp_ret_by_col_i)
       }
 
+      # If already_done is not present, we need to call the function again to keep precision
+      if (!"already_done" %in% names(list(...))) {
+        stat_string_ret <- as_result_df(
+          tt = tt, spec = spec, data_format = "numeric",
+          make_ard = TRUE, already_done = TRUE, ...
+        )
+        ret_w_cols <- cbind(ret_w_cols, "stat_string" = stat_string_ret$stat)
+      }
+
       ret <- ret_w_cols
     }
 
@@ -290,22 +325,25 @@ as_result_df <- function(tt, spec = NULL,
 }
 
 # Helper function used to structure the raw values into a dataframe
-.make_df_from_raw_data <- function(raw_vals, nr, nc) {
+.make_df_from_raw_data <- function(rawvals, nr, nc) {
   ## if the table has one row and multiple columns, sometimes the cell values returns a list of the cell values
   ## rather than a list of length 1 representing the single row. This is bad but may not be changeable
   ## at this point.
-  if (nr == 1 && length(raw_vals) > 1) {
-    raw_vals <- list(raw_vals)
+  if (nr == 1 && length(rawvals) > 1) {
+    rawvals <- list(rawvals)
   }
 
   # Flatten the list of lists (rows) of cell values into a data frame
-  cellvals <- as.data.frame(do.call(rbind, raw_vals))
-  row.names(cellvals) <- NULL
+  cellvals <- as.data.frame(do.call(rbind, rawvals))
 
   if (nr == 1 && nc == 1) {
-    colnames(cellvals) <- names(raw_vals)
+    if (length(unlist(rawvals)) > 1) { # This happens only with nr = nc = 1 for raw values
+      cellvals <- as.data.frame(I(rawvals))
+    }
+    colnames(cellvals) <- names(rawvals)
   }
 
+  row.names(cellvals) <- NULL
   cellvals
 }
 
@@ -358,21 +396,30 @@ as_result_df <- function(tt, spec = NULL,
     return(char_df[nzchar(char_df, keepNA = TRUE)])
   }
 
-  apply(char_df, 2, function(col_i) col_i[nzchar(col_i, keepNA = TRUE)])
+  ret <- apply(char_df, 2, function(col_i) col_i[nzchar(col_i, keepNA = TRUE)], simplify = FALSE)
+  do.call(cbind, ret)
 }
 
 # Helper function to make the character matrix numeric
 .make_numeric_char_mf <- function(char_df) {
   if (is.null(dim(char_df))) {
-    return(as.numeric(stringi::stri_extract_all(char_df, regex = "\\d+.\\d+|\\d+")))
+    ret <- lapply(char_df[[1]], function(x) {
+      as.numeric(stringi::stri_extract_all(x, regex = "\\d+.\\d+|\\d+")[[1]])
+    }) # keeps the list (single element) for data.frame
+    return(I(ret))
   }
 
   ret <- apply(char_df, 2, function(col_i) {
-    lapply(
+    out <- lapply(
       stringi::stri_extract_all(col_i, regex = "\\d+.\\d+|\\d+"),
       as.numeric
     )
-  })
+    if (all(dim(char_df) == c(1, 1)) && is.list(out[[1]])) {
+      return(unlist(out, use.names = FALSE))
+    } else {
+      return(out)
+    }
+  }, simplify = FALSE)
 
   do.call(cbind, ret)
 }
@@ -386,7 +433,7 @@ make_result_df_md_colnames <- function(maxlen) {
   ret <- c(ret, c("avar_name", "row_name", "label_name", "row_num", "is_group_summary", "node_class"))
 }
 
-do_label_row <- function(rdfrow, maxlen) {
+do_label_row <- function(rdfrow, maxlen, add_tbl_name_split = FALSE) {
   pth <- rdfrow$path[[1]]
   # Adjusting for the fact that we have two columns for each split
   extra_nas_from_splits <- floor((maxlen - length(pth)) / 2) * 2
@@ -395,8 +442,13 @@ do_label_row <- function(rdfrow, maxlen) {
   if (length(pth) %% 2 == 1) {
     extra_nas_from_splits <- extra_nas_from_splits + 1
   } else {
-    pth <- c("<analysis_spl_tbl_name>", pth)
-    extra_nas_from_splits <- extra_nas_from_splits - 1
+    if (isTRUE(add_tbl_name_split)) {
+      pth <- c("<analysis_spl_tbl_name>", pth)
+      extra_nas_from_splits <- extra_nas_from_splits - 1
+    } else {
+      pth <- pth[-1]
+      extra_nas_from_splits <- extra_nas_from_splits + 1
+    }
   }
 
   c(
@@ -431,14 +483,18 @@ do_content_row <- function(rdfrow, maxlen) {
   )
 }
 
-do_data_row <- function(rdfrow, maxlen) {
+do_data_row <- function(rdfrow, maxlen, add_tbl_name_split = FALSE) {
   pth <- rdfrow$path[[1]]
   pthlen <- length(pth)
   ## odd means we have a multi-analsysis step in the path, we do not want this in the result
   if (pthlen %% 2 == 1 && pthlen > 1) {
     # we remove the last element, as it is a fake split (tbl_name from analyse)
     # pth <- pth[-1 * (pthlen - 2)]
-    pth <- c("<analysis_spl_tbl_name>", pth)
+    if (isTRUE(add_tbl_name_split)) {
+      pth <- c("<analysis_spl_tbl_name>", pth)
+    } else {
+      pth <- pth[-1]
+    }
   }
   pthlen_new <- length(pth)
   if (pthlen_new == 1) {
@@ -483,13 +539,13 @@ do_data_row <- function(rdfrow, maxlen) {
   path
 }
 
-handle_rdf_row <- function(rdfrow, maxlen) {
+handle_rdf_row <- function(rdfrow, maxlen, add_tbl_name_split = FALSE) {
   nclass <- rdfrow$node_class
 
   ret <- switch(nclass,
-    LabelRow = do_label_row(rdfrow, maxlen),
+    LabelRow = do_label_row(rdfrow, maxlen, add_tbl_name_split = add_tbl_name_split),
     ContentRow = do_content_row(rdfrow, maxlen),
-    DataRow = do_data_row(rdfrow, maxlen),
+    DataRow = do_data_row(rdfrow, maxlen, add_tbl_name_split = add_tbl_name_split),
     stop("Unrecognized node type in row dataframe, unable to generate result data frame")
   )
   setNames(ret, make_result_df_md_colnames(maxlen))
@@ -510,6 +566,23 @@ handle_rdf_row <- function(rdfrow, maxlen) {
     return(ret)
   }
 }
+
+# Function to convert all elements to character while preserving structure
+.convert_to_character <- function(df) {
+  # Apply transformation to each column
+  df_converted <- lapply(df, function(col) {
+    if (is.list(col)) {
+      # For columns with vector cells, convert each vector to a character vector
+      I(lapply(col, as.character))
+    } else {
+      # For regular columns, directly convert to character
+      as.character(col)
+    }
+  })
+  # Return the transformed data frame
+  data.frame(df_converted, stringsAsFactors = FALSE)
+}
+
 # path_enriched_df ------------------------------------------------------------
 #
 #' @describeIn data.frame_export Transform a `TableTree` object to a path-enriched `data.frame`.

diff --git a/man/data.frame_export.Rd b/man/data.frame_export.Rd