Skip to content

Commit

Permalink
add precision column stat_string and different table names (#986)
Browse files Browse the repository at this point in the history
Fix #977 #976 #970

---------

Signed-off-by: Davide Garolini <[email protected]>
Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Emily de la Rua <[email protected]>
  • Loading branch information
3 people authored Jan 27, 2025
1 parent 44b7eef commit 088d4bb
Show file tree
Hide file tree
Showing 4 changed files with 192 additions and 42 deletions.
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
## rtables 0.6.11.9001

### New Features
* Added `stat_string` to `as_result_df(make_ard = TRUE)` to preserve the original string representation of the statistics.
* Added `add_tbl_name_split` to `as_result_df()` to handle split levels constituted by different table names.

### Bug Fixes
* Fixed issue with `split_cols_by_multivar()` when having more than one value. Now `as_result_df(make_ard = TRUE)` adds a predefined split name for each of the `multivar` splits.

## rtables 0.6.11

### New Features
Expand Down
135 changes: 104 additions & 31 deletions R/tt_as_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
#' @param simplify (`flag`)\cr when `TRUE`, the result data frame will have only visible labels and
#' result columns. Consider showing also label rows with `keep_label_rows = TRUE`. This output can be
#' used again to create a `TableTree` object with [df_to_tt()].
#' @param add_tbl_name_split (`flag`)\cr when `TRUE` and when the table has more than one
#' `analyze(table_names = "<diff_names>")`, the table names will be present as a group split named
#' `"<analysis_spl_tbl_name>"`.
#' @param ... additional arguments passed to spec-specific result data frame function (`spec`).
#'
#' @return
Expand All @@ -41,6 +44,7 @@ as_result_df <- function(tt, spec = NULL,
make_ard = FALSE,
expand_colnames = FALSE,
keep_label_rows = FALSE,
add_tbl_name_split = FALSE,
simplify = FALSE,
...) {
data_format <- data_format[[1]]
Expand All @@ -51,6 +55,7 @@ as_result_df <- function(tt, spec = NULL,
checkmate::assert_flag(expand_colnames)
checkmate::assert_flag(keep_label_rows)
checkmate::assert_flag(simplify)
checkmate::assert_flag(add_tbl_name_split)

if (nrow(tt) == 0) {
return(sanitize_table_struct(tt))
Expand All @@ -70,24 +75,31 @@ as_result_df <- function(tt, spec = NULL,
if (data_format %in% c("strings", "numeric")) {
# we keep previous calculations to check the format of the data
mf_tt <- matrix_form(tt)
mf_result_chars <- mf_strings(mf_tt)[-seq_len(mf_nlheader(mf_tt)), -1]
mf_result_chars <- mf_strings(mf_tt)[-seq_len(mf_nlheader(mf_tt)), -1, drop = FALSE]
mf_result_chars <- .remove_empty_elements(mf_result_chars)
mf_result_numeric <- as.data.frame(
.make_numeric_char_mf(mf_result_chars)
)
mf_result_numeric <- .make_numeric_char_mf(mf_result_chars)
mf_result_chars <- as.data.frame(mf_result_chars)
mf_result_numeric <- as.data.frame(mf_result_numeric)
if (!setequal(dim(mf_result_numeric), dim(cellvals)) || !setequal(dim(mf_result_chars), dim(cellvals))) {
stop(
"The extracted numeric data.frame does not have the same dimension of the",
" cell values extracted with cell_values(). This is a bug. Please report it."
) # nocov
}

colnames(mf_result_chars) <- colnames(cellvals)
colnames(mf_result_numeric) <- colnames(cellvals)
if (data_format == "strings") {
colnames(mf_result_chars) <- colnames(cellvals)
cellvals <- mf_result_chars
} else {
colnames(mf_result_numeric) <- colnames(cellvals)
cellvals <- mf_result_numeric
if (isTRUE(make_ard)) {
stop("make_ard = TRUE is not compatible with data_format = 'strings'")
}
} else if (data_format == "numeric") {
if (isTRUE(make_ard)) {
cellvals <- .convert_to_character(mf_result_numeric)
} else {
cellvals <- mf_result_numeric
}
}
}

Expand All @@ -112,7 +124,7 @@ as_result_df <- function(tt, spec = NULL,
lapply(
seq_len(NROW(df)),
function(ii) {
handle_rdf_row(df[ii, ], maxlen = maxlen)
handle_rdf_row(df[ii, ], maxlen = maxlen, add_tbl_name_split = add_tbl_name_split)
}
)
)
Expand Down Expand Up @@ -219,9 +231,23 @@ as_result_df <- function(tt, spec = NULL,
for (col_i in only_col_indexes) {
# Making row splits into row specifications (group1 group1_level)
current_col_split_level <- unlist(ret_tmp[seq_len(number_of_col_splits), col_i], use.names = FALSE)
flattened_cols_names <- .c_alternated(column_split_names[[1]][[1]], current_col_split_level)
col_split_names <- column_split_names[[1]][[1]] # cross section of the column split names (not values)
more_than_one_name_in_csn <- sapply(col_split_names, length) > 1

# Correction for cases where there is split_cols_by_multivar
if (any(more_than_one_name_in_csn)) {
col_split_names[more_than_one_name_in_csn] <- lapply(
seq(sum(more_than_one_name_in_csn)),
function(i) {
paste0("multivar_split", i)
}
)
}

# Alternated association of split names and values (along with group split)
flattened_cols_names <- .c_alternated(col_split_names, current_col_split_level)
names(flattened_cols_names) <- .c_alternated(
paste0("group", seq_along(column_split_names[[1]][[1]]) + n_row_groups),
paste0("group", seq_along(col_split_names) + n_row_groups),
paste0("group", seq_along(current_col_split_level) + n_row_groups, "_level")
)

Expand All @@ -243,12 +269,12 @@ as_result_df <- function(tt, spec = NULL,
# retrieving stat names and stats
stat_name <- setNames(cell_stat_names[, col_i - min(only_col_indexes) + 1, drop = TRUE], NULL)
stat <- setNames(ret_tmp[!col_label_rows, col_i, drop = TRUE], NULL)
necessary_stat_lengths <- sapply(stat, length)
necessary_stat_lengths <- lapply(stat, length)
stat[sapply(stat, is.null)] <- NA

# Truncating or adding NA if stat names has more or less elements than stats
stat_name <- lapply(seq_along(stat_name), function(sn_i) {
stat_name[[sn_i]][seq_len(necessary_stat_lengths[sn_i])]
unlist(stat_name[[sn_i]], use.names = FALSE)[seq_len(necessary_stat_lengths[[sn_i]])]
})
stat_name[!nzchar(stat_name)] <- NA

Expand All @@ -269,6 +295,15 @@ as_result_df <- function(tt, spec = NULL,
ret_w_cols <- rbind(ret_w_cols, tmp_ret_by_col_i)
}

# If already_done is not present, we need to call the function again to keep precision
if (!"already_done" %in% names(list(...))) {
stat_string_ret <- as_result_df(
tt = tt, spec = spec, data_format = "numeric",
make_ard = TRUE, already_done = TRUE, ...
)
ret_w_cols <- cbind(ret_w_cols, "stat_string" = stat_string_ret$stat)
}

ret <- ret_w_cols
}

Expand All @@ -290,22 +325,25 @@ as_result_df <- function(tt, spec = NULL,
}

# Helper function used to structure the raw values into a dataframe
.make_df_from_raw_data <- function(raw_vals, nr, nc) {
.make_df_from_raw_data <- function(rawvals, nr, nc) {
## if the table has one row and multiple columns, sometimes the cell values returns a list of the cell values
## rather than a list of length 1 representing the single row. This is bad but may not be changeable
## at this point.
if (nr == 1 && length(raw_vals) > 1) {
raw_vals <- list(raw_vals)
if (nr == 1 && length(rawvals) > 1) {
rawvals <- list(rawvals)
}

# Flatten the list of lists (rows) of cell values into a data frame
cellvals <- as.data.frame(do.call(rbind, raw_vals))
row.names(cellvals) <- NULL
cellvals <- as.data.frame(do.call(rbind, rawvals))

if (nr == 1 && nc == 1) {
colnames(cellvals) <- names(raw_vals)
if (length(unlist(rawvals)) > 1) { # This happens only with nr = nc = 1 for raw values
cellvals <- as.data.frame(I(rawvals))
}
colnames(cellvals) <- names(rawvals)
}

row.names(cellvals) <- NULL
cellvals
}

Expand Down Expand Up @@ -358,21 +396,30 @@ as_result_df <- function(tt, spec = NULL,
return(char_df[nzchar(char_df, keepNA = TRUE)])
}

apply(char_df, 2, function(col_i) col_i[nzchar(col_i, keepNA = TRUE)])
ret <- apply(char_df, 2, function(col_i) col_i[nzchar(col_i, keepNA = TRUE)], simplify = FALSE)
do.call(cbind, ret)
}

# Helper function to make the character matrix numeric
.make_numeric_char_mf <- function(char_df) {
if (is.null(dim(char_df))) {
return(as.numeric(stringi::stri_extract_all(char_df, regex = "\\d+.\\d+|\\d+")))
ret <- lapply(char_df[[1]], function(x) {
as.numeric(stringi::stri_extract_all(x, regex = "\\d+.\\d+|\\d+")[[1]])
}) # keeps the list (single element) for data.frame
return(I(ret))
}

ret <- apply(char_df, 2, function(col_i) {
lapply(
out <- lapply(
stringi::stri_extract_all(col_i, regex = "\\d+.\\d+|\\d+"),
as.numeric
)
})
if (all(dim(char_df) == c(1, 1)) && is.list(out[[1]])) {
return(unlist(out, use.names = FALSE))
} else {
return(out)
}
}, simplify = FALSE)

do.call(cbind, ret)
}
Expand All @@ -386,7 +433,7 @@ make_result_df_md_colnames <- function(maxlen) {
ret <- c(ret, c("avar_name", "row_name", "label_name", "row_num", "is_group_summary", "node_class"))
}

do_label_row <- function(rdfrow, maxlen) {
do_label_row <- function(rdfrow, maxlen, add_tbl_name_split = FALSE) {
pth <- rdfrow$path[[1]]
# Adjusting for the fact that we have two columns for each split
extra_nas_from_splits <- floor((maxlen - length(pth)) / 2) * 2
Expand All @@ -395,8 +442,13 @@ do_label_row <- function(rdfrow, maxlen) {
if (length(pth) %% 2 == 1) {
extra_nas_from_splits <- extra_nas_from_splits + 1
} else {
pth <- c("<analysis_spl_tbl_name>", pth)
extra_nas_from_splits <- extra_nas_from_splits - 1
if (isTRUE(add_tbl_name_split)) {
pth <- c("<analysis_spl_tbl_name>", pth)
extra_nas_from_splits <- extra_nas_from_splits - 1
} else {
pth <- pth[-1]
extra_nas_from_splits <- extra_nas_from_splits + 1
}
}

c(
Expand Down Expand Up @@ -431,14 +483,18 @@ do_content_row <- function(rdfrow, maxlen) {
)
}

do_data_row <- function(rdfrow, maxlen) {
do_data_row <- function(rdfrow, maxlen, add_tbl_name_split = FALSE) {
pth <- rdfrow$path[[1]]
pthlen <- length(pth)
## odd means we have a multi-analsysis step in the path, we do not want this in the result
if (pthlen %% 2 == 1 && pthlen > 1) {
# we remove the last element, as it is a fake split (tbl_name from analyse)
# pth <- pth[-1 * (pthlen - 2)]
pth <- c("<analysis_spl_tbl_name>", pth)
if (isTRUE(add_tbl_name_split)) {
pth <- c("<analysis_spl_tbl_name>", pth)
} else {
pth <- pth[-1]
}
}
pthlen_new <- length(pth)
if (pthlen_new == 1) {
Expand Down Expand Up @@ -483,13 +539,13 @@ do_data_row <- function(rdfrow, maxlen) {
path
}

handle_rdf_row <- function(rdfrow, maxlen) {
handle_rdf_row <- function(rdfrow, maxlen, add_tbl_name_split = FALSE) {
nclass <- rdfrow$node_class

ret <- switch(nclass,
LabelRow = do_label_row(rdfrow, maxlen),
LabelRow = do_label_row(rdfrow, maxlen, add_tbl_name_split = add_tbl_name_split),
ContentRow = do_content_row(rdfrow, maxlen),
DataRow = do_data_row(rdfrow, maxlen),
DataRow = do_data_row(rdfrow, maxlen, add_tbl_name_split = add_tbl_name_split),
stop("Unrecognized node type in row dataframe, unable to generate result data frame")
)
setNames(ret, make_result_df_md_colnames(maxlen))
Expand All @@ -510,6 +566,23 @@ handle_rdf_row <- function(rdfrow, maxlen) {
return(ret)
}
}

# Function to convert all elements to character while preserving structure
.convert_to_character <- function(df) {
# Apply transformation to each column
df_converted <- lapply(df, function(col) {
if (is.list(col)) {
# For columns with vector cells, convert each vector to a character vector
I(lapply(col, as.character))
} else {
# For regular columns, directly convert to character
as.character(col)
}
})
# Return the transformed data frame
data.frame(df_converted, stringsAsFactors = FALSE)
}

# path_enriched_df ------------------------------------------------------------
#
#' @describeIn data.frame_export Transform a `TableTree` object to a path-enriched `data.frame`.
Expand Down
5 changes: 5 additions & 0 deletions man/data.frame_export.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 088d4bb

Please sign in to comment.