ConfoundReducer.R

# ConfoundReducer.R | v2024.05.01 (HAPPY MAY DAY; SUPPORT PALESTINE)

# Very much a work in progress.
# Simply cuts out excess covariates that fMRIPrep generates 
# Note: Confounds list allows a user to enter the names of desired columns manually. Confounds search will use regrex to identify any column that meets certain patterns. It should be set to NA if not in use.
ConfoundReducer <- function(PIDs,
         dir = "/data/Uncertainty/data/deriv/pipeline_1/fmriprep",
         runs = 2,
         Components = c("Test", "Control"),
         motion_censor = TRUE,
         motion_censor_thresh = 0.9,
         check_VIF = TRUE,
         confounds_list = c("trans_x", "trans_y", "trans_z", "rot_x", "rot_y", "rot_z", # Head motion and it's derivatives
                            'trans_x_derivative1', 'trans_y_derivative1', 'trans_z_derivative1', 'rot_x_derivative1', 'rot_y_derivative1', 'rot_z_derivative1',
                            "framewise_displacement"),
         confounds_search = c("^cosine*", # Adjusting for scanner and physiological related drifts (especially needed in long video studies)
                              "^t_comp_cor_0[0-4]$", # Time-related components to capture temporal physioogical confounds
                              "^a_comp_cor_0[0-4]$")) # Anatomical components to capture spatial physiological confounds
{
  
  # Check whether this file path is valid.
  if (!dir.exists(dir)){
    stop(print(paste("Error: The filepath you provided for dir (", dir, ") does not appear to exist. This directory should ideally contain the confound files generated by fMRIPrep for each participant (It should be the level just above individual participant directories). Please respecify your filepath and try again.")))
  }
  
  # Check whether runs is entered validly.
  if (runs <= 0 | !is.numeric(runs) | is.na(runs)){
    stop(print(paste("Error: runs should contain the number of runs your task contains. It must be a numeric value greater than 0. Please respecify the number of runs and try again.")))
  }
  
  # Check whether task was entered validly.
  if (any(is.na(Components)) | any(is.null(Components))){
    # stop(print(paste("Error: task should be the name heudiconv or your preferred BIDS organizing program assigns to your task. It cannot be left undefined. Please enter something and try again.")))
    stop(print(paste("Error: Components should be either 'Control' and/or 'Test'. It cannot be left undefined. Please enter something and try again.")))
  }
  
  # I need to return and add QA checks for censor and confound arguments
  
  # If we want to check the VIF to apply a PCA and avoid collinearity
  if (check_VIF == TRUE){
    
    # Creating a subfunction to check collinearity
    VIF_Checker <- function(data = df){
      
      # Need to come back and setup dependencies better here (i.e., install if not already installed)
      
      # Dependencies
      library(car)
      
      # Create a simulated outcome variable
      data$outcome <- rnorm(nrow(data))
      
      # Creating a dummy model to test VIF
      model <- lm(outcome ~., data = data)
      
      #  Capturing vif values
      vif_values <- car::vif(model)
      
      # Viewing the vif values
      if (any(vif_values > 5)){
        
        # Remove the dummy outcome variable
        data <- data[,-which(names(data) == "outcome")]
        
        # Scale all the data
        data <- scale(data)
        
        # Perform PCA
        data <- prcomp(data, center = TRUE, scale. = TRUE)
        
      }
      
      return(data)
    }
  }
  
  # Creating a subfunction to make this all smoother
  Reducer <- function(file = filename, confound_list = confounds_list, confound_search = confounds_search)
  {
    # Check whether this file exists, and if it doesn't, print an error and give up.
    if (!file.exists(file)){
      print(paste("Error:", PID, "'s", Run, "file could not be located:", file))
    }
    
    # If it does exist . . . 
    if (file.exists(file)){
      
      # Check if the file is empty
      if (file.info(file)$size <= 0){
        print(paste("Error:", PID,"'s ", Run,"file does not contain data:", file))
      }
      
      # And if the file isn't empty . . .
      if (file.info(file)$size > 0){
        
        # Read in the file as a dataframe
        df <- read.table(file = file,
                         sep = '\t',
                         header = T,
                         na.strings = c("","NA","n/a"))
        
        # If a confound search was entered
        # NOTE that this is a quick and dirty solution, which should be switched out later. 
        if (!is.na(confound_search[1])){
          
          # Create an empty array
          search_results <- NULL
          
          # Iterate through all of the items in the search list 
          for (ITEM in confound_search){
            
            # Save all columns names meeting the pattern into the array
            search_results <- c(search_results,
                                names(df)[grep(x = names(df), pattern = ITEM)]) 
          }
          
          # and combine them with the list
          confounds <- c(confound_list, search_results)
          
        }
        
        # If a confound search was not entered
        if (is.na(confound_search[1])){

          # define the confound list as the only confounds to keep
          confounds <- confound_list
                         
        }
        
        # Subset the desired columns
        df <- subset(df, select = confounds)
        
        # Check if censoring should occur
        if (motion_censor == TRUE){
          
          # Tracking how many variables are present
          cols <- names(df)
          
          # Iterate through each observation in the dataframe
          for (OBS in 2:nrow(df)){
            
            # If that observation isn't an NA
            if (!is.na(df$framewise_displacement[OBS])){
              
              # If that observation has a FWD value greater than the threshold ...
              if (df$framewise_displacement[OBS] > motion_censor_thresh){
                
                # Create a new column of zeroes
                df[,ncol(df) + 1] <- 0
                
                # Add a one for this specific observation in that column
                df[OBS, ncol(df)] <- 1
                
              }
            }
          }
          
          # If any of the observations were greater than out threshold
          if (length(which(df$framewise_displacement > motion_censor_thresh)) > 0){
            
            # If we have more than 9 motion outliers
            if (length(which(df$framewise_displacement > motion_censor_thresh)) > 09){
              
              # Assigning variable headers 
              names(df) <- c(cols, 
                             paste0("motion_outlier0", 1:9),
                             paste0("motion_outlier", 10:length(which(df$framewise_displacement > motion_censor_thresh))))
            }
            
            # If we have less than 10 motion outliers
            if (length(which(df$framewise_displacement > motion_censor_thresh)) < 10){
              
              # Assigning variable headers 
              names(df) <- c(cols, 
                             paste0("motion_outlier0", 1:length(which(df$framewise_displacement > motion_censor_thresh))))
            }
          }
        }
        
        # If we want to check the VIF to apply a PCA and avoid collinearity
        if (check_VIF == TRUE){
          
          # Check for multicollinearity and perform PCA if necessary
          df <- VIF_Checker(data = df)
          
        }
      }
    }
    
    return(df)
  }
  
  # Dependencies
  library(assertthat)
  library(dplyr)
  library(stringr)
  
  # Iterating through each of the participants
  for (PID in PIDs){
    
    # Iterate through the components
    for (COMPONENT in Components){
      
      # If we're looking at a control component
      if (COMPONENT == "Control"){
        
        # If the luma file exists
        if (file.exists(paste0(dir, "/sub-", PID, "/func/sub-", PID, "_task-luma_desc-confounds_timeseries.tsv"))){
          
          # Create a variable to capture the file name for this run
          filename <- paste0(dir, "/sub-", PID, "/func/sub-", PID, "_task-luma_desc-confounds_timeseries.tsv")
          
          # Note what kind of control we have here
          control <- "luma"
        }
        
        # If the fish file exists
        if (file.exists(paste0(dir, "/sub-", PID, "/func/sub-", PID, "_task-fish_desc-confounds_timeseries.tsv"))){
          
          # Create a variable to capture the file name for this run
          filename <- paste0(dir, "/sub-", PID, "/func/sub-", PID, "_task-fish_desc-confounds_timeseries.tsv")
          
          # Note what kind of control we have here
          control <- "fish"
        }
        
        # Run the found reducer subfunction
        df <- Reducer(file = filename)
        
        # And then save that dataframe as a new text file within that "look_onsets" folder
        write.table(df, 
                    file = paste0(dir, "/sub-", PID, "/func/sub-", PID, "_task-", control, "_desc-confounds_timeseries_reduced.tsv"), 
                    sep = "\t",
                    row.names = FALSE,
                    col.names = TRUE)
      }
      
      # If we're looking at a test component
      if (COMPONENT == "Test"){  
        
        # Iterate through the runs in the study
        for (Run in paste0("run-",1:runs)){
          
          # Create a variable to capture the file name for this run
          filename <- paste0(dir, "/sub-", PID, "/func/sub-", PID, "_task-uncertainty_", Run,"_desc-confounds_timeseries.tsv")
          
          # Run the found reducer subfunction
          df <- Reducer(file = filename)
          
          # And then save that dataframe as a new text file within that "look_onsets" folder
          write.table(df, 
                      file = paste0(dir, "/sub-", PID, "/func/sub-", PID, "_task-uncertainty_", Run,"_desc-confounds_timeseries_reduced.tsv"), 
                      sep = "\t",
                      row.names = FALSE,
                      col.names = TRUE)
          
        }
      }
    }
  }
}