R/AuxFunctions.R

Defines functions growtreeMut findsplitMut growtreeExp findsplitExp getsumic50v3 getSplit getTreatment

Documented in findsplitExp findsplitMut getSplit getsumic50v3 getTreatment growtreeExp growtreeMut

#' ODT Internal Functions
#'
#' Internal functions used by `trainTree` in the various
#' steps of the algorithm. These functions are not intended for
#' direct use by end users and are primarily for internal logic
#' and calculations within the package.
#'
#' @keywords internal
#' @name InternalFunctions
#' @return A list of internal outputs generated by these functions.
#'
#' @import matrixStats
#' @import partykit
#'
NULL

#' @rdname InternalFunctions
#' @title Get Optimal Treatment
#' @name getTreatment
#' @description This internal function calculates the optimal treatment based on patient sensitivity data.
#' @param PatientSensitivity A matrix representing drug response values (e.g., IC50), where rows correspond to patients/samples and columns correspond to drugs.
#' @param weights A numeric vector indicating which samples are considered; only samples with weights equal to 1 are included in the analysis.
#' @return An integer indicating the index of the optimal treatment based on the minimum sum of sensitivity values.
#'
#' @keywords internal
getTreatment <- function(PatientSensitivity, weights) {
  # Filter PatientSensitivity based on weights, retaining only samples marked with weight 1
  PatientSensitivity <- PatientSensitivity[weights == 1, , drop = FALSE]
  
  # Calculate the sum of treatment responses across samples
  sumtreat <- colSums2(PatientSensitivity)
  
  # Identify the index of the treatment with the minimum sum of responses
  biomk <- which.min(sumtreat)
  names(biomk) <- colnames(PatientSensitivity)[biomk]  # Assign name to the optimal treatment index
  
  return(biomk)  # Return the index of the optimal treatment
}

#' @rdname InternalFunctions
#' @title Get Split Information
#' @description This internal function calculates the optimal split point based on a specified gene in the expression data.
#' @param gene The index of the gene used for splitting the data.
#' @param PatientData A matrix representing patient features, where rows correspond to patients/samples and columns correspond to genes/features.
#' @param PatientSensitivity A matrix representing drug response values (e.g., IC50), where rows correspond to patients/samples and columns correspond to drugs.
#' @return A list containing:
#'   - `sumic50`: The minimum summed IC50 values for the two groups.
#'   - `Treatment1`: The treatment associated with the first group.
#'   - `Treatment2`: The treatment associated with the second group.
#'   - `split`: The index of the optimal split point.
#'   - `expressionSplit`: The expression value at the split point adjusted by a small epsilon.
#'
#' @keywords internal
getSplit <- function(gene, PatientData, PatientSensitivity) {
  # Order samples based on the specified gene for ascending and descending order
  I <- order(PatientData[, gene])
  Id <- order(PatientData[, gene], decreasing = TRUE)
  
  # Calculate cumulative sums of PatientSensitivity for both orderings
  M1 <- rbind(0, colCumsums(PatientSensitivity[I, ]))
  M2 <- rbind(0, colCumsums(PatientSensitivity[Id, ]))
  M2 <- M2[c(nrow(M1):1), , drop = FALSE]
  
  # Determine the optimal split point by minimizing the sum of row minimums
  th <- which.min(rowMins(M1) + rowMins(M2))
  sumic50 <- min(rowMins(M1) + rowMins(M2))
  
  # Determine the treatments for the two groups based on the split point
  Treatment1 = max.col(-M1)[th]
  names(Treatment1) = colnames(PatientSensitivity)[Treatment1]
  Treatment2 = max.col(-M2)[th]
  names(Treatment2) = colnames(PatientSensitivity)[Treatment2]
  
  # Return a list with the relevant split information
  return(list(
    sumic50 = sumic50,
    Treatment1 = Treatment1,
    Treatment2 = Treatment2,
    split = th,
    expressionSplit = PatientData[I[th], gene] - 1e-6
  ))  # Adjust expression value slightly
}

#' @rdname InternalFunctions
#' @title Calculate Sum IC50 Value for Gene Patient Response
#' @description This internal function calculates the summed IC50 value based on the response of patients to a specific gene.
#' @param genePatientResponse A numeric vector representing the response of patients to a particular gene.
#' @param PatientSensitivity A matrix representing drug response values (e.g., IC50), where rows correspond to patients/samples and columns correspond to drugs.
#' @return A numeric value representing the minimum summed IC50 value derived from the patient responses.
#'
#' @keywords internal
getsumic50v3 <- function(genePatientResponse, PatientSensitivity) {
  # Order the patient responses based on the gene response
  I <- order(genePatientResponse)  # Ascending order
  Id <- I[length(I):1]  # Descending order
  
  # Calculate cumulative sums for both orderings
  M1 <- colCumsums(PatientSensitivity, rows = I)
  M2 <- colCumsums(PatientSensitivity, rows = Id)
  
  # Combine row minimums from both cumulative sums to find the summed IC50
  dummy <- c(0, rowMins(M1)) + c(rowMins(M2, rows = (nrow(M2):1)), 0)
  sumic50 <- min(dummy)  # Find the minimum summed IC50 value
  
  return(sumic50)  # Return the minimum summed IC50 value
}

#' @rdname InternalFunctions
#' @title Find Optimal Split for Expression Data
#' @description This internal function identifies the optimal split point in expression data based on patient sensitivity (IC50).
#' @param PatientSensitivity A matrix representing drug response values (e.g., IC50), where rows correspond to patients/samples and columns correspond to drugs.
#' @param PatientData A matrix representing patient features, where rows correspond to patients/samples and columns correspond to genes/features.
#' @param minimum An integer specifying the minimum number of samples required for a valid split (default is 1).
#' @param weights A numeric vector indicating which samples are under study; defaults to NULL, meaning all samples are considered.
#' @param verbose A logical value indicating whether to print additional information during execution (default is FALSE).
#' @return A `partysplit` object representing the optimal split based on the specified patient sensitivity and expression data.
#'
#' @keywords internal
findsplitExp <- function(PatientSensitivity, PatientData, minimum = 1, weights = NULL, verbose = FALSE) {
  # PatientSensitivity: IC50
  # PatientData: gene expression data
  if (verbose)
    print("Split!")
  
  # Apply weights if provided
  if (!is.null(weights)) {
    PatientSensitivity <- PatientSensitivity[weights == 1, ]
    PatientData <- PatientData[weights == 1, ]
  }
  
  # Check if PatientSensitivity is a matrix and has sufficient rows
  if (is.matrix(PatientSensitivity) == TRUE) {
    if (nrow(PatientSensitivity) <= minimum)
      return(NULL)
  } else {
    return(NULL)  # If not a matrix, return NULL
  }
  
  # Find the gene that minimizes the IC50 values
  Gene <- which.min(apply(PatientData, 2, getsumic50v3, PatientSensitivity)) # Find optimal split
  names(Gene) <- colnames(PatientData[, Gene]) # Assign gene name to the optimal gene index
  
  # Get the split information
  Output <- getSplit(Gene, PatientData, PatientSensitivity)
  
  # Return a party split object with the relevant information
  return(
    partysplit(
      varid = as.integer(Gene),
      breaks = Output$expressionSplit,
      index = c(1L, 2L),
      info = list(treatments = c(names(Output$Treatment1), names(Output$Treatment2)))
    )
  )
}

#' @rdname InternalFunctions
#' @title Grow Decision Tree for Expression Data
#' @description This internal function recursively builds a decision tree based on expression data and patient sensitivity (IC50).
#' @param id A unique identifier for the node in the decision tree (default is 1L).
#' @param PatientSensitivity A matrix representing drug response values (e.g., IC50), where rows correspond to patients/samples and columns correspond to drugs.
#' @param PatientData A matrix representing patient features, where rows correspond to patients/samples and columns correspond to genes/features.
#' @param minbucket An integer specifying the minimum number of samples required in a child node for further splitting (default is 10).
#' @param weights A numeric vector indicating which samples are under study; defaults to NULL, meaning all samples are considered.
#' @return A `partynode` object representing the current node and its children in the decision tree.
#'
#' @keywords internal
#'
growtreeExp <- function(id = 1L,
                        PatientSensitivity,
                        PatientData,
                        minbucket = 10,
                        weights = NULL) {
  # Initialize weights if not provided
  if (is.null(weights))
    weights <- rep(1L, nrow(PatientData))
  
  # Check if there are enough observations; if not, stop here and return a leaf node
  if (sum(weights) < minbucket)
    return(partynode(id = id, info = names(
      getTreatment(PatientSensitivity, weights)
    )))
  
  # Find the best split using the findsplitExp function
  sp <- findsplitExp(PatientSensitivity, PatientData, minimum = minbucket, weights)
  
  # Convert PatientData to a data frame for easier handling
  datadf <- as.data.frame(PatientData)
  
  # No split found, stop here and return a leaf node
  if (is.null(sp)) {
    return(partynode(id = id, info = names(
      getTreatment(PatientSensitivity, weights)
    )))
  }
  
  # Split the data based on the identified split
  kidids <- kidids_split(sp, data = datadf)
  
  # Check if all observations belong to the same child; if so, return a leaf node
  if (length(unique(kidids[weights == 1])) == 1) {
    return(partynode(id = id, info = names(
      getTreatment(PatientSensitivity, weights)
    )))
  }
  
  # Check if the minimum number of samples in any child node is less than minbucket; if so, stop here
  if (min(table(kidids[weights == 1])) < minbucket) {
    return(partynode(id = id, info = names(
      getTreatment(PatientSensitivity, weights)
    )))
  }
  
  # Set up all daughter nodes
  kids <- vector(mode = "list", length = max(kidids, na.rm = TRUE))
  for (kidid in 1:length(kids)) {
    # Select observations for the current node
    w <- weights
    w[kidids != kidid] <- 0
    
    # Get the next node ID
    if (kidid > 1) {
      myid <- max(nodeids(kids[[kidid - 1]]))
    } else {
      myid <- id
    }
    
    # Start recursion on this daughter node
    kids[[kidid]] <- growtreeExp(
      id = as.integer(myid + 1),
      PatientSensitivity,
      PatientData,
      weights = w,
      minbucket = minbucket
    )
  }
  
  # Return a party node object representing the current node and its children
  return(partynode(
    id = as.integer(id),
    split = sp,
    kids = kids,
    info = ""
  ))
}

#' @rdname InternalFunctions
#' @title Find Optimal Split in Mutation Data
#' @description This internal function identifies the optimal split point in a mutation matrix based on
#' patient sensitivity data (IC50) and the presence of specific mutations.
#' @param PatientSensitivity A matrix representing drug response values (e.g., IC50), where rows correspond to patients/samples and columns correspond to drugs.
#' @param X (PatientData) A matrix representing patient features, where rows correspond to patients/samples and columns correspond to genes/features.
#' @param minimum An integer specifying the minimum number of samples required for a valid split (default is 1).
#' @param weights A numeric vector indicating which samples are included in the analysis; only samples with weights equal to 1 are considered.
#' @return A `partysplit` object if a valid split is found, or `NULL` if no valid split can be determined.
#'
#' @keywords internal
#'
findsplitMut <- function(PatientSensitivity, X, minimum = 1, weights) {
  # Filter PatientSensitivity and mutation matrix X by weights
  PatientSensitivity <- PatientSensitivity[weights == 1, ]
  X <- X[weights == 1, ]
  
  # Check if PatientSensitivity is a matrix and has enough rows
  if (is.matrix(PatientSensitivity) == TRUE) {
    if (nrow(PatientSensitivity) <= minimum)
      return(NULL)  # Not enough samples for split
  } else {
    return(NULL)  # Return NULL if PatientSensitivity is not a matrix
  }
  
  # Identify the maximum value in the mutation matrix (mutation) and minimum value (wildtype)
  mut <- max(X)  # Mutation is the largest value of X
  WT <- min(X)   # Wildtype is the smallest value of X
  
  # Calculate treatment outcomes based on mutation presence
  tA <- t(PatientSensitivity) %*% (X == mut)  # Treatment with mutation
  tB <- t(PatientSensitivity) %*% (X == WT)   # Treatment for wildtype
  
  #   tA Sum of the IC50 of the mutated samples treated with each treatment
  #   tB Sum of the IC50 of the wildtype samples treated with each treatment
  
  # The biomarker mutation
  optimal <- colMins(tA) + colMins(tB) # Optimal selection of treatments for each mutation
  biomk <- which.min(optimal) # Identify the optimal mutation
  names(biomk) <- colnames(tA)[biomk] # Assign name to the optimal mutation
  
  # If both mutated and WT samples return the same treatment, don't split
  if (length(unique(X[, biomk])) == 1)
    return(NULL)
  # Ensure there are patients with and without the mutation to justify a split
  
  # Identify the treatments for the mutated and wildtype samples
  Treatment1Mut <- which.min(tA[, biomk]) # Treatment for mutated samples
  Treatment1WT <- which.min(tB[, biomk]) # Treatment for wildtype samples
  
  # Check if the same treatment is suggested for both groups; if so, don't split
  if (Treatment1Mut == Treatment1WT)
    return(NULL) # Ensure that the treatments differ
  
  # Create an index for output with names indicating mutation and wildtype
  index = c(1L, 2L)
  names(index) <- c(paste0(names(biomk), "mut"), paste0(names(biomk), "WT"))
  
  # Return a party split object with the variable ID, index, and treatment information
  return(partysplit(
    varid = as.integer(biomk),
    index = index,
    info = list(treatments = c(names(Treatment1Mut), names(Treatment1WT)))
  ))
}

#' @rdname InternalFunctions
#' @title Grow Decision Tree for Mutation Data
#' @description This internal function recursively builds a decision tree based on patient data and
#' drug sensitivity responses, specifically designed for mutation data.
#' @param id A unique identifier for the node in the decision tree (default is 1L).
#' @param PatientSensitivity A matrix representing drug response values (e.g., IC50), where rows correspond to patients/samples and columns correspond to drugs.
#' @param PatientData A matrix representing patient features, where rows correspond to patients/samples and columns correspond to genes/features.
#' @param minbucket An integer specifying the minimum number of samples required in a child node for further splitting (default is 10).
#' @param weights A numeric vector indicating which samples are under study; defaults to NULL, meaning all samples are considered.
#' @param findsplit A function used to find the best split; defaults to `findsplitMut`.
#' @return A `partynode` object representing the current node and its children in the decision tree.
#'
#' @keywords internal
#'
growtreeMut <- function(id = 1L,
                        PatientSensitivity,
                        PatientData,
                        minbucket = 10,
                        weights = NULL,
                        findsplit = findsplitMut) {
  # nolint: line_length_linter.
  # Set default weights if none are provided
  if (is.null(weights)) {
    weights <- rep(1L, nrow(PatientData))  # Each observation gets equal weight
  }
  
  # Check if the total weight of observations is less than "minbucket"
  # If so, stop recursion and return a leaf node with treatment info
  if (sum(weights) < minbucket) {
    return(partynode(id = id, info = names(
      getTreatment(PatientSensitivity, weights)
    ))) # nolint: line_length_linter.
  }
  
  # Find the best split using the provided function
  sp <- findsplitMut(PatientSensitivity, PatientData, minimum = minbucket, weights)
  
  # If no valid split is found, stop here and return a node with the current treatment info
  if (is.null(sp)) {
    return(partynode(id = id, info = names(
      getTreatment(PatientSensitivity, weights)
    )))
  }
  
  # Convert PatientData to a data frame for processing
  datadf <- as.data.frame(PatientData)
  
  # Get the child node identifiers based on the split
  kidids <- kidids_split(sp, data = datadf)
  
  # If there's only one child node type, return a leaf node with treatment info
  if (length(unique(kidids)) == 1) {
    return(partynode(id = id, info = names(
      getTreatment(PatientSensitivity, weights)
    )))
  }
  
  # If any of the splits result in fewer observations than minbucket, return the current node
  if (min(table(kidids[weights == 1, drop = FALSE])) < minbucket) {
    return(partynode(id = id, info = names(
      getTreatment(PatientSensitivity, weights)
    )))
  }
  
  # Initialize a list to hold the child nodes
  kids <- vector(mode = "list", length = max(kidids, na.rm = TRUE)) # na.rm=TRUE removes any missing values
  # Iterate through each child node
  for (kidid in 1:length(kids)) {
    # Select observations for the current child node
    w <- weights
    w[kidids != kidid] <- 0 # Set weights to 0 for observations not selected in this split
    
    # Determine the next node ID
    if (kidid > 1) {
      myid <- max(nodeids(kids[[kidid - 1]])) # Get the last used ID from the previous child
    } else {
      myid <- id # For the first child, use the current node ID
    }
    
    # Recursively call growtreeMut to create the child node
    kids[[kidid]] <- growtreeMut(
      id = as.integer(myid + 1),
      PatientSensitivity,
      PatientData,
      weights = w,
      minbucket = minbucket,
      findsplit = findsplitMut
    )
  }
  
  # Return the current node with its children
  return(partynode(
    id = as.integer(id),
    split = sp,
    kids = kids,
    info = ""
  ))
}

Try the ODT package in your browser

Any scripts or data that you put into this service are public.

ODT documentation built on Oct. 18, 2024, 5:12 p.m.