R/get_siblings.R

Defines functions convert_sibling_names_to_indices split_sibling_names split_sibling_indices get_siblings get_elder elder_weighted_corr_score get_sorted_corrs_pairwise_features

Documented in convert_sibling_names_to_indices elder_weighted_corr_score get_elder get_siblings get_sorted_corrs_pairwise_features split_sibling_indices split_sibling_names

#' Correlations between each pairwise feature and y
#'
#' @param pairwise_feature_mat a matrix with scores from feature pair comparison (rows) for each sample (cols)
#' @param y vector with quantitative outcome variable
#'
#' @return a sorted list of pairwise features based on their pearson correlation with y
#' @export
#'
#' @examples
#' C <- 100  # represents samples
#' R <- 200 # represents features
#' y <- rnorm(C) # represents outcome variable
#' X <- matrix(rbeta(R*C, 2, 3), nrow = R)  # simulate data matrix
#' Is <- get_pairwise_rank_matrices(X)
#' pairwise_feature_mat <- make_feature_pair_score_matrix(Is)
#' sorted_corrs <- get_sorted_corrs_pairwise_features(pairwise_feature_mat, y)
#'
get_sorted_corrs_pairwise_features <- function(pairwise_feature_mat, y){
  # Return a sorted list of pairwise features based on their pearson correlation with y 
  sorted_corrs <- sort(apply(pairwise_feature_mat, 2, function(x){
    stats::cor(x,y)
  }), decreasing = T)
  
  return(sorted_corrs)
}


#' Calculate weighted corr score
#'
#' Calculates a weighted correlation score that over weights higher correlations. When calling elders, this allows me to prioritize features that have are a part of a smaller number of highly correlative (with y) pairwise features as opposed to features that are a part of a larger number of relative less correlative pairwise features.
#'
#' @param corr a float between 0 and 1 representing the absolute value of a correlation coefficient (r)
#' @param max_score the maximum weight given to a correlation. Given that this weighted scoring can produce exponentially increasing weighted scores, setting this to 100 (representing a correlation of ~0.99) is reasonable.
#'
#' @return a float between 0 and 100, that represents a mapping of corr to a weighted score that overweights higher correlation values
#' @export
#'
#' @examples
#' corr <- 0.05
#' weighted_score <- elder_weighted_corr_score(corr)
#' 
elder_weighted_corr_score <- function(corr, max_score = 100){
  return(min(abs(1/log(corr)), max_score))
}

#' Calculate weighted feature correlation score
#'
#' For every feature, this function calculates a feature correlation score that is dervied from the sum of all weighted correlations (see weighting function) between y and pairwise features that said feature is a component of.
#'
#' @param sorted_corrs a vector of floats between -1 and 1 representing the the correlation coeficient between a pairwise feature and the outcome variable y. Output by get_sorted_corrs_pairwise_features().
#'
#' @return a numeric vector of length one. The value is the correlation of the elder with y when all pairwise features containing elder are accounted for (after weighting). The name of this numeric vector is the index of the elder in X as a character vector (i.e. string)
#' @export
#'
#' @examples
#' C <- 100  # represents samples
#' R <- 200 # represents features
#' y <- rnorm(C) # represents outcome variable
#' X <- matrix(rbeta(R*C, 2, 3), nrow = R)  # simulate data matrix
#' Is <- get_pairwise_rank_matrices(X)
#' pairwise_feature_mat <- make_feature_pair_score_matrix(Is)
#' sorted_corrs <- get_sorted_corrs_pairwise_features(pairwise_feature_mat, y)
#' elder <- get_elder(sorted_corrs)
#' 
get_elder <- function(sorted_corrs){
  
  # refernced below repeatedly
  sorted_corr_names <- names(sorted_corrs)
  
  # init emplty vector in which each original feature (i.e. 1/2 of pairwise feature) will be initialized to a value of zero
  feature_vec <- c() 
  for (feature in unique(unlist(sapply(sorted_corr_names, strsplit, "_")))){
    feature_vec[feature] <- 0
  }
  
  # for every feature, tally a sum of all weighted correlations between y and pairwise features that said feature is a component of. Note, this correlation will be weighted first before being added to the running sum
  for (i in 1:length(sorted_corrs)){
    feature1_feature2 <- sorted_corr_names[i]
    corr <- abs(as.numeric(sorted_corrs[i]))
    feature1 <- stringr::str_split_fixed(feature1_feature2, "_", n = 2)[1]
    feature2 <- stringr::str_split_fixed(feature1_feature2, "_", n = 2)[2]
    feature_vec[feature1] <- feature_vec[feature1] + elder_weighted_corr_score(corr)
    feature_vec[feature2] <- feature_vec[feature2] + elder_weighted_corr_score(corr)
  }
  
  # sort feature_vec
  sorted_feature_vec <- sort(feature_vec, decreasing = T)
  
  return(sorted_feature_vec[1]) # top feature is the elder
}

#' Determine elder and siblings that make cluster
#' 
#' This function identifies the best elder with which to define the best cluster for regression prediction. Such a cluster is made of pairwise features, the component features of which are termed elder and sibling, where the elder has the highest weighted sum of correlations with y across all pairwise features that it is a component of. Siblings are then the features that make up these pairwise features with elder.
#'
#' @param elder character vector (i.e. string) of the elder index I am up to
#' @param cluster_corr_prop kTSCR hyperparameter that determines what proportion oan elders overall correlation weight I want to capture in the included current cluster (which is made of elder sibling pairs). Default value = 1, which represents all elder sibling pairs.
#' @param sorted_corrs a vector of floats between -1 and 1 representing the the correlation coeficient between a pairwise feature and the outcome variable y. Output by get_sorted_corrs_pairwise_features().
#' @param elder_corr a length numeric vector (single value with a name) representing the sum weighted total correlation for the elder feature.
#'
#' @return a list of elder sibling pairs that represent the sibling pairs with elder that comprise the top correlations with y.
#' @export
#'
#' @examples
#' C <- 100  # represents samples
#' R <- 200 # represents features
#' y <- stats::rnorm(C) # represents outcome variable
#' X <- matrix(rbeta(R*C, 2, 3), nrow = R)  # simulate data matrix
#' Is <- get_pairwise_rank_matrices(X)
#' pairwise_feature_mat <- make_feature_pair_score_matrix(Is)
#' sorted_corrs <- get_sorted_corrs_pairwise_features(pairwise_feature_mat, y)
#' elder_corr <- get_elder(sorted_corrs)
#' elder <- names(elder_corr)
#' siblings <- get_siblings(sorted_corrs, elder, elder_corr)
#' 
get_siblings <- function(sorted_corrs, elder, elder_corr, cluster_corr_prop = 1){
  
  # now get siblings
  elder_corr_so_far <- 0 # init at zero - will record a sum of how much of the elder corr value has been accounted for as every sibling pair is found
  siblings <- c() # stores sibling as I find them
  for (i in 1:length(sorted_corrs)){
    
    # BREAK if elder_corr_so_far / elder_corr >  cluster_corr_prop
    # this is where hyperparameter cluster_corr_prop exerts its effect - it controls the number of siblings based on proportion fo total weighted elder correlation that has been captured so far
    if (elder_corr_so_far / elder_corr >= cluster_corr_prop){ 
      break
    }
    feature1_feature2 <- names(sorted_corrs)[i]
    feature1 <- stringr::str_split_fixed(feature1_feature2, "_", n = 2)[1]
    feature2 <- stringr::str_split_fixed(feature1_feature2, "_", n = 2)[2]
    # reverse the feature1_feature2 order if the correlation is negative
    corr <- as.numeric(sorted_corrs[i])
    if (corr < 0){
      feature1_feature2 <- paste(feature2, feature1, sep = "_")
    }
      
   if (feature1 == elder | feature2 == elder){
      siblings <- append(siblings, feature1_feature2)
      # add corr value to elder_val_build_up
      elder_corr_so_far <- elder_corr_so_far + elder_weighted_corr_score(abs(corr))
    }
  }
  
  return(siblings)
  
}

#' Split pairwise features into numeric indices
#'
#' Convert character vector of pairwise features (i.e. siblings)  into numeric matrix of indices from the original feature matrix X
#'
#' @param siblings a vector of sibling names made of indices concatenated by underscore (is a string)
#'
#' @return returns siblings as a numeric vector of indices rather than the character vector version of said indices
#' @export
#'
#' @examples
#' siblings <- c("1_3", "20_5", "100_110", "101_50")
#' sibling_indices <- split_sibling_indices(siblings)
#' 
#' 
split_sibling_indices <- function(siblings){
  
  return(t(apply(do.call(rbind, strsplit(siblings, "_")), 1, as.numeric)))
}

#' Split pairwise features into character vector
#'
#' Convert character vector of concatenated pairwise features (i.e. siblings)  into a 'character vector' matrix of split names (i.e. original non-pairwise feature names)
#'
#' @param siblings a vector of sibling names (each of which is a character vector, i.e. string)
#'
#' @return returns siblings as a character vector of split names reflecting pairs of independent features from the original matrix X
#' @export
#'
#' @examples
#' siblings <- c("V1_V3", "V20_V5", "V100_V110", "V101_V50")
#' sibling_indices <- split_sibling_names(siblings)
#' 
#' 

split_sibling_names <- function(siblings){
  
  return(do.call(rbind, strsplit(siblings, "_")))
}

#' Convert sibling names to indices
#'
#' @param X input matrix
#' @param siblings character vector (as a matrix) of feature names
#'
#' @return character vector (as a matrix) of feature indices
#' @export
#'
#' @examples
convert_sibling_names_to_indices <- function(X, siblings){
  
  mapdf <- data.frame(feature_names = rownames(X), feature_indices = 1:nrow(X))
  indices <- matrix(mapdf$feature_indices[match(siblings,mapdf$feature_names)], ncol = 2, byrow = FALSE)

  return(indices)
}
mdkessler/kTSCR documentation built on Feb. 25, 2021, 10:31 p.m.