R/generate_composite_mat_and_gene_weights.R

Defines functions generate_composite_mat_and_gene_weights

Documented in generate_composite_mat_and_gene_weights

#' Run a comparison between between two cohorts (e.g. cell lines and tumors)
#' 
#' @param default_weight see run_comparison
#' @param tumor_file see run_comparison
#' @param cell_line_file see run_comparison
#' @param known_cancer_gene_weights_file see run_comparison
#' @param cancer_specific_gene_weights_file see run_comparison
#' @param gene_list a vector of HGNC gene symbols to run comparison only for the specified genes (Default: NULL)
#' @param run_mds a boolean, whether to run multidimensional scaling (MDS) on dataset (Default: TRUE)
#' @param verbose show debugging information
#' 
#' @details The composite matrix is a single matrix where the columns are samples 
#'   (i.e. tumors AND cell line IDs) and the rows are an rbind() of mutations 
#'   (with 1 or 0 outputs for each sample), copy number alterations from 
#'   GISTIC (with values -2, -1, 0, 1, 2), or gene expression values. Available similarity/distance measures include: 
#'   \itemize{
#'     \item{"weighted_correlation"}{Weighted correlation, based on weighted means and standard deviations}
#'     \item{"generalized_jaccard"}{A weighted distance based on the Jaccard coefficient}
#'  }
#' 
#' @return a list with multiple items. NOTE: The values of the dist and isomdsfit will
#'  depend on whether the input data is continuous or discrete.
#' \itemize{
#'   \item{"dist_mat"}{a matrix of pairwise distances}
#'   \item{"isomdsfit"}{a two-column (2-dimension) fitting of the distances reduced to 
#'   two dimensions via MDS - multidimensional scaling, using the isoMDS function}
#'   \item{"cor_unweighted"}{a matrix of unweighted pairwise correlations}
#'   \item{"composite_mat"}{the composite matrix (see Details)}
#'   \item{"cell_lines_ids"}{a vector of cell line IDs/names}
#'   \item{"tumors_ids"}{a vector of tumor IDs}
#' }
#'
#' @author Rileen Sinha (rileen@gmail.com), Augustin Luna (aluna@jimmy.harvard.edu)
#' 
#' @examples 
#' mut_default_weight <- 0.01
#' tumor_mut_file <- system.file("extdata", "READ_data_for_running_TC", "tumor_mut.txt", 
#'   package="tumorcomparer")
#' cell_line_mut_file <- system.file("extdata", "READ_data_for_running_TC", "cell_line_mut.txt", 
#'   package="tumorcomparer")
#' known_cancer_gene_weights_mut_file <- system.file("extdata", "READ_data_for_running_TC", 
#'   "default_weights_for_known_cancer_genes_mut.txt", package="tumorcomparer")
#' cancer_specific_gene_weights_mut_file <- system.file("extdata", "READ_data_for_running_TC", 
#'   "Genes_and_weights_mut.txt", package="tumorcomparer")
#' 
#' mut <- generate_composite_mat_and_gene_weights(
#'   default_weight=mut_default_weight,
#'   tumor_file=tumor_mut_file,
#'   cell_line_file=cell_line_mut_file,
#'   known_cancer_gene_weights_file=known_cancer_gene_weights_mut_file,
#'   cancer_specific_gene_weights_file=cancer_specific_gene_weights_mut_file)
#'
#' @concept tumorcomparer
#' @export
#' 
#' @importFrom MASS isoMDS
#' @importFrom cluster daisy
#' @importFrom utils read.table write.table
#' @importFrom stats cor
generate_composite_mat_and_gene_weights <- function(default_weight, 
                                                    tumor_file, 
                                                    cell_line_file, 
                                                    known_cancer_gene_weights_file = NULL, 
                                                    cancer_specific_gene_weights_file = NULL,
                                                    gene_list = NULL,
                                                    run_mds=TRUE,
                                                    verbose=FALSE) {

  # GET INTERSECTING GENES BETWEEN TUMORS AND CELL LINES ----
  tumor <- read.table(tumor_file, sep = "\t", header = TRUE, row.names = 1, check.names = FALSE, stringsAsFactors = FALSE)
  cell_line <- read.table(cell_line_file, sep = "\t", header = TRUE, row.names = 1, check.names = FALSE, stringsAsFactors = FALSE)
  
  if(verbose) {
    cat("INFO: Tumor Row Count: ", nrow(tumor), " Cell Line Row Count: ", nrow(cell_line), "\n")
  }
  
  if(!is.null(gene_list)) {
    filtered_gene_list <- gene_list[which(gene_list %in% intersect(rownames(tumor), rownames(cell_line)))]
    
    if(length(filtered_gene_list) < length(gene_list)) {
      cat(paste0("INFO: From provided gene_list only ", length(filtered_gene_list), " genes are available in the dataset, removing ", length(gene_list) - length(filtered_gene_list), " genes from the list", "\n"))
    }
    
    tumor <- tumor[filtered_gene_list,]
    cell_line <- cell_line[filtered_gene_list,]
  }
  
  tumor_ids <- colnames(tumor)
  cell_line_ids <- colnames(cell_line) 
  
  # Genes in both tumors and cell
  genes_intersection <- intersect(rownames(tumor), rownames(cell_line))
  genes_setdiff <- setdiff(c(rownames(tumor), rownames(cell_line)), genes_intersection)

  if (length(genes_intersection) < 5) {
    stop("ERROR: At least 5 genes are required. Genes Included: ", paste(genes_intersection, collapse=", "), " Genes Excluded: ", paste(genes_setdiff, collapse=", "))
  }
  
  composite_mat <- cbind(cell_line[genes_intersection,], tumor[genes_intersection,])

  # Check that tumor and cell_line data are either both discrete or continuous 
  # and set the distance_similarity_measure to run
  if(all((composite_mat %% 1) == 0)) {
    is_discrete <- TRUE
    distance_similarity_measure <- "generalized_jaccard"      
  } else {
    is_discrete <- FALSE 
    distance_similarity_measure <- "weighted_correlation"      
  }

  # For discrete data (mut, cna) - compute alteration frequencies, remove samples which have no alterations
  if(is_discrete) { 
    # Calculation of alteration frequencies
    # Assign frequency weights as (freq. of alteration of gene)/(mean freq. of alteration across all genes) - "rewarding recurrent changes"
    overall_alt_freq <- length(which((composite_mat[]) != 0)) / (length(which((composite_mat[]) == 0)) + length(which((composite_mat[]) != 0)))
  
    freq_alt <- rep(0, nrow(composite_mat))
    freq_alt <- apply(composite_mat, 1, compute_freq_alt)
    freq_alt_tumors <- apply(composite_mat[, colnames(tumor)], 1, compute_freq_alt)
    freq_alt_cell_lines <- apply(composite_mat[, colnames(cell_line)], 1, compute_freq_alt)
    
    freq_alt_samplewise <- apply(composite_mat, 2, compute_freq_alt)
  
    #composite_mat <- composite_mat[, which(freq_alt_samplewise > 0)]
    
    names(freq_alt) <- rownames(composite_mat)
    
    freq_weights <- rep(1, nrow(composite_mat))
    freq_weights <- freq_alt / overall_alt_freq
    names(freq_weights) <- rownames(composite_mat)
  }
 
  # Initialize weights with default weights
  gene_weights <- rep(default_weight, nrow(composite_mat))
  names(gene_weights) <- rownames(composite_mat)
 
  # GET WEIGHTS ----
  # Read in user-provided weights for known cancer genes
  if(!is.null(known_cancer_gene_weights_file)) {
    known_cancer_genes_and_weights_all <-
      read.table(
        known_cancer_gene_weights_file,
        sep = "\t",
        header = TRUE,
        row.names = 1,
        stringsAsFactors = FALSE
      )
    rownames(known_cancer_genes_and_weights_all) <- trimws(rownames(known_cancer_genes_and_weights_all)) # trim whitespace, if any
    
    gene_weights[intersect(names(gene_weights), rownames(known_cancer_genes_and_weights_all))] <- known_cancer_genes_and_weights_all[intersect(names(gene_weights), rownames(known_cancer_genes_and_weights_all)),1]
  } else {
    known_cancer_genes_and_weights_all <- list()
  }

  # Read in user-provided weights for cancer-specific genes
  if(!is.null(cancer_specific_gene_weights_file)) {
    genes_and_weights_all <-
      read.table(
        cancer_specific_gene_weights_file,
        sep = "\t",
        header = TRUE,
        row.names = 1,
        stringsAsFactors = FALSE
      )
    rownames(genes_and_weights_all) <- trimws(rownames(genes_and_weights_all)) # trim whitespace, if any
    
    gene_weights[intersect(names(gene_weights), rownames(genes_and_weights_all))] <- genes_and_weights_all[intersect(names(gene_weights), rownames(genes_and_weights_all)),1]
  } else {
    genes_and_weights_all <- list()
  }
  
  gene_weights <- gene_weights / max(gene_weights + 1e-6) # map to 0-1
  
  #cor_unweighted <- cor(composite_mat)  
  
  if(distance_similarity_measure == "weighted_correlation") {
    # CALCULATE CORRELATIONS ----
    # Including low-level CNAs
    cor_weighted <- calc_weighted_corr(as.matrix(composite_mat),
                                       as.matrix(composite_mat),
                                       gene_weights)
    cor_weighted[which(is.na(cor_weighted))] <- 0
    cor_weighted[which(is.nan(cor_weighted))] <- 0
    cor_weighted <- cor_weighted - 1e-6
    # Excluding low-levels CNAs
    #cor_weighted_high_level_only <- calc_weighted_corr(as.matrix(composite_mat_high_level_only),as.matrix(composite_mat_high_level_only),gene_weights)
    
    # Convert to distance, and call multidimensional scaling via isoMDS
    dist_mat <- 1 - as.matrix(cor_weighted)
    
    # Run multi-dimensional scaling
    if(run_mds) {
      isomdsfit <-  isoMDS(dist_mat, k=2)      
    } else {
      isomdsfit <- NA
    }    

    if(verbose) { cat("INFO: Distance measure used: ", distance_similarity_measure, "\n") }
  } else if(distance_similarity_measure == "generalized_jaccard") {
    # Calculate weighted distance based on Jaccard's coefficient
    weighted_distance_excluding_zero_zero_matches <- apply(composite_mat, 2, function(x_i, weights=gene_weights) {
      sapply(1:ncol(composite_mat), function(j) pair_dist(x_i, composite_mat[,j], gene_weights)) # repeatedly apply function for weighted distance between a pair of coulmns/vectors
    })
    
    # NOTE: This is here because debugging via RStudio could not access this part of the code
    #saveRDS(weighted_distance_excluding_zero_zero_matches, file="weight_dist.rds")
    #saveRDS(gene_weights, file="weights.rds")
    #saveRDS(composite_mat, file="comp_mat.rds")
    
    # Change missing or small values
    weighted_distance_excluding_zero_zero_matches[which(is.na(weighted_distance_excluding_zero_zero_matches))] <- 0
    weighted_distance_excluding_zero_zero_matches <- weighted_distance_excluding_zero_zero_matches + 1e-6
    
    # Call multidimensional scaling via isoMDS
    dist_mat <- weighted_distance_excluding_zero_zero_matches
    rownames(dist_mat) <- colnames(composite_mat)
    colnames(dist_mat) <- colnames(composite_mat)
    
    # Run multi-dimensional scaling
    if(run_mds) {
      isomdsfit <-  isoMDS(dist_mat, k=2)      
    } else {
      isomdsfit <- NA
    }    
    
    if(verbose) { cat("INFO: Distance measure used: ", distance_similarity_measure, "\n") }
  } else {
    stop("ERROR: Unknown distance_similarity_measure: ", distance_similarity_measure)
  }
 
  results <- list(
    dist_mat = dist_mat,
    isomdsfit = isomdsfit, 
    composite_mat = composite_mat,
    gene_weights = gene_weights,
    cancer_specific_gene_weights = genes_and_weights_all,
    known_cancer_gene_weights = known_cancer_genes_and_weights_all,
    cell_line_ids = cell_line_ids,
    tumor_ids = tumor_ids,
    genes_included=genes_intersection,
    genes_excluded=genes_setdiff
  )
  
  return(results)
}
cannin/tumorcomparer documentation built on Feb. 7, 2023, 3:13 p.m.