R/fitSignatures.R

Defines functions fitSignatures

Documented in fitSignatures

#'  fitSignatures
#' @description Find nonnegative linear combination of mutation signatures to reconstruct matrix and 
#' calculate cosine similarity based on somatic SNVs.
#' 
#' @param tri_matrix A matrix or a list of matrix generated by \code{\link{triMatrix}} function.
#' @param patient.id Select the specific patients. Default NULL, all patients are included
#' @param signaturesRef Signature reference,Users can upload their own reference. 
#' Default "cosmic_v2". Option: "exome_cosmic_v3","nature2013".
#' @param associated Associated Vector of associated signatures. 
#' If given, will narrow the signatures reference to only the ones listed. Default NULL.
#' @param min.mut.count The threshold for the variants in a branch. Default 15.
#' @param signature.cutoff Discard any signature relative contributions with a weight less than this amount. Default 0.1.
#' @return A list of data frames, each one contains treeMSOutput, 
#' containing information about each set/branch's mutational signature.
#' 
#' @importFrom pracma lsqnonneg
#' @examples
#' maf.File <- system.file("extdata/", "CRC_HZ.maf", package = "MesKit")
#' clin.File <- system.file("extdata/", "CRC_HZ.clin.txt", package = "MesKit")
#' ccf.File <- system.file("extdata/", "CRC_HZ.ccf.tsv", package = "MesKit")
#' maf <- readMaf(mafFile=maf.File, clinicalFile = clin.File, ccfFile=ccf.File, refBuild="hg19")
#' 
#' ## Load a reference genome.
#' library(BSgenome.Hsapiens.UCSC.hg19)
#' 
#' phyloTree <- getPhyloTree(maf, patient.id = 'V402')
#' tri_matrix <- triMatrix(phyloTree)
#' fitSignatures(tri_matrix)
#' @export  fitSignatures


fitSignatures <- function(tri_matrix = NULL,
                          patient.id = NULL,
                          signaturesRef = "cosmic_v2",
                          associated  = NULL,
                          min.mut.count = 15,
                          signature.cutoff = 0.1){
    
    if(!is.null(patient.id)){
         patient.setdiff <- setdiff(patient.id, names(tri_matrix))
         if(length(patient.setdiff) > 0){
            stop(paste0("Patient ", patient.setdiff, " can not be found in your data"))
         }
         tri_matrix <- tri_matrix[names(tri_matrix) %in% patient.id]
    }
  
  ## get signatrue reference
  df.etiology <- NULL
  if(is(signaturesRef,'character')){
    signaturesRef <- match.arg(signaturesRef,
                               choices = c("cosmic_v2","nature2013","exome_cosmic_v3"),
                               several.ok = FALSE)
    signatures.etiology <- readRDS(file = system.file("extdata", "signatures.etiology.rds", package = "MesKit"))
    # signatures.etiology <- readRDS(file = "D:/renlab/MesKit/inst/extdata/signatures.etiology.rds")
    if (signaturesRef == "cosmic_v2"){
      sigsRef <- readRDS(file = system.file("extdata", "signatures.cosmic.rds", package = "MesKit"))
      rownames(sigsRef) <- gsub("Signature.", "Signature ", rownames(sigsRef))
      df.etiology <- data.frame(aeti = signatures.etiology$cosmic_v2$etiology,
                                 sig = rownames(signatures.etiology$cosmic_v2))
    }else if(signaturesRef == "nature2013"){
      sigsRef <- readRDS(file = system.file("extdata", "signatures.nature2013.rds", package = "MesKit"))
      ## rename signature
      rownames(sigsRef) <- gsub("Signature.", "Signature ", rownames(sigsRef))
      df.etiology <- data.frame(aeti = signatures.etiology$nature2013$etiology,
                                 sig = rownames(signatures.etiology$nature2013))
    }else if(signaturesRef == "exome_cosmic_v3"){
      sigsRef <- readRDS(file = system.file("extdata", "signatures.exome.cosmic.v3.may2019.rds", package = "MesKit"))
      df.etiology <- data.frame(aeti = signatures.etiology$cosmic_v3$etiology,
                                 sig = rownames(signatures.etiology$cosmic_v3))
    }
  }else if(!is(signaturesRef, 'data.frame')){
    stop('Input signature reference should be a data frame.')
  }else{
    sigsRef <- signaturesRef 
  }
  
  if(!is.null(associated)){
    signature.setdiff <- setdiff(associated, rownames(sigsRef))
    if(length(signature.setdiff) > 0){
      stop(paste0(signature.setdiff, " were not found in signature reference."))
    }
    sigsRef <- sigsRef[rownames(sigsRef) %in% associated, ]
  }
  
  tri_matrix_list <- tri_matrix
  
  processFitSig <- function(i){
    if(typeof(tri_matrix_list[[i]]) == "list"){
      tri_matrix <- tri_matrix_list[[i]]$tri_matrix
      tsb.label <- tri_matrix_list[[i]]$tsb.label
    }else{
      tri_matrix <- tri_matrix_list[[i]]
    }
    patient <- names(tri_matrix_list)[i]
    ## Remove branches whose mutation number is less than min.mut.count
    branch_remove_idx <- which(rowSums(tri_matrix) <= min.mut.count)
    branch_remove <- rownames(tri_matrix)[branch_remove_idx]
    if(length(branch_remove) > 0){
      message("Warning: mutation number of ",
              paste(branch_remove, collapse = ", "),
              " of ",patient, " is not enough for signature extraction. See 'min.mut.count' parameter.")
      branch_left <- setdiff(rownames(tri_matrix),branch_remove)
      if(length(branch_left) == 0){
        return(NA)
      }
      tri_matrix <- tri_matrix[branch_left,]
      ## rebuild matrix if there is only one branch left
      if(is(tri_matrix, "numeric")){
        type_name <- names(tri_matrix)
        tri_matrix <- matrix(tri_matrix, ncol = length(tri_matrix))
        rownames(tri_matrix) <- branch_left
        colnames(tri_matrix) <- type_name
      }
    }
    
    ## convert mutation number to proportion
    # origin_matrix <- t(apply(tri_matrix,1,function(x)x/sum(x)))
    origin_matrix <- tri_matrix
    ## calculate cosine similarity
    branch_num <- nrow(origin_matrix)
    refsig_num <- nrow(sigsRef)
    
    cos_sim_matrix <- matrix(nrow = branch_num, ncol = refsig_num)
    rownames(cos_sim_matrix) <- rownames(origin_matrix)
    colnames(cos_sim_matrix) <- rownames(sigsRef)
    
    cos_sim_matrix <- apply(origin_matrix, 1, function(x){
      apply(sigsRef, 1, function(y){
        y <- as.numeric(y)
        s <- as.numeric(x %*% y / (sqrt(x %*% x) * sqrt(y %*% y)))
        return(s)
      })
    }) %>% t()
    
    ## calculate signature contribution by solving nonnegative least-squares constraints problem(weight)
    type_num <- ncol(origin_matrix)
    
    sigsRef_t <- t(as.matrix(sigsRef))
    ## contribution matrix
    con_matrix_absolute <- matrix(1, nrow = branch_num, ncol = refsig_num)
    
    ## reconstruted matrix
    recon_matrix <- matrix(1, nrow = branch_num, ncol = type_num)
    
    ## solve nonnegative least-squares constraints.
    # con_matrix_absolute <- apply(origin_matrix, 1, function(m){
    #   lsq <- pracma::lsqnonneg(sigsRef_t, m)
    #   return(lsq$x)
    # }) %>% t()
    con_matrix_absolute <- apply(origin_matrix, 1, function(m){
      lsq <- pracma::lsqnonneg(sigsRef_t, m)
      return(lsq$x)
    })
    
    if(is(con_matrix_absolute, "matrix")){
      con_matrix_absolute <- t(con_matrix_absolute)
    }else{
      con_matrix_absolute <- matrix(data = con_matrix_absolute, nrow = length(con_matrix_absolute))
    }
    
    
    recon_matrix <- apply(origin_matrix, 1, function(m){
      lsq <- pracma::lsqnonneg(sigsRef_t, m)
      l <- sigsRef_t %*% as.matrix(lsq$x)
      return(l)
    }) %>% t()
    

    rownames(con_matrix_absolute) <- rownames(origin_matrix)
    colnames(con_matrix_absolute) <- rownames(sigsRef)
    
    con_matrix_relative <- con_matrix_absolute
    con_matrix_relative <- apply(con_matrix_relative, 1, function(m){
      m <- m/sum(m)
      return(m)
    }) %>% t()
    
    
    rownames(recon_matrix) <- rownames(origin_matrix)
    colnames(recon_matrix) <- colnames(origin_matrix)
    
    # print(con_matrix_absolute)
    # print(recon_matrix)
    
    ## calculate RSS of reconstructed matrix and origin matrix
    RSS <- vapply(seq_len(branch_num), function(i){
      r <- recon_matrix[i,]
      r <- r/sum(r)
      o <- origin_matrix[i,]
      o <- o/sum(o)
      rss <- sum((r-o)^2) 
      return(rss)
    },FUN.VALUE = numeric(1))
    names(RSS) <- rownames(origin_matrix)
    
    
    ## summary mutation signatures of branches
    if(!is.null(df.etiology)){
      etiology_ref <- as.character(df.etiology$aeti)  
      names(etiology_ref) <-  as.character(df.etiology$sig)
    }
    etiology_ref <- as.character(df.etiology$aeti)  
    names(etiology_ref) <-  as.character(df.etiology$sig)
    signatures_etiology <- data.frame()
    signatures_etiology_list <- lapply(seq_len(branch_num), function(i){
      branch_name <- rownames(tri_matrix)[i]
      sig_con_absolute <- con_matrix_absolute[i,]
      sig_con_relative <- con_matrix_relative[i,]
      
      if(length(sig_con_relative) == 1){
        names(sig_con_relative) <- colnames(con_matrix_absolute)
      }
      
      idx <- which(sig_con_relative >= signature.cutoff)
      sig_name <- names(sig_con_relative[idx])
      if(length(sig_name) == 0){
        return(NA)
      }
      sig_con_relative <- as.numeric(sig_con_relative[idx])
      sig_con_absolute <- as.numeric(sig_con_absolute[idx]) 
      # mut_sum <- sum(tri_matrix[i,])
      
      sub <- data.frame(Level_ID = branch_name, 
                        Signature = sig_name,
                        # Mutation_number = mut_sum,
                        Contribution_absolute = sig_con_absolute,
                        Contribution_relative = sig_con_relative)
      if(!is.null(df.etiology)){
        aet <- etiology_ref[sig_name]
        sub$Etiology <- as.character(aet)  
      }
      return(sub)
    })
    signatures_etiology_list <- signatures_etiology_list[!is.na(signatures_etiology_list)]
    signatures_etiology <- dplyr::bind_rows(signatures_etiology_list)
    ## order data frame by contribution of each branch
    signatures_etiology <- dplyr::arrange(signatures_etiology,
                                           dplyr::desc(.data$Level_ID),
                                           dplyr::desc(.data$Contribution_relative)) 
    
    recon_df  <- as.data.frame(recon_matrix) 
    recon_df$Branch <- as.character(row.names(recon_df)) 
    rownames(recon_df) <- NULL
    recon_spectrum <- tidyr::pivot_longer(recon_df,
                                          -"Branch",
                                          names_to = "Type",
                                          values_to = "Reconstructed")
    
    ## convert origin matrix to data frame
    origin_df  <- as.data.frame(origin_matrix) 
    origin_df$Branch <- as.character(row.names(origin_df)) 
    rownames(origin_df) <- NULL
    origin_spectrum <- tidyr::pivot_longer(origin_df,
                                           -"Branch",
                                           names_to = "Type",
                                           values_to = "Original")
    
    mut_spectrum <- dplyr::left_join(recon_spectrum, origin_spectrum, by = c("Branch", "Type"))
    
    total_cosine_similarity <- mut_spectrum %>%
      dplyr::group_by(Branch) %>%
      dplyr::summarise(
        cosine_similarity = sum(Original*Reconstructed)/(sqrt(sum(Original^2))*sqrt(sum(Reconstructed^2)))
       ) %>% 
      dplyr::rename("Level_ID" = "Branch") %>% 
      as.data.frame()
    ## calculate cosine similarity between origin matrix and reconstructed matrix
    
    if(typeof(tri_matrix_list[[i]]) == "list"){
      f <- list(
        reconstructed.mat = recon_matrix,
        original.mat = origin_matrix,
        cosine.similarity = cos_sim_matrix,
        contribution.absolute = con_matrix_absolute,
        contribution.relative = con_matrix_relative,
        total.cosine.similarity = total_cosine_similarity,
        RSS = RSS, 
        signatures.etiology = signatures_etiology,
        tsb.label = tsb.label
      )
      return(f)
    }else{
      f <- list(
        reconstructed.mat = recon_matrix,
        original.mat = origin_matrix,
        cosine.similarity = cos_sim_matrix,
        total.cosine.similarity = total_cosine_similarity,
        RSS = RSS, 
        signatures.etiology = signatures_etiology
      )
      return(f)
    }
  }
  
  result <- lapply(seq_len(length(tri_matrix_list)) ,processFitSig)
  names(result) <- names(tri_matrix_list)
  result <- result[!is.na(result)]
  
  if(length(result) == 0){
    return(NA)
  }else{
    return(result)
  }
 
}

Try the MesKit package in your browser

Any scripts or data that you put into this service are public.

MesKit documentation built on March 28, 2021, 6 p.m.