#' fitSignatures
#' @description Find nonnegative linear combination of mutation signatures to reconstruct matrix and
#' calculate cosine similarity based on somatic SNVs.
#'
#' @param tri_matrix A matrix or a list of matrix generated by \code{\link{triMatrix}} function.
#' @param patient.id Select the specific patients. Default NULL, all patients are included
#' @param signaturesRef Signature reference,Users can upload their own reference.
#' Default "cosmic_v2". Option: "exome_cosmic_v3","nature2013".
#' @param associated Associated Vector of associated signatures.
#' If given, will narrow the signatures reference to only the ones listed. Default NULL.
#' @param min.mut.count The threshold for the variants in a branch. Default 15.
#' @param signature.cutoff Discard any signature relative contributions with a weight less than this amount. Default 0.1.
#' @return A list of data frames, each one contains treeMSOutput,
#' containing information about each set/branch's mutational signature.
#'
#' @importFrom pracma lsqnonneg
#' @examples
#' maf.File <- system.file("extdata/", "CRC_HZ.maf", package = "MesKit")
#' clin.File <- system.file("extdata/", "CRC_HZ.clin.txt", package = "MesKit")
#' ccf.File <- system.file("extdata/", "CRC_HZ.ccf.tsv", package = "MesKit")
#' maf <- readMaf(mafFile=maf.File, clinicalFile = clin.File, ccfFile=ccf.File, refBuild="hg19")
#'
#' ## Load a reference genome.
#' library(BSgenome.Hsapiens.UCSC.hg19)
#'
#' phyloTree <- getPhyloTree(maf, patient.id = 'V402')
#' tri_matrix <- triMatrix(phyloTree)
#' fitSignatures(tri_matrix)
#' @export fitSignatures
fitSignatures <- function(tri_matrix = NULL,
patient.id = NULL,
signaturesRef = "cosmic_v2",
associated = NULL,
min.mut.count = 15,
signature.cutoff = 0.1){
if(!is.null(patient.id)){
patient.setdiff <- setdiff(patient.id, names(tri_matrix))
if(length(patient.setdiff) > 0){
stop(paste0("Patient ", patient.setdiff, " can not be found in your data"))
}
tri_matrix <- tri_matrix[names(tri_matrix) %in% patient.id]
}
## get signatrue reference
df.etiology <- NULL
if(is(signaturesRef,'character')){
signaturesRef <- match.arg(signaturesRef,
choices = c("cosmic_v2","nature2013","exome_cosmic_v3"),
several.ok = FALSE)
signatures.etiology <- readRDS(file = system.file("extdata", "signatures.etiology.rds", package = "MesKit"))
# signatures.etiology <- readRDS(file = "D:/renlab/MesKit/inst/extdata/signatures.etiology.rds")
if (signaturesRef == "cosmic_v2"){
sigsRef <- readRDS(file = system.file("extdata", "signatures.cosmic.rds", package = "MesKit"))
rownames(sigsRef) <- gsub("Signature.", "Signature ", rownames(sigsRef))
df.etiology <- data.frame(aeti = signatures.etiology$cosmic_v2$etiology,
sig = rownames(signatures.etiology$cosmic_v2))
}else if(signaturesRef == "nature2013"){
sigsRef <- readRDS(file = system.file("extdata", "signatures.nature2013.rds", package = "MesKit"))
## rename signature
rownames(sigsRef) <- gsub("Signature.", "Signature ", rownames(sigsRef))
df.etiology <- data.frame(aeti = signatures.etiology$nature2013$etiology,
sig = rownames(signatures.etiology$nature2013))
}else if(signaturesRef == "exome_cosmic_v3"){
sigsRef <- readRDS(file = system.file("extdata", "signatures.exome.cosmic.v3.may2019.rds", package = "MesKit"))
df.etiology <- data.frame(aeti = signatures.etiology$cosmic_v3$etiology,
sig = rownames(signatures.etiology$cosmic_v3))
}
}else if(!is(signaturesRef, 'data.frame')){
stop('Input signature reference should be a data frame.')
}else{
sigsRef <- signaturesRef
}
if(!is.null(associated)){
signature.setdiff <- setdiff(associated, rownames(sigsRef))
if(length(signature.setdiff) > 0){
stop(paste0(signature.setdiff, " were not found in signature reference."))
}
sigsRef <- sigsRef[rownames(sigsRef) %in% associated, ]
}
tri_matrix_list <- tri_matrix
processFitSig <- function(i){
if(typeof(tri_matrix_list[[i]]) == "list"){
tri_matrix <- tri_matrix_list[[i]]$tri_matrix
tsb.label <- tri_matrix_list[[i]]$tsb.label
}else{
tri_matrix <- tri_matrix_list[[i]]
}
patient <- names(tri_matrix_list)[i]
## Remove branches whose mutation number is less than min.mut.count
branch_remove_idx <- which(rowSums(tri_matrix) <= min.mut.count)
branch_remove <- rownames(tri_matrix)[branch_remove_idx]
if(length(branch_remove) > 0){
message("Warning: mutation number of ",
paste(branch_remove, collapse = ", "),
" of ",patient, " is not enough for signature extraction. See 'min.mut.count' parameter.")
branch_left <- setdiff(rownames(tri_matrix),branch_remove)
if(length(branch_left) == 0){
return(NA)
}
tri_matrix <- tri_matrix[branch_left,]
## rebuild matrix if there is only one branch left
if(is(tri_matrix, "numeric")){
type_name <- names(tri_matrix)
tri_matrix <- matrix(tri_matrix, ncol = length(tri_matrix))
rownames(tri_matrix) <- branch_left
colnames(tri_matrix) <- type_name
}
}
## convert mutation number to proportion
# origin_matrix <- t(apply(tri_matrix,1,function(x)x/sum(x)))
origin_matrix <- tri_matrix
## calculate cosine similarity
branch_num <- nrow(origin_matrix)
refsig_num <- nrow(sigsRef)
cos_sim_matrix <- matrix(nrow = branch_num, ncol = refsig_num)
rownames(cos_sim_matrix) <- rownames(origin_matrix)
colnames(cos_sim_matrix) <- rownames(sigsRef)
cos_sim_matrix <- apply(origin_matrix, 1, function(x){
apply(sigsRef, 1, function(y){
y <- as.numeric(y)
s <- as.numeric(x %*% y / (sqrt(x %*% x) * sqrt(y %*% y)))
return(s)
})
}) %>% t()
## calculate signature contribution by solving nonnegative least-squares constraints problem(weight)
type_num <- ncol(origin_matrix)
sigsRef_t <- t(as.matrix(sigsRef))
## contribution matrix
con_matrix_absolute <- matrix(1, nrow = branch_num, ncol = refsig_num)
## reconstruted matrix
recon_matrix <- matrix(1, nrow = branch_num, ncol = type_num)
## solve nonnegative least-squares constraints.
# con_matrix_absolute <- apply(origin_matrix, 1, function(m){
# lsq <- pracma::lsqnonneg(sigsRef_t, m)
# return(lsq$x)
# }) %>% t()
con_matrix_absolute <- apply(origin_matrix, 1, function(m){
lsq <- pracma::lsqnonneg(sigsRef_t, m)
return(lsq$x)
})
if(is(con_matrix_absolute, "matrix")){
con_matrix_absolute <- t(con_matrix_absolute)
}else{
con_matrix_absolute <- matrix(data = con_matrix_absolute, nrow = length(con_matrix_absolute))
}
recon_matrix <- apply(origin_matrix, 1, function(m){
lsq <- pracma::lsqnonneg(sigsRef_t, m)
l <- sigsRef_t %*% as.matrix(lsq$x)
return(l)
}) %>% t()
rownames(con_matrix_absolute) <- rownames(origin_matrix)
colnames(con_matrix_absolute) <- rownames(sigsRef)
con_matrix_relative <- con_matrix_absolute
con_matrix_relative <- apply(con_matrix_relative, 1, function(m){
m <- m/sum(m)
return(m)
}) %>% t()
rownames(recon_matrix) <- rownames(origin_matrix)
colnames(recon_matrix) <- colnames(origin_matrix)
## calculate RSS of reconstructed matrix and origin matrix
RSS <- vapply(seq_len(branch_num), function(i){
r <- recon_matrix[i,]
r <- r/sum(r)
o <- origin_matrix[i,]
o <- o/sum(o)
rss <- sum((r-o)^2)
return(rss)
},FUN.VALUE = numeric(1))
names(RSS) <- rownames(origin_matrix)
## summary mutation signatures of branches
if(!is.null(df.etiology)){
etiology_ref <- as.character(df.etiology$aeti)
names(etiology_ref) <- as.character(df.etiology$sig)
}
etiology_ref <- as.character(df.etiology$aeti)
names(etiology_ref) <- as.character(df.etiology$sig)
signatures_etiology <- data.frame()
signatures_etiology_list <- lapply(seq_len(branch_num), function(i){
branch_name <- rownames(tri_matrix)[i]
sig_con_absolute <- con_matrix_absolute[i,]
sig_con_relative <- con_matrix_relative[i,]
if(length(sig_con_relative) == 1){
names(sig_con_relative) <- colnames(con_matrix_absolute)
}
idx <- which(sig_con_relative > signature.cutoff)
sig_name <- names(sig_con_relative[idx])
if(length(sig_name) == 0){
return(NA)
}
sig_con_relative <- as.numeric(sig_con_relative[idx])
sig_con_absolute <- as.numeric(sig_con_absolute[idx])
# mut_sum <- sum(tri_matrix[i,])
sub <- data.frame(Level_ID = branch_name,
Signature = sig_name,
# Mutation_number = mut_sum,
Contribution_absolute = sig_con_absolute,
Contribution_relative = sig_con_relative)
if(!is.null(df.etiology)){
aet <- etiology_ref[sig_name]
sub$Etiology <- as.character(aet)
}
return(sub)
})
signatures_etiology_list <- signatures_etiology_list[!is.na(signatures_etiology_list)]
signatures_etiology <- dplyr::bind_rows(signatures_etiology_list)
if(nrow(signatures_etiology) != 0){
## order data frame by contribution of each branch
signatures_etiology <- dplyr::arrange(signatures_etiology,
dplyr::desc(.data$Level_ID),
dplyr::desc(.data$Contribution_relative))
}else{
message("Warning: None of relative contribution of signature",
" in ",patient, " is larger than signature.cutoff")
signatures_etiology <- NULL
}
recon_df <- as.data.frame(recon_matrix)
recon_df$Branch <- as.character(row.names(recon_df))
rownames(recon_df) <- NULL
recon_spectrum <- tidyr::pivot_longer(recon_df,
-"Branch",
names_to = "Type",
values_to = "Reconstructed")
## convert origin matrix to data frame
origin_df <- as.data.frame(origin_matrix)
origin_df$Branch <- as.character(row.names(origin_df))
rownames(origin_df) <- NULL
origin_spectrum <- tidyr::pivot_longer(origin_df,
-"Branch",
names_to = "Type",
values_to = "Original")
mut_spectrum <- dplyr::left_join(recon_spectrum, origin_spectrum, by = c("Branch", "Type"))
total_cosine_similarity <- mut_spectrum %>%
dplyr::group_by(Branch) %>%
dplyr::summarise(
cosine_similarity = sum(Original*Reconstructed)/(sqrt(sum(Original^2))*sqrt(sum(Reconstructed^2)))
) %>%
dplyr::rename("Level_ID" = "Branch") %>%
as.data.frame()
## calculate cosine similarity between origin matrix and reconstructed matrix
if(typeof(tri_matrix_list[[i]]) == "list"){
f <- list(
reconstructed.mat = recon_matrix,
original.mat = origin_matrix,
cosine.similarity = cos_sim_matrix,
contribution.absolute = con_matrix_absolute,
contribution.relative = con_matrix_relative,
total.cosine.similarity = total_cosine_similarity,
RSS = RSS,
signatures.etiology = as.data.frame(signatures_etiology) ,
tsb.label = tsb.label
)
return(f)
}else{
f <- list(
reconstructed.mat = recon_matrix,
original.mat = origin_matrix,
cosine.similarity = cos_sim_matrix,
total.cosine.similarity = total_cosine_similarity,
RSS = RSS,
signatures.etiology = signatures_etiology
)
return(f)
}
}
result <- lapply(seq_len(length(tri_matrix_list)) ,processFitSig)
names(result) <- names(tri_matrix_list)
result <- result[!is.na(result)]
if(length(result) == 0){
return(NA)
}else{
return(result)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.