#' Runs SigMA: (1) calculates likelihood, cosine similarity,
#' NNLS exposures, and likelihood of the decomposition. (2) These
#' features are later used in multivariate analysis. (3) Based
#' on scores a final decision on existence of the signature.
#'
#' @param genome_file a csv file with snv spectra info can be created
#' from vcf file using @make_genome_matrix() function
#' see ?make_genome_matrix
#' @param output_file the output file name, can be NULL in which
#' case input file name is used and appended with "_output"
#' @param data determines the type of sequencing platform
#' used for dataset see list_data_options() and find_data_setting()
#' @param tumor_type see list_tumor_types() for available options
#' @param do_mva a boolean for whether multivariate analysis
#' should be run, has_model() function can be used to check whether
#' there is an available MVA model for that data and tumor_type setting
#' see SigMA/examples/example_find_settings.R
#' @param do_assign when true a cutoff is applied on MVA score
#' to convert these values into binary classification. The thresholds
#' correspond to positive rates of 0.1 and 0.01 for MVA score,
#' the columns generated for these are pass_mva and pass_mva_strict,
#' respectively
#' @param check_msi is a boolean which determines whether the user
#' wants to identify micro-sattelite instable tumors which often
#' confound the HRD classification
#' @param weight_cf determines whether the likelihood calculation
#' will take into account the number of tumors in each cluster
#' when it is F the clusters get equal weights and when it's T
#' they are weighted according to the fraction of tumors in each
#' cluster
#' @param lite_format saves the output in a lite format when set
#' to true
#' @param add_sig3 should be set to T when the likelihood of
#' Signature 3 is calculated for tumor types for which Signature 3
#' was not discovered by NMF in their WGS data
#' @param norm96 the normalization for custom classifiers to weight
#' signatures taking into account differences in trinucleotide
#' frequency in whole genome versus the the sequencing platform.
#' When built in data setting are used nomr96 is not necessary.
#' @param readjust when set TRUE the cutoffs are readjusted to
#' match the median SNV counts in the data which may differ from
#' the dataset used in tuning the MVA classifier. Set to F by
#' default in order to avoid readjusting for dataset with too
#' few samples which may bias the prediction.
#' @param return_df when set TRUE instead of saving the output
#' to file it returns the data frame
#' @param input_df the input data frame, can be used instead of
#' genome file. Provide one or the other.
#'
#' @examples
#' run(genome_file = "input_genomes.csv",
#' data = "msk",
#' tumor_type = "ovary")
#' run(genome_file = "input_genomes.csv",
#' data = "seqcap",
#' tumor_type = "bone_other")
run <- function(genome_file = NULL,
output_file = NULL,
data = "msk",
tumor_type = "breast",
catalog_name = NULL,
do_assign = T,
do_mva = T,
check_msi = F,
weight_cf = F,
lite_format = F,
add_sig3 = F,
norm96 = NULL,
custom = F,
readjust = F,
return_df = F,
input_df = NULL,
snv_cutoff = 5){
if(is.null(catalog_name))
stop('please provide a catalog_name argument options: ', paste0(names(catalogs), sep = ' '))
if(!(catalog_name %in% names(catalogs)))
stop(paste0('catalog ', catalog_name, ' does not exist provide one of the following: ', paste0(names(catalogs), sep = ' ')))
cosmic_catalog <- catalogs[[catalog_name]]
# when readjust is set to FALSE the parameters below are not used
cut_var = NULL
limits = NULL
cutoffs_recalculated = NULL
# only if readjust is set to TRUE the data setting is controlled
# against other options to pick the best option and overwritten
best_model <- NULL
below_cutoff <- NULL
if(!check_msi & readjust){
best_model <- find_data_setting(input_file = genome_file,
remove_msi_pole = F,
tumor_type = tumor_type,
input_df = input_df)
if(best_model != data){
warning(paste0('best data setting differs from the data setting
used so the setting was changed to: ', best_model))
data <- best_model
}
}
if((data == 'seqcap' | data == "seqcap_probe") & do_mva & !weight_cf){
warning('weight_cf was set to FALSE but the seqcap and seqcap_probe requires
weight_cf to be TRUE, so this setting was change to TRUE')
weight_cf = T
}else if(data %in% c('wgs', 'wgs_pancan', 'tcga_mc3')){
weight_cf = F
}
# There are trained MVA models for tumor_type "eso", "osteo", "ovary",
# "panc_ad", "panc_en", "prost", "stomach", "uterus", "breast", "bladder"
# for other tumor types do_mva should be FALSE
# Signature 3 likelihood and all other variables can still be calculated
# by setting add_sig3 = T
if(do_mva & !custom & sum(tumor_type == names(gbm_models[[data]])) == 0){
stop('No built-in MVA models for the tumor_type selected for
targetted gene panels for "medullo" or "ewing" whole
exome sequencing is available for others set do_mva to FALSE')
}
if(!add_sig3 & do_assign & ('Signature_3' %in% signames_per_tissue_per_catalog[[catalog_name]][[tumor_type]]) & sum(grepl('Signature_3_',colnames(all_catalogs[[tumor_type]]))) ==0 ){
add_sig3 <- T
}
if(is.null(genome_file) & !is.null(input_df)){
genomes <- input_df
}
else if(!is.null(genome_file) & is.null(input_df)){
genomes <- read.csv(genome_file)
}
else{
stop('genome_file or input file should point to the input dataset')
}
# remove genomes with no mutation
if(sum(is.na(rowSums(genomes[, 1:96]))) > 0){
genomes <- genomes[!is.na(rowSums(genomes[, 1:96])), ]
}
if(sum(rowSums(genomes[, 1:96]) == 0) > 0){
if(readjust)
below_cutoff <- genomes[is.na(rowSums(genomes[,1:96])),]
genomes <- genomes[which(rowSums(genomes[, 1:96]) > 0), ]
}
# lower cutoff on number mutations for SigMA
if(do_assign | do_mva){
if(!is.null(snv_cutoff)){
if(readjust){
if(is.null(below_cutoff))
below_cutoff <- genomes[rowSums(genomes[,1:96]) < snv_cutoff,]
else{
below_cutoff <- rbind(below_cutoff,
genomes[rowSums(genomes[,1:96]) < snv_cutoff,])
}
}
genomes <- genomes[rowSums(genomes[,1:96]) >= snv_cutoff,]
}
else{
if(data == "msk" | data == "op" | data == "fo"){
if(tumor_type == "prost") snv_cutoff <- 4
else if(tumor_type == "osteo" | tumor_type == "panc_en") snv_cutoff <- 3
else snv_cutoff <- 5
}else if(data %in% names(platform_names)){ # for exomes and wgs a larger lower cutoff is applied
if(tumor_type == "bone_other" | tumor_type == "medullo") snv_cutoff <- 5
else snv_cutoff <- 10
}
}
}
if(!custom){
message(paste0("You are running SigMA for ", dim(genomes)[[1]],
" ", tissue_names[[tumor_type]], "(s) sequenced by ",
platform_names[[data]]))
}else{
message(paste0("You are running SigMA for ", dim(genomes)[[1]],
" ", tissue_names[[tumor_type]]))
}
# method names to calculate different features
methods <- c("median_catalog", "cosine_simil", "decompose")
# signature catalogs to be used for each method
sig_catalogs <- c("average", "cosmic_tissue", "cosmic_tissue")
# first the samples are assumed to be mss and later msi
# calculations are done
steps <- c("mss", "mss", "mss")
# signature to be identified from mss samples
signames <- rep("Signature_3", 3)
# for breast cancer there is an additional feature that is
# the weighted likelihood
if(tumor_type == "breast"){
methods <- c(methods, "weighted_catalog")
sig_catalogs <- c(sig_catalogs, "cosmic_tissue")
steps <- c(steps, "mss")
signames <- c(signames, "Signature_3")
}
if(do_mva){
methods <- c(methods, "mva")
sig_catalogs <- c(sig_catalogs, "none")
signames <- c(signames, "Signature_3")
steps <- c(steps, "mss")
}
if(check_msi){
methods <- c(methods, "median_catalog", "decompose")
sig_catalogs <- c(sig_catalogs, "average", "cosmic_tissue")
steps <- c(steps, "msi", "msi")
signames <- c(signames, "Signature_msi")
}
for(imethod in 1:length(methods)){
sig_catalog <- sig_catalogs[[imethod]]
method <- methods[[imethod]]
step <- steps[[imethod]]
if(method == "median_catalog"){
average_catalog <- all_catalogs[[tumor_type]]
if(add_sig3) average_catalog$Signature_3_c1 <- all_catalogs$breast$Signature_3_c1
message("Calculating likelihoods for each cluster")
}else if(method == "cosine_simil"){
message("Calculating cosine similarities")
}else if(method == "decompose"){
message("Calculating exposures with NNLS")
}else if(method == "mva"){
message("Predicting MVA score")
}
if(sig_catalog == "cosmic"){
signatures <- cosmic_catalog
}
if(sig_catalog == "cosmic_tissue"){
if(step == "mss" & method != "cosine_simil"){
signatures <- cosmic_catalog[, signames_per_tissue_per_catalog[[catalog_name]][[tumor_type]]]
}else{
signatures <- cosmic_catalog[, unique(c(signames_per_tissue_per_catalog[[catalog_name]][[tumor_type]],
signames_per_tissue_per_catalog[[catalog_name]][["msi"]],
signames_per_tissue_per_catalog[[catalog_name]][["msi_extra"]],
signames_per_tissue_per_catalog[[catalog_name]][["pole"]]))]
}
if(add_sig3) signatures$Signature_3 <- cosmic_catalog$Signature_3
}
if(sig_catalog == "average"){
signatures_msi <- all_catalogs[["msi"]]
if(step == "mss") signatures <- average_catalog
else signatures <- cbind(average_catalog,
signatures_msi,
all_catalogs[["pole"]])
signatures <- unique(signatures)
}
# scale for the tri-nucleotide context
if(data != "wgs" & data != "wgs_pancan"){
if(!is.null(norm96)){
signatures <- norm96*signatures
}
else{
if(data %in% names(weight_3Nfreq)){
signatures <- unlist(weight_3Nfreq[data]) * signatures
}else{
stop('choose a built-in model or provide trinucleotide normalization')
}
}
}
signatures_norm <- t(t(signatures)/colSums(signatures))
colnames(signatures_norm) <- colnames(signatures)
signatures <- signatures_norm
rm(signatures_norm)
# set the column with total number of mutations if this column
# doesn't exist
if(sum(colnames(genomes) == "total_snvs") == 0)
genomes$total_snvs <- rowSums(genomes[, 1:96])
#calculate the likelihood/cos simil/decomposition
if(method != "mva"){
if(method == 'median_catalog'){
# When likelihoods are calculated they are scaled with the number
# of tumors in the public datasets that are in that specific cluster
# with respect to which likelihoods are calculated. For WES data
# this is the default for panel data there is the option of not
# having this weighting to make the model more robust
if(weight_cf){
cluster_fractions_this <- cluster_fractions[[tumor_type]]
if(check_msi & step == 'msi'){
msi_count <- cluster_fractions$msi_tissue_weights[[tumor_type]]
pole_count <- cluster_fractions$pole_tissue_weights[[tumor_type]]
counts <- c(cluster_fractions_this,
msi_count*cluster_fractions$msi/sum(cluster_fractions$msi),
pole_count*cluster_fractions$pole/sum(cluster_fractions$pole))
names(counts) <- c(names(cluster_fractions_this),
names(cluster_fractions$msi),
names(cluster_fractions$pole))
cluster_fractions_this <- counts
}
}else{
cluster_fractions_this <- NULL
}
output <- match_to_catalog(genomes,
signatures,
method = method,
data = data,
cluster_fractions = cluster_fractions_this)
}else{
output <- match_to_catalog(genomes,
signatures,
method = method,
data = data)
}
if(step == "msi"){
colnames(output) <- paste0(colnames(output), "_msi")
}
}
else{ # calculate the multivariate analysis score
output <- predict_mva(cbind(genomes, merged_output),
signames[[imethod]],
data,
tumor_type,
weight_cf,
custom)
}
# if check_msi is set to FALSE meaning that user has provided only the
# mismatch repair proficient samples then the adjustment is done initially
if(method == "mva" & readjust){
# if readjust is set to TRUE the cutoffs are determined based on average
# SNV cuts. 1. MSI and POLE-exo mutated sampels are removed. 2. The best model is
# determined, 3. cutoffs are recalculated if the average count of this dataset
# differs from the data used to tune the models
combined <- cbind(genomes, merged_output, output)
if(check_msi){
inds <- which((merged_output$Signature_msi_ml > 0.99 | merged_output$Signature_pole_msi > 0.99)
& merged_output$total_snvs > median(merged_output$total_snvs))
if(length(inds) > 0) df_no_msi_pole <- combined[-inds,]
else df_no_msi_pole <- combined
}
else{
df_no_msi_pole <- combined
}
if(is.null(best_model)){
best_model <- find_data_setting(input_file = NULL,
input_df = df_no_msi_pole,
remove_msi_pole = F,
tumor_type = tumor_type)
warning(paste0('Changed to the best data setting is:', best_model))
}
if(best_model != data){
if(exists('merged_output')) rm('merged_output')
if(exists('output')) rm('output')
SigMA_output <- run(genome_file = genome_file,
output_file = output_file,
data = best_model,
tumor_type = tumor_type,
do_assign = do_assign,
do_mva = do_mva,
check_msi = check_msi,
weight_cf = weight_cf,
lite_format = lite_format,
add_sig3 = add_sig3,
norm96 = norm96,
custom = custom,
readjust = readjust,
return_df = return_df,
input_df = input_df)
return(SigMA_output)
}
else{
adjusted_cutoff <- adjust_cutoff(df_no_msi_pole, data, tumor_type, below_cutoff)
if(!is.null(adjusted_cutoff)){
custom <- T
cut_var <- adjusted_cutoff$cut_var
limits <- adjusted_cutoff$limits
cutoffs_recalculated <- adjusted_cutoff$cutoffs
}
else{
simul_df <- quick_simulation(input_file = NULL,
input_df = df_no_msi_pole,
tumor_type = tumor_type,
data = best_model,
remove_msi_pole = F,
return_df = T,
run_SigMA = F,
below_cutoff = below_cutoff)
# algorithm is run on simulations
output_simul_df <- run(genome_file = NULL,
input_df =simul_df,
data = best_model,
tumor_type = tumor_type,
do_mva = T,
do_assign = T,
check_msi = F,
return_df = T)
cut_var <- 'fpr'
limits <- c(0.1, 0.01)
thresh <- get_threshold(output_simul_df,
limits, var = 'Signature_3_mva', cut_var = cut_var)
cutoffs_recalculated <- thresh$cutoff
warning('adjusted cutoff was not available because the SNV count
differs from the simulations used to tune the
classifier. Results are likely to be subobtimal try retuning see
SigMA/examples/test_tune_new.R for a tutorial')
}
}
}
# calculates the pass/fail boolean from MVA score or likelihood
if(do_assign &
(method == "median_catalog" | method == "mva")){
output_comb <- cbind(genomes, output)
assignments <- assignment(output_comb,
method = method,
signame = signames[[imethod]],
data = data,
tumor_type = tumor_type,
weight_cf = weight_cf,
custom_model = custom,
cut_var = cut_var,
limits = limits,
cutoffs_custom = cutoffs_recalculated)
output <- cbind(output, assignments)
}
if(!exists("merged_output")) merged_output <- output
else merged_output <- cbind(merged_output, output)
if(!('rat_sig3' %in% colnames(merged_output))){
if('exp_sig3' %in% colnames(merged_output)){
merged_output$rat_sig3 <- merged_output$exp_sig3/genomes$total_snvs
}
}
if(!("Signature_3_ml" %in% colnames(merged_output))){
inds <- na.omit(match(paste0('Signature_3_c', 1:10, '_ml'), colnames(merged_output)))
if(length(inds) > 1)
merged_output$Signature_3_ml <- rowSums(merged_output[, inds])
if(length(inds) == 1)
merged_output$Signature_3_ml <- merged_output$Signature_3_c1_ml
}
}
merged_output <- cbind(genomes, merged_output)
# lite format removes cosine similarities and NNLS + likelihood results for
# signatures other than Signature 3 it keeps likelihoods with respect to
# cluster averages, but combines the different clusters in different
# categories together
if(return_df) return(merged_output)
else{
if(!is.null(best_model)) data = best_model
if(data %in% names(platform_names)){
platform_name = gsub(platform_names[[data]],
pattern = " ", replace = "")
}
else{
platform_name = paste0('Custom_', data)
}
if(is.null(output_file) & !is.null(genome_file)){
output_file = gsub(genome_file,
pattern = ".csv",
replacement = paste0("_output_tumortype_",
tumor_type,
"_platform_",
platform_name,
"_cf", as.integer(weight_cf),
".csv"))
}else if(is.null(output_file) & is.null(genome_file)){
output_file = paste0("SigMA_output_tumor_type_",
tumor_type,
"_platform_",
platform_name,
"_cf", as.integer(weight_cf),
".csv")
}
if(lite_format){
lite <- lite_df(merged_output)
output_file <- gsub(output_file, pattern = '\\.csv', replace = '_lite\\.csv')
write.table(lite,
output_file,
row.names = F,
col.names = T,
quote = F,
sep = ",")
if(file.exists(output_file))
message(paste0("SigMA output is in: ", output_file))
}else{
write.table(merged_output,
output_file,
row.names = F,
col.names = T,
quote = F,
sep = ",")
if(file.exists(output_file))
message(paste0("SigMA output is in: ", output_file))
}
return(output_file)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.