#' The tune_new_gbm() function is used for tuning a new model
#' for users who would like to use the algorithm for sequencing
#' platform without built in models. See SigMA/test/test_tune_new.R
#' for an example. Additional functions for determining the
#' sensitivity, FPR, FDR and thresholds to use for the new model:
#' sen_fpr(), get_threshold(), and for adding and removing gbm
#' models to the algorithm.
#'
#' @param input_file the path to the file that contains the
#' data table that will be used in run.R function
#' @param rda_file the path to an rda file to save the tuned
#' gbm model, if it is NULL function returns the model
#' @param tumor_type see list_tumor_types() for more information
#' @param data sequencing platform see list_data_options() for more
#' information
#' @param norm96 is a 96-dimensional array with weights that adjusts
#' the trinucleotide frequency in the specific sequencing platform
#' to the whole genome frequency. You can use get_trinuc_norm() for
#' calculating the weight from a bed file. If norm96 is NULL the
#' normalization for the platform determined by data settings is used
#' @param run_SigMA boolean indicating whether the input file
#' is the output of an initial SigMA calculation (to be set to FALSE)
#' or whether it is an unprocessed file (to be set to TRUE).
tune_new_gbm <- function(input_file,
rda_file = NULL,
tumor_type = NULL,
data = NULL,
catalog_name = NULL,
norm96 = NULL,
run_SigMA = T,
snv_cutoff = 5,
feature_cutoff = 0.01,
colname_truth_tag = 'is_sig3',
use_indels = F,
gbm_parameters = list(n.trees = 5000,
shrinkage = 0.01,
bag.fraction = 0.2,
n.minobsinnode = 3,
cv.fold = 5)){
if(is.null(tumor_type) & !run_SigMA){
if(!grepl('tumor_type', input_file)) stop('tumor_type not provided')
tumor_type <- unlist(lapply(strsplit(input_file, split = 'tumortype'),
function(x){x[[2]]}))
tumor_type <- unlist(lapply(strsplit(tumor_type, split = '_'),
function(x){x[[1]]}))
}
else if(is.null(tumor_type)){
stop('tumor_type mising see get_tumor_types()')
}
if(run_SigMA){
if(is.null(catalog_name))
stop('please provide a catalog_name argument options: ', paste0(names(catalogs), sep = ' '))
if(!(catalog_name %in% names(catalogs)))
stop(paste0('catalog ', catalog_name, ' does not exist provide one of the following: ', paste0(names(catalogs), sep = ' ')))
output_file <- run(input_file,
tumor_type = tumor_type,
data = data,
catalog_name = catalog_name,
norm96 = norm96,
do_assign = F,
check_msi = F,
do_mva = F,
snv_cutoff = snv_cutoff)
}
else{
output_file <- input_file
}
gbm_model <- tune_gbm_model(output_file, snv_cutoff, feature_cutoff, gbm_parameters, colname_truth_tag, use_indels)
output_table <- predict_prob(output_file, gbm_model)
write.table(output_table,
file = gsub(input_file,
pattern = '.csv',
replace = '_predictions.csv'),
row.names = F, sep = ',', quote = F)
if(!is.null(rda_file)) save(gbm_model, file = rda_file)
else return(gbm_model)
}
tune_gbm_model <- function(file, snv_cutoff, feature_cutoff, gbm_parameters, colname_truth_tag, use_indels){
df <- read.csv(file)
df <- df[df$total_snvs >= snv_cutoff,]
df$is_true <- df[, colname_truth_tag]
if(sum(!(df$is_true %in% c(0, 1)), na.rm = T) > 0){
stop('The column with truth tag should be 0 or 1. Currently binary classification is implemented')
}
features_gbm <- unique(c(features_gbm, colnames(df)[grepl('Signature_', colnames(df)) & grepl('_ml', colnames(df))]))
if(use_indels)
features_gbm <- c(features_gbm, 'mmej', 'nhej')
features_gbm <- unique(c(features_gbm, colnames(df)[grepl('_sig3', colnames(df)) & !grepl('is_sig3', colnames(df))]))
features_gbm <- features_gbm[!is.na(match(features_gbm, colnames(df)))]
df[,colname_truth_tag] <- as.factor(df[,colname_truth_tag])
n.trees = gbm_parameters$n.trees
shrinkage = gbm_parameters$shrinkage
bag.fraction = gbm_parameters$bag.fraction
n.minobsinnode = gbm_parameters$n.minobsinnode
cv.fold = gbm_parameters$cv.fold
gbm_model <- gbm::gbm(formula = is_true ~ .,
distribution = 'bernoulli',
data = na.omit(df[, c(features_gbm, 'is_true')]),
n.trees = n.trees,
shrinkage = shrinkage,
bag.fraction = bag.fraction,
n.minobsinnode= n.minobsinnode,
cv.fold = cv.fold,
n.cores = 1)
bestTreeForPrediction = gbm::gbm.perf(gbm_model)
gbm_model <- gbm::gbm(formula = is_true ~ .,
distribution = 'bernoulli',
data = na.omit(df[, c(features_gbm, 'is_true')]),
n.trees = bestTreeForPrediction,
shrinkage = shrinkage,
bag.fraction = bag.fraction,
n.minobsinnode= n.minobsinnode,
n.cores = 1)
rel_infs <- gbm::relative.influence(gbm_model)
features <- names(rel_infs)[rel_infs/sum(rel_infs) > feature_cutoff]
gbm_model <- gbm::gbm(formula = is_true ~ .,
distribution = 'bernoulli',
data = na.omit(df[, c(features, 'is_true')]),
n.trees = bestTreeForPrediction,
shrinkage = shrinkage,
bag.fraction = bag.fraction,
n.minobsinnode= n.minobsinnode,
n.cores = 1)
return(gbm_model)
}
predict_prob <- function(file, gbm_model){
df <- read.csv(file)
prediction = predict(object = gbm_model,
newdata = df,
n.trees = gbm_model$n.trees,
type = "response")
prediction[df$total_snvs < 5] <- 0
df$prob <- prediction
return(df)
}
# adds the gbm model in the system files of the package
add_gbm_model <- function(name_model,
tumor_type,
gbm_model = NULL,
cutoff = NULL,
cutoff_strict = NULL){
file_path <- system.file("extdata/gbm_models/",
package="SigMA")
if(file.exists(paste0(file_path, '/', name_model, '.rda')))
load(paste0(file_path, '/', name_model, '.rda'))
if(!exists('gbm_models_custom')) gbm_models_custom <- list()
gbm_models_custom[[tumor_type]] <- gbm_model
if(!exists('cutoffs_custom')) cutoffs_custom <- list()
cutoffs_custom[[tumor_type]] <- cutoff
if(!exists('cutoffs_strict_custom')) cutoffs_strict_custom <- list()
cutoffs_strict_custom[[tumor_type]] <- cutoff_strict
save(gbm_models_custom, cutoffs_custom, cutoffs_strict_custom, file = paste0(file_path, '/', name_model, '.rda'))
}
# remove a specific model if the tumor_type is
# specified the rest of the models are kept and just
# the one for that tumor type is removed if is null
# all the gbm models with the given name is removed
remove_gbm_model <- function(name_model,
tumor_type = NULL){
file_path <- system.file(paste0("extdata/gbm_models/", name_model, ".rda"),
package="SigMA")
if(is.null(tumor_type)){
file.rm(file_path)
}
else{
load(file_path)
ind <- which(names(gbm_models_custom[[name_model]]) == tumor_type)
gbm_models_custom <- gbm_models_custom[[-ind]]
cutoffs_custom <- cutoffs_custom[[-ind]]
cutoffs_strict_custom <- cutoffs_strict_custom[[-ind]]
save(gbm_models_custom, cutoffs_custom, cutoffs_strict_custom, file = file_path)
}
}
# calculates sensitivity FPR and FRD based on the parameter
# provided with the signal setting
sen_fpr <- function(df, var, signal){
df <- df[!is.na(df[, var]),]
df <- df[order(df[,var]),]
df$signal <- df[, signal]
#remove
total_signal <- sum(df$signal)
total_background <- dim(df)[[1]] - sum(df$signal == 1)
sen <- rep(0, dim(df)[[1]])
fpr <- rep(0, dim(df)[[1]])
fdr <- rep(0, dim(df)[[1]])
for(ifile in 1:dim(df)[[1]]){
sen[[ifile]] <- (total_signal - sum(df$signal[1:ifile] == 1))
sen[[ifile]] <- sen[[ifile]]/total_signal
fpr[[ifile]] <- sum(df$signal[ifile:dim(df)[[1]]] != 1)
fpr[[ifile]] <- fpr[[ifile]]/total_background
fdr[[ifile]] <- sum(df$signal[ifile:dim(df)[[1]]] != 1)
denom <- sum(df$signal[ifile:dim(df)[[1]]] != 1)
denom <- denom + (total_signal - sum(df$signal[1:ifile] == 1))
fdr[[ifile]] <- fdr[[ifile]]/denom
}
return(data.frame(sen = sen, fpr = fpr, fdr = fdr, value = df[,var]))
}
# get the threshold to be used to select samples with the signature
# based on the the parameter provided by the var setting.
# the signal defines the true values and cut_var defines whether the limits
# are set on sensitivity, FPR or FDR. For sensitivity the limits are applied
# as a lower bound and for FPR and FDR as an upper bound
get_threshold <- function(df, limits, var = 'prob',
signal = 'is_sig3', cut_var = 'fpr'){
df_sen_fpr <- sen_fpr(df, var, signal = signal)
sen_cutoff <- numeric()
fpr_cutoff <- numeric()
fdr_cutoff <- numeric()
cutoff_vec <- numeric()
for(limit in limits){
if(cut_var == "fpr" | cut_var == "fdr"){
cutoff <- min(df_sen_fpr$value[df_sen_fpr[, cut_var] <= limit], na.rm = T)
if(length(cutoff) == 0)
limit <- min(df_sen_fpr[,cut_var], na.rm = T)
}
else if(cut_var == "sen"){
cutoff <- max(df_sen_fpr$value[df_sen_fpr[,cut_var] >= limit], na.rm = T)
if(length(cutoff) == 0)
limit <- max(df_sen_fpr[,cut_var], na.rm = T)
}
else{
stop('cut var can be fpr, fdr or sen')
}
cutoff_vec <- c(cutoff_vec, cutoff)
sen_cutoff <- c(sen_cutoff,
max(df_sen_fpr$sen[df_sen_fpr$value > cutoff]))
fpr_cutoff <- c(fpr_cutoff,
max(df_sen_fpr$fpr[df_sen_fpr$value > cutoff]))
fdr_cutoff <- c(fdr_cutoff,
max(df_sen_fpr$fdr[df_sen_fpr$value > cutoff]))
}
return(list(cutoff = cutoff_vec,
sen = sen_cutoff,
fpr = fpr_cutoff,
fdr = fdr_cutoff))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.