# Copyright © 2014-2016 The YAPSA package contributors
# This file is part of the YAPSA package. The YAPSA package is licenced under
# GPL-3
#' Wrapper function for the Stratification of a Mutational Catalogue
#'
#' \code{run_SMC} takes as input a big dataframe constructed from a vcf-like
#' file of a whole cohort. This wrapper function calls custom functions to
#' construct a mutational catalogue and stratify it according to categories
#' indicated by a special column in the input dataframe:
#' \itemize{
#' \item \code{\link{create_mutation_catalogue_from_df}}
#' \item \code{adjust_number_of_columns_in_list_of_catalogues}
#' }
#' This stratification
#' yields a collection of stratified mutational catalogues, these are
#' reformatted and sent to the custom function \code{SMC} and thus
#' indirectly to \code{LCD_SMC} to perform a signature analysis
#' of the stratified mutational catalogues. The result is then handed over
#' to \code{\link{plot_SMC}} for visualization.
#'
#' @param my_table
#' A big dataframe constructed from a vcf-like file of a whole cohort. The
#' first columns are those of a standard vcf file, followed by an arbitrary
#' number of custom or user defined columns. One of these must carry a PID
#' (patient or sample identifyier) and one must be the category used for
#' stratification.
#' @param this_signatures_df
#' A numeric data frame \code{W} in with \code{n} rows and \code{l} columns,
#' \code{n} being the number of features and \code{l} being the number of
#' signatures
#' @param this_signatures_ind_df
#' A data frame containing meta information about the signatures
#' @param this_subgroups_df
#' A data frame indicating which PID (patient or sample identifyier) belongs
#' to which subgroup
#' @param column_name
#' Name of the column in \code{my_table} which is going to be used for
#' stratification
#' @param refGenome
#' FaFile of the reference genome to extract the motif context of the
#' variants in \code{my_table}
#' @param cohort_method_flag
#' Either or several of \code{c("all_PIDs","cohort","norm_PIDs")},
#' representing alternative ways to average over the cohort.
#' @param in_strata_order_ind
#' Index vector defining reordering of the strata
#' @param wordLength
#' Integer number defining the length of the features or motifs, e.g. 3 for
#' tripletts or 5 for pentamers
#' @param verbose_flag
#' Verbose if \code{verbose_flag=1}
#' @param target_dir
#' Path to directory where the results of the stratification procedure are
#' going to be stored if non-NULL.
#' @param strata_dir
#' Path to directory where the mutational catalogues of the different strata
#' are going to be stored if non-NULL
#' @param output_path
#' Path to directory where the results, especially the figures produced by
#' \code{\link{plot_SMC}} are going to be stored.
#' @param in_all_exposures_df
#' Optional argument, if specified, \code{H}, i.e. the overall exposures
#' without stratification, is set to equal \code{in_all_exposures_df}. This
#' is equivalent to forcing the LCD_SMC procedure to use e.g. the exposures
#' of a previously performed NMF decomposition.
#' @param in_rownames
#' Optional parameter to specify rownames of the mutational catalogue \code{V}
#' i.e. the names of the features.
#' @param in_norms
#' If specified, vector of the correction factors for every motif due to
#' differing trinucleotide content. If null, no correction is applied.
#' @param in_label_orientation
#' Whether or not to turn the labels on the x-axis.
#' @param this_sum_ind
#' Optional set of indices for reordering the PIDs
#'
#' @return A list with entries
#' \code{exposures_list},
#' \code{catalogues_list},
#' \code{cohort} and
#' \code{name_list}.
#' \itemize{
#' \item \code{exposures_list}:
#' The list of \code{s} strata specific exposures Hi, all are numerical data
#' frames with \code{l} rows and \code{m} columns, \code{l} being the number
#' of signatures and \code{m} being the number of samples
#' \item \code{catalogues_list}:
#' A list of \code{s} strata specific cohortwide (i.e. averaged over cohort)
#' normalized exposures
#' \item \code{cohort}:
#' \code{subgroups_df} adjusted for plotting
#' \item \code{name_list}:
#' Names of the contructed strata.
#' }
#'
#' @examples
#' library(BSgenome.Hsapiens.UCSC.hg19)
#' data(sigs)
#' data(lymphoma_test)
#' data(lymphoma_cohort_LCD_results)
#' strata_list <-
#' cut_breaks_as_intervals(lymphoma_test_df$random_norm,
#' in_outlier_cutoffs=c(-4,4),
#' in_cutoff_ranges_list=list(c(-2.5,-1.5),
#' c(0.5,1.5)),
#' in_labels=c("small","intermediate","big"))
#' lymphoma_test_df$random_cat <- strata_list$category_vector
#' choice_ind <- (names(lymphoma_Nature2013_COSMIC_cutoff_exposures_df)
#' %in% unique(lymphoma_test_df$PID))
#' lymphoma_test_exposures_df <-
#' lymphoma_Nature2013_COSMIC_cutoff_exposures_df[,choice_ind]
#' temp_subgroups_df <- make_subgroups_df(lymphoma_test_df,
#' lymphoma_test_exposures_df)
#' mut_density_list <- run_SMC(lymphoma_test_df,
#' AlexCosmicValid_sig_df,
#' AlexCosmicValid_sigInd_df,
#' temp_subgroups_df,
#' column_name="random_cat",
#' refGenome=BSgenome.Hsapiens.UCSC.hg19,
#' cohort_method_flag="norm_PIDs",
#' in_rownames = rownames(AlexCosmicValid_sig_df))
#'
#' @seealso \code{\link{create_mutation_catalogue_from_df}}
#' @seealso \code{\link{normalizeMotifs_otherRownames}}
#' @seealso \code{\link{plot_SMC}}
#'
#' @export
#'
run_SMC <-
function(my_table, this_signatures_df, this_signatures_ind_df,
this_subgroups_df, column_name, refGenome,
cohort_method_flag = "all_PIDs",
in_strata_order_ind=seq_len(length(unique(my_table[,column_name]))),
wordLength = 3,verbose_flag = 1,
target_dir = NULL, strata_dir = NULL, output_path = NULL,
in_all_exposures_df=NULL, in_rownames = c(), in_norms = NULL,
in_label_orientation="turn", this_sum_ind=NULL) {
refGenome_Seqinfo <- seqinfo(refGenome)
if (verbose_flag==1) cat("\nYAPSA:::run_SMC::calling ",
"create_mutation_catalogue_from_df...\n")
merged_results_list <-
create_mutation_catalogue_from_df(my_table, refGenome_Seqinfo,
this_seqnames.field="CHROM",
this_start.field="POS",
this_end.field="POS",
this_PID.field="PID",
this_subgroup.field="subgroup",
this_rownames = in_rownames,
refGenome,wordLength,verbose_flag)
temp_rownames <- rownames(merged_results_list$matrix)
if (verbose_flag==1) cat("YAPSA:::run_SMC::calling ",
"stratify_and_create_mutational_catalogue...\n")
all_list <-
stratify_and_create_mutational_catalogue(my_table,column_name,target_dir,
strata_dir, refGenome_Seqinfo,
our_seqnames.field="CHROM",
our_start.field="POS",
our_end.field="POS",
our_PID.field="PID",
our_subgroup.field="subgroup",
refGenome,wordLength,verbose_flag,
temp_rownames)
name_list <- all_list$name_list
df_list <-
adjust_number_of_columns_in_list_of_catalogues(all_list,
merged_results_list)
mutation_catalogue_all_df <- as.data.frame(merged_results_list$matrix)
## adjust for trinucleotide content if necessary
if(!is.null(in_norms)) {
if (verbose_flag==1) cat("\nYAPSA:::run_SMC::adapting to different kmer ",
"distribution by calling ",
"normalizeMotifs_otherRownames...\n")
df_list <- lapply(df_list,
function(l) normalizeMotifs_otherRownames(l,in_norms))
mutation_catalogue_all_df <-
normalizeMotifs_otherRownames(mutation_catalogue_all_df,in_norms)
}
number_of_strata <- length(df_list)
number_of_sigs <- dim(this_signatures_df)[2]
if (verbose_flag==1) cat("\nYAPSA:::run_SMC::calling SMC...\n")
SMC_list <- SMC(df_list, this_signatures_df, in_all_exposures_df,
number_of_strata, number_of_sigs, name_list,
this_subgroups_df, mutation_catalogue_all_df,
cohort_method_flag, in_verbose=verbose_flag)
exposures_strata_list <- SMC_list$exposures_strata_list
exposures_both_rel_df_list <- SMC_list$exposures_both_rel_df_list
this_subgroups_df <- SMC_list$this_subgroups_df
decomposition_method <- SMC_list$decomposition_method
## plot
if (verbose_flag==1) {cat("YAPSA:::run_SMC::calling plot_SMC...\n");}
my_plot_list <- plot_SMC(number_of_strata,output_path,decomposition_method,
number_of_sigs,name_list,exposures_strata_list,
this_signatures_ind_df,this_subgroups_df,
in_strata_order_ind,exposures_both_rel_df_list,
cohort_method_flag,
in_label_orientation=in_label_orientation,
this_sum_ind=this_sum_ind)
return(list(exposures_list=exposures_strata_list,catalogues_list=df_list,
cohort=exposures_both_rel_df_list,name_list=name_list))
}
#' @import reshape2
#'
SMC <- function(df_list, this_signatures_df, in_all_exposures_df,
number_of_strata,number_of_sigs, name_list, this_subgroups_df,
mutation_catalogue_all_df, cohort_method_flag, in_verbose=1){
## this is the LCD_SMC procedure
if (in_verbose==1) cat("YAPSA:::SMC::calling LCD_SMC on per-PID data...\n")
exposures_strata_list <-
LCD_SMC(df_list,this_signatures_df,in_all_exposures_df)
exposures_all_df <- exposures_strata_list$exposures_all_df
sum_over_exposures_all_per_pid <- apply(exposures_all_df,2,sum)
sum_df <- data.frame(sum = sum_over_exposures_all_per_pid,
PID = colnames(exposures_all_df))
## make cohort-wide statistics
## 1. compute exposures in cohort as sum over PIDs
mean_over_exposures_all_per_sig <- apply(exposures_all_df,1,mean)
stderrmean_over_exposures_all_per_sig <- apply(exposures_all_df,1,stderrmean)
exposures_all_PIDs_df <-
data.frame(all_abs = mean_over_exposures_all_per_sig,
all_stderrmean = stderrmean_over_exposures_all_per_sig)
temp_denominator <- sum(exposures_all_PIDs_df$all_abs)
exposures_all_PIDs_df$all_rel <-
exposures_all_PIDs_df$all_abs /temp_denominator
exposures_all_PIDs_df$all_rstderrmean <-
exposures_all_PIDs_df$all_stderrmean / temp_denominator
for (i in seq_len(number_of_strata)){
my_index <- 4*(i)+1
temp_exposures_df <- exposures_strata_list$sub_exposures_list[[i]]
exposures_all_PIDs_df[,my_index] <- apply(temp_exposures_df, 1, mean)
exposures_all_PIDs_df[,my_index+1] <-
apply(temp_exposures_df, 1, stderrmean)
temp_denominator <- sum(exposures_all_PIDs_df[,my_index])
exposures_all_PIDs_df[,my_index+2] <-
exposures_all_PIDs_df[,my_index] / temp_denominator
exposures_all_PIDs_df[,my_index+3] <-
exposures_all_PIDs_df[,my_index+1] / temp_denominator
names(exposures_all_PIDs_df)[my_index] <- paste0(name_list[[i]] ,"_abs")
names(exposures_all_PIDs_df)[my_index+1] <- paste0(name_list[[i]],
"_stderrmean")
names(exposures_all_PIDs_df)[my_index+2] <- paste0(name_list[[i]], "_rel")
names(exposures_all_PIDs_df)[my_index+3] <- paste0(name_list[[i]],
"_rstderrmean")
}
exposures_all_PIDs_df$sig <- rownames(exposures_all_PIDs_df)
exposures_all_PIDs_df$method <- "all_PIDs"
## 2. compute exposures in cohort by running decomposition on a fused vector
if (in_verbose==1) cat("YAPSA:::SMC::compute exposures in cohort by ",
"running decomposition on a fused vector...\n")
catalogue_in_cohort_all <- apply(mutation_catalogue_all_df,1,sum)
catalogue_in_cohort_df <- data.frame(all_abs=catalogue_in_cohort_all)
catalogue_in_cohort_df$all_rel <-
catalogue_in_cohort_df$all_abs / sum(catalogue_in_cohort_df$all_abs)
catalogue_in_cohort_df_list <- list()
for (i in seq_len(number_of_strata)) {
my_index <- 2*(i)+1
temp_catalogue_df <- df_list[[i]]
catalogue_in_cohort_df[,my_index] <- apply(temp_catalogue_df,1,sum)
catalogue_in_cohort_df[,my_index+1] <-
catalogue_in_cohort_df[,my_index] /
sum(catalogue_in_cohort_df[,my_index])
names(catalogue_in_cohort_df)[my_index] <- paste0(name_list[[i]], "_abs")
names(catalogue_in_cohort_df)[my_index+1] <- paste0(name_list[[i]], "_rel")
catalogue_in_cohort_df_list[[i]] <-
data.frame(all_abs = catalogue_in_cohort_df[,my_index])
}
catalogue_in_cohort_in_exposures <- NULL
decomposition_method <- "LCD"
if (!is.null(in_all_exposures_df)) {
temp_vector <- apply(in_all_exposures_df, 1, sum)
catalogue_in_cohort_in_exposures <- data.frame(all_abs = temp_vector)
decomposition_method <- "NMF"
}
if (in_verbose==1) cat("YAPSA:::SMC::calling LCD_SMC ",
"on cohort-wide data...\n")
exposures_in_cohort_strata_list <- LCD_SMC(catalogue_in_cohort_df_list,
this_signatures_df,
catalogue_in_cohort_in_exposures)
temp_dim <- dim(exposures_in_cohort_strata_list$exposures_all_df)[1]
exposures_in_cohort_df <-
data.frame(all_abs = exposures_in_cohort_strata_list$exposures_all_df,
all_stderrmean = rep(0,temp_dim))
exposures_in_cohort_df$all_rel <- exposures_in_cohort_df$all_abs /
sum(exposures_in_cohort_df$all_abs)
exposures_in_cohort_df$all_rstderrmean <-
rep(0,dim(exposures_in_cohort_strata_list$exposures_all_df)[1])
for (i in seq_len(number_of_strata)) {
my_index <- 4*(i)+1
temp_exposures_df <-
exposures_in_cohort_strata_list$sub_exposures_list[[i]]
exposures_in_cohort_df[,my_index] <- temp_exposures_df$all_abs
exposures_in_cohort_df[,my_index+1] <- 0
exposures_in_cohort_df[,my_index+2] <- exposures_in_cohort_df[,my_index] /
sum(exposures_in_cohort_df[,my_index])
exposures_in_cohort_df[,my_index+3] <- 0
names(exposures_in_cohort_df)[my_index] <- paste0(name_list[[i]], "_abs")
names(exposures_in_cohort_df)[my_index+1] <- paste0(name_list[[i]],
"_stderrmean")
names(exposures_in_cohort_df)[my_index+2] <- paste0(name_list[[i]], "_rel")
names(exposures_in_cohort_df)[my_index+3] <- paste0(name_list[[i]],
"_rstderrmean")
}
exposures_in_cohort_df$sig <- rownames(exposures_in_cohort_df)
exposures_in_cohort_df$method <- "cohort"
## 3. average over relative exposures
## 3.a) compute relative exposures
exposures_strata_list$norm_exposures_all_df <-
normalize_df_per_dim(exposures_strata_list$exposures_all_df,2)
exposures_strata_list$sub_norm_exposures_list <-
lapply(exposures_strata_list$sub_exposures_list,
function(l) normalize_df_per_dim(l,2))
## 3.b) average and build data structure
mean_over_norm_exposures_all_per_sig <-
apply(exposures_strata_list$norm_exposures_all_df, 1, mean)
stderrmean_over_norm_exposures_all_per_sig <-
apply(exposures_strata_list$norm_exposures_all_df, 1, stderrmean)
norm_exposures_all_PIDs_df <-
data.frame(all_abs = rep(0,length(mean_over_norm_exposures_all_per_sig)),
all_stderrmean = rep(0,length(
mean_over_norm_exposures_all_per_sig)),
all_rel=mean_over_norm_exposures_all_per_sig,
all_rstderrmean=stderrmean_over_norm_exposures_all_per_sig)
temp_list <- lapply(exposures_strata_list$sub_norm_exposures_list,
function(l) apply(l,1,mean))
for (i in seq_len(number_of_strata)) {
my_index <- 4*(i)+1
norm_exposures_all_PIDs_df[,my_index] <- 0
norm_exposures_all_PIDs_df[,my_index+1] <- 0
norm_exposures_all_PIDs_df[,my_index+2] <-
average_over_present(
exposures_strata_list$sub_norm_exposures_list[[i]], 1)
norm_exposures_all_PIDs_df[,my_index+3] <-
stderrmean_over_present(
exposures_strata_list$sub_norm_exposures_list[[i]], 1)
names(norm_exposures_all_PIDs_df)[my_index] <- paste0(name_list[[i]],
"_abs")
names(norm_exposures_all_PIDs_df)[my_index+1] <- paste0(name_list[[i]],
"_stderrmean")
names(norm_exposures_all_PIDs_df)[my_index+2] <- paste0(name_list[[i]],
"_rel")
names(norm_exposures_all_PIDs_df)[my_index+3] <- paste0(name_list[[i]],
"_rstderrmean")
}
norm_exposures_all_PIDs_df$sig <- rownames(exposures_all_PIDs_df)
norm_exposures_all_PIDs_df$method <- "norm_PIDs"
## 4 unite the 3 different methods into one dataframe
exposures_combMethod_df <-
rbind(exposures_all_PIDs_df,
exposures_in_cohort_df,
norm_exposures_all_PIDs_df)
abs_ind <- grep(".*_abs|^sig$|^method$", names(exposures_combMethod_df))
rel_ind <- grep(".*_rel|^sig$|^method$", names(exposures_combMethod_df))
rstderrmean_ind <- grep(".*_rstderrmean|^sig$|^method$",
names(exposures_combMethod_df))
exposures_combMethod_abs_df <- exposures_combMethod_df[,abs_ind]
exposures_combMethod_rel_df <- exposures_combMethod_df[,rel_ind]
exposures_combMethod_rstderrmean_df <-
exposures_combMethod_df[,rstderrmean_ind]
## prepare for plotting
if (in_verbose==1) {cat("YAPSA:::SMC::prepare for plotting...\n");}
exposures_combMethod_rel_melt_df <-
melt(exposures_combMethod_rel_df, id.vars=c("sig","method"),
value.name="exposure")
exposures_combMethod_rstderrmean_melt_df <-
melt(exposures_combMethod_rstderrmean_df, id.vars=c("sig","method"),
value.name="exposure_rstderrmean")
exposures_combMethod_rel_melt_df$rstderrmean <-
exposures_combMethod_rstderrmean_melt_df$exposure_rstderrmean
exposures_combMethod_rel_melt_df$exposure_min <-
exposures_combMethod_rel_melt_df$exposure -
exposures_combMethod_rstderrmean_melt_df$exposure_rstderrmean
exposures_combMethod_rel_melt_df$exposure_max <-
exposures_combMethod_rel_melt_df$exposure +
exposures_combMethod_rstderrmean_melt_df$exposure_rstderrmean
counter <- 0
exposures_combMethod_rel_df_list <- list()
exposures_combMethod_rstderrmean_df_list <- list()
method_choice_ind <-
which(exposures_combMethod_rel_melt_df$method %in% cohort_method_flag)
exposures_combMethod_rel_melt_df <-
exposures_combMethod_rel_melt_df[method_choice_ind,]
for (temp_varname in unique(exposures_combMethod_rel_melt_df$variable)) {
counter <- counter + 1
temp_ind <- which(exposures_combMethod_rel_melt_df$variable==temp_varname)
exposures_combMethod_rel_df_list[[counter]] <-
exposures_combMethod_rel_melt_df[temp_ind,]
exposures_combMethod_rel_df_list[[counter]]$sig <-
factor(exposures_combMethod_rel_df_list[[counter]]$sig,
levels=unique(exposures_combMethod_rel_df_list[[counter]]$sig))
}
## adapt subgroups data structure
if(dim(mutation_catalogue_all_df)[2]==dim(this_subgroups_df)[1]) {
this_subgroups_df$PID <- colnames(mutation_catalogue_all_df)
this_subgroups_df$sum <- sum_df$sum
max_total_count <- max(this_subgroups_df$sum)
this_subgroups_df$compl_sum <- max_total_count - this_subgroups_df$sum
subgroup_ind <-
order(this_subgroups_df$subgroup, this_subgroups_df$compl_sum)
this_subgroups_df$index <- order(subgroup_ind)
} else {
cat("YAPSA:::SMC::Warning: dimension mismatch between ",
"mutation_catalogue_df and subgroup_df.\n")
q(status=5)
}
return(list(exposures_strata_list=exposures_strata_list,
exposures_both_rel_df_list=exposures_combMethod_rel_df_list,
this_subgroups_df=this_subgroups_df,
subgroup_ind=subgroup_ind,
decomposition_method=decomposition_method))
}
#' Apply statistical tests to a stratification (SMC)
#'
#' \code{stat_test_SMC} tests for enrichment or depletion in the different
#' strata of a stratification of the mutational catalogue for every signature
#' independently by applying Kruskal Wallis tests. For those signatures where
#' the Kruskal Wallis test gives a significant p-value, pairwise posthoc tests
#' are carried out by calling
#' \code{\link[PMCMR]{posthoc.kruskal.nemenyi.test}}. Additionally all data is
#' tested for normality by Shapiro Wilk tests, so that the user may apply
#' ANOVA and pairwise posthoc t-test where allowed.
#'
#' @param in_strat_list
#' A list with entries \code{exposures_list}, \code{catalogues_list},
#' \code{cohort} and \code{name_list} as in the output of
#' \code{\link{run_SMC}}.
#' \itemize{
#' \item \code{exposures_list}:
#' The list of \code{s} strata specific exposures Hi, all are numerical
#' data frames with \code{l} rows and \code{m} columns, \code{l} being the
#' number of signatures and \code{m} being the number of samples
#' \item \code{catalogues_list}:
#' A list of \code{s} strata specific cohortwide (i.e. averaged over
#' cohort) normalized exposures
#' \item \code{cohort}:
#' \code{subgroups_df} adjusted for plotting
#' \item \code{name_list}:
#' Names of the contructed strata.
#' }
#' @param in_flag
#' If "norm", all tests are performed on normalized exposures, otherwise the
#' absolute exposures are taken.
#'
#' @return A list with entries
#' \code{kruskal_df},
#' \code{shapiro_df},
#' \code{kruskal_posthoc_list},
#' \itemize{
#' \item \code{kruskal_df}:
#' A data frame containing results (statistic and p values) of the Kruskal
#' Wallis tests (tests for enrichment or depletion in the different strata
#' for every signature independently).
#' \item \code{shapiro_df}:
#' A data frame containing results (p values) of the Shapiro Wilk tests
#' (tests for normal distribution in the different strata for every
#' signature independently).
#' \item \code{kruskal_posthoc_list}:
#' A list of results of pairwise posthoc tests carried out for those
#' signatures where the Kruskal Wallis test yielded a significant p-value
#' (carried out by \code{\link[PMCMR]{posthoc.kruskal.nemenyi.test}}).
#' }
#'
#' @examples
#' NULL
#'
#' @seealso \code{\link{run_SMC}}
#' @seealso \code{\link[PMCMR]{posthoc.kruskal.nemenyi.test}}
#' @seealso \code{\link[stats]{kruskal.test}}
#' @seealso \code{\link{shapiro_if_possible}}
#' @seealso \code{\link[stats]{shapiro.test}}
#'
#' @import PMCMR
#' @export
#'
stat_test_SMC <- function(in_strat_list,in_flag="norm"){
my_number_of_strata <-
length(in_strat_list$exposures_list$sub_norm_exposures_list)
my_signatures <-
rownames(in_strat_list$exposures_list$sub_norm_exposures_list[[1]])
my_number_of_signatures <-
dim(in_strat_list$exposures_list$sub_norm_exposures_list[[1]])[1]
my_number_of_PIDs <-
dim(in_strat_list$exposures_list$sub_norm_exposures_list[[1]])[2]
out_stat_df <- repeat_df(NA,my_number_of_signatures,3)
shapiro_df <- repeat_df(NA,my_number_of_signatures,my_number_of_strata)
rownames(out_stat_df) <- my_signatures
rownames(shapiro_df) <- my_signatures
names(out_stat_df) <- c("Kruskal_statistic","df","Kruskal_p_val")
posthoc_list <- list()
for(temp_sig in my_signatures){
sig_exposures_vector <- c()
sig_strata_vector <- c()
for(j in seq_len(my_number_of_strata)){
if(in_flag=="norm"){
temp_df <- in_strat_list$exposures_list$sub_norm_exposures_list[[j]]
} else {
temp_df <- in_strat_list$exposures_list$sub_exposures_list[[j]]
}
PID_choice_ind <- which(colSums(temp_df)>0)
temp_exposures_vector <- as.numeric(temp_df[temp_sig,PID_choice_ind])
sig_exposures_vector <- c(sig_exposures_vector,temp_exposures_vector)
sig_strata_vector <- c(sig_strata_vector,
rep(in_strat_list$name_list[[j]],length(temp_exposures_vector)))
## catch exception for temp_exposures_vector being all equal values
## (all zero)
shapiro_df[temp_sig,j] <- shapiro_if_possible(temp_exposures_vector)
}
sig_strata_vector <- factor(sig_strata_vector)
sig_kruskal <- kruskal.test(sig_exposures_vector,sig_strata_vector)
posthoc_list[[temp_sig]] <-
posthoc.kruskal.nemenyi.test(x=sig_exposures_vector,g=sig_strata_vector)
out_stat_df[temp_sig,1] <- sig_kruskal$statistic
out_stat_df[temp_sig,2] <- sig_kruskal$parameter
out_stat_df[temp_sig,3] <- sig_kruskal$p.value
}
out_stat_df$Kruskal_p_val_BH <- p.adjust(out_stat_df$Kruskal_p_val,
method="BH")
return(list(kruskal_df=out_stat_df,shapiro_df = shapiro_df,
kruskal_posthoc_list = posthoc_list))
}
#' Test for differences in average signature exposures between subgroups
#'
#' Apply Kruskal-Wallis tests to detect differences in the signature exposures
#' between different subgroups. Uses
#' \code{\link{split_exposures_by_subgroups}}.
#' Algorithm analogous to \code{\link{stat_test_SMC}}.
#'
#' @param in_exposures_df
#' Numerical data frame of the exposures (i.e. contributions of the
#' different signatures to the number of point mutations per PID)
#' @param in_subgroups_df
#' Data frame indicating which PID belongs to which subgroup
#' @param in_subgroups.field
#' Name indicating which column in \code{in_subgroups_df} contains the
#' subgroup information
#' @param in_PID.field
#' Name indicating which column in \code{in_subgroups_df} contains the
#' PID information
#'
#' @return A list with entries
#' \code{kruskal_df},
#' \code{kruskal_posthoc_list},
#' \itemize{
#' \item \code{kruskal_df}:
#' A data frame containing results (statistic and p values) of the Kruskal
#' Wallis tests (tests for enrichment or depletion in the different strata
#' for every signature independently).
#' \item \code{kruskal_posthoc_list}:
#' A list of results of pairwise posthoc tests carried out for those
#' signatures where the Kruskal Wallis test yielded a significant p-value
#' (carried out by \code{\link[PMCMR]{posthoc.kruskal.nemenyi.test}}).
#' }
#'
#' @seealso \code{\link{split_exposures_by_subgroups}}
#' @seealso \code{\link{stat_test_SMC}}
#' @seealso \code{\link[PMCMR]{posthoc.kruskal.nemenyi.test}}
#' @seealso \code{\link[stats]{kruskal.test}}
#'
#' @examples
#' NULL
#'
#' @export
#'
stat_test_subgroups <- function(in_exposures_df,in_subgroups_df,
in_subgroups.field="subgroup",
in_PID.field="PID"){
# split the data with custom function
exposures_df_list <- split_exposures_by_subgroups(
in_exposures_df=in_exposures_df,in_subgroups_df=in_subgroups_df,
in_subgroups.field=in_subgroups.field,
in_PID.field=in_PID.field)
number_of_subgroups <- length(exposures_df_list)
my_signatures <- rownames(in_exposures_df)
number_of_signatures <- length(my_signatures)
out_stat_df <- repeat_df(NA,number_of_signatures,3)
shapiro_df <- repeat_df(NA,number_of_signatures,number_of_subgroups)
rownames(out_stat_df) <- my_signatures
rownames(shapiro_df) <- my_signatures
names(out_stat_df) <- c("Kruskal_statistic","df","Kruskal_p_val")
posthoc_list <- list()
for(temp_sig in my_signatures){
sig_exposures_vector <- c()
sig_subgroups_vector <- c()
for(j in seq_len(number_of_subgroups)){
temp_exposures_vector <- as.numeric(exposures_df_list[[j]][temp_sig,])
sig_exposures_vector <- c(sig_exposures_vector,temp_exposures_vector)
sig_subgroups_vector <- c(sig_subgroups_vector,
rep(names(exposures_df_list)[j],
length(temp_exposures_vector)))
}
sig_subgroups_vector <- factor(sig_subgroups_vector)
sig_kruskal <- kruskal.test(sig_exposures_vector,sig_subgroups_vector)
posthoc_list[[temp_sig]] <-
posthoc.kruskal.nemenyi.test(x = sig_exposures_vector,
g = sig_subgroups_vector)
out_stat_df[temp_sig,1] <- sig_kruskal$statistic
out_stat_df[temp_sig,2] <- sig_kruskal$parameter
out_stat_df[temp_sig,3] <- sig_kruskal$p.value
}
out_stat_df$Kruskal_p_val_BH <- p.adjust(out_stat_df$Kruskal_p_val,
method="BH")
return(list(kruskal_df = out_stat_df,shapiro_df=shapiro_df,
kruskal_posthoc_list = posthoc_list))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.