#' Prepare data for module: xCell
#'
#' The workflow of vigilante is highly module-based. To ensure a successful and smooth run, vigilante needs to prepare input data before continuing.
#'
#' @param doGE logic, whether to prepare Gene Expression (GE) data, if FALSE, no GE data will be available for downstream v_chmXcell function; if TRUE, please make sure GE data files are properly named, see Details for more information about file naming.
#' @param doLocalAnalysis logic, whether to perform local Cell Type Enrichment Analysis (based on xCell), if FALSE, no local analysis results will be available for downstream v_chmXcell function.
#' @param colSliceOrder character vector, the order (from left to right) of groups to be shown on the output plots and files, should be set to the same value when called from different functions within the same module (e.g., v_prepareVdata_xCell and v_chmXcell). By default will use the internal character vector "grpName" and can be left unchanged. Groups not included in the "colSliceOrder" will also be excluded from the output plots and files (in most cases), and from certain analysis process (depending on the situation).
#'
#' @details
#' Oftentimes input data files generated by upstream tools came with diverse naming conventions. It might be easy for the user to recognize those files, but not for vigilante if there is no recognizable patterns.
#'
#' To make input data files clear to vigilante, it would be nice to have them named something like "studyID_sampleID_(other descriptions).file extension". Here "studyID" is the name of the study or project, and it will be used in multiple naming situations (such as on the plot, or in the output file names), so it is recommended to be concise and meaningful.
#'
#' For module xCell, currently supported input data files are listed below, please contact the author if you want to add more files to the supported list:
#' Gene Expression (GE): *cDNA_genes.sf for Salmon
#'
#' @return list, because R CMD check discourages assignments to the global environment within functions, user needs to run the function with explicitly assigning the return value to a global variable named "prepareVdata_xCell_returnList", which will be a list containing the required variables for downstream analyses.
#'
#' @export
#'
# v_prepareVdata_xCell function
v_prepareVdata_xCell = function(doGE = FALSE, doLocalAnalysis = FALSE, colSliceOrder = grpName) {
# ask user to make sure v_prepareVdata_xCell function return value is assigned to the global variable named "prepareVdata_xCell_returnList"
status_returnValue = menu(choices = c("Yes", "No"), title = "\nIs v_prepareVdata_xCell function return value assigned to the global variable named 'prepareVdata_xCell_returnList'?")
if (status_returnValue == 1) {
print("Function return value is properly assigned, now continue to the next step")
} else if (status_returnValue == 2) {
print("Please follow the instruction to assign the function return value before continue to the next step")
stop()
} else {
print("Please choose a valid answer")
stop()
}
rm(status_returnValue)
# check if globalSettings_returnList exists
if (!exists("globalSettings_returnList")) {
print("globalSettings_returnList not detected, please follow the instruction and run v_globalSettings function before continue")
stop()
} else {
print("globalSettings_returnList detected, start extracting global settings")
}
# extract global settings from return list
for (i in 1:length(globalSettings_returnList)) {
assign(x = names(globalSettings_returnList)[i], value = globalSettings_returnList[[i]])
}
rm(i)
# preset variables for temp_prepareVdata_xCell_returnList
ge.list.c = NULL
ge.xcell = NULL
ge.xcell.result = NULL
assayID_rna = NULL
plot_name_rna = NULL
# if condition for whether to reform GE data
if (doGE == TRUE) {
# check if GE data exist
ge_file_name = dir(path = folder_name, pattern = glob2rx("*cDNA_genes.sf"))
if (length(ge_file_name) == 0) {
print("GE data not detected, please double check if GE data files are properly named, or set doGE = FALSE to skip GE data processing")
stop()
} else {
print("Start processing GE data")
}
# load and extract gene expression data
ge_path = paste0("./", folder_name, "/", ge_file_name)
ge.list = lapply(ge_path, read.table, header = TRUE, stringsAsFactors = FALSE)
ge.list = lapply(ge.list, function(x) x[(names(x) %in% c("Name", "TPM"))])
ge_header = c("ENSG", "TPM")
ge.list = lapply(ge.list, setNames, ge_header)
if (exists("gdc_ge.list_FPKM")) {
names(ge.list) = assayID[1:(length(assayID) - length(gdc_ge.list_FPKM))]
} else {
names(ge.list) = assayID
}
# filter out unexpected ENST/transcripts in ge.list
ge.list = lapply(ge.list, function(x) {
temp_ENST = grepl("^ENST.+$", x$ENSG)
x = subset(x, subset = !temp_ENST)
return(x)
})
# remove ENSG version suffix in ge.list
ge.list = lapply(ge.list, function(x) {
x$ENSG = gsub("\\.[[:digit:]]+$", "", x$ENSG)
return(x)
})
# map gene expression data to GRCh38, or do homolog lifting if it is non-human
if (speciesID == "hg38") {
ge.list = lapply(ge.list, merge, x = GRCh38G, by = "ENSG")
} else if (speciesID == "hg19") {
ge.list = lapply(ge.list, merge, x = GRCh37G, by = "ENSG")
} else {
# homolog lifting from Mouse to Human
ge.list = lapply(ge.list, merge, x = GRCm2h38C, by.x = "ENSG_M", by.y = "ENSG")
ge.list = lapply(ge.list, function(x) x[, -c(1, 2)])
}
ge.list = lapply(ge.list, function(x) x[, c(2, 6)]) # for xCell
# append TCGA data if available
if (exists("gdc_ge.list.c_xcell")) {
ge.list = c(ge.list, gdc_ge.list.c_xcell)
}
# filter out sample with empty ge data, using backwards for loop
if (exists("gdc_ge.list_FPKM")) {
plot_name_rna = plot_name[1:(length(assayID) - length(gdc_ge.list_FPKM))]
assayID_rna = assayID[1:(length(assayID) - length(gdc_ge.list_FPKM))]
} else {
plot_name_rna = plot_name
assayID_rna = assayID
}
for (i in length(ge.list):1) {
if (nrow(ge.list[[i]]) == 0) {
ge.list[[i]] = NULL
plot_name_rna = plot_name_rna[-i]
assayID_rna = assayID_rna[-i]
}
}
rm(i)
# append grouped list to original list
ge.list.c = ge.list
# remove temp ge data
rm(ge.list, ge_file_name, ge_header, ge_path)
# unify gene numbers in ge.list.c in case samples have differnt number of genes
ge.unif = dplyr::bind_rows(ge.list.c)
ge.unif["Occurrence"] = 1
ge.unif = ge.unif[, -2]
ge.unif = ge.unif %>% dplyr::group_by(Gene) %>% dplyr::summarise(TotalOccurrence = sum(Occurrence))
ge.unif = subset(ge.unif, subset = TotalOccurrence == length(ge.list.c))
ge.unif = unlist(ge.unif[, 1])
ge.list.c = lapply(ge.list.c, function(x) {
x = subset(x, subset = Gene %in% ge.unif)
return(x)
})
rm(ge.unif)
# merge separate dataframes into one and rename TPM columns
ge.xcell = ge.list.c[[1]]
for (i in 2:length(ge.list.c)) {
ge.xcell = cbind(ge.xcell, ge.list.c[[i]][, 2])
}
rm(i)
colnames(ge.xcell) = c("Gene", plyr::mapvalues(names(ge.list.c), from = groupInfo$assayID, to = groupInfo$aliasID, warn_missing = FALSE))
# calculate average GE for duplicated genes
ge.xcell = ge.xcell %>% dplyr::group_by(Gene) %>% dplyr::summarise_all(.vars = colnames(ge.xcell)[-1], .fun = mean) # non-unique colnames error doesn't have an effect; also run may get stuck here, use manual stop to generate normal results
print("GE data processing completed")
}
# if condition for whether to conduct local analysis
if (doLocalAnalysis == TRUE) {
# check if GE data are already reformed
if (is.null(ge.xcell)) {
print("Reformed GE data not detected, please double check if GE data are already reformed, or set doLocalAnalysis = FALSE to skip local analysis")
stop()
} else {
print("Start local analysis")
}
# reform ge.xcell to ge.xcell.prep to meet xCell input requirement
tempRownames = unlist(ge.xcell[, "Gene"])
tempRownamesUniq = unique(tempRownames)
ge.xcell.prep = as.matrix(ge.xcell[, -1], stringAsFactor = FALSE)
rownames(ge.xcell.prep) = tempRownamesUniq
rm(tempRownames, tempRownamesUniq)
# subset ge.xcell.prep to be consistent with downstream colSliceOrder
if (length(colSliceOrder) != length(grpName)) {
print(as.character(glue::glue("Group not processed in xCell: {setdiff(grpName, colSliceOrder)}")))
colnames_temp = colnames(ge.xcell.prep)
grp.list.xcell_temp = list()
for (i in 1:length(grpName)) {
grp.list.xcell_temp[[i]] = grp.list[[i]][grp.list[[i]] %in% names(ge.list.c)]
}
rm(i)
names(grp.list.xcell_temp) = grpName
grp.list.xcell_temp = grp.list.xcell_temp[names(grp.list.xcell_temp) %in% colSliceOrder]
groupInfo_temp = subset(groupInfo, subset = Group %in% colSliceOrder)
colnames_temp = colnames_temp %in% groupInfo_temp$aliasID
ge.xcell.prep = subset(ge.xcell.prep, select = colnames_temp)
rm(colnames_temp, grp.list.xcell_temp, groupInfo_temp)
}
# local xCell analysis
ge.xcell.result = xCell::xCellAnalysis(ge.xcell.prep)
# remove xCellAnalysis byproduct variables
rm(ge.xcell.prep)
rm(progressBar, iSample, nSamples, envir = .GlobalEnv)
}
# add variables to temp_prepareVdata_xCell_returnList
temp_prepareVdata_xCell_returnList = list(
"ge.list.c" = ge.list.c,
"ge.xcell" = ge.xcell,
"ge.xcell.result" = ge.xcell.result,
"assayID_rna" = assayID_rna,
"plot_name_rna" = plot_name_rna
)
# remove variables already added to temp_prepareVdata_xCell_returnList
rm(list = names(temp_prepareVdata_xCell_returnList))
# filter out empty variables, using backwards for loop
for (i in length(temp_prepareVdata_xCell_returnList):1) {
if (is.null(temp_prepareVdata_xCell_returnList[[i]])) {
temp_prepareVdata_xCell_returnList[[i]] = NULL
}
}
rm(i)
# remove extracted global settings
rm(list = names(globalSettings_returnList))
# end of v_prepareVdata_xCell function
print("v_prepareVdata_xCell run completed, return value saved to the global environment in **prepareVdata_xCell_returnList**")
return(temp_prepareVdata_xCell_returnList)
}
#' Calculate log10 fold-change value for module: xCell
#'
#' (Internal) Helper function, used to calculate log10 fold-change value for module xCell, should be called within the main function (v_chmXcell).
#'
#' @keywords internal
# v_chmFoldChangeLog10_xCell function
v_chmFoldChangeLog10_xCell = function(outputFolderPath, log10Threshold, grpName_fc, filterNoTPM) {
# get variables from parent function
studyID = get("studyID", envir = parent.frame())
# temporarily turn ge.chm.pmg into xcell.pmg for downstream calculation
xcell.pmg = get("ge.chm.pmg", envir = parent.frame())
# group up and calculate average xCell enrichment score
xcell.pmg.fc = subset(xcell.pmg, subset = Group %in% grpName_fc)
xcell.pmg.fc = xcell.pmg.fc %>% dplyr::group_by(Cell.Type, Group) %>% dplyr::summarise_at(.vars = "Enrichment.Score", .fun = list(Enrichment.Score_avg = mean))
# turn xcell.pmg.fc into a list
xcell.foldchange = list()
for (i in 1:length(grpName_fc)) {
xcell.foldchange[[i]] = subset(xcell.pmg.fc, subset = Group == grpName_fc[i])[, c(1, 3)]
}
rm(i, xcell.pmg.fc)
names(xcell.foldchange) = grpName_fc
# replace NA and negative value with 0
xcell.foldchange = lapply(xcell.foldchange, function(x) {
x[is.na(x[, 2]), 2] = 0
x[x[, 2] < 0, 2] = 0
return(x)
})
# filter out genes with Enrichment.Score = 0 in both groups
if (filterNoTPM == TRUE) {
xcell.foldchange = lapply(xcell.foldchange, function(x) {
noES = (xcell.foldchange[[1]][, 2] == 0) & (xcell.foldchange[[2]][, 2] == 0)
x = x[!noES, ]
})
}
# add 10e-32 to fix 0 denominator issue
min0offset_fc_xcell = min(sapply(xcell.foldchange, function(x) {min(x[x[, 2] > 0, 2])})) / 10
min0offset_fc_xcell = 10 ^ (floor(log10(min0offset_fc_xcell)))
xcell.foldchange = lapply(xcell.foldchange, function(x) {
x[, 2] = x[, 2] + min0offset_fc_xcell # add 10e-32 to fix log(0) issue
return(x)
})
# calculate fold change
xcell.foldchange[[3]] = xcell.foldchange[[grpName_fc[2]]]
xcell.foldchange[[3]][, 2] = log10(xcell.foldchange[[3]][, 2] / xcell.foldchange[[grpName_fc[1]]][, 2])
xcell.foldchange = xcell.foldchange[[3]]
colnames(xcell.foldchange) = c("Cell_Type", "Log10_FC")
# classify fold change based on threshold
xcell.foldchange["Exp_Level"] = dplyr::case_when(
xcell.foldchange[, 2] >= log10Threshold[1] ~ "Up-Regulated",
xcell.foldchange[, 2] <= log10Threshold[2] ~ "Down-Regulated",
(xcell.foldchange[, 2] < log10Threshold[1]) & (xcell.foldchange[, 2] > log10Threshold[2]) ~ "Within-Threshold"
)
# write xcell.foldchange to csv file
write.csv(xcell.foldchange, file = paste0(outputFolderPath, studyID, "_xCell_foldchange_3ExprLvls.csv"), quote = FALSE, row.names = FALSE)
# end of v_chmFoldChangeLog10_xCell function
print(as.character(glue::glue("v_chmFoldChangeLog10_xCell completed, output csv file saved in {outputFolderPath}")))
return(xcell.foldchange)
}
#' Combine the main body and side annotation of heatmap for module: xCell
#'
#' (Internal) Helper function, used to combine the main body and side annotation of heatmap for module xCell, should be called within the main function (v_chmXcell).
#'
#' @import magick
#'
#' @keywords internal
# v_magick_xCell function
v_magick_xCell = function(outputFolderPath, sigType) {
# get variables from parent function
studyID = get("studyID", envir = parent.frame())
chm_suffix = get("chm_suffix", envir = parent.frame())
# load plots to be combined
chm_mainbody = image_read(path = paste0(outputFolderPath, "HeatMap4K_", studyID, "_CellTypeEnrichmentScore", chm_suffix, "_", sigType, ".png"))
chm_bpanno = image_read(path = paste0(outputFolderPath, "BoxPlotAnno4K_", studyID, "_annotation_xCell", chm_suffix, "_", sigType, ".png"))
# combine bpanno and mainbody, watch for the order, from left to right
chm_combined = image_append(image = c(chm_bpanno, chm_mainbody), stack = FALSE)
# output combined plot
image_write(chm_combined, path = paste0(outputFolderPath, "CombinedHeatMap4K_", studyID, "_CellTypeEnrichmentScore", chm_suffix, "_", sigType, ".png"))
# remove temp variables
rm(chm_mainbody, chm_bpanno, chm_combined)
}
#' Main function for module: xCell
#'
#' Extract, clean and reform NGS data prepared by v_prepareVdata_xCell function, and then perform local Cell Type Enrichment Analysis (based on xCell) in addition to a set of in-house filtering process and other statistical analysis based on user's choice.
#'
#' @param outputFolderPath string, relative or absolute path to the output folder, trailing slash "/" required, can be set to NULL (no output file will be written to the file system), default "./_VK/_xCell/" for module xCell.
#' @param ge.xcell.result main input, automatically extracted from the prepared data, can (or probably should) be omitted in function call.
#' @param filterOutlier logic, whether to filter outliers and exclude them from downstream analysis.
#' @param filterOutlier_fromGroup character vector, if "filterOutlier" is set to TRUE, filter outliers from the selected groups and ignore outliers from other groups. By default will use the internal character vector "grpName" and can be left unchanged.
#' @param filterStringency numeric, the ratio that outliers can be allowed in each selected group, from 0 to 1, default 0.1 (10\%), calculation will be rounded down to integer. For example, if there are 24 samples in a group and filterStringency is set to 0.1, the number of allowed outliers in this group will be 2. See Details for more information about how "filterOutlier" works.
#' @param filterNoTPM logic, whether to filter out cell types that have Enrichment.Score = 0 in all samples, preferably TRUE to prevent error in significance test (error: data are essentially constant).
#' @param significanceTest logic, whether to perform significance test, the type of test will be automatically adjustd based on the input data and other settings (e.g. number of groups).
#' @param significanceTest_inputForm character, choose one from c("log10", "log2", "raw"), use log10, log2 or raw value to perform significance test. By default will use "raw" for module xCell, and preferably be left unchanged.
#' @param significanceTest_fdrq logic, whether to calculate and display false discovery rate q-value, user-set value may be automatically modified (with notice) due to interplay.
#' @param shadowGroup list(logic, numeric), [[1]] whether to create shadow/virtual group for each individual sample (grpName will be modified based on aliaseID); [[2]] number of shadow groups per sample. "shadowGroup" can only be used when each original group only contains one individual sample; only works for "significanceTest", will disable "calculateFC"; user-set value may be automatically modified (with notice) due to interplay.
#' @param calculateFC logic, whether to calculate log10 fold-change value, user-set value may be automatically modified (with notice) due to interplay.
#' @param log10Threshold numeric vector, upper and lower cutoff points in log10 form for deciding fold-change levels ("Up-Regulated", "With-Threshold", "Down-Regulated"). By default will use c(0.3, -0.3), which equal to the generally accepted fold-change threshold in non-log form (two-fold and half-fold, respectively).
#' @param rowSliceOrder character vector, the order (from top to bottom) of fold-change levels (if applicable) to be shown on the output plots and files. By default will use c("Up-Regulated", "Within-Threshold", "Down-Regulated") for module xCell, and preferably be left unchanged.
#' @param colSliceOrder character vector, the order (from left to right) of groups to be shown on the output plots and files, should be set to the same value when called from different functions within the same module (e.g., v_prepareVdata_xCell and v_chmXcell). By default will use the internal character vector "grpName" and can be left unchanged. Groups not included in the "colSliceOrder" will also be excluded from the output plots and files (in most cases), and from certain analysis process (depending on the situation).
#' @param grpName_fc character vector, two groups selected for fold-change calculation. Order matters, the first group will be used as the denominator, whereas the second as the nominator.
#' @param grpName_pval character vector, groups selected for significance test if "significanceTest" is set to TRUE. Order does not matter, but number of groups may affect the type of test. If provided, should have at least two groups.
#' @param addBpAnno logic, whether to add boxplot annotation to the left of the main heatmap, user-set value may be automatically modified (with notice) due to interplay.
#' @param unsupervisedClustering logic, whether to perform unsupervised clustering (currently support Euclidean distance method), if TRUE, will override "rowSliceOrder" and "colSliceOrder", as well as disable "addBpAnno".
#' @param colorScheme list(character, numeric vector), set color scheme for heatmap mainbody, [[1]] mode, choose one from c("continuous", "discrete"); [[2]] breakpoints, should be a numeric vector containing breakpoints ranging from 0 to 1, inclusive; moreover, breakpoints are only used in "discrete" mode, and will be ignored in "continuous" mode. Recommended values are list(mode = "continuous", breakpoints = NULL) and list(mode = "discrete", breakpoints = seq(from = 0, to = 1, by = 0.2)). By default, will use the first recommended value, list(mode = "continuous", breakpoints = NULL).
#' @param resizeColSlicer logic, whether to resize each column slicer shown on the heatmap, very useful when number of samples in different groups vary greatly (e.g. 10 samples in group A, 50 in group B, and 300 in group C). By default, size of each column slicer is proportional to its sample size, and thus user can use 'resizeColSlicer' to custom and balance the layout of all column slicers.
#' @param resizeColSlicer_width integer vector, should be the same length as the number of column slicers (including additional statistical analysis results columns), and values provided here will be used in relative instead of absolute calculation. For example, there are 3 groups (A/B/C) with 10/50/300 samples in them, respectively. If there are 2 additional statistical analysis results columns, the final column slicers will be 5. Set 'resizeColSlicer_width' to rep(1, 5) will make all of the 5 column slicers the same size, but usually user may want the mainbody to be larger while the sidebar smaller, in this case, 'resizeColSlicer_width' can be set to c(rep(6, 3), rep(1, 2)) so that the 3 groups of the mainbody are in the same size while the additional statistical analysis results are only one-third of the mainbody's size.
#'
#' @details
#' Here is more information about how "filterOutlier" works. To begin with, vigilante takes the generally accepted definition of outliers: observations that lie outside 1.5 * IQR (Inter Quartile Range) of the 25th or 75th quartiles are regarded as outliers.
#'
#' Assuming there is a project containing 100 samples which are divided into 3 groups (Group 1: 24, Group 2: 36, Group 3: 40). The data contains 10,000 observations per sample, and vigilante has detected outliers of observation A and B in the following:
#' Group 1: A 3, B 2;
#' Group 2: A 0, B 3;
#' Group 3: A 0, B 4.
#'
#' Under default "filterStringency" of 0.1, observation A will be excluded from downstream analysis, because A has 3 outliers in Group 1 which exceeds the number of allowed outliers in this group (24 * 0.1 = 2.4, rounded down to 2), even if there is no outlier of A in Group 2 or 3. On the other hand, though observation B has 2/3/4 outliers in Group 1/2/3, B will still be kept for downstream analysis because these numbers do not exceed the number of allowed outliers in their repective groups.
#'
#' Moreover, if "filterStringency" is changed to 0.05, observation B will be excluded from downstream analysis as well; if "filterStringency" is changed to 0.15, observation A will no longer be excluded from downstream analysis.
#'
#' Therefore, it is up to the user to decide whether or not to filter outliers, and how much stringency should be imposed on the filtering process.
#'
#' @return NULL, when a valid outputFolderPath is provided, analysis results and output plots will be generated and saved in the provided location, otherwise function run will stop and nothing will be written into the file system.
#'
#' @import ggplot2 ggpubr qvalue
#'
#' @export
#'
# v_chmXcell function
v_chmXcell = function(outputFolderPath = "./_VK/_xCell/", ge.xcell.result = ge.xcell.result, filterOutlier = FALSE, filterOutlier_fromGroup = grpName, filterStringency = 0.1, filterNoTPM = FALSE, significanceTest = FALSE, significanceTest_inputForm = "raw", significanceTest_fdrq = FALSE, shadowGroup = list(FALSE, 5), calculateFC = FALSE, log10Threshold = c(0.3, -0.3), rowSliceOrder = c("Up-Regulated", "Within-Threshold", "Down-Regulated"), colSliceOrder = grpName, grpName_fc = grpName, grpName_pval = grpName, addBpAnno = FALSE, unsupervisedClustering = FALSE, colorScheme = list(mode = "continuous", breakpoints = seq(0, 1, 0.2)), resizeColSlicer = FALSE, resizeColSlicer_width = NULL) {
# check if outputFolderPath is set
if (is.null(outputFolderPath)) {
print("v_chmXcell run stopped: based on user's choice, output files and plots will not be generated")
stop()
} else {
print(as.character(glue::glue("Based on user's choice, output files and plots will be generated and saved in {outputFolderPath}")))
}
# check if globalSettings_returnList exists
if (!exists("globalSettings_returnList")) {
print("globalSettings_returnList not detected, please follow the instruction and run v_globalSettings function before continue")
stop()
} else {
print("globalSettings_returnList detected, start extracting global settings")
}
# extract global settings from return list
for (i in 1:length(globalSettings_returnList)) {
assign(x = names(globalSettings_returnList)[i], value = globalSettings_returnList[[i]])
}
rm(i)
# check if prepareVdata_xCell_returnList exists
if (!exists("prepareVdata_xCell_returnList")) {
print("prepareVdata_xCell_returnList not detected, please follow the instruction and run v_prepareVdata_xCell function before continue")
stop()
} else {
print("prepareVdata_xCell_returnList detected, start extracting xCell variables")
}
# extract xCell variables from return list
for (i in 1:length(prepareVdata_xCell_returnList)) {
assign(x = names(prepareVdata_xCell_returnList)[i], value = prepareVdata_xCell_returnList[[i]])
}
rm(i)
# arguments modifiers, v_chmXcell
print("Checking the values of the following arguments, modifiers may be applied due to interplay. Please check argument description for more information.")
print("----------Original values: ----------")
print(as.character(glue::glue("significanceTest_fdrq: {significanceTest_fdrq} | shadowGroup: {paste0(shadowGroup[[1]], ', ', shadowGroup[[2]])} | calculateFC: {calculateFC} | addBpAnno: {addBpAnno}")))
significanceTest_fdrq = significanceTest_fdrq & significanceTest
shadowGroup[[1]] = shadowGroup[[1]] & (length(unlist(grp.list)) == length(grp.list))
calculateFC = (calculateFC & length(grpName_fc) == 2) & !shadowGroup[[1]]
addBpAnno = addBpAnno & significanceTest & !unsupervisedClustering
print("----------Updated values: ----------")
print(as.character(glue::glue("significanceTest_fdrq: {significanceTest_fdrq} | shadowGroup: {paste0(shadowGroup[[1]], ', ', shadowGroup[[2]])} | calculateFC: {calculateFC} | addBpAnno: {addBpAnno}")))
# assign ge.xcell.result to ge.chm
ge.chm = ge.xcell.result
# extract grp.list.xcell from grp.list
grp.list.xcell = list()
for (i in 1:length(grpName)) {
grp.list.xcell[[i]] = grp.list[[i]][grp.list[[i]] %in% names(ge.list.c)]
}
rm(i)
names(grp.list.xcell) = grpName
# filter out outlier cell types
if (filterOutlier == TRUE) {
xcell.p = ge.chm
xcell.pm = reshape2::melt(xcell.p)
xcell.pmg = xcell.pm
xcell.pmg["Group"] = factor(plyr::mapvalues(xcell.pmg[, 2], from = groupInfo$aliasID, to = groupInfo$Group, warn_missing = FALSE), levels = colSliceOrder)
colnames(xcell.pmg) = c("Cell.Type", "Sample.ID", "Enrichment.Score", "Group")
ge.grp.iqr = xcell.pmg %>% dplyr::group_by(Cell.Type, Group)
ge.grp.iqr = dplyr::summarise_at(ge.grp.iqr, .vars = "Enrichment.Score", .fun = list(Outlier = ~length(unlist(boxplot.stats(Enrichment.Score)["out"]))))
rm(xcell.p, xcell.pm, xcell.pmg)
# filter outlier from selected group only
ge.grp.iqr = subset(ge.grp.iqr, Group %in% filterOutlier_fromGroup)
grp.list.xcell = subset(grp.list.xcell, names(grp.list.xcell) %in% filterOutlier_fromGroup)
# original filter process
for (i in 1:length(grp.list.xcell)) {
filterThreshold = floor(length(grp.list.xcell[[i]]) * filterStringency)
ge.grp.iqr[ge.grp.iqr$Group == names(grp.list.xcell)[i] & ge.grp.iqr$Outlier <= filterThreshold, 3] = 0
}
rm(i, filterThreshold)
ge.iqr = ge.grp.iqr %>% dplyr::group_by(Cell.Type) %>% dplyr::summarise_at(.vars = "Outlier", .fun = list(TotalOutlier = sum))
xcell_type.filtered = unlist(subset(ge.iqr, subset = TotalOutlier == 0)[, 1])
ge.chm = subset(ge.chm, subset = rownames(ge.chm) %in% xcell_type.filtered)
rm(ge.grp.iqr, ge.iqr, xcell_type.filtered)
}
# subset ge.chm and grp.list.xcell for plot purpose only, e.g. use Normal group for background subtraction but don't show this group
if (length(colSliceOrder) != length(grpName)) {
print(as.character(glue::glue("Group not shown: {setdiff(grpName, colSliceOrder)}")))
colnames_temp = colnames(ge.chm)
grp.list.xcell_temp = grp.list.xcell[names(grp.list.xcell) %in% colSliceOrder]
groupInfo_temp = subset(groupInfo, subset = Group %in% colSliceOrder)
colnames_temp = colnames_temp %in% groupInfo_temp$aliasID
ge.chm = subset(ge.chm, select = colnames_temp)
rm(colnames_temp, grp.list.xcell_temp)
} else {
groupInfo_temp = groupInfo
}
# convert ge.chm to ge.chm.p, and set ge.chm.pmg
ge.chm.p = ge.chm
ge.chm.p[is.na(ge.chm.p)] = 0 # replace NA with 0
ge.chm.p[ge.chm.p < 0] = 0 # replace negative value with 0 (for subtraction-normolized group)
if (filterNoTPM == TRUE) {
selectedCol = plyr::mapvalues(colnames(ge.chm.p), from = groupInfo$aliasID, to = groupInfo$Group, warn_missing = FALSE)
selectedCol = selectedCol %in% grpName_pval
ge.chm.p = ge.chm.p[rowSums(ge.chm.p[, selectedCol], na.rm = TRUE) != 0, ] # filter out genes that have TPM = 0 in grpName_pval group samples to prevent "data are essentially constant" error
rm(selectedCol)
}
min0offset_xcell_min = min(ge.chm.p[ge.chm.p > 0]) / 10
min0offset_xcell_min = 10 ^ (floor(log10(min0offset_xcell_min)))
min0offset_xcell_max = 10 ^ (-ceiling(log10(max(ge.chm.p))))
min0offset_xcell = min(min0offset_xcell_min, min0offset_xcell_max)
rm(min0offset_xcell_min, min0offset_xcell_max)
ge.chm.p = ge.chm.p + min0offset_xcell # add 10e-32 to fix 0 denominator issue
if (significanceTest_inputForm == "log10") {
ge.chm.pm = reshape2::melt(log10(ge.chm.p))
} else if (significanceTest_inputForm == "log2") {
ge.chm.pm = reshape2::melt(log2(ge.chm.p))
} else {
ge.chm.pm = reshape2::melt(ge.chm.p)
}
ge.chm.pmg = ge.chm.pm
ge.chm.pmg["Group"] = factor(plyr::mapvalues(ge.chm.pmg[, 2], from = groupInfo$aliasID, to = groupInfo$Group, warn_missing = FALSE), levels = colSliceOrder)
colnames(ge.chm.pmg) = c("Cell.Type", "Sample.ID", "Enrichment.Score", "Group")
rm(ge.chm.pm)
# reform ge.chm.pmg and set grp.list.xcell_shadow
if (shadowGroup[[1]] == TRUE) {
grpName_pval = groupInfo$aliasID
grp.list.xcell_shadow = c(rep(list(ge.chm.pmg), times = shadowGroup[[2]]))
grp.list.xcell_shadow = lapply(grp.list.xcell_shadow, function(x) {
x$Group = x$Sample.ID
return(x)
})
names(grp.list.xcell_shadow) = paste0("shadow_", 1:shadowGroup[[2]])
shadow_multiplier = seq(from = 0.5, to = 1.5, length.out = shadowGroup[[2]])
for (i in 1:shadowGroup[[2]]) {
grp.list.xcell_shadow[[i]]$Sample.ID = paste0(grp.list.xcell_shadow[[i]]$Sample.ID, "_", names(grp.list.xcell_shadow)[i])
grp.list.xcell_shadow[[i]]$Enrichment.Score = grp.list.xcell_shadow[[i]]$Enrichment.Score * shadow_multiplier[i]
}
rm(i)
grp.list.xcell_shadow = dplyr::bind_rows(grp.list.xcell_shadow)
ge.chm.pmg = grp.list.xcell_shadow
rm(grp.list.xcell_shadow, shadow_multiplier)
} else {
ge.chm.pmg = subset(ge.chm.pmg, subset = Group %in% grpName_pval)
}
# add t-test/anova significance test based number of groups, and prepare data for boxplot annotation
if (significanceTest == TRUE) {
tempPaired = (length(grpName_pval) == 2) & (sum(assayID_rna %in% grp.list[[grpName_pval[1]]]) == sum(assayID_rna %in% grp.list[[grpName_pval[2]]]))
if (shadowGroup[[1]] == TRUE) {
tempPaired = TRUE
}
ge.chm.p.pval = compare_means(Enrichment.Score ~ Group, data = ge.chm.pmg, method = ifelse(length(grpName_pval) > 2, "anova", "t.test"), group.by = "Cell.Type", paired = tempPaired)
tempRownames = ge.chm.p.pval$Cell.Type
ge.chm.p.pval = as.matrix(ge.chm.p.pval$p)
rownames(ge.chm.p.pval) = tempRownames
colnames(ge.chm.p.pval) = "p-value"
ge.chm.p.pval.sig = subset(ge.chm.p.pval, subset = ge.chm.p.pval[, 1] <= 0.05)
tempRownames_sig = rownames(ge.chm.p.pval.sig)
ge.chm.p = ge.chm.p[rownames(ge.chm.p) %in% tempRownames, ]
ge.chm.pmg.sig = subset(ge.chm.pmg, subset = Cell.Type %in% tempRownames_sig)
# calculate false discovery rate q-value (fdrq)
if (significanceTest_fdrq == TRUE) {
ge.chm.p.fdrq = qvalue(p = unlist(ge.chm.p.pval[, 1]), pi0 = 1)[["qvalues"]]
ge.chm.p.fdrq = as.matrix(ge.chm.p.fdrq)
colnames(ge.chm.p.fdrq) = "q-value"
ge.chm.p.fdrq.sig = subset(ge.chm.p.fdrq, subset = rownames(ge.chm.p.fdrq) %in% tempRownames_sig)
}
# conduct pair-wise t-test for detailed csv output of ANOVA test
if (length(grpName_pval) > 2 & nrow(ge.chm.pmg.sig) > 0 & shadowGroup[[1]] == FALSE) {
# shuffle the data (add slight randomness to the constant part) to prevent "data are essentially constant" error
tempMinVal = min(ge.chm.pmg.sig[, 3])
for (i in 1:nrow(ge.chm.pmg.sig)) {
if (ge.chm.pmg.sig[i, 3] == tempMinVal) {
tempShuffle = runif(1) / 1e32
ge.chm.pmg.sig[i, 3] = ge.chm.pmg.sig[i, 3] + tempShuffle
rm(tempShuffle)
}
}
rm(i, tempMinVal)
# original pair-wise t-test process
ge.chm.p.pval.sig.ttest = compare_means(Enrichment.Score ~ Group, data = ge.chm.pmg.sig, method = "t.test", group.by = "Cell.Type", paired = tempPaired)
ge.chm.p.pval.sig.ttest = ge.chm.p.pval.sig.ttest[, c(1, 3:5, 8)]
colnames(ge.chm.p.pval.sig.ttest) = c("Cell_Type", "Group_1", "Group_2", "p-value", "Significance_Level")
ge.chm.p.pval.sig.ttest["NOTES"] = ""
ge.chm.p.pval.sig.ttest[1, "NOTES"] = "Open this file with a text editor in case Excel-unsupported encoding characters appear."
print(as.character(glue::glue("Additional pair-wise t-test completed, output csv file saved in {outputFolderPath}")))
write.csv(ge.chm.p.pval.sig.ttest, file = paste0(outputFolderPath, studyID, "_xCell_PairWiseTTest_fromAnovaSigOnly.csv"), quote = FALSE, row.names = FALSE, fileEncoding = "UTF-8")
rm(ge.chm.p.pval.sig.ttest)
}
# rm temp significance test variables
rm(tempRownames, tempRownames_sig, ge.chm.p.pval.sig, tempPaired)
}
# calculate fold change log10 ratio for xCell enrichment score data
if (calculateFC == TRUE) {
xcell.foldchange = v_chmFoldChangeLog10_xCell(outputFolderPath, log10Threshold, grpName_fc, filterNoTPM)
signature_panel.fc = xcell.foldchange[, -3]
signature_panel.fc = subset(signature_panel.fc, subset = Cell_Type %in% rownames(ge.chm.p))
signature_panel.fc = dplyr::arrange(signature_panel.fc, dplyr::desc(Log10_FC))
ge.chm.p = ge.chm.p[signature_panel.fc$Cell_Type, ]
if (significanceTest == TRUE) {
ge.chm.p.pval = as.matrix(ge.chm.p.pval[signature_panel.fc$Cell_Type, ])
colnames(ge.chm.p.pval) = "p-value"
}
if (significanceTest_fdrq == TRUE) {
ge.chm.p.fdrq = as.matrix(ge.chm.p.fdrq[signature_panel.fc$Cell_Type, ])
colnames(ge.chm.p.fdrq) = "q-value"
}
tempRownames.fc = signature_panel.fc$Cell_Type
signature_panel.fc = as.matrix(signature_panel.fc[, -1])
rownames(signature_panel.fc) = tempRownames.fc
colnames(signature_panel.fc) = "Log10_FC"
rm(tempRownames.fc)
}
# output significance test and fold-change calculation results to csv file
if (any(significanceTest, calculateFC)) {
outputCSV_moduleStatus = c(signature_panel.fc = calculateFC, ge.chm.p.pval= significanceTest, ge.chm.p.fdrq = significanceTest_fdrq)
tempCSV = ""
for (i in 1:length(outputCSV_moduleStatus)) {
if (outputCSV_moduleStatus[i] == TRUE) {
if (nchar(tempCSV) == 0 ) {
tempCSV = paste0(tempCSV, names(outputCSV_moduleStatus)[i])
} else {
tempCSV = paste0(tempCSV, ", ", names(outputCSV_moduleStatus)[i])
}
}
}
rm(i)
tempCSV = glue::glue("cbind({tempCSV})")
tempCSV = eval(parse(text = tempCSV))
rm(outputCSV_moduleStatus)
tempCSV = as.data.frame(tempCSV, stringAsFactor = FALSE)
tempCSV_suffix = c("_FC" = calculateFC, "_pval" = significanceTest, "_fdrq" = significanceTest_fdrq)
tempCSV_suffix = tempCSV_suffix[tempCSV_suffix]
tempCSV_suffix = names(tempCSV_suffix)
tempCSV_suffix = paste0(tempCSV_suffix, collapse = "")
tempCSV["NOTES"] = ""
tempCSV[1, "NOTES"] = "Open this file with a text editor in case many false 0 values appear (Excel only supports maximum precision digits of 15)."
print(as.character(glue::glue("Local computation and analysis completed, output csv file saved in {outputFolderPath}")))
write.csv(tempCSV, file = paste0(outputFolderPath, studyID, "_xCell_all_types", tempCSV_suffix, ".csv"), quote = FALSE, row.names = TRUE, fileEncoding = "UTF-8")
rm(tempCSV, tempCSV_suffix)
}
# reset column order based on groupInfo$aliasID
ge.chm.p = ge.chm.p[, sort(colnames(ge.chm.p))]
# preset default chm_fdrq annotation for chm.sigpan_ge.pval
chm_fdrq = NULL
# set chm_fdrq annotation for chm.sigpan_ge.pval
if (significanceTest == TRUE & significanceTest_fdrq == TRUE) {
ge.chm.p.fdrq.nglog10 = -log10(ge.chm.p.fdrq)
SPCM.breakpoint.fdrq = ceiling(max(ge.chm.p.fdrq.nglog10))
SPCM.breakpoint.fdrq = max(SPCM.breakpoint.fdrq, 2)
if (SPCM.breakpoint.fdrq %% 2 != 0) {
SPCM.breakpoint.fdrq = SPCM.breakpoint.fdrq + 1
}
SPCM.fdrq = colorRamp2(breaks = c(0, SPCM.breakpoint.fdrq), colors = c("white", "red3"))
chm_fdrq_col = SPCM.fdrq(ge.chm.p.fdrq.nglog10[, 1])
if (nrow(ge.chm.pmg.sig) > 0) {
ge.chm.p.fdrq.nglog10.sig = -log10(ge.chm.p.fdrq.sig)
chm_fdrq_col.sig = SPCM.fdrq(ge.chm.p.fdrq.nglog10.sig[, 1])
}
chm_fdrq = rowAnnotation(NgLog10_FDR = anno_barplot(ge.chm.p.fdrq.nglog10[, 1], baseline = 0, gp = gpar(fill = chm_fdrq_col, col = chm_fdrq_col), bar_width = 0.66, axis_param = list(at = c(0, SPCM.breakpoint.fdrq * 0.5, SPCM.breakpoint.fdrq, 1.3), labels = as.character(c(0, SPCM.breakpoint.fdrq * 0.5, SPCM.breakpoint.fdrq, "p")), gp = gpar(fontsize = 6), labels_rot = 0, side = "bottom"), ylim = c(0, SPCM.breakpoint.fdrq)), annotation_name_gp = gpar(fontsize = 8, fontface = "bold"), annotation_name_side = "bottom", annotation_name_rot = 0, width = unit(2, "cm"))
}
# set parameter for splitting rows and columns
if (calculateFC == TRUE) {
split_row = factor(plyr::mapvalues(rownames(ge.chm.p), from = xcell.foldchange$Cell_Type, to = xcell.foldchange$Exp_Level, warn_missing = FALSE), levels = rowSliceOrder)
} else {
split_row = NULL
rowSliceOrder = "Undefined"
}
temp_groupInfo = groupInfo_temp
temp_groupInfo = plyr::arrange(temp_groupInfo, aliasID)
temp_groupInfo = subset(temp_groupInfo, subset = assayID %in% names(ge.list.c))
split_col = factor(unlist(temp_groupInfo$Group), levels = colSliceOrder)
rm(temp_groupInfo, groupInfo_temp)
if (unsupervisedClustering == TRUE) {
split_row = NULL
rowSliceOrder = "Undefined"
split_col = NULL
}
# set color mapping
if (colorScheme[["mode"]] == "continuous") {
SPCM.es = colorRamp2(c(0, 1), c("white", "royalblue3"))
SPCM.es_lgd = list(direction = "horizontal", col_fun = SPCM.es, title = "Enrichment\n Score", legend_width = unit(2, "cm"), at = c(0, 1))
} else if (colorScheme[["mode"]] == "discrete") {
SPCM.es_mat = cut(x = sort(as.vector(ge.xcell.result)), breaks = seq(from = 0, to = 1, by = 0.2), labels = NULL, include.lowest = FALSE, right = TRUE, dig.lab = 1, ordered_result = FALSE)
SPCM.es_lvl = levels(SPCM.es_mat)
SPCM.es_tempContinuous = colorRamp2(c(0, 1), c("white", "royalblue3"))
SPCM.es_color = SPCM.es_tempContinuous(seq(0.2, 1, 0.2))
SPCM.es_obj = ColorMapping(colors = SPCM.es_color, levels = SPCM.es_lvl, na_col = "white")
SPCM.es = map_to_colors(object = SPCM.es_obj, x = SPCM.es_mat)
SPCM.es = structure(.Data = SPCM.es, names = sort(as.vector(ge.chm.p)))
SPCM.es_lgd = NULL
rm(SPCM.es_mat, SPCM.es_lvl, SPCM.es_tempContinuous, SPCM.es_color, SPCM.es_obj)
}
SPCM.pval = colorRamp2(breaks = c(0, 0.05), colors = c("magenta3", "white"))
if (calculateFC == TRUE) {
SPCM.breakpoint.fc = min(abs(floor(min(signature_panel.fc))), ceiling(max(signature_panel.fc)))
if (length(unique(c(-SPCM.breakpoint.fc, log10Threshold[2], 0, log10Threshold[1], SPCM.breakpoint.fc))) == 5) {
SPCM.fc = colorRamp2(c(-SPCM.breakpoint.fc, log10Threshold[2], 0, log10Threshold[1], SPCM.breakpoint.fc), c("limegreen", "white", "white", "white", "orange"))
} else {
SPCM.fc = colorRamp2(sort(unique(c(-SPCM.breakpoint.fc, log10Threshold[2], 0, log10Threshold[1], SPCM.breakpoint.fc))), c("limegreen", "white", "orange"))
}
}
# set chm_row_names_gp and chm_col_names_gp (show/not show & font size)
chm_row_names_gp = dplyr::case_when(
nrow(ge.chm.p) <= 70 ~ list(TRUE, 9 - (nrow(ge.chm.p) - 10) / 10),
nrow(ge.chm.p) > 70 ~ list(FALSE, 1)
)
chm_col_names_gp = dplyr::case_when(
ncol(ge.chm.p) <= 90 ~ list(TRUE, 10 - (ncol(ge.chm.p) - 10) / 10, "black"),
ncol(ge.chm.p) > 90 ~ list(FALSE, 1, "white")
)
# render complex heatmap
if (resizeColSlicer == FALSE) {
chm.sigpan_ge.p = Heatmap(ge.chm.p, name = "Enrichment\n Score", row_split = split_row, column_split = split_col, col = SPCM.es, show_parent_dend_line = FALSE, row_gap = unit(0.25, "mm"), column_gap = unit(1, "mm"), border = TRUE,
row_title_rot = 0,
row_names_rot = 0,
row_title_gp = gpar(fontsize = 6, fontface = "bold"),
show_row_names = chm_row_names_gp[[1]][1],
row_names_gp = gpar(fontsize = chm_row_names_gp[[2]][1]),
row_names_side = "right",
cluster_rows = unsupervisedClustering,
cluster_row_slices = unsupervisedClustering,
cluster_columns = unsupervisedClustering,
cluster_column_slices = unsupervisedClustering,
column_title_rot = 0,
column_title_gp = gpar(fontsize = 12, fontface = "bold"),
column_names_rot = 45,
show_column_names = chm_col_names_gp[[1]][1],
column_names_gp = gpar(fontsize = chm_col_names_gp[[2]][1], col = chm_col_names_gp[[3]][1]),
column_names_side = "bottom",
top_annotation = NULL,
show_heatmap_legend = colorScheme[["mode"]] == "continuous",
heatmap_legend_param = SPCM.es_lgd)
chm.sigpan_ge.list = chm.sigpan_ge.p
} else {
chm.sigpan_ge.p.resize = list()
for (i in 1:length(colSliceOrder)) {
resizeColSlicer_index = plyr::mapvalues(colnames(ge.chm.p), from = groupInfo$aliasID, to = groupInfo$Group, warn_missing = FALSE) == colSliceOrder[i]
chm.sigpan_ge.p.resize[[i]] = Heatmap(ge.chm.p[, resizeColSlicer_index], name = "Enrichment\n Score", row_split = split_row, column_split = split_col[resizeColSlicer_index], col = SPCM.es, show_parent_dend_line = FALSE, row_gap = unit(0.25, "mm"), column_gap = unit(1, "mm"), border = TRUE,
row_title_rot = 0,
row_names_rot = 0,
row_title_gp = gpar(fontsize = 6, fontface = "bold"),
show_row_names = chm_row_names_gp[[1]][1],
row_names_gp = gpar(fontsize = chm_row_names_gp[[2]][1]),
row_names_side = "right",
cluster_rows = unsupervisedClustering,
cluster_row_slices = unsupervisedClustering,
cluster_columns = unsupervisedClustering,
cluster_column_slices = unsupervisedClustering,
column_title_rot = 0,
column_title_gp = gpar(fontsize = 12, fontface = "bold"),
column_names_rot = 45,
show_column_names = chm_col_names_gp[[1]][1],
column_names_gp = gpar(fontsize = chm_col_names_gp[[2]][1], col = chm_col_names_gp[[3]][1]),
column_names_side = "bottom",
top_annotation = NULL,
show_heatmap_legend = colorScheme[["mode"]] == "continuous",
heatmap_legend_param = SPCM.es_lgd,
width = resizeColSlicer_width[i])
}
rm(i, resizeColSlicer_index)
chm.sigpan_ge.list = chm.sigpan_ge.p.resize[[1]]
for (i in 2:length(colSliceOrder)) {
chm.sigpan_ge.list = chm.sigpan_ge.list + chm.sigpan_ge.p.resize[[i]]
}
rm(i, chm.sigpan_ge.p.resize)
}
if (significanceTest == TRUE) {
chm.sigpan_ge.pval = Heatmap(ge.chm.p.pval, name = ifelse(length(grpName_pval) > 2, "Anova Test\n p-value", " t-test\n p-value"), row_split = split_row, column_split = NULL, col = SPCM.pval, show_parent_dend_line = FALSE, row_gap = unit(0.25, "mm"), column_gap = unit(1, "mm"), border = TRUE,
row_title_rot = 0,
row_names_rot = 0,
row_title_gp = gpar(fontsize = 6, fontface = "bold"),
row_names_gp = gpar(fontsize = 3),
row_names_side = "right",
show_row_names = FALSE,
cluster_rows = unsupervisedClustering,
cluster_row_slices = unsupervisedClustering,
cluster_columns = FALSE,
column_title_rot = 0,
column_title_gp = gpar(fontsize = 12, fontface = "bold"),
column_names_rot = 45,
column_names_gp = gpar(fontsize = 9, fontface = "bold"),
column_names_side = "bottom",
right_annotation = chm_fdrq,
heatmap_legend_param = list(direction = "horizontal", col_fun = SPCM.pval, at = c(0, 0.05), labels = c("0", "0.05"), title = ifelse(length(grpName_pval) > 2, "Anova Test\n p-value", " t-test\n p-value"), legend_width = unit(1.75, "cm")))
if (calculateFC == FALSE) {
chm.sigpan_ge.list = chm.sigpan_ge.p + chm.sigpan_ge.pval
chm.sigpan_ge.list.sig = chm.sigpan_ge.list[ge.chm.p.pval[, "p-value"] <= 0.05, ]
}
}
if (calculateFC == TRUE) {
chm.sigpan_ge.fc = Heatmap(signature_panel.fc, name = "Fold-Change\n log-ratio", row_split = split_row, column_split = NULL, col = SPCM.fc, show_parent_dend_line = FALSE, row_gap = unit(0.25, "mm"), column_gap = unit(1, "mm"), border = TRUE,
row_title_rot = 0,
row_names_rot = 0,
row_title_gp = gpar(fontsize = 6, fontface = "bold"),
row_names_gp = gpar(fontsize = 3),
row_names_side = "right",
show_row_names = FALSE,
cluster_rows = unsupervisedClustering,
cluster_row_slices = unsupervisedClustering,
cluster_columns = FALSE,
column_title_rot = 0,
column_title_gp = gpar(fontsize = 12, fontface = "bold"),
column_names_rot = 45,
column_names_gp = gpar(fontsize = 9, fontface = "bold"),
column_names_side = "bottom",
heatmap_legend_param = list(direction = "horizontal", col_fun = SPCM.fc, at = sort(unique(c(-SPCM.breakpoint.fc, log10Threshold[2], 0, log10Threshold[1], SPCM.breakpoint.fc))), labels = sort(unique(c(-SPCM.breakpoint.fc, log10Threshold[2], 0, log10Threshold[1], SPCM.breakpoint.fc))), title = "Fold-Change\n log-ratio", legend_width = unit(2, "cm")))
if (significanceTest == TRUE) {
chm.sigpan_ge.list = chm.sigpan_ge.list + chm.sigpan_ge.fc + chm.sigpan_ge.pval
chm.sigpan_ge.list.sig = chm.sigpan_ge.list[ge.chm.p.pval[, "p-value"] <= 0.05, ]
} else {
chm.sigpan_ge.list = chm.sigpan_ge.list + chm.sigpan_ge.fc
}
}
# set file name suffix
chm_moduleStatus = c(significanceTest, significanceTest_fdrq, calculateFC, resizeColSlicer, filterOutlier, unsupervisedClustering, addBpAnno, !is.null(significanceTest_inputForm), !is.null(colorScheme[["mode"]])) # always put "filterOutlier" at last, for function complete return message
names(chm_moduleStatus) = c("_pvalue", "_fdrq", "_FoldChange", "_Resized", "_Filtered", "_Unsupervised", "_BpAnno", paste0("_", significanceTest_inputForm), paste0("_", colorScheme[["mode"]]))
chm_suffix = ""
for (i in 1:length(chm_moduleStatus)) {
if (chm_moduleStatus[i] == TRUE) {
chm_suffix = paste0(chm_suffix, names(chm_moduleStatus)[i])
}
}
rm(i)
# set standalone heatmap mainbody legend for discrete mapping
if (colorScheme[["mode"]] == "discrete") {
SPCM.es_tempContinuous = colorRamp2(c(0, 1), c("white", "royalblue3"))
SPCM.es_lgd_discrete = Legend(direction = "vertical", labels = seq(1, 0, -0.2), legend_gp = gpar(fill = SPCM.es_tempContinuous(seq(1, 0, -0.2))), title = "Enrichment\n Score", legend_height = unit((length(seq(1, 0, -0.2)) * 0.5), "cm"))
rm(SPCM.es_tempContinuous)
} else {
SPCM.es_lgd_discrete = NULL
}
# save complex heatmap file and output matrix file
png(filename = paste0(outputFolderPath, "HeatMap4K_", studyID, "_CellTypeEnrichmentScore", chm_suffix, "_all.png"), width = 3339, height = 2160, units = "px", res = 300)
draw(chm.sigpan_ge.list, auto_adjust = resizeColSlicer, heatmap_legend_side = "right", heatmap_legend_list = SPCM.es_lgd_discrete)
if (resizeColSlicer == FALSE) {
if (any(significanceTest, calculateFC)) {
output_row_order = unlist(row_order(chm.sigpan_ge.list@ht_list[[1]]))
output_col_order = unlist(column_order(chm.sigpan_ge.list@ht_list[[1]]))
output_ge_mat = chm.sigpan_ge.list@ht_list[[1]]@matrix
} else {
output_row_order = unlist(row_order(chm.sigpan_ge.list))
output_col_order = unlist(column_order(chm.sigpan_ge.list))
output_ge_mat = chm.sigpan_ge.list@matrix
}
output_ge_mat = output_ge_mat[output_row_order, output_col_order]
rm(output_row_order, output_col_order)
} else {
output_ge_mat.resize = list()
for (i in 1:(length(colSliceOrder))) {
output_row_order = unlist(row_order(chm.sigpan_ge.list@ht_list[[i]]))
output_col_order = unlist(column_order(chm.sigpan_ge.list@ht_list[[i]]))
output_ge_mat = chm.sigpan_ge.list@ht_list[[i]]@matrix
output_ge_mat = output_ge_mat[output_row_order, output_col_order]
output_ge_mat.resize[[i]] = output_ge_mat
}
rm(i, output_row_order, output_col_order, output_ge_mat)
output_ge_mat.resize = lapply(output_ge_mat.resize, function(x) {
x = as.data.frame(x)
return(x)
})
output_ge_mat = dplyr::bind_cols(output_ge_mat.resize)
rownames(output_ge_mat) = rownames(output_ge_mat.resize[[1]])
rm(output_ge_mat.resize)
}
write.csv(output_ge_mat, file = paste0(outputFolderPath, "HeatMap4K_", studyID, "_CellTypeEnrichmentScore", chm_suffix, "_all.csv"), quote = FALSE)
rm(output_ge_mat)
if (significanceTest_fdrq == TRUE) {
for (i in 1:length(rowSliceOrder)) {
decorate_annotation("NgLog10_FDR", {
grid.lines(y = c(0, 1), x = unit(c(-log10(0.05), -log10(0.05)), "native"), gp = gpar(lty = 2, col = "black", lwd = 1.5))
}, slice = i)
for (j in seq(0, SPCM.breakpoint.fdrq, length.out = 5)) {
decorate_annotation("NgLog10_FDR", {
grid.lines(y = c(0, 1), x = unit(c(j, j), "native"), gp = gpar(lty = 1, col = "#00000033", lwd = 0.5))
}, slice = i)
}
rm(j)
}
rm(i)
}
dev.off()
if (significanceTest == TRUE) {
tryCatch({
png(filename = paste0(outputFolderPath, "HeatMap4K_", studyID, "_CellTypeEnrichmentScore", chm_suffix, "_sigOnly.png"), width = 3339, height = 2160, units = "px", res = 300)
draw(chm.sigpan_ge.list.sig, auto_adjust = resizeColSlicer, heatmap_legend_side = "right", heatmap_legend_list = SPCM.es_lgd_discrete)
if (resizeColSlicer == FALSE) {
output_row_order.sig = unlist(row_order(chm.sigpan_ge.list.sig@ht_list[[1]]))
output_col_order.sig = unlist(column_order(chm.sigpan_ge.list.sig@ht_list[[1]]))
output_ge_mat.sig = chm.sigpan_ge.list.sig@ht_list[[1]]@matrix
output_ge_mat.sig = output_ge_mat.sig[output_row_order.sig, output_col_order.sig]
rm(output_row_order.sig, output_col_order.sig)
} else {
output_ge_mat.resize.sig = list()
for (i in 1:(length(colSliceOrder))) {
output_row_order.sig = unlist(row_order(chm.sigpan_ge.list.sig@ht_list[[i]]))
output_col_order.sig = unlist(column_order(chm.sigpan_ge.list.sig@ht_list[[i]]))
output_ge_mat.sig = chm.sigpan_ge.list.sig@ht_list[[i]]@matrix
output_ge_mat.sig = output_ge_mat.sig[output_row_order.sig, output_col_order.sig]
output_ge_mat.resize.sig[[i]] = output_ge_mat.sig
}
rm(i, output_row_order.sig, output_col_order.sig, output_ge_mat.sig)
output_ge_mat.resize.sig = lapply(output_ge_mat.resize.sig, function(x) {
x = as.data.frame(x)
return(x)
})
output_ge_mat.sig = dplyr::bind_cols(output_ge_mat.resize.sig)
rownames(output_ge_mat.sig) = rownames(output_ge_mat.resize.sig[[1]])
rm(output_ge_mat.resize.sig)
}
write.csv(output_ge_mat.sig, file = paste0(outputFolderPath, "HeatMap4K_", studyID, "_CellTypeEnrichmentScore", chm_suffix, "_sigOnly.csv"), quote = FALSE)
rm(output_ge_mat.sig)
if (significanceTest_fdrq == TRUE) {
if (calculateFC == TRUE) {
rowSliceOrder_sig = unique(plyr::mapvalues(rownames(ge.chm.p.fdrq.nglog10.sig), from = xcell.foldchange$Cell_Type, to = xcell.foldchange$Exp_Level, warn_missing = FALSE))
rowSliceOrder_sig = subset(rowSliceOrder, subset = rowSliceOrder %in% rowSliceOrder_sig)
} else {
rowSliceOrder_sig = rowSliceOrder
}
for (i in 1:length(rowSliceOrder_sig)) {
decorate_annotation("NgLog10_FDR", {
grid.lines(y = c(0, 1), x = unit(c(-log10(0.05), -log10(0.05)), "native"), gp = gpar(lty = 2, col = "black", lwd = 1.5))
}, slice = i)
for (j in seq(0, SPCM.breakpoint.fdrq, length.out = 5)) {
decorate_annotation("NgLog10_FDR", {
grid.lines(y = c(0, 1), x = unit(c(j, j), "native"), gp = gpar(lty = 1, col = "#00000033", lwd = 0.5))
}, slice = i)
}
rm(j)
}
rm(i)
}
dev.off()
}, error = function(x) {print("No significant cell types!")})
}
# add boxplot annotation
if (addBpAnno == TRUE) {
# set rowOrder to match heatmap rowOrder
if (calculateFC == TRUE) {
ge.chm_bpAnno_xaxis_order = as.data.frame(rownames(signature_panel.fc), stringsAsFactors = FALSE)
} else {
ge.chm_bpAnno_xaxis_order = as.data.frame(rownames(ge.chm.p.pval), stringsAsFactors = FALSE)
}
colnames(ge.chm_bpAnno_xaxis_order) = "Cell_Type"
if (calculateFC == TRUE) {
ge.chm_bpAnno_xaxis_order = cbind(ge.chm_bpAnno_xaxis_order, "Exp_Level" = factor(plyr::mapvalues(ge.chm_bpAnno_xaxis_order$Cell_Type, from = xcell.foldchange$Cell_Type, to = xcell.foldchange$Exp_Level, warn_missing = FALSE), levels = rowSliceOrder))
ge.chm_bpAnno_xaxis_order = plyr::arrange(ge.chm_bpAnno_xaxis_order, Exp_Level)
}
# set auto-adjusted bottom margin based on main heatmap colnames
bpAnno_bottomMargin_ge = max(nchar(colnames(ge.chm.p)))
bpAnno_bottomMargin_ge = (bpAnno_bottomMargin_ge * sinpi(45 / 180) * 0.4) * (chm_col_names_gp[[2]] / 9) + (9 - chm_col_names_gp[[2]]) / 10 + 0.05
bpAnno_bottomMargin_pval = ifelse(significanceTest == TRUE, nchar("p-value"), 0)
bpAnno_bottomMargin_pval = (bpAnno_bottomMargin_pval * sinpi(45 / 180) * 0.4) + 0.05
bpAnno_bottomMargin_fc = ifelse(calculateFC == TRUE, nchar("Log10_FC"), 0)
bpAnno_bottomMargin_fc = (bpAnno_bottomMargin_fc * sinpi(45 / 180) * 0.4) + 0.05
bpAnno_bottomMargin = max(bpAnno_bottomMargin_ge, bpAnno_bottomMargin_pval, bpAnno_bottomMargin_fc)
# render boxplot annotation, all
ge.chm_bpAnno = ggplot(ge.chm.pmg, aes(x = reorder(Cell.Type, dplyr::desc(factor(Cell.Type, levels = ge.chm_bpAnno_xaxis_order$Cell_Type))), y = Enrichment.Score, fill = Group)) + geom_boxplot(aes(color = Group), outlier.size = 1) + coord_flip() + scale_y_reverse() + scale_x_discrete(position = "top") + labs(x = NULL, y = ifelse(significanceTest_inputForm %in% c("log10", "log2"), "Enrichment Log Ratio", "Enrichment Score")) + theme_gray() + theme(legend.position = "left", plot.margin = unit(c(2, 0.1, bpAnno_bottomMargin, 0.1), "cm")) + guides(color = guide_legend(reverse = TRUE), fill = guide_legend(reverse = TRUE))
# save boxplot annotation file, all
png(filename = paste0(outputFolderPath, "BoxPlotAnno4K_", studyID, "_annotation_xCell", chm_suffix, "_all.png"), width = 720, height = 2160, units = "px", res = 150)
print(ge.chm_bpAnno)
dev.off()
# combine chmXcell and boxplot annotation, all
v_magick_xCell(outputFolderPath, sigType = "all")
# render and save boxplot annotation, significant only
if (significanceTest == TRUE) {
tryCatch({
ge.chm_bpAnno_sig = ggplot(ge.chm.pmg.sig, aes(x = reorder(Cell.Type, dplyr::desc(factor(Cell.Type, levels = ge.chm_bpAnno_xaxis_order$Cell_Type))), y = Enrichment.Score, fill = Group)) + geom_boxplot(aes(color = Group), outlier.size = 1) + coord_flip() + scale_y_reverse() + scale_x_discrete(position = "top") + labs(x = NULL, y = ifelse(significanceTest_inputForm %in% c("log10", "log2"), "Enrichment Log Ratio", "Enrichment Score")) + theme_gray() + theme(legend.position = "left", plot.margin = unit(c(2, 0.1, bpAnno_bottomMargin, 0.1), "cm")) + guides(color = guide_legend(reverse = TRUE), fill = guide_legend(reverse = TRUE))
png(filename = paste0(outputFolderPath, "BoxPlotAnno4K_", studyID, "_annotation_xCell", chm_suffix, "_sigOnly.png"), width = 720, height = 2160, units = "px", res = 150)
print(ge.chm_bpAnno_sig)
dev.off()
# combine chmXcell and boxplot annotation, sigOnly
v_magick_xCell(outputFolderPath, sigType = "sigOnly")
}, error = function(x) {print("No significant cell types!")})
}
}
# remove extracted global settings
rm(list = names(globalSettings_returnList))
# remove extracted prepareVdata_xCell_returnList variables
rm(list = names(prepareVdata_xCell_returnList))
# end of v_chmXcell function
chm_returnBlock = ""
for (i in 1:length(chm_moduleStatus)) {
if (chm_moduleStatus[i] == TRUE) {
chm_returnBlock = paste0(chm_returnBlock, toupper(names(chm_moduleStatus)[i]))
}
}
rm(i)
print(as.character(glue::glue("v_chmXcell run completed {chm_returnBlock}, output plots and csv files saved in {outputFolderPath}")))
return()
}
# preset globalVariables for R CMD check
utils::globalVariables(c("GRCm2h38C", "Gene", "Occurrence", "TotalOccurrence", "gdc_ge.list.c_xcell", "gdc_ge.list_FPKM", "iSample", "nSamples", "progressBar", "Cell.Type", "Exp_Level", "Cell_Type", "Enrichment.Score", "Log10_FC", "TotalOutlier", "aliasID", "assayID_rna", "prepareVdata_xCell_returnList"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.