##################################################
## R script for MicrobiomeAnalyst
## Description: Data/resource management functions
## Author: Jeff Xia, jeff.xia@mcgill.ca
###################################################
###########################
########### I/O ###########
###########################
#'Function to get gene list statistics
#'@description This function gets the gene list stats.
#'@param mbSetObj Input the name of the mbSetObj.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
GetGeneListStat <- function(mbSetObj){
mbSetObj <- .get.mbSetObj(mbSetObj);
range <- c(0, 0);
if(mbSetObj$analSet$gene.only){
val.range <- c(0, 0);
}else{
val.range <- range(mbSetObj$analSet$ko.mapped[,1])
}
return(c(nrow(mbSetObj$analSet$ko.orig), nrow(mbSetObj$analSet$ko.mapped), mbSetObj$analSet$gene.only, val.range));
}
#'Function to filter list data based on a minimum count
#'@description This function filters the inputted list data
#'based on a minimum count.
#'@param mbSetObj Input the name of the mbSetObj.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
UpdateListInput <- function(mbSetObj, minL, maxL=Inf){
mbSetObj <- .get.mbSetObj(mbSetObj);
hit.inx <- mbSetObj$analSet$ko.mapped >= minL & mbSetObj$analSet$ko.mapped <= maxL;
if(sum(hit.inx) > 0){
current.msg <<- paste("A total of unqiue", sum(hit.inx), "KO genes were selected based on the cutoff", minL);
mbSetObj$analSet$data <- mbSetObj$analSet$ko.mapped[hit.inx, , drop=FALSE];
mbSetObj$dataSet$filt.msg <- current.msg;
return(.set.mbSetObj(mbSetObj));
}else{
current.msg <<- "No genes were selected in this range!";
AddErrMsg(current.msg);
mbSetObj$dataSet$filt.msg <- current.msg;
.set.mbSetObj(mbSetObj);
return(0);
}
}
#'Main function to read in shotgun data
#'@description This function reads in shotgun data
#'in a tabular format.
#'@param mbSetObj Input the name of the mbSetObj.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
ReadShotgunTabData <- function(mbSetObj, dataName, geneidtype,is.normalized) {
mbSetObj <- .get.mbSetObj(mbSetObj);
mydata <- .readDataTable(dataName);
if(any(is.na(mydata)) || class(mydata) == "try-error"){
AddErrMsg("Failed to read in the abundance data! Please make sure the gene abundance table is in the right format and do not have empty cells or NA.");
return(0);
}
# look for #NAME, store in a list
#just to check the name of labels.
sam.nm <- substr(colnames(mydata[1]),1,5);
sam.nm <- tolower(sam.nm);
sam.inx <- grep("^#name",sam.nm);
if(length(sam.inx) > 0){
smpl_nm<-colnames(mydata[-1]);
}else{
AddErrMsg("No labels #NAME found in your data!");
return(0);
}
dat.nms <- mydata[,1];
mydata <- as.matrix(mydata[,-1]);
#print(head(mydata))
if(mode(mydata)=="character"){
AddErrMsg(paste("Errors in parsing your data as numerics - possible reason: comma as decimal separator?"));
return(0);
}
rownames(mydata) <- dat.nms;
if(mbSetObj$module.type=="mmp"){
mbSetObj$micDataType = "ko"
}else{
mbSetObj$micDataType = "na"
}
mbSetObj$dataSet$name <- basename(dataName);
mbSetObj$dataSet$data.orig <- mydata;
mbSetObj$dataSet$smpl_nm <- smpl_nm;
current.msg <<- paste("A total of ",ncol(mydata) , " samples and ", nrow(mydata), " metagenomic features are present.");
mbSetObj$dataSet$read.msg<-current.msg;
mbSetObj$dataSet$data.type<-"text";
mbSetObj$dataSet$gene.id<-geneidtype;
mbSetObj$dataSet$is.normalized <- is.normalized;
return(.set.mbSetObj(mbSetObj));
}
#'Main function to read in shotgun data
#'@description This function reads in shotgun data
#'in biom format.
#'@param mbSetObj Input the name of the mbSetObj.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@import biomformat
ReadShotgunBiomData <- function(mbSetObj, dataName, geneidtype, module.type, ismetadata,is.normalized) {
mbSetObj <- .get.mbSetObj(mbSetObj);
#to support biom file produced by picrust, (read_biom:phyloseq) function doesn't work
msg <- NULL;
if(on.public.web){
load_biomformat();
}
#reading .biom file using phyloseq
mydata <- biomformat::read_biom(dataName);
#converting to matrix for further use
mydata = as(biom_data(mydata), "matrix")
otu.dat = otu_table(mydata, taxa_are_rows=TRUE)
if(length(otu.dat)==0){
AddErrMsg("Biom file does not contain abundance information");
return(0);
}
otu.dat<-as.matrix(otu.dat);
msg <- c(msg, "Abundance data present.");
mbSetObj$dataSet$name <- basename(dataName);
mbSetObj$dataSet$data <- otu.dat;
#sample file if present within
if(ismetadata=="T"){
sample_data<-sample_data(mydata,errorIfNULL = FALSE);
if(length(sample_data)==0){
AddErrMsg("Metadata file not detected in your biom file. Please upload metadeta file seperately.");
ismetafile<-"F";
return(0);
}
mbSetObj$dataSet$sample_data<-as.data.frame(sample_data,check.names=FALSE);
msg <- c(msg, "Metadata file is also detected in your biom file.");
}
msg <- c(msg, paste("A total of ",ncol(mydata) , " samples and ", nrow(mydata), " metagenomic features were found."));
current.msg <<- paste(msg, collapse="; ");
mbSetObj$dataSet$read.msg<-current.msg;
mbSetObj$dataSet$data.type<-"biom";
mbSetObj$module.type<-module.type;
mbSetObj$dataSet$data.orig <- otu.dat;
mbSetObj$dataSet$gene.id<-geneidtype;
mbSetObj$dataSet$is.normalized <- is.normalized;
return(.set.mbSetObj(mbSetObj));
}
#'Function to prepare shotgun data for PCA.
#'@description This function formats shotgun data for PCA.
#'@param mbSetObj Input the name of the mbSetObj.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@import ggfortify
PreparePCA4Shotgun <- function(mbSetObj, imgName,imgName2, format="json", inx1, inx2, inx3,
variable, showlabel, format2d="png", dpi=72){
load_phyloseq();
mbSetObj <- .get.mbSetObj(mbSetObj);
load_ggfortify();
imgName2 = paste(imgName2, ".", format2d, sep="");
mbSetObj$imgSet$pca<-imgName2;
dat <- as.matrix(otu_table(mbSetObj$dataSet$norm.phyobj));
pca3d <- list();
pca <- prcomp(t(dat), center=T, scale=T);
imp.pca <- summary(pca)$importance;
fast.write(signif(pca$x,5), file="pca_score.csv");
pca3d$score$axis <- paste("PC", 1:3, " (", 100*round(imp.pca[2,][1:3], 3), "%)", sep="");
#3D
coords <- data.frame(t(signif(pca$x[,1:3], 5)),check.names=FALSE);
colnames(coords) <- NULL;
pca3d$score$type <- "factor";
pca3d$score$xyz <- coords;
pca3d$score$name <- sample_names(mbSetObj$dataSet$norm.phyobj);
sam_data <- data.frame(sample_data(mbSetObj$dataSet$norm.phyobj),check.names=FALSE);
cls <- as.character(sample_data(mbSetObj$dataSet$norm.phyobj)[[variable]]);
clsLbl <- sam_data[[variable]];
pca3d$score$facA <- cls;
variable <<- variable;
# now set color for each group
cols <- unique(as.numeric(factor(cls)))+1;
rgbcols <- col2rgb(cols);
cols <- apply(rgbcols, 2, function(x){paste("rgb(", paste(x, collapse=","), ")", sep="")});
pca3d$score$colors <- cols;
#json.obj <- rjson::toJSON(pca3d);
#sink(imgName);
#cat(json.obj);
#sink();
#2D
Cairo::Cairo(file=imgName2, width=720, height=500, type=format2d, bg="white",dpi=dpi);
label = FALSE;
if(showlabel=="samnm"){
label = TRUE;
box <- autoplot(pca,data=sam_data,colour=variable,size=4,alpha =0.8,label = label) + theme_bw()
box <- box + stat_ellipse(type="norm", linetype=2, geom = "polygon",alpha = 0.2, aes_string(fill = quo(clsLbl)), show.legend=FALSE)
box <- box + labs(x = pca3d$score$axis[1], y = pca3d$score$axis[2]);
} else if(showlabel=="none") {
box <- autoplot(pca,data=sam_data,colour=variable,size=4,alpha =0.8,label = label) + theme_bw()
box <- box + stat_ellipse(type="norm", linetype=2, geom = "polygon",alpha = 0.2, aes_string(fill = quo(clsLbl)), show.legend=FALSE)
box <- box + labs(x = pca3d$score$axis[1], y = pca3d$score$axis[2]);
} else {
grplbl <<- sam_data[ ,showlabel];
clsLbl <<- clsLbl;
box <- autoplot(pca,data=sam_data,colour=variable,size=4,alpha =0.8,label = label) + geom_text(aes(label=grplbl,colour=clsLbl))
box <- box + theme_bw()+ stat_ellipse(type="norm", linetype=2, geom = "polygon",alpha = 0.2, aes_string(fill = quo(clsLbl)), show.legend=FALSE)
box <- box + labs(x = pca3d$score$axis[1], y = pca3d$score$axis[2]);
}
print(box);
dev.off();
mbSetObj$analSet$pca <- pca3d;
if(!exists("my.json.scatter")){
.load.scripts.on.demand("utils_scatter3d.Rc");
}
.set.mbSetObj(mbSetObj)
my.json.scatter(mbSetObj, imgName, F);
return(.set.mbSetObj(mbSetObj))
}
#############################################
###########Functional Profiling #############
#############################################
#'Function to plot stacked bar chart of functional data.
#'@description This function plots stacked bar charts of functional data.
#'@param mbSetObj Input the name of the mbSetObj.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@import reshape
PlotFunctionStack <-function(mbSetObj, summaryplot, functionlvl, abundcal, geneidtype, metadata,
colpalopt, format="png", dpi=72){
mbSetObj <- .get.mbSetObj(mbSetObj);
load_phyloseq();
# record parameters
mbSetObj$paramSet$stack <- list(
fun.nm = functionlvl,
abud.lvl = abundcal,
exp.fac = metadata
);
if(geneidtype == "ec"){
AddErrMsg("ECs are not supported for functional profiling!")
return(0)
}
summaryplot <- paste(summaryplot, ".", format, sep="");
load_reshape();
data <- mbSetObj$dataSet$proc.phyobj;
smpl_nm <- sample_names(data);
clsLbl <- factor(sample_data(data)[[metadata]]);
if(all(c(length(levels(clsLbl)) > 9, min(table(clsLbl)) < 3))){
AddErrMsg("Too many facets to be displayed - please select a more meaningful facet option with at least 3 samples per group.");
return(0);
}
#reorder data based on groups
ord.inx <- order(clsLbl);
clsLbl <- clsLbl[ord.inx];
colvec <- as.numeric(clsLbl) + 1;
smpl_nm <- smpl_nm[ord.inx];
query <- otu_table(data)[,ord.inx];
if(geneidtype=="cog"){
ko_higher_path <- .read.microbiomeanalyst.lib.rds("cog_functioncount.rds", "ko");
}else {
if(functionlvl=="KEGG metabolism"){
ko_higher_path<-.read.microbiomeanalyst.lib.rds("ko_higherpathway.rds", "ko");
}else if(functionlvl=="KEGG pathways"){
ko_higher_path<-.read.microbiomeanalyst.lib.rds("ko_pathwaycount.rds", "ko");
}else if(functionlvl=="KEGG modules"){
ko_higher_path<-.read.microbiomeanalyst.lib.rds("ko_modulecount.rds", "ko");
}else if(functionlvl=="COG Functional category"){
ko_higher_path<-.read.microbiomeanalyst.lib.rds("ko_cogfunction.rds", "ko");
}
}
clsLbl_new <- as.character(clsLbl);
#sample,categories name
samplenm <- colnames(query);
categnm <- colnames(ko_higher_path)<-gsub("\\.", " ",colnames(ko_higher_path));
#merging user KO query with the require KO file
result2 <- merge(query,ko_higher_path, by ="row.names");
indx2 <- match(samplenm,colnames(result2), nomatch = NA_integer_, incomparables = NULL);
indx1 <- match(categnm,colnames(result2), nomatch = NA_integer_, incomparables = NULL);
#actual weight of node/no of pathways in which it is present
if(abundcal=="weig_hit"){
result2[indx1] <- result2[indx1]/rowSums(result2[,indx1]);
#removing NA introduced
result2 <- replace(result2, is.na(result2), 0);
}
myList <- vector('list', length(indx2));
for (i in 1:length(indx2)) {
myList[[i]] <- as.data.frame(colSums(result2[indx1]*result2[,indx2[i]]),check.names=FALSE);
}
MyMerge<- function(x, y){
df <- merge(x, y, by= "row.names", all.x= F, all.y= F);
rownames(df) <- df$Row.names
df$Row.names <- NULL
return(df)
}
result <- Reduce(MyMerge, myList);
#orignal class label
colnames(result) <- samplenm;
result <- result[rowSums(result)!=0, ];
if(abundcal=="norm_hit"){
#getting the size of categories
categ_size <- as.data.frame(colSums(ko_higher_path),check.names=FALSE);
colnames(categ_size) <- "size";
result <- merge(result,categ_size, by ="row.names");
result[indx2] <- result[indx2]/result[['size']];
#removing the extra (size) column
result <- result[ ,c(1,indx2)];
rownames(result) <- result[,1];
result <- result[,-1];
#removing zero abundance KEGG pathways, metabolism and modules
result <- result[ rowSums(result)!=0, ];
}
output <- result
if(functionlvl=="KEGG pathways"){
set2nm <- qs::qread("../../lib/mmp/set2nm.qs")
nms = unname(set2nm[["pathway"]][match(rownames(output),names(set2nm[["pathway"]]))])
output = cbind(name=nms,output)
}else if(functionlvl=="KEGG modules"){
set2nm <- qs::qread("../../lib/mmp/set2nm.qs")
nms = unname(set2nm[["module"]][match(rownames(output),names(set2nm[["module"]]))])
output = cbind(name=nms,output)
}
fast.write(output, file="funcprof_abund.csv");
# now plotting
w <- 1000;
if(nrow(result)>100){
w<-w+200;
}else if(nrow(result)>150){
w<-w+300;
}
##selecting 250 samples
subsmpl <- 250;
if (ncol(result)>subsmpl) {
ss <- sample(ncol(result), subsmpl);
result <- result[,ss,drop=FALSE];
} else {
result <- result;
}
data<-t(result);
nms <- colnames(data);
data <- data %*% sapply(unique(nms),"==",nms);
data <- data.frame(data,check.names=FALSE);
data$facetOpt <- as.character(clsLbl_new);
data$step <- factor(rownames(data), levels = rownames(data));
data <- melt(data,id=c('step', 'facetOpt'));
data$step <- as.numeric(data$step);
data$variable <- gsub("\\.", " ",data$variable); # remove the dot introduced for readability
#color schema
color_var <- levels(factor(data$variable));
x <- length(color_var);
if(colpalopt=="grad"){
indx <- which(color_var=="NA");
#color schema for ggplot
x.colors <- hcl(h=seq(15,375,length=(x+1)),l=65,c=100)[1:x];
x.colors[indx] <- "#666666";
}else if (colpalopt=="cont21"){
x.colors <- rep(custom_col21,length.out=x);
}else if (colpalopt=="cont28"){
x.colors <- rep(custom_final28,length.out=x);
}else {
x.colors <- rep(custom_col42,length.out=x);
}
Cairo::Cairo(file=summaryplot,width=w, height=600, type=format, bg="white",dpi=dpi);
mbSetObj$imgSet$func.prof<-summaryplot;
box <- ggplot(data,aes(x=step,y=value)) +
facet_grid(~ facetOpt, space = "free", scales = "free") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust =1, vjust=0.5)) +
geom_area(aes(fill=variable),position='fill') +
scale_x_continuous(breaks=seq(1,length(unique(data$step)),1),labels=smpl_nm) +
#scale_fill_manual(values=c(x.colors))+
labs(y=" Relative Abundance",fill=functionlvl) +
theme(axis.text.y = element_text(size = 10)) +
theme(legend.text=element_text(size=10), strip.text = element_text(size = 12)) +
theme(axis.text.x = element_text(colour="black", size = 10), axis.title.x=element_blank());
if(colpalopt=="set3"){
cols.needed <- length(unique(data$variable))
if(cols.needed > 12){
col.func <- grDevices::colorRampPalette(RColorBrewer::brewer.pal(12, "Set3"))
box <- box + scale_fill_manual(values=col.func(cols.needed),
#guide = guide_legend(direction = "horizontal", ncol = 5)
)
} else {
box <- box + scale_fill_brewer(palette = "Set3",
#guide = guide_legend(direction = "horizontal",ncol = 5)
)
}
} else {
box <- box + scale_fill_manual(values=c(x.colors))
}
print(box);
dev.off();
mbSetObj$analSet$func.prof<-data;
mbSetObj$analSet$func.lvl<-functionlvl;
return(.set.mbSetObj(mbSetObj))
}
#'Function to perform KO mapping.
#'@description This function performs KO mapping.
#'@param mbSetObj Input the name of the mbSetObj.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
PerformKOmapping <- function(mbSetObj, geneIDs, type){
mbSetObj <- .get.mbSetObj(mbSetObj);
mbSetObj$dataSet$data.type <- "ko.list";
mbSetObj$analSet <- list();
mbSetObj$analSet$orig <- geneIDs;
current.msg <<- NULL;
lines <- unlist(strsplit(geneIDs, "\r|\n|\r\n")[1]);
if(substring(lines[1],1,1)=="#"){
lines <- lines[-1];
}
gene.lists <- strsplit(lines, "\\s+");
gene.mat <- do.call(rbind, gene.lists);
if(dim(gene.mat)[2] == 1){ # add 1
gene.only <- 1;
gene.mat <- cbind(gene.mat, rep(1, nrow(gene.mat)));
}else {
gene.only <- 0;
gene.mat <- gene.mat[,1:2];
}
rownames(gene.mat) <- gene.mat[,1];
gene.mat <- gene.mat[,-1, drop=F];
mbSetObj$analSet$ko.orig <- gene.mat;
mbSetObj$analSet$gene.only <- gene.only;
gene.mat <- RemoveDuplicates(gene.mat, "sum", quiet=F);
mbSetObj$analSet$ko.uniq <- gene.mat;
# now get input that are in the lib
kos <- doKOFiltering(rownames(gene.mat), type);
if(sum(!is.na(kos)) < 2){
msg <- "Less than two hits found in the database.";
mbSetObj$dataSet$map.msg <- msg;
AddErrMsg(msg);
.set.mbSetObj(mbSetObj);
return(0);
}else{
rownames(gene.mat) <- kos;
gd.inx <- (!is.na(kos)) & gene.mat[,1] > 0;
gene.mat <- gene.mat[gd.inx, ,drop=F];
mbSetObj$analSet$ko.mapped <- mbSetObj$analSet$data <- gene.mat; # data will be updated, ko.map will keep intact
current.msg <<- paste("A total of unique", nrow(gene.mat), "KO genes were mapped to KEGG network!");
mbSetObj$dataSet$map.msg <- paste("A total of", nrow(gene.mat), "KO genes were mapped to our database!")
return(.set.mbSetObj(mbSetObj));
}
}
#'Function to prepare query for JSON.
#'@description This function prepares the data for JSON.
#'@param mbSetObj Input the name of the mbSetObj.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
PrepareQueryJson <- function(mbSetObj){
mbSetObj <- .get.mbSetObj(mbSetObj);
#for LTS
if(!is.null(mbSetObj$paramSet$netQuery)){
netQueryNm <- mbSetObj$paramSet$netQueryFileNm;
includeInfoNm <- mbSetObj$paramSet$includeInfoFileNm;
}else{
netQueryNm <- "network_query";
includeInfoNm <- "includeInfo";
}
if(enrich.type == "hyper"){
exp.vec <- mbSetObj$analSet$data[,1]; # drop dim for json
}else{
# for global test, all KO measured should be highlighted
genemat <- as.data.frame(t(otu_table(mbSetObj$dataSet$norm.phyobj)),check.names=FALSE);
exp.vec <- rep(2, ncol(genemat));
names(exp.vec) <- colnames(genemat);
}
edge.mat <- MapKO2KEGGEdges(exp.vec);
row.names(edge.mat) <- eids <- rownames(edge.mat);
query.ko <<- edge.mat[,1];
net.orig <- edge.mat[,2];
query.res <- edge.mat[,3];# abundance
names(query.res) <- eids; # named by edge
filtKOmap(query.ko, includeInfoNm)
labels <- qs::qread("../../lib/ko/ko_lbs.qs")
labels <-labels[labels$info %in% query.ko,c(3,4)]
labels <- aggregate(labels$info,list(labels$id_edge),function(x) paste(x,collapse = ","))
json.mat <- rjson::toJSON(list(query.res=query.res,id_rxn=labels[,1],label=labels[,2]));
sink(paste0(netQueryNm, ".json"));
cat(json.mat);
sink();
return(.set.mbSetObj(mbSetObj));
}
filtKOmap <- function(include, fileName){
edges.ko = qs::qread("../../lib/mmp/ko.info.qs")
edges.ko = edges.ko[which(edges.ko$ko %in% include),]
includeInfo = list(edges=edges.ko)
includeInfo$nodes = unique(edges.ko$met)
json.mat <- rjson::toJSON(includeInfo);
sink(paste0(fileName, ".json"));
cat(json.mat);
sink();
}
#'Function to prepare KO enrichment analysis.
#'@description This function performs KO enrichment analysis
#'using the KO01100 map.
#'@param mbSetObj Input the name of the mbSetObj.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
PerformKOEnrichAnalysis_KO01100 <- function(mbSetObj, category, contain="all",file.nm){
mbSetObj <- .get.mbSetObj(mbSetObj);
if(enrich.type == "hyper"){
LoadKEGGKO_lib(category,contain);
mbSetObj<-PerformKOEnrichAnalysis_List(mbSetObj, file.nm);
}else{
.prepare.global(mbSetObj, category,contain ,file.nm);
.perform.computing();
mbSetObj <- .save.global.res();
}
.set.mbSetObj(mbSetObj)
return(1);
}
.prepare.global<-function(mbSetObj, category,contain ,file.nm){
load_phyloseq();
LoadKEGGKO_lib(category,contain);
mbSetObj <- .get.mbSetObj(mbSetObj);
print(c(category,contain,file.nm))
phenotype <- as.factor(sample_data(mbSetObj$dataSet$norm.phyobj)[[selected.meta.data]]);
genemat <- as.data.frame(t(otu_table(mbSetObj$dataSet$norm.phyobj)),check.names=FALSE);
# first, get the matched entries from current.mset
hits <- lapply(current.mset, function(x){x[x %in% colnames(genemat)]});
set.num <- unlist(lapply(current.mset, length), use.names = FALSE);
dat.in <- list(cls=phenotype, data=genemat, subsets=hits, set.num=set.num, filenm=file.nm);
my.fun <- function(){
gt.obj <- globaltest::gt(dat.in$cls, dat.in$data, subsets=dat.in$subsets);
gt.res <- globaltest::result(gt.obj);
match.num <- gt.res[,5];
if(sum(match.num>0)==0){
return(NA);
}
raw.p <- gt.res[,1];
# add adjust p values
bonf.p <- p.adjust(raw.p, "holm");
fdr.p <- p.adjust(raw.p, "fdr");
res.mat <- cbind(set.num, match.num, gt.res[,2], gt.res[,3], raw.p, bonf.p, fdr.p);
rownames(res.mat) <- names(hits);
colnames(res.mat) <- c("Size", "Hits", "Statistic Q", "Expected Q", "Pval", "Holm p", "FDR");
hit.inx <- res.mat[,2]>0;
res.mat <- res.mat[hit.inx, ];
ord.inx <- order(res.mat[,5]);
res.mat <- res.mat[ord.inx,];
return(res.mat);
}
dat.in <- list(cls=phenotype, data=genemat, subsets=hits, set.num=set.num, filenm=file.nm , my.fun=my.fun);
qs::qsave(dat.in, file="dat.in.qs");
return(1);
}
.save.global.res <- function(){
mbSetObj <- .get.mbSetObj("NA");
dat.in <- qs::qread("dat.in.qs");
hits = dat.in$subsets
file.nm = dat.in$filenm;
my.res <- dat.in$my.res;
if(all(c(length(my.res)==1, is.na(my.res)))){
AddErrMsg("No match was found to the selected metabolite set library!");
return(0);
}
# in R, sort list is by its name!, using pos order has issues!
nms <- rownames(my.res);
hits <- hits[nms];
mbSetObj <- recordEnrTable(mbSetObj, "global", my.res, "KEGG", "Global Test");
mbSetObj <- Save2KEGGJSON(mbSetObj, hits, my.res, file.nm);
print("heredone")
return(.set.mbSetObj(mbSetObj));
}
# Utility function
LoadKEGGKO_lib<-function(category,contain="all"){
if(category == "module"){
current.setlink <- "http://www.genome.jp/kegg-bin/show_module?";
if(contain=="bac"){
current.mset <- qs::qread("../../lib/ko/module_bac.qs")
}else if(contain=="hsabac"){
current.mset <- qs::qread("../../lib/ko/module_hsa_bac.qs")
}else if(contain=="hsa"){
current.mset <- qs::qread("../../lib/ko/module_hsa.qs")
}else if(contain=="all"){
kegg.anot <- .read.microbiomeanalyst.lib.rds("ko_modules.rds", "ko")
current.mset <- kegg.anot$sets$"Pathway module";
}else{
current.mset <- qs::qread("../../lib/ko/module_bac.qs") ## filter users' data based on bacterial metabolism
if(enrich.type != "hyper"){
current.mset <- lapply(current.mset, function(x) x[x %in% query.ko])
current.mset <- current.mset[unlist(lapply(current.mset,function(x) length(x)))>1]
}
}
}else{
current.setlink <- "http://www.genome.jp/kegg-bin/show_pathway?";
if(contain=="bac"){
current.mset <- qs::qread("../../lib/mmp/ko_set_bac.qs")
}else if(contain=="hsabac"){
current.mset <- qs::qread("../../lib/mmp/ko_set_hsa_bac.qs")
}else if(contain=="hsa"){
current.mset <- qs::qread("../../lib/mmp/ko_set_hsa.qs")
}else if(contain=="all"){
kegg.anot <- .read.microbiomeanalyst.lib.rds("ko_pathways.rds", "ko")
current.mset <- kegg.anot$sets$Metabolism;
}else{
current.mset <- qs::qread("../../lib/mmp/ko_set_bac.qs") ## filter users' data based on bacterial metabolism
if(enrich.type != "hyper"){
current.mset <- lapply(current.mset, function(x) x[x %in% query.ko])
current.mset <- current.mset[unlist(lapply(current.mset,function(x) length(x)))>1]
}
}
}
# now need to update the msets to contain only those in ko01100 map
if(!exists("ko.edge.map")){
if(.on.public.web){
ko.edge.path <- paste("../../lib/ko/ko_edge.csv", sep="");
}else{
ko.edge.path <- paste("https://www.microbiomeanalyst.ca/MicrobiomeAnalyst/resources/lib/ko/ko_edge.csv", sep="");
}
ko.edge.map <<- .readDataTable(ko.edge.path);
}
kos.01100 <- ko.edge.map$gene[ko.edge.map$net == "ko01100"];
current.mset <- lapply(current.mset, function(x) {
as.character(unique(x[x %in% kos.01100]))});
# remove those empty ones
mset.ln <- lapply(current.mset, length);
current.mset <- current.mset[mset.ln > 0];
set.ids <- names(current.mset);
set2nm <- qs::qread("../../lib/mmp/set2nm.qs")[[category]];
names(set.ids) <- names(current.mset) <- set2nm[set.ids];
current.setlink <<- current.setlink;
current.setids <<- set.ids;
current.mset <<- current.mset;
current.universe <<- unique(unlist(current.mset));
}
###################################################
########### Metabolic Pathway projection ##########
###################################################
#'Function to perform KO projection
#'@description This function projects user-uploaded
#'KOs onto the KEGG Global Metabolic Network.
#'@param mbSetObj Input the name of the mbSetObj.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
PerformKOProjection <- function(mbSetObj){
mbSetObj <- .get.mbSetObj(mbSetObj);
datatype <- mbSetObj$analSet$datatype;
de.Num <- mbSetObj$analSet$sig.count;
resTable <- mbSetObj$analSet$resTable[1:de.Num,];
if(nrow(resTable) > 5000){
resTable <- resTable[1:5000,];
current.msg <<- paste(" Due to computational constraints, only top 5000 features will be used. ", collapse="\n");
}
mbSetObj$analSet$sig.mat <- resTable;
p.inx <- colnames(resTable) %in% c("P-value", "Pvalues");
gene.mat <- data.matrix(-log10(resTable[,p.inx]));
rownames(gene.mat) <- row.names(resTable);
gene.mat <- RemoveDuplicates(gene.mat, "sum", quiet=F);
mbSetObj$analSet$ko.uniq <- gene.mat;
id.type <- mbSetObj$analSet$id.type;
if(any(c(is.na(id.type),is.null(id.type), id.type == "NA"))){
id.type <- "ko";
}
kos <- doKOFiltering(rownames(gene.mat),id.type);
if(sum(!is.na(kos)) < 2){
AddErrMsg("Less than two hits found in the database.");
return(0);
}else{
rownames(gene.mat) <- kos;
gd.inx <- (!is.na(kos)) & gene.mat[,1] > 0;
gene.mat <- gene.mat[gd.inx, ,drop=F];
mbSetObj$analSet$ko.mapped <- mbSetObj$analSet$data <- gene.mat; # data will be updated, ko.map will keep intact
current.msg <<- paste("A total of unqiue", nrow(gene.mat), "KO genes were mapped to KEGG network!");
}
return(.set.mbSetObj(mbSetObj));
}
##############################################
############## Utility Functions #############
##############################################
# Utility function
MapKO2KEGGEdges<- function(kos, net="ko01100"){
if(!exists("ko.edge.map")){
if(.on.public.web){
ko.edge.path <- paste("../../lib/ko/ko_edge.csv", sep="");
}else{
ko.edge.path <- paste("https://www.microbiomeanalyst.ca/MicrobiomeAnalyst/resources/lib/ko/ko_edge.csv", sep="");
}
ko.edge.map <<- .readDataTable(ko.edge.path);
}
all.hits <- ko.edge.map$gene %in% names(kos) & ko.edge.map$net == net;
my.map <- ko.edge.map[all.hits, ];
q.map <- data.frame(gene=names(kos), expr=as.numeric(kos),check.names=FALSE);
# first merge to get ko abundance to each edge
dat <- merge(my.map, q.map, by="gene");
# now merge duplicated edge to sum
dup.inx <- duplicated(dat[,2]);
dat <- dat[!dup.inx,];
rownames(dat) <- dat[,2];
return(dat[,-2]);
}
# Utility function
# note: only return hits in this map KO01100
PerformKOEnrichAnalysis_List <- function(mbSetObj, file.nm){
mbSetObj <- .get.mbSetObj(mbSetObj);
# prepare for the result table
set.size <- length(current.mset);
res.mat <- matrix(0, nrow=set.size, ncol=5);
rownames(res.mat) <- names(current.mset);
colnames(res.mat) <- c("Total", "Expected", "Hits", "Pval", "FDR");
# prepare query
ora.vec <- NULL;
exp.vec <- mbSetObj$analSet$data[,1]; # drop dim for json
ora.vec <- names(exp.vec);
# need to cut to the universe covered by the pathways, not all genes
hits.inx <- ora.vec %in% current.universe;
ora.vec <- ora.vec[hits.inx];
#ora.nms <- ora.nms[hits.inx];
q.size <- length(ora.vec);
# get the matched query for each pathway
hits.query <- lapply(current.mset, function(x) {
ora.vec[ora.vec%in%unlist(x)];});
names(hits.query) <- names(current.mset);
hit.num <- unlist(lapply(hits.query, function(x){length(x)}), use.names=FALSE);
# total unique gene number
uniq.count <- length(current.universe);
# unique gene count in each pathway
set.size <- unlist(lapply(current.mset, length));
res.mat[,1] <- set.size;
res.mat[,2] <- q.size*(set.size/uniq.count);
res.mat[,3] <- hit.num;
# use lower.tail = F for P(X>x)
raw.pvals <- phyper(hit.num-1, set.size, uniq.count-set.size, q.size, lower.tail=F);
res.mat[,4] <- raw.pvals;
res.mat[,5] <- p.adjust(raw.pvals, "fdr");
# now, clean up result, synchronize with hit.query
res.mat <- res.mat[hit.num>0,,drop = F];
hits.query <- hits.query[hit.num>0];
if(nrow(res.mat)> 1){
# order by p value
ord.inx <- order(res.mat[,4]);
res.mat <- signif(res.mat[ord.inx,],3);
hits.query <- hits.query[ord.inx];
imp.inx <- res.mat[,4] <= 0.05;
if(sum(imp.inx) < 10){ # too little left, give the top ones
topn <- ifelse(nrow(res.mat) > 10, 10, nrow(res.mat));
res.mat <- res.mat[1:topn,];
hits.query <- hits.query[1:topn];
}else{
res.mat <- res.mat[imp.inx,];
hits.query <- hits.query[imp.inx];
if(sum(imp.inx) > 120){
# now, clean up result, synchronize with hit.query
res.mat <- res.mat[1:120,];
hits.query <- hits.query[1:120];
}
}
}
#report related object
if(!is.null(mbSetObj$paramSet$koProj.type)){
vis.type <- mbSetObj$paramSet$koProj.type;
if(is.null(mbSetObj$imgSet$enrTables)){
mbSetObj$imgSet$enrTables <- list();
}
mbSetObj$imgSet$enrTables[[vis.type]] <- list();
mbSetObj$imgSet$enrTables[[vis.type]]$table <- res.mat;
mbSetObj$imgSet$enrTables[[vis.type]]$library <- "KEGG";
mbSetObj$imgSet$enrTables[[vis.type]]$algo <- "Overrepresentation Analysis";
}
mbSetObj <- Save2KEGGJSON(mbSetObj, hits.query, res.mat, file.nm);
return(mbSetObj);
}
# Utility function
# for KO01100
Save2KEGGJSON <- function(mbSetObj, hits.query, res.mat, file.nm){
print(file.nm)
resTable <- data.frame(Pathway=rownames(res.mat), res.mat,check.names=FALSE);
current.msg <<- "Functional enrichment analysis was completed";
if(!exists("ko.edge.map")){
if(.on.public.web){
ko.edge.path <- paste("../../lib/ko/ko_edge.csv", sep="");
}else{
ko.edge.path <- paste("https://www.microbiomeanalyst.ca/MicrobiomeAnalyst/resources/lib/ko/ko_edge.csv", sep="");
}
ko.edge.map <- .readDataTable(ko.edge.path);
ko.edge.map <- ko.edge.map[ko.edge.map$net=="ko01100",]; #only one map
ko.edge.map <<- ko.edge.map;
}
hits.edge <- lapply(hits.query, function(x) {
as.character(unique(ko.edge.map$edge[ko.edge.map$gene%in%unlist(x)]));});
# only keep hits with edges in the map
hits.inx <- unlist(lapply(hits.edge, length))>0;
hits.query <- hits.query[hits.inx];
resTable <- resTable[hits.inx, ];
# write json
fun.pval = resTable$Pval; if(length(fun.pval) ==1) { fun.pval <- matrix(fun.pval) };
hit.num = resTable$Hits; if(length(hit.num) ==1) { hit.num <- matrix(hit.num) };
fun.ids <- as.vector(current.setids[names(hits.query)]); if(length(fun.ids) ==1) { fun.ids <- matrix(fun.ids) };
json.res <- list(hits.query = hits.query,
hits.edge = hits.edge,
path.ids = fun.ids,
fun.pval = fun.pval,
hit.num = hit.num);
json.mat <- rjson::toJSON(json.res);
json.nm <- paste(file.nm, ".json", sep="");
sink(json.nm)
cat(json.mat);
sink();
# write csv
fast.write(resTable, file=paste(file.nm, ".csv", sep=""), row.names=F);
return(mbSetObj)
}
# Utility function
SetCurrentMetaData <- function(meta.data, type){
selected.meta.data <<- meta.data
enrich.type <<- type;
}
# Given a data with duplicates, dups is the one with duplicates
RemoveDuplicates <- function(data, lvlOpt, quiet=T){
all.nms <- rownames(data);
colnms <- colnames(data);
dup.inx <- duplicated(all.nms);
dim.orig <- dim(data);
data <- apply(data, 2, as.numeric); # force to be all numeric
dim(data) <- dim.orig; # keep dimension (will lost when only one item)
rownames(data) <- all.nms;
colnames(data) <- colnms;
if(sum(dup.inx) > 0){
uniq.nms <- all.nms[!dup.inx];
uniq.data <- data[!dup.inx,,drop=F];
dup.nms <- all.nms[dup.inx];
uniq.dupnms <- unique(dup.nms);
uniq.duplen <- length(uniq.dupnms);
for(i in 1:uniq.duplen){
nm <- uniq.dupnms[i];
hit.inx.all <- which(all.nms == nm);
hit.inx.uniq <- which(uniq.nms == nm);
# average the whole sub matrix
if(lvlOpt == "mean"){
uniq.data[hit.inx.uniq, ]<- apply(data[hit.inx.all,,drop=F], 2, mean, na.rm=T);
}else if(lvlOpt == "median"){
uniq.data[hit.inx.uniq, ]<- apply(data[hit.inx.all,,drop=F], 2, median, na.rm=T);
}else if(lvlOpt == "max"){
uniq.data[hit.inx.uniq, ]<- apply(data[hit.inx.all,,drop=F], 2, max, na.rm=T);
}else{ # sum
uniq.data[hit.inx.uniq, ]<- apply(data[hit.inx.all,,drop=F], 2, sum, na.rm=T);
}
}
if(!quiet){
current.msg <<- paste(current.msg, paste("A total of ", sum(dup.inx), " of duplicates were replaced by their ", lvlOpt, ".", sep=""), collapse="\n");
}
return(uniq.data);
}else{
if(!quiet){
current.msg <<- paste(current.msg, "All IDs are unique.", collapse="\n");
}
return(data);
}
}
#############################
##### Utility Functions #####
#############################
# return matched KO in the same order (NA if no match)
doKOFiltering <- function(ko.vec, type){
ko.dic <- .read.microbiomeanalyst.lib.rds("ko_dic.rds", "ko");
if(type == "ko"){
hit.inx <- match(ko.vec, ko.dic$KO);
return(ko.dic$KO[hit.inx]);
}
if(type == "ec"){
hit.inx = vector(mode='numeric', length=length(ko.vec)); # record hit index, initial 0
match.values = rep(NA, length=length(ko.vec)); # the best matched values (hit names), initial ""
if(substring(ko.vec[1], 0, 2) == "EC"){
ko.vec <- gsub("^EC", "", ko.vec)
}
# first find exact match to the common compound names
hit.inx <- match(ko.vec, ko.dic[, "EC"]);
match.values <- ko.dic$KO[hit.inx];
todo.inx <-which(is.na(hit.inx));
if(length(todo.inx) > 0){
multi.ec.inx <- grep("; ", ko.dic$EC);
m.dic <- ko.dic[multi.ec.inx,];
ec.list <- strsplit(m.dic$EC, "; ");
ecs2ko <- m.dic$KO;
for(i in 1:length(ec.list)){
hitInx <- match(ko.vec[todo.inx], ec.list[[i]]);
hitPos <- which(!is.na(hitInx));
orig.inx<-todo.inx[hitPos];
match.values[orig.inx] <- ecs2ko[i];
}
}
return(match.values);
}
if(type == "cog"){
hit.inx = vector(mode='numeric', length=length(ko.vec)); # record hit index, initial 0
match.values = rep(NA, length=length(ko.vec)); # the best matched values (hit names), initial ""
# first find exact match to the common compound names
hit.inx <- match(ko.vec, ko.dic[, "COG"]);
match.values <- ko.dic$KO[hit.inx];
todo.inx <-which(is.na(hit.inx));
if(length(todo.inx) > 0){
multi.cog.inx <- grep(";", ko.dic$COG);
m.dic <- ko.dic[multi.cog.inx,];
cog.list <- strsplit(m.dic$COG, ";");
cog2ko <- m.dic$KO;
for(i in 1:length(cog.list)){
hitInx <- match(ko.vec[todo.inx], cog.list[[i]]);
hitPos <- which(!is.na(hitInx));
orig.inx<-todo.inx[hitPos];
match.values[orig.inx] <- cog2ko[i];
}
}
return(match.values);
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.