MicrobiomeAnalystR: MicrobiomeAnalystR - A comprehensive R package for statistical, visual, and functional analysis of the microbiome.

Documented in ApplyAbundanceFilter ApplyMetaboFilter ApplyVarianceFilter CleanData CreatePhyloseqObj PerformMetaboNormalization PerformNormalization PerformRarefaction PlotLibSizeView PlotPCAView PlotRareCurve SanityCheckData SanityCheckMetData UpdateSampleItems

##################################################
## R script for MicrobiomeAnalyst
## Description: filtering and normalization functions
## Author: Jeff Xia, jeff.xia@mcgill.ca
###################################################

#######################################
# filter zeros and singletons this is not good enough ==> single occurance
# reads occur in only one sample should be treated as artifacts and removed

#'Main function to sanity check on uploaded microbiome data
#'@description This function performs a sanity check on the uploaded
#'user data. It checks the grouping of samples, if a phylogenetic tree
#'was uploaded.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param filetype Character, "biom" if the uploaded data
#'was .biom format, "mothur" if mothur format, or "txt" if
#'as .txt or .csv format.
#'@param disableFilter Boolean. Set to TRUE to bypass the hard-coded filter
#'to remove features occuring in <2 samples.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
SanityCheckData <- function(mbSetObj, filetype, preFilter = "sample", rmConstant = TRUE){
  mbSetObj <- .get.mbSetObj(mbSetObj);
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  
   feat.sums <- rowSums(mbSetObj$dataSet$data.orig,na.rm = T) ###
   if(length(which(feat.sums<=1))>0){
     singleton <- TRUE
     singleton_length <- length(feat.sums<=1)
   }else{
    singleton <- FALSE
    singleton_length <- 0
  }

  if(preFilter == "sample"){ # sample singleton
    smpl.sums <- apply(mbSetObj$dataSet$data.orig, 1, function(x){sum(x>0, na.rm=T)});
    gd.inx <- smpl.sums > 1; #  occur in at least 2 samples
    if(length(which(gd.inx=="TRUE"))==0){
      AddErrMsg("Reads occur in only one sample.  All these are considered as artifacts and have been removed from data. No data left after such processing.");
      return(0);
    }
    data.proc <- mbSetObj$dataSet$data.orig[gd.inx, ]; 
    print("sample singleton performed");

  }else if(preFilter == "count"){ # global singleton
     gd.inx <- feat.sums > 1; # not singleton
    if(length(which(gd.inx=="TRUE"))==0){
      AddErrMsg("No data left after removing the singleton.");
      return(0);
    }
    data.proc <- mbSetObj$dataSet$data.orig[gd.inx, ]; 
    print("global singleton performed");

  }else{ # none performed
    data.proc <- mbSetObj$dataSet$data.orig;
    print("none filtering performed");
  }
  
 if(rmConstant=="true"){

   # filtering the constant features here
   # check for columns with all constant (var=0)
   varCol <- apply(data.proc, 1, var, na.rm=T);
   constCol <- varCol == 0 | is.na(varCol);
  # making copy of data.proc and proc.phyobj(phyloseq)
   data.proc <- data.proc[!constCol, ];
   if(length(data.proc)==0){
    AddErrMsg("All features are found to be constant and have been removed from data. No data left after such processing.");
    return(0);
  }
  }

  ##### Excluding samples that contain only zero values
  
  smp.sums <- apply(mbSetObj$dataSet$data.orig, 2, function(x){sum(x>0, na.rm=T)});

 if(!all(smp.sums)>0 ){
  kp.inx <- smp.sums > 0;  
  if(length(which(kp.inx=="TRUE"))==0){
    AddErrMsg("Every sample exclusively comprises zero values!");
    return(0);
   }
  data.proc <- data.proc[, kp.inx]; 
 }
  
  saveDataQs(data.proc, "data.proc.orig", module.type, dataName);
  saveDataQs(data.proc, "data.prefilt", module.type, dataName);
  
  if(module.type == "meta"){
    sanCheck <- .checkSampleConsistencyMeta(mbSetObj,dataName);
    if(sanCheck == 0){
      return(0);
    }
  }
  
  # now get stats
  taxa_no <- nrow(mbSetObj$dataSet$data.orig);
  
  # from abundance table
  sample_no <- ncol(mbSetObj$dataSet$data.orig);
  
  if(any(c(filetype=="biom", filetype=="mothur"))){
    samplemeta_no <- nrow(mbSetObj$dataSet$sample_data);
  }else{
    samplemeta_no <- sample_no;
  }
  
  if(!mbSetObj$module.type %in% c("ppd") ){
    mbSetObj$dataSet$sample_data <- mbSetObj$dataSet$sample_data[sapply(mbSetObj$dataSet$sample_data, function(col) length(unique(col))) > 1];
  }
  
  if(ncol(mbSetObj$dataSet$sample_data)==0){
    AddErrMsg("No sample variable have more than one group. Please provide variables with at least two groups.");
    return(0);
  }
  
  #converting all sample variables to factor type
  character_vars <- sapply(mbSetObj$dataSet$sample_data, is.character);
  
  if(any(character_vars)=="TRUE"){
    mbSetObj$dataSet$sample_data[, character_vars] <- sapply(mbSetObj$dataSet$sample_data[, character_vars], as.factor);
  }
  
  num_vars <- sapply(mbSetObj$dataSet$sample_data, is.numeric);
  
  if(any(num_vars)=="TRUE"){
    mbSetObj$dataSet$sample_data[, num_vars] <- sapply(mbSetObj$dataSet$sample_data[, num_vars],as.factor);
  }
  
  if(mbSetObj$tree.uploaded){
    tree_exist <- 1;
    tree<-qs::qread("tree.qs");

   # if(identical(rownames(data.proc),tree$tip.label)){
    # no need to be identical, as long as tree can be superset of feature names

    my.diff <- setdiff(rownames(data.proc),tree$tip.label);
    if(length(my.diff) == 0){
      tree_tip <- 1
    }else{
      tree_tip <- 0;
      if(length(my.diff) > 10){
        show.num <- 10;
      }
      my.msg <- paste0("A total of ",  length(my.diff), " feature names in the abundance table are not found in the tip labels of the tree! Top ", 
                        show.num, " features are shown here: ", paste(my.diff[1:show.num], collapse="; "), 
                        " You can still proceed, but will not be able to use UniFrac distance in beta diversity analysis.");
      AddErrMsg(my.msg);
    }
  } else {
    tree_exist <- 0
    tree_tip <- 0
  }
  if(identical(sort(row.names(mbSetObj$dataSet$sample_data)),
               sort(colnames(data.proc)))){
    samname_same <- 1;
  } else {
    samname_same <- 0;
  }
  
  sample_no_in_outfile <- ncol(data.proc);
  samname_same_number <- sum(row.names(mbSetObj$dataSet$sample_data) %in% colnames(data.proc));
  if(samname_same_number){
    mis.num <- length(which(!row.names(mbSetObj$dataSet$sample_data) %in% colnames(data.proc)))
    if(mis.num==1){
      current.msg <<- paste0("One sample name was not included in the OTU/ASV abundance table.");
      
    }else{
      current.msg <<- paste0("A total of ",mis.num," sample names were not included in the OTU/ASV abundance table.");
      
    }
    
  }
  
  # now store data.orig to RDS
  saveDataQs(mbSetObj$dataSet$data.orig, "data.orig", module.type, dataName);
  saveDataQs(data.proc,"data.proc", module.type, dataName);
  saveDataQs(mbSetObj$dataSet$sample_data, "data.sample_data", module.type, dataName);
  
  mbSetObj$dataSet$tree <- tree_exist;
  mbSetObj$dataSet$tree_tip <- tree_tip;

  vari_no <- ncol(mbSetObj$dataSet$sample_data);
  disc_no <- sum(mbSetObj$dataSet$meta_info$disc.inx);
  cont_no <- sum(mbSetObj$dataSet$meta_info$cont.inx);
  vari_dup_no <- disc_no + cont_no
  smpl.sums <- apply(data.proc, 2, sum);
  tot_size <- sum(smpl.sums);
  smin <- min(smpl.sums)
  smean <- mean(smpl.sums)
  smax <- max(smpl.sums);
  gd_feat <- nrow(data.proc);

  if(exists("current.proc")){
    current.proc$mic$data.proc<<-data.proc
  }
 
  if(.on.public.web){
    .set.mbSetObj(mbSetObj)
    return(c(1,taxa_no,sample_no,vari_no, smin,smean,smax,gd_feat,samplemeta_no,tot_size, tree_exist, samname_same, samname_same_number, sample_no_in_outfile, disc_no, cont_no,vari_dup_no,tree_tip,singleton,singleton_length));
  }else{
    print("Sanity check passed!")
    
    return(.set.mbSetObj(mbSetObj))
  }
}

#'Function to sanity check on uploaded metabolomics data
#'@description This function performs a sanity check on the uploaded
#'user data. It checks the grouping of samples, if a phylogenetic tree
#'was uploaded.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param disableFilter Booleanmetabo. Set to TRUE to bypass the hard-coded filter
#'to remove features occuring in <2 samples.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export


SanityCheckMetData <- function(mbSetObj,isNormMetInput, disableFilter = FALSE){
  
  mbSetObj <- .get.mbSetObj(mbSetObj);
  
  feat.sums <- apply(mbSetObj$dataSet$metabolomics$data.orig, 1, function(x){sum(x>0, na.rm=T)});
 
  if(mbSetObj$dataSet$metabolomics$feature.type=="peak" &  any(grepl("[a-z]|[A-Z]|@",mbSetObj$dataSet$metabolomics$comp_metnm))){
   AddErrMsg("The peak format is not correct! Please use Generic Format if the data is a peak table. Note that if retention times are included, they must
                                be formatted with two underscores between the m/z and retention time. ");
      return(0);
  }
  if(disableFilter){
    data.proc <-mbSetObj$dataSet$metabolomics$data.orig
  }else{
    gd.inx <- feat.sums > 1; # occur in at least 2 samples
    if(length(which(gd.inx=="TRUE"))==0){
      AddErrMsg("Reads occur in only one sample.  All these are considered as artifacts and have been removed from data. No data left after such processing.");
      return(0);
    }
    data.proc <- mbSetObj$dataSet$metabolomics$data.orig[gd.inx, ]; 
  }
  
  # filtering the constant features here
  # check for columns with all constant (var=0)
  varCol <- apply(data.proc, 1, var, na.rm=T);
  constCol <- varCol == 0 | is.na(varCol);
  
  # making copy of data.proc
  data.proc <- data.proc[!constCol, ];
  
  if(length(data.proc)==0){
    AddErrMsg("All features are found to be constant and have been removed from data. No data left after such processing.");
    return(0);
  }
  
  # check zero, NA values
  totalCount <- nrow(data.proc)*ncol(data.proc);
  naCount <- sum(is.na(data.proc));
  naPercent <- round(100*naCount/totalCount,1)
  mbSetObj$dataSet$metabolomics$missingCount <- naCount; 
  msg<-paste("A total of ", naCount, " (", naPercent, "%) missing values were detected.", sep="");
  
  
  #Reset to default
  mbSetObj$dataSet$metabolomics$data.filt<-  mbSetObj$dataSet$metabolomics$edit <- NULL;
  
  # replace zero and missing values using Detection Limit for each variable 
  int.mat <- t(ReplaceMissingByLoD(t(data.proc)));   ##notice that samples in column and features in row
  
  mbSetObj$dataSet$metabolomics$proc.feat.num <- ncol(int.mat);
  
  msg<-c(msg, paste("Zero or missing values were replaced by 1/5 of the min positive value for each variable."));
  
  
  mbSetObj$dataSet$metabolomics$check.msg <- msg;
  mbSetObj$dataSet$metabolomics$isNormInput <- isNormMetInput;
  samname_same_number <- sum(row.names(mbSetObj$dataSet$sample_data) %in% colnames(data.proc));
  # now store data.orig to RDS
  qs::qsave(mbSetObj$dataSet$metabolomics$data.orig, file="metabo.data.orig.qs");
  qs::qsave(int.mat, file="metabo.data.init.qs");
  current.proc$met$data.proc<<-int.mat
  
  if(.on.public.web){
    .set.mbSetObj(mbSetObj)
    return(c(1,length(feat.sums),nrow(data.proc),naPercent,samname_same_number));
  }else{
    print("Sanity check passed!")
    
    return(.set.mbSetObj(mbSetObj))
  }
}


#'Function to filter uploaded data
#'@description This function filters data based on low counts in high percentage samples.
#'Note, first is abundance, followed by variance.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param filt.opt Character, input the low count filter option. "prevalence" to
#'filter based on prevalence in samples, "mean" to filter by the mean abundance value, and
#'"median" to filter by median abundance value.
#'@param count Numeric, input the minimum count. Set to 0 to disable the low count filter.
#'@param smpl.perc Numeric, input the percentage of samples for which to filter low counts.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export

ApplyAbundanceFilter <- function(mbSetObj, filt.opt, count, smpl.perc){
  
  mbSetObj <- .get.mbSetObj(mbSetObj);
  
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  data <- readDataQs("data.prefilt", module.type, dataName);
  #data <- qs::qread("data.prefilt");
  
  #this data is used for sample categorial comparision further
  rmn_feat <- nrow(data);
  
  mbSetObj$dataSet$ab.filtered <- FALSE
  
  if(count==0){# no low-count filtering
    kept.inx <- rep(TRUE, rmn_feat);
  }else{
    mbSetObj$dataSet$ab.filtered <- TRUE
    if(filt.opt == "prevalence"){
      rmn_feat <- nrow(data);
      minLen <- smpl.perc*ncol(data);
      kept.inx <- apply(data, MARGIN = 1,function(x) {sum(x >= count) >= minLen});
    }else if (filt.opt == "mean"){
      filter.val <- apply(data, 1, mean, na.rm=T);
      kept.inx <- filter.val >= count;
    }else if (filt.opt == "median"){
      filter.val <- apply(data, 1, median, na.rm=T);
      kept.inx <- filter.val >= count;
    }
  }
  
  if(sum(kept.inx)==0){
    AddErrMsg(paste0("No samples have counts less than ", count, "! If your data has already been normalized, please turn off count filter (0)."))
    return(0)
  }
  
  mbSetObj$dataSet$filt.data <- data[kept.inx, ];
  
  if(exists("current.proc")){
    
    current.proc$mic$data.proc<<-mbSetObj$dataSet$filt.data
    
  }
  
  saveDataQs(mbSetObj$dataSet$filt.data, "filt.data.orig", module.type, dataName);
  #qs::qsave(mbSetObj$dataSet$filt.data, file="filt.data.orig"); # save an copy
  current.msg <<- paste("A total of ", sum(!kept.inx), " low abundance features were removed based on ", filt.opt, ".", sep="");
  
  return(.set.mbSetObj(mbSetObj));
}

#'Function to filter uploaded data
#'@description This function filters data based on low abundace or variance.
#'Note, this is applied after abundance filter.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param filtopt Character, input the low variance filter option. "iqr" for 
#'inter-quantile range, "sd" for standard deviation, and "cov" for coefficient of variation.
#'@param filtPerct Numeric, input the percentage cutoff for low variance. 
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export

ApplyVarianceFilter <- function(mbSetObj, filtopt, filtPerct){
  
  mbSetObj <- .get.mbSetObj(mbSetObj);
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  data <- mbSetObj$dataSet$filt.data;
  
  rmn_feat <- nrow(data);
  
  filter.val <- nm <- NULL;
  
  mbSetObj$dataSet$var.filtered <- FALSE
  
  if(filtPerct==0){# no low-count filtering
    remain <- rep(TRUE, rmn_feat);
  }else{
    mbSetObj$dataSet$var.filtered <- TRUE
    if (filtopt == "iqr"){
      filter.val <- apply(data, 1, IQR, na.rm=T);
      nm <- "IQR";
    }else if (filtopt == "sd"){
      filter.val <- apply(data, 1, sd, na.rm=T);
      nm <- "standard deviation";
    }else if (filtopt == "cov"){
      sds <- apply(data, 1, sd, na.rm=T);
      mns <- apply(data, 1, mean, na.rm=T);
      filter.val <- abs(sds/mns);
      nm <- "Coeffecient of variation";
    }
    # get the rank
    rk <- rank(-filter.val, ties.method='random');
    var.num <- nrow(data);
    remain <- rk < var.num*(1-filtPerct);
  }
  
  data <- data[remain,];
  
  if(nrow(data) == 0){
    AddErrMsg("No samples remaining after variance filtering!")
    return(0)
  }
  
  mbSetObj$dataSet$filt.data <- data;
  if(exists("current.proc")){ 
    current.proc$mic$data.proc<<-data
  }
  
  #qs::qsave(mbSetObj$dataSet$filt.data, file="filt.data.orig"); # save an copy
  saveDataQs(mbSetObj$dataSet$filt.data, "filt.data.orig", module.type, dataName);

  rm.msg1 <- paste0("A total of ", sum(!remain), " low variance features were removed based on ", filtopt);
  rm.msg2 <- paste0("The number of features remains after the data filtering step: ", nrow(data));
  current.msg <<- c(current.msg, rm.msg1, rm.msg2);

  rm.msg1 <- paste0("A total of ```", sum(!remain), "``` low variance features were removed based on ```", filtopt, "```.");
  rm.msg2 <- paste0("The number of features remains after the data filtering step: ```", nrow(data), "```.");
  mbSetObj$dataSet$filt.msg <- c(rm.msg1, rm.msg2);

  return(.set.mbSetObj(mbSetObj));
  
}

#'Function to filter uploaded metabolomic data
#'@description .
#'Note, this is applied after abundance filter.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param filtopt Character, input the low variance filter option. "iqr" for 
#'inter-quantile range, "sd" for standard deviation, and "cov" for coefficient of variation.
#'@param filtPerct Numeric, input the percentage cutoff for low variance. 
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export

ApplyMetaboFilter <- function(mbSetObj=NA, filter,  rsd){
  mbSetObj <- .get.mbSetObj(mbSetObj);
  
  data <- mbSetObj$dataSet$metabolomics$data.orig;
  int.mat <- t(data) ;
  
  
  feat.num <- ncol(int.mat);
  feat.nms <- colnames(int.mat);
  nm <- NULL;
  msg <- "";
  if(all(c(filter == "none", feat.num < 5000))) { # only allow for less than 4000
    remain <- rep(TRUE, feat.num);
    msg <- paste(msg, "No filtering was applied");
  }else {
    if (filter == "rsd"){
      sds <- apply(int.mat, 2, sd, na.rm=T);
      mns <- apply(int.mat, 2, mean, na.rm=T);
      filter.val <- abs(sds/mns);
      nm <- "Relative standard deviation";
    }else if (filter == "nrsd" ){
      mads <- apply(int.mat, 2, mad, na.rm=T);
      meds <- apply(int.mat, 2, median, na.rm=T);
      filter.val <- abs(mads/meds);
      nm <- "Non-paramatric relative standard deviation";
    }else if (filter == "mean"){
      filter.val <- apply(int.mat, 2, mean, na.rm=T);
      nm <- "mean";
    }else if (filter == "sd"){
      filter.val <- apply(int.mat, 2, sd, na.rm=T);
      nm <- "standard deviation";
    }else if (filter == "mad"){
      filter.val <- apply(int.mat, 2, mad, na.rm=T);
      nm <- "Median absolute deviation";
    }else if (filter == "median"){
      filter.val <- apply(int.mat, 2, median, na.rm=T);
      nm <- "median";
    }else{ # iqr
      filter.val <- apply(int.mat, 2, IQR, na.rm=T);
      nm <- "Interquantile Range";
    }
    
    # get the rank of the filtered variables
    rk <- rank(-filter.val, ties.method='random');
    
    
    if(feat.num < 250){ # reduce 5%
      remain <- rk < feat.num*0.95;
      msg <- paste(msg, "Further feature filtering based on", nm,". A total of ",sum(!remain), "features were removed.");
    }else if(feat.num < 500){ # reduce 10%
      remain <- rk < feat.num*0.9;
      msg <- paste(msg, "Further feature filtering based on", nm,". A total of ",sum(!remain), "features were removed.");
    }else if(feat.num < 1000){ # reduce 25%
      remain <- rk < feat.num*0.75;
      msg <- paste(msg, "Further feature filtering based on", nm,". A total of ",sum(!remain), "features were removed.");
    }else{ # reduce 40%, if still over 5000, then only use top 5000
      remain <- rk < feat.num*0.6;
      msg <- paste(msg, "Further feature filtering based on", nm);
      
      if(sum(remain) > 5000){
        remain <- rk < 5000;
        msg <- paste(msg, paste("Reduced to", 5000, "features based on", nm,""));
      }
    }
    
  }
  
  # save a copy for user 
  #fast.write.csv(cbind(filter=filter.val, t(int.mat)), file=paste0("data_prefilter_", filter, ".csv"));
  
  
  filt.res=int.mat[, remain]
  mbSetObj$dataSet$metabolomics$filt.data <- t(filt.res); 
  current.proc$met$data.proc<<-t(filt.res)
  qs::qsave(mbSetObj$dataSet$metabolomics$filt.data, file="metabo.filt.data"); # save an copy
  current.msg <<- msg
  mbSetObj$dataSet$metabolomics$filt.msg <- current.msg;
  return(.set.mbSetObj(mbSetObj));
  
}


#'Function to update samples
#'@description This function prunes samples to be included 
#'for downstream analysis.
#'@param mbSetObj Input the name of the mbSetObj.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
UpdateSampleItems <- function(mbSetObj){
  
  load_phyloseq();
  mbSetObj <- .get.mbSetObj(mbSetObj);
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  
  if(!exists("smpl.nm.vec")){
    AddErrMsg("Cannot find the current sample names!");
    return(0);
  }
  
  # read from saved original copy
  data.proc.orig  <- qs::qread("data.proc.orig");
  proc.phyobj.orig <- qs::qread("proc.phyobj.orig");
  hit.inx <- colnames(data.proc.orig) %in% smpl.nm.vec;
  
  #preserving the rownames
  # read from saved prefilt copy
  prefilt.data <- readDataQs("data.prefilt", module.type, dataName);
  tax_nm <- rownames(prefilt.data);
  data.proc <- readDataQs("data.proc", module.type, dataName);
  modi_data <- data.proc[,hit.inx];
  prefilt.data <- modi_data;
  
  #doesn't need to change the names object
  rownames(prefilt.data) <- tax_nm;
  #getting unmatched sample names
  unmhit.indx <- which(hit.inx==FALSE);
  allnm <- colnames(data.proc.orig);
  mbSetObj$dataSet$remsam <- allnm[unmhit.indx];
 
  mbSetObj$dataSet$proc.phyobj <- prune_samples(colnames(prefilt.data),proc.phyobj.orig);
  qs::qsave(prefilt.data, file="data.prefilt")
  current.msg <<- "Successfully updated the sample items!";
  
  # need to update metadata info after removing samples
  my.meta <- sample_data(mbSetObj$dataSet$proc.phyobj)
  disc.inx <- GetDiscreteInx(my.meta);
  
  if(sum(disc.inx) == 0){
    mbSetObj$poor.replicate <- TRUE;
  }else{
    mbSetObj$poor.replicate <- FALSE;
  }
  
  mbSetObj$dataSet$sample_data <- my.meta
  
  return(.set.mbSetObj(mbSetObj));
  
}


#'Function to perform normalization
#'@description This function performs normalization on the uploaded
#'data.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param rare.opt Character, "rarewi" to rarefy the data and 
#'"none" to not.
#'@param scale.opt Character, input the name of the data scaling option.
#'"colsum" for total sum scaling, "CSS" for cumulative sum scaling,
#' "upperquartile" for upper-quartile normalization, and "none" to none.
#'@param transform.opt Character, input the name of the data transformation
#'to be applied. "rle" for relative log expression, "TMM" for trimmed mean of 
#'M-values, "clr" for centered log ratio, and "none" for none.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@import edgeR
#'@import metagenomeSeq
PerformNormalization <- function(mbSetObj, rare.opt, scale.opt, transform.opt,isAutoScale=F, rareDepth=NULL){

  mbSetObj <- .get.mbSetObj(mbSetObj);
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  data <- readDataQs("filt.data.orig", module.type, dataName);
  saveDataQs(mbSetObj$dataSet$proc.phyobj, "orig.phyobj", module.type, dataName);  
  # now proc.phyobj is now filtered data
  tax_nm <- rownames(data);
  msg <- NULL;
  
  #rarefying (should?) affect filt.data in addition to norm 
  
  if(rare.opt != "none"){
    data <- PerformRarefaction(mbSetObj, data, rare.opt,rareDepth);
    if(is.null(data)){
       return(0)
     }
    tax_nm <- rownames(data);
    msg <- c(msg, paste("Performed data rarefaction."));
    
    ###### note, rarefying (should?) affect filt.data in addition to norm
    mbSetObj$dataSet$filt.data <- data@otu_table;
  }else{
    msg <- c(msg, paste("No data rarefaction was performed."));
  }
  
  # create phyloseq obj
  mbSetObj$dataSet$sample_data$sample_id <- rownames(mbSetObj$dataSet$sample_data);
  sample_table <- sample_data(mbSetObj$dataSet$sample_data, errorIfNULL=TRUE);
  mbSetObj$dataSet$proc.phyobj<- merge_phyloseq(otu_table(data,taxa_are_rows =TRUE), sample_table, mbSetObj$dataSet$taxa_table);
  
  
  #make hierarchies
  if(mbSetObj$module.type=="sdp" | mbSetObj$micDataType=="ko"){
    ranks <- "OTU"
  }else{
    ranks <- c(GetMetaTaxaInfo(mbSetObj), "OTU")
    ranks <- unique(ranks)
  } 
  
  # start with lowest
  data.list <- list()
  data.list$merged_obj <- vector(length = length(ranks), "list")
  data.list$count_tables <- vector(length = length(ranks), "list")
  names(data.list$count_tables) <- names(data.list$merged_obj) <- ranks
  
  for(i in 1:length(ranks)){
    phyloseq.obj <- UtilMakePhyloseqObjs(mbSetObj, ranks[i])
    data.list$merged_obj[[i]] <- phyloseq.obj
    count.table <- UtilMakeCountTables(phyloseq.obj, ranks[i])
    data.list$count_tables[[i]] <- count.table
  }
  
  ##### The prenorm object is the filter result of the raw counts and also affected by the rarefy option 
  ### Note the counts table have different row numbers with the merged_obj as the otus without taxonomy assignment at a certain level will be merged in the count table.
  saveDataQs(data.list, "phyloseq_prenorm_objs.qs", module.type, dataName);  
  
  
  for(i in 1:length(ranks)){
    
    data <- data.list$count_tables[[i]]
    if(scale.opt != "none"){
      if(scale.opt=="colsum"){
        data <- sweep(data, 2, colSums(data), FUN="/")
        #data <- data*10000000;
        data <- data*1000000;
        if(i==1){
          msg <- c(msg, paste("Performed ```total sum scaling``` normalization."));
        }
      }else if(scale.opt=="upperquartile"){
        load_edgeR();
        otuUQ <- edgeRnorm(data,method="upperquartile");
        data <- as.matrix(otuUQ$counts);
        if(i==1){
          msg <- c(msg, paste("Performed ```upper quartile``` normalization"));
        }
      }else if(scale.opt=="CSS"){
        load_metagenomeseq();
        #biom and mothur data also has to be in class(matrix only not in phyloseq:otu_table)
        data1 <- as(data,"matrix");
        dataMR <- newMRexperiment(data1);
        data <- cumNorm(dataMR,p=cumNormStat(dataMR));
        data <- MRcounts(data,norm = T);
        if(i==1){
          msg <- c(msg, paste("Performed ```cumulative sum scaling``` normalization"));
        }
      }else{
        if(scale.opt!="auto" & i==1){
          print(paste("Unknown scaling parameter:", scale.opt));
        }
      }
    }else{
      if(i==1){
        msg <- c(msg, paste("No data scaling was performed."));
      }
    }
    
    if(transform.opt != "none"){
      if(transform.opt=="rle"){
        load_edgeR();            
        otuRLE <- edgeRnorm(data,method="RLE");
        data <- as.matrix(otuRLE$counts);
        if(i==1){        
          msg <- c(msg, paste("Performed ```RLE``` Normalization"));
        }
      }else if(transform.opt=="TMM"){
        load_edgeR();   
        otuTMM <- edgeRnorm(data,method="TMM");
        data <- as.matrix(otuTMM$counts);
        if(i==1){        
          msg <- c(msg, paste("Performed ```TMM``` Normalization"));
        }
      }else if(transform.opt=="clr"){
        data <- apply(data, 2, clr_transform);
        if(i==1){        
          msg <- "Performed ```centered-log-ratio (CLR)``` normalization."
        }
      }else{
        if(scale.opt!="auto" & i==1){
          print(paste("Unknown scaling parameter:", transform.opt));
        }
      }
    }else{
      if(i==1){        
        msg <- c(msg, paste("No data transformation was performed."));
      }
    }
    data.list$count_tables[[i]] <- data
    
    if(ranks[i]=="OTU"){
      otu.tab <- otu_table(data,taxa_are_rows =TRUE);
      data.list$merged_obj[[i]] <- merge_phyloseq(otu.tab, sample_data(data.list$merged_obj[[i]]), mbSetObj$dataSet$taxa_table);
    }else{
      otu.tab <- otu_table(data,taxa_are_rows =TRUE);
      tax.tab <- tax_table(data.list$merged_obj[[i]])
      nm <- as.character(tax.tab[,ranks[i]])
      nm[is.na(nm)|nm==""|nm==" "] <- "Not_Assigned";
      rownames(tax.tab) <- nm
      tax.tab <- tax.tab[!(duplicated(nm)),] 
      tax.tab[which(rownames(tax.tab)=="Not_Assigned"),] <- NA
      data.list$merged_obj[[i]] <- merge_phyloseq(otu.tab, sample_data(data.list$merged_obj[[i]]), tax.tab);
    }
  }
  
  #using the OTU level for plotting when first initialize
  mbSetObj$dataSet$norm.phyobj <- data.list$merged_obj[["OTU"]];
  mbSetObj$dataSet$norm.msg <- msg;
  
  ### prepare for mmp module
  if(exists("current.proc")){
    qs::qsave(data.list,"prescale.phyobj.qs")
    if(isAutoScale=="true" | scale.opt == "auto"){
      data.list$count_tables <- lapply(data.list$count_tables,function(df) t(AutoScale(t(df))))
    }  
    current.proc$mic$data.proc<<- data.list$count_tables[["OTU"]]
    
  }
  ### Normalized object for each taxonomy level
  saveDataQs(data.list, "phyloseq_objs.qs", module.type, dataName);  
  return(.set.mbSetObj(mbSetObj));
}


#'Function to perform normalization on metabolomics data
#'@description This function performs normalization on the uploaded metabolomics
#'data.
#@param rowNorm Select the option for row-wise normalization, "QuantileNorm" for Quantile Normalization, 
#'"SumNorm" for Normalization to constant sum, 
#'"MedianNorm" for Normalization to sample median, and 
#'@param transNorm Select option to transform the data, "LogNorm" for Log Normalization,
#'and "CrNorm" for Cubic Root Transformation. 
#'@param scaleNorm Select option for scaling the data, "MeanCenter" for Mean Centering,
#'"AutoNorm" for Autoscaling, "ParetoNorm" for Pareto Scaling, amd "RangeNorm" for Range Scaling.
#'@param ref Input the name of the reference sample or the reference feature, use " " around the name.  
#'@param ratio This option is only for biomarker analysis.
#'@param ratioNum Relevant only for biomarker analysis.  
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@import edgeR
#'@import metagenomeSeq
PerformMetaboNormalization <- function(mbSetObj, rowNorm, transNorm, scaleNorm,isAutoScale){
  
  mbSetObj <- .get.mbSetObj(mbSetObj);
  data <-t(mbSetObj$dataSet$metabolomics$filt.data);
  # now proc.phyobj is now filtered data
  
  msg <- "";
  
  colNames <- colnames(data);
  rowNames <- rownames(data);
  
  # row-wise normalization
  if(rowNorm=="QuantileNorm"){
    
    data<-t(preprocessCore::normalize.quantiles(t(data), copy=FALSE));
    # this can introduce constant variables if a variable is 
    # at the same rank across all samples (replaced by its average across all)
    varCol <- apply(data, 2, var, na.rm=T);
    constCol <- (varCol == 0 | is.na(varCol));
    constNum <- sum(constCol, na.rm=T);
    if(constNum > 0){
      print(paste("After quantile normalization", constNum, "features with a constant value were found and deleted."));
      data <- data[,!constCol, drop=FALSE];
      colNames <- colnames(data);
      rowNames <- rownames(data);
    }
    rownm<-"Quantile Normalization";
  }else if(rowNorm=="SumNorm"){
    data<-t(apply(data, 1, function(x){
      1000*x/sum(x, na.rm=T);
    }));
    rownm<-"Normalization to constant sum";
  }else if(rowNorm=="MedianNorm"){
    data<-t(apply(data, 1, function(x){
      x/median(x, na.rm=T);
    }));
    rownm<-"Normalization to sample median";
  }else{
    # nothing to do
    rownm<-"N/A";
  }
  
  if(rownm!="N/A"){
    msg <- paste(rownm,"performed.")
    
  }
  # use apply will lose dimension info (i.e. row names and colnames)
  rownames(data)<-rowNames;
  colnames(data)<-colNames;
  
  # record row-normed data for fold change analysis (b/c not applicable for mean-centered data)
  row.norm <- as.data.frame(CleanData(data, T, T)); #moved below ratio 
  qs::qsave(row.norm, file="metabo.row.norm.qs");
  # transformation
  # may not be able to deal with 0 or negative values
  if(transNorm=='LogNorm'){
    min.val <- min(abs(data[data!=0]))/10;
    data<-apply(data, 2, LogNorm, min.val);
    transnm<-"Log10 Normalization";
  }else if(transNorm=='SrNorm'){
    min.val <- min(abs(data[data!=0]))/10;
    data<-apply(data, 2, SquareRootNorm, min.val);
    transnm<-"Square Root Transformation";
  }else if(transNorm=='CrNorm'){
    norm.data <- abs(data)^(1/3);
    norm.data[data<0] <- - norm.data[data<0];
    data <- norm.data;
    transnm<-"Cubic Root Transformation";
  }else{
    transnm<-"N/A";
  }
  
  if(transnm!="N/A"){
    msg <- paste(msg,paste(transnm,"performed."))
    
  }
  # scaling
  if(scaleNorm=='MeanCenter'){
    data<-apply(data, 2, function(x){
      x - mean(x);
    });
    scalenm<-"Mean Centering";
  }else if(scaleNorm=='AutoNorm'| isAutoScale == "true"){
    data<-apply(data, 2, function(x){
      (x - mean(x))/sd(x, na.rm=T);
    });
    scalenm<-"Autoscaling";
  }else if(scaleNorm=='ParetoNorm'){
    data<-apply(data, 2, function(x){
      (x - mean(x))/sqrt(sd(x, na.rm=T));
    });
    scalenm<-"Pareto Scaling";
  }else if(scaleNorm=='RangeNorm'){
    data<-apply(data, 2, function(x){
      if(max(x) == min(x)){
        x;
      }else{
        (x - mean(x))/(max(x)-min(x));
      }
    });
    scalenm<-"Range Scaling";
  }else{
    scalenm<-"N/A";
  }
  
  if(scalenm!="N/A"){
    msg <- paste(msg,paste(scalenm,"performed."))
    
  }
  # note after using "apply" function, all the attribute lost, need to add back
  rownames(data)<-rowNames;
  colnames(data)<-colNames;
  
  # need to do some sanity check, for log there may be Inf values introduced
  data <- CleanData(data, T, F);
  
  
  mbSetObj$dataSet$metabolomics$norm.data <- t(as.data.frame(data)); ## feature in row and sample in column
  
  qs::qsave( mbSetObj$dataSet$metabolomics$norm.data, file="metabo.complete.norm.qs");
  
  current.proc$met$data.proc<<-t(as.data.frame(data))
  
  mbSetObj$dataSet$metabolomics$rownorm.method <- rownm;
  mbSetObj$dataSet$metabolomics$trans.method <- transnm;
  mbSetObj$dataSet$metabolomics$scale.method <- scalenm;
  
  #using this object for plotting
  if(rownm=="N/A" & transnm=="N/A" & scalenm=="N/A"){
    current.msg <<- "No normalization was performed";
  }else{
    current.msg <<-msg
  }
  
  mbSetObj$dataSet$metabolomics$norm.msg <- current.msg;
  
  return(.set.mbSetObj(mbSetObj));
}

#'Get function to obtain the library scale for rarefraction
GetLibscale <- function(mbSetObj){
  mbSetObj <- .get.mbSetObj(mbSetObj);
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  data <- readDataQs("filt.data.orig", module.type, dataName);
  data <- data.matrix(data);
  tax_nm<-rownames(data);
  data <- round(data);
  
  # create phyloseq obj
  otu.tab<-otu_table(data,taxa_are_rows =TRUE);
  taxa_names(otu.tab)<-tax_nm;
  
  mbSetObj$dataSet$sample_data$sample_id<-rownames(mbSetObj$dataSet$sample_data);
  sample_table<-sample_data(mbSetObj$dataSet$sample_data, errorIfNULL=TRUE);
  phy.obj<-merge_phyloseq(otu.tab,sample_table);
 return(c(min(sample_sums(phy.obj)),quantile(sample_sums(phy.obj) ,0.75)))

}



#'Utility function to perform rarefraction (used by PerformNormalization)
#'@description This function performs rarefraction on the uploaded
#'data.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param data Input the data.
#'@param rare.opt Input the option for rarefying the microbiome data.
#'"rarewi" to rarefy with replacement to the minimum library depth and 
#'"rarewo" to rarefy without replacemet to the minimum library depth.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
PerformRarefaction <- function(mbSetObj, data, rare.opt,rareDepth=NULL){
  mbSetObj <- .get.mbSetObj(mbSetObj);

  data <- data.matrix(data);
  tax_nm<-rownames(data);
  # data must be count data (not contain fractions)
  data <- round(data);
  
  if(is.null(rareDepth)){
    rare.opt <- "rarewi"; # use default
  }

  # create phyloseq obj
  otu.tab<-otu_table(data,taxa_are_rows =TRUE);
  taxa_names(otu.tab)<-tax_nm;
  
  mbSetObj$dataSet$sample_data$sample_id<-rownames(mbSetObj$dataSet$sample_data);
  sample_table<-sample_data(mbSetObj$dataSet$sample_data, errorIfNULL=TRUE);
  phy.obj<-merge_phyloseq(otu.tab,sample_table);
  msg<-NULL;
  
  # first doing rarefaction, this is on integer or count data
  if(rare.opt=="rarewi"){
  
  phy.obj <- tryCatch({
        # Your function call
        rarefied_data <- rarefy_even_depth(phy.obj, replace=TRUE,rngseed = T)
        return(rarefied_data)
    }, error = function(e) {
        # Custom error handling
        err.vec <<- e$message
        # Return NULL or some other value indicating failure
        return(NULL)
    }
  )
  if(is.NULL(phy.obj)){
    return(NULL)
  }else{
    msg <- c(msg, paste("Rarefy with replacement to minimum library depth."));
  }
 
}else if(rare.opt=="rarewo"){ # this is not shown on web due to computational issue

  phy.obj <- tryCatch({
    # Your function call
    phy.obj <- rarefy_even_depth(phy.obj, replace=FALSE,rngseed = T);
    return(rarefied_data)
  }, error = function(e) {
    # Custom error handling
    err.vec <<- e$message
    # Return NULL or some other value indicating failure
    return(NULL)
  }
  )
  if(is.NULL(phy.obj)){
    return(NULL)
  }else{
    msg <- c(msg, paste("Rarefaction without replacement to minimum library depth."));
  }
  
}else if(rare.opt=="rareto"){
  print(max((sample_sums(phy.obj))))

  if(max((sample_sums(phy.obj)))<rareDepth){
     AddErrMsg("The specified library depth exceeds that of all samples! Please provide a suitable depth value for rarefaction!")
     return(NULL)
  }
  phy.obj <- tryCatch({
    # Your function call
    rarefied_data <- rarefy_even_depth(phy.obj, sample.size = rareDepth,replace=TRUE,rngseed = T)
    return(rarefied_data)
  }, error = function(e) {
    # Custom error handling
    err.vec <<- e$message
    # Return NULL or some other value indicating failure
    return(NULL)
  }
  )
  if(is.NULL(phy.obj)){
    return(NULL)
  }else{
    msg <- c(msg, paste("Rarefy with replacement to selected library depth."));
  }

}
  msg <- paste(msg, collapse=" ");
  #print(msg);
  cleanMem();
  otu.tab <- otu_table(phy.obj);
  return(otu.tab);
}

#'Function to plot rarefraction curves
#'@description This function plots rarefraction curves from the uploaded
#'data for both sample-wise or group-wise
#'@param mbSetObj Input the name of the mbSetObj.
#'@param graphName Input the name of the plot.
#'@param variable Input the experimental factor.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@import vegan

PlotRareCurve <- function(mbSetObj, graphName, variable){
  
  mbSetObj <- .get.mbSetObj(mbSetObj);
  set.seed(13789);
  load_vegan();
  
  data <- data.matrix(mbSetObj$dataSet$filt.data);
  rarefaction_curve_data<-as.matrix(otu_table(data));
  
  Cairo::Cairo(file=graphName, width=900, height=480, type="png", bg="white");
  
  #sample-wise
  subsmpl=50;
  
  if (ncol(rarefaction_curve_data)>subsmpl) {
    ss  = sample(ncol(rarefaction_curve_data), subsmpl);
    rarefaction_curve_data = rarefaction_curve_data[,ss,drop=FALSE];
    #retaining the column names
    data<-prune_samples(colnames(rarefaction_curve_data),data);
  } else {
    rarefaction_curve_data = rarefaction_curve_data;
  }
  
  sam_data<-sample_data(data);
  raremax <- min(rowSums(t(rarefaction_curve_data)));
  
  #getting colors
  grp.num <- length(levels(as.factor(sam_data[[variable]])));
  
  if(grp.num<9){
    dist.cols <- 1:grp.num + 1;
    lvs <- levels(as.factor(sam_data[[variable]]));
    colors <- vector(mode="character", length=length(as.factor(sam_data[[variable]])));
    
    for(i in 1:length(lvs)){
      colors[as.factor(sam_data[[variable]]) == lvs[i]] <- dist.cols[i];
    }
  }else{
    colors<-"blue";
  }
  
  rarecurve(t(rarefaction_curve_data), step = 20, sample = raremax, col = colors, cex = 0.6, xlab = "Sequencing depth", ylab = "Observed Species");
  legend("bottomright",lvs,lty = rep(1,grp.num),col = dist.cols);
  
  plot.data <- rarefaction_curve_data;
  sam_data <- sample_data(data);
  cls.lbls <- as.factor(sam_data[[variable]]);
  grp.lvls <- levels(cls.lbls);
  grp.num <- length(grp.lvls);
  grp.data <- list();
  
  for(lvl in grp.lvls){
    grp.data[[lvl]] <- rowSums(plot.data[, cls.lbls == lvl])
  }
  
  raremax <- min(unlist(lapply(grp.data, sum)));
  grp.data <- data.frame(grp.data,check.names=FALSE);
  dist.cols <- 1:grp.num + 1;
  # now plot
  rarecurve(t(grp.data), step = 20, sample = raremax, col = dist.cols, lwd = 2, cex=1.5, xlab = "Sequencing depth", ylab = "Observed Species");
  legend("bottomright",grp.lvls,lty = rep(1,grp.num),col = dist.cols);
  
  dev.off();
  return(.set.mbSetObj(mbSetObj))
}

#'Function to plot library size 
#'@description This function creates a plot summarizing the library
#'size of microbiome dataset.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param imgName Character, input the name
#'of the plot.
#'@param format Character, input the preferred
#'format of the plot. By default it is set to "png".
#'@param dpi Numeric, input the dots per inch. By default
#'it is set to 72.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
PlotLibSizeView <- function(mbSetObj, origImgName="",format="png", dpi=72, dataName="", interactive=F){
  load_phyloseq();
  mbSetObj <- .get.mbSetObj(mbSetObj)
  library(ggplot2)
  library(dplyr)
  library(Cairo)
  
  if(dataName != ""){
    ind <- TRUE
  } else {
    dataName <- mbSetObj$dataSet$name
    ind <- FALSE
  }
  
  module.type <- mbSetObj$module.type
  colLegendNm="";
  if(all(c(mbSetObj$module.type == "meta", !ind))){
    sums.list <- list()
    datanm.list <- list()
    for(i in 1:length(mbSetObj$dataSets)){
      dataName <- mbSetObj$dataSets[[i]]$name
      data.proc <- readDataQs("data.proc", module.type, dataName)
      data_bef <- data.matrix(data.proc)
      smpl.sums <- colSums(data_bef)
      names(smpl.sums) <- colnames(data_bef)
      smpl.sums <- sort(smpl.sums)
      sums.list[[i]] <- smpl.sums
      datanm.list[[i]] <- rep(dataName,length(smpl.sums)) 
    }
    smpl.sums <- unlist(sums.list);
    col.vec <- unlist(datanm.list);
    colLegendNm = "Dataset";
  
  } else {
    data.proc <- readDataQs("data.proc", module.type, dataName)
    data_bef <- data.matrix(data.proc)
    smpl.sums <- colSums(data_bef)
    names(smpl.sums) <- colnames(data_bef)
    smpl.sums <- sort(smpl.sums)
    col.vec <- as.vector(mbSetObj$dataSet$sample_data[match(names(smpl.sums),rownames(mbSetObj$dataSet$sample_data)),1]);
    colLegendNm = "Group";
  
  }
  
  # save the full lib size 
  fast.write(cbind(Size=smpl.sums), file="norm_libsizes.csv");
  
  # Create a data frame for ggplot
  if(is.data.frame(col.vec)){
    library_size_data <- data.frame(Sample = names(smpl.sums), LibrarySize = smpl.sums, group = as.character(col.vec[[colnames(col.vec)]]))
  } else {
    library_size_data <- data.frame(Sample = names(smpl.sums), LibrarySize = smpl.sums, group = col.vec)
  }
  
  colnames(library_size_data)[3] <- "group";
  library_size_data <- library_size_data %>% arrange(desc(LibrarySize))
  library_size_data$Sample = factor(library_size_data$Sample, levels=library_size_data[order(library_size_data$LibrarySize), "Sample"])
  
  # Determine the width based on the number of samples
  num_samples <- length(smpl.sums)
  width <- ifelse(num_samples < 50, 800, 800 + (num_samples - 50) * 10)
  height <- 600
  
  # Plotting with ggplot2
  imgName <- paste0(origImgName, ".", format)
  g <- ggplot(library_size_data, aes(x = Sample, y = LibrarySize)) +
    geom_point(aes(color = library_size_data$group)) +
    geom_hline(yintercept = mean(library_size_data$LibrarySize), color = "blue", linetype = "dashed") +
    labs(x = "Sample", y = "Read Counts", color="") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          legend.text = element_text(size = 14), legend.title = element_text(size = 14))
  
  if(imgName != ""){
    Cairo::Cairo(file=imgName, width=width, height=height, type=format, bg="white", dpi=dpi);
    print(g);
    dev.off();
  }
  
  mbSetObj$imgSet$lib.size <- imgName;
  
  mean_line <- mean(library_size_data$LibrarySize)
  annotation_offset <- 0.05 * (max(library_size_data$LibrarySize) - min(library_size_data$LibrarySize))
  
  # Split the data by group
  grouped_data <- split(library_size_data, library_size_data$group)
  
  # Map each group to a color
  unique_groups <- unique(library_size_data$group)
  colors <- RColorBrewer::brewer.pal(length(unique_groups), "Paired")
  color_mapping <- setNames(colors, unique_groups)
  x_values <- seq_along(library_size_data$Sample);
  library_size_data$x_values <- x_values 
  
  # Create a trace for each group
  traces <- lapply(names(grouped_data), function(group) {
    data_group <- library_size_data[library_size_data$group == group, ]
    tooltips <- paste("Sample:", data_group$Sample, "<br>Library Size:", data_group$LibrarySize)
    
    list(
      x = data_group$x_values,
      y = data_group$LibrarySize,
      type = 'scatter',
      mode = 'markers',
      name = group, # This will be the legend entry
      marker = list(
        size = 10,
        line = list(color = 'white', width = 0.8),
        color = color_mapping[group]
      ),
      text = tooltips,
      hoverinfo = 'text'
    )
  })
  
  # Convert data to JSON for Plotly
  plot_data <- list(
    data=traces,
    layout = list(
      title = 'Library Size Overview',
      xaxis = list(title = 'Sample'),
      yaxis = list(title = 'Read Counts'),
      shapes = list(
        list(
          type = 'line',
          x0 = 0,  # Start at the left edge of the plot area
          y0 = mean_line,  # The y-value at which the line is placed
          x1 = 1,  # End at the right edge of the plot area
          y1 = mean_line,  # Same y-value to keep the line horizontal
          line = list(
            color = 'blue',  # Line color
            width = 3  # Line width
          ),
          xref = 'paper',  # Use the 'paper' reference for the x-axis
          yref = 'y'  # Use the y-axis data reference for the y-axis
        )
      ),
      annotations = list(
        list(
          x = 1,
          y = mean_line + annotation_offset,
          xref = 'paper',
          yref = 'y',
          text = paste('Mean:', signif(mean_line, 4)),
          showarrow = FALSE,
          font = list(family = 'Arial', size = 12, color = 'black')
        )
      )
    )
  )
  
  jsonFileName <- paste0(origImgName, ".json")
  json.obj <- rjson::toJSON(plot_data );
  sink(jsonFileName);
  cat(json.obj);
  sink();
  
  if(interactive){
    library(plotly);
    tooltips <- paste("Sample: ", library_size_data$Sample, "\nSize:", library_size_data$LibrarySize)
    annotation_offset <- 50 # Adjust as needed
    
    fig <- plot_ly() %>%
      add_trace(data = plot_data$data[[1]], 
                x = ~x, 
                y = ~y, 
                type = 'scatter', 
                mode = 'markers', 
                marker = list(size = 10, line = list(color = 'white', width = 0.8)),
                text = ~text, 
                hoverinfo = 'text', 
                name = ~name)
    
    # Iteratively add the other traces
    if (length(plot_data$data) > 1) {
      for (i in 2:length(plot_data$data)) {
        fig <- fig %>%
          add_trace(data = plot_data$data[[i]], 
                    x = ~x, 
                    y = ~y, 
                    type = 'scatter', 
                    mode = 'markers', 
                    marker = list(size = 10, line = list(color = 'white', width = 0.8)),
                    text = ~text, 
                    hoverinfo = 'text', 
                    name = ~name)
      }
    }
    
    fig <- fig %>% layout(
      title = plot_data$data$layout$title,
      xaxis = plot_data$layout$xaxis,
      yaxis = plot_data$layout$yaxis,
      shapes = plot_data$layout$shapes,
      annotations = plot_data$annotations,
      note = "proc"
    )
    
    return(fig);
  }else{
    return(.set.mbSetObj(mbSetObj))
  }
}

#'Plot PCA plot for multi-omics samples
#'@description 
#'@param imgNm name of the image to output
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'


PlotPCAView <- function(imgName, format="png", dpi=72,init){
  require("ggsci")
  
  #dpi<-as.numeric(dpi)
  imgName = paste(imgName,".", format, sep="");
  
  fig.list <- list()
  pca.list<- list()
  pct <- list();
  
  for(i in 1:2){
    if(i==1){
      data.proc <- current.proc$mic$data.proc;
      x <- data.matrix(data.proc);
      type="Microbiome"
    }else{
      data.proc <- current.proc$met$data.proc;
      x <- data.matrix(data.proc);
      type="Metabolomics"
    }
    pca <- prcomp(t(na.omit(x)), center=T, scale=T);
    imp.pca<-summary(pca)$importance;
    xlabel <- paste0("PC1"," (", 100*round(imp.pca[2,][1], 3), "%)")
    ylabel <- paste0("PC2"," (", 100*round(imp.pca[2,][2], 3), "%)")
    names <- colnames(x);
    pca.res <- as.data.frame(pca$x);
    pca.res <- pca.res[,c(1,2)]
    
    xlim <- GetExtendRange(pca.res$PC1);
    ylim <- GetExtendRange(pca.res$PC2);
    
    metaFile <- qs::qread("data.sample_data")
    Factor <- metaFile[,1]
    pca.rest <- pca.res
    pca.rest$Conditions <- Factor
    pca.rest$names <- rownames(pca.res)
    
    pcafig <- ggplot(pca.rest, aes(x=PC1, y=PC2,  color=Conditions)) +
      geom_point(size=3, alpha=0.7) + 
      scale_color_npg()+
      xlim(xlim) + 
      ylim(ylim) + 
      xlab(xlabel) + 
      ylab(ylabel) +
      theme_bw()
    fig.list[[i]] <- pcafig
    
    pos.xyz = data.frame(x=pca$x[,1], y=pca$x[,2], z=pca$x[,3]);
    loading.pos.xyz = data.frame(pca$rotation[,c(1:3)]);
    loadingNames = rownames(pca$rotation);
    
    pos.xyz <- as.data.frame(pos.xyz);
    pos.xyz <- unitAutoScale(pos.xyz);
    
    loadingNames <- rownames(loading.pos.xyz);
    loading.pos.xyz <- as.data.frame(loading.pos.xyz);
    loading <- unitAutoScale(loading.pos.xyz);
    rownames(loading) <- loadingNames;
    nm <- paste0("pca_", type);
    pca.list[[nm]][["score"]] <- pos.xyz * 1000;
    pca.list[[nm]][["loading"]] <- loading* 1000;    
    
    pct[[nm]] <- unname(round(imp.pca[2,],3))[c(1:3)]*100;
  }
  pca.list$pct2 <- pct;
  qs::qsave(pca.list, file="pca.scatter.qs");
  
  h<-6*round(length(fig.list)/2)
  Cairo::Cairo(file=imgName, width=14, height=h, type=format, bg="white", unit="in", dpi=dpi);
  library("ggpubr")
  p1 <- ggarrange(plotlist=fig.list, ncol = 2, nrow = round(length(fig.list)/2), labels=c("Microbiome","Metabolomics"))
  print(p1)
  dev.off();
  return(1)
  
}


PlotDensityView <- function(imgName, format="png",  init=0, autoScale=T,dpi=72){
  
  # dpi <- as.numeric(dpi)
  imgName = paste(imgName,".", format, sep="");
  fig.list <- list()
  merged.df <- data.frame
  
  df.list <- list()
  for(i in 1:2){
    
    if(i==1){
      data.proc <- current.proc$mic$data.proc;
      dat <- data.matrix(data.proc);
      if(init==0){dat= t(AutoScale(t(data.proc))) }
      st <- stack(as.data.frame(dat))
      st$type <- "Microbiome"
      merged.df <- st
    }else{
      data.proc <- current.proc$met$data.proc;
      dat <- data.matrix(data.proc);
      if(init==0){dat = t(AutoScale(t(data.proc)))}
      st <- stack(as.data.frame(dat))
      st$type <- "Metabolomics"
      merged.df <-rbind(merged.df, st)
    } 
    
  }
  
  
  
  type<-merged.df$type
  merged.df$ind <- paste0(merged.df$ind, "_", merged.df$type)
  #if(init==0){
  # merged.df$values[which(merged.df$values==0)] <- 10^-3
  # merged.df$values <- log((merged.df$values),base=2)
  
  # }
  
  Cairo::Cairo(file=imgName, width=10, height=6, type=format, bg="white", dpi=dpi, unit="in");
  g <- ggplot(merged.df, aes(x=values)) + 
    geom_line(aes(color=type, group=ind), stat="density", alpha=0.1) + 
    geom_line(aes(color=type), stat="density", alpha=0.7, linewidth=3) +
    scale_color_manual(values=c("#E64B35B2","#00A087B2"))+
    theme_bw()
  print(g)
  dev.off();
  
  
  return(1)
}



################################################
###########Phyloseq object creation#############
################################################

#'Function to recreate phyloseq object
#'@description This function recreates the phyloseq object.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param type Input the format of the uploaded files. "text" for 
#'.txt or .csv files, "biom" for .biom, and "mothur" for mothur files.
#'@param taxa_type Input the taxonomy labels. "SILVA" for SILVA taxonomy,
#'"Greengenes" for Greengenes taxonomy, "QIIME" for QIIME taxonomy,
#'"GreengenesID" for Greengenes OTU IDs, and "Others/Not_specific" for others.
#'@param taxalabel Logical, T if taxonomy labels were already included in
#'the OTU table and F if not.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@import phyloseq
CreatePhyloseqObj<-function(mbSetObj, type, taxa_type, taxalabel,isNormInput){
  mbSetObj <- .get.mbSetObj(mbSetObj);
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  load_phyloseq();
  load_splitstackshape();
   data.proc <- readDataQs("data.proc", module.type, dataName);
   # do some sanity check here on sample and feature names
  smpl.nms <- colnames(data.proc);
  taxa.nms <- row.names(data.proc);
  # check for uniqueness of dimension name
  if(length(unique(smpl.nms))!=length(smpl.nms)){
    dup.nms <- paste(smpl.nms[duplicated(smpl.nms)], collapse="; ");
    AddErrMsg(paste(c("Duplicate sample names are not allowed:"), dup.nms, collapse=" "));
    return(0);
  }
  
  # check for uniqueness of taxon names for metagenomic data only
  if(mbSetObj$module.type == "sdp" | mbSetObj$micDataType =="ko"){
    if(length(unique(taxa.nms))!=length(taxa.nms)){
      dup.tax.nms <- paste(taxa.nms[duplicated(taxa.nms)], collapse="; ");
      AddErrMsg(paste(c("Duplicate taxon names are not allowed:"), dup.tax.nms, collapse=" "));
      return(0);
    }
  }else{
    # for ASV data, replace sequences with ASV1, ASV2 ....
    # check is avegage name length is > 75
    nm.ln <- sum(nchar(taxa.nms)/length(taxa.nms));
    if(nm.ln > 75){
      mbSetObj$is.ASV <-TRUE;
    }
  }
  # now check for special characters in the data labels
  if(sum(is.na(iconv(smpl.nms)))>0){
    na.inx <- is.na(iconv(smpl.nms));
    nms <- paste(smpl.nms[na.inx], collapse="; ");
    AddErrMsg(paste("No special letters (i.e. Latin, Greek) are allowed in sample names:", nms, collapse=" "));
    return(0);
  }
  
  var.nms <- rownames(data.proc);
  
  if(sum(is.na(iconv(var.nms)))>0){
    na.inx <- is.na(iconv(var.nms));
    nms <- paste(var.nms[na.inx], collapse="; ");
    AddErrMsg(paste("No special letters (i.e. Latin, Greek) are allowed in feature or taxa names:", nms, collapse=" "));
    return(0);
  }
  #standard name to be used
  classi.lvl<- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species");
  if(any(c(mbSetObj$module.type == "mdp", mbSetObj$micDataType =="otu", mbSetObj$module.type == "ppd", mbSetObj$module.type == "meta"))){ 
    if(type=="text"){
      # prepare data for phyloseq visualization.
      # if features names are present in specific taxonomy format (greengenes or silva).
      if(taxalabel=="T"){ #taxa labels present and need to be parsed in OTU table
        feat_nm <- rownames(data.proc);
        mbSetObj$dataSet$feat_nm <- feat_nm;
        
        if(!(is.null(mbSetObj$dataSet$sglTax))){
          idxsgl <- match(toupper(mbSetObj$dataSet$sglTax),toupper(classi.lvl))
          feat_nm<-data.frame(mbSetObj$dataSet$feat_nm,check.names=FALSE);
          names(feat_nm)<-classi.lvl[idxsgl];
          #sanity check: no of sample of both abundance and metadata should match.
          taxmat<-as.matrix(feat_nm);
          rownames(taxmat)<-c(1:nrow(taxmat));
          
          taxa_table <- tax_table(taxmat);
          taxa_names(taxa_table)<-rownames(taxmat);
          
        }else{
          
          
          if(taxa_type=="SILVA"){
            
            load_splitstackshape();
            feat_nm<-data.frame(mbSetObj$dataSet$feat_nm,check.names=FALSE);
            names(feat_nm)<-"Rank";
            taxonomy<-splitstackshape::cSplit(feat_nm,"Rank",";");
            taxmat= data.frame(matrix(NA, ncol = 7, nrow = nrow(taxonomy)),check.names=FALSE);
            colnames(taxmat) <- classi.lvl;
            taxmat[,1:ncol(taxonomy)]<-taxonomy;
            taxmat<-taxmat[colSums(!is.na(taxmat))>0]; 
            taxmat<-as.matrix(taxmat);
            rownames(taxmat)<-c(1:nrow(taxmat));
            #phyloseq taxonomy object
            taxa_table <- tax_table(taxmat);
            taxa_names(taxa_table)<-rownames(taxmat);
          }else if(any(c(taxa_type=="Greengenes", taxa_type=="QIIME"))){
            my.parser<-lapply(feat_nm, parse_taxonomy_qiime); # note, this functions will remove empty strings, replace with NA!?
            taxa_table<-build_tax_table(my.parser);
            
            if(taxa_type=="Greengenes"){
              #additional columns are added(starting with Rank1 etc; has to be removed)
              indx<-grep("^Rank",colnames(taxa_table));
              # need to test if still columns left (could be that users specified wrong tax format)
              if(ncol(taxa_table) == length(indx)){
                AddErrMsg("Error parsing Greengenes Taxonomy Labels - you may try <b>Greengenes OTU ID</b> or <b>Not Specific /Other</b> in data upload.");
                return(0);
              }
              if(length(indx) > 0){
                taxa_table<-taxa_table[,-indx];
              }
              #if all taxa ranks of an taxa are NA with pase_taxonomy_qiime function #covert to Not_assigned
              ind<-which(apply(taxa_table,1, function(x)all(is.na(x)))==TRUE);
              taxa_table[ind,]<-rep("Not_Assigned",ncol(taxa_table));
            }
            taxa_names(taxa_table)<-c(1:nrow(taxa_table));
          }else if(any(c(taxa_type=="GreengenesID", taxa_type=="Others/Not_specific"))){
            print("here")

            # need to parse Taxonomy still!
            load_splitstackshape();               
            
            feat_nm <- data.frame(mbSetObj$dataSet$feat_nm,check.names=FALSE);
            feat_nm <- data.frame(apply(feat_nm, 1, function(x) gsub(";\\s;", ";", x)),check.names=FALSE) # remove empty taxa
            names(feat_nm) <- "Rank";
            taxonomy <- splitstackshape::cSplit(feat_nm,"Rank",";");
            taxmat <- data.frame(matrix(NA, ncol = 7, nrow = nrow(taxonomy)),check.names=FALSE);
            colnames(taxmat) <- classi.lvl;
            taxmat[,1:ncol(taxonomy)] <- taxonomy;
            taxmat <- taxmat[colSums(!is.na(taxmat))>0]; 
            taxmat <- as.matrix(taxmat);
            rownames(taxmat) <- c(1:nrow(taxmat));
            #phyloseq taxonomy object
            taxa_table <- tax_table(taxmat);
            taxa_names(taxa_table) <- rownames(taxmat);
          }
          
          
        }
     
        # making unique id for each OTU consist of lowest taxonomy level present followed by row number
        tmat <- as(tax_table(taxa_table), "matrix")
        rowLn <- nrow(tmat);
        colLn <- ncol(tmat);
        num.mat <- matrix(0, nrow=rowLn, ncol=colLn);
        
        for(i in 1:colLn){
          num.mat[,i] <- rep(i, rowLn);
        }
        
        na.inx <- is.na(tmat);
        blank.inx <- which(tmat==""|tmat=="__")
        num.mat[na.inx] <- 0;
        num.mat[blank.inx] <- 0;
        maxInx <- apply(num.mat, 1, max);
        gd.coords <- cbind(1:rowLn, maxInx);
        gd.nms <- tmat[gd.coords];
        mynames <- make.unique(gd.nms, sep="_");
 
        
        if(length(mynames) != nrow(tmat)){
          AddErrMsg(paste("Errors with the taxonomy table! Please check if there are any names with all NA."));
          return(0);
        }
        taxa_names(taxa_table)<-mynames;
        feat_nm<-NULL;
        mbSetObj$dataSet$taxa_table<-taxa_table;
      }else{
        #sanity check: features name of taxonomy table should match feature names in OTU abundance table,
        # Only features in filtered OTU tables are selected additional are removed.
        taxa_table <- mbSetObj$dataSet$taxa_table;
        
        indx<-match(rownames(data.proc), rownames(taxa_table));
        if(sum(is.na(indx)) > 0){
          na.nms <- rownames(data.proc)[is.na(indx)];
          if(length(na.nms)>10){
            na.nms <- na.nms[1:10];
          }
          AddErrMsg(paste("The following names cannot be found in your taxonomy table (showing 10 max):", paste(na.nms, collapse="; ")));
          return(0);
        }
        
        # let's address ASV sequence ID here
        if(mbSetObj$is.ASV){
          # do the conversion and save 
          new.nms <- paste("ASV", 1:length(taxa.nms), sep="_");
          master.nms <- cbind("NewID"=new.nms, "OriginalID"=taxa.nms);
          fast.write(master.nms, file="ASV_ID_mapping.csv");
          mbSetObj$dataSet$master.nms = master.nms;
          # update the taxa and sample
          names(new.nms) <- taxa.nms;
          rownames(data.proc) <- new.nms[rownames(data.proc)];
          rownames(taxa_table) <- new.nms[rownames(taxa_table)];
          
          # update tree file if uploaded
          if(mbSetObj$tree.uploaded & mbSetObj$dataSet$tree_tip==1){
            pg_tree <- qs::qread("tree.qs");
            pg_tree$tip.label <- new.nms[pg_tree$tip.label];
            qs::qsave(pg_tree, "tree.qs");
          }
        }
        nm<-colnames(taxa_table);
        taxa_table<-as.matrix(taxa_table[indx,]);
        colnames(taxa_table)<-nm;
        #converting taxonomy file(data frame) to phyloseq taxonomy object;keeping the taxonomy names as it is.
        mbSetObj$dataSet$taxa_table<-tax_table(taxa_table);
        taxa_nms <- rownames(taxa_table)
        
        if(length(unique(taxa_nms))!=length(taxa_nms)){
          dup.tax.nms <- paste(taxa_nms[duplicated(taxa_nms)], collapse="; ");
          AddErrMsg(paste(c("Duplicate taxon names are not allowed:"), dup.tax.nms, collapse=" "));
          return(0);
        }
        taxa_names(mbSetObj$dataSet$taxa_table)<-rownames(taxa_table);
      }
      # creating phyloseq object
      if(isNormInput=="false"){
        data.proc<-apply(data.proc,2,as.integer);
      }  
       data.proc<-otu_table(data.proc,taxa_are_rows =TRUE);
      taxa_names(data.proc)<-taxa_names(mbSetObj$dataSet$taxa_table);
      # removing constant column and making standard names
      ind<-apply(mbSetObj$dataSet$taxa_table[,1], 2, function(x) which(x %in% c("Root", "root")));
      
      if(length(ind)>0){
        mbSetObj$dataSet$taxa_table<-mbSetObj$dataSet$taxa_table[,-1];
        #dataSet$taxa_table<-dataSet$taxa_table #newnew
      }
      
      # removing first KINGDOM OR DOMAIN column
      indx<-apply(mbSetObj$dataSet$taxa_table[,1], 2, function(x) which(x %in% c("k__Fungi", "Fungi", "k__Viruses", "Viruses","k_Bacteria",
                                                                                 "Bacteria", "Archaea", "k__Bacteria", "k__Archea", 
                                                                                 "D_0__Bacteria", "D_0__Archea", "d__Bacteria")));
      
      if(length(indx)>0){
        mbSetObj$dataSet$taxa_table<-mbSetObj$dataSet$taxa_table[,-1];
      }
      classi.lvl<- c("Phylum", "Class", "Order", "Family", "Genus", "Species");
      colnames(mbSetObj$dataSet$taxa_table)<-classi.lvl[1:ncol(mbSetObj$dataSet$taxa_table)]; 
      #sanity check: no of sample of both abundance and metadata should match.
      indx<-match(colnames(data.proc), rownames(mbSetObj$dataSet$sample_data));
     
      if(all(is.na(indx))){
        AddErrMsg("Please make sure that sample names and their number are same in metadata and OTU abundance files.");
        return(0);
      }else{
        mbSetObj$dataSet$sample_data<-mbSetObj$dataSet$sample_data[indx, ,drop=FALSE];
      }
 
      mbSetObj$dataSet$sample_data<-sample_data(mbSetObj$dataSet$sample_data, errorIfNULL=TRUE);
      #cleaning up the names#deleting [] if present; and substituting (space,.,/ with underscore(_))
      if(!all(validUTF8(mbSetObj$dataSet$taxa_table))){
         AddErrMsg("Unexpected symbol was detected in your taxonomy table, please make sure you have upload the table in correct format!");
        return(0);
       }
      mbSetObj$dataSet$taxa_table<- gsub("[[:space:]./_-]", "_",mbSetObj$dataSet$taxa_table);
      mbSetObj$dataSet$taxa_table<- gsub("\\[|\\]","",mbSetObj$dataSet$taxa_table);
      mbSetObj$dataSet$taxa_table[which(mbSetObj$dataSet$taxa_table==''|grepl("_$",mbSetObj$dataSet$taxa_table))]=NA;
      
      if(taxalabel=="T"){# for mapping to the database in mmp module
        if("Species" %in% colnames( mbSetObj$dataSet$taxa_table)){
          sps = data.frame(mbSetObj$dataSet$taxa_table@.Data[,c('Genus','Species')])
          if(grepl("^g__",sps[1,"Genus"])){
            sps$Species = gsub("^s__","",sps$Species)
            if(all(!grepl(" ",sps$Species)) & all(!grepl("_",sps$Species))){
              sps$Genus =gsub("^g__","",sps$Genus)
              idxsp = which(!is.na(sps$Genus) & !is.na(sps$Species) & sps$Genus!="" & sps$Species!="")
              sps$Species[idxsp] = paste0("s__",sps$Genus[idxsp],"_",sps$Species[idxsp])
            }
          }else if(grepl("^g_",sps[1,"Genus"])){
            sps$Species = gsub("^s_","",sps$Species)
            if(all(!grepl(" ",sps$Species)) & all(!grepl("_",sps$Species))){
              sps$Genus =gsub("^g_","",sps$Genus)
              idxsp = which(!is.na(sps$Genus) & !is.na(sps$Species) & sps$Genus!="" & sps$Species!="")
              sps$Species[idxsp] = paste0("s__",sps$Genus[idxsp],"_",sps$Species[idxsp])
            }
          }else if(grepl("^g_0__",sps[1,"Genus"])){
            sps$Species = gsub("^s_0__","",sps$Species)
            if(all(!grepl(" ",sps$Species)) & all(!grepl("_",sps$Species))){
              sps$Genus =gsub("^g_0__","",sps$Genus)
              idxsp = which(!is.na(sps$Genus) & !is.na(sps$Species) & sps$Genus!="" & sps$Species!="")
              sps$Species[idxsp] = paste0("s_0__",sps$Genus[idxsp],"_",sps$Species[idxsp])
            }
          }else{
            if(all(!grepl(" ",sps$Species)) & all(!grepl("_",sps$Species))){
              idxsp = which(!is.na(sps$Genus) & !is.na(sps$Species) & sps$Genus!="" & sps$Species!="")
              sps$Species[idxsp] = paste0(sps$Genus[idxsp],"_",sps$Species[idxsp])
            }
          }
          mbSetObj$dataSet$taxa_table@.Data[,'Species']  <- sps$Species
        }
        
      }
      #sometimes after removal of such special characters rownames beacame non unique; so make it unique
      mynames1<- gsub("[[:space:]./_-]", "_",taxa_names(mbSetObj$dataSet$taxa_table));
      mynames1<- gsub("\\[|\\]","",mynames1);
      
      if(length(unique(mynames1))< length(taxa_names(mbSetObj$dataSet$taxa_table))){
        mynames1 <- make.unique(mynames1, sep="_");
      }
      
      #after cleaning, names might now be NA
      missing.inx <- which(is.na(mynames1))
      missing_vector <- paste0("New_OTU", seq_along(missing.inx))
      mynames1[missing.inx] <- missing_vector
      
    } else if(any(c(type == "biom", type == "mothur"))){
      # creating phyloseq object
      feat_nm<-rownames(data.proc);
      if(isNormInput=="false"){
        data.proc<-apply(data.proc,2,as.integer);
      }
      data.proc<-otu_table(data.proc,taxa_are_rows =TRUE);
      taxa_names(data.proc)<-feat_nm;
      #sanity check: features name of taxonomy table should match feature names in OTU abundance table,Only features in filtered OTU tables are selected additional are removed.
      indx<-match(taxa_names(data.proc),taxa_names(mbSetObj$dataSet$taxa_table));
      
      if(all(is.na(indx))){
        AddErrMsg("Please make sure that features name of the taxonomy table match the feature names in the OTU abundance table.");
        return(0);
      }
      
      mbSetObj$dataSet$taxa_table<-mbSetObj$dataSet$taxa_table[indx,];
      
      # removing constant column (root) if present otherwise just kingdom column from taxonomy table
      ind<-apply(mbSetObj$dataSet$taxa_table[,1], 2, function(x) which(x %in% c("Root", "root")));
      
      if(length(ind)>0){
        mbSetObj$dataSet$taxa_table<-mbSetObj$dataSet$taxa_table[,-1];
        #dataSet$taxa_table<-dataSet$taxa_table #newnew
      }
      
      # removing first KINGDOM OR DOMAIN column
      indx<-apply(mbSetObj$dataSet$taxa_table[,1], 2, function(x) which(x %in% c("k__Fungi", "Fungi", "k__Viruses", "Viruses", "k_Bacteria",
                                                                                 "Bacteria","Archaea","k__Bacteria","k__Archea", 
                                                                                 "D_0__Bacteria","D_0__Archea", "d__Bacteria")));
      
      if(length(indx)>0){
        mbSetObj$dataSet$taxa_table<-mbSetObj$dataSet$taxa_table[,-1];
      }
      
      classi.lvl<- c("Phylum", "Class", "Order", "Family", "Genus", "Species"); 
      colnames(mbSetObj$dataSet$taxa_table)<-classi.lvl[1:ncol(mbSetObj$dataSet$taxa_table)];
      #sanity check: no of sample of both abundance and metadata should match.
      indx<-match(colnames(data.proc), rownames(mbSetObj$dataSet$sample_data));
      
      if(all(is.na(indx))){
        AddErrMsg("Please make sure that sample names and their number are the same in metadata and OTU abundance files.");
        return(0);
      }else{
        mbSetObj$dataSet$sample_data<-mbSetObj$dataSet$sample_data[indx, ,drop=FALSE];
      }
      
      #cleaning up the names#deleting [] if present; and substituting (space,.,/ with underscore(_))
      mbSetObj$dataSet$taxa_table <- gsub("[[:space:]./_-]", "_",mbSetObj$dataSet$taxa_table);
      mbSetObj$dataSet$taxa_table<- gsub("\\[|\\]","",mbSetObj$dataSet$taxa_table);
      mbSetObj$dataSet$taxa_table[which(mbSetObj$dataSet$taxa_table==''|grepl("_$",mbSetObj$dataSet$taxa_table))]=NA;
      #sometimes after removal of such special characters rownames beacame non unique; so make it unique
      mynames1 <- gsub("[[:space:]./_-]", "_",taxa_names(mbSetObj$dataSet$taxa_table));
      mynames1<-gsub("\\[|\\]","",mynames1);
      
      if(length(unique(mynames1))< length(taxa_names(mbSetObj$dataSet$taxa_table))){
        mynames1 <- make.unique(mynames1, sep="_");
      }
      
      #after cleaning, names might now be NA
      missing.inx <- which(is.na(mynames1))
      missing_vector <- paste0("New_OTU", seq_along(missing.inx))
      mynames1[missing.inx] <- missing_vector
    }
    taxa_names(mbSetObj$dataSet$taxa_table)<-taxa_names(data.proc)<-mynames1;
    mbSetObj$dataSet$sample_data<-sample_data(mbSetObj$dataSet$sample_data, errorIfNULL = TRUE);
    sd_names <- sample_names(mbSetObj$dataSet$sample_data)
    name.match.inx <- sd_names %in% smpl.nms
    
    if(sum(as.numeric(name.match.inx)) == 0){
      AddErrMsg("Issues with sample names in your files! Make sure names are not purely numeric (i.e. 1, 2, 3).");
      return(0);
    }
     data.proc <- data.proc[!grepl("^__",rownames(data.proc)),] 
    mbSetObj$dataSet$proc.phyobj <- merge_phyloseq(data.proc, mbSetObj$dataSet$sample_data, mbSetObj$dataSet$taxa_table);
    
    if(length(rank_names(mbSetObj$dataSet$proc.phyobj)) > 7){
      tax_table(mbSetObj$dataSet$proc.phyobj) <- tax_table(mbSetObj$dataSet$proc.phyobj)[, 1:7]
    }
    #also using unique names for our further data
    prefilt.data <- readDataQs("data.prefilt", module.type, dataName);
    rownames(prefilt.data)<-taxa_names(mbSetObj$dataSet$taxa_table);
  
    prefilt.data <- prefilt.data[!grepl("^__",rownames(prefilt.data)),]
    saveDataQs(prefilt.data, "data.prefilt", module.type, dataName);
    
  }else if(mbSetObj$module.type == "sdp"| mbSetObj$micDataType =="ko"){
    #constructing phyloseq object for aplha diversity.
    data.proc<-otu_table(data.proc, taxa_are_rows = TRUE);
    taxa_names(data.proc)<-rownames(data.proc);
    
    #sanity check: sample names of both abundance and metadata should match.
    smpl_var<-colnames(mbSetObj$dataSet$sample_data);
    indx<-match(colnames(data.proc), rownames(mbSetObj$dataSet$sample_data));
    
    if(all(is.na(indx))){
      AddErrMsg("Please make sure that sample names are matched between both files.");
      return(0);
    }
    mbSetObj$dataSet$sample_data<-sample_data(mbSetObj$dataSet$sample_data, errorIfNULL=TRUE);
    mbSetObj$dataSet$proc.phyobj <- merge_phyloseq(data.proc, mbSetObj$dataSet$sample_data);
  }
  mbSetObj$dataSet$proc.phyobj@tax_table <- mbSetObj$dataSet$proc.phyobj@tax_table[,!is.na(colnames(mbSetObj$dataSet$proc.phyobj@tax_table))]
  saveDataQs(mbSetObj$dataSet$proc.phyobj, "proc.phyobj.orig", module.type, dataName);
  #do some data cleaning
  mbSetObj$dataSet$data.prefilt <- NULL;
  mbSetObj$dataSet$taxa_type <- taxa_type;
  
  if(module.type == "meta"){
    mbSetObj$dataSet$proc.phyobj <- mbSetObj$dataSet$proc.phyobj;
    mbSetObj$dataSet$norm.phyobj <-mbSetObj$dataSet$proc.phyobj;
  }
 
  return(.set.mbSetObj(mbSetObj));
}


###########################################################################
## Function to make the fake objects for normalized input ##
###########################################################################
### This function is used to create fake objects for
### normalized input to make sure all the function can work properly

CreateFakeFile <- function(mbSetObj,isNormalized="true",isNormalizedMet,module.type){
  
  mbSetObj <- .get.mbSetObj(mbSetObj);
  
  if(isNormalized=="false" & isNormalizedMet=="false"){
    
    AddErrMsg("Please make sure your data has been normalized properly!");
    return(0)
    
  }
  mbSetObj$dataSet$filt.data <- mbSetObj$dataSet$data.orig
  mbSetObj$dataSet$filt.msg <- "No data filtering has been performed since the input data has been transformed."
  
  mbSetObj$dataSet$norm.phyobj <- mbSetObj$dataSet$proc.phyobj
  mbSetObj$dataSet$norm.msg <- "No normalization has been performed since the input data has been transformed."
  
  #make hierarchies
  if(mbSetObj$module.type=="sdp" | mbSetObj$micDataType=="ko"){
    ranks <- "OTU"
  }else if(mbSetObj$module.type=="meta"){ #temporary
    ranks <- "OTU"
  }else{
    ranks <- c(GetMetaTaxaInfo(mbSetObj), "OTU")
    ranks <- unique(ranks)
  } 
  
  
  data.list <- list()
  data.list$merged_obj <- vector(length = length(ranks), "list")
  data.list$count_tables <- vector(length = length(ranks), "list")
  names(data.list$count_tables) <- names(data.list$merged_obj) <- ranks
  
  for(i in 1:length(ranks)){
    phyloseq.obj <- UtilMakePhyloseqObjs(mbSetObj, ranks[i])
    data.list$merged_obj[[i]] <- phyloseq.obj
    count.table <- UtilMakeCountTables(phyloseq.obj, ranks[i])
    data.list$count_tables[[i]] <- count.table
  }
  
  if(exists("current.proc")){
    qs::qsave(data.list,"prescale.phyobj.qs")
    
    current.proc$mic$data.proc<<- data.list$count_tables[["OTU"]]
    
  }
  mbSetObj$dataSet$sample_data$sample_id <- rownames(mbSetObj$dataSet$sample_data);
  
  saveDataQs(data.list, "phyloseq_prenorm_objs.qs",module.type, mbSetObj$dataSet$name);  
  saveDataQs(data.list, "phyloseq_objs.qs",module.type, mbSetObj$dataSet$name);  
  
  return(.set.mbSetObj(mbSetObj));
}


###########################################
## Utilty function to perform edgeR norm ##
###########################################
edgeRnorm = function(x,method){
  # Enforce orientation.
  # See if adding a single observation, 1,
  # everywhere (so not zeros) prevents errors
  # without needing to borrow and modify
  # calcNormFactors (and its dependent functions)
  # It did. This fixed all problems.
  # Can the 1 be reduced to something smaller and still work?
  x = x + 1;
  # Now turn into a DGEList
  y = DGEList(counts=x, remove.zeros=TRUE);
  # Perform edgeR-encoded normalization, using the specified method (...)
  z = edgeR::calcNormFactors(y, method=method);
  # A check that we didn't divide by zero inside `calcNormFactors`
  if( !all(is.finite(z$samples$norm.factors)) ){
    AddErrMsg(paste("Something wrong with edgeR::calcNormFactors on this data, non-finite $norm.factors."));
    return(0);
  }
  return(z)
}


###########################################
##################Scale##################
###########################################
scale_rows = function(x){
  m = apply(x, 1, mean, na.rm = T)
  s = apply(x, 1, sd, na.rm = T)
  return((x - m) / s)
}
scale_mat = function(mat, scale){
  if(!(scale %in% c("none", "row", "column"))){
    stop("scale argument shoud take values: 'none', 'row' or 'column'")
  }
  mat = switch(scale, none = mat, row = scale_rows(mat), column = t(scale_rows(t(mat))))
  return(mat)
}

GetExtendRange<-function(vec, unit=10){
  var.max <- max(vec, na.rm=T);
  var.min <- min(vec, na.rm=T);
  exts <- (var.max - var.min)/unit;
  c(var.min-exts, var.max+exts);
}

unitAutoScale <- function(df){
  df <- as.data.frame(df)
  row.nms <- rownames(df);
  col.nms <- colnames(df);
  df<-apply(df, 2, AutoNorm);
  rownames(df) <- row.nms;
  colnames(df) <- col.nms;
  maxVal <- max(abs(df))
  df<- df/maxVal
  return(df)
}
AutoNorm<-function(x){
  (x - mean(x))/sd(x, na.rm=T);
}

#######################################################################
###Help function for processing metabolomics data##################
#######################################################################

# Limit of detection (1/5 of min for each var)
.replace.by.lod <- function(x){
  lod <- min(x[x>0], na.rm=T)/5;
  x[x==0|is.na(x)] <- lod;
  return(x);
}

ReplaceMissingByLoD <- function(int.mat=data.proc){
  int.mat <- as.matrix(int.mat);
  
  rowNms <- rownames(int.mat);
  colNms <- colnames(int.mat);
  int.mat <- apply(int.mat, 2, .replace.by.lod);
  rownames(int.mat) <- rowNms;
  colnames(int.mat) <- colNms;
  return (int.mat);
}


#'Perform data cleaning
#'@description Cleans data and removes -Inf, Inf, NA, negative and 0s.
#'@param bdata Input data to clean
#'@param removeNA Logical, T to remove NAs, F to not. 
#'@param removeNeg Logical, T to remove negative numbers, F to not. 
#'@param removeConst Logical, T to remove samples/features with 0s, F to not. 
#'@author Jeff Xia\email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: MIT License
#'

CleanData <-function(bdata, removeNA=T, removeNeg=T, removeConst=T){
  
  if(sum(bdata==Inf, na.rm=TRUE)>0){
    inx <- bdata == Inf;
    bdata[inx] <- NA;
    bdata[inx] <- max(bdata, na.rm=T)*2
  }
  if(sum(bdata==-Inf, na.rm=TRUE)>0){
    inx <- bdata == -Inf;
    bdata[inx] <- NA;
    bdata[inx] <- min(bdata, na.rm=T)/2
  }
  if(removeNA){
    if(sum(is.na(bdata))>0){
      bdata[is.na(bdata)] <- min(bdata, na.rm=T)/2
    }
  }
  if(removeNeg){
    if(sum(as.numeric(bdata<=0)) > 0){
      inx <- bdata <= 0;
      bdata[inx] <- NA;
      bdata[inx] <- min(bdata, na.rm=T)/2
    }
  }
  if(removeConst){
    varCol <- apply(data.frame(bdata), 2, var, na.rm=T); # getting an error of dim(X) must have a positive length, fixed by data.frame
    constCol <- (varCol == 0 | is.na(varCol));
    constNum <- sum(constCol, na.rm=T);
    if(constNum > 0){
      bdata <- data.frame(bdata[,!constCol, drop=FALSE], check.names = F); # got an error of incorrect number of dimensions, added drop=FALSE to avoid vector conversion
    }
  }
  bdata;
}

LogNorm<-function(x, min.val){
  log10((x + sqrt(x^2 + min.val^2))/2)
}


# square root, tolerant to negative values
SquareRootNorm<-function(x, min.val){
  ((x + sqrt(x^2 + min.val^2))/2)^(1/2);
}

###########################################
##################Getters##################
###########################################

GetSamplNames<-function(dataName, module.type){
  prefilt.data <- readDataQs("data.prefilt", module.type, dataName);
  return(colnames(prefilt.data));
}

GetSampleNamesaftNorm<-function(mbSetObj, dataName){
  mbSetObj <- .get.mbSetObj(mbSetObj);
  if(mbSetObj$module.type == "meta"){
    if(dataName != ""){
      data <- readDataset(dataName);
    }else{
      data <- qs::qread("merged.data.raw.qs");
      data <- subsetPhyloseqByDataset(mbSetObj, data);
    }
    return(sample_names(data));
  }else{
    return(sample_names(mbSetObj$dataSet$norm.phyobj));
  }
}
xia-lab/MicrobiomeAnalystR documentation built on June 9, 2025, 11:51 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
xia-lab/MicrobiomeAnalystR
MicrobiomeAnalystR - A comprehensive R package for statistical, visual, and functional analysis of the microbiome.

R/general_proc.R
In xia-lab/MicrobiomeAnalystR: MicrobiomeAnalystR - A comprehensive R package for statistical, visual, and functional analysis of the microbiome.

Documented in ApplyAbundanceFilter ApplyMetaboFilter ApplyVarianceFilter CleanData CreatePhyloseqObj PerformMetaboNormalization PerformNormalization PerformRarefaction PlotLibSizeView PlotPCAView PlotRareCurve SanityCheckData SanityCheckMetData UpdateSampleItems

R Package Documentation

Browse R Packages

We want your feedback!

xia-lab/MicrobiomeAnalystR MicrobiomeAnalystR - A comprehensive R package for statistical, visual, and functional analysis of the microbiome.

R/general_proc.R In xia-lab/MicrobiomeAnalystR: MicrobiomeAnalystR - A comprehensive R package for statistical, visual, and functional analysis of the microbiome.

Documented in ApplyAbundanceFilter ApplyMetaboFilter ApplyVarianceFilter CleanData CreatePhyloseqObj PerformMetaboNormalization PerformNormalization PerformRarefaction PlotLibSizeView PlotPCAView PlotRareCurve SanityCheckData SanityCheckMetData UpdateSampleItems

R Package Documentation

Browse R Packages

We want your feedback!

xia-lab/MicrobiomeAnalystR
MicrobiomeAnalystR - A comprehensive R package for statistical, visual, and functional analysis of the microbiome.

R/general_proc.R
In xia-lab/MicrobiomeAnalystR: MicrobiomeAnalystR - A comprehensive R package for statistical, visual, and functional analysis of the microbiome.