R/general_proc.R

Defines functions GetSampleNamesaftNorm GetSamplNames SquareRootNorm LogNorm CleanData ReplaceMissingByLoD .replace.by.lod AutoNorm unitAutoScale GetExtendRange scale_mat scale_rows edgeRnorm CreateFakeFile CreatePhyloseqObj PlotDensityView PlotPCAView PlotLibSizeView PlotRareCurve PerformRarefaction GetLibscale PerformMetaboNormalization PerformNormalization UpdateSampleItems ApplyMetaboFilter ApplyVarianceFilter ApplyAbundanceFilter SanityCheckMetData SanityCheckData

Documented in ApplyAbundanceFilter ApplyMetaboFilter ApplyVarianceFilter CleanData CreatePhyloseqObj PerformMetaboNormalization PerformNormalization PerformRarefaction PlotLibSizeView PlotPCAView PlotRareCurve SanityCheckData SanityCheckMetData UpdateSampleItems

##################################################
## R script for MicrobiomeAnalyst
## Description: filtering and normalization functions
## Author: Jeff Xia, jeff.xia@mcgill.ca
###################################################

#######################################
# filter zeros and singletons this is not good enough ==> single occurance
# reads occur in only one sample should be treated as artifacts and removed

#'Main function to sanity check on uploaded microbiome data
#'@description This function performs a sanity check on the uploaded
#'user data. It checks the grouping of samples, if a phylogenetic tree
#'was uploaded.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param filetype Character, "biom" if the uploaded data
#'was .biom format, "mothur" if mothur format, or "txt" if
#'as .txt or .csv format.
#'@param disableFilter Boolean. Set to TRUE to bypass the hard-coded filter
#'to remove features occuring in <2 samples.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
SanityCheckData <- function(mbSetObj, filetype, preFilter = "sample",rmConstant){

  mbSetObj <- .get.mbSetObj(mbSetObj);
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  
   feat.sums <- rowSums(mbSetObj$dataSet$data.orig,na.rm = T) ###
   if(length(which(feat.sums<=1))>0){
     singleton <- TRUE
     singleton_length <- length(feat.sums<=1)
   }else{
    singleton <- FALSE
    singleton_length <- 0
  }

  if(preFilter == "sample"){ # sample singleton
    smpl.sums <- apply(mbSetObj$dataSet$data.orig, 1, function(x){sum(x>0, na.rm=T)});
    gd.inx <- smpl.sums > 1; #  occur in at least 2 samples
    if(length(which(gd.inx=="TRUE"))==0){
      AddErrMsg("Reads occur in only one sample.  All these are considered as artifacts and have been removed from data. No data left after such processing.");
      return(0);
    }
    data.proc <- mbSetObj$dataSet$data.orig[gd.inx, ]; 
    print("sample singleton performed");

  }else if(preFilter == "count"){ # global singleton
     gd.inx <- feat.sums > 1; # not singleton
    if(length(which(gd.inx=="TRUE"))==0){
      AddErrMsg("No data left after removing the singleton.");
      return(0);
    }
    data.proc <- mbSetObj$dataSet$data.orig[gd.inx, ]; 
    print("global singleton performed");

  }else{ # none performed
    data.proc <- mbSetObj$dataSet$data.orig;
    print("none filtering performed");
  }
  
 if(rmConstant=="true"){

   # filtering the constant features here
   # check for columns with all constant (var=0)
   varCol <- apply(data.proc, 1, var, na.rm=T);
   constCol <- varCol == 0 | is.na(varCol);
  # making copy of data.proc and proc.phyobj(phyloseq)
   data.proc <- data.proc[!constCol, ];
   if(length(data.proc)==0){
    AddErrMsg("All features are found to be constant and have been removed from data. No data left after such processing.");
    return(0);
  }
  }

  ##### Excluding samples that contain only zero values
  
  smp.sums <- apply(mbSetObj$dataSet$data.orig, 2, function(x){sum(x>0, na.rm=T)});

 if(!all(smp.sums)>0 ){
  kp.inx <- smp.sums > 0;  
  if(length(which(kp.inx=="TRUE"))==0){
    AddErrMsg("Every sample exclusively comprises zero values!");
    return(0);
   }
  data.proc <- data.proc[, kp.inx]; 
 }
  
  saveDataQs(data.proc, "data.proc.orig", module.type, dataName);
  saveDataQs(data.proc, "data.prefilt", module.type, dataName);
  
  if(module.type == "meta"){
    sanCheck <- .checkSampleConsistencyMeta(mbSetObj,dataName);
    if(sanCheck == 0){
      return(0);
    }
  }
  
  # now get stats
  taxa_no <- nrow(mbSetObj$dataSet$data.orig);
  
  # from abundance table
  sample_no <- ncol(mbSetObj$dataSet$data.orig);
  
  if(any(c(filetype=="biom", filetype=="mothur"))){
    samplemeta_no <- nrow(mbSetObj$dataSet$sample_data);
  }else{
    samplemeta_no <- sample_no;
  }
  
  if(!mbSetObj$module.type %in% c("ppd") ){
    mbSetObj$dataSet$sample_data <- mbSetObj$dataSet$sample_data[sapply(mbSetObj$dataSet$sample_data, function(col) length(unique(col))) > 1];
  }
  
  if(ncol(mbSetObj$dataSet$sample_data)==0){
    AddErrMsg("No sample variable have more than one group. Please provide variables with at least two groups.");
    return(0);
  }
  
  #converting all sample variables to factor type
  character_vars <- sapply(mbSetObj$dataSet$sample_data, is.character);
  
  if(any(character_vars)=="TRUE"){
    mbSetObj$dataSet$sample_data[, character_vars] <- sapply(mbSetObj$dataSet$sample_data[, character_vars], as.factor);
  }
  
  num_vars <- sapply(mbSetObj$dataSet$sample_data, is.numeric);
  
  if(any(num_vars)=="TRUE"){
    mbSetObj$dataSet$sample_data[, num_vars] <- sapply(mbSetObj$dataSet$sample_data[, num_vars],as.factor);
  }
  
  if(file.exists("tree.qs")){
    tree_exist <- 1
    tree<-qs::qread("tree.qs")  
    
    if(length(intersect(rownames(data.proc),tree$tip.label))==nrow(data.proc)){
      tree_tip <- 1
    }else{
      tree_tip <- 0
      AddErrMsg("The tip labels of the tree are not matched with your feature names in the abundance table!");
      
    }
  } else {
    tree_exist <- 0
    tree_tip <- 0
  }
  if(identical(sort(row.names(mbSetObj$dataSet$sample_data)),
               sort(colnames(data.proc)))){
    samname_same <- 1;
  } else {
    samname_same <- 0;
  }
  
  sample_no_in_outfile <- ncol(data.proc);
  samname_same_number <- sum(row.names(mbSetObj$dataSet$sample_data) %in% colnames(data.proc));
  if(samname_same_number){
    mis.num <- length(which(!row.names(mbSetObj$dataSet$sample_data) %in% colnames(data.proc)))
    if(mis.num==1){
      current.msg <<- paste0("One sample name was not included in the OTU/ASV abundance table.");
      
    }else{
      current.msg <<- paste0("A total of ",mis.num," sample names were not included in the OTU/ASV abundance table.");
      
    }
    
  }
  
  # now store data.orig to RDS
  saveDataQs(mbSetObj$dataSet$data.orig, "data.orig", module.type, dataName);
  saveDataQs(data.proc,"data.proc", module.type, dataName);
  saveDataQs(mbSetObj$dataSet$sample_data, "data.sample_data", module.type, dataName);
  
  mbSetObj$dataSet$tree <- tree_exist
  
  vari_no <- ncol(mbSetObj$dataSet$sample_data);
  disc_no <- sum(mbSetObj$dataSet$meta_info$disc.inx);
  cont_no <- sum(mbSetObj$dataSet$meta_info$cont.inx);
  vari_dup_no <- disc_no + cont_no
  smpl.sums <- apply(data.proc, 2, sum);
  tot_size <- sum(smpl.sums);
  smin <- min(smpl.sums)
  smean <- mean(smpl.sums)
  smax <- max(smpl.sums);
  gd_feat <- nrow(data.proc);

  if(exists("current.proc")){
    current.proc$mic$data.proc<<-data.proc
  }
  
  if(.on.public.web){
    .set.mbSetObj(mbSetObj)
    return(c(1,taxa_no,sample_no,vari_no, smin,smean,smax,gd_feat,samplemeta_no,tot_size, tree_exist, samname_same, samname_same_number, sample_no_in_outfile, disc_no, cont_no,vari_dup_no,tree_tip,singleton,singleton_length));
  }else{
    print("Sanity check passed!")
    
    return(.set.mbSetObj(mbSetObj))
  }
}

#'Function to sanity check on uploaded metabolomics data
#'@description This function performs a sanity check on the uploaded
#'user data. It checks the grouping of samples, if a phylogenetic tree
#'was uploaded.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param disableFilter Booleanmetabo. Set to TRUE to bypass the hard-coded filter
#'to remove features occuring in <2 samples.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export


SanityCheckMetData <- function(mbSetObj,isNormMetInput, disableFilter = FALSE){
  
  mbSetObj <- .get.mbSetObj(mbSetObj);
  
  feat.sums <- apply(mbSetObj$dataSet$metabolomics$data.orig, 1, function(x){sum(x>0, na.rm=T)});
 
  if(mbSetObj$dataSet$metabolomics$feature.type=="peak" &  any(grepl("[a-z]|[A-Z]|@",mbSetObj$dataSet$metabolomics$comp_metnm))){
   AddErrMsg("The peak format is not correct! Please use Generic Format if the data is a peak table. Note that if retention times are included, they must
                                be formatted with two underscores between the m/z and retention time. ");
      return(0);
  }
  if(disableFilter){
    data.proc <-mbSetObj$dataSet$metabolomics$data.orig
  }else{
    gd.inx <- feat.sums > 1; # occur in at least 2 samples
    if(length(which(gd.inx=="TRUE"))==0){
      AddErrMsg("Reads occur in only one sample.  All these are considered as artifacts and have been removed from data. No data left after such processing.");
      return(0);
    }
    data.proc <- mbSetObj$dataSet$metabolomics$data.orig[gd.inx, ]; 
  }
  
  # filtering the constant features here
  # check for columns with all constant (var=0)
  varCol <- apply(data.proc, 1, var, na.rm=T);
  constCol <- varCol == 0 | is.na(varCol);
  
  # making copy of data.proc
  data.proc <- data.proc[!constCol, ];
  
  if(length(data.proc)==0){
    AddErrMsg("All features are found to be constant and have been removed from data. No data left after such processing.");
    return(0);
  }
  
  # check zero, NA values
  totalCount <- nrow(data.proc)*ncol(data.proc);
  naCount <- sum(is.na(data.proc));
  naPercent <- round(100*naCount/totalCount,1)
  mbSetObj$dataSet$metabolomics$missingCount <- naCount; 
  msg<-paste("A total of ", naCount, " (", naPercent, "%) missing values were detected.", sep="");
  
  
  #Reset to default
  mbSetObj$dataSet$metabolomics$data.filt<-  mbSetObj$dataSet$metabolomics$edit <- NULL;
  
  # replace zero and missing values using Detection Limit for each variable 
  int.mat <- t(ReplaceMissingByLoD(t(data.proc)));   ##notice that samples in column and features in row
  
  mbSetObj$dataSet$metabolomics$proc.feat.num <- ncol(int.mat);
  
  msg<-c(msg, paste("Zero or missing values were replaced by 1/5 of the min positive value for each variable."));
  
  
  mbSetObj$dataSet$metabolomics$check.msg <- msg;
  mbSetObj$dataSet$metabolomics$isNormInput <- isNormMetInput;
  samname_same_number <- sum(row.names(mbSetObj$dataSet$sample_data) %in% colnames(data.proc));
  # now store data.orig to RDS
  qs::qsave(mbSetObj$dataSet$metabolomics$data.orig, file="metabo.data.orig.qs");
  qs::qsave(int.mat, file="metabo.data.init.qs");
  current.proc$met$data.proc<<-int.mat
  
  if(.on.public.web){
    .set.mbSetObj(mbSetObj)
    return(c(1,length(feat.sums),nrow(data.proc),naPercent,samname_same_number));
  }else{
    print("Sanity check passed!")
    
    return(.set.mbSetObj(mbSetObj))
  }
}


#'Function to filter uploaded data
#'@description This function filters data based on low counts in high percentage samples.
#'Note, first is abundance, followed by variance.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param filt.opt Character, input the low count filter option. "prevalence" to
#'filter based on prevalence in samples, "mean" to filter by the mean abundance value, and
#'"median" to filter by median abundance value.
#'@param count Numeric, input the minimum count. Set to 0 to disable the low count filter.
#'@param smpl.perc Numeric, input the percentage of samples for which to filter low counts.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export

ApplyAbundanceFilter <- function(mbSetObj, filt.opt, count, smpl.perc){
  
  mbSetObj <- .get.mbSetObj(mbSetObj);
  
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  data <- readDataQs("data.prefilt", module.type, dataName);
  #data <- qs::qread("data.prefilt");
  
  #this data is used for sample categorial comparision further
  rmn_feat <- nrow(data);
  
  mbSetObj$dataSet$ab.filtered <- FALSE
  
  if(count==0){# no low-count filtering
    kept.inx <- rep(TRUE, rmn_feat);
  }else{
    mbSetObj$dataSet$ab.filtered <- TRUE
    if(filt.opt == "prevalence"){
      rmn_feat <- nrow(data);
      minLen <- smpl.perc*ncol(data);
      kept.inx <- apply(data, MARGIN = 1,function(x) {sum(x >= count) >= minLen});
    }else if (filt.opt == "mean"){
      filter.val <- apply(data, 1, mean, na.rm=T);
      kept.inx <- filter.val >= count;
    }else if (filt.opt == "median"){
      filter.val <- apply(data, 1, median, na.rm=T);
      kept.inx <- filter.val >= count;
    }
  }
  
  if(sum(kept.inx)==0){
    AddErrMsg(paste0("No samples have counts less than ", count, "! If your data has already been normalized, please turn off count filter (0)."))
    return(0)
  }
  
  mbSetObj$dataSet$filt.data <- data[kept.inx, ];
  
  if(exists("current.proc")){
    
    current.proc$mic$data.proc<<-mbSetObj$dataSet$filt.data
    
  }
  
  
  saveDataQs(mbSetObj$dataSet$filt.data, "filt.data.orig", module.type, dataName);
  #qs::qsave(mbSetObj$dataSet$filt.data, file="filt.data.orig"); # save an copy
  current.msg <<- paste("A total of ", sum(!kept.inx), " low abundance features were removed based on ", filt.opt, ".", sep="");
  
  return(.set.mbSetObj(mbSetObj));
}

#'Function to filter uploaded data
#'@description This function filters data based on low abundace or variance.
#'Note, this is applied after abundance filter.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param filtopt Character, input the low variance filter option. "iqr" for 
#'inter-quantile range, "sd" for standard deviation, and "cov" for coefficient of variation.
#'@param filtPerct Numeric, input the percentage cutoff for low variance. 
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export

ApplyVarianceFilter <- function(mbSetObj, filtopt, filtPerct){
  
  mbSetObj <- .get.mbSetObj(mbSetObj);
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  data <- mbSetObj$dataSet$filt.data;
  
  rmn_feat <- nrow(data);
  
  filter.val <- nm <- NULL;
  
  mbSetObj$dataSet$var.filtered <- FALSE
  
  if(filtPerct==0){# no low-count filtering
    remain <- rep(TRUE, rmn_feat);
  }else{
    mbSetObj$dataSet$var.filtered <- TRUE
    if (filtopt == "iqr"){
      filter.val <- apply(data, 1, IQR, na.rm=T);
      nm <- "IQR";
    }else if (filtopt == "sd"){
      filter.val <- apply(data, 1, sd, na.rm=T);
      nm <- "standard deviation";
    }else if (filtopt == "cov"){
      sds <- apply(data, 1, sd, na.rm=T);
      mns <- apply(data, 1, mean, na.rm=T);
      filter.val <- abs(sds/mns);
      nm <- "Coeffecient of variation";
    }
    # get the rank
    rk <- rank(-filter.val, ties.method='random');
    var.num <- nrow(data);
    remain <- rk < var.num*(1-filtPerct);
  }
  
  data <- data[remain,];
  
  if(nrow(data) == 0){
    AddErrMsg("No samples remaining after variance filtering!")
    return(0)
  }
  
  mbSetObj$dataSet$filt.data <- data;
  if(exists("current.proc")){ 
    current.proc$mic$data.proc<<-data
  }
  
  #qs::qsave(mbSetObj$dataSet$filt.data, file="filt.data.orig"); # save an copy
  saveDataQs(mbSetObj$dataSet$filt.data, "filt.data.orig", module.type, dataName);

  rm.msg1 <- paste0("A total of ", sum(!remain), " low variance features were removed based on ", filtopt);
  rm.msg2 <- paste0("The number of features remains after the data filtering step: ", nrow(data));
  current.msg <<- c(current.msg, rm.msg1, rm.msg2);

  rm.msg1 <- paste0("A total of ```", sum(!remain), "``` low variance features were removed based on ```", filtopt, "```.");
  rm.msg2 <- paste0("The number of features remains after the data filtering step: ```", nrow(data), "```.");
  mbSetObj$dataSet$filt.msg <- c(current.msg, rm.msg1, rm.msg2);
  return(.set.mbSetObj(mbSetObj));
  
}

#'Function to filter uploaded metabolomic data
#'@description .
#'Note, this is applied after abundance filter.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param filtopt Character, input the low variance filter option. "iqr" for 
#'inter-quantile range, "sd" for standard deviation, and "cov" for coefficient of variation.
#'@param filtPerct Numeric, input the percentage cutoff for low variance. 
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export

ApplyMetaboFilter <- function(mbSetObj=NA, filter,  rsd){
  mbSetObj <- .get.mbSetObj(mbSetObj);
  
  data <- mbSetObj$dataSet$metabolomics$data.orig;
  int.mat <- t(data) ;
  
  
  feat.num <- ncol(int.mat);
  feat.nms <- colnames(int.mat);
  nm <- NULL;
  msg <- "";
  if(all(c(filter == "none", feat.num < 5000))) { # only allow for less than 4000
    remain <- rep(TRUE, feat.num);
    msg <- paste(msg, "No filtering was applied");
  }else {
    if (filter == "rsd"){
      sds <- apply(int.mat, 2, sd, na.rm=T);
      mns <- apply(int.mat, 2, mean, na.rm=T);
      filter.val <- abs(sds/mns);
      nm <- "Relative standard deviation";
    }else if (filter == "nrsd" ){
      mads <- apply(int.mat, 2, mad, na.rm=T);
      meds <- apply(int.mat, 2, median, na.rm=T);
      filter.val <- abs(mads/meds);
      nm <- "Non-paramatric relative standard deviation";
    }else if (filter == "mean"){
      filter.val <- apply(int.mat, 2, mean, na.rm=T);
      nm <- "mean";
    }else if (filter == "sd"){
      filter.val <- apply(int.mat, 2, sd, na.rm=T);
      nm <- "standard deviation";
    }else if (filter == "mad"){
      filter.val <- apply(int.mat, 2, mad, na.rm=T);
      nm <- "Median absolute deviation";
    }else if (filter == "median"){
      filter.val <- apply(int.mat, 2, median, na.rm=T);
      nm <- "median";
    }else{ # iqr
      filter.val <- apply(int.mat, 2, IQR, na.rm=T);
      nm <- "Interquantile Range";
    }
    
    # get the rank of the filtered variables
    rk <- rank(-filter.val, ties.method='random');
    
    
    if(feat.num < 250){ # reduce 5%
      remain <- rk < feat.num*0.95;
      msg <- paste(msg, "Further feature filtering based on", nm,". A total of ",sum(!remain), "features were removed.");
    }else if(feat.num < 500){ # reduce 10%
      remain <- rk < feat.num*0.9;
      msg <- paste(msg, "Further feature filtering based on", nm,". A total of ",sum(!remain), "features were removed.");
    }else if(feat.num < 1000){ # reduce 25%
      remain <- rk < feat.num*0.75;
      msg <- paste(msg, "Further feature filtering based on", nm,". A total of ",sum(!remain), "features were removed.");
    }else{ # reduce 40%, if still over 5000, then only use top 5000
      remain <- rk < feat.num*0.6;
      msg <- paste(msg, "Further feature filtering based on", nm);
      
      if(sum(remain) > 5000){
        remain <- rk < 5000;
        msg <- paste(msg, paste("Reduced to", 5000, "features based on", nm,""));
      }
    }
    
  }
  
  # save a copy for user 
  #fast.write.csv(cbind(filter=filter.val, t(int.mat)), file=paste0("data_prefilter_", filter, ".csv"));
  
  
  filt.res=int.mat[, remain]
  mbSetObj$dataSet$metabolomics$filt.data <- t(filt.res); 
  current.proc$met$data.proc<<-t(filt.res)
  qs::qsave(mbSetObj$dataSet$metabolomics$filt.data, file="metabo.filt.data"); # save an copy
  current.msg <<- msg
  mbSetObj$dataSet$metabolomics$filt.msg <- current.msg;
  return(.set.mbSetObj(mbSetObj));
  
}


#'Function to update samples
#'@description This function prunes samples to be included 
#'for downstream analysis.
#'@param mbSetObj Input the name of the mbSetObj.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
UpdateSampleItems <- function(mbSetObj){
  
  load_phyloseq();
  mbSetObj <- .get.mbSetObj(mbSetObj);
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  
  if(!exists("smpl.nm.vec")){
    AddErrMsg("Cannot find the current sample names!");
    return(0);
  }
  
  # read from saved original copy
  data.proc.orig  <- qs::qread("data.proc.orig");
  proc.phyobj.orig <- qs::qread("proc.phyobj.orig");
  hit.inx <- colnames(data.proc.orig) %in% smpl.nm.vec;
  
  #preserving the rownames
  # read from saved prefilt copy
  prefilt.data <- readDataQs("data.prefilt", module.type, dataName);
  tax_nm <- rownames(prefilt.data);
  data.proc <- readDataQs("data.proc", module.type, dataName);
  modi_data <- data.proc[,hit.inx];
  prefilt.data <- modi_data;
  
  #doesn't need to change the names object
  rownames(prefilt.data) <- tax_nm;
  #getting unmatched sample names
  unmhit.indx <- which(hit.inx==FALSE);
  allnm <- colnames(data.proc.orig);
  mbSetObj$dataSet$remsam <- allnm[unmhit.indx];
  
  mbSetObj$dataSet$proc.phyobj <- prune_samples(colnames(prefilt.data),proc.phyobj.orig);
  qs::qsave(prefilt.data, file="data.prefilt")
  current.msg <<- "Successfully updated the sample items!";
  
  # need to update metadata info after removing samples
  my.meta <- sample_data(mbSetObj$dataSet$proc.phyobj)
  disc.inx <- GetDiscreteInx(my.meta);
  
  if(sum(disc.inx) == 0){
    mbSetObj$poor.replicate <- TRUE;
  }else{
    mbSetObj$poor.replicate <- FALSE;
  }
  
  mbSetObj$dataSet$sample_data <- my.meta
  
  return(.set.mbSetObj(mbSetObj));
  
}


#'Function to perform normalization
#'@description This function performs normalization on the uploaded
#'data.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param rare.opt Character, "rarewi" to rarefy the data and 
#'"none" to not.
#'@param scale.opt Character, input the name of the data scaling option.
#'"colsum" for total sum scaling, "CSS" for cumulative sum scaling,
#' "upperquartile" for upper-quartile normalization, and "none" to none.
#'@param transform.opt Character, input the name of the data transformation
#'to be applied. "rle" for relative log expression, "TMM" for trimmed mean of 
#'M-values, "clr" for centered log ratio, and "none" for none.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@import edgeR
#'@import metagenomeSeq
PerformNormalization <- function(mbSetObj, rare.opt, scale.opt, transform.opt,isAutoScale=F,rareDepth){
  print(rareDepth)
  mbSetObj <- .get.mbSetObj(mbSetObj);
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  data <- readDataQs("filt.data.orig", module.type, dataName);
  saveDataQs(mbSetObj$dataSet$proc.phyobj, "orig.phyobj", module.type, dataName);  
  # now proc.phyobj is now filtered data
  tax_nm <- rownames(data);
  msg <- NULL;
  
  #rarefying (should?) affect filt.data in addition to norm 
  
  if(rare.opt != "none"){
    data <- PerformRarefaction(mbSetObj, data, rare.opt,rareDepth);
    if(is.null(data)){
       return(0)
     }
    tax_nm <- rownames(data);
    msg <- c(msg, paste("Performed data rarefaction."));
    
    ###### note, rarefying (should?) affect filt.data in addition to norm
    mbSetObj$dataSet$filt.data <- data@otu_table;
  }else{
    msg <- c(msg, paste("No data rarefaction was performed."));
  }
  
  # create phyloseq obj
  mbSetObj$dataSet$sample_data$sample_id <- rownames(mbSetObj$dataSet$sample_data);
  sample_table <- sample_data(mbSetObj$dataSet$sample_data, errorIfNULL=TRUE);
  mbSetObj$dataSet$proc.phyobj<- merge_phyloseq(otu_table(data,taxa_are_rows =TRUE), sample_table, mbSetObj$dataSet$taxa_table);
  
  
  #make hierarchies
  if(mbSetObj$module.type=="sdp" | mbSetObj$micDataType=="ko"){
    ranks <- "OTU"
  }else{
    ranks <- c(GetMetaTaxaInfo(mbSetObj), "OTU")
    ranks <- unique(ranks)
  } 
  
  # start with lowest
  data.list <- list()
  data.list$merged_obj <- vector(length = length(ranks), "list")
  data.list$count_tables <- vector(length = length(ranks), "list")
  names(data.list$count_tables) <- names(data.list$merged_obj) <- ranks
  
  for(i in 1:length(ranks)){
    phyloseq.obj <- UtilMakePhyloseqObjs(mbSetObj, ranks[i])
    data.list$merged_obj[[i]] <- phyloseq.obj
    count.table <- UtilMakeCountTables(phyloseq.obj, ranks[i])
    data.list$count_tables[[i]] <- count.table
  }
  
  ##### The prenorm object is the filter result of the raw counts and also affected by the rarefy option 
  ### Note the counts table have different row numbers with the merged_obj as the otus without taxonomy assignment at a certain level will be merged in the count table.
  saveDataQs(data.list, "phyloseq_prenorm_objs.qs", module.type, dataName);  
  
  
  for(i in 1:length(ranks)){
    
    data <- data.list$count_tables[[i]]
    if(scale.opt != "none"){
      if(scale.opt=="colsum"){
        data <- sweep(data, 2, colSums(data), FUN="/")
        data <- data*10000000;
        if(i==1){
          msg <- c(msg, paste("Performed ```total sum scaling``` normalization."));
        }
      }else if(scale.opt=="upperquartile"){
        load_edgeR();
        otuUQ <- edgeRnorm(data,method="upperquartile");
        data <- as.matrix(otuUQ$counts);
        if(i==1){
          msg <- c(msg, paste("Performed ```upper quartile``` normalization"));
        }
      }else if(scale.opt=="CSS"){
        load_metagenomeseq();
        #biom and mothur data also has to be in class(matrix only not in phyloseq:otu_table)
        data1 <- as(data,"matrix");
        dataMR <- newMRexperiment(data1);
        data <- cumNorm(dataMR,p=cumNormStat(dataMR));
        data <- MRcounts(data,norm = T);
        if(i==1){
          msg <- c(msg, paste("Performed ```cumulative sum scaling``` normalization"));
        }
      }else{
        if(scale.opt!="auto" & i==1){
          print(paste("Unknown scaling parameter:", scale.opt));
        }
      }
    }else{
      if(i==1){
        msg <- c(msg, paste("No data scaling was performed."));
      }
    }
    
    if(transform.opt != "none"){
      if(transform.opt=="rle"){
        load_edgeR();            
        otuRLE <- edgeRnorm(data,method="RLE");
        data <- as.matrix(otuRLE$counts);
        if(i==1){        
          msg <- c(msg, paste("Performed ```RLE``` Normalization"));
        }
      }else if(transform.opt=="TMM"){
        load_edgeR();   
        otuTMM <- edgeRnorm(data,method="TMM");
        data <- as.matrix(otuTMM$counts);
        if(i==1){        
          msg <- c(msg, paste("Performed ```TMM``` Normalization"));
        }
      }else if(transform.opt=="clr"){
        data <- apply(data, 2, clr_transform);
        if(i==1){        
          msg <- "Performed ```centered-log-ratio (CLR)``` normalization."
        }
      }else{
        if(scale.opt!="auto" & i==1){
          print(paste("Unknown scaling parameter:", transform.opt));
        }
      }
    }else{
      if(i==1){        
        msg <- c(msg, paste("No data transformation was performed."));
      }
    }
    data.list$count_tables[[i]] <- data
    
    if(ranks[i]=="OTU"){
      otu.tab <- otu_table(data,taxa_are_rows =TRUE);
      data.list$merged_obj[[i]] <- merge_phyloseq(otu.tab, sample_data(data.list$merged_obj[[i]]), mbSetObj$dataSet$taxa_table);
    }else{
      otu.tab <- otu_table(data,taxa_are_rows =TRUE);
      tax.tab <- tax_table(data.list$merged_obj[[i]])
      nm <- as.character(tax.tab[,ranks[i]])
      nm[is.na(nm)|nm==""|nm==" "] <- "Not_Assigned";
      rownames(tax.tab) <- nm
      tax.tab <- tax.tab[!(duplicated(nm)),] 
      tax.tab[which(rownames(tax.tab)=="Not_Assigned"),] <- NA
      data.list$merged_obj[[i]] <- merge_phyloseq(otu.tab, sample_data(data.list$merged_obj[[i]]), tax.tab);
    }
  }
  
  #using the OTU level for plotting when first initialize
  mbSetObj$dataSet$norm.phyobj <- data.list$merged_obj[["OTU"]];
  mbSetObj$dataSet$norm.msg <- msg;
  
  ### prepare for mmp module
  if(exists("current.proc")){
    qs::qsave(data.list,"prescale.phyobj.qs")
    if(isAutoScale=="true" | scale.opt == "auto"){
      data.list$count_tables <- lapply(data.list$count_tables,function(df) t(AutoScale(t(df))))
    }  
    current.proc$mic$data.proc<<- data.list$count_tables[["OTU"]]
    
  }
  ### Normalized object for each taxonomy level
  saveDataQs(data.list, "phyloseq_objs.qs", module.type, dataName); 
  
  return(.set.mbSetObj(mbSetObj));
}


#'Function to perform normalization on metabolomics data
#'@description This function performs normalization on the uploaded metabolomics
#'data.
#@param rowNorm Select the option for row-wise normalization, "QuantileNorm" for Quantile Normalization, 
#'"SumNorm" for Normalization to constant sum, 
#'"MedianNorm" for Normalization to sample median, and 
#'@param transNorm Select option to transform the data, "LogNorm" for Log Normalization,
#'and "CrNorm" for Cubic Root Transformation. 
#'@param scaleNorm Select option for scaling the data, "MeanCenter" for Mean Centering,
#'"AutoNorm" for Autoscaling, "ParetoNorm" for Pareto Scaling, amd "RangeNorm" for Range Scaling.
#'@param ref Input the name of the reference sample or the reference feature, use " " around the name.  
#'@param ratio This option is only for biomarker analysis.
#'@param ratioNum Relevant only for biomarker analysis.  
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@import edgeR
#'@import metagenomeSeq
PerformMetaboNormalization <- function(mbSetObj, rowNorm, transNorm, scaleNorm,isAutoScale){
  
  mbSetObj <- .get.mbSetObj(mbSetObj);
  data <-t(mbSetObj$dataSet$metabolomics$filt.data);
  # now proc.phyobj is now filtered data
  
  msg <- "";
  
  colNames <- colnames(data);
  rowNames <- rownames(data);
  
  # row-wise normalization
  if(rowNorm=="QuantileNorm"){
    
    data<-t(preprocessCore::normalize.quantiles(t(data), copy=FALSE));
    # this can introduce constant variables if a variable is 
    # at the same rank across all samples (replaced by its average across all)
    varCol <- apply(data, 2, var, na.rm=T);
    constCol <- (varCol == 0 | is.na(varCol));
    constNum <- sum(constCol, na.rm=T);
    if(constNum > 0){
      print(paste("After quantile normalization", constNum, "features with a constant value were found and deleted."));
      data <- data[,!constCol, drop=FALSE];
      colNames <- colnames(data);
      rowNames <- rownames(data);
    }
    rownm<-"Quantile Normalization";
  }else if(rowNorm=="SumNorm"){
    data<-t(apply(data, 1, function(x){
      1000*x/sum(x, na.rm=T);
    }));
    rownm<-"Normalization to constant sum";
  }else if(rowNorm=="MedianNorm"){
    data<-t(apply(data, 1, function(x){
      x/median(x, na.rm=T);
    }));
    rownm<-"Normalization to sample median";
  }else{
    # nothing to do
    rownm<-"N/A";
  }
  
  if(rownm!="N/A"){
    msg <- paste(rownm,"performed.")
    
  }
  # use apply will lose dimension info (i.e. row names and colnames)
  rownames(data)<-rowNames;
  colnames(data)<-colNames;
  
  # record row-normed data for fold change analysis (b/c not applicable for mean-centered data)
  row.norm <- as.data.frame(CleanData(data, T, T)); #moved below ratio 
  qs::qsave(row.norm, file="metabo.row.norm.qs");
  # transformation
  # may not be able to deal with 0 or negative values
  if(transNorm=='LogNorm'){
    min.val <- min(abs(data[data!=0]))/10;
    data<-apply(data, 2, LogNorm, min.val);
    transnm<-"Log10 Normalization";
  }else if(transNorm=='SrNorm'){
    min.val <- min(abs(data[data!=0]))/10;
    data<-apply(data, 2, SquareRootNorm, min.val);
    transnm<-"Square Root Transformation";
  }else if(transNorm=='CrNorm'){
    norm.data <- abs(data)^(1/3);
    norm.data[data<0] <- - norm.data[data<0];
    data <- norm.data;
    transnm<-"Cubic Root Transformation";
  }else{
    transnm<-"N/A";
  }
  
  if(transnm!="N/A"){
    msg <- paste(msg,paste(transnm,"performed."))
    
  }
  # scaling
  if(scaleNorm=='MeanCenter'){
    data<-apply(data, 2, function(x){
      x - mean(x);
    });
    scalenm<-"Mean Centering";
  }else if(scaleNorm=='AutoNorm'| isAutoScale == "true"){
    data<-apply(data, 2, function(x){
      (x - mean(x))/sd(x, na.rm=T);
    });
    scalenm<-"Autoscaling";
  }else if(scaleNorm=='ParetoNorm'){
    data<-apply(data, 2, function(x){
      (x - mean(x))/sqrt(sd(x, na.rm=T));
    });
    scalenm<-"Pareto Scaling";
  }else if(scaleNorm=='RangeNorm'){
    data<-apply(data, 2, function(x){
      if(max(x) == min(x)){
        x;
      }else{
        (x - mean(x))/(max(x)-min(x));
      }
    });
    scalenm<-"Range Scaling";
  }else{
    scalenm<-"N/A";
  }
  
  if(scalenm!="N/A"){
    msg <- paste(msg,paste(scalenm,"performed."))
    
  }
  # note after using "apply" function, all the attribute lost, need to add back
  rownames(data)<-rowNames;
  colnames(data)<-colNames;
  
  # need to do some sanity check, for log there may be Inf values introduced
  data <- CleanData(data, T, F);
  
  
  mbSetObj$dataSet$metabolomics$norm.data <- t(as.data.frame(data)); ## feature in row and sample in column
  
  qs::qsave( mbSetObj$dataSet$metabolomics$norm.data, file="metabo.complete.norm.qs");
  
  current.proc$met$data.proc<<-t(as.data.frame(data))
  
  mbSetObj$dataSet$metabolomics$rownorm.method <- rownm;
  mbSetObj$dataSet$metabolomics$trans.method <- transnm;
  mbSetObj$dataSet$metabolomics$scale.method <- scalenm;
  
  #using this object for plotting
  if(rownm=="N/A" & transnm=="N/A" & scalenm=="N/A"){
    current.msg <<- "No normalization was performed";
  }else{
    current.msg <<-msg
  }
  
  mbSetObj$dataSet$metabolomics$norm.msg <- current.msg;
  
  return(.set.mbSetObj(mbSetObj));
}

#'Get function to obtain the library scale for rarefraction
GetLibscale <- function(mbSetObj){
  mbSetObj <- .get.mbSetObj(mbSetObj);
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  data <- readDataQs("filt.data.orig", module.type, dataName);
  data <- data.matrix(data);
  tax_nm<-rownames(data);
  data <- round(data);
  
  # create phyloseq obj
  otu.tab<-otu_table(data,taxa_are_rows =TRUE);
  taxa_names(otu.tab)<-tax_nm;
  
  mbSetObj$dataSet$sample_data$sample_id<-rownames(mbSetObj$dataSet$sample_data);
  sample_table<-sample_data(mbSetObj$dataSet$sample_data, errorIfNULL=TRUE);
  phy.obj<-merge_phyloseq(otu.tab,sample_table);
 return(c(min(sample_sums(phy.obj)),quantile(sample_sums(phy.obj) ,0.75)))

}



#'Utility function to perform rarefraction (used by PerformNormalization)
#'@description This function performs rarefraction on the uploaded
#'data.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param data Input the data.
#'@param rare.opt Input the option for rarefying the microbiome data.
#'"rarewi" to rarefy with replacement to the minimum library depth and 
#'"rarewo" to rarefy without replacemet to the minimum library depth.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
PerformRarefaction <- function(mbSetObj, data, rare.opt,rareDepth){
  mbSetObj <- .get.mbSetObj(mbSetObj);

  data <- data.matrix(data);
  tax_nm<-rownames(data);

  # data must be count data (not contain fractions)
  data <- round(data);
  
  # create phyloseq obj
  otu.tab<-otu_table(data,taxa_are_rows =TRUE);
  taxa_names(otu.tab)<-tax_nm;
  
  mbSetObj$dataSet$sample_data$sample_id<-rownames(mbSetObj$dataSet$sample_data);
  sample_table<-sample_data(mbSetObj$dataSet$sample_data, errorIfNULL=TRUE);
  phy.obj<-merge_phyloseq(otu.tab,sample_table);
  msg<-NULL;
  
  # first doing rarefaction, this is on integer or count data
  if(rare.opt=="rarewi"){
  
  phy.obj <- tryCatch({
    # Your function call
    rarefied_data <- rarefy_even_depth(phy.obj, replace=TRUE,rngseed = T)
    return(rarefied_data)
  }, error = function(e) {
    # Custom error handling
    err.vec <<- e$message
    # Return NULL or some other value indicating failure
    return(NULL)
  }
  )
  if(is.NULL(phy.obj)){
    return(NULL)
  }else{
    msg <- c(msg, paste("Rarefy with replacement to minimum library depth."));
  }
 
}else if(rare.opt=="rarewo"){ # this is not shown on web due to computational issue

  phy.obj <- tryCatch({
    # Your function call
    phy.obj <- rarefy_even_depth(phy.obj, replace=FALSE,rngseed = T);
    return(rarefied_data)
  }, error = function(e) {
    # Custom error handling
    err.vec <<- e$message
    # Return NULL or some other value indicating failure
    return(NULL)
  }
  )
  if(is.NULL(phy.obj)){
    return(NULL)
  }else{
    msg <- c(msg, paste("Rarefaction without replacement to minimum library depth."));
  }
  
}else if(rare.opt=="rareto"){
  print(max((sample_sums(phy.obj))))
  if(max((sample_sums(phy.obj)))<rareDepth){
     AddErrMsg("The specified library depth exceeds that of all samples! Please provide a suitable depth value for rarefaction!")
     return(NULL)
  }
  phy.obj <- tryCatch({
    # Your function call
    rarefied_data <- rarefy_even_depth(phy.obj, sample.size = rareDepth,replace=TRUE,rngseed = T)
    return(rarefied_data)
  }, error = function(e) {
    # Custom error handling
    err.vec <<- e$message
    # Return NULL or some other value indicating failure
    return(NULL)
  }
  )
  if(is.NULL(phy.obj)){
    return(NULL)
  }else{
    msg <- c(msg, paste("Rarefy with replacement to selected library depth."));
  }

}
  msg <- paste(msg, collapse=" ");
  #print(msg);
  cleanMem();
  otu.tab <- otu_table(phy.obj);
  return(otu.tab);
}

#'Function to plot rarefraction curves
#'@description This function plots rarefraction curves from the uploaded
#'data for both sample-wise or group-wise
#'@param mbSetObj Input the name of the mbSetObj.
#'@param graphName Input the name of the plot.
#'@param variable Input the experimental factor.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@import vegan

PlotRareCurve <- function(mbSetObj, graphName, variable){
  
  mbSetObj <- .get.mbSetObj(mbSetObj);
  set.seed(13789);
  load_vegan();
  
  data <- data.matrix(mbSetObj$dataSet$filt.data);
  rarefaction_curve_data<-as.matrix(otu_table(data));
  
  Cairo::Cairo(file=graphName, width=900, height=480, type="png", bg="white");
  
  #sample-wise
  subsmpl=50;
  
  if (ncol(rarefaction_curve_data)>subsmpl) {
    ss  = sample(ncol(rarefaction_curve_data), subsmpl);
    rarefaction_curve_data = rarefaction_curve_data[,ss,drop=FALSE];
    #retaining the column names
    data<-prune_samples(colnames(rarefaction_curve_data),data);
  } else {
    rarefaction_curve_data = rarefaction_curve_data;
  }
  
  sam_data<-sample_data(data);
  raremax <- min(rowSums(t(rarefaction_curve_data)));
  
  #getting colors
  grp.num <- length(levels(as.factor(sam_data[[variable]])));
  
  if(grp.num<9){
    dist.cols <- 1:grp.num + 1;
    lvs <- levels(as.factor(sam_data[[variable]]));
    colors <- vector(mode="character", length=length(as.factor(sam_data[[variable]])));
    
    for(i in 1:length(lvs)){
      colors[as.factor(sam_data[[variable]]) == lvs[i]] <- dist.cols[i];
    }
  }else{
    colors<-"blue";
  }
  
  rarecurve(t(rarefaction_curve_data), step = 20, sample = raremax, col = colors, cex = 0.6, xlab = "Sequencing depth", ylab = "Observed Species");
  legend("bottomright",lvs,lty = rep(1,grp.num),col = dist.cols);
  
  plot.data <- rarefaction_curve_data;
  sam_data <- sample_data(data);
  cls.lbls <- as.factor(sam_data[[variable]]);
  grp.lvls <- levels(cls.lbls);
  grp.num <- length(grp.lvls);
  grp.data <- list();
  
  for(lvl in grp.lvls){
    grp.data[[lvl]] <- rowSums(plot.data[, cls.lbls == lvl])
  }
  
  raremax <- min(unlist(lapply(grp.data, sum)));
  grp.data <- data.frame(grp.data,check.names=FALSE);
  dist.cols <- 1:grp.num + 1;
  # now plot
  rarecurve(t(grp.data), step = 20, sample = raremax, col = dist.cols, lwd = 2, cex=1.5, xlab = "Sequencing depth", ylab = "Observed Species");
  legend("bottomright",grp.lvls,lty = rep(1,grp.num),col = dist.cols);
  
  dev.off();
  return(.set.mbSetObj(mbSetObj))
}

#'Function to plot library size 
#'@description This function creates a plot summarizing the library
#'size of microbiome dataset.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param imgName Character, input the name
#'of the plot.
#'@param format Character, input the preferred
#'format of the plot. By default it is set to "png".
#'@param dpi Numeric, input the dots per inch. By default
#'it is set to 72.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
PlotLibSizeView <- function(mbSetObj, origImgName="",format="png", dpi=72, dataName="", interactive=F){
  load_phyloseq();
  mbSetObj <- .get.mbSetObj(mbSetObj)
  library(ggplot2)
  library(dplyr)
  library(Cairo)

  if(dataName != ""){
    ind <- TRUE
  } else {
    dataName <- mbSetObj$dataSet$name
    ind <- FALSE
  }
  
  module.type <- mbSetObj$module.type
  colLegendNm="";
  if(all(c(mbSetObj$module.type == "meta", !ind))){
    sums.list <- list()
    datanm.list <- list()
    for(i in 1:length(mbSetObj$dataSets)){
      dataName <- mbSetObj$dataSets[[i]]$name
      data.proc <- readDataQs("data.proc", module.type, dataName)
      data_bef <- data.matrix(data.proc)
      smpl.sums <- colSums(data_bef)
      names(smpl.sums) <- colnames(data_bef)
      smpl.sums <- sort(smpl.sums)
      sums.list[[i]] <- smpl.sums
      datanm.list[[i]] <- rep(dataName,length(smpl.sums)) 
    }
    smpl.sums <- unlist(sums.list);
    col.vec <- unlist(datanm.list);
    colLegendNm = "Dataset";

  } else {
    data.proc <- readDataQs("data.proc", module.type, dataName)
    data_bef <- data.matrix(data.proc)
    smpl.sums <- colSums(data_bef)
    names(smpl.sums) <- colnames(data_bef)
    smpl.sums <- sort(smpl.sums)
    col.vec <- as.vector(mbSetObj$dataSet$sample_data[match(names(smpl.sums),rownames(mbSetObj$dataSet$sample_data)),1]);
    colLegendNm = "Group";

  }
  # Create a data frame for ggplot
  if(is.data.frame(col.vec)){
    library_size_data <- data.frame(Sample = names(smpl.sums), LibrarySize = smpl.sums, group = as.character(col.vec[[colnames(col.vec)]]))
  } else {
    library_size_data <- data.frame(Sample = names(smpl.sums), LibrarySize = smpl.sums, group = col.vec)
  }

  colnames(library_size_data)[3] <- "group";
  library_size_data <- library_size_data %>% arrange(desc(LibrarySize))
  library_size_data$Sample = factor(library_size_data$Sample, levels=library_size_data[order(library_size_data$LibrarySize), "Sample"])

  # Plotting with ggplot2
  imgName <- paste0(origImgName, ".", format)
  g <- ggplot(library_size_data, aes(x = Sample, y = LibrarySize)) +
    geom_point(aes(color = library_size_data$group)) +
    geom_hline(yintercept = mean(library_size_data$LibrarySize), color = "blue", linetype = "dashed") +
    labs(x = "Sample", y = "Read Counts", color="") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          legend.text = element_text(size = 14), legend.title = element_text(size = 14))
  
  if(imgName != ""){
  Cairo::Cairo(file=imgName, width=800, height=600, type=format, bg="white",dpi=dpi);
  print(g);
  dev.off();
  }
  
  mbSetObj$imgSet$lib.size <- imgName;
  
  mean_line <- mean(library_size_data$LibrarySize)
  annotation_offset <- 0.05 * (max(library_size_data$LibrarySize) - min(library_size_data$LibrarySize))
  
  
  # Split the data by group
  grouped_data <- split(library_size_data, library_size_data$group)
  
  # Map each group to a color
  unique_groups <- unique(library_size_data$group)
  colors <- RColorBrewer::brewer.pal(length(unique_groups), "Paired")
  color_mapping <- setNames(colors, unique_groups)
  x_values <- seq_along(library_size_data$Sample);
  library_size_data$x_values <- x_values 
  
  # Create a trace for each group
  traces <- lapply(names(grouped_data), function(group) {
    data_group <- library_size_data[library_size_data$group == group, ]
    tooltips <- paste("Sample:", data_group$Sample, "<br>Library Size:", data_group$LibrarySize)
    
    list(
      x = data_group$x_values,
      y = data_group$LibrarySize,
      type = 'scatter',
      mode = 'markers',
      name = group, # This will be the legend entry
      marker = list(
        size = 10,
        line = list(color = 'white', width = 0.8),
        color = color_mapping[group]
      ),
      text = tooltips,
      hoverinfo = 'text'
    )
  })
  
  # Convert data to JSON for Plotly
  plot_data <- list(
    data=traces,
    layout = list(
      title = 'Library Size Overview',
      xaxis = list(title = 'Sample'),
      yaxis = list(title = 'Read Counts'),
      shapes = list(
        list(
          type = 'line',
          x0 = 0,  # Start at the left edge of the plot area
          y0 = mean_line,  # The y-value at which the line is placed
          x1 = 1,  # End at the right edge of the plot area
          y1 = mean_line,  # Same y-value to keep the line horizontal
          line = list(
            color = 'blue',  # Line color
            width = 3  # Line width
          ),
          xref = 'paper',  # Use the 'paper' reference for the x-axis
          yref = 'y'  # Use the y-axis data reference for the y-axis
        )
      ),
      annotations = list(
        list(
          x = 1,
          y = mean_line + annotation_offset,
          xref = 'paper',
          yref = 'y',
          text = paste('Mean:', signif(mean_line, 4)),
          showarrow = FALSE,
          font = list(family = 'Arial', size = 12, color = 'black')
        )
      )
    )
  )
  
  jsonFileName <- paste0(origImgName, ".json")
  json.obj <- rjson::toJSON(plot_data );
  sink(jsonFileName);
  cat(json.obj);
  sink();
  
  if(interactive){
    library(plotly);
    tooltips <- paste("Sample: ", library_size_data$Sample, "\nSize:", library_size_data$LibrarySize)
    annotation_offset <- 50 # Adjust as needed
    
    fig <- plot_ly() %>%
      add_trace(data = plot_data$data[[1]], 
                x = ~x, 
                y = ~y, 
                type = 'scatter', 
                mode = 'markers', 
                marker = list(size = 10, line = list(color = 'white', width = 0.8)),
                text = ~text, 
                hoverinfo = 'text', 
                name = ~name)
    
    # Iteratively add the other traces
    if (length(plot_data$data) > 1) {
      for (i in 2:length(plot_data$data)) {
        fig <- fig %>%
          add_trace(data = plot_data$data[[i]], 
                    x = ~x, 
                    y = ~y, 
                    type = 'scatter', 
                    mode = 'markers', 
                    marker = list(size = 10, line = list(color = 'white', width = 0.8)),
                    text = ~text, 
                    hoverinfo = 'text', 
                    name = ~name)
      }
    }
    
    fig <- fig %>% layout(
      title = plot_data$data$layout$title,
      xaxis = plot_data$layout$xaxis,
      yaxis = plot_data$layout$yaxis,
      shapes = plot_data$layout$shapes,
      annotations = plot_data$annotations
    )
    
    return(fig);
  }else{
    return(.set.mbSetObj(mbSetObj))
  }
}

#'Plot PCA plot for multi-omics samples
#'@description 
#'@param imgNm name of the image to output
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'


PlotPCAView <- function(imgName, format="png", dpi=72,init){
  require("ggsci")
  
  #dpi<-as.numeric(dpi)
  imgName = paste(imgName,".", format, sep="");
  
  fig.list <- list()
  pca.list<- list()
  pct <- list();
  
  for(i in 1:2){
    if(i==1){
      data.proc <- current.proc$mic$data.proc;
      x <- data.matrix(data.proc);
      type="Microbiome"
    }else{
      data.proc <- current.proc$met$data.proc;
      x <- data.matrix(data.proc);
      type="Metabolomics"
    }
    pca <- prcomp(t(na.omit(x)), center=T, scale=T);
    imp.pca<-summary(pca)$importance;
    xlabel <- paste0("PC1"," (", 100*round(imp.pca[2,][1], 3), "%)")
    ylabel <- paste0("PC2"," (", 100*round(imp.pca[2,][2], 3), "%)")
    names <- colnames(x);
    pca.res <- as.data.frame(pca$x);
    pca.res <- pca.res[,c(1,2)]
    
    xlim <- GetExtendRange(pca.res$PC1);
    ylim <- GetExtendRange(pca.res$PC2);
    
    metaFile <- qs::qread("data.sample_data")
    Factor <- metaFile[,1]
    pca.rest <- pca.res
    pca.rest$Conditions <- Factor
    pca.rest$names <- rownames(pca.res)
    
    pcafig <- ggplot(pca.rest, aes(x=PC1, y=PC2,  color=Conditions)) +
      geom_point(size=3, alpha=0.7) + 
      scale_color_npg()+
      xlim(xlim) + 
      ylim(ylim) + 
      xlab(xlabel) + 
      ylab(ylabel) +
      theme_bw()
    fig.list[[i]] <- pcafig
    
    pos.xyz = data.frame(x=pca$x[,1], y=pca$x[,2], z=pca$x[,3]);
    loading.pos.xyz = data.frame(pca$rotation[,c(1:3)]);
    loadingNames = rownames(pca$rotation);
    
    pos.xyz <- as.data.frame(pos.xyz);
    pos.xyz <- unitAutoScale(pos.xyz);
    
    loadingNames <- rownames(loading.pos.xyz);
    loading.pos.xyz <- as.data.frame(loading.pos.xyz);
    loading <- unitAutoScale(loading.pos.xyz);
    rownames(loading) <- loadingNames;
    nm <- paste0("pca_", type);
    pca.list[[nm]][["score"]] <- pos.xyz * 1000;
    pca.list[[nm]][["loading"]] <- loading* 1000;    
    
    pct[[nm]] <- unname(round(imp.pca[2,],3))[c(1:3)]*100;
  }
  pca.list$pct2 <- pct;
  qs::qsave(pca.list, file="pca.scatter.qs");
  
  h<-6*round(length(fig.list)/2)
  Cairo::Cairo(file=imgName, width=14, height=h, type=format, bg="white", unit="in", dpi=dpi);
  library("ggpubr")
  p1 <- ggarrange(plotlist=fig.list, ncol = 2, nrow = round(length(fig.list)/2), labels=c("Microbiome","Metabolomics"))
  print(p1)
  dev.off();
  return(1)
  
}


PlotDensityView <- function(imgName, format="png",  init=0, autoScale=T,dpi=72){
  
  # dpi <- as.numeric(dpi)
  imgName = paste(imgName,".", format, sep="");
  fig.list <- list()
  merged.df <- data.frame
  
  df.list <- list()
  for(i in 1:2){
    
    if(i==1){
      data.proc <- current.proc$mic$data.proc;
      dat <- data.matrix(data.proc);
      if(init==0){dat= t(AutoScale(t(data.proc))) }
      st <- stack(as.data.frame(dat))
      st$type <- "Microbiome"
      merged.df <- st
    }else{
      data.proc <- current.proc$met$data.proc;
      dat <- data.matrix(data.proc);
      if(init==0){dat = t(AutoScale(t(data.proc)))}
      st <- stack(as.data.frame(dat))
      st$type <- "Metabolomics"
      merged.df <-rbind(merged.df, st)
    } 
    
  }
  
  
  
  type<-merged.df$type
  merged.df$ind <- paste0(merged.df$ind, "_", merged.df$type)
  #if(init==0){
  # merged.df$values[which(merged.df$values==0)] <- 10^-3
  # merged.df$values <- log((merged.df$values),base=2)
  
  # }
  
  Cairo::Cairo(file=imgName, width=10, height=6, type=format, bg="white", dpi=dpi, unit="in");
  g <- ggplot(merged.df, aes(x=values)) + 
    geom_line(aes(color=type, group=ind), stat="density", alpha=0.1) + 
    geom_line(aes(color=type), stat="density", alpha=0.7, linewidth=3) +
    scale_color_manual(values=c("#E64B35B2","#00A087B2"))+
    theme_bw()
  print(g)
  dev.off();
  
  
  return(1)
}



################################################
###########Phyloseq object creation#############
################################################

#'Function to recreate phyloseq object
#'@description This function recreates the phyloseq object.
#'@param mbSetObj Input the name of the mbSetObj.
#'@param type Input the format of the uploaded files. "text" for 
#'.txt or .csv files, "biom" for .biom, and "mothur" for mothur files.
#'@param taxa_type Input the taxonomy labels. "SILVA" for SILVA taxonomy,
#'"Greengenes" for Greengenes taxonomy, "QIIME" for QIIME taxonomy,
#'"GreengenesID" for Greengenes OTU IDs, and "Others/Not_specific" for others.
#'@param taxalabel Logical, T if taxonomy labels were already included in
#'the OTU table and F if not.
#'@author Jeff Xia \email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: GNU GPL (>= 2)
#'@export
#'@import phyloseq
CreatePhyloseqObj<-function(mbSetObj, type, taxa_type, taxalabel,isNormInput){
  mbSetObj <- .get.mbSetObj(mbSetObj);
  dataName <- mbSetObj$dataSet$name;
  module.type <- mbSetObj$module.type;
  load_phyloseq();
  load_splitstackshape();
  
  data.proc <- readDataQs("data.proc", module.type, dataName);
  
  # do some sanity check here on sample and feature names
  smpl.nms <- colnames(data.proc);
  taxa.nms <- row.names(data.proc);
  # check for uniqueness of dimension name
  if(length(unique(smpl.nms))!=length(smpl.nms)){
    dup.nms <- paste(smpl.nms[duplicated(smpl.nms)], collapse="; ");
    AddErrMsg(paste(c("Duplicate sample names are not allowed:"), dup.nms, collapse=" "));
    return(0);
  }
  
  # check for uniqueness of taxon names for metagenomic data only
  if(mbSetObj$module.type == "sdp" | mbSetObj$micDataType =="ko"){
    if(length(unique(taxa.nms))!=length(taxa.nms)){
      dup.tax.nms <- paste(taxa.nms[duplicated(taxa.nms)], collapse="; ");
      AddErrMsg(paste(c("Duplicate taxon names are not allowed:"), dup.tax.nms, collapse=" "));
      return(0);
    }
  }else{
    # for ASV data, replace sequences with ASV1, ASV2 ....
    # check is avegage name length is > 75
    nm.ln <- sum(nchar(taxa.nms)/length(taxa.nms));
    if(nm.ln > 75){
      mbSetObj$is.ASV <-TRUE;
    }
  }
  # now check for special characters in the data labels
  if(sum(is.na(iconv(smpl.nms)))>0){
    na.inx <- is.na(iconv(smpl.nms));
    nms <- paste(smpl.nms[na.inx], collapse="; ");
    AddErrMsg(paste("No special letters (i.e. Latin, Greek) are allowed in sample names:", nms, collapse=" "));
    return(0);
  }
  
  var.nms <- rownames(data.proc);
  
  if(sum(is.na(iconv(var.nms)))>0){
    na.inx <- is.na(iconv(var.nms));
    nms <- paste(var.nms[na.inx], collapse="; ");
    AddErrMsg(paste("No special letters (i.e. Latin, Greek) are allowed in feature or taxa names:", nms, collapse=" "));
    return(0);
  }
  #standard name to be used
  classi.lvl<- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species");
  if(any(c(mbSetObj$module.type == "mdp", mbSetObj$micDataType =="otu", mbSetObj$module.type == "ppd", mbSetObj$module.type == "meta"))){
    if(type=="text"){
      # prepare data for phyloseq visualization.
      # if features names are present in specific taxonomy format (greengenes or silva).
      if(taxalabel=="T"){ #taxa labels present and need to be parsed in OTU table
        feat_nm <- rownames(data.proc);
        mbSetObj$dataSet$feat_nm <- feat_nm;
        
        if(!(is.null(mbSetObj$dataSet$sglTax))){
          idxsgl <- match(toupper(mbSetObj$dataSet$sglTax),toupper(classi.lvl))
          feat_nm<-data.frame(mbSetObj$dataSet$feat_nm,check.names=FALSE);
          names(feat_nm)<-classi.lvl[idxsgl];
          #sanity check: no of sample of both abundance and metadata should match.
          taxmat<-as.matrix(feat_nm);
          rownames(taxmat)<-c(1:nrow(taxmat));
          
          taxa_table <- tax_table(taxmat);
          taxa_names(taxa_table)<-rownames(taxmat);
          
        }else{
          
          
          if(taxa_type=="SILVA"){
            
            load_splitstackshape();
            feat_nm<-data.frame(mbSetObj$dataSet$feat_nm,check.names=FALSE);
            names(feat_nm)<-"Rank";
            taxonomy<-splitstackshape::cSplit(feat_nm,"Rank",";");
            taxmat= data.frame(matrix(NA, ncol = 7, nrow = nrow(taxonomy)),check.names=FALSE);
            colnames(taxmat) <- classi.lvl;
            taxmat[,1:ncol(taxonomy)]<-taxonomy;
            taxmat<-taxmat[colSums(!is.na(taxmat))>0];
            taxmat<-as.matrix(taxmat);
            rownames(taxmat)<-c(1:nrow(taxmat));
            #phyloseq taxonomy object
            taxa_table <- tax_table(taxmat);
            taxa_names(taxa_table)<-rownames(taxmat);
          }else if(any(c(taxa_type=="Greengenes", taxa_type=="QIIME"))){
            my.parser<-lapply(feat_nm, parse_taxonomy_qiime); # note, this functions will remove empty strings, replace with NA!?
            taxa_table<-build_tax_table(my.parser);
            
            if(taxa_type=="Greengenes"){
              #additional columns are added(starting with Rank1 etc; has to be removed)
              indx<-grep("^Rank",colnames(taxa_table));
              # need to test if still columns left (could be that users specified wrong tax format)
              if(ncol(taxa_table) == length(indx)){
                AddErrMsg("Error parsing Greengenes Taxonomy Labels - you may try <b>Greengenes OTU ID</b> or <b>Not Specific /Other</b> in data upload.");
                return(0);
              }
              if(length(indx) > 0){
                taxa_table<-taxa_table[,-indx];
              }
              #if all taxa ranks of an taxa are NA with pase_taxonomy_qiime function #covert to Not_assigned
              ind<-which(apply(taxa_table,1, function(x)all(is.na(x)))==TRUE);
              taxa_table[ind,]<-rep("Not_Assigned",ncol(taxa_table));
            }
            taxa_names(taxa_table)<-c(1:nrow(taxa_table));
          }else if(any(c(taxa_type=="GreengenesID", taxa_type=="Others/Not_specific"))){
            
            # need to parse Taxonomy still!
            load_splitstackshape();               
            
            feat_nm <- data.frame(mbSetObj$dataSet$feat_nm,check.names=FALSE);
            feat_nm <- data.frame(apply(feat_nm, 1, function(x) gsub(";\\s;", ";", x)),check.names=FALSE) # remove empty taxa
            names(feat_nm) <- "Rank";
            taxonomy <- splitstackshape::cSplit(feat_nm,"Rank",";");
            taxmat <- data.frame(matrix(NA, ncol = 7, nrow = nrow(taxonomy)),check.names=FALSE);
            colnames(taxmat) <- classi.lvl;
            taxmat[,1:ncol(taxonomy)] <- taxonomy;
            taxmat <- taxmat[colSums(!is.na(taxmat))>0];
            taxmat <- as.matrix(taxmat);
            rownames(taxmat) <- c(1:nrow(taxmat));
            #phyloseq taxonomy object
            taxa_table <- tax_table(taxmat);
            taxa_names(taxa_table) <- rownames(taxmat);
          }
          
          
        }
        
        # making unique id for each OTU consist of lowest taxonomy level present followed by row number
        tmat <- as(tax_table(taxa_table), "matrix")
        rowLn <- nrow(tmat);
        colLn <- ncol(tmat);
        num.mat <- matrix(0, nrow=rowLn, ncol=colLn);
        
        for(i in 1:colLn){
          num.mat[,i] <- rep(i, rowLn);
        }
        
        na.inx <- is.na(tmat);
        blank.inx <- which(tmat=="")
        num.mat[na.inx] <- 0;
        num.mat[blank.inx] <- 0;
        maxInx <- apply(num.mat, 1, max);
        gd.coords <- cbind(1:rowLn, maxInx);
        gd.nms <- tmat[gd.coords];
        mynames <- make.unique(gd.nms, sep="_");
        
        if(length(mynames) != nrow(tmat)){
          AddErrMsg(paste("Errors with the taxonomy table! Please check if there are any names with all NA."));
          return(0);
        }
        taxa_names(taxa_table)<-mynames;
        feat_nm<-NULL;
        mbSetObj$dataSet$taxa_table<-taxa_table;
      }else{
        #sanity check: features name of taxonomy table should match feature names in OTU abundance table,
        # Only features in filtered OTU tables are selected additional are removed.
        taxa_table <- mbSetObj$dataSet$taxa_table;
        
        indx<-match(rownames(data.proc), rownames(taxa_table));
        if(sum(is.na(indx)) > 0){
          na.nms <- rownames(data.proc)[is.na(indx)];
          if(length(na.nms)>10){
            na.nms <- na.nms[1:10];
          }
          AddErrMsg(paste("The following names cannot be found in your taxanomy table (showing 10 max):", paste(na.nms, collapse="; ")));
          return(0);
        }
        
        # let's address ASV sequence ID here
        if(mbSetObj$is.ASV){
          # do the conversion and save 
          new.nms <- paste("ASV", 1:length(taxa.nms), sep="_");
          master.nms <- cbind("NewID"=new.nms, "OriginalID"=taxa.nms);
          fast.write(master.nms, file="ASV_ID_mapping.csv");
          mbSetObj$dataSet$master.nms = master.nms;
          # update the taxa and sample
          names(new.nms) <- taxa.nms;
          rownames(data.proc) <- new.nms[rownames(data.proc)];
          rownames(taxa_table) <- new.nms[rownames(taxa_table)];
          
          # update tree file if uploaded
          if(mbSetObj$tree.uploaded){
            pg_tree <- qs::qread("tree.qs");
            pg_tree$tip.label <- new.nms[pg_tree$tip.label];
            qs::qsave(pg_tree, "tree.qs");
          }
        }
        nm<-colnames(taxa_table);
        taxa_table<-as.matrix(taxa_table[indx,]);
        colnames(taxa_table)<-nm;
        #converting taxonomy file(data frame) to phyloseq taxonomy object;keeping the taxonomy names as it is.
        mbSetObj$dataSet$taxa_table<-tax_table(taxa_table);
        taxa_nms <- rownames(taxa_table)
        
        if(length(unique(taxa_nms))!=length(taxa_nms)){
          dup.tax.nms <- paste(taxa_nms[duplicated(taxa_nms)], collapse="; ");
          AddErrMsg(paste(c("Duplicate taxon names are not allowed:"), dup.tax.nms, collapse=" "));
          return(0);
        }
        taxa_names(mbSetObj$dataSet$taxa_table)<-rownames(taxa_table);
      }
      # creating phyloseq object
      if(isNormInput=="false"){
        data.proc<-apply(data.proc,2,as.integer);
      }   
      data.proc<-otu_table(data.proc,taxa_are_rows =TRUE);
      taxa_names(data.proc)<-taxa_names(mbSetObj$dataSet$taxa_table);
      # removing constant column and making standard names
      ind<-apply(mbSetObj$dataSet$taxa_table[,1], 2, function(x) which(x %in% c("Root", "root")));
      
      if(length(ind)>0){
        mbSetObj$dataSet$taxa_table<-mbSetObj$dataSet$taxa_table[,-1];
        #dataSet$taxa_table<-dataSet$taxa_table #newnew
      }
      
      # removing first KINGDOM OR DOMAIN column
      indx<-apply(mbSetObj$dataSet$taxa_table[,1], 2, function(x) which(x %in% c("k__Fungi", "Fungi", "k__Viruses", "Viruses","k_Bacteria",
                                                                                 "Bacteria", "Archaea", "k__Bacteria", "k__Archea", 
                                                                                 "D_0__Bacteria", "D_0__Archea", "d__Bacteria")));
      
      if(length(indx)>0){
        mbSetObj$dataSet$taxa_table<-mbSetObj$dataSet$taxa_table[,-1];
      }
      classi.lvl<- c("Phylum", "Class", "Order", "Family", "Genus", "Species");
      colnames(mbSetObj$dataSet$taxa_table)<-classi.lvl[1:ncol(mbSetObj$dataSet$taxa_table)]; 
      #sanity check: no of sample of both abundance and metadata should match.
      indx<-match(colnames(data.proc), rownames(mbSetObj$dataSet$sample_data));
      
      if(all(is.na(indx))){
        AddErrMsg("Please make sure that sample names and their number are same in metadata and OTU abundance files.");
        return(0);
      }else{
        mbSetObj$dataSet$sample_data<-mbSetObj$dataSet$sample_data[indx, ,drop=FALSE];
      }
      mbSetObj$dataSet$sample_data<-sample_data(mbSetObj$dataSet$sample_data, errorIfNULL=TRUE);
      #cleaning up the names#deleting [] if present; and substituting (space,.,/ with underscore(_))
      if(!all(validUTF8(mbSetObj$dataSet$taxa_table))){
         AddErrMsg("Unexpected symbol was detected in your taxonomy table, please make sure you have upload the table in correct format!");
        return(0);
       }
      mbSetObj$dataSet$taxa_table<- gsub("[[:space:]./_-]", "_",mbSetObj$dataSet$taxa_table);
      mbSetObj$dataSet$taxa_table<- gsub("\\[|\\]","",mbSetObj$dataSet$taxa_table);
      mbSetObj$dataSet$taxa_table[which(mbSetObj$dataSet$taxa_table==''|grepl("_$",mbSetObj$dataSet$taxa_table))]=NA;
      
      if(taxalabel=="T"){# for mapping to the database in mmp module
        if("Species" %in% colnames( mbSetObj$dataSet$taxa_table)){
          sps = data.frame(mbSetObj$dataSet$taxa_table@.Data[,c('Genus','Species')])
          if(grepl("^g__",sps[1,"Genus"])){
            sps$Species = gsub("^s__","",sps$Species)
            if(all(!grepl(" ",sps$Species)) & all(!grepl("_",sps$Species))){
              sps$Genus =gsub("^g__","",sps$Genus)
              idxsp = which(!is.na(sps$Genus) & !is.na(sps$Species) & sps$Genus!="" & sps$Species!="")
              sps$Species[idxsp] = paste0("s__",sps$Genus[idxsp],"_",sps$Species[idxsp])
            }
          }else if(grepl("^g_",sps[1,"Genus"])){
            sps$Species = gsub("^s_","",sps$Species)
            if(all(!grepl(" ",sps$Species)) & all(!grepl("_",sps$Species))){
              sps$Genus =gsub("^g_","",sps$Genus)
              idxsp = which(!is.na(sps$Genus) & !is.na(sps$Species) & sps$Genus!="" & sps$Species!="")
              sps$Species[idxsp] = paste0("s__",sps$Genus[idxsp],"_",sps$Species[idxsp])
            }
          }else if(grepl("^g_0__",sps[1,"Genus"])){
            sps$Species = gsub("^s_0__","",sps$Species)
            if(all(!grepl(" ",sps$Species)) & all(!grepl("_",sps$Species))){
              sps$Genus =gsub("^g_0__","",sps$Genus)
              idxsp = which(!is.na(sps$Genus) & !is.na(sps$Species) & sps$Genus!="" & sps$Species!="")
              sps$Species[idxsp] = paste0("s_0__",sps$Genus[idxsp],"_",sps$Species[idxsp])
            }
          }else{
            if(all(!grepl(" ",sps$Species)) & all(!grepl("_",sps$Species))){
              idxsp = which(!is.na(sps$Genus) & !is.na(sps$Species) & sps$Genus!="" & sps$Species!="")
              sps$Species[idxsp] = paste0(sps$Genus[idxsp],"_",sps$Species[idxsp])
            }
          }
          mbSetObj$dataSet$taxa_table@.Data[,'Species']  <- sps$Species
        }
        
      }
      #sometimes after removal of such special characters rownames beacame non unique; so make it unique
      mynames1<- gsub("[[:space:]./_-]", "_",taxa_names(mbSetObj$dataSet$taxa_table));
      mynames1<- gsub("\\[|\\]","",mynames1);
      
      if(length(unique(mynames1))< length(taxa_names(mbSetObj$dataSet$taxa_table))){
        mynames1 <- make.unique(mynames1, sep="_");
      }
      
      #after cleaning, names might now be NA
      missing.inx <- which(is.na(mynames1))
      missing_vector <- paste0("New_OTU", seq_along(missing.inx))
      mynames1[missing.inx] <- missing_vector
      
    } else if(any(c(type == "biom", type == "mothur"))){
      # creating phyloseq object
      feat_nm<-rownames(data.proc);
      if(isNormInput=="false"){
        data.proc<-apply(data.proc,2,as.integer);
      }
      data.proc<-otu_table(data.proc,taxa_are_rows =TRUE);
      taxa_names(data.proc)<-feat_nm;
      #sanity check: features name of taxonomy table should match feature names in OTU abundance table,Only features in filtered OTU tables are selected additional are removed.
      indx<-match(taxa_names(data.proc),taxa_names(mbSetObj$dataSet$taxa_table));
      
      if(all(is.na(indx))){
        AddErrMsg("Please make sure that features name of the taxonomy table match the feature names in the OTU abundance table.");
        return(0);
      }
      
      mbSetObj$dataSet$taxa_table<-mbSetObj$dataSet$taxa_table[indx,];
      
      # removing constant column (root) if present otherwise just kingdom column from taxonomy table
      ind<-apply(mbSetObj$dataSet$taxa_table[,1], 2, function(x) which(x %in% c("Root", "root")));
      
      if(length(ind)>0){
        mbSetObj$dataSet$taxa_table<-mbSetObj$dataSet$taxa_table[,-1];
        #dataSet$taxa_table<-dataSet$taxa_table #newnew
      }
      
      # removing first KINGDOM OR DOMAIN column
      indx<-apply(mbSetObj$dataSet$taxa_table[,1], 2, function(x) which(x %in% c("k__Fungi", "Fungi", "k__Viruses", "Viruses", "k_Bacteria",
                                                                                 "Bacteria","Archaea","k__Bacteria","k__Archea", 
                                                                                 "D_0__Bacteria","D_0__Archea", "d__Bacteria")));
      
      if(length(indx)>0){
        mbSetObj$dataSet$taxa_table<-mbSetObj$dataSet$taxa_table[,-1];
      }
      
      classi.lvl<- c("Phylum", "Class", "Order", "Family", "Genus", "Species"); 
      colnames(mbSetObj$dataSet$taxa_table)<-classi.lvl[1:ncol(mbSetObj$dataSet$taxa_table)];
      #sanity check: no of sample of both abundance and metadata should match.
      indx<-match(colnames(data.proc), rownames(mbSetObj$dataSet$sample_data));
      
      if(all(is.na(indx))){
        AddErrMsg("Please make sure that sample names and their number are the same in metadata and OTU abundance files.");
        return(0);
      }else{
        mbSetObj$dataSet$sample_data<-mbSetObj$dataSet$sample_data[indx, ,drop=FALSE];
      }
      
      #cleaning up the names#deleting [] if present; and substituting (space,.,/ with underscore(_))
      mbSetObj$dataSet$taxa_table <- gsub("[[:space:]./_-]", "_",mbSetObj$dataSet$taxa_table);
      mbSetObj$dataSet$taxa_table<- gsub("\\[|\\]","",mbSetObj$dataSet$taxa_table);
      mbSetObj$dataSet$taxa_table[which(mbSetObj$dataSet$taxa_table==''|grepl("_$",mbSetObj$dataSet$taxa_table))]=NA;
      #sometimes after removal of such special characters rownames beacame non unique; so make it unique
      mynames1 <- gsub("[[:space:]./_-]", "_",taxa_names(mbSetObj$dataSet$taxa_table));
      mynames1<-gsub("\\[|\\]","",mynames1);
      
      if(length(unique(mynames1))< length(taxa_names(mbSetObj$dataSet$taxa_table))){
        mynames1 <- make.unique(mynames1, sep="_");
      }
      
      #after cleaning, names might now be NA
      missing.inx <- which(is.na(mynames1))
      missing_vector <- paste0("New_OTU", seq_along(missing.inx))
      mynames1[missing.inx] <- missing_vector
    }
    taxa_names(mbSetObj$dataSet$taxa_table)<-taxa_names(data.proc)<-mynames1;
    mbSetObj$dataSet$sample_data<-sample_data(mbSetObj$dataSet$sample_data, errorIfNULL = TRUE);
    sd_names <- sample_names(mbSetObj$dataSet$sample_data)
    name.match.inx <- sd_names %in% smpl.nms
    
    if(sum(as.numeric(name.match.inx)) == 0){
      AddErrMsg("Issues with sample names in your files! Make sure names are not purely numeric (i.e. 1, 2, 3).");
      return(0);
    }
    mbSetObj$dataSet$proc.phyobj <- merge_phyloseq(data.proc, mbSetObj$dataSet$sample_data, mbSetObj$dataSet$taxa_table);
    
    if(length(rank_names(mbSetObj$dataSet$proc.phyobj)) > 7){
      tax_table(mbSetObj$dataSet$proc.phyobj) <- tax_table(mbSetObj$dataSet$proc.phyobj)[, 1:7]
    }
    #also using unique names for our further data
    prefilt.data <- readDataQs("data.prefilt", module.type, dataName);
    rownames(prefilt.data)<-taxa_names(mbSetObj$dataSet$taxa_table);
    
    saveDataQs(prefilt.data, "data.prefilt", module.type, dataName);
    
  }else if(mbSetObj$module.type == "sdp"| mbSetObj$micDataType =="ko"){
    #constructing phyloseq object for aplha diversity.
    data.proc<-otu_table(data.proc, taxa_are_rows = TRUE);
    taxa_names(data.proc)<-rownames(data.proc);
    
    #sanity check: sample names of both abundance and metadata should match.
    smpl_var<-colnames(mbSetObj$dataSet$sample_data);
    indx<-match(colnames(data.proc), rownames(mbSetObj$dataSet$sample_data));
    
    if(all(is.na(indx))){
      AddErrMsg("Please make sure that sample names are matched between both files.");
      return(0);
    }
    mbSetObj$dataSet$sample_data<-sample_data(mbSetObj$dataSet$sample_data, errorIfNULL=TRUE);
    mbSetObj$dataSet$proc.phyobj <- merge_phyloseq(data.proc, mbSetObj$dataSet$sample_data);
  }
  mbSetObj$dataSet$proc.phyobj@tax_table <- mbSetObj$dataSet$proc.phyobj@tax_table[,!is.na(colnames(mbSetObj$dataSet$proc.phyobj@tax_table))]
  saveDataQs(mbSetObj$dataSet$proc.phyobj, "proc.phyobj.orig", module.type, dataName);
  #do some data cleaning
  mbSetObj$dataSet$data.prefilt <- NULL;
  mbSetObj$dataSet$taxa_type <- taxa_type;
  
  if(module.type == "meta"){
    mbSetObj$dataSet$proc.phyobj <- mbSetObj$dataSet$proc.phyobj;
    mbSetObj$dataSet$norm.phyobj <-mbSetObj$dataSet$proc.phyobj;
  }
  
  return(.set.mbSetObj(mbSetObj));
}


###########################################################################
## Function to make the fake objects for normalized input ##
###########################################################################
### This function is used to create fake objects for
### normalized input to make sure all the function can work properly

CreateFakeFile <- function(mbSetObj,isNormalized="true",isNormalizedMet,module.type){
  
  mbSetObj <- .get.mbSetObj(mbSetObj);
  
  if(isNormalized=="false" & isNormalizedMet=="false"){
    
    AddErrMsg("Please make sure your data has been normalized properly!");
    return(0)
    
  }
  mbSetObj$dataSet$filt.data <- mbSetObj$dataSet$data.orig
  mbSetObj$dataSet$filt.msg <- "No data filtering has been performed since the input data has been transformed."
  
  mbSetObj$dataSet$norm.phyobj <- mbSetObj$dataSet$proc.phyobj
  mbSetObj$dataSet$norm.msg <- "No normalization has been performed since the input data has been transformed."
  
  #make hierarchies
  if(mbSetObj$module.type=="sdp" | mbSetObj$micDataType=="ko"){
    ranks <- "OTU"
  }else if(mbSetObj$module.type=="meta"){ #temporary
    ranks <- "OTU"
  }else{
    ranks <- c(GetMetaTaxaInfo(mbSetObj), "OTU")
    ranks <- unique(ranks)
  } 
  
  
  data.list <- list()
  data.list$merged_obj <- vector(length = length(ranks), "list")
  data.list$count_tables <- vector(length = length(ranks), "list")
  names(data.list$count_tables) <- names(data.list$merged_obj) <- ranks
  
  for(i in 1:length(ranks)){
    phyloseq.obj <- UtilMakePhyloseqObjs(mbSetObj, ranks[i])
    data.list$merged_obj[[i]] <- phyloseq.obj
    count.table <- UtilMakeCountTables(phyloseq.obj, ranks[i])
    data.list$count_tables[[i]] <- count.table
  }
  
  if(exists("current.proc")){
    qs::qsave(data.list,"prescale.phyobj.qs")
    
    current.proc$mic$data.proc<<- data.list$count_tables[["OTU"]]
    
  }
  mbSetObj$dataSet$sample_data$sample_id <- rownames(mbSetObj$dataSet$sample_data);
  
  saveDataQs(data.list, "phyloseq_prenorm_objs.qs",module.type, mbSetObj$dataSet$name);  
  saveDataQs(data.list, "phyloseq_objs.qs",module.type, mbSetObj$dataSet$name);  
  
  return(.set.mbSetObj(mbSetObj));
}


###########################################
## Utilty function to perform edgeR norm ##
###########################################
edgeRnorm = function(x,method){
  # Enforce orientation.
  # See if adding a single observation, 1,
  # everywhere (so not zeros) prevents errors
  # without needing to borrow and modify
  # calcNormFactors (and its dependent functions)
  # It did. This fixed all problems.
  # Can the 1 be reduced to something smaller and still work?
  x = x + 1;
  # Now turn into a DGEList
  y = DGEList(counts=x, remove.zeros=TRUE);
  # Perform edgeR-encoded normalization, using the specified method (...)
  z = edgeR::calcNormFactors(y, method=method);
  # A check that we didn't divide by zero inside `calcNormFactors`
  if( !all(is.finite(z$samples$norm.factors)) ){
    AddErrMsg(paste("Something wrong with edgeR::calcNormFactors on this data, non-finite $norm.factors."));
    return(0);
  }
  return(z)
}


###########################################
##################Scale##################
###########################################
scale_rows = function(x){
  m = apply(x, 1, mean, na.rm = T)
  s = apply(x, 1, sd, na.rm = T)
  return((x - m) / s)
}
scale_mat = function(mat, scale){
  if(!(scale %in% c("none", "row", "column"))){
    stop("scale argument shoud take values: 'none', 'row' or 'column'")
  }
  mat = switch(scale, none = mat, row = scale_rows(mat), column = t(scale_rows(t(mat))))
  return(mat)
}

GetExtendRange<-function(vec, unit=10){
  var.max <- max(vec, na.rm=T);
  var.min <- min(vec, na.rm=T);
  exts <- (var.max - var.min)/unit;
  c(var.min-exts, var.max+exts);
}

unitAutoScale <- function(df){
  df <- as.data.frame(df)
  row.nms <- rownames(df);
  col.nms <- colnames(df);
  df<-apply(df, 2, AutoNorm);
  rownames(df) <- row.nms;
  colnames(df) <- col.nms;
  maxVal <- max(abs(df))
  df<- df/maxVal
  return(df)
}
AutoNorm<-function(x){
  (x - mean(x))/sd(x, na.rm=T);
}

#######################################################################
###Help function for processing metabolomics data##################
#######################################################################

# Limit of detection (1/5 of min for each var)
.replace.by.lod <- function(x){
  lod <- min(x[x>0], na.rm=T)/5;
  x[x==0|is.na(x)] <- lod;
  return(x);
}

ReplaceMissingByLoD <- function(int.mat=data.proc){
  int.mat <- as.matrix(int.mat);
  
  rowNms <- rownames(int.mat);
  colNms <- colnames(int.mat);
  int.mat <- apply(int.mat, 2, .replace.by.lod);
  rownames(int.mat) <- rowNms;
  colnames(int.mat) <- colNms;
  return (int.mat);
}


#'Perform data cleaning
#'@description Cleans data and removes -Inf, Inf, NA, negative and 0s.
#'@param bdata Input data to clean
#'@param removeNA Logical, T to remove NAs, F to not. 
#'@param removeNeg Logical, T to remove negative numbers, F to not. 
#'@param removeConst Logical, T to remove samples/features with 0s, F to not. 
#'@author Jeff Xia\email{jeff.xia@mcgill.ca}
#'McGill University, Canada
#'License: MIT License
#'

CleanData <-function(bdata, removeNA=T, removeNeg=T, removeConst=T){
  
  if(sum(bdata==Inf, na.rm=TRUE)>0){
    inx <- bdata == Inf;
    bdata[inx] <- NA;
    bdata[inx] <- max(bdata, na.rm=T)*2
  }
  if(sum(bdata==-Inf, na.rm=TRUE)>0){
    inx <- bdata == -Inf;
    bdata[inx] <- NA;
    bdata[inx] <- min(bdata, na.rm=T)/2
  }
  if(removeNA){
    if(sum(is.na(bdata))>0){
      bdata[is.na(bdata)] <- min(bdata, na.rm=T)/2
    }
  }
  if(removeNeg){
    if(sum(as.numeric(bdata<=0)) > 0){
      inx <- bdata <= 0;
      bdata[inx] <- NA;
      bdata[inx] <- min(bdata, na.rm=T)/2
    }
  }
  if(removeConst){
    varCol <- apply(data.frame(bdata), 2, var, na.rm=T); # getting an error of dim(X) must have a positive length, fixed by data.frame
    constCol <- (varCol == 0 | is.na(varCol));
    constNum <- sum(constCol, na.rm=T);
    if(constNum > 0){
      bdata <- data.frame(bdata[,!constCol, drop=FALSE], check.names = F); # got an error of incorrect number of dimensions, added drop=FALSE to avoid vector conversion
    }
  }
  bdata;
}

LogNorm<-function(x, min.val){
  log10((x + sqrt(x^2 + min.val^2))/2)
}


# square root, tolerant to negative values
SquareRootNorm<-function(x, min.val){
  ((x + sqrt(x^2 + min.val^2))/2)^(1/2);
}

###########################################
##################Getters##################
###########################################

GetSamplNames<-function(dataName, module.type){
  prefilt.data <- readDataQs("data.prefilt", module.type, dataName);
  return(colnames(prefilt.data));
}

GetSampleNamesaftNorm<-function(mbSetObj, dataName){
  mbSetObj <- .get.mbSetObj(mbSetObj);
  if(mbSetObj$module.type == "meta"){
    if(dataName != ""){
      data <- readDataset(dataName);
    }else{
      data <- qs::qread("merged.data.raw.qs");
      data <- subsetPhyloseqByDataset(mbSetObj, data);
    }
    return(sample_names(data));
  }else{
    return(sample_names(mbSetObj$dataSet$norm.phyobj));
  }
}
xia-lab/MicrobiomeAnalystR documentation built on April 17, 2024, 7:45 p.m.