R/DESeq2_functions.R

Defines functions check_rank_design get_Results_DDS exploratory_plots analysis_DESeq check_terms_matrix check_reduced_LRT get_all_data_DESeq2 filter_signficant_DESEQ relevel_function get_comparison_resultsNames plot_gene_values adjust_samples plot_batch_effect DESeq2_HCGB_function plot_DESeq2_pvalues

Documented in adjust_samples analysis_DESeq check_rank_design check_reduced_LRT check_terms_matrix DESeq2_HCGB_function exploratory_plots filter_signficant_DESEQ get_all_data_DESeq2 get_comparison_resultsNames get_Results_DDS plot_batch_effect plot_DESeq2_pvalues plot_gene_values relevel_function

#' Plot DESeq2 p-values
#'
#' This functions plots pvalues original and after filtering
#' @param pdf_name File path for the pdf to save images
#' @param res Original results (dds object)
#' @param res_filtered Filtered results (dds object)
#' @export
plot_DESeq2_pvalues <- function(pdf_name, res, res_filtered) {
  pdf(pdf_name)
  p1 <- hist(res$pvalue, breaks=100, col="grey50", main = "p-value distribution (No filtered)")
  p2 <- hist(res_filtered$pvalue, breaks=100, col="grey50", main = "p-value distribution (Filtered)")
  
  ### p-value histogram
  use <- res_filtered$baseMean > metadata(res_filtered)$filterThreshold
  h1 <- hist(res_filtered$pvalue[!use], breaks=100, plot=FALSE)
  h2 <- hist(res_filtered$pvalue[use], breaks=100, plot=FALSE)
  colori <- c(`do not pass`="khaki", `pass`="powderblue")
  
  plot(p1)
  plot(p2, add=TRUE, col="powderblue")
  plot(h1, add=TRUE, col="khaki")
  legend("top", legend=c("Filtered low count", "Filtered FDR", "Passed"), fill = c("grey", "khaki", "powderblue"))
  dev.off()
}

#' DESEq2 main function
#'
#' This functions creates the main analysis given a DDS and comparison to test
#' @param dds_object DESeq2 object
#' @param coef_n Coefficient number obtain using resultsName(dds)
#' @param comp_name Name of the comparison
#' @param comp_ID ID tag of the comparison
#' @param numerator Name of the numerator comparison
#' @param denominator Name of the denominator comparison
#' @param OUTPUT_Data_dir Folder path to store results
#' @param df_treatment_Ind Dataframe containing additional information for each sample
#' @param list_of_cols Set of columns with important information in df_treatment_ind 
#' @param threads Number of CPUs to use (Default: 2).
#' @param sign_value.given Adjusted pvlaue cutoff. Default=0.05, 
#' @param LFC.given Log Fold change cutoff. Default=log2(1.2), 
#' @param forceResults Boolean to force re-run analysis if already generated in the folder provided
#' @param gene.annot.df Dataframe containing gene annotation (Default: NULL)
#' @param min_cutoff_to_plot Minimun number of genes significant to continue analysis. Default=3
#' @param max_cutoff_to_plot Number of genes significant to plot as candidates analysis. Default=50
#' @export

DESeq2_HCGB_function = function(dds_object, coef_n, comp_name, comp_ID="comp1",
                                numerator="example1", denominator="example2", 
                                OUTPUT_Data_dir, df_treatment_Ind, list_of_cols, threads=2, 
                                sign_value.given = 0.05, LFC.given = log2(1.2), 
                                min_cutoff_to_plot=3, max_cutoff_to_plot=50,
                                gene.annot.df=NULL, data_type="mRNA",
                                forceResults=FALSE, shrinkage="apeglm") {
  
  #--------------------------
  # Packages
  #--------------------------
  library(DESeq2)
  library(vsn)
  library(EnhancedVolcano)
  library(BiocParallel)
  library(ggfortify)
  library(ggrepel)
  library(pheatmap)
  library(reshape2)
  library(RColorBrewer)
  #--------------------------
  
  #--------------------------
  ## Prepare
  #--------------------------
  file_name <- paste0(comp_ID, "_", comp_name, "_", numerator, "_vs_", denominator)
  
  ## start
  print (paste0("## Starting: ", file_name))
  
  ## Set parallel threads
  print (paste0("Set Multicore: ", as.numeric(threads)))
  register(MulticoreParam(as.numeric(threads)))
  
  ## Create folder
  OUTPUT_Data_sample = file.path(OUTPUT_Data_dir, file_name)
  print (paste0("Create folder: ", OUTPUT_Data_sample))
  dir.create(OUTPUT_Data_sample)
  
  if (forceResults) {
    
  } else {
    ## check if previously done
    if (file.exists(file.path(OUTPUT_Data_sample, "data2return.RData"))) {
      print("Data already available in:")
      print(OUTPUT_Data_sample)
      return(file.path(OUTPUT_Data_sample, "data2return.RData"))
    }
  }
  #--------------------------
  
  #--------------------------
  ## Generate results according to comparison
  #--------------------------
  
  print("+ Using shrinkage estimation provided:")
  print(shrinkage)
  
  ## Compare pvalues distribution
  res <- DESeq2::lfcShrink(dds_object, coef=coef_n, type = shrinkage, parallel = TRUE)
  
  ## filter low counts
  dds_object = HCGB.IGTP.DAnalysis::discard_lowCounts(dds_object = dds_object)
  res_filtered <- DESeq2::lfcShrink(dds_object, coef=coef_n, type=shrinkage, parallel = TRUE)
  
  #--------------------------
  
  #--------------------------
  ### Plot p-values
  #--------------------------
  ## original/filtered
  Pvalues_pdf <- file.path(OUTPUT_Data_sample, paste0(comp_name,"_", numerator, "_vs_", denominator," p-value_distribution.pdf"))
  HCGB.IGTP.DAnalysis::plot_DESeq2_pvalues(Pvalues_pdf, res, res_filtered)
  #--------------------------
  
  #--------------------------
  # Write Results tables
  #--------------------------
  
  ## Merge normalized values and differential expression
  alldata <- merge(as.data.frame(counts(dds_object, normalized=TRUE)), as.data.frame(res_filtered), by="row.names")
  rownames(alldata) <- alldata$Row.names
  alldata$Row.names <- NULL
  #names(alldata)[1] <- "Gene"
  
  if (!is.null(gene.annot.df)) {
    alldata <- merge(gene.annot.df, alldata, by='row.names')
    rownames(alldata) <- alldata$Row.names
    alldata$Row.names <- NULL
  }
  
  Gene <- rownames(alldata)
  alldata <- cbind(Gene, alldata)
  print(head(alldata))
  
  ## Write results
  write.table(alldata, file=file.path(OUTPUT_Data_sample, 
                                      paste0(file_name, "-ResultsCounting_NormValues-and-DE_table.txt")), 
              sep="\t", quote=T, row.names = F)
  
  ##get significant data only with counts in all samples
  sign.data <- filter_signficant_DESEQ(alldata, sign_value = sign_value.given, LFC = LFC.given)
  row.names(sign.data) <- sign.data$Gene
  
  # Results table in the same order than counting table
  write.table(sign.data, 
              file.path(OUTPUT_Data_sample, paste0(file_name, "-ResultsCounting_table_SignificantDE.txt")), 
              sep="\t", quote=T, row.names = F)
  #--------------------------
  
  #--------------------------
  ## Samples
  #--------------------------
  comp_name <- comp_name$category
  #comp_name <- sub("\\.", "-", comp_name)
  
  numerator <- numerator$cmp1
  #numerator <- sub("\\.", "-", numerator)
  
  denominator <- denominator$cmp2
  #denominator <- sub("\\.", "-", denominator)
  
  print("Hi there!")
  alldata2 <- alldata
  
  listOfSampls <- c()
  
  if (denominator=="reference") {
    
    listOfSampls = NULL
    Samplslist <- list()
    
  } else {
    print("Comparison: ")
    print(comp_name)
    
    print("numerator: ")
    print(numerator)
    
    print("denominator: ")
    print(denominator)
    
    print("Samples: ")
    subsheet <- df_treatment_Ind[comp_name]
    colnames(subsheet)[1] <- 'comp_name'
    #print(subsheet)
    
    listOfSampls <- c(rownames(subset(subsheet, comp_name==numerator)),
                      rownames(subset(subsheet, comp_name==denominator)))
    
    print("List of samples in these comparison")
    #print(listOfSampls)
    
    Samplslist <- list(
      numerator = rownames(subset(subsheet, comp_name==numerator)),
      denominator = rownames(subset(subsheet, comp_name==denominator))
    )
    
    print("Samples in numerator")
    print(Samplslist$numerator)
    
    print("Samples in denominator")
    print(Samplslist$denominator)
    
    ## Get data for this samples
    ## 
    
    print("head(alldata)")
    print(head(alldata))
    
    ## add basemean for each category
    alldata2['baseMean_num'] <- rowMeans(alldata[,Samplslist$numerator])
    alldata2['baseMean_den'] <- rowMeans(alldata[,Samplslist$denominator])
    
    ## number of 0s
    alldata2['0counts_num'] <- apply(alldata[,Samplslist$numerator], 
                                     1, function(x) sum(x == 0))
    alldata2['0counts_den'] <- apply(alldata[,Samplslist$denominator], 
                                     1, function(x) sum(x == 0))
    
    ## percentage of counts
    alldata2['0counts.perc_num'] <- round(apply(alldata2[,Samplslist$numerator], 
                                                1, function(x) sum(x == 0))/length(Samplslist$numerator)*100, digits = 2)
    
    alldata2['0counts.perc_den'] <- round(apply(alldata2[,Samplslist$denominator], 
                                                1, function(x) sum(x == 0))/length(Samplslist$denominator)*100, digits = 2)
    
    if (!is.null(gene.annot.df)) {
      
      if (data_type=="mRNA") {
        list_cols <- c("Gene", "ensembl_gene_id", "hgnc_symbol", "description", "gene_biotype")
      
      } else if (data_type=="miRNA") {
        
        list_cols <- c("parent", "ID", "Gene", "variant", "len_isomiR")
        
      } else {
        
      }
      
      list_cols <- c(list_cols,
                     "baseMean", "baseMean_num", 
                     "baseMean_den", "0counts_num", "0counts.perc_num", 
                     "0counts_den", "0counts.perc_den", 
                     "log2FoldChange", "lfcSE", "pvalue", "padj")
      
      
    } else {
      list_cols <- c("Gene", "baseMean", "baseMean_num", 
                     "baseMean_den", "0counts_num", "0counts.perc_num", 
                     "0counts_den", "0counts.perc_den", 
                     "log2FoldChange", "lfcSE", "pvalue", "padj")
      
    }
    
    
    alldata2 <- alldata2[,c(list_cols, Samplslist$numerator, Samplslist$denominator)]
    
    print("head(alldata2)")
    print(head(alldata2))
    
    
    library(openxlsx)
    
    DE.filename <- file.path(OUTPUT_Data_sample, paste0(file_name, "-ResultsCounting.xlsx"))
    sheet_name <- paste0(numerator, "_vs_", denominator)
    title_name <- paste0("Comparison for: ", comp_name, ": ", numerator, " vs. ", denominator)
    
    wb <- openxlsx::createWorkbook()
    openxlsx::addWorksheet(wb, sheet_name)
    openxlsx::writeData(wb, sheet_name, title_name, startRow = 2, startCol=2,rowNames = FALSE, keepNA=TRUE,na.string="NA")
    openxlsx::writeData(wb, sheet_name, alldata2, startRow = 4, startCol=2, rowNames = FALSE, keepNA=TRUE,na.string="NA")
    
    sheet_name2 <- "all.data"
    openxlsx::addWorksheet(wb, sheet_name2)
    openxlsx::writeData(wb, sheet_name2, paste0(title_name, ": all other samples included"), 
                        startRow = 2, startCol=2,rowNames = FALSE, keepNA=TRUE,na.string="NA")
    openxlsx::writeData(wb, sheet_name2, alldata, startRow = 4, startCol=2, rowNames = FALSE, keepNA=TRUE,na.string="NA")
    
    ## add data with all counts
    ##
    openxlsx::saveWorkbook(wb, DE.filename, overwrite = TRUE)
    
  }
  
  
  #--------------------------
  ## check if it is worth to continue, avoid error if missing sign.data
  #--------------------------
  print(length(rownames(sign.data)))
  
  if (length(rownames(sign.data)) < min_cutoff_to_plot) {
    
    data2save<- list(
      "alldata2" = alldata2,
      "alldata" = alldata,
      "Samplslist" = Samplslist,
      #"dds_object" = dds_object,
      #"res"=res,
      "res_filtered" = res_filtered,
      "sign.df"=sign.data,
      "sign.genes"=sign.data$Gene,
      "sign.count"=length(sign.data$Gene),
      "shrinkage.LFC"=shrinkage
    )
    
    ## dump in disk RData
    save(data2save, file=file.path(OUTPUT_Data_sample, "data2return.RData"))
    
    data2return<- list(
      "alldata2" = alldata2,
      "Samplslist" = Samplslist,
      "alldata" = alldata,
      "res_filtered" = res_filtered,
      "sign.df"=sign.data,
      "sign.genes"=sign.data$Gene,
      "sign.count"=length(sign.data$Gene),
      "shrinkage.LFC"=shrinkage
    )
    
    return(data2return)
  } 
  #--------------------------  
  
  #--------------------------
  ## ma plot
  #--------------------------
  pdf(file.path(OUTPUT_Data_sample, "DiffExpression-maplot.jpeg"))
  plotMA(res_filtered)
  dev.off()
  #--------------------------
  
  #--------------------------
  ## volcano
  #--------------------------
  #jpeg(file.path(OUTPUT_Data_sample, paste0(file_name, "_DiffExpression-volcano-plot.jpeg")), 1500, 1000, pointsize=20)
  volcano_main_title <- paste0("Volcano Plot: ", numerator, " vs ", denominator)
  volcan_plot <- EnhancedVolcano::EnhancedVolcano(alldata, x="log2FoldChange", y="padj", lab="",
                                                  pCutoff=sign_value.given, FCcutoff=LFC.given, pointSize=3, labSize=6) + 
    ggplot2::scale_x_continuous() + ggplot2::labs(title = volcano_main_title)
  
  HCGB.IGTP.DAnalysis::save_pdf(folder_path = OUTPUT_Data_sample, 
                                name_file = paste0(file_name, "_DiffExpression-volcano-plot"), plot_given = volcan_plot)
  #--------------------------
  
  #--------------------------
  ## Transform normal
  #--------------------------
  
  ## rlog transformation
  #rld <- DESeq2::rlogTransformation(dds_object)
  
  # variance stabilizing
  vsd <- DESeq2::varianceStabilizingTransformation(dds_object, blind = FALSE)
  
  ## The figure below plots the standard deviation of the transformed data, 
  ## across samples, against the mean, using the shifted logarithm transformation (ntd), 
  ## the regularized log transformation (rld) and the variance stabilizing transformation (vst).
  ## The shifted logarithm has elevated standard deviation in the lower count 
  ## range, and the regularized log to a lesser extent, while for the variance 
  ## stabilized data the standard deviation is roughly constant along the whole
  ## dynamic range.
  ## 
  ## Note that the vertical axis in such plots is the square root of the variance 
  ## over all samples, so including the variance due to the experimental conditions. 
  ## While a flat curve of the square root of variance over the mean may seem like 
  ## the goal of such transformations, this may be unreasonable in the case of 
  ## datasets with many true differences due to the experimental conditions.
  
  ## ntd <- normTransform(dds_object)
  ## pdf(file.path(OUTPUT_Data_sample, paste0(file_name, "_DiffExpression-Transformation.pdf")))
  ## Normal transformation  
  ## meanSdPlot(assay(ntd))
  ## RLE transformation
  ## meanSdPlot(assay(rld))
  ## VarianceStabilization transformation
  ## meanSdPlot(assay(vsd))
  ## dev.off()
  #--------------------------
  
  #--------------------------
  ### Pheatmap top50 DE genes
  #--------------------------
  
  print("Plotting Top significant DE genes with different normalization methods")
  if (is.null(max_cutoff_to_plot)) {
    ## use them all
    select <- rownames(sign.data)
    max_cutoff_to_plot <- length(rownames(sign.data))
  } else {
    select <- rownames(sign.data)[1:max_cutoff_to_plot]
  }
  
  # Plotting Top 50 significant DE genes with different normalization methods:
  select <- select[!is.na(select)] ## discard NA values
  data2pheatmap <- discard_0_counts(countsF = assay(vsd), cutoff = 0.75)
  
  if ( length(select) > 5 ) {
    
    ## plot rld
    #plot1 <- pheatmap(assay(rld)[select,],
    #                  main=paste0("Log Transformation Pheatmap (p.adj<", sign_value.given, " and [LFC]>", LFC.given),
    #                  cluster_rows=TRUE, cluster_cols=TRUE, show_rownames=TRUE, show_colnames = TRUE, legend = TRUE,
    #                  annotation_col = df_treatment_Ind[,list_of_cols],
    #                  color = rev(colorRampPalette(brewer.pal(9, "RdYlBu"))(10)),
    #                  scale="row" ## centered and scale values per row
    #)
    #HCGB.IGTP.DAnalysis::save_pdf(folder_path = OUTPUT_Data_sample,
    #                              name_file = paste0(file_name, "_top50_DEgenes_Heatmap-LogTransformation_allSamples"),
    #                              plot_given = plot1)
    
    ## plot vsd
    plot2 <- try(pheatmap(data2pheatmap[select,],
                      main=paste0("Variance Stabilization Pheatmap (p.adj<", sign_value.given, " and [LFC]>", LFC.given),
                      cluster_rows=TRUE, cluster_cols=TRUE, show_rownames=TRUE, show_colnames = TRUE, legend = TRUE,
                      annotation_col = df_treatment_Ind[,list_of_cols],
                      color = rev(colorRampPalette(brewer.pal(9, "RdYlBu"))(10)),
                      scale="row" ## centered and scale values per row7
    ))
    
    if (!is.null(plot2)) {
      HCGB.IGTP.DAnalysis::save_pdf(folder_path = OUTPUT_Data_sample,
                                  name_file = paste0(file_name, "_top50_DEgenes_Heatmap-VarianceStabiliz_allSamples"),
                                  plot_given = plot2)
    }
    
    
    if (!is.null(listOfSampls)) {
      
      ## Only samples included in comparison
      dataSubset <- try(data2pheatmap[select,listOfSampls], silent = TRUE)
      
      if (exists("dataSubset")) {
        
        print("select:")
        print(select)
        print(head(dataSubset))
        
        ## plot rld
        #plot3 <- pheatmap(dataSubset,
        #                  main=paste0("Log Transformation Pheatmap (p.adj<", sign_value.given, " and [LFC]>", LFC.given),
        #                  cluster_rows=TRUE, cluster_cols=TRUE, show_rownames=TRUE, show_colnames = TRUE, legend = TRUE,
        #                  annotation_col = df_treatment_Ind[,list_of_cols],
        #                  color = rev(colorRampPalette(brewer.pal(9, "RdYlBu"))(10)),
        #                  scale="row" ## centered and scale values per row
        #)
        #save_pdf(folder_path = OUTPUT_Data_sample,
        #         name_file = paste0(file_name, "_top50_DEgenes_Heatmap-LogTransformation"),
        #         plot_given = plot3)
        
        ## plot vsd
        plot4 <- try(pheatmap(dataSubset,
                          main=paste0("Variance Stabilization Pheatmap (p.adj<", sign_value.given, " and [LFC]>", LFC.given),
                          cluster_rows=TRUE, cluster_cols=TRUE, show_rownames=TRUE, show_colnames = TRUE, legend = TRUE,
                          annotation_col = df_treatment_Ind[,list_of_cols],
                          color = rev(colorRampPalette(brewer.pal(9, "RdYlBu"))(10)),
                          scale="row" ## centered and scale values per row7
        ))
        
        if (!is.null(plot4)) {
          
        save_pdf(folder_path = OUTPUT_Data_sample,
                 name_file = paste0(file_name, "_top50_DEgenes_Heatmap-VarianceStabiliz"),
                 plot_given = plot4)
        }
      }
      
    }
    
  }
  #--------------------------
  
  #--------------------------
  ## Add PCA for all significant results
  #--------------------------
  ## create PCA
  ## Add PCA for all significant results
  norm.counts <- as.data.frame(counts(dds_object, normalized=TRUE))
  data2pca <- t(norm.counts[sign.data$Gene,])
  pca_res <- stats::prcomp(as.matrix(data2pca))
  
  pdf(file.path(OUTPUT_Data_sample,"PCA_multiple.pdf"))
  for (i in colnames(df_treatment_Ind[,list_of_cols])) {
    p<-autoplot(pca_res, 
                data=df_treatment_Ind, 
                colour=i) + 
      theme_classic() + 
      ggtitle(paste0("Variable: ", i )) 
    print(p)
  }
  
  p<-autoplot(pca_res,
              data=df_treatment_Ind, 
              colour=i) + 
    geom_text(label=rownames(df_treatment_Ind)) + 
    theme_classic() + ggtitle(paste0("Variable: ", i )) 
  print(p)
  
  dev.off()
  #--------------------------
  
  #--------------------------
  ## Add boxplot for each DE gene
  #--------------------------
  boxplot_DE <- file.path(OUTPUT_Data_sample, "boxplot_DE")
  dir.create(boxplot_DE)
  
  df_treatment_Ind
  
  DE_plots.df <- data.frame(row.names = rownames(df_treatment_Ind), 
                            df_treatment_Ind[,list_of_cols],
                            t(sign.data[,rownames(df_treatment_Ind)]))
  print(DE_plots.df)
  
  ## print only top50
  for (gene_given in head(rownames(sign.data), n=max_cutoff_to_plot)) {
    
    ## 
    g <- gsub("-", "\\.", gene_given)
    g <- gsub("&", "\\.", g)
    g <- gsub(":", "\\.", g)
    g <- gsub("\\+", "\\.", g)
    
    ##
    print(g)
    if (!is.null(gene.annot.df)) {
      gene_annot.df <- gene.annot.df[gene_given,]
      if (data_type=="mRNA") {
        gene_name = paste0(gene_given, "_", gene_annot.df$hgnc_symbol)
      } else if (data_type=="miRNA") {
        gene_name = paste0(gene_annot.df$parent, "_", gene_given)
      }
      print(gene_name)
    } else {
      gene_name = g
    }
      
    pdf(file.path(boxplot_DE, paste0(gene_name, ".pdf")), paper = "A4r", width = 35, height = 12)
    for (i in colnames(df_treatment_Ind[,list_of_cols])) {
      g <- gsub("-", "\\.", g)
      if (is.numeric(df_treatment_Ind[,i])) {
        p2 <- ggscatter_plotRegression(data_all_given = DE_plots.df, 
                                       x.given = g, y.given = i, 
                                       title_string = i)
      } else {
        p2 <- ggboxplot_scatter(data_all_given = DE_plots.df, colName = i, y.coord = g)   
      }
      
      print(p2)
    }
    dev.off()
    
  }
  #--------------------------
  
  ######################################################################
  print ("Finish here for: ")
  print(file_name)
  ######################################################################
  
  ######################################################################
  ## Save
  ######################################################################
  data2save <- list(
    "alldata2" = alldata2,
    "Samplslist" = Samplslist,
    "alldata" = alldata,
    "data2pca" = data2pca,
    #"rld" = rld, ## It takes too much time
    "vsd" = vsd,
    "volcan_plot" = volcan_plot,
    #"dds_object" = dds_object,
    #"res"=res,
    "res_filtered"=res_filtered,
    "sign.df"=sign.data,
    "sign.genes"=sign.data$Gene,
    "sign.count"=length(sign.data$Gene),
    "shrinkage.LFC"=shrinkage,
    "DE_plots.df" = DE_plots.df
  )
  
  ## dump in disk RData
  save(data2save, file=file.path(OUTPUT_Data_sample, "data2return.RData"))
  
  data2return <- list(
    "alldata2" = alldata2,
    "Samplslist" = Samplslist,
    "alldata" = alldata,
    "res_filtered"=res_filtered,
    "sign.df"=sign.data,
    "sign.genes"=sign.data$Gene,
    "sign.count"=length(sign.data$Gene),
    "shrinkage.LFC"=shrinkage,
    "DE_plots.df"=DE_plots.df
  )
  ######################################################################
  return(data2return)  
  
}


#' Plot batch effect
#'
#' This functions plots original PCA and batch corrected given two variables and a putative batch variable
#' @param var1 DESeq2 object
#' @param var2 DESeq2 object
#' @param dds_object DESeq2 object
#' @param dirName Folder path to store results
#' @param batch_var Putative batch variable
#' @export
plot_batch_effect <- function(var1, var2, dds_object, dirName, batch_var) {
  ## remove batch effect?
  ## http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html
  ## Why after VST are there still batches in the PCA plot?
  
  ## Mickael Love:
  ## DESeq() only takes as input the original counts, and this is on purpose. 
  ## This is the optimal statistical approach. To account for batch, you put 
  ## the variables at the beginning of the design, e.g. ~batch + condition 
  
  ## plot batch effect
  vsd_Test <- varianceStabilizingTransformation(dds_object, blind = FALSE)
  vsd_Test1 <- vsd_Test
  
  mat_Test <- assay(vsd_Test)
  mat_Test <- limma::removeBatchEffect(mat_Test, vsd_Test[[batch_var]])
  assay(vsd_Test) <- mat_Test
  
  p1 <- plotPCA(vsd_Test1, intgroup=var1)
  p2 <- plotPCA(vsd_Test, intgroup=var1)
  p3 <- plotPCA(vsd_Test1, intgroup=var2)
  p4 <- plotPCA(vsd_Test, intgroup=var2)
  
  pdf(paste0(dirName, "BatchEffect.pdf"))
  par(mfrow=c(1,2))
  print(p1)
  print(p2)
  print(p3)
  print(p4)
  dev.off()
  ##
}

#' Adjust sample names
#'
#' Adjust samples between sample sheet and files for DESeq2
#' @param counts Expression counts. Samples as columns
#' @param target Phenotypic information. Samples as rows
#' @export
adjust_samples <- function(counts, target){
  
  ## adjust samples in data and in sample sheet
  counts <- counts[, sort(colnames(counts)) ]
  counts <- counts[, colnames(counts) %in% rownames(target) ]
  
  print ("** Samples in counts:")
  print (colnames(counts))
  print ("** Samples in target:")
  print (rownames(target))
  
  print ("** Adjusting...")
  ##
  logical_vec <- rownames(target) %in% colnames(counts)
  target <- target[logical_vec,]
  counts <- counts[, rownames(target) ]
  
  print ("** Match:")
  print (match(rownames(target), colnames(counts)))
  print (rownames(target) == colnames(counts))
  
  list2return <- list("counts" = counts, 
                      "target" = target)
  
  return(list2return)
  
}

#' Plot values per genes
#' 
#' Plot DESeq2 normalization values using ggplot2
#' @param gene Gene ID
#' @param tableCounts Expression counts. Samples as columns
#' @param targetsFile Phenothypic information. Sampes as rows.
#' @param condition List of phenotypic condition to retrieve in targestFile
#' @param out_folder Out folder to create plot
#' @export
plot_gene_values <- function(gene, tableCounts, targetsFile, condition, out_folder, print2file=TRUE) {
  library(ggplot2)
  library(ggpubr)
  library(data.table)
  
  gene_table <- tableCounts[tableCounts$Gene==gene,]
  print(gene_table)
  
  ## check if DE table and contains Gene, pvalue, etc
  gene_table <- dplyr::select(gene_table, -c("Gene", "baseMean", "log2FoldChange", "lfcSE","pvalue","padj"))
  print(gene_table)
  
  
  ## melt data
  gene_table_long <- melt(gene_table,variable.name="sample", value.name="Count")
  rownames(gene_table_long) <- gene_table_long$sample
  print(gene_table_long)
  
  ## get annotation
  print(class(condition))
  print(condition)
  annot_info <- targetsFile[rownames(gene_table_long), condition]
  print(annot)
  
  test <- merge(gene_table_long, annot_info, by.x='sample', by.y="Sample_Name")
  print(test)
  
  ## check if multiple conditions
  x.var = rlang::sym(condition[1])
  y.var = rlang::sym(condition[2])
  
  p <- ggplot(test, aes(x = !! x.var, y = Count, fill= !! y.var)) + 
    geom_boxplot() + geom_point(position=position_jitterdodge(),alpha=0.3)
  
  if (print2file) {
    ## save plot
    save_pdf(folder_path = out_folder, name_file = paste0('boxplot_gene-', gene), plot_given = p)
  }
  return(p)
}

#' Get names provided in comparison
#' 
#' When running DESeq2 you usually get names from resultsNames() such as var_comp1_vs_comp2 e.g. Sex_male_vs_female. This functions returns the name of the variables and the comparison studied.
#' @param str_given A string with the comparison. E.g. Sex_male_vs_female
#' @export
get_comparison_resultsNames <- function(str_given) {
  list_produced <- unlist(strsplit(str_given, split="_")) 
  
  str2return <- list(
    "category" = NULL,
    "cmp1" = NULL,
    "cmp2" = NULL
  )
  
  if ("vs" %in% list_produced) {
    if (list_produced[3] == "vs") {
      # category comp1 vs comp2
      str2return$category = list_produced[1]
      str2return$cmp1 = list_produced[2]
      str2return$cmp2 = list_produced[4]
    } else {
      vs_index <- as.numeric(match("vs", list_produced))
      len_given <- length(list_produced[vs_index:length(list_produced)])
      if (len_given == 2) {
        str2return$category = paste0(list_produced[1], "_", list_produced[2])
        str2return$cmp1 = list_produced[3]
        str2return$cmp2 = list_produced[5]
      } else if (len_given == 3) {
        str2return$category = list_produced[1]
        str2return$cmp1 = paste0(list_produced[2], "_", list_produced[3])
        str2return$cmp2 = paste0(list_produced[5], "_", list_produced[6])
      }
    }
  } else {
    print("Interaction term selected:")
    print(str_given)
    
    str2return$category = "interaction"
    str2return$cmp1 = str_given
    str2return$cmp2 = "reference"
    
  }
  
  return(str2return)
}


#' Relevel and rung DESEQ2 analysis
#' 
#' When running DESeq2 you sometimes require to relevel some comparisons
#' @param dds_object DESeq2 object
#' @param category Name of the variable to use within dds_object metadata
#' @param reference Name of the reference class to set.
#' @param given_dir Output dir to use
#' @param dfAnnotation Dataframe containing additional information for each sample
#' @param int_threads Number of CPUs to use (Default: 2)
#' @param sign_value.given Adjusted pvlaue cutoff. Default=0.05, 
#' @param LFC.given Log Fold change cutoff. Default=log2(1.2), 
#' @param forceResults Boolean to force re-run analysis if already generated in the folder provided
#' @param localFit Use a fitType=local for mean dispersion fit in DESeq2
#' @param shrinkage.given LFC shrinkage estimator provided. Available: apeglm, ashr or normal
#' @export
relevel_function <- function(dds_object, category, reference, 
                             given_dir, dfAnnotation, list_of_cols, 
                             data_type="mRNA", gene.annot.df.given=NULL,
                             int_threads=2, sign_value.given = 0.05, LFC.given = log2(1.2), 
                             comp_ID.given="comp1", forceResults=FALSE, localFit=FALSE, shrinkage.given='apeglm'){
  ## relevel
  dds_object[[category]] <- relevel(dds_object[[category]], ref=reference)
  
  ## re-run dispersion fit
  if (localFit) {
    dds_object_releveled <- DESeq(dds_object, parallel = TRUE, fitType = "local")  
  } else {
    dds_object_releveled <- DESeq(dds_object, parallel = TRUE)
  }
  
  # check
  print("resultsNames(dds_object)")
  print(resultsNames(dds_object))
  
  print("resultsNames(dds_object_releveled)")
  print(resultsNames(dds_object_releveled))
  
  results_list <- list()
  
  for (coef_name in resultsNames(dds_object_releveled)) {
    if (coef_name=="Intercept") {} else {
      print(paste0(" + Analysis for coefficient: ", coef_name))
      listNames <- get_comparison_resultsNames(coef_name)
      
      res_dds = DESeq2_HCGB_function( 
        dds_object = dds_object_releveled, 
        coef_n = coef_name, comp_ID=paste0("relevel_", comp_ID.given), 
        comp_name = listNames[1], 
        numerator = listNames[2], denominator = listNames[3],
        OUTPUT_Data_dir = given_dir, df_treatment_Ind = dfAnnotation, list_of_cols = list_of_cols,
        sign_value.given = sign_value.given, LFC.given = LFC.given,
        threads = as.numeric(int_threads), 
        gene.annot.df = gene.annot.df.given, data_type = data_type,
        forceResults = forceResults, shrinkage = shrinkage.given)
      
      ## save to return
      #print (head(res_dds))
      results_list[[coef_name]] <- res_dds
    }
  }
  
  ## Init data to return
  data2return <- list(
    "dds_obj" = dds_object_releveled,
    "resultsNames" = resultsNames(dds_object_releveled),
    "results" = results_list
  )
  
  return(data2return)
}

#' Filter significant hits from DESEQ2 analysis
#' 
#' When running DESeq2 you usually required significant hits.
#' @param dataF Dataframe with either normalized values and DESEQ2 values or only DESEQ2.
#' @param sign_value Pvalue adjusted cutoff: Default=0.05
#' @param LFC Log Fold Change cutoff: Default: 0.26
#' @export
filter_signficant_DESEQ <- function(dataF, sign_value = 0.05, LFC=0.26) {
  
  #log2FoldChange
  #padj
  dataFilt <- subset(dataF, abs(log2FoldChange)>LFC & padj<sign_value)
  dataFilt <- dataFilt[order(dataFilt$padj),]
  return(dataFilt)
}

#' Get results from DESeq2 and normalized data
#' 
#' When running DESeq2 you usually require to get all statiscal results and normalized data
#' @param dds_obj DESeq2 object (DESeqDataSet)
#' @param coef_n Name of the coefficient obtain from DESeq2::resultsNames(dds_obj))
#' @param type By default RNA is expected and genes, either EntrezID or ENSEMBL ID is used as ID. You can specify XICRA if miRNA is provided and Gene is the combination of miRNA, variant and isomir (e.g hsa-let-7a-2-3p&iso_add3p:1&iso-23-NLJ18XQZD2). If type XICRA provided, columns is splitted into three new columns (parent, variant and UID)
#' @export
get_all_data_DESeq2 <- function(dds_obj, coef_n, type="DESeq2") {
  #res <- DESeq2::results(dds_object, name = resultsNames(dds_object)[coef_n])
  res <- DESeq2::results(dds_obj, name = coef_n)
  
  ## Merge normalized values and differential expression
  alldata <- merge(as.data.frame(counts(dds_obj, normalized=TRUE)), as.data.frame(res), by="row.names", sort=FALSE)
  names(alldata)[1] <- "Gene"
  
  if (type=="XICRA") {
    alldata <- tidyr::separate(alldata, Gene, c('parent', 'variant', 'UID'), sep = '&', remove=FALSE)  
  }
  return(alldata)
}


#' Check the effect of reducing one term from a DESEQ2 design
#' 
#' Check reducing effect using LRT method.
#' @param dds_obj.given DESeq2 object (DESeqDataSet)
#' @param formula_given Formula to substract for formula in dds_obj.given
#' @param comp.folder.given Absolute path to store results
#' @param compID.given Tag name to include for each comparison
#' @param dfAnnotation.given Dataframe with useful metadata to include
#' @param forceResults Boolean to force re-run analysis if already generated in the folder provided
#' @param shrinkage.given LFC shrinkage estimator provided. Available: apeglm, ashr or normal
#' @export
check_reduced_LRT <- function(dds_obj.given, formula_given, 
                              comp.folder.given, compID.given, 
                              dfAnnotation.given,  list_of_cols, int_threads=2, 
                              sign_value.given=0.05, LFC.given = log2(1.2),
                              gene.annot=NULL, forceResults=FALSE, shrinkage.given="apeglm") {
  
  ## LRT: Check reduction
  DEseq.red <- DESeq2::DESeq(object = dds_obj.given, test="LRT", 
                             reduced=as.formula(formula_given))
  
  print(length(resultsNames(DEseq.red)))
  term2use <- tail(resultsNames(DEseq.red), 1)
  print(term2use)
  
  Resultsnames2use <- HCGB.IGTP.DAnalysis::get_comparison_resultsNames(term2use)
  print(Resultsnames2use)
  
  ## get for all 
  DEseq.red.res <- get_Results_DDS(dds_object = DEseq.red, 
                                   OUTPUT_Data_dir_given = comp.folder.given, 
                                   comp_ID = compID.given,
                                   dfAnnotation = dfAnnotation.given, list_of_cols = list_of_cols,
                                   int_threads = int_threads, 
                                   forceResults=forceResults,   
                                   sign_value.given = sign_value.given, 
                                   LFC.given = LFC.given, 
                                   gene.annot=gene.annot, shrinkage.given=shrinkage.given)
  
  return(DEseq.red.res)
}

#' Check the effect of variables in matrix design
#' 
#' When running DESeq2 you usually add multiple terms to the matrix design. Test the effect of them
#' @param sampleSheet.given Samplesheet containing metadata information
#' @param countsGiven Dataframe/matrix of counts
#' @param list.terms List of terms from samplesheet to include in design matrix
#' @param red.formula.given Design formula to use as naive and in reduction
#' @param compID.given Tag name to include for each comparison
#' @param comp.folder.given Absolute path to store results
#' @param int_threads Number of threads to use
#' @param shrinkage.given LFC shrinkage estimator provided. Available: apeglm, ashr or normal
#' @export
check_terms_matrix <- function(sampleSheet.given, countsGiven, list.terms, red.formula.given, list_of_cols,
                               compID.given, comp.folder.given, 
                               int_threads=2, gene.annot.df=NULL, shrinkage.given="apeglm") {
  
  resulst_list <- list()
  
  ##--------------------------
  ## naive
  ##--------------------------
  print("++++++++++++++++++++++++++++")
  print("Naive")
  print("++++++++++++++++++++++++++++")
  print("Formula: ")
  print(paste0("~", red.formula.given))
  
  naive_res <- HCGB.IGTP.DAnalysis::analysis_DESeq(OUTPUT_Data_dir_given = comp.folder.given,
                                                   count_table = countsGiven, 
                                                   sample_sheet_given = sampleSheet.given, 
                                                   int_threads = int_threads,  
                                                   gene.annot=gene.annot.df,
                                                   list_of_cols=list_of_cols,
                                                   formula_given = as.formula(paste0("~", red.formula.given)), 
                                                   early_return = FALSE, comp_ID = paste0(compID.given, ".naive"), shrinkage.given=shrinkage.given)
  
  ## Save only the path
  save(naive_res, file = file.path(comp.folder.given, "naive.RData"))
  naive_res <- NULL
  resulst_list[['naive']] = file.path(comp.folder.given, "naive.RData")
  
  print("##################")
  ##--------------------------
  
  ##--------------------------
  ## add terms: addition
  ##--------------------------
  print("++++++++++++++++++++++++++++")
  print("Test addition and interaction")
  print("++++++++++++++++++++++++++++")
  
  for (term in list.terms) {
    
    print("##################")
    print("Testing the effect of:")
    print(term)
    
    comp_ID.here <- paste0(compID.given, "_", term)
    
    print("Testing the effect of addittion:")
    print("Formula: ")
    print(paste0("~", term, "+", red.formula.given))
    
    this.term.res.add <- try(HCGB.IGTP.DAnalysis::analysis_DESeq(OUTPUT_Data_dir_given = comp.folder.given,
                                                                 count_table = countsGiven, 
                                                                 sample_sheet_given = sampleSheet.given, 
                                                                 int_threads = int_threads, 
                                                                 formula_given = as.formula(paste0("~", term, "+", red.formula.given)), 
                                                                 early_return = FALSE, 
                                                                 gene.annot=gene.annot.df,
                                                                 list_of_cols=list_of_cols,
                                                                 comp_ID = paste0(comp_ID.here, ".add"), shrinkage.given=shrinkage.given))
    ## Save only the path
    save(this.term.res.add, file = file.path(comp.folder.given, paste0(term, ".add.RData")))
    resulst_list[[paste0(term, ".add")]] = file.path(comp.folder.given, paste0(term, ".add.RData"))
    
    print("##################")
    print("Testing the effect of interaction:")
    print("Formula: ")
    print(paste0("~", term, ":", red.formula.given))
    
    this.term.res.int <- try(HCGB.IGTP.DAnalysis::analysis_DESeq(OUTPUT_Data_dir_given = comp.folder.given,
                                                                 count_table = countsGiven, 
                                                                 sample_sheet_given = sampleSheet.given, 
                                                                 int_threads = int_threads, 
                                                                 formula_given = as.formula(paste0("~", term, ":", red.formula.given)), 
                                                                 early_return = FALSE, 
                                                                 gene.annot=gene.annot.df,
                                                                 list_of_cols=list_of_cols,
                                                                 comp_ID = paste0(comp_ID.here, ".int"), shrinkage.given=shrinkage.given))
    ## Save only the path
    save(this.term.res.int, file = file.path(comp.folder.given, paste0(term, ".int.RData")))
    resulst_list[[paste0(term, ".int")]] = file.path(comp.folder.given, paste0(term, ".int.RData"))
    
    print("##################")
    print("Testing the effect of reducing:")
    print(term)
    print("Formula: ")
    print(paste0("~", term, ":", red.formula.given, " vs. ~", red.formula.given))
    
    ## Check reduction
    red.res = try(HCGB.IGTP.DAnalysis::check_reduced_LRT(dds_obj.given = this.term.res.int$dds_obj, 
                                                         formula_given = as.formula(paste0("~", red.formula.given)),
                                                         comp.folder.given = comp.folder.given, 
                                                         gene.annot=gene.annot.df, 
                                                         list_of_cols=list_of_cols,
                                                         compID.given = paste0(comp_ID.here, ".red"), 
                                                         dfAnnotation.given = sampleSheet.given, shrinkage.given=shrinkage.given))
    ## save the path
    save(red.res, file = file.path(comp.folder.given, paste0(term, ".red.RData")))
    resulst_list[[paste0(term, ".red")]] = file.path(comp.folder.given, paste0(term, ".red.RData"))
    
    red.res <- NULL
    this.term.res.int <- NULL
    this.term.res.add <- NULL
  }
  ##--------------------------
  
  ##--------------------------
  ## complex
  ##--------------------------
  ##
  print("++++++++++++++++++++++++++++")
  print("Complex:")
  print("++++++++++++++++++++++++++++")
  
  print("Formula: ")
  print(paste0("~", paste(list.terms, "+ ", collapse = ""), red.formula.given))
  
  complex_res = analysis_DESeq(OUTPUT_Data_dir_given = comp.folder.given,
                               count_table = countsGiven, sample_sheet_given = sampleSheet.given, 
                               int_threads = int_threads, gene.annot=gene.annot.df,
                               list_of_cols=list_of_cols,
                               formula_given = as.formula(paste0("~", 
                                                                 paste(list.terms, "+ ", 
                                                                       collapse = ""), red.formula.given)), 
                               early_return = FALSE, 
                               comp_ID = paste0(compID.given, ".complex"), shrinkage.given=shrinkage.given)
  
  ## save the path
  save(complex_res, file = file.path(comp.folder.given, "complex.RData"))
  resulst_list[["complex"]] = file.path(comp.folder.given, "complex.RData")
  
  ##--------------------------
  
  print("##################")
  
  ##
  
  return(resulst_list)
}


#' DESEQ2 analysis pipeline
#' 
#' When running DESeq2 you usually add multiple terms to the matrix design. Test the effect of them
#' @param sample_sheet_given Samplesheet containing metadata information
#' @param count_table Dataframe/matrix of counts
#' @param OUTPUT_Data_dir_given Absolute path to store results
#' @param int_threads Number of threads to use
#' @param formula_given Design formula to use
#' @param coef_n Number of the coefficient of results to test (if desired)
#' @param early_return Whether to return exploratory results early or not
#' @param comp_ID Tag name to include for each comparison
#' @param cutoff.given add an option to include cutoff when removing Zeros
#' @param sign_value.given Adjusted pvalue cutoff. Default=0.05, 
#' @param LFC.given Log Fold change cutoff. Default=log2(1.2), 
#' @param localFit Use a fitType=local for mean dispersion fit in DESeq2
#' @param forceResults Boolean to force re-run analysis if already generated in the folder provided
#' @param gene.annot Dataframe containing gene annotation (Default: NULL)
#' @param shrinkage.given LFC shrinkage estimator provided. Available: apeglm, ashr or normal
#' @param min_cutoff_to_plot Minimun number of genes significant to continue analysis. Default=3
#' @param max_cutoff_to_plot Number of genes significant to plot as candidates analysis. Default=50
#' @export
analysis_DESeq <- function(OUTPUT_Data_dir_given, count_table, sample_sheet_given, 
                           list_of_cols, formula_given, int_threads=2,
                           sign_value.given = 0.05, LFC.given = log2(1.2),
                           coef_n=NA, early_return=FALSE, comp_ID=NULL, cutoff.given=0.9, 
                           localFit=FALSE, forceResults=FALSE, min_cutoff_to_plot=3, max_cutoff_to_plot=50,
                           gene.annot=NULL, data_type = "mRNA",
                           shrinkage.given="apeglm") {

  #--------------------------
  # Packages
  #--------------------------
  library(DESeq2)
  library(vsn)
  library(EnhancedVolcano)
  library(BiocParallel)
  library(ggfortify)
  library(ggrepel)
  library(pheatmap)
  library(reshape2)
  library(RColorBrewer)
  #--------------------------
  
  dir.create(OUTPUT_Data_dir_given, showWarnings = FALSE)
  
  #############
  ## Create list object for DESeq: remove 0 values
  #############
  data_DESeq <- list(
    "counts"=HCGB.IGTP.DAnalysis::discard_0_counts(countsF = count_table, cutoff = cutoff.given),
    "target"=sample_sheet_given
  )
  data_DESeq <- adjust_samples(data_DESeq$counts, data_DESeq$target)
  
  
  ## Set parallel threads
  print (paste0("Set Multicore: ", int_threads))
  register(MulticoreParam(int_threads))
  #############
  
  #############
  ## Design ###
  #############
  ddsFullCountTable <- DESeq2::DESeqDataSetFromMatrix(
    countData = data_DESeq$counts,
    colData = data_DESeq$target, design = as.formula(formula_given) )
  
  if (localFit) {
    print("fitType = 'local'")
    dds_object <- DESeq2::DESeq(ddsFullCountTable, parallel = TRUE, fitType = "local")
  } else {
    print("fitType = 'parametric'")
    dds_object <- DESeq2::DESeq(ddsFullCountTable, parallel = TRUE)  
  }
  
  ## check
  print("resultsNames(dds_object)")
  print(resultsNames(dds_object))
  
  #############
  ## exploratory dds_object
  #############
  print("Exploratory plots")
  exploratory_plots_dir <- file.path(OUTPUT_Data_dir_given, paste0(comp_ID, "_exploratory_plots"))
  dir.create(exploratory_plots_dir, showWarnings = FALSE)
  
  exploratory_plots_returned <- exploratory_plots(dds_object.exp = dds_object, 
                                                  OUTPUT_dir = exploratory_plots_dir, 
                                                 dfAnnotation_df = sample_sheet_given, 
                                                 list_of_cols = list_of_cols)
  print('Out Exploratory plots')
  #############
  
  if (early_return) {
    return(list("dds_object"=dds_object, 
                "exploratory_plots" = exploratory_plots_returned,
                "resultsNames" = resultsNames(dds_object)
    ))
  }
  #############
  
  ############ 
  # Norm.counts
  ############
  # Save normalized values
  # Normalized values
  dds_object1 = HCGB.IGTP.DAnalysis::discard_lowCounts(dds_object = dds_object)
  
  normValues <- counts(dds_object1, normalized=T)
  print(head(normValues))
  
  if (!is.null(gene.annot)) {
    normValues <- merge(gene.annot, normValues, by='row.names')
    print(head(normValues))
    rownames(normValues) <- normValues$Row.names
    normValues$Row.names <- NULL
    print("## Add annotation")
    print(head(normValues))
  }
  
  Gene <- rownames(normValues)
  normValues <- cbind(Gene, normValues)
  write.table(normValues, file.path(OUTPUT_Data_dir_given, "NormValues_table.txt"), sep="\t", row.names=F, col.names=T, quote=T)
  
  ###########
  # Get results
  #############
  results_list = get_Results_DDS(dds_object = dds_object, 
                                 OUTPUT_Data_dir_given = OUTPUT_Data_dir_given, 
                                 dfAnnotation = sample_sheet_given, list_of_cols, comp_ID = comp_ID, 
                                 sign_value.given = sign_value.given, LFC.given = LFC.given, 
                                 min_cutoff_to_plot=min_cutoff_to_plot, max_cutoff_to_plot=max_cutoff_to_plot,
                                 gene.annot=gene.annot, data_type = data_type,
                                 int_threads = int_threads, coef_n = coef_n, forceResults=forceResults, shrinkage.given=shrinkage.given)
  #############
  
  #############
  ## Init data to return
  #############
  data2return <- list(
    "dds_obj" = dds_object,
    "exploratory_plots" = exploratory_plots_returned,
    "resultsNames" = resultsNames(dds_object),
    "dataDESeq" = data_DESeq,
    "formula" = formula_given,
    "results" = results_list,
    "normValues" = normValues
  )
  
  return(data2return)
}


#' Exploratory plots for DESEQ2 analysis
#' 
#' When running DESeq2 you usually add multiple terms to the matrix design. Test the effect of them
#' @param dds_object.exp DESeq2 object (DESeqDataSet)
#' @param OUTPUT_dir Absolute path to store results
#' @param dfAnnotation_df Dataframe with useful metadata to include
#' @param list_of_cols List of columns of interest to subset from metadata
#' @param forceResults Boolean to force re-run analysis if already generated in the folder provided
#' @export
exploratory_plots <- function(dds_object.exp, OUTPUT_dir, dfAnnotation_df, list_of_cols, forceResults=FALSE){
  
  library(reshape2)
  library(RColorBrewer)
  
  if (forceResults) {
    
  } else {
    ## check if previously done
    if (file.exists(file.path(OUTPUT_dir, "exploratory.RData"))) {
      print("Data already available in:")
      print(OUTPUT_dir)
      return(file.path(OUTPUT_dir, "exploratory.RData"))
    }
  }
  
  ############################
  # Exploratory plots 
  ############################
  
  print('Inside exploratory plots')
  
  ## Create plot for different dispersions and return value of best fit
  
  # Dispersion plot 
  plotDisp <- plotDispEsts(dds_object.exp)
  jpeg(file.path(OUTPUT_dir, "general_dispersion_plot.jpeg"))
  print(plotDisp)
  dev.off()
  
  # Top 50 genes:
  select <- order(rowMeans(counts(dds_object.exp,normalized=TRUE)),decreasing=TRUE)[1:50]
  vsd <- varianceStabilizingTransformation(dds_object.exp, blind = FALSE)
  
  top50heatmap <- pheatmap(assay(vsd)[select,], annotation = dfAnnotation_df[,list_of_cols])
  save_pdf(folder_path = OUTPUT_dir, name_file = "top50_heatmap",
           plot_given = top50heatmap)
  
  ## ape library
  # Clustering:
  # Get sample-to-sample distances
  distsRL <- dist(t(assay(vsd)))
  hc <- hclust(distsRL, method="average")
  
  # Clustering:
  ## ape library
  #pdf(file.path(OUTPUT_dir,"clustering.pdf"))
  #phylo_plot <- plot.phylo(as.phylo(hc), 
  #                         tip.color = HCGB.IGTP.DAnalysis::create_col_palette(dfAnnotation_df$condition, 
  #                                                                             levels(dfAnnotation_df$condition), 
  #                                                                             palette_given = "Set1"),
  #                         direction = "downwards", 
  #                         srt = 180, adj = 1, 
  #                         main=paste("Correlation-based clustering"),)
  #dev.off()
  
  # Get sample-to-sample distances
  distsRL <- dist(t(assay(vsd)))
  mat <- as.matrix(distsRL)
  
  sampleDist <- pheatmap(mat, annotation = dfAnnotation_df[,list_of_cols])
  save_pdf(folder_path = OUTPUT_dir, name_file = "heatmap_samplesDistance",
           plot_given = sampleDist)
  # 
  # ## PCA
  pdf(file.path(OUTPUT_dir,"PCA_multiple.pdf"), paper = "A4r", width = 35, height = 12)
  list_pca <- list()
  for (gr in colnames(dfAnnotation_df)) {
    print(paste0("Printing PCA for ", gr))
    plt_pca <- plotPCA(vsd, intgroup=gr) + ggtitle(gr) + 
      ggrepel::geom_text_repel(label=rownames(dfAnnotation_df)) + theme_classic()
    list_pca[[gr]] <- plt_pca
    print(plt_pca)
  }
  dev.off()
  # 
  
  #PCA_data <- plotPCA(vsd, returnData=TRUE)
  
  ### cooks distance
  df.cooks <- as.data.frame(log10(assays(dds_object.exp)[["cooks"]])) %>% melt()
  
  ## return
  plots2return <- list(
    "plotDisp" = plotDisp,
    "top50heatmap" = top50heatmap,
    #"phylo_plot" = phylo_plot,
    "sampleDist" = sampleDist,
    "PCA" = list(
      "PCA_data" = vsd,
      "PCA_list" = list_pca
    ),
    "cooks.data" = df.cooks
  )
  
  save(plots2return, file = file.path(OUTPUT_dir, "exploratory.RData"))
  
  return(plots2return)
  
}


#' Generate results given a DDS object
#' 
#' When running DESeq2 you usually add multiple terms to the matrix design. Test the effect of them
#' @param dds_object DESeq2 object (DESeqDataSet)
#' @param OUTPUT_Data_dir_given Absolute path to store results
#' @param dfAnnotation Dataframe with useful metadata to include
#' @param comp_ID Tag name to include for each comparison
#' @param int_threads Number of threads to use in the analysis
#' @param coef_n Number of the coefficient of results to test (if desired)
#' @param sign_value.given Adjusted pvlaue cutoff. Default=0.05, 
#' @param LFC.given Log Fold change cutoff. Default=log2(1.2), 
#' @param forceResults Boolean to force re-run analysis if already generated in the folder provided
#' @param gene.annot Dataframe containing gene annotation (Default: NULL)
#' @param shrinkage.given LFC shrinkage estimator provided. Available: apeglm, ashr or normal
#' @param min_cutoff_to_plot Minimun number of genes significant to continue analysis. Default=3
#' @param max_cutoff_to_plot Number of genes significant to plot as candidates analysis. Default=50

#' @export
get_Results_DDS <- function(dds_object, OUTPUT_Data_dir_given, dfAnnotation, list_of_cols, comp_ID,
                            sign_value.given = 0.05, LFC.given = log2(1.2),
                            int_threads=2, coef_n=NA, forceResults=FALSE, 
                            min_cutoff_to_plot=3, max_cutoff_to_plot=50,
                            gene.annot=NULL, data_type = "mRNA", shrinkage.given='apeglm') {

  ###########
  # Get results
  #############
  
  results_list <- list()
  if (is.numeric(coef_n)) {
    listNames <- get_comparison_resultsNames(resultsNames(dds_object)[coef_n])
    print(listNames)
    print(paste0(" + Analysis for coefficient given: ", as.character(coef_n)))
    res_dds = try(DESeq2_HCGB_function(
      dds_object = dds_object, coef_n = coef_n, comp_name = listNames[1], comp_ID = comp_ID,
      numerator = listNames[2], denominator = listNames[3],
      OUTPUT_Data_dir = OUTPUT_Data_dir_given, df_treatment_Ind = dfAnnotation,  
      list_of_cols = list_of_cols,
      sign_value.given = sign_value.given, LFC.given = LFC.given,
      min_cutoff_to_plot=min_cutoff_to_plot, max_cutoff_to_plot=max_cutoff_to_plot,
      threads = as.numeric(int_threads), forceResults=forceResults, 
      gene.annot.df = gene.annot, data_type = data_type, shrinkage=shrinkage.given))

    ## save to return
    coef_name = as.character(resultsNames(dds_object)[coef_n])
    results_list[[coef_name]] = res_dds
    
  } else {
    
    for (coef_name in resultsNames(dds_object)) {
      if (coef_name=="Intercept") {} else {
        print(paste0(" + Analysis for coefficient: ", coef_name))
        listNames <- get_comparison_resultsNames(coef_name)
        
        print(listNames)
        
        res_dds = try(DESeq2_HCGB_function(
          dds_object = dds_object, coef_n = coef_name, comp_ID = comp_ID,
          comp_name = listNames[1], numerator = listNames[2], denominator = listNames[3],
          OUTPUT_Data_dir = OUTPUT_Data_dir_given, df_treatment_Ind = dfAnnotation, list_of_cols = list_of_cols,
          sign_value.given = sign_value.given, LFC.given = LFC.given,
          min_cutoff_to_plot=min_cutoff_to_plot, max_cutoff_to_plot=max_cutoff_to_plot,
          threads = as.numeric(int_threads), forceResults=forceResults, 
          gene.annot=gene.annot, data_type = data_type, shrinkage=shrinkage.given))

        ## save results
        results_list[[coef_name]] = res_dds
      }
    }
    
  }
  #############
  
  return(results_list)
}


#' Check the rank of a design matrix
#' 
#' When running DESeq2 you need a design matrix, check the rank of it first
#' @param formula2test String with formula to check
#' @param data.df Sample sheet dataframe
#' @export
check_rank_design <- function(formula2test, data.df) {
  m <- model.matrix(as.formula(formula2test), data=data.df)
  
  print("colnames(m)")
  print(colnames(m))
  
  print("Check if colSums or rowSums == 0")
  
  print("## check rows: samples")
  row.res <- apply(m, 1, function(x) all(x==0)) ## check rows: samples
  print(table(row.res))
  print("")
  print(row.res)
  print("")
  print(which(row.res))
  print("which(row.res)")
  
  print("")
  print("## check columns: categories")
  col.res <- apply(m, 2, function(x) all(x==0)) ## check columns: categories
  
  print("table(col.res)")
  print(table(col.res))
  print("")
  print(col.res)
  print("")
  print("which(col.res)")
  print(which(col.res))
}
HCGB-IGTP/HCGB.IGTP.DAnalysis documentation built on April 13, 2025, 12:03 a.m.