R/visualisation_and_clustering_functions.R

# #cran
# install.packages(c("ggplot2", "Matrix", "dbscan", "pheatmap", "fpc",
#"dynamicTreeCut", "factoextra", "digest", "RColorBrewer", "doParallel"))
# #bioconductor
# source("https://bioconductor.org/biocLite.R")
# biocLite(c("BiocParallel", "scran", "scater", "monocle",
#"SingleCellExperiment", "KEGGREST"))

suppressMessages(library(BiocParallel, warn.conflicts = F))
suppressMessages(library(scran, warn.conflicts = F))
suppressMessages(library(ggplot2, warn.conflicts = F))
suppressMessages(library(scater, warn.conflicts = F))
suppressMessages(library(Matrix, warn.conflicts = F))
suppressMessages(library(monocle, warn.conflicts = F))
suppressMessages(library(SingleCellExperiment, warn.conflicts = F))
suppressMessages(library(dbscan, warn.conflicts = F))
suppressMessages(library(pheatmap, warn.conflicts = F))
suppressMessages(library(fpc, warn.conflicts = F))
suppressMessages(library(dynamicTreeCut, warn.conflicts = F))
suppressMessages(library(factoextra, warn.conflicts = F))
suppressMessages(library(digest, warn.conflicts = F))
suppressMessages(library(KEGGREST, warn.conflicts = F))
suppressMessages(library(RColorBrewer, warn.conflicts = F))
suppressMessages(require(doParallel))
suppressMessages(require(matrixStats))
suppressMessages(library(AnnotationDbi, warn.conflicts = F))
suppressMessages(library(dplyr, warn.conflicts = F))
suppressMessages(library(biomaRt, warn.conflicts = F))
suppressMessages(library(org.Mm.eg.db, warn.conflicts = F))
suppressMessages(library(xlsx, warn.conflicts = F))
suppressMessages(library(grDevices, warn.conflicts = F))
suppressMessages(library(S4Vectors, warn.conflicts = F))
suppressMessages(library(Biobase, warn.conflicts = F))
suppressMessages(library(foreach, warn.conflicts = F))

### Internal function.
### this function calculates PCA and then tSNE with PCs and perplexities ###
### it returns a list of pSNE = PCA+tSNE results ###
### to get XY coordinates call psne_res[1,i][[1]] ###
### i = [1:length(PCs)*length(perplexities)] is a number of iteration ###

getTSNEresults <- function(expressionMatrix, cores=1,
                           PCs=c(4, 6, 8, 10, 20, 40, 50),
                           perplexities=c(30, 40), randomSeed=42){
    PCAData <- prcomp(t(expressionMatrix))$x
    myCluster <- parallel::makeCluster(cores, # number of cores to use
                             type = "PSOCK") # type of cluster
    doParallel::registerDoParallel(myCluster)
    tSNECoordinates <- foreach::foreach(PCA=rep(PCs, length(perplexities)),
                               perp=rep(perplexities, each=length(PCs)),
                               .combine='cbind') %dopar% {
                                   library(SingleCellExperiment)
                        scater::plotTSNE(SingleCellExperiment(assays=list(
                            logcounts=t(PCAData[,1:PCA]))),
                        scale_features=FALSE, perplexity=perp,
                        rand_seed=randomSeed, theme_size=13, return_SCESet=FALSE)
                        }
    parallel::stopCluster(myCluster)
    message(paste("Calculated", length(PCs)*length(perplexities),
"2D-tSNE plots."))
    return(tSNECoordinates)
}

# Do not export this function.
.testClustering <- function(sceObject, dataDirectory, experimentName,
                           dbscanEpsilon=1.4,
                           minPts=5,
                           perplexities = c(30), PCs = c(4),
                           randomSeed = 42,
                           width=7, height=7, onefile=FALSE, #pdf
                           family, title, fonts, version,
                           paper, encoding, bg, fg, pointsize,
                           pagecentre, colormodel,
                           useDingbats, useKerning,
                           fillOddEven, compress){

  initialisePath(dataDirectory)
  dir.create(file.path(dataDirectory, "test_clustering"), showWarnings = F)

  message("Generating TSNE.")
  #1. Generating 2D tSNE plots
  tSNE <- getTSNEresults(expr = Biobase::exprs(sceObject), cores=1,
                         perplexities = perplexities,
                         PCs = PCs,
                         randomSeed = randomSeed)

  message("Saving results.")
  #picture checker
  pdf(file.path(dataDirectory, "test_clustering", "test_tSNE.pdf"),
      width=width, height=height, onefile=onefile, # not changed by default
      family=family, title=title, fonts=fonts, version=version,
      paper=paper, encoding=encoding, bg=bg, fg=fg, pointsize=pointsize,
      pagecentre=pagecentre, colormodel=colormodel,
      useDingbats=useDingbats, useKerning=useKerning,
      fillOddEven=fillOddEven, compress=compress)
  print(tSNE)
  dev.off()


  #2. Clustering with dbscan
  # choosing for the best epsilon
  pdf(file.path(dataDirectory, "test_clustering", "distance_graph.pdf"),
      width=width, height=height, onefile=onefile, # not changed by default
      family=family, title=title, fonts=fonts, version=version,
      paper=paper, encoding=encoding, bg=bg, fg=fg, pointsize=pointsize,
      pagecentre=pagecentre, colormodel=colormodel,
      useDingbats=useDingbats, useKerning=useKerning,
      fillOddEven=fillOddEven, compress=compress)
  plotDistanceGraphWithEpsilon(tSNE$data, epsilon=dbscanEpsilon,
                               minNeighbours = minPts)
  dev.off()

  pdf(file.path(dataDirectory, "test_clustering", "test_clustering.pdf"),
      width=width, height=height, onefile=onefile, # not changed by default
      family=family, title=title, fonts=fonts, version=version,
      paper=paper, encoding=encoding, bg=bg, fg=fg, pointsize=pointsize,
      pagecentre=pagecentre, colormodel=colormodel,
      useDingbats=useDingbats, useKerning=useKerning,
      fillOddEven=fillOddEven, compress=compress)
  print(plotTestClustering(tSNE$data, epsilon=dbscanEpsilon,
                           minNeighbours = minPts))
  dev.off()

  message("Pictures of test clustering were exported.")
  return(list(tSNE,
              plotDistanceGraphWithEpsilon(tSNE$data, epsilon=dbscanEpsilon,
                                                 minNeighbours = minPts),
              plotTestClustering(tSNE$data, epsilon=dbscanEpsilon,
                                 minNeighbours = minPts)))

}

#' To check one iteration of clustering before running full workflow CONCLUS.
#' 
#' This function generates a single clustering iteration of CONCLUS to check whether
#' chosen parameters for dbscan are suitable for your data.
#'
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param dataDirectory output directory (supposed to be the same for one experiment during the workflow).
#' @param experimentName name of the experiment which will appear in filenames (supposed to be the same for one experiment during the workflow).
#' @param dbscanEpsilon a parameter of fpc::dbscan() function.
#' @param minPts a parameter of fpc::dbscan() function.
#' @param PCs a vector of PCs for plotting.
#' @param perplexities vector of perplexities (t-SNE parameter).
#' @param randomSeed random seed for reproducibility.
#' @param width plot width.
#' @param height plot height.
#' @param ... other pdf() arguments.
#'
#' @return t-SNE results, a distance graph plot, a t-SNE plot colored by test clustering solution.
#' @export
testClustering <- function(sceObject, dataDirectory, experimentName,
                           dbscanEpsilon=1.4,
                           minPts=5,
                           perplexities = c(30), PCs = c(4),
                           randomSeed = 42,
                           width=7, height=7, ...){

  .testClustering(sceObject, dataDirectory, experimentName,
                  dbscanEpsilon=dbscanEpsilon,
                  minPts=minPts,
                  perplexities = perplexities, PCs = PCs,
                  randomSeed = randomSeed,
                  width=width, height=height, ...)
}

#' Choose palette for a plot.
#'
#' It is an internal function usually applied for choosing the palette for clusters.
#' Depending if the number of clusters is more than 12 or not, one of two built-in palettes will be applied.
#' If you give your vector of colors, the function will not change them.
#' If the number of clusters is more than 26, it will copy colors to get the needed length of the palette.
#'
#' @param colorPalette Either "default" or a vector of colors, for example c("yellow", "#CC79A7"). 
#' @param clustersNumber number of clusters in the output palette.
#'
#' @return Color palette with the number of colors equal to the clusterNumber parameter.
#' @export
choosePalette <- function(colorPalette, clustersNumber){

  colorPalette26 <- c( "yellow", "darkgoldenrod1", "coral1", "deeppink",
                       "indianred", "coral4", "darkblue", "darkmagenta",
                       "darkcyan", "mediumorchid", "plum2", "gray73",
                       "cadetblue3", "khaki",
                       "darkolivegreen1", "lightgreen", "limegreen",
                       "darkolivegreen4", "green", "#CC79A7", "violetred3",
                       "brown3", "darkslategray1", "gray51", "slateblue2",
                       "blue")

  pickDefaultPalette <- function(clustersNumber, colorPalette26){
    if(clustersNumber < 13) return(c("#A6CEE3", "#1F78B4", "#B2DF8A", "#33A02C",
                                     "#FB9A99", "#E31A1C", "#FDBF6F", "#FF7F00",
                                     "#CAB2D6", "#6A3D9A", "#FFFF99",
                                     "#B15928")[1:clustersNumber])
    return(rep(colorPalette26,
              round(clustersNumber/length(colorPalette26))+1)[1:clustersNumber])
  }

  if(length(colorPalette) == 1){
    if(colorPalette == "default"){
      return(pickDefaultPalette(clustersNumber, colorPalette26))
    }
  }

  if(clustersNumber > length(colorPalette)){
    message("The number of clusters is greater than the number of colors.
Using default CONCLUS palette instead.")
    return(pickDefaultPalette(clustersNumber, colorPalette26))
  }

  return(colorPalette[1:clustersNumber])
}

chooseStatePalette <- function(statesNumber){

  colorPalette <- c("#8DD3C7", "#FFFFB3", "#BEBADA", "#FB8072", "#80B1D3",
                      "#FDB462", "#B3DE69", "#FCCDE5", "#D9D9D9", "#BC80BD",
                      "#CCEBC5", "#FFED6F")
  return(rep(colorPalette,
             round(statesNumber/length(colorPalette)) + 1)[1:statesNumber])
}

#' Run CONCLUS in one click
#'
#' This function performs core CONCLUS workflow. It generates PCA and t-SNE coordinates, 
#' runs DBSCAN, calculates similarity matrices of cells and clusters, assigns cells to clusters,
#' searches for positive markers for each cluster. The function saves plots and tables into dataDirectory.
#'
#' @param sceObject a SingleCellExperiment object with your data.
#' @param dataDirectory CONCLUS will create this directory if it doesn't exist and store there all output files.
#' @param experimentName most of output file names of CONCLUS are hardcoded.
#' experimentName will stay at the beginning of each output file name to
#' distinguish different runs easily.
#' @param colorPalette a vector of colors for clusters.
#' @param statePalette a vector of colors for states.
#' @param clusteringMethod a clustering methods passed to hclust() function.
#' @param epsilon a parameter of fpc::dbscan() function.
#' @param minPoints a parameter of fpc::dbscan() function.
#' @param k preferred number of clusters. Alternative to deepSplit. A parameter of cutree() function.
#' @param PCs a vector of first principal components.
#' For example, to take ranges 1:5 and 1:10 write c(5, 10).
#' @param perplexities a vector of perplexity for t-SNE.
#' @param randomSeed random seed for reproducibility.
#' @param deepSplit intuitive level of clustering depth. Options are 1, 2, 3, 4.
#' @param preClustered if TRUE, it will not change the column clusters after the run.
#' However, it will anyway run DBSCAN to calculate similarity matrices.
#' @param orderClusters can be either FALSE (default) of "name".
#' If "name", clusters in the similarity matrix of cells will be ordered by name.
#' @param cores maximum number of jobs that CONCLUS can run in parallel.
#' @param plotPDFcellSim if FALSE, the similarity matrix of cells will be saved in png format.
#' FALSE is recommended for count matrices with more than 2500 cells due to large pdf file size.
#' @param deleteOutliers whether cells which were often defined as outliers by dbscan must be deleted.
#' It will require recalculating of the similarity matrix of cells. Default is FALSE.
#' Usually those cells form a separate "outlier" cluster and can be easier distinguished and deleted later
#' if necessary.
#' @param tSNEalreadyGenerated if you already ran CONCLUS ones and have t-SNE coordinated saved
#' You can set TRUE to run the function faster since it will skip the generation of t-SNE coordinates and use the stored ones. 
#' Option TRUE requires t-SNE coordinates to be located in your 'dataDirectory/tsnes' directory.
#' @param tSNEresExp experimentName of t-SNE coordinates which you want to use.
#' This argument allows copying and pasting t-SNE coordinates between different CONCLUS runs without renaming the files.
#'
#' @keywords CONCLUS
#' @export
#' @return A SingleCellExperiment object.

runCONCLUS <- function(sceObject, dataDirectory, experimentName,
                       colorPalette="default",
                       statePalette="default",
                       clusteringMethod="ward.D2",
                       epsilon=c(1.3, 1.4, 1.5), minPoints=c(3, 4), k=0,
                       PCs=c(4, 6, 8, 10, 20, 40, 50),
                       perplexities=c(30,40),
                       randomSeed = 42,
                       deepSplit=4, preClustered = F,
                       orderClusters = FALSE,
                       cores=14,
                       plotPDFcellSim = TRUE,
                       deleteOutliers = TRUE,
                       tSNEalreadyGenerated = FALSE,
                       tSNEresExp = ""){

  initialisePath(dataDirectory)

  # Generating 2D tSNE plots
  if(!tSNEalreadyGenerated){
      tSNEResults <- generateTSNECoordinates(sceObject, dataDirectory,
                                             experimentName, PCs=PCs,
                                             perplexities=perplexities,
                                             randomSeed = randomSeed)
  }else{
      tSNEResults <- readRDS(file.path(dataDirectory, "output_tables",
                                     paste0(tSNEresExp,"_tSNEResults.rds")))
  }

  if(preClustered){
    # Running dbscan
    message("Running dbscan using ", cores, " cores.")
    dbscanResults <- runDBSCAN(tSNEResults, sceObject, dataDirectory,
                               experimentName, epsilon=epsilon,
                               minPoints=minPoints,
                               cores=cores)

    # assigning cells to clusters
    message("Calculating cells similarity matrix.")
    cellsSimilarityMatrix <- clusterCellsInternal(dbscanResults, sceObject, clusterNumber=k,
                                      deepSplit=deepSplit, cores=cores,
                                      clusteringMethod=clusteringMethod)[[2]]
    sceObjectFiltered <- sceObject
  } else {
    # Running clustering
    clusteringResults <- runClustering(tSNEResults, sceObject, dataDirectory,
                                       experimentName,
                                       epsilon=epsilon, minPoints=minPoints,
                                       k=k, deepSplit=deepSplit,
                                       cores=cores,
                                       clusteringMethod=clusteringMethod,
                                       deleteOutliers = deleteOutliers,
                                       PCs=PCs,
                                       perplexities=perplexities,
                                       randomSeed = randomSeed)
    sceObjectFiltered <- clusteringResults[[1]]
    cellsSimilarityMatrix <- clusteringResults[[2]]
  }

  print(table(SummarizedExperiment::colData(sceObjectFiltered)$clusters,
              dnn=list("Cells distribution by clusters")))

  clustersNumber <- length(unique(SummarizedExperiment::colData(sceObjectFiltered)$clusters))
  colorPalette <- choosePalette(colorPalette, clustersNumber)

  # Plotting cluster stablility and 2D visualisations
  plotCellSimilarity(sceObjectFiltered, cellsSimilarityMatrix, dataDirectory,
                     experimentName, colorPalette,
                     orderClusters = orderClusters,
                     statePalette = statePalette,
                     clusteringMethod = clusteringMethod,
                     plotPDF = plotPDFcellSim)

  plotClusteredTSNE(sceObjectFiltered, dataDirectory, experimentName,
                    PCs=PCs, perplexities=perplexities, colorPalette,
                    columnName = "clusters",
                    tSNEresExp = tSNEresExp)
  plotClusteredTSNE(sceObjectFiltered, dataDirectory, experimentName,
                    PCs=PCs, perplexities=perplexities, colorPalette,
                    columnName = "noColor",
                    tSNEresExp = tSNEresExp)
  if(any(colnames(SummarizedExperiment::colData(sceObjectFiltered)) %in% "state")){
      plotClusteredTSNE(sceObjectFiltered, dataDirectory, experimentName,
                        PCs=PCs, perplexities=perplexities, statePalette,
                        columnName = "state",
                        tSNEresExp = tSNEresExp)
  }

  # Calculating cluster similarity and marker genes
  clustersSimilarityMatrix <-
      calculateClustersSimilarity(cellsSimilarityMatrix,
          sceObject = sceObjectFiltered,
          clusteringMethod = clusteringMethod)[[1]]
  
  # Exporting key matrices
  exportMatrix(cellsSimilarityMatrix, dataDirectory, experimentName,
               "cellsSimilarityMatrix")
  exportMatrix(clustersSimilarityMatrix, dataDirectory, experimentName,
               "clustersSimilarityMatrix")

  plotClustersSimilarity(clustersSimilarityMatrix,
                         sceObject = sceObjectFiltered,
                         dataDirectory = dataDirectory,
                         experimentName = experimentName,
                         colorPalette = colorPalette,
                         statePalette = statePalette,
                         clusteringMethod = clusteringMethod)

  rankGenes(sceObjectFiltered, clustersSimilarityMatrix, dataDirectory,
            experimentName)

  return(sceObjectFiltered)
}

#' Export matrix to a file.
#' 
#' The function allows you to export a matrix to a .csv file with a hard-coded filename (according to experimentName) 
#' in the "dataDirectory/output_tables" directory for further analysis.
#'
#' @param matrix your matrix (e.g., expression matrix)
#' @param dataDirectory CONCLUS output directory for a given experiment (supposed to be the same for one experiment during the workflow).
#' @param experimentName name of the experiment which will appear at the beginning of the filenames 
#' (supposed to be the same for one experiment during the workflow).
#' @param name name of the file. Will be placed after the experimentName header.
#' @export
exportMatrix <- function(matrix, dataDirectory, experimentName, name){

  fileName <- paste0(experimentName, "_", name, ".csv")
  write.table(matrix, file=file.path(dataDirectory, "output_tables", fileName),
              sep = ",")
}

#' Create all needed directories for CONCLUS output.
#'
#' @param dataDirectory output directory for a given CONCLUS run (supposed to be the same for one experiment during the workflow).
#' @export
initialisePath <- function(dataDirectory){
  # creates directories for further writing of results.
  # names of directories are hardcoded.
  # no idea if it is good or bad.

  graphsDirectory <- "pictures"
  markerGenesDirectory <- "marker_genes"
  tSNEDirectory <- "tsnes"
  outputDataDirectory <- "output_tables"
  tSNEPicturesDirectory <- "tSNE_pictures"


  dir.create(dataDirectory, showWarnings=F)
  dir.create(file.path(dataDirectory, graphsDirectory), showWarnings=F)
  dir.create(file.path(dataDirectory, graphsDirectory, tSNEPicturesDirectory),
             showWarnings=F)
  dir.create(file.path(dataDirectory, markerGenesDirectory), showWarnings=F)
  dir.create(file.path(dataDirectory, tSNEDirectory), showWarnings=F)
  dir.create(file.path(dataDirectory, outputDataDirectory), showWarnings=F)

}

#' Generate and save t-SNE coordinates with selected parameters.
#'
#' The function generates several t-SNE coordinates based on given perplexity and ranges of PCs. 
#' Final number of t-SNE plots is length(PCs)*length(perplexities)
#' It writes coordinates in "dataDirectory/tsnes" subfolder.
#'
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param dataDirectory output directory for CONCLUS (supposed to be the same for one experiment during the workflow).
#' @param experimentName name of the experiment which will appear in filenames (supposed to be the same for one experiment during the workflow).
#' @param randomSeed random seed for reproducibility.
#' @param cores maximum number of jobs that CONCLUS can run in parallel.
#' @param PCs a vector of first principal components.
#' For example, to take ranges 1:5 and 1:10 write c(5, 10).
#' @param perplexities a vector of perplexity (t-SNE parameter).
#'
#' @return An object with t-SNE results (coordinates for each plot).
#' @export
generateTSNECoordinates <- function(sceObject, dataDirectory, experimentName,
                                    randomSeed=42, cores=14,
                                    PCs=c(4, 6, 8, 10, 20, 40, 50),
                                    perplexities=c(30,40)){

  tSNEDirectory <- "tsnes"
  message(paste0("Running TSNEs using ", cores, " cores."))
  TSNEres <- getTSNEresults(Biobase::exprs(sceObject), cores=cores, PCs=PCs,
                            perplexities=perplexities, randomSeed=randomSeed)

  PCA <- rep(PCs, length(perplexities))
  perp <- rep(perplexities, each=length(PCs))

  outputDir <- file.path(dataDirectory, tSNEDirectory)
  filesList <- list.files(outputDir, pattern = "_tsne_coordinates_")
  deletedFiles <- sapply(filesList, function(fileName) deleteFile(fileName,
                                                                  outputDir))
  for (i in 1:(length(PCs)*length(perplexities))){
    write.table(TSNEres[1, i][[1]],
                file=file.path(dataDirectory, tSNEDirectory,
                    paste0(experimentName,'_tsne_coordinates_', i, "_" ,
                           PCA[i], "PCs_", perp[i], "perp.tsv")),
                quote=FALSE, sep='\t')
  }
  saveRDS(TSNEres, file = file.path(dataDirectory, "output_tables",
                             paste0(experimentName,"_tSNEResults.rds")))
  rm(tSNEDirectory, PCA, perp)
  return(TSNEres)
}

checkTSNEPicture <- function(tSNEResults, sceObject){
  # gives the unclustered picture of tSNE
  # to be sure that normalization step was
  # successful

  tSNECoords <- tSNEResults[[1]]
  tSNECoords <- tSNECoords[rownames(tSNECoords) %in%
                               SummarizedExperiment::colData(sceObject)$cellName, ]

  ggplot2::ggplot(tSNECoords, aes_string(x=names(tSNECoords)[1],
                                y=names(tSNECoords)[2])) +
    geom_point(size=I(1))
}

plotDistanceGraph <- function(tSNEResults, minNeighbours=5, tSNEIndex=3){
  # plots kNN distance graph
  # for choosing right values
  # of epsilon for further DBSCAN analysis


  tSNECoords <- tSNEResults[1,tSNEIndex][[1]]
  dbscan::kNNdistplot(tSNECoords, k=minNeighbours)
}

plotDistanceGraphWithEpsilon <- function(tSNEData, minNeighbours=5,
                                         epsilon=1.2){
  # similar function as plotDistanceGraph,
  # but with already known epsilon value
  #

  dbscan::kNNdistplot(tSNEData, k=minNeighbours)
  abline(h=epsilon, lty=2)

}

plotTestClustering <- function(tSNEData, minNeighbours=5,
                               epsilon=1.2){
  # plots test DBSCAN on one of the pictures
  # for being ensured that clustering will
  # probably work successful

  dbscanResults <- fpc::dbscan(tSNEData, eps=epsilon, MinPts=minNeighbours)
  factoextra::fviz_cluster(dbscanResults, tSNEData, ellipse=TRUE, geom="point",
               legend="bottom")
}

### This function calculates dbscan for all t-SNE from TSNEtables with all
### combinations of paramenters from epsilon and minPoints
### it does not set random seed. It allows to vary this parameter automatically
### it returns a matrix where columns are iterations
### number of iterations is equal to
### ncol is (TSNEtables)*length(epsilon)*length(epsilon)

mkDbscan <- function(TSNEtables, cores = 14, epsilon = c(1.2, 1.5, 1.8),
                    minPoints = c(15, 20)){
    myCluster <- parallel::makeCluster(cores, # number of cores to use
                             type = "PSOCK") # type of cluster
    doParallel::registerDoParallel(myCluster)
    dbscanResults <- foreach::foreach(i=rep(rep(1:ncol(TSNEtables),
                                   each=length(minPoints)),
                                   length(epsilon)),
                             eps=rep(epsilon,
                                 each=ncol(TSNEtables)*length(minPoints)),
                             MinPts=rep(minPoints,
                                    ncol(TSNEtables)*length(epsilon)),
                         .combine='cbind') %dopar% {
                             fpc::dbscan(TSNEtables[1,i][[1]], eps=eps,
                                         MinPts=MinPts)$cluster
                         }
    parallel::stopCluster(myCluster)
    return(dbscanResults)
}

#' Run clustering iterations with selected parameters using DBSCAN.
#'
#' This function returns a matrix of clustering iterations of DBSCAN.
#'
#' @param tSNEResults results of conclus::generateTSNECoordinates() function.
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param dataDirectory output directory for CONCLUS (supposed to be the same for one experiment during the workflow).
#' @param experimentName name of the experiment which will appear in filenames 
#' (supposed to be the same for one experiment during the workflow).
#' @param cores maximum number of jobs that CONCLUS can run in parallel.
#' @param epsilon a fpc::dbscan() parameter.
#' @param minPoints a fpc::dbscan() parameter.
#'
#' @return A matrix of DBSCAN results.
#' @export
runDBSCAN <- function(tSNEResults, sceObject, dataDirectory, experimentName,
                      cores=14, epsilon=c(1.3, 1.4, 1.5), minPoints=c(3, 4)){

  outputDataDirectory <- "output_tables"
  # taking only cells from the sceObject
  for(i in 1:ncol(tSNEResults)){
      tmp <- tSNEResults[1,i][[1]]
      tSNEResults[1,i][[1]] <- tmp[colnames(sceObject),]
  }
  dbscanResults <- mkDbscan(tSNEResults, cores = cores, epsilon = epsilon,
                             minPoints = minPoints)
  dbscanResults <- t(dbscanResults)
  colnames(dbscanResults) <- SummarizedExperiment::colData(sceObject)$cellName
  write.table(dbscanResults, file.path(dataDirectory, outputDataDirectory,
              paste0(experimentName, "_dbscan_results.tsv")), quote = FALSE,
              sep = "\t")
  rm(tmp)
  return(dbscanResults)
}

### This function calculates how many time a cell were not assigned to
### any clusters by dbscan. ###
### it returns a data frame ###
mkOutlierScoreDf <- function(mat){

    outlierScoreDf <- as.data.frame(colnames(mat))
    colnames(outlierScoreDf) <- "cellName"
    outlierScoreDf <- dplyr::mutate(outlierScoreDf, outlierScore=NA)

    for(i in 1:ncol(mat)){
        vec <- mat[,i]
        outlierScoreDf$outlierScore[i] <- length(vec[vec == 0])
    }

    outlierScoreDf$outlierScorePer <- outlierScoreDf$outlierScore / nrow(mat)
    return(outlierScoreDf)
}

excludeOutliers <- function(dbscanMatrix, sceObject, threshold=0.3){
  # exclude outliers based on DBSCAN clustering
  # outliers are the cells which cannot be assigned
  # to any final cluster

  outlierInfo <- mkOutlierScoreDf(dbscanMatrix)
  colData <- SummarizedExperiment::colData(sceObject)[SummarizedExperiment::colData(sceObject)$cellName
                                %in% colnames(dbscanMatrix),]

  if(is.vector(colData)){
    colData <- S4Vectors::DataFrame(cellName=colData, row.names=colData)
  }

  # Case if coldata has the outliers scores already
  colData$outlierScorePer <- NULL
  colData$outlierScore <- NULL

  colData <- merge(colData, outlierInfo,
                 by.x="cellName", by.y="cellName",
                 all.x=TRUE, all.y=TRUE, sort=FALSE)
  rownames(colData) <- colData$cellName

  numberOfCellsBefore <- dim(colData)[1]
  sceObject <- sceObject[,colData$outlierScorePer < threshold]
  colData <- colData[colData$outlierScorePer < threshold,]
  numberOfCellsAfter <- dim(colData)[1]

  dbscanMatrix <- dbscanMatrix[,outlierInfo$outlierScorePer < threshold]
  SummarizedExperiment::colData(sceObject) <- colData

  message(paste(numberOfCellsBefore - numberOfCellsAfter,
              "outliers were excluded from the SingleCellExperiment object.\n"))

  return(list(sceObject, dbscanMatrix))
}

mkSimMat <- function(mat, cores=14){

    myCluster <- parallel::makeCluster(cores, # number of cores to use
                             type = "PSOCK") # type of cluster
    doParallel::registerDoParallel(myCluster)

    simMats <- foreach::foreach(i=1:nrow(mat)) %dopar% {
        simMat <- matrix(0, ncol=ncol(mat), nrow=ncol(mat))
        colnames(simMat) <- colnames(mat)
        rownames(simMat) <- colnames(mat)

        vec <- unique(mat[i,])
        for(j in vec[vec!=0]){
            selCol <- colnames(mat)[mat[i,] == j]
            simMat[rownames(simMat) %in% selCol,
                    colnames(simMat) %in% selCol] <-
                simMat[rownames(simMat) %in% selCol,
                        colnames(simMat) %in% selCol] + 1
        }
        rm(cl, selCol, i, j)
        return(simMat)
    }
    parallel::stopCluster(myCluster)

    simMat <- matrix(0, ncol=ncol(mat), nrow=ncol(mat))
    colnames(simMat) <- colnames(mat)
    rownames(simMat) <- colnames(mat)

    for(i in 1:nrow(mat)){
        simMat <- simMat + simMats[[i]]
    }

    rm(simMats)
    simMat <- simMat / nrow(mat)
    stopifnot(isSymmetric(simMat))

    return(simMat)
}

#' Cluster cells and get similarity matrix of cells.
#' 
#' The function returns consensus clusters by using hierarchical clustering on the similarity matrix of cells.
#' It provides two options: to specify an exact number of clusters (with clusterNumber parameter)
#' or to select the depth of splitting (deepSplit parameter).
#' 
#' @param dbscanMatrix an output matrix of conclus::runDBSCAN() function.
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param clusterNumber a parameter, specifying the exact number of cluster.
#' @param deepSplit a parameter, specifying how deep we will split the clustering tree. It takes integers from 1 to 4.
#' @param cores maximum number of jobs that CONCLUS can run in parallel.
#' @param clusteringMethod a clustering methods passed to hclust() function.
#'
#' @return A SingleCellExperiment object with modified/created "clusters" column in the colData, and cells similarity matrix.
#' @export
clusterCellsInternal <- function(dbscanMatrix, sceObject, clusterNumber=0,
                         deepSplit, cores=14,
                         clusteringMethod = "ward.D2") {
  # 

  cellsSimilarityMatrix <- mkSimMat(dbscanMatrix, cores=cores)

  distanceMatrix <- as.dist(sqrt((1-cellsSimilarityMatrix)/2))
  clusteringTree <- hclust(distanceMatrix, method=clusteringMethod)

  if(clusterNumber == 0){
    message(paste0("Assigning cells to clusters. DeepSplit = ", deepSplit))
    clusters <- unname(cutreeDynamic(clusteringTree,
                                     distM=as.matrix(distanceMatrix),
                                     verbose=0, deepSplit = deepSplit))
  } else {
    message(paste0("Assigning cells to ", clusterNumber, " clusters."))
    clusters <- cutree(clusteringTree, k=clusterNumber)
  }

  SummarizedExperiment::colData(sceObject)$clusters <- factor(clusters)

  return(list(sceObject, cellsSimilarityMatrix))
}

# Do not export this function.
.plotCellSimilarity <- function(sceObject, cellsSimilarityMatrix, dataDirectory,
                               experimentName, colorPalette="default",
                               statePalette="default", clusteringMethod="ward.D2",
                               orderClusters = FALSE,
                               plotPDF = TRUE,
                               returnPlot = FALSE,
                               width=7, height=6, onefile=FALSE, #pdf
                               family, title, fonts, version,
                               paper, encoding, bg, fg, pointsize,
                               pagecentre, colormodel,
                               useDingbats, useKerning,
                               fillOddEven, compress,
                               color = colorRampPalette(rev( #pheatmap
                                   RColorBrewer::brewer.pal(n = 7,
                                              name = "RdYlBu")))(100),
                               kmeans_k = NA, breaks = NA,
                               border_color = "grey60",
                               cellwidth = NA, cellheight = NA,
                               scale = "none",
                               cluster_rows = FALSE, #not default
                               cluster_cols = FALSE, #not default
                               clustering_distance_rows = "euclidean",
                               clustering_distance_cols = "euclidean",
                               clustering_method = "complete",
                               clustering_callback = identity2,
                               cutree_rows = NA, cutree_cols = NA,
                               treeheight_row = ifelse((
                                   class(cluster_rows) == "hclust") ||
                                       cluster_rows, 50, 0),
                               treeheight_col = ifelse((
                                   class(cluster_cols) == "hclust") ||
                                       cluster_cols, 50, 0),
                               legend = TRUE,
                               legend_breaks = NA,
                               legend_labels = NA,
                               annotation_row = NA,
                               annotation_col = NA,
                               annotation = NA,
                               annotation_colors = NA,
                               annotation_legend = TRUE,
                               annotation_names_row = TRUE,
                               annotation_names_col = TRUE,
                               drop_levels = TRUE,
                               show_rownames = FALSE, #not default
                               show_colnames = FALSE, #not default
                               fontsize = 7.5, #not default (10)
                               fontsize_row = 0.03,
                               fontsize_col = fontsize,
                               display_numbers = F,
                               number_format = "%.2f",
                               number_color = "grey30",
                               fontsize_number = 0.8 * fontsize,
                               gaps_row = NULL, gaps_col = NULL,
                               labels_row = NULL, labels_col = NULL,
                               filename = NA,
                               widthHeatmap = NA, heightHeatmap = NA, #edited
                               silent = FALSE,
                               main = paste0("Cells similarity matrix ",
                                             ncol(cellsSimilarityMatrix),
                                             " columns, ",
                                             nrow(cellsSimilarityMatrix),
                                             " rows."),
                               widthPNG = 800, heightPNG = 750 #png
                               ){
  # plots cells correlation matrix gained form
  # clusterCellsInternal() function as the result of DBSCAN

  graphsDirectory <- "pictures"
  colData <- SummarizedExperiment::colData(sceObject)
  clustersNumber <- length(unique(colData$clusters))

  if(orderClusters == "name"){
      # Ordering expressionMatrixrix
      newOrder <- unname(unlist(sapply(levels(colData$clusters),
                                   function(cluster)
                                       orderCellsInCluster(cluster,
                                           colData,
                                           expressionMatrix,
                                           clusteringMethod=clusteringMethod))))

      cellsSimilarityMatrix <- cellsSimilarityMatrix[newOrder, newOrder]
      cluster_cols <- FALSE
      cluster_rows <- FALSE
  }else if(orderClusters == FALSE){
      distanceMatrix <- as.dist(sqrt((1-cellsSimilarityMatrix)/2))
      clusteringTree <- hclust(distanceMatrix, method=clusteringMethod)
      cluster_cols <- clusteringTree
      cluster_rows <- clusteringTree
  }else{
      message("Unknown option of orderClusters. Options are 'FALSE' or 'name'.
Using the default version 'FALSE'.")
      distanceMatrix <- as.dist(sqrt((1-cellsSimilarityMatrix)/2))
      clusteringTree <- hclust(distanceMatrix, method=clusteringMethod)
      cluster_cols <- clusteringTree
      cluster_rows <- clusteringTree
  }

  annotationColors <- generateAnnotationColors(colData, colorPalette,
                                               statePalette)
  columnsToPlot <- switch(is.null(colData$state) + 1, c("clusters", "state"),
                          c("clusters"))
  
  pheatmapObject <- pheatmap::pheatmap(cellsSimilarityMatrix,
                           show_colnames=show_colnames,
                           show_rownames=show_rownames,
                           annotation_col=as.data.frame(colData[columnsToPlot]),
                           annotation_colors=annotationColors,
                           fontsize_row=fontsize_row,
                           cluster_cols=cluster_cols,
                           cluster_rows=cluster_rows,
                           fontsize=fontsize,
                           main = main,
                           color = color, kmeans_k = kmeans_k, # not changed by default
                           breaks = breaks,
                           border_color = border_color,
                           cellwidth = cellwidth, cellheight = cellheight, scale = scale,
                           clustering_distance_rows = clustering_distance_rows,
                           clustering_distance_cols = clustering_distance_cols,
                           clustering_method = clustering_method,
                           clustering_callback = clustering_callback,
                           treeheight_row = treeheight_row,
                           treeheight_col = treeheight_col, legend = legend,
                           legend_breaks = legend_breaks,
                           legend_labels = legend_labels, annotation_row = annotation_row,
                           annotation = annotation, annotation_legend = annotation_legend,
                           annotation_names_row = annotation_names_row,
                           annotation_names_col = annotation_names_col,
                           drop_levels = drop_levels,
                           fontsize_col = fontsize_col,
                           display_numbers = display_numbers,
                           number_format = number_format,
                           number_color = number_color,
                           fontsize_number = fontsize_numbere, gaps_row = gaps_row,
                           gaps_col = gaps_col, labels_row = labels_row,
                           labels_col = labels_col, filename = filename, width = widthHeatmap,
                           height = heightHeatmap, silent = silent)

  if(plotPDF){
      pdf(file=file.path(dataDirectory, graphsDirectory, paste(experimentName,
                    "cells_correlation", clustersNumber,"clusters.pdf", sep="_")),
          width=width, height=height, onefile=onefile, # not changed by default
          family=family, title=title, fonts=fonts, version=version,
          paper=paper, encoding=encoding, bg=bg, fg=fg, pointsize=pointsize,
          pagecentre=pagecentre, colormodel=colormodel,
          useDingbats=useDingbats, useKerning=useKerning,
          fillOddEven=fillOddEven, compress=compress)
  }else{
      message("Plot type is not pdf. Saving in png.")
      png(filename=file.path(dataDirectory, graphsDirectory,
                             paste(experimentName,
                               "cells_correlation", clustersNumber,
                               "clusters.png", sep="_")),
          width = widthPNG, height = heightPNG, type = "cairo")
      #message("Plot type is not pdf. Saving in svg.")
      #svg(file.path(dataDirectory, graphsDirectory,
      #              paste(experimentName,
      #                    "cells_correlation", clustersNumber,
      #                    "clusters.svg", sep="_")),width=8,height=8)
  }
  grid::grid.newpage()
  grid::grid.draw(pheatmapObject$gtable)
  dev.off()

  if(returnPlot){
      return(pheatmapObject)
  }
}

#' Save a cells similarity matrix.
#' 
#' This function plots similarity matrix as a heatmap, so one can see similarity between parts of different clusters.
#'
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param dataDirectory output directory for CONCLUS (supposed to be the same for one experiment during the workflow).
#' @param experimentName name of the experiment which will appear in filenames (supposed to be the same for one experiment during the workflow).
#' @param cellsSimilarityMatrix an output matrix from the conclus::clusterCellsInternal() function.
#' @param colorPalette "default" or a vector of colors for the column "clusters" in the colData, for example c("yellow", "#CC79A7"). 
#' @param statePalette "default" or a vector of colors for the column "state" in the colData, for example c("yellow", "#CC79A7"). 
#' @param clusteringMethod a clustering methods passed to hclust() function.
#' @param orderClusters boolean, order clusters or not.
#' @param plotPDF if TRUE export to pdf, if FALSE export to png. 
#' FALSE is recommended for datasets with more than 2500 cells due to large pdf file size.
#' @param returnPlot boolean, return plot or not. Default if FALSE.
#' @param width plot width.
#' @param height plot height.
#' @param ... other parameters of pdf(), pheatmap() and png() functions.
#'
#' @return A ggplot object or nothing (depends on the returnPlot parameter).
#' It saves the pdf in "dataDirectory/pictures" folder.
#' @export
plotCellSimilarity <- function(sceObject, cellsSimilarityMatrix, dataDirectory,
                               experimentName, colorPalette="default",
                               statePalette="default", clusteringMethod="ward.D2",
                               orderClusters = FALSE,
                               plotPDF = TRUE,
                               returnPlot = FALSE,
                               width=7, height=6, ...){

  .plotCellSimilarity(sceObject, cellsSimilarityMatrix, dataDirectory,
                      experimentName, colorPalette=colorPalette,
                               statePalette=statePalette, clusteringMethod=clusteringMethod,
                               orderClusters = orderClusters,
                               plotPDF = plotPDF,
                               returnPlot = returnPlot,
                               width=width, height=height, ...)
}

# Do not export this function.
.plotClusteredTSNE <- function(sceObject, dataDirectory, experimentName,
                              tSNEresExp = "",
                              colorPalette = "default",
                              PCs=c(4, 6, 8, 10, 20, 40, 50),
                              perplexities=c(30, 40),
                              columnName="clusters",
                              returnPlot = FALSE,
                              width=6, height=5, onefile=FALSE, #pdf
                              family, title, fonts, version,
                              paper, encoding, bg, fg, pointsize,
                              pagecentre, colormodel,
                              useDingbats, useKerning,
                              fillOddEven, compress){
  # plots picture based on t-SNE coordinates from
  # generateTSNECoordinates() and clustering results
  # from clusterCellsInternal() or runClustering()

  tSNEDirectory <- "tsnes"
  graphsDirectory <- "pictures"
  graphsTSNEDirectory <- "tSNE_pictures"

  if(tSNEresExp == ""){
      tSNEresExp <- experimentName
  }

  ### Plot all precalculated pSNEs to show your clusters ###

  if(columnName == "noColor"){
      numberElements <- NULL
  }else{
      numberElements <- length(unique(SummarizedExperiment::colData(sceObject)[,columnName]))
      colorPalette <- choosePalette(colorPalette, numberElements)
  }

  outputDir <- file.path(dataDirectory, graphsDirectory, graphsTSNEDirectory,
                        paste("tSNE", numberElements, columnName, sep="_"))
  dir.create(outputDir, showWarnings = F)

  filesList <- list.files(outputDir, pattern = "_tsne_coordinates_")
  deletedFiles <- sapply(filesList, function(fileName) deleteFile(fileName, 
                                                                  outputDir))

  PCA <- rep(PCs, length(perplexities))
  perp <- rep(perplexities, each=length(PCs))

  tSNEplots <- rep(list(NA),(length(PCs)*length(perplexities)))

  for(i in 1:(length(PCs)*length(perplexities))){

    coordinatesName <- paste0(tSNEresExp, '_tsne_coordinates_', i, "_",
                              PCA[i], "PCs_", perp[i], "perp")

    #fns <- list.files(file.path(dataDirectory, tSNEDirectory), full.names = TRUE)

    TSNEres <- read.delim(file.path(dataDirectory, tSNEDirectory,
                                    paste0(coordinatesName, ".tsv")),
                          stringsAsFactors = FALSE)
    #TSNEres <- read.delim(fns[grepl(paste0(PCA[i], "PCs_", perp[i], "perp.tsv"), fns)],
    #                            stringsAsFactors = FALSE)

    TSNEres <- TSNEres[rownames(TSNEres) %in% SummarizedExperiment::colData(sceObject)$cellName, ]

    if(columnName != "noColor"){
        TSNEres[columnName] <- factor(SummarizedExperiment::colData(sceObject)[,columnName])
    }

    pdf(file.path(dataDirectory, graphsDirectory, graphsTSNEDirectory,
                  paste("tSNE", numberElements, columnName, sep="_"),
                  paste0(coordinatesName, ".pdf")),
        width=width, height=height, onefile=onefile, # not changed by default
        family=family, title=title, fonts=fonts, version=version,
        paper=paper, encoding=encoding, bg=bg, fg=fg, pointsize=pointsize,
        pagecentre=pagecentre, colormodel=colormodel,
        useDingbats=useDingbats, useKerning=useKerning,
        fillOddEven=fillOddEven, compress=compress)
    if(columnName == "noColor"){
        tmp <- ggplot2::ggplot(TSNEres, aes_string(x=names(TSNEres)[1],
                                          y=names(TSNEres)[2])) +
            geom_point(size=I(1)) + theme_bw()
    }else{
        tmp <- ggplot2::ggplot(TSNEres, aes_string(x=names(TSNEres)[1],
                                          y=names(TSNEres)[2],
                                          color=columnName)) +
            geom_point(size=I(1)) +
            scale_color_manual(values=colorPalette) + theme_bw()
    }
    print(tmp)
    dev.off()
    tSNEplots[[i]] <- tmp
  }
  if(returnPlot){
      return(tSNEplots)
  }
  rm(PCA, perp)
}

#' Plot t-SNE. Addtionally, it can highlight clusters or states.
#'
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param dataDirectory output directory for CONCLUS (supposed to be the same for one experiment during the workflow).
#' @param experimentName name of the experiment which will appear in filenames (supposed to be the same for one experiment during the workflow).
#' @param tSNEresExp if t-SNE coordinates were generated in a different CONCLUS run, you can use them without renaming the files.
#' Please copy tsnes folder from the source run to the current one and write that experimentName in the tSNEresExp argument.
#' @param colorPalette "default" or a vector of colors for the column "clusters" in the colData, for example c("yellow", "#CC79A7").
#' @param PCs vector of PCs (will be specified in filenames).
#' @param perplexities vector of perplexities (will be specified in filenames).
#' @param columnName name of the column to plot on t-SNE dimensions.
#' @param returnPlot boolean, return plot or not.
#' @param width plot width.
#' @param height plot height.
#' @param ... other arguments of the pdf() function.
#'
#' @return A ggplot object or nothing (depends on the returnPlot parameter).
#' @export
plotClusteredTSNE <- function(sceObject, dataDirectory, experimentName,
                              tSNEresExp = "",
                              colorPalette = "default",
                              PCs=c(4, 6, 8, 10, 20, 40, 50),
                              perplexities=c(30, 40),
                              columnName="clusters",
                              returnPlot = FALSE,
                              width=6, height=5, ...){

  .plotClusteredTSNE(sceObject, dataDirectory, experimentName,
                     tSNEresExp = tSNEresExp,
                     colorPalette = colorPalette,
                     PCs=PCs,
                     perplexities=perplexities,
                     columnName=columnName,
                     returnPlot = returnPlot,
                     width=width, height=height, ...)
}

### This function returns a matrix with "protocells" representing clusters ###
### values show how much two "protocells" are similar ###
### 1 if clusters are very similar, 0 if very different ###

mkSimMed <- function(simMat, clusters){

    clusMed <- matrix(ncol=length(unique(clusters)), nrow=nrow(simMat))
    clusterNames <- levels(clusters)

    for(i in 1:ncol(clusMed)){
        clusMed[,i] <- matrixStats::rowMedians(simMat[,clusters == clusterNames[i]])
    }

    clusMed <- t(clusMed)

    simMed <- matrix(ncol=length(unique(clusters)),
                     nrow=length(unique(clusters)))

    for(i in 1:ncol(simMed)){
        simMed[,i] <- matrixStats::rowMedians(clusMed[,clusters == clusterNames[i]])
    }

    # colnames(simMed) = 1:length(unique(clusters))
    # rownames(simMed) = 1:length(unique(clusters))

    colnames(simMed) <- clusterNames
    rownames(simMed) <- clusterNames

    return(simMed)
}

#' Having cells similarity, calculate clusters similarity.
#'
#' @param cellsSimilarityMatrix a similarity matrix, one of the results of conclus::clusterCellsInternal() function.
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param clusteringMethod a clustering methods passed to hclust() function.
#'
#' @return A list contating the cluster similarity matrix and cluster names (order).
#' @export
calculateClustersSimilarity <- function(cellsSimilarityMatrix, sceObject,
                                        clusteringMethod){

    # Calculating cluster similarity for plotting picture
    # and ranking genes result is the square matrix with
    # dimension equal to number of cluster. Numbers in matrix
    # are similarity between cluster.

    clusters <- SummarizedExperiment::colData(sceObject)$clusters
    clustersNumber <- length(unique(clusters))
    clustersNames <- levels(clusters)

    # Calculating matrix
    clustersSimilarityMatrix <-
      mkSimMed(simMat=as.matrix(cellsSimilarityMatrix), clusters=clusters)

    # Plotting matrix
    distanceMatrix <- as.dist(sqrt((1-clustersSimilarityMatrix)/2))
    clusteringTree <- hclust(distanceMatrix, method=clusteringMethod)

    clustersSimOrdered <- data.frame(clusterNames = clustersNames,
                                     clusterIndexes = 1:clustersNumber)
    rownames(clustersSimOrdered) <- clustersSimOrdered$clusterIndexes
    clustersSimOrdered <- clustersSimOrdered[clusteringTree$order,]

    return(list(clustersSimilarityMatrix, clustersSimOrdered$clusterNames))

}

# Don't export this function
.plotClustersSimilarity <- function(clustersSimilarityMatrix, sceObject,
                                   dataDirectory,
                                   experimentName, colorPalette,
                                   statePalette,
                                   clusteringMethod,
                                   returnPlot = FALSE,
                                   width=7, height=5.5, onefile=FALSE, #pdf
                                   family, title, fonts, version,
                                   paper, encoding, bg, fg, pointsize,
                                   pagecentre, colormodel,
                                   useDingbats, useKerning,
                                   fillOddEven, compress,
                                   color = colorRampPalette(rev( #pheatmap
                                       RColorBrewer::brewer.pal(n = 7,
                                                  name = "RdYlBu")))(100),
                                   kmeans_k = NA, breaks = NA,
                                   border_color = "grey60",
                                   cellwidth = NA, cellheight = NA,
                                   scale = "none", cluster_rows = TRUE,
                                   cluster_cols = TRUE,
                                   clustering_distance_rows = "euclidean",
                                   clustering_distance_cols = "euclidean",
                                   clustering_method = "complete",
                                   clustering_callback = identity2,
                                   cutree_rows = NA, cutree_cols = NA,
                                   treeheight_row = ifelse((
                                       class(cluster_rows) == "hclust") ||
                                           cluster_rows, 50, 0),
                                   treeheight_col = ifelse((
                                       class(cluster_cols) == "hclust") ||
                                           cluster_cols, 50, 0),
                                   legend = TRUE,
                                   legend_breaks = NA,
                                   legend_labels = NA,
                                   annotation_row = NA,
                                   annotation_col = NA,
                                   annotation = NA,
                                   annotation_colors = NA,
                                   annotation_legend = TRUE,
                                   annotation_names_row = TRUE,
                                   annotation_names_col = TRUE,
                                   drop_levels = TRUE,
                                   show_rownames = TRUE,
                                   show_colnames = TRUE,
                                   fontsize = 7.5, #not default (10)
                                   fontsize_row = fontsize,
                                   fontsize_col = fontsize,
                                   display_numbers = F,
                                   number_format = "%.2f",
                                   number_color = "grey30",
                                   fontsize_number = 0.8 * fontsize,
                                   gaps_row = NULL, gaps_col = NULL,
                                   labels_row = NULL, labels_col = NULL,
                                   filename = NA, widthHeatmap = NA,
                                   heightHeatmap = NA, silent = FALSE,
                                   main =
                                       paste0("Clusters similarity matrix ",
                                              ncol(clustersSimilarityMatrix),
                                              " columns, ", nrow(clustersSimilarityMatrix),
                                              " rows.")){

    clusters <- SummarizedExperiment::colData(sceObject)$clusters
    clustersNumber <- length(unique(clusters))
    clustersNames <- levels(clusters)
    dataDirectory <- dataDirectory
    experimentName <- experimentName
    graphsDirectory <- "pictures"

    # Plotting matrix
    distanceMatrix <- as.dist(sqrt((1-clustersSimilarityMatrix)/2))
    clusteringTree <- hclust(distanceMatrix, method=clusteringMethod)

    colDataSimilarity <- data.frame(clusters = clustersNames)
    rownames(colDataSimilarity) <- colDataSimilarity$clusters

    annotationColors <- generateAnnotationColors(SummarizedExperiment::colData(sceObject),
                                                 colorPalette,
                                                 statePalette)
    
    pheatmapObject <- pheatmap::pheatmap(clustersSimilarityMatrix,
                       show_colnames=show_colnames,
                       show_rownames=show_rownames,
                       annotation_col=colDataSimilarity,
                       annotation_colors=annotationColors,
                       cluster_cols=clusteringTree,
                       cluster_rows=clusteringTree,
                       fontsize=fontsize,
                       main = main,
                       color = color, kmeans_k = kmeans_k, # not changed by default
                       breaks = breaks,
                       border_color = border_color,
                       cellwidth = cellwidth, cellheight = cellheight, scale = scale,
                       clustering_distance_rows = clustering_distance_rows,
                       clustering_distance_cols = clustering_distance_cols,
                       clustering_method = clustering_method,
                       clustering_callback = clustering_callback,
                       treeheight_row = treeheight_row,
                       treeheight_col = treeheight_col, legend = legend,
                       legend_breaks = legend_breaks,
                       legend_labels = legend_labels, annotation_row = annotation_row,
                       annotation = annotation, annotation_legend = annotation_legend,
                       annotation_names_row = annotation_names_row,
                       annotation_names_col = annotation_names_col,
                       drop_levels = drop_levels,
                       fontsize_row = fontsize_row, fontsize_col = fontsize_col,
                       display_numbers = display_numbers,
                       number_format = number_format,
                       number_color = number_color,
                       fontsize_number = fontsize_numbere, gaps_row = gaps_row,
                       gaps_col = gaps_col, labels_row = labels_row,
                       labels_col = labels_col, filename = filename, width = widthHeatmap,
                       height = heightHeatmap, silent = silent)
    
    message("\nSaving a heatmap with the clusters similarity matrix.")
    pdf(file.path(dataDirectory, graphsDirectory,
                  paste(experimentName,"clusters_similarity", clustersNumber,
                        "clusters.pdf", sep="_")),
        width=width, height=height, onefile=onefile, # not changed by default
        family=family, title=title, fonts=fonts, version=version,
        paper=paper, encoding=encoding, bg=bg, fg=fg, pointsize=pointsize,
        pagecentre=pagecentre, colormodel=colormodel,
        useDingbats=useDingbats, useKerning=useKerning,
        fillOddEven=fillOddEven, compress=compress)
    
        grid::grid.newpage()
        grid::grid.draw(pheatmapObject$gtable)
    dev.off()

    if(returnPlot){
        return(pheatmapObject)
    }
}

#' Save a similarity cluster matrix.
#'
#' @param clustersSimilarityMatrix a matrix, result of conclus::calculateClustersSimilarity() function.
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param dataDirectory output directory for CONCLUS (supposed to be the same for one experiment during the workflow).
#' @param experimentName name of the experiment which will appear in filenames (supposed to be the same for one experiment during the workflow).
#' @param colorPalette "default" or a vector of colors for the column "clusters" in the colData, for example c("yellow", "#CC79A7").
#' @param statePalette "default" or a vector of colors for the column "state" in the colData, for example c("yellow", "#CC79A7").
#' @param clusteringMethod a clustering methods passed to hclust() function.
#' @param returnPlot boolean, return plot or not.
#' @param width plot width.
#' @param height plot height.
#' @param ... other parameters of pdf() and pheatmap() functions.
#'
#' @return A ggplot object or nothing (depends on returnPlot parameter). It saves the pdf in "dataDirectory/pictures" folder.
#' @export
plotClustersSimilarity <- function(clustersSimilarityMatrix, sceObject,
                                   dataDirectory,
                                   experimentName, colorPalette,
                                   statePalette,
                                   clusteringMethod,
                                   returnPlot = FALSE,
                                   width=7, height=5.5, ...) {

  .plotClustersSimilarity(clustersSimilarityMatrix, sceObject,
                          dataDirectory, 
                          experimentName, colorPalette,
                          statePalette,
                          clusteringMethod,
                          returnPlot = returnPlot,
                          width=width, height=height, ...)
}

#' @export
deleteFile <- function(fileName, outputDir){
    file.remove(file.path(outputDir, fileName))
}

# rankGenesInternal() saves n files in your outputDir, n=number of groups
rankGenesInternal <- function(expr, colData, column, simMed, outputDir,
                             experimentName){

    message("Ranking marker genes for each cluster.")

    filesList <- list.files(outputDir, pattern = "_genes.tsv")
    deletedFiles <- sapply(filesList, function(fileName) deleteFile(fileName,
                                                                    outputDir))

    stopifnot(all(colnames(expr) == rownames(colData)))
    groups <- unique(colData[,column])
    simMed = simMed + 0.05
    for(i in 1:length(groups)){
        message(paste("Working on the file", i))
        tTestPval <- data.frame(Gene=rownames(expr))
        otherGroups <- groups[groups!=groups[i]]

        for(k in 1:length(otherGroups)){
            tTestPval[,paste0("vs_", otherGroups[k])] <- NA
            x <- expr[,colData[,c(column)] == groups[i]]
            y <- expr[,colData[,c(column)] == otherGroups[k]]
            t <- (rowMeans(x) - rowMeans(y)) /
                sqrt(apply(expr, 1, var)*(1/ncol(x) + 1/ncol(y)))
            df <- ncol(x) + ncol(y) - 2
            tTestPval[, paste0("vs_", otherGroups[k])] <-
                pt(t, df, lower.tail=FALSE)
        }

        tTestFDR <- data.frame(Gene=tTestPval$Gene)
        for(l in 1:length(otherGroups)){
            tTestFDR[,paste0("vs_", otherGroups[l])] <-
                p.adjust(tTestPval[,paste0("vs_", otherGroups[l])],
                         method="fdr")
        }

        submat <- as.matrix(tTestFDR[,2:(length(otherGroups)+1)])
        tTestFDR$mean_log10_fdr <- rowMeans(log10(submat+1e-300))
        tTestFDR$n_05 <- apply(submat, 1, function(x)
            length(x[!is.na(x) & x < 0.05]))

        weights <- simMed[i, otherGroups]
        tTestFDR$score <- apply(submat, 1, function(x)
            sum(-log10(x+1e-300) * weights) / ncol(submat))

        tTestFDR <- tTestFDR[order(tTestFDR$score,decreasing=TRUE),]

        write.table(tTestFDR, file.path(outputDir,
                                        paste0(experimentName, "_cluster_",
                                               groups[i], "_genes.tsv")),
                    col.names=TRUE, row.names=FALSE, quote=FALSE,
                    sep="\t")
    }

    rm(tTestFDR, tTestPval, i, k, l, x, y, t, df, otherGroups, submat,
       weights, groups)
}

#' Rank marker genes by statistical significance.
#'
#' This function searches marker genes for each cluster. It saves tables in the "dataDirectory/marker_genes" directory,
#' one table per cluster.
#' 
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param clustersSimilarityMatrix matrix, result of conclus::calculateClustersSimilarity() function.
#' @param dataDirectory output directory for CONCLUS (supposed to be the same for one experiment during the workflow).
#' @param experimentName name of the experiment which will appear in filenames (supposed to be the same for one experiment during the workflow).
#' @export
#' @param column name of the column with a clustering result.
rankGenes <- function(sceObject, clustersSimilarityMatrix, dataDirectory,
                      experimentName, column="clusters"){
  # 

  markerGenesDirectory <- "marker_genes"
  rankGenesInternal(Biobase::exprs(sceObject), SummarizedExperiment::colData(sceObject), column,
             clustersSimilarityMatrix,
             file.path(dataDirectory, markerGenesDirectory), experimentName)

}

#' Get top N marker genes from each cluster. 
#' 
#' This function reads results of conclus::rankGenes() from "dataDirectory/marker_genes" and selects top N markers for each cluster.
#' 
#' @param dataDirectory output directory for a run of CONCLUS (supposed to be the same for one experiment during the workflow).
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param genesNumber top N number of genes to get from one cluster.
#' @param experimentName name of the experiment which appears in filenames (supposed to be the same for one experiment during the workflow).
#' @param removeDuplicates boolean, if duplicated genes must be deleted or not.
#'
#' @return A data frame where the first columns are marker genes ("geneName") and 
#' the second column is the groups ("clusters").
#' @export
getMarkerGenes <- function(dataDirectory, sceObject, genesNumber=14,
                           experimentName, removeDuplicates = TRUE){

    markerGenesDirectory <- "marker_genes"
    numberOfClusters <- length(unique(SummarizedExperiment::colData(sceObject)$clusters))
    dir = file.path(dataDirectory, markerGenesDirectory)
    nTop = genesNumber
    clusters = unique(SummarizedExperiment::colData(sceObject)$clusters)

    markersClusters <- as.data.frame(matrix(ncol = 2,
                                            nrow = nTop*numberOfClusters))
    colnames(markersClusters) = c("geneName", "clusters")

    (fns <- list.files(dir, pattern = "_genes.tsv", full.names = FALSE))
    if(length(fns) != numberOfClusters){
        message(paste("Something wrong with number of files.
It is supposed to be equal to number of clusters:", numberOfClusters))
        message(paste("Returning the marker genes from
first", clusters, "clusters."))
        runUntil = numberOfClusters
    }else{
        runUntil = length(fns)
    }
    for(i in 1:runUntil){
        tmpAll <- read.delim(file.path(dir, paste(experimentName,
                                                  "cluster", clusters[i], "genes.tsv", sep="_")),
                             stringsAsFactors = FALSE)
        markersClusters$clusters[(nTop*(i-1)+1):(nTop*i)] <- as.character(clusters[i])
        markersClusters$geneName[(nTop*(i-1)+1):(nTop*i)] <- tmpAll$Gene[1:nTop]
    }
    if(removeDuplicates){
        markersClusters <- markersClusters[!duplicated(markersClusters$geneName),]
    }
    return(markersClusters)
}

orderCellsInCluster <- function(cluster, colData, mtx,
                                clusteringMethod="ward.D2"){
  # Order cells according to clustering results
  # Uses for ordering matrix to further plot it with pheatmap()

  cells <- colData[colData$clusters == cluster, ]$cellName
  if(length(cells) > 2){
      tree <- hclust(dist(t(mtx[, cells])), method=clusteringMethod)
      return(cells[tree$order])
  }else{
      return(cells)
  }

}

orderGenesInCluster <- function(cluster, markersClusters, mtx,
                                clusteringMethod="ward.D2"){
    # Order cells according to clustering results
    # Uses for ordering matrix to further plot it with pheatmap()

    genes <- markersClusters[markersClusters$clusters == cluster, ]$geneName
    if(length(genes) > 2){
        tree <- hclust(dist(mtx[genes, ]), method=clusteringMethod)
        return(genes[tree$order])
    }else{
        return(genes)
    }


}

generateAnnotationColors <- function(colData, colorPaletteParameter,
                                     statePalette){

  clusters <- levels(colData$clusters)
  states <- unique(colData$state)
  clusterNumber <- length(unique(colData$clusters))

  colorAnnotationClusters <- choosePalette(colorPaletteParameter, clusterNumber)
  #colorAnnotationState <- chooseStatePalette(length(states))
  colorAnnotationState <- choosePalette(statePalette, length(states))
  names(colorAnnotationState) <- states
  names(colorAnnotationClusters) <- clusters

  return(list(state=colorAnnotationState, clusters=colorAnnotationClusters))
}

# Do not export this function
.plotCellHeatmap <- function(markersClusters, sceObject, dataDirectory,
                            experimentName,
                            fileName, meanCentered=TRUE, colorPalette="default",
                            statePalette="default", clusteringMethod="ward.D2",
                            orderClusters = FALSE, #FALSE, TRUE, name, similarity
                            orderGenes = FALSE, # FALSE, TRUE (will be ordered the same as clusters)
                            returnPlot = FALSE,
                            saveHeatmapTable = FALSE,
                            width=10, height=8.5, onefile=FALSE, #pdf
                            family, title, fonts, version,
                            paper, encoding, bg, fg, pointsize,
                            pagecentre, colormodel,
                            useDingbats, useKerning,
                            fillOddEven, compress,
                            color = colorRampPalette(c("#023b84","#4b97fc",
                                            "#c9d9ef","#FEE395", #not default
                                            "#F4794E", "#D73027",#pheatmap
                                            "#a31008","#7a0f09"))(100),
                            kmeans_k = NA, breaks = NA,
                            border_color = "grey60",
                            cellwidth = NA, cellheight = NA,
                            scale = "none", cluster_rows = TRUE,
                            cluster_cols = FALSE, # not original default
                            clustering_distance_rows = "euclidean",
                            clustering_distance_cols = "euclidean",
                            clustering_method = "complete",
                            clustering_callback = identity2,
                            cutree_rows = NA, cutree_cols = NA,
                            treeheight_row = ifelse((
                                class(cluster_rows) == "hclust") ||
                                    cluster_rows, 50, 0),
                            treeheight_col = ifelse((
                                class(cluster_cols) == "hclust") ||
                                    cluster_cols, 50, 0),
                            legend = TRUE,
                            legend_breaks = NA,
                            legend_labels = NA,
                            annotation_row = NA,
                            annotation_col = NA,
                            annotation = NA,
                            annotation_colors = NA,
                            annotation_legend = TRUE,
                            annotation_names_row = TRUE,
                            annotation_names_col = TRUE,
                            drop_levels = TRUE,
                            show_rownames = TRUE,
                            show_colnames = FALSE, # not original default (T)
                            main = NA,
                            fontsize = 7.5, # not original default (10)
                            fontsize_row = 8, # not original default
                            fontsize_col = fontsize,
                            display_numbers = F,
                            number_format = "%.2f",
                            number_color = "grey30",
                            fontsize_number = 0.8 * fontsize,
                            gaps_row = NULL, gaps_col = NULL,
                            labels_row = NULL, labels_col = NULL,
                            filename = NA, widthHeatmap = NA,
                            heightHeatmap = NA, silent = FALSE){
  # plots correlation between cells between clusters

  colData <- SummarizedExperiment::colData(sceObject)
  expressionMatrix <- Biobase::exprs(sceObject)[rownames(Biobase::exprs(sceObject)) %in%
                                           markersClusters$geneName,]

  if(meanCentered == TRUE){
    meanRows <- rowSums(expressionMatrix)/ncol(expressionMatrix)
    expressionMatrix <- expressionMatrix - meanRows
  }

  if(orderClusters == FALSE & orderGenes == TRUE){
      message("Genes cannot be ordered without clusters.
              Returning heatmap with orderCluster = similarity")
      orderClusters <- "similarity"
  }
  if(orderClusters == "name"){
      # Ordering expressionMatrixrix
      newOrder <- unname(unlist(sapply(levels(colData$clusters),
                                   function(cluster)
                                         orderCellsInCluster(cluster,
                                         colData,
                                         expressionMatrix,
                                         clusteringMethod=clusteringMethod))))

      expressionMatrix <- expressionMatrix[, newOrder]
      cluster_cols <- FALSE
      if(orderGenes){
          newOrder <- unname(unlist(sapply(levels(colData$clusters),
                                   function(cluster)
                                          orderGenesInCluster(cluster,
                                          markersClusters,
                                          expressionMatrix,
                                          clusteringMethod=clusteringMethod))))
          expressionMatrix <- expressionMatrix[newOrder, ]
          cluster_rows <- FALSE
      }
  }else if((orderClusters == TRUE) | (orderClusters == "similarity")){
      cellsSimilarityMatrix <- read.delim(file.path(dataDirectory,
                                              "output_tables",
                                              paste0(experimentName,
                                              "_cellsSimilarityMatrix.csv")),
                                          stringsAsFactors = FALSE,
                                          header = TRUE,
                                          sep = ",")

      clustersSimOrdered <- calculateClustersSimilarity(cellsSimilarityMatrix,
                                                        sceObject,
                                                        clusteringMethod)[[2]]

      newOrder <- unname(unlist(sapply(clustersSimOrdered,
                                       function(cluster)
                                           orderCellsInCluster(cluster,
                                           colData,
                                           expressionMatrix,
                                           clusteringMethod=clusteringMethod))))

      expressionMatrix <- expressionMatrix[, newOrder]
      cluster_cols <- FALSE
      if(orderGenes){
          newOrder <- unname(unlist(sapply(clustersSimOrdered,
                                       function(cluster)
                                          orderGenesInCluster(cluster,
                                          markersClusters,
                                          expressionMatrix,
                                          clusteringMethod=clusteringMethod))))
          expressionMatrix <- expressionMatrix[newOrder, ]
          cluster_rows <- FALSE
      }
  }else if(orderClusters == FALSE){
      distanceMatrix <- dist(t(expressionMatrix))
      cluster_cols <- hclust(distanceMatrix, method="ward.D2")
  }else{
      message("Unknown option of orderClusters. Options are 'TRUE' ('similarity'),
'FALSE', 'name'. Using the default version 'FALSE'.")
      distanceMatrix <- dist(t(expressionMatrix))
      cluster_cols <- hclust(distanceMatrix, method="ward.D2")
  }

  if(!orderGenes){
      cluster_rows <- hclust(dist(expressionMatrix), method="ward.D2")
  }

  annotationColors <- generateAnnotationColors(colData, colorPalette,
                                               statePalette)
  columnsToPlot <- switch(is.null(colData$state) + 1, c("clusters", "state"),
                          c("clusters"))

  if(is.null(colData$clusters)){
      annCol <- switch(is.null(colData$state) + 1,
                       as.data.frame(colData["state"]), NA)
      annColors <- switch(is.null(colData$state) + 1,
                          annotationColors[names(annotationColors) == "state"],
                          NA)
  }else{
      annCol <- as.data.frame(colData[columnsToPlot])
      annColors <- annotationColors
  }
  
  pheatmapObject <- pheatmap::pheatmap(expressionMatrix,
                     show_colnames=show_colnames,
                     show_rownames=show_rownames,
                     annotation_col=annCol,
                     annotation_colors=annColors,
                     fontsize_row=fontsize_row,
                     cluster_cols=cluster_cols,
                     main=main,
                     cluster_rows=cluster_rows,
                     color=color,
                     fontsize=fontsize, kmeans_k = kmeans_k, # not changed by default
                     breaks = breaks,
                     border_color = border_color,
                     cellwidth = cellwidth, cellheight = cellheight, scale = scale,
                     clustering_distance_rows = clustering_distance_rows,
                     clustering_distance_cols = clustering_distance_cols,
                     clustering_method = clustering_method,
                     clustering_callback = clustering_callback,
                     treeheight_row = treeheight_row,
                     treeheight_col = treeheight_col, legend = legend,
                     legend_breaks = legend_breaks,
                     legend_labels = legend_labels,
                     annotation = annotation,
                     annotation_row = annotation_row,
                     annotation_legend = annotation_legend,
                     annotation_names_row = annotation_names_row,
                     annotation_names_col = annotation_names_col,
                     drop_levels = drop_levels,
                     fontsize_col = fontsize_col,
                     display_numbers = display_numbers,
                     number_format = number_format,
                     number_color = number_color,
                     fontsize_number = fontsize_numbere, gaps_row = gaps_row,
                     gaps_col = gaps_col, labels_row = labels_row,
                     labels_col = labels_col, filename = filename, width = widthHeatmap,
                     height = heightHeatmap, silent = silent)

  pdf(file.path(dataDirectory, "pictures",
                paste0(experimentName, "_", fileName, ".pdf")),
      width=width, height=height, onefile=onefile, # not changed by default
      family=family, title=title, fonts=fonts, version=version,
      paper=paper, encoding=encoding, bg=bg, fg=fg, pointsize=pointsize,
      pagecentre=pagecentre, colormodel=colormodel,
      useDingbats=useDingbats, useKerning=useKerning,
      fillOddEven=fillOddEven, compress=compress)
  grid::grid.newpage()
  grid::grid.draw(pheatmapObject$gtable)
  dev.off()

  if(saveHeatmapTable){
      exportMatrix(expressionMatrix, dataDirectory, experimentName, fileName)
  }

  if(returnPlot){
      return(pheatmapObject)
  }
}

#' Save markers heatmap.
#' 
#' This function plots heatmap with marker genes on rows and clustered cells on columns. 
#'
#' @param markersClusters a data frame where the first column is "geneName" containing genes names from sceObject, 
#' and the second column is corresponding "clusters". All names from that column must come from the column "clusters" in the colData(sceObject).
#' The data frame can be obtained from conclus::getMarkerGenes() function or created manually.
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param dataDirectory output directory of a given CONCLUS run (supposed to be the same for one experiment during the workflow).
#' @param experimentName name of the experiment which appears in filenames (supposed to be the same for one experiment during the workflow).
#' @param fileName name of the ouput file
#' @param meanCentered boolean, should mean centering be applied to the expression data or not.
#' @param colorPalette "default" or a vector of colors for the column "clusters" in the colData, for example c("yellow", "#CC79A7").
#' @param statePalette "default" or a vector of colors for the column "state" in the colData, for example c("yellow", "#CC79A7").
#' @param clusteringMethod a clustering methods passed to hclust() function.
#' @param orderClusters boolean, should the heatmap be structured by clusters.
#' @param orderGenes boolean, should the heatmap be structured by genes.
#' @param returnPlot boolean, whether to return a ggplot object with the plot or not.
#' @param saveHeatmapTable boolean, whether to save the expression matrix used for heatmap into a .csv file or not.
#' The file will be saved into 'dataDirectory/output_tables' with the same name as the .pdf plot.
#' @param width plot width.
#' @param height plot height.
#' @param ... other parameters from pdf() and pheatmap() functions.
#'
#' @return A ggplot object of the plot if needed. The function saves pdf in "dataDirectiry/pictures" folder.
#' @export
plotCellHeatmap <- function(markersClusters, sceObject, dataDirectory,
                            experimentName,
                            fileName, meanCentered=TRUE, colorPalette="default",
                            statePalette="default", clusteringMethod="ward.D2",
                            orderClusters = FALSE, #FALSE, TRUE, name, similarity
                            orderGenes = FALSE, # FALSE, TRUE (will be ordered the same as clusters)
                            returnPlot = FALSE,
                            saveHeatmapTable = FALSE,
                            width=10, height=8.5, ...){

  .plotCellHeatmap(markersClusters, sceObject, dataDirectory,
                   experimentName,
                   fileName, meanCentered=meanCentered, colorPalette=colorPalette,
                   statePalette=statePalette, clusteringMethod=clusteringMethod,
                   orderClusters = orderClusters, #FALSE, TRUE, name, similarity
                   orderGenes = orderGenes, # FALSE, TRUE (will be ordered the same as clusters)
                   returnPlot = returnPlot,
                   saveHeatmapTable = saveHeatmapTable,
                   width=width, height=height, ...)
}

exportData <- function(sceObject, dataDirectory, experimentName){
  # exports all the data from workflow, including .RData files

  outputDataDirectory <- "output_tables"

  ################ EXPORT MATRIX, COLDATA, ROWDATA, FULL WORKSPACE
  write.table(Biobase::exprs(sceObject), file=file.path(dataDirectory,
            outputDataDirectory, paste0(experimentName, "_",
                                        "expression_matrix.tsv")), sep="\t",
            row.names = TRUE, quote = FALSE, col.names = TRUE)
  write.table(SummarizedExperiment::colData(sceObject), file=file.path(dataDirectory,
            outputDataDirectory, paste0(experimentName, "_",
                                        "colData.tsv")), sep="\t",
            row.names = TRUE, quote = FALSE, col.names = TRUE)
  write.table(rowData(sceObject), file=file.path(dataDirectory,
            outputDataDirectory, paste0(experimentName, "_",
                                        "rowData.tsv")), sep="\t",
            row.names = TRUE, quote = FALSE, col.names = TRUE)
  save.image(file=file.path(dataDirectory, outputDataDirectory,
                        paste0(experimentName, "_", "full_workspace.RData")))

}

#' DBSCAN clustering on t-SNE results.
#' 
#' This function provides consensus DBSCAN clustering based on the results of t-SNE. 
#' You can tune algorithm parameters in options to get the number of clusters you want.
#'
#' @param tSNEResults the result of conclus::generateTSNECoordinates() function.
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param dataDirectory output directory of a given CONCLUS run (supposed to be the same for one experiment during the workflow).
#' @param experimentName name of the experiment which appears in filenames (supposed to be the same for one experiment during the workflow).
#' @param epsilon a parameter of fpc::dbscan() function.
#' @param minPoints a parameter of fpc::dbscan() function.
#' @param k preferred number of clusters. Alternative to deepSplit.
#' @param PCs a vector of first principal components.
#' For example, to take ranges 1:5 and 1:10 write c(5, 10).
#' @param perplexities a vector of perplexity for t-SNE.
#' @param randomSeed random seed for reproducibility.
#' @param deepSplit intuitive level of clustering depth. Options are 1, 2, 3, 4.
#' @param clusteringMethod a clustering methods passed to hclust() function.
#' @param cores maximum number of jobs that CONCLUS can run in parallel.
#' @param deleteOutliers Whether cells which were often defined as outliers by dbscan must be deleted.
#' It will require recalculating of the similarity matrix of cells. Default is FALSE.
#' Usually those cells appear in an "outlier" cluster and can be easier distinguished and deleted later
#' if necessary.
#' 
#'
#' @return A list containing filtered from outliers SingleCellExperiment object and cells similarity matrix.
#' @export
runClustering <- function(tSNEResults, # for deleteOutliers = FALSE
                          sceObject, dataDirectory,
                          experimentName, epsilon=c(1.3, 1.4, 1.5),
                          minPoints=c(3, 4), k=0, deepSplit=4,
                          clusteringMethod = "ward.D2",
                          cores=14,
                          deleteOutliers = TRUE,
                          PCs=c(4, 6, 8, 10, 20, 40, 50),
                          perplexities=c(30,40), # for deleteOutliers = TRUE
                          randomSeed = 42){
  # It combines all the clustering parts. Takes tSNE coordinates and gives
  # results of final clustering: sceObject and cell correlation matrix

  # run dbscan
  message("Running dbscan using ", cores, " cores.")
  dbscanResults <- runDBSCAN(tSNEResults, sceObject, dataDirectory,
                             experimentName, epsilon=epsilon,
                             minPoints=minPoints, cores=cores)
  if(deleteOutliers){
      # filter cluster outliers
      message("Excluding clustering outliers.")
      filteredResults <- excludeOutliers(dbscanResults, sceObject)
      sceObjectFiltered <- filteredResults[[1]]
      #dbscanResultsFiltered <- filteredResults[[2]]
      message("Getting TSNE coordinates for the filtered sceObject.")
      tSNEResultsFiltered <- generateTSNECoordinates(sceObjectFiltered,
                                                     dataDirectory,
                                                     experimentName, PCs=PCs,
                                                     perplexities=perplexities,
                                                     randomSeed=randomSeed)

      dbscanResultsFiltered <- runDBSCAN(tSNEResultsFiltered, sceObjectFiltered,
                                         dataDirectory,
                                         experimentName, epsilon=epsilon,
                                         minPoints=minPoints, cores=cores)
  }else{
      sceObjectFiltered = sceObject
      dbscanResultsFiltered = dbscanResults
  }

  # assign cells to cluster
  clusteringResults <- clusterCellsInternal(dbscanResultsFiltered, sceObjectFiltered,
                                    clusterNumber=k, deepSplit=deepSplit,
                                    clusteringMethod=clusteringMethod,
                                    cores=cores)
  sceObjectFiltered <- clusteringResults[[1]]
  cellsSimilarityMatrix <- clusteringResults[[2]]

  return(list(sceObjectFiltered, cellsSimilarityMatrix))
}

# getKEGGGenes
# Extracting genes from KEGG database by pathway ID.
# Returns only the genes that found in expression matrix.

# Please do not export this function because it does not work now.
.getKEGGGenes <- function(pathwayID, sceObject, species="mmu"){
  #

  keggOutput <- keggGet(paste0(species, pathwayID))[[1]]$GENE
  keggOutput <- keggOutput[seq(2, length(keggOutput), by = 2)]
  geneList <- unique(unname(sapply(keggOutput,
                                   function(x) strsplit(x, ";")[[1]][1])))
  filteredGeneList <- geneList[geneList %in% rownames(sceObject)]

  message(paste0("Taking ", length(filteredGeneList), " genes out of ",
             length(geneList), " from KEGG pathway: ", species, pathwayID,
             " ", keggGet(paste0(species, pathwayID))[[1]]$NAME))

  return(filteredGeneList)

}

# Do not export this function.
.plotGeneExpression <- function(geneName, experimentName, dataDirectory,
                               graphsDirectory = "pictures",
                               sceObject, tSNEpicture=1,
                               commentName = "", palette = c("grey","red",
                                                             "#7a0f09",
                                                             "black"),
                               returnPlot = FALSE,
                               savePlot = TRUE,
                               alpha = 1, limits = NA,
                               pointSize = 1,
                               width=6, height=5, onefile=FALSE, #pdf
                               family, title, fonts, version,
                               paper, encoding, bg, fg, pointsize,
                               pagecentre, colormodel,
                               useDingbats, useKerning,
                               fillOddEven, compress){

  experimentName <- experimentName
  dataDirectory <- dataDirectory
  tSNEDirectory <- "tsnes"

  ### Plot all precalculated t-SNEs to show your clusters ###

  clustersNumber <- length(unique(SummarizedExperiment::colData(sceObject)$clusters))

  coordsName <- list.files(file.path(dataDirectory, tSNEDirectory),
                          pattern = paste0(experimentName,'_tsne_coordinates_',
                                           tSNEpicture, "_"))

  tSNECoords <- read.delim(file.path(dataDirectory, tSNEDirectory, coordsName),
                        stringsAsFactors=FALSE)

  tSNECoords <- tSNECoords[SummarizedExperiment::colData(sceObject)$cellName, ]

  if(!geneName %in% rownames(Biobase::exprs(sceObject))){
    print("Gene is not found in expression matrix")
  }

  stopifnot(all(rownames(tSNECoords) == colnames(sceObject)))
  tSNECoords$expression <- Biobase::exprs(sceObject)[geneName, ]

  if(length(limits) == 1){
      limits <- c(min(tSNECoords$expression),
                  max(tSNECoords$expression))
  }

  if(savePlot){
      pdf(file.path(dataDirectory, graphsDirectory, paste0(paste(experimentName,
                                                                 "tSNE", clustersNumber, "clusters" , geneName, commentName,
                                                                 "tSNEpicture", tSNEpicture, "_alpha", alpha,
                                                                 sep="_"), ".pdf")),
          width=width, height=height, onefile=onefile, # not changed by default
          family=family, title=title, fonts=fonts, version=version,
          paper=paper, encoding=encoding, bg=bg, fg=fg, pointsize=pointsize,
          pagecentre=pagecentre, colormodel=colormodel,
          useDingbats=useDingbats, useKerning=useKerning,
          fillOddEven=fillOddEven, compress=compress)
  }
  tmp <- ggplot2::ggplot(tSNECoords, aes(x=tSNECoords[,1],
                                y=tSNECoords[,2], color=expression)) +
    geom_point(size=I(pointSize), alpha = alpha) + theme_bw() +
    scale_colour_gradientn(colours=alpha(colorRampPalette(palette)(100), 0.8),
                           limits = limits) +
      ggtitle(geneName)
  #brewer.pal(9, "OrRd")[0:9]
  print(tmp)

  if(savePlot){
      dev.off()
  }

  if(returnPlot){
      return(tmp)
  }
}

#' plotGeneExpression
#'
#' The function saves a t-SNE plot colored by expression of a given gene. 
#' Warning: filename with t-SNE results is hardcoded, so please don't rename the output file.
#'
#' @param geneName name of the gene you want to plot.
#' @param dataDirectory output directory for CONCLUS (supposed to be the same for one experiment during the workflow).
#' @param experimentName name of the experiment which appears in filenames (supposed to be the same for one experiment during the workflow).
#' @param graphsDirectory name of the subdirectory where to put graphs. Default is "dataDirectory/pictures".
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param tSNEpicture number of the picture you want to use for plotting. 
#' Please check "dataDirectory/tsnes" or "dataDirectory/pictures/tSNE_pictures/clusters" to get the number, it is usually from 1 to 14.
#' @param commentName comment you want to specify in the filename.
#' @param palette color palette for the legend.
#' @param returnPlot boolean, should the function return a ggplot object or not.
#' @param savePlot boolean, should the function export the plot to pdf or not.
#' @param alpha opacity of the points of the plot.
#' @param limits range of the gene expression shown in the legend.
#' This option allows generating t-SNE plots with equal color
#' scale to compare the expression of different genes. By default, limits are the range
#' of expression of a selected gene.
#' @param pointSize size of the point.
#' @param width plot width.
#' @param height plot height.
#' @param ... other parameters of the pdf() function.
#'
#' @return A ggplot object of the plot if needed.
#' @export
plotGeneExpression <- function(geneName, experimentName, dataDirectory,
                               graphsDirectory = "pictures",
                               sceObject, tSNEpicture=1,
                               commentName = "", palette = c("grey","red",
                                                             "#7a0f09",
                                                             "black"),
                               returnPlot = FALSE,
                               savePlot = TRUE,
                               alpha = 1, limits = NA,
                               pointSize = 1,
                               width=6, height=5, ...){

  .plotGeneExpression(geneName, experimentName, dataDirectory,
                      graphsDirectory = graphsDirectory,
                      sceObject, tSNEpicture=tSNEpicture,
                      commentName = commentName, palette = palette,
                      returnPlot = returnPlot,
                      savePlot = savePlot,
                      alpha = alpha, limits = limits,
                      pointSize = pointSize,
                      width=width, height=height, ...)
}

#' exportClusteringResults
#'
#' The function saves clustering results into a table. Row names are cell names in the same order as in the sceObject.
#'
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param dataDirectory output directory (supposed to be the same for one experiment during the workflow).
#' @param experimentName name of the experiment which appears at the beginning of the file name 
#' (supposed to be the same for one experiment during the workflow).
#' @param fileName the rest of output file name.
#'
#' @export
exportClusteringResults <- function(sceObject, dataDirectory,
                                    experimentName, fileName){

  tableData <- S4Vectors::DataFrame(clusters = SummarizedExperiment::colData(sceObject)$clusters,
                         row.names = SummarizedExperiment::colData(sceObject)$cellName)
  write.table(tableData,
              file = file.path(dataDirectory, "output_tables",
                               paste0(experimentName,"_", fileName)),
              sep = "\t", quote = FALSE)
}

#' addClusteringManually
#'
#' The function replaces the content of the column "clusters" in the colData(sceObject) 
#' with the clustering provided in the user table.
#' The function will return the sceObject with cells which intersect with the cells from the input table.
#'
#' @param fileName a file with the clustering solution (for example, from previous CONCLUS runs).
#' @param sceObject a SingleCellExperiment object with your experiment.
#' @param dataDirectory output directory (supposed to be the same for one experiment during the workflow).
#' @param experimentName name of the experiment which appears in filenames (supposed to be the same for one experiment during the workflow).
#' @param columnName name of the column with the clusters.
#'
#' @return A SingleCellExperiment object with the created/renewed column "clusters" in the colData(sceObject).
#' @export
addClusteringManually <- function(fileName, sceObject, dataDirectory,
                                  experimentName, columnName = "clusters"){

  tableData <- read.table(file.path(dataDirectory, "output_tables",
                                paste0(experimentName,"_", fileName)), sep="\t")

  if(all(rownames(SummarizedExperiment::colData(sceObject)) %in% rownames(tableData))){
    if(ncol(tableData) == 1){
      SummarizedExperiment::colData(sceObject)$clusters <-
          factor(tableData[rownames(SummarizedExperiment::colData(sceObject)),])
    } else {
      SummarizedExperiment::colData(sceObject)$clusters <-
          factor(tableData[rownames(SummarizedExperiment::colData(sceObject)),][,columnName])
    }

    return(sceObject)
  } else {
    message("Rownames in colData are not equal to rownames in table.
Returning SCE object with cells intersecting with clusters_table.")
      sceObject <- sceObject[ ,colnames(sceObject) %in%
                                 intersect(colnames(sceObject),
                                           rownames(tableData))]
      tableData$randomColumn <- NA
      tableData <- tableData[rownames(tableData) %in%
                                 intersect(colnames(sceObject),
                                           rownames(tableData)), ]
      SummarizedExperiment::colData(sceObject)$clusters <-
          factor(tableData[rownames(SummarizedExperiment::colData(sceObject)),][,columnName])
    return(sceObject)
  }
}

# This function assumes that rownames(countMatrix) are either ENSEMBL IDs or
# or SYMBOLs. It will return a rowData with the same rownames as in countMatrix
# but genes which are not ENSEMBL IDs or SYMBOLs will not receive the annotation.
# Both types are possible in one matrix but not in one gene.
# Genes like Lrrc8a_ENSMUSG00000007476 will not be annotated.
# Iformation about cell surface localization will be used in the Shine application.
# It is useful to find cell surface markers. But this part of the function
# takes some time, so if you need this info, you can disable this option by
# cellSurface = FALSE to speed up the calculations.
annotateGenes <- function(countMatrix, species = "mmu",
                          genomeAnnot, ensemblPattern, rowData = NULL,
                          databaseDir = system.file("extdata", package = "conclus")){
    if(databaseDir == FALSE){
        if(missing(species) & (missing(genomeAnnot) | missing(ensemblPattern))){
            message("Species is either not selected or not equal to 'mmu' or 'human'.
    Please, select among default species or use options genomeAnnot and
    ensemblPattern. See example in the help page.")
            return(NULL)
        }else if(!missing(genomeAnnot) & !missing(ensemblPattern)){
            species = "manual"
        }else if(species == "mmu"){
            suppressMessages(library(org.Mm.eg.db, warn.conflicts = F))
            genomeAnnot <- org.Mm.eg.db
            ensemblPattern <- "ENSMUSG"
        }else if(species == "human"){
            suppressMessages(library(org.Hs.eg.db, warn.conflicts = F))
            genomeAnnot <- org.Hs.eg.db
            ensemblPattern <- "ENSG"
        }
        ensemblGenes <- rownames(countMatrix)[grep(ensemblPattern,
                                                   rownames(countMatrix))]
        symbolGenes <- rownames(countMatrix)[!grepl(ensemblPattern,
                                                    rownames(countMatrix))]
        message(paste0("Annotating ",length(ensemblGenes), " genes containing ",
                       ensemblPattern, " pattern."))
        # if none of ENSEMBL genes are in database we fill their rowData rows with NA
        if(length(intersect(AnnotationDbi::keys(genomeAnnot,
                                                keytype = "ENSEMBL"),
                            ensemblGenes)) == 0){
            rowdataEnsembl <- data.frame(ENSEMBL = ensemblGenes,
                                         SYMBOL = NA,
                                         GENENAME = NA)
        }else{
            rowdataEnsembl <- AnnotationDbi::select(genomeAnnot, keys=ensemblGenes,
                                                    keytype="ENSEMBL",
                                                    columns=c("SYMBOL",
                                                              "GENENAME"),
                                                    multiVals="first")
            rowdataEnsembl <- rowdataEnsembl[!duplicated(rowdataEnsembl$ENSEMBL),]
        }
        rowdataEnsembl$nameInCountMatrix <- ensemblGenes
        message("Annotating rest ", length(symbolGenes), " genes
    considering them as SYMBOLs.")
        rowdataSymbol <- AnnotationDbi::select(genomeAnnot, keys=symbolGenes,
                                               keytype="SYMBOL",
                                               columns=c("ENSEMBL",
                                                         "GENENAME"),
                                               multiVals="first")
        rowdataSymbol <- rowdataSymbol[!duplicated(rowdataSymbol$SYMBOL),]
        rowdataSymbol$nameInCountMatrix <- symbolGenes
        rowdata <- base::rbind(rowdataSymbol, rowdataEnsembl)
        rm(rowdataEnsembl, rowdataSymbol)

        # sometimes several ensembls give one symbol
        (mult_sym <- rowdata$SYMBOL[!is.na(rowdata$SYMBOL) &
                                        duplicated(rowdata$SYMBOL)])

        # we decided don't combine such ensembls, but leave them unique with
        #"symbol_ensembl" ###
        (rowdata$SYMBOL[rowdata$SYMBOL %in% mult_sym] <-
                paste(rowdata$SYMBOL[rowdata$SYMBOL %in% mult_sym],
                      rowdata$ENSEMBL[rowdata$SYMBOL %in% mult_sym],
                      sep = "_"))
        rm(mult_sym)

        ensembl <- useMart(biomart = "ensembl", dataset="mmusculus_gene_ensembl")
        message("Retrieving information about genes from biomaRt.
                It can take up to five minutes, depends on Internet connection.")
        res <- getBM(attributes=c("ensembl_gene_id", "go_id", "name_1006",
                                  "chromosome_name", "gene_biotype"),
                     mart=ensembl)
        tmp <- res[!duplicated(res$ensembl_gene_id),]
        rowdata <- merge(rowdata, tmp[c("ensembl_gene_id",
                                        "chromosome_name", "gene_biotype")],
                         by.x = "ENSEMBL", by.y = "ensembl_gene_id",
                         all.x = TRUE, all.y = FALSE, sort = FALSE)
        rowdata_GO <- merge(rowdata, res[c("ensembl_gene_id",
                                           "go_id", "name_1006")],
                            by.x = "ENSEMBL", by.y = "ensembl_gene_id",
                            all.x = TRUE, all.y = FALSE, sort = FALSE)
        rowdata_GO <- rowdata_GO[!is.na(rowdata_GO$name_1006) &
                                     ((rowdata_GO$name_1006 == "cell surface") |
                (rowdata_GO$name_1006=="cell surface receptor signaling pathway")),]
        rowdata_GO$name_1006[duplicated(rowdata_GO$ENSEMBL)] <-
            "cell surface receptor signaling pathway"
        rowdata_GO <- rowdata_GO[!duplicated(rowdata_GO$ENSEMBL),]
        rowdata <- merge(rowdata, rowdata_GO[c("nameInCountMatrix",
                                               "go_id", "name_1006")],
                         by.x = "nameInCountMatrix", by.y = "nameInCountMatrix",
                         all.x = TRUE, all.y = TRUE, sort = FALSE)
        rm(tmp, ensembl, res, rowdata_GO)
    }else{
        # only for mouse
        ensemblPattern <- "ENSMUSG"
        database <- read.delim(file.path(databaseDir,
                                         "Mmus_gene_database_secretedMol.tsv"),
                               stringsAsFactors = FALSE)
        database <- database[!duplicated(database$Symbol),]

        ensemblGenes <- rownames(countMatrix)[grep(ensemblPattern,
                                                   rownames(countMatrix))]
        ensemblGenesInternal <- gsub(paste0(".*_", ensemblPattern),
                             ensemblPattern, ensemblGenes)
        symbolGenes <- rownames(countMatrix)[!grepl(ensemblPattern,
                                                    rownames(countMatrix))]

        rowdataEnsembl <- data.frame(ensemblGenesInternal = ensemblGenesInternal,
                                     nameInCountMatrix = ensemblGenes)
        rowdataSymbol <- data.frame(nameInCountMatrix = symbolGenes)

        message(paste0("Annotating ",length(ensemblGenes), " genes containing ",
                       ensemblPattern, " pattern."))

        rowdataEnsembl <- merge(rowdataEnsembl, database,
                         by.x = "ensemblGenesInternal", by.y = "Ensembl",
                         all.x = TRUE, all.y = FALSE, sort = FALSE)
        rowdataEnsembl <- rowdataEnsembl[,-1]

        rowdataEnsembl$Ensembl <- rowdataEnsembl$nameInCountMatrix
        rowdataSymbol$Symbol <- rowdataSymbol$nameInCountMatrix

        message("Annotating rest ", length(symbolGenes), " genes
    considering them as SYMBOLs.")

        rowdataSymbol <- merge(rowdataSymbol, database,
                               by.x = "nameInCountMatrix", by.y = "Symbol",
                               all.x = TRUE, all.y = FALSE, sort = FALSE)

        rowdata <- base::rbind(rowdataSymbol, rowdataEnsembl)
        colnames(rowdata)[colnames(rowdata) == "Ensembl"] <- "ENSEMBL"
        colnames(rowdata)[colnames(rowdata) == "Symbol"] <- "SYMBOL"
        colnames(rowdata)[colnames(rowdata) == "Name"] <- "GENENAME"
        colnames(rowdata)[colnames(rowdata) == "Feature.Type"] <- "gene_biotype"
    }

    if(!is.null(rowData)){
        rowData$nameInCountMatrix <- rownames(rowData)
        rowdata <- merge(rowData, rowdata,
                         by.x = "nameInCountMatrix", by.y = "nameInCountMatrix",
                         all.x = TRUE, all.y = TRUE, sort = FALSE)
    }

    rownames(rowdata) <- rowdata$nameInCountMatrix
    rowdata <- rowdata[rownames(countMatrix),]
    rowdata$SYMBOL[(S4Vectors::isEmpty(rowdata$SYMBOL)) | (rowdata$SYMBOL == "")] <- NA
    stopifnot(all(rownames(rowdata) == rownames(countMatrix)))

    return(rowdata)
}

filterCells <- function(countMatrix, colData, genesSumThr = 100,
                        MoreBetter = c("genesNum", "sumCodPer", "genesSum"),
                        MoreWorse = c("sumMtPer")){
    message("Running filterCells.")
    countMatrix <- countMatrix[,colSums(countMatrix) > genesSumThr]
    colData <- colData[colnames(countMatrix),]
    mb <- MoreBetter
    mw <- MoreWorse

    reportTable <- data.frame(matrix(NA, ncol = length(mb)+length(mw),
                                     nrow = nrow(colData)))
    colnames(reportTable) <- c(mb, mw)
    reportTable <- cbind(cellName = colData$cellName, reportTable)
    rownames(reportTable) <- reportTable$cellName

    stopifnot(all(colData$cellName==reportTable$cellName))

    for(j in 1:length(mb)){
        quan <- quantile(colData[,colnames(colData) == mb[j]])
        threshold <- 2.5*quan[2] - 1.5*quan[4]
        if(threshold < 0){
            threshold <- (quan[1]+quan[2]) / 2
        }
        reportTable[colData[,colnames(colData)==mb[j]] >= as.numeric(threshold),
                    colnames(reportTable)==mb[j]] = 1
        reportTable[colData[,colnames(colData)==mb[j]] < as.numeric(threshold),
                    colnames(reportTable)==mb[j]] = 0
    }
    for(j in 1:length(mw)){
        quan <- quantile(colData[,colnames(colData) == mw[j]])
        threshold <- 2.5*quan[4] - 1.5*quan[2]
        if(threshold > quan[5]){
            threshold <- (quan[3]+quan[4]) / 2
        }
        reportTable[colData[,colnames(colData)==mw[j]] <= as.numeric(threshold),
                    colnames(reportTable)==mw[j]] = 1
        reportTable[colData[,colnames(colData)==mw[j]] > as.numeric(threshold),
                    colnames(reportTable)==mw[j]] = 0
    }

    ### add columns with filtering score and verdict ###
    reportTable <- dplyr::mutate(reportTable, score = NA)
    reportTable$score <- rowSums(reportTable[,colnames(reportTable) %in%
                                                 c(mb,mw)])
    reportTable <- dplyr::mutate(reportTable, filterPassed = NA)
    reportTable$filterPassed[reportTable$score >= length(mb)+length(mw)] <- 1
    reportTable$filterPassed[reportTable$score < length(mb)+length(mw)] <- 0

    ### add filtering verdict to colData ###
    colData <- dplyr::mutate(colData, filterPassed = NA)
    colData$filterPassed[colData$cellName %in%
                    reportTable$cellName[reportTable$filterPassed == 1]] <- 1
    colData$filterPassed[colData$cellName %in%
                    reportTable$cellName[reportTable$filterPassed == 0]] <- 0

    reportTable <- reportTable[order(reportTable$score, decreasing  =  FALSE), ]

    rm(threshold, j, mb, mw)

    rownames(colData) <- colData$cellName
    colData <- colData[colnames(countMatrix), ]
    stopifnot(all(rownames(colData) == colnames(countMatrix)))

    countMatrix <- countMatrix[,colData$filterPassed == 1]
    colData <- colData[colData$filterPassed == 1,]

    return(list(countMatrix, colData))
}

# from 2_mk_coldata_light.R
# I rewrote it with the new style
# This function creates colData or add columns mtGenes, genesNum,
# codGenes, genesSum, codSum, mtPer, codPer, sumMtPer, sumCodPer to the
# existing colData.
addCellsInfo <- function(countMatrix, rowData, colData = NULL){
    message("Adding cell info for cells filtering.")
    coldata <- data.frame(cellName = colnames(countMatrix),
                          stringsAsFactors = FALSE)

    ### add info about all genes in a cell ###
    coldata <- dplyr::mutate(coldata, genesNum = NA, genesSum = NA, oneUMI = NA)
    coldata$genesSum <- colSums(countMatrix)
    for(i in 1:ncol(countMatrix)){
        vec <- countMatrix[,i]
        coldata$genesNum[coldata$cellName == colnames(countMatrix)[i]] <-
            length(vec[vec > 0])
        coldata$oneUMI[coldata$cellName == colnames(countMatrix)[i]] <-
            length(vec[vec == 1])
    }
    rm(vec)
    coldata <- dplyr::mutate(coldata,
                            oneUMIper = 100 * coldata$oneUMI / coldata$genesNum)

    ### add info about mitochondrial and protein-coding genes ###
    coldata <- dplyr::mutate(coldata,
                             mtGenes = NA, mtSum = NA,
                             codGenes = NA, codSum = NA)
    for(i in 1:ncol(countMatrix)){
        mt <- countMatrix[rownames(countMatrix) %in%
                            rowData$nameInCountMatrix[rowData$chromosome_name ==
                                                                        "MT"],i]
        coldata$mtGenes[coldata$cellName == colnames(countMatrix)[i]] <-
            length(mt[mt > 0])
        coldata$mtSum[coldata$cellName == colnames(countMatrix)[i]]   <- sum(mt)

        cod <- countMatrix[rownames(countMatrix) %in%
                    rowData$nameInCountMatrix[rowData$gene_biotype ==
                                                            "protein_coding"],i]
        coldata$codGenes[coldata$cellName == colnames(countMatrix)[i]] <-
            length(cod[cod > 0])
        coldata$codSum[coldata$cellName == colnames(countMatrix)[i]] <- sum(cod)
    }

    coldata <- dplyr::mutate(coldata,
                    mtPer      = 100 * coldata$mtGenes  / coldata$genesNum,
                    codPer     = 100 * coldata$codGenes / coldata$genesNum,
                    sumMtPer  = 100 * coldata$mtSum    / coldata$genesSum,
                    sumCodPer = 100 * coldata$codSum   / coldata$genesSum)
    rm(mt, cod)

    if(!is.null(colData)){
        colData$cellName <- rownames(colData)
        coldata <- merge(colData, coldata,
                         by.x = "cellName", by.y = "cellName",
                         all.x = FALSE, all.y = TRUE, sort = FALSE)
    }

    rownames(coldata) <- coldata$cellName
    coldata <- coldata[colnames(countMatrix),]
    stopifnot(all(rownames(coldata) == colnames(countMatrix)))

    return(coldata)
}


filterGenes <- function(countMatrix, rowData){

    # internal function, filters genes which are more than in 10 cells and less than (all-10) cells

    selRows <- ((rowSums(countMatrix[,] >= 1)) > 10)
    countMatrix <- countMatrix[selRows,]
    rowData <- rowData[rowData$nameInCountMatrix %in% rownames(countMatrix),]

    return(list(countMatrix, rowData))
}

# Do not export this function.
.normaliseCountMatrix <- function(countMatrix,
                                 species,
                                 method="default",
                                 sizes=c(20,40,60,80,100),
                                 rowData=NULL,
                                 colData=NULL,
                                 alreadyCellFiltered = FALSE,
                                 runQuickCluster = TRUE,
                                 databaseDir = system.file("extdata", package = "conclus")){
    # Does normalisation of count matrix with.
    # There are 2 possible methods: "default" or "census"
    # The function returns SCE object with normalised count matrix
    if(method == "default"){
        rowData <- annotateGenes(countMatrix, species = species,
                                 rowData = rowData, databaseDir = databaseDir)
        colData <- addCellsInfo(countMatrix, rowData = rowData,
                                colData = colData)
        if(!alreadyCellFiltered){
            filterCellsResult <- filterCells(countMatrix, colData)
            countMatrix <- filterCellsResult[[1]]
            colData <- filterCellsResult[[2]]
            rm(filterCellsResult)
        }
        filterGenesResult <- filterGenes(countMatrix, rowData)
        countMatrix <- filterGenesResult[[1]]
        rowData <- filterGenesResult[[2]]
        rm(filterGenesResult)

        stopifnot(all(rownames(countMatrix) == rownames(rowData)))
        stopifnot(all(colnames(countMatrix) == rownames(colData)))

        sce <-
            SingleCellExperiment::SingleCellExperiment(assays = list(counts = as.matrix(countMatrix)),
                                 colData=colData, rowData=rowData)

        # normalization
        message("Running normalization. It can take a while, depends on the
number of cells.")
        if(runQuickCluster){
            cl <- tryCatch(scran::quickCluster(sce), error=function(e) NULL)
        }else{
            cl <- NULL
        }

        # compute sizeFactors which will be used for normalization
        sceNorm <- scran::computeSumFactors(sce, sizes = sizes, clusters = cl)

        message("summary(sizeFactors(sceObject)):")
        print(summary(SingleCellExperiment::sizeFactors(sceNorm)))
        if(length(SingleCellExperiment::sizeFactors(sceNorm)[SingleCellExperiment::sizeFactors(sceNorm) <= 0]) > 0){
            message("Cells with negative sizeFactors will be deleted before the
downstream analysis.")
        }
        sceNorm <- sceNorm[, SingleCellExperiment::sizeFactors(sceNorm) > 0]
        sceNorm <- scater::normalize(sceNorm)
        rm(sce)

        return(sceNorm)

    }else if(method == "census"){
         message("Method 'census' is currently unavailable. Please select 'default'.")
         message("Unmodified count matrix returned.")
         return(countMatrix)
    #    sceObject <- normalize_dataset(as.matrix(countMatrix))
    #    SummarizedExperiment::colData(sceObject)$cellName = rownames(SummarizedExperiment::colData(sceObject))
    #    return(sceObject)
    }else{
        message("Wrong method. Unmodified count matrix returned.")
        return(countMatrix)
    }
}

#' normaliseCountMatrix
#'
#' Create a SingleCellExperiment object and perform normalization. The same as conclus::normalizeCountMatrix.
#'
#' @param countMatrix a matrix with non-normalised gene expression.
#' @param species either 'mmu' or 'human'.
#' @param method a method of clustering: available option is "default" using scran and scater.
#' @param sizes a vector of size factors from scran::computeSumFactors() function.
#' @param rowData a data frame with information about genes
#' @param colData a data frame with information about cells
#' @param alreadyCellFiltered if TRUE, cells quality check and filtering will not be applied. 
#' However, the function may delete some cells if they have negative size factors after scran::computeSumFactors.
#' @param runQuickCluster if scran::quickCluster() function must be applied.
#' Usually, it allows to improve normalization for medium-size count matrices. 
#' However, it is not recommended for datasets with less than 200 cells and
#' may take too long for datasets with more than 10000 cells.
#' @param databaseDir a path to annotation database provided with CONCLUS called 
#' "Mmus_gene_database_secretedMol.tsv" (only for MusMusculus 'mmu').
#' The function will work also without the database but slower because it will retrieve genes info from biomaRt.
#'
#' @return A SingleCellExperiment object with normalized gene expression, colData, and rowData.
#' @export
normaliseCountMatrix <- function(countMatrix,
                                 species,
                                 method="default",
                                 sizes=c(20,40,60,80,100),
                                 rowData=NULL,
                                 colData=NULL,
                                 alreadyCellFiltered = FALSE,
                                 runQuickCluster = TRUE,
                                 databaseDir = system.file("extdata", package = "conclus") # FALSE for not using the database but download from biomaRt
                                 ){

    .normaliseCountMatrix(countMatrix,
                                 species,
                                 method=method,
                                 sizes=sizes,
                                 rowData=rowData,
                                 colData=colData,
                                 alreadyCellFiltered = alreadyCellFiltered,
                                 runQuickCluster = runQuickCluster,
                                 databaseDir = databaseDir)
}
# deleted from the description of the normaliseCountMatrix() function:
# #' @param method a method of clustering: "default" (using scran and scater) or "census" (using Census from Monocle).

#' @export
normalizeCountMatrix <- function(countMatrix,
                                 species,
                                 method="default",
                                 sizes=c(20,40,60,80,100),
                                 rowData=NULL,
                                 colData=NULL,
                                 alreadyCellFiltered = FALSE,
                                 runQuickCluster = TRUE,
                                 databaseDir = ""){

    .normaliseCountMatrix(countMatrix,
                                 species,
                                 method="default",
                                 sizes=c(20,40,60,80,100),
                                 rowData=NULL,
                                 colData=NULL,
                                 alreadyCellFiltered = FALSE,
                                 runQuickCluster = TRUE,
                                 databaseDir = "")
}

#' Collect genes information to one table.
#'
#' The function takes a data frame containing gene symbols and (or) ENSEMBL IDs and returns
#' a data frame with such information as gene name, feature type, chromosome,
#' gene IDs in different annotations, knockout information from MGI, a summary from NCBI 
#' and UniProt, and whether or not a gene belongs to GO terms containing proteins on the cell surface or 
#' involved in secretion.
#'
#' @param genes a data frame with the first column called "geneName" containing gene symbols and (or) ENSEMBL IDs.
#' Other columns are optional. For example, the second column could be "clusters" with the name of the cluster 
#' for which the gene is a marker.
#' @param databaseDir a path to the database provided with CONCLUS called "Mmus_gene_database_secretedMol.tsv".
#' @param groupBy a column in the input table used for grouping the genes in the output tables.
#' This option is useful if a table contains genes from different clusters.
#' @param orderGenes if "initial" then the order of genes will not be changed.
#' @param getUniprot boolean, whether to get information from UniProt or not. Default is TRUE.
#' Sometimes, the connection to the website is not reliable. 
#' If you tried a couple of times and it failed, select FALSE. 
#' @param silent whether to show messages from intermediate steps or not.
#' @param coresGenes maximum number of jobs that the function can run in parallel.
#'
#' @return Returns a data frame.
#' @export
getGenesInfo <- function(genes, databaseDir = system.file("extdata", package = "conclus"), 
                         groupBy = "clusters",
                         orderGenes = "initial",
                         getUniprot = TRUE,
                         silent = FALSE, coresGenes = 20){

    # MGI
    getMGIentry <- function(MGIid){
        library('rvest')
        library(S4Vectors)
        data <- NULL
        if(!is.na(MGIid)){
            url <- paste0("http://www.informatics.jax.org/marker/",
                          MGIid)
            webpage <- read_html(url)
            data_html <- html_nodes(webpage,'#mpMarkerClip')
            data <- html_text(data_html)
            data <- gsub("\t", "", data, perl = TRUE)
            data <- gsub("\n", "", data, perl = TRUE)
            data <- gsub(";", ",", data)

            rm(url, data_html, webpage)
        }
        if(!S4Vectors::isEmpty(data)){
            return(data)
        }else{
            return(NA)
        }
    }

    #NCBI
    getNCBIentry <- function(NCBIid){
        library('rvest')
        library(S4Vectors)
        data <- NULL
        if(!is.na(NCBIid)){
            url <- paste0("https://www.ncbi.nlm.nih.gov/gene?cmd=Retrieve&dopt=full_report&list_uids=",
                          NCBIid)
            webpage <- read_html(url)
            data_html <- html_nodes(webpage,'#summaryDl dd:nth-child(20)')
            data <- html_text(data_html)
            data <- gsub(" See more", "", data)
            data <- gsub("\n          human\n          all\n", "", data)
            data <- gsub(";", ",", data)

            rm(url, data_html, webpage)
        }
        if(!S4Vectors::isEmpty(data)){
            return(data)
        }else{
            return(NA)
        }
    }

    # Uniprot
    getUniprotEntry <- function(UniprotID){
        library('rvest')
        library(S4Vectors)
        data <- NULL
        if(!is.na(UniprotID)){
            url <- paste0("https://www.uniprot.org/uniprot/",
                          UniprotID, "#function")
            webpage <- read_html(url)
            data_html <- html_nodes(webpage,'h2+ .annotation > span')
            data <- html_text(data_html)
            data <- gsub("* Publication.*", " Publications", data)
            data <- gsub('<p><a href="/manual/evidences#ECO:0000250">More...</a></p>',
                         "", data)
            data <- gsub('\r\n .* <p>Manually curated information .* toiUniProtKB:.* (.*_.*).',
                         "", data)
            data <- gsub('\r\n .* <p>Manual validated information.*',
                         "", data)
            data <- gsub('UniRule annotation', "", data)
            data <- gsub(";", ",", data)

            rm(url, data_html, webpage)
        }
        if(!S4Vectors::isEmpty(data)){
            return(data)
        }else{
            return(NA)
        }
    }

    library(DataCombine)
    library(doParallel)
    #database <- read.delim(file.path(databaseDir, "Mmus_gene_database.tsv"),
    #                       stringsAsFactors = FALSE)
    database <- read.delim(file.path(databaseDir, "Mmus_gene_database_secretedMol.tsv"),
                           stringsAsFactors = FALSE)
    database <- database[!duplicated(database$Symbol),]

    genesEnsembl <- genes[grepl("ENSMUSG", genes$geneName),]
    genesSymbol <- genes[!grepl("ENSMUSG", genes$geneName),]

    resultEnsembl <- merge(genesEnsembl, database,
                           by.x = "geneName", by.y = "Ensembl",
                           all.x = TRUE, all.y = FALSE, sort = FALSE)
    resultEnsembl$Ensembl <- resultEnsembl$geneName

    resultSymbol <- merge(genesSymbol, database,
                          by.x = "geneName", by.y = "Symbol",
                          all.x = TRUE, all.y = FALSE, sort = FALSE)
    resultSymbol$Symbol <- resultSymbol$geneName

    colnames(resultEnsembl)
    colnamesOrder = c("geneName", "clusters", "Ensembl", "Symbol", "Name",
                      "Feature.Type", "MGI.Gene.Marker.ID", "Entrez.Gene.ID",
                      "Uniprot.ID", "chromosome_name", "go_id", "name_1006")
    resultEnsembl <- resultEnsembl[,colnamesOrder]
    resultSymbol <- resultSymbol[,colnamesOrder]

    result <- rbind(resultSymbol, resultEnsembl)
    #colnames(result)[11:12] <- c("CellSurface.GOid", "CellSurface.GOname")

    rownames(result) <- result$geneName
    result <- result[genes$geneName,]

    if(orderGenes == "alphabetical"){
        result <- result[order(result$geneName),]
    }

    myCluster <- parallel::makeCluster(coresGenes, # number of cores to use
                             type = "PSOCK") # type of cluster
    doParallel::registerDoParallel(myCluster)

    # MGI vector
    if(!silent){
        message("Collecting knockout phenotype information from MGI.")
    }

    MGI <- unname(unlist(foreach::foreach(MGIid=result$MGI.Gene.Marker.ID) %dopar% getMGIentry(MGIid)))

    #NCBI vector
    if(!silent){
        message("Retrieving info from NCBI.")
    }

    NCBI <- unname(unlist( foreach::foreach(NCBIid=result$Entrez.Gene.ID) %dopar% getNCBIentry(NCBIid) ))

    if(getUniprot){
        # Uniprot
        if(!silent){
            message("Getting summary from Uniprot.")
        }
        UniprotFunction <- unname(unlist( foreach::foreach(UniprotID=result$Uniprot.ID) %dopar% getUniprotEntry(UniprotID) ))
    }

    parallel::stopCluster(myCluster)

    #MGI <- data.frame(MGI, stringsAsFactors = FALSE)
    #NCBI <- data.frame(NCBI, stringsAsFactors = FALSE)
    #UniprotFunction <- data.frame(UniprotFunction, stringsAsFactors = FALSE)

    if(getUniprot){
        result <- cbind(result, MGI, NCBI, UniprotFunction)
    }else{
        result <- cbind(result, MGI, NCBI)
    }

    result$MGI <- as.character(result$MGI)
    result$NCBI <- as.character(result$NCBI)
    if(getUniprot){
        result$UniprotFunction <- as.character(result$UniprotFunction)
    }

    rownames(result) <- c(1:nrow(result))

    # inserting space for comments
    if(any(colnames(result) %in% groupBy) &
       (orderGenes == "initial") &
       (length(unique(result$clusters)) > 1)){
        resultFinal <- result
        groupingTable <- table(resultFinal[,groupBy])
        groupingTable <- groupingTable[unique(resultFinal$clusters)]
        resultFinal <- InsertRow(resultFinal, c("For notes:",
                                                rep("", (ncol(result))-1)),
                                 RowNum = 1)

        RowNum <- groupingTable[1] + 1
        for(i in 1:(length(groupingTable)-1)){
            resultFinal <- InsertRow(resultFinal, rep("", ncol(result)),
                                     RowNum = (RowNum + 1))
            resultFinal <- InsertRow(resultFinal, c("For notes:",
                                                    rep("", (ncol(result))-1)),
                                     RowNum = (RowNum + 2))
            RowNum <- RowNum + 2 + groupingTable[i+1]
        }
        result <- resultFinal
        rm(resultFinal)
    }

    rm(resultEnsembl, resultSymbol, database, colnamesOrder)

    #result <- result[,c("geneName", "clusters", "Name",
    #                    "Feature.Type",  "CellSurface.GOid",
    #                    "CellSurface.GOname", "MGI", "NCBI",
    #                    "UniprotFunction", "chromosome_name",
    #                    "Symbol", "Ensembl", "MGI.Gene.Marker.ID",
    #                    "Entrez.Gene.ID", "Uniprot.ID")]
    if(getUniprot){
        result <- result[,c("geneName", "clusters", "Name",
                        "Feature.Type",  "go_id",
                        "name_1006", "MGI", "NCBI",
                        "UniprotFunction", "chromosome_name",
                        "Symbol", "Ensembl", "MGI.Gene.Marker.ID",
                        "Entrez.Gene.ID", "Uniprot.ID")]
    }else{
        result <- result[,c("geneName", "clusters", "Name",
                        "Feature.Type",  "go_id",
                        "name_1006", "MGI", "NCBI",
                        "chromosome_name", # no "UniprotFunction"
                        "Symbol", "Ensembl", "MGI.Gene.Marker.ID",
                        "Entrez.Gene.ID", "Uniprot.ID")]
    }
    return(result)
}

#' Save gene information into a table or tables for multiple inputs.
#'
#' This function runs conclus::getGenesInfo() function for all tables into the inputDir 
#' and saves the result into the outputDir.
#'
#' @param dataDirectory a directory with CONCLUS output. You can specify either 
#' dataDirectory, then inputDir and outputDir will be hardcoded, or inputDir and outputDir only.
#' The first is recommended during running CONCLUS workflow when the second option
#' is comfortable when you created input tables with genes manually.
#' @param inputDir input directory containing text files. These files can be obtained by 
#' applying conclus::saveMarkersLists() function or created manually. Each file must be a 
#' data frame with the first column called "geneName" containing gene symbols and (or) ENSEMBL IDs.
#' @param pattern a pattern of file names to take.
#' @param outputDir output directory.
#' @param databaseDir a path to the database "Mmus_gene_database_secretedMol.tsv". It is provided with the conclus package.
#' @param sep a parameter of read.delim() function.
#' @param header whether or not your input files have a header.
#' @param startFromFile number of the input file to start with. The function approaches files one by one.
#' It uses web scraping method to collect publicly available info from MGI, NCBI and UniProt websites.
#' Sometimes, if the Internet connection is not reliable, the function can drop. 
#' In this case, it is comfortable to start from the failed file and not to redo the previous ones.
#' @param groupBy a column in the input table used for grouping the genes in the output tables.
#' @param orderGenes if "initial" then the order of genes will not be changed.
#' @param getUniprot boolean, whether to get information from UniProt or not. Default is TRUE.
#' Sometimes, the connection to the website is not reliable. 
#' If you tried a couple of times and it failed, select FALSE. 
#' @param silent whether to show messages from intermediate steps or not.
#' @param coresGenes maximum number of jobs that the function can run in parallel.
#'
#' @return It saves text files either in the 'dataDirectory/marker_genes/saveGenesInfo' or outputDir 
#' depending on whether you specify dataDirectory or (inpitDir and outputDir) explicitly.
#' @export
saveGenesInfo <- function(dataDirectory = "",
                          inputDir = "", 
                          outputDir = "", 
                          pattern = "", 
                          databaseDir = system.file("extdata", package = "conclus"),
                          sep = ";", header = TRUE,
                          startFromFile = 1, #outputFormat = c("csv", "xlsx"),
                          groupBy = "clusters", # getGenesInfo params
                          orderGenes = "initial",
                          getUniprot = TRUE,
                          silent = FALSE, coresGenes = 20){

    if(dataDirectory != ""){
        inputDir = file.path(dataDirectory, "/marker_genes/markers_lists")
        outputDir = file.path(dataDirectory, "/marker_genes/saveGenesInfo")
        pattern = "markers.csv"
        dir.create(outputDir, showWarnings=F)
    }

    filesList <- list.files(inputDir, pattern = pattern)
    filePrefix <- do.call(rbind, strsplit(filesList, "[.]"))[,1]

    for(file in startFromFile:length(filesList)) {
        cat(file, "\n")
        genes <- read.delim(file.path(inputDir, filesList[file]),
                            sep = sep, header = header,
                            stringsAsFactors = FALSE)

        result <- getGenesInfo(genes, databaseDir,
                               groupBy = groupBy,
                               orderGenes = orderGenes,
                               getUniprot = getUniprot,
                               silent = silent, coresGenes = coresGenes)

        message("Writing the output file number ", file, "\n")

        #if(any(outputFormat %in% "csv")){
            write.table(result, file = file.path(outputDir,
                                                 paste0(filePrefix[file],
                                                        "_genesInfo.csv")),
                        quote = FALSE, sep = ";", row.names = FALSE)
        #}else if(any(outputFormat %in% "xlsx")){
            #write.xlsx2(result, file = file.path(outputDir,
            #                                     paste0(filePrefix[file],
            #                                            "_genesInfo.xlsx")),
            #            row.names=FALSE)
        #}
    }
}

#' Save top N marker genes for each cluster into a format suitable for conclus::saveGenesInfo() function.
#' 
#' The function takes the output files of conclus::rankGenes(), extracts top N markers and saves
#' them into the first "geneName" column of the output table. The second column "clusters" contains the 
#' name of the corresponding cluster.
#'
#' @param experimentName name of the experiment which appears at the beginning of the file name 
#' (supposed to be the same for one experiment during the workflow).
#' @param dataDirectory experiment directory (supposed to be the same for one experiment during the workflow).
#' @param inputDir input directory, usually "marker_genes" created automatically after conclus::runCONCLUS().
#' @param outputDir output directory.
#' @param pattern a pattern of the input file names to take.
#' @param Ntop number of top markers to take from each cluster.
#'
#' @return It saves files into the outputDir. The number of files is equal to the number of clusters.
#' @export
saveMarkersLists <- function(experimentName, dataDirectory,
                             inputDir = file.path(dataDirectory, "marker_genes"),
                             outputDir = file.path(dataDirectory,
                                                   paste0("marker_genes/markers_lists")),
                             pattern = "genes.tsv", Ntop = 100){

    dir.create(outputDir, showWarnings=F)

    filesList <- list.files(outputDir, pattern = "_markers.csv")
    deletedFiles <- sapply(filesList, function(fileName) deleteFile(fileName,
                                                                    outputDir))
    fnames <- list.files(inputDir, pattern = pattern)
    for(i in 1:length(fnames)){
        tmp <- read.delim(file.path(inputDir, fnames[i]), stringsAsFactors = FALSE)
        markerList <- as.data.frame(tmp$Gene[1:Ntop])
        outputName <- gsub(paste0("_", pattern), "", fnames[i])
        clusterName <- gsub(paste0(experimentName, "_"), "", outputName)
        colnames(markerList) <- "geneName"
        markerList$clusters <- clusterName
        write.table(markerList, file.path(outputDir,
                                          paste0(outputName, "_markers.csv")),
                    row.names = FALSE, quote = FALSE, sep = ";")
    }
}
PolinaPavlovich/CONCLUS documentation built on May 10, 2019, 2:42 p.m.