R/veni.R

Defines functions JackD GetGeneFromNetwork SaveNetworkH5 GenerateSeuratObject SaveCountsToH5 QCPlots GeneBoxPlot DepthBoxPlot CoverageBoxPlot MitoBoxPlot DepthPlotComparison DepthPlot array_split GetCutoffs FilterMito GetMito CellBarPlot GenerateLbls BatchProblemPlot HClustBackgroundMatrix InspectBackground InspectBackgroundGenes FilterBackgroundMatrix InspectBackgroundMatrix GetBackgroundFromMatrix GetBackground GetBackgroundMatrix RegressMito Decontaminate

Documented in array_split BatchProblemPlot CellBarPlot CoverageBoxPlot Decontaminate DepthBoxPlot DepthPlot DepthPlotComparison FilterBackgroundMatrix FilterMito GeneBoxPlot GenerateLbls GenerateSeuratObject GetBackground GetBackgroundFromMatrix GetBackgroundMatrix GetCutoffs GetGeneFromNetwork GetMito HClustBackgroundMatrix InspectBackground InspectBackgroundGenes InspectBackgroundMatrix JackD MitoBoxPlot QCPlots RegressMito SaveCountsToH5 SaveNetworkH5

#' Load count matrix from a txt (or txt.gz) file
#'
#' @param txt.file Filename of the counts txt (or txt.gz) file
#' @return Count matrix with genes and barcodes
#' @export
LoadTxt <- function (txt.file)
{
  E = utils::read.delim(txt.file, stringsAsFactors=FALSE, row.names = 1)
  E = as.matrix(E)
  E = Matrix::Matrix(E, sparse = TRUE)
  return(E)
}

#' Load count matrix from a txt (or txt.gz) file
#'
#' @param txt.folder folder where each sub-directory contains the counts txt (or txt.gz) file
#' @return Count matrix with genes and barcodes
#' @export
LoadTxtFolder <- function (txt.folder)
{
  txt.folder = gsub("\\/$", "", txt.folder, perl = TRUE);
  fls = list.files(txt.folder, full.names = T)
  E = lapply(fls, function(x) utils::read.delim(x, stringsAsFactors=FALSE, row.names = 1))
  E = lapply(E, as.matrix)
  E = lapply(E, function(x) Matrix::Matrix(x, sparse = TRUE))
  return(E)
}

#' Load 10X count matrix from an h5 file
#'
#' @param filename Directory and filename of the h5 file
#' @param use.names Boolean TRUE, see ?Seurat::Read10X_h5
#' @param unique Sets the rownames (genes) of the matrix to be unique, see ?make.names
#' @return Count matrix with genes and barcodes
#' @export
Read10Xh5 <- function (filename, use.names = TRUE, unique = TRUE)
{
  if (!requireNamespace("hdf5r", quietly = TRUE)) {
    stop("Please install hdf5r to read HDF5 files")
  }
  if (!file.exists(filename)) {
    stop("File not found")
  }
  infile <- hdf5r::H5File$new(filename)
  genomes <- names(infile)
  output <- list()
  flag = !infile$attr_exists("PYTABLES_FORMAT_VERSION")
  if (flag) {
    if (use.names) {
      feature_slot <- "features/name"
    }
    else {
      feature_slot <- "features/id"
    }
  } else {
    if (use.names) {
      feature_slot <- "gene_names"
      feature_slot2 = "genes"
    }
    else {
      feature_slot = "genes"
    }
  }
  for (genome in genomes) {
    counts <- infile[[paste0(genome, "/data")]]
    indices <- infile[[paste0(genome, "/indices")]]
    indptr <- infile[[paste0(genome, "/indptr")]]
    shp <- infile[[paste0(genome, "/shape")]]
    features <- infile[[paste0(genome, "/", feature_slot)]][]
    if (!flag)
      features2 <- infile[[paste0(genome, "/", feature_slot2)]][]
    barcodes <- infile[[paste0(genome, "/barcodes")]]
    sparse.mat <- Matrix::sparseMatrix(i = indices[] + 1, p = indptr[],
                                       x = as.numeric(counts[]), dims = shp[], giveCsparse = FALSE)
    if (!flag) {
      rownames(sparse.mat) <- make.names(paste(features, features2, sep = "_._"), unique = unique)
    } else {
      rownames(sparse.mat) <- make.names(features, unique = unique)
    }
    colnames(sparse.mat) <- barcodes[]
    sparse.mat <- as(object = sparse.mat, Class = "dgCMatrix")
    output[[genome]] <- sparse.mat
  }
  infile$close_all()
  if (length(output) == 1) {
    return(output[[genome]])
  }
  else {
    return(output)
  }
}

#' Load decont count matrix from an h5 file
#'
#' @param filename Directory and filename of the h5 file
#' @param use.names Boolean TRUE, see ?Seurat::Read10X_h5
#' @return Count matrix with genes and barcodes
#' @export
ReadDeconth5 <- function (filename, use.names = TRUE)
{
  if (!requireNamespace("hdf5r", quietly = TRUE)) {
    stop("Please install hdf5r to read HDF5 files")
  }
  if (!file.exists(filename)) {
    stop("File not found")
  }
  infile <- hdf5r::H5File$new(filename)
  genomes <- names(infile)
  output <- list()
  if (!infile$attr_exists("PYTABLES_FORMAT_VERSION")) {
    if (use.names) {
      feature_slot <- "features/name"
    }
    else {
      feature_slot <- "features/id"
    }
  } else {
    if (use.names) {
      feature_slot <- "gene_names"
      feature_slot2 = "genes"
    }
    else {
      feature_slot = "genes"
    }
  }
  for (genome in genomes) {
    counts <- infile[[paste0(genome, "/data")]]
    indices <- infile[[paste0(genome, "/indices")]]
    indptr <- infile[[paste0(genome, "/indptr")]]
    shp <- infile[[paste0(genome, "/shape")]]
    features <- infile[[paste0(genome, "/", "gene_names")]][]
    features2 <- infile[[paste0(genome, "/", "genes")]][]
    barcodes <- infile[[paste0(genome, "/barcodes")]]
    sparse.mat <- Matrix::sparseMatrix(i = indices[] + 1, p = indptr[],
                                       x = as.numeric(counts[]), dims = shp[], giveCsparse = FALSE)
    rownames(sparse.mat) <- paste(features, features2, sep = "_._")
    colnames(sparse.mat) <- barcodes[]
    sparse.mat <- as(object = sparse.mat, Class = "dgCMatrix")
    output[[genome]] <- sparse.mat
  }
  infile$close_all()
  if (length(output) == 1) {
    return(output[[genome]])
  }
  else {
    return(output)
  }
}

#' Load count matrix from a tsv (or tsv.gz) file
#'
#' @param counts.file Filename of the counts tsv (or tsv.gz) file
#' @param meta.file Filename of the metadata tsv (or tsv.gz) file
#' @return Count matrix with genes and barcodes
#' @export
ReadCelseq <- function (counts.file, meta.file)
{
  E = suppressWarnings(readr::read_tsv(counts.file));
  gns <- E$gene;
  E = E[,-1]
  M = suppressWarnings(readr::read_tsv(meta.file))
  S = lapply(unique(M$sample), function(x) Matrix::Matrix(as.matrix(E[,M$sample == x]), sparse = TRUE))
  S = lapply(S, function(x) as(x, Class = "dgCMatrix"))
  names(S) <- unique(M$sample)
  #rownames(S[[1]]) <- gns
  S = lapply(S, function(x){
    rownames(x) <- gns
    x
  })
  S
}

#' Load Indrop count matrix from "matrix.txt" file
#'
#' @param filename Directory and filename of the "matrix.txt" file
#' @return Count matrix with genes and barcodes
#' @export
ReadIndrop <- function (filename)
{
  q = utils::read.delim(filename, stringsAsFactors = FALSE)
  barcodes = q$Name
  genes = colnames(q)[-c(1,2,3,4,5)]
  matrix = Matrix::Matrix(t(as.matrix(q[,-c(1,2,3,4,5)])), sparse = TRUE)
  matrix = as(matrix, "dgTMatrix")
  rownames(matrix) <- genes
  colnames(matrix) <- barcodes
  return(matrix)
}

#' Load folder of Indrop count matrices from "matrix.txt" file
#'
#' @param filedir Directory where each folder contains a "matrix.txt" file
#' @return Count matrix with genes and barcodes
#' @export
ReadIndropFolder <- function (filedir)
{
  filedir = gsub("\\/$", "", filedir, perl = TRUE);
  dirs = list.files(filedir, full.names = TRUE)
  fns = list.files(dirs, full.names = TRUE)
  q = lapply(fns, function(x) utils::read.delim(x, stringsAsFactors = FALSE))
  barcodes = lapply(q, function(x) x$Name)
  genes = lapply(q, function(x) colnames(x)[-c(1,2,3,4,5)])
  matrix = lapply(q, function(x) Matrix::Matrix(t(as.matrix(x[,-c(1,2,3,4,5)])), sparse = TRUE))
  matrix = lapply(matrix, function(x) as(x, "dgTMatrix"))
  matrix = mapply(function(x,y) {
    colnames(x) <- y
    x
  }, x = matrix, y = barcodes)
  names(matrix) <- list.files(filedir)
  return(matrix)
}

#' Load decont count matrix from an h5 file in each sub-folder
#'
#' @param filedir Directory where each folder contains filename fn
#' @param fn File name of each .h5 file. By default, fn = "raw_gene_bc_matrices_h5.h5"
#' @return Count matrix with genes and barcodes
#' @seealso Read10Xh5_v2, ReadCelseq
#' @export
ReadDeconth5Folder <- function (filedir, fn = "count_matrix.h5")
{
  filedir = gsub("\\/$", "", filedir, perl = TRUE);
  fls = list.files(filedir, full.names = TRUE)
  fls = list.files(fls, full.names = TRUE)
  fls = fls[grepl(fn, fls)]
  if (length(fls) == 0)
  {
    warning(fn, "not detected in any folder. Perhaps you can check that the file exists? \n")
    stop()
  }
  cat(" ..........  Entry in ReadDeconth5_folder: \n");
  ta = proc.time()[3];
  cat(" Number of count matrices:", length(fls), "\n");
  E = lapply(fls, function(x) ReadDeconth5(x))
  names(E) <- list.files(filedir)[grepl(fn, fls)]
  cat(" Number of genes:", sapply(E, nrow)[1], "\n");
  cat(" Total number of barcodes:", sum(sapply(E, ncol)), "\n");
  tb = proc.time()[3] - ta;
  cat(" ..........  Exit Read10X_h5_folder \n");
  cat("             Execution time = ", tb, " s.\n", sep = "");
  return(E)
}

#' Load 10X count matrix from an h5 file in each sub-folder
#'
#' @param filedir Directory where each folder contains filename fn
#' @param fn File name of each .h5 file. By default, fn = "raw_gene_bc_matrices_h5.h5"
#' @return Count matrix with genes and barcodes
#' @seealso Read10Xh5_v2, ReadCelseq
#' @export
Read10Xh5Folder <- function (filedir, fn = "raw_gene_bc_matrices_h5.h5")
{
  filedir = gsub("\\/$", "", filedir, perl = TRUE);
  fls = list.files(filedir, full.names = TRUE)
  fls = list.files(fls, full.names = TRUE)
  fls = fls[grepl(fn, fls)]
  if (length(fls) == 0)
  {
    warning(fn, "not detected in any folder. Perhaps you can check that the file exists? \n")
    stop()
  }
  cat(" ..........  Entry in Read10X_h5_folder: \n");
  ta = proc.time()[3];
  cat(" Number of count matrices:", length(fls), "\n");
  E = lapply(fls, function(x) Read10Xh5(x))
  names(E) <- list.files(filedir)[grepl(fn, fls)]
  cat(" Number of genes:", sapply(E, nrow)[1], "\n");
  cat(" Total number of barcodes:", sum(sapply(E, ncol)), "\n");
  tb = proc.time()[3] - ta;
  cat(" ..........  Exit Read10X_h5_folder \n");
  cat("             Execution time = ", tb, " s.\n", sep = "");
  return(E)
}

#' Filter barcodes
#'
#' @param E Expression matrix or a list of expression matrices
#' @param min_genes_before All barcodes with less than min_genes_before detected will be removed prior to background correction.
#' @param min_umi_before All barcodes with less than min_umi_before will be removed prior to background correction.
#' @param keep If TRUE, all genes are kept for total counts and genes detected calculation. If FALSE, total counts and genes detected are caculated after discarding mitochondrial and ribosomal genes.
#' @return Filtered expression matrices and barcodes
#' @export
FilterBarcodes <- function (E, min_genes_before = 0, min_umi_before = 0, keep = T)
{
  if (! class(E) %in% "list")
    E = list(E)

  cat(" ..........  Entry in FilterBarcodes: \n");
  ta = proc.time()[3];
  cat("             Total number of barcodes before filtering:", sum(sapply(E, ncol)), "\n");
  # keep only cells with > min_genes genes detected
  if (keep) {
    E = lapply(E, function(x) x[,(Matrix::colSums(x != 0) > min_genes_before) & (Matrix::colSums(x) > min_umi_before)])
  } else {
    crap = grepl("^RPS", rownames(E[[1]])) | grepl("^RPL", rownames(E[[1]])) | grepl("^RP[0-9]", rownames(E[[1]])) | grepl("^MT-", rownames(E[[1]]))
    E = lapply(E, function(x) x[,(Matrix::colSums(x[!crap,] != 0) > min_genes_before) & (Matrix::colSums(x[!crap,]) > min_umi_before)])
  }
  cat("             Total number of barcodes after filtering:", sum(sapply(E, ncol)), "\n");
  tb = proc.time()[3] - ta;
  cat(" ..........  Exit FilterBarcodes \n");
  cat("             Execution time = ", tb, " s.\n", sep = "");
  return(E)
}

#' Main function
#'
#' @param E Expression matrix or a list of expression matrices
#' @param B Background matrix or a list of background matrices (see ?GetBackground)
#' @param min_genes_before All barcodes with less than min_genes_before detected will be removed prior to background correction.
#' @param min_genes_after All barcodes with less than min_genes_after detected will be removed after background correction.
#' @param min_umi_before All barcodes with less than min_umi_before will be removed prior to background correction.
#' @param min_umi_after All barcodes with less than min_umi_after will be removed after background correction.
#' @param do.par If TRUE, regressions will be done in parallel
#' @param num.cores Set number of cores for parallel computation. By default, all available cores - 1 will be used.
#' @param large if TRUE, dataset is chunked prior to regression to prevent any memory errors.
#' @param num.chunks if large, the dataset is chunked into num.chunks, and then regressed in parallel.
#' @return Background corrected expression matrices
#' @export
Decontaminate <- function(E, B, min_genes_before = 0, min_genes_after = 0, min_umi_before = 0, min_umi_after = 0, do.par = FALSE, num.cores = NULL, large = FALSE, num.chunks = 20)
{
  if (! class(E) %in% "list")
    E = list(E)
  if (! class(B) %in% "list")
    B = list(B)

  cat(" ..........  Entry in Decontaminate: \n");
  ta = proc.time()[3];
  cat("             Number of count matrices:", length(E), "\n");
  cat("             Number of genes:", sapply(E, nrow)[1], "\n");
  cat("             Total number of barcodes:", sum(sapply(E, ncol)), "\n");

  # keep only cells with > min_genes genes detected
  crap = grepl("RPS[0-9]", rownames(E[[1]]), ignore.case = T) | grepl("RPL[0-9]", rownames(E[[1]]), ignore.case = T)  | grepl("^MT-", rownames(E[[1]]), ignore.case = T) | grepl("^RP[0-9]", rownames(E[[1]]), ignore.case = T)

  # keep only cells with > min_umi_before detected
  E = lapply(E, function(x) x[,(Matrix::colSums(x[!crap,]) > min_umi_before) & (Matrix::colSums(x[!crap,] != 0) > min_genes_before)])

  if (min_genes_before != 0 | min_umi_before != 0)
    cat("             Total number of barcodes after filtering:", sum(sapply(E, ncol)), "\n");

  if (large)
    E = lapply(E, function(x) array_split(x, num.chunks))

  cat(" ..........  Running regressions with RcppArmadillo (this may take a minute)... \n");

  if (!do.par)
    num.cores = 1

  if (do.par & is.null(num.cores))
    num.cores = parallel::detectCores() - 1

  if (!large)
  {
    Q = parallel::mcmapply(function(x,y) {
      if(sum(y) == 0)
        return(x)
      q = Matrix::Matrix(apply(x,2, function(z) {as.matrix(z) - as.matrix(y) %*% RcppArmadillo::fastLmPure(as.matrix(y), as.matrix(z))$coefficients
      }), sparse = TRUE)
      q[q < 1] <- 0
      q}
      ,x = E, y = B, SIMPLIFY = FALSE, mc.cores = num.cores)
  } else {
    Q = parallel::mclapply(E, function(X) {mapply(function(x,y) {
      if(sum(y) == 0)
        return(x)
      q = Matrix::Matrix(apply(x,2, function(z) {as.matrix(z) - as.matrix(y) %*% RcppArmadillo::fastLmPure(as.matrix(y), as.matrix(z))$coefficients
      }), sparse = TRUE)
      q[q < 1] <- 0
      q
    }
    ,x = X, y = B, SIMPLIFY = FALSE)}, mc.cores = num.cores)
  }

  if (large)
    Q = list(do.call(cbind, lapply(Q, function(x) do.call(cbind, x))))

  # round Q to be integers
  Q = lapply(Q, round)

  cat(" ..........  Regressions completed successfully! \n");
  Q[which(sapply(B,sum) != 0)] = lapply(Q[which(sapply(B,sum) != 0)], function(x) {
    x[,(Matrix::colSums(x[!crap,]) > min_umi_after) & (Matrix::colSums(x[!crap,] != 0) > min_genes_after)]
  })
  cat("             Total number of decontaminated barcodes with genes detected > ", min_genes_after, "and total counts >", min_umi_after, ":", sum(sapply(Q, ncol)), "\n");

  Q = lapply(Q, function(x){rownames(x) <- rownames(E[[1]])
  x})

  tb = proc.time()[3] - ta;
  cat(" ..........  Exit Decontaminate \n");
  cat("             Execution time = ", tb, " s.\n", sep = "");
  return(Q)
}

#' Regress MT percentage
#'
#' @param E Expression matrix or a list of expression matrices
#' @param M Mitochondrial gene percentage vector or a list of these vector (see ?GetMito)
#' @param min_genes_before All barcodes with less than min_genes_before detected will be removed prior to background correction.
#' @param min_genes_after All barcodes with less than min_genes_after detected will be removed after background correction.
#' @param min_umi_before All barcodes with less than min_umi_before will be removed prior to background correction.
#' @param do.par If TRUE, regressions will be done in parallel
#' @param num.cores Set number of cores for parallel computation. By default, all available cores - 1 will be used.
#' @param large if TRUE, dataset is chunked prior to regression to prevent memory errors.
#' @param num.chunks if large, the dataset is chunked into num.chunks, and then regressed in parallel.
#' @return Background corrected expression matrices
#' @export
RegressMito <- function(E, M, min_genes_before = 0, min_genes_after = 0, min_umi_before = 0, do.par = FALSE, num.cores = NULL, large = FALSE, num.chunks = 20)
{
  if (! class(E) %in% "list")
    E = list(E)
  if (! class(M) %in% "list")
    M = list(M)

  cat(" ..........  Entry in RegressMito: \n");
  ta = proc.time()[3];
  cat("             Number of count matrices:", length(E), "\n");
  cat("             Number of genes:", sapply(E, nrow)[1], "\n");
  cat("             Total number of barcodes:", sum(sapply(E, ncol)), "\n");

  # keep only cells with > min_genes genes detected
  crap = grepl("^RPS", rownames(E[[1]])) | grepl("^RPL", rownames(E[[1]])) | grepl("^RP1", rownames(E[[1]]))  | grepl("^RP2", rownames(E[[1]])) | grepl("^RP3", rownames(E[[1]])) | grepl("^RP4", rownames(E[[1]])) | grepl("^RP5", rownames(E[[1]])) | grepl("^RP6", rownames(E[[1]])) | grepl("^RP7",rownames(E[[1]])) | grepl("^RP8", rownames(E[[1]])) | grepl("^RP9", rownames(E[[1]]))  | grepl("^MT-", rownames(E[[1]]))

  E = lapply(E, function(x) x[,Matrix::colSums(x[!crap,] != 0) > min_genes_before])
  # keep only cells with > min_umi_before detected
  E = lapply(E, function(x) x[,Matrix::colSums(x[!crap,]) > min_umi_before])

  if (min_genes_before != 0 | min_umi_before != 0)
    cat("             Total number of barcodes after filtering:", sum(sapply(E, ncol)), "\n");

  if (large)
    E = lapply(E, function(x) array_split(x, num.chunks))

  cat(" ..........  Running regressions with RcppArmadillo (this may take a minute)... \n");

  if (!do.par)
    num.cores = 1

  if (do.par & is.null(num.cores))
    num.cores = parallel::detectCores() - 1

  if (!large)
  {
    Q = parallel::mcmapply(function(x,y) {
      if(sum(y) == 0)
        return(x)
      q = Matrix::Matrix(apply(x,1, function(z) {as.matrix(z) - as.matrix(y) %*% RcppArmadillo::fastLmPure(as.matrix(y), as.matrix(z))$coefficients
      }), sparse = TRUE)
      q[q < 1] <- 0
      q}
      ,x = E, y = M, SIMPLIFY = FALSE, mc.cores = num.cores)
  } else {
    Q = parallel::mclapply(E, function(X) {mapply(function(x,y) {
      if(sum(y) == 0)
        return(x)
      q = Matrix::Matrix(apply(x,1, function(z) {as.matrix(z) - as.matrix(y) %*% RcppArmadillo::fastLmPure(as.matrix(y), as.matrix(z))$coefficients
      }), sparse = TRUE)
      q[q < 1] <- 0
      q
    }
    ,x = X, y = M, SIMPLIFY = FALSE)}, mc.cores = num.cores)
  }

  if (large)
    Q = list(do.call(cbind, lapply(Q, function(x) do.call(cbind, x))))

  # transpose
  Q = lapply(Q, Matrix::t)

  cat(" ..........  Regressions completed successfully! \n");
  Q[which(sapply(M,sum) != 0)] = lapply(Q[which(sapply(M,sum) != 0)], function(x) {
    x[,Matrix::colSums(x != 0) >= min_genes_after]
  })
  cat("             Total number of regressed barcodes with genes detected >= ", min_genes_after, ":", sum(sapply(Q, ncol)), "\n");

  rownames(Q[[1]]) <- rownames(E[[1]])

  Q = mapply(function(x,y){
    colnames(x) <- colnames(y)
    x
  }, x = Q, y = E)

  tb = proc.time()[3] - ta;
  cat(" ..........  Exit RegressMito \n");
  cat("             Execution time = ", tb, " s.\n", sep = "");
  return(Q)
}

#' Get background matrix
#'
#' @param E Expression matrix or a list of expression matrices
#' @param Cutoffs A dataframe as returned by ?GetCutoffs
#' @return Background expression matrices
#' @export
GetBackgroundMatrix <- function(E, Cutoffs)
{
  if (! class(E) %in% "list")
    E = list(E)

  if (NROW(Cutoffs) != 1)
  {
    B = mapply(function(x,y) {
      if (!is.na(y$min_umi)) {
        idx = Matrix::colSums(x) >= y$min_umi & Matrix::colSums(x != 0) <= y$min_genes;
        x[,idx]
      } else {
        return(Matrix::Matrix(0, nrow(E[[1]]), 1, sparse = TRUE))
      }
    }, x = E, y = Cutoffs, SIMPLIFY = F)
  } else {
    B = lapply(E, function(x) {idx = Matrix::colSums(x) >= Cutoffs[[1]]$min_umi & Matrix::colSums(x != 0) <= Cutoffs[[1]]$min_genes;
    x[,idx]})
  }

  B[sapply(B, class) == "numeric"] = lapply(B[sapply(B, class) == "numeric"], function(x) {
    Matrix::Matrix(cbind(x,x))
  })

  return(B)
}

#' Get background distribution
#'
#' @param E Expression matrix or a list of expression matrices
#' @param Cutoffs A dataframe as returned by ?GetCutoffs
#' @return Background distributions
#' @export
GetBackground <- function(E, Cutoffs = NA)
{
  if (! class(E) %in% "list")
    E = list(E)

  cat(" ..........  Entry in GetBackground: \n");
  ta = proc.time()[3];
  cat("             Number of count matrices:", length(E), "\n");
  cat("             Number of genes:", nrow(E[[1]]), "\n");
  cat("             Total number of barcodes:", sum(sapply(E, ncol)), "\n");

  if (is.na(Cutoffs[1]))
    Cutoffs = GetCutoffs(E, verbose = F)

  if (is.data.frame(Cutoffs))
    Cutoffs = list(Cutoffs)

  # Get background matrix
  B = GetBackgroundMatrix(E = E, Cutoffs = Cutoffs)

  # Define contaminated droplets
  B = lapply(B, function(x){
    if (!sum(x)) {
      return(Matrix::Matrix(0, nrow(E[[1]]), 1, sparse = TRUE))
    } else if (is.null(ncol(x)))
    {
      return(x)
    } else {
      apply(x, 1, max)
    }})

  if (0 %in% sapply(B, sum))
  {
    cat(" ..........  Friendly message from GetBackground! No background detected for samples: \n");
    cat("             ", paste(names(E)[which(sapply(B,sum) == 0)],collapse = ", "))
    cat("\n")
  }

  tb = proc.time()[3] - ta;
  cat(" ..........  Exit GetBackground \n");
  cat("             Execution time = ", tb, " s.\n", sep = "");
  return(B)
}

#' Get background distribution
#'
#' @param B Expression matrix or a list of expression matrices as returned by ?GetBackgroundMatrix
#' @return Background distributions
#' @export
GetBackgroundFromMatrix <- function(B)
{
  if (! class(B) %in% "list")
    B = list(B)

  # Define contaminated droplets
  B = lapply(B, function(x){
    if (!sum(x)) {
      return(Matrix::Matrix(0, nrow(B[[1]]), 1, sparse = TRUE))
    } else if (is.null(ncol(x)))
    {
      return(x)
    } else {
      apply(x, 1, max)
    }})
  return(B)
}

#' Inspect background matrix
#'
#' @param B Background expression matrix, as returned by GetBackgroundMatrix
#' @return a correlogram of the Pearson correlation coefficients
#' @export
InspectBackgroundMatrix <- function(B)
{
  if (class(B) != "list")
    B = list(B)

  if (is.null(names(B)))
    names(B) <- paste("Sample", seq_along(1:length(B)))

  B = B[! 0 %in% sapply(B, sum)]
  B = B[! "numeric" %in% sapply(B, class)]
  # remove all zero expressed genes
  B = lapply(B, function(x){
    if (class(x) == "numeric")
      x = Matrix::Matrix(cbind(x,x))
    x[Matrix::rowSums(x) != 0,]
  })

  # generate correlation matrix
  q = lapply(B, function(x) {
    cor_mat = stats::cor(as.matrix(x))
    colnames(cor_mat) <- paste("CB", 1:ncol(cor_mat))
    rownames(cor_mat) <-paste("CB", 1:ncol(cor_mat))
    corrplot::corrplot(cor_mat,
                       addgrid.col = NA,
                       tl.col='black',
                       cl.cex=1,
                       tl.srt = 60,
                       cl.align = "l",
                       tl.cex = 1.5,
                       cl.ratio = 0.2,
                       cl.length = 5)
  })
  return(q)
}

#' Filter background matrix
#'
#' @param B Background expression matrix, as returned by GetBackgroundMatrix
#' @param cutoff a value between 0 and 1 that filters based on the average Pearson correlation coefficient. Default is 0.75.
#' @return a filtered background matrix
#' @export
FilterBackgroundMatrix <- function(B, cutoff = 0.75)
{
  if (class(B) != "list")
    B = list(B)

  if (is.null(names(B)))
    names(B) <- paste("Sample", seq_along(1:length(B)))

  # remove all zero expressed genes
  D = lapply(B, function(x){
    x[Matrix::rowSums(x) != 0,]
  })

  # generate correlation matrix
  q = lapply(D, function(x) {
    cor_mat = stats::cor(as.matrix(x))
  })

  # get average correlation coefficient
  q = mapply(function(x,y) {
    kmu = colMeans(x)
    y[,kmu > cutoff]
  }, x = q, y= B, SIMPLIFY = F)

  return(q)
}

#' Inspect background genes in background matrix
#'
#' @param B Background expression matrix, as returned by GetBackgroundMatrix
#' @param num.genes number of genes to display. Default is 10.
#' @return a barplot of the percentage of cells expressing genes
#' @export
InspectBackgroundGenes <- function(B, num.genes = 10)
{
  if (class(B) != "list")
    B = list(B)

  if (is.null(names(B)))
    names(B) <- paste("Sample", seq_along(1:length(B)))

  B = B[! 0 %in% sapply(B, sum)]

  # generate plots
  q = lapply(B, function(x) {
  kmu = Matrix::rowSums(x != 0)
  xx = factor(names(kmu)[order(kmu, decreasing = T)], levels = names(kmu)[order(kmu, decreasing = T)])
  df = data.frame(Genes   = xx,
                  Percent = kmu[order(kmu, decreasing = T)] / ncol(x))
  df = df[1:num.genes,]
  ggplot2::ggplot(data=df, ggplot2::aes(x=Genes, y=round(Percent*100))) +
    ggplot2::geom_bar(stat="identity", fill="steelblue")+
    ggplot2::theme_minimal() + ggplot2::xlab("Genes") + ggplot2::ylab("Percent expressed") + ggplot2::coord_flip() + ggplot2::ggtitle("Background expression profile")
  })
  return(q)
}

#' Inspect background distribution
#'
#' @param B Background expression, as returned by GetBackground
#' @return list of the gene names in the empty droplets for each sample with background.
#' @export
InspectBackground <- function(B)
{
  if (class(B) != "list")
    B = list(B)

  if (is.null(names(B)))
    names(B) <- paste("Sample", seq_along(1:length(B)))

  B = B[! 0 %in% sapply(B, sum)]

  q = lapply(B, function(x) {
    names(x)[x != 0]
  })
  return(q)
}

#' Hierarchical cluster the background matrix
#'
#' @param B Background expression matrix, as returned by GetBackgroundMatrix
#' @param k Parameter for tree cutting, see ?cutree
#' @return group membership for each background cell
#' @export
HClustBackgroundMatrix <- function(B, k = 2)
{
  if (class(B) != "list")
    B = list(B)

  if (is.null(names(B)))
    names(B) <- paste("Sample", seq_along(1:length(B)))

  B = B[! 0 %in% sapply(B, sum)]

  # remove all zero expressed genes
  B = lapply(B, function(x){
    x[Matrix::rowSums(x) != 0,]
  })

  # generate correlation matrix, perform hierarchical clustering
  q = lapply(B, function(x) {
    cor_mat = stats::cor(as.matrix(x))
    stats::hclust(stats::as.dist(1 - cor_mat))
  })

  # return cluster membership
  hc = lapply(q, function(x){
    stats::cutree(x, k = k)
  })
  return(hc)
}

#' Inspect the shannon entropy of edges
#'
#' @param G An adjacency matrix (cell by cell) for the nearest neighbors in the dataset
#' @param S A vector of labels (cells) for the samples of the dataset
#' @return list of the gene names in the empty droplets for each sample with background.
#' @export
BatchProblemPlot <- function(G, S)
{
  # set diagonal to zero
  diag(G) <- 0
  m = Matrix::Matrix(0, nrow = length(S), ncol = length(unique(S)), sparse = T)
  m[cbind(1:nrow(m), as.numeric(factor(S)))] <- 1
  # compute neighbor graph
  res = G %*% m
  res = res / Matrix::rowSums(res)
  N_unique = length(unique(S))
  # compute normalized Shannon entropy
  entropy = apply(res, 1, function(freqs) {-sum(freqs[freqs != 0] * log(freqs[freqs != 0])) / log(2) / log2(N_unique)})
  # t-test
  df = data.frame(Sample = S, entropy = entropy)
  logik = sapply(unique(S), function(x) { stats::t.test(df$entropy[df$Sample == x], df$entropy[df$Sample != x], alternative = 'less')$p.value}) < 0.01
  if (any(logik)) {
    batch_problems = paste("Batch effects:", paste(as.character(unique(df$Sample)[logik]), collapse = ", "))
  } else {
    batch_problems = 'No batch effects present in these data'
  }
  # plot results
  q = ggplot2::ggplot(df, ggplot2::aes(x=Sample, y=entropy, fill = Sample)) + ggplot2::geom_boxplot() + ggplot2::xlab('Samples') +  ggplot2::ylab('Normalized Shannon entropy') + ggplot2::theme(axis.text.x=ggplot2::element_text(angle=45, hjust=1)) + ggplot2::ggtitle(batch_problems)
  return(q)
}

#' Cells counts plot
#'
#' @param E A list of expression matrices
#' @return labels ordered by number of cells
#' @export
GenerateLbls <- function(E)
{
  # generate sample vector
  X = unlist(mapply(function(x, y) {
    rep(x, y)
  }, x = names(E), y = sapply(E, ncol)))
  NumberOfCells = unlist(sapply(E, ncol))
  names(E)[order(NumberOfCells, decreasing = F)]
}

#' Cells counts plot
#'
#' @param E A list of expression matrices
#' @param type R
#' @param lbls factor with levels of the desired order, see? GenerateLbls
#' @return barplot of the number of cells or percentage of cells
#' @export
CellBarPlot <- function(E, type = c("Count", "Percentage")[1], lbls)
{
  # generate sample vector
  X = unlist(mapply(function(x, y) {
    rep(x, y)
  }, x = names(E), y = sapply(E, ncol)))

  AvgDepth = unlist(sapply(E, function(x) mean(Matrix::colSums(x))))

  # first get data table
  d = data.frame(table(X))

  # compute fraction for each disease state
  d$Frac= d$Freq/sum(d$Freq)
  colnames(d) <- c("Sample", "Count", "Frac")

  d$Sample = factor(d$Sample, levels = lbls)

  if (type == "Percentage"){
    p = ggplot2::ggplot(d, ggplot2::aes(x=Sample, y=Frac*100, fill=Sample)) +
      ggplot2::geom_bar(stat = "identity", color = "black",
                        position = ggplot2::position_dodge()) + ggplot2::coord_flip() +
      ggplot2::xlab("Samples") + ggplot2::ylab("Percentage (%)") + ggplot2::ggtitle("Abundance") + ggplot2::theme(legend.position = "none")
  }
  if (type == "Count"){
    p = ggplot2::ggplot(d, ggplot2::aes(x=Sample, y=Count, fill=Sample)) +
      ggplot2::geom_bar(stat = "identity", color = "black",
                        position = ggplot2::position_dodge()) + ggplot2::coord_flip() +
      ggplot2::ylab("Number of cells") + ggplot2::xlab("Samples") + ggplot2::ggtitle("Abundance plot") + ggplot2::theme(legend.position = "none")
  }
  return(p)
}

#' Get Mito Fraction
#'
#' @param E Expression matrix or a list of expression matrices
#' @param choose can be "all" or "Mito," where all includes ribosomal "RP" genes. Default is all.
#' @return Mitochondrial gene percentage per cell
#' @export
GetMito <- function(E, choose = "all")
{
  if (! class(E) %in% "list")
    E = list(E)
  n = rownames(E[[1]])
  if (choose == "all")
    logik = grepl("^MT-", n) | grepl("^MT.", n) | grepl("^RPL", n) | grepl("^RPS", n) | grepl("^RP[0-9]", n)
  if (choose == "Mito")
    logik = grepl("^MT-", n) | grepl("^MT.", n)
  MitoFrac = lapply(E, function(x){
    (Matrix::colSums(x[logik,]) / Matrix::colSums(x)) * 100
  })
  return(MitoFrac)
}

#' Filter by Mito Fraction
#'
#' @param E Expression matrix or a list of expression matrices.
#' @param cutoff Threshold, only cells with less than cutoff will be kept. Default is 25.
#' @param choose Can be "all" or "Mito". "all" filters based on RP and MT genes, mito just MT genes.
#' @return Filtered expression matrix
#' @export
FilterMito <- function(E, cutoff = 25, choose)
{
  cat(" ..........  Entry in FilterMito: \n");
  ta = proc.time()[3];
  cat("             Total number of barcodes before filtering:", sum(sapply(E, ncol)), "\n");
  fracs = GetMito(E, choose = choose)
  E = mapply(function(x , y){
    x[, y < cutoff]
  }, x = E, y = fracs, SIMPLIFY = F)
  cat("             Total number of barcodes after filtering:", sum(sapply(E, ncol)), "\n");
  tb = proc.time()[3] - ta;
  cat(" ..........  Exit FilterMito \n");
  cat("             Execution time = ", tb, " s.\n", sep = "");
  return(E)
}

#' Get suggested filter values
#'
#' @param E Expression matrix or a list of expression matrices, see ?Read_h5
#' @param verbose If false, hides output dialog
#' @param min_counts Minimum counts for outlier detection, default is 100.
#' @param max_counts Maximum counts for outlier detection, default is 1000.
#' @importFrom aplpack compute.bagplot
#' @return List where each element is a data frame with min_umi and min_genes_detected filters
#' @export
GetCutoffs <- function(E, verbose = T, min_counts = 100, max_counts = 1000)
{
  if (! class(E) %in% "list")
    E = list(E)

  if (verbose)
  {
    cat(" ..........  Entry in GetCutoffs: \n");
    ta = proc.time()[3];
    cat("             Number of count matrices:", length(E), "\n");
    cat("             Number of genes:", nrow(E[[1]]), "\n");
    cat("             Total number of barcodes:", sum(sapply(E, ncol)), "\n");
  }
  E = lapply(E, function(x) {x[,Matrix::colSums(x) > min_counts & Matrix::colSums(x) < max_counts]})
  if (verbose)
    cat("             Total number of barcodes after filtering:", sum(sapply(E, ncol)), "\n");
  L = lapply(E, function(x){
    if (is.null(colnames(x)))
      colnames(x) <- 1:ncol(x)
    logik = Matrix::colSums(x) > 0;
    x = x[,logik]
    total_counts = Matrix::colSums(x)
    genes_detected =  Matrix::colSums(x > 0)
    p = suppressWarnings(suppressMessages(aplpack::compute.bagplot(cbind(log10(total_counts), genes_detected))))
    idx = genes_detected < p$center[2] & log10(total_counts) > p$center[1] & colnames(x) %in% rownames(p$pxy.outlier)
    if (sum(idx) > 1)
      return(data.frame(min_umi = min(total_counts[idx]), min_genes = max(genes_detected[idx])))
    if (sum(idx) == 1)
      return(data.frame(min_umi = total_counts[idx], min_genes = genes_detected[idx]))
    if (sum(idx) == 0)
      return(data.frame(min_umi = NA, min_genes = NA))
  })
  if (verbose)
  {
    cat("             Total number of samples with background detected with bivariate statistics:", sum(sapply(L, function(x) !is.na(x$min_umi))), "\n");
    tb = proc.time()[3] - ta;
    cat(" ..........  Exit GetCutoffs \n");
    cat("             Execution time = ", tb, " s.\n", sep = "");
  }
  return(L)
}

#' Split large data into smaller chunks
#'
#' @param E Count matrix (see ?Read_h5)
#' @param number_of_chunks Number of chunks to divide the data
#' @return List where each element is a chunk of the original matrix
#' @export
array_split <- function(E, number_of_chunks) {
  if (number_of_chunks > (ncol(E))/2)
    number_of_chunks = (ncol(E))/2
  colIdx <- seq_len(ncol(E))
  lapply(split(colIdx, cut(colIdx, pretty(colIdx, number_of_chunks))), function(x) E[, x ])
}

#' Plot genes detected vs. nUMI
#'
#' @param E Count matrix (see ?Read_h5)
#' @param Cutoffs A dataframe as returned by ?GetCutoffs
#' @param Barcodes A list as returned by ?FilterBackgroundMatrix or ?GetBackgroundMatrix
#' @param show.min_genes If TRUE, plot a line with the minimum gene cutoff
#' @param show.min_umi If TRUE, plot a line with the minimum nUMI cutoff
#' @param print.pdf If TRUE, returns "DepthPlots.pdf" in the working directory. Default is TRUE.
#' @param fn Filename of the .pdf file. Default is "DepthPlots.pdf".
#' @return PDF where each page is a plot of the genes detected vs. nUMI, or a list of ggplot objects.
#' @seealso DepthPlotComparison, GetCutoffs
#' @export
DepthPlot <- function(E, Cutoffs = NULL, Barcodes = NULL, show.min_genes = T, show.min_umi = T, print.pdf = T, fn = "DepthPlots.pdf")
{
  pointfill <- "#4271AE"
  if (class(E) != "list")
    E = list(E)

  if (is.null(names(E)))
    names(E) <- paste("Sample", seq_along(1:length(E)))

  q = mapply(function(z, nms){
    df = data.frame(total_counts = Matrix::colSums(E[[z]]), genes_detected = Matrix::colSums(E[[z]] != 0));

    if (!is.null(Cutoffs)){
      min_genes = Cutoffs[[z]]$min_genes
      min_umi   = Cutoffs[[z]]$min_umi
    } else {
      min_genes = NA
      min_umi = NA
    }

    if (!is.null(Barcodes))
    {
      df$cells = "Cells"
      df$cells[rownames(df) %in% colnames(Barcodes[[which(names(Barcodes) == nms)]])] = "Soup"
    }

    if (is.na(min_genes) | is.na(min_umi) & is.null(Barcodes)){
      ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected")) + ggplot2::geom_point() + ggplot2::xlab("Transcripts per barcode") +  ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x))) + ggplot2::ggtitle(nms) + ggplot2::scale_color_manual(values=c(pointfill, "#999999"))
    } else if (is.na(min_genes) | is.na(min_umi)) {
      ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected")) + ggplot2::geom_point(alpha = 0.3, color = pointfill) + ggplot2::xlab("Transcripts per barcode") +  ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x))) + ggplot2::annotate("text", x = stats::median(df$total_counts), y = max(df$genes_detected), label = "No background detected") + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
    } else if (show.min_genes & show.min_umi ) {
      df$cells = "Cells"
      df$cells[(df$total_counts > min_umi) & (df$genes_detected < min_genes)] = "Soup"
      ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected", color = "cells")) + ggplot2::geom_point() + ggplot2::xlab("Transcripts per barcode") +  ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x))) + ggplot2::geom_hline(yintercept = min_genes, color = "red", linetype = "dashed") + ggplot2::geom_vline(xintercept = min_umi, color = "blue", linetype = "dashed") + ggplot2::ggtitle(nms) + ggplot2::scale_color_manual(values=c(pointfill, "#999999"))
    } else if (show.min_genes & show.min_umi & !is.null(Barcodes)) {
      ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected", color = "cells")) + ggplot2::geom_point() + ggplot2::xlab("Transcripts per barcode") +  ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x))) + ggplot2::geom_hline(yintercept = min_genes, color = "red", linetype = "dashed") + ggplot2::geom_vline(xintercept = min_umi, color = "blue", linetype = "dashed") + ggplot2::ggtitle(nms) + ggplot2::scale_color_manual(values=c(pointfill, "#999999"))
    }
    else if (show.min_genes) {
      ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected")) + ggplot2::geom_point(alpha = 0.3, color = pointfill) + ggplot2::xlab("Transcripts per barcode") +  ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x))) + ggplot2::geom_hline(yintercept = min_genes, color = "red", linetype = "dashed") + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
    } else if (show.min_umi) {
      ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected")) + ggplot2::geom_point(alpha = 0.3) + ggplot2::xlab("Transcripts per barcode") +  ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x))) + ggplot2::geom_vline(xintercept = min_umi, color = "blue", linetype = "dashed") + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
    }
  }, z = 1:length(E), nms = names(E), SIMPLIFY = F)
  names(q) <- names(E)
  if (print.pdf)
  {
    grDevices::pdf(fn)
    invisible(suppressWarnings(lapply(q, print)))
    grDevices::dev.off()
    return("Done!")
  } else {
    return(q)
  }
}

#' Compare results before and after
#'
#' @param E Expression matrix or a list of expression matrices
#' @param Z Background corrected expression matrices
#' @param print.pdf If TRUE, returns "DepthPlots.pdf" in the working directory. Default is TRUE.
#' @return Plot of genes detected vs. nUMI
#' @export
DepthPlotComparison <- function(E, Z, print.pdf = T)
{

  if (class(E) != "list")
    E = list(E)

  if (class(Z) != "list")
    Z = list(Z)

  q = mapply(function(X, Y){
    df = data.frame(state = c(rep("before",ncol(X)), rep("after", ncol(Y))),
                    total_counts = c(Matrix::colSums(X), Matrix::colSums(Y)),
                    genes_detected = c(Matrix::colSums(X != 0), Matrix::colSums(Y != 0)))
    ggplot2::ggplot(df, ggplot2::aes_string(x = "total_counts", y = "genes_detected", color = "state")) + ggplot2::geom_point(alpha = 0.2) + ggplot2::xlab("Transcripts per barcode") +  ggplot2::ylab("Genes detected") + ggplot2::scale_x_log10(breaks = scales::trans_breaks("log10", function(x) 10^x),labels = scales::trans_format("log10", scales::math_format(10^.x)))
  }, X = E, Y = Z, SIMPLIFY = F)

  if (print.pdf)
  {
    grDevices::pdf("DepthPlotComparison.pdf")
    invisible(suppressWarnings(lapply(q, print)))
    grDevices::dev.off()
    return("Done!")
  } else {
    return(q)
  }
}

#' Plot a boxplot for the fraction of mitochondrial rna
#'
#' @param E Count matrix (see ?Read_h5)
#' @param order.by Order by a quantity associated with each sample. Options are "AvgDepth" or "Mito", default is AvgDepth.
#' @param choose See ?GetMito or ?FilterMito
#' @return ggplot objects of genes detected ordered by average depth
#' @seealso GeneBoxPlot; CoverageBoxPlot
#' @export
MitoBoxPlot <- function(E, order.by = c("average depth", "mito percentage")[1], choose = "all")
{
  Samples = unlist(mapply(function(x, y) {
    rep(x, y)}, x = names(E), y = sapply(E, ncol)))

  M = GetMito(E = E, choose = choose)

  if (order.by == "average depth")
    sortVar = unlist(sapply(E, function(x) mean(log2(Matrix::colSums(x)))))

  if (order.by == "genes detected")
    sortVar = unlist(sapply(E, function(x) Matrix::colSums(x != 0)))

  if (order.by == "mito percentage")
    sortVar = unlist(sapply(M, function(x) mean(x)))

  Samples = factor(Samples, levels = names(sortVar)[order(sortVar, decreasing = T)])

  # wrangle data frame
  df = data.frame(Samples = Samples, Mito = unlist(M))

  # make ggplot
  ggplot2::ggplot(df, ggplot2::aes(x = Samples, y=Mito, fill = Samples)) +  ggplot2::geom_boxplot() + ggplot2::coord_flip() +  ggplot2::ylab("Percentage") + ggplot2::ggtitle(paste("Mitochondrial percentage ordered by", order.by)) + ggplot2::xlab("Samples") + ggplot2::theme(legend.position = "none")
}

#' Plot a boxplot for genes detected
#'
#' @param E Count matrix (see ?Read_h5)
#' @param order.by Order by a quantity associated with each sample. Options are "AvgDepth" or "Mito", default is AvgDepth.
#' @return ggplot objects of genes detected ordered by average depth
#' @seealso GeneBoxPlot; CoverageBoxPlot
#' @export
CoverageBoxPlot <- function(E, order.by = c("average depth", "mito percentage")[1])
{
  Samples = unlist(mapply(function(x, y) {
    rep(x, y)}, x = names(E), y = sapply(E, ncol)))

  Coverage = unlist(sapply(E, function(x) Matrix::colSums(x != 0)))

  if (order.by == "average depth")
    AvgDepth = unlist(sapply(E, function(x) mean(log2(Matrix::colSums(x)))))

  if (order.by == "mito percentage")
  {
    M = GetMito(E = E, choose = "all")
    AvgDepth = unlist(sapply(M, function(x) mean(x)))
  }

  Samples = factor(Samples, levels = names(AvgDepth)[order(AvgDepth, decreasing = T)])

  # wrangle data frame
  df = data.frame(Samples = Samples, Coverage = Coverage)

  # make ggplot
  ggplot2::ggplot(df, ggplot2::aes(x = Samples, y=Coverage, fill = Samples)) +  ggplot2::geom_boxplot() + ggplot2::coord_flip() +  ggplot2::ylab("Genes detected") + ggplot2::ggtitle(paste("Genes detected ordered by", order.by)) + ggplot2::xlab("Samples") + ggplot2::theme(legend.position = "none")
}

#' Plot a boxplot for sequencing depth
#'
#' @param E Count matrix (see ?Read_h5)
#' @param order.by Order by a quantity associated with each sample. Options are "average depth" or "mito percentage", default is AvgDepth.
#' @return ggplot objects ordered by average depth
#' @seealso GeneBoxPlot; CoverageBoxPlot
#' @export
DepthBoxPlot <- function(E, order.by = c("average depth", "mito percentage")[1])
{
  Samples = unlist(mapply(function(x, y) {
    rep(x, y)}, x = names(E), y = sapply(E, ncol)))

  Depth = unlist(sapply(E, function(x) Matrix::colSums(x)))

  if (order.by == "average depth")
    AvgDepth = unlist(sapply(E, function(x) mean(log2(Matrix::colSums(x)))))

  if (order.by == "mito percentage")
  {
    M = GetMito(E = E, choose = "all")
    AvgDepth = unlist(sapply(M, function(x) mean(x)))
  }

  Samples = factor(Samples, levels = names(AvgDepth)[order(AvgDepth, decreasing = T)])

  # wrangle data frame
  df = data.frame(Samples = Samples, Depth = log2(Depth))

  # make ggplot
  ggplot2::ggplot(df, ggplot2::aes(x = Samples, y=Depth, fill = Samples)) +  ggplot2::geom_boxplot() + ggplot2::coord_flip() +  ggplot2::ylab("Library size (log2)") + ggplot2::ggtitle(paste("Sequencing depth ordered by", order.by)) + ggplot2::xlab("Samples") + ggplot2::theme(legend.position = "none")
}

#' Plot a boxplot for a single gene
#'
#' @param E Count matrix (see ?Read_h5)
#' @param order.by Order by a quantity associated with each sample. Options are "average depth" or "mito percentage", default is AvgDepth.
#' @param gene Gene to boxplot
#' @return ggplot objects
#' @seealso DepthPlot; DepthPlotComparison; QCPlots
#' @export
GeneBoxPlot <- function(E, gene, order.by = c("average depth", "mito percentage")[1])
{
  logik = grepl(gene, rownames(E[[1]]))

  expr = log2(1 + unlist(sapply(E, function(x) x[logik,])))

  # generate sample vector
  X = unlist(mapply(function(x, y) {
    rep(x, y)
  }, x = names(E), y = sapply(E, ncol)))

  if (order.by == "average depth")
    AvgDepth = unlist(sapply(E, function(x) mean(log2(Matrix::colSums(x)))))

  if (order.by == "mito percentage")
  {
    M = GetMito(E = E, choose = "all")
    AvgDepth = unlist(sapply(M, function(x) mean(x)))
  }

  X = factor(X, levels = names(AvgDepth)[order(AvgDepth, decreasing = T)])

  df = data.frame(Sample = X, Expression = expr)

  ggplot2::ggplot(df, ggplot2::aes(x = Sample, y=Expression, fill = Sample)) +
    ggplot2::geom_boxplot() +
    ggplot2::coord_flip() +
    ggplot2::theme(legend.position = "none") +
    ggplot2::ylab("Expression UMI (log2)") + ggplot2::ggtitle(paste(sub(pattern = "_._", replacement = "", x = gene), "order by", order.by))

}

#' Plot total genes and total counts histograms
#'
#' @param E Count matrix (see ?Read_h5)
#' @param print.pdf If TRUE, returns pdf files in the working directory. Default is TRUE; if FALSE, returns list of ggplot2 objects.
#' @param QC Type of QC plot,  "Genes Detected", "Total Counts" or "Mito". Default is Genes Detected.
#' @param fn Filename of the .pdf file. Default is the input parameter QC saved as a .pdf file.
#' @param choose See ?FilterMito or ?GetMito
#' @param min_counts Keep cells with at least min_counts, default is 100.
#' @param min_genes Keep cells with at least min_genes, default is 100.
#' @param order.by Order by a quantity associated with each sample. Options are "average depth" or "mito percentage", default is AvgDepth.
#' @return PDF, histogram of total counts, or a list of ggplot objects.
#' @seealso DepthPlot; DepthPlotComparison
#' @export
QCPlots <- function(E, print.pdf = T, QC = c("Genes detected", "Total counts", "Mito")[1], order.by = c("average depth", "mito percentage")[1], fn = "default", choose = c("Mito", "all")[1], min_counts = 100, min_genes = 100)
{
  E = lapply(E, function(x) { x[,(Matrix::colSums(x) > min_counts) & (Matrix::colSums(x != 0) > min_genes)] })

  if (fn == "default")
    fn = paste0(QC, ".pdf")

  if (class(E) != "list")
    E = list(E)

  if (is.null(names(E)))
    names(E) <- paste("Sample", seq_along(1:length(E)))

  if (QC == "Mito")
    E = GetMito(E, choose = choose)

  barfill <- "#4271AE"
  barlines <- "#1F3552"

  q = mapply(function(z, nms){
    if (QC != "Mito")
    {
      df = data.frame(total_counts = Matrix::colSums(z), genes_detected = Matrix::colSums(z != 0));
      if (QC == "Total counts")
      {
        ggplot2::ggplot(df, ggplot2::aes(x=total_counts)) + ggplot2::geom_histogram(binwidth = 50,colour = barlines, fill = barfill) + ggplot2::xlab('Total counts (nUMI)') + ggplot2::ylab('Counts') + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
      } else {
        ggplot2::ggplot(df, ggplot2::aes(x=genes_detected)) + ggplot2::geom_histogram(binwidth = 50,colour = barlines, fill = barfill) + ggplot2::xlab('Genes detected') + ggplot2::ylab('Counts') + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
      }
    } else {
      df = data.frame(mito = z)
      if (choose == "Mito"){
        ggplot2::ggplot(df, ggplot2::aes(x=mito)) + ggplot2::geom_histogram(binwidth = 5, colour = barlines, fill = barfill) + ggplot2::xlab('Mitochondrial percentage') + ggplot2::ylab('Counts') + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
      } else {
        ggplot2::ggplot(df, ggplot2::aes(x=mito)) + ggplot2::geom_histogram(binwidth = 5, colour = barlines, fill = barfill) + ggplot2::xlab('Mitochondrial and ribosomal percentage') + ggplot2::ylab('Counts') + ggplot2::theme_bw() + ggplot2::ggtitle(nms)
      }
    }
  }, z = E, nms = names(E), SIMPLIFY = F)
  names(q) <- names(E)
  if (print.pdf)
  {
    grDevices::pdf(fn)
    invisible(suppressWarnings(lapply(q, print)))
    grDevices::dev.off()
    return("Done!")
  } else {
    return(q)
  }
}

#' Write results to a folder
#'
#' @param D A list of count matrices
#' @param data.dir directory (will be created if it does not exist) where results are saved
#' @param genome default is GRCh38.
#' @importFrom methods as
#' @importClassesFrom Matrix dgCMatrix
#' @return matrix.h5 file, where each is background corrected
#' @export
#' @importFrom rhdf5 h5createFile h5createGroup h5write h5write.default
SaveCountsToH5 <- function(D, data.dir, genome = "GRCh38")
{

  data.dir = gsub("\\/$", "", data.dir, perl = TRUE);

  if (!dir.exists(data.dir))
    dir.create(data.dir)

  if (!"list" %in% class(D))
    D = list(D)

  cat(" ..........  Entry in SaveCountsToH5: \n");
  ta = proc.time()[3];
  cat("             Number of cells:", sum(sapply(D, ncol)), "\n");
  cat("             Number of genes:", nrow(D[[1]]), "\n");
  cat("             Number of samples:", length(D), "\n");

  if (is.null(names(D)))
    names(D) <- paste0("Sample", seq_along(1:length(D)))

  flag = rownames(D[[1]])[1] != gsub( "_.*$", "", rownames(D[[1]])[1])

  if (flag)
  {
    genes = strsplit(x = rownames(D[[1]]), split = "_._")
    gene_names = sapply(genes, `[[`, 1)
    genes = sapply(genes, `[[`, 2)
  }

  D = lapply(D, function(x)
  {
    x@Dimnames[[1]] = rownames(D[[1]])
    x
  })

  data.dirs = paste(data.dir, names(D), sep = "/")

  q = lapply(data.dirs, function(x){
    if (!dir.exists(x))
      dir.create(x)})

  d = suppressWarnings(
    mapply(function(x,y){
      fn = "count_matrix.h5"
      fn = paste(y, fn, sep = "/")
      rhdf5::h5createFile(fn)
      rhdf5::h5createGroup(fn, genome)
      rhdf5::h5write.default(x@Dimnames[[2]], file = fn, name = paste0(genome, "/barcodes"))
      if (flag)
      {
        rhdf5::h5write.default(genes, file = fn, name=paste0(genome, "/genes"))
        rhdf5::h5write.default(gene_names, file = fn, name=paste0(genome, "/gene_names"))
      } else {
        rhdf5::h5write.default(x@Dimnames[[1]], file = fn, name=paste0(genome, "/genes"))
        rhdf5::h5write.default(x@Dimnames[[1]], file = fn, name=paste0(genome, "/gene_names"))
      }
      rhdf5::h5write.default(x@x, file = fn, name=paste0(genome, "/data"))
      rhdf5::h5write.default(dim(x), file = fn, name=paste0(genome, "/shape"))
      rhdf5::h5write.default(x@i, file = fn, name=paste0(genome, "/indices")) # already zero-indexed.
      rhdf5::h5write.default(x@p, file = fn, name=paste0(genome, "/indptr"))
    }, x = D, y = data.dirs)
  )
  rhdf5::h5closeAll()
  tb = proc.time()[3] - ta;
  cat(" ..........  Exit SaveCountsToH5 \n");
  cat("             Execution time = ", tb, " s.\n", sep = "");
}

#' Convert a list of count matrices to a Seurat object
#'
#' @param E a list of count matrices, named by sample names
#' @param filter.mito if true, populates the percent of mitochondrial RNA and removes those genes from the expression matrix. Default is TRUE.
#' @return Seurat object
#' @export
GenerateSeuratObject <- function(E, filter.mito = T)
{
  # merge data into single expression matrix
  if (class(E) != "list"){
    E = list(E)
    names(E) <- paste("Sample", 1:length(E))
  }

  # create a vector of sample identities
  nms = unlist(mapply(function(x, y) {
    rep(x, y)
  }, x = names(E), y = sapply(E, ncol)))
  E = do.call(cbind, E)

  # create seurat object
  pbmc <- Seurat::CreateSeuratObject(counts = E)
  pbmc@meta.data$Samples    = nms

  # get mito percentage
  if (filter.mito) {
    mito.pct = GetMito(E, choose = "Mito")[[1]]
    logik = grepl("^MT-", rownames(E)) | grepl("^MT.", rownames(E)) | grepl("^RPL", rownames(E)) | grepl("^RPS", rownames(E)) | grepl("^RP[0-9]", rownames(E))
    E = E[!logik,]
    E = E[Matrix::rowSums(E) != 0, ]
    rownames(E) <- gsub(pattern = "_._ENSG", replacement = "$ENSG", x = rownames(E))
    pbmc@meta.data$percent.mt = mito.pct
  }

  return(pbmc)
}

#' Save network h5 files
#'
#' @param A an adjacency matrix with row names and columns names genes (gene by gene)
#' @return a saved network file
#' @export
SaveNetworkH5 <- function(A)
{
  # convert to sparse matrix
  if (!"dgCMatrix" %in% class(A))
    A = Matrix::Matrix(A, sparse = T)

  # save HDF5 file
  cat('Saving hdf5 file for fast network loading...')
  file.h5 <- H5File$new("network.hdf5", mode="w")
  file.h5$create_group("edges")
  file.h5$create_group("nodes")
  lapply(rownames(A), function(x){
    edges_list = A[rownames(A) == x,]
    nodes_list = which(edges_list != 0)
    edges_list = edges_list[nodes_list]
    file.h5[[paste0("edges/", x)]] <- edges_list
    file.h5[[paste0("nodes/", x)]] <- nodes_list
    cars_ds <- file.h5[[paste0("edges/", x)]]
    cars_ds <- file.h5[[paste0("nodes/", x)]]
  })
  file.h5$create_group("ngene")
  file.h5[["ngene/ngenes"]] <- nrow(A)
  cars_ds <- file.h5[["ngene/ngenes"]]
  file.h5$close_all()
}

#' Load gene from network h5 file
#'
#' @param gene A gene to query from the network
#' @param filename file name of network. Default is "network.hdf5"
#' @return a saved network file
#' @export
GetGeneFromNetwork <- function(gene, filename = "network.hdf5")
{
  if (!requireNamespace("hdf5r", quietly = TRUE))
    stop("Please install hdf5r to read HDF5 files")
  if (!file.exists(filename))
      stop("File not found")
  infile = hdf5r::H5File$new(filename)
  E = Matrix::Matrix(0, ncol = 1, nrow = infile[["ngene/ngenes"]][], sparse = T)
  nodes <- infile[[paste("/nodes", gene, sep = "/")]][]
  E[nodes] = infile[[paste("/edges", gene, sep = "/")]][]
  return(E)
}

#' Jackknife Differential Expression Analysis
#'
#' @param E A gene by cell expression matrix
#' @param Samples A character vector of samples
#' @param Identities A character vector of cell type or cluster labels
#' @param Disease A character vector of disease labels
#' @param do.par if TRUE, code is run in parallel on all available cores minus one
#' @param num.cores optionally, user can hard set the number of cores to use
#' @return a DEG table Jackknifed
#' @export
JackD <- function(E, Samples, Disease, Identities, do.par = F, num.cores = 1)
{
  cat(" ..........  Entry in JackD: \n");
  ta = proc.time()[3];
  cat("             Number of cells:", ncol(E), "\n");
  cat("             Number of genes:", nrow(E), "\n");
  u = as.character(unique(Samples))
  cat("             Number of samples:", length(u), "\n");

  if (do.par)
    num.cores = parallel::detectCores() - 1

  colnames(E) <- 1:ncol(E)

  outs = parallel::mclapply(u, function(x){
    # dropout one sample
    logik = Samples != x;
    E_dummy          = E[,logik]
    Disease_dummy    = Disease[logik]
    Identities_dummy = Identities[logik]
    # create Seurat object for each cell type; run differential expression
    y = as.character(unique(Identities_dummy))
    qq = lapply(y, function(z){
      logik = Identities_dummy == z;
      E.          = E_dummy[,logik]
      Disease.    = Disease_dummy[logik]
      Identities. = Identities_dummy[logik]
      ctrl <- Seurat::CreateSeuratObject(counts = E.)
      ctrl <- Seurat::NormalizeData(object = ctrl)
      ctrl <- Seurat::AddMetaData(ctrl, metadata=Disease., col.name = "Disease")
      ctrl <- Seurat::SetIdent(ctrl, value='Disease')
      Seurat::FindAllMarkers(ctrl, only.pos = T)
    })
    names(qq) <- y
    qq
    }, mc.cores =  num.cores)
  return(outs)
}
sanofi-pi/veni documentation built on Oct. 12, 2020, 10:17 p.m.