#' Top Variable Gene Identificaiton
#'
#' This function identifies top n variable genes using method described in KeyGenes algorithm. In brief, this algorithm use cross validation based on LASSO regression. For detailed explanation, please refer to the paper listed in the note.
#' @param input.dir The path to where the dataset is saved. Dataset should be saved as a tab-delimited table with row = gene, column = cell.
#' @param output.dir The path to where the output should be saved. The output will be saved as a tab-delimited table with NO row names or column names.
#' @param top.number.count The number of top variable genes to extract. Default to 500 genes
#' @keywords top variable gene extraction
#' @note Code reference from Roost et. al., KeyGenes, a Tool to Probe Tissue Differentiation Using a Human Fetal Transcriptional Atlas, Cell Stem Cell Reports, 2015
#' @export
#' @examples
#' top.genes("~/Desktop/sample_data.txt", "~/Desktop/sample_gene_list_output.txt", top.number.count = 1000)
top.genes <- function(input.dir, output.dir, top.number.count = 500) {
dataset <- input.dir
top.dir <- output.dir
DS <- read.table(dataset, sep="\t", header=TRUE, row.names=1, check.names=FALSE)
X <- voom(as.matrix(DS), normalize.method="none")$E
vars <- apply(X, 1, var)
top <- sort.list(-vars)[1:top.number.count]
X <- X[top,]
topl <- as.data.frame(rownames(X))
colnames(topl) <- c(paste("Top", top.number.count, sep = "", collapse = ""))
write.table(topl, top.dir, sep="\t", quote=F, row.names = FALSE, col.names = TRUE)
return()
}
#' Normalization of Single-Cell RNA-Seq Data
#'
#' This function normalizes single-cell RNA-seq data using its raw counts. The normalization is performed to remove variation due to difference in read depth and coverage.
#' @param dt.st The dataset to normalize
#' @keywords normalization
#' @export
#' @examples
#' normalize.dt(single.cell.mtx)
normalize.dt <- function(dt.st) {
# Calculate column sums
csums <- Matrix::colSums(dt.st)
# Calculate averages of the sums
cavg <- mean(csums)
# Calculate normalized bulk
norm.dt.st <- dt.st
for (i in 1:length(csums)) {norm.dt.st[,i] <- (norm.dt.st[,i]/csums[i]) * cavg}
return(norm.dt.st)
}
#' Scale Ratio Calculation
#'
#' This function calculate the scale ratio between the single-cell transcriptome and reference dataset to minimize the Lagrangian multiplier. In the other word, reduce the restriction.
#' @param bulk The reference dataset
#' @param sc The single-cell dataset
#' @keywords Scale ratio calculation
#' @export
#' @examples
#' calc.scale.ratio(reference.dt, single.cell.mtx)
calc.scale.ratio <- function(bulk, sc) {
# Calculate column sums
csums.bulk <- colSums(bulk)
csums.sc <- Matrix::colSums(sc)
# Calculate averages of the sums
cavg.bulk <- mean(csums.bulk)
cavg.sc <- mean(csums.sc)
# Calculate scaling ratio for each tissue
scale.ratio.sc <- cavg.bulk/cavg.sc
return(scale.ratio.sc)
}
#' Find intersection genes
#'
#' This function identifies the intersection genes between reference and single-cell data
#' @param bulk The reference dataset
#' @param sc The single-cell dataset
#' @export
#' @examples
#' gene.intersect.sub(reference.dt, single.cell.mtx)
gene.intersect.sub <- function(bulk, sc) {
# Find genes with all zero-expression within sc/bulk
rsums.bulk <- Matrix::rowSums(bulk)
rsums.sc <- Matrix::rowSums(sc)
# Identify blank genes
bulk.blank.genes <- which(rsums.bulk == 0)
sc.blank.genes <- which(rsums.sc == 0)
# Remove those genes
norm.bulk.sub <- bulk
norm.sc.sub <- sc
if (length(bulk.blank.genes) > 0) {norm.bulk.sub <- bulk[-(bulk.blank.genes),]}
if (length(sc.blank.genes) > 0) {norm.sc.sub <- sc[-(sc.blank.genes),]}
# Find intersection genes
genes.inter <- intersect(rownames(norm.bulk.sub), rownames(norm.sc.sub))
# Find the intersected sub dataset of bulk (normalized bulk)
norm.bulk.sub.sub <- norm.bulk.sub[genes.inter, ]
norm.sc.sub.sub <- norm.sc.sub[genes.inter, ]
return(list(norm.bulk.sub.sub, norm.sc.sub.sub))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.