#matrix_path = 'data-raw/external/scirep_sequential_qc.txt'
#classinfo_path = 'scirep_classes.txt'
#' @title Read counts matrix
#'
#' @param path string. path to the counts matrix file
#' @param ... other arguments passsed on to [readr::read_tsv()].
#'
#' @return integer matrix. counts.
#'
#' @details
#' In any case, first part (separated by `|`) of row names must be
#' Ensembl transcript id
#'
#' @export
# path = 'data-raw/external/scirep_sequential_qc.txt'
read_mat <- function(path, ...) {
path %>% readr::read_tsv(T, readr::cols(.default = 'c'), ...) %>%
dplyr::mutate_at(-1, readr::parse_integer) %>%
dplyr::rename('transcript' = 1) %>% as.data.frame() %>%
tibble::column_to_rownames('transcript') %>% as.matrix()
}
#' @title filter genes with low expression values
#'
#' @param mat integer matrix. counts.
#' @param min_count, min_sample_per_gene integer scalar. For each gene, it must
#' contain at least `min_count` reads in at least `min_sample_per_gene`
#' samples. Otherwise, it would be dropped.
#'
#' @return integer matrix. counts.
#'
#' @examples
#' filter_low(sim_mat) %>% head()
#'
#' @export
filter_low <- function(mat, min_count = 2, min_sample_per_gene = 5) {
low_per_row <- rowSums(mat > min_count)
keeped_row <- low_per_row > min_sample_per_gene
mat[keeped_row, ]
}
#' @title Plot highest expressed genes
#'
#' @details Each row represents a gene. In a row, each bar represents a sample,
#' the x axis tells the percentage of counts accounted for across the whole
#' dataset. (counts of that gene in that sample / total counts of all genes in
#' all samples)
#'
#' Genes are sorted by average expression.
#'
#' @param mat numeric matrix. counts.
#' @param top_n integer scalar. How many genes to show. If greater that total
#' gene number, show all genes.
#'
#' @examples
#' plot_highest_exprs(sim_mat)
#'
#' @export
plot_highest_exprs <- function(mat, top_n = 20) {
mat %>% as_SingleCellExperiment() %>%
{suppressMessages(scater::calculateQCMetrics(.))} %>%
scater::plotHighestExprs(n = top_n)
}
# plot_group --------------
#' @title workhorse of `plot_PCA/TSNE()`
#'
#' @details `plot_PCA(sce, shape, color) -> `plot_group_impl(sce, shape, color, scater::plotPCA)`
#'
#' @keywords internal
plot_group_impl <- function(sce, shape = NULL, color = NULL, plot_fun) {
suppressWarnings(
plot_fun(
sce,
shape_by = shape, colour_by = color,
run_args = list(exprs_values = 'counts')
)
)
}
#' @title plot PCA, TSNE
#'
#' @param sce SingleCellExperiment object.
#' @param shape, color string. Specify a column in `col_data` of
#' [as_SingleCellExperiment()] to shape/color by.
#'
#' @return ggplot object.
#'
#' @name plot_group
NULL
#' @rdname plot_group
#'
#' @examples
#' as_SingleCellExperiment(sim_mat) %>% plot_PCA()
#'
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA()
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label')
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(color = 'label')
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label', color = 'label')
#'
#' @export
plot_PCA <- function(sce, shape = NULL, color = NULL) {
plot_group_impl(sce, shape, color, scater::plotPCA)
}
#' @rdname plot_group
#'
#' @examples
#' as_SingleCellExperiment(sim_mat) %>% plot_PCA()
#'
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA()
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label')
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(color = 'label')
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label', color = 'label')
#'
#' @export
plot_TSNE <- function(sce, shape = NULL, color = NULL) {
plot_group_impl(sce, shape, color, scater::plotTSNE)
}
# plot_variance --------------
#' get y axis range of ggplot object.
get_y_range <- function(plot) {
ggplot2::ggplot_build(plot)$layout$panel_params[[1]]$y.range
}
#' @title generate equally spaced y coordinates, not hit bottom nor top.
#'
#' @details Image `plot`'s y axis extends from 0 to 3, `x` contains 3 values,
#' then we give `c(0.5, 1.5, 2.5)`.
#'
#' @param plot ggplot object.
#' @param x numberic vector
#'
#' @return numeric vector. The same length as `x`
#'
#' @keywords internal
seq_y <- function(plot, x) {
y_range <- get_y_range(plot)
by <- diff(y_range) / length(x)
seq(y_range[1] + by, y_range[2], by) - by/2
}
#' @title plot variance of counts across samples
#'
#' @param mat integer matrix. counts
#' @param refer_gene_id character. Ensembl transcript id, add a vertical line
#' for each gene to mark the corresponding CV (on x axis). Only genes in
#' counts matrix would be shown. Usually these genes should be the reference
#' genes you want to use for normalization.
#' @param refer_gene_name character. Transcript name
#'
#' @return [ggplot2::ggplot()] object.
#'
#' @name plot_variance
NULL
#' @details
#' `plot_cv_density()` produces density plot of coefficient of variation
#'
#' @examples
#' # only one gene exist in the matrix
#' plot_cv_density(sim_mat, suggest_refer$id)
#' plot_cv_density(sim_mat, suggest_refer$id, suggest_refer$name)
#'
#' # the name should be the same length as id
#' plot_cv_density(sim_mat, rownames(sim_mat)[1:6], letters[1:3])
#' # if only part of the genes have name, you can pass the id of other genes
#' plot_cv_density(sim_mat, rownames(sim_mat)[1:6], c(letters[1:3], rownames(sim_mat)[4:6]))
#'
#' @export
#'
#' @rdname plot_variance
# mat = sim_mat
# refer_gene_id = suggest_refer$id
# refer_gene_name = suggest_refer$name
plot_cv_density <- function(mat, refer_gene_id = '', refer_gene_name = refer_gene_id) {
cv <- mat %>% apply(1, cv_fun) %>%
{tibble::tibble(id = names(.), value = .)} %>%
dplyr::mutate(id = stringr::str_extract(id, '[^|]+'))
plot <- ggplot2::ggplot(cv, ggplot2::aes(value)) +
ggplot2::geom_density(color = 'blue') +
ggplot2::labs(x = 'coefficient of variation')
if (length(refer_gene_id) != length(refer_gene_name)) {
warning("Ignoring refer_gene_name, since it isn't the same length as refer_gene_id")
refer_gene_name = refer_gene_id
}
cv_refer <- tibble::tibble(id = refer_gene_id, name = refer_gene_name) %>%
dplyr::inner_join(cv, by = 'id')
if (nrow(cv_refer) == 0L) {
warning("None refer gene found in the count matrix")
return(plot)
}
plot + ggplot2::geom_vline(xintercept = cv_refer$value, color = 'green') +
ggplot2::geom_point(
ggplot2::aes(x = value, y = seq_y(plot, value)),
data = cv_refer, size = 2, shape = 1
) +
ggrepel::geom_label_repel(
ggplot2::aes(x = value, y = seq_y(plot, value), label = name),
data = cv_refer, hjust = 0.5
)
}
#' @details
#' `plot_refer_violin()` produces violin plot of given reference genes's counts (with CV annotated)
#'
#' @examples
#' # only one gene exist in the matrix
#' plot_refer_violin(sim_mat, suggest_refer$id)
#' plot_refer_violin(sim_mat, suggest_refer$id, suggest_refer$name)
#'
#' # the name should be the same length as id
#' plot_refer_violin(sim_mat, rownames(sim_mat)[1:6], letters[1:3])
#' # if only part of the genes have name, you can pass the id of other genes
#' plot_refer_violin(sim_mat, rownames(sim_mat)[1:6], c(letters[1:3], rownames(sim_mat)[4:6]))
#'
#' @export
#'
#' @rdname plot_variance
# mat = sim_mat
# refer_gene_id = rownames(mat)[1:6]
# refer_gene_name = paste0('gene_', letters[1:6])
plot_refer_violin <- function(mat, refer_gene_id, refer_gene_name = refer_gene_id) {
if (length(refer_gene_id) != length(refer_gene_name)) {
warning("Ignoring refer_gene_name, since it isn't the same length as refer_gene_id")
refer_gene_name = refer_gene_id
}
refer_gene <- tibble::tibble(id = refer_gene_id, name = refer_gene_name)
refer_count <- mat %>% tibble::as_tibble(rownames = 'id') %>%
dplyr::mutate(id = stringr::str_extract(id, '[^|]+')) %>%
dplyr::inner_join(refer_gene, ., by = 'id') %>% dplyr::select(-id)
if (nrow(refer_count) == 0L) {
warning('None refer gene found in the count matrix')
return(ggplot2::ggplot())
}
refer_count_long <- refer_count %>% tidyr::gather('sample', 'count', -1) %>%
dplyr::mutate_at('name', as.factor)
g_violin <- refer_count_long %>%
ggplot2::ggplot(ggplot2::aes(name, log2(count + 0.001))) +
ggplot2::geom_violin() +
ggplot2::labs(x = 'reference transcripts', y = quote(log[2](count)))
# max y coordinate of each violin
y_max <- ggplot2::ggplot_build(g_violin)$data[[1]] %>% tibble::as_tibble() %>%
dplyr::group_by(x) %>% dplyr::arrange(dplyr::desc(y)) %>% dplyr::slice(1) %>%
dplyr::ungroup() %>% dplyr::arrange(x) %>% dplyr::select(x, y)
cv_df <- refer_count_long %>%
dplyr::group_by(name) %>% dplyr::summarise(cv = cv_fun(count)) %>%
dplyr::arrange(name) %>% dplyr::mutate(x = seq_along(name)) %>%
dplyr::inner_join(y_max, by = 'x') %>%
dplyr::mutate(y = y + diff(get_y_range(g_violin)) / 20) %>%
dplyr::mutate(cv = formatC(cv, digits = 3, format = 'f'))
g_violin + ggplot2::geom_text(ggplot2::aes(x, y, label = cv), cv_df, color = 'blue')
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.