R/matrix-process.R

Defines functions read_mat filter_low plot_highest_exprs

Documented in filter_low plot_highest_exprs read_mat

#matrix_path = 'data-raw/external/scirep_sequential_qc.txt'
#classinfo_path = 'scirep_classes.txt'

#' @title Read counts matrix
#'
#' @param path string. path to the counts matrix file
#' @param ... other arguments passsed on to [readr::read_tsv()].
#'
#' @return integer matrix. counts.
#'
#' @details
#' In any case, first part (separated by `|`) of row names must be
#' Ensembl transcript id
#'
#' @export

# path = 'data-raw/external/scirep_sequential_qc.txt'
read_mat <- function(path, ...) {
	path %>% readr::read_tsv(T, readr::cols(.default = 'c'), ...) %>%
		dplyr::mutate_at(-1, readr::parse_integer) %>%
		dplyr::rename('transcript' = 1) %>% as.data.frame() %>%
		tibble::column_to_rownames('transcript') %>% as.matrix()
}



#' @title filter genes with low expression values
#'
#' @param mat integer matrix. counts.
#' @param min_count, min_sample_per_gene integer scalar. For each gene, it must
#'   contain at least `min_count` reads in at least `min_sample_per_gene`
#'   samples. Otherwise, it would be dropped.
#'
#' @return integer matrix. counts.
#'
#' @examples
#' filter_low(sim_mat) %>% head()
#'
#' @export
filter_low <- function(mat, min_count = 2, min_sample_per_gene = 5) {
	low_per_row <- rowSums(mat > min_count)
	keeped_row <- low_per_row > min_sample_per_gene
	mat[keeped_row, ]
}


#' @title Plot highest expressed genes
#'
#' @details Each row represents a gene. In a row, each bar represents a sample,
#' the x axis tells the percentage of counts accounted for across the whole
#' dataset. (counts of that gene in that sample / total counts of all genes in
#' all samples)
#'
#' Genes are sorted by average expression.
#'
#' @param mat numeric matrix. counts.
#' @param top_n integer scalar. How many genes to show. If greater that total
#'   gene number, show all genes.
#'
#' @examples
#' plot_highest_exprs(sim_mat)
#'
#' @export
plot_highest_exprs <- function(mat, top_n = 20) {
	mat %>% as_SingleCellExperiment() %>%
		{suppressMessages(scater::calculateQCMetrics(.))} %>%
		scater::plotHighestExprs(n = top_n)
}


# plot_group --------------

#' @title workhorse of `plot_PCA/TSNE()`
#'
#' @details `plot_PCA(sce, shape, color) -> `plot_group_impl(sce, shape, color, scater::plotPCA)`
#'
#' @keywords internal
plot_group_impl <- function(sce, shape = NULL, color = NULL, plot_fun) {
 	suppressWarnings(
        plot_fun(
            sce,
            shape_by = shape, colour_by = color,
            run_args = list(exprs_values = 'counts')
        )
    )
}

#' @title plot PCA, TSNE
#'
#' @param sce SingleCellExperiment object.
#' @param shape, color string. Specify a column in `col_data` of
#'   [as_SingleCellExperiment()] to shape/color by.
#'
#' @return ggplot object.
#'
#' @name plot_group
NULL


#' @rdname plot_group
#'
#' @examples
#' as_SingleCellExperiment(sim_mat) %>% plot_PCA()
#'
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA()
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label')
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(color = 'label')
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label', color = 'label')
#'
#' @export
plot_PCA <- function(sce, shape = NULL, color = NULL) {
	plot_group_impl(sce, shape, color, scater::plotPCA)
}



#' @rdname plot_group
#'
#' @examples
#' as_SingleCellExperiment(sim_mat) %>% plot_PCA()
#'
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA()
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label')
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(color = 'label')
#' as_SingleCellExperiment(sim_mat, sim_sample_class) %>% plot_PCA(shape = 'label', color = 'label')
#'
#' @export
plot_TSNE <- function(sce, shape = NULL, color = NULL) {
	plot_group_impl(sce, shape, color, scater::plotTSNE)
}

# plot_variance --------------


#' get y axis range of ggplot object.
get_y_range <- function(plot) {
	 ggplot2::ggplot_build(plot)$layout$panel_params[[1]]$y.range
}

#' @title generate equally spaced y coordinates, not hit bottom nor top.
#'
#' @details Image `plot`'s y axis extends from 0 to 3, `x` contains 3 values,
#' then we give `c(0.5, 1.5, 2.5)`.
#'
#' @param plot ggplot object.
#' @param x numberic vector
#'
#' @return numeric vector. The same length as `x`
#'
#' @keywords internal
seq_y <- function(plot, x) {
	y_range <- get_y_range(plot)
	by <- diff(y_range) / length(x)
	seq(y_range[1] + by, y_range[2], by) - by/2
}


#' @title plot variance of counts across samples
#'
#' @param mat integer matrix. counts
#' @param refer_gene_id character. Ensembl transcript id, add a vertical line
#'   for each gene to mark the corresponding CV (on x axis). Only genes in
#'   counts matrix would be shown. Usually these genes should be the reference
#'   genes you want to use for normalization.
#' @param refer_gene_name character. Transcript name
#'
#' @return [ggplot2::ggplot()] object.
#'
#' @name plot_variance
NULL




#' @details
#' `plot_cv_density()` produces density plot of coefficient of variation
#'
#' @examples
#' # only one gene exist in the matrix
#' plot_cv_density(sim_mat, suggest_refer$id)
#' plot_cv_density(sim_mat, suggest_refer$id, suggest_refer$name)
#'
#' # the name should be the same length as id
#' plot_cv_density(sim_mat, rownames(sim_mat)[1:6], letters[1:3])
#' # if only part of the genes have name, you can pass the id of other genes
#' plot_cv_density(sim_mat, rownames(sim_mat)[1:6], c(letters[1:3], rownames(sim_mat)[4:6]))
#'
#' @export
#'
#' @rdname plot_variance

# mat = sim_mat
# refer_gene_id = suggest_refer$id
# refer_gene_name = suggest_refer$name
plot_cv_density <- function(mat, refer_gene_id = '', refer_gene_name = refer_gene_id) {
	cv <- mat %>% apply(1, cv_fun) %>%
		{tibble::tibble(id = names(.), value = .)} %>%
		dplyr::mutate(id = stringr::str_extract(id, '[^|]+'))
	plot <- ggplot2::ggplot(cv, ggplot2::aes(value)) +
		ggplot2::geom_density(color = 'blue') +
		ggplot2::labs(x = 'coefficient of variation')

	if (length(refer_gene_id) != length(refer_gene_name)) {
		warning("Ignoring refer_gene_name, since it isn't the same length as refer_gene_id")
		refer_gene_name = refer_gene_id
	}
	cv_refer <- tibble::tibble(id = refer_gene_id, name = refer_gene_name) %>%
		dplyr::inner_join(cv, by = 'id')
	if (nrow(cv_refer) == 0L) {
		warning("None refer gene found in the count matrix")
		return(plot)
	}

	plot + ggplot2::geom_vline(xintercept = cv_refer$value, color = 'green') +
		ggplot2::geom_point(
			ggplot2::aes(x = value, y = seq_y(plot, value)),
			data = cv_refer, size = 2, shape = 1
		) +
		ggrepel::geom_label_repel(
			ggplot2::aes(x = value, y = seq_y(plot, value), label = name),
			data = cv_refer, hjust = 0.5
		)
}


#' @details
#' `plot_refer_violin()` produces violin plot of given reference genes's counts (with CV annotated)
#'
#' @examples
#' # only one gene exist in the matrix
#' plot_refer_violin(sim_mat, suggest_refer$id)
#' plot_refer_violin(sim_mat, suggest_refer$id, suggest_refer$name)
#'
#' # the name should be the same length as id
#' plot_refer_violin(sim_mat, rownames(sim_mat)[1:6], letters[1:3])
#' # if only part of the genes have name, you can pass the id of other genes
#' plot_refer_violin(sim_mat, rownames(sim_mat)[1:6], c(letters[1:3], rownames(sim_mat)[4:6]))
#'
#' @export
#'
#' @rdname plot_variance

# mat = sim_mat
# refer_gene_id = rownames(mat)[1:6]
# refer_gene_name = paste0('gene_', letters[1:6])
plot_refer_violin <- function(mat, refer_gene_id, refer_gene_name = refer_gene_id) {
	if (length(refer_gene_id) != length(refer_gene_name)) {
		warning("Ignoring refer_gene_name, since it isn't the same length as refer_gene_id")
		refer_gene_name = refer_gene_id
	}

	refer_gene <- tibble::tibble(id = refer_gene_id, name = refer_gene_name)
	refer_count <- mat %>% tibble::as_tibble(rownames = 'id') %>%
		dplyr::mutate(id = stringr::str_extract(id, '[^|]+')) %>%
		dplyr::inner_join(refer_gene, ., by = 'id') %>% dplyr::select(-id)
	if (nrow(refer_count) == 0L) {
		warning('None refer gene found in the count matrix')
		return(ggplot2::ggplot())
	}

	refer_count_long <- refer_count %>%	tidyr::gather('sample', 'count', -1) %>%
		dplyr::mutate_at('name', as.factor)
	g_violin <- refer_count_long %>%
		ggplot2::ggplot(ggplot2::aes(name, log2(count + 0.001))) +
			ggplot2::geom_violin() +
			ggplot2::labs(x = 'reference transcripts', y = quote(log[2](count)))

	# max y coordinate of each violin
	y_max <- ggplot2::ggplot_build(g_violin)$data[[1]] %>% tibble::as_tibble() %>%
		dplyr::group_by(x) %>% dplyr::arrange(dplyr::desc(y)) %>% dplyr::slice(1) %>%
		dplyr::ungroup() %>% dplyr::arrange(x) %>% dplyr::select(x, y)

	cv_df <- refer_count_long %>%
		dplyr::group_by(name) %>% dplyr::summarise(cv = cv_fun(count)) %>%
		dplyr::arrange(name) %>% dplyr::mutate(x = seq_along(name)) %>%
		dplyr::inner_join(y_max, by = 'x') %>%
		dplyr::mutate(y = y + diff(get_y_range(g_violin)) / 20) %>%
		dplyr::mutate(cv = formatC(cv, digits = 3, format = 'f'))

 	g_violin + ggplot2::geom_text(ggplot2::aes(x, y, label = cv), cv_df, color = 'blue')
}
dongzhuoer/rexseek documentation built on Dec. 16, 2019, 3:19 a.m.