Nothing
#' Make DESeqDataSet from counts matrix and metadata
#'
#' @param counts The genes x samples counts matrix with row names. At least one row name must be an ENSEMBL gene ID,
#' since gene annotation is done via the ENSEMBL database.
#' @param metadata data.frame of sample information. Order of rows corresponds to the order of columns in the counts matrix.
#' @param ah_record ID of AnnotationHub record used to retrieve an `EnsDb` object.
#' @param design The design formula specified in `DESeqDataSet()`
#' To view all valid record IDs, run
#' ```
#' library(AnnotationHub)
#' mcols(AnnotationHub()) %>%
#' as_tibble(rownames="ah_record") %>%
#' filter(rdataclass=="EnsDb")
#' ```
#'
#' @return A `DESeq2::DESeqDataSet` object containing the counts matrix and metadata.
#'
#' @examples
#' \donttest{
#' library("DESeq2")
#' count_mat <- counts(T47D)
#' meta <- data.frame(colData(T47D))
#' dds <- make_dds(counts = count_mat, metadata = meta, ah_record = "AH89426")
#' }
#'
#' @export
make_dds <- function(counts, metadata, ah_record, design = ~1) {
if (ncol(counts) != nrow(metadata)) {
stop("The number columns in 'counts' does not equal the number of metadata rows.")
}
# if (is.null(colnames(counts))) {
# stop("Matrix 'counts' must have column names.")
# }
if (is.null(rownames(counts))) {
stop("Matrix 'counts' must have row names.")
} else if (!any(str_detect(rownames(counts), "^ENS"))) {
stop("At least one rowname of matrix 'counts' must be an ENSEMBL gene ID. \n
Otherwise, no gene can be annotated and it is better to use DESeq2::DESeqDataSetFromMatrix() directly.")
}
# remove version numbers from ENSEMBL IDs
rownames(counts) <- ifelse(
stringr::str_detect(rownames(counts),"^ENS"),
stringr::str_replace(rownames(counts),"\\.[:digit:]*$",""),
rownames(counts)
)
# load gene information, that will be stored in rowData slot
suppressMessages({
ah <- AnnotationHub::AnnotationHub()
edb <- ah[[ah_record]]
})
ensembl_info <- ensembldb::genes(edb, filter = AnnotationFilter::GeneIdFilter(rownames(counts))) %>%
as_tibble() %>%
mutate(chromosome = as.character(seqnames)) %>%
select(gene_id, gene_name, gene_biotype, chromosome, chr_start = start)
gc_info <- ensembldb::transcripts(edb, filter = AnnotationFilter::GeneIdFilter(rownames(counts))) %>%
as_tibble() %>%
dplyr::select(tx_id, gene_id, gc_content) %>%
group_by(gene_id) %>%
summarize(gc_content = stats::median(gc_content), .groups = "drop")
ensembl_info <- left_join(ensembl_info, gc_info, by="gene_id")
not_annotated_genes <- setdiff(rownames(counts), ensembl_info$gene_id)
if (length(not_annotated_genes)>0) {
message(paste0("There are ", length(not_annotated_genes), " gene IDs that could not be annotated."))
}
row_data <- data.frame(gene_id = rownames(counts)) %>%
left_join(ensembl_info, by = "gene_id") %>%
mutate(gene_name = ifelse(is.na(gene_name), gene_id, gene_name)) %>%
column_to_rownames("gene_id")
dds <- suppressMessages(DESeq2::DESeqDataSetFromMatrix(counts, colData=metadata, rowData=row_data, design = design))
# somehow in the design, there is also an environment stored, which bloats the whole dds object
environment(DESeq2::design(dds)) <- NULL
dds
}
#' Filter genes with low counts
#'
#' @param dds A DESeqDataSet
#' @param min_count,min_rep keep genes with at least min_count counts in at least min_rep replicates
#'
#' @return A `DESeq2::DESeqDataSet` object with only those genes that meet the filter criteria.
#'
#' @examples
#' library("DESeq2")
#' dds <- makeExampleDESeqDataSet()
#' filter_genes(dds)
#'
#' @importFrom DESeq2 counts
#' @export
filter_genes <- function(dds, min_count = 5, min_rep = 3) {
dds[rowSums(counts(dds) >= min_count) >= min_rep, ]
}
#' Create a mean-sd plot
#' Make a scatterplot that shows for each gene its standard deviation versus mean.
#' @param vsd A DESeqTransform object
#'
#' @return A ggplot object of the ggplot2 package that contains the mean-sd plot.
#' @examples
#' \donttest{
#' library("DESeq2")
#' dds <- makeExampleDESeqDataSet(interceptMean=10, n=5000)
#' vsd <- vst(dds)
#' mean_sd_plot(vsd)
#' }
#' @export
mean_sd_plot <- function(vsd) {
data.frame(
gene_mean = matrixStats::rowMeans2(assay(vsd)),
gene_sd = matrixStats::rowSds(assay(vsd))
) %>%
ggplot(aes(rank(gene_mean), gene_sd)) +
ggpointdensity::geom_pointdensity() +
cowplot::theme_cowplot() +
geom_smooth(color = "red", method = "gam", formula = y ~ s(x, bs = "cs")) +
labs(x = "rank(mean)", y = "sd")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.