#' Retrieve expression data for genes/transcripts across samples
#'
#' Returns the estimated counts for the genes/transcripts enumerated in the
#' `features` table for the samples enumerated in the `samples` table.
#'
#' Note that the values returned are simply estimated counts. They are not
#' normalized by sequencing depth. For now, the only use for this function is
#' to compare how ratios of genes compare across samples.
#'
#' @export
#' @importFrom edgeR cpm
#' @importFrom reshape2 melt
#' @seealso [as.DGEList()]
#'
#' @param a4 An `Archs4Repository`
#' @param features a `tibble` of feature descriptors (as returned from
#' [feature_info()]). Really this table simply needs the following columns:
#' * `"a4name"` or `"ensembl_id" or `;
#' * `"feature_type"`: (gene or transcript)
#' * `"source"` (human or mouse),;
#' @param samples a samples identifier: ie. a tibble with series_id and
#' sample_id columns.
#' @param type "gene" or "transcript"-level data?
#' @param source "mouse" or "human" pass this in explicitly if there is no
#' "source" column in your `features` or `samples` data.frame
#' @param feature_meta additional columns from
#' `feature_info(a4, feature_type = feature_type)`
#' @param sample_meta metadata information to include for the samples. These
#' values are extracted from the `meta/VALUE` files in the respective hdf5
#' data files. See [archs4_sample_info()] for more details.
#' @return a data.frame of expression data. Each row is an observation
#' (one gene one sample)
#' @examples
#' a4 <- Archs4Repository()
#' gnz <- feature_lookup(a4, c("CFAP65", "PECR"))
#' gexprs <- fetch_expression(a4, gnz)
fetch_expression <- function(a4, features, samples = NULL,
feature_type = "gene", source = NULL,
feature_meta = "symbol",
sample_meta = c("Sample_title", "Sample_source_name_ch1"),
prior.count = 3, ...) {
assert_class(a4, "Archs4Repository")
source <- .extract_source(features, samples, source)
# Extract Features
if (is.character(features)) {
features <- feature_lookup(a4, features, feature_type = feature_type,
source = source)
}
assert_data_frame(features)
assert_subset(c("ensembl_id", "feature_type"), colnames(features))
features.all <- distinct(features, ensembl_id)
features <- features.all %>%
inner_join(feature_info(a4, feature_type, source), by = "ensembl_id")
fmiss <- anti_join(features.all, features, by = "ensembl_id")
if (nrow(fmiss) > 0L) {
warning("Missing ", nrow(fmiss), " features from query")
}
all.samples <- a4 %>%
sample_table(feature_type = type) %>%
filter(organism == source) %>%
select(series_id, sample_id)
samples.specified <- !is.null(samples)
if (!samples.specified) {
samples <- all.samples
} else {
assert_data_frame(samples)
assert_subset(c("series_id", "sample_id"), colnames(samples))
samples <- distinct(samples, series_id, sample_id, .keep_all = TRUE)
missing.samples <- samples %>%
anti_join(all.samples, by = c("series_id", "sample_id"))
if (nrow(missing.samples)) {
warning(nrow(missing.samples), " requested samples are not in archs4")
samples <- samples %>%
semi_join(all.samples, by = c("series_id", "sample_id"))
}
}
sids <- unique(samples$sample_id)
y <- as.DGEList(a4, sids, features = features, feature_type = feature_type,
check_missing_samples = FALSE)
counts <- reshape2::melt(y$counts)
counts$Var1 <- as.character(counts$Var1)
counts$Var2 <- as.character(counts$Var2)
colnames(counts) <- c("ensembl_id", "sample_id", "count")
cpms <- reshape2::melt(edgeR::cpm(y, prior.count = prior.count, log = TRUE))
counts$cpm <- cpms[[3]]
out <- left_join(samples, counts, by = c("sample_id"))
if (is.character(feature_meta)) {
fi.meta <- feature_info(a4, feature_type = feature_type, source = source)
fi.meta <- filter(fi.meta, !is.na(ensembl_id))
feature_meta <- unique(c("ensembl_id", feature_meta))
fi.meta <- fi.meta[, colnames(fi.meta) %in% feature_meta, drop = FALSE]
if (ncol(fi.meta) > 1L) {
out <- left_join(out, fi.meta, by = "ensembl_id")
}
}
if (is.character(sample_meta)) {
si <- tryCatch({
sample_info(a4, sids, sample_meta, check_missing_samples = FALSE)
}, error = function(e) NULL)
if (is.data.frame(si)) {
scols <- unique(c("sample_id", sample_meta))
si <- si[, intersect(scols, colnames(si)), drop = FALSE]
if (ncol(si) > 1L) {
si <- distinct(si, sample_id, .keep_all = TRUE)
out <- left_join(out, si, by = "sample_id")
}
} else {
warning("There was an error fetching the sample metadata requested")
}
}
out
}
.extract_source <- function(features, samples, source) {
if (is.null(source)) {
source <- character()
if (is.data.frame(features)) {
source <- unique(features$source)
}
if (is.data.frame(samples)) {
source <- unique(c(source, samples$source))
}
source <- source[!is.null(source)]
if (length(source) == 0) {
stop("Don't know where to fetch sources from")
}
}
if (is.character(source)) {
if (length(source) != 1L) {
stop("Can't specificy multiple organisms for source -- one at a time!")
}
} else {
stop("source expected to be a character by know")
}
if (!source %in% archs4_sources()) {
stop("Unknown source:", source)
}
source
}
#' Retrieves (organism) source of gene/transcript identifiers
#'
#' Parses ensembl identifiers and determines if they are for genes, or
#' transcripts, as well as the organism they should belong to (human, mouse)
#'
#' @param x A vector of identifiers
feature_source <- function(x, ...) {
stop("not implemented yet")
}
# library size and normalization factor estimation =============================
#
# The code here was adapted from the code in edgeR.
# We want to calculate and store library sizes and normalization factors
# for an Archs4Repository, so we can calculate things like "cpm" on the fly ...
# the super-fly.
# Default top-level TMM normalization parameters:
#
# logratioTrim = 0.3
# sumTrim = 0.05
# doWeighting = TRUE
# Acutoff = -1e10
# p = 0.75
#
# Main TMM Calculation
# f75 <- .calcFactorQuantile(data=x, lib.size=lib.size, p=0.75)
# refColumn <- which.min(abs(f75-mean(f75)))
# f <- rep(NA,ncol(x)) # holds norm factors
# for(i in 1:ncol(x))
# f[i] <- .calcFactorWeighted(obs=x[,i], ref=x[,refColumn],
# libsize.obs=lib.size[i],
# libsize.ref=lib.size[refColumn],
# logratioTrim=logratioTrim,
# sumTrim=sumTrim,
# doWeighting=doWeighting,
# Acutoff=Acutoff)
#
# .calcFactorQuantile
# y <- t(t(data)/lib.size) # divides each column by its library size
# f <- apply(y,2,function(x) quantile(x,p=p)) # find value at upper quartile
# .expr.datasets <- c("human_gene", "mouse_gene",
# "human_transcript", "mouse_transcript")
.expr.datasets <- c("human_gene", "mouse_gene")
#' Estimate normalization factors for datasets in the Archs4Repository
#'
#' This function will serialize the results of the library size and
#' normalization factors into files inside `datadir(a4)`
#'
#' @export
#' @param a4 The `Arcsh4Repository`
#' @return Invisibly returns a tibble of the the library size and normalization
#' factors for the expression data in `a4` (invisibly).
estimate_repository_norm_factors <- function(a4) {
info <- lapply(.expr.datasets, function(key) {
estimate_norm_factors(a4, key)
})
invisible(info)
}
#' Calculates library size and norm factors for a specific dataset
#'
#' This function will serialize the results of the library size and
#' normalization factors into files inside `datadir(a4)`
#'
#'
estimate_norm_factors <- function(a4, key, n = 500, logratioTrim = 0.3,
sumTrim = 0.05, doWeighting = TRUE,
Acutoff = -1e10, p = 0.75, ...) {
if (FALSE) {
key = "mouse_gene"
n = 500; logratioTrim = 0.3; sumTrim = 0.05; doWeighting = TRUE;
Acutoff = -1e10; p = 0.75
}
key <- match.arg(key, .expr.datasets)
h5.fn <- file_path(a4, key)
fn.out <- paste0(key, "-normfactors.csv")
fn.out <- file.path(datadir(a4), fn.out)
# Almost could have saved a lot of time using the v4++ matrices since they
# include a "meta/reads_aligned" vector, however to use TMM, we still need
# to calculate the percentiles ...
# sinfo <- tibble(
# series_id = trimws(h5read(h5.fn, "meta/Sample_series_id")),
# sample_id = trimws(h5read(h5.fn, "meta/Sample_geo_accession")),
# libsize = h5read(h5.fn, "meta/reads_aligned"))
sinfo <- .crankLibSize(h5.fn, n = n, p = p)
fn.tmp <- sub("\\.csv$", "-tmp.csv", fn.out)
write.csv(sinfo, fn.tmp, row.names = FALSE)
# Identify the sample to use as the reference profile
refColumn <- which.min(abs(sinfo$quantile - mean(sinfo$quantile, na.rm = TRUE)))
ref.vals <- h5read(h5.fn, "data/expression", list(NULL, refColumn))
# Run a TMM for all samples against the reference profile
nf <- .crankNormFactors(h5.fn, sinfo, ref.vals, n = n,
logratioTrim = logratioTrim, sumTrim = sumTrim,
doWeighting = doWeighting, Acutoff = Acutoff)
sinfo$normfactor <- nf
write.csv(sinfo, fn.out, row.names = FALSE)
invisible(sinfo)
}
# before the v4 gene-level count matrices, the read counts per sample were not
# included, so we were calculating them manually.
.crankLibSize <- function(h5.fn, n = 500, p = 0.75) {
message("Calculating library sizes ...")
sinfo <- tibble(
series_id = trimws(h5read(h5.fn, "meta/Sample_series_id")),
sample_id = trimws(h5read(h5.fn, "meta/Sample_geo_accession")))
n.batches <- nrow(sinfo) / n
remainder <- nrow(sinfo) %% n
if (remainder > 0) n.batches <- n.batches + 1
n.batches <- floor(n.batches)
offset <- 0
res <- vector("list", n.batches)
for (i in 1:n.batches) {
idx.start <- offset + 1
idx.end <- idx.start + (if (i != n.batches) n else remainder) - 1L
idxs <- idx.start:idx.end
offset <- idx.end
# lib size and percentile
dat <- h5read(h5.fn, "data/expression", list(NULL, idxs))
lib.size <- colSums(dat)
dat <- t(t(dat)/lib.size)
# Somehow some samples had NA counts, so we need to protect from that
fq <- apply(dat, 2, function(x) quantile(x, p = p, na.rm = TRUE))
res[[i]] <- data.frame(index = idxs, libsize = lib.size, quantile = fq)
}
res <- bind_rows(res)
bind_cols(sinfo, res)
}
.crankNormFactors <- function(h5.fn, sinfo, ref.vals, n = n,
logratioTrim = logratioTrim, sumTrim = sumTrim,
doWeighting = doWeighting, Acutoff = Acutoff) {
message("Calculating normalization factors ...")
ref.size <- sum(ref.vals)
# Figure out how to batch iterate over our data, we now have to reload the
# data to calculate the normalization factors.
n.batches <- nrow(sinfo) / n
remainder <- nrow(sinfo) %% n
if (remainder > 0) n.batches <- n.batches + 1
n.batches <- floor(n.batches)
# Load batches again and calc norm factors
nf <- numeric(nrow(sinfo))
offset <- 0
for (i in 1:n.batches) {
idx.start <- offset + 1
idx.end <- idx.start + (if (i != n.batches) n else remainder) - 1L
idxs <- idx.start:idx.end
offset <- idx.end
# lib size and percentile
dat <- h5read(h5.fn, "data/expression", list(NULL, idxs))
for (j in 1:ncol(dat)) {
idx <- idxs[j]
vals <- dat[,j]
if (any(is.na(vals))) {
nf[idx] <- NA_real_
} else {
nf[idx] <- .calcFactorWeighted(obs=vals, ref=ref.vals,
libsize.obs=sum(vals),
libsize.ref=ref.size,
logratioTrim=logratioTrim,
sumTrim=sumTrim,
doWeighting=doWeighting,
Acutoff=Acutoff)
}
}
}
nf
}
# Calculates the TMM between two libraries.
#
# This is lifted straight from edgeR_3.22.2. The only reason it is included here
# is because it is not exported (rightfully) from the package.
.calcFactorWeighted <- function(obs, ref, libsize.obs=NULL, libsize.ref=NULL,
logratioTrim=.3, sumTrim=0.05, doWeighting=TRUE,
Acutoff=-1e10) {
obs <- as.numeric(obs)
ref <- as.numeric(ref)
if( is.null(libsize.obs) ) nO <- sum(obs) else nO <- libsize.obs
if( is.null(libsize.ref) ) nR <- sum(ref) else nR <- libsize.ref
logR <- log2((obs/nO)/(ref/nR)) # log ratio of expression, accounting for library size
absE <- (log2(obs/nO) + log2(ref/nR))/2 # absolute expression
v <- (nO-obs)/nO/obs + (nR-ref)/nR/ref # estimated asymptotic variance
# remove infinite values, cutoff based on A
fin <- is.finite(logR) & is.finite(absE) & (absE > Acutoff)
logR <- logR[fin]
absE <- absE[fin]
v <- v[fin]
if(max(abs(logR)) < 1e-6) return(1)
# taken from the original mean() function
n <- length(logR)
loL <- floor(n * logratioTrim) + 1
hiL <- n + 1 - loL
loS <- floor(n * sumTrim) + 1
hiS <- n + 1 - loS
# keep <- (rank(logR) %in% loL:hiL) & (rank(absE) %in% loS:hiS)
# a fix from leonardo ivan almonacid cardenas, since rank() can return
# non-integer values when there are a lot of ties
keep <- (rank(logR)>=loL & rank(logR)<=hiL) & (rank(absE)>=loS & rank(absE)<=hiS)
if(doWeighting)
f <- sum(logR[keep]/v[keep], na.rm=TRUE) / sum(1/v[keep], na.rm=TRUE)
else
f <- mean(logR[keep], na.rm=TRUE)
# Results will be missing if the two libraries share no features with positive counts
# In this case, return unity
if(is.na(f)) f <- 0
2^f
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.