#' Merge results from expression estimation tools
#'
#' Merge the results from expression estimation tools into one data.table that
#' is easy to analyze and plot
#'
#' @param exp_list a list of expression results, each which contain columns:
#' target_id, tpm, est_counts
#' @param exp_labels a character vector of the same length as exp_list with
#' labels for each method
#' @param oracle a data.frame with columns: target_id, counts, tpm
#' @export
merge_results <- function(exp_list, exp_labels, oracle) {
stopifnot( is(exp_list, "list") )
stopifnot( length(exp_list) == length(exp_labels) )
exp_list <- lapply(seq_along(exp_list),
function(i) {
res <- exp_list[[i]] %>%
dplyr::select(target_id, tpm, est_counts) %>%
data.table::data.table()
data.table::setnames(res, "tpm", paste0("tpm_", exp_labels[i]))
data.table::setnames(res, "est_counts", paste0("est_counts_", exp_labels[i]))
})
oracle <- oracle %>%
rename(tpm_oracle = tpm, est_counts_oracle = counts)
all_res <- Reduce(function(x, y) inner_join(x, y, by = c("target_id")), exp_list)
melt_by <- function(data, unit_by) {
m_unit <- data %>%
dplyr::select(target_id, starts_with(unit_by)) %>%
reshape2::melt(id.vars = "target_id", variable.name = "method")
ret <- data.table::data.table(oracle) %>%
dplyr::select(target_id, starts_with(unit_by)) %>%
inner_join(data.table::data.table(m_unit), by = "target_id") %>%
rename(estimate = value)
data.table::setnames(ret, paste0(unit_by, "_oracle"), "oracle")
ret
}
m_tpm <- melt_by(all_res, "tpm")
m_est_counts <- melt_by(all_res, "est_counts")
all_res <- all_res %>%
inner_join(data.table::data.table(oracle), by = "target_id")
structure(list(all_data = all_res,
m_tpm = m_tpm,
m_est_counts = m_est_counts),
class = "merged_res")
}
#' Compute correlation of a merged results with an oracle
#'
#' Compute correlation of a merged results with an oracle
#'
#' @param mres a \code{merged_res} object from \code{merged_result}
#' @return a named list with "tpm" and "est_counts"
#' @export
compute_cor_oracle <- function(mres) {
stopifnot( is(mres, "merged_res") )
both_res <- lapply(list(mres$m_tpm, mres$m_est_counts),
function(res) {
res %>%
group_by(method) %>%
summarise(
pearson = cor(oracle, estimate, method = "pearson"),
spearman = cor(oracle, estimate, method = "spearman"),
med_rel_diff = median(abs(scaled_error(estimate, oracle)),
na.rm = TRUE))
})
setNames(both_res, c("tpm", "est_counts"))
}
#' @export
filtered_no_summary <- function(mres, filter_exp) {
stopifnot( is(mres, "merged_res") )
do_filter <- if (missing(filter_exp)) {
FALSE
} else {
filter_exp <- deparse(substitute(filter_exp))
filtered_ids <- mres$all_data %>%
filter_(.dots = list(filter_exp)) %>%
select(target_id)
TRUE
}
both_res <- lapply(list(mres$m_tpm, mres$m_est_counts),
function(res) {
if (do_filter) {
res <- data.table(res) %>%
inner_join(data.table(filtered_ids), by = c("target_id"))
}
res %>%
group_by(method) %>%
mutate(
scaled_err = abs(scaled_error(estimate, oracle)),
per_err = abs(percent_error(estimate, oracle))
)
})
setNames(both_res, c("tpm", "est_counts"))
}
remove_perfect <- function(x, delta = .Machine$double.eps ^ 2) {
x[abs(x) >= delta]
}
#' @export
filtered_summary <- function(mres, filter_exp, ignore_perfect = TRUE, normalize = TRUE) {
stopifnot( is(mres, "merged_res") )
do_filter <- if (missing(filter_exp)) {
FALSE
} else {
filter_exp <- deparse(substitute(filter_exp))
filtered_ids <- mres$all_data %>%
filter_(.dots = list(filter_exp)) %>%
dplyr::select(target_id)
TRUE
}
both_res <- lapply(list(mres$m_tpm, mres$m_est_counts),
function(res) {
if (do_filter) {
message('filtering')
res <- data.table(res) %>%
inner_join(data.table(filtered_ids), by = c("target_id"))
}
metrics <- as.data.frame(res) %>%
#print(class(res))
#metrics <- res %>%
group_by(method) %>%
mutate(diff = relative_difference(estimate, oracle, na_zeroes = FALSE,
normalize_counts = TRUE))
perfect <- metrics %>%
group_by(target_id) %>%
summarize(y = all(abs(diff) <= .Machine$double.eps ^ 2)) %>%
filter(y)
message('removing: ', nrow(perfect))
metrics <- anti_join(metrics, perfect, by = 'target_id')
result <- as.data.frame(metrics) %>%
group_by(method) %>%
summarize(
mrd = median(abs(diff)),
pearson = cor(estimate, oracle, method = "pearson"),
spearman = cor(estimate, oracle, method = "spearman")
)
result
})
setNames(both_res, c("tpm", "est_counts"))
}
#' @export
relative_difference <- function(x, y, na_zeroes = TRUE, normalize_counts = TRUE) {
stopifnot(length(x) == length(y))
result <- rep(NA_real_, length(x))
non_zero <- which( x > 0 | y > 0 )
both_zero <- setdiff(seq_along(x), non_zero)
if (!na_zeroes) {
result[both_zero] <- 0.0
}
if (normalize_counts) {
x <- x / sum(x, na.rm = TRUE)
y <- y / sum(y, na.rm = TRUE)
}
result[non_zero] <- 2 * ( ( x[non_zero] - y[non_zero] ) /
abs(x[non_zero] + y[non_zero]) )
result
}
#' @export
percent_error <- function(estimate, truth) {
(estimate - truth) / truth
}
#' Compute all pairwise correlations of a unit
#'
#' Compute all pairwise correlations of a unit
#' @param mres a \code{merged_res} object from \code{merged_result}
#' @param a character vector of length 1 that is either "tpm" or "est_counts"
#' @export
pairwise_cor <- function(mres, unit) {
stopifnot(is(mres, "merged_res"))
stopifnot(is(unit, "character"))
stopifnot(length(unit) == 1)
unit_data <- mres$all_data %>%
select(starts_with(unit))
pcor <- unit_data %>%
cor(method = "pearson") %>%
reshape2::melt(varnames = c("method_a", "method_b")) %>%
filter(method_a != method_b) %>%
distinct(value) %>%
mutate(method_a = gsub(paste0(unit, "_"), "", method_a)) %>%
mutate(method_b = gsub(paste0(unit, "_"), "", method_b)) %>%
arrange(method_a, method_b)
scor <- unit_data %>%
cor(method = "spearman") %>%
reshape2::melt(varnames = c("method_a", "method_b")) %>%
filter(method_a != method_b) %>%
mutate(ah = ifelse(as.character(method_a) < as.character(method_b),
paste0(method_a, method_b), paste0(method_b, method_a))) %>%
distinct(ah) %>%
select(-ah) %>%
mutate(method_a = gsub(paste0(unit, "_"), "", method_a)) %>%
mutate(method_b = gsub(paste0(unit, "_"), "", method_b)) %>%
arrange(method_a, method_b)
list(pearson = pcor, spearman = scor)
}
#' @export
plot_pair <- function(mres, unit, method_a, method_b) {
stopifnot( is(mres, "merged_res") )
stopifnot( unit == "tpm" || unit == "est_counts" )
col_a <- paste0(unit, "_", method_a)
col_b <- paste0(unit, "_", method_b)
sub_data <- mres$all_data %>%
select(target_id,
matches(col_a),
matches(col_b))
ggplot(sub_data, aes_string(col_a, col_b)) +
geom_point(alpha = 0.08) +
geom_abline(intercept = 0, slope = 1, color = "black") +
stat_smooth(method = "lm", color = "blue", alpha= 0.8) +
theme_bw()
}
#' @export
plot_ranks <- function(mres, unit, method) {
stopifnot( is(mres, "merged_res") )
stopifnot( unit == "tpm" || unit == "est_counts" )
col_meth <- paste0(unit, "_", method)
col_oracle <- paste0(unit, "_", "oracle")
sub_data <- mres$all_data %>%
select_("target_id", col_meth, col_oracle) %>%
as.data.frame()
sub_data$meth_rank <- rank(sub_data[,col_meth])
sub_data$oracle_rank <- rank(sub_data[,col_oracle])
sub_data$d <- with(sub_data, oracle_rank - meth_rank)
# sub_data <- as.data.frame(sub_data, stringsAsFactors = FALSE) %>%
# mutate(meth_rank = lazyeval::interp(~rank(x), x = col_meth))
# ggplot(sub_data, aes_string("oracle_rank", "d")) +
# geom_point(alpha = 0.2)
# print(head(sub_data))
plt <- ggvis::ggvis(sub_data, x = ~oracle_rank, y = ~d) %>%
ggvis::layer_points(fill := "black", opacity := 0.2) %>%
ggvis::add_tooltip(function(dat){
print(dat)
print(sub_data[(sub_data$oracle_rank == dat$oracle_rank) & (sub_data$d == dat$d),])
}, "hover")
list(plt = plt, data = sub_data)
plt
}
#' @export
plot_pe <- function(mres, unit, method) {
stopifnot( is(mres, "merged_res") )
stopifnot( unit == "tpm" || unit == "est_counts" )
col_meth <- paste0(unit, "_", method)
col_oracle <- paste0(unit, "_", "oracle")
sub_data <- mres$all_data %>%
select_("target_id", col_meth, col_oracle) %>%
as.data.frame()
sub_data$pe <- pe(sub_data[,col_meth], sub_data[,col_oracle])
ggplot(sub_data, aes_string(col_oracle, "pe")) +
geom_point(alpha = 0.02)
}
#' Read eXpress data
#'
#' @param fname the file name of the express 'results.xprs' file.
#' @return a data.table
#' @export
read_xprs <- function(fname) {
xprs <- fread(fname, header = TRUE, stringsAsFactors = FALSE,
data.table = TRUE)
xprs
}
#' Read kallisto output for merge_results
#'
#' Read a kallisto data into a format that place nicely with
#' \code{merge_results}. Basically equivalent to
#' \code{read_kallisto("dir")$abundance}, but doesn't do extra stuff with
#' metadata that \code{read_kallisto_does}
#'
#' @param fname the path to "expression.txt" from kallisto
#' @return a data.table
#' @export
read_kallisto_rename <- function(fname) {
# kal <- fread(fname, header = TRUE, stringsAsFactors = FALSE,
# data.table = TRUE)
# kal
read.table(fname, stringsAsFactors=FALSE, header = TRUE)
}
#' @export
read_sailfish <- function(fname) {
# TODO: fix sailfish TPM
sf <- fread(fname, header = FALSE, skip = 5, stringsAsFactors = FALSE,
data.table = FALSE)
colnames(sf) <- c("target_id", "length", "tpm", "rpkm", "kpkm", "EstimatedNumKmers", "est_counts")
# sf %>%
# rename(tpm_sailfish = tpm, counts_sf = est_counts) %>%
# arrange(target_id)
sf %>%
select(target_id, tpm, est_counts)
}
#' @export
read_salmon <- function(fname) {
# TODO: fix salmon TPM
salmon <- fread(fname, header = FALSE, skip = 13, stringsAsFactors = FALSE,
data.table = FALSE)
colnames(salmon) <- c("target_id", "length", "tpm", "est_counts")
# colnames(salmon) <- c("target_id", "length", "tpm", "fpkm", "est_counts")
# salmon %>%
# rename(tpm_salmon = tpm, counts_salmon = est_counts) %>%
# arrange(target_id)
salmon %>%
select(target_id, tpm, est_counts)
}
#' @export
read_rsem <- function(fname) {
read.table(fname, header = TRUE, stringsAsFactors = FALSE) %>%
rename(target_id = transcript_id,
est_counts = expected_count,
tpm = TPM, eff_len = effective_length) %>%
select(-gene_id, -FPKM, -IsoPct)
}
read_kallisto_py <- function(fname) {
kal <- fread(fname, header = TRUE, stringsAsFactors = FALSE,
data.table = FALSE)
# kal %>%
# rename(target_id = name, tpm_kal_py = tpm, counts_kal_py = est_counts) %>%
# arrange(target_id)
kal %>%
rename(target_id = name)
}
#' @export
read_oracle <- function(fname, targ_to_eff_len) {
oracle <- fread(paste0("sed 's/^ *//g' ", fname), header = FALSE,
data.table = FALSE)
targ_to_eff_len <- select(targ_to_eff_len, target_id, eff_length)
oracle %>%
rename(target_id = V1, counts = V2) %>%
left_join(targ_to_eff_len, ., by = "target_id") %>%
mutate(counts = replace(counts, is.na(counts), 0.0)) %>%
mutate(tpm = counts / eff_length) %>%
mutate(tpm = replace(tpm, eff_length == 0, 0.0)) %>%
mutate(tpm = 1e6 * tpm / sum(tpm)) %>%
rename(tpm_oracle = tpm) %>%
arrange(target_id)
}
#' @export
read_cufflinks <- function(fname, mean_frag_len) {
#data <- data.table::fread(fname, header = TRUE, data.table = FALSE)
data <- read.table(fname, header = TRUE, stringsAsFactors = FALSE)
data <- data %>%
mutate(tpm = FPKM * 1e6 / sum(FPKM))
#est_counts = tpm_to_alpha(tpm, length - mean_frag_len))
data %>%
select(target_id = tracking_id, tpm)
}
tpm_to_alpha <- function(tpm, eff_len) {
stopifnot(length(tpm) == length(eff_len))
alpha <- (tpm * eff_len) / 1e6
alpha <- alpha / sum(alpha)
alpha
}
#' @export
alpha_load <- function(dir_name, oracle_targs) {
alpha <- data.table::fread(file.path(dir_name, "alpha.txt"), data.table = FALSE, header = TRUE)
alpha <- t(as.matrix(alpha))
alpha <- alpha[oracle_targs,]
ll <- data.table::fread(file.path(dir_name, "ll.txt"), header = FALSE, data.table = FALSE)
list(alpha = alpha, ll = ll[,1])
}
#' @export
alpha_to_oracle_cor <- function(alpha, oracle_counts) {
all_spearman <- lapply(1:ncol(alpha),
function(i) {
if (i %% 100 == 0) print(i)
cor(alpha[,i], oracle_counts, method = "spearman")
})
unlist(all_spearman)
}
other_comparison <- function(quant, truth) {
dplyr::inner_join(quant, truth, by = 'target_id')
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.