# library(dplyr); # for "tidyverse" tibble manipulation pipes %>%
# library(ggfortify); # for biplot PCA+loading
std_mean <- function(x) { sd(x) / sqrt(length(x)) }
#' Add a transcript's scaled est_count value as a numeric dosage factor on top of an existing Sleuth object's model
#'
#' Can be used to correct for transcript noise/artefacts in a model when it is proportional to a biomarker transcript.
#'
#' @param sleuth_obj An existing Sleuth object as generated by sleuth_prep()
#' @param metadata The metadata that was used to populate sleuth_obj
#' @param transcript_id The name of the transcript to model quantitatively
#' @return A new Sleuth object with the new regression model
#' @keywords dosage model, error correction
#' @export
#' @examples
#' # Read in metadata with "sample", "path" and other columns that can be used as regression model factors like "treatment"
#' meta <- read.table("tab_delimited_file_with sample_and_path_columns_etc.txt", header=TRUE)
#' meta$path <- as.character(meta$path)
#' so <- sleuth_prep(meta, ~treatment)
#' so <- sleuth_fit(so)
#' pca_plot(so)
#' Plot shows one sample way off on the X-axis, with biggest loading vector in the X-shift direction being NM_001402 (EEF1A1, a biomaker for cell proliferation)
#' The following call builds a new differential expression model (~treatment + NM_001402) that accounts for this artefact.
#' so_redux <- sleuth_prep_transcript_dosage(so, meta, "NM_001402")
#'
#' Compare the model fits (lower number is better)
#' sleuth_model_rss(so)
#' sleuth_model_rss(so_redux)
sleuth_prep_transcript_dosage <- function(sleuth_obj, metadata, transcript_id) {
# Using the raw observation (scaled) so that the order doesn't matter if we add mutliple dosages in sequence
transcript_expression_level <- sleuth_obj$obs_raw %>%
filter(target_id == transcript_id) %>%
mutate(est_counts = est_counts * sleuth_obj$est_counts_sf)
new_metadata <- metadata %>%
left_join(transcript_expression_level[, c("sample", "est_counts")], by = "sample") %>%
rename({ { transcript_id } } := est_counts)
new_model <- as.formula(paste0(deparse(sleuth_obj$full_formula), " + ", transcript_id))
sleuth_prep(new_metadata, new_model)
}
targetid_u_test_quantitative <- function(sleuth_obj, tid, factor_name, value_threshold) {
d <- sleuth_obj$obs_norm_filt %>%
filter(target_id == tid) %>%
left_join(sleuth_obj$sample_to_covariates, by = "sample")
# Make a new attribute per sample which is the one to be evaluated as a binary factor
d$u_group_name <- d[, factor_name] >= value_threshold
u <- wilcox.test(est_counts ~ u_group_name, data = d)
c(tid, u$p.value, NA)
}
targetid_single_factor_kruskal_wallace_test <- function(sleuth_obj, tid, factor_name) {
d <- sleuth_obj$obs_norm_filt %>%
mutate(est_counts = est_counts * sleuth_obj$est_counts_sf) %>%
filter(target_id == tid) %>%
left_join(sleuth_obj$sample_to_covariates, by = "sample")
# Make a new attribute per sample which is the (hopefully unique) delimited concatenation of the requested factors
d$kw_group_name <- d[, factor_name]
k <- kruskal.test(est_counts ~ kw_group_name, data = d)
c(tid, k$p.value, NA)
}
# An ANOVA test without the assumption of equal variance between samples.
targetid_one_way_anova_test <- function(sleuth_obj, tid, factor_name) {
d <- sleuth_obj$obs_norm_filt %>%
filter(target_id == tid) %>%
left_join(sleuth_obj$sample_to_covariates, by = "sample")
# Make a new attribute per sample which is the (hopefully unique) delimited concatenation of the requested factors
d$anova_group_name <- d[, factor_name]
k <- oneway.test(est_counts ~ anova_group_name, data = d)
c(tid, k$p.value, NA)
}
#' Turn a numeric factor in a Sleuth object regression model into a binary one using a threshold value,
#' then perform the non-parametric U (a.k.a. Mann-Whitney or Wilcox) Test on the binarized factor.
#'
#' For eukaryotic transcript analysis, tends to start generating multiple testing corrected p-values (FDR) <.05 when 10 or more samples
#' in each category (<= threshold, or > threshold) are present.
#'
#' @param sleuth_obj An existing Sleuth object as generated by sleuth_prep()
#' @param factor_name The existing numeric factor to be turned into 0 or 1 based on a threshold value
#' @param value_threshold The threshold value for binarization (values for the given factor >= threshold are designated as "1", others are "0").
#' @param target_ids The list of transcripts the test should be run for (affects multiple testing correction as well)
#' @return A table of \code{target_id}, \code{pval}, \code{qval} sorted by ascending pval
#' @keywords U test, non-parametric test, Wilcox test, Mann-Whitney test
#' @export
#' @examples
#' # Read in metadata with "sample", "path" and other columns that can be used as regression model factors like "age"
#' meta <- read.table("tab_delimited_file_with sample_and_path_columns_etc.txt", header=TRUE)
#' meta$path <- as.character(meta$path)
#' so <- sleuth_prep(meta, ~treatment*age)
#' so <- sleuth_fit(so)
#' so <- sleuth_wt(so, "age")
#' wald_result <- sleuth_results(so, "age")
#' table(wald_result$qval<.05)
#' # The regression analysis shows few to no transcripts explained by age as a numeric linear dosage factor. What about <65 years old vs older?
#' wilcox_result <- sleuth_u(so, "age", 65)
#' table(wilcox_result$qval<.05)
sleuth_u <- function(sleuth_obj, factor_name, value_threshold, target_ids = unique(sleuth_obj$obs_norm_filt$target_id)) {
U_results_list <- lapply(target_ids, function(target_id) { targetid_u_test_quantitative(sleuth_obj, target_id, factor_name, value_threshold) })
U_results_dataframe <- as.data.frame(do.call(rbind, U_results_list))
U_results_dataframe[, 3] <- p.adjust(U_results_dataframe[, 2], method = "fdr")
colnames(U_results_dataframe) <- c("target_id", "pval", "qval")
U_results_dataframe$pval <- as.numeric(as.character(U_results_dataframe$pval))
U_results_dataframe[order(U_results_dataframe$pval),]
}
#' List all the Sleuth object \code{target_id}s that have at least \code{min_count_per_sample} magnitude in the scaled \code{est_count}
#'
#' Can be used to provide a non-default target_ids paramater value in other methods of this package, effectively reducing the
#' magnitude of multiple testing correction in statistical tests to restricting the reported tests to those transcripts with replicable
#' expression levels (not potentially zero-inflated).
#'
#' @param sleuth_obj An existing Sleuth object as generated by sleuth_prep()
#' @param min_count_per_sample threshold that all scaled \code{est_count}s for a transcript should have to be included im the results
#' @return a list of target_ids meeting the given criterion
#' @export
#' @examples
#' # Check the replicate dot plots for a factor combination (typically the control condition set, same number of columns as the dleuth object design_matrix, minus the intercept)
#' replicate_dotplots(so,c(0,0))
#' # Note that many genes in the replicate samples have zero expression in one sample, but up to exp(4) in another.
#' # Exclude these low-expressed genes from a U test to reduce pval inflation caused by multiple testing correction (qval/FDR)
#' sleuth_u(so, "age", 65, target_ids=sleuth_reliable_target_ids(so, exp(4)))
# If there are mapped read in all replicates, that's considered reliable
sleuth_reliable_target_ids <- function(sleuth_obj, min_count_per_sample = 1) {
sleuth_obj$obs_norm_filt %>%
group_by(target_id) %>%
mutate(est_counts = est_counts * sleuth_obj$est_counts_sf) %>%
summarise(min_est_counts = min(est_counts)) %>%
filter(min_est_counts >= min_count_per_sample) %>%
select(target_id) %>%
pull(1)
}
#' Calculate the Residual Sum of Squares of the Sleuth object's full regression model
sleuth_model_rss <- function(sleuth_obj) { sum(sleuth_obj$fits$full$summary[, 2]) }
sleuth_oneway_anova <- function(sleuth_obj, factor_name, target_ids = unique(sleuth_obj$obs_norm_filt$target_id)) {
oneway_results_list <- lapply(target_ids, function(target_id) { targetid_one_way_anova_test(sleuth_obj, target_id, factor_name) })
oneway_results_dataframe <- as.data.frame(do.call(rbind, oneway_results_list))
oneway_results_dataframe[, 3] <- p.adjust(oneway_results_dataframe[, 2], method = "fdr")
colnames(oneway_results_dataframe) <- c("target_id", "pval", "qval")
oneway_results_dataframe$pval <- as.numeric(as.character(oneway_results_dataframe$pval))
oneway_results_dataframe[order(oneway_results_dataframe$pval),]
}
sleuth_kruskal_wallace <- function(sleuth_obj, factor_names, target_ids = unique(sleuth_obj$obs_norm_filt$target_id)) {
kw_results_list <- lapply(target_ids, function(target_id) { targetid_single_factor_kruskal_wallace_test(sleuth_obj, target_id, factor_names) })
kw_results_dataframe <- as.data.frame(do.call(rbind, kw_results_list))
kw_results_dataframe[, 3] <- p.adjust(kw_results_dataframe[, 2], method = "fdr")
colnames(kw_results_dataframe) <- c("target_id", "pval", "qval")
kw_results_dataframe$pval <- as.numeric(as.character(kw_results_dataframe$pval))
kw_results_dataframe[order(kw_results_dataframe$pval),]
}
targetid_paired_t_test <- function(sleuth_obj, tid, pair_name, factor_name) {
row_by_factor_level <- sleuth_obj$obs_norm_filt %>%
filter(target_id == tid) %>%
left_join(sleuth_obj$sample_to_covariates, by = "sample") %>%
group_split(.data[[factor_name]])
# Convoluted way to sort by an unordered factor in tibbles, so we can ensure the pairs are kept together (dplyr's arrange() has conniptions)
row_by_factor_level[[1]] <- row_by_factor_level[[1]][order(as.character(pull(row_by_factor_level[[1]], pair_name))),]
row_by_factor_level[[2]] <- row_by_factor_level[[2]][order(as.character(pull(row_by_factor_level[[2]], pair_name))),]
#print(row_by_factor_level[[1]]);
#print(row_by_factor_level[[2]]);
# This t-test assumes log-norml distribution
t <- t.test(x = log1p(row_by_factor_level[[1]]$est_counts), y = log1p(row_by_factor_level[[2]]$est_counts), paired = TRUE, alternative = "two.sided")
logFCs <- log(row_by_factor_level[[2]]$est_counts / row_by_factor_level[[1]]$est_counts)
shapiro <- ifelse(sum(!is.nan(logFCs)) < 3, NA, ifelse(length(unique(logFCs)) < 3, NA, shapiro.test(logFCs)$p.value))
log_counts <- log1p(row_by_factor_level[[1]]$est_counts)
# Same order as normal Wald test sleuth_results() output
c(tid, t$p.value, NA, mean(logFCs), sd(logFCs), mean(log_counts), std_mean(log_counts), shapiro)
#c(tid,t$p.value,NA,mean(logFCs),sd(logFCs),mean(log_counts),std_mean(log_counts));
}
sleuth_paired_t <- function(sleuth_obj, pair_name, factor_name, target_ids = unique(sleuth_obj$obs_norm_filt$target_id)) {
t_test_results_list <- lapply(target_ids, function(target_id) { targetid_paired_t_test(sleuth_obj, target_id, pair_name, factor_name) })
t_test_results_dataframe <- as.data.frame(do.call(rbind, t_test_results_list))
t_test_results_dataframe[, 3] <- p.adjust(t_test_results_dataframe[, 2], method = "fdr")
colnames(t_test_results_dataframe) <- c("target_id", "pval", "qval", "b", "se_b", "mean_obs", "var_obs", "Shapiro-Wilks")
# The following makes it syntactically easier to pull out rows by the callee
rownames(t_test_results_dataframe) <- t_test_results_dataframe$target_id
# Otherwise NAs cause p-value to be considered as a factor column
t_test_results_dataframe$pval <- as.numeric(as.character(t_test_results_dataframe$pval))
t_test_results_dataframe[order(t_test_results_dataframe$pval),]
}
fdr_ids <- function(sleuth_obj, test_name, fdr = 0.05, test_type = "wald") {
res <- sleuth_results(sleuth_obj, test_name, test_type = test_type)
res$target_id[which(res$qval <= fdr)]
}
replicate_dotplots <- function(sleuth_obj, design_matrix_combo) {
samples <- sleuth_obj$design_matrix
for (column_index in 1:length(design_matrix_combo)) {
# +1 on the index because of the Intercept term in the design matrix
samples <- samples[which(samples[, column_index + 1] == design_matrix_combo[column_index]),]
}
num_replicates <- dim(samples)[1]
data <- sleuth_obj$obs_norm_filt[which(sleuth_obj$obs_norm_filt$sample %in% rownames(samples)),]
data$log_est_counts <- log1p(data$est_counts)
layout(mat = matrix(1:(num_replicates^2), nrow = num_replicates, ncol = num_replicates, byrow = T))
for (sample1 in rownames(samples)) {
for (sample2 in rownames(samples)) {
plot(data$log_est_counts[which(data$sample == sample1)], data$log_est_counts[which(data$sample == sample2)], xlab = sample1, ylab = sample2, xlim = c(0, 12), ylim = c(0, 12))
}
}
}
#' Filter low expressed transcripts by taking into account the regression model.
#'
#' @param meta the same data frame that would be passed as the first argument to sleuth_prep
#' @param model the regression model that would be passed as the second argument to sleuth_prep
#' @param row the count data for a given transcript, as fed by sleuth_prep to the \code{filter_fun} method
#' @param min_reads minimum number of reads required to consider a transcript present in a sample (default value meant to match the behaviour of the default filter in Sleuth called basic_filter)
#' @param min_prop minimum proportion of samples *in a given experimental factor combination* required to consider a transcript present in the experiment (default value meant to match the behaviour of the default filter in Sleuth called basic_filter although it looks for the proportion across ALL sample)
#'
#' @return true or false, indicating whether or not the transcript in question passes the filter criteria
#' @export
#' @examples
#' # Given a sample metadata table with the columns \code{sample}, \code{path}, \code{treatment}, and \code{sex}
#' # create a model with the two factors plus their interaction. Use a design filter so that for example a transcript that only shows up in treated males (1/4 of the dataset) will not be filtered out
#' # (as it would by the default filter which would require half of ALL sample to express the transcript).
#' model <- ~treatment*sex
#' so <- sleuth_prep(sample_meta, model, filter_fun=function(x){design_filter(sample_meta, model, x)})
design_filter <- function(meta, model, row, min_reads = 5, min_prop = 0.47) {
design_matrix <- model.matrix(model, meta)
sum(apply(design_matrix, 2, function(x) {
y <- as.factor(x)
return(max(tapply(row, y, function(f) { sum(f >= min_reads) }) / tapply(row, y, length)) == 1
|| basic_filter(row, min_reads, min_prop))
})) > 0
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.