Sleuth RNASeq Analysis

Documented in design_filter sleuth_model_rss sleuth_prep_transcript_dosage sleuth_reliable_target_ids sleuth_u

# library(dplyr);  # for "tidyverse" tibble manipulation pipes %>%
# library(ggfortify); # for biplot PCA+loading

std_mean <- function(x) { sd(x) / sqrt(length(x)) }

#' Add a transcript's scaled est_count value as a numeric dosage factor on top of an existing Sleuth object's model
#'
#' Can be used to correct for transcript noise/artefacts in a model when it is proportional to a biomarker transcript.
#'
#' @param sleuth_obj An existing Sleuth object as generated by sleuth_prep()
#' @param metadata The metadata that was used to populate sleuth_obj
#' @param transcript_id The name of the transcript to model quantitatively
#' @return A new Sleuth object with the new regression model
#' @keywords dosage model, error correction
#' @export
#' @examples
#' # Read in metadata with "sample", "path" and other columns that can be used as regression model factors like "treatment"
#' meta <- read.table("tab_delimited_file_with sample_and_path_columns_etc.txt", header=TRUE)
#' meta$path <- as.character(meta$path)
#' so <- sleuth_prep(meta, ~treatment)
#' so <- sleuth_fit(so)
#' pca_plot(so)
#' Plot shows one sample way off on the X-axis, with biggest loading vector in the X-shift direction being NM_001402 (EEF1A1, a biomaker for cell proliferation)
#' The following call builds a new differential expression model (~treatment + NM_001402) that accounts for this artefact.
#' so_redux <- sleuth_prep_transcript_dosage(so, meta, "NM_001402")
#'
#' Compare the model fits (lower number is better)
#' sleuth_model_rss(so)
#' sleuth_model_rss(so_redux)
sleuth_prep_transcript_dosage <- function(sleuth_obj, metadata, transcript_id) {
  # Using the raw observation (scaled) so that the order doesn't matter if we add mutliple dosages in sequence
  transcript_expression_level <- sleuth_obj$obs_raw %>%
    filter(target_id == transcript_id) %>%
    mutate(est_counts = est_counts * sleuth_obj$est_counts_sf)
  new_metadata <- metadata %>%
    left_join(transcript_expression_level[, c("sample", "est_counts")], by = "sample") %>%
    rename({ { transcript_id } } := est_counts)
  new_model <- as.formula(paste0(deparse(sleuth_obj$full_formula), " + ", transcript_id))
	sleuth_prep(new_metadata, new_model)
}

targetid_u_test_quantitative <- function(sleuth_obj, tid, factor_name, value_threshold) {
  d <- sleuth_obj$obs_norm_filt %>%
    filter(target_id == tid) %>%
    left_join(sleuth_obj$sample_to_covariates, by = "sample")
	# Make a new attribute per sample which is the one to be evaluated as a binary factor
  d$u_group_name <- d[, factor_name] >= value_threshold
	u <- wilcox.test(est_counts ~ u_group_name, data = d)
	c(tid, u$p.value, NA)
}

targetid_single_factor_kruskal_wallace_test <- function(sleuth_obj, tid, factor_name) {
  d <- sleuth_obj$obs_norm_filt %>%
    mutate(est_counts = est_counts * sleuth_obj$est_counts_sf) %>%
    filter(target_id == tid) %>%
    left_join(sleuth_obj$sample_to_covariates, by = "sample")
	# Make a new attribute per sample which is the (hopefully unique) delimited concatenation of the requested factors
  d$kw_group_name <- d[, factor_name]
	k <- kruskal.test(est_counts ~ kw_group_name, data = d)
	c(tid, k$p.value, NA)
}

# An ANOVA test without the assumption of equal variance between samples. 
targetid_one_way_anova_test <- function(sleuth_obj, tid, factor_name) {
  d <- sleuth_obj$obs_norm_filt %>%
    filter(target_id == tid) %>%
    left_join(sleuth_obj$sample_to_covariates, by = "sample")
	# Make a new attribute per sample which is the (hopefully unique) delimited concatenation of the requested factors
  d$anova_group_name <- d[, factor_name]
	k <- oneway.test(est_counts ~ anova_group_name, data = d)
	c(tid, k$p.value, NA)
}

#' Turn a numeric factor in a Sleuth object regression model into a binary one using a threshold value, 
#' then perform the non-parametric U (a.k.a. Mann-Whitney or Wilcox) Test on the binarized factor.
#'
#' For eukaryotic transcript analysis, tends to start generating multiple testing corrected p-values (FDR) <.05 when 10 or more samples
#' in each category (<= threshold, or > threshold) are present.
#'
#' @param sleuth_obj An existing Sleuth object as generated by sleuth_prep()
#' @param factor_name The existing numeric factor to be turned into 0 or 1 based on a threshold value
#' @param value_threshold The threshold value for binarization (values for the given factor >= threshold are designated as "1", others are "0").
#' @param target_ids The list of transcripts the test should be run for (affects multiple testing correction as well)
#' @return A table of \code{target_id}, \code{pval}, \code{qval} sorted by ascending pval
#' @keywords U test, non-parametric test, Wilcox test, Mann-Whitney test
#' @export
#' @examples
#' # Read in metadata with "sample", "path" and other columns that can be used as regression model factors like "age"
#' meta <- read.table("tab_delimited_file_with sample_and_path_columns_etc.txt", header=TRUE)
#' meta$path <- as.character(meta$path)
#' so <- sleuth_prep(meta, ~treatment*age)
#' so <- sleuth_fit(so)
#' so <- sleuth_wt(so, "age") 
#' wald_result <- sleuth_results(so, "age")
#' table(wald_result$qval<.05)
#' # The regression analysis shows few to no transcripts explained by age as a numeric linear dosage factor. What about <65 years old vs older?
#' wilcox_result <- sleuth_u(so, "age", 65)
#' table(wilcox_result$qval<.05)
sleuth_u <- function(sleuth_obj, factor_name, value_threshold, target_ids = unique(sleuth_obj$obs_norm_filt$target_id)) {
  U_results_list <- lapply(target_ids, function(target_id) { targetid_u_test_quantitative(sleuth_obj, target_id, factor_name, value_threshold) })
	U_results_dataframe <- as.data.frame(do.call(rbind, U_results_list))
	U_results_dataframe[, 3] <- p.adjust(U_results_dataframe[, 2], method = "fdr")
	colnames(U_results_dataframe) <- c("target_id", "pval", "qval")
	U_results_dataframe$pval <- as.numeric(as.character(U_results_dataframe$pval))
	U_results_dataframe[order(U_results_dataframe$pval),]
}

#' List all the Sleuth object \code{target_id}s that have at least \code{min_count_per_sample} magnitude in the scaled \code{est_count}
#'
#' Can be used to provide a non-default target_ids paramater value in other methods of this package, effectively reducing the 
#' magnitude of multiple testing correction in statistical tests to restricting the reported tests to those transcripts with replicable
#' expression levels (not potentially zero-inflated).
#'
#' @param sleuth_obj An existing Sleuth object as generated by sleuth_prep()
#' @param min_count_per_sample threshold that all scaled \code{est_count}s for a transcript should have to be included im the results
#' @return a list of target_ids meeting the given criterion
#' @export
#' @examples
#' # Check the replicate dot plots for a factor combination (typically the control condition set, same number of columns as the dleuth object design_matrix, minus the intercept) 
#' replicate_dotplots(so,c(0,0))
#' # Note that many genes in the replicate samples have zero expression in one sample, but up to exp(4) in another. 
#' # Exclude these low-expressed genes from a U test to reduce pval inflation caused by multiple testing correction (qval/FDR)
#' sleuth_u(so, "age", 65, target_ids=sleuth_reliable_target_ids(so, exp(4)))
# If there are mapped read in all replicates, that's considered reliable
sleuth_reliable_target_ids <- function(sleuth_obj, min_count_per_sample = 1) {
  sleuth_obj$obs_norm_filt %>%
    group_by(target_id) %>%
    mutate(est_counts = est_counts * sleuth_obj$est_counts_sf) %>%
    summarise(min_est_counts = min(est_counts)) %>%
    filter(min_est_counts >= min_count_per_sample) %>%
    select(target_id) %>%
    pull(1)
}

#' Calculate the Residual Sum of Squares of the Sleuth object's full regression model
sleuth_model_rss <- function(sleuth_obj) { sum(sleuth_obj$fits$full$summary[, 2]) }


sleuth_oneway_anova <- function(sleuth_obj, factor_name, target_ids = unique(sleuth_obj$obs_norm_filt$target_id)) {
  oneway_results_list <- lapply(target_ids, function(target_id) { targetid_one_way_anova_test(sleuth_obj, target_id, factor_name) })
	oneway_results_dataframe <- as.data.frame(do.call(rbind, oneway_results_list))
	oneway_results_dataframe[, 3] <- p.adjust(oneway_results_dataframe[, 2], method = "fdr")
	colnames(oneway_results_dataframe) <- c("target_id", "pval", "qval")
	oneway_results_dataframe$pval <- as.numeric(as.character(oneway_results_dataframe$pval))
	oneway_results_dataframe[order(oneway_results_dataframe$pval),]
}

sleuth_kruskal_wallace <- function(sleuth_obj, factor_names, target_ids = unique(sleuth_obj$obs_norm_filt$target_id)) {
  kw_results_list <- lapply(target_ids, function(target_id) { targetid_single_factor_kruskal_wallace_test(sleuth_obj, target_id, factor_names) })
	kw_results_dataframe <- as.data.frame(do.call(rbind, kw_results_list))
	kw_results_dataframe[, 3] <- p.adjust(kw_results_dataframe[, 2], method = "fdr")
	colnames(kw_results_dataframe) <- c("target_id", "pval", "qval")
	kw_results_dataframe$pval <- as.numeric(as.character(kw_results_dataframe$pval))
	kw_results_dataframe[order(kw_results_dataframe$pval),]
}

targetid_paired_t_test <- function(sleuth_obj, tid, pair_name, factor_name) {
  row_by_factor_level <- sleuth_obj$obs_norm_filt %>%
    filter(target_id == tid) %>%
    left_join(sleuth_obj$sample_to_covariates, by = "sample") %>%
    group_split(.data[[factor_name]])
	# Convoluted way to sort by an unordered factor in tibbles, so we can ensure the pairs are kept together (dplyr's arrange() has conniptions)
  row_by_factor_level[[1]] <- row_by_factor_level[[1]][order(as.character(pull(row_by_factor_level[[1]], pair_name))),]
  row_by_factor_level[[2]] <- row_by_factor_level[[2]][order(as.character(pull(row_by_factor_level[[2]], pair_name))),]
	#print(row_by_factor_level[[1]]);
  #print(row_by_factor_level[[2]]);
  # This t-test assumes log-norml distribution
  t <- t.test(x = log1p(row_by_factor_level[[1]]$est_counts), y = log1p(row_by_factor_level[[2]]$est_counts), paired = TRUE, alternative = "two.sided")
  logFCs <- log(row_by_factor_level[[2]]$est_counts / row_by_factor_level[[1]]$est_counts)
	shapiro <- ifelse(sum(!is.nan(logFCs)) < 3, NA, ifelse(length(unique(logFCs)) < 3, NA, shapiro.test(logFCs)$p.value))
  log_counts <- log1p(row_by_factor_level[[1]]$est_counts)
	# Same order as normal Wald test sleuth_results() output
  c(tid, t$p.value, NA, mean(logFCs), sd(logFCs), mean(log_counts), std_mean(log_counts), shapiro)
  #c(tid,t$p.value,NA,mean(logFCs),sd(logFCs),mean(log_counts),std_mean(log_counts));
}

sleuth_paired_t <- function(sleuth_obj, pair_name, factor_name, target_ids = unique(sleuth_obj$obs_norm_filt$target_id)) {
  t_test_results_list <- lapply(target_ids, function(target_id) { targetid_paired_t_test(sleuth_obj, target_id, pair_name, factor_name) })
  t_test_results_dataframe <- as.data.frame(do.call(rbind, t_test_results_list))
  t_test_results_dataframe[, 3] <- p.adjust(t_test_results_dataframe[, 2], method = "fdr")
  colnames(t_test_results_dataframe) <- c("target_id", "pval", "qval", "b", "se_b", "mean_obs", "var_obs", "Shapiro-Wilks")
  # The following makes it syntactically easier to pull out rows by the callee
  rownames(t_test_results_dataframe) <- t_test_results_dataframe$target_id
  # Otherwise NAs cause p-value to be considered as a factor column
  t_test_results_dataframe$pval <- as.numeric(as.character(t_test_results_dataframe$pval))
  t_test_results_dataframe[order(t_test_results_dataframe$pval),]
}

fdr_ids <- function(sleuth_obj, test_name, fdr = 0.05, test_type = "wald") {
  res <- sleuth_results(sleuth_obj, test_name, test_type = test_type)
  res$target_id[which(res$qval <= fdr)]
}

replicate_dotplots <- function(sleuth_obj, design_matrix_combo) {
  samples <- sleuth_obj$design_matrix
  for (column_index in 1:length(design_matrix_combo)) {
    # +1 on the index because of the Intercept term in the design matrix
    samples <- samples[which(samples[, column_index + 1] == design_matrix_combo[column_index]),]
  }
  num_replicates <- dim(samples)[1]
  data <- sleuth_obj$obs_norm_filt[which(sleuth_obj$obs_norm_filt$sample %in% rownames(samples)),]
  data$log_est_counts <- log1p(data$est_counts)
  layout(mat = matrix(1:(num_replicates^2), nrow = num_replicates, ncol = num_replicates, byrow = T))
  for (sample1 in rownames(samples)) {
    for (sample2 in rownames(samples)) {
      plot(data$log_est_counts[which(data$sample == sample1)], data$log_est_counts[which(data$sample == sample2)], xlab = sample1, ylab = sample2, xlim = c(0, 12), ylim = c(0, 12))
    }
  }
}

#' Filter low expressed transcripts by taking into account the regression model.
#'
#' @param meta the same data frame that would be passed as the first argument to sleuth_prep
#' @param model the regression model that would be passed as the second argument to sleuth_prep
#' @param row the count data for a given transcript, as fed by sleuth_prep to the \code{filter_fun} method
#' @param min_reads minimum number of reads required to consider a transcript present in a sample (default value meant to match the behaviour of the default filter in Sleuth called basic_filter)
#' @param min_prop minimum proportion of samples *in a given experimental factor combination* required to consider a transcript present in the experiment (default value meant to match the behaviour of the default filter in Sleuth called basic_filter although it looks for the proportion across ALL sample)
#'
#' @return true or false, indicating whether or not the transcript in question passes the filter criteria
#' @export
#' @examples
#' # Given a sample metadata table with the columns \code{sample}, \code{path}, \code{treatment}, and \code{sex}
#' # create a model with the two factors plus their interaction. Use a design filter so that for example a transcript that only shows up in treated males (1/4 of the dataset) will not be filtered out
#' # (as it would by the default filter which would require half of ALL sample to express the transcript).
#' model <- ~treatment*sex
#' so <- sleuth_prep(sample_meta, model, filter_fun=function(x){design_filter(sample_meta, model, x)})
design_filter <- function(meta, model, row, min_reads = 5, min_prop = 0.47) {
  design_matrix <- model.matrix(model, meta)
  sum(apply(design_matrix, 2, function(x) {
    y <- as.factor(x)
    return(max(tapply(row, y, function(f) { sum(f >= min_reads) }) / tapply(row, y, length)) == 1
             || basic_filter(row, min_reads, min_prop))
  })) > 0
}
nodrogluap/gumshoe documentation built on Feb. 28, 2023, 6:15 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
nodrogluap/gumshoe
Supplemental Statistical Tests and Utilities for Kallisto/Sleuth RNASeq Analysis

R/gumshoe.R
In nodrogluap/gumshoe: Supplemental Statistical Tests and Utilities for Kallisto/Sleuth RNASeq Analysis

Documented in design_filter sleuth_model_rss sleuth_prep_transcript_dosage sleuth_reliable_target_ids sleuth_u

R Package Documentation

Browse R Packages

We want your feedback!

nodrogluap/gumshoe Supplemental Statistical Tests and Utilities for Kallisto/Sleuth RNASeq Analysis

R/gumshoe.R In nodrogluap/gumshoe: Supplemental Statistical Tests and Utilities for Kallisto/Sleuth RNASeq Analysis

Documented in design_filter sleuth_model_rss sleuth_prep_transcript_dosage sleuth_reliable_target_ids sleuth_u

R Package Documentation

Browse R Packages

We want your feedback!

nodrogluap/gumshoe
Supplemental Statistical Tests and Utilities for Kallisto/Sleuth RNASeq Analysis

R/gumshoe.R
In nodrogluap/gumshoe: Supplemental Statistical Tests and Utilities for Kallisto/Sleuth RNASeq Analysis