#### Statisitc related wranglers ####
#' Creates a much nicer formatted design matrix than the default model.matrix
#'
#' I made this function because the way the model.matrix function
#' creates a desing matrix, can cause issues with differential
#' gene expression tools, such as edgeR, DESeq2,
#' and limma. If the intercept column is included,
#' it first removes the parens and parens from the
#' intercept column and then removing the variable
#' name from the colnames and replaces them with
#' factor level vs reference level syntax. If the
#' design is specified without an intercept, variable
#' concatentation is simply removed.
#'
#' @param ... formula with outcome as first variable
#' @param data data.frame
#' @param show_warnings logical whether to show wanrings
#'
#' @return model.matrix
#' @export
#'
#' @importFrom limma strsplit2
#' @importFrom stats model.matrix
#'
#'
#' @examples
better_model_matrix <- function(..., data, show_warnings=TRUE){
m <- model.matrix(..., data=data)
# Getting variables (columns) from formula
mod_str <- deparse(...)
var_str <-
strsplit2(mod_str, '\\+') %>%
drop(.) %>%
trimws(.) %>%
gsub('~', '', .) #%>%
# cleaning model matrix
if(var_str[1] == "0"){
# checks if intercept is 1st term
outcome_var <- var_str[2]
} else {
outcome_var <- var_str[1]
}
if(show_warnings){
w <- paste0('\n', outcome_var, ' used as outcome variable.\nIf this is incorrect, reorder the formula so that the outcome variable comes first.' )
warning(w)
}
df_str <- deparse(substitute(data))
colnames(m) <-
gsub(outcome_var, '', colnames(m)) %>%
# gsub(df_str, '', .) %>%
gsub('\\(', '', .) %>%
gsub('\\)', '', .)
model_has_intercept <-
colnames(m) %>%
grepl('Intercept', ., ignore.case = TRUE) %>%
any(.)
if(model_has_intercept){
colnames_to_prettify <- levels(data[,outcome_var])
ref_level <- colnames_to_prettify[1]
colnames_to_prettify <- colnames_to_prettify[-1]
stopifnot(all(colnames_to_prettify %in% colnames(m)))
pretty_colnames <- sapply(colnames_to_prettify, function(x){
paste0(x, '_vs_', ref_level)
})
col_idxs <- match(colnames_to_prettify, colnames(m))
colnames(m)[col_idxs] <- pretty_colnames
}
return(m)
}
#' Selects the top n genes by variance
#'
#' @param expr_mat expression mat
#' @param n_genes number of genes to extract
#'
#' @return gene matrix
#' @export
#'
#' @importFrom matrixStats rowVars
#'
#' @examples
select_genes_by_variance <- function(expr_mat, n_genes=5000){
rv_idx <- rowVars(expr_mat) %>%
order(., decreasing = TRUE)
rv_idx <- rv_idx[1:n_genes]
filt_expr_mat <- expr_mat[rv_idx,]
return(filt_expr_mat)
}
#' Gene expression filter by quantile
#'
#' This function first removes rows in which
#' the max value equals the min value and
#' then filters the resulting genes by row
#' medians. Rows with a median greater than
#' the resulting value of the quantile are
#' retained.
#'
#' @param expr_mat an expression matrix of class matrix
#' @param quant_val quantile threshold
#'
#' @return filtered by quantile expresion matrix
#' @export
#'
#' @importFrom stats quantile
#' @importFrom Biobase rowMax rowMedians
#'
#' @examples
gene_quantile_filter <- function(expr_mat, quant_val=.05){
rm <- rowMax(expr_mat)
qthresh <- quantile(rm, quant_val)
filt_mat <- expr_mat[rm > qthresh,]
filt_mat <- filt_mat[rowMedians(filt_mat) > qthresh,]
return(filt_mat)
}
#' Creates matrix of overlap ratios
#'
#' This function creates a matrix of the ratio
#' of overlapping genesets. It takes a list of genes
#' or alternatively 2 gene lists
#'
#' @param l1 gene list
#' @param l2 optional 2nd gene list
#'
#' @return top diagonal matrix of overlap ratios
#' @export
#'
#' @examples
#' \dontrun{
#' # returns a self (l1 vs l1) overlap matrix
#' genes <- paste0('gene', seq(1, 100))
#' set.seed(42)
#' n_genes_to_sample <- rnbinom(10, 10, .5)
#' l1 <- lapply(seq_along(n_genes_to_sample), function(i){
#' n <- n_genes_to_sample[i]
#' sample(genes, n)
#' }) %>%
#' setNames(., paste0('L1_', LETTERS[1:10]))
#' omat <- get_overlap_matix(l1)
#' }
#'
get_overlap_matix <- function(l1, l2=NULL){
if(is.null(l2)){
l2 <- l1
}
l1_nms <- names(l1)
l2_nms <- names(l2)
w <- matrix(NA, nrow=length(l1_nms), ncol=length(l2_nms))
rownames(w) <- l1_nms
colnames(w) <- l2_nms
for (i in 1:nrow(w)) {
for (j in 1:ncol(w)) {
w[i,j] = overlap_ratio(l1[[i]], l2[[j]])
}
}
return(w)
}
#' Extract genes sets from a pathway results data.frame
#'
#' pathway results data.frame with a column
#' of gene sets seperated by a delimiter
#'
#' @param path_df data.frame with gene sets
#' @param delim delimeter seperating genes (default: '/')
#'
#' @return list with names as pathway ids
#' @export
#'
#' @examples
get_geneSets <- function(path_df, delim="/"){
nms <- path_df$ID
gene_sets <- strsplit(path_df$geneID, delim, fixed=TRUE)
names(gene_sets) <- nms
return(gene_sets)
}
#' get results from limma fit object
#'
#' Takes a limma fit object created using either \link[limma]{eBayes} or
#' \link[limma]{treat} and returns a long data.frame of the results.
#' If the `fit` object contains `lods`, then results are obtained using
#' \link[limma]{topTable}. Else, if the fit object contains a `treat.lfc`
#' topTreat is used. If no coefficients are passed, it tries to guess
#' which are the relevant ones checking which are made up of 1's and 0's.
#'
#' @param limma_fit fit object produced by limma
#' @param coefs coefficients to return
#' @param print_summary logical, whether print summary of the results
#'
#' @return data.frame of results in long(ish) format
#' @export
#'
#' @import tibble dplyr
#' @importFrom limma topTable
#'
#'
#' @examples
get_limma_results <- function(limma_fit, coefs=NULL, print_summary=TRUE){
#TODO add tests
#TODO think about handling levels with whitespace
#TODO maybe automagically figure type of gene annots
d <- limma_fit$design
if(is.null(coefs)){
d <- limma_fit$design
one_hot_cols <- sapply(seq(1,ncol(d)), function(i){
all(d[,i] %in% c(1,0))
})
coefs <- colnames(d)[one_hot_cols]
coefs <- coefs[!grepl('intercept', coefs, ignore.case = T)]
}
res_list <- lapply(coefs, function(x){
if('lods' %in% names(limma_fit)){
cur_res <-
topTable(limma_fit, coef = x, number = Inf, confint=TRUE)
}
if('treat.lfc' %in% names(limma_fit)){
cur_res <-
topTreat(limma_fit, coef = x, number = Inf, confint=TRUE)
}
cur_res <- cur_res %>%
tibble::rownames_to_column(., 'ensembl_gene_id') %>%
cbind(coefficient=x, .)
}) %>% do.call(rbind, .)
if(print_summary){
expression_summaries(res_list) %>%
print
}
return(res_list)
}
#' Returns DESeq2 results in long format
#'
#' @param dds_object DDS object after running DESeq
#' @param use_shrinkage logical whether to perform lfcshrinkage
#' @param gene_annots Gene annotations which if not given are assumed
#'
#' @return data.frame of results in long(ish) format
#' @export
#'
#' @importFrom SummarizedExperiment mcols
#' @importFrom DESeq2 resultsNames results
#'
#' @examples
get_deseq2_results <- function(dds_object, gene_annots=NULL, use_shrinkage=FALSE){
if(is.null(gene_annots)){
gene_annots <-
mcols(dds_object) %>%
data.frame(.) %>%
dplyr::select_if(., is.character)
warning_msg <- paste0('gene annots not given.\nUsing: ',
paste(names(gene_annots), collapse = ', '),
' for gene annotations')
warning(warning_msg)
}
# Get results by coefs which
# have `_vs_` in them
results_list <- list()
res_names <- DESeq2::resultsNames(dds_object)
res_names <- res_names[grepl('_vs_', res_names)]
for(i in res_names){
# Checks whether to get results via shrinkage
if(use_shrinkage){
temp_res <- lfcShrink(dds_object,
coef = i,
type = 'normal',
parallel=TRUE)
} else if(!use_shrinkage){
temp_res <- results(dds_object, name = i)
}
temp_res <-
temp_res %>%
data.frame(.) %>%
rownames_to_column(., var = "ensembl_gene_id") %>%
data.frame(coefficient=i, .)
results_list[[i]] <- temp_res
}
names(results_list) <- NULL
res_df <- do.call(rbind,results_list)
rn <- res_df$ensembl_gene_id
res_df$row <- NULL
sorted_annots <- gene_annots[match(rn, row.names(gene_annots)),,drop=FALSE]
out_df <- data.frame(ensembl_gene_id=rn, sorted_annots, res_df)
row.names(out_df) <- NULL
return(out_df)
}
#' Returns summaries of gene expression
#'
#'
#' This just summarizes the number of significant up & down
#' regulated genes per group
#'
#' @param long_res_table res df in long format
#' @param log2_thresh threshold to genes by lfc
#' @param logFC_col str of logFC_col
#' @param padj_col str of padj_col
#'
#' @return table summary
#' @export
#'
#' @examples
expression_summaries <- function(long_res_table, log2_thresh=0, logFC_col = NULL, padj_col=NULL){
#TODO fix hardcoding of coefficient
if(is.null(logFC_col)){
potential_cols <- c('log2FoldChange','logFC')
logFC_col <- na.omit(match(potential_cols, colnames(long_res_table)) )
colnames(long_res_table)[logFC_col] <- 'log2FoldChange'
}
if(is.null(padj_col)){
potential_cols <- c('padj','adj.P.Val')
logFC_col <- na.omit(match(potential_cols, colnames(long_res_table)) )
colnames(long_res_table)[logFC_col] <- 'padj'
}
group_sums <-
long_res_table %>%
dplyr::filter(padj < .05) %>%
dplyr::filter(.,abs(log2FoldChange) >= log2_thresh) %>%
group_by(., coefficient) %>%
summarise(.,
up=sum(log2FoldChange > 0),
down=sum(log2FoldChange < 0))
return(group_sums)
}
#### Statistical functions ####
#' Function performs lm over accross a number of genes
#'
#' @param expr_mat matrix of expression values
#' @param p_data matrix with sample data
#' @param model_formula a string of the form: y ~ x + o
#'
#' @return lm list
#' @export
#'
#' @examples
basic_linear_regression <- function(expr_mat, p_data, model_formula){
if(class(model_formula) != 'character'){
stop('model_formula needs to be a string of the form y ~ + x1 + x2...')
}
expr_mat <- data.frame(expr_mat[row.names(p_data),], check.names = F)
l <- lapply(expr_mat, function(x){
tmp <- summary(lm(as.formula(model_formula), data=p_data))
})
out_df <- lmlist_to_df(l)
# nms <- names(l)
# x_list <- lapply(l, function(x){
# x <- x$coefficients[2,]
# })
# out_df <- do.call(rbind, x_list)
# row.names(out_df) <- nms
return(out_df)
}
#' Performs logistic regression with numerous Y vals
#'
#' @param expr_mat matrix of expression values
#' @param p_data matrix with sample data
#' @param model_formula a string of the form: y ~ x + o
#'
#' @return lm list
#' @export
#'
#' @examples
basic_logistic_regression <- function(expr_mat, p_data, model_formula){
### Args:
### expr_mat: matrix of expression values
### p_data: matrix with sample data
### model_formula: needs to be y ~ x + o
### y is outcome var, x is gene, o is other covariates
if(class(model_formula) != 'character'){
stop('model_formula needs to be a string of the form y ~ + x1 + x2...')
}
expr_mat <- data.frame(expr_mat, check.names = F)
l <- lapply(expr_mat, function(x){
tmp <- summary(glm(as.formula(model_formula), family = 'binomial', data=p_data))
tmp <- coefficients(tmp)
if(nrow(tmp) == 1){
temp_row <- matrix(rep(NA, 4), nrow = 1)
colnames(temp_row) <- colnames(tmp)
x <- temp_row
} else {
x <- tmp[2,]
}
})
res_logreg <- data.frame(do.call(rbind, l), check.names = F)
colnames(res_logreg) <- c("Estimate","Std_Error","z_or_t_value", "p_value" )
row.names(res_logreg) <- names(l)
return(res_logreg)
}
#' Compare the sample contributions according to their annotation level across the components.
#'
#' Wilcoxon or Kruskal-Wallis tests are performed depending on the number of levels in the considered annotation.
#' @title Comparison of distributions of sample groups
#' @param A A matrix of dimensions 'samples x components' containing the sample contributions
#' @param annot A matrix of dimensions 'samples x variables' containing the sample annotations
#' @param colAnnot The name of the column of \code{annot} to be considered
#' @return A vector of p-values
#' @author Anne Biton
#' @seealso \code{wilcox.test}, \code{kruskal.test}
#' @keywords external
#' @import foreach
#' @import doParallel
wilcoxOrKruskalOnA <- function (A, colAnnot, annot) {
comp <- NULL
A <- A[rownames(annot),]
res_tests <- foreach::foreach(comp=as.list(A),.combine = c, .errorhandling = "stop") %dopar% {
annotComp <- data.frame(comp=comp)
annotComp[[colAnnot]] <- as.factor(annot[[colAnnot]])
if (length(levels(annotComp[[colAnnot]])) == 2)
res.test <- wilcox.test(comp~eval(as.name(colAnnot)), data = annotComp, na.action = "na.omit")
else
res.test <- kruskal.test(comp~eval(as.name(colAnnot)), data = annotComp, na.action = "na.omit")
return(res.test$p.value)
}
return(unlist(res_tests))
}
#### Distance functions ####
#' Calculates the overlap coefficient of 2 sets
#'
#'
#' @param x set vector
#' @param y set vector
#'
#' @return overlap ratio/coefficient
#' @export
#'
#'
overlap_ratio <- function(x, y, check_sets=TRUE) {
if(check_sets){
stopifnot("x contains duplicated values"=!any(unclass(table(x)) > 1))
stopifnot("y contains duplicated values"=!any(unclass(table(y)) > 1))
}
numer <- length(intersect(x, y))
denom <- min(length(x), length(y))
return(numer/denom)
}
#' Calculates the jaccard distance between 2 named sets
#'
#' @param x set vector
#' @param y set vector
#'
#' @return jaccard distance
#' @export
#'
#' @examples
jaccard <- function(x, y, check_sets=TRUE){
if(check_sets){
stopifnot("x contains duplicated values"=!any(unclass(table(x)) > 1))
stopifnot("y contains duplicated values"=!any(unclass(table(y)) > 1))
}
numer <- length(intersect(x, y))
denom <- length(union(x, y))
return(numer/denom)
}
#' Calculates the Dice coefficient
#'
#' @param x
#' @param y
#' @param check_sets
#'
#' @return
#' @export
#'
#' @examples
dice <- function(x, y, check_sets=TRUE){
if(check_sets){
stopifnot("x contains duplicated values"=!any(unclass(table(x)) > 1))
stopifnot("y contains duplicated values"=!any(unclass(table(y)) > 1))
}
numer <- 2*length(intersect(x, y))
denom <- length(x) + length(y)
return(numer/denom)
}
#' Order DNA sequences by Levenshtein edit distance
#'
#' This function takes a vector of DNA sequences, computes pairwise distances
#' between them, and orders the sequences based on their similarity using
#' hierarchical clustering.
#'
#' @param dna_strings A character vector containing DNA sequences (A, C, G, T).
#'
#' @return A character vector of DNA sequences ordered by similarity.
#'
#' @examples
#' dna_strings <- c("ACTG", "ACCG", "ATGC", "AGTC")
#' ordered_dna <- order_by_similarity(dna_strings)
#' print(ordered_dna)
#'
#' @importFrom Biostrings DNAStringSet stringDist
#' @export
order_by_similarity <- function(dna_strings) {
dna_set <- DNAStringSet(dna_strings)
dist_matrix <- stringDist(dna_set)
hc <- hclust(as.dist(dist_matrix))
ordered_indices <- hc$order
# Return the DNA strings in the new order
ordered_dna_strings <- dna_strings[ordered_indices]
return(ordered_dna_strings)
}
#### TODO Functions ####
#' Split names by delim character
#'
#' @param df a data.frame
#' @param delim character to split names by
#' @param split_item index of split string to keep
#' @param remove_space should whitespace be replaced with "_" default: TRUE
#'
#' @return vector of names
#' @keywords internal
#'
#' @importFrom limma strsplit2
#'
#' @examples
.split_fix_names <- function(df, delim=':', split_item=1, remove_space=TRUE){
nms <- strsplit2(names(df), delim)[,split_item]
if(remove_space){
nms <- gsub(' ', '_',nms)
}
return(nms)
}
#' Title
#'
#' @param n_genes
#' @param m_samples
#' @param seed
#' @param groups
#'
#' @return
#' @keywords internal
#'
#' @import DESeq2
#'
#' @examples
.create_test_dds <- function(n_genes=500, m_samples=60, seed=42, groups=NULL){
set.seed(seed)
dds <- makeExampleDESeqDataSet(n=n_genes, m = m_samples)
mcols(dds) <- NULL
mcols(dds) <- data.frame(gene_id=names(dds),
row.names=names(dds))
if(is.null(groups)){
groups <-
c(rep('A', m_samples/3),
rep('B', m_samples/3),
rep('C', m_samples/3)) %>%
factor(.)
}
colData(dds) <- DataFrame(condition=groups,
row.names = colnames(dds))
return(dds)
}
#' Turn summary object from lm into a data.frame
#'
#' @param lmlist lm summary object
#' @param coef the coefficient to extract
#'
#' @return data.frame with coefficient
#' @keywords internal
#'
#' @examples
.lmlist_to_df <- function(lmlist, coef=2){
rnms <- try(limma::strsplit2(names(lmlist), ' ')[,2], silent = T) # gets rid of extranous name
if(class(rnms) == 'try-error'){
rnms <- names(lmlist)
}
cnms <- c("Estimate","Std_Error","z_or_t_value", "p_value" )
coef_list <- lapply(lmlist, function(x){
x <- x$coefficients[coef,]
})
out_df <- data.frame(do.call(rbind, coef_list))
row.names(out_df) <- rnms
colnames(out_df) <- cnms
return(out_df)
}
#' Gets lfc from a pathway res df
#'
#' This returns the lfc from a genelist as a
#' named vector of genes by lfc
#'
#' @param gene_list list of gene vectors
#' @param res_df long res df table
#'
#' @return
#' @keywords internal
#'
#' @examples
.get_lfc <- function(gene_list, res_df){
ol <- lapply(gene_list, function(x){
lfc <- res_df$logFC[match(x, res_df$external_gene_name)] %>%
setNames(., x)
})
names(ol) <- names(gene_list)
return(ol)
}
#' Split a named vector of DEGs by fold change
#'
#' @param lfc_list DEG vector of FCs
#'
#' @return
#' @keywords internal
#'
#' @examples
.split_by_reg <- function(lfc_list){
split_degs <- list(
sort(lfc_list[lfc_list > 0, drop=F], decreasing = TRUE),
sort(lfc_list[lfc_list < 0, drop=F]))
names(split_degs) <- c('upreg', 'downreg')
return(split_degs)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.