#' Fast linear regression
#'
#' Calculated associtation gene counts matrix with the pehonotype where each of gene counts on aexposure
#'
#' @param count_matrix A matrix of gene counts (possibly transformed). rows are genes, columns are individuals
#' @param pheno A data frame of phenotype, includes the trait and covariates.
#' @param trait A character, the name of the exposure variable. The trait should be a column in pheno.
#' @param covariates_string Characters string with specifying the covariats, include "as.factor" statements. example: covariate_string = "age,as.factor(sex)"
#' @param log_transform One of the transformations log_replace_half_min, log_add_min, log_add_0.5, or NULL (default)
#' @param gene_IDs Vector of selection of geneID, NULL if all genes are tested
#' @return Linear regression results as a data frame with columns geneID, adjLogFC,se,t_stat (t-statistic),t_stat_df(degree of freedom),p_value,fdr_bh
#' @examples
#' library(dplyr)
#' data(rnaseq_count_matrix)
#' rnaseq_count_matrix<- rnaseq_count_matrix[rowSums(rnaseq_count_matrix)>0,]
#' data(phenotype)
#' trait<-"Trait.1"
#' covars<-"Age+Sex"
#' log_transform<-"log_replace_half_min"
#' lm_count_mat(count_matrix=rnaseq_count_matrix,pheno=phenotype,trait=trait,
#' covariates_string=covars, log_transform=log_transform)
#' @export
#'
lm_count_mat <-function(count_matrix, pheno, trait, covariates_string,
gene_IDs=NULL, log_transform = "log_replace_half_min"){
stopifnot(colnames(count_matrix) == rownames(pheno), is.element(trait, colnames(pheno)))
count_matrix <- as.matrix(count_matrix)
# if gene_IDs are (or is) provided, filter count_matrix to the requested genes
if(!is.null(gene_IDs)) {
count_matrix <- filter_by_genes(count_matrix, gene_IDs)
}
count_matrix <- log_transform_count(count_matrix, transform = log_transform)
count_matrix <- t(count_matrix)
#covariates_string<- as.character(covariates_string)
#covars<- unlist(strsplit(covariates_string, ","))
model_string <- paste0(covariates_string,"+",trait)
XX<-model.matrix(as.formula(paste0("~", model_string)), data=pheno)
if (nrow(XX) != nrow(count_matrix)){
message(paste( nrow(count_matrix) - nrow(XX), "samples have missing phenotypes, using", nrow(XX), "samples"))
count_matrix <- count_matrix[rownames(XX),]
}
XtXinv <- solve(t(XX) %*% as.matrix(XX))
XtXinv_se_arg <- sqrt(XtXinv[trait,trait])
numExplan <-ncol(XX)
XXproj <- XtXinv %*% t(XX)
betas_mat <- XXproj %*% count_matrix
betas <- betas_mat[trait,]
resid_Ys <- count_matrix - XX %*% XXproj %*% count_matrix
sum_squares_resids <- colSums(resid_Ys^2)
sigmas_square <- sum_squares_resids/(nrow(count_matrix)-numExplan)
se_betas <- sqrt(sigmas_square)*XtXinv_se_arg
test_stats <- betas/se_betas
t_stat_df <- nrow(count_matrix) - numExplan
t_pval <- 2*pt(abs(test_stats), lower.tail=FALSE, df = t_stat_df)
res <- data.frame(geneID = colnames(count_matrix), adjLogFC = betas, se = se_betas,
t_stat = test_stats,t_stat_df=t_stat_df, p_value = t_pval)
rownames(res)<-NULL
res<- res %>% mutate(fdr_bh= p.adjust(p_value, method = "BH"))
return(res)
}
#' Wrapper function for differential expression analysis
#'
#' The function is to calculate DEG (Differential Expression Genes) analysis using residual permuation approach to calculate empirical p-value
#'
#' @param count_matrix A matrix of gene counts (possibly transformed). rows are genes, columns are individuals
#' @param pheno A data frame of phenotype data, includes the trait and covariates
#' @param trait A character, the name of the exposure variable. The trait should be a column in pheno
#' @param covariates_string Characters string with specifying the covariats, include "as.factor" statements. example: covariate_string = "age+as.factor(sex)"
#' @param log_transform One of the transformations log_replace_half_min, log_add_min, log_add_0.5, or NULL (default)
#' @param gene_IDs A vector of selection of geneID, NULL if all genes are tested
#' @param n_permute number of computing residual permutation. Default is 100 times
#' @param seed Random seed
#' @param outcome_type continuous and binary.Default is continuous
#' @return Linear regression results as a data frame with columns geneID, adjLogFC,se,t_stat (t-statistic),t_stat_df(degree of freedom),p_value, fdr_bh,,
#' emp_pvals,bh_emp_pvals
#' @examples
#' set.seed(123)
#' library(dplyr)
#' data(phenotype)
#' data(rnaseq_count_matrix)
#' rnaseq_count_matrix<- rnaseq_count_matrix[rowSums(rnaseq_count_matrix)>0,]
#' trait<-"Trait.1"
#' covars<- "Age+Sex"
#' lm_count_mat_emp_pval(count_matrix=rnaseq_count_matrix,pheno = phenotype,trait = trait,
#' covariates_string=covars, outcome_type="continuous")
#' @export
lm_count_mat_emp_pval <-function(count_matrix, pheno, trait, covariates_string,
n_permute=100,
gene_IDs=NULL,
log_transform = "log_replace_half_min",
seed = NULL,
outcome_type="continuous"){
if (!is.null(seed)) set.seed(seed)
stopifnot(colnames(count_matrix) == rownames(pheno), is.element(trait, colnames(pheno)))
model_string <- paste0(covariates_string,"+",trait)
XX<-model.matrix(as.formula(paste0("~", model_string)), data=pheno)
if (nrow(XX) != ncol(count_matrix)){
message(paste( ncol(count_matrix) - nrow(XX), "samples have missing phenotypes, using", nrow(XX), "samples"))
count_matrix <- count_matrix[,rownames(XX)]
pheno <- pheno[rownames(XX),]
}
deg <- lm_count_mat(count_matrix=count_matrix,
pheno=pheno,
trait=trait,
covariates_string=covariates_string,
gene_IDs=gene_IDs,
log_transform = log_transform)
message("Performing residual permutation to generate permuted trait...")
permuted_trait<-sapply(seq_len(n_permute), function(x){
permute_resids_trait(pheno = pheno,
trait= trait, seed = seed,
covariates_string = covariates_string,outcome_type = outcome_type)
})
message(paste("performing differential expression analysis on", n_permute, "permuted traits"))
permute_val<-lapply(seq_len(n_permute), function(y){
pheno$perm_trait <- permuted_trait[,y]
deg_perm <- lm_count_mat(count_matrix=count_matrix,
pheno=pheno,
trait="perm_trait",
covariates_string=covariates_string,
gene_IDs=gene_IDs,
log_transform = log_transform)
null_pval<-deg_perm[,"p_value"]
})
null_pval<-unlist(permute_val)
head(null_pval)
stopifnot(length(null_pval)==n_permute*nrow(deg))
message("Computing quantile empirical p-values")
emp_pvals <- compute_quantile_empirical_pvalues(statistics = deg[,"p_value"],
null_statistics = as.numeric(null_pval))
deg$emp_pvals <- emp_pvals
deg<- deg %>%mutate(bh_emp_pvals=p.adjust(emp_pvals, method = "BH"))
return(deg)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.