#' Linear mixed effects models with kinship for RNA-seq
#' Run lmerel and corresponding lm or lme without kinship of gene expression in RNA-seq data
#' @param dat EList object output by voom( ). Must contain counts (dat$E) and meta (dat$targets). Optionally also contains gene metadata (dat$genes) and weights (dat$weights)
#' @param kin Matrix with pairwise kinship values between individuals. Must be numeric with rownames.
#' @param patientID Character of variable name to match dat$targets to kinship row and column names.
#' @param libraryID Character of variable name to match dat$targets to dat$E colnames
#' @param counts Matrix of normalized expression. Rows are genes, columns are libraries.
#' @param meta Matrix or data frame of sample and individual metadata.
#' @param genes Optional matrix or data frame of gene metadata.
#' @param weights Optional matrix of data frame of gene specific weights. Usually calculated with limma::voomWithQualityWeights().
#' @param subset_var Character list of variable name(s) to filter data by.
#' @param subset_lvl Character list of variable value(s) or level(s) to filter data to. Must match order of subset_var
#' @param subset_genes Character vector of genes to include in models.
#' @param model Character vector of model starting with ~ Should include (1|patientID) if mixed effects will be run
#' @param use_weights Logical if gene specific weights should be used in model. Default is FALSE
#' @param run_lm Logical if should run lm model without kinship
#' @param run_lme Logical if should run lme model without kinship
#' @param run_lmerel Logical if should run lmerel model with kinship
#' @param run_contrast Logical if should run pairwise contrasts. If no matrix provided, all possible pairwise comparisons are completed.
#' @param contrast_var Character vector of variable in model to run contrasts of. Interaction terms must be specified as "var1:var2". If NULL (default), all contrasts for all variables in the model are run
#' @param metrics Logical if should calculate model fit metrics such as AIC, BIC, R-squared. Default is FALSE
#' @param processors Numeric processors to run in parallel. Default is 2 less than the total available
#' @param p_method Character of FDR adjustment method. Values as in p.adjust( )
#' @param genotype_name Character string. Used internally for kmFit_eQTL
#' @param run.lmekin Deprecated. Please use run_lmerel
#' @param subset.var Deprecated form of subset_var
#' @param subset.lvl Deprecated form of subset_lvl
#' @param subset.genes Deprecated form of subset_genes
#' @param use.weights Deprecated form of use_weights
#' @param run.lm Deprecated form of run_lm
#' @param run.lme Deprecated form of run_lme
#' @param run.lmerel Deprecated form of run_lmerel
#' @param run.contrast Deprecated form of run_contrast
#' @param contrast.var Deprecated form of contrast_var
#' @param p.method Deprecated form of p_method
#' @return List of data frames including
#' - lm/lme/lmerel: model estimates and significance
#' - *.contrast: model estimates and significance for pairwise contrasts with variables in the original model
#' - *.fit: model fit metrics such as sigma, AIC, BIC, R-squared (optional with metrics paramater)
#' - *.error: error messages for genes that failed model fitting
#' @importFrom foreach %dopar%
#' @export
#' @examples
#' # All samples and all genes
#' ## Not run
#' # kmFit(dat = example.voom,
#' # kin = example.kin, run_lmerel = TRUE,
#' # model = "~ virus + (1|ptID)")
#' # Subset samples and genes
#' ## Also with weights
#' kmFit(dat = example.voom,
#' run_lm = TRUE, use_weights = FALSE,
#' subset_var = list("asthma"), subset_lvl = list(c("asthma")),
#' subset_genes = c("ENSG00000250479","ENSG00000250510","ENSG00000255823"),
#' model = "~ virus + (1|ptID)")
#' # Pairwise contrasts
#' ## Continuous interaction
#' kmFit(dat = example.voom,
#' run_lme = TRUE, run_contrast = TRUE,
#' subset_genes = c("ENSG00000250479","ENSG00000250510","ENSG00000255823"),
#' model = "~ virus + asthma * median_cv_coverage + (1|ptID)",
#' contrast_var=c("asthma:median_cv_coverage"))
#' ## Categorical interaction
#' kmFit(dat = example.voom, kin = example.kin,
#' run_lmerel = TRUE, run_contrast = TRUE, metrics=TRUE,
#' subset_genes = c("ENSG00000250479","ENSG00000250510","ENSG00000255823"),
#' model = "~ virus*asthma + (1|ptID)",
#' contrast_var=c("virus:asthma"))
#' # Model with failed genes
#' kmFit(dat = example.voom,
#' kin = example.kin, run_lmerel = TRUE, run_lm = TRUE,
#' subset_genes = c("ENSG00000250479","ENSG00000250510","ENSG00000255823"),
#' model = "~ virus*asthma + lib.size + norm.factors + median_cv_coverage + ptID + (1|ptID)")
#' # Non-dat data
#' kmFit(counts = example.voom$E, meta = example.voom$targets,
#' run_lm = TRUE, use_weights = FALSE,
#' subset_genes = c("ENSG00000250479","ENSG00000250510","ENSG00000255823"),
#' model = "~ virus + (1|ptID)")
#' # Three level variable
#' example.voom$targets$lvl <- rep(c("A","B","C"), length(example.voom$targets$libID)/3)
#' kmFit(dat = example.voom,
#' run_lme= TRUE, run_contrast = TRUE,
#' subset_genes = c("ENSG00000250479","ENSG00000250510","ENSG00000255823"),
#' model = "~ lvl + (1|ptID)")
kmFit <- function(dat=NULL, kin=NULL, patientID="ptID", libraryID="libID",
counts=NULL, meta=NULL, genes=NULL, weights=NULL,
subset_var = NULL, subset_lvl = NULL, subset_genes = NULL,
model, use_weights=FALSE,
run_lm = FALSE, run_lme = FALSE, run_lmerel = FALSE,
metrics = FALSE,
run_contrast = FALSE, contrast_var = NULL,
processors = NULL, p_method = "BH",
run.lmekin = NULL,
subset.var = NULL, subset.lvl = NULL, subset.genes = NULL,
run.lm = FALSE, run.lme = FALSE, run.lmerel = FALSE,
run.contrast = FALSE, contrast.var = NULL,
p.method = NULL){
rowname <- libID <- variable <- statistic <- df <- pval <- group <- gene <- V1 <- V2 <- combo <- term <- p.value <- estimate <- contrast <- contrast.i <- weights.gene <- FDR <- contrast_ref <- contrast_lvl <- NULL
#Back compatibility
message("WARNING: Arguments with '.' have been deprecated. Please use '_' versions.")
if(!is.null(subset.var)){subset_var <- subset.var}
if(!is.null(subset.lvl)){subset_lvl <- subset.lvl}
if(!is.null(subset.genes)){subset_genes <- subset.genes}
if(use.weights){use_weights <- use.weights}
if(run.lm){run_lm <- run.lm}
if(run.lme){run_lme <- run.lme }
if(run.lmerel){run_lmerel <- run.lmerel}
if(run.contrast){run_contrast <- run.contrast}
if(!is.null(contrast.var)){contrast_var <- contrast.var}
if(!is.null(p.method)){p_method <- p.method}
###### Parallel ######
#setup parallel processors
chk <- Sys.getenv("_R_CHECK_LIMIT_CORES_", "")
if (nzchar(chk) && chk == "TRUE") {
#Use 2 in CRAN/Travis/AppVeyor
processors.to.use <- 2
} else if (is.null(processors)){
#Use 2 less than total if not user defined
processors.to.use <- parallel::detectCores()-2
if(processors.to.use == 0){
stop("Error processors: Default resulted in 0. Please correct.")
} else {
#Use user defined number
processors.to.use <- processors
cl <- parallel::makeCluster(processors.to.use)
###### Check common input parameter errors #####
if(!is.null(dat) & !(libraryID %in% colnames(dat$targets))){
stop("libraryID column not found in dat$targets.")}
if(!is.null(meta) & !(libraryID %in% colnames(meta))){
stop("libraryID column not found in meta.")}
if(!is.null(dat) & !(patientID %in% colnames(dat$targets))){
stop("patientID column not found in dat$targets.")}
if(!is.null(meta) & !(patientID %in% colnames(meta))){
stop("patientID column not found in meta.")}
if(is.null(subset_var) & !is.null(subset_lvl)){
stop("Sample subsetting has been selected. Please also provide subset_var")}
if(!is.null(subset_var) & is.null(subset_lvl)){
stop("Sample subsetting has been selected. Please also provide subset_lvl")}
if(run_lmerel & !grepl("\\|", model)){
stop("Kinship models require a random effect in the model as in (1 | patientID)")}
if(run_lme & !grepl("\\|", model)){
stop("LME models require a random effect in the model")}
if(is.null(kin) & run_lmerel){
stop("Kinship matrix is required to run lmerel")}
if(!run_lm & !run_lme & !run_lmerel & !run_contrast){
stop("At least 1 model type must be selected. Please set one run parameter to TRUE.")}
if(!run_lm & !run_lme & !run_lmerel & run_contrast){
stop("Contrast models must be run with an accompanying linear model.")}
if(use_weights & is.null(weights) & is.null(dat$weights)){
stop("When use_weights is TRUE, must provide gene weights is dat object or separate data frame.")
if("gene_weight" %in% c(colnames(meta), colnames(dat$targets))){
stop("Variable gene_weight is present in dat$targets or meta. This name is used for model weights. Please change variable name in your data.")
stop("run.lmekin no longer supported. Please use run_lmerel")
if(grepl("\n", model)){
stop("Model cannot contain hard returns \n. Please correct.")
if(!is.null(weights) | !is.null(dat$weights)){
message("WARNING: To use weights provided in dat$weights or weights, set use_weights = TRUE\n")
if(run_lme | run_lmerel){
if(!grepl(patientID, model)){
stop("patientID value does not match variable used in model.")
###### Formulae #####
#Make formulae. as.formula does not work
if(grepl("\\|", model)){
model.temp <- strsplit(gsub(" ", "", model), split = "\\+\\(1")[[1]][1]
model_lm <- paste("expression", model.temp, sep="")
} else {
model_lm <- paste("expression", model, sep="")
model_lm <- gsub(" ","",model_lm)
model_lme <- paste("expression", gsub(" ", "", model), sep="")
#Model message
if(run_lm){ message(paste("lm model:",model_lm))}
if(run_lme | run_lmerel){ message(paste("lme/lmerel model:",model_lme))}
#If no contrast variable set, as all
if(run_contrast & is.null(contrast_var)){
contrast.temp <- strsplit(gsub(" |~", "", model), split = "\\+\\(1")[[1]][1]
#main term
contrast.main <- strsplit(contrast.temp, split = "\\+|\\*")[[1]]
#interaction term
contrast.interact <- strsplit(contrast.temp, split = "\\+")[[1]]
contrast.interact <- contrast.interact[grepl("\\*",contrast.interact)]
#Check for triple interactions
if(any(stringr::str_count(contrast.interact, "\\*") > 1)){
stop("Contrasts of triple interactions are not supported.")}
contrast.interact <- gsub("\\*",":", contrast.interact)
} else {
contrast.interact <- NULL
contrast_var <- c(contrast.main, contrast.interact)
#Set eQTL variable
contrast_var <- gsub("genotype", genotype_name, contrast_var)
#If contrast variables given, force run contrast model
if(!is.null(contrast_var)){run_contrast <- TRUE}
#Check contrast variable is character/factor
meta.temp <- dat$targets
} else {
meta.temp <- meta
for(v in contrast_var){
#Deal with interaction terms. Only 1 need be non-numeric
if(grepl("[*]|:", v)){
v.sep <- strsplit(v, split="[*]|:")[[1]]
v.class1 <- class(meta.temp[,v.sep[1]])
v.class2 <- class(meta.temp[,v.sep[2]])
if(all(c(v.class1,v.class2) %in% c("numeric","integer","double"))){
stop(paste("Contrast variable", v, "is numeric. Please specify only character/factor contrasts."))
} else {
v.class <- class(unlist(meta.temp[,v]))
if(v.class %in% c("numeric","integer","double")){
stop(paste("Contrast variable", v, "is numeric. Please specify only character/factor contrasts."))
###### Data #####
to.model.ls <- kimma_cleaning(dat, kin, patientID, libraryID,
counts, meta, genes, weights,
subset_var, subset_lvl, subset_genes,
model_lm, genotype_name, run_lmerel)
###### Run models ######
#create blank df to hold results
fit.results <- data.frame()
#Loop through each gene
fit.results <- data.table::rbindlist(fill=TRUE,
.packages = c("dplyr","magrittr","stats","broom","lme4",
.export = c("kimma_lm","kimma_lme","kimma_lmerel",
"kmFit_contrast")) %dopar% {
#### Prepare data ####
#Filter data to gene
to.model.gene <- to.model.ls[["to.model"]] %>%
dplyr::filter(rowname == gene) %>%
#### LM model #####
#Run linear model without kinship
results.lm.ls <- NULL
#Wrap model run in error catch to allow loop to continue even if a single model fails
results.lm.ls <- tryCatch({
kimma_lm(model_lm, to.model.gene, gene, use_weights, metrics)
}, error=function(e){
results.lm.ls[["error"]] <- data.frame(model="lm",
#### LME model #####
results.lme.ls <- NULL
#Wrap model run in error catch to allow loop to continue even if a single model fails
results.lme.ls <- tryCatch({
kimma_lme(model_lme, to.model.gene, gene, use_weights, metrics)
}, error=function(e){
results.lme.ls[["error"]] <- data.frame(model="lme",
##### Kinship model ######
results.kin.ls <- NULL
#Wrap model run in error catch to allow loop to continue even if a single model fails
results.kin.ls <- tryCatch({
kimma_lmerel(model_lme, to.model.gene, gene, to.model.ls[["kin.subset"]],
use_weights, patientID, metrics)
}, error=function(e){
results.kin.ls[["error"]] <- data.frame(model="lmerel",
#### Contrasts ####
contrast.lm <- NULL
contrast.lme <- NULL
contrast.kin <- NULL
contrast.results <- NULL
contrast.lm <- tryCatch({
kmFit_contrast(results.lm.ls[["fit"]], contrast_var, to.model.gene,
genotype_name) %>%
}, error=function(e){
contrast.lm.error <- data.frame(model="lm.contrast",
contrast.lme <- tryCatch({
kmFit_contrast(results.lme.ls[["fit"]], contrast_var, to.model.gene,
genotype_name) %>%
}, error=function(e){
contrast.lme.error <- data.frame(model="lme.contrast",
contrast.kin <- tryCatch({
kmFit_contrast(results.kin.ls[["fit"]], contrast_var, to.model.gene,
genotype_name) %>%
}, error=function(e){
contrast.kin.error <- data.frame(model="lmerel.contrast",
#Combine contrast results
contrast.results <- dplyr::bind_rows(contrast.lm, contrast.lme, contrast.kin) %>%
#### Combine results #####
#All models for this gene
#If any estimate are character (seeContrasts), force for merging
results.lm.ls[["results"]]$estimate <- as.character(results.lm.ls[["results"]]$estimate)
results.lme.ls[["results"]]$estimate <- as.character(results.lme.ls[["results"]]$estimate)
results.kin.ls[["results"]]$estimate <- as.character(results.kin.ls[["results"]]$estimate)
contrast.results$estimate <- as.character(contrast.results$estimate)
results.lm.ls[["results"]]$statistic <- as.character(results.lm.ls[["results"]]$statistic)
results.lme.ls[["results"]]$statistic <- as.character(results.lme.ls[["results"]]$statistic)
results.kin.ls[["results"]]$statistic <- as.character(results.kin.ls[["results"]]$statistic)
contrast.results$statistic <- as.character(contrast.results$statistic)
# results.lm.ls[["results"]]$df <- as.character(results.lm.ls[["results"]]$df)
# results.lme.ls[["results"]]$df <- as.character(results.lme.ls[["results"]]$df)
# results.kin.ls[["results"]]$df <- as.character(results.kin.ls[["results"]]$df)
# contrast.results$df <- as.character(contrast.results$df)
fit.results <- results.lm.ls[["results"]] %>%
dplyr::bind_rows(results.lme.ls[["results"]]) %>%
dplyr::bind_rows(results.kin.ls[["results"]]) %>%
dplyr::bind_rows(contrast.results) %>%
dplyr::bind_rows(results.lm.ls[["error"]]) %>%
dplyr::bind_rows(results.lme.ls[["error"]]) %>%
dplyr::bind_rows(results.kin.ls[["error"]]) %>%
dplyr::bind_rows(results.lm.ls[["metrics"]]) %>%
dplyr::bind_rows(results.lme.ls[["metrics"]]) %>%
#### Completion messages ####
all <- length(unique(to.model.ls[["to.model"]]$rowname))
message("Complete: ", all, " genes")
if("message" %in% colnames(fit.results)){
fail <- fit.results %>%
tidyr::drop_na(message) %>%
dplyr::distinct(gene, message) %>% nrow()
message("Failed: ", fail, " genes. See results[['model_error']]")
#move message to separate df
error.results <- fit.results %>%
dplyr::filter(!is.na(message)) %>%
dplyr::distinct(model, gene, message)
fit.results <- fit.results %>%
dplyr::filter(is.na(message)) %>%
} else {
message("Failed: 0 genes")
error.results <- NULL
#### Final formatting ####
kmFit.ls <- list()
if(nrow(fit.results) > 0 ){
# Calculate FDR
kmFit.results <- fit.results %>%
#Within model and variable
dplyr::group_by(model, variable, contrast_ref, contrast_lvl) %>%
dplyr::mutate(FDR=stats::p.adjust(pval, method=p_method)) %>%
dplyr::ungroup() %>%
dplyr::select(model:statistic, contrast_ref, contrast_lvl,
estimate, pval, FDR, dplyr::everything())
kmFit.results <- fit.results %>%
#Within model and variable
dplyr::group_by(model, variable) %>%
dplyr::mutate(FDR=stats::p.adjust(pval, method=p_method)) %>%
dplyr::ungroup() %>%
dplyr::select(model:statistic, estimate, pval, FDR,
# Add gene info if available
# REMOVE because uses too much RAM with large models / contrasts
# if(!is.null(dat$genes)){
# genes <- as.data.frame(dat$genes)
# }
# if(!is.null(genes)){
# #Find matching column
# nameID <- which(apply(genes, 2, function(x)
# any(grepl(kmFit.results$gene[1], x))))
# name <- colnames(genes)[nameID]
# kmFit.results.anno <- kmFit.results %>%
# dplyr::left_join(genes, by=c("gene"=name))
# } else{
# kmFit.results.anno <- kmFit.results
# }
kmFit.results.anno <- kmFit.results
# Split into list
for(result.i in unique(kmFit.results.anno$model)){
result.temp <- dplyr::filter(kmFit.results.anno, model==result.i)
#Turn estimate numeric if needed
estimates <- unique(result.temp$estimate)
estimates <- estimates[!is.na(estimates)]
if(all(estimates != "seeContrasts") & !is.null(estimates)){
result.temp <- dplyr::filter(kmFit.results.anno, model==result.i) %>%
kmFit.ls[[result.i]] <- result.temp %>%
dplyr::select_if(function(x) any(!is.na(x)))
# Split error messages into list object
for(result.i in unique(error.results$model)){
kmFit.ls[[paste(result.i,"error",sep=".")]] <- dplyr::filter(error.results, model==result.i)
#### Save ####
