knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE )
Specify the active configuration by setting R_CONFIG_ACTIVE
.
Sys.setenv(R_CONFIG_ACTIVE = "ihab")
sageseqr
library and login to Synapse. rnaseq_plan()
calls the arguments from the config file and creates the drake
plan. Execute drake::make(plan)
to compute. Run this code from your project root.#library(devtools) #devtools::install_git('https://github.com/kelshmo/sageseqr.git') #devtools::install_github('th1vairam/CovariateAnalysis@dev') library(sageseqr) library(edgeR) library(ggplot2) library(CovariateAnalysis) #get the package from devtools::install_github('th1vairam/CovariateAnalysis@dev') library(data.table) library(plyr) library(tidyverse) library(psych) library(limma) library(edgeR) library(biomaRt) library(RColorBrewer) library(cqn) library(glmnet) library(knitr) library(doParallel) library(foreach) library(githubr) #BiocManager::install("WGCNA") # Login to Synapse. Make a Synapse account and use synapser to login: https://r-docs.synapse.org/articles/manageSynapseCredentials.html synapser::synLogin() Sys.setenv(R_CONFIG_ACTIVE = "ihab") # TO DO: how to setup cache for end user # make_cache # Run the analysis plan <- sageseqr::rnaseq_plan(metadata_id = config::get("metadata")$synID, metadata_version = config::get("metadata")$version, counts_id = config::get("counts")$synID, counts_version = config::get("counts")$version, gene_id_input = config::get("counts")$`gene id`, sample_id_input='individualID', factor_input = config::get("factors"), continuous_input = config::get("continuous"), gene_id = config::get("biomart")$`gene id`, biomart_id = config::get("biomart")$synID, biomart_version = config::get("biomart")$version, filters = config::get("biomart")$filters, host = config::get("biomart")$host, organism = config::get("biomart")$organism, conditions = config::get("conditions") ) import_metadata <- sageseqr::get_data("syn21822937", 1L) import_metadata <- import_metadata[ import_metadata$individualID != 'S155', ] import_metadata$bmi <- import_metadata$weight/(import_metadata$height^2) *703 import_metadata[ as.character(import_metadata$diagnosis) %in% 'Mild Cognitive Impairment', ]$diagnosis <- 'MCI' import_metadata$diagnosis <- as.factor(import_metadata$diagnosis) import_metadata[ as.character(import_metadata$race) %in% 'Asian', ]$race <- 'Other' import_metadata[ as.character(import_metadata$race) %in% 'Black or African American', ]$race <- 'Black_AA' import_metadata$yearsEducation <- gsub(' ', '_', import_metadata$yearsEducation) import_counts <- sageseqr::get_data("syn21969017", 1L) import_counts <- import_counts[ ,colnames(import_counts) != 'S155' ] import_counts[ grep('ENSG00000184895', import_counts$feature),] counts <- tibble::column_to_rownames(import_counts, var = "feature") import_metadata <- import_metadata[,c( "individualID", "sex", "race", "yearsEducation", "diagnosis", "bmi", "mocatotal", "AlignmentSummaryMetrics__PF_READS_ALIGNED", "AlignmentSummaryMetrics__TOTAL_READS", "RnaSeqMetrics__INTERGENIC_BASES", "RnaSeqMetrics__INTRONIC_BASES", "RnaSeqMetrics__PCT_CODING_BASES", "RnaSeqMetrics__PCT_INTERGENIC_BASES", "RnaSeqMetrics__PCT_INTRONIC_BASES")] clean_md <- sageseqr::clean_covariates(md = import_metadata, factors = c("individualID", "sex", "race", "yearsEducation", "diagnosis" ), sample_identifier= "individualID", continuous = c("bmi", "mocatotal", "AlignmentSummaryMetrics__PF_READS_ALIGNED", "AlignmentSummaryMetrics__TOTAL_READS", "RnaSeqMetrics__INTERGENIC_BASES", "RnaSeqMetrics__INTRONIC_BASES", "RnaSeqMetrics__PCT_CODING_BASES", "RnaSeqMetrics__PCT_INTERGENIC_BASES", "RnaSeqMetrics__PCT_INTRONIC_BASES" )) biomart_results <- sageseqr::get_biomart(count_df = counts, gene_id = "ensembl_gene_id", synid = "syn21983044", version = 1L, filters = "ensembl_gene_id", host = "ensembl.org", organism = "hsa") filtered_counts <- sageseqr::filter_genes( clean_metadata = clean_md, count_df = counts, conditions = c("diagnosis", "sex"), cpm_threshold=1, conditions_threshold=.5 ) cqn_counts <- sageseqr::cqn(filtered_counts, biomart_results)
PCA based clustering of samples
# Find principal components of expression to plot PC <- prcomp(cqn_counts$E.no.na, scale.=T, center = T) # Plot first 2 PCs plotdata <- data.frame(individualID=rownames(PC$rotation), PC1=PC$rotation[,1], PC2=PC$rotation[,2]) plotdata <- left_join(plotdata, rownameToFirstColumn(METADATA, 'individualID')) p <- ggplot(plotdata, aes(x=PC1, y=PC2)) p <- p + geom_point(aes(color=diagnosis, size=mocatotal)) p <- p + theme_bw() + theme(legend.position="right") p
Tree based clustering of samples
# Eucledian tree based analysis COVARIATES.tmp = data.matrix(METADATA[,c( "sex", "diagnosis", 'race', 'yearsEducation')]) COVARIATES.tmp[is.na(COVARIATES.tmp)] = 0 tree = hclust(as.dist(t(cqn_counts$E.no.na))) cols = WGCNA::labels2colors(COVARIATES.tmp); WGCNA::plotDendroAndColors(tree, colors = cols, dendroLabels = FALSE, abHeight = 0.80, main = "Sample dendrogram", groupLabels = colnames(COVARIATES.tmp))
# Plot abberent distribution of logcpm counts tmp1 = cqn_counts$E %>% rownameToFirstColumn('Gene.ID') %>% tidyr::gather(individualID, logCPM, -Gene.ID) %>% left_join(METADATA %>% rownameToFirstColumn('individualID')) p = ggplot(tmp1, aes(x = logCPM, color = individualID)) + geom_density() p = p + theme(legend.position = 'NONE') + facet_grid(.~diagnosis, scale = 'free') p
Coexpression of genes
cr = cor(t(cqn_counts$E.no.na)) hist(cr, main = 'Distribution of correlation between genes', xlab = 'Correlation')
Correlation between pca of unadjusted mRNA expression and covariates are used to find significant covariates
# Find correlation between PC's of gene expression with covariates cqn_counts$E.no.na = cqn_counts$E cqn_counts$E.no.na[is.na(cqn_counts$E.no.na)] = 0 #clean_md METADATA <- as.data.frame( clean_md) row.names(METADATA) <- import_metadata$individualID METADATA <- METADATA[, colnames(METADATA) != 'individualID' ] Meta_HeatMap <- METADATA preAdjustedSigCovars = runPCAandPlotCorrelations(cqn_counts$E.no.na, Meta_HeatMap, 'NULL design(voom-normalized)', isKeyPlot=TRUE, MIN_PVE_PCT_PC = 1) preAdjustedSigCovars[["PC_res"]][[2]]$plotData
parallelDuplicateCorrelation.lmer <- function (object, design = NULL, ndups = 2, spacing = 1, block = NULL, trim = 0.15, weights = NULL) { y <- getEAWP(object) M <- y$exprs ngenes <- nrow(M) narrays <- ncol(M) if (is.null(design)) design <- y$design if (is.null(design)) design <- matrix(1, ncol(y$exprs), 1) else { design <- as.matrix(design) if (mode(design) != "numeric") stop("design must be a numeric matrix") } if (nrow(design) != narrays) stop("Number of rows of design matrix does not match number of arrays") ne <- nonEstimable(design) if (!is.null(ne)) cat("Coefficients not estimable:", paste(ne, collapse = " "), "\n") nbeta <- ncol(design) if (missing(ndups) && !is.null(y$printer$ndups)) ndups <- y$printer$ndups if (missing(spacing) && !is.null(y$printer$spacing)) spacing <- y$printer$spacing if (missing(weights) && !is.null(y$weights)) weights <- y$weights if (!is.null(weights)) { weights <- as.matrix(weights) if (any(dim(weights) != dim(M))) weights <- array(weights, dim(M)) M[weights < 1e-15] <- NA weights[weights < 1e-15] <- NA } if (is.null(block)) { if (ndups < 2) { warning("No duplicates: correlation between duplicates not estimable") return(list(cor = NA, cor.genes = rep(NA, nrow(M)))) } if (is.character(spacing)) { if (spacing == "columns") spacing <- 1 if (spacing == "rows") spacing <- object$printer$nspot.c if (spacing == "topbottom") spacing <- nrow(M)/2 } Array <- rep(1:narrays, rep(ndups, narrays)) } else { ndups <- 1 nspacing <- 1 Array <- block } if (is.null(block)) { M <- unwrapdups(M, ndups = ndups, spacing = spacing) ngenes <- nrow(M) if (!is.null(weights)) weights <- unwrapdups(weights, ndups = ndups, spacing = spacing) design <- design %x% rep(1, ndups) } rho <- rep(NA, ngenes) nafun <- function(e) NA rho = foreach(i = 1:ngenes, .combine = rbind, .packages = c("lme4"), .export = c("M", "Array", "nbeta", "design", "weights", "nafun"), .verbose = F) %dopar% { y <- drop(M[i, ]) o <- is.finite(y) A <- factor(Array[o]) nobs <- sum(o) nblocks <- length(levels(A)) if (nobs > (nbeta + 2) && nblocks > 1 && nblocks < nobs - 1) { data = cbind(data.frame(y = y[o]), design[o, , drop = FALSE], data.frame(A = A)) formula1 = paste0('y~',paste(setdiff(colnames(data), c('y','A')), collapse = '+'), '+(1|A)') if (!is.null(weights)) { w <- drop(weights[i, ])[o] s <- tryCatch({ mdl = lme4::lmer(formula1, data = data, weights = w); as.data.frame(VarCorr(mdl)) }, error = nafun) } else { s <- tryCatch({ mdl = lme4::lmer(formula1, data = data); as.data.frame(VarCorr(mdl)) }, error = nafun) } if (!is.na(s[1])) rho <- s$vcov[1]/sum(s$vcov) else rho <- NA } } arho <- atanh(pmax(-1, rho)) mrho <- tanh(mean(arho, trim = trim, na.rm = TRUE)) list(consensus.correlation = mrho, cor = mrho, atanh.correlations = arho) } parallelDuplicateCorrelation.mm2 <- function (object, design = NULL, ndups = 2, spacing = 1, block = NULL, trim = 0.15, weights = NULL) { y <- getEAWP(object) M <- y$exprs ngenes <- nrow(M) narrays <- ncol(M) if (is.null(design)) design <- y$design if (is.null(design)) design <- matrix(1, ncol(y$exprs), 1) else { design <- as.matrix(design) if (mode(design) != "numeric") stop("design must be a numeric matrix") } if (nrow(design) != narrays) stop("Number of rows of design matrix does not match number of arrays") ne <- nonEstimable(design) if (!is.null(ne)) cat("Coefficients not estimable:", paste(ne, collapse = " "), "\n") nbeta <- ncol(design) if (missing(ndups) && !is.null(y$printer$ndups)) ndups <- y$printer$ndups if (missing(spacing) && !is.null(y$printer$spacing)) spacing <- y$printer$spacing if (missing(weights) && !is.null(y$weights)) weights <- y$weights if (!is.null(weights)) { weights <- as.matrix(weights) if (any(dim(weights) != dim(M))) weights <- array(weights, dim(M)) M[weights < 1e-15] <- NA weights[weights < 1e-15] <- NA } if (is.null(block)) { if (ndups < 2) { warning("No duplicates: correlation between duplicates not estimable") return(list(cor = NA, cor.genes = rep(NA, nrow(M)))) } if (is.character(spacing)) { if (spacing == "columns") spacing <- 1 if (spacing == "rows") spacing <- object$printer$nspot.c if (spacing == "topbottom") spacing <- nrow(M)/2 } Array <- rep(1:narrays, rep(ndups, narrays)) } else { ndups <- 1 nspacing <- 1 Array <- block } if (is.null(block)) { M <- unwrapdups(M, ndups = ndups, spacing = spacing) ngenes <- nrow(M) if (!is.null(weights)) weights <- unwrapdups(weights, ndups = ndups, spacing = spacing) design <- design %x% rep(1, ndups) } rho <- rep(NA, ngenes) nafun <- function(e) NA rho = foreach(i = 1:ngenes, .combine = rbind, .packages = c("statmod"), .export = c("M", "Array", "nbeta", "design", "weights", "nafun"), .verbose = F) %dopar% { y <- drop(M[i, ]) o <- is.finite(y) A <- factor(Array[o]) nobs <- sum(o) nblocks <- length(levels(A)) if (nobs > (nbeta + 2) && nblocks > 1 && nblocks < nobs - 1) { y <- y[o] X <- design[o, , drop = FALSE] Z <- model.matrix(~0 + A) if (!is.null(weights)) { w <- drop(weights[i, ])[o] s <- tryCatch(mixedModel2Fit(y, X, Z, w, only.varcomp = TRUE, maxit = 20)$varcomp, error = nafun) } else { s <- tryCatch(mixedModel2Fit(y, X, Z, only.varcomp = TRUE, maxit = 20)$varcomp, error = nafun) } if (!is.na(s[1])) rho <- s[2]/sum(s) else rho <- NA } } arho <- atanh(pmax(-1, rho)) mrho <- tanh(mean(arho, trim = trim, na.rm = TRUE)) list(consensus.correlation = mrho, cor = mrho, atanh.correlations = arho) } parallelDuplicateCorrelation <- function (object, design = NULL, ndups = 2, spacing = 1, block = NULL, trim = 0.15, weights = NULL, method = 'lmer'){ if (method == 'lmer'){ parallelDuplicateCorrelation.lmer(object, design, ndups, spacing, block, trim, weights) } else if (method == 'mm2'){ parallelDuplicateCorrelation.mm2(object, design, ndups, spacing, block, trim, weights) } else { error('Unknown Method') } }
Since many covariates are correlated, re-normalising and re-adjusting COUNTS with an iterative design matrix 1. Adding Batch and Sex a priori to variable selection 2. Primary variable of interest Diagnosis is excluded from the pool of available covariates for selection
# Primary variable of interest primaryVariable <- c( "mocatotal", "diagnosis") FactorCovariates <- c( "sex", "race", "yearsEducation") ContCovariates <- c("bmi", "mocatotal", 'AlignmentSummaryMetrics__PF_READS_ALIGNED', 'AlignmentSummaryMetrics__TOTAL_READS', 'RnaSeqMetrics__INTERGENIC_BASES', 'RnaSeqMetrics__INTRONIC_BASES', 'RnaSeqMetrics__PCT_CODING_BASES', 'RnaSeqMetrics__PCT_INTERGENIC_BASES') #postAdjustCovars = c( 'sex', 'race', 'yearsEducation', 'bmi' ); postAdjustCovars = 'bmi'; # Assign residual covariates residualCovars = setdiff(preAdjustedSigCovars$significantCovars, c(postAdjustCovars, primaryVariable)) residualSigCovars = preAdjustedSigCovars covariatesEffects = preAdjustedSigCovars$Effects.significantCovars[residualCovars] postAdjustCovars = c(postAdjustCovars, names(which.max(covariatesEffects))) %>% unique() loopCount = 0 NEW.COUNTS <- cqn_counts$E while(length(residualSigCovars$significantCovars)!=0 && loopCount <= 20){ writeLines(paste('Using following covariates in the model:', paste(postAdjustCovars, collapse=', '), 'as fixed effects')) # Post adjusted design matrix DM1 = getDesignMatrix(METADATA[,postAdjustCovars,drop=F],Intercept = F) DM1$design = DM1$design[,linColumnFinder(DM1$design)$indepCols] # Estimate voom weights for dispersion control cnts = NEW.COUNTS cnts[is.na(cnts)] = 0 VOOM.GENE_EXPRESSION = voom(cnts, design=DM1$design, plot=F) #na.rm = T) # Fit linear model using new weights and new design VOOM.ADJUSTED.FIT = lmFit(cqn_counts$E, design = DM1$design, weights = VOOM.GENE_EXPRESSION$weights) # Residuals after normalisation RESIDUAL.GENE_EXPRESSION = residuals.MArrayLM(VOOM.ADJUSTED.FIT, cqn_counts$E) # Residual covariates to choose from residCovars <- setdiff(c(FactorCovariates,ContCovariates), c(postAdjustCovars, primaryVariable)) # Find PC of residual gene expression and significant covariates that are highly correlated with PCs expr = RESIDUAL.GENE_EXPRESSION expr[is.na(expr)] = 0 residualSigCovars = runPCAandPlotCorrelations(expr, METADATA[, residCovars, drop=F], 'adjusted design(voom-normalized)', isKeyPlot=TRUE) # Add postadjusted covariates (if any) residCovars = setdiff(residualSigCovars$significantCovars, c(postAdjustCovars, primaryVariable)) covariatesEffects = residualSigCovars$Effects.significantCovars[residCovars] postAdjustCovars = c(postAdjustCovars, names(which.max(covariatesEffects))) loopCount = loopCount + 1 } modelStr <- paste(paste(gsub('_','\\\\_',postAdjustCovars), collapse=', '), 'as fixed effects') tmp <- paste('Using following covariates in the final model:', modelStr)
# Find PC of residual gene expression and significant covariates that are highly correlated with PCs residualSigCovars = runPCAandPlotCorrelations(expr, METADATA, 'adjusted design(voom-normalized)', isKeyPlot=TRUE) residualSigCovars[["PC_res"]][[2]]$plotData
Coexpression of genes
cr = cor(t(expr)) hist(cr, main = 'Distribution of correlation between genes', xlab = 'Correlation')
PCA of residual data
# Find principal components of expression to plot PC <- prcomp(expr, scale.=T, center = T) # Plot first 4 PCs plotdata <- data.frame(individualID=rownames(PC$rotation), PC1=PC$rotation[,1], PC2=PC$rotation[,2]) plotdata <- left_join(plotdata, rownameToFirstColumn(METADATA, 'individualID')) p <- ggplot(plotdata, aes(x=PC1, y=PC2)) p <- p + geom_point(aes(color=diagnosis, size=mocatotal)) p <- p + theme_bw() + theme(legend.position="right") p
Tree based clustering of residual data
# Eucledian tree based analysis COVARIATES.tmp = data.matrix(METADATA[,c( 'sex', 'yearsEducation', 'race', primaryVariable)]) COVARIATES.tmp[is.na(COVARIATES.tmp)] = 0 tree = hclust(as.dist(t(expr))) cols = WGCNA::labels2colors(COVARIATES.tmp); WGCNA::plotDendroAndColors(tree, colors = cols, dendroLabels = FALSE, abHeight = 0.80, main = "Sample dendrogram", groupLabels = colnames(COVARIATES.tmp))
Identified covariates are regressed out from the expression matrix for network analysis
# Get design matrix DESIGN.NET = getDesignMatrix(METADATA[, postAdjustCovars, drop = F], Intercept = F) DESIGN.NET = DESIGN.NET$design[,linColumnFinder(DESIGN.NET$design)$indepCols] # Estimate voom weights for dispersion control cnts = NEW.COUNTS cnts[is.na(cnts)] = 0 VOOM.NET.WEIGHTS = voom(cnts, design=DESIGN.NET, plot=F) # Fit linear model using new weights and new design VOOM.NET.FIT = lmFit(cqn_counts$E, design = DESIGN.NET, weights = VOOM.NET.WEIGHTS$weights) # Residuals after normalisation RESIDUAL.NET.GENE_EXPRESSION = residuals.MArrayLM(VOOM.NET.FIT, cqn_counts$E)
Differential expression is performed on the primary variable by controlling for covariates identified above
Interpretation of Diagnosis 1. MCI: Mildly Impaired 2. CONTROL: cognitivly non-impaired
Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are
# Get design matrix DESIGN = getDesignMatrix(METADATA[, c(primaryVariable[2], postAdjustCovars), drop = F], Intercept = F) DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols] # Estimate voom weights for dispersion control VOOM.WEIGHTS = voom(cqn_counts$counts, design=DESIGN$design, plot=F) #_#VOOM.WEIGHTS = voom(cnts, design=DESIGN$design, plot=F) # Fit linear model using new weights and new design FIT = lmFit(cqn_counts$E, design = DESIGN$design, weights = VOOM.WEIGHTS$weights) # Fit contrast contrast = makeContrasts(contrasts=c("diagnosisMCI-diagnosisControl"), levels = colnames(FIT$coefficients)) FIT.CONTR = contrasts.fit(FIT, contrasts=contrast) FIT.CONTR = eBayes(FIT.CONTR) # Get differnetial expression DE = lapply(1:dim(contrast)[2], function(i, FIT){ topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>% rownameToFirstColumn('ensembl_gene_id') }, FIT.CONTR) names(DE) = c('MCI-CONTROL') DE = DE %>% rbindlist(idcol = 'Comparison') %>% left_join(biomart_results %>% dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>% unique()) %>% dplyr::mutate(Study = 'Emory_Vascular', Direction = logFC/abs(logFC), Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')), Direction = as.character(Direction)) DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE' writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2') tmp = DE %>% dplyr::select(ensembl_gene_id, Comparison, Direction) %>% group_by(Comparison, Direction) %>% dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>% spread(Direction, FDR_0_05_FC_1.2) kable(tmp) p = DE %>% dplyr::filter(Comparison == 'MCI-CONTROL') %>% ggplot(aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1)) p = p + scale_color_manual(values = c('green','grey','red')) p = p + facet_grid(.~Comparison, scales = 'fixed') p all.diff.exp = c(all.diff.exp, list(Diagnosis = DE)) all.fit = c(all.fit, list(Diagnosis = FIT))
Differential expression is performed on the primary variable by controlling for covariates identified above
Interpretation of Diagnosis 1. MCI: Mildly Impaired 2. CONTROL: cognitivly non-impaired
Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are
# Get design matrix DESIGN = getDesignMatrix(METADATA[, c('sex', postAdjustCovars), drop = F], Intercept = F) DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols] # Estimate voom weights for dispersion control VOOM.WEIGHTS = voom( cqn_counts$counts, design=DESIGN$design, plot=F ) #VOOM.WEIGHTS = voom( cqn_counts$E, design=DESIGN$design, plot=F ) # Fit linear model using new weights and new design #LOG_Exp <- log2(cqn_counts$E) #LOG_Exp[is.na(LOG_Exp)] FIT = lmFit(cqn_counts$E, design = DESIGN$design, weights = VOOM.WEIGHTS$weights) # Fit contrast contrast = makeContrasts(contrasts=c("sexfemale"), levels = colnames(FIT$coefficients)) FIT.CONTR = contrasts.fit(FIT, contrasts=contrast) FIT.CONTR = eBayes(FIT.CONTR) # Get differnetial expression DE = lapply(1:dim(contrast)[2], function(i, FIT){ topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>% rownameToFirstColumn('ensembl_gene_id') }, FIT.CONTR) names(DE) = c('Sex') biomart_results$ensembl_gene_id <- row.names(biomart_results) DE = DE %>% rbindlist(idcol = 'Comparison') %>% left_join(biomart_results %>% dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>% unique()) %>% dplyr::mutate(Study = 'Emory_Vascular', Direction = logFC/abs(logFC), Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')), Direction = as.character(Direction)) DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE' writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2') tmp = DE %>% dplyr::select(ensembl_gene_id, Comparison, Direction) %>% group_by(Comparison, Direction) %>% dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>% spread(Direction, FDR_0_05_FC_1.2) kable(tmp) p = DE %>% dplyr::filter(Comparison == 'sex') %>% ggplot(aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1)) p = p + scale_color_manual(values = c('green','grey','red')) p = p + facet_grid(.~Comparison, scales = 'fixed') p all.diff.exp = c(all.diff.exp, list(Diagnosis = DE)) all.fit = c(all.fit, list(Diagnosis = FIT))
Differential expression is performed on the Diagnosis x Sex variable by controlling for covariates identified above
Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are
## Modeling Diagnosis, msex and age_death conjointly METADATA$Diagnosis.Sex = paste(METADATA$diagnosis,METADATA$sex, sep = '.') %>% factor # Get design matrix DESIGN = getDesignMatrix(METADATA[, c('Diagnosis.Sex', setdiff(postAdjustCovars, 'Sex')), drop = F], Intercept = F) DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols] colnames(DESIGN$design) = gsub('Diagnosis.Sex','',colnames(DESIGN$design)) # Estimate voom weights for dispersion control VOOM.WEIGHTS = voom(cnts, design=DESIGN$design, plot=F) # Fit linear model using new weights and new design FIT = lmFit(CQN.GENE_EXPRESSION$E, design = DESIGN$design, weights = VOOM.WEIGHTS$weights) # Fit contrast contrast = makeContrasts(contrasts=c("(MCI.female+MCI.male)/2-(Control.female+Control.male)/2", "(MCI.female+Control.female)/2-(MCI.male+Control.male)/2", "(MCI.female-Control.female)/2-(MCI.male-Control.male)/2", "MCI.female-Control.female", "MCI.male-Control.male"), levels = colnames(FIT$coefficients)) FIT.CONTR = contrasts.fit(FIT, contrasts=contrast) FIT.CONTR = eBayes(FIT.CONTR) # Get differnetial expression DE = lapply(1:dim(contrast)[2], function(i, FIT){ topTable(FIT, coef=i, number = dim(NEW.COUNTS)[1], confint = T) %>% rownameToFirstColumn('ensembl_gene_id') }, FIT.CONTR) names(DE) = c('MCI-CONTROL','FEMALE-MALE','MCI-CONTROL.IN.FEMALE-MALE','MCI-CONTROL.IN.FEMALE', 'MCI-CONTROL.IN.MALE') DE = DE %>% rbindlist(idcol = 'Comparison') %>% left_join(GENE.PARAM %>% dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gc_content, gene.length) %>% unique()) %>% dplyr::mutate(Study = 'ROSMAP', Region = 'DLPFC', Direction = logFC/abs(logFC), Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')), Direction = as.character(Direction)) DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE' writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2') tmp = DE %>% dplyr::select(ensembl_gene_id, Comparison, Direction) %>% group_by(Comparison, Direction) %>% dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>% spread(Direction, FDR_0_05_FC_1.2) kable(tmp) p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1)) p = p + scale_color_manual(values = c('green','grey','red')) p = p + facet_grid(.~Comparison, scales = 'fixed') p all.diff.exp = c(all.diff.exp, list(Diagnosis.Sex = DE)) all.fit = c(all.fit, list(Diagnosis.Sex = FIT))
# Get design matrix DESIGN = getDesignMatrix(METADATA[, c('diagnosis', primaryVariable[2], postAdjustCovars), drop = F], Intercept = F) DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols] # Estimate voom weights for dispersion control VOOM.WEIGHTS = voom(CQN.GENE_EXPRESSION$E, design=DESIGN$design, plot=F) # Fit linear model using new weights and new design FIT = lmFit(CQN.GENE_EXPRESSION$E, design = DESIGN$design, weights = VOOM.WEIGHTS$weights) # Fit contrast contrast = makeContrasts(contrasts=c("sexfemale-sexmale"), levels = colnames(FIT$coefficients)) FIT.CONTR = contrasts.fit(FIT, contrasts=contrast) FIT.CONTR = eBayes(FIT.CONTR) # Get differnetial expression DE = lapply(1:dim(contrast)[2], function(i, FIT){ topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>% rownameToFirstColumn('ensembl_gene_id') }, FIT.CONTR) names(DE) = c('MCI-CONTROL') DE = DE %>% rbindlist(idcol = 'Comparison') %>% left_join(biomart_results %>% dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>% unique()) %>% dplyr::mutate(Study = 'Emory_Vascular', Direction = logFC/abs(logFC), Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')), Direction = as.character(Direction)) DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE' writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2') tmp = DE %>% dplyr::select(ensembl_gene_id, Comparison, Direction) %>% group_by(Comparison, Direction) %>% dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>% spread(Direction, FDR_0_05_FC_1.2) kable(tmp) p = DE %>% dplyr::filter(Comparison == 'MCI-CONTROL') %>% ggplot(aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1)) p = p + scale_color_manual(values = c('green','grey','red')) p = p + facet_grid(.~Comparison, scales = 'fixed') p all.diff.exp = c(all.diff.exp, list(Diagnosis = DE)) all.fit = c(all.fit, list(Diagnosis = FIT))
Since many covariates are correlated, re-normalising and re-adjusting COUNTS with an iterative design matrix 1. Adding Sex, bmi, race, and years education a priori to variable selection 2. Primary variable of interest Diagnosis is excluded from the pool of available covariates for selection
library(variancePartition) #NEW.COUNTS <- cqn_counts$E.no.na NEW.COUNTS <- cqn_counts$counts METADATA$Individual_ID <- row.names(METADATA) # Primary variable of interest primaryVariable <- c("mocatotal", "diagnosis") #Leave out bmi because of missing weight variable ## postAdjustCovars = c( 'sex', 'race', 'yearsEducation' ); postAdjustCovars = c( 'race', 'yearsEducation', 'bmi'); # Assign residual covariates residualCovars = setdiff(preAdjustedSigCovars$significantCovars, c(postAdjustCovars, primaryVariable)) residualSigCovars = preAdjustedSigCovars covariatesEffects = preAdjustedSigCovars$Effects.significantCovars[residualCovars] postAdjustCovars = c(postAdjustCovars, names(which.max(covariatesEffects))) %>% unique() #_#Set the cov list total_cov <- c( preAdjustedSigCovars$significantCovars, postAdjustCovars, primaryVariable ) FactorCovariates <- total_cov[ total_cov %in% config::get('factors') ] ContCovariates <- total_cov[ total_cov %in% config::get('continuous')[ -grep( 'PCT_RIBOSOMAL', config::get('continuous'))] ] loopCount = 0 notEstimable<-NULL while(length(residualSigCovars$significantCovars)!=0 && loopCount <= 20){ writeLines(paste('Using following covariates in the model:', paste(postAdjustCovars, collapse=', '), 'as fixed effects')) DM1 = getDesignMatrix(METADATA[,postAdjustCovars,drop=F],Intercept = F) DM1$design = DM1$design[,linColumnFinder(DM1$design)$indepCols] VOOM.GENE_EXPRESSION = voom(NEW.COUNTS, design=DM1$design, plot=F) # Fit linear model using new weights and new design ##Replace CQN.GENE_EXPRESSION$E w/ new pipeline cqn VOOM.ADJUSTED.FIT = lmFit(cqn_counts$E, design = DM1$design, weights = VOOM.GENE_EXPRESSION$weights) # Residuals after normalisation RESIDUAL.GENE_EXPRESSION = residuals.MArrayLM(VOOM.ADJUSTED.FIT, cqn_counts$E) # Residual covariates to choose from residCovars <- setdiff(c(FactorCovariates,ContCovariates), c(postAdjustCovars, primaryVariable)) # Find PC of residual gene expression and significant covariates that are highly correlated with PCs expr = RESIDUAL.GENE_EXPRESSION expr[is.na(expr)] = 0 residualSigCovars = runPCAandPlotCorrelations(expr, METADATA[, residCovars, drop=F], 'adjusted design(voom-normalized)', isKeyPlot=TRUE) # Add postadjusted covariates (if any) residCovars = setdiff(residualSigCovars$significantCovars, c(postAdjustCovars, primaryVariable)) covariatesEffects = residualSigCovars$Effects.significantCovars[residCovars] postAdjustCovars = c(postAdjustCovars, names(which.max(covariatesEffects))) loopCount = loopCount + 1 } modelStr <- paste(paste(gsub('_','\\\\_',postAdjustCovars), collapse=', '), 'as fixed effects') tmp <- paste('Using following covariates in the final model:', modelStr)
r tmp
# Find PC of residual gene expression and significant covariates that are highly correlated with PCs residualSigCovars = runPCAandPlotCorrelations(expr, Meta_HeatMap, 'adjusted design(voom-normalized)', isKeyPlot=TRUE) residualSigCovars[["PC_res"]][[2]]$plotData
Coexpression of genes
cr = cor(t(expr)) hist(cr, main = 'Distribution of correlation between genes', xlab = 'Correlation')
PCA of residual data
# Find principal components of expression to plot PC <- prcomp(expr, scale.=T, center = T) # Plot first 4 PCs METADATA <- METADATA[ , c('Individual_ID', colnames(METADATA)[ (colnames(METADATA) %in% 'Individual_ID')==F ]) ] plotdata <- data.frame(Individual_ID=rownames(PC$rotation), PC1=PC$rotation[,1], PC2=PC$rotation[,2]) plotdata <- left_join(plotdata, rownameToFirstColumn(METADATA, 'Induvidual_ID')) p <- ggplot(plotdata, aes(x=PC1, y=PC2)) p <- p + geom_point(aes(color=diagnosis, size=mocatotal)) #p <- p + geom_point() p <- p + theme_bw() + theme(legend.position="right") p
Tree based clustering of residual data
# Eucledian tree based analysis COVARIATES.tmp = data.matrix(METADATA[,c( 'sex', 'race', 'yearsEducation', primaryVariable) ]) COVARIATES.tmp[is.na(COVARIATES.tmp)] = 0 tree = hclust(as.dist(t(expr))) cols = WGCNA::labels2colors(COVARIATES.tmp); WGCNA::plotDendroAndColors(tree, colors = cols, dendroLabels = FALSE, abHeight = 0.80, main = "Sample dendrogram", groupLabels = colnames(COVARIATES.tmp))
Identified covariates are regressed out from the expression matrix for network analysis
# Get design matrix DESIGN.NET = getDesignMatrix(METADATA[, postAdjustCovars, drop = F], Intercept = F) DESIGN.NET = DESIGN.NET$design[,linColumnFinder(DESIGN.NET$design)$indepCols] # Estimate voom weights for dispersion control cnts = NEW.COUNTS cnts[is.na(cnts)] = 0 VOOM.NET.WEIGHTS = voom(cnts, design=DESIGN.NET, plot=F) # Fit linear model using new weights and new design VOOM.NET.FIT = lmFit(cqn_counts$E, design = DESIGN.NET, weights = VOOM.NET.WEIGHTS$weights) # Residuals after normalisation RESIDUAL.NET.GENE_EXPRESSION = residuals.MArrayLM(VOOM.NET.FIT, cqn_counts$E)
Differential expression is performed on the primary variable by controlling for covariates identified above
Interpretation of Diagnosis 1. Mild Cognitive Impairment: Samples from individuals defined as Mildly cognitivly impared 2. CONTROL: Samples from individuals defined as not cognitivly impared
Genes that are differentially expressed at an FDR <= 0.05 are
# Get design matrix DESIGN = getDesignMatrix(METADATA[, c(primaryVariable[2], postAdjustCovars), drop = F], Intercept = F) DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols] # Re-estimate voom weights VOOM.WEIGHTS = voom(cqn_counts$E, design=DESIGN$design, plot=F) # Fit linear model using new weights and new design VOOM.WEIGHTS$E = cqn_counts$E FIT = lmFit(VOOM.WEIGHTS) # Fit contrast cntr = c() cntr <- 'diagnosisControl-diagnosisMCI' contrast = makeContrasts(contrasts=cntr, levels = colnames(FIT$coefficients)) FIT.CONTR = contrasts.fit(FIT, contrasts=contrast) FIT.CONTR = eBayes(FIT.CONTR) # Get differnetial expression DE = lapply(1:dim(contrast)[2], function(i, FIT){ topTable(FIT, coef=i, number = dim(cqn_counts$E)[1], confint = T) %>% rownameToFirstColumn('ensembl_gene_id') }, FIT.CONTR) names(DE) = colnames(contrast) biomart_results$ensembl_gene_id <- row.names(biomart_results) DE = DE %>% data.table::rbindlist(idcol = 'Comparison') %>% dplyr::mutate(Comparison = gsub('Diagnosis','',Comparison), Study = 'Emory_Vascular', Sex = 'ALL') %>% tidyr::separate(Comparison, c('from.state','to.state'), sep = '-') %>% tidyr::unite(Comparison, from.state, to.state, sep = '-') %>% dplyr::left_join(biomart_results %>% dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>% unique()) DE$Direction = 'NONE' DE$Direction[DE$adj.P.Val <= 0.05 & DE$logFC <= -log2(1.2)] = 'DOWN' DE$Direction[DE$adj.P.Val <= 0.05 & DE$logFC >= log2(1.2)] = 'UP' tmp = DE %>% dplyr::select(ensembl_gene_id, Direction) %>% group_by(Direction) %>% dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>% spread(Direction, FDR_0_05_FC_1.2) kable(tmp) p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1)) p = p + scale_color_manual(values = c('green','grey','red')) + theme(legend.position='top') #p = p + facet_grid(Tissue+.~Comparison, scales = 'fixed') p DE$Comparison <- 'MCI-CONTROL' all.diff.exp <- list() all.fit <- list() all.diff.exp = c(all.diff.exp, list(Diagnosis = DE)) all.fit = c(all.fit, list(Diagnosis = FIT))
pl = list() pl[[1]] = ggplot(DE %>% dplyr::filter(Comparison == 'MCI-CONTROL'), aes(x = log10(gene_length), y = logFC, color = Direction)) + geom_point() pl[[1]] = pl[[1]] + geom_smooth(method = 'loess', inherit.aes = FALSE, aes(x = log10(gene_length), y = logFC)) #pl[[1]] = pl[[1]] + facet_grid(.+Comparison) + scale_color_manual(values = c('green','grey','red')) pl[[1]] = pl[[1]] + theme(legend.position = 'top') pl[[2]] = ggplot(DE %>% dplyr::filter(Comparison == 'MCI-CONTROL'), aes(x = percentage_gene_gc_content, y = logFC, color = Direction)) + geom_point() pl[[2]] = pl[[2]] + geom_smooth(method = 'loess', inherit.aes = FALSE, aes(x = percentage_gene_gc_content, y = logFC)) #pl[[2]] = pl[[2]] + facet_grid(Tissue~.+Comparison) + scale_color_manual(values = c('green','grey','red')) pl[[2]] = pl[[2]] + theme(legend.position = 'top') pl[[3]] = ggplot(DE %>% dplyr::filter(Comparison == 'MCI-CONTROL'), aes(x = AveExpr, y = logFC, color = Direction)) + geom_point() pl[[3]] = pl[[3]] + geom_smooth(method = 'loess', inherit.aes = FALSE, aes(x = AveExpr, y = logFC)) #pl[[3]] = pl[[3]] + facet_grid(Tissue~.+Comparison) + scale_color_manual(values = c('green','grey','red')) pl[[3]] = pl[[3]] + theme(legend.position = 'top') multiplot(plotlist = pl, cols = 3)
Differential expression is performed on the Diagnosis x Sex variable by controlling for covariates identified above
Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are
## Modeling Diagnosis, msex and age_death conjointly METADATA$Diagnosis.Sex = paste(METADATA$diagnosis,METADATA$sex, sep = '.') %>% factor # Get design matrix DESIGN = getDesignMatrix(METADATA[, c('Diagnosis.Sex', setdiff(postAdjustCovars, 'sex')), drop = F], Intercept = F) DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols] colnames(DESIGN$design) = gsub('Diagnosis.Sex','',colnames(DESIGN$design)) # Estimate voom weights for dispersion control VOOM.WEIGHTS = voom(cqn_counts$E, design=DESIGN$design, plot=F) # Fit linear model using new weights and new design FIT = lmFit(cqn_counts$E, design = DESIGN$design, weights = VOOM.WEIGHTS$weights) # Fit contrast contrast = makeContrasts(contrasts=c("(MCI.female+MCI.male)/2-(Control.female+Control.male)/2", "(MCI.female+Control.female)/2-(MCI.male+Control.male)/2", "(MCI.female-Control.female)/2-(MCI.male-Control.male)/2", "MCI.female-Control.female", "MCI.male-Control.male"), levels = colnames(FIT$coefficients)) FIT.CONTR = contrasts.fit(FIT, contrasts=contrast) FIT.CONTR = eBayes(FIT.CONTR) # Get differnetial expression DE = lapply(1:dim(contrast)[2], function(i, FIT){ topTable(FIT, coef=i, number = dim(cqn_counts$E)[1], confint = T) %>% rownameToFirstColumn('ensembl_gene_id') }, FIT.CONTR) names(DE) = c('MCI-CONTROL','FEMALE-MALE','MCI-CONTROL.IN.FEMALE-MALE','MCI-CONTROL.IN.FEMALE', 'MCI-CONTROL.IN.MALE') DE = DE %>% rbindlist(idcol = 'Comparison') %>% left_join(biomart_results %>% dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>% unique()) %>% dplyr::mutate(Study = 'Emory_Vascular', Direction = logFC/abs(logFC), Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')), Direction = as.character(Direction)) DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE' writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2') tmp = DE %>% dplyr::select(ensembl_gene_id, Comparison, Direction) %>% group_by(Comparison, Direction) %>% dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>% spread(Direction, FDR_0_05_FC_1.2) kable(tmp) p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1)) p = p + scale_color_manual(values = c('green','grey','red')) p = p + facet_grid(.~Comparison, scales = 'fixed') p all.diff.exp = c(all.diff.exp, list(Diagnosis.Sex = DE)) all.fit = c(all.fit, list(Diagnosis.Sex = FIT))
Differential expression is performed on the Diagnosis x MOCHATotal variable by controlling for covariates identified above
Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are
# Get design matrix DESIGN = getDesignMatrix(METADATA[, c(primaryVariable[2], setdiff(postAdjustCovars, 'mocatotal')), drop = F], Intercept = F) ind = grep('diagnosis', colnames(DESIGN$design)) DESIGN$design[,ind] = DESIGN$design[,ind] * METADATA[rownames(DESIGN$design),'mocatotal'] DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols] colnames(DESIGN$design)[ind] = gsub('diagnosis', '', paste0(colnames(DESIGN$design)[ind], 'mocatotal')) # Re-estimate voom weights VOOM.WEIGHTS = voom( cqn_counts$E, design=DESIGN$design, plot=F)#, #block = METADATA$Individual_ID, #correlation = correlation$cor) # Fit linear model using new weights and new design VOOM.WEIGHTS$E = cqn_counts$E FIT = lmFit(VOOM.WEIGHTS) # Fit contrast cntr = c( "MCImocatotal-Controlmocatotal" )#, #"IFG.AD.AOD-IFG.CONTROL.AOD", #"PHG.AD.AOD-PHG.CONTROL.AOD", #"STG.AD.AOD-STG.CONTROL.AOD") contrast = makeContrasts(contrasts=cntr, levels = colnames(FIT$coefficients)) colnames(contrast) = cntr FIT.CONTR = contrasts.fit(FIT, contrasts=contrast) FIT.CONTR = eBayes(FIT.CONTR) # Get differnetial expression DE = lapply(1:dim(contrast)[2], function(i, FIT){ topTable(FIT, coef=i, number = dim(cqn_counts$counts)[1], confint = T) %>% rownameToFirstColumn('ensembl_gene_id') }, FIT.CONTR) names(DE) = colnames(contrast) DE = DE %>% data.table::rbindlist(idcol = 'Comparison') %>% dplyr::mutate( Sex = 'ALL') %>% tidyr::separate(Comparison, c('from.state','to.state'), sep = '-') %>% #tidyr::separate(from.state, c('reference','Sex'), sep = '\\.') %>% #tidyr::separate(to.state, c('Tissue1','against','Sex1'), sep = '\\.') %>% tidyr::unite(Comparison, from.state, to.state, sep = '-') %>% #dplyr::select(-Tissue1, -Sex1, -Sex) %>% dplyr::left_join(biomart_results %>% dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>% unique()) DE$Direction = 'NONE' DE$Direction[DE$adj.P.Val <= 0.05 & DE$logFC <= -log2(1.2)] = 'DOWN' DE$Direction[DE$adj.P.Val <= 0.05 & DE$logFC >= log2(1.2)] = 'UP' tmp = DE %>% dplyr::select(ensembl_gene_id, Comparison, Direction) %>% group_by(Comparison, Direction) %>% dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>% spread(Direction, FDR_0_05_FC_1.2) kable(tmp) all.diff.exp = c(all.diff.exp, list(Diagnosis.mocatotal = DE)) all.fit = c(all.fit, list(Diagnosis.AOD = FIT))
Differential expression is performed on Sex variable by controlling for covariates identified above
Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are
## Modeling Diagnosis, msex and age_death conjointly #METADATA$Diagnosis.Sex = paste(METADATA$diagnosis,METADATA$sex, sep = '.') %>% # factor # Get design matrix DESIGN = getDesignMatrix(METADATA[, c( 'sex',setdiff(postAdjustCovars, 'sex')), drop = F], Intercept = F) DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols] colnames(DESIGN$design) = gsub('sex','',colnames(DESIGN$design)) # Estimate voom weights for dispersion control VOOM.WEIGHTS = voom(cqn_counts$E.no.na, design=DESIGN$design, plot=F) # Fit linear model using new weights and new design FIT = lmFit(cqn_counts$E.no.na, design = DESIGN$design, weights = VOOM.WEIGHTS$weights) # Fit contrast contrast = makeContrasts(contrasts=c("male-female"), levels = colnames(FIT$coefficients)) FIT.CONTR = contrasts.fit(FIT, contrasts=contrast) FIT.CONTR = eBayes(FIT.CONTR) # Get differnetial expression DE = lapply(1:dim(contrast)[2], function(i, FIT){ topTable(FIT, coef=i, number = dim(cqn_counts$E)[1], confint = T) %>% rownameToFirstColumn('ensembl_gene_id') }, FIT.CONTR) names(DE) = c('FEMALE-MALE') DE = DE %>% rbindlist(idcol = 'Comparison') %>% left_join(biomart_results %>% dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>% unique()) %>% dplyr::mutate(Study = 'Emory_Vascular', Direction = logFC/abs(logFC), Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')), Direction = as.character(Direction)) DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE' writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2') tmp = DE %>% dplyr::select(ensembl_gene_id, Comparison, Direction) %>% group_by(Comparison, Direction) %>% dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>% spread(Direction, FDR_0_05_FC_1.2) kable(tmp) p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1)) p = p + scale_color_manual(values = c('green','grey','red')) p = p + facet_grid(.~Comparison, scales = 'fixed') p all.diff.exp = c(all.diff.exp, list(Diagnosis.Sex = DE)) all.fit = c(all.fit, list(Diagnosis.Sex = FIT))
COUNT$gene_id = rownames(COUNT) REPORTED.GENDER.COUNTS = GENE.GC.CONT %>% left_join(COUNT) %>% dplyr::select(-one_of("percentage_gene_gc_content")) %>% filter(chromosome_name == "X" |chromosome_name == "Y") %>% tidyr::gather(key = item, value = value, -c(gene_id, chromosome_name)) %>% dplyr::mutate(value = log(value)) %>% dplyr::rename(`counts(log)`= value) %>% dplyr::rename(SampleID = item) %>% left_join(METADATA[,c("SampleID", "Reported_Gender")]) %>% dplyr::rename(`Reported Gender` = Reported_Gender) my.theme <- theme_bw() %+replace% theme(legend.position = 'top', axis.text.x = element_text(angle = 90, hjust = 1), plot.title=element_text(hjust=0.5)) p = list() p[[1]] = ggplot(filter(REPORTED.GENDER.COUNTS, chromosome_name == "X"), aes(x = `Reported Gender`, y = `counts(log)`)) + geom_boxplot() p[[1]] = p[[1]] + ggtitle('X') + my.theme p[[2]] = ggplot(filter(REPORTED.GENDER.COUNTS, chromosome_name == "Y"), aes(x = `Reported Gender`, y = `counts(log)`)) + geom_boxplot() p[[2]] = p[[2]] + ggtitle('Y') + my.theme multiplot(plotlist = p, cols = 2) ##XIST and UTY expression #ENSG00000229807.11 and ENSG00000183878.15 FILT <- REPORTED.GENDER.COUNTS %>% filter(gene_id == "ENSG00000229807.11" | gene_id == "ENSG00000183878.15") %>% dplyr::select(-one_of("chromosome_name")) %>% tidyr::spread(key = gene_id, value = `counts(log)`) %>% mutate(XIST = as.numeric(`ENSG00000229807.11`)) %>% mutate(UTY = as.numeric(`ENSG00000183878.15`)) %>% mutate(UTY = ifelse(UTY == -Inf, 0, UTY)) %>% mutate(XIST = ifelse(XIST == -Inf, 0, XIST)) p = ggplot(FILT, aes (x= XIST, y = UTY)) p = p + geom_point(aes(color=`Reported Gender`)) + ggtitle("Sex Check HBCC Cohort") + theme(plot.title = element_text(hjust = 0.5, size = 15)) + labs(colour = "Reported Gender") p
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.