In jgockley62/UW_RNASeq: Identify differentially expressed genes counts data

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = FALSE
)

Write your own config file.
Specify the active configuration by setting R_CONFIG_ACTIVE.

Sys.setenv(R_CONFIG_ACTIVE = "ihab")

Load the sageseqr library and login to Synapse. rnaseq_plan() calls the arguments from the config file and creates the drake plan. Execute drake::make(plan) to compute. Run this code from your project root.

#library(devtools)
#devtools::install_git('https://github.com/kelshmo/sageseqr.git')
#devtools::install_github('th1vairam/CovariateAnalysis@dev')
library(sageseqr)
library(edgeR)
library(ggplot2)
library(CovariateAnalysis) #get the package from devtools::install_github('th1vairam/CovariateAnalysis@dev')
library(data.table)
library(plyr)
library(tidyverse)
library(psych)
library(limma)
library(edgeR)
library(biomaRt)
library(RColorBrewer)
library(cqn)
library(glmnet)
library(knitr)
library(doParallel)
library(foreach)
library(githubr)
#BiocManager::install("WGCNA")

# Login to Synapse. Make a Synapse account and use synapser to login: https://r-docs.synapse.org/articles/manageSynapseCredentials.html
synapser::synLogin()

Sys.setenv(R_CONFIG_ACTIVE = "ihab")
# TO DO: how to setup cache for end user
# make_cache

# Run the analysis
plan <- sageseqr::rnaseq_plan(metadata_id = config::get("metadata")$synID,
                    metadata_version = config::get("metadata")$version,
                    counts_id = config::get("counts")$synID,
                    counts_version = config::get("counts")$version,
                    gene_id_input = config::get("counts")$`gene id`,
                    sample_id_input='individualID',
                    factor_input = config::get("factors"),
                    continuous_input = config::get("continuous"),
                    gene_id = config::get("biomart")$`gene id`,
                    biomart_id = config::get("biomart")$synID,
                    biomart_version = config::get("biomart")$version,
                    filters = config::get("biomart")$filters,
                    host = config::get("biomart")$host,
                    organism = config::get("biomart")$organism, 
                    conditions = config::get("conditions")
)
import_metadata <- sageseqr::get_data("syn21822937", 1L)
import_metadata <- import_metadata[ import_metadata$individualID != 'S155', ]

import_metadata$bmi <- import_metadata$weight/(import_metadata$height^2) *703
import_metadata[  as.character(import_metadata$diagnosis) %in% 'Mild Cognitive Impairment', ]$diagnosis <- 'MCI'
import_metadata$diagnosis <- as.factor(import_metadata$diagnosis)

import_metadata[  as.character(import_metadata$race) %in% 'Asian', ]$race <- 'Other'
import_metadata[  as.character(import_metadata$race) %in% 'Black or African American', ]$race <- 'Black_AA'

import_metadata$yearsEducation <- gsub(' ', '_', import_metadata$yearsEducation)


import_counts <- sageseqr::get_data("syn21969017", 1L)
import_counts <- import_counts[ ,colnames(import_counts) != 'S155' ]
import_counts[ grep('ENSG00000184895', import_counts$feature),]

counts <- tibble::column_to_rownames(import_counts, var = "feature")

import_metadata <- import_metadata[,c( "individualID", "sex", "race", "yearsEducation", "diagnosis", "bmi", "mocatotal",
"AlignmentSummaryMetrics__PF_READS_ALIGNED", "AlignmentSummaryMetrics__TOTAL_READS", 
"RnaSeqMetrics__INTERGENIC_BASES", "RnaSeqMetrics__INTRONIC_BASES", 
"RnaSeqMetrics__PCT_CODING_BASES", "RnaSeqMetrics__PCT_INTERGENIC_BASES", 
"RnaSeqMetrics__PCT_INTRONIC_BASES")]

clean_md <- sageseqr::clean_covariates(md = import_metadata, factors = c("individualID", "sex", "race", "yearsEducation", "diagnosis" ), sample_identifier= "individualID", continuous = c("bmi", "mocatotal",
"AlignmentSummaryMetrics__PF_READS_ALIGNED", "AlignmentSummaryMetrics__TOTAL_READS", 
"RnaSeqMetrics__INTERGENIC_BASES", "RnaSeqMetrics__INTRONIC_BASES", 
"RnaSeqMetrics__PCT_CODING_BASES", "RnaSeqMetrics__PCT_INTERGENIC_BASES", 
"RnaSeqMetrics__PCT_INTRONIC_BASES"
))

biomart_results <- sageseqr::get_biomart(count_df = counts, gene_id = "ensembl_gene_id", synid = "syn21983044", 
    version = 1L, filters = "ensembl_gene_id", host = "ensembl.org", 
    organism = "hsa")

filtered_counts <- sageseqr::filter_genes( clean_metadata = clean_md, count_df = counts, conditions = c("diagnosis", "sex"), cpm_threshold=1, conditions_threshold=.5 )

cqn_counts <- sageseqr::cqn(filtered_counts, biomart_results)

Sample clustering

PCA based clustering of samples

# Find principal components of expression to plot
PC <- prcomp(cqn_counts$E.no.na, scale.=T, center = T)
# Plot first 2 PCs
plotdata <- data.frame(individualID=rownames(PC$rotation), 
                       PC1=PC$rotation[,1], 
                       PC2=PC$rotation[,2])
plotdata <- left_join(plotdata, rownameToFirstColumn(METADATA, 'individualID'))
p <- ggplot(plotdata, aes(x=PC1, y=PC2))
p <- p + geom_point(aes(color=diagnosis, size=mocatotal))
p <- p + theme_bw() + theme(legend.position="right")
p

Tree based clustering of samples

# Eucledian tree based analysis
COVARIATES.tmp = data.matrix(METADATA[,c( "sex", "diagnosis", 'race', 'yearsEducation')])
COVARIATES.tmp[is.na(COVARIATES.tmp)] = 0
tree = hclust(as.dist(t(cqn_counts$E.no.na)))
cols = WGCNA::labels2colors(COVARIATES.tmp);
WGCNA::plotDendroAndColors(tree, 
                           colors = cols, 
                           dendroLabels = FALSE, 
                           abHeight = 0.80, 
                           main = "Sample dendrogram",
                           groupLabels = colnames(COVARIATES.tmp))

Distribution of samples (log cpm)

# Plot abberent distribution of logcpm counts
tmp1 = cqn_counts$E %>%
  rownameToFirstColumn('Gene.ID') %>%
  tidyr::gather(individualID, logCPM, -Gene.ID) %>%
  left_join(METADATA %>%
              rownameToFirstColumn('individualID'))
p = ggplot(tmp1, aes(x = logCPM, color = individualID)) + geom_density() 
p = p + theme(legend.position = 'NONE') + facet_grid(.~diagnosis, scale = 'free')
p

Coexpression of genes

cr = cor(t(cqn_counts$E.no.na))
hist(cr, main = 'Distribution of correlation between genes', xlab = 'Correlation')

Significant Covariates

Correlation between pca of unadjusted mRNA expression and covariates are used to find significant covariates

# Find correlation between PC's of gene expression with covariates
cqn_counts$E.no.na = cqn_counts$E
cqn_counts$E.no.na[is.na(cqn_counts$E.no.na)] = 0

#clean_md
METADATA <- as.data.frame( clean_md)
row.names(METADATA) <- import_metadata$individualID 
METADATA <- METADATA[, colnames(METADATA) != 'individualID' ]

Meta_HeatMap <- METADATA
preAdjustedSigCovars = runPCAandPlotCorrelations(cqn_counts$E.no.na, 
                                                 Meta_HeatMap,
                                                 'NULL design(voom-normalized)', 
                                                 isKeyPlot=TRUE, 
                                                 MIN_PVE_PCT_PC = 1)

preAdjustedSigCovars[["PC_res"]][[2]]$plotData

parallelDuplicateCorrelation.lmer <- function (object, design = NULL, ndups = 2, spacing = 1, 
block = NULL,
                                               trim = 0.15, weights = NULL) {
  y <- getEAWP(object)
  M <- y$exprs
  ngenes <- nrow(M)
  narrays <- ncol(M)
  if (is.null(design))
    design <- y$design
  if (is.null(design))
    design <- matrix(1, ncol(y$exprs), 1)
  else {
    design <- as.matrix(design)
    if (mode(design) != "numeric")
      stop("design must be a numeric matrix")
  }
  if (nrow(design) != narrays)
    stop("Number of rows of design matrix does not match number of arrays")
  ne <- nonEstimable(design)
  if (!is.null(ne))
    cat("Coefficients not estimable:", paste(ne, collapse = " "),
        "\n")
  nbeta <- ncol(design)
  if (missing(ndups) && !is.null(y$printer$ndups))
    ndups <- y$printer$ndups
  if (missing(spacing) && !is.null(y$printer$spacing))
    spacing <- y$printer$spacing
  if (missing(weights) && !is.null(y$weights))
    weights <- y$weights
  if (!is.null(weights)) {
    weights <- as.matrix(weights)
    if (any(dim(weights) != dim(M)))
      weights <- array(weights, dim(M))
    M[weights < 1e-15] <- NA
    weights[weights < 1e-15] <- NA
  }
  if (is.null(block)) {
    if (ndups < 2) {
      warning("No duplicates: correlation between duplicates not estimable")
      return(list(cor = NA, cor.genes = rep(NA, nrow(M))))
    }
    if (is.character(spacing)) {
      if (spacing == "columns")
        spacing <- 1
      if (spacing == "rows")
        spacing <- object$printer$nspot.c
      if (spacing == "topbottom")
        spacing <- nrow(M)/2
    }
    Array <- rep(1:narrays, rep(ndups, narrays))
  }
  else {
    ndups <- 1
    nspacing <- 1
    Array <- block
  }
  if (is.null(block)) {
    M <- unwrapdups(M, ndups = ndups, spacing = spacing)
    ngenes <- nrow(M)
    if (!is.null(weights))
      weights <- unwrapdups(weights, ndups = ndups, spacing = spacing)
    design <- design %x% rep(1, ndups)
  }
  rho <- rep(NA, ngenes)
  nafun <- function(e) NA
  rho = foreach(i = 1:ngenes, .combine = rbind, .packages = c("lme4"),
                .export = c("M", "Array", "nbeta", "design", "weights",
                            "nafun"), .verbose = F) %dopar% {
                              y <- drop(M[i, ])
                              o <- is.finite(y)
                              A <- factor(Array[o])
                              nobs <- sum(o)
                              nblocks <- length(levels(A))
                              if (nobs > (nbeta + 2) && nblocks > 1 && nblocks < nobs - 1) {
                                data = cbind(data.frame(y = y[o]),
                                             design[o, , drop = FALSE],
                                             data.frame(A = A))
                                formula1 = paste0('y~',paste(setdiff(colnames(data), 
c('y','A')), collapse = '+'), '+(1|A)')
                                if (!is.null(weights)) {
                                  w <- drop(weights[i, ])[o]
                                  s <- tryCatch({
                                    mdl = lme4::lmer(formula1, data = data, weights = w);
                                    as.data.frame(VarCorr(mdl))
                                  }, error = nafun)
                                }
                                else {
                                  s <- tryCatch({
                                    mdl = lme4::lmer(formula1, data = data);
                                    as.data.frame(VarCorr(mdl))
                                  }, error = nafun)
                                }
                                if (!is.na(s[1]))
                                  rho <- s$vcov[1]/sum(s$vcov)
                                else
                                  rho <- NA
                              }
                            }
  arho <- atanh(pmax(-1, rho))
  mrho <- tanh(mean(arho, trim = trim, na.rm = TRUE))
  list(consensus.correlation = mrho, cor = mrho, atanh.correlations = arho)
}
parallelDuplicateCorrelation.mm2 <- function (object, design = NULL, ndups = 2, spacing = 1, 
block = NULL,
                                              trim = 0.15, weights = NULL) {
  y <- getEAWP(object)
  M <- y$exprs
  ngenes <- nrow(M)
  narrays <- ncol(M)
  if (is.null(design))
    design <- y$design
  if (is.null(design))
    design <- matrix(1, ncol(y$exprs), 1)
  else {
    design <- as.matrix(design)
    if (mode(design) != "numeric")
      stop("design must be a numeric matrix")
  }
  if (nrow(design) != narrays)
    stop("Number of rows of design matrix does not match number of arrays")
  ne <- nonEstimable(design)
  if (!is.null(ne))
    cat("Coefficients not estimable:", paste(ne, collapse = " "),
        "\n")
  nbeta <- ncol(design)
  if (missing(ndups) && !is.null(y$printer$ndups))
    ndups <- y$printer$ndups
  if (missing(spacing) && !is.null(y$printer$spacing))
    spacing <- y$printer$spacing
  if (missing(weights) && !is.null(y$weights))
    weights <- y$weights
  if (!is.null(weights)) {
    weights <- as.matrix(weights)
    if (any(dim(weights) != dim(M)))
      weights <- array(weights, dim(M))
    M[weights < 1e-15] <- NA
    weights[weights < 1e-15] <- NA
  }
  if (is.null(block)) {
    if (ndups < 2) {
      warning("No duplicates: correlation between duplicates not estimable")
      return(list(cor = NA, cor.genes = rep(NA, nrow(M))))
    }
    if (is.character(spacing)) {
      if (spacing == "columns")
        spacing <- 1
      if (spacing == "rows")
        spacing <- object$printer$nspot.c
      if (spacing == "topbottom")
        spacing <- nrow(M)/2
    }
    Array <- rep(1:narrays, rep(ndups, narrays))
  }
  else {
    ndups <- 1
    nspacing <- 1
    Array <- block
  }
  if (is.null(block)) {
    M <- unwrapdups(M, ndups = ndups, spacing = spacing)
    ngenes <- nrow(M)
    if (!is.null(weights))
      weights <- unwrapdups(weights, ndups = ndups, spacing = spacing)
    design <- design %x% rep(1, ndups)
  }
  rho <- rep(NA, ngenes)
  nafun <- function(e) NA
  rho = foreach(i = 1:ngenes, .combine = rbind, .packages = c("statmod"),
                .export = c("M", "Array", "nbeta", "design", "weights",
                            "nafun"), .verbose = F) %dopar% {
                              y <- drop(M[i, ])
                              o <- is.finite(y)
                              A <- factor(Array[o])
                              nobs <- sum(o)
                              nblocks <- length(levels(A))
                              if (nobs > (nbeta + 2) && nblocks > 1 && nblocks < nobs -
                                  1) {
                                y <- y[o]
                                X <- design[o, , drop = FALSE]
                                Z <- model.matrix(~0 + A)
                                if (!is.null(weights)) {
                                  w <- drop(weights[i, ])[o]
                                  s <- tryCatch(mixedModel2Fit(y, X, Z, w, only.varcomp = TRUE,
                                                               maxit = 20)$varcomp, error = 
nafun)
                                }
                                else {
                                  s <- tryCatch(mixedModel2Fit(y, X, Z, only.varcomp = TRUE,
                                                               maxit = 20)$varcomp, error = 
nafun)
                                }
                                if (!is.na(s[1]))
                                  rho <- s[2]/sum(s)
                                else rho <- NA
                              }
                            }
  arho <- atanh(pmax(-1, rho))
  mrho <- tanh(mean(arho, trim = trim, na.rm = TRUE))
  list(consensus.correlation = mrho, cor = mrho, atanh.correlations = arho)
}
parallelDuplicateCorrelation <- function (object, design = NULL, ndups = 2, spacing = 1, block 
= NULL,
                                          trim = 0.15, weights = NULL, method = 'lmer'){
  if (method == 'lmer'){
    parallelDuplicateCorrelation.lmer(object, design, ndups, spacing, block, trim, weights)
  } else if (method == 'mm2'){
    parallelDuplicateCorrelation.mm2(object, design, ndups, spacing, block, trim, weights)
  } else {
    error('Unknown Method')
  }
}

Normalisation (iterative design)

Since many covariates are correlated, re-normalising and re-adjusting COUNTS with an iterative design matrix 1. Adding Batch and Sex a priori to variable selection 2. Primary variable of interest Diagnosis is excluded from the pool of available covariates for selection

# Primary variable of interest
primaryVariable <- c( "mocatotal", "diagnosis")
FactorCovariates <- c( "sex", "race", "yearsEducation")
ContCovariates <- c("bmi", "mocatotal", 'AlignmentSummaryMetrics__PF_READS_ALIGNED', 'AlignmentSummaryMetrics__TOTAL_READS', 'RnaSeqMetrics__INTERGENIC_BASES', 'RnaSeqMetrics__INTRONIC_BASES', 'RnaSeqMetrics__PCT_CODING_BASES', 'RnaSeqMetrics__PCT_INTERGENIC_BASES')

#postAdjustCovars = c( 'sex', 'race', 'yearsEducation', 'bmi' );
postAdjustCovars = 'bmi';
# Assign residual covariates
residualCovars = setdiff(preAdjustedSigCovars$significantCovars, c(postAdjustCovars, primaryVariable))
residualSigCovars = preAdjustedSigCovars
covariatesEffects = preAdjustedSigCovars$Effects.significantCovars[residualCovars]
postAdjustCovars = c(postAdjustCovars, names(which.max(covariatesEffects))) %>% unique()
loopCount = 0 
NEW.COUNTS <- cqn_counts$E
while(length(residualSigCovars$significantCovars)!=0 && loopCount <= 20){
  writeLines(paste('Using following covariates in the model:',
                   paste(postAdjustCovars, collapse=', '),
                   'as fixed effects'))

  # Post adjusted design matrix
  DM1 = getDesignMatrix(METADATA[,postAdjustCovars,drop=F],Intercept = F)
  DM1$design = DM1$design[,linColumnFinder(DM1$design)$indepCols]

  # Estimate voom weights for dispersion control
  cnts = NEW.COUNTS
  cnts[is.na(cnts)] = 0
  VOOM.GENE_EXPRESSION = voom(cnts, 
                              design=DM1$design, 
                              plot=F)
                              #na.rm = T)

  # Fit linear model using new weights and new design
  VOOM.ADJUSTED.FIT = lmFit(cqn_counts$E,
                            design = DM1$design,
                            weights = VOOM.GENE_EXPRESSION$weights)

  # Residuals after normalisation
  RESIDUAL.GENE_EXPRESSION = residuals.MArrayLM(VOOM.ADJUSTED.FIT,
                                                cqn_counts$E)

  # Residual covariates to choose from
  residCovars <- setdiff(c(FactorCovariates,ContCovariates), c(postAdjustCovars, primaryVariable))

  # Find PC of residual gene expression and significant covariates that are highly correlated with PCs
  expr = RESIDUAL.GENE_EXPRESSION
  expr[is.na(expr)] = 0
  residualSigCovars = runPCAandPlotCorrelations(expr, 
                                                METADATA[, residCovars, drop=F], 
                                                'adjusted design(voom-normalized)',
                                                isKeyPlot=TRUE)

  # Add postadjusted covariates (if any)
  residCovars = setdiff(residualSigCovars$significantCovars, c(postAdjustCovars, primaryVariable))
  covariatesEffects = residualSigCovars$Effects.significantCovars[residCovars]

  postAdjustCovars = c(postAdjustCovars, names(which.max(covariatesEffects)))
  loopCount = loopCount + 1
}
modelStr <- paste(paste(gsub('_','\\\\_',postAdjustCovars), collapse=', '),
                  'as fixed effects')
tmp <- paste('Using following covariates in the final model:', modelStr)

Sanity check

# Find PC of residual gene expression and significant covariates that are highly correlated with PCs
residualSigCovars = runPCAandPlotCorrelations(expr, 
                                              METADATA,
                                              'adjusted design(voom-normalized)',
                                              isKeyPlot=TRUE)
residualSigCovars[["PC_res"]][[2]]$plotData

Coexpression of genes

cr = cor(t(expr))
hist(cr, main = 'Distribution of correlation between genes', xlab = 'Correlation')

PCA of residual data

# Find principal components of expression to plot
PC <- prcomp(expr, scale.=T, center = T)
# Plot first 4 PCs
plotdata <- data.frame(individualID=rownames(PC$rotation), 
                       PC1=PC$rotation[,1], 
                       PC2=PC$rotation[,2])
plotdata <- left_join(plotdata, rownameToFirstColumn(METADATA, 'individualID'))
p <- ggplot(plotdata, aes(x=PC1, y=PC2)) 
p <- p + geom_point(aes(color=diagnosis, size=mocatotal))
p <- p + theme_bw() + theme(legend.position="right")
p

Tree based clustering of residual data

# Eucledian tree based analysis
COVARIATES.tmp = data.matrix(METADATA[,c( 'sex', 'yearsEducation', 'race', primaryVariable)])
COVARIATES.tmp[is.na(COVARIATES.tmp)] = 0
tree = hclust(as.dist(t(expr)))
cols = WGCNA::labels2colors(COVARIATES.tmp);
WGCNA::plotDendroAndColors(tree, 
                           colors = cols, 
                           dendroLabels = FALSE, 
                           abHeight = 0.80, 
                           main = "Sample dendrogram",
                           groupLabels = colnames(COVARIATES.tmp))

Adjust data with covariates for Network Analysis

Identified covariates are regressed out from the expression matrix for network analysis

# Get design matrix
DESIGN.NET = getDesignMatrix(METADATA[, postAdjustCovars, drop = F], Intercept = F)
DESIGN.NET = DESIGN.NET$design[,linColumnFinder(DESIGN.NET$design)$indepCols]
# Estimate voom weights for dispersion control
cnts = NEW.COUNTS
cnts[is.na(cnts)] = 0
VOOM.NET.WEIGHTS = voom(cnts, design=DESIGN.NET, plot=F)
# Fit linear model using new weights and new design
VOOM.NET.FIT = lmFit(cqn_counts$E,
                     design = DESIGN.NET,
                     weights = VOOM.NET.WEIGHTS$weights)
# Residuals after normalisation
RESIDUAL.NET.GENE_EXPRESSION = residuals.MArrayLM(VOOM.NET.FIT,
                                                  cqn_counts$E)

Differential expression analysis (with Diagnosis as primary variable)

Differential expression is performed on the primary variable by controlling for covariates identified above

Interpretation of Diagnosis 1. MCI: Mildly Impaired 2. CONTROL: cognitivly non-impaired

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are

# Get design matrix
DESIGN = getDesignMatrix(METADATA[, c(primaryVariable[2], postAdjustCovars), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
# Estimate voom weights for dispersion control
VOOM.WEIGHTS = voom(cqn_counts$counts, design=DESIGN$design, plot=F)
#_#VOOM.WEIGHTS = voom(cnts, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design
FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            weights = VOOM.WEIGHTS$weights)
# Fit contrast
contrast = makeContrasts(contrasts=c("diagnosisMCI-diagnosisControl"),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c('MCI-CONTROL')
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'Emory_Vascular',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = DE %>% dplyr::filter(Comparison == 'MCI-CONTROL') %>% 
  ggplot(aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list(Diagnosis = DE))
all.fit = c(all.fit, list(Diagnosis = FIT))

Differential expression analysis (with sex)

Differential expression is performed on the primary variable by controlling for covariates identified above

Interpretation of Diagnosis 1. MCI: Mildly Impaired 2. CONTROL: cognitivly non-impaired

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are

# Get design matrix
DESIGN = getDesignMatrix(METADATA[, c('sex', postAdjustCovars), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
# Estimate voom weights for dispersion control
VOOM.WEIGHTS = voom( cqn_counts$counts, design=DESIGN$design, plot=F )
#VOOM.WEIGHTS = voom( cqn_counts$E, design=DESIGN$design, plot=F )

# Fit linear model using new weights and new design
#LOG_Exp <- log2(cqn_counts$E)
#LOG_Exp[is.na(LOG_Exp)]
FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            weights = VOOM.WEIGHTS$weights)
# Fit contrast
contrast = makeContrasts(contrasts=c("sexfemale"),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c('Sex')
biomart_results$ensembl_gene_id <- row.names(biomart_results)
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'Emory_Vascular',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = DE %>% dplyr::filter(Comparison == 'sex') %>% 
  ggplot(aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list(Diagnosis = DE))
all.fit = c(all.fit, list(Diagnosis = FIT))

Differential expression analysis (with Diagnosis and Sex)

Differential expression is performed on the Diagnosis x Sex variable by controlling for covariates identified above

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are

## Modeling Diagnosis, msex and age_death conjointly
METADATA$Diagnosis.Sex = paste(METADATA$diagnosis,METADATA$sex, sep = '.') %>%
  factor
# Get design matrix
DESIGN = getDesignMatrix(METADATA[, c('Diagnosis.Sex', setdiff(postAdjustCovars, 'Sex')), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
colnames(DESIGN$design) = gsub('Diagnosis.Sex','',colnames(DESIGN$design))
# Estimate voom weights for dispersion control
VOOM.WEIGHTS = voom(cnts, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design
FIT = lmFit(CQN.GENE_EXPRESSION$E,
            design = DESIGN$design,
            weights = VOOM.WEIGHTS$weights)
# Fit contrast
contrast = makeContrasts(contrasts=c("(MCI.female+MCI.male)/2-(Control.female+Control.male)/2",
                                     "(MCI.female+Control.female)/2-(MCI.male+Control.male)/2",
                                     "(MCI.female-Control.female)/2-(MCI.male-Control.male)/2",
                                     "MCI.female-Control.female",
                                     "MCI.male-Control.male"),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(NEW.COUNTS)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c('MCI-CONTROL','FEMALE-MALE','MCI-CONTROL.IN.FEMALE-MALE','MCI-CONTROL.IN.FEMALE', 'MCI-CONTROL.IN.MALE')
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(GENE.PARAM %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gc_content, gene.length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'ROSMAP', Region = 'DLPFC',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list(Diagnosis.Sex = DE))
all.fit = c(all.fit, list(Diagnosis.Sex = FIT))

# Get design matrix
DESIGN = getDesignMatrix(METADATA[, c('diagnosis', primaryVariable[2], postAdjustCovars), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
# Estimate voom weights for dispersion control
VOOM.WEIGHTS = voom(CQN.GENE_EXPRESSION$E, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design
FIT = lmFit(CQN.GENE_EXPRESSION$E,
            design = DESIGN$design,
            weights = VOOM.WEIGHTS$weights)
# Fit contrast
contrast = makeContrasts(contrasts=c("sexfemale-sexmale"),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c('MCI-CONTROL')
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'Emory_Vascular',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = DE %>% dplyr::filter(Comparison == 'MCI-CONTROL') %>% 
  ggplot(aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list(Diagnosis = DE))
all.fit = c(all.fit, list(Diagnosis = FIT))

###########AFTER

Normalisation (iterative design)

Since many covariates are correlated, re-normalising and re-adjusting COUNTS with an iterative design matrix 1. Adding Sex, bmi, race, and years education a priori to variable selection 2. Primary variable of interest Diagnosis is excluded from the pool of available covariates for selection

library(variancePartition)
#NEW.COUNTS <- cqn_counts$E.no.na
NEW.COUNTS <- cqn_counts$counts
METADATA$Individual_ID <- row.names(METADATA)
# Primary variable of interest
primaryVariable <- c("mocatotal", "diagnosis")

#Leave out bmi because of missing weight variable 
## postAdjustCovars = c( 'sex', 'race', 'yearsEducation' );
postAdjustCovars = c( 'race', 'yearsEducation', 'bmi');

# Assign residual covariates
residualCovars = setdiff(preAdjustedSigCovars$significantCovars, c(postAdjustCovars, primaryVariable))
residualSigCovars = preAdjustedSigCovars
covariatesEffects = preAdjustedSigCovars$Effects.significantCovars[residualCovars]
postAdjustCovars = c(postAdjustCovars, names(which.max(covariatesEffects))) %>% unique()

#_#Set the cov list
total_cov <- c( preAdjustedSigCovars$significantCovars, postAdjustCovars, primaryVariable )
FactorCovariates <- total_cov[ total_cov %in% config::get('factors') ] 
ContCovariates <- total_cov[ total_cov %in% config::get('continuous')[ -grep( 'PCT_RIBOSOMAL', config::get('continuous'))] ] 

loopCount = 0 
notEstimable<-NULL
while(length(residualSigCovars$significantCovars)!=0 && loopCount <= 20){
  writeLines(paste('Using following covariates in the model:',
                   paste(postAdjustCovars, collapse=', '),
                   'as fixed effects'))

  DM1 = getDesignMatrix(METADATA[,postAdjustCovars,drop=F],Intercept = F)
  DM1$design = DM1$design[,linColumnFinder(DM1$design)$indepCols]

  VOOM.GENE_EXPRESSION = voom(NEW.COUNTS, 
                              design=DM1$design, 
                              plot=F)

  # Fit linear model using new weights and new design
  ##Replace CQN.GENE_EXPRESSION$E w/ new pipeline cqn
  VOOM.ADJUSTED.FIT = lmFit(cqn_counts$E,
                            design = DM1$design,
                            weights = VOOM.GENE_EXPRESSION$weights)

  # Residuals after normalisation
  RESIDUAL.GENE_EXPRESSION = residuals.MArrayLM(VOOM.ADJUSTED.FIT,
                                                cqn_counts$E)

  # Residual covariates to choose from
  residCovars <- setdiff(c(FactorCovariates,ContCovariates), c(postAdjustCovars, primaryVariable))

  # Find PC of residual gene expression and significant covariates that are highly correlated with PCs
  expr = RESIDUAL.GENE_EXPRESSION
  expr[is.na(expr)] = 0
  residualSigCovars = runPCAandPlotCorrelations(expr, 
                                                METADATA[, residCovars, drop=F], 
                                                'adjusted design(voom-normalized)',
                                                isKeyPlot=TRUE)

  # Add postadjusted covariates (if any)
  residCovars = setdiff(residualSigCovars$significantCovars, c(postAdjustCovars, primaryVariable))
  covariatesEffects = residualSigCovars$Effects.significantCovars[residCovars]

  postAdjustCovars = c(postAdjustCovars, names(which.max(covariatesEffects)))
  loopCount = loopCount + 1
}
modelStr <- paste(paste(gsub('_','\\\\_',postAdjustCovars), collapse=', '),
                  'as fixed effects')
tmp <- paste('Using following covariates in the final model:', modelStr)

r tmp

Sanity check

# Find PC of residual gene expression and significant covariates that are highly correlated with PCs
residualSigCovars = runPCAandPlotCorrelations(expr, 
                                              Meta_HeatMap,
                                              'adjusted design(voom-normalized)',
                                              isKeyPlot=TRUE)
residualSigCovars[["PC_res"]][[2]]$plotData

Coexpression of genes

cr = cor(t(expr))
hist(cr, main = 'Distribution of correlation between genes', xlab = 'Correlation')

PCA of residual data

# Find principal components of expression to plot
PC <- prcomp(expr, scale.=T, center = T)
# Plot first 4 PCs
METADATA <- METADATA[ , c('Individual_ID', colnames(METADATA)[ (colnames(METADATA) %in% 'Individual_ID')==F ]) ]
plotdata <- data.frame(Individual_ID=rownames(PC$rotation), 
                       PC1=PC$rotation[,1], 
                       PC2=PC$rotation[,2])
plotdata <- left_join(plotdata, rownameToFirstColumn(METADATA, 'Induvidual_ID'))
p <- ggplot(plotdata, aes(x=PC1, y=PC2)) 
p <- p + geom_point(aes(color=diagnosis, size=mocatotal))
#p <- p + geom_point()
p <- p + theme_bw() + theme(legend.position="right")
p

Tree based clustering of residual data

# Eucledian tree based analysis
COVARIATES.tmp = data.matrix(METADATA[,c( 'sex', 'race', 'yearsEducation', primaryVariable) ])
COVARIATES.tmp[is.na(COVARIATES.tmp)] = 0
tree = hclust(as.dist(t(expr)))
cols = WGCNA::labels2colors(COVARIATES.tmp);
WGCNA::plotDendroAndColors(tree, 
                           colors = cols, 
                           dendroLabels = FALSE, 
                           abHeight = 0.80, 
                           main = "Sample dendrogram",
                           groupLabels = colnames(COVARIATES.tmp))

Adjust data with covariates for Network Analysis

Identified covariates are regressed out from the expression matrix for network analysis

# Get design matrix
DESIGN.NET = getDesignMatrix(METADATA[, postAdjustCovars, drop = F], Intercept = F)
DESIGN.NET = DESIGN.NET$design[,linColumnFinder(DESIGN.NET$design)$indepCols]
# Estimate voom weights for dispersion control
cnts = NEW.COUNTS
cnts[is.na(cnts)] = 0
VOOM.NET.WEIGHTS = voom(cnts, design=DESIGN.NET, plot=F)
# Fit linear model using new weights and new design
VOOM.NET.FIT = lmFit(cqn_counts$E,
                     design = DESIGN.NET,
                     weights = VOOM.NET.WEIGHTS$weights)
# Residuals after normalisation
RESIDUAL.NET.GENE_EXPRESSION = residuals.MArrayLM(VOOM.NET.FIT,
                                                  cqn_counts$E)

Differential expression analysis (with diagnosis as primary variable)

Differential expression is performed on the primary variable by controlling for covariates identified above

Interpretation of Diagnosis 1. Mild Cognitive Impairment: Samples from individuals defined as Mildly cognitivly impared 2. CONTROL: Samples from individuals defined as not cognitivly impared

Genes that are differentially expressed at an FDR <= 0.05 are

# Get design matrix
DESIGN = getDesignMatrix(METADATA[, c(primaryVariable[2], postAdjustCovars), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
# Re-estimate voom weights
VOOM.WEIGHTS = voom(cqn_counts$E,
                    design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design
VOOM.WEIGHTS$E = cqn_counts$E
FIT = lmFit(VOOM.WEIGHTS)
# Fit contrast
cntr = c()
cntr <- 'diagnosisControl-diagnosisMCI'
contrast = makeContrasts(contrasts=cntr,
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(cqn_counts$E)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = colnames(contrast)

biomart_results$ensembl_gene_id <- row.names(biomart_results)
DE = DE %>% 
  data.table::rbindlist(idcol = 'Comparison') %>%
  dplyr::mutate(Comparison = gsub('Diagnosis','',Comparison),
                Study = 'Emory_Vascular',
                Sex = 'ALL') %>%
  tidyr::separate(Comparison, c('from.state','to.state'), sep = '-') %>%
  tidyr::unite(Comparison, from.state, to.state, sep = '-') %>%
  dplyr::left_join(biomart_results %>%
                     dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
                     unique())
DE$Direction = 'NONE'
DE$Direction[DE$adj.P.Val <= 0.05 & DE$logFC <= -log2(1.2)] = 'DOWN'
DE$Direction[DE$adj.P.Val <= 0.05 & DE$logFC >= log2(1.2)] = 'UP'

tmp = DE %>%
  dplyr::select(ensembl_gene_id, Direction) %>%
  group_by(Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red')) + theme(legend.position='top')
#p = p + facet_grid(Tissue+.~Comparison, scales = 'fixed')
p
DE$Comparison <- 'MCI-CONTROL'
all.diff.exp <- list()
all.fit <- list()
all.diff.exp = c(all.diff.exp, list(Diagnosis = DE))
all.fit = c(all.fit, list(Diagnosis = FIT))

Associate differential expression results with gc content, gene length and average expression

pl = list()
pl[[1]] = ggplot(DE %>% dplyr::filter(Comparison == 'MCI-CONTROL'), 
                 aes(x = log10(gene_length), y = logFC, color = Direction)) + geom_point() 
pl[[1]] = pl[[1]] + geom_smooth(method = 'loess', inherit.aes = FALSE, aes(x = log10(gene_length), y = logFC))
#pl[[1]] = pl[[1]] + facet_grid(.+Comparison) + scale_color_manual(values = c('green','grey','red'))
pl[[1]] = pl[[1]] + theme(legend.position = 'top')
pl[[2]] = ggplot(DE %>% dplyr::filter(Comparison == 'MCI-CONTROL'), 
                 aes(x = percentage_gene_gc_content, y = logFC, color = Direction)) + geom_point()
pl[[2]] = pl[[2]] + geom_smooth(method = 'loess', inherit.aes = FALSE, aes(x = percentage_gene_gc_content, y = logFC))
#pl[[2]] = pl[[2]] + facet_grid(Tissue~.+Comparison) + scale_color_manual(values = c('green','grey','red'))
pl[[2]] = pl[[2]] + theme(legend.position = 'top')
pl[[3]] = ggplot(DE %>% dplyr::filter(Comparison == 'MCI-CONTROL'), 
                 aes(x = AveExpr, y = logFC, color = Direction)) + geom_point()
pl[[3]] = pl[[3]] + geom_smooth(method = 'loess', inherit.aes = FALSE, aes(x = AveExpr, y = logFC))
#pl[[3]] = pl[[3]] + facet_grid(Tissue~.+Comparison) + scale_color_manual(values = c('green','grey','red'))
pl[[3]] = pl[[3]] + theme(legend.position = 'top')
multiplot(plotlist = pl, cols = 3)

Differential expression analysis (with Diagnosis and Sex)

Differential expression is performed on the Diagnosis x Sex variable by controlling for covariates identified above

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are

## Modeling Diagnosis, msex and age_death conjointly
METADATA$Diagnosis.Sex = paste(METADATA$diagnosis,METADATA$sex, sep = '.') %>%
  factor
# Get design matrix
DESIGN = getDesignMatrix(METADATA[, c('Diagnosis.Sex', setdiff(postAdjustCovars, 'sex')), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
colnames(DESIGN$design) = gsub('Diagnosis.Sex','',colnames(DESIGN$design))
# Estimate voom weights for dispersion control
VOOM.WEIGHTS = voom(cqn_counts$E, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design
FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            weights = VOOM.WEIGHTS$weights)
# Fit contrast
contrast = makeContrasts(contrasts=c("(MCI.female+MCI.male)/2-(Control.female+Control.male)/2",
  "(MCI.female+Control.female)/2-(MCI.male+Control.male)/2",                                  "(MCI.female-Control.female)/2-(MCI.male-Control.male)/2",
  "MCI.female-Control.female",
  "MCI.male-Control.male"),
  levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(cqn_counts$E)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c('MCI-CONTROL','FEMALE-MALE','MCI-CONTROL.IN.FEMALE-MALE','MCI-CONTROL.IN.FEMALE', 'MCI-CONTROL.IN.MALE')
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'Emory_Vascular',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list(Diagnosis.Sex = DE))
all.fit = c(all.fit, list(Diagnosis.Sex = FIT))

Differential expression analysis (with Diagnosis and MOCHATotal)

Differential expression is performed on the Diagnosis x MOCHATotal variable by controlling for covariates identified above

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are

# Get design matrix
DESIGN = getDesignMatrix(METADATA[, c(primaryVariable[2], setdiff(postAdjustCovars, 'mocatotal')), drop = F], Intercept = F)
ind = grep('diagnosis', colnames(DESIGN$design))
DESIGN$design[,ind] = DESIGN$design[,ind] * METADATA[rownames(DESIGN$design),'mocatotal']
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
colnames(DESIGN$design)[ind] = gsub('diagnosis', '', paste0(colnames(DESIGN$design)[ind], 'mocatotal'))
# Re-estimate voom weights
VOOM.WEIGHTS = voom( cqn_counts$E,
                    design=DESIGN$design, plot=F)#,
                    #block = METADATA$Individual_ID, 
                    #correlation = correlation$cor)
# Fit linear model using new weights and new design
VOOM.WEIGHTS$E = cqn_counts$E
FIT = lmFit(VOOM.WEIGHTS)
# Fit contrast
cntr = c( "MCImocatotal-Controlmocatotal" )#,
         #"IFG.AD.AOD-IFG.CONTROL.AOD",
         #"PHG.AD.AOD-PHG.CONTROL.AOD",
         #"STG.AD.AOD-STG.CONTROL.AOD")
contrast = makeContrasts(contrasts=cntr,
                         levels = colnames(FIT$coefficients))
colnames(contrast) = cntr
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(cqn_counts$counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = colnames(contrast)
DE = DE %>% 
  data.table::rbindlist(idcol = 'Comparison') %>%
  dplyr::mutate( Sex = 'ALL') %>%
  tidyr::separate(Comparison, c('from.state','to.state'), sep = '-') %>%
  #tidyr::separate(from.state, c('reference','Sex'), sep = '\\.') %>%
  #tidyr::separate(to.state, c('Tissue1','against','Sex1'), sep = '\\.') %>%
  tidyr::unite(Comparison, from.state, to.state, sep = '-') %>%
  #dplyr::select(-Tissue1, -Sex1, -Sex) %>%
  dplyr::left_join(biomart_results %>%
                     dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
                     unique())
DE$Direction = 'NONE'
DE$Direction[DE$adj.P.Val <= 0.05 & DE$logFC <= -log2(1.2)] = 'DOWN'
DE$Direction[DE$adj.P.Val <= 0.05 & DE$logFC >= log2(1.2)] = 'UP'
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
all.diff.exp = c(all.diff.exp, list(Diagnosis.mocatotal = DE))
all.fit = c(all.fit, list(Diagnosis.AOD = FIT))

Differential expression analysis by Sex (sanity check)

Differential expression is performed on Sex variable by controlling for covariates identified above

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are

## Modeling Diagnosis, msex and age_death conjointly
#METADATA$Diagnosis.Sex = paste(METADATA$diagnosis,METADATA$sex, sep = '.') %>%
#  factor
# Get design matrix
DESIGN = getDesignMatrix(METADATA[, c( 'sex',setdiff(postAdjustCovars, 'sex')), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
colnames(DESIGN$design) = gsub('sex','',colnames(DESIGN$design))
# Estimate voom weights for dispersion control
VOOM.WEIGHTS = voom(cqn_counts$E.no.na, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design
FIT = lmFit(cqn_counts$E.no.na,
            design = DESIGN$design,
            weights = VOOM.WEIGHTS$weights)
# Fit contrast
contrast = makeContrasts(contrasts=c("male-female"),
  levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(cqn_counts$E)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c('FEMALE-MALE')
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'Emory_Vascular',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list(Diagnosis.Sex = DE))
all.fit = c(all.fit, list(Diagnosis.Sex = FIT))

Sex Chromosome-specific Gene Expression Patterns

COUNT$gene_id = rownames(COUNT) 
REPORTED.GENDER.COUNTS = GENE.GC.CONT %>% 
  left_join(COUNT) %>%
  dplyr::select(-one_of("percentage_gene_gc_content")) %>%
  filter(chromosome_name == "X" |chromosome_name == "Y") %>% 
  tidyr::gather(key = item, value = value, -c(gene_id, chromosome_name)) %>%
  dplyr::mutate(value = log(value)) %>%
  dplyr::rename(`counts(log)`= value) %>% 
  dplyr::rename(SampleID = item) %>%
  left_join(METADATA[,c("SampleID", "Reported_Gender")]) %>% 
  dplyr::rename(`Reported Gender` = Reported_Gender) 
my.theme <- theme_bw() %+replace% theme(legend.position = 'top', axis.text.x = element_text(angle = 90, hjust = 1), plot.title=element_text(hjust=0.5))
p = list()
p[[1]] = ggplot(filter(REPORTED.GENDER.COUNTS, chromosome_name == "X"), aes(x = `Reported Gender`, y = `counts(log)`)) + geom_boxplot()
p[[1]] = p[[1]] + ggtitle('X') + my.theme
p[[2]] = ggplot(filter(REPORTED.GENDER.COUNTS, chromosome_name == "Y"), aes(x = `Reported Gender`, y = `counts(log)`)) + geom_boxplot()
p[[2]] = p[[2]] + ggtitle('Y') + my.theme
multiplot(plotlist = p, cols = 2)
##XIST and UTY expression 
#ENSG00000229807.11 and ENSG00000183878.15 
FILT <- REPORTED.GENDER.COUNTS %>% 
  filter(gene_id == "ENSG00000229807.11" | gene_id == "ENSG00000183878.15") %>% 
  dplyr::select(-one_of("chromosome_name")) %>% 
  tidyr::spread(key = gene_id, value = `counts(log)`) %>% 
  mutate(XIST = as.numeric(`ENSG00000229807.11`)) %>% 
  mutate(UTY = as.numeric(`ENSG00000183878.15`)) %>% 
  mutate(UTY = ifelse(UTY == -Inf, 0, UTY)) %>% 
  mutate(XIST = ifelse(XIST == -Inf, 0, XIST))
p = ggplot(FILT, aes (x= XIST, y = UTY)) 
p = p + geom_point(aes(color=`Reported Gender`)) + 
  ggtitle("Sex Check HBCC Cohort") + 
  theme(plot.title = element_text(hjust = 0.5, size = 15)) +
  labs(colour = "Reported Gender")
p

jgockley62/UW_RNASeq documentation built on Oct. 1, 2020, 8:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

jgockley62/UW_RNASeq
Identify differentially expressed genes counts data

In jgockley62/UW_RNASeq: Identify differentially expressed genes counts data

Sample clustering

Distribution of samples (log cpm)

Significant Covariates

Normalisation (iterative design)

Sanity check

Adjust data with covariates for Network Analysis

Differential expression analysis (with Diagnosis as primary variable)

Differential expression analysis (with sex)

Differential expression analysis (with Diagnosis and Sex)

###########AFTER

Normalisation (iterative design)

Sanity check

Adjust data with covariates for Network Analysis

Differential expression analysis (with diagnosis as primary variable)

Associate differential expression results with gc content, gene length and average expression

Differential expression analysis (with Diagnosis and Sex)

Differential expression analysis (with Diagnosis and MOCHATotal)

Differential expression analysis by Sex (sanity check)

Sex Chromosome-specific Gene Expression Patterns

R Package Documentation

Browse R Packages

We want your feedback!

jgockley62/UW_RNASeq Identify differentially expressed genes counts data

In jgockley62/UW_RNASeq: Identify differentially expressed genes counts data

Sample clustering

Distribution of samples (log cpm)

Significant Covariates

Normalisation (iterative design)

Sanity check

Adjust data with covariates for Network Analysis

Differential expression analysis (with Diagnosis as primary variable)

Differential expression analysis (with sex)

Differential expression analysis (with Diagnosis and Sex)

###########AFTER

Normalisation (iterative design)

Sanity check

Adjust data with covariates for Network Analysis

Differential expression analysis (with diagnosis as primary variable)

Associate differential expression results with gc content, gene length and average expression

Differential expression analysis (with Diagnosis and Sex)

Differential expression analysis (with Diagnosis and MOCHATotal)

Differential expression analysis by Sex (sanity check)

Sex Chromosome-specific Gene Expression Patterns

R Package Documentation

Browse R Packages

We want your feedback!

jgockley62/UW_RNASeq
Identify differentially expressed genes counts data