In jgockley62/UW_RNASeq: Identify differentially expressed genes counts data

  options(xtable.type="html")
knitr::opts_chunk$set(
  echo=FALSE,
  warning=FALSE,
  message=FALSE,
  error = FALSE,
  tidy = FALSE,
  cache = TRUE,
  collapse = TRUE,
  comment = "#>",
  eval = FALSE
)

Write your own config file.
Specify the active configuration by setting R_CONFIG_ACTIVE.

Sys.setenv(R_CONFIG_ACTIVE = "UW")

Load the sageseqr library and login to Synapse. rnaseq_plan() calls the arguments from the config file and creates the drake plan. Execute drake::make(plan) to compute. Run this code from your project root.

#library(devtools)
#devtools::install_git('https://github.com/kelshmo/sageseqr.git')
#devtools::install_github('th1vairam/CovariateAnalysis@dev')
library(sageseqr)
library(edgeR)
library(ggplot2)
library(CovariateAnalysis) #get the package from devtools::install_github('th1vairam/CovariateAnalysis@dev')
library(data.table)
library(plyr)
library(tidyverse)
library(psych)
library(limma)
library(edgeR)
library(biomaRt)
library(RColorBrewer)
library(cqn)
library(glmnet)
library(knitr)
library(doParallel)
library(foreach)
library(githubr)
#BiocManager::install("WGCNA")

# Login to Synapse. Make a Synapse account and use synapser to login: https://r-docs.synapse.org/articles/manageSynapseCredentials.html
synapser::synLogin()

Sys.setenv(R_CONFIG_ACTIVE = "UW")
# TO DO: how to setup cache for end user
# make_cache

# Run the analysis

#plan <- sageseqr::rnaseq_plan(metadata_id = config::get("metadata")$synID,
#                    metadata_version = config::get("metadata")$version,
#                    counts_id = config::get("counts")$synID,
#                    counts_version = config::get("counts")$version,
#                    gene_id_input = config::get("counts")$`gene id`,
#                    sample_id_input='individualID',
#                    factor_input = config::get("factors"),
#                    continuous_input = config::get("continuous"),
#                    gene_id = config::get("biomart")$`gene id`,
#                    biomart_id = config::get("biomart")$synID,
#                    biomart_version = config::get("biomart")$version,
#                    filters = config::get("biomart")$filters,
#                    host = config::get("biomart")$host,
#                    organism = config::get("biomart")$organism, 
#                    conditions = config::get("conditions")
#)
import_metadata <- sageseqr::get_data("syn22254834", 8L)
import_metadata$sample <- paste0( 'S', import_metadata$specimenID)
import_metadata$sampleID <- paste0( 'S', import_metadata$specimenID)
import_metadata$individualID <- paste0( 'I', import_metadata$individualID)
import_metadata$Family_ID <- paste0( 'F', import_metadata$individualID)

import_metadata$totalID <- paste0( import_metadata$individualID, "_", import_metadata$Family_ID )

#trans<-c(1,0)
#names(trans)<-c('female','male')
#import_metadata$sex <- trans[import_metadata$sex]

import_counts <- sageseqr::get_data("syn22249653", 1L)

counts <- tibble::column_to_rownames(import_counts, var = 'V1')
colnames( counts ) <- counts[ 'feature', ]
counts <- counts[ row.names(counts) != 'feature', ]
colnames(counts) <- paste0( 'S', colnames(counts) )

import_metadata <- import_metadata[,c("sampleID", "totalID", "individualID", "Family_ID", "disease_cohort", 'sex', "ind_mut_status", "apoeGenotype", "ind_mutation", "ADAD_fam_mut", "CDR", "age", "RIN", "AlignmentSummaryMetrics__PF_READS_ALIGNED", "AlignmentSummaryMetrics__TOTAL_READS",  "RnaSeqMetrics__INTERGENIC_BASES", "RnaSeqMetrics__INTRONIC_BASES", "RnaSeqMetrics__PCT_CODING_BASES", "RnaSeqMetrics__PCT_INTERGENIC_BASES", "RnaSeqMetrics__PCT_INTRONIC_BASES", "RnaSeqMetrics__PCT_RIBOSOMAL_BASES")]

#clean_md <- sageseqr::clean_covariates(md = import_metadata, factors = c("sampleID", "individualID", "Family_ID", "disease_cohort", 'sex', "ind_mut_status", "apoeGenotype", "ind_mutation", "ADAD_fam_mut", "CDR" ), sample_identifier= "sampleID", continuous = c("age", "RIN",
#  "AlignmentSummaryMetrics__PF_READS_ALIGNED",
#  "AlignmentSummaryMetrics__TOTAL_READS",  "RnaSeqMetrics__INTERGENIC_BASES",
#  "RnaSeqMetrics__INTRONIC_BASES", "RnaSeqMetrics__PCT_CODING_BASES",
#  "RnaSeqMetrics__PCT_INTERGENIC_BASES", "RnaSeqMetrics__PCT_INTRONIC_BASES",
#  "RnaSeqMetrics__PCT_RIBOSOMAL_BASES"
#))

clean_md <- sageseqr::clean_covariates(md = import_metadata, factors = c("sampleID", "totalID", "disease_cohort", 'sex', "ind_mut_status", "apoeGenotype", "ind_mutation", "ADAD_fam_mut", "CDR" ), sample_identifier= "sampleID", continuous = c("age", "RIN",
  "AlignmentSummaryMetrics__PF_READS_ALIGNED",
  "AlignmentSummaryMetrics__TOTAL_READS",   "RnaSeqMetrics__INTERGENIC_BASES",
  "RnaSeqMetrics__INTRONIC_BASES", "RnaSeqMetrics__PCT_CODING_BASES",
  "RnaSeqMetrics__PCT_INTERGENIC_BASES", "RnaSeqMetrics__PCT_INTRONIC_BASES",
  "RnaSeqMetrics__PCT_RIBOSOMAL_BASES"
))

filtered_counts <- sageseqr::filter_genes( clean_metadata = clean_md, count_df = counts, conditions = c("ind_mut_status", "ADAD_fam_mut", "ind_mutation", "sex"), cpm_threshold=1, conditions_threshold=.5 )

biomart_results <- get_biomart(count_df = counts, gene_id = "ensembl_gene_id", synid = "syn22254975", 
    version = 2L, filters = "ensembl_gene_id", host = "ensembl.org", 
    organism = "hsa")
colnames(biomart_results)[1] <- "ensembl_gene_id"

filtered_counts<- filtered_counts[ row.names(filtered_counts) %in% row.names(biomart_results), ]
table(row.names(filtered_counts) %in% row.names(biomart_results))

cqn_counts <- sageseqr::cqn(filtered_counts, biomart_results)

Sex Chromosome-specific Gene Expression Patterns

Cohort Sex check based on the sex specific transcripts UTY and XIST. Sample 19 was not designated male or female and is infered to be female based on no UTY expression and high XIST expression

COUNT <- as.data.frame( counts )
COUNT$ensembl_gene_id <- do.call(rbind, strsplit(row.names(COUNT), '\\.'))[,1]
COUNT <- COUNT[ grepl('ENSG', row.names(COUNT)), ]
COUNT <- COUNT[ COUNT$ensembl_gene_id %in% biomart_results$ensembl_gene_id, ]

METADATA <- as.data.frame( import_metadata )
row.names(METADATA) <- METADATA$sampleID

REPORTED.GENDER.COUNTS = biomart_results %>% 
  left_join(COUNT) %>%
  dplyr::select(-one_of("percentage_gene_gc_content")) %>%
  filter(chromosome_name == "X" |chromosome_name == "Y") %>% 
  tidyr::gather(key = item, value = value, -c( ensembl_gene_id, hgnc_symbol, gene_biotype, chromosome_name, gene_length, ensembl_gene_id)) %>%
  dplyr::mutate(value = log(value)) %>%
  dplyr::rename(`counts(log)`= value) %>% 
  dplyr::rename(sampleID = item) %>%
  left_join(METADATA[,c("sampleID", "sex")]) %>% 
  dplyr::rename(`Reported Gender` = sex) 

my.theme <- theme_bw() %+replace% theme(legend.position = 'top', axis.text.x = element_text(angle = 90, hjust = 1), plot.title=element_text(hjust=0.5))
p = list()
p[[1]] = ggplot(filter(REPORTED.GENDER.COUNTS, chromosome_name == "X"), aes(x = `Reported Gender`, y = `counts(log)`)) + geom_boxplot()
p[[1]] = p[[1]] + ggtitle('X') + my.theme
p[[2]] = ggplot(filter(REPORTED.GENDER.COUNTS, chromosome_name == "Y"), aes(x = `Reported Gender`, y = `counts(log)`)) + geom_boxplot()
p[[2]] = p[[2]] + ggtitle('Y') + my.theme
multiplot(plotlist = p, cols = 2)
##XIST and UTY expression 
#ENSG00000229807.11 and ENSG00000183878.15 

#Plot initial data
FILT <- REPORTED.GENDER.COUNTS[ , c('ensembl_gene_id', 'chromosome_name', 'sampleID','counts(log)', 'Reported Gender')] %>% 
  filter( ensembl_gene_id == "ENSG00000229807" | ensembl_gene_id == "ENSG00000183878") %>% 
  dplyr::select(-one_of("chromosome_name")) %>% 
  tidyr::spread(key = ensembl_gene_id, value = `counts(log)`) %>% 
  mutate(XIST = as.numeric(`ENSG00000229807`)) %>% 
  mutate(UTY = as.numeric(`ENSG00000183878`)) %>% 
  mutate(UTY = ifelse(UTY == -Inf, 0, UTY)) %>% 
  mutate(XIST = ifelse(XIST == -Inf, 0, XIST))
p = ggplot(FILT, aes (x= XIST, y = UTY)) 
p = p + geom_point(aes(color=`Reported Gender`)) + 
  ggtitle("Sex Check Inital Sex: UW Cohort") + 
  theme(plot.title = element_text(hjust = 0.5, size = 15)) +
  labs(colour = "Reported Gender")
p

###Find Sex of missing values
if( T %in%  is.na(REPORTED.GENDER.COUNTS$`Reported Gender`) ){
  FILT <- REPORTED.GENDER.COUNTS[ is.na(REPORTED.GENDER.COUNTS$`Reported Gender`), c('ensembl_gene_id', 'chromosome_name', 'sampleID','counts(log)', 'Reported Gender')] %>% 
    filter( ensembl_gene_id == "ENSG00000229807" | ensembl_gene_id == "ENSG00000183878") %>% 
    dplyr::select(-one_of("chromosome_name")) %>% 
    tidyr::spread(key = ensembl_gene_id, value = `counts(log)`) %>% 
    mutate(XIST = as.numeric(`ENSG00000229807`)) %>% 
    mutate(UTY = as.numeric(`ENSG00000183878`)) %>% 
    mutate(UTY = ifelse(UTY == -Inf, 0, UTY)) %>% 
    mutate(XIST = ifelse(XIST == -Inf, 0, XIST))
  p = ggplot(FILT, aes (x= XIST, y = UTY)) 
  p = p + geom_point() + 
    ggtitle("Sex Check Unknown Sex: UW Cohort") + 
    theme(plot.title = element_text(hjust = 0.5, size = 15)) +
    labs(colour = "Reported Gender")
  p

  Males <- FILT[ FILT$UTY > 5,]$sampleID
  Females <- FILT[ FILT$UTY < 3 ,]$sampleID

  import_metadata$Infered.Sex <- import_metadata$sex 
  if( length(Males) > 0 ){
    writeLines( paste0( 'Inferred Male Induviduals: ', paste0(Males,collapse =', ') ))
    import_metadata[ import_metadata$sampleID %in% Males, ]$Infered.Sex <- "male" #0
  }else{ writeLines( 'Inferred Male Induviduals: NA') }
  if( length(Females) > 0 ){
    writeLines( paste0( 'Inferred Female Induviduals: ', paste0(Females,collapse =', ') ))
    import_metadata[ import_metadata$sampleID %in% Females, ]$Infered.Sex <- "female" #1
  }else{ writeLines( 'Inferred Female Induviduals: NA') }

  clean_md$Infered.Sex <- import_metadata$Infered.Sex
}else{
  import_metadata$Infered.Sex <- import_metadata$sex
  clean_md$Infered.Sex <- import_metadata$Infered.Sex

}
clean_md$Infered.Sex <- as.factor(clean_md$Infered.Sex)

Sample clustering

PCA based clustering of samples before regressing out confounding variables. Health controls are masked beause they have no APOE status

# Find principal components of expression to plot
cqn_counts$E.no.na <- cqn_counts$E
METADATA <- clean_md[ , colnames(clean_md)[colnames(clean_md) != 'sex'] ]
fill<-colnames(METADATA)
METADATA$sampleID<- row.names(METADATA)
METADATA <- METADATA[, c('sampleID',fill)]
colnames(METADATA)[ colnames(METADATA) == 'Infered.Sex'] <- 'sex'
PC <- prcomp(cqn_counts$E.no.na, scale.=T, center = T)
# Plot first 2 PCs
plotdata <- data.frame(sampleID=rownames(PC$rotation), 
                       PC1=PC$rotation[,1], 
                       PC2=PC$rotation[,2])
plotdata <- left_join(plotdata, rownameToFirstColumn(METADATA, 'sampleID'))
p <- ggplot(plotdata, aes(x=PC1, y=PC2))
p <- p + geom_point(aes(color=disease_cohort, shape=apoeGenotype, size=RIN))
p <- p + theme_bw() + theme(legend.position="right")
p

Clustering of Samples

Tree based clustering of samples before regressing out confounding variables

# Eucledian tree based analysis
#COVARIATES.tmp = data.matrix(METADATA[,c( 'individualID', "sex", "apoeGenotype", "disease_cohort", 'ADAD_fam_mut', "ind_mut_status", "ind_mutation")])
COVARIATES.tmp = data.matrix(METADATA[,c( 'totalID', "sex", "apoeGenotype", "disease_cohort", 'ADAD_fam_mut', "ind_mut_status", "ind_mutation")])
COVARIATES.tmp[is.na(COVARIATES.tmp)] = 0
tree = hclust(as.dist(t(cqn_counts$E.no.na)))
cols = WGCNA::labels2colors(COVARIATES.tmp);
WGCNA::plotDendroAndColors(tree, 
                           colors = cols, 
                           dendroLabels = FALSE, 
                           abHeight = 0.80, 
                           main = "Sample dendrogram",
                           groupLabels = colnames(COVARIATES.tmp))

Distribution of samples (log cpm)

Log(Counts Per Million) density distribution of all genes accross each sample by cohort

# Plot abberent distribution of logcpm counts
tmp1 = cqn_counts$E %>%
  rownameToFirstColumn('Gene.ID') %>%
  tidyr::gather(sampleID, logCPM, -Gene.ID) %>%
  left_join(METADATA %>%
              rownameToFirstColumn('sampleID'))
p = ggplot(tmp1, aes(x = logCPM, color = sampleID)) + geom_density() 
p = p + theme(legend.position = 'NONE') + facet_grid(.~disease_cohort, scale = 'free')
p

Coexpression of genes

cr = cor(t(cqn_counts$E.no.na))
hist(cr, main = 'Distribution of correlation between genes', xlab = 'Correlation')

Significant Covariates

Correlation between pca of unadjusted mRNA expression and covariates are used to find significant covariates

# Find correlation between PC's of gene expression with covariates
cqn_counts$E = cqn_counts$E[,row.names(clean_md)]
cqn_counts$E.no.na = cqn_counts$E[,row.names(clean_md)]
cqn_counts$E.no.na[is.na(cqn_counts$E.no.na)] = 0

#clean_md
METADATA <- as.data.frame( clean_md)
row.names(METADATA) <- import_metadata$sampleID 
METADATA <- METADATA[, colnames(METADATA) != c('individualID','totalID') ]

Meta_HeatMap <- METADATA[, (colnames(METADATA) %in% c('individualID', 'totalID', 'Family_ID','sampleID')==F)  ]
#Meta_HeatMap <- METADATA[, (colnames(METADATA) %in% c('totalID','sampleID')==F)  ]
Meta_HeatMap$sex <- Meta_HeatMap$Infered.Sex
#Meta_HeatMap <- Meta_HeatMap[ , colnames(Meta_HeatMap)[ colnames(Meta_HeatMap) != 'Infered.Sex']]
Iters <- colnames(Meta_HeatMap)[colSums(is.na(Meta_HeatMap)) > 0]
##_## dim(Meta_HeatMap)
##_## dim(Meta_HeatMap[ complete.cases(Meta_HeatMap),])
writeLines("Total Metadata with Missing variables:")
preAdjustedSigCovars = runPCAandPlotCorrelations(cqn_counts$E.no.na, 
                                                 Meta_HeatMap,
                                                 'NULL design(voom-normalized)', 
                                                 isKeyPlot=TRUE, 
                                                 MIN_PVE_PCT_PC = 1)

preAdjustedSigCovars[["PC_res"]][[2]]$plotData

writeLines("Iterate across covariates with missingness to attempt to find association:")
for( focus in 1:length(Iters)){
  Meta_HeatMap_t <- METADATA

  Meta_HeatMap_t$individualID <- as.factor(Meta_HeatMap_t$individualID)

  Meta_HeatMap_t$sampleID <- row.names(METADATA)
  Meta_HeatMap_t <- Meta_HeatMap_t[ !is.na(Meta_HeatMap_t[,Iters[focus] ]), ]
  exp <- cqn_counts$E.no.na[,Meta_HeatMap_t$sampleID]
  Meta_HeatMap_t <- Meta_HeatMap_t[, (colnames(Meta_HeatMap_t) %in% c('sex', 'totalID','Family_ID', 'sampleID')) == F]

  ##_##'ADAD_fam_mut',
  if( Iters[focus] %in% c( 'ADAD_fam_mut', 'CDR' )){
    Meta_HeatMap_t <- Meta_HeatMap_t[ , (colnames(Meta_HeatMap_t) %in% names( which(apply(Meta_HeatMap_t, 2, var) == 0) ))==F,]
    Meta_HeatMap_t <- Meta_HeatMap_t[ , (colnames(Meta_HeatMap_t) %in% 'disease_cohort')==F,]

  }else{}

  preAdjustedSigCovars = runPCAandPlotCorrelations(exp, 
                                                 Meta_HeatMap_t,
                                                 'NULL design(voom-normalized)', 
                                                 isKeyPlot=TRUE, 
                                                 MIN_PVE_PCT_PC = 1)

  preAdjustedSigCovars[["PC_res"]][[2]]$plotData
  if( Iters[focus] %in% preAdjustedSigCovars$significantCovars ){
    writeLines( paste0( Iters[focus], " Is Significantly Associated"))
  }else{
    writeLines( paste0( Iters[focus], " Is NOT Significantly Associated"))
  }
}
## RIN, Sex, disease_cohort, RnaSeqMetrics__PCT_CODING_BASES, RnaSeqMetrics__PCT_INTERGENIC_BASES, RnaSeqMetrics__INTRONIC_BASES, RnaSeqMetrics__PCT_INTRONIC_BASES
# Ind Mut Status: Yes PC11
# APOE: Yes PC5, PC11
# Ind_Mutation: NO
# ADAD_fam_mut: NO
# CDR: NO 
# Age: NO

Normalisation (iterative design)

Since many covariates are correlated, re-normalising and re-adjusting COUNTS with an iterative design matrix 1. Adding Batch and Sex a priori to variable selection 2. Primary variable of interest Diagnosis is excluded from the pool of available covariates for selection

# Primary variable of interest
#RIN, Sex, disease_cohort, RnaSeqMetrics__PCT_CODING_BASES, RnaSeqMetrics__PCT_INTERGENIC_BASES, RnaSeqMetrics__INTRONIC_BASES, RnaSeqMetrics__PCT_INTRONIC_BASES
# Ind Mut Status: Yes PC11
# APOE

source('~/sageseqr/utility_functions/parallelDuplicateCorrelation.R')
primaryVariable <- c( "disease_cohort")
#exclude APOE for now, can condition on it later
FactorCovariates <- c( "sex" )
ContCovariates <- c("RIN", "RnaSeqMetrics__PCT_CODING_BASES", "RnaSeqMetrics__PCT_INTERGENIC_BASES", "RnaSeqMetrics__INTRONIC_BASES", "RnaSeqMetrics__PCT_INTRONIC_BASES")
#METADATA <- METADATA[ , colnames(METADATA) != 'Family_ID']

#postAdjustCovars = c( 'sex', 'race', 'yearsEducation', 'bmi' );
postAdjustCovars = 'sex';
# Assign residual covariates
residualCovars = setdiff(preAdjustedSigCovars$significantCovars, c(postAdjustCovars, primaryVariable))
residualCovars <- residualCovars[ residualCovars != 'Infered.Sex']
residualSigCovars = preAdjustedSigCovars
covariatesEffects = preAdjustedSigCovars$Effects.significantCovars[residualCovars]
covariatesEffects <- covariatesEffects[ names(covariatesEffects) != 'Infered.Sex' ]
postAdjustCovars = c(postAdjustCovars, names(which.max(covariatesEffects))) %>% unique()
postAdjustCovars <- postAdjustCovars[ postAdjustCovars != 'Infered.Sex']
loopCount = 0 
cqn_counts$E <- cqn_counts$E[,row.names(METADATA)]
cqn_counts$counts <- cqn_counts$counts[,row.names(METADATA)]
cqn_counts$E.no.na <- cqn_counts$E.no.na[,row.names(METADATA)]
NEW.COUNTS <- cqn_counts$E
notEstimable = c()
# Re-Assign the induvidualIDs
METADATA$individualID <- clean_md[ row.names(METADATA), ]$individualID
#METADATA$individualID <- clean_md[ row.names(METADATA), ]$totalID
METADATA$sex <- METADATA$Infered.Sex
METADATA <- METADATA[ , colnames(METADATA) != 'Infered.Sex']
while(length(residualSigCovars$significantCovars)!=0 && loopCount <= 20){
  writeLines(paste('Using following covariates in the model:',
                   paste(postAdjustCovars, collapse=', '),
                   'as fixed effects and individualID as random effect'))

  #writeLines(paste('Using following covariates in the model:',
                   #paste(postAdjustCovars, collapse=', '),
                   #'as fixed effects'))

  # Post adjusted design matrix
  DM1 = getDesignMatrix(METADATA[,postAdjustCovars,drop=F],Intercept = F)
  DM1$design = DM1$design[,linColumnFinder(DM1$design)$indepCols]

  # Estimate voom weights for dispersion control
  cnts = NEW.COUNTS
  cnts[is.na(cnts)] = 0
  VOOM.GENE_EXPRESSION = voom(cqn_counts$counts, 
                              design=DM1$design,
                              #block= METADATA$individualID,
                              plot=F)#,
                              #na.rm = T)

  VOOM.GENE_EXPRESSION$E = cqn_counts$E
  correlation = parallelDuplicateCorrelation(VOOM.GENE_EXPRESSION,
                                             block = METADATA$individualID,
                                             method = 'lmer')

  if (!is.nan(correlation$cor)){
    # Re-estimate voom weights
    VOOM.GENE_EXPRESSION = voom(cqn_counts$counts, 
                                design=DM1$design, plot=F, 
                                block = METADATA$individualID, 
                                correlation = correlation$cor)

    # Fit linear model using new weights and new design
    VOOM.ADJUSTED.FIT = lmFit(cqn_counts$E,
                              design = DM1$design,
                              weights = VOOM.GENE_EXPRESSION$weights,
                              block = METADATA$individualID, 
                              correlation = correlation$cor)

    # Residuals after normalisation
    RESIDUAL.GENE_EXPRESSION = residuals.MArrayLM(VOOM.ADJUSTED.FIT,
                                                  cqn_counts$E)
  } else {
    notEstimable = c(notEstimable, postAdjustCovars[length(postAdjustCovars)])
    postAdjustCovars = postAdjustCovars[1:(length(postAdjustCovars)-1)]
  }

  # Residual covariates to choose from
  residCovars <- setdiff(c(FactorCovariates,ContCovariates),
                         c(postAdjustCovars, primaryVariable, 'individualID', notEstimable))

  # Find PC of residual gene expression and significant covariates that are highly correlated with PCs
  expr = RESIDUAL.GENE_EXPRESSION; expr[is.na(expr)] = 0
  residualSigCovars = runPCAandPlotCorrelations(expr, 
                                                METADATA[, residCovars, drop=F], 
                                                'adjusted design(voom-normalized)',
                                                isKeyPlot=TRUE)

  # Add postadjusted covariates (if any)
  residCovars = setdiff(residualSigCovars$significantCovars, c(postAdjustCovars, primaryVariable, notEstimable))
  covariatesEffects = residualSigCovars$Effects.significantCovars[residCovars]

  postAdjustCovars = c(postAdjustCovars, names(which.max(covariatesEffects)))
  loopCount = loopCount + 1
}
modelStr <- paste(paste(gsub('_','\\\\_',postAdjustCovars), collapse=', '),
                  'as fixed effects')
tmp <- paste('Using following covariates in the final model:', modelStr)

Sanity check

Have we regressed out the confounding variable signal

# Find PC of residual gene expression and significant covariates that are highly correlated with PCs
METADATA$Family_ID <- as.factor(METADATA$Family_ID)
METADATA$individualID <- as.factor(METADATA$individualID)
residualSigCovars = runPCAandPlotCorrelations(expr, 
                                              METADATA,
                                              'adjusted design(voom-normalized)',
                                              isKeyPlot=TRUE)
residualSigCovars[["PC_res"]][[2]]$plotData

Coexpression of genes

cr = cor(t(expr))
hist(cr, main = 'Distribution of correlation between genes', xlab = 'Correlation')

PCA of residual data

PCA based clustering of samples after regressing out confounding variables. Health controls are masked beause they have no APOE status

# Find principal components of expression to plot
PC <- prcomp(expr, scale.=T, center = T)
# Plot first 4 PCs
plotdata <- data.frame(individualID=rownames(PC$rotation), 
                       PC1=PC$rotation[,1], 
                       PC2=PC$rotation[,2])
plotdata <- left_join(plotdata, rownameToFirstColumn(METADATA, 'individualID'))
p <- ggplot(plotdata, aes(x=PC1, y=PC2)) 
p <- p + geom_point(aes(color=disease_cohort, shape=apoeGenotype, size=RIN))
p <- p + theme_bw() + theme(legend.position="right")
p

Clustering of Samples

Tree based clustering of samples before regressing out confounding variables

# Eucledian tree based analysis
COVARIATES.tmp = data.matrix(METADATA[,c( 'individualID', "sex", "apoeGenotype", "disease_cohort", 'ADAD_fam_mut', "ind_mut_status", "ind_mutation")])
COVARIATES.tmp[is.na(COVARIATES.tmp)] = 0
tree = hclust(as.dist(t(expr)))
cols = WGCNA::labels2colors(COVARIATES.tmp);
WGCNA::plotDendroAndColors(tree, 
                           colors = cols, 
                           dendroLabels = FALSE, 
                           abHeight = 0.80, 
                           main = "Sample dendrogram",
                           groupLabels = colnames(COVARIATES.tmp))

Adjust data with covariates for Network Analysis

Identified covariates are regressed out from the expression matrix for network analysis

# Get design matrix
DESIGN.NET = getDesignMatrix(METADATA[, postAdjustCovars, drop = F], Intercept = F)
DESIGN.NET = DESIGN.NET$design[,linColumnFinder(DESIGN.NET$design)$indepCols]
# Estimate voom weights for dispersion control
cnts = cqn_counts$counts
cnts[is.na(cnts)] = 0
VOOM.NET.WEIGHTS = voom(cnts, design=DESIGN.NET, plot=F)
# Fit linear model using new weights and new design
VOOM.NET.FIT = lmFit(cqn_counts$E,
                     design = DESIGN.NET,
                     weights = VOOM.NET.WEIGHTS$weights)
# Residuals after normalisation
RESIDUAL.NET.GENE_EXPRESSION = residuals.MArrayLM(VOOM.NET.FIT,
                                                  cqn_counts$E)

Differential expression analysis (with Risk Cohort as primary variable)

Differential expression is performed by controlling for covariates identified above. Comparisons are between the Matched FamilialAD Controls, the FamilialAD Mutation carriers, the sporatic AD cohort and the healthy controls

Comparisons Run - 6

Interpretation of Diagnosis - Control: No AD Risk FamilialAD_Mut: Familial AD Risk With Mutation FamilialAD_NoMut: Familial AD Risk Without Mutation Sporadic: Sporadic AD Risk

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are deemed significantly differentially expressed

# Get design matrix
METADATA$individualID <- as.character( METADATA$totalID )
METADATA_Sink <- METADATA
METADATA$Comp <- as.character( METADATA$disease_cohort )
METADATA[ METADATA$disease_cohort %in% 'FamilialAD' & METADATA$ind_mut_status %in% 'HasMut', ]$Comp <- 'FamilialAD_Mut'
METADATA[ METADATA$disease_cohort %in% 'FamilialAD' & METADATA$ind_mut_status %in% 'NoMut', ]$Comp <- 'FamilialAD_NoMut'
METADATA$Comp <- as.factor(METADATA$Comp)

#Previous was the Full Cohorts
# Add Cohort in  here to try and  be more conservative
DESIGN = getDesignMatrix(METADATA[, c('Comp', postAdjustCovars), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
#colnames(DESIGN$design) <- gsub('disease_cohort','',colnames(DESIGN$design))
colnames(DESIGN$design) <- gsub('Comp','',colnames(DESIGN$design))
# Estimate voom weights for dispersion control
VOOM.WEIGHTS = voom(cqn_counts$counts, design=DESIGN$design, plot=F)
#_#VOOM.WEIGHTS = voom(cnts, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design

VOOM.WEIGHTS$E = cqn_counts$E
correlation = parallelDuplicateCorrelation(VOOM.WEIGHTS,
                                             block = as.character( METADATA$individualID ),
                                             method = 'lmer')

#correlatio = duplicateCorrelation(VOOM.WEIGHTS,DESIGN$design,block=METADATA$individualID)

FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            block = METADATA$individualID,
            weights = VOOM.WEIGHTS$weights,
            correlation = correlation$cor)

# Fit contrast
contrast = makeContrasts(contrasts=c("FamilialAD_NoMut-FamilialAD_Mut", 
                                     "FamilialAD_NoMut-SporadicAD",
                                     "FamilialAD_NoMut-Healthy_Control",
                                     "FamilialAD_Mut-SporadicAD", 
                                     "FamilialAD_Mut-Healthy_Control",
                                     "SporadicAD-Healthy_Control"
                                    ),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c("FamilialAD_NoMut-FamilialAD_Mut", 
              "FamilialAD_NoMut-SporadicAD",
              "FamilialAD_NoMut-Healthy_Control",
              "FamilialAD_Mut-SporadicAD", 
              "FamilialAD_Mut-Healthy_Control",
              "SporadicAD-Healthy_Control"
            )
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'UW',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp=NULL
all.fit=NULL
all.diff.exp = c(all.diff.exp, list(AD_Cohort_CaseControl = DE))
all.fit = c(all.fit, list(AD_Cohort_CaseControl = FIT))

Differential expression analysis All 'controls' versus Sporatic and FamilialAD cases (with Risk Cohort as primary variable)

Differential expression is performed by controlling for covariates identified above. Comparisons are between the (FamilialAD Controls + healthy controls) VS. the FamilialAD Mutation carriers, and the sporatic AD cohort

Comparisons Run - 2

Interpretation of Diagnosis Control: No AD Risk + FamilialAD_NoMut FamilialAD_Mut: Familial AD Risk With Mutation Sporadic: Sporadic AD Risk

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are deemed significantly differentially expressed

# Get design matrix
METADATA <- METADATA_Sink
METADATA$Comp <- as.character( METADATA$disease_cohort )
METADATA[ METADATA$disease_cohort %in% 'FamilialAD' & METADATA$ind_mut_status %in% 'HasMut', ]$Comp <- 'FamilialAD_Mut'
METADATA[ METADATA$disease_cohort %in% 'FamilialAD' & METADATA$ind_mut_status %in% 'NoMut', ]$Comp <- 'Control'
METADATA[ METADATA$disease_cohort %in% 'Healthy_Control', ]$Comp <- 'Control'

METADATA$Comp <- as.factor(METADATA$Comp)

#Previous was the Full Cohorts
# Add Cohort in  here to try and  be more conservative
DESIGN = getDesignMatrix(METADATA[, c('Comp', postAdjustCovars), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
#colnames(DESIGN$design) <- gsub('disease_cohort','',colnames(DESIGN$design))
colnames(DESIGN$design) <- gsub('Comp','',colnames(DESIGN$design))
# Estimate voom weights for dispersion control
VOOM.WEIGHTS = voom(cqn_counts$counts, design=DESIGN$design, plot=F)
#_#VOOM.WEIGHTS = voom(cnts, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design

VOOM.WEIGHTS$E = cqn_counts$E
correlation = parallelDuplicateCorrelation(VOOM.WEIGHTS,
                                             block = as.character( METADATA$individualID ),
                                             method = 'lmer')

#correlatio = duplicateCorrelation(VOOM.WEIGHTS,DESIGN$design,block=METADATA$individualID)

FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            block = METADATA$individualID,
            weights = VOOM.WEIGHTS$weights,
            correlation = correlation$cor)

# Fit contrast
contrast = makeContrasts(contrasts=c("Control-FamilialAD_Mut", 
                                     "Control-SporadicAD"
                                    ),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c("Control-FamilialAD_Mut", 
              "Control-SporadicAD"
            )
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'UW',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p

all.diff.exp = c(all.diff.exp, list(AD_Comb_Cohort = DE))
all.fit = c(all.fit, list(AD_Comb_Cohort = FIT))

Differential expression analysis (with Mutation Status as primary variable)

Differential expression is performed on the mutation status that is profiled as mutated by controlling for covariates identified above and disease cohort. Unknown (NA) mutation status is ignored as the patient's status of mutation is unknown.

Comparisons Run - 1

Interpretation of Mutation Status NoMut: No Mutation in (APP, PSEN1, PSEN2, Other) Mut: Has a Mutation in (APP, PSEN1, PSEN2, Other)

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are deemed significantly differentially expressed

#Remove NAs (The Control samples don't have mutation status)
METADATA_Use <- METADATA_Sink
#METADATA_Use$ind_mut_status <- addNA(METADATA_Use$ind_mut_status)
#levels(METADATA_Use$ind_mut_status) <- c(levels(METADATA_Use$ind_mut_status), 9)
levels(METADATA_Use$ind_mut_status) <- c(levels(METADATA_Use$ind_mut_status), 'UNK')
METADATA_Use$ind_mut_status[ is.na(METADATA_Use$ind_mut_status)] <- 'UNK'
METADATA_Use$ind_mut_status <- as.factor(as.character(METADATA_Use$ind_mut_status))

# Get design matrix
DESIGN = getDesignMatrix(METADATA_Use[, c("ind_mut_status", primaryVariable, postAdjustCovars), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
colnames(DESIGN$design) <- gsub('ind_mut_status','',colnames(DESIGN$design))

# Estimate voom weights for dispersion control
VOOM.WEIGHTS = voom(cqn_counts$counts, design=DESIGN$design, plot=F)
#_#VOOM.WEIGHTS = voom(cnts, design=DESIGN$design, plot=F)

# Fit linear model using new weights and new design
VOOM.WEIGHTS$E = cqn_counts$E
correlation = parallelDuplicateCorrelation(VOOM.WEIGHTS,
                                             block = as.character( METADATA$individualID ),
                                             method = 'lmer')

FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            block = METADATA$individualID,
            weights = VOOM.WEIGHTS$weights,
            correlation = correlation$cor)

# Fit contrast
contrast = makeContrasts(contrasts=c("NoMut-HasMut"#, 
                                     ),#"HasMut-UNK",
                                     #"NoMut-UNK"),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c('NoMut-HasMut')#, 'NoMut-MissingData', 'HasMut-Missing' )
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'UW',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('grey','green','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list(MutationStatus = DE))
all.fit = c(all.fit, list(MutationStatus = FIT))

Differential expression analysis (with Gene Mutation as primary variable)

Differential expression is performed on the specific gene that is profiled as mutated by controlling for covariates identified above and disease cohort.

Comparisons Run - 15

Interpretation of Mutation Status NoMut: No Mutation in (APP, PSEN1, PSEN2, Other) PS1: Has a Mutation in PSEN1 PS2: Has a Mutation in PSEN1 PSEN2 APP: Has a Mutation in PSEN1 APP Other: A mutation in a gene other than APP, PSEN1, or PSEN2 UNK: No Mutation Status Available (These are mostly controls)

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are deemed significantly differentially expressed

#Remove NAs (The Control samples don't have mutation status)
METADATA_Use <- METADATA_Sink
METADATA_Use$ind_mutation <- addNA(METADATA_Use$ind_mutation)
levels(METADATA_Use$ind_mutation) <- c(levels(METADATA_Use$ind_mutation), 'UNK')
METADATA_Use$ind_mutation[ is.na(METADATA_Use$ind_mutation)] <- 'UNK'
METADATA_Use$ind_mutation <- as.factor(as.character(METADATA_Use$ind_mutation))

# Get design matrix
DESIGN = getDesignMatrix(METADATA_Use[, c("ind_mutation", primaryVariable, postAdjustCovars), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
colnames(DESIGN$design) <- gsub('ind_mutation','',colnames(DESIGN$design))

# Estimate voom weights for dispersion control
VOOM.WEIGHTS = voom(cqn_counts$counts, design=DESIGN$design, plot=F)
#_#VOOM.WEIGHTS = voom(cnts, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design
# Fit linear model using new weights and new design
VOOM.WEIGHTS$E = cqn_counts$E
correlation = parallelDuplicateCorrelation(VOOM.WEIGHTS,
                                             block = as.character( METADATA$individualID ),
                                             method = 'lmer')

FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            block = METADATA$individualID,
            weights = VOOM.WEIGHTS$weights,
            correlation = correlation$cor)

# Fit contrast
contrast = makeContrasts(contrasts=c("APP-None", 
                                     "APP-Other",
                                     "APP-PS1",
                                     "APP-PS2",
                                     "APP-UNK",
                                     "None-Other",
                                     "None-PS1",
                                     "None-PS2",
                                     "None-UNK",
                                     "Other-PS1",
                                     "Other-PS2",
                                     "Other-UNK",
                                     "PS1-PS2",
                                     "PS1-UNK",
                                     "PS2-UNK"),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c("APP-None", 
              "APP-Other",
              "APP-PS1",
              "APP-PS2",
              "APP-UNK",
              "None-Other",
              "None-PS1",
              "None-PS2",
              "None-UNK",
              "Other-PS1",
              "Other-PS2",
              "Other-UNK",
              "PS1-PS2",
              "PS1-UNK",
              "PS2-UNK")

DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'UW',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list(MutationType = DE))
all.fit = c(all.fit, list(MutationType = FIT))

Differential expression analysis (with Sex as primary variable)

Differential expression is performed on the mutation status that is profiled as mutated by controlling for covariates identified above and disease cohort

Comparisons Run - 1

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are deemed significantly differentially expressed

#Remove NAs (The Control samples don't have mutation status)
METADATA_Use <- METADATA_Sink

# Get design matrix
DESIGN = getDesignMatrix(METADATA_Use[, c("sex", primaryVariable, postAdjustCovars), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
colnames(DESIGN$design) <- gsub( 'sex', '', colnames(DESIGN$design) )

# Estimate voom weights for dispersion control
VOOM.WEIGHTS = voom(cqn_counts$counts, design=DESIGN$design, plot=F)
#_#VOOM.WEIGHTS = voom(cnts, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design
# Fit linear model using new weights and new design
VOOM.WEIGHTS$E = cqn_counts$E
correlation = parallelDuplicateCorrelation(VOOM.WEIGHTS,
                                             block = as.character( METADATA$individualID ),
                                             method = 'lmer')

FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            block = METADATA$individualID,
            weights = VOOM.WEIGHTS$weights,
            correlation = correlation$cor)

# Fit contrast
contrast = makeContrasts(contrasts=c("male-female"),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c( 'Male-Female' )
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'UW',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list(Sex = DE))
all.fit = c(all.fit, list(Sex = FIT))

Differential expression analysis (with Disease Cohort and Sex)

Differential expression is performed on Comparisons are between the Matched FamilialAD Controls, the FamilialAD Mutation carriers, the sporatic AD cohort x Sex variable by controlling for covariates identified above.

Comparisons Run - 6

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are marked as differentially expressed Interpretation of Diagnosis Control: No AD Risk FamilialAD_Mut: Familial AD Risk With Mutation FamilialAD_NoMut: Familial AD Risk Without Mutation Sporadic: Sporadic AD Risk

All conditions are looked at by sex

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are deemed significantly differentially expressed

#Remove NAs (The Control samples don't have mutation status)
METADATA <- METADATA_Sink
METADATA$Comp <- as.character( METADATA$disease_cohort )
METADATA[ METADATA$disease_cohort %in% 'FamilialAD' & METADATA$ind_mut_status %in% 'HasMut', ]$Comp <- 'FamilialAD_Mut'
METADATA[ METADATA$disease_cohort %in% 'FamilialAD' & METADATA$ind_mut_status %in% 'NoMut', ]$Comp <- 'FamilialAD_NoMut'
METADATA$Comp <- as.factor(METADATA$Comp)

METADATA$disease_cohort.Sex = paste(METADATA$Comp,METADATA$sex, sep = '.') %>%
  factor
# Get design matrix
# Add Cohort in  here to try and  be more conservative
DESIGN = getDesignMatrix(METADATA[, c('disease_cohort.Sex', setdiff(postAdjustCovars, 'Sex')), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
colnames(DESIGN$design) = gsub('disease_cohort.Sex','',colnames(DESIGN$design))
# Estimate voom weights for dispersion control
VOOM.WEIGHTS =  voom(cqn_counts$counts, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design
VOOM.WEIGHTS$E = cqn_counts$E
correlation = parallelDuplicateCorrelation(VOOM.WEIGHTS,
                                             block = as.character( METADATA$individualID ),
                                             method = 'lmer')

FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            block = METADATA$individualID,
            weights = VOOM.WEIGHTS$weights,
            correlation = correlation$cor)

# Fit contrast
contrast = makeContrasts( 
  contrasts=c("FamilialAD_NoMut.female-FamilialAD_Mut.female",
              "FamilialAD_NoMut.male-FamilialAD_Mut.male",
              "Healthy_Control.female-SporadicAD.female",
              "Healthy_Control.male-SporadicAD.male",
              "SporadicAD.female-FamilialAD_Mut.female",
              "SporadicAD.male-FamilialAD_Mut.male"
              ),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c( "FamilialAD_NoMut.female-FamilialAD_Mut.female",
              "FamilialAD_NoMut.male-FamilialAD_Mut.male",
              "Healthy_Control.female-SporadicAD.female",
              "Healthy_Control.male-SporadicAD.male",
              "SporadicAD.female-FamilialAD_Mut.female",
              "SporadicAD.male-FamilialAD_Mut.male" 
            )
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'UW',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list(AD_Cohort.Sex = DE))
all.fit = c(all.fit, list(AD_Cohort.Sex = FIT))

Differential expression analysis ( All 'controls' versus Sporatic and FamilialAD cases and Sex )

Differential expression comparisons are between the (FamilialAD Controls + healthy controls) VS. the FamilialAD Mutation carriers, and the sporatic AD cohort x Sex variable by controlling for covariates identified above.

Comparisons Run - 6

Interpretation of Diagnosis Control: No AD Risk + FamilialAD_NoMut FamilialAD_Mut: Familial AD Risk With Mutation Sporadic: Sporadic AD Risk

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are deemed significantly differentially expressed

#Remove NAs (The Control samples don't have mutation status)
METADATA <- METADATA_Sink
METADATA$Comp <- as.character( METADATA$disease_cohort )
METADATA[ METADATA$disease_cohort %in% 'FamilialAD' & METADATA$ind_mut_status %in% 'HasMut', ]$Comp <- 'FamilialAD_Mut'
METADATA[ METADATA$disease_cohort %in% 'FamilialAD' & METADATA$ind_mut_status %in% 'NoMut', ]$Comp <- 'Control'
METADATA[ METADATA$disease_cohort %in% 'Healthy_Control', ]$Comp <- 'Control'

METADATA$Comp <- as.factor(METADATA$Comp)
METADATA_Use <- METADATA
METADATA_Use$disease_cohort.Sex = paste(METADATA_Use$Comp,METADATA_Use$sex, sep = '.') %>%
  factor
# Get design matrix
# Add Cohort in  here to try and  be more conservative
DESIGN = getDesignMatrix(METADATA_Use[, c('disease_cohort.Sex', setdiff(postAdjustCovars, 'Sex')), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
colnames(DESIGN$design) = gsub('disease_cohort.Sex','',colnames(DESIGN$design))
# Estimate voom weights for dispersion control
VOOM.WEIGHTS =  voom(cqn_counts$counts, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design
VOOM.WEIGHTS$E = cqn_counts$E
correlation = parallelDuplicateCorrelation(VOOM.WEIGHTS,
                                             block = as.character( METADATA$individualID ),
                                             method = 'lmer')

FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            block = METADATA$individualID,
            weights = VOOM.WEIGHTS$weights,
            correlation = correlation$cor)

# Fit contrast
contrast = makeContrasts( 
  contrasts=c("Control.female-FamilialAD_Mut.female",
              "Control.male-FamilialAD_Mut.male",
              "Control.female-SporadicAD.female",
              "Control.male-SporadicAD.male",
              "SporadicAD.female-FamilialAD_Mut.female",
              "SporadicAD.male-FamilialAD_Mut.male"
              ),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c("Control.female-FamilialAD_Mut.female",
              "Control.male-FamilialAD_Mut.male",
              "Control.female-SporadicAD.female",
              "Control.male-SporadicAD.male",
              "SporadicAD.female-FamilialAD_Mut.female",
              "SporadicAD.male-FamilialAD_Mut.male"
            )
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'UW',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list(AD_CaseCntrl_Cohort.Sex = DE))
all.fit = c(all.fit, list(AD_CaseCntrl_Cohort.Sex = FIT))

Differential expression analysis (with Within Sporatic Cohort Between Mut No Mut)

Differential expression is performed on the Disease Cohort x Sex variable by controlling for covariates identified above

Comparisons - 1

Interpretation of Diagnosis Sporadic_HasMut: Sporadic AD Risk and A mutation Sporadic_HasMut: Sporadic AD Risk and does not have a mutation

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are differentially expressed

#Remove NAs (The Control samples don't have mutation status)
METADATA_Use <- METADATA_Sink
METADATA_Use$disease_cohort <- as.character(METADATA_Use$disease_cohort)
METADATA_Use$disease_cohort[ METADATA_Use$disease_cohort == '0' ] <- 'Controls'
METADATA_Use$disease_cohort[ METADATA_Use$disease_cohort == '1' ] <- 'Familial'
METADATA_Use$disease_cohort[ METADATA_Use$disease_cohort == '2' ] <- 'Sporadic'
METADATA_Use$Cohort.Mut <- paste0( as.character(METADATA_Use$disease_cohort), '.',
                                   as.character(METADATA_Use$ind_mut_status)
)

METADATA_Use$disease_cohort <- as.factor(METADATA_Use$disease_cohort)
METADATA_Use$Cohort.Mut <- as.factor(METADATA_Use$Cohort.Mut)

# Get design matrix
DESIGN = getDesignMatrix(METADATA_Use[, c('Cohort.Mut', postAdjustCovars), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
colnames(DESIGN$design) = gsub('Cohort.Mut','',colnames(DESIGN$design))
# Estimate voom weights for dispersion control
VOOM.WEIGHTS =  voom(cqn_counts$counts, design=DESIGN$design, plot=F)

# Fit linear model using new weights and new design
VOOM.WEIGHTS$E = cqn_counts$E
correlation = parallelDuplicateCorrelation(VOOM.WEIGHTS,
                                             block = as.character( METADATA$individualID ),
                                             method = 'lmer')

FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            block = METADATA$individualID,
            weights = VOOM.WEIGHTS$weights,
            correlation = correlation$cor)

# Fit contrast
contrast = makeContrasts( 
  contrasts=c(
              "SporadicAD.NoMut-SporadicAD.HasMut"
            ),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c("SporadicAD.NoMut-SporadicAD.HasMut"
            )

DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'UW',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('green','grey','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list(SporaticMut_NoMut = DE))
all.fit = c(all.fit, list(SporaticMut_NoMut = FIT))

Differential expression analysis ( with FAD_NoMut+HC versus Sporatic (Non-PSEN1) )

Control: FAD_NoMut or HC
Cases: Sporatic AD (Non-PSEN1)
Other Samples

Comparisons - 1 (39)

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are differentially expressed

#Remove NAs (The Control samples don't have mutation status)
METADATA_Use <- METADATA_Sink
METADATA_Use$ind_mut_status <- as.character(METADATA_Use$ind_mut_status)
METADATA_Use$ind_mut_status[ is.na(METADATA_Use$ind_mut_status) ] <- 9
METADATA_Use$ind_mutation <- as.character(METADATA_Use$ind_mutation)
METADATA_Use$ind_mutation[ is.na(METADATA_Use$ind_mutation) ] <- 9

METADATA_Use$FADNoMut_HC_vs_SporaticNoPSEN1 <- 9
#HC
METADATA_Use[ as.character(METADATA_Use$disease_cohort) %in% 'Healthy_Control', ]$FADNoMut_HC_vs_SporaticNoPSEN1 <- 0
#FAD Controls
METADATA_Use[ as.character(METADATA_Use$disease_cohort) %in% 'FamilialAD' & as.character(METADATA_Use$ind_mut_status) == 'NoMut', ]$FADNoMut_HC_vs_SporaticNoPSEN1 <- 0
#Cases
METADATA_Use[ as.character(METADATA_Use$disease_cohort) == 'SporadicAD' & as.character(METADATA_Use$ind_mutation) %in% c('APP','Other','PS2'), ]$FADNoMut_HC_vs_SporaticNoPSEN1 <- 1


METADATA_Use$FADNoMut_HC_vs_SporaticNoPSEN1 <- as.factor(METADATA_Use$FADNoMut_HC_vs_SporaticNoPSEN1)
METADATA_Use$ind_mutation <- as.factor(METADATA_Use$ind_mutation)
METADATA_Use$ind_mut_status <- as.factor(METADATA_Use$ind_mut_status)

# Get design matrix
DESIGN = getDesignMatrix(METADATA_Use[, c('FADNoMut_HC_vs_SporaticNoPSEN1', postAdjustCovars ), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
colnames(DESIGN$design) = gsub('FADNoMut_HC_vs_SporaticNoPSEN1',"Comp",colnames(DESIGN$design))
# Estimate voom weights for dispersion control
VOOM.WEIGHTS =  voom(cqn_counts$counts, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design
# Fit linear model using new weights and new design
VOOM.WEIGHTS$E = cqn_counts$E
correlation = parallelDuplicateCorrelation(VOOM.WEIGHTS,
                                             block = as.character( METADATA$individualID ),
                                             method = 'lmer')

FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            block = METADATA$individualID,
            weights = VOOM.WEIGHTS$weights,
            correlation = correlation$cor)

# Fit contrast
contrast = makeContrasts( 
  contrasts=c(
              "Comp0-Comp1"),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c('FADNoMut_HC-SporaticNoPSEN1')
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'UW',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('grey', 'green','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list( FADNoMut_HC_vs_SporaticNoPSEN1 = DE))
all.fit = c(all.fit, list( FADNoMut_HC_vs_SporaticNoPSEN1 = FIT))

Differential expression analysis (familial mutation PSEN1 Only - non-mutation )

Comparisons - 1

Control: FAD_NoMut
Cases: FAD_Mut( PSEN1 only )
Other Samples

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are differentially expressed

#Remove NAs (The Control samples don't have mutation status)
METADATA_Use <- METADATA
METADATA_Use$ind_mut_status <- as.character(METADATA_Use$ind_mut_status)
METADATA_Use$ind_mut_status[ is.na(METADATA_Use$ind_mut_status) ] <- 9
METADATA_Use$ind_mutation <- as.character(METADATA_Use$ind_mutation)
METADATA_Use$ind_mutation[ is.na(METADATA_Use$ind_mutation) ] <- 9

METADATA_Use$FAD_NoMut_FAD_PSEN1 <- 9
#FAD Controls
METADATA_Use[ as.character(METADATA_Use$disease_cohort) %in% 'FamilialAD' & as.character(METADATA_Use$ind_mut_status) %in% 'NoMut', ]$FAD_NoMut_FAD_PSEN1 <- 0
#Cases
METADATA_Use[ as.character(METADATA_Use$disease_cohort) %in% 'FamilialAD' & as.character(METADATA_Use$ind_mutation) %in% 'PS1', ]$FAD_NoMut_FAD_PSEN1 <- 1


METADATA_Use$FAD_NoMut_FAD_PSEN1 <- as.factor(METADATA_Use$FAD_NoMut_FAD_PSEN1)
METADATA_Use$ind_mutation <- as.factor(METADATA_Use$ind_mutation)
METADATA_Use$ind_mut_status <- as.factor(METADATA_Use$ind_mut_status)

# Get design matrix
DESIGN = getDesignMatrix(METADATA_Use[, c('FAD_NoMut_FAD_PSEN1', postAdjustCovars ), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
colnames(DESIGN$design) = gsub('FAD_NoMut_FAD_PSEN1',"Comp",colnames(DESIGN$design))
# Estimate voom weights for dispersion control
VOOM.WEIGHTS =  voom(cqn_counts$counts, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design
# Fit linear model using new weights and new design
VOOM.WEIGHTS$E = cqn_counts$E
correlation = parallelDuplicateCorrelation(VOOM.WEIGHTS,
                                             block = as.character( METADATA$individualID ),
                                             method = 'lmer')

FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            block = METADATA$individualID,
            weights = VOOM.WEIGHTS$weights,
            correlation = correlation$cor)

# Fit contrast
contrast = makeContrasts( 
  contrasts=c(
              "Comp0-Comp1"),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c('FAD_NoMut-FAD_PSEN1')
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'UW',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('grey', 'green','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list( FAD_NoMut_FAD_PSEN1 = DE))
all.fit = c(all.fit, list( FAD_NoMut_FAD_PSEN1 = FIT))

Differential expression analysis (familial mutation PSEN1 only - non-mutation+HC )

Comparisons - 1

Control: FAD_NoMut
Cases: FAD_Mut( PSEN1 only )
Other Samples

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are differentially expressed Comparisons - 1

Control: FAD_NoMut
Cases: FAD_Mut( PSEN1 only )
Other Samples

Genes that are differentially expressed at an FDR <= 0.05 and folde change of 1.2 are differentially expressed

#Remove NAs (The Control samples don't have mutation status)
METADATA_Use <- METADATA
METADATA_Use$ind_mut_status <- as.character(METADATA_Use$ind_mut_status)
METADATA_Use$ind_mut_status[ is.na(METADATA_Use$ind_mut_status) ] <- 9
METADATA_Use$ind_mutation <- as.character(METADATA_Use$ind_mutation)
METADATA_Use$ind_mutation[ is.na(METADATA_Use$ind_mutation) ] <- 9

METADATA_Use$FAD_NoMut_FAD_PSEN1 <- 9
#FAD Controls
METADATA_Use[ as.character(METADATA_Use$disease_cohort) %in% 'FamilialAD' & as.character(METADATA_Use$ind_mut_status) %in% 'NoMut', ]$FAD_NoMut_FAD_PSEN1 <- 0
#Healthy Controls
METADATA_Use[ as.character(METADATA_Use$disease_cohort) %in% 'Healthy_Control' , ]$FAD_NoMut_FAD_PSEN1 <- 0

#Cases
METADATA_Use[ as.character(METADATA_Use$disease_cohort) %in% 'FamilialAD' & as.character(METADATA_Use$ind_mutation) %in% 'PS1', ]$FAD_NoMut_FAD_PSEN1 <- 1


METADATA_Use$FAD_NoMut_FAD_PSEN1 <- as.factor(METADATA_Use$FAD_NoMut_FAD_PSEN1)
METADATA_Use$ind_mutation <- as.factor(METADATA_Use$ind_mutation)
METADATA_Use$ind_mut_status <- as.factor(METADATA_Use$ind_mut_status)

# Get design matrix
DESIGN = getDesignMatrix(METADATA_Use[, c('FAD_NoMut_FAD_PSEN1', postAdjustCovars ), drop = F], Intercept = F)
DESIGN$design = DESIGN$design[,linColumnFinder(DESIGN$design)$indepCols]
colnames(DESIGN$design) = gsub('FAD_NoMut_FAD_PSEN1',"Comp",colnames(DESIGN$design))
# Estimate voom weights for dispersion control
VOOM.WEIGHTS =  voom(cqn_counts$counts, design=DESIGN$design, plot=F)
# Fit linear model using new weights and new design
# Fit linear model using new weights and new design
VOOM.WEIGHTS$E = cqn_counts$E
correlation = parallelDuplicateCorrelation(VOOM.WEIGHTS,
                                             block = as.character( METADATA$individualID ),
                                             method = 'lmer')

FIT = lmFit(cqn_counts$E,
            design = DESIGN$design,
            block = METADATA$individualID,
            weights = VOOM.WEIGHTS$weights,
            correlation = correlation$cor)

# Fit contrast
contrast = makeContrasts( 
  contrasts=c(
              "Comp0-Comp1"),
                         levels = colnames(FIT$coefficients))
FIT.CONTR = contrasts.fit(FIT, contrasts=contrast)
FIT.CONTR = eBayes(FIT.CONTR)
# Get differnetial expression
DE = lapply(1:dim(contrast)[2], function(i, FIT){
  topTable(FIT, coef=i, number = dim(counts)[1], confint = T) %>%
    rownameToFirstColumn('ensembl_gene_id')
}, FIT.CONTR)
names(DE) = c('FAD_With_HC_NoMut-FAD_PSEN1')
DE = DE %>% 
  rbindlist(idcol = 'Comparison') %>%
  left_join(biomart_results %>%
              dplyr::select(ensembl_gene_id, hgnc_symbol, percentage_gene_gc_content, gene_length) %>%
              unique()) %>%
  dplyr::mutate(Study = 'UW',
                Direction = logFC/abs(logFC),
                Direction = factor(Direction, c(-1,1), c('-1' = 'DOWN', '1' = 'UP')),
                Direction = as.character(Direction))
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
DE$Direction[DE$adj.P.Val > 0.05 | abs(DE$logFC) < log2(1.2)] = 'NONE'
writeLines('Differentially expressed genes at an FDR 0.05 and logFC 1.2')
tmp = DE %>%
  dplyr::select(ensembl_gene_id, Comparison, Direction) %>%
  group_by(Comparison, Direction) %>%
  dplyr::summarise(FDR_0_05_FC_1.2 = length(unique(ensembl_gene_id))) %>%
  spread(Direction, FDR_0_05_FC_1.2) 
kable(tmp)
p = ggplot(DE, aes(y = -log10(adj.P.Val), x = logFC, color = Direction)) + geom_point() + xlim(c(-1,1))
p = p + scale_color_manual(values = c('grey', 'green','red'))
p = p + facet_grid(.~Comparison, scales = 'fixed')
p
all.diff.exp = c(all.diff.exp, list( FAD_NoMut_with_HC_FAD_PSEN1 = DE))
all.fit = c(all.fit, list( FAD_NoMut_with_HC_FAD_PSEN1 = FIT))

Store files in synapse

parentId ="syn22257765"
folderName = 'RNASeq'
CODE <- synapser::Folder(name = folderName, parentId = parentId)
CODE <- synapser::synStore(CODE)

activityName = 'Preliminary DE'
activityDescription = 'Preliminary Differential expression Analysis'

thisFileName <- 'UW_Plan.Rmd'
# Github link
thisRepo <- githubr::getRepo(repository = "jgockley62/UW_RNASeq", ref="branch", refName='master')
thisFile <- githubr::getPermlink(repository = thisRepo, repositoryPath=paste0(thisFileName))
#Set Used SynIDs For Provenance
used <- c( paste0( config::get("metadata")$synID, '.', config::get("metadata")$version),
           paste0( config::get("counts")$synID, '.', config::get("counts")$version ), 
           paste0( config::get("biomart")$synID, '.', config::get("biomart")$version)
)

# Set annotations
all.annotations = list(
  dataType = 'mRNA',
  dataSubType = 'geneExp',
  summaryLevel = 'gene',
  assay  = 'RNAseq',
  tissueTypeAbrv    = 'UW', 
  study = 'UW', 
  organism = 'HomoSapiens',
  consortium    = 'UW',
  normalizationStatus   = TRUE,
  normalizationType = 'CQN',
  rnaquantification = 'RSEM',
  genomeAssemblyID = 'GRCh38'
)
# Store differential expression results
rbindlist(all.diff.exp, use.names = T, fill = T, idcol = 'Model') %>%
  write.table(file = 'outs/UW_DiffExpression.tsv', sep = '\t', row.names=F, quote=F)
DEXP_OBJ = File('outs/UW_DiffExpression.tsv', 
                name = 'Differential Expression Results',
                parentId = CODE$properties$id )
all.annotations$annotations$dataSubType = 'diffExp'
DEXP_OBJ$annotations = all.annotations
DEXP_OBJ = synStore(DEXP_OBJ, used = used, activityName = activityName, executed = thisFile, activityDescription = activityDescription  )

# Store all differential expression models
save(all.fit, file = 'outs/UW_Models.RData')
FIT_OBJ = File('outs/UW_Models.RData', 
               name = 'All Limma Models',
               parentId = CODE$properties$id
               )

all.annotations$annotations$dataSubType = 'modelFits'
all.annotations$annotations$fileFormat = 'RData'
FIT_OBJ$annotations = all.annotations
FIT_OBJ = synStore(FIT_OBJ, used = used, activityName = activityName, executed = thisFile, activityDescription = activityDescription)

# Store filtered counts
filtered_counts %>%
  rownameToFirstColumn('ensembl_gene_id') %>%
  write.table(file = 'outs/UW_Filtered_Counts.tsv', sep = '\t', row.names=F, quote=F)
ENRICH_OBJ <-  File( path='outs/UW_Filtered_Counts.tsv', name = 'Counts (filtered raw)', parentId=CODE$properties$id ) 
  all.annotations$dataSubType = 'filteredCounts'
  synStore(ENRICH_OBJ, annotations = all.annotations, used = used, activityName = activityName, executed = thisFile, activityDescription = activityDescription)

# Store Estimated counts
NEW.COUNTS %>%
  rownameToFirstColumn('ensembl_gene_id') %>%
  write.table(file = 'outs/UW_eCounts.tsv', sep = '\t', row.names=F, quote=F)
ENRICH_OBJ <-  File( path='outs/UW_eCounts.tsv', name = 'Counts (estimated)', parentId=CODE$properties$id )
  all.annotations$dataSubType = 'estimatedCounts'
  ENRICH_OBJ$annotations = all.annotations
synStore( ENRICH_OBJ, used = used, activityName = activityName, executed = thisFile, activityDescription = activityDescription)

# Store residual gene expression for network analysis
RESIDUAL.NET.GENE_EXPRESSION %>%
  rownameToFirstColumn('ensembl_gene_id') %>%
  write.table(file = 'outs/UW_netResidualExpression.tsv', sep = '\t', row.names=F, quote=F)
ENRICH_OBJ <- File( path='outs/UW_netResidualExpression.tsv', name = 'Normalised, covariates and Diagnosis removed residual expression (for network analysis)', parentId=CODE$properties$id )
  all.annotations$dataSubType = 'residualGeneExpForNetAnlz'
  ENRICH_OBJ$annotations = all.annotations
synStore( ENRICH_OBJ, used = used, activityName = activityName, executed = thisFile, activityDescription = activityDescription)

library(synapser)
library(knit2synapse) # get the package from devtools::install_github('Sage-Bionetworks/knit2synapse')
source("~/UW_RNASeq/utility_functions/knitfile2synapseClient.R")
source("~/UW_RNASeq/utility_functions/hook_synapseMdSyntax_plot.R")
source("~/UW_RNASeq/utility_functions/knitfile2synapse.R")
setwd("~/UW_RNASeq/")
synapser::synLogin()
createAndKnitToFolderEntity(file = "UW_Plan.Rmd",  parentId ="syn22257765", folderName = 'RNASeq')

jgockley62/UW_RNASeq documentation built on Oct. 1, 2020, 8:07 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

jgockley62/UW_RNASeq
Identify differentially expressed genes counts data

In jgockley62/UW_RNASeq: Identify differentially expressed genes counts data

Sex Chromosome-specific Gene Expression Patterns

Sample clustering

Clustering of Samples

Distribution of samples (log cpm)

Significant Covariates

Normalisation (iterative design)

Sanity check

PCA of residual data

Clustering of Samples

Adjust data with covariates for Network Analysis

Differential expression analysis (with Risk Cohort as primary variable)

Differential expression analysis All 'controls' versus Sporatic and FamilialAD cases (with Risk Cohort as primary variable)

Differential expression analysis (with Mutation Status as primary variable)

Differential expression analysis (with Gene Mutation as primary variable)

Differential expression analysis (with Sex as primary variable)

Differential expression analysis (with Disease Cohort and Sex)

Differential expression analysis ( All 'controls' versus Sporatic and FamilialAD cases and Sex )

Differential expression analysis (with Within Sporatic Cohort Between Mut No Mut)

Differential expression analysis ( with FAD_NoMut+HC versus Sporatic (Non-PSEN1) )

Differential expression analysis (familial mutation PSEN1 Only - non-mutation )

Differential expression analysis (familial mutation PSEN1 only - non-mutation+HC )

Store files in synapse

R Package Documentation

Browse R Packages

We want your feedback!

jgockley62/UW_RNASeq Identify differentially expressed genes counts data

In jgockley62/UW_RNASeq: Identify differentially expressed genes counts data

Sex Chromosome-specific Gene Expression Patterns

Sample clustering

Clustering of Samples

Distribution of samples (log cpm)

Significant Covariates

Normalisation (iterative design)

Sanity check

PCA of residual data

Clustering of Samples

Adjust data with covariates for Network Analysis

Differential expression analysis (with Risk Cohort as primary variable)

Differential expression analysis All 'controls' versus Sporatic and FamilialAD cases (with Risk Cohort as primary variable)

Differential expression analysis (with Mutation Status as primary variable)

Differential expression analysis (with Gene Mutation as primary variable)

Differential expression analysis (with Sex as primary variable)

Differential expression analysis (with Disease Cohort and Sex)

Differential expression analysis ( All 'controls' versus Sporatic and FamilialAD cases and Sex )

Differential expression analysis (with Within Sporatic Cohort Between Mut No Mut)

Differential expression analysis ( with FAD_NoMut+HC versus Sporatic (Non-PSEN1) )

Differential expression analysis (familial mutation PSEN1 Only - non-mutation )

Differential expression analysis (familial mutation PSEN1 only - non-mutation+HC )

Store files in synapse

R Package Documentation

Browse R Packages

We want your feedback!

jgockley62/UW_RNASeq
Identify differentially expressed genes counts data