knitr::opts_chunk$set(echo = TRUE)
library( tidyverse ) ## Define Synapse downloader(s) synapser::synLogin(email='nathan_johnson@hms.harvard.edu', apiKey='07rRy7r0BjxoRPF6YNLFpJBj10LAekFHeCaPwpg2FYNOn4YbINW7RAVkjy7Bf8JqsxED/S5VaJYg14Wi/tDgmQ==') syn <- synExtra::synDownloader("./data", ifcollision="overwrite.local") syn_csv <- function( id ) {syn(id) %>% read_csv(col_types=cols())}
## Download and wrangle DGE data DGE <- loadDGE() ## Download ISG list (positive control) ISG <- syn("syn11629935") %>% scan( what=character() ) ## TAS scores TAS <- syn_csv("syn18268627") %>% mutate_at("name", str_to_lower) ## Transcription Factor -> Target relationships fnTF <- str_c("https://raw.githubusercontent.com/bioinfonerd/", "Transcription-Factor-Databases/master/Ttrust_v2/human_TF_GSEA/all.gmt") TF <- parseTF(fnTF) %>% c( list(ISG=ISG) ) ## JAK2 dfxJAK2 <- myEdgeR( "JAK2", DGE, TAS ) tfJAK2 <- dfxJAK2 %>% myGSEA( TF )
#Artem's DGE <- loadDGE() TAS <- syn_csv("syn18268627") %>% mutate_at("name", str_to_lower) target<-"JAK2" ## Identify binders and non-binders dPos <- TAS %>% filter( symbol == target, tas %in% c(1,2,3) ) %>% pull(name) %>% unique() dNeg <- TAS %>% filter( symbol == target, tas == 10 ) %>% pull(name) %>% unique() ## Pull corresponding wells wPos <- DGE$Y %>% filter( Drug %in% dPos ) %>% select(Well) %>% mutate( Binder = "Yes" ) wNeg <- DGE$Y %>% filter( Drug %in% dNeg ) %>% select(Well) %>% mutate( Binder = "No" ) ## Compose inputs for edgeR eRY <- bind_rows(wPos, wNeg) %>% mutate_at("Binder", factor, levels=c("No","Yes")) %>% as.data.frame %>% column_to_rownames("Well") eRX <- DGE$X %>% select(HUGO, rownames(eRY)) %>% as.data.frame %>% column_to_rownames("HUGO") ## Create the design matrix and estimate dispersion dl <- edgeR::DGEList( counts = eRX, samples = eRY ) dl <- edgeR::calcNormFactors( dl ) mmx_A <- model.matrix( ~Binder, data = dl$samples ) #mmx_A_t #make orders identical to see if change result (doesn't) dl <- edgeR::estimateDisp( dl, mmx_A ) ## Compute differential gene expression gf <- edgeR::glmFit( dl, mmx_A ) %>% edgeR::glmLRT( coef = 2 ) output_A<-edgeR::topTags( gf, nrow(eRX) ) %>% as.data.frame %>% rownames_to_column( "Gene" )
#Nathan's (removing any extra print/extra statements (TAS function prints)) (change variables to not overwrite Artem's) count_file_name<-'DGE1_DGE2_counts' meta_file_name<-'DGE1_DGE2_meta' counts_N<-counts_load(count_file_name) meta<-meta_load(meta_file_name) TAS<-TAS_load() meta<-TAS_drug_binder(TAS_tibble=TAS,drug_target=target,meta_tibble=meta) meta<-as.data.frame(meta) #modify counts & meta data [TODO]: add ability for QC to filter automatically HUGO_N<-counts_N$HUGO#removes HUGO name counts_N <- counts_N[ , (names(counts_N) %in% meta$Well)] #remove unused samples #rownames(counts_N)<-HUGO_N #setting rownames on a tibble is depracated counts_N<-as.data.frame(counts_N) #to match Artem rownames(counts_N)<-HUGO_N #formula formula=as.formula(paste(c('~','Binder',collapse = ' '))) ## Initialize the DGEList object y <- DGEList( counts = counts_N, samples = meta, remove.zeros = FALSE) #count matrix by sample group (remove rows with zeros and changed when gene names are added) (Artem's) #y <- DGEList( counts = counts_N, samples = meta, genes = HUGO_N,remove.zeros = TRUE) #count matrix by sample group (remove rows with zeros) (original) y <- calcNormFactors(y) #normalizes for RNA composition by finding a set of scaling factors for the library sizes that minimize the log-fold changes between the samples for most genes mmx_N <- model.matrix(formula, data = y$samples ) y <- estimateDisp( y, mmx_N ) glf <- glmFit( y, mmx_N ) %>% glmLRT( coef=2) #consistency with Artem's data ## Perform quasi-likelihood F-test #qlf <- glmQLFit( y, mmx ) %>% glmQLFTest( coef=2:ncol(mmx) ) #possibly better for DGE data due to high 0 data # Create the differential expression directory, if needed [TODO]: make into function #output_N<-as.data.frame(topTags(glf,n=nrow(glf$table),p.value = 1)) output_N<-edgeR::topTags( gf, nrow(counts_N) ) %>% as.data.frame %>% rownames_to_column( "Gene" )
#final edgeR A vs N output cor(output_A %>% arrange(Gene) %>% select(logCPM),output_N %>% arrange(Gene) %>% select(logCPM)) cor(output_A %>% arrange(Gene) %>% select(logFC),output_N %>% arrange(Gene) %>% select(logFC)) cor(output_A %>% arrange(Gene) %>% select(FDR),output_N %>% arrange(Gene) %>% select(FDR)) print('logCPM highly correlated but not FDR or logFC')
#sample well input? identical(meta %>% filter(Binder == 'Yes') %>% arrange(Well) %>% select(Well), wPos %>% arrange(Well) %>% select(Well)) identical(meta %>% filter(Binder == 'No') %>% arrange(Well) %>% select(Well), wNeg %>% arrange(Well) %>% select(Well)) print('same well input') #sample wells in count data? identical(names(eRX) %>% sort(),names(counts_N) %>% sort()) print('Same Wells in Count Data') #same count data? #reorder by colnames tmp<-names(eRX) %>% sort() eRX_t <- eRX[,tmp] eRX_t <- as_tibble(eRX_t) tmp<-names(counts_N) %>% sort() counts_N_t <- counts_N[,tmp] identical(counts_N_t,eRX_t) print('Same input counts data, however had to reorder columns...')
#model matrix identical(mmx_A,mmx_N) tmp<-rownames(mmx_A) %>% sort() mmx_A_t <- mmx_A[tmp,]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.