library(ggplot2) library(dplyr) library(feather) library(tidyr) library(broom) gtex_marg_h5 <- "/home/nwknoblauch/Desktop/eQTL/Snake/Whole_Blood_Analysis.v6p.all_snpgene_pairs.h5" dgntf <- "~/Downloads/PredictDB_Covariance/DGN-WB/DGN-WB_0.5.db" gtexf <- "~/Downloads/PredictDB_Covariance/TW_Whole_Blood/TW_Whole_Blood_0.5.db" ogtexf <- "~/Downloads/TW_Whole_Blood_0.5.db" gwasf2 <- "/media/nwknoblauch/Data/gwas_data/iibdgc-trans-ancestry-summary-stats/EUR.IBD.gwas.assoc_2.feather" gwas_dat <- read_feather(gwasf2) gwas_dat <- rename(gwas_dat,pvalg=p.value,se_betahat=serr.g) gwas_dat <- mutate(gwas_dat,betahat=ifelse(ref_allele<eff_allele,log(1/OR),log(OR))) %>% select(-eff_allele,-ref_allele) dgndb <- src_sqlite(dgntf,create=F) gtexdb <- src_sqlite(gtexf,create=F) ogtexdb <- src_sqlite(ogtexf,create=F) gtex_marg <- read_h5_df(gtex_marg_h5,groupname = "cis_eQTL",subcols = c("fgeneid","chrom","pos","weight")) # dgn_gene <- tbl(src=dgndb,"extra") # dgn_weights <- tbl(src=dgndb,"weights") # dgn_weights <- mutate(dgn_weights,weight=ifelse(ref_allele<eff_allele,-weight,weight)) %>% select(-eff_allele,-ref_allele) # dgn_var <- group_by(dgn_weights,gene) %>% summarise(weight_var=var(weight),weight_mean=mean(weight)) # ndgn_weights <- inner_join(dgn_weights,gwas_dat,by=c("rsid"),copy = T) # ndgn_weights <- mutate(ndgn_weights,source="DGN") # ndgn_weights <- select(ndgn_weights,gene,chrom,pos,weight,beta,source,serr.g) gtex_gene <- tbl(src=gtexdb,"extra") %>% arrange(pred.perf.qval) gtex_weights <- tbl(src=gtexdb,"weights") gtex_weights <- mutate(gtex_weights,weight=ifelse(ref_allele<eff_allele,-weight,weight)) %>% select(-eff_allele,-ref_allele) ngtex_weights <- inner_join(gtex_weights,gwas_dat,by=c("rsid"),copy=T) %>% mutate(source="GTEx") ngtex_weights <- select(ngtex_weights,gene,chrom,pos,weight,betahat,source,se_betahat) ngtex_weights <- collect(ngtex_weights,n=Inf) %>% %>% filter(!is.na(fgeneid)) gtex_gene <- collect(gtex_gene,n=Inf) %>% mutate(fgeneid=as.integer(gsub("ENSG([0-9]+).+","\\1",gene))) %>% filter(!is.na(fgeneid)) gtex_gene <- group_by(ngtex_weights,fgeneid) %>% summarise(beta_var=var(beta),beta_mean=mean(beta)) %>% inner_join(gtex_gene) gtex_gene <- group_by(gtex_weights,gene) %>% summarise(weight_var=var(weight),weight_mean=mean(weight)) %>% inner_join(gtex_gene,copy = T) gtex_gene <- group_by(gtex_marg,fgeneid) %>% summarise(n.snps.total=n(),marginal_mean=mean(weight),marginal_var=var(weight)) %>% inner_join(gtex_gene,copy=T) gtex_gene <- select(gtex_gene,-fgeneid) write.table(gtex_gene,file = "~/Dropbox/eQTL/GTEx_WB_LASSO_MARGINAL.tsv",sep="\t",col.names=T,row.names=F,quote=F)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
ndgn_weights <- collect(ndgn_weights,n=Inf) ndgn_weights <- mutate(ndgn_weights,fgeneid=as.integer(gsub("ENSG([0-9]+).+","\\1",gene))) %>% filter(!is.na(fgeneid)) ndgn_weights <- semi_join(ndgn_weights,gtex_marg,by=c("fgeneid")) dgn_lambda <- group_by(ndgn_weights,fgeneid) %>% nest() %>% mutate(fit=purrr::map(data,~lm(beta~weight+0,data=.,weights=1/serr.g^2))) dgn_res <- dgn_lambda %>% unnest(fit %>% purrr::map(broom::tidy)) %>% arrange(p.value) ggplot(dgn_res)+geom_histogram(aes(x=p.value)) ngtex_weights <- collect(ngtex_weights,n=Inf) gtex_marg <- semi_join(gtex_marg,ndgn_weights,copy=T,by=c("fgeneid")) gtex_ct <- group_by(gtex_marg,fgeneid) %>% summarise(snp_total=n()) gtex_lambda <- group_by(ngtex_weights,gene) %>% nest() %>% mutate(fit=purrr::map(data,~lm(beta~weight+0,data=.,weights=1/serr.g^2))) gtex_res <- gtex_lambda %>% unnest(fit %>% purrr::map(broom::tidy)) %>% arrange(p.value) gtex_res <- select(gtex_res,gene,estimate,std.error,statistic,p.value) dgn_res <- select(dgn_res,gene,estimate,std.error,statistic,p.value) dgn_gtex_w <- inner_join(dgn_weights,gtex_weights,copy=T,by=c("rsid","gene"),suffix=c(".DGN",".GTEx")) dgn_gtex_w <- collect(dgn_gtex_w,n=Inf) dgn_gtex <- inner_join(dgn_res,gtex_res,by="gene",suffix=c(".DGN",".GTEx")) ggplot(dgn_gtex_w,aes(x=weight.DGN,y=weight.GTEx))+geom_point()+geom_abline()+ggtitle(label="Elastic Net Weights")
ggplot(dgn_gtex,aes(x=estimate.DGN,y=estimate.GTEx))+geom_point()+geom_abline()+ggtitle(label="lambda_DGN vs lambda_GTEx (p<2e-16)")
dgn_gtex_g <- inner_join(dgn_gene,gtex_gene,copy=T,by=c("gene","genename"),suffix=c(".DGN",".GTEx")) dgn_gtex <- inner_join(dgn_gtex,dgn_gtex_g,copy=T,by="gene") dgn_res <- inner_join(dgn_res,dgn_gene,copy=T,by="gene") %>% mutate(source="DGN") dgn_res <- arrange(dgn_res,p.value) gtex_res <- inner_join(gtex_res,gtex_gene,copy=T,by="gene") %>% mutate(source="GTEx") gtex_res <- arrange(gtex_res,p.value)
Let's look at the feasability of doing elastic net genome-wide (in trans)
library(RColumbo) library(glmnet) rawh5f <- "/home/nwknoblauch/Desktop/eQTL/Snake/Whole_Blood_eQTL_v6p_raw_data.h5" snpleg <- read_h5_df(rawh5f,"SNPinfo") expleg <- read_h5_df(rawh5f,"EXPinfo") cistf <- "/home/nwknoblauch/Desktop/eQTL/Snake/GTEx_cistrans.h5" cisleg <- read_h5_df(cistf,groupname = "cis") write_h5_df(df = cisleg,group = "cis",outfile = rawh5f,deflate_level = 4) cistfs <- paste0("/home/nwknoblauch/Desktop/eQTL/GTEx/GTEx_WB_cis_Chr",1:22,".h5")
library(glmnet) library(broom) library(RColumbo) library(BBmisc) nchunks <- 100 mychunk <- as.integer(commandArgs(trailingOnly = F)) rawh5 <- "/home/nwknoblauch/Desktop/eQTL/Snake/Whole_Blood_eQTL_v6p_raw_data.h5" cistf <- "/home/nwknoblauch/Desktop/eQTL/Snake/GTEx_cistrans.h5" cisleg <- read_h5_df(cistf,"cis") expleg <- distinct(cisleg,exp_ind,.keep_all = T) expdat <- read_fmat_chunk_ind(h5file = rawh5,groupname = "EXPdata","orthoexpression",expleg$exp_ind) colnames(expdat) <- as.character(expleg$fgeneid) ufgid <- expleg$fgeneid ensid <- paste0("ENSG",sprintf("%011d",ufgid)) fgidchunk <- chunk(ufgid,n.chunks = nchunks) h5chunks <- paste0("/home/nwknoblauch/Desktop/eQTL/GTEx/cis_elasticnet/chunk_",1:length(fgidchunk),".h5") mh5 <- h5chunks[mychunk] mufgid <- fgidchunk[[mychunk]] ensid <- paste0("ENSG",sprintf("%011d",mufgid)) overall_tl <- list() lambda_min_tl <- list() weights_tl <- list() bcisleg <- filter(cisleg,fgeneid %in% mufgid) rm(cisleg) gc() bsnpleg <- distinct(bcisleg,snp_ind,.keep_all = T) %>% mutate(snp_name=paste0("chr",chrom,"_",pos)) bsnpdat <- read_fmat_chunk_ind(rawh5,"SNPdata","genotype",bsnpleg$snp_ind) colnames(bsnpdat) <- paste0("chr",bsnpleg$chrom,"_",bsnpleg$pos) bcisleg <- mutate(bcisleg,snp_name=paste0("chr",chrom,"_",bcisleg$pos)) namel <- split(bcisleg$snp_name,bcisleg$fgeneid) for(i in 1:length(namel)){ cat(ensid[i],"_",i,"\n") texp <- expdat[,as.character(mufgid[i])] snp_data <- bsnpdat[,namel[[i]]] pred_fit <- cv.glmnet(x = snp_data,y = texp,alpha=0.95,parallel = F) overall_tl[[as.character(mufgid[i])]] <- tidy(pred_fit) %>% mutate(fgeneid=mufgid[i]) lambda_min_tl[[as.character(mufgid[i])]] <- glance(pred_fit) %>% mutate(fgeneid=mufgid[i]) weights_tl[[as.character(mufgid[i])]] <- tidy(pred_fit$glmnet.fit) %>% mutate(fgeneid=mufgid[i]) %>% filter(estimate!=0) } overall <- bind_rows(overall_tl) lambda_min <- bind_rows(lambda_min_tl) weights <- select(bsnpleg,chrom,pos,snp_name) %>% right_join(bind_rows(weights_tl)) weights <- mutate(weights,chrom=ifelse(is.na(chrom),0,chrom),pos=ifelse(is.na(pos),0,pos)) %>% select(-snp_name) write_h5_df(weights,group = "weights",outfile = mh5,deflate_level = 4) write_h5_df(overall,group = "overall",outfile = mh5,deflate_level = 4) write_h5_df(lambda_min,group = "lambda_min",outfile = mh5,deflate_level = 4)
snpleg <- read_h5_df(snplegf,groupname="SNPinfo") %>% mutate(snpind=1:n()) expleg <- read_h5_df(snplegf,groupname="EXPinfo") %>% mutate(geneind=1:n(),start=pmax(start-1e6,0),end=end+1e6) cistrans <- inner_join(snpleg,expleg,by="chr") %>% filter(between(pos,start,end)) subleg <- filter(expleg,fgeneid==107929) subexp <- read_fmat_chunk_ind(snplegf,"EXPdata","orthoexpression",indvec = subleg$geneind) sub_weights <- filter(ngtex_weights,fgeneid==107929) nsub_weights <- select(sub_weights,chrom,pos,weight) sub_gtex <- filter(gtex_marg,fgeneid==107929) sub_snpleg <- rename(snpleg,chrom=chr) %>% semi_join(sub_gtex,by=c("chrom","pos")) sub_geno <- read_fmat_chunk_ind(snplegf,"SNPdata","orthogenotype",indvec = sub_snpleg$snpind) set.seed(42) sub_genores <- cv.glmnet(sub_geno,c(subexp),alpha=0.95) sub_df <- tidy(sub_genores) sub_g <- glance(sub_genores) sub_b <- broom::augment(sub_genores) fit.df <- data.frame(sub_genores$cvm, sub_genores$lambda, 1:length(sub_genores$cvm)) best.lam <- fit.df[which.min(fit.df[,1]),] cvm.best <- best.lam[,1] lambda.best <- best.lam[,2] # Position of best lambda in cv.glmnet output nrow.best <- best.lam[,3] # Get the betas from the best lambda value ret <- data_frame(weight=sub_genores$glmnet.fit$beta[,nrow.best]) sub_snpleg <- bind_cols(sub_snpleg,ret) nsub_snpleg <- inner_join(sub_snpleg,nsub_weights,by=c("chrom","pos"),suffix=c(".mine",".predixcan"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.