knitr::opts_chunk$set(echo = TRUE) set.seed(1)
## Load packages library(devtools) library(dplyr) library(ggplot2) library(cowplot) library(xtable) load_all('.') # library('gws') library(moiR) library(bigsnpr) library(bigstatsr)
#define the top SNPs gwares<-readRDS('dataint/Fitness_mli.rda') # window=100000 # 150e6 /window # gwares$roundpos<- window*round(gwares$pos/window) # # head(gwares) # # gwares%>% # group_by(roundpos) %>% # summarize(chr = which(p-) ) # subg<-gwares[which(gwares$p_lrt < sort(gwares$p_lrt)[1000]),] head(subg) subg$rs<-paste(subg$chr,subg$pos,sep='_') # dim(gwares) # dim(G)
### Real data # genomes<-readRDS("genome.rda") # data(genomes) G <- genomes$genotypes Map <- genomes$map head(Map) Map$rs<-paste(Map$chr,Map$physical.pos,sep='_') Map$order<-1:nrow(Map) matched<-merge(Map, subg, by='rs') dim(matched) head(matched) X.top=inputena.mat(G[,matched$order]) N.top=nrow(X.top) M.top=ncol(X.top) qplot(fn(X.top), xlab="allele dossage",main="") # X.random = inputena.mat(G[,sample(1:ncol(G),size = 600)]) # N=nrow(X.random) # M=ncol(X.random) # # # X.random = inputena.mat(G[,sample(1:ncol(G),size = 600)]) # N=nrow(X.random) # M=ncol(X.random) qplot(fn(X.random), xlab="allele dossage",main="")
data(dry) Y=merge(genomes$fam,by.x="sample.ID", dry[,c("id","Fitness_mli")], by.y="id",all.x=T)[,7] Y=relative(Y) Y = meanvarcent(Y) qplot(Y,geom="histogram",main="Fitness Madrid drought") Y.bad=Y Y.bad[is.na(Y.bad)]<-mean(Y.bad,na.rm=TRUE) # Mean impute # # Y=merge(genomes$fam,by.x="sample.ID", dry[,c("id","Survival_fruit_thi")], by.y="id",all.x=T)[,7] # # Y=merge(genomes$fam,by.x="sample.ID", dry[,c("id","Flowering_time_thi")], by.y="id",all.x=T)[,7] # Y=merge(genomes$fam,by.x="sample.ID", dry[,c("id","Fitness_thi")], by.y="id",all.x=T)[,7] # Y=relative(Y) # Y = meanvarcent(Y) # qplot(Y,geom="histogram",main="Fitness Tübingen water") # Y.good=Y # Y.good[is.na(Y.good)]<-mean(Y.good,na.rm=TRUE) # Mean impute
bgwa.bad=gws::gwa(X.random,Y.bad) b.bad=gws::cgwa(X.random,Y.bad) blasso.bad=gws::lassogwa(X.random,Y.bad) bridge.bad=gws::ridgegwa(X.random,Y.bad) bgwa.bad=gws::gwa(X.top,Y.bad) b.bad=gws::cgwa(X.top,Y.bad) blasso.bad=gws::lassogwa(X.top,Y.bad) bridge.bad=gws::ridgegwa(X.top,Y.bad) X=X.top Y=Y.bad lambda=0 X.=meanvarcent.mat(X.top) # b.ridge= MASS::ginv(t(X.)%*%X. + lambda*(1-normalize(b.)) *diag(M.top)) %*% t(X.) %*% Y bgwa=gws::gwa(X.top,Y.bad) b.= MASS::ginv(t(X.)%*%X. ) %*% t(X.) %*% Y b.001= MASS::ginv(t(X.)%*%X. + 0.001 *diag(M.top)) %*% t(X.) %*% Y b.01= MASS::ginv(t(X.)%*%X. + 0.01 *diag(M.top)) %*% t(X.) %*% Y b.0.1= MASS::ginv(t(X.)%*%X. + 0.1 *diag(M.top)) %*% t(X.) %*% Y b.1= MASS::ginv(t(X.)%*%X. + 1 *diag(M.top)) %*% t(X.) %*% Y b.10= MASS::ginv(t(X.)%*%X. + 10 *diag(M.top)) %*% t(X.) %*% Y b.100= MASS::ginv(t(X.)%*%X. + 100 *diag(M.top)) %*% t(X.) %*% Y b.ridge= MASS::ginv(t(X.)%*%X. + 100*(normalize(abs(b.))) *diag(M.top)) %*% t(X.) %*% Y b.ridgeLD= MASS::ginv(t(X.)%*%X. + 100*(normalize(abs(bgwa-b.))) *diag(M.top)) %*% t(X.) %*% Y plot(1-normalize(logistic(normalize(abs(b.)),k=5)),x=normalize(abs(b.))) plot(1-normalize(logistic(normalize(abs(b.)),k=10e6,x0 = 0.01)),x=normalize(abs(b.))) b.ridgex= MASS::ginv(t(X.)%*%X. + 1000*(1-normalize(logistic(normalize(abs(b.)),k=10e6,x0 = 0.5))) *diag(M.top)) %*% t(X.) %*% Y plot(b.,b.100, ylim=c(-0.2,+0.2),xlim=c(-0.2,+0.2)) points(b.,b.10,col='blue') points(b.,b.1,col='green') points(b.,b.0.1,col='red') # points(b.,b.01,col='orange') # points(b.,b.001,col='orange') points(b.,b.ridge,col='orange') # points(b.,b.ridgeLD,col='purple') points(b.,b.ridgex,col='purple') # points(b.,b.lasso,col='brown') abline(b = 1,a = 0) plot(b.ridge,b.ridgex) # To be able to have a meaninful prediction using the cond I need leave one out # Y.cond= sapply(1:N,function(i){ # bs=cgwa(X.random[-i,], Y.bad[-i]) # Ytmp=meanvarcent.mat(X.random)[i,] %*% bs # return(Ytmp) # }) # # save(file='dataint/Y.cond.rda',Y.cond) # load('../dataint/Y.cond.rda') # load('dataint/Y.cond.rda') load('../dataint/Y.cond.rda') # trick to get if faster, do not fit one individual # bs=cgwa(X.random[-i,], Y.bad[-i]) # Y.cond2=meanvarcent.mat(X.random) %*% bs
toplot=data.frame(Y=Y.bad, Y.cond=Y.cond, # Y.cond=Y.cond2, Y.marg=(meanvarcent.mat(X.random) %*% bgwa.bad), Y.lasso=(meanvarcent.mat(X.random) %*% blasso.bad) ) ggplot(toplot)+ geom_point(aes(x=Y,y=Y.cond),color="red",alpha=0.2)+ stat_smooth(aes(x=Y,y=Y.cond),color='red')+ geom_point(aes(x=Y,y=Y.marg),color="black",alpha=0.2)+stat_smooth(aes(x=Y,y=Y.marg),color='black')+ geom_point(aes(x=Y,y=Y.lasso),color="blue",alpha=0.2)+stat_smooth(aes(x=Y,y=Y.lasso),color='blue')+ geom_abline(slope = 1,intercept = 0)+ xlab("Real phenotype")+ ylab("Predicted phenotype")
p1<-ggplot(toplot)+ geom_point(aes(x=Y,y=Y.cond),color="red",alpha=0.2)+ stat_smooth(aes(x=Y,y=Y.cond),color='red')+ geom_abline(slope = 1,intercept = 0)+ xlab("Real phenotype")+ ylab("Predicted phenotype") p2<-ggplot(toplot)+ geom_point(aes(x=Y,y=Y.marg),color="black",alpha=0.2)+stat_smooth(aes(x=Y,y=Y.marg),color='black')+ geom_abline(slope = 1,intercept = 0)+ xlab("Real phenotype")+ ylab("Predicted phenotype") p3<-ggplot(toplot)+ geom_point(aes(x=Y,y=Y.lasso),color="blue",alpha=0.2)+stat_smooth(aes(x=Y,y=Y.lasso),color='blue')+ geom_abline(slope = 1,intercept = 0)+ xlab("Real phenotype")+ ylab("Predicted phenotype") plot_grid(p1,p2,p3,ncol=3)
Note: the phenotype prediction in the conditional GWA was done using leave-one-out procedure.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.