##--------------------------------------------------------------------------------------------------------
## evaluate results
## olli 12.08.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate<- function()
{
require(RColorBrewer)
dfa <- project.PANGEA.TEST.pipeline.Aug2015.evaluate.read()
# check for updated submissions, and keep
dfa <- project.PANGEA.TEST.pipeline.Aug2015.keep.most.recent.submission(dfa, format='%d.%m.%Y')
# add objective legend
dfa <- merge(dfa, data.table(USED_GENES=c('pol','all'), USED_GENES_L=c('pol gene','pol+gag+env\ngenome') ), by='USED_GENES')
set(dfa, NULL, 'TEAM', dfa[, factor(TEAM)])
tmp <- data.table( OBJ= c('OBJ_i','OBJ_ii','OBJ_iii','OBJ_iv','OBJ_v','OBJ_vi'),
OBJ_L= c('Incidence\nTrend', '%Incidence', 'Incidence\nreduction', '%Acute Ctgr\n(baseline)', '%Acute\n(baseline)', '%Acute\n(endpoint)'))
set(tmp, NULL, 'OBJ_L2', tmp[, factor(OBJ_L, levels=OBJ_L, labels=OBJ_L)])
set(tmp, NULL, 'OBJ_L', tmp[, factor(OBJ_L, levels=rev(OBJ_L), labels=rev(OBJ_L))])
dfa <- merge(dfa, tmp, by='OBJ')
# add data legend
dfa[, DATA_T2:='NA_character_']
set(dfa, dfa[, which(DATA_T=='seq')], 'DATA_T2', 'using\nsequences')
set(dfa, dfa[, which(DATA_T=='phy')], 'DATA_T2', 'using\ntrue tree')
set(dfa, NULL, 'DATA_T2', dfa[, factor(DATA_T2, levels=rev(c('using\nsequences','using\ntrue tree')), labels=rev(c('using\nsequences','using\ntrue tree')))])
# add scenario type
set(dfa, NULL, 'DATA_T', dfa[, factor(DATA_T, levels=c('seq','phy'), labels=c('seq','phy'))])
set(dfa, NULL, 'INT_T', dfa[, factor(INT_T, levels=c('fast','slow','none'), labels=c('fast','slow','none'))])
set(dfa, NULL, 'AC_T', dfa[, factor(AC_T, levels=c('low','high'), labels=c('low','high'))])
set(dfa, NULL, 'IMPRT', dfa[, factor(IMPRT*100, levels=c(0,2,5,20), labels=paste(c(0,2,5,20),'%',sep=''))])
set(dfa, NULL, 'SMPL_C', dfa[, factor(SMPL_C*100, levels=c(8, 16, 30, 60), labels=paste(c(8, 16, 25, 50),'%',sep=''))])
set(dfa, NULL, 'SMPL_D', dfa[, factor(SMPL_D, levels=c(5,3), labels=c(5,3))])
set(dfa, dfa[, which(SMPL_M=='overs')], 'SMPL_M', 'much')
set(dfa, dfa[, which(SMPL_M=='extrs')], 'SMPL_M', 'extreme')
set(dfa, dfa[, which(is.na(SMPL_M))], 'SMPL_M', 'extreme')
set(dfa, NULL, 'SMPL_M', dfa[, factor(SMPL_M, levels=c('much','extreme'), labels=c('much','extreme'))])
# save submissions
outdir <- '~/Dropbox (SPH Imperial College)/PANGEAHIVsim_internal/documents/external/2015_08_results/results'
save(dfa, file=paste(outdir,'/submissions.R',sep=''))
load(paste(outdir,'/submissions.R',sep=''))
tmp <- unique(subset( dfa, select=c(DATAT_L, SC_RND, DATA_T, SC, AC_T, INT_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D) ))
setkey(tmp, DATAT_L, AC_T, INT_T, DATA_T, IMPRT, SMPL_C, SMPL_D, SMPL_M)
tmp[, SCENARIO_L:= paste('%AC=',AC_T,' ARTup=',INT_T,' EXT=',IMPRT,'\n',DATA_T,' ',SMPL_N,' ',SMPL_C,' ',SMPL_D,' ',SMPL_M, ' (',SC_RND,')',sep='')]
dfa <- merge(dfa, subset(tmp, select=c(SC_RND, SCENARIO_L)), by='SC_RND')
# add intervention legend
dfa[, INT_L:= dfa[, paste('ART scale up\n',as.character(INT_T),sep='')]]
setkey(dfa, INT_T)
set(dfa, NULL, 'INT_L', dfa[, factor(INT_L, levels=dfa[, unique(INT_L)], labels=dfa[, unique(INT_L)])])
# add %Acute legend
dfa[, AC_L:= dfa[, paste('%Acute\n',as.character(AC_T),sep='')]]
setkey(dfa, AC_T)
set(dfa, NULL, 'AC_L', dfa[, factor(AC_L, levels=dfa[, unique(AC_L)], labels=dfa[, unique(AC_L)])])
# get data.table of data sets ~ all primary and secondary objectives
dfd <- subset(dfa, select=c(SC_RND, DATA_T, DATAT_L, AC_T, INT_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D))
setkey(dfd, DATAT_L, INT_T, AC_T, DATA_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D)
dfd <- unique(dfd)
# Primary Objectives, on sequences
tmp <- data.table(expand.grid(ANA='Pr_Seq', SC_RND=subset(dfd, DATA_T=='seq')[, SC_RND], stringsAsFactors=FALSE))
dfr <- copy(tmp)
# Primary Objectives, on trees
tmp <- subset(dfd, DATA_T=='phy' & SMPL_D==5 & (DATAT_L=='Regional' & IMPRT=='5%' & SMPL_M=='much' & SMPL_N==1600 | DATAT_L=='Village' & SMPL_C=='25%') )
dfr <- rbind(dfr, data.table(expand.grid(ANA='Pr_Phy', SC_RND=tmp[, SC_RND], stringsAsFactors=FALSE)))
# Secondary: sequence coverage
tmp <- subset(dfd, DATA_T=='phy' & SMPL_D==5 & INT_T!='none' & (DATAT_L=='Regional' & IMPRT=='5%' & SMPL_M=='much' | DATAT_L=='Village') )
dfr <- rbind(dfr, data.table(expand.grid(ANA='Sc_SeqCoverage_Phy', SC_RND=tmp[, SC_RND], stringsAsFactors=FALSE)))
# Secondary: imports
tmp <- subset(dfd, DATA_T=='phy' & SMPL_D==5 & INT_T!='none' & AC_T=='high' & (DATAT_L=='Regional' & SMPL_M=='much' & SMPL_C=='8%') )
dfr <- rbind(dfr, data.table(expand.grid(ANA='Sc_Imports_Phy', SC_RND=tmp[, SC_RND], stringsAsFactors=FALSE)))
# Secondary: focussed sampling
tmp <- subset(dfd, DATA_T=='phy' & SMPL_D==5 & INT_T!='none' & AC_T=='low' & SMPL_C=='8%' & IMPRT=='5%' & DATAT_L=='Regional' )
dfr <- rbind(dfr, data.table(expand.grid(ANA='Sc_SmplFc_Phy', SC_RND=tmp[, SC_RND], stringsAsFactors=FALSE)))
# Secondary: sampling duration
tmp <- subset(dfd, DATA_T=='phy' & INT_T!='none' & DATAT_L=='Regional' & SMPL_M=='much' & SMPL_C=='8%' & IMPRT=='5%' & INT_T=='fast')
dfr <- rbind(dfr, data.table(expand.grid(ANA='Sc_SmplD_Phy', SC_RND=tmp[, SC_RND], stringsAsFactors=FALSE)))
# merge with dfa
dfr <- dcast.data.table(dfr, SC_RND~ANA, value.var='SC_RND')
set(dfr, NULL, 'Pr_Phy', dfr[, as.numeric(!is.na(Pr_Phy))])
set(dfr, NULL, 'Pr_Seq', dfr[, as.numeric(!is.na(Pr_Seq))])
set(dfr, NULL, 'Sc_Imports_Phy', dfr[, as.numeric(!is.na(Sc_Imports_Phy))])
set(dfr, NULL, 'Sc_SeqCoverage_Phy', dfr[, as.numeric(!is.na(Sc_SeqCoverage_Phy))])
set(dfr, NULL, 'Sc_SmplD_Phy', dfr[, as.numeric(!is.na(Sc_SmplD_Phy))])
set(dfr, NULL, 'Sc_SmplFc_Phy', dfr[, as.numeric(!is.na(Sc_SmplFc_Phy))])
dfr <- merge(unique(subset(dfa, select=c(DATAT_L,SC_RND))), dfr, by='SC_RND')
dfa <- merge(dfa, dfr, by=c('SC_RND','DATAT_L'))
#
# reset Vancouver to UBC
#
set( dfa, dfa[, which(TEAM=='Vancouver')], 'TEAM', 'British Columbia' )
#
# set team color
#
tmp <- c('Cambridge','Cambridge/Imperial','ETH Zurich','Imperial','British Columbia','True','Cambridge/Imperial (chronos)','Cambridge/Imperial (lsd)','Cambridge/Imperial (mh15)','Cambridge/Imperial (mh30)','Cambridge/Imperial (merged)')
TEAM_CL <- c( brewer.pal(5, 'Set1'), 'black', brewer.pal(5, 'Set2') )
names(TEAM_CL) <- tmp
# write info on scenario IDs to table
dfi<- subset( dfa, select=c(SC_RND, DATAT_L, DATA_T, AC_T, INT_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D) )
setkey(dfi, DATAT_L, DATA_T, AC_T, INT_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D)
dfi <- unique(dfi)
tmp <- subset(dfa, OBJ!='OBJ_iv' & TEAM!='True' & !grepl('(',TEAM,fixed=1))[, list(SUB_N=length(central)), by=c('SC_RND','TEAM')]
tmp <- dcast.data.table(tmp, SC_RND~TEAM, value.var='SUB_N', fill=0L)
dfi <- merge(dfi, tmp, by='SC_RND')
setkey(dfi, DATAT_L, DATA_T, AC_T, INT_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D)
file<- paste(outdir,'/SC_RND_info.csv',sep='')
write.csv(dfi, file=file, row.names=FALSE)
#
melt(dfi, measure.vars=c('Cambridge','Cambridge/Imperial','ETH Zurich','Imperial','British Columbia'))[, list(SUM=sum(value), COMPLETE=sum(value)/(5*33)), by='variable']
#
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.sortedIncidenceRatio(dfa, outdir)
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.IncidenceTrends(dfa, outdir)
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.IncidenceRatioCovariates(dfa, outdir)
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.sortedPCIncidence(dfa, outdir)
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.PCIncidenceCovariates(dfa, outdir)
#
project.PANGEA.TEST.pipeline.Aug2015.evaluate.overallnumbers(dfa, outdir)
#
# no results on pol submitted, focus on full genome only. get sample sizes per objective.
project.PANGEA.TEST.pipeline.Aug2015.evaluate.samplesize(dfr, dfa, outdir)
#
tmp <- subset(dfa, select=c(TEAM, DATA_T, DATAT_L, SIM_SCENARIO))
setkey(tmp, TEAM, DATA_T, DATAT_L, SIM_SCENARIO)
tmp <- unique(tmp)
tmp[, table(TEAM, DATA_T, DATAT_L )]
# for each primary objective
# compare results across teams
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.incidence(dfa, outdir, onSeq=1)
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.incidence(dfa, outdir, onSeq=0)
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.acute(dfa, outdir, onSeq=1)
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.acute(dfa, outdir, onSeq=0)
# for each secondary objective
# compare results across teams
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.imports(dfa, outdir)
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.focussedsampling(dfa, outdir)
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.sduration(dfa, outdir)
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.seqcoverage(dfa, outdir)
# for each team
# all results
invisible(sapply(setdiff(dfa[, unique(TEAM)],'True'), function(x)
{
#x <- 'Imperial'
df <- subset(dfa, (TEAM=='True' | TEAM==x) & USED_GENES=='all')
set(df, df[, which(TEAM==x)], 'TEAM', 'estimate')
set(df, df[, which(TEAM=='True')], 'TEAM', 'true value')
set(df, NULL, 'TEAM', df[, factor(TEAM, levels=c('estimate','true value'), labels=c('estimate','true value'))])
ggplot(df, aes(y=SCENARIO_L, x=central, xmin=lower95, xmax=upper95, colour=TEAM, pch=TEAM)) +
geom_errorbarh(height=0.3) + geom_point(size=3) +
scale_colour_manual(values = c("red","black")) +
scale_shape_manual(values = c(13,18), guide = FALSE) +
labs(x='', y='', title= paste('TEAM',x,'\n'), colour='') +
facet_grid(DATAT_L~OBJ_L2, scales='free', space='free_y') +
theme_bw() + theme(legend.position='bottom')
ggsave(file=paste(outdir,'/res_obj_TEAM_',gsub(' ','_',gsub('\\/|\\(|\\)','',x)),'.pdf',sep=''), w=14, h=0.5*df[, length(unique(SCENARIO_L))])
# results using seq data
df <- subset(dfa, (TEAM=='True' | TEAM==x) & USED_GENES=='all' & DATA_T=='seq')
set(df, df[, which(TEAM==x)], 'TEAM', 'estimate')
set(df, df[, which(TEAM=='True')], 'TEAM', 'true value')
set(df, NULL, 'TEAM', df[, factor(TEAM, levels=c('estimate','true value'), labels=c('estimate','true value'))])
ggplot(df, aes(y=SCENARIO_L, x=central, xmin=lower95, xmax=upper95, colour=TEAM, pch=TEAM)) +
geom_errorbarh(height=0.3) + geom_point(size=3) +
scale_colour_manual(values = c("red","black")) +
scale_shape_manual(values = c(13,18), guide = FALSE) +
labs(x='', y='', title= paste('TEAM',x,'\n'), colour='') +
facet_grid(DATAT_L~OBJ_L2, scales='free', space='free_y') +
theme_bw() + theme(legend.position='bottom')
ggsave(file=paste(outdir,'/res_objonseq_TEAM_',gsub(' ','_',gsub('\\/|\\(|\\)','',x)),'.pdf',sep=''), w=14, h=0.7*df[, length(unique(SCENARIO_L))])
}))
}
##--------------------------------------------------------------------------------------------------------
## evaluate results
## olli 12.08.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.overallnumbers<- function(dfa, outdir)
{
# count total submissions primary vs secondary
tmp <- subset(dfa, TEAM!='True' & !grepl('(', TEAM, fixed=1))
tmp <- tmp[, list( Village=length(which(grepl('Vill',SIM_SCENARIO))), Regional=length(which(grepl('Regional',SIM_SCENARIO)))), by=c('TEAM','OBJ_L','USED_GENES_L')]
tmp <- melt(tmp, measure.vars=c('Village','Regional'))
ggplot(tmp, aes(x=OBJ_L, y=value, fill=TEAM)) + geom_bar(stat='identity') +
facet_grid(USED_GENES_L~variable) +
guides(fill=guide_legend(ncol=2)) +
scale_fill_manual(values=TEAM_CL) +
labs(x='', y='submissions\n(#)', title='Total scenarios submitted\n(using sequence data or true trees)\n') +
theme_bw()+ theme(legend.position='bottom') + coord_flip()
ggsave(file=paste(outdir,'/res_scenarios_total.pdf',sep=''), w=10, h=8)
# count all submissions for primary objectives
tmp <- subset(dfa, TEAM!='True' & !grepl('(', TEAM, fixed=1) & DATA_T=='seq')
tmp <- tmp[, list( Village=length(which(grepl('Vill',SIM_SCENARIO))), Regional=length(which(grepl('Regional',SIM_SCENARIO)))), by=c('TEAM','OBJ_L')]
tmp <- melt(tmp, measure.vars=c('Village','Regional'))
ggplot(tmp, aes(x=OBJ_L, y=value, fill=TEAM)) + geom_bar(stat='identity') +
facet_grid(~variable) +
labs(x='', y='submissions\n(#)', title='Total scenarios submitted\n(using sequence data)\n') +
scale_fill_manual(values=TEAM_CL) +
guides(fill=guide_legend(ncol=2)) +
theme_bw() + theme(legend.position='bottom') + coord_flip()
ggsave(file=paste(outdir,'/res_scenarios_total_seqonly.pdf',sep=''), w=10, h=5)
# count complete submissions for primary objectives
tmp <- subset(dfa, TEAM!='True' & !grepl('(', TEAM, fixed=1) & DATA_T=='seq')
tmp <- tmp[, list( Village=as.numeric(length(setdiff(c('01','02','03','04'),SC_RND))==0), Regional=as.numeric(length(setdiff(c('A','B','C','D'),SC))==0)), by=c('TEAM','OBJ_L','USED_GENES_L')]
tmp <- melt(tmp, measure.vars=c('Village','Regional'))
ggplot(tmp, aes(x=OBJ_L, y=value, fill=TEAM)) + geom_bar(stat='identity') +
facet_grid(USED_GENES_L~variable) +
scale_y_continuous(breaks=seq(1,10,1), minor_breaks=NULL) +
scale_fill_manual(values=TEAM_CL) +
labs(x='', y='complete set of 4 submissions\n(#)', title='Complete submissions to evalute primary objectives\n(either village or regional)') +
guides(fill=guide_legend(ncol=2)) +
theme_bw() + theme(legend.position='bottom') + coord_flip()
ggsave(file=paste(outdir,'/res_scenarios_total_seqonlycomplete.pdf',sep=''), w=10, h=7)
}
##--------------------------------------------------------------------------------------------------------
## olli 12.08.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.keep.most.recent.submission<- function(dfa, format='%d.%m.%Y')
{
tmp <- dfa[, list(SUB_N=length(unique(SUBMISSION_DATE)), SUB_DATE=unique(SUBMISSION_DATE)), by=c('TEAM','DATAT_L','OBJ')]
tmp <- subset(tmp, SUB_N>1)
set(tmp, NULL, 'SUB_DATE', tmp[,as.Date(SUB_DATE, format=format)])
# for each objective, determine submissions that are to be discarded
tmp <- tmp[, list(SUB_DATE=SUB_DATE[SUB_DATE!=max(SUB_DATE)]), by=c('TEAM','DATAT_L','OBJ')]
for(i in seq_len(nrow(tmp)))
set(dfa, dfa[, which(TEAM==tmp$TEAM[i] & DATAT_L==tmp$DATAT_L[i] & OBJ==tmp$OBJ[i] & SUBMISSION_DATE==tmp[, as.character(tmp$SUB_DATE[i], format=format)])], 'central', NA_real_)
subset(dfa, !is.na(central))
}
##--------------------------------------------------------------------------------------------------------
## olli 12.08.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.incidence<- function(dfa, outdir, onSeq=1)
{
require(gridExtra)
#,'OBJ_v','OBJ_vi'
# compare objectives with / without seq data, village + regional
if(onSeq)
{
df <- subset(dfa, Pr_Seq==1)
title <- '\nPrimary objective\nIncidence from sequence data\n'
}
if(!onSeq)
{
df <- subset(dfa, Pr_Phy==1)
title <- '\nPrimary objective\nIncidence when phylogeny known\n'
}
tmp <- subset(df, !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_ii'))
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
p1 <- ggplot(subset(tmp, TEAM!='True'), aes(y=gsub('\n',':',paste(INT_L,' ',AC_L,sep='')), x=central)) +
#geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
coord_cartesian(xlim=c(0, 10)) +
scale_x_continuous(breaks=seq(1,9,1)) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n%Incidence', y='')
tmp <- subset(df, !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_iii'))
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
p2 <- ggplot(subset(tmp, TEAM!='True'), aes(y=gsub('\n',':',paste(INT_L,' ',AC_L,sep='')), x=central)) +
#geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
geom_vline(xintercept=1, colour='grey50', size=0.8) +
coord_cartesian(xlim=c(0, 2)) +
scale_colour_manual(values=TEAM_CL) +
scale_x_continuous(breaks=seq(0.2,1.8,0.4), minor_breaks=seq(0.2,1.8,0.2)) +
facet_grid(TEAM~DATAT_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\nIncidence reduction', y='')
pdf(file=paste(outdir,'/res_acrossTEAM_PrimaryIncidence_onSeq',onSeq,'.pdf',sep=''), width = 15, height = 8)
print(grid.arrange(p1, p2, nrow=1, main=title))
dev.off()
}
##--------------------------------------------------------------------------------------------------------
## olli 12.08.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.acute2<- function(dfa, outdir, onSeq=1)
{
require(gridExtra)
if(onSeq)
{
df <- subset(dfa, Pr_Seq==1)
title <- '\nPrimary objective\n%Acute from sequence data\n'
}
if(!onSeq)
{
df <- subset(dfa, Pr_Phy==1)
title <- '\nSecondary objective\n%Acute when phylogeny known\n'
}
tmp <- subset(df, !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_v'))
tmp3 <- subset(tmp, TEAM=='True', select=c(SC_RND, central))
setnames(tmp3, 'central', 'central_true')
tmp <- subset(merge(tmp, tmp3, by='SC_RND', all.x=TRUE), TEAM!='True')
ggplot(tmp, aes(x=central_true, y=central, colour=TEAM, pch= gsub('\n',':',paste(INT_L,' ',AC_L,sep='')))) +
geom_point() +
geom_abline(slope=1, intercept=0) +
scale_colour_manual(values=TEAM_CL) +
facet_wrap(~DATAT_L, scales='free') +
theme_bw() + theme(panel.margin.x= unit(1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3), pch=guide_legend(ncol=3)) +
labs(x= '\ntrue % Acute at baseline', y='\nestimated % Acute at baseline', pch='')
}
##--------------------------------------------------------------------------------------------------------
## olli 25.11.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.IncidenceTrends<- function(dfa, outdir)
{
tmp <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_iii')
tmp2 <- subset(dfa, TEAM=='True' & OBJ=='OBJ_iii')
setkey(tmp2, central)
set(tmp, NULL, 'SC_RND', factor(tmp[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
setnames(tmp2, 'central', 'central_t')
tmp2[, TEAM:=NULL]
tmp2 <- merge( data.table(expand.grid(SC_RND=tmp[, unique(SC_RND)], TEAM=tmp[, unique(TEAM)])), tmp2, by='SC_RND')
tmp <- merge(tmp, subset(tmp2, select=c(SC_RND, TEAM, central_t)), by=c('SC_RND','TEAM'), all=1)
tmp2 <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_i', c(SC_RND, TEAM, central))
setnames(tmp2, 'central', 'OBJ_i')
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp[, levels(SC_RND)], labels=tmp[, levels(SC_RND)]))
tmp <- merge(tmp, tmp2, by=c('SC_RND','TEAM'), all=1)
set(tmp, tmp[, which(is.na(OBJ_i))], 'OBJ_i', 2)
cnts <- as.data.table(melt(tmp[, table(central_t<0.75, as.character(OBJ_i), as.character(TEAM)) ]))
setnames(cnts, c('Var1','Var2','Var3','value'), c('TRUE_TREND','OBJ_i','TEAM','CNT'))
set(cnts, NULL, 'OBJ_i', cnts[, factor(OBJ_i, levels=c(-1,0,1,2), labels=c('declining','stable','increasing','missing'))])
set(cnts, NULL, 'TRUE_TREND', cnts[, factor(TRUE_TREND, levels=c(FALSE,TRUE), labels=c('g75pc','l75pc'))])
cnts <- subset(cnts, TRUE_TREND=='l75pc')
tmp2 <- cnts[, list(CNT= round(100*CNT[OBJ_i=='declining']/sum(CNT),d=0), OBJ_i='TPR' ), by=c('TRUE_TREND','TEAM')]
cnts <- rbind(cnts, tmp2, use.names=TRUE)
cnts <- dcast.data.table( subset(cnts, TRUE_TREND=='l75pc'), TRUE_TREND+OBJ_i~TEAM, value.var='CNT')
ans <- copy(cnts)
cnts <- as.data.table(melt(tmp[, table(central_t>0.85, as.character(OBJ_i), as.character(TEAM)) ]))
setnames(cnts, c('Var1','Var2','Var3','value'), c('TRUE_TREND','OBJ_i','TEAM','CNT'))
set(cnts, NULL, 'OBJ_i', cnts[, factor(OBJ_i, levels=c(-1,0,1,2), labels=c('declining','stable','increasing','missing'))])
set(cnts, NULL, 'TRUE_TREND', cnts[, factor(TRUE_TREND, levels=c(FALSE,TRUE), labels=c('l85pc','g85pc'))])
cnts <- subset(cnts, TRUE_TREND=='g85pc')
tmp2 <- cnts[, list(CNT= round(100*CNT[OBJ_i=='declining']/sum(CNT),d=0), OBJ_i='FPR' ), by=c('TRUE_TREND','TEAM')]
cnts <- rbind(cnts, tmp2, use.names=TRUE)
cnts <- dcast.data.table( subset(cnts, TRUE_TREND=='g85pc'), TRUE_TREND+OBJ_i~TEAM, value.var='CNT')
ans <- rbind(ans, cnts)
write.csv(ans, file=paste(outdir, '/', 'res_acrossTEAM_Primary_IncidenceTrend.csv', sep=''), row.names=FALSE)
}
##--------------------------------------------------------------------------------------------------------
## olli 26.11.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.CovariatesPCIncidence<- function(dfa, outdir)
{
tmp <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_ii')
tmp2 <- subset(dfa, TEAM=='True' & OBJ=='OBJ_ii')
setkey(tmp2, central)
set(tmp, NULL, 'SC_RND', factor(tmp[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
setnames(tmp2, 'central', 'central_t')
tmp2[, TEAM:=NULL]
tmp2 <- merge( data.table(expand.grid(SC_RND=tmp[, unique(SC_RND)], TEAM=tmp[, unique(TEAM)])), tmp2, by='SC_RND')
tmp <- merge(tmp, subset(tmp2, select=c(SC_RND, TEAM, central_t)), by=c('SC_RND','TEAM'), all=1)
tmp2 <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_i', c(SC_RND, TEAM, central))
setnames(tmp2, 'central', 'OBJ_i')
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp[, levels(SC_RND)], labels=tmp[, levels(SC_RND)]))
tmp <- merge(tmp, tmp2, by=c('SC_RND','TEAM'), all=1)
tmp <- subset(tmp, !is.na(central), c(SC_RND, TEAM, DATAT_L, central, lower95, upper95, DATA_T, AC_T, INT_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, central_t))
tmp[, RES:= log(central)-log(central_t)]
tmp[, OUTLIER:= central>20]
dfc <- subset(tmp, !OUTLIER)[, list( COR=cor(central_t, central), COR_LOG=cor(log(central_t), log(central)),
MAE=mean( abs(central-central_t) ), MAE_L=mean( abs(log(central)-log(central_t)) ),
MSE=mean( (central-central_t)^2 ), MSE_L=mean( (log(central)-log(central_t))^2 ),
ARME=mean(central-central_t), ARME_L=mean(log(central)-log(central_t))), by='TEAM']
dfc2 <- subset(tmp, !OUTLIER)[, list( COR=cor(central_t, central), COR_LOG=cor(log(central_t), log(central)),
MAE=mean( abs(central-central_t) ), MAE_L=mean( abs(log(central)-log(central_t)) ),
MSE=mean( (central-central_t)^2 ), MSE_L=mean( (log(central)-log(central_t))^2 ),
ARME=mean(central-central_t), ARME_L=mean(log(central)-log(central_t))), by=c('TEAM','DATAT_L')]
require(gamlss)
bw.AIC <- vector('list', tmp[, length(unique(TEAM))])
names(bw.AIC) <- tmp[, unique(TEAM)]
bw.BIC <- vector('list', tmp[, length(unique(TEAM))])
names(bw.BIC) <- tmp[, unique(TEAM)]
for(x in names(bw.AIC))
{
cat('\nprocess',x)
df <- subset(tmp, TEAM==x)
set(df, NULL, c('lower95','upper95'), NULL)
if(x!='Cambridge')
mnoa <- gamlss(RES~DATAT_L+DATA_T+AC_T+INT_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D-1, data=df, family=NO, trace=FALSE)
if(x=='Cambridge')
mnoa <- gamlss(RES~AC_T+INT_T+IMPRT+SMPL_N-1, data=df, family=NO, trace=FALSE)
bw.AIC[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(df)))
}
dfcoeff <- tmp[, {
z <- names(coef(bw.AIC[[as.character(TEAM)]]))
z2 <- names(coef(bw.BIC[[as.character(TEAM)]]))
length(z) <- max(length(z),length(z2))
length(z2) <- max(length(z),length(z2))
list( BWAIC=z, BWBIC=z2 )
}, by='TEAM']
}
##--------------------------------------------------------------------------------------------------------
## olli 26.11.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.sortedPCIncidence<- function(dfa, outdir)
{
tmp <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_ii')
tmp2 <- subset(dfa, TEAM=='True' & OBJ=='OBJ_ii')
setkey(tmp2, central)
set(tmp, NULL, 'SC_RND', factor(tmp[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
setnames(tmp2, 'central', 'central_t')
tmp2[, TEAM:=NULL]
tmp2 <- merge( data.table(expand.grid(SC_RND=tmp[, unique(SC_RND)], TEAM=tmp[, unique(TEAM)])), tmp2, by='SC_RND')
tmp <- merge(tmp, subset(tmp2, select=c(SC_RND, TEAM, central_t)), by=c('SC_RND','TEAM'), all=1)
tmp2 <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_i', c(SC_RND, TEAM, central))
setnames(tmp2, 'central', 'OBJ_i')
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp[, levels(SC_RND)], labels=tmp[, levels(SC_RND)]))
tmp <- merge(tmp, tmp2, by=c('SC_RND','TEAM'), all=1)
ggplot( tmp, aes(x=SC_RND, group=TEAM, colour=gsub('using\n','',DATA_T2))) +
#geom_point(aes(y=central, shape=factor(OBJ_i, levels=c(-1,0,1), labels=c('declining','stable','increasing'))), position=position_dodge(width = 0.90)) +
geom_point(aes(y=central), pch=16, position=position_dodge(width = 0.90)) +
geom_errorbar(aes(ymin=lower95, ymax=upper95), na.rm=TRUE, position=position_dodge(width = 0.90)) +
geom_point(aes(y=central_t), colour='black', pch=18) +
coord_cartesian(ylim=c(0,20)) +
scale_colour_brewer(palette='Set1') +
scale_y_continuous(expand=c(0,0)) +
#scale_y_continuous(breaks=seq(0,3,0.5), minor_breaks=seq(0,3,0.1)) +
#scale_size_manual(values = seq(1.5, by=0.5, length.out=4)) +
#scale_shape_manual(values = c(0, 1, 5)) +
facet_wrap(~TEAM, ncol=3) +
labs(x='\nPANGEA data set', y='Estimated and true % Incidence\n', colour='Estimates based on') +
theme_bw() +
theme(legend.position='bottom', axis.text.x=element_text(size=5.5), panel.grid.major.y=element_line(colour='grey70', size=1), panel.grid.minor.y=element_line(colour='grey70', size=0.4))
ggsave(file=paste(outdir,'/res_acrossTEAM_Primary_PcIncidence_2.pdf',sep=''), width=11, height=7, useDingbats=FALSE)
}
##--------------------------------------------------------------------------------------------------------
## olli 25.11.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.sortedIncidenceRatio<- function(dfa, outdir)
{
tmp <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_iii')
tmp2 <- subset(dfa, TEAM=='True' & OBJ=='OBJ_iii')
setkey(tmp2, central)
set(tmp, NULL, 'SC_RND', factor(tmp[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
setnames(tmp2, 'central', 'central_t')
tmp2[, TEAM:=NULL]
tmp2 <- merge( data.table(expand.grid(SC_RND=tmp[, unique(SC_RND)], TEAM=tmp[, unique(TEAM)])), tmp2, by='SC_RND')
tmp <- merge(tmp, subset(tmp2, select=c(SC_RND, TEAM, central_t)), by=c('SC_RND','TEAM'), all=1)
tmp2 <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_i', c(SC_RND, TEAM, central))
setnames(tmp2, 'central', 'OBJ_i')
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp[, levels(SC_RND)], labels=tmp[, levels(SC_RND)]))
tmp <- merge(tmp, tmp2, by=c('SC_RND','TEAM'), all=1)
ggplot( tmp, aes(x=SC_RND, group=TEAM, colour=gsub('using\n','',DATA_T2))) +
geom_point(aes(y=central), pch=16, position=position_dodge(width = 0.90)) +
geom_errorbar(aes(ymin=lower95, ymax=upper95), na.rm=TRUE, position=position_dodge(width = 0.90)) +
geom_point(aes(y=central_t), colour='black', pch=18) +
coord_cartesian(ylim=c(0,1.99)) +
scale_colour_brewer(palette='Set1') +
scale_y_continuous(breaks=seq(0,3,0.5), minor_breaks=seq(0,3,0.1), expand=c(0,0)) +
#scale_size_manual(values = seq(1.5, by=0.5, length.out=4)) +
#scale_shape_manual(values = c(0, 1, 5)) +
facet_wrap(~TEAM, ncol=3) +
labs(x='\nPANGEA data set', y='Estimated and true incidence ratio\n', colour='Estimates based on') +
theme_bw() +
theme(legend.position='bottom', axis.text.x=element_text(size=5.5), panel.grid.major.y=element_line(colour='grey70', size=1), panel.grid.minor.y=element_line(colour='grey70', size=0.4))
ggsave(file=paste(outdir,'/res_acrossTEAM_Primary_IncidenceRatio_2.pdf',sep=''), width=11, height=7, useDingbats=FALSE)
ggplot( tmp, aes(x=SC_RND, group=TEAM, colour=gsub('using\n','',DATA_T2))) +
geom_point(aes(y=central, shape=factor(OBJ_i, levels=c(-1,0,1), labels=c('declining','stable','increasing'))), position=position_dodge(width = 0.90)) +
geom_errorbar(aes(ymin=lower95, ymax=upper95), na.rm=TRUE, position=position_dodge(width = 0.90)) +
geom_point(aes(y=central_t), colour='black', pch=18) +
coord_cartesian(ylim=c(1.99,8)) +
scale_colour_brewer(palette='Set1') +
scale_y_continuous(breaks=seq(0,8,2), minor_breaks=seq(0,8,0.5)) +
#scale_size_manual(values = seq(1.5, by=0.5, length.out=4)) +
scale_shape_manual(values = c(0, 1, 5)) +
facet_wrap(~TEAM, ncol=3) +
labs(x='\nPANGEA data set', y='Outliers in estimated incidence ratio\n', colour='Estimates based on', shape='Estimated trend in incidence during intervention') +
theme_bw() +
theme(legend.position='bottom', axis.text.x=element_text(size=5.5), panel.grid.major.y=element_line(colour='grey70', size=1), panel.grid.minor.y=element_line(colour='grey70', size=0.4))
ggsave(file=paste(outdir,'/res_acrossTEAM_Primary_IncidenceRatio_Outliers.pdf',sep=''), width=10, height=7)
}
##--------------------------------------------------------------------------------------------------------
## olli 27.11.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.AcutePercentvsSamplingCoverage<- function(dfa, outdir)
{
tmp <- subset(dfa, USED_GENES=='all' & !grepl('(', TEAM,fixed=1) & OBJ%in%c('OBJ_v','OBJ_vi'), c(SC_RND, TEAM, DATAT_L, central, lower95, upper95, DATA_T, AC_T, INT_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, OBJ))
tmp2 <- subset(tmp, TEAM=='True', c(SC_RND, OBJ, central))
setnames(tmp2, c('central'), c('central_t'))
tmp <- merge(tmp, tmp2, by=c('SC_RND','OBJ'))
tmp <- subset(tmp, TEAM!='True')
ggplot(tmp, aes(x=SC_RND, y=abs(central-central_t), group=OBJ)) +
geom_point(aes(colour=factor(OBJ, levels=c('OBJ_v','OBJ_vi'), labels=c('just before intervention','after intervention'))), position=position_dodge(width = 0.6)) +
geom_boxplot(aes(fill=factor(OBJ, levels=c('OBJ_v','OBJ_vi'), labels=c('just before intervention','after intervention'))), colour='black', outlier.shape=NA, alpha=0.1) +
scale_y_continuous(expand=c(0,0), breaks=seq(0,30,10), minor_breaks=seq(0,30,5)) +
scale_colour_brewer(palette='Set1') +
scale_fill_brewer(palette='Set1') +
facet_grid(TEAM~SMPL_C, scales='free_x', space='free_x') +
theme_bw() + theme(legend.position='bottom', panel.grid.major.y=element_line(colour='grey70', size=0.4), panel.grid.minor.y=element_line(colour='grey70', size=0.4)) +
labs(x="\nPANGEA data set", y='absolute error between\nestimated and true proportion of early transmissions\n', colour='Proportion early transmissions', fill='Proportion early transmissions')
ggsave(file=paste(outdir,'/res_acrossTEAM_Primary_AcutePercent_SamplingCoverage.pdf',sep=''), width=10, height=7)
}
##--------------------------------------------------------------------------------------------------------
## olli 26.11.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.sortedAcutePercent<- function(dfa, outdir)
{
tmp <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_v')
tmp2 <- subset(dfa, TEAM=='True' & OBJ=='OBJ_v')
setkey(tmp2, central)
set(tmp, NULL, 'SC_RND', factor(tmp[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
setnames(tmp2, 'central', 'central_t')
tmp2[, TEAM:=NULL]
tmp2 <- merge( data.table(expand.grid(SC_RND=tmp[, unique(SC_RND)], TEAM=tmp[, unique(TEAM)])), tmp2, by='SC_RND')
tmp <- merge(tmp, subset(tmp2, select=c(SC_RND, TEAM, DATAT_L, central_t)), by=c('SC_RND','TEAM','DATAT_L'), all=1)
tmp2 <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_iv', c(SC_RND, TEAM, central))
setnames(tmp2, 'central', 'OBJ_iv')
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp[, levels(SC_RND)], labels=tmp[, levels(SC_RND)]))
tmp <- merge(tmp, tmp2, by=c('SC_RND','TEAM'), all=1)
tmp[, SMPL_C_BEFORE:= NA_character_]
set(tmp, tmp[, which(DATAT_L=='Village' & !is.na(SMPL_C))], 'SMPL_C_BEFORE', '<1%')
set(tmp, tmp[, which(DATAT_L=='Regional' & SMPL_C=='8%' & SMPL_M=='much')], 'SMPL_C_BEFORE', '4%')
set(tmp, tmp[, which(DATAT_L=='Regional' & SMPL_C=='16%' & SMPL_M=='much')], 'SMPL_C_BEFORE', '8%')
set(tmp, tmp[, which(DATAT_L=='Regional' & SMPL_M=='extreme')], 'SMPL_C_BEFORE', '1%')
tmp2 <- tmp[, which(!is.na(SMPL_C))]
set(tmp, tmp2, 'SMPL_C_BEFORE', tmp[tmp2, paste(SMPL_C_BEFORE,' just before intervention\n',SMPL_C,' after intervention',sep='')])
ggplot( tmp, aes(x=SC_RND, group=TEAM, colour=gsub('using\n','',DATA_T2))) +
geom_point(aes(y=central), position=position_dodge(width = 0.90), pch=16) +
geom_errorbar(aes(ymin=lower95, ymax=upper95), na.rm=TRUE, position=position_dodge(width = 0.90)) +
geom_point(aes(y=central_t), colour='black', pch=18) +
coord_cartesian(ylim=c(0,50)) +
scale_colour_brewer(palette='Set1') +
scale_y_continuous(breaks=seq(0,50,10), minor_breaks=seq(0,50,5)) +
scale_size_manual(values = c(1.5, 4, 1.5, 1.5, 4)) +
#scale_shape_manual(values = c(0, 1, 5, 6, 7)) +
facet_wrap(DATAT_L~TEAM, ncol=4, scale='free') +
labs(x='\nPANGEA data set', y='Estimated and true % early transmissions\njust before the intervention\n', colour='Estimates based on', shape='Sampling coverage', size='Sampling coverage') +
theme_bw() +
theme(legend.position='bottom', axis.text.x=element_text(size=8), panel.grid.major.y=element_line(colour='grey70', size=1), panel.grid.minor.y=element_line(colour='grey70', size=0.4)) +
guides(shape=guide_legend(ncol=3), size=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Primary_PCearlyjustbefore_2.pdf',sep=''), width=11, height=7, useDingbats=FALSE)
ggplot( tmp, aes(x=central_t, y=central, colour=gsub('using\n','',DATA_T2))) +
geom_point(aes(shape=SMPL_C, size=SMPL_C)) +
geom_errorbar(aes(ymin=lower95, ymax=upper95), na.rm=TRUE) +
geom_abline(intercept=0, slope=1) +
coord_cartesian(ylim=c(0,50)) +
scale_colour_brewer(palette='Set1') +
scale_size_manual(values = seq(1.5, by=1, length.out=4)) +
scale_shape_manual(values = c(0, 1, 5, 6)) +
facet_grid(DATAT_L~TEAM) +
theme_bw() +
theme(legend.position='bottom', panel.grid.major=element_line(colour='grey70', size=0.4), panel.grid.minor=element_line(colour='grey70', size=0.4)) +
labs(x='\nTrue % early transmissions\njust before the intervention', y='Estimated % early transmissions\njust before the intervention\n', colour='Estimates based on', shape='Sampling coverage', size='Sampling coverage')
ggsave(file=paste(outdir,'/res_acrossTEAM_Primary_PCearlyjustbefore2.pdf',sep=''), width=10, height=7)
tmp <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_vi')
tmp2 <- subset(dfa, TEAM=='True' & OBJ=='OBJ_vi')
setkey(tmp2, central)
set(tmp, NULL, 'SC_RND', factor(tmp[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
setnames(tmp2, 'central', 'central_t')
tmp2[, TEAM:=NULL]
tmp2 <- merge( data.table(expand.grid(SC_RND=tmp[, unique(SC_RND)], TEAM=tmp[, unique(TEAM)])), tmp2, by='SC_RND')
tmp <- merge(tmp, subset(tmp2, select=c(SC_RND, TEAM, DATAT_L, central_t)), by=c('SC_RND','TEAM','DATAT_L'), all=1)
tmp2 <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_iv', c(SC_RND, TEAM, central))
setnames(tmp2, 'central', 'OBJ_iv')
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp[, levels(SC_RND)], labels=tmp[, levels(SC_RND)]))
tmp <- merge(tmp, tmp2, by=c('SC_RND','TEAM'), all=1)
tmp[, SMPL_C_BEFORE:= NA_character_]
set(tmp, tmp[, which(DATAT_L=='Village' & !is.na(SMPL_C))], 'SMPL_C_BEFORE', '<1%')
set(tmp, tmp[, which(DATAT_L=='Regional' & SMPL_C=='8%' & SMPL_M=='much')], 'SMPL_C_BEFORE', '4%')
set(tmp, tmp[, which(DATAT_L=='Regional' & SMPL_C=='16%' & SMPL_M=='much')], 'SMPL_C_BEFORE', '8%')
set(tmp, tmp[, which(DATAT_L=='Regional' & SMPL_M=='extreme')], 'SMPL_C_BEFORE', '1%')
tmp2 <- tmp[, which(!is.na(SMPL_C))]
set(tmp, tmp2, 'SMPL_C_BEFORE', tmp[tmp2, paste(SMPL_C_BEFORE,' just before intervention\n',SMPL_C,' after intervention',sep='')])
ggplot( tmp, aes(x=SC_RND, group=TEAM, colour=gsub('using\n','',DATA_T2))) +
#geom_point(aes(y=central, shape=factor(OBJ_iv, levels=c(-1,0,1), labels=c('<15%','15-30%','>30%'))), position=position_dodge(width = 0.90)) +
geom_point(aes(y=central), position=position_dodge(width = 0.90), pch=16) +
geom_errorbar(aes(ymin=lower95, ymax=upper95), na.rm=TRUE, position=position_dodge(width = 0.90)) +
geom_point(aes(y=central_t), colour='black', pch=18) +
coord_cartesian(ylim=c(0,50)) +
scale_colour_brewer(palette='Set1') +
scale_y_continuous(breaks=seq(0,50,10), minor_breaks=seq(0,50,5)) +
#scale_size_manual(values = c(1.5, 4, 1.5, 1.5, 4)) +
#scale_shape_manual(values = c(0, 1, 5, 6, 7)) +
facet_wrap(DATAT_L~TEAM, ncol=4, scale='free') +
labs(x='\nPANGEA data set', y='Estimated and true % early transmissions\nafter the intervention\n', colour='Estimates based on', shape='Sampling coverage', size='Sampling coverage') +
theme_bw() +
theme(legend.position='bottom', axis.text.x=element_text(size=8), panel.grid.major.y=element_line(colour='grey70', size=1), panel.grid.minor.y=element_line(colour='grey70', size=0.4)) +
guides(shape=guide_legend(ncol=3), size=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Primary_PCearlyafter_2.pdf',sep=''), width=11, height=7, useDingbats=FALSE)
}
##--------------------------------------------------------------------------------------------------------
## olli 25.11.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.CovariatesIncidenceRatio<- function(dfa, outdir)
{
tmp <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_iii')
tmp2 <- subset(dfa, TEAM=='True' & OBJ=='OBJ_iii')
setkey(tmp2, central)
set(tmp, NULL, 'SC_RND', factor(tmp[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
setnames(tmp2, 'central', 'central_t')
tmp2[, TEAM:=NULL]
tmp2 <- merge( data.table(expand.grid(SC_RND=tmp[, unique(SC_RND)], TEAM=tmp[, unique(TEAM)])), tmp2, by='SC_RND')
tmp <- merge(tmp, subset(tmp2, select=c(SC_RND, TEAM, central_t)), by=c('SC_RND','TEAM'), all=1)
tmp2 <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_i', c(SC_RND, TEAM, central))
setnames(tmp2, 'central', 'OBJ_i')
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp[, levels(SC_RND)], labels=tmp[, levels(SC_RND)]))
tmp <- merge(tmp, tmp2, by=c('SC_RND','TEAM'), all=1)
tmp <- subset(tmp, !is.na(central), c(SC_RND, TEAM, DATAT_L, central, lower95, upper95, DATA_T, AC_T, INT_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, central_t))
tmp[, RES:= central-central_t]
tmp[, OUTLIER:= central>2]
dfc <- subset(tmp, !OUTLIER)[, list( COR=cor(central_t, central), COR_LOG=cor(log(central_t), log(central)),
MAE=mean( abs(central-central_t) ), MAE_L=mean( abs(log(central)-log(central_t)) ),
MSE=mean( (central-central_t)^2 ), MSE_L=mean( (log(central)-log(central_t))^2 ),
ARME= mean( central-central_t ),
HAME_R=1/mean(1/(central/central_t)), ARME_L=mean(log(central)-log(central_t))), by='TEAM']
dfc2 <- subset(tmp, !OUTLIER)[, list( COR=cor(central_t, central), COR_LOG=cor(log(central_t), log(central)),
MAE=mean( abs(central-central_t) ), MAE_L=mean( abs(log(central)-log(central_t)) ),
MSE=mean( (central-central_t)^2 ), MSE_L=mean( (log(central)-log(central_t))^2 ),
ARME= mean( central-central_t ),
HAME_R=1/mean(1/(central/central_t)), ARME_L=mean(log(central)-log(central_t))), by=c('DATAT_L','TEAM')]
require(gamlss)
bw.AIC <- vector('list', tmp[, length(unique(TEAM))])
names(bw.AIC) <- tmp[, unique(TEAM)]
bw.BIC <- vector('list', tmp[, length(unique(TEAM))])
names(bw.BIC) <- tmp[, unique(TEAM)]
for(x in names(bw.AIC))
{
cat('\nprocess',x)
df <- subset(tmp, TEAM==x)
set(df, NULL, c('lower95','upper95'), NULL)
if(x!='Cambridge')
mnoa <- gamlss(RES~DATAT_L+DATA_T+AC_T+INT_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D-1, data=df, family=NO, trace=FALSE)
if(x=='Cambridge')
mnoa <- gamlss(RES~AC_T+INT_T+IMPRT+SMPL_N-1, data=df, family=NO, trace=FALSE)
bw.AIC[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(df)))
}
dfcoeff <- tmp[, {
z <- names(coef(bw.AIC[[as.character(TEAM)]]))
z2 <- names(coef(bw.BIC[[as.character(TEAM)]]))
length(z) <- max(length(z),length(z2))
length(z2) <- max(length(z),length(z2))
list( BWAIC=z, BWBIC=z2 )
}, by='TEAM']
}
##--------------------------------------------------------------------------------------------------------
## olli 26.11.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.CovariatesAcutePercent<- function(dfa, outdir)
{
tmp <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_v')
tmp2 <- subset(dfa, TEAM=='True' & OBJ=='OBJ_v')
setkey(tmp2, central)
set(tmp, NULL, 'SC_RND', factor(tmp[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
setnames(tmp2, 'central', 'central_t')
tmp2[, TEAM:=NULL]
tmp2 <- merge( data.table(expand.grid(SC_RND=tmp[, unique(SC_RND)], TEAM=tmp[, unique(TEAM)])), tmp2, by='SC_RND')
tmp <- merge(tmp, subset(tmp2, select=c(SC_RND, TEAM, DATAT_L, central_t)), by=c('SC_RND','DATAT_L','TEAM'), all=1)
tmp2 <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_iv', c(SC_RND, TEAM, central))
setnames(tmp2, 'central', 'OBJ_iv')
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp[, levels(SC_RND)], labels=tmp[, levels(SC_RND)]))
tmp <- merge(tmp, tmp2, by=c('SC_RND','TEAM'), all=1)
tmp <- subset(tmp, !is.na(central), c(SC_RND, TEAM, DATAT_L, central, lower95, upper95, DATA_T, AC_T, INT_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, central_t))
tmp[, RES:= central-central_t]
tmp[, OUTLIER:= central>60]
dfcb <- subset(tmp, !OUTLIER)[, list( COR=cor(central_t, central),
MAE=mean( abs(central-central_t) ),
MSE=mean( (central-central_t)^2 ),
ARME=mean(central-central_t)), by=c('DATAT_L','TEAM')]
dfcbo <- subset(tmp, !OUTLIER)[, list( COR=cor(central_t, central),
MAE=mean( abs(central-central_t) ),
MSE=mean( (central-central_t)^2 ),
ARME=mean(central-central_t)), by=c('TEAM')]
require(gamlss)
bw.AIC <- vector('list', tmp[, length(unique(TEAM))])
names(bw.AIC) <- tmp[, unique(TEAM)]
bw.BIC <- vector('list', tmp[, length(unique(TEAM))])
names(bw.BIC) <- tmp[, unique(TEAM)]
for(x in names(bw.AIC))
{
cat('\nprocess',x)
df <- subset(tmp, TEAM==x)
set(df, NULL, c('lower95','upper95'), NULL)
#stepGAIC.VR(mnoa, direction='forward', what='mu', trace=1, scope=~DATAT_L+DATA_T+AC_T+INT_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D)
if(x!='Cambridge') # exclude IMPRT SMPL_C
mnoa <- gamlss(RES~DATAT_L+DATA_T+AC_T+INT_T+SMPL_N+SMPL_M+SMPL_D-1, data=df, family=NO, trace=FALSE)
if(x=='Cambridge')
mnoa <- gamlss(RES~AC_T+INT_T+IMPRT+SMPL_N-1, data=df, family=NO, trace=FALSE)
bw.AIC[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(df)))
}
dfcoeffb <- tmp[, {
z <- summary(bw.AIC[[as.character(TEAM)]])[names(coef(bw.AIC[[as.character(TEAM)]])),'Pr(>|t|)']
z2 <- summary(bw.BIC[[as.character(TEAM)]])[names(coef(bw.BIC[[as.character(TEAM)]])),'Pr(>|t|)']
z <- sort(z)
z2 <- sort(z2)
length(z) <- max(length(z),length(z2))
length(z2) <- max(length(z),length(z2))
list( BWAIC=names(z), BWAICp=z, BWBIC=names(z2), BWBICp=z2 )
}, by='TEAM']
tmp <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_vi')
tmp2 <- subset(dfa, TEAM=='True' & OBJ=='OBJ_vi')
setkey(tmp2, central)
set(tmp, NULL, 'SC_RND', factor(tmp[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
set(tmp2, NULL, 'SC_RND', factor(tmp2[,as.character(SC_RND)], levels=tmp2[, SC_RND], labels=tmp2[, SC_RND]))
setnames(tmp2, 'central', 'central_t')
tmp2[, TEAM:=NULL]
tmp2 <- merge( data.table(expand.grid(SC_RND=tmp[, unique(SC_RND)], TEAM=tmp[, unique(TEAM)])), tmp2, by='SC_RND')
tmp <- merge(tmp, subset(tmp2, select=c(SC_RND, TEAM, DATAT_L, central_t)), by=c('SC_RND','DATAT_L','TEAM'), all=1)
tmp <- subset(tmp, !is.na(central), c(SC_RND, TEAM, DATAT_L, central, lower95, upper95, DATA_T, AC_T, INT_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, central_t))
tmp[, RES:= central-central_t]
tmp[, OUTLIER:= central>60]
dfca <- subset(tmp, !OUTLIER)[, list( COR=cor(central_t, central),
MAE=mean( abs(central-central_t) ),
MSE=mean( (central-central_t)^2 ),
ARME=mean(central-central_t)), by=c('DATAT_L','TEAM')]
dfcao <- subset(tmp, !OUTLIER)[, list( COR=cor(central_t, central),
MAE=mean( abs(central-central_t) ),
MSE=mean( (central-central_t)^2 ),
ARME=mean(central-central_t)), by=c('TEAM')]
bw.AICR <- vector('list', tmp[, length(unique(TEAM))])
names(bw.AICR) <- tmp[, unique(TEAM)]
bw.BICR <- vector('list', tmp[, length(unique(TEAM))])
names(bw.BICR) <- tmp[, unique(TEAM)]
bw.AICV <- vector('list', tmp[, length(unique(TEAM))])
names(bw.AICV) <- tmp[, unique(TEAM)]
bw.BICV <- vector('list', tmp[, length(unique(TEAM))])
names(bw.BICV) <- tmp[, unique(TEAM)]
for(x in names(bw.AICR))
{
cat('\nprocess',x)
df <- subset(tmp, DATAT_L=='Village' & TEAM==x)
set(df, NULL, c('lower95','upper95'), NULL)
#stepGAIC.VR(mnoa, direction='forward', what='mu', trace=1, scope=~DATAT_L+DATA_T+AC_T+INT_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D)
if(x!='Cambridge') # exclude IMPRT SMPL_C
mnoa <- gamlss(RES~DATA_T+AC_T+INT_T+SMPL_N+SMPL_D-1, data=df, family=NO, trace=FALSE)
if(x=='Cambridge')
mnoa <- gamlss(RES~AC_T+INT_T+IMPRT+SMPL_N-1, data=df, family=NO, trace=FALSE)
bw.AICV[[x]]<- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BICV[[x]]<- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(df)))
df <- subset(tmp, DATAT_L=='Regional' & TEAM==x)
set(df, NULL, c('lower95','upper95'), NULL)
#stepGAIC.VR(mnoa, direction='forward', what='mu', trace=1, scope=~DATAT_L+DATA_T+AC_T+INT_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D)
if(x!='Cambridge') # exclude IMPRT SMPL_C
mnoa <- gamlss(RES~DATA_T+AC_T+INT_T+SMPL_N+SMPL_M+SMPL_D-1, data=df, family=NO, trace=FALSE)
if(x=='Cambridge')
mnoa <- gamlss(RES~AC_T+INT_T+IMPRT+SMPL_N-1, data=df, family=NO, trace=FALSE)
bw.AICR[[x]]<- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BICR[[x]]<- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(df)))
}
dfcoeffa <- tmp[, {
cat(TEAM, DATAT_L)
z <- z2 <- numeric(0)
if(DATAT_L=='Village' & !is.null(names(coef(bw.AICV[[as.character(TEAM)]]))))
{
z <- summary(bw.AICV[[as.character(TEAM)]])[,'Pr(>|t|)']
z <- z[ intersect(names(z), names(coef(bw.AICV[[as.character(TEAM)]]))) ]
}
if(DATAT_L=='Village' & !is.null(names(coef(bw.BICV[[as.character(TEAM)]]))))
{
z2 <- summary(bw.BICV[[as.character(TEAM)]])[,'Pr(>|t|)']
z2 <- z2[ intersect(names(z2), names(coef(bw.BICV[[as.character(TEAM)]]))) ]
}
if(DATAT_L=='Regional'& !is.null(names(coef(bw.AICR[[as.character(TEAM)]]))))
{
z <- summary(bw.AICR[[as.character(TEAM)]])[,'Pr(>|t|)']
z <- z[ intersect(names(z), names(coef(bw.AICR[[as.character(TEAM)]]))) ]
}
if(DATAT_L=='Regional'& !is.null(names(coef(bw.BICR[[as.character(TEAM)]]))))
{
z2 <- summary(bw.BICR[[as.character(TEAM)]])[,'Pr(>|t|)']
z2 <- z2[ intersect(names(z2), names(coef(bw.BICR[[as.character(TEAM)]]))) ]
}
z <- sort(z)
z2 <- sort(z2)
length(z) <- max(length(z),length(z2))
length(z2) <- max(length(z),length(z2))
list( BWAIC=names(z), BWAICp=z, BWBIC=names(z2), BWBICp=z2 )
}, by=c('TEAM','DATAT_L')]
dfcoeffa <- subset(dfcoeffa, BWAICp<0.05 | BWBIC<.05)
}
##--------------------------------------------------------------------------------------------------------
## olli 02.12.15
##--------------------------------------------------------------------------------------------------------
grubbs.flag <- function(x)
{
outliers <- NULL
test <- x
gr <- grubbs.test(test)
pv <- gr$p.value
# throw an error if there are too few values for the Grubb's test
if (length(test) < 3 )
stop("Grubb's test requires > 2 input values")
while(pv < 0.05)
{
if( grepl('lowest',gr$alternative) )
{
outliers <- c(outliers, min(test))
test <- test[-which.min(test)]
}
if( grepl('highest',gr$alternative) )
{
outliers <- c(outliers, max(test))
test <- test[-which.max(test)]
}
# stop if all but two values are flagged as outliers
if (length(test) < 3 )
{
warning("All but two values flagged as outliers")
break
}
gr <- grubbs.test(test)
pv <- gr$p.value
}
return(data.frame(X=x, OUTLIER=(x %in% outliers)))
}
##--------------------------------------------------------------------------------------------------------
## olli 05.12.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.TreeSeq<- function(dfa, outdir)
{
require(exactRankTests)
dfo <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1), c(SC_RND, TEAM, DATAT_L, central, AC_T, INT_T,DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, OBJ, Pr_Phy, Pr_Seq))
dfo <- rbind( subset(dfo, Pr_Phy==1), subset(dfo, Pr_Seq==1) )
tmp <- dcast.data.table( subset(dfo, DATAT_L=='Village'), TEAM+DATAT_L+AC_T+INT_T+SMPL_C+SMPL_M+SMPL_D+OBJ~DATA_T, value.var='SC_RND')
dfo <- dcast.data.table(subset(dfo, DATAT_L=='Regional'), TEAM+DATAT_L+AC_T+INT_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+OBJ~DATA_T, value.var='SC_RND')
dfo <- rbind(dfo, tmp, use.names=TRUE, fill=TRUE)
dfo <- subset(dfo, !is.na(seq) & !is.na(phy))
tmp <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1), c(SC_RND, TEAM, OBJ, central, lower95, upper95))
setnames(tmp, c('SC_RND','central','lower95','upper95'), c('seq','SEQ_central','SEQ_lower95','SEQ_upper95'))
dfo <- merge(dfo, tmp, by=c('TEAM','OBJ','seq'), all.x=1)
tmp <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1), c(SC_RND, TEAM, OBJ, central, lower95, upper95))
setnames(tmp, c('SC_RND','central','lower95','upper95'), c('phy','PHY_central','PHY_lower95','PHY_upper95'))
dfo <- merge(dfo, tmp, by=c('TEAM','OBJ','phy'), all.x=1)
tmp <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & TEAM=='True', c(SC_RND, OBJ, central))
setnames(tmp, c('SC_RND','central'), c('seq','SEQ_True'))
dfo <- merge(dfo, tmp, by=c('OBJ','seq'), all.x=1)
tmp <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & TEAM=='True', c(SC_RND, OBJ, central))
setnames(tmp, c('SC_RND','central'), c('phy','PHY_True'))
dfo <- merge(dfo, tmp, by=c('OBJ','phy'), all.x=1)
# set residuals for incidence and incidence reduction
tmp <- dfo[, which(OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi'))]
set(dfo, tmp, 'SEQ_central', dfo[tmp, log(SEQ_central)-log(SEQ_True)])
set(dfo, tmp, 'SEQ_lower95', dfo[tmp, log(SEQ_lower95)-log(SEQ_True)])
set(dfo, tmp, 'SEQ_upper95', dfo[tmp, log(SEQ_upper95)-log(SEQ_True)])
set(dfo, tmp, 'PHY_central', dfo[tmp, log(PHY_central)-log(PHY_True)])
set(dfo, tmp, 'PHY_lower95', dfo[tmp, log(PHY_lower95)-log(PHY_True)])
set(dfo, tmp, 'PHY_upper95', dfo[tmp, log(PHY_upper95)-log(PHY_True)])
# set residuals for
if(0)
{
tmp <- dfo[, which(OBJ%in%c('OBJ_v','OBJ_vi'))]
set(dfo, tmp, 'SEQ_central', dfo[tmp, SEQ_central-SEQ_True])
set(dfo, tmp, 'SEQ_lower95', dfo[tmp, SEQ_lower95-SEQ_True])
set(dfo, tmp, 'SEQ_upper95', dfo[tmp, SEQ_upper95-SEQ_True])
set(dfo, tmp, 'PHY_central', dfo[tmp, PHY_central-PHY_True])
set(dfo, tmp, 'PHY_lower95', dfo[tmp, PHY_lower95-PHY_True])
set(dfo, tmp, 'PHY_upper95', dfo[tmp, PHY_upper95-PHY_True])
}
set(dfo, NULL, 'OBJ', dfo[, factor(OBJ, levels=c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi'), labels=c('Incidence\nafter intervention','Incidence reduction\nduring intervention','Proportion of early transmissions\njust before intervention','Proportion of early transmissions\nafter intervention'))])
ggplot(dfo, aes(x=seq, y=SEQ_central-PHY_central, ymin=SEQ_lower95-PHY_central, ymax=SEQ_upper95-PHY_central, colour=(SEQ_lower95>PHY_central | SEQ_upper95<PHY_central) )) +
geom_point() + geom_errorbar() + geom_boxplot(aes(group=OBJ), outlier.shape=NA, fill='transparent', colour='black') +
scale_colour_brewer(palette='Dark2', guide=FALSE) +
scale_y_continuous(breaks=seq(-4,4,2), minor_breaks=seq(-4,4,0.5)) +
facet_grid(TEAM~OBJ) + labs(x='\nPANGEA data set\n(top: data set containing sequences\nbottom: paired data set containing trees)',y='pairwise difference in error of estimates\nobtained from sequences and true trees\n') +
theme_bw() +
theme( panel.grid.major.y=element_line(colour='grey70', size=0.4), panel.grid.minor.y=element_line(colour='grey70', size=0.4))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_TreeSeq.pdf',sep=''), width=10, height=10)
#
dfo[,{
z <- t.test(SEQ_central, PHY_central, paired=TRUE)
list(TYPE='TTEST',NV=length(SEQ_central), PV=z$p.value, CIL=z$conf.int[1], CIU=z$conf.int[2])
}, by=c('TEAM')]
dfo[,{
z <- perm.test(1000*SEQ_central,1000*PHY_central,paired = TRUE,exact = TRUE)
list(TYPE='EXACTRANK',PV=z$p.value)
}, by=c('TEAM')]
# estimates on trees are sig different from those on sequences *only for Imperial*
}
##--------------------------------------------------------------------------------------------------------
## olli 07.12.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.Pol<- function(dfa, outdir)
{
dfpol <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & USED_GENES=='pol' & TEAM!='True' & !grepl('(', TEAM,fixed=1), c(SC_RND, TEAM, DATAT_L, central, lower95, upper95, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, OBJ))
setnames(dfpol, c('central','lower95','upper95'), c('POL_central','POL_lower95','POL_upper95'))
tmp <- merge(subset(dfa, USED_GENES=='all'), subset(dfpol, select=c('SC_RND','TEAM','OBJ')), by=c('SC_RND','TEAM','OBJ'))
setnames(tmp, c('central','lower95','upper95'), c('ALL_central','ALL_lower95','ALL_upper95'))
dfpol <- merge(dfpol, subset(tmp, select=c('SC_RND','TEAM','OBJ','ALL_central','ALL_lower95','ALL_upper95')), by=c('SC_RND','TEAM','OBJ'))
tmp <- subset(dfa, TEAM=='True', c(SC_RND, TEAM, DATAT_L, central, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, OBJ))
setnames(tmp, c('central'), c('TRUE_central'))
dfpol <- merge(dfpol, subset(tmp, select=c('SC_RND','OBJ','TRUE_central')), by=c('SC_RND','OBJ'))
set(dfpol, NULL, 'POL_central', dfpol[, log(POL_central)-log(TRUE_central)] )
set(dfpol, NULL, 'POL_upper95', dfpol[, log(POL_upper95)-log(TRUE_central)] )
set(dfpol, NULL, 'POL_lower95', dfpol[, log(POL_lower95)-log(TRUE_central)] )
set(dfpol, NULL, 'ALL_central', dfpol[, log(ALL_central)-log(TRUE_central)] )
set(dfpol, NULL, 'ALL_upper95', dfpol[, log(ALL_upper95)-log(TRUE_central)] )
set(dfpol, NULL, 'ALL_lower95', dfpol[, log(ALL_lower95)-log(TRUE_central)] )
}
##--------------------------------------------------------------------------------------------------------
## olli 06.12.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.Errors.v151206<- function(dfa, outdir)
{
require(plsdepot)
require(pls)
require(outliers)
dfo <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1), c(SC_RND, TEAM, DATAT_L, central, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, OBJ))
dfo <- dcast.data.table(dfo, SC_RND+TEAM+DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D~OBJ, value.var='central')
tmp <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & TEAM=='True', c(SC_RND, TEAM, DATAT_L, central, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, OBJ))
set(tmp, NULL, 'OBJ', tmp[, paste(OBJ,'_t',sep='')])
tmp <- dcast.data.table(tmp, SC_RND~OBJ, value.var='central')
dfo <- merge(dfo, tmp, by='SC_RND')
set(dfo, NULL, 'DATAT_L', dfo[, as.numeric(factor(DATAT_L, levels=c("Regional","Village"), labels=c("Regional","Village")))])
set(dfo, NULL, 'SMPL_M', dfo[, as.numeric(SMPL_M)])
set(dfo, NULL, 'SMPL_D', dfo[, as.numeric(SMPL_D)])
set(dfo, NULL, 'DATA_T', dfo[, as.numeric(DATA_T)])
if(1)
{
set(dfo, dfo[, which(IMPRT!='20%')], 'IMPRT', '<=5%')
set(dfo, NULL, 'IMPRT', dfo[, as.numeric(factor(as.character(IMPRT), levels=c('<=5%','20%'), labels=c('<=5%','20%')))])
set(dfo, dfo[, which(SMPL_C%in%c('8%','30%'))], 'SMPL_C', '1x')
set(dfo, dfo[, which(SMPL_C%in%c('16%','60%'))], 'SMPL_C', '2x')
set(dfo, NULL, 'SMPL_C', dfo[, as.numeric(factor(as.character(SMPL_C), levels=c('1x','2x'), labels=c('1x','2x')))])
}
if(0)
{
set(dfo, NULL, 'SMPL_C', dfo[, as.numeric(gsub('%','',as.character(SMPL_C)))/100])
set(dfo, NULL, 'IMPRT', dfo[, as.numeric(gsub('%','',as.character(IMPRT)))/100])
}
setnames(dfo, c('OBJ_ii_t','OBJ_iii_t','OBJ_v_t','OBJ_vi_t'), c('INC_t','INCR_t','ACS_t','ACE_t'))
#dfo[, R_ii_1:= OBJ_ii-INC_t]
dfo[, R_ii:= log(OBJ_ii)-log(INC_t)]
#dfo[, R_iii_1:= OBJ_iii-INCR_t]
dfo[, R_iii:= log(OBJ_iii)-log(INCR_t)]
dfo[, R_v:= OBJ_v-ACS_t]
#dfo[, R_v_2:= log(OBJ_v)-log(ACS_t)]
dfo[, R_vi:= OBJ_vi-ACE_t]
#dfo[, R_vi_2:= log(OBJ_vi)-log(ACE_t)]
dfo <- subset(dfo, select=c(SC_RND, TEAM, DATAT_L, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, INC_t, INCR_t, ACS_t, ACE_t, R_ii, R_iii, R_v, R_vi))
dfo <- melt(dfo, measure.vars=c('R_ii','R_iii','R_v','R_vi'), variable.name='OBJ', value.name='RESID')
set(dfo, NULL, 'OBJ', dfo[, factor(OBJ, levels=c('R_ii','R_iii','R_v','R_vi'), labels=c('Incidence\nafter intervention','Incidence reduction\nduring intervention','Proportion of early transmissions\njust before intervention','Proportion of early transmissions\nafter intervention'))])
dfo <- subset(dfo, !is.na(RESID))
# restrict to not Cambridge
dfo <- subset(dfo, TEAM!='Cambridge')
#
# do PLS on errors
#
plsms2 <- vector('list', dfo[, length(unique(TEAM))])
plsms3 <- vector('list', dfo[, length(unique(TEAM))])
plsms5 <- vector('list', dfo[, length(unique(TEAM))])
plsms6 <- vector('list', dfo[, length(unique(TEAM))])
names(plsms2) <- names(plsms3) <- names(plsms5) <- names(plsms6) <- dfo[, unique(TEAM)]
for(x in names(plsms2))
{
df <- subset(dfo, TEAM==x & OBJ=='Incidence\nafter intervention')
plsms2[[x]] <- plsr(RESID~DATAT_L+IMPRT+SMPL_C+SMPL_M+INC_t, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Incidence reduction\nduring intervention')
plsms3[[x]] <- plsr(RESID~DATAT_L+IMPRT+SMPL_C+SMPL_M+INCR_t, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\njust before intervention')
plsms5[[x]] <- plsr(RESID~DATAT_L+IMPRT+SMPL_C+SMPL_M+ACS_t, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\nafter intervention')
plsms6[[x]] <- plsr(RESID~DATAT_L+IMPRT+SMPL_C+SMPL_M+ACE_t, data=df, validation='LOO')
}
# get variance explained
dfr <- as.data.table(melt(sapply(plsms2, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
dfr[, OBJ:='Incidence\nafter intervention']
tmp <- as.data.table(melt(sapply(plsms3, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Incidence reduction\nduring intervention']
dfr <- rbind(dfr, tmp)
tmp <- as.data.table(melt(sapply(plsms5, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Proportion of early transmissions\njust before intervention']
dfr <- rbind(dfr, tmp)
tmp <- as.data.table(melt(sapply(plsms6, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Proportion of early transmissions\nafter intervention']
dfr <- rbind(dfr, tmp)
set(dfr, NULL, 'LATENT', dfr[, paste('LV',gsub(' comps','',LATENT),sep='')])
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, R2each= c(R2[1],diff(R2))), by=c('TEAM','OBJ')], by=c('TEAM','OBJ','LATENT'))
#oh wow! the variance explained by the various latent variables is quite different from each other!!
dfr <- dfr[order(TEAM, OBJ, -R2each)]
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, LATENTO= paste('lv',seq_along(LATENT),sep='')), by=c('TEAM','OBJ')], by=c('TEAM','OBJ','LATENT'))
set(dfr, NULL, 'LATENTO', dfr[, factor(LATENTO, levels=paste('lv',1:10,sep=''), labels=paste('lv',1:10,sep=''))])
#collect key X variables with high latent variable weights
dfl <- do.call('rbind',lapply(paste('Comp',1:5), function(x){
do.call('rbind', list( data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms2[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms2[[TEAM]]$loading.weights[, x], OBJ='Incidence\nafter intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms3[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms3[[TEAM]]$loading.weights[, x], OBJ='Incidence reduction\nduring intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms5[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms5[[TEAM]]$loading.weights[, x], OBJ='Proportion of early transmissions\njust before intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms6[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms6[[TEAM]]$loading.weights[, x], OBJ='Proportion of early transmissions\nafter intervention'), by='TEAM'] ))
}))
set(dfl, NULL, 'LATENT', dfl[, gsub('Comp ','LV',LATENT)])
dfl <- subset(dfl, !is.nan(LOAD))
# use variable influence projection (Wold et al 1993)
# https://books.google.co.uk/books?id=QhHdGt8TG80C&pg=PA2&lpg=PA2&dq=PLS+contribution+of+each+variable&source=bl&ots=vWaqNtCTYz&sig=RT9STQ3SzXk1tU2ZNYlRycgxIQ8&hl=en&sa=X&ved=0ahUKEwiH1eHmirvJAhXK7hoKHcBXC_YQ6AEILzAC#v=onepage&q=PLS%20contribution%20of%20each%20variable&f=false
dfl <- merge(dfl, dfl[, list(X=X, LOADstd=LOAD^2/sum(LOAD^2)), by=c('TEAM','OBJ','LATENT')], by=c('TEAM','OBJ','LATENT','X'))
dfl <- merge(dfl, dfr, by=c('TEAM','OBJ','LATENT'))
set(dfl, NULL, 'X', dfl[, factor(X, levels=c("INC_t","INCR_t","ACS_t","ACE_t","DATAT_L","DATA_T","IMPRT","SMPL_N","SMPL_C","SMPL_M","SMPL_D"),
labels=c('True % incidence','True incidence ratio','True % early transmissions just before intervention','True % early transmissions after intervention','Simulation model','Data provided', 'Frequency of viral introductions','Sequences (#)','Sequence coverage','Proportion of sequences from after intervention start','Sampling duration after intervention start'))])
set(dfl, NULL, 'OBJ', dfl[, factor(OBJ, levels=c('Incidence\nafter intervention','Incidence reduction\nduring intervention','Proportion of early transmissions\njust before intervention','Proportion of early transmissions\nafter intervention'), labels=c('Incidence\nafter intervention','Incidence reduction\nduring intervention','Proportion of early transmissions\njust before intervention','Proportion of early transmissions\nafter intervention'))])
setkey(dfl, TEAM, OBJ, LATENTO)
dfl <- merge(dfl, dfl[, list( LATENTO=LATENTO, LOADcm=cumsum(LOADstd*R2each)), by=c('TEAM','OBJ','X')], by=c('TEAM','OBJ','LATENTO','X'))
#do barplot
ggplot(subset(dfl, LATENTO%in%paste('lv',1:4,sep='')), aes(x=LATENTO, y=100*LOADcm, fill=X, alpha= factor(LATENTO%in%paste('lv',1:4,sep=''),levels=c(TRUE,FALSE),labels=c(1,0)) )) +
geom_bar(stat="identity", colour='black') +
scale_fill_manual(values=c("#762A83","#9970AB","#C2A5CF","#E7D4E8", "#80B1D3", "#FDB462", "#A6DBA0","#5AAE61","#1B7837")) +
scale_alpha_manual(values=c(1,0.2)) +
scale_x_discrete(labels=c("1","1-2","1-3","1-4")) +
scale_y_continuous(expand=c(0,0), limit=c(0,100), breaks=seq(0,100,20), minor_breaks=seq(0,100,10)) +
facet_grid(TEAM~OBJ) +
theme_bw() + theme(panel.margin = unit(0.8, "lines"), legend.position='bottom',panel.grid.major.y=element_line(colour='grey70', size=1), panel.grid.minor.y=element_line(colour='grey70', size=0.4)) +
labs(x='\nfirst n PLS latent factors',fill='Variable',y='variance in error explained\n(%)\n') +
guides(fill=guide_legend(ncol=3), alpha=FALSE)
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Errors_PLSbyLatentFactors_v2.pdf',sep=''), width=12, height=12)
#do influence plot
tmp2 <- dfl[, length(unique(X))]
tmp2 <- dfl[, list(VIP= sqrt(tmp2*sum(R2each*LOADstd)) ), by=c('TEAM','OBJ','X')]
ggplot(tmp2, aes(x=X, y=VIP, fill=X)) + geom_bar(stat="identity", colour='black') +
scale_y_continuous(expand=c(0,0), limit=c(0,2.9), breaks=seq(0.5,5,0.5), minor_breaks=seq(0,5,0.1)) +
theme_bw() + coord_flip() + facet_grid(TEAM~OBJ) +
theme(legend.position='bottom', axis.text.y=element_blank(), axis.ticks.y=element_blank(), panel.grid.major.x=element_line(colour='grey70', size=1), panel.grid.minor.x=element_line(colour='grey70', size=0.4)) +
labs(x='', y='\nVariable influence projection', fill='Variable') +
guides(fill=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Outliers_PLSvip_v2.pdf',sep=''), width=12, height=12)
#
# do the same without true values
#
plsms2 <- vector('list', dfo[, length(unique(TEAM))])
plsms3 <- vector('list', dfo[, length(unique(TEAM))])
plsms5 <- vector('list', dfo[, length(unique(TEAM))])
plsms6 <- vector('list', dfo[, length(unique(TEAM))])
names(plsms2) <- names(plsms3) <- names(plsms5) <- names(plsms6) <- dfo[, unique(TEAM)]
for(x in names(plsms2))
{
df <- subset(dfo, TEAM==x & OBJ=='Incidence\nafter intervention')
plsms2[[x]] <- plsr(RESID~DATAT_L+IMPRT+SMPL_C+SMPL_M+SMPL_D, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Incidence reduction\nduring intervention')
plsms3[[x]] <- plsr(RESID~DATAT_L+IMPRT+SMPL_C+SMPL_M+SMPL_D, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\njust before intervention')
plsms5[[x]] <- plsr(RESID~DATAT_L+IMPRT+SMPL_C+SMPL_M+SMPL_D, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\nafter intervention')
plsms6[[x]] <- plsr(RESID~DATAT_L+IMPRT+SMPL_C+SMPL_M+SMPL_D, data=df, validation='LOO')
}
# get variance explained
dfr <- as.data.table(melt(sapply(plsms2, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
dfr[, OBJ:='Incidence\nafter intervention']
tmp <- as.data.table(melt(sapply(plsms3, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Incidence reduction\nduring intervention']
dfr <- rbind(dfr, tmp)
tmp <- as.data.table(melt(sapply(plsms5, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Proportion of early transmissions\njust before intervention']
dfr <- rbind(dfr, tmp)
tmp <- as.data.table(melt(sapply(plsms6, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Proportion of early transmissions\nafter intervention']
dfr <- rbind(dfr, tmp)
set(dfr, NULL, 'LATENT', dfr[, paste('LV',gsub(' comps','',LATENT),sep='')])
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, R2each= c(R2[1],diff(R2))), by=c('TEAM','OBJ')], by=c('TEAM','OBJ','LATENT'))
#oh wow! the variance explained by the various latent variables is quite different from each other!!
dfr <- dfr[order(TEAM, OBJ, -R2each)]
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, LATENTO= paste('lv',seq_along(LATENT),sep='')), by=c('TEAM','OBJ')], by=c('TEAM','OBJ','LATENT'))
set(dfr, NULL, 'LATENTO', dfr[, factor(LATENTO, levels=paste('lv',1:10,sep=''), labels=paste('lv',1:10,sep=''))])
#collect key X variables with high latent variable weights
dfl <- do.call('rbind',lapply(paste('Comp',1:5), function(x){
do.call('rbind', list( data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms2[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms2[[TEAM]]$loading.weights[, x], OBJ='Incidence\nafter intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms3[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms3[[TEAM]]$loading.weights[, x], OBJ='Incidence reduction\nduring intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms5[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms5[[TEAM]]$loading.weights[, x], OBJ='Proportion of early transmissions\njust before intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms6[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms6[[TEAM]]$loading.weights[, x], OBJ='Proportion of early transmissions\nafter intervention'), by='TEAM'] ))
}))
set(dfl, NULL, 'LATENT', dfl[, gsub('Comp ','LV',LATENT)])
dfl <- subset(dfl, !is.nan(LOAD))
# use variable influence projection (Wold et al 1993)
# https://books.google.co.uk/books?id=QhHdGt8TG80C&pg=PA2&lpg=PA2&dq=PLS+contribution+of+each+variable&source=bl&ots=vWaqNtCTYz&sig=RT9STQ3SzXk1tU2ZNYlRycgxIQ8&hl=en&sa=X&ved=0ahUKEwiH1eHmirvJAhXK7hoKHcBXC_YQ6AEILzAC#v=onepage&q=PLS%20contribution%20of%20each%20variable&f=false
dfl <- merge(dfl, dfl[, list(X=X, LOADstd=LOAD^2/sum(LOAD^2)), by=c('TEAM','OBJ','LATENT')], by=c('TEAM','OBJ','LATENT','X'))
dfl <- merge(dfl, dfr, by=c('TEAM','OBJ','LATENT'))
set(dfl, NULL, 'X', dfl[, factor(X, levels=c("DATAT_L","DATA_T","INC_t","INCR_t","ACS_t","ACE_t","IMPRT","SMPL_N","SMPL_C","SMPL_M","SMPL_D"),
labels=c('Simulation model','Data provided', 'True % incidence','True incidence ratio','True % early transmissions just before intervention','True % early transmissions after intervention','Viral introductions','Sequences (#)','Sequence coverage','Sequences from after intervention start','Sampling duration after intervention start'))])
setkey(dfl, TEAM, OBJ, LATENTO)
dfl <- merge(dfl, dfl[, list( LATENTO=LATENTO, LOADcm=cumsum(LOADstd*R2each)), by=c('TEAM','OBJ','X')], by=c('TEAM','OBJ','LATENTO','X'))
#do barplot
ggplot(subset(dfl, LATENTO%in%paste('lv',1:4,sep='')), aes(x=LATENTO, y=100*LOADcm, fill=X, alpha= factor(LATENTO%in%paste('lv',1:4,sep=''),levels=c(TRUE,FALSE),labels=c(1,0)) )) +
geom_bar(stat="identity", colour='black') +
#scale_fill_brewer(palette='Spectral') +
scale_alpha_manual(values=c(1,0.2)) +
scale_x_discrete(labels=c("1","1-2","1-3","1-4")) +
scale_y_continuous(expand=c(0,0), limit=c(0,100), breaks=seq(0,100,20), minor_breaks=seq(0,100,10)) +
facet_grid(TEAM~OBJ) +
theme_bw() + theme(panel.margin = unit(0.8, "lines"), legend.position='bottom',panel.grid.major.y=element_line(colour='grey70', size=1), panel.grid.minor.y=element_line(colour='grey70', size=0.4)) +
labs(x='\nfirst n PLS latent factors',fill='Variable',y='variance in error explained\n(%)\n') +
guides(fill=guide_legend(ncol=3), alpha=FALSE)
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Errors_PLSbyLatentFactors_v3.pdf',sep=''), width=12, height=12)
#do influence plot
tmp2 <- dfl[, length(unique(X))]
tmp2 <- dfl[, list(VIP= sqrt(tmp2*sum(R2each*LOADstd)) ), by=c('TEAM','OBJ','X')]
ggplot(tmp2, aes(x=X, y=VIP, fill=X)) + geom_bar(stat="identity", colour='black') +
scale_y_continuous(expand=c(0,0), limit=c(0,2.9), breaks=seq(0.5,5,0.5), minor_breaks=seq(0,5,0.1)) +
theme_bw() + coord_flip() + facet_grid(TEAM~OBJ) +
theme(legend.position='bottom', axis.text.y=element_blank(), axis.ticks.y=element_blank(), panel.grid.major.x=element_line(colour='grey70', size=1), panel.grid.minor.x=element_line(colour='grey70', size=0.4)) +
labs(x='', y='\nVariable influence projection', fill='Variable') +
guides(fill=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Outliers_PLSvip_v3.pdf',sep=''), width=12, height=12)
#
# Do stepwise model selection in regression
#
require(gamlss)
bw.AIC2 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC2 <- vector('list', dfo[, length(unique(TEAM))])
bw.AIC3 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC3 <- vector('list', dfo[, length(unique(TEAM))])
bw.AIC5 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC5 <- vector('list', dfo[, length(unique(TEAM))])
bw.AIC6 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC6 <- vector('list', dfo[, length(unique(TEAM))])
names(bw.AIC6) <- names(bw.AIC5) <- names(bw.AIC3) <-names(bw.AIC2) <- dfo[, unique(TEAM)]
names(bw.BIC6) <- names(bw.BIC5) <- names(bw.BIC3) <-names(bw.BIC2) <- dfo[, unique(TEAM)]
for(x in names(bw.AIC2))
{
cat('\nprocess',x)
tmp <- subset(dfo, TEAM==x & OBJ=='Incidence\nafter intervention')
mnoa <- gamlss(RESID~DATA_T+DATAT_L+IMPRT+SMPL_C+SMPL_M+SMPL_D+INC_t, data=tmp, family=NO, trace=FALSE)
bw.AIC2[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC2[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
tmp <- subset(dfo, TEAM==x & OBJ=='Incidence reduction\nduring intervention')
mnoa <- gamlss(RESID~DATA_T+DATAT_L+IMPRT+SMPL_C+SMPL_M+SMPL_D+INCR_t, data=tmp, family=NO, trace=FALSE)
bw.AIC3[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC3[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
tmp <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\njust before intervention')
mnoa <- gamlss(RESID~DATA_T+DATAT_L+IMPRT+SMPL_C+SMPL_M+SMPL_D+ACS_t, data=tmp, family=NO, trace=FALSE)
bw.AIC5[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC5[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
tmp <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\nafter intervention')
mnoa <- gamlss(RESID~DATA_T+DATAT_L+IMPRT+SMPL_C+SMPL_M+SMPL_D+ACE_t, data=tmp, family=NO, trace=FALSE)
bw.AIC6[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC6[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
}
dforc <- dfo[, {
cat(as.character(TEAM), as.character(OBJ))
zz <- zy <- NULL
if(OBJ=='Incidence\nafter intervention')
{
zz <- bw.AIC2[[as.character(TEAM)]]
zy <- bw.BIC2[[as.character(TEAM)]]
}
if(OBJ=='Incidence reduction\nduring intervention')
{
zz <- bw.AIC3[[as.character(TEAM)]]
zy <- bw.BIC3[[as.character(TEAM)]]
}
if(OBJ=='Proportion of early transmissions\njust before intervention')
{
zz <- bw.AIC5[[as.character(TEAM)]]
zy <- bw.BIC5[[as.character(TEAM)]]
}
if(OBJ=='Proportion of early transmissions\nafter intervention')
{
zz <- bw.AIC6[[as.character(TEAM)]]
zy <- bw.BIC6[[as.character(TEAM)]]
}
z <- z2 <- numeric(0)
if(!is.null(names(coef(zz))))
{
z <- summary(zz)[,'Pr(>|t|)']
z <- z[ intersect(names(z), names(coef(zz))) ]
}
if(!is.null(names(coef(zy))))
{
z2 <- summary(zy)[,'Pr(>|t|)']
z2 <- z2[ intersect(names(z2), names(coef(zy))) ]
}
z <- sort(z)
z2 <- sort(z2)
length(z) <- max(length(z),length(z2))
length(z2) <- max(length(z),length(z2))
list( AICn=as.character(names(z)), AICp=z, BICn=as.character(names(z2)), BICp=z2 )
}, by=c('TEAM','OBJ')]
dforc <- subset(dforc, AICp<0.05 | BICp<0.05)
dforc <- dcast.data.table(dforc, OBJ+BICn~TEAM, value.var='BICp')
dforc <- subset(dforc, !grepl('Intercept',BICn) & nchar(BICn)>0)
setkey(dforc, OBJ, BICn)
write.csv(dforc, row.names=FALSE, file=paste(outdir,'/res_acrossTEAM_Secondary_Errors.csv',sep=''))
}
##--------------------------------------------------------------------------------------------------------
## olli 02.12.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.Errors<- function(dfa, outdir)
{
require(plsdepot)
require(pls)
require(outliers)
dfo <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1), c(SC_RND, TEAM, DATAT_L, central, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, OBJ))
dfo <- dcast.data.table(dfo, SC_RND+TEAM+DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D~OBJ, value.var='central')
tmp <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & TEAM=='True', c(SC_RND, TEAM, DATAT_L, central, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, OBJ))
set(tmp, NULL, 'OBJ', tmp[, paste(OBJ,'_t',sep='')])
tmp <- dcast.data.table(tmp, SC_RND~OBJ, value.var='central')
dfo <- merge(dfo, tmp, by='SC_RND')
set(dfo, NULL, 'DATAT_L', dfo[, as.numeric(factor(DATAT_L))])
set(dfo, NULL, 'SMPL_M', dfo[, as.numeric(SMPL_M)])
set(dfo, NULL, 'SMPL_D', dfo[, as.numeric(SMPL_D)])
set(dfo, NULL, 'DATA_T', dfo[, as.numeric(DATA_T)])
set(dfo, NULL, 'SMPL_C', dfo[, as.numeric(gsub('%','',as.character(SMPL_C)))/100])
set(dfo, NULL, 'IMPRT', dfo[, as.numeric(gsub('%','',as.character(IMPRT)))/100])
setnames(dfo, c('OBJ_ii_t','OBJ_iii_t','OBJ_v_t','OBJ_vi_t'), c('INC_t','INCR_t','ACS_t','ACE_t'))
#dfo[, R_ii_1:= OBJ_ii-INC_t]
dfo[, R_ii:= log(OBJ_ii)-log(INC_t)]
#dfo[, R_iii_1:= OBJ_iii-INCR_t]
dfo[, R_iii:= log(OBJ_iii)-log(INCR_t)]
dfo[, R_v:= OBJ_v-ACS_t]
#dfo[, R_v_2:= log(OBJ_v)-log(ACS_t)]
dfo[, R_vi:= OBJ_vi-ACE_t]
#dfo[, R_vi_2:= log(OBJ_vi)-log(ACE_t)]
dfo <- subset(dfo, select=c(SC_RND, TEAM, DATAT_L, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, INC_t, INCR_t, ACS_t, ACE_t, R_ii, R_iii, R_v, R_vi))
dfo <- melt(dfo, measure.vars=c('R_ii','R_iii','R_v','R_vi'), variable.name='OBJ', value.name='RESID')
set(dfo, NULL, 'OBJ', dfo[, factor(OBJ, levels=c('R_ii','R_iii','R_v','R_vi'), labels=c('Incidence\nafter intervention','Incidence reduction\nduring intervention','Proportion of early transmissions\njust before intervention','Proportion of early transmissions\nafter intervention'))])
dfo <- subset(dfo, !is.na(RESID))
# restrict to not Cambridge
dfo <- subset(dfo, TEAM!='Cambridge')
#
# do PLS on errors
#
plsms2 <- vector('list', dfo[, length(unique(TEAM))])
plsms3 <- vector('list', dfo[, length(unique(TEAM))])
plsms5 <- vector('list', dfo[, length(unique(TEAM))])
plsms6 <- vector('list', dfo[, length(unique(TEAM))])
names(plsms2) <- names(plsms3) <- names(plsms5) <- names(plsms6) <- dfo[, unique(TEAM)]
for(x in names(plsms2))
{
df <- subset(dfo, TEAM==x & OBJ=='Incidence\nafter intervention')
plsms2[[x]] <- plsr(RESID~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Incidence reduction\nduring intervention')
plsms3[[x]] <- plsr(RESID~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\njust before intervention')
plsms5[[x]] <- plsr(RESID~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\nafter intervention')
plsms6[[x]] <- plsr(RESID~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=df, validation='LOO')
}
# get variance explained
dfr <- as.data.table(melt(sapply(plsms2, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
dfr[, OBJ:='Incidence\nafter intervention']
tmp <- as.data.table(melt(sapply(plsms3, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Incidence reduction\nduring intervention']
dfr <- rbind(dfr, tmp)
tmp <- as.data.table(melt(sapply(plsms5, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Proportion of early transmissions\njust before intervention']
dfr <- rbind(dfr, tmp)
tmp <- as.data.table(melt(sapply(plsms6, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Proportion of early transmissions\nafter intervention']
dfr <- rbind(dfr, tmp)
set(dfr, NULL, 'LATENT', dfr[, paste('LV',gsub(' comps','',LATENT),sep='')])
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, R2each= c(R2[1],diff(R2))), by=c('TEAM','OBJ')], by=c('TEAM','OBJ','LATENT'))
#oh wow! the variance explained by the various latent variables is quite different from each other!!
dfr <- dfr[order(TEAM, OBJ, -R2each)]
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, LATENTO= paste('lv',seq_along(LATENT),sep='')), by=c('TEAM','OBJ')], by=c('TEAM','OBJ','LATENT'))
set(dfr, NULL, 'LATENTO', dfr[, factor(LATENTO, levels=paste('lv',1:10,sep=''), labels=paste('lv',1:10,sep=''))])
#collect key X variables with high latent variable weights
dfl <- do.call('rbind',lapply(paste('Comp',1:10), function(x){
do.call('rbind', list( data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms2[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms2[[TEAM]]$loading.weights[, x], OBJ='Incidence\nafter intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms3[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms3[[TEAM]]$loading.weights[, x], OBJ='Incidence reduction\nduring intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms5[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms5[[TEAM]]$loading.weights[, x], OBJ='Proportion of early transmissions\njust before intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms6[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms6[[TEAM]]$loading.weights[, x], OBJ='Proportion of early transmissions\nafter intervention'), by='TEAM'] ))
}))
set(dfl, NULL, 'LATENT', dfl[, gsub('Comp ','LV',LATENT)])
dfl <- subset(dfl, !is.nan(LOAD))
# use variable influence projection (Wold et al 1993)
# https://books.google.co.uk/books?id=QhHdGt8TG80C&pg=PA2&lpg=PA2&dq=PLS+contribution+of+each+variable&source=bl&ots=vWaqNtCTYz&sig=RT9STQ3SzXk1tU2ZNYlRycgxIQ8&hl=en&sa=X&ved=0ahUKEwiH1eHmirvJAhXK7hoKHcBXC_YQ6AEILzAC#v=onepage&q=PLS%20contribution%20of%20each%20variable&f=false
dfl <- merge(dfl, dfl[, list(X=X, LOADstd=LOAD^2/sum(LOAD^2)), by=c('TEAM','OBJ','LATENT')], by=c('TEAM','OBJ','LATENT','X'))
dfl <- merge(dfl, dfr, by=c('TEAM','OBJ','LATENT'))
set(dfl, NULL, 'X', dfl[, factor(X, levels=c("DATAT_L","DATA_T","ACE_t","INC_t","INCR_t","IMPRT","SMPL_N","SMPL_C","SMPL_M","SMPL_D"),
labels=c('Simulation model','Data provided','%Acute', '%Incidence','Incidence ratio','Viral introductions','Sequences (#)','Sequence coverage','Sequences from after intervention start','Sampling duration after intervention start'))])
setkey(dfl, TEAM, OBJ, LATENTO)
dfl <- merge(dfl, dfl[, list( LATENTO=LATENTO, LOADcm=cumsum(LOADstd*R2each)), by=c('TEAM','OBJ','X')], by=c('TEAM','OBJ','LATENTO','X'))
#do barplot
ggplot(subset(dfl, LATENTO%in%paste('lv',1:4,sep='')), aes(x=LATENTO, y=100*LOADcm, fill=X, alpha= factor(LATENTO%in%paste('lv',1:4,sep=''),levels=c(TRUE,FALSE),labels=c(1,0)) )) +
geom_bar(stat="identity", colour='black') +
#scale_fill_brewer(palette='Spectral') +
scale_alpha_manual(values=c(1,0.2)) +
scale_x_discrete(labels=c("1","1-2","1-3","1-4")) +
scale_y_continuous(expand=c(0,0), limit=c(0,100), breaks=seq(0,100,20), minor_breaks=seq(0,100,10)) +
facet_grid(TEAM~OBJ) +
theme_bw() + theme(panel.margin = unit(0.8, "lines"), legend.position='bottom',panel.grid.major.y=element_line(colour='grey70', size=1), panel.grid.minor.y=element_line(colour='grey70', size=0.4)) +
labs(x='\nfirst n PLS latent factors',fill='Variable',y='variance in error explained\n(%)\n') +
guides(fill=guide_legend(ncol=3), alpha=FALSE)
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Errors_PLSbyLatentFactors.pdf',sep=''), width=12, height=12)
#do influence plot
tmp2 <- dfl[, length(unique(X))]
tmp2 <- dfl[, list(VIP= sqrt(tmp2*sum(R2each*LOADstd)) ), by=c('TEAM','OBJ','X')]
ggplot(tmp2, aes(x=X, y=VIP, fill=X)) + geom_bar(stat="identity", colour='black') +
scale_y_continuous(expand=c(0,0), limit=c(0,2.9), breaks=seq(0.5,5,0.5), minor_breaks=seq(0,5,0.1)) +
theme_bw() + coord_flip() + facet_grid(TEAM~OBJ) +
theme(legend.position='bottom', axis.text.y=element_blank(), axis.ticks.y=element_blank(), panel.grid.major.x=element_line(colour='grey70', size=1), panel.grid.minor.x=element_line(colour='grey70', size=0.4)) +
labs(x='', y='\nVariable influence projection', fill='Variable') +
guides(fill=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Outliers_PLSvip.pdf',sep=''), width=12, height=12)
#
# Do stepwise model selection in regression
#
require(gamlss)
bw.AIC2 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC2 <- vector('list', dfo[, length(unique(TEAM))])
bw.AIC3 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC3 <- vector('list', dfo[, length(unique(TEAM))])
bw.AIC5 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC5 <- vector('list', dfo[, length(unique(TEAM))])
bw.AIC6 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC6 <- vector('list', dfo[, length(unique(TEAM))])
names(bw.AIC6) <- names(bw.AIC5) <- names(bw.AIC3) <-names(bw.AIC2) <- dfo[, unique(TEAM)]
names(bw.BIC6) <- names(bw.BIC5) <- names(bw.BIC3) <-names(bw.BIC2) <- dfo[, unique(TEAM)]
for(x in names(bw.AIC2))
{
cat('\nprocess',x)
tmp <- subset(dfo, TEAM==x & OBJ=='Incidence\nafter intervention')
mnoa <- gamlss(RESID~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=tmp, family=NO, trace=FALSE)
bw.AIC2[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC2[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
tmp <- subset(dfo, TEAM==x & OBJ=='Incidence reduction\nduring intervention')
mnoa <- gamlss(RESID~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=tmp, family=NO, trace=FALSE)
bw.AIC3[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC3[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
tmp <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\njust before intervention')
mnoa <- gamlss(RESID~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=tmp, family=NO, trace=FALSE)
bw.AIC5[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC5[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
tmp <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\nafter intervention')
mnoa <- gamlss(RESID~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=tmp, family=NO, trace=FALSE)
bw.AIC6[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC6[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
}
dforc <- dfo[, {
cat(as.character(TEAM), as.character(OBJ))
zz <- zy <- NULL
if(OBJ=='Incidence\nafter intervention')
{
zz <- bw.AIC2[[as.character(TEAM)]]
zy <- bw.BIC2[[as.character(TEAM)]]
}
if(OBJ=='Incidence reduction\nduring intervention')
{
zz <- bw.AIC3[[as.character(TEAM)]]
zy <- bw.BIC3[[as.character(TEAM)]]
}
if(OBJ=='Proportion of early transmissions\njust before intervention')
{
zz <- bw.AIC5[[as.character(TEAM)]]
zy <- bw.BIC5[[as.character(TEAM)]]
}
if(OBJ=='Proportion of early transmissions\nafter intervention')
{
zz <- bw.AIC6[[as.character(TEAM)]]
zy <- bw.BIC6[[as.character(TEAM)]]
}
z <- z2 <- numeric(0)
if(!is.null(names(coef(zz))))
{
z <- summary(zz)[,'Pr(>|t|)']
z <- z[ intersect(names(z), names(coef(zz))) ]
}
if(!is.null(names(coef(zy))))
{
z2 <- summary(zy)[,'Pr(>|t|)']
z2 <- z2[ intersect(names(z2), names(coef(zy))) ]
}
z <- sort(z)
z2 <- sort(z2)
length(z) <- max(length(z),length(z2))
length(z2) <- max(length(z),length(z2))
list( AICn=as.character(names(z)), AICp=z, BICn=as.character(names(z2)), BICp=z2 )
}, by=c('TEAM','OBJ')]
dforc <- subset(dforc, AICp<0.05 | BICp<0.05)
dforc <- dcast.data.table(dforc, OBJ+BICn~TEAM, value.var='BICp')
dforc <- subset(dforc, !grepl('Intercept',BICn) & nchar(BICn)>0)
setkey(dforc, OBJ, BICn)
write.csv(dforc, row.names=FALSE, file=paste(outdir,'/res_acrossTEAM_Secondary_Errors.csv',sep=''))
}
##--------------------------------------------------------------------------------------------------------
## olli 06.12.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.Outliers.v151206<- function(dfa, outdir)
{
require(plsdepot)
require(pls)
require(outliers)
dfo <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1), c(SC_RND, TEAM, DATAT_L, central, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, OBJ))
dfo <- dcast.data.table(dfo, SC_RND+TEAM+DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D~OBJ, value.var='central')
tmp <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & TEAM=='True', c(SC_RND, TEAM, DATAT_L, central, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, OBJ))
set(tmp, NULL, 'OBJ', tmp[, paste(OBJ,'_t',sep='')])
tmp <- dcast.data.table(tmp, SC_RND~OBJ, value.var='central')
dfo <- merge(dfo, tmp, by='SC_RND')
set(dfo, NULL, 'DATAT_L', dfo[, as.numeric(factor(DATAT_L, levels=c("Regional","Village"), labels=c("Regional","Village")))])
set(dfo, NULL, 'SMPL_M', dfo[, as.numeric(SMPL_M)])
set(dfo, NULL, 'SMPL_D', dfo[, as.numeric(SMPL_D)])
set(dfo, NULL, 'DATA_T', dfo[, as.numeric(DATA_T)])
if(1)
{
set(dfo, dfo[, which(IMPRT!='20%')], 'IMPRT', '<=5%')
set(dfo, NULL, 'IMPRT', dfo[, as.numeric(factor(as.character(IMPRT), levels=c('<=5%','20%'), labels=c('<=5%','20%')))])
set(dfo, dfo[, which(SMPL_C%in%c('8%','30%'))], 'SMPL_C', '1x')
set(dfo, dfo[, which(SMPL_C%in%c('16%','60%'))], 'SMPL_C', '2x')
set(dfo, NULL, 'SMPL_C', dfo[, as.numeric(factor(as.character(SMPL_C), levels=c('1x','2x'), labels=c('1x','2x')))])
}
if(0)
{
set(dfo, NULL, 'SMPL_C', dfo[, as.numeric(gsub('%','',as.character(SMPL_C)))/100])
set(dfo, NULL, 'IMPRT', dfo[, as.numeric(gsub('%','',as.character(IMPRT)))/100])
}
setnames(dfo, c('OBJ_ii_t','OBJ_iii_t','OBJ_v_t','OBJ_vi_t'), c('INC_t','INCR_t','ACS_t','ACE_t'))
dfo[, R_ii_1:= OBJ_ii-INC_t]
dfo[, R_ii_2:= log(OBJ_ii)-log(INC_t)]
dfo[, R_iii_1:= OBJ_iii-INCR_t]
dfo[, R_iii_2:= log(OBJ_iii)-log(INCR_t)]
dfo[, R_v_1:= OBJ_v-ACS_t]
dfo[, R_v_2:= log(OBJ_v)-log(ACS_t)]
dfo[, R_vi_1:= OBJ_vi-ACE_t]
dfo[, R_vi_2:= log(OBJ_vi)-log(ACE_t)]
#
# Calculate outliers
#
set(dfo, NULL, c('R_ii_1','R_iii_1','R_v_2','R_vi_2'), NULL)
setnames(dfo, c('R_ii_2','R_iii_2','R_v_1','R_vi_1'), c('R_ii','R_iii','R_v','R_vi'))
dfo <- subset(dfo, select=c(SC_RND, TEAM, DATAT_L, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, INC_t, INCR_t, ACS_t, ACE_t, R_ii, R_iii, R_v, R_vi))
dfo <- melt(dfo, measure.vars=c('R_ii','R_iii','R_v','R_vi'), variable.name='OBJ', value.name='RESID')
dfo <- subset(dfo, !is.na(RESID))
tmp <- dfo[, {
z <- 1.5*diff(quantile(RESID, p=c(0.25,0.75)))
z <- c(quantile(RESID, p=0.25)-z, quantile(RESID, p=0.25)+z)
list(TEAM=TEAM, SC_RND=SC_RND, OU_GR= as.data.table(grubbs.flag( RESID ))[, OUTLIER], OU_TK= RESID<z[1] | RESID>z[2] )
}, by='OBJ']
dfo <- merge(dfo, tmp, by=c('TEAM','OBJ','SC_RND'))
dfo[, OUTLIER:=0]
set(dfo, dfo[, which(OU_TK)], 'OUTLIER', 1)
set(dfo, dfo[, which(OU_GR)], 'OUTLIER', 2)
set(dfo, NULL, 'OUTLIER', dfo[, factor(OUTLIER, levels=c(0,1,2), labels=c('No','Mild','Extreme'))])
set(dfo, NULL, 'OBJ', dfo[, factor(OBJ, levels=c('R_ii','R_iii','R_v','R_vi'), labels=c('Incidence\nafter intervention','Incidence reduction\nduring intervention','Proportion of early transmissions\njust before intervention','Proportion of early transmissions\nafter intervention'))])
#
# do PLS on OU_TK as in http://link.springer.com/article/10.1007/s00439-003-0921-9#/page-1 or http://bib.oxfordjournals.org/content/8/1/32.full
#
set(dfo, NULL, 'OU_TK', dfo[, as.numeric(OU_TK)])
set(dfo, NULL, 'OU_GR', dfo[, as.numeric(OU_GR)])
# restrict to not Cambridge
dfo <- subset(dfo, TEAM!='Cambridge')
#
plsms2 <- vector('list', dfo[, length(unique(TEAM))])
plsms3 <- vector('list', dfo[, length(unique(TEAM))])
plsms5 <- vector('list', dfo[, length(unique(TEAM))])
plsms6 <- vector('list', dfo[, length(unique(TEAM))])
names(plsms2) <- names(plsms3) <- names(plsms5) <- names(plsms6) <- dfo[, unique(TEAM)]
for(x in names(plsms2))
{
df <- subset(dfo, TEAM==x & OBJ=='Incidence\nafter intervention')
plsms2[[x]] <- plsr(OU_TK~DATAT_L+IMPRT+SMPL_C+SMPL_M+INC_t, data=df, validation='LOO', scale=FALSE)
df <- subset(dfo, TEAM==x & OBJ=='Incidence reduction\nduring intervention')
plsms3[[x]] <- plsr(OU_TK~DATAT_L+IMPRT+SMPL_C+SMPL_M+INCR_t, data=df, validation='LOO', scale=FALSE)
df <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\njust before intervention')
if(x=='ETH Zurich')
plsms5[[x]] <- plsr(OU_TK~DATAT_L+IMPRT+SMPL_C+SMPL_M+ACS_t, data=df, validation='LOO', scale=TRUE)
if(x!='ETH Zurich')
plsms5[[x]] <- plsr(OU_TK~DATAT_L+IMPRT+SMPL_C+SMPL_M+ACS_t, data=df, validation='LOO', scale=FALSE)
df <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\nafter intervention')
plsms6[[x]] <- plsr(OU_TK~DATAT_L+IMPRT+SMPL_C+SMPL_M+ACE_t, data=df, validation='LOO', scale=FALSE)
}
# get variance explained
dfr <- as.data.table(melt(sapply(plsms2, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
dfr[, OBJ:='Incidence\nafter intervention']
tmp <- as.data.table(melt(sapply(plsms3, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Incidence reduction\nduring intervention']
dfr <- rbind(dfr, tmp)
tmp <- as.data.table(melt(sapply(plsms5, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Proportion of early transmissions\njust before intervention']
dfr <- rbind(dfr, tmp)
tmp <- as.data.table(melt(sapply(plsms6, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Proportion of early transmissions\nafter intervention']
dfr <- rbind(dfr, tmp)
set(dfr, NULL, 'LATENT', dfr[, paste('LV',gsub(' comps','',LATENT),sep='')])
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, R2each= c(R2[1],diff(R2))), by=c('TEAM','OBJ')], by=c('TEAM','OBJ','LATENT'))
#oh wow! the variance explained by the various latent variables is quite different from each other!!
dfr <- dfr[order(TEAM, OBJ, -R2each)]
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, LATENTO= paste('lv',seq_along(LATENT),sep='')), by=c('TEAM','OBJ')], by=c('TEAM','OBJ','LATENT'))
set(dfr, NULL, 'LATENTO', dfr[, factor(LATENTO, levels=paste('lv',1:10,sep=''), labels=paste('lv',1:10,sep=''))])
#collect coefficients of first x latent factors
tmp <- do.call('rbind',lapply(paste(1:5,'comps'), function(x){
do.call('rbind', list( data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms2[[TEAM]]$coefficients[,, '1 comps']), LATENT=x, CBETA=plsms2[[TEAM]]$coefficients[,, x], OBJ='Incidence\nafter intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms3[[TEAM]]$coefficients[,, '1 comps']), LATENT=x, CBETA=plsms3[[TEAM]]$coefficients[,, x], OBJ='Incidence reduction\nduring intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms5[[TEAM]]$coefficients[,, '1 comps']), LATENT=x, CBETA=plsms5[[TEAM]]$coefficients[,, x], OBJ='Proportion of early transmissions\njust before intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms6[[TEAM]]$coefficients[,, '1 comps']), LATENT=x, CBETA=plsms6[[TEAM]]$coefficients[,, x], OBJ='Proportion of early transmissions\nafter intervention'), by='TEAM'] ))
}))
tmp <- subset(tmp, !is.nan(CBETA))
set(tmp, NULL, 'LATENT', tmp[, paste('LV',gsub(' comps','',LATENT),sep='')])
dfr <- merge(tmp, dfr, by=c('TEAM','OBJ','LATENT'))
#collect key X variables with high latent variable weights
dfl <- do.call('rbind',lapply(paste('Comp',1:5), function(x){
do.call('rbind', list( data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms2[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms2[[TEAM]]$loading.weights[, x], OBJ='Incidence\nafter intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms3[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms3[[TEAM]]$loading.weights[, x], OBJ='Incidence reduction\nduring intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms5[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms5[[TEAM]]$loading.weights[, x], OBJ='Proportion of early transmissions\njust before intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms6[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms6[[TEAM]]$loading.weights[, x], OBJ='Proportion of early transmissions\nafter intervention'), by='TEAM'] ))
}))
set(dfl, NULL, 'LATENT', dfl[, gsub('Comp ','LV',LATENT)])
dfl <- subset(dfl, !is.nan(LOAD))
# use variable influence projection (Wold et al 1993)
# https://books.google.co.uk/books?id=QhHdGt8TG80C&pg=PA2&lpg=PA2&dq=PLS+contribution+of+each+variable&source=bl&ots=vWaqNtCTYz&sig=RT9STQ3SzXk1tU2ZNYlRycgxIQ8&hl=en&sa=X&ved=0ahUKEwiH1eHmirvJAhXK7hoKHcBXC_YQ6AEILzAC#v=onepage&q=PLS%20contribution%20of%20each%20variable&f=false
dfl <- merge(dfl, dfl[, list(X=X, LOADstd=LOAD^2/sum(LOAD^2)), by=c('TEAM','OBJ','LATENT')], by=c('TEAM','OBJ','LATENT','X'))
dfl <- merge(dfl, dfr, by=c('TEAM','OBJ','LATENT','X'), all=1)
set(dfl, NULL, 'X', dfl[, factor(X, levels=c("INC_t","INCR_t","ACS_t","ACE_t","DATAT_L","DATA_T","IMPRT","SMPL_N","SMPL_C","SMPL_M","SMPL_D"),
labels=c('True % incidence','True incidence ratio','True % early transmissions just before intervention','True % early transmissions after intervention',
'Village simulation model vs. Regional model','Data provided', 'Frequency of viral introductions 20%/year vs. <=5%/year','Sequences (#)','High sequence coverage (80% for Village, 16% for Regional) vs. lower coverage (40% for Village, 8% for Regional)','Proportion of sequences from after intervention start >80% vs. 50%','Sampling duration after intervention start'))])
set(dfl, NULL, 'OBJ', dfl[, factor(OBJ, levels=c('Incidence\nafter intervention','Incidence reduction\nduring intervention','Proportion of early transmissions\njust before intervention','Proportion of early transmissions\nafter intervention'), labels=c('Incidence\nafter intervention','Incidence reduction\nduring intervention','Proportion of early transmissions\njust before intervention','Proportion of early transmissions\nafter intervention'))])
setkey(dfl, TEAM, OBJ, LATENTO)
dfl <- merge(dfl, dfl[, list( LATENTO=LATENTO, LOADcm=cumsum(LOADstd*R2each)), by=c('TEAM','OBJ','X')], by=c('TEAM','OBJ','LATENTO','X'))
dfl[, CBS:= factor(sign(CBETA), levels=c(-1,0,1),labels=c('+','','-'))]
setkey(dfl, OBJ, TEAM, LATENTO, X)
dfl <- merge(dfl, dfl[, list(X=X, POS= cumsum(LOADcm)-0.5*LOADcm), by=c('OBJ','TEAM','LATENTO')],by=c('OBJ','TEAM','LATENTO','X'))
#do barplot
ggplot(subset(dfl, LATENTO%in%paste('lv',1:4,sep='')), aes(x=LATENTO, y=100*LOADcm, fill=X)) +
geom_bar(stat="identity", colour='black') +
scale_fill_manual(values=c("#762A83","#9970AB","#C2A5CF","#E7D4E8", "#80B1D3", "#FDB462", "#A6DBA0","#5AAE61","#1B7837")) +
#scale_fill_brewer(palette='PRGn') +
scale_x_discrete(labels=c('1','1-2','1-3','1-4')) +
scale_y_continuous(expand=c(0,0), limit=c(0,100), breaks=seq(0,100,20), minor_breaks=seq(0,100,10)) +
facet_grid(TEAM~OBJ) +
theme_bw() + theme(panel.margin = unit(0.8, "lines"), legend.position='bottom', panel.grid.major.y=element_line(colour='grey70', size=1), panel.grid.minor.y=element_line(colour='grey70', size=0.4)) +
labs(x='\nfirst n PLS latent factors',fill='Strong error\npredictor',y='variance in outlier presence explained\n(%)\n') +
guides(fill=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Outliers_PLSbyLatentFactors_v2.pdf',sep=''), width=12, height=12)
# simpler barplot
ggplot(subset(dfl, LATENTO=='lv4'), aes(x=TEAM)) +
geom_bar(aes(y=100*LOADcm, fill=X), stat="identity", colour='black') +
scale_fill_manual(values=c("#762A83","#9970AB","#C2A5CF","#E7D4E8", "#80B1D3", "#FDB462", "#A6DBA0","#5AAE61","#1B7837")) +
geom_text(aes(y=100*POS, label=CBS)) +
#scale_fill_brewer(palette='PRGn') +
#scale_x_discrete(labels=c('1','1-2','1-3','1-4')) +
scale_y_continuous(expand=c(0,0), limit=c(0,100), breaks=seq(0,100,20), minor_breaks=seq(0,100,10)) +
facet_grid(~OBJ) +
coord_flip() +
theme_bw() + theme(panel.margin = unit(0.8, "lines"), legend.position='bottom', panel.grid.major.x=element_line(colour='grey70', size=1), panel.grid.minor.x=element_line(colour='grey70', size=0.4)) +
labs(x='',fill='Strong error\npredictor',y='\nvariance in outlier presence explained\n(%)') +
guides(fill=guide_legend(ncol=2))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Outliers_PLSbyLatentFactors_v3.pdf',sep=''), width=12, height=5)
#do influence plot
tmp2 <- dfl[, length(unique(X))]
tmp2 <- dfl[, list(VIP= sqrt(tmp2*sum(R2each*LOADstd)) ), by=c('TEAM','OBJ','X')]
ggplot(tmp2, aes(x=X, y=VIP, fill=X)) + geom_bar(stat="identity", colour='black') +
scale_y_continuous(expand=c(0,0), limit=c(0,2.9), breaks=seq(0.5,5,0.5), minor_breaks=seq(0,5,0.1)) +
theme_bw() + coord_flip() + facet_grid(TEAM~OBJ) +
theme(legend.position='bottom', axis.text.y=element_blank(), axis.ticks.y=element_blank(), panel.grid.major.x=element_line(colour='grey70', size=1), panel.grid.minor.x=element_line(colour='grey70', size=0.4)) +
labs(x='', y='\nVariable influence projection', fill='Variable') +
guides(fill=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Outliers_PLSbyLatentFactors_v2.pdf',sep=''), width=12, height=5)
#
# repeat without true values
#
plsms2 <- vector('list', dfo[, length(unique(TEAM))])
plsms3 <- vector('list', dfo[, length(unique(TEAM))])
plsms5 <- vector('list', dfo[, length(unique(TEAM))])
plsms6 <- vector('list', dfo[, length(unique(TEAM))])
names(plsms2) <- names(plsms3) <- names(plsms5) <- names(plsms6) <- dfo[, unique(TEAM)]
for(x in names(plsms2))
{
df <- subset(dfo, TEAM==x & OBJ=='Incidence\nafter intervention')
plsms2[[x]] <- plsr(OU_TK~DATAT_L+IMPRT+SMPL_C+SMPL_M+SMPL_D, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Incidence reduction\nduring intervention')
plsms3[[x]] <- plsr(OU_TK~DATAT_L+IMPRT+SMPL_C+SMPL_M+SMPL_D, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\njust before intervention')
plsms5[[x]] <- plsr(OU_TK~DATAT_L+IMPRT+SMPL_C+SMPL_M+SMPL_D, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\nafter intervention')
plsms6[[x]] <- plsr(OU_TK~DATAT_L+IMPRT+SMPL_C+SMPL_M+SMPL_D, data=df, validation='LOO')
}
# get variance explained
dfr <- as.data.table(melt(sapply(plsms2, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
dfr[, OBJ:='Incidence\nafter intervention']
tmp <- as.data.table(melt(sapply(plsms3, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Incidence reduction\nduring intervention']
dfr <- rbind(dfr, tmp)
tmp <- as.data.table(melt(sapply(plsms5, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Proportion of early transmissions\njust before intervention']
dfr <- rbind(dfr, tmp)
tmp <- as.data.table(melt(sapply(plsms6, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Proportion of early transmissions\nafter intervention']
dfr <- rbind(dfr, tmp)
set(dfr, NULL, 'LATENT', dfr[, paste('LV',gsub(' comps','',LATENT),sep='')])
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, R2each= c(R2[1],diff(R2))), by=c('TEAM','OBJ')], by=c('TEAM','OBJ','LATENT'))
#oh wow! the variance explained by the various latent variables is quite different from each other!!
dfr <- dfr[order(TEAM, OBJ, -R2each)]
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, LATENTO= paste('lv',seq_along(LATENT),sep='')), by=c('TEAM','OBJ')], by=c('TEAM','OBJ','LATENT'))
set(dfr, NULL, 'LATENTO', dfr[, factor(LATENTO, levels=paste('lv',1:10,sep=''), labels=paste('lv',1:10,sep=''))])
#collect key X variables with high latent variable weights
dfl <- do.call('rbind',lapply(paste('Comp',1:5), function(x){
do.call('rbind', list( data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms2[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms2[[TEAM]]$loading.weights[, x], OBJ='Incidence\nafter intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms3[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms3[[TEAM]]$loading.weights[, x], OBJ='Incidence reduction\nduring intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms5[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms5[[TEAM]]$loading.weights[, x], OBJ='Proportion of early transmissions\njust before intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms6[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms6[[TEAM]]$loading.weights[, x], OBJ='Proportion of early transmissions\nafter intervention'), by='TEAM'] ))
}))
set(dfl, NULL, 'LATENT', dfl[, gsub('Comp ','LV',LATENT)])
dfl <- subset(dfl, !is.nan(LOAD))
# use variable influence projection (Wold et al 1993)
# https://books.google.co.uk/books?id=QhHdGt8TG80C&pg=PA2&lpg=PA2&dq=PLS+contribution+of+each+variable&source=bl&ots=vWaqNtCTYz&sig=RT9STQ3SzXk1tU2ZNYlRycgxIQ8&hl=en&sa=X&ved=0ahUKEwiH1eHmirvJAhXK7hoKHcBXC_YQ6AEILzAC#v=onepage&q=PLS%20contribution%20of%20each%20variable&f=false
dfl <- merge(dfl, dfl[, list(X=X, LOADstd=LOAD^2/sum(LOAD^2)), by=c('TEAM','OBJ','LATENT')], by=c('TEAM','OBJ','LATENT','X'))
dfl <- merge(dfl, dfr, by=c('TEAM','OBJ','LATENT'))
set(dfl, NULL, 'X', dfl[, factor(X, levels=c("DATAT_L","DATA_T","INC_t","INCR_t","ACS_t","ACE_t","IMPRT","SMPL_N","SMPL_C","SMPL_M","SMPL_D"),
labels=c('Simulation model','Data provided', 'True % incidence','True incidence ratio','True % early transmissions just before intervention','True % early transmissions after intervention','Viral introductions','Sequences (#)','Sequence coverage','Sequences from after intervention start','Sampling duration after intervention start'))])
setkey(dfl, TEAM, OBJ, LATENTO)
dfl <- merge(dfl, dfl[, list( LATENTO=LATENTO, LOADcm=cumsum(LOADstd*R2each)), by=c('TEAM','OBJ','X')], by=c('TEAM','OBJ','LATENTO','X'))
#do barplot
ggplot(subset(dfl, LATENTO%in%paste('lv',1:4,sep='')), aes(x=LATENTO, y=100*LOADcm, fill=X)) +
geom_bar(stat="identity", colour='black') +
#scale_fill_brewer(palette='Spectral') +
scale_x_discrete(labels=c('1','1-2','1-3','1-4')) +
scale_y_continuous(expand=c(0,0), limit=c(0,100), breaks=seq(0,100,20), minor_breaks=seq(0,100,10)) +
facet_grid(TEAM~OBJ) +
theme_bw() + theme(panel.margin = unit(0.8, "lines"), legend.position='bottom', panel.grid.major.y=element_line(colour='grey70', size=1), panel.grid.minor.y=element_line(colour='grey70', size=0.4)) +
labs(x='\nfirst n PLS latent factors',fill='Predictor',y='variance in outlier status explained\n(%)\n') +
guides(fill=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Outliers_PLSbyLatentFactors_v3.pdf',sep=''), width=12, height=12)
#do influence plot
tmp2 <- dfl[, length(unique(X))]
tmp2 <- dfl[, list(VIP= sqrt(tmp2*sum(R2each*LOADstd)) ), by=c('TEAM','OBJ','X')]
ggplot(tmp2, aes(x=X, y=VIP, fill=X)) + geom_bar(stat="identity", colour='black') +
scale_y_continuous(expand=c(0,0), limit=c(0,2.9), breaks=seq(0.5,5,0.5), minor_breaks=seq(0,5,0.1)) +
theme_bw() + coord_flip() + facet_grid(TEAM~OBJ) +
theme(legend.position='bottom', axis.text.y=element_blank(), axis.ticks.y=element_blank(), panel.grid.major.x=element_line(colour='grey70', size=1), panel.grid.minor.x=element_line(colour='grey70', size=0.4)) +
labs(x='', y='\nVariable influence projection', fill='Variable') +
guides(fill=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Outliers_PLSvip_v2.pdf',sep=''), width=12, height=12)
#
# Do stepwise model selection in regression
#
require(gamlss)
bw.AIC2 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC2 <- vector('list', dfo[, length(unique(TEAM))])
bw.AIC3 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC3 <- vector('list', dfo[, length(unique(TEAM))])
bw.AIC5 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC5 <- vector('list', dfo[, length(unique(TEAM))])
bw.AIC6 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC6 <- vector('list', dfo[, length(unique(TEAM))])
names(bw.AIC6) <- names(bw.AIC5) <- names(bw.AIC3) <-names(bw.AIC2) <- dfo[, unique(TEAM)]
names(bw.BIC6) <- names(bw.BIC5) <- names(bw.BIC3) <-names(bw.BIC2) <- dfo[, unique(TEAM)]
for(x in names(bw.AIC2))
{
cat('\nprocess',x)
tmp <- subset(dfo, TEAM==x & OBJ=='Incidence\nafter intervention')
mnoa <- gamlss(OU_TK~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=tmp, family=BI, trace=FALSE)
bw.AIC2[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC2[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
tmp <- subset(dfo, TEAM==x & OBJ=='Incidence reduction\nduring intervention')
mnoa <- gamlss(OU_TK~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=tmp, family=BI, trace=FALSE)
bw.AIC3[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC3[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
tmp <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\njust before intervention')
mnoa <- gamlss(OU_TK~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=tmp, family=BI, trace=FALSE)
bw.AIC5[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC5[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
tmp <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\nafter intervention')
mnoa <- gamlss(OU_TK~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=tmp, family=BI, trace=FALSE)
bw.AIC6[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC6[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
}
dfoc <- dfo[, {
cat(as.character(TEAM), as.character(OBJ))
zz <- zy <- NULL
if(OBJ=='Incidence\nafter intervention')
{
zz <- bw.AIC2[[as.character(TEAM)]]
zy <- bw.BIC2[[as.character(TEAM)]]
}
if(OBJ=='Incidence reduction\nduring intervention')
{
zz <- bw.AIC3[[as.character(TEAM)]]
zy <- bw.BIC3[[as.character(TEAM)]]
}
if(OBJ=='Proportion of early transmissions\njust before intervention')
{
zz <- bw.AIC5[[as.character(TEAM)]]
zy <- bw.BIC5[[as.character(TEAM)]]
}
if(OBJ=='Proportion of early transmissions\nafter intervention')
{
zz <- bw.AIC6[[as.character(TEAM)]]
zy <- bw.BIC6[[as.character(TEAM)]]
}
z <- z2 <- numeric(0)
if(!is.null(names(coef(zz))))
{
z <- summary(zz)[,'Pr(>|t|)']
z <- z[ intersect(names(z), names(coef(zz))) ]
}
if(!is.null(names(coef(zy))))
{
z2 <- summary(zy)[,'Pr(>|t|)']
z2 <- z2[ intersect(names(z2), names(coef(zy))) ]
}
z <- sort(z)
z2 <- sort(z2)
length(z) <- max(length(z),length(z2))
length(z2) <- max(length(z),length(z2))
list( AICn=as.character(names(z)), AICp=z, BICn=as.character(names(z2)), BICp=z2 )
}, by=c('TEAM','OBJ')]
subset(dfoc, AICp<0.05 | BICp<0.05)
subset(dfo, OU_TK & TEAM=='Cambridge/Imperial')
subset(dfo, OU_TK & TEAM=='Imperial')
subset(dfo, OU_TK & TEAM=='British Columbia')
dfo[, table(OU_GR,OU_TK)]
dfo[, table(OU_TK,OBJ,as.character(TEAM))]
dfo[, table(OU_TK,OBJ)]
subset(dfo, OU_TK & OBJ%in%c('R_ii','R_vi'))[, table(SC_RND,as.character(TEAM))]
subset(dfo, OU_TK & OBJ%in%c('R_ii','R_vi'))[, table(SC_RND,OBJ)]
subset(dfo, OU_TK)
subset(dfo, OU_GR)
}
##--------------------------------------------------------------------------------------------------------
## olli 02.12.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.Outliers<- function(dfa, outdir)
{
require(plsdepot)
require(pls)
require(outliers)
dfo <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1), c(SC_RND, TEAM, DATAT_L, central, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, OBJ))
dfo <- dcast.data.table(dfo, SC_RND+TEAM+DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D~OBJ, value.var='central')
tmp <- subset(dfa, OBJ%in%c('OBJ_ii','OBJ_iii','OBJ_v','OBJ_vi') & TEAM=='True', c(SC_RND, TEAM, DATAT_L, central, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, OBJ))
set(tmp, NULL, 'OBJ', tmp[, paste(OBJ,'_t',sep='')])
tmp <- dcast.data.table(tmp, SC_RND~OBJ, value.var='central')
dfo <- merge(dfo, tmp, by='SC_RND')
set(dfo, NULL, 'DATAT_L', dfo[, as.numeric(factor(DATAT_L))])
set(dfo, NULL, 'SMPL_M', dfo[, as.numeric(SMPL_M)])
set(dfo, NULL, 'SMPL_D', dfo[, as.numeric(SMPL_D)])
set(dfo, NULL, 'DATA_T', dfo[, as.numeric(DATA_T)])
set(dfo, NULL, 'SMPL_C', dfo[, as.numeric(gsub('%','',as.character(SMPL_C)))/100])
set(dfo, NULL, 'IMPRT', dfo[, as.numeric(gsub('%','',as.character(IMPRT)))/100])
setnames(dfo, c('OBJ_ii_t','OBJ_iii_t','OBJ_v_t','OBJ_vi_t'), c('INC_t','INCR_t','ACS_t','ACE_t'))
dfo[, R_ii_1:= OBJ_ii-INC_t]
dfo[, R_ii_2:= log(OBJ_ii)-log(INC_t)]
dfo[, R_iii_1:= OBJ_iii-INCR_t]
dfo[, R_iii_2:= log(OBJ_iii)-log(INCR_t)]
dfo[, R_v_1:= OBJ_v-ACS_t]
dfo[, R_v_2:= log(OBJ_v)-log(ACS_t)]
dfo[, R_vi_1:= OBJ_vi-ACE_t]
dfo[, R_vi_2:= log(OBJ_vi)-log(ACE_t)]
tmp2 <- melt(dfo, measure.vars=c('R_ii_1','R_ii_2'))
ggplot(tmp2, aes(x=value, colour=TEAM)) + geom_density() + facet_grid(~variable) + coord_cartesian(xlim=c(-10,10))
# use R_ii_2
tmp2 <- melt(dfo, measure.vars=c('R_iii_1','R_iii_2'))
ggplot(tmp2, aes(x=value)) + geom_histogram(aes(colour=TEAM), fill='transparent') + geom_density(colour='black') + facet_grid(TEAM~variable, scales='free_x')
# use R_iii_2
tmp2 <- melt(dfo, measure.vars=c('R_v_1','R_v_2'))
ggplot(tmp2, aes(x=value)) + geom_histogram(aes(colour=TEAM), fill='transparent') + geom_density(colour='black') + facet_grid(TEAM~variable, scales='free_x')
# use R_v_1
tmp2 <- melt(dfo, measure.vars=c('R_vi_1','R_vi_2'))
ggplot(tmp2, aes(x=value)) + geom_histogram(aes(colour=TEAM), fill='transparent') + geom_density(colour='black') + facet_grid(TEAM~variable, scales='free_x')
# use R_vi_1
#
# Calculate outliers
#
set(dfo, NULL, c('R_ii_1','R_iii_1','R_v_2','R_vi_2'), NULL)
setnames(dfo, c('R_ii_2','R_iii_2','R_v_1','R_vi_1'), c('R_ii','R_iii','R_v','R_vi'))
dfo <- subset(dfo, select=c(SC_RND, TEAM, DATAT_L, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, INC_t, INCR_t, ACS_t, ACE_t, R_ii, R_iii, R_v, R_vi))
dfo <- melt(dfo, measure.vars=c('R_ii','R_iii','R_v','R_vi'), variable.name='OBJ', value.name='RESID')
dfo <- subset(dfo, !is.na(RESID))
tmp <- dfo[, {
z <- 1.5*diff(quantile(RESID, p=c(0.25,0.75)))
z <- c(quantile(RESID, p=0.25)-z, quantile(RESID, p=0.25)+z)
list(TEAM=TEAM, SC_RND=SC_RND, OU_GR= as.data.table(grubbs.flag( RESID ))[, OUTLIER], OU_TK= RESID<z[1] | RESID>z[2] )
}, by='OBJ']
dfo <- merge(dfo, tmp, by=c('TEAM','OBJ','SC_RND'))
dfo[, OUTLIER:=0]
set(dfo, dfo[, which(OU_TK)], 'OUTLIER', 1)
set(dfo, dfo[, which(OU_GR)], 'OUTLIER', 1)
set(dfo, NULL, 'OUTLIER', dfo[, factor(OUTLIER, levels=c(0,1), labels=c('No','Yes'))])
set(dfo, NULL, 'OBJ', dfo[, factor(OBJ, levels=c('R_ii','R_iii','R_v','R_vi'), labels=c('Incidence\nafter intervention','Incidence reduction\nduring intervention','Proportion of early transmissions\njust before intervention','Proportion of early transmissions\nafter intervention'))])
ggplot(dfo, aes(y=TEAM, x=RESID, label=SC_RND, colour=OUTLIER, size=OUTLIER)) +
geom_text(position=position_jitter(width=0, height=0.9)) +
scale_colour_manual(values=c('grey20','red3')) +
scale_size_manual(values=c(2.2,3)) +
theme_bw() + facet_grid(~OBJ, scales='free') +
labs(x='\nerror in phylogenetic estimate',y='',size='Outlier',colour='Outlier')
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Outliers.pdf',sep=''), width=12, height=5, useDingbats=FALSE)
ggplot(dfo, aes(y=TEAM, x=RESID, label=SC_RND, colour=OUTLIER, size=OUTLIER)) +
geom_point(data=subset(dfo, OUTLIER=='No'), position=position_jitter(width=0, height=0.9)) +
geom_text(data=subset(dfo, OUTLIER=='Yes'), position=position_jitter(width=0, height=0.9)) +
scale_colour_manual(values=c('grey40','red3')) +
scale_size_manual(values=c(0.8,3)) +
theme_bw() + facet_grid(~OBJ, scales='free') +
labs(x='\nerror in phylogenetic estimate',y='',size='Outlier',colour='Outlier')
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Outliers2.pdf',sep=''), width=12, height=5, useDingbats=FALSE)
#
# do PLS on OU_TK as in http://link.springer.com/article/10.1007/s00439-003-0921-9#/page-1 or http://bib.oxfordjournals.org/content/8/1/32.full
#
set(dfo, NULL, 'OU_TK', dfo[, as.numeric(OU_TK)])
set(dfo, NULL, 'OU_GR', dfo[, as.numeric(OU_GR)])
# restrict to not Cambridge
dfo <- subset(dfo, TEAM!='Cambridge')
plsms2 <- vector('list', dfo[, length(unique(TEAM))])
plsms3 <- vector('list', dfo[, length(unique(TEAM))])
plsms5 <- vector('list', dfo[, length(unique(TEAM))])
plsms6 <- vector('list', dfo[, length(unique(TEAM))])
names(plsms2) <- names(plsms3) <- names(plsms5) <- names(plsms6) <- dfo[, unique(TEAM)]
for(x in names(plsms2))
{
df <- subset(dfo, TEAM==x & OBJ=='Incidence\nafter intervention')
plsms2[[x]] <- plsr(OU_TK~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Incidence reduction\nduring intervention')
plsms3[[x]] <- plsr(OU_TK~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\njust before intervention')
plsms5[[x]] <- plsr(OU_TK~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=df, validation='LOO')
df <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\nafter intervention')
plsms6[[x]] <- plsr(OU_TK~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=df, validation='LOO')
}
# get variance explained
dfr <- as.data.table(melt(sapply(plsms2, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
dfr[, OBJ:='Incidence\nafter intervention']
tmp <- as.data.table(melt(sapply(plsms3, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Incidence reduction\nduring intervention']
dfr <- rbind(dfr, tmp)
tmp <- as.data.table(melt(sapply(plsms5, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Proportion of early transmissions\njust before intervention']
dfr <- rbind(dfr, tmp)
tmp <- as.data.table(melt(sapply(plsms6, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
tmp[, OBJ:='Proportion of early transmissions\nafter intervention']
dfr <- rbind(dfr, tmp)
set(dfr, NULL, 'LATENT', dfr[, paste('LV',gsub(' comps','',LATENT),sep='')])
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, R2each= c(R2[1],diff(R2))), by=c('TEAM','OBJ')], by=c('TEAM','OBJ','LATENT'))
#oh wow! the variance explained by the various latent variables is quite different from each other!!
dfr <- dfr[order(TEAM, OBJ, -R2each)]
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, LATENTO= paste('lv',seq_along(LATENT),sep='')), by=c('TEAM','OBJ')], by=c('TEAM','OBJ','LATENT'))
set(dfr, NULL, 'LATENTO', dfr[, factor(LATENTO, levels=paste('lv',1:10,sep=''), labels=paste('lv',1:10,sep=''))])
#collect key X variables with high latent variable weights
dfl <- do.call('rbind',lapply(paste('Comp',1:10), function(x){
do.call('rbind', list( data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms2[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms2[[TEAM]]$loading.weights[, x], OBJ='Incidence\nafter intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms3[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms3[[TEAM]]$loading.weights[, x], OBJ='Incidence reduction\nduring intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms5[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms5[[TEAM]]$loading.weights[, x], OBJ='Proportion of early transmissions\njust before intervention'), by='TEAM'],
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms6[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms6[[TEAM]]$loading.weights[, x], OBJ='Proportion of early transmissions\nafter intervention'), by='TEAM'] ))
}))
set(dfl, NULL, 'LATENT', dfl[, gsub('Comp ','LV',LATENT)])
dfl <- subset(dfl, !is.nan(LOAD))
# use variable influence projection (Wold et al 1993)
# https://books.google.co.uk/books?id=QhHdGt8TG80C&pg=PA2&lpg=PA2&dq=PLS+contribution+of+each+variable&source=bl&ots=vWaqNtCTYz&sig=RT9STQ3SzXk1tU2ZNYlRycgxIQ8&hl=en&sa=X&ved=0ahUKEwiH1eHmirvJAhXK7hoKHcBXC_YQ6AEILzAC#v=onepage&q=PLS%20contribution%20of%20each%20variable&f=false
dfl <- merge(dfl, dfl[, list(X=X, LOADstd=LOAD^2/sum(LOAD^2)), by=c('TEAM','OBJ','LATENT')], by=c('TEAM','OBJ','LATENT','X'))
dfl <- merge(dfl, dfr, by=c('TEAM','OBJ','LATENT'))
set(dfl, NULL, 'X', dfl[, factor(X, levels=c("DATAT_L","DATA_T","ACE_t","INC_t","INCR_t","IMPRT","SMPL_N","SMPL_C","SMPL_M","SMPL_D"),
labels=c('Simulation model','Data provided','%Acute', '%Incidence','Incidence ratio','Viral introductions','Sequences (#)','Sequence coverage','Sequences from after intervention start','Sampling duration after intervention start'))])
setkey(dfl, TEAM, OBJ, LATENTO)
dfl <- merge(dfl, dfl[, list( LATENTO=LATENTO, LOADcm=cumsum(LOADstd*R2each)), by=c('TEAM','OBJ','X')], by=c('TEAM','OBJ','LATENTO','X'))
#do barplot
ggplot(subset(dfl, LATENTO%in%paste('lv',1:4,sep='')), aes(x=LATENTO, y=100*LOADcm, fill=X)) +
geom_bar(stat="identity", colour='black') +
#scale_fill_brewer(palette='Spectral') +
scale_x_discrete(labels=c('1','1-2','1-3','1-4')) +
scale_y_continuous(expand=c(0,0), limit=c(0,100), breaks=seq(0,100,20), minor_breaks=seq(0,100,10)) +
facet_grid(TEAM~OBJ) +
theme_bw() + theme(panel.margin = unit(0.8, "lines"), legend.position='bottom', panel.grid.major.y=element_line(colour='grey70', size=1), panel.grid.minor.y=element_line(colour='grey70', size=0.4)) +
labs(x='\nfirst n PLS latent factors',fill='Variable',y='variance in outlier status explained\n(%)\n') +
guides(fill=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Outliers_PLSbyLatentFactors.pdf',sep=''), width=12, height=12)
#do influence plot
tmp2 <- dfl[, length(unique(X))]
tmp2 <- dfl[, list(VIP= sqrt(tmp2*sum(R2each*LOADstd)) ), by=c('TEAM','OBJ','X')]
ggplot(tmp2, aes(x=X, y=VIP, fill=X)) + geom_bar(stat="identity", colour='black') +
scale_y_continuous(expand=c(0,0), limit=c(0,2.9), breaks=seq(0.5,5,0.5), minor_breaks=seq(0,5,0.1)) +
theme_bw() + coord_flip() + facet_grid(TEAM~OBJ) +
theme(legend.position='bottom', axis.text.y=element_blank(), axis.ticks.y=element_blank(), panel.grid.major.x=element_line(colour='grey70', size=1), panel.grid.minor.x=element_line(colour='grey70', size=0.4)) +
labs(x='', y='\nVariable influence projection', fill='Variable') +
guides(fill=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_Outliers_PLSvip.pdf',sep=''), width=12, height=12)
#
# Do stepwise model selection in regression
#
require(gamlss)
bw.AIC2 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC2 <- vector('list', dfo[, length(unique(TEAM))])
bw.AIC3 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC3 <- vector('list', dfo[, length(unique(TEAM))])
bw.AIC5 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC5 <- vector('list', dfo[, length(unique(TEAM))])
bw.AIC6 <- vector('list', dfo[, length(unique(TEAM))])
bw.BIC6 <- vector('list', dfo[, length(unique(TEAM))])
names(bw.AIC6) <- names(bw.AIC5) <- names(bw.AIC3) <-names(bw.AIC2) <- dfo[, unique(TEAM)]
names(bw.BIC6) <- names(bw.BIC5) <- names(bw.BIC3) <-names(bw.BIC2) <- dfo[, unique(TEAM)]
for(x in names(bw.AIC2))
{
cat('\nprocess',x)
tmp <- subset(dfo, TEAM==x & OBJ=='Incidence\nafter intervention')
mnoa <- gamlss(OU_TK~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=tmp, family=BI, trace=FALSE)
bw.AIC2[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC2[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
tmp <- subset(dfo, TEAM==x & OBJ=='Incidence reduction\nduring intervention')
mnoa <- gamlss(OU_TK~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=tmp, family=BI, trace=FALSE)
bw.AIC3[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC3[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
tmp <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\njust before intervention')
mnoa <- gamlss(OU_TK~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=tmp, family=BI, trace=FALSE)
bw.AIC5[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC5[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
tmp <- subset(dfo, TEAM==x & OBJ=='Proportion of early transmissions\nafter intervention')
mnoa <- gamlss(OU_TK~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=tmp, family=BI, trace=FALSE)
bw.AIC6[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE)
bw.BIC6[[x]] <- stepGAIC.VR(mnoa, direction='backward', what='mu', trace=FALSE, k=log(nrow(tmp)))
}
dfoc <- dfo[, {
cat(as.character(TEAM), as.character(OBJ))
zz <- zy <- NULL
if(OBJ=='Incidence\nafter intervention')
{
zz <- bw.AIC2[[as.character(TEAM)]]
zy <- bw.BIC2[[as.character(TEAM)]]
}
if(OBJ=='Incidence reduction\nduring intervention')
{
zz <- bw.AIC3[[as.character(TEAM)]]
zy <- bw.BIC3[[as.character(TEAM)]]
}
if(OBJ=='Proportion of early transmissions\njust before intervention')
{
zz <- bw.AIC5[[as.character(TEAM)]]
zy <- bw.BIC5[[as.character(TEAM)]]
}
if(OBJ=='Proportion of early transmissions\nafter intervention')
{
zz <- bw.AIC6[[as.character(TEAM)]]
zy <- bw.BIC6[[as.character(TEAM)]]
}
z <- z2 <- numeric(0)
if(!is.null(names(coef(zz))))
{
z <- summary(zz)[,'Pr(>|t|)']
z <- z[ intersect(names(z), names(coef(zz))) ]
}
if(!is.null(names(coef(zy))))
{
z2 <- summary(zy)[,'Pr(>|t|)']
z2 <- z2[ intersect(names(z2), names(coef(zy))) ]
}
z <- sort(z)
z2 <- sort(z2)
length(z) <- max(length(z),length(z2))
length(z2) <- max(length(z),length(z2))
list( AICn=as.character(names(z)), AICp=z, BICn=as.character(names(z2)), BICp=z2 )
}, by=c('TEAM','OBJ')]
subset(dfoc, AICp<0.05 | BICp<0.05)
subset(dfo, OU_TK & TEAM=='Cambridge/Imperial')
subset(dfo, OU_TK & TEAM=='Imperial')
subset(dfo, OU_TK & TEAM=='British Columbia')
dfo[, table(OU_GR,OU_TK)]
dfo[, table(OU_TK,OBJ,as.character(TEAM))]
dfo[, table(OU_TK,OBJ)]
subset(dfo, OU_TK & OBJ%in%c('R_ii','R_vi'))[, table(SC_RND,as.character(TEAM))]
subset(dfo, OU_TK & OBJ%in%c('R_ii','R_vi'))[, table(SC_RND,OBJ)]
subset(dfo, OU_TK)
subset(dfo, OU_GR)
}
##--------------------------------------------------------------------------------------------------------
## olli 26.11.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.PLSIncidence<- function(dfa, outdir)
{
require(plsdepot)
require(pls)
tmp <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_ii')
tmp2 <- dcast.data.table(subset(dfa, TEAM=='True', c(SC_RND, OBJ, central)), SC_RND~OBJ, value.var='central')
setnames(tmp2, c('OBJ_ii','OBJ_iii','OBJ_vi'), c('INC_t','INCR_t','ACE_t'))
tmp <- merge(tmp, tmp2, by='SC_RND')
tmp <- subset(tmp, !is.na(central), c(SC_RND, TEAM, DATAT_L, central, lower95, upper95, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, INC_t, INCR_t, ACE_t))
set(tmp, NULL, 'DATAT_L', tmp[, as.numeric(factor(DATAT_L))])
set(tmp, NULL, 'SMPL_M', tmp[, as.numeric(SMPL_M)])
set(tmp, NULL, 'SMPL_D', tmp[, as.numeric(SMPL_D)])
set(tmp, NULL, 'DATA_T', tmp[, as.numeric(DATA_T)])
set(tmp, NULL, 'SMPL_C', tmp[, as.numeric(gsub('%','',as.character(SMPL_C)))/100])
set(tmp, NULL, 'IMPRT', tmp[, as.numeric(gsub('%','',as.character(IMPRT)))/100])
tmp[, RES:= central-INC_t]
plsms <- vector('list', tmp[, length(unique(TEAM))])
names(plsms) <- tmp[, unique(TEAM)]
for(x in names(plsms))
{
df <- subset(tmp, TEAM==x)
plsms[[x]] <- plsr(RES~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=df, validation='LOO')
}
sapply(plsms, explvar)
#first component explains 99% of variance in X
tmp2 <- sapply(plsms, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) )
tmp2$Cambridge <- c(tmp2$Cambridge, {x<- rep(NA, 8); names(x)<- paste(3:10,'comps'); x})
dfr <- as.data.table(melt(t(do.call('rbind',tmp2)), varnames=c('LATENT','TEAM'), value.name='R2'))
set(dfr, NULL, 'LATENT', dfr[, paste('LV',gsub(' comps','',LATENT),sep='')])
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, R2each= c(R2[1],diff(R2))), by='TEAM'], by=c('TEAM','LATENT'))
#oh wow! the variance explained by the various latent variables is quite different from each other!!
dfr <- dfr[order(TEAM, -R2each)]
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, LATENTO= paste('lv',seq_along(LATENT),sep='')), by=c('TEAM')], by=c('TEAM','LATENT'))
set(dfr, NULL, 'LATENTO', dfr[, factor(LATENTO, levels=paste('lv',1:10,sep=''), labels=paste('lv',1:10,sep=''))])
#collect key X variables with high latent variable weights
dfl <- do.call('rbind',lapply(paste('Comp',1:10), function(x){
tmp2 <- NA_real_
if(x%in%colnames(plsms[[TEAM]]$loading.weights))
tmp2 <- plsms[[TEAM]]$loading.weights[, x]
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=tmp2), by='TEAM']
}))
set(dfl, NULL, 'LATENT', dfl[, gsub('Comp ','LV',LATENT)])
# use variable influence projection (Wold et al 1993)
# https://books.google.co.uk/books?id=QhHdGt8TG80C&pg=PA2&lpg=PA2&dq=PLS+contribution+of+each+variable&source=bl&ots=vWaqNtCTYz&sig=RT9STQ3SzXk1tU2ZNYlRycgxIQ8&hl=en&sa=X&ved=0ahUKEwiH1eHmirvJAhXK7hoKHcBXC_YQ6AEILzAC#v=onepage&q=PLS%20contribution%20of%20each%20variable&f=false
dfl <- merge(dfl, dfl[, list(X=X, LOADstd=LOAD^2/sum(LOAD^2)), by=c('TEAM','LATENT')], by=c('TEAM','LATENT','X'))
dfl <- merge(dfl, dfr, by=c('TEAM','LATENT'))
set(dfl, NULL, 'X', dfl[, factor(X, levels=c("DATAT_L","DATA_T","ACE_t","INC_t","INCR_t","IMPRT","SMPL_N","SMPL_C","SMPL_M","SMPL_D"),
labels=c('Simulation model','Data provided','%Acute', '%Incidence','Incidence ratio','Viral introductions','Sequences (#)','Sequence coverage','Sequences from after intervention start','Sampling duration after intervention start'))])
setkey(dfl, TEAM, LATENTO)
dfl <- merge(dfl, dfl[, list( LATENTO=LATENTO, LOADcm=cumsum(LOADstd*R2each)), by=c('TEAM','X')], by=c('TEAM','LATENTO','X'))
#do barplot
#does not show total variance explained :-(
ggplot(dfl, aes(x=LATENTO, y=100*LOADcm, fill=X)) +
geom_bar(stat="identity", colour='black') +
#scale_fill_brewer(palette='Spectral') +
scale_y_continuous(expand=c(0,0), limit=c(0,100), breaks=seq(0,100,20), minor_breaks=seq(0,100,10)) +
facet_grid(~TEAM) +
theme_bw() + theme(legend.position='bottom', axis.text.x=element_blank(),panel.grid.major.y=element_line(colour='grey70', size=1), panel.grid.minor.y=element_line(colour='grey70', size=0.4)) +
labs(x='\nfirst n PLS latent factors',fill='Variable',y='variance in error explained\n(%)') +
guides(fill=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_IncEnd_PLSbyLatentFactors.pdf',sep=''), width=12, height=6)
#do influence plot
tmp2 <- dfl[, length(unique(X))]
tmp2 <- dfl[, list(VIP= sqrt(tmp2*sum(R2each*LOADstd)) ), by=c('TEAM','X')]
ggplot(tmp2, aes(x=X, y=VIP, fill=X)) + geom_bar(stat="identity", colour='black') +
scale_y_continuous(expand=c(0,0), limit=c(0,2.9), breaks=seq(0.5,5,0.5), minor_breaks=seq(0,5,0.1)) +
theme_bw() + coord_flip() + facet_grid(~TEAM) +
theme(legend.position='bottom', axis.text.y=element_blank(), axis.ticks.y=element_blank(), panel.grid.major.x=element_line(colour='grey70', size=1), panel.grid.minor.x=element_line(colour='grey70', size=0.4)) +
labs(x='', y='\nVariable influence projection', fill='Variable') +
guides(fill=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_IncEnd_PLSvip.pdf',sep=''), width=10, height=5)
# TRAIN
data("gasoline")
gasTrain <- gasoline[1:50, ]
gasTest <- gasoline[51:60, ]
gas1 <- plsr(octane ~ NIR, ncomp = 10, data = gasTrain, validation = "LOO")
plot(gas1, ncomp = 2, asp = 1, line = TRUE)
plot(RMSEP(gas1), legendpos = "topright")
plot(gas1, plottype = "scores", comps = 1:3)
plot(gas1, plottype = "coef", ncomp = 1:3, legendpos = "bottomleft", labels = "numbers", xlab = "nm")
plot(gas1, plottype = "loadings", comps = 1:5, legendpos = "topleft", labels = "numbers", xlab = "nm")
plot(gas1, plottype = "correlation", comps = 1:4)
data("yarn")
y1 <- plsr(density ~ NIR, ncomp = 10, data = yarn, validation = "LOO")
plot(y1, plottype = "scores", comps = 1:3)
df <- subset(tmp, TEAM=='Cambridge/Imperial')
pls1 <- plsr(RES~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=df, validation='LOO')
plot(RMSEP(pls1), legendpos = "topright") #4 components?
explvar(pls1) #first component explains 99.9% of variance in X
plot(pls1, plottype = "loadings", comps = 1:3, labels=c("Model","Data\nprovided","Viral\nintro","SampleN", "SampleC","SampleM","SampleD","Inc","INCR","pcAcute"), legendpos='topright')
# the first component has -1 loading at SampleN
plot(pls1, plottype = "coef", comps = 1, labels=c("Model","Data\nprovided","Viral\nintro","SampleN", "SampleC","SampleM","SampleD","Inc","INCR","pcAcute"), legendpos='topright')
# the first component has a dip at SampleN
plot(gas1, plottype = "correlation", comps = 1:4)
pls1$coefficients #for every C: Y~beta*var in component C
pls1$scores #obs x component
pls1$loadings #var x component
pls1$loading.weights #var x component
#https://books.google.co.uk/books?id=Qsn6yjRXOaMC&pg=PA143&lpg=PA143&dq=what+the+loadings+and+loading+weights+in+pls&source=bl&ots=cB2qb_rUZ1&sig=lIWjCXvw95pejNTLY0c5RO58dm0&hl=en&sa=X&ved=0ahUKEwj9ttSj7rrJAhWGaRQKHZ9nB4oQ6AEIKDAB#v=onepage&q=what%20the%20loadings%20and%20loading%20weights%20in%20pls&f=false
#Multivariate Data Analysis - in Practice: An Introduction to Multivariate ... By Kim H. Esbensen, Dominique Guyot, Frank Westad, Lars P. Houmolle
#P loadings are very much like PCA
#W loading weights. w1 characterizes the first PLS component direction in X space
#If P and W are similar, then the dominant structures in X happen to be those with max correlation to Y
pls2 <- plsreg1(subset(df, select=c(DATAT_L, DATA_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, INC_t, INCR_t, ACE_t)), subset(df, select=RES), comps=10, crosval=TRUE)
# this seems quite different! The first component explains 63% of variance
}
##--------------------------------------------------------------------------------------------------------
## olli 26.11.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.PLSAcute<- function(dfa, outdir)
{
require(plsdepot)
require(pls)
tmp <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(', TEAM,fixed=1) & OBJ=='OBJ_vi')
tmp2 <- dcast.data.table(subset(dfa, TEAM=='True', c(SC_RND, OBJ, central)), SC_RND~OBJ, value.var='central')
setnames(tmp2, c('OBJ_ii','OBJ_iii','OBJ_vi'), c('INC_t','INCR_t','ACE_t'))
tmp <- merge(tmp, tmp2, by='SC_RND')
tmp <- subset(tmp, !is.na(central), c(SC_RND, TEAM, DATAT_L, central, lower95, upper95, DATA_T,IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, INC_t, INCR_t, ACE_t))
set(tmp, NULL, 'DATAT_L', tmp[, as.numeric(factor(DATAT_L))])
set(tmp, NULL, 'SMPL_M', tmp[, as.numeric(SMPL_M)])
set(tmp, NULL, 'SMPL_D', tmp[, as.numeric(SMPL_D)])
set(tmp, NULL, 'DATA_T', tmp[, as.numeric(DATA_T)])
set(tmp, NULL, 'SMPL_C', tmp[, as.numeric(gsub('%','',as.character(SMPL_C)))/100])
set(tmp, NULL, 'IMPRT', tmp[, as.numeric(gsub('%','',as.character(IMPRT)))/100])
tmp[, RES:= central-ACE_t]
plsms <- vector('list', tmp[, length(unique(TEAM))])
names(plsms) <- tmp[, unique(TEAM)]
for(x in names(plsms))
{
df <- subset(tmp, TEAM==x)
plsms[[x]] <- plsr(RES~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=df, validation='LOO')
}
sapply(plsms, explvar)
#first component explains 99% of variance in X
dfr <- as.data.table(melt(sapply(plsms, function(x) drop((R2(x, intercept=FALSE, estimate='train'))$val) ), varnames=c('LATENT','TEAM'), value.name='R2'))
set(dfr, NULL, 'LATENT', dfr[, paste('LV',gsub(' comps','',LATENT),sep='')])
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, R2each= c(R2[1],diff(R2))), by='TEAM'], by=c('TEAM','LATENT'))
#oh wow! the variance explained by the various latent variables is quite different from each other!!
dfr <- dfr[order(TEAM, -R2each)]
dfr <- merge(dfr, dfr[, list(LATENT=LATENT, LATENTO= paste('lv',seq_along(LATENT),sep='')), by=c('TEAM')], by=c('TEAM','LATENT'))
set(dfr, NULL, 'LATENTO', dfr[, factor(LATENTO, levels=paste('lv',1:10,sep=''), labels=paste('lv',1:10,sep=''))])
#collect key X variables with high latent variable weights
dfl <- do.call('rbind',lapply(paste('Comp',1:10), function(x){
data.table(TEAM=tmp[, as.character(unique(TEAM))])[, list( X=names(plsms[[TEAM]]$loading.weights[, 'Comp 1']), LATENT=x, LOAD=plsms[[TEAM]]$loading.weights[, x]), by='TEAM']
}))
set(dfl, NULL, 'LATENT', dfl[, gsub('Comp ','LV',LATENT)])
# use variable influence projection (Wold et al 1993)
# https://books.google.co.uk/books?id=QhHdGt8TG80C&pg=PA2&lpg=PA2&dq=PLS+contribution+of+each+variable&source=bl&ots=vWaqNtCTYz&sig=RT9STQ3SzXk1tU2ZNYlRycgxIQ8&hl=en&sa=X&ved=0ahUKEwiH1eHmirvJAhXK7hoKHcBXC_YQ6AEILzAC#v=onepage&q=PLS%20contribution%20of%20each%20variable&f=false
dfl <- merge(dfl, dfl[, list(X=X, LOADstd=LOAD^2/sum(LOAD^2)), by=c('TEAM','LATENT')], by=c('TEAM','LATENT','X'))
dfl <- merge(dfl, dfr, by=c('TEAM','LATENT'))
set(dfl, NULL, 'X', dfl[, factor(X, levels=c("DATAT_L","DATA_T","ACE_t","INC_t","INCR_t","IMPRT","SMPL_N","SMPL_C","SMPL_M","SMPL_D"),
labels=c('Simulation model','Data provided','%Acute', '%Incidence','Incidence ratio','Viral introductions','Sequences (#)','Sequence coverage','Sequences from after intervention start','Sampling duration after intervention start'))])
setkey(dfl, TEAM, LATENTO)
dfl <- merge(dfl, dfl[, list( LATENTO=LATENTO, LOADcm=cumsum(LOADstd*R2each)), by=c('TEAM','X')], by=c('TEAM','LATENTO','X'))
#do barplot
#does not show total variance explained :-(
ggplot(dfl, aes(x=LATENTO, y=100*LOADcm, fill=X)) +
geom_bar(stat="identity", colour='black') +
#scale_fill_brewer(palette='Spectral') +
scale_y_continuous(expand=c(0,0), limit=c(0,100), breaks=seq(0,100,20), minor_breaks=seq(0,100,10)) +
facet_grid(~TEAM) +
theme_bw() + theme(legend.position='bottom', axis.text.x=element_blank(),panel.grid.major.y=element_line(colour='grey70', size=1), panel.grid.minor.y=element_line(colour='grey70', size=0.4)) +
labs(x='\nfirst n PLS latent factors',fill='Variable',y='variance in error explained\n(%)') +
guides(fill=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_AcuteEnd_PLSbyLatentFactors.pdf',sep=''), width=10, height=6)
#do influence plot
tmp2 <- dfl[, length(unique(X))]
tmp2 <- dfl[, list(VIP= sqrt(tmp2*sum(R2each*LOADstd)) ), by=c('TEAM','X')]
ggplot(tmp2, aes(x=X, y=VIP, fill=X)) + geom_bar(stat="identity", colour='black') +
scale_y_continuous(expand=c(0,0), limit=c(0,2.9), breaks=seq(0.5,5,0.5), minor_breaks=seq(0,5,0.1)) +
theme_bw() + coord_flip() + facet_grid(~TEAM) +
theme(legend.position='bottom', axis.text.y=element_blank(), axis.ticks.y=element_blank(), panel.grid.major.x=element_line(colour='grey70', size=1), panel.grid.minor.x=element_line(colour='grey70', size=0.4)) +
labs(x='', y='\nVariable influence projection', fill='Variable') +
guides(fill=guide_legend(ncol=3))
ggsave(file=paste(outdir,'/res_acrossTEAM_Secondary_AcuteEnd_PLSvip.pdf',sep=''), width=10, height=5)
# TRAIN
data("gasoline")
gasTrain <- gasoline[1:50, ]
gasTest <- gasoline[51:60, ]
gas1 <- plsr(octane ~ NIR, ncomp = 10, data = gasTrain, validation = "LOO")
plot(gas1, ncomp = 2, asp = 1, line = TRUE)
plot(RMSEP(gas1), legendpos = "topright")
plot(gas1, plottype = "scores", comps = 1:3)
plot(gas1, plottype = "coef", ncomp = 1:3, legendpos = "bottomleft", labels = "numbers", xlab = "nm")
plot(gas1, plottype = "loadings", comps = 1:5, legendpos = "topleft", labels = "numbers", xlab = "nm")
plot(gas1, plottype = "correlation", comps = 1:4)
data("yarn")
y1 <- plsr(density ~ NIR, ncomp = 10, data = yarn, validation = "LOO")
plot(y1, plottype = "scores", comps = 1:3)
df <- subset(tmp, TEAM=='Cambridge/Imperial')
pls1 <- plsr(RES~DATAT_L+DATA_T+IMPRT+SMPL_N+SMPL_C+SMPL_M+SMPL_D+INC_t+INCR_t+ACE_t, data=df, validation='LOO')
plot(RMSEP(pls1), legendpos = "topright") #4 components?
explvar(pls1) #first component explains 99.9% of variance in X
plot(pls1, plottype = "loadings", comps = 1:3, labels=c("Model","Data\nprovided","Viral\nintro","SampleN", "SampleC","SampleM","SampleD","Inc","INCR","pcAcute"), legendpos='topright')
# the first component has -1 loading at SampleN
plot(pls1, plottype = "coef", comps = 1, labels=c("Model","Data\nprovided","Viral\nintro","SampleN", "SampleC","SampleM","SampleD","Inc","INCR","pcAcute"), legendpos='topright')
# the first component has a dip at SampleN
plot(gas1, plottype = "correlation", comps = 1:4)
pls1$coefficients #for every C: Y~beta*var in component C
pls1$scores #obs x component
pls1$loadings #var x component
pls1$loading.weights #var x component
#https://books.google.co.uk/books?id=Qsn6yjRXOaMC&pg=PA143&lpg=PA143&dq=what+the+loadings+and+loading+weights+in+pls&source=bl&ots=cB2qb_rUZ1&sig=lIWjCXvw95pejNTLY0c5RO58dm0&hl=en&sa=X&ved=0ahUKEwj9ttSj7rrJAhWGaRQKHZ9nB4oQ6AEIKDAB#v=onepage&q=what%20the%20loadings%20and%20loading%20weights%20in%20pls&f=false
#Multivariate Data Analysis - in Practice: An Introduction to Multivariate ... By Kim H. Esbensen, Dominique Guyot, Frank Westad, Lars P. Houmolle
#P loadings are very much like PCA
#W loading weights. w1 characterizes the first PLS component direction in X space
#If P and W are similar, then the dominant structures in X happen to be those with max correlation to Y
pls2 <- plsreg1(subset(df, select=c(DATAT_L, DATA_T, IMPRT, SMPL_N, SMPL_C, SMPL_M, SMPL_D, INC_t, INCR_t, ACE_t)), subset(df, select=RES), comps=10, crosval=TRUE)
# this seems quite different! The first component explains 63% of variance
}
##--------------------------------------------------------------------------------------------------------
## olli 12.08.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.primary.acute<- function(dfa, outdir, onSeq=1)
{
require(gridExtra)
if(onSeq)
{
df <- subset(dfa, Pr_Seq==1)
title <- '\nPrimary objective\n%Acute from sequence data\n'
}
if(!onSeq)
{
df <- subset(dfa, Pr_Phy==1)
title <- '\nSecondary objective\n%Acute when phylogeny known\n'
}
tmp <- subset(df, !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_v'))
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
p1 <- ggplot(subset(tmp, TEAM!='True'), aes(y=gsub('\n',':',paste(INT_L,' ',AC_L,sep='')), x=central)) +
#geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
coord_cartesian(xlim=c(0, 50)) +
scale_x_continuous(breaks=seq(10,40,10), minor_breaks=seq(0,50,2)) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% transmissions from individuals in their first 3 months of infection\nat baseline', y='')
tmp <- subset(df, !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_vi'))
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
p2 <- ggplot(subset(tmp, TEAM!='True'), aes(y=gsub('\n',':',paste(INT_L,' ',AC_L,sep='')), x=central)) +
#geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
coord_cartesian(xlim=c(0, 50)) +
scale_x_continuous(breaks=seq(10,40,10), minor_breaks=seq(0,50,2)) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% transmissions from individuals in their first 3 months of infection\nat end of intervention', y='')
pdf(file=paste(outdir,'/res_acrossTEAM_PrimaryAcute_onSeq',onSeq,'.pdf',sep=''), width = 15, height = 8)
print(grid.arrange(p1, p2, nrow=1, main=title))
dev.off()
}
##--------------------------------------------------------------------------------------------------------
## olli 12.08.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.seqcoverage<- function(dfa, outdir)
{
title <- '\nSecondary objective\nImpact of sequence coverage\n'
tmp <- subset(dfa, Sc_SeqCoverage_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_ii'))
set(tmp, NULL, 'INT_L',tmp[,gsub('\n',': ',INT_L)])
tmp2 <- as.data.table(expand.grid(SMPL_C=tmp[, unique(SMPL_C)], central=1, DATAT_L=tmp[, unique(DATAT_L)], INT_L=tmp[, unique(INT_L)], AC_L=tmp[,unique(AC_L)]))
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, INT_L, IMPRT)
p1 <- ggplot(subset(tmp, TEAM!='True'), aes(y=SMPL_C, x=central)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
coord_cartesian(xlim=c(0, 8)) +
scale_x_continuous(breaks=seq(1,7,1)) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L+INT_L+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% Incidence', y='sequence coverage\n')
tmp <- subset(dfa, Sc_SeqCoverage_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_iii'))
set(tmp, NULL, 'INT_L',tmp[,gsub('\n',': ',INT_L)])
tmp2 <- as.data.table(expand.grid(SMPL_C=tmp[, unique(SMPL_C)], central=1, DATAT_L=tmp[, unique(DATAT_L)], INT_L=tmp[, unique(INT_L)], AC_L=tmp[,unique(AC_L)]))
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, INT_L, IMPRT)
p2 <- ggplot(subset(tmp, TEAM!='True'), aes(y=SMPL_C, x=central)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_vline(xintercept=1, colour='grey50', size=0.8) +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
coord_cartesian(xlim=c(0, 2)) +
scale_colour_manual(values=TEAM_CL) +
scale_x_continuous(breaks=seq(0.5,1.5,0.5), minor_breaks=seq(0.25,1.75,0.25)) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
facet_grid(TEAM~DATAT_L+INT_L+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\nIncidence reduction', y='sequence coverage\n')
tmp <- subset(dfa, Sc_SeqCoverage_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_v'))
set(tmp, NULL, 'INT_L',tmp[,gsub('\n',': ',INT_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, INT_L, IMPRT)
p3 <- ggplot(subset(tmp, TEAM!='True'), aes(y=SMPL_C, x=central)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
coord_cartesian(xlim=c(0, 50)) +
scale_x_continuous(breaks=seq(10,40,10), minor_breaks=seq(0,50,2)) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L+INT_L+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% trms from individuals in their first 3 months of infection\nat baseline', y='sequence coverage\n')
tmp <- subset(dfa, Sc_SeqCoverage_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_vi'))
set(tmp, NULL, 'INT_L',tmp[,gsub('\n',': ',INT_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, INT_L, IMPRT)
p4 <- ggplot(subset(tmp, TEAM!='True'), aes(y=SMPL_C, x=central)) +
geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
coord_cartesian(xlim=c(0, 50)) +
scale_x_continuous(breaks=seq(10,40,10), minor_breaks=seq(0,50,2)) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L+INT_L+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% trms from individuals in their first 3 months of infection\nat end of intervention', y='sequence coverage\n')
pdf(file=paste(outdir,'/res_acrossTEAM_Secondary_SeqCoverage.pdf',sep=''), width = 15, height = 30)
print(grid.arrange(p1, p2, p3, p4, nrow=4, main=title))
dev.off()
}
##--------------------------------------------------------------------------------------------------------
## olli 12.08.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.imports<- function(dfa, outdir)
{
title <- '\nSecondary objective\nImpact of 20% / year transmissions from outside\n'
tmp <- subset(dfa, Sc_Imports_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_ii'))
set(tmp, NULL, 'INT_L',tmp[,gsub('\n',': ',INT_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, INT_L, IMPRT)
p1 <- ggplot(subset(tmp, TEAM!='True'), aes(y=paste(IMPRT,' / year',sep=''), x=central)) +
#geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
coord_cartesian(xlim=c(0, 8)) +
scale_x_continuous(breaks=seq(1,7,1)) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L+INT_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% Incidence', y='transmissions from outside\n')
tmp <- subset(dfa, Sc_Imports_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_iii'))
set(tmp, NULL, 'INT_L',tmp[,gsub('\n',': ',INT_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, INT_L, IMPRT)
p2 <- ggplot(subset(tmp, TEAM!='True'), aes(y=paste(IMPRT,' / year',sep=''), x=central)) +
geom_vline(xintercept=1, colour='grey50', size=0.8) +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
coord_cartesian(xlim=c(0, 4)) +
scale_colour_manual(values=TEAM_CL) +
scale_x_continuous(breaks=seq(0.5,3.5,0.5), minor_breaks=seq(0.25,3.75,0.25)) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
facet_grid(TEAM~DATAT_L+INT_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\nIncidence reduction', y='transmissions from outside\n')
tmp <- subset(dfa, Sc_Imports_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_v'))
set(tmp, NULL, 'INT_L',tmp[,gsub('\n',': ',INT_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, INT_L, IMPRT)
p3 <- ggplot(subset(tmp, TEAM!='True'), aes(y=paste(IMPRT,' / year',sep=''), x=central)) +
#geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
coord_cartesian(xlim=c(0, 50)) +
scale_x_continuous(breaks=seq(10,40,10), minor_breaks=seq(0,50,2)) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L+INT_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% trms from individuals in their first 3 months of infection\nat baseline', y='transmissions from outside\n')
tmp <- subset(dfa, Sc_Imports_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_vi'))
set(tmp, NULL, 'INT_L',tmp[,gsub('\n',': ',INT_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, INT_L, IMPRT)
p4 <- ggplot(subset(tmp, TEAM!='True'), aes(y=paste(IMPRT,' / year',sep=''), x=central)) +
#geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
coord_cartesian(xlim=c(0, 50)) +
scale_x_continuous(breaks=seq(10,40,10), minor_breaks=seq(0,50,2)) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L+INT_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% trms from individuals in their first 3 months of infection\nat end of intervention', y='transmissions from outside\n')
pdf(file=paste(outdir,'/res_acrossTEAM_Secondary_Imports.pdf',sep=''), width = 12, height = 12)
print(grid.arrange(p1, p2, p3, p4, nrow=2, main=title))
dev.off()
}
##--------------------------------------------------------------------------------------------------------
## olli 12.08.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.focussedsampling<- function(dfa, outdir)
{
title <- '\nSecondary objective\nImpact of 50% vs 85% of samples obtained after intervention start\n'
tmp <- subset(dfa, Sc_SmplFc_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_ii'))
set(tmp, NULL, 'INT_L',tmp[,gsub('\n',': ',INT_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, INT_L, SMPL_M)
p1 <- ggplot(subset(tmp, TEAM!='True'), aes(y=factor(as.character(SMPL_M),levels=c('much','extreme'),labels=c('50%','85%')), x=central)) +
#geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
coord_cartesian(xlim=c(0, 8)) +
scale_x_continuous(breaks=seq(1,7,1)) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L+INT_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% Incidence', y='% samples obtained after\nintervention start\n')
tmp <- subset(dfa, Sc_SmplFc_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_iii'))
set(tmp, NULL, 'INT_L',tmp[,gsub('\n',': ',INT_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, INT_L, SMPL_M)
p2 <- ggplot(subset(tmp, TEAM!='True'), aes(y=factor(as.character(SMPL_M),levels=c('much','extreme'),labels=c('50%','85%')), x=central)) +
geom_vline(xintercept=1, colour='grey50', size=0.8) +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
coord_cartesian(xlim=c(0, 4)) +
scale_colour_manual(values=TEAM_CL) +
scale_x_continuous(breaks=seq(0.5,3.5,0.5), minor_breaks=seq(0.25,3.75,0.25)) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
facet_grid(TEAM~DATAT_L+INT_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\nIncidence reduction', y='% samples obtained after\nintervention start\n')
tmp <- subset(dfa, Sc_SmplFc_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_v'))
set(tmp, NULL, 'INT_L',tmp[,gsub('\n',': ',INT_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, INT_L, SMPL_M)
p3 <- ggplot(subset(tmp, TEAM!='True'), aes(y=factor(as.character(SMPL_M),levels=c('much','extreme'),labels=c('50%','85%')), x=central)) +
#geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
coord_cartesian(xlim=c(0, 50)) +
scale_x_continuous(breaks=seq(10,40,10), minor_breaks=seq(0,50,2)) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L+INT_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% trms from individuals in their first 3 months of infection\nat baseline', y='% samples obtained after\nintervention start\n')
tmp <- subset(dfa, Sc_SmplFc_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_vi'))
set(tmp, NULL, 'INT_L',tmp[,gsub('\n',': ',INT_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, INT_L, SMPL_M)
p4 <- ggplot(subset(tmp, TEAM!='True'), aes(y=factor(as.character(SMPL_M),levels=c('much','extreme'),labels=c('50%','85%')), x=central)) +
#geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
coord_cartesian(xlim=c(0, 50)) +
scale_x_continuous(breaks=seq(10,40,10), minor_breaks=seq(0,50,2)) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L+INT_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% trms from individuals in their first 3 months of infection\nat end of intervention', y='% samples obtained after\nintervention start\n')
pdf(file=paste(outdir,'/res_acrossTEAM_Secondary_SFocus.pdf',sep=''), width = 12, height = 12)
print(grid.arrange(p1, p2, p3, p4, nrow=2, main=title))
dev.off()
}
##--------------------------------------------------------------------------------------------------------
## olli 12.08.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.secondary.sduration<- function(dfa, outdir)
{
title <- '\nSecondary objective\nImpact of sampling duration 3 yrs vs 5 yrs\n'
tmp <- subset(dfa, Sc_SmplD_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_ii'))
set(tmp, NULL, 'AC_L',tmp[,gsub('\n',': ',AC_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, AC_L, IMPRT)
p1 <- ggplot(subset(tmp, TEAM!='True'), aes(y=paste(SMPL_D,' yrs',sep=''), x=central)) +
#geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
coord_cartesian(xlim=c(0, 8)) +
scale_x_continuous(breaks=seq(1,7,1)) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% Incidence', y='duration of intensified sampling\nafter intervention start\n')
tmp <- subset(dfa, Sc_SmplD_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_iii'))
set(tmp, NULL, 'AC_L',tmp[,gsub('\n',': ',AC_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, AC_L, IMPRT)
p2 <- ggplot(subset(tmp, TEAM!='True'), aes(y=paste(SMPL_D,' yrs',sep=''), x=central)) +
geom_vline(xintercept=1, colour='grey50', size=0.8) +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
coord_cartesian(xlim=c(0, 4)) +
scale_colour_manual(values=TEAM_CL) +
scale_x_continuous(breaks=seq(0.5,3.5,0.5), minor_breaks=seq(0.25,3.75,0.25)) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
facet_grid(TEAM~DATAT_L+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\nIncidence reduction', y='duration of intensified sampling\nafter intervention start\n')
tmp <- subset(dfa, Sc_SmplD_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_v'))
set(tmp, NULL, 'AC_L',tmp[,gsub('\n',': ',AC_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, AC_L, IMPRT)
p3 <- ggplot(subset(tmp, TEAM!='True'), aes(y=paste(SMPL_D,' yrs',sep=''), x=central)) +
#geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
coord_cartesian(xlim=c(0, 50)) +
scale_x_continuous(breaks=seq(10,40,10), minor_breaks=seq(0,50,2)) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% trms from individuals in their first 3 months of infection\nat baseline', y='duration of intensified sampling\nafter intervention start\n')
tmp <- subset(dfa, Sc_SmplD_Phy==1 & !grepl('(',TEAM,fixed=1) & USED_GENES=='all' & OBJ%in%c('OBJ_vi'))
set(tmp, NULL, 'AC_L',tmp[,gsub('\n',': ',AC_L)])
tmp3 <- subset(tmp, TEAM=='True')
setnames(tmp3, 'TEAM','team')
setkey(tmp, TEAM, AC_L, IMPRT)
p4 <- ggplot(subset(tmp, TEAM!='True'), aes(y=paste(SMPL_D,' yrs',sep=''), x=central)) +
#geom_point(data=tmp2, size=1, colour='transparent') +
geom_errorbarh(aes(xmin=lower95, xmax=upper95, colour=TEAM), height=0.3) +
geom_point(size=4, aes(colour=TEAM), pch=13) +
geom_point(data=tmp3, size=3, colour='black', pch=18) +
coord_cartesian(xlim=c(0, 50)) +
scale_x_continuous(breaks=seq(10,40,10), minor_breaks=seq(0,50,2)) +
scale_colour_manual(values=TEAM_CL) +
facet_grid(TEAM~DATAT_L+AC_L, scales='free', space='free_y') +
theme_bw() + theme(panel.margin.x= unit(0.1, "lines"), legend.position='bottom') +
guides(colour=guide_legend(ncol=3)) +
labs(x= '\n% trms from individuals in their first 3 months of infection\nat end of intervention', y='duration of intensified sampling\nafter intervention start\n')
pdf(file=paste(outdir,'/res_acrossTEAM_Secondary_SDuration.pdf',sep=''), width = 12, height = 12)
print(grid.arrange(p1, p2, p3, p4, nrow=2, main=title))
dev.off()
}
##--------------------------------------------------------------------------------------------------------
## evaluate results
## olli 08.05.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.read<- function()
{
# read truth for regional simus
indir <- '/Users/Oliver/Dropbox\ (Infectious Disease)/PANGEAHIVsim_internal/documents/external/2015_05_results'
file <- paste(indir, '/answers_Regional_Feb2015_rFormat.csv', sep='')
df <- read.submission.Feb2015(file, verbose=0, reset.OBJiv.conservative=1)
# read truth for village simus
file <- paste(indir, '/answers_Village_Feb2015-yr43_rFormat.csv', sep='')
tmp <- read.submission.Feb2015(file, verbose=0, reset.OBJiv.conservative=1)
set(tmp, NULL, 'TEAM', 'True')
df <- rbind(df, tmp)
# read submissions from May 2015
tmp <- list.files(indir, pattern='csv$')
tmp <- tmp[!grepl('answers',tmp)]
# read Eriks multiple submissions from May 2015
tmp2 <- data.table(FILE=tmp[grepl('cambImp',tmp)])
tmp2[, RUN:= tmp2[, sapply( strsplit(FILE,'_'), function(x) rev(x)[1] )]]
set(tmp2, NULL, 'RUN', tmp2[, substr(RUN, 1, nchar(RUN)-4)])
set(tmp2, NULL, 'RUN', tmp2[, gsub('results0','',RUN)])
dfs <- do.call('rbind',lapply(seq_len(nrow(tmp2)), function(i)
{
z <- read.submission.Feb2015( paste(indir, '/', tmp2[i, FILE], sep=''), verbose=0, reset.OBJiv.conservative=1 )
set(z, NULL, 'TEAM', z[, paste(TEAM, ' (', tmp2[i, RUN], ')', sep='')])
z
}))
tmp <- tmp[!grepl('cambImp',tmp)]
tmp <- do.call('rbind',lapply(tmp, function(x) read.submission.Feb2015(paste(indir,'/',x,sep=''), verbose=0, reset.OBJiv.conservative=1)))
dfs <- rbind(dfs, tmp)
# read submissions from August 2015
indir <- '/Users/Oliver/Dropbox\ (Infectious Disease)/PANGEAHIVsim_internal/documents/external/2015_08_results'
tmp <- list.files(indir, pattern='csv$')
tmp <- tmp[!grepl('answers',tmp)]
tmp2 <- tmp[grepl('Vancouver',tmp)]
stopifnot(length(tmp2)==1)
tmp2 <- read.submission.May2015(paste(indir,'/',tmp2,sep=''), verbose=0)
dfs <- rbind(dfs, tmp2)
tmp2 <- tmp[!grepl('Vancouver',tmp)]
tmp2 <- do.call('rbind',lapply(tmp2, function(x) read.submission.Aug2015(paste(indir,'/',x,sep=''), verbose=0, reset.OBJiv.conservative=1)))
dfs <- rbind(dfs, tmp2)
# change team name
set(dfs, dfs[, which(TEAM=='Colijn')],'TEAM','Imperial')
# construct Erik's gold submission
# for regional tree, use mergedTab
tmp <- subset(dfs, grepl('merged', TEAM) & grepl('Regional',SIM_SCENARIO))
tmp[, TEAM:='Cambridge/Imperial']
#tmp <- subset(dfs, grepl('mh30', TEAM) & grepl('Regional',SIM_SCENARIO))
#tmp[, TEAM:='Cambridge/Imperial']
#tmp2 <- subset(dfs, grepl('mh15', TEAM) & grepl('Regional',SIM_SCENARIO))
#tmp2[, TEAM:='Cambridge/Imperial']
#tmp <- merge(tmp, tmp2, by=c('TEAM','SUBMISSION_DATE','SIM_SCENARIO','USED_GENES','OBJ'), all=1)
#tmp2 <- tmp[, which(is.na(central.x))]
#set(tmp, tmp2, 'central.x', tmp[tmp2, central.y])
#set(tmp, tmp2, 'lower95.x', tmp[tmp2, lower95.y])
#set(tmp, tmp2, 'upper95.x', tmp[tmp2, upper95.y])
#setnames(tmp, c('central.x', 'lower95.x', 'upper95.x'), c('central', 'lower95', 'upper95'))
#set(tmp, NULL, c('central.y', 'lower95.y', 'upper95.y'), NULL)
dfs <- rbind(dfs, tmp)
# for village tree, use mh30 where available and mh15 where mh30 not available
tmp <- subset(dfs, grepl('mh30', TEAM) & grepl('Vill',SIM_SCENARIO))
tmp[, TEAM:='Cambridge/Imperial']
tmp2 <- subset(dfs, grepl('mh15', TEAM) & grepl('Vill',SIM_SCENARIO))
tmp2[, TEAM:='Cambridge/Imperial']
tmp <- merge(tmp, tmp2, by=c('TEAM','SUBMISSION_DATE','SIM_SCENARIO','USED_GENES','OBJ'), all=1)
tmp2 <- tmp[, which(is.na(central.x))]
set(tmp, tmp2, 'central.x', tmp[tmp2, central.y])
set(tmp, tmp2, 'lower95.x', tmp[tmp2, lower95.y])
set(tmp, tmp2, 'upper95.x', tmp[tmp2, upper95.y])
setnames(tmp, c('central.x', 'lower95.x', 'upper95.x'), c('central', 'lower95', 'upper95'))
set(tmp, NULL, c('central.y', 'lower95.y', 'upper95.y'), NULL)
dfs <- rbind(dfs, tmp)
# for village seq, use LSD
tmp <- subset(dfs, grepl('lsd', TEAM) & grepl('Vill',SIM_SCENARIO))
tmp[, TEAM:='Cambridge/Imperial']
dfs <- rbind(dfs, tmp)
# define data types (seq or phylo)
dfa <- rbind(dfs, df)
dfa[, DATA_T:=NA_character_]
set(dfa, dfa[, which(grepl('Vill_0[1-4]', SIM_SCENARIO))], 'DATA_T', 'seq')
set(dfa, dfa[, which(!grepl('Vill_0[1-4]', SIM_SCENARIO))], 'DATA_T', 'phy')
set(dfa, dfa[, which(grepl('FirstObj', SIM_SCENARIO))], 'DATA_T', 'seq')
set(dfa, dfa[, which(grepl('SecondObj', SIM_SCENARIO))], 'DATA_T', 'phy')
stopifnot(!any(is.na(dfa[, DATA_T])))
# define randomized scenario IDs
dfa[, SC_RND:=NA_character_]
tmp <- dfa[, which(grepl('Regional',SIM_SCENARIO))]
set(dfa, tmp, 'SC_RND', dfa[tmp, substring(regmatches(SIM_SCENARIO,regexpr('sc[A-Z]',SIM_SCENARIO)),3)])
tmp <- dfa[, which(grepl('Vill',SIM_SCENARIO))]
set(dfa, tmp, 'SC_RND', dfa[tmp, substring(regmatches(SIM_SCENARIO,regexpr('Vill_[0-9]+',SIM_SCENARIO)),6)])
stopifnot(!any(is.na(dfa[, SC_RND])))
# describe regional simulations in terms of fast/low intervention high/low acute
set.seed(42)
dfi <- data.table(FILE=list.files('/Users/Oliver/Dropbox (SPH Imperial College)/OR_Work/2014/2014_Gates/methods_comparison_pipeline/FINAL', '.*zip$', full.names=FALSE))
dfi[, SC:= sapply(strsplit(FILE, '_'),'[[',3)]
dfi[, CONFIG:= sapply(strsplit(SC, '-'),'[[',2)]
set(dfi, NULL, 'SC', dfi[, sapply(strsplit(SC, '-'),'[[',1)])
dfi[, DATAT:= sapply(strsplit(FILE, '_'),'[[',5)]
set(dfi, NULL, 'DATAT', dfi[, gsub('.zip','',DATAT,fixed=T)])
set(dfi, NULL, 'OBJECTIVE', 'SecondObj')
set(dfi, dfi[,which(CONFIG=='sq')],'OBJECTIVE', 'FirstObj')
dfi <- merge(dfi,dfi[, list(FILE=FILE, DUMMY=sample(length(FILE),length(FILE))), by='OBJECTIVE'],by=c('OBJECTIVE','FILE'))
tmp <- dfi[, which(OBJECTIVE=='SecondObj')]
set(dfi, tmp, 'DUMMY', dfi[tmp, DUMMY] + dfi[OBJECTIVE=='FirstObj', max(DUMMY)])
setkey(dfi, DUMMY)
dfi[, SC_RND:= toupper(letters[seq_len(nrow(dfi))])]
dfi <- subset(dfi, select=c(SC, SC_RND, CONFIG))
set(dfi, NULL, 'SC', dfi[, substring(SC, 3)])
dfi <- merge( dfi, data.table(SC= c('A','B','C','D','E','F'), AC_T=c('low','high','low','high','low','high'), INT_T=c('fast','fast','slow','slow','none','none')), by='SC' )
tmp <- data.table( CONFIG= c('sq','s2x','y3','mFP85','ph','tr20'),
IMPRT= c(.05, .05, .05, .05, .05, .2),
SMPL_N= c(1600, 3200, 1280, 1600, 1600, 1600),
SMPL_C= c(0.08, 0.16, 0.08, 0.08, 0.08, 0.08),
SMPL_M= c('overs', 'overs', 'overs', 'extrs', 'overs', 'overs'),
SMPL_D= c(5, 5, 3, 5, 5, 5))
dfi <- merge( dfi, tmp, by='CONFIG')
set(dfi, NULL, c('CONFIG'), NULL)
# add info for village
tmp <- data.table( SC_RND= c('03','02','01','04','05','08','06','07','11','09','12','10','00'),
AC_T= c('low','low','high','high','low','low','high','high','low','low','high','high','low'),
INT_T= c('fast','slow','fast','slow','fast','slow','fast','slow','fast','slow','fast','slow','none'),
#SMPL_C= c(0.25, 0.25, 0.25, 0.25, 0.5, 0.5, 0.5, 0.5, 0.25, 0.25, 0.25, 0.25, 0.25),
SMPL_C= c(0.3, 0.3, 0.3, 0.3, 0.6, 0.6, 0.6, 0.6, 0.3, 0.3, 0.3, 0.3, 0.3),
SMPL_D= 5,
SMPL_N= c(777, 857, 957, 1040, 1469, 1630, 1831, 1996, 638, 686, 956, 1012, 872),
IMPRT= c(0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0, 0, 0, 0, 0.02))
dfi <- rbind(dfi, tmp, fill=TRUE,use.names=TRUE)
# merge to dfa
cat(paste('\nnumber of rows before merge with dfi, n=', nrow(dfa)))
dfa <- merge(dfa, dfi, by='SC_RND')
cat(paste('\nnumber of rows before merge with dfi, n=', nrow(dfa)))
# set answers to numerical
set(dfa, dfa[, which(OBJ%in%c('OBJ_i','OBJ_iv'))], c('lower95','upper95'), NA_character_)
set(dfa, dfa[, which(central=='decreasing')], c('central'), '-1')
set(dfa, dfa[, which(central=='stable')], c('central'), '0')
set(dfa, dfa[, which(central=='increasing')], c('central'), '1')
set(dfa, dfa[, which(central=='<15%')], c('central'), '-1')
set(dfa, dfa[, which(central=='15%-30%')], c('central'), '0')
set(dfa, dfa[, which(central=='>30%')], c('central'), '1')
set(dfa, NULL, 'central', dfa[, as.numeric(central)])
set(dfa, NULL, 'lower95', dfa[, as.numeric(lower95)])
set(dfa, NULL, 'upper95', dfa[, as.numeric(upper95)])
# add simulation type
dfa[, DATAT_L:='NA_character_']
set(dfa, dfa[, which(grepl('Vill',SIM_SCENARIO))], 'DATAT_L','Village')
set(dfa, dfa[, which(grepl('Regional',SIM_SCENARIO))], 'DATAT_L','Regional')
# transform %incidence 0.01 to 1%
tmp <- dfa[, which(OBJ=='OBJ_ii')]
set(dfa, tmp, 'central', dfa[tmp, 100*central])
set(dfa, tmp, 'lower95', dfa[tmp, 100*lower95])
set(dfa, tmp, 'upper95', dfa[tmp, 100*upper95])
# transform %Acute 0.01 to 1%
tmp <- dfa[, which(OBJ%in%c('OBJ_v','OBJ_vi'))]
set(dfa, tmp, 'central', dfa[tmp, 100*central])
set(dfa, tmp, 'lower95', dfa[tmp, 100*lower95])
set(dfa, tmp, 'upper95', dfa[tmp, 100*upper95])
# fix submission dates
set(dfa, NULL, 'SUBMISSION_DATE', dfa[, gsub('\\.15','\\.2015',SUBMISSION_DATE)])
dfa
}
##--------------------------------------------------------------------------------------------------------
## evaluate results
## olli 12.08.15
##--------------------------------------------------------------------------------------------------------
project.PANGEA.TEST.pipeline.Aug2015.evaluate.samplesize<- function(dfr, dfa, outdir)
{
tmp <- dfr[, lapply(.SD, sum ), .SDcol=c('Pr_Phy','Pr_Seq','Sc_Imports_Phy','Sc_SeqCoverage_Phy','Sc_SmplD_Phy','Sc_SmplFc_Phy'), by='DATAT_L']
tmp2 <- dfa[, unique(TEAM[TEAM!='True' & !grepl('(',TEAM,fixed=T)])]
tmp <- melt(tmp, id.vars='DATAT_L')
set(tmp, NULL, 'value', tmp[,value]*length(tmp2))
tmp <- dcast.data.table(tmp, DATAT_L~variable, value.var='value')
tmp[, N:='TOTAL']
tmp2 <- unique(subset(dfa, select=c(DATAT_L,OBJ)))
tmp2[, N:='TOTAL']
tmp <- merge(tmp, tmp2, by=c('DATAT_L','N'))
tmp2 <- subset(dfa, USED_GENES=='all' & TEAM!='True' & !grepl('(',TEAM,fixed=T))[, lapply(.SD, sum ), by=c('DATAT_L','OBJ','TEAM'), .SDcol=c('Pr_Phy','Pr_Seq','Sc_Imports_Phy','Sc_SeqCoverage_Phy','Sc_SmplD_Phy','Sc_SmplFc_Phy')]
tmp2[, N:='SUBMITTED']
tmp <- rbind(tmp, tmp2, use.names=T, fill=T)
tmp2 <- dcast.data.table(melt(tmp, id.vars=c('DATAT_L','OBJ','N','TEAM')), DATAT_L+OBJ~variable+N,fun.aggregate=sum,value.var='value')
file <- paste(outdir,'/SampleSizesByAnalysis_PolGagEnv','.csv',sep='')
cat('\nWrite to',file)
write.csv(tmp2, file=file, row.names=FALSE)
set(tmp, tmp[, which(is.na(TEAM))], 'TEAM', 'True')
z <- melt(subset(tmp, OBJ=='OBJ_i'), id.vars=c('DATAT_L','N','OBJ','TEAM'))
set(z, NULL, 'variable', z[, factor(variable, levels=c('Pr_Seq','Pr_Phy','Sc_Imports_Phy','Sc_SeqCoverage_Phy','Sc_SmplD_Phy','Sc_SmplFc_Phy'),
labels=c('Primary Objective\non sequences','Primary Objective\non true tree','Secondary\nimports','Secondary\nsequence coverage','Secondary\nsampling duration','Secondary\nfocussed sampling'))])
ggplot(z, aes(x=factor(N, levels=c('SUBMITTED','TOTAL'), labels=c('submitted','if submissions\nhad been\ncomplete')), fill=TEAM, y=value)) + geom_bar(stat='identity') + facet_grid(DATAT_L~variable) +
scale_fill_manual(values=TEAM_CL) +
theme_bw() + theme(legend.position='bottom') + labs(x='', y='total')
file <- paste(outdir,'/SampleSizesByAnalysis_PolGagEnv.pdf',sep='')
cat('\nPlot to',file)
ggsave(file=file, w=12, h=4)
NULL
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.