#commit: ##- using AGE_MIN instead of age mean-> max age is 30 in both HC and KH rm(list=ls()) # setwd("~/Traineeship/Dengue/slides_June_2018/PMA_batch2To7_July2019/pop data/models/FOI/CombinedCode_Maxine") library(rstan) util <- new.env() source('./codes/stan_utility.R', local=util) rstan_options(auto_write = TRUE) options(mc.cores = parallel::detectCores())
# Changeable model <- "cons" # cons, TV,SS, TVSS groupAge <- 5 # 1,5 # Grouping Input data by group of .... step_year <- 1 # non-TV models, input 0 site <- "HC&KH" #KH, HC,HC&KH dataInput <-"C1D1" # C1D1, C1F1, C1D3, C1F3 #for time-varying model, options are 1,2,3,4,5 # # for loop to run at once # site_vec <- c("KH","HC","KH&HC") # dataInput_vec <- c("C1D1","C1F1") # step_year_vec <- c(1:5) # # for (idx_site_vec in 1:length(site_vec)){ # for (idx_dataInput_vec in 1: length(dataInput_vec)){ # for (idx_step_year_vec in 1: length(step_year_vec)){ # # site <- site_vec[idx_site_vec] # dataInput <- dataInput_vec[idx_dataInput_vec] # step_year <- step_year_vec[idx_step_year_vec] # code function ........ # } # } # } ##Results from different IS and infecting serotype infering models: #~/Dropbox/1.SHARED_FOLDER/slides_June_2018/PMA_batch2To7_July2019/pop data/models/Infecting serotype Model # original locations if ( dataInput == "C1D1"){ data <- read.csv("./data/Infecting_Models/PredictedIS_B23_567_pred.serotype_C1D1.csv") } if ( dataInput == "C1D3"){ data <- read.csv("./data/Infecting_Models/PredictedIS_B23_567_pred.serotype_C1D3.csv") } if ( dataInput == "C1F1"){ data <- read.csv("./data/Infecting_Models/PredictedIS_B23_567_pred.serotype_HomoAssigned_C1F1.csv") } if ( dataInput == "C1F3"){ data <- read.csv("./data/Infecting_Models/PredictedIS_B23_567_pred.serotype_HomoAssigned_C1F3.csv") } data$YEAR <- as.factor(data$YEAR) data$predictedIS<- ordered(data$predictedIS,levels=c("Secondary","Primary","Neg")) data$pred.serotype<- as.factor(data$pred.serotype) # Extract population data only! pop<- data[which(!is.na(data$AGE_MIN)&is.na(data$PanbioUnit)),]#cross out acute and ELISA samples pop$Site <- substr(pop$sampleID,1,2) # Age grouping if(groupAge == 1){ ##Agegroup_ 1year pop$AgeGroup <-factor(as.factor(ceiling(pop$AGE_MIN)),labels=c("(1-2]","(2-3]","(3-4]","(4-5]","(5-6]","(6-7]","(7-8]","(8-9]","(9-10]","(10-11]","(11-12]","(12-13]","(13-14]","(14-15]","(15-16]","(16-17]","(17-18]","(18-19]","(19-20]","(20-21]","(21-22]","(22-23]","(23-24]","(24-25]","(25-26]","(26-27]","(27-28]","(28-29]","(29-30]"))# group as 1year 1 group, no individual is less than 1 year old } if(groupAge == 2){ pop$AgeGroup <-factor(as.factor(ceiling(pop$AGE_MIN/2)),labels=c("(0-2]","(2-4]","(4-6]","(6-8]","(8-10]","(10-12]","(12-14]","(14-16]","(16-18]","(18-20]","(20-22]","(22-24]","(24-26]","(26-28]","(28-30]")) } if(groupAge == 3){ pop$AgeGroup <- ceiling(pop$AGE_MIN/3) pop$AgeGroup <- factor( pop$AgeGroup, labels = c("(0-3]","(3-6]","(6-9]","(9-12]","(12-15]","(15-18]", "(18-21]","(21-24]","(24-27]","(27-30]")) } if(groupAge == 5){ pop$AgeGroup <- ceiling(pop$AGE_MIN/5) pop$AgeGroup <- factor( pop$AgeGroup, labels = c("(0-5]","(5-10]","(10-15]","(15-20]","(20-25]","(25-30]")) }
library(dplyr) library(tidyr) # spread(x,y): spread rows of col x into collums, then value of col y will fit in as rows. #rename(new_name=old_name). if (site=="HC"){ p_year=dplyr::filter(pop,Site=="HC")%>%group_by(AgeGroup,YEAR,Site)%>%dplyr::count(pred.serotype)%>%spread(pred.serotype,n)} if(site == "KH"){ p_year=dplyr::filter(pop,Site=="KH")%>%group_by(AgeGroup,YEAR,Site)%>%dplyr::count(pred.serotype)%>%spread(pred.serotype,n)} if(site == "KH&HC"|site =="HC&KH"){ p_year=dplyr::filter(pop)%>%group_by(AgeGroup,YEAR)%>%dplyr::count(pred.serotype)%>%spread(pred.serotype,n) p_year$Site <- "HC&KH" # add Site-> same no.columns-> later convenient #Rearrange by column index p_year <- p_year[,colnames(p_year)[c(1:2,9,3:8)]] } # Sorting "YEAR" in chronological order p_year<-p_year[order(p_year$YEAR),] # p_year$total <- apply(p_year[,3:8],1,sum,na.rm=T) p_year$Primary <-rowSums(p_year[,4:7],na.rm = T) p_year$total <-rowSums(p_year[,4:9],na.rm = T) #create function for substr age rather than put age in order, this is useful in case of missing data in a specific age: my_substr <- function(s){ if (nchar(s) == 5){ t <- substr(s, 4, 4) }else{ if (nchar(s) == 6){ t <- substr(s, 4, 5) }else{ t <- substr(s, 5, 6) } } return(t) } # lower range my_substr_lower <- function(s){ if (nchar(s) == 5|nchar(s) == 6){ t <- substr(s, 2, 2) }else{ t <- substr(s, 2, 3) } return(t) } # p_year$age<- apply(p_year[,1],1,my_substr) p_year$age <- as.numeric(p_year$age) p_year$age_lower<- apply(p_year[,1],1,my_substr_lower) p_year$age_lower <- as.numeric(p_year$age_lower) p_year$ageMean <- apply(p_year[,12:13],1,mean) # Assign NA as 0 because stan does not support NA value: for (idx in 4:9){ idx.na <- which(is.na(p_year[[idx]])) p_year[[idx]][idx.na] <- 0 } # setnames(p_year, old=c("AgeGroup","1","2","3","4","Neg","Secondary","total"),new=c("AgeGroup","DV1.13","DV2.13","DV3.13","DV4.13","Neg.13","Sec.13","total.13"))
b = length(p_year$age) n_y = 2017-2013+1+max(as.integer(p_year[which(p_year$YEAR==2013),]$age)) # number of years backward, time period that data is available ## KH: 36 -> 1982, HC: 35 -> 1983 # data input for constant FOI model if (model == "cons") { datapp<- list(a = b, datap=c(p_year[1:b, 8], p_year[1:b, 4], p_year[1:b, 5], p_year[1:b, 6], p_year[1:b, 7], p_year[1:b, 9]), veca = as.integer(p_year$age), veca_mean = as.integer(p_year$ageMean)) } # data input for serotype-specific FOI model if (model == "SS") { datapp<- list(a = b, datap=c(p_year[1:b, 8], p_year[1:b, 4], p_year[1:b, 5], p_year[1:b, 6], p_year[1:b, 7], p_year[1:b, 9]), veca=as.integer(p_year$age), veca_mean = as.integer(p_year$ageMean)) } # if (model == "TV"|model == "TVSS"){ l_vecj = length(c(rep(1:ceiling(n_y/step_year), each = step_year))) # ceiling: round up datapp<- list(a = b, step_year = step_year, datap=c(p_year[1:b, 8], p_year[1:b, 4], p_year[1:b, 5], p_year[1:b, 6], p_year[1:b, 7], p_year[1:b, 9]), vecj=as.integer(c(rep(1:ceiling(n_y/step_year), each = step_year))),#for index of actual lambda l_vecj = length(c(rep(1:ceiling(n_y/step_year), each = step_year))),# length of vecj for later use vecy=as.integer(as.character(p_year$YEAR)), veca=as.integer(p_year$age), veca_mean = as.integer(p_year$ageMean)) }
if (model == "cons"&groupAge==1){ FOI_Dengue <-"./codes/FoI/Stan/FOI_constant_MLE.stan" } if (model == "cons"&groupAge==5){ FOI_Dengue <-"./codes/FoI/Stan/FOI_constant_MLE_GroupAge.stan" } if (model == "SS"&groupAge==1){ FOI_Dengue <-"./codes/FoI/Stan/FOI_serotype_specific_MLE.stan" } if (model == "SS"&groupAge==5){ FOI_Dengue <-"./codes/FoI/Stan/FOI_serotype_specific_MLE_GroupAge.stan" } if (model == "TV"){ FOI_Dengue <- "./codes/FoI/Stan/FOI_time varying_MLE.stan" } if (model == "TVSS"){ FOI_Dengue <-"./codes/FoI/Stan/FOI_time varying_serotype specific_MLE.stan" }
# you then run using this: From this output we can quickly assess model convergence by looking at the Rhat values for each parameter. When these are at or near 1, the chains have converged. There are many other diagnostics, but this is an important one for Stan. set.seed(9) fit<-stan(file = FOI_Dengue, data=datapp, iter=6000, chains= 4, cores = 4, # parallel::detectCores() control = list(adapt_delta=0.8, max_treedepth=10))# default posterior <- rstan::extract(fit) # specify package rstan rather than tidyr # str(posterior) #this summarizes the parameters, gets mean,etc. of parameters and metrics of whether converged or not like Rhat and neff foisummary <- summary(fit) foisummary tab <- data.frame(foisummary$summary) # # saving fits #trace previous result here: /home/phuong/pCloudDrive/PMA analysis results/FoI/RDS/FOI if (model == "cons") { saveRDS(fit,paste("./output/FoI/RDS/FOI",model,"groupAge",groupAge,site,dataInput,".rds")) } else if (model == "SS") { saveRDS(fit, paste("./output/FoI/RDS/FOI",model,"groupAge",groupAge,site,dataInput, ".rds")) } else if (model == "TV") { saveRDS(fit, paste("./output/FoI/RDS/FOI",model,"groupAge",groupAge,"step_year",step_year,site,dataInput, ".rds")) }else { saveRDS(fit,paste("./output/FoI/RDS/FOI",model,"groupAge",groupAge,"step_year",step_year,site,dataInput,".rds")) } # # plot to see whether estimated parameter is convergence or not: traceplot # plottedRows <- 1 : nrow(posterior$lambda) # plottedRows <- which(plottedRows %% 6 == 0) # plot(posterior$lambda[plottedRows, 4],type="l")
#eval=F: prevent the codes unless doing it manually util$check_all_diagnostics(fit) library(purrr) class(fit) summary(fit)$summary %>% head() plot(fit,pars="lambda")# mean lamda plot(fit,pars=c("lambda[1,1]", "lambda[1,2]", "lambda[1,3]", "lambda[1,4]", "lambda[1,5]", "lambda[1,6]", "lambda[1,7]")) traceplot(fit,pars="lambda[1,1]") #change the names in the stanfit object you can use the names method for stanfit objects. print(names(fit)) names(fit)[1] <- "DENV1_" #Assessing and Fixing Divergences and Treedepth Problems rstan::check_divergences(fit) warmups <- 3000 total_iterations <- 6000 max_treedepth <- 10 # default n_chains <- 4 n_cores <- 8 # parallel::detectCores() adapt_delta <- 0.8 # default mack_diagnostics <- rstan::get_sampler_params(fit) %>% set_names(1:n_chains) %>% map_df(as_data_frame,.id = 'chain') %>% group_by(chain) %>% mutate(iteration = 1:length(chain)) %>% mutate(warmup = iteration <= warmups) library(ggsci) mack_diagnostics %>% group_by(warmup, chain) %>% summarise(percent_divergent = mean(divergent__ >0)) %>% ggplot() + geom_col(aes(chain, percent_divergent, fill = warmup), position = 'dodge', color = 'black') + scale_y_continuous(labels = scales::percent, name = "% Divergent Runs") + scale_fill_npg() mack_diagnostics %>% ggplot(aes(iteration, treedepth__, color = chain)) + geom_line() + geom_hline(aes(yintercept = max_treedepth), color = 'red') + scale_color_locuszoom() mack_diagnostics %>% ggplot(aes(iteration, stepsize__, color = chain)) + geom_line() + scale_color_locuszoom() bh_summary <- summary(fit)$summary %>% as.data.frame() %>% mutate(variable = rownames(.)) %>% select(variable, everything()) %>% as_data_frame() bh_summary %>% head() bh_summary %>% ggplot(aes(n_eff)) + geom_histogram() + geom_vline(aes(xintercept = 4000), color = 'red') bh_summary %>% ggplot(aes(Rhat)) + geom_histogram() + geom_vline(aes(xintercept = 1.1), color = 'red') bh_summary %>% filter(variable %in% c('lambda[1,1]','lambda[1,2]','lambda[1,3]')) %>% ggplot() + geom_linerange(aes(variable, ymin = `2.5%`,ymax = `97.5%`)) + geom_crossbar(aes(variable, mean, ymin = `25%`, ymax = `75%`), fill= 'grey') + facet_wrap(~variable, scales = 'free') library(stringr) rhat <- bh_summary %>% filter(str_detect(variable,'rhat') & !str_detect(variable,'log') & !str_detect(variable,'pp')) # sal_data <- sal_data %>% # mutate(mean_rhat = rhat$mean, # lower = rhat$`2.5%`, # upper = rhat$`97.5%`) # # sal_data %>% # ggplot() + # geom_point(aes(ssb, r)) + # geom_line(aes(ssb, mean_rhat)) + # geom_ribbon(aes(ssb, ymin = lower, ymax = upper), alpha = 0.25) bh_mcmc <- fit %>% rstan::extract() bh_pars <- bh_mcmc[c('lambda')] bh_pars %>% map_df(as_data_frame, .id = 'variable') ggplot(aes(value, fill = variable)) + geom_density() + # facet_wrap(~ variable, scales = 'free') + coord_flip() + scale_fill_locuszoom()
## constant FOI if (model == "cons") { p <- data.frame(lambda = posterior$lambda) p$year <- rep("FOI") # violin plot lamb <- ggplot(p, aes(x=year,y=lambda)) + geom_violin()+ ggtitle(paste("FoI",model,"groupAge",groupAge,site,dataInput))+ coord_cartesian(ylim=c (0.0,0.03))+ theme(plot.title = element_text(hjust=0.5,face="bold",size=20,colour = "blue"), axis.title.x = element_blank(), axis.text = element_text(face = "bold", angle = 0, hjust = 1))+ # stat_summary(fun.y=mean, geom="point", size=2, color="red")+ # add mean # geom_boxplot(width=0.1)+# add median and quartile stat_summary(fun.data=mean_sdl, geom="pointrange", color="red")#Add mean and standard deviation # # save the figure ggsave(filename=paste("./output/FoI/Lambda/FOI",model,"groupAge",groupAge,site,dataInput,".png"),plot=lamb) lamb } ## serotype-specific FOI if (model == "SS") { p <- data.frame(lambda = as.numeric(), DENV = as.character()) for(a in 1:4){ p_temp <- data.frame(lambda = posterior$lambda[ , a], DENV = paste0('DENV ', a)) p <- rbind(p, p_temp) } # violin plot lamb = ggplot(p, aes(x=DENV,y=lambda)) + geom_violin()+ ggtitle(paste("FoI",model,"groupAge",groupAge,site,dataInput))+ theme(plot.title = element_text(hjust=0.5,face="bold",size=20,colour = "blue"), axis.title.x = element_blank(), axis.text = element_text(face = "bold"))+ ylim(0,0.05)+ # stat_summary(fun.y=mean, geom="point", size=2, color="red")+ # add mean # geom_boxplot(width=0.1)+# add median and quartile stat_summary(fun.data=mean_sdl, geom="pointrange", color="red")#Add mean and standard deviation ggsave(filename = paste("./output/FoI/Lambda/FOI",model,"groupAge",groupAge,site,dataInput,".png" ),plot=lamb) lamb } ## time-varyingFOI if (model == "TV") { #create dataframe for violin plot: p <- data.frame(lambda = as.numeric(), YEAR = as.character()) for(a in 1:(l_vecj/step_year)){ p_temp <- if(step_year ==1){ data.frame( lambda = posterior$lambda[ ,a], YEAR = paste0(2018-(step_year*(a-1)+1)))# 1 year }else{ data.frame( lambda = posterior$lambda[ ,a], YEAR = paste0(2018-(step_year*(a-1)+1),"-",2018-(step_year*(a-1)+1)-(step_year-1)))# period } p <- rbind(p, p_temp) } # violin plot lamb <- ggplot(p, aes(x=YEAR,y=lambda)) + geom_violin()+ ggtitle(paste("FoI",model,step_year,"yr(s) groupAge",groupAge,site,dataInput))+ ylim(-.01,0.2)+ theme(plot.title = element_text(hjust=0.5,face="bold",size=20,colour = "blue"), axis.title.x = element_blank(), axis.text = element_text(face = "bold", angle = 90, hjust = 1))+ # stat_summary(fun.y=mean, geom="point", size=2, color="red")+ # add mean # geom_boxplot(width=0.1)+# add median and quartile stat_summary(fun.data=mean_sdl, geom="pointrange", color="red")#Add mean and standard deviation ggsave(filename=paste("./output/FoI/Lambda/FOI",model,"groupAge",groupAge,"step_year",step_year,site,dataInput,".png"), plot = lamb) lamb } ## time-varying serotype-specific FOI if (model == "TVSS"){ #create dataframe for violin plot: p <- data.frame(lambda = as.numeric(), DENV = as.character()) for(a in 1:4){ p_temp <- data.frame(lambda = posterior$lambda[,a,], DENV = paste0('DENV ', a)) p <- rbind(p, p_temp) } ## Grouping less info. yrs >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # #1. renames colnames of dataframe p as year, group less info. yrs # if ( step_year == 1) # {names(p) <- c(2017:1988,"87-83","DENV")} # if ( step_year ==2){ # names(p) <- c("2017-2016","2015-2014","2013-2012","2011-2010","2009-2008","2007-2006","2005-2004","2003-2002","2001-2000","1999-1998","1997-1996","1995-1994","1993-1992","1991-1990","1989-1988","1987-1986","1985-1983","DENV")} # if (step_year ==5){ # names(p) <- c("2017-2013","2012-2008","2007-2003","2002-1998","1997-1993","1992-1988","1987-1983","DENV") # } # Not grouping the very last years #1.renames colnames of dataframe p as year, not grouping less info.yrs if ( step_year == 1)# 35 lambda {names(p) <- c(2017:1983,"DENV")} if ( step_year ==2){# 18 lambda names(p) <- c("2017-2016","2015-2014","2013-2012","2011-2010","2009-2008","2007-2006","2005-2004","2003-2002","2001-2000","1999-1998","1997-1996","1995-1994","1993-1992","1991-1990","1989-1988","1987-1986","1985-1984","1983","DENV")} if ( step_year ==3){# 12 lambda names(p) <- c("2017-2015","2014-2012","2011-2009","2008-2006","2005-2003","2002-2000","1999-1997","1996-1994","1993-1991","1990-1988","1987-1985","1984-1983","DENV")} if ( step_year ==4){# 9 lambda names(p) <- c("2017-2014","2013-2010","2009-2006","2005-2002","2001-1998","1997-1994","1993-1990","1989-1986","1985-1983","DENV")} if (step_year ==5){ #7 lambda names(p) <- c("2017-2013","2012-2008","2007-2003","2002-1998","1997-1993","1992-1988","1987-1983","DENV") } #2. violin plot, not grouping less info. yrs DV1 <- p[1:12000,-((l_vecj/step_year)+1)] DV2 <- p[(12001:24000),-((l_vecj/step_year)+1)] DV3 <- p[24001:36000,-((l_vecj/step_year)+1)] DV4 <- p[36001:48000,-((l_vecj/step_year)+1)] # tranform data frame for ploting library(reshape2) DV1 <- reshape2::melt(DV1,id.vars=0) # transformed data frame DV2 <- reshape2::melt(DV2,id.vars=0) DV3 <- reshape2::melt(DV3,id.vars=0) DV4 <- reshape2::melt(DV4,id.vars=0) # DV1 p1 <- ggplot(DV1,aes(x=variable,y=value)) + geom_violin()+ # geom_boxplot(width=0.1)+ # ggtitle(paste("posterior lambda _ age varying_serotype specific",p_year$Site[1]))+ theme(plot.title = element_text(hjust=0.5,face="bold",size=20,colour = "blue"), axis.title.x = element_blank(), # axis.text.x = element_text(angle = 90, hjust = 0.5, axis.text.x = element_blank())+ coord_cartesian(ylim=c(-0.01,0.3))+ ylab("DENV1")+ # stat_summary(fun.y=mean, geom="point", size=2, color="red")+ # add mean # geom_boxplot(width=0.1)+# add median and quartile stat_summary(fun.data=mean_sdl, geom="pointrange", color="red")#Add mean and standard deviation #DV2 p2 <- ggplot(DV2,aes(x=variable,y=value)) + geom_violin()+ # geom_boxplot(width=0.1)+ # ggtitle(paste("posterior lambda _ age varying_serotype specific",p_year$Site[1]))+ theme(plot.title = element_text(hjust=0.5,face="bold",size=20,colour = "blue"), axis.title.x = element_blank(), # axis.text.x = element_text(angle = 90, hjust = 0.5, axis.text.x = element_blank())+ coord_cartesian(ylim=c(-0.01,0.3))+ ylab("DENV2")+ # stat_summary(fun.y=mean, geom="point", size=2, color="red")+ # add mean # geom_boxplot(width=0.1)+# add median and quartile stat_summary(fun.data=mean_sdl, geom="pointrange", color="red")#Add mean and standard deviation #DV3 p3 <- ggplot(DV3,aes(x=variable,y=value)) + geom_violin()+ # geom_boxplot(width=0.1)+ # ggtitle(paste("posterior lambda _ age varying_serotype specific",p_year$Site[1]))+ theme(plot.title = element_text(hjust=0.5,face="bold",size=20,colour = "blue"), axis.title.x = element_blank(), axis.text.x = element_text(angle = 90,size=6,face = "bold"))+ # axis.text.x = element_blank())+ coord_cartesian(ylim=c(-0.01,0.3))+ ylab("DENV3")+ # stat_summary(fun.y=mean, geom="point", size=2, color="red")+ # add mean # geom_boxplot(width=0.1)+# add median and quartile stat_summary(fun.data=mean_sdl, geom="pointrange", color="red")#Add mean and standard deviation #DV4 p4 <- ggplot(DV4,aes(x=variable,y=value)) + geom_violin()+ # geom_boxplot(width=0.1)+ # ggtitle(paste("posterior lambda _ age varying_serotype specific",p_year$Site[1]))+ theme(plot.title = element_text(hjust=0.5,face="bold",size=20,colour = "blue"), axis.title.x = element_blank(), axis.text.x = element_text(angle = 90,size=6,face = "bold"))+ coord_cartesian(ylim=c(-0.01,0.3))+ ylab("DENV4")+ # stat_summary(fun.y=mean, geom="point", size=2, color="red")+ # add mean # geom_boxplot(width=0.1)+# add median and quartile stat_summary(fun.data=mean_sdl, geom="pointrange", color="red")#Add mean and standard deviation library(gridExtra) library(grid) lamb <- grid.arrange(p1,p2,p3,p4, ncol=2,nrow=2, top =textGrob(paste("FoI",model,step_year,"yr(s) groupAge",groupAge,site,dataInput),gp=gpar(fontsize=20,col = "blue"))) # save the figure ggsave(filename=paste("./output/FoI/Lambda/FOI",model,"groupAge",groupAge,"step_year",step_year,site,dataInput,".png"), plot = lamb) }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.