R/data_setup_functions.R

Defines functions process_serology create_pop30_agg_vc30_agg create_pop_at_survey Make_aggregate_pop_vc_3d Make_aggregate_pop_vc calc_pop_moments_agg calc_pop_moments calc_t0_vac_africa calc_incidence_vac_general transform_into_vc3d get_pop_data_3d Make_pop3d_Ptot2d_Pprop3d add_1940_1950 import_pop_data_LS2017 import_pop_data_LS2014 fit_glm launch_env_dat

Documented in add_1940_1950 calc_incidence_vac_general calc_pop_moments calc_pop_moments_agg calc_t0_vac_africa create_pop30_agg_vc30_agg create_pop_at_survey fit_glm get_pop_data_3d import_pop_data_LS2014 import_pop_data_LS2017 launch_env_dat Make_aggregate_pop_vc Make_aggregate_pop_vc_3d Make_pop3d_Ptot2d_Pprop3d process_serology transform_into_vc3d

# function file to work with model_reestimation

#' load environmental data from path
#'
#' @param filepath file path to data csv
#' @param c34 list of countries
#' @param dat_full data if already imported, defaults to NA
#' @param delete_surv_AGO remove AGO from surveillance quality, defaults to TRUE
#'
#'
#' @return depi and dat, dependant variable index and data
#' @export
launch_env_dat = function(filepath, c34, dat_full = NA, delete_surv_AGO = TRUE) {

  if(!is.na(filepath)){
    dat_full = read.csv(filepath, stringsAsFactors=FALSE)
  } else if(is.na(dat_full)){
    stop("no valid filepath or data entered")
  }
  #which presence/abence outcome to consider. Alternatives : "cases" # "outbreaks" #
  depvar = "cas.or.out"
  depi = match(depvar, names(dat_full)) # column number for the chosen outcome

  # excluding LC1,3,5,15 as they only ever cover <5% -
  ex_i = match(c("LC1","LC3","LC5","LC15"), names(dat_full))
  dat_full = dat_full[,-ex_i]

  # adding a categorical "dominant land cover" variable:
  LC_i = grep("LC",names(dat_full))
  LC_dom = names(dat_full)[LC_i][ apply(dat_full[,LC_i],1, which.max )]
  dat_full = cbind(dat_full, LC_dom) # 37 potential covariates/levels

  #setting NA to 0
  dat_full$surv.qual.adm0[is.na(dat_full$surv.qual.adm0)] =
    dat_full$surv.qual.adm1[is.na(dat_full$surv.qual.adm1)] = 0

  # setting the surv.qual.adm0 for AGO to 0 (very few cases reported):
  if(delete_surv_AGO){
    dat_full$surv.qual.adm0[dat_full$adm0=="AGO"] = 0
  }

  #adding log(surv_qual_adm0)
  dat_full = dplyr::mutate(dat_full,
                           log.surv.qual.adm0 = ifelse(is.infinite(log(surv.qual.adm0)),
                                                       0, log(surv.qual.adm0)))

  # a same categorical variable for all countries within the YFSD
  dat_full = dplyr::mutate(dat_full,
                           adm05 = ifelse(surv.qual.adm0>0,
                                          "AFR", adm0))

  dat =  dplyr::filter(dat_full, adm0 %in% c34)


  v1 = apply(dat,2,var, na.rm = TRUE)
  for(i in 8:(ncol(dat))) {

    if(!is.factor(dat[,i]) & !is.character(dat[,i])) {

      dat[,i] = dat[,i]/sqrt(v1[i])
      dat_full[,i] = dat_full[,i]/sqrt(v1[i])

    }
    # Explanation:
    # If we fit the model on dat and if we want to project estimates on dat_full, variable from dat_full
    # need to be expressed on the same scale than those from dat , thus we normalize dat_full relatively to dat

  }

  dat = dat[,names(dat)!="surv_qual_adm0"]
  depi = match(depvar,names(dat))

  # I do that because pop is ordered that way and we need to match both pop and dat
  dat = dat[ order(dat$adm0_adm1), ]

  return(list(depi = depi, dat = dat))
}




#' fit the glm
#'
#' @param dat data with wenvironmental and occurrence information
#' @param depi dependant variables
#' @param models list of models- only use first one
#'
#'
#' @return coefficients of glm
#' @export
fit_glm = function(dat, depi, models) {

  fm_best = models[1] #take first model

  bm = glm(as.formula(fm_best), data=dat, family=binomial(link="cloglog"))

  # setting up the evaluation of the likelihood:
  beta = coefficients(bm)

  vl=NULL
  for(i in 1:ncol(dat)) {
    if(length(grep(names(dat)[i], names(beta)))>0) {
      vl = c(vl,i) # select the variables used in the GLM
    }
  }


  x = cbind(Intercept=1,dat[,vl])
  j_expand = !sapply(1:ncol(x), function(i) is.numeric(x[,i]) & !is.factor(x[,i]))

  x_num = NULL
  for(j in 1:ncol(x)) {   # create indicative variables for adm05

    if(j_expand[j]) {
      tab = table(1:nrow(x),x[,j])
      colnames(tab) = paste(colnames(x)[j],colnames(tab),sep="")
      x_num = cbind(x_num,tab[,-1])
    } else {
      x_num = cbind(x_num,x[,j])
      colnames(x_num)[ncol(x_num)] = colnames(x)[j]
    }
  }
  x = x_num # covariate matrix
  rm(x_num)
  y = dat[,depi]  # dependant variable vector

  beta0 = coefficients(bm)
  names(beta0)[names(beta0)=="(Intercept)"] = "Intercept"

  mm = match(names(beta0),colnames(x))
  x = x[,mm]

  beta0 = beta0[match(colnames(x),names(beta0))]

  return(list(beta0=beta0, x=x, y=y))

}






#' old popualtion data import function
#'
#' @param path path to data
#' @param c_country names of countries
#' @param adm admin level
#'
#'
#' @return population by age and year
#' @export
import_pop_data_LS2014 = function(path, c_country, adm) {

  pop_year_age = NULL

  for(adm0 in c_country){
    tmp = read.csv(paste0(path,"Population", "/from_LandScan2014/", adm, "/pop_year_age_",
                          adm, "_", adm0, "_LandScan2014.csv"),
                   header=TRUE)
    pop_year_age = rbind(pop_year_age, tmp)
  }

  pop_year_age[,adm] = paste(pop_year_age[,"adm0"], pop_year_age[,adm], sep="_")
  colnames(pop_year_age)[colnames(pop_year_age)==adm] = paste("adm0", adm, sep = "_")

  return(pop_year_age)
}





#' population data import function
#'
#' @param path path to data
#' @param c_country names of countries
#' @param adm admin level
#'
#'
#' @return population by age and year
#' @export
import_pop_data_LS2017 = function(path, c_country, adm) {

  pop_year_age = read.csv(paste0(path,"Population/", "from_Landscan2014/",
                                 "population_by_adm1_year_age_1950-2100_LandScan2014_gadm2_countries.csv"),
                          header=TRUE)

  pop_year_age = pop_year_age[pop_year_age$adm0 %in% c_country,]

  pop_year_age[,adm] = paste(pop_year_age[,"adm0"], pop_year_age[,adm], sep="_")
  colnames(pop_year_age)[colnames(pop_year_age)==adm] = paste("adm0", adm, sep = "_")

  return(pop_year_age)
}





#' internal fucntion to add early years to population data
#'
#' @param pop2d 2-dimensional population data
#'
#'
#' @return 2-dimensional population data with early years
#' @export
#'
add_1940_1950 = function(pop2d){
  if(min(pop2d$year) != 1950)  stop("pop2d should start in 1950")

  for(y in 1949:1940) {
    pop1_early = pop2d[pop2d$year==y+1,]
    pop1_early$year = y
    pop2d = rbind(pop1_early,pop2d)
  }

  return(pop2d)
}






#' function to reorganise population data
#'
#' @param pop2d 2-dimensional population data
#' @param admin level
#'
#'
#' @return pop3d, P_tot_2d, p_prop_3d ie. population as an array,
#' the total population in a matrix and the population proportion in an array
Make_pop3d_Ptot2d_Pprop3d = function(pop2d, adm){

  adm0_adm = paste0("adm0_", adm)

  dim_adm  = unique(pop2d[,adm0_adm])
  dim_year = as.numeric(unique(pop2d$year))
  dim_age  = names(pop2d)[4:length(names(pop2d))]

  length_adm=length(dim_adm);   length_year=length(dim_year);   length_age=length(dim_age)

  #pop3d =3d array- dim [adm, year, age]
  pop3d=rep(NA, nrow(pop2d)*length_age)
  dim(pop3d)=c(length_adm, length_year, length_age)

  for(ageIndex in 1:length_age) {
    for(yearIndex in min(dim_year):max(dim_year)) {
      mm = match( pop2d[,adm0_adm][pop2d$year == yearIndex], dim_adm )
      pop3d[mm, yearIndex-min(dim_year)+1, ageIndex] = pop2d[pop2d$year == yearIndex, ageIndex+3]
    }
  }

  #P_tot2d = sum(pop3d(adm,year))
  P_tot_2d = rep(NA, length_adm*length_year);   dim(P_tot_2d)=c(length_adm,length_year)

  #p_prop_3d = pop3d/P_tot_2d
  p_prop_3d=rep(NA, length_adm*length_year*length_age);   dim(p_prop_3d)=dim(pop3d)

  for(admIndex in 1:length_adm){
    for (yearIndex in 1:length_year){

      P_tot_2d[admIndex, yearIndex] = sum(pop3d[admIndex, yearIndex,], na.rm=TRUE)

      for (ageIndex in 1:length_age){
        p_prop_3d[admIndex,yearIndex,ageIndex] = pop3d[admIndex, yearIndex, ageIndex]/P_tot_2d[admIndex, yearIndex]
      }
    }
  }

  #names
  dimnames(p_prop_3d)[[1]] = dimnames(pop3d)[[1]] = rownames(P_tot_2d) = dim_adm
  dimnames(p_prop_3d)[[2]] = dimnames(pop3d)[[2]] = colnames(P_tot_2d) = dim_year
  dimnames(p_prop_3d)[[3]] = dimnames(pop3d)[[3]] = dim_age

  return(list(pop3d = pop3d, P_tot_2d = P_tot_2d, p_prop_3d = p_prop_3d))

}


#' function to import and get pop data in array
#'
#' @param path path to data
#' @param c_country list of countries
#' @param dat environmental and occurrence data
#'
#'
#' @export
#' @return pop1, pop3d, P_tot_2d, p_prop_3d
get_pop_data_3d = function(path, c_country, dat) {

  adm="adm1"
  adm0_adm = paste0("adm0_", adm)

  pop1 = import_pop_data_LS2017(path, c_country, adm)

  #restrict to lines in dat THIS IS CHANGED FOR 2017!
  pop1 = pop1[pop1[,adm0_adm] %in% dat[,adm0_adm],]

  #make sure data starts from 1940
  pop1 = add_1940_1950(pop1)

  out = Make_pop3d_Ptot2d_Pprop3d(pop1, adm)

  return(list(pop1 = pop1, pop3d = out$pop3d, P_tot_2d = out$P_tot_2d, p_prop_3d = out$p_prop_3d))
}



#' get vaccine data into an array
#'
#' @param vc2d 2-dimensional vaccination data
#' @param adm defunct
#'
#'
#' @return vc3d
#' @export
transform_into_vc3d = function(vc2d, adm = NA){

  vc_tmp = tidyr::gather(vc2d, age, coverage, -c(year, adm0_adm1))

  vc3d = array(data = vc_tmp$coverage,
               dim=c(length(unique(vc_tmp$adm0_adm1)),
                     length(unique(vc_tmp$year)),
                     length(unique(vc_tmp$age))),
               dimnames=list(unique(vc_tmp$adm0_adm1),
                             unique(vc_tmp$year),
                             unique(vc_tmp$age))
  )

  return(vc3d)
}



#' calculate the first instance of vaccine coverage
#'
#' @param vc3d vaccination data in array form
#'
#'
#'
#' @return inc_v3d
#' @export
calc_incidence_vac_general = function(vc3d){

  inc_v3d = vc3d*NA

  for (admIndex in 1:dim(vc3d)[1] ) { #

    inc_v3d[admIndex,1,]= 0 # no vaccination in 1940

    for(yearIndex in 2:(dim(vc3d)[2]-1)){

      inc_v3d[admIndex,yearIndex,1] = vc3d[admIndex, yearIndex+1,1] # for a0, incidence = coverage in the a1 age group the next year

      for(age in 2:(dim(vc3d)[3]-1)) {

        if(!is.na(vc3d[admIndex, yearIndex, age]) & vc3d[admIndex, yearIndex, age]==1){ #if vc is already =1, incidence =0

          inc_v3d[admIndex,yearIndex,age]=0

        } else {

          inc_v3d[admIndex, yearIndex, age] =  ( vc3d[admIndex, yearIndex+1, age+1] - vc3d[admIndex, yearIndex, age] ) /
            (1 - vc3d[admIndex,yearIndex,age] ) # among those susceptible at y-1
          inc_v3d[admIndex, yearIndex, age] = ifelse(inc_v3d[admIndex,yearIndex,age]<10e-15,
                                                     0, inc_v3d[admIndex,yearIndex,age])
        }# with rounding, some values become negative
      }
      inc_v3d[admIndex, yearIndex, dim(vc3d)[3]]=0 # incidence vaccination = 0 for age=100
    }

    inc_v3d[admIndex,dim(vc3d)[2],]  = 0# incidence vaccination = 0 for year=2050
    inc_v3d[admIndex,dim(vc3d)[2],1] = vc3d[admIndex,dim(vc3d)[2],1] # except among new borns
  }

  return(inc_v3d)
}



#' get time of first vaccination
#'
#' @param vc3d vaccination data in array form
#'
#'
#' @return t0_vac_africa
#' @export
calc_t0_vac_africa = function(vc3d){

  t0_vac_africa = rep(NA, dim(vc3d)[1])

  for (i in 1:dim(vc3d)[1]){

    year_i  = 1940
    sum_vac = 0

    while(sum_vac == 0 & year_i <= 2050) {

      sum_vac = sum( vc3d[i, year_i-1940+1,], na.rm=TRUE)
      year_i = year_i + 1

    }

    t0_vac_africa[i]=year_i-2

  }

  names(t0_vac_africa) = dimnames(vc3d)[[1]]

  return(t0_vac_africa)
}




#' calculate population moments
#'
#' @param p_prop proportion of population
#' @param t0_vac_ intial vaccination time in area of interest
#' @param dim_adm admins of interest
#' @param dim_year years of interest
#' @param dim_age ages of interest
#'
#' @return pop_moments_whole
#' @export
calc_pop_moments = function(p_prop,
                            t0_vac_,
                            dim_adm,
                            dim_year,
                            dim_age) {

  ageVec=c(0:100)

  minYear=1983 #min year of data
  maxYear=2019 #last year of data
  print("Remember, last year of data is hardcoded into calc_pop_moments")

  maxDuration=maxYear-minYear + 1 #duration of data for second for loop

  length_adm=length(dim_adm)

  pop_moments_whole = rep(NA, length_adm*6)
  dim(pop_moments_whole)=c(length_adm,6)

  for (admIndex in 1:length_adm){
    if (t0_vac_[admIndex]<maxYear){

      vec = which(dim_year==1940):max(which(dim_year==t0_vac_[admIndex]), 1)

      pop_moments_whole_tmp = rep(NA, length(vec)*6)
      dim(pop_moments_whole_tmp) = c(length(vec), 6)

      for(yearIndex in vec){
        for(i in 1:6){
          pop_moments_whole_tmp[yearIndex-min(vec)+1, i] = sum(p_prop[admIndex, yearIndex,]*ageVec^(i-1),
                                                               na.rm = TRUE)
        }
      }

    } else if (t0_vac_[admIndex]>=maxYear){

      pop_moments_whole_tmp = rep(NA, maxDuration*6)
      dim(pop_moments_whole_tmp) = c(maxDuration, 6)
      vec = which(dim_year==minYear):which(dim_year==maxYear)

      for(yearIndex in vec){
        for(i in 1:6){
          pop_moments_whole_tmp[yearIndex-min(vec)+1, i] = sum(p_prop[admIndex, yearIndex,]*ageVec^(i-1),
                                                               na.rm = TRUE)
        }
      }
    }
    pop_moments_whole[admIndex,] = apply(pop_moments_whole_tmp,2,mean)
  }
  return(pop_moments_whole)
}




#' calculate the population moments for aggregated data
#'
#' @param pop_agg3d population aggregated in array
#' @param t0_vac_ initial vaccination time for area of interest
#' @param dim_year years of interest
#' @param study_years years of serological studies
#'
#'
#' @return pop_moments_agg
#' @export
calc_pop_moments_agg = function(pop_agg3d,
                                t0_vac_,
                                dim_year,
                                study_years) {

  ageVec=c(0:100)# a matrix of age with the same dimension that pop_agg for 1 fixed year

  minYear=1983 #min year of data
  maxYear=2019 #last year of data
  print("Remember, last year of data is hardcoded into calc_pop_moments_agg")

  maxDuration=maxYear-minYear + 1 #duration of data for second for loop
  n_serosurveys=length(study_years)

  pop_moments_agg = rep(NA, n_serosurveys*6)
  dim(pop_moments_agg)=c(n_serosurveys,6)

  for (index_survey in 1:n_serosurveys){

    if (t0_vac_[index_survey]<study_years[[index_survey]]){

      vec = which(dim_year==1940):which(dim_year==t0_vac_[index_survey])
      pop_moments_agg_tmp = rep(NA, length(vec)*6)
      dim(pop_moments_agg_tmp) = c(length(vec), 6)

      for(year in vec){
        p_prop_agg = pop_agg3d[index_survey, year,]/sum(pop_agg3d[index_survey, year,],
                                                        na.rm = TRUE)

        for(i in 1:6){
          pop_moments_agg_tmp[year, i] = sum(p_prop_agg*ageVec^(i-1),
                                             na.rm=TRUE)
        }
      }

    } else if (t0_vac_[index_survey]>=maxYear){

      vec = which(dim_year==minYear):which(dim_year==maxYear)
      pop_moments_agg_tmp = rep(NA, length(vec)*6)
      dim(pop_moments_agg_tmp) = c(length(vec), 6)

      for(year in vec){
        p_prop_agg = pop_agg3d[index_survey, year,]/sum(pop_agg3d[index_survey, year,],
                                                        na.rm = TRUE)
        for(i in 1:6){
          pop_moments_agg_tmp[year-min(vec)+1, i] = sum(p_prop_agg*ageVec^(i-1),
                                                        na.rm = TRUE)
        }
      }
    }
    pop_moments_agg[index_survey,] = apply(pop_moments_agg_tmp,2,mean)
  }
  return(pop_moments_agg)
}





#' aggregating population and vaccination data for serological surveys
#'
#' @param pop1 population data
#' @param vc2d vaccine coverage in 2-d
#' @param sero_studies list of serological studies
#' @param adm1s list of admins for each survey
#'
#'
#' @return pop_agg, vc_agg
#' @export
Make_aggregate_pop_vc = function(pop1,
                                 vc2d,
                                 sero_studies,
                                 adm1s) {

  no_sero_surveys=length(sero_studies)

  vc_tmp = pop_tmp = pop_agg = vc_agg = NULL

  for(yearIndex in 1940:max(pop1$year)) { # if needed replace 2050 by max(unlist(study_years))
    for(surveyIndex in 1:no_sero_surveys) {

      pop_tmp = pop1 %>% filter(year == yearIndex &
                                  adm0_adm1 %in% adm1s[[surveyIndex]])

      vc_tmp = vc2d %>% filter(year == yearIndex &
                                 adm0_adm1 %in% adm1s[[surveyIndex]])

      if (nrow(pop_tmp)==1) {

        pop_tmp %<>% mutate(adm0 = sero_studies[surveyIndex])
        vc_tmp %<>% mutate(adm0 = sero_studies[surveyIndex])

      } else { # need to do some summing/averaging...

        if(anyNA(vc_tmp)){
          pop_tmp[is.na(vc_tmp)] = NA
        }



        vc_tmp2 = colSums(select(pop_tmp, -c(year, adm0_adm1)) * select(vc_tmp, -c(year, adm0_adm1)),na.rm=TRUE)/
          colSums(select(pop_tmp, -c(year, adm0_adm1)),na.rm=TRUE)


        names(vc_tmp2) = paste0("a",0:100)

        vc_tmp = data.frame(adm0 = sero_studies[surveyIndex],
                            adm0_adm1 = NA,
                            year = yearIndex,
                            t(vc_tmp2) )

        pop_tmp = colSums(select(pop_tmp, -c(year, adm0_adm1)))

        names(pop_tmp) = paste0("a",0:100)

        pop_tmp = data.frame(adm0 = sero_studies[surveyIndex],
                             adm0_adm1 = NA,
                             year = yearIndex,
                             t(pop_tmp))

      }
      pop_agg = rbind(pop_agg,pop_tmp)
      vc_agg = rbind(vc_agg,vc_tmp)

    }
  }

  pop_agg[,-(1:3)][is.na(pop_agg[,-(1:3)])] = 0
  vc_agg[,-(1:3)][is.na(vc_agg[,-(1:3)])] = 0

  return(list(pop_agg = pop_agg, vc_agg = vc_agg))
}






#' Aggregate the population and vaccination data into an array
#'
#' @param pop1 population data
#' @param vc2d vaccine coverage in 2-d
#' @param sero_studies list of serological studies
#' @param adm1s list of admins for each survey
#'
#'
#' @return pop_agg3d, vc_agg3d
#' @export
Make_aggregate_pop_vc_3d = function(pop1,
                                    vc2d,
                                    sero_studies,
                                    adm1s){

  agg=Make_aggregate_pop_vc(pop1, vc2d, sero_studies, adm1s)

  pop_agg=agg$pop_agg
  vc_agg=agg$vc_agg

  dim_survey = sero_studies
  dim_year = as.numeric(names(table(pop_agg$year)))
  dim_age = names(pop1 %>% select(-c(adm0_adm1, year)))

  length_survey=length(dim_survey);   length_year=length(dim_year);   length_age=length(dim_age)

  ## pass in 3d
  pop_agg3d=rep(NA, nrow(pop_agg)*length_age)
  dim(pop_agg3d)=c(length_survey, length_year, length_age)
  vc_agg3d=rep(NA, nrow(vc_agg)*length_age)
  dim(vc_agg3d)=c(length_survey, length_year,length_age)

  for(ageIndex in 1:length_age) { # dim_age
    for(yearIndex in min(dim_year):max(dim_year)) {

      mm = match(pop_agg$adm0[pop_agg$year == yearIndex], dim_survey)
      pop_agg3d[mm,yearIndex-min(dim_year)+1,ageIndex] = pop_agg[pop_agg$year==yearIndex, ageIndex+3]
      vc_agg3d[mm,yearIndex-min(dim_year)+1,ageIndex] = vc_agg[vc_agg$year==yearIndex, ageIndex+3]
    }
  }

  dimnames(pop_agg3d)[[1]] = dimnames(vc_agg3d)[[1]] = dim_survey
  dimnames(pop_agg3d)[[2]] = dimnames(vc_agg3d)[[2]] = dim_year
  dimnames(pop_agg3d)[[3]] = dimnames(vc_agg3d)[[3]] = dim_age

  return(list(pop_agg3d = pop_agg3d, vc_agg3d = vc_agg3d))

}





#'get the population at the survey times and places
#'
#'@param pop_agg3d aggregated population
#'@param dim_survey surveys of interest
#'@param dim_year years of interest
#'
#'
#'@return p_at_survey_3d, P_tot_survey_2d
#'@export
create_pop_at_survey = function(pop_agg3d, dim_survey, dim_year) {

  # Create p_at_survey and P_tot, population structure and number, aggregated at survey provinces
  p_at_survey_3d=rep(NA, dim(pop_agg3d)[1]*dim(pop_agg3d)[2]*dim(pop_agg3d)[3])
  dim(p_at_survey_3d)=dim(pop_agg3d)
  dimnames(p_at_survey_3d) =dimnames(pop_agg3d)

  P_tot_survey_2d=  matrix(rep(NA, dim(pop_agg3d)[1]*dim(pop_agg3d)[2]), nrow=length(dim_survey))
  rownames(P_tot_survey_2d) = dim_survey; colnames(P_tot_survey_2d) = dim_year

  for (i in 1:length(dim_survey)){
    for (Y in  1:length(dim_year) ){
      P_tot_survey_2d[i,Y] = sum(pop_agg3d[i,Y,],
                                 na.rm=TRUE)
      p_at_survey_3d[i,Y,] = pop_agg3d[i,Y,]/ P_tot_survey_2d[i,Y]
    }
  }

  return(list(p_at_survey_3d = p_at_survey_3d, P_tot_survey_2d = P_tot_survey_2d))
}




#' get aggregated population and vaccination across observation period
#'
#' @param pop1 population data
#' @param vc2d vaccine coverage data
#'
#' @export
#' @return pop30_agg, pop_vc_moments and vc30_agg, population and vaccination aggregated over observation period
create_pop30_agg_vc30_agg = function(pop1, vc2d){
  #print("Remember, final year of data is hardcoded into create_pop30_agg_vc30_agg")

  ### subset by year ###
  pop30 = dplyr::filter(pop1, year>1983 & year<2019);  pop30 = pop30[order(pop30$adm0_adm1),]
  vc30 = dplyr::filter(vc2d, year>1983 & year<2019)

  pop30[is.na(pop30) | is.na(vc30)] = 0
  vc30[is.na(vc30)] = 0

  ### aggregate first age group ###
  pop30_agg = aggregate(pop30[,"a0"],
                        by=list(adm0_adm1=pop30$adm0_adm1),
                        sum) # a0 is the 4th column,

  vc30_agg = aggregate(vc30[,"a0"]*pop30[,"a0"],
                       by=list(adm0_adm1=vc30$adm0_adm1),
                       sum) # number of vaccinated people by age, aggregated over 30y

  vc30_agg$x = vc30_agg$x/pop30_agg$x

  names(pop30_agg)[2] = names(vc30_agg)[2] = names(pop30)["a0"]

  ### then other age groups ###
  for(i in grep("a1", names(pop30))[1]:ncol(pop30)) {

    pop30_agg = cbind(pop30_agg, aggregate(pop30[,i],
                                           by=list(adm0_adm1=pop30$adm0_adm1),
                                           sum)$x)

    vc30_agg = cbind(vc30_agg,
                     aggregate(vc30[,i]*pop30[,i],
                               by=list(adm0_adm1=vc30$adm0_adm1),
                               sum)$x/
                                      aggregate(pop30[,i],
                                                by=list(adm0_adm1=pop30$adm0_adm1),
                                                sum)$x)

    names(pop30_agg)[ncol(pop30_agg)] = names(vc30_agg)[ncol(vc30_agg)] = names(pop30)[i]
  }

  vc30_agg[is.na(vc30_agg)] = 0


  amat = matrix(0:100, nrow=nrow(pop30_agg), ncol=101, byrow=TRUE)
  pop_vc_moments = data.frame(adm0_adm1 = pop30_agg$adm0_adm1,
                              m0 = rowSums(pop30_agg[,-1]*(1-vc30_agg[,-1]),
                                           na.rm=TRUE),
                              m1 = rowSums(pop30_agg[,-1]*(1-vc30_agg[,-1])*amat,
                                           na.rm=TRUE),
                              m2 = rowSums(pop30_agg[,-1]*(1-vc30_agg[,-1])*amat*amat,
                                           na.rm=TRUE),
                              m3 = rowSums(pop30_agg[,-1]*(1-vc30_agg[,-1])*amat*amat*amat,
                                           na.rm=TRUE),
                              m4 = rowSums(pop30_agg[,-1]*(1-vc30_agg[,-1])*amat*amat*amat*amat,
                                           na.rm=TRUE),
                              m5 = rowSums(pop30_agg[,-1]*(1-vc30_agg[,-1])*amat*amat*amat*amat*amat,
                                           na.rm=TRUE))

  return(list(pop30_agg=pop30_agg, vc30_agg=vc30_agg, pop_vc_moments=pop_vc_moments))
}



#' extract key information from serological data
#'
#' @param Serology dataframe of serological data
#' @param adm which gamd- defaults to "gadm2"
#'
#' @return survey_dat, adm1s, sero_studies, study_years, vc_factor, t0_vac, no_sero_studies
#' @export
process_serology = function(Serology, adm = "gadm2"){

  #extracting values as the file is read as a tibble
  sero_studies =  unique(Serology$country_zone)                                      #serology locations
  study_years = unique(Serology[,c("country_zone","year") ] )[ ,2]    #years each survey occurred
  no_sero_surveys = length(sero_studies)                                             #Number of sero surveys in database

  vc_factor = unique(Serology[,c("country_zone","vc_factor") ] )[ ,2]  #whether to account for vaccination in serology or not
  t0_vac = unique(Serology[,c("country_zone","t0_vac") ] )[ ,2]         #first incidence of vaccination, pulled from tibble
  names(t0_vac)=unique(Serology[,c("country_zone","t0_vac") ] )[ ,1]

  adm1s_tibble = unique(Serology[ , c("country_zone", adm)])[ ,2]                   #shape file admin locations in tibble form
  adm1s=list()
  for (surveyIndex in 1:no_sero_surveys) {
    adm1s[surveyIndex] = strsplit(as.character(adm1s_tibble[surveyIndex]),",")
  }

  survey_dat=list()
  for (surveyIndex in 1:no_sero_surveys) {
    survey_dat[[surveyIndex]] = filter(Serology,
                                       country_zone==sero_studies[surveyIndex])[,c("samples","positives","age_min", "age_max")]
  }

  return(list(survey_dat = survey_dat,
              adm1s = adm1s,
              sero_studies = sero_studies,
              study_years = study_years,
              vc_factor = vc_factor,
              t0_vac = t0_vac,
              no_sero_surveys = no_sero_surveys))
}
mrc-ide/YFestimation documentation built on March 13, 2021, 1:40 p.m.