R/functions_master_thesis.R

Defines functions my_setup my_EpiEstim_stan my_stan_bsts my_F my_centrality my_RPS my_DSS my_infectiousness my_next_expected_incidence my_infection_overview my_plot_r_pred my_plot_two_histograms my_evolution_delta my_evolution_R my_extract my_pred_vs_true_inc_plot my_iterative_fit my_iter_pred_vs_true my_load_conflict_data my_make_weekly

Documented in my_setup

#' setup work environment
#' 
#' not needed later on. 
#' 
#' @return nothing
#' @examples
#' \notrun{
#' inc <- my_load_data()
#' }
#' 
#' 

my_setup <- function(){
	options(max.print = 2000)
	par(family = "Serif")
	options(width=as.integer(160))
	options(max.print = 36000)
	#library(tidyverse)
	library(ggplot2)
	theme_set(theme_get() + theme(text = element_text(family = 'Serif')))

	library(rstan)
	#options(mc.cores = parallel::detectCores())
	options(mc.cores = 4)
	rstan_options(auto_write = TRUE)
	library(extraDistr)
	#library(EpiEstim)
	library(shinystan)
	library(bsts)
	library(matrixStats)
}


# ================================================= # 
# ================================================= # 

# ================================================= # 
# ================================================= # 

my_EpiEstim_stan <- function(past_incidences){	
	inc <- past_incidences
	t <- length(inc)
	l <- list(t = t, past_incidences = inc, tau = 7)

	stanfit2 <- rstan::stan(file = "../stan/estimate_R_EpiEstim_rebuild.stan",
	                        data = l,
	                        iter = 4000, warmup = 800, thin = 1, control = list(adapt_delta = 0.97))

	s1 <- summary(stanfit2)$summary %>%
		as.data.frame() %>%
		rownames_to_column("var") %>%
		filter(grepl("^R", var)) %>%
		select(-var) %>%
		mutate(estimate="stan") %>%
		select(c(1,4,8,11)) %>%
		mutate(id=1:n())

	s1 <- s1[17:nrow(s1),] %>%
		mutate(id=1:n())
	colnames(s1) <- c("mean", "low", "high", "estimate", "id")

	result <- list(R = s1, stanfit = stanfit2)	
}

# ================================================= # 
# ================================================= # 

my_stan_bsts <- function(past_r, n_pred = 10){
	t <- length(past_r)
	l <- list(t = t, past_r = past_r, n_pred = n_pred)

	stanfit2 <- rstan::stan(file = "../stan/bayesian_structural_time_series_model_r.stan" ,
	                        data = l,
	                        iter = 4000, warmup = 800, thin = 1, control = list(adapt_delta = 0.97))

	sum <- summary(stanfit2)$summary
	sum <- sum %>% as.data.frame(rownames(sum)) %>% mutate(var = rownames(sum)) 

	params <- sum %>% filter(sum$var %in% c("sigma_epsilon", "sigma_eta", "phi", "D"))
	rownames(params) <- params$var

	predicted <- sum %>% filter(grepl("^r_pred", var))
	rownames(predicted) <- predicted$var

	res <- list(params = params, predicted = predicted, stanfit = stanfit2)

	r <- res$predicted
	r <- r[,c(1,4,8)]
	colnames(r) <- c("mean", "low", "high")


	ggplot(r, aes(x = 1:n_pred, y = mean, ymin = low, ymax = high)) + geom_line() + geom_ribbon(alpha = 0.5)

	return(res)
}


my_F <- function(predictions, k){
	return(sum(predictions <= k) / length(predictions))
}



# ================================================= # 
# ================================================= # 

my_centrality <- function(u){
	sum(u > 0.25 & u < 0.75)/length(u) - 0.5
}


# ================================================= # 
# ================================================= # 
## ranked probability score

my_RPS <- function(true_values, samples){
	t <- length(true_values)
	rps <- numeric(t)	
	for (j in 1:t){
		m <- max(samples[,j])
		#i <- 1:m
		#rps[t] <- 0
		for (i in 1:m){
			rps[j] <- rps[j] + (my_F(samples[,j], i) - (i >= true_values[j]))^2 
		}
	}
	return(rps)
}

# ================================================= # 
# ================================================= # 
## David-Sebastiani-Score

my_DSS <- function(true_values, samples){
	t <- length(true_values)
	dss <- numeric(t)	
	for (j in 1:t){
		mu_sample <- mean(samples[, j])
		sd_sample <- sd(samples[, j]) 

		dss[j] <- ((true_values[j] - mu_sample) / sd_sample)^2 + 2 * log(sd_sample) 
	}
	return(dss)
}

# ================================================= # 
# ================================================= # 
## Function to get the infectiousness. Already 
## implemented in stan, but I want to double
## check the results

my_infectiousness <- function(past_incidences, n_pred = 0){
	## inputs: past_incidences
	## number of timesteps to forecast n_pred
	
	t <- length(past_incidences)

	w <- rep(0, times = t + n_pred)
	for (i in 1:(t + n_pred)){
	    if (i > 40){
	      w[i] = 0;
	    } else {
	      w[i] = pgamma(i + 0.5, 2.706556, 0.1768991) - pgamma(i  - 0.5, 2.706556, 0.1768991); 
	    }
	}

	infectiousness <- rep(0.000001, times = t)
    for (s in 2:t){
    infectiousness[s] = 0;
    for (i in 1:(s - 1)){
      infectiousness[s] = infectiousness[s] + past_incidences[i] * w[s - i];
    }
    #infectiousness[1:10] <- 1

    infectiousness_weekly <- rep(0, times = t/7)
    for (i in 1:length(infectiousness_weekly)){
    	infectiousness_weekly[i] <- sum(infectiousness[(7*(i-1)+1):(7*i)])
    }

    infectiousness_pred = 0
    for (i in 1:(t)){
      infectiousness_pred = infectiousness_pred + past_incidences[i] * w[t + 1 - i]
    }
    
  }
  l <- list(weights = w, infectiousness = infectiousness, infectiousness_one_ahead = infectiousness_pred, infectiousness_weekly = infectiousness_weekly)
  return(l)

  ## output: 
  ## weights (cut after 40 periods)
  ## calculated infectiousness
  ## one step ahead calculation of infectiousness
}


# ================================================= # 
# ================================================= # 
## get next expected incidence
## my_infectiousness(inc) = my_next_expected_incidence(inc[-length(inc)])
## now also implemented in my_infectiousness)()


my_next_expected_incidence <- function(past_incidences){
	t <- length(past_incidences) 
	# note that we have t instead of t-1 here as in the master thesis
	adjusted_affected <- 0	
	for (s in 1:t){
		## implementation for continuous numbers
		# weight <- ddgamma(t + 1 - s, 2.706556, 0.1768991) 

		## implementation for integers
		## not that here there is no cut-off value that sets every weight for periods > 40 to 0. 
		weight <- pgamma(t + 1 - s + 0.5, 2.706556, 0.1768991) - pgamma(t + 1 - s  - 0.5, 2.706556, 0.1768991)

		adjusted_affected <- adjusted_affected + past_incidences[s] * weight
	}	
	return(adjusted_affected)
}


# ================================================= # 
# ================================================= # 
my_infection_overview <- function(past_incidences){
	tmp <- my_infectiousness(past_incidences)

	data.frame(incidences = past_incidences, 
			   infectiousness = tmp$infectiousness,
			   #weights_rel = rev(tmp$weights)
			   weights = tmp$weights
			   )

	## output weights: outputs the weights for the 
	## corresponding number of periods. w[1] = 1 period
	## away. The weights do not at all correspond to the 
	## columns next to it! 

	## weights_rel: weights relative to the one after the last period
}


# ================================================= # 
# ================================================= # 
## unsure what this does. delete?
my_plot_r_pred <- function(predicted){
  mean_R <- rowMeans(predicted)
  quantiles <- rowQuantiles(predicted, probs=c(0.05, 0.95))
  days <- 1:nrow(predicted)
  q <- ggplot() +  geom_line(aes(x=days, y=mean_R)) +
    geom_ribbon(aes(x=days, ymin=quantiles[,1],
                    ymax=quantiles[,2]),alpha=0.3)

}

# ================================================= # 
# ================================================= # 
## plot 2 histograms. useful for comparing prior and posterior
my_plot_two_histograms <- function(vector1, vector2, 
								   breaks = 100, 
								   upper_limit = NULL){
	if(!is.null(upper_limit)){
		vector1 <- vector1[vector1 < upper_limit]
		vector2 <- vector2[vector2 < upper_limit]
	}

	## set breakpoints and define minimum 
	## breakpoint a and maximum breakpoint b
	a <- min(c(vector1, vector2)) 
	b <- max(c(vector1, vector2)) 

	## define axis
	ax <- pretty(a:b, n = breaks)

	while(min(ax) > a | max(ax) < b){
		if (min(ax) > a){
		a <- a - (ax[2] - ax[1])
		}
		if (max(ax) < b){
		b <- b + (ax[2] - ax[1])
		}
		ax <- pretty(a:b, n = breaks)
	}

	

	## make histogram A and B
	plot1 <- hist(vector1, breaks = ax, plot = FALSE)
	plot1$density = plot1$counts/sum(plot1$counts)
	plot2 <- hist(vector2, breaks = ax, plot = FALSE)
	plot2$density = plot2$counts/sum(plot2$counts)

	## set correct font
	par(family = "Serif")

	## define two colors
	col1 <- rgb(168,209,225,max = 255, alpha = 75)
	col2 <- rgb(248,183,193, max = 255, alpha = 75)

	## plot and add 2nd plot to first
	plot(plot1, col = col1, xlab = "vec1 is blue, vec2 is pink", xlim = c(a, b)) 
	plot(plot2, col = col2, add = TRUE) 
}


# ================================================= # 
# ================================================= # 
## diagnostic functions to visualize the evolution of
## delta and R under different circumstances

my_evolution_delta <- function(delta0 = 0, D = -0.02, phi = 0, n = 100, random = F, sigma = 0.5){
	deltas = rep(0, n)
	for (i in 2:n){
		deltas[i] <- D + phi * (deltas[i-1] - D)
		if (random){
			deltas[i] <- deltas[i] + rnorm(1, 0, sigma)
		}
	}
	return(deltas)
}


my_evolution_R <- function(n = 100, sigma_epsilon = 0.5, ...){
	R <- rep(0, n)
	R[1] <- 1
	delta <- my_evolution_delta(n = n, ...) 
	for (i in 2:n){
		mean <- R[i-1] + delta[i]
		while (R[i] <= 0){
			R[i] <- rnorm(1, mean, sigma_epsilon)	
		}
		
	}
	return(R)
}

## results: 
## the phi determines how quickly the thing will revert to a constant
## trend. See this paper: http://www.unofficialgoogledatascience.com/2017/07/fitting-bayesian-structural-time-series.html



# ================================================= # 
# ================================================= # 
## functions to extract and to plot the posterior predictive
## values against the values that were actually observed
my_extract <- function(stanfitobject, var = "Inc_post"){
	tmp <- extract(stanfitobject)
	tmp <- getElement(tmp, var)
	apply(tmp, MARGIN = 2, FUN = mean)
}


my_pred_vs_true_inc_plot <- function(y_true, 
									 y_pred, 
									 vert_lines = NULL){
	ymin <- min(c(y_true, y_pred))
	ymax <- max(c(y_true, y_pred))

	plot(y_true, type = "l", col = "grey", family = "Serif", ylim = c(ymin, ymax))
	lines(y_pred, col = "red", lwd = 3)

	if (!is.null(vert_lines) && vert_lines > 0){
		abline(v = vert_lines, col = "blue", lty = 2)
	}
}


# ================================================= # 
# ================================================= # 
## fit the stan model iteratively 

my_iterative_fit <- function(past_incidences, 
								  n_pred = 14, 
								  interval = 0,
								  start_period = 30,
								  tau = 7,
								  stanfile = "../stan/combined_EpiEstim_bsts_only_sigma_eta.stan"){
	## inputs: 
	## vector past incidences
	## number of periods to forecast into the future
	## interval between predictions
	time <- Sys.time()
	if (interval == 0) interval <- n_pred

	total_n <- (length(past_incidences))
	current_n <- start_period

	## calculate how many fits you need. 
	runs <- ceiling((total_n - start_period) / interval)

	# predictions <- numeric(0)
	res <- list()

	i <- 0
	while (current_n < total_n){
		print("run ", as.character(i), "of ", as.character(runs))
		index <- 1:current_n
		if ((length(index)) > total_n) {index <- i:total_n}

		inc <- past_incidences[index]
		T <- length(inc)
		l <- list(T = T, past_incidences = inc, tau = tau, n_pred = n_pred)
		stanfit <- rstan::stan(file = stanfile,
                        data = l,
                        iter = 2000, warmup = 1000, thin = 1, control = list(adapt_delta = 0.99))	

		i <- i + 1
		res[[i]] <- stanfit
		current_n <- start_period + i * interval
	}

	print(time - Sys.time())
	return(res)
}


# ================================================= # 
# ================================================= # 
## use iterative fits of the stan model to plot
## predicted vs. actual cases of Ebola

my_iter_pred_vs_true <- function(inc, 
								 n_pred = 14, 
								 interval = 0,
								 start_period = 30,
								 tau = 7,
								 stanfile = "../stan/combined_EpiEstim_bsts_only_sigma_eta.stan"){

	if (interval == 0) interval <- n_pred

	l <- my_iterative_fit(past_incidences = inc,
						  n_pred = n_pred, 
						  interval = interval,
						  start_period = start_period,
						  tau = tau,
						  stanfile = stanfile)

	n_total <- length(inc) 
	predictions <- lapply(l, my_extract, var = "I_pred")
	predictions <- unlist(predictions, use.names = FALSE)

	if ((n_total - start_period) > interval){
		vert_lines = seq(interval, n_total - start_period, interval)
	} else {
		vert_lines = -1
	}
	
	try(my_pred_vs_true_inc_plot(y_true = inc[(start_period + 1):n_total], 
								 y_pred = predictions, 
								 vert_lines = vert_lines))

	return(list(predictions = predictions, stanfitobjects = l))
}


# ================================================= # 
# ================================================= # 

my_load_conflict_data <- function(){
	conflicts <- read.csv("../data/2018-08-03-2020-01-19-Democratic_Republic_of_Congo.csv", stringsAsFactors = F)

	cols_to_keep <- c("event_date", 
					  "event_type", 
					  "sub_event_type", 
					  "admin1", 
					  "admin2")

	conflicts <- conflicts[, colnames(conflicts) %in% cols_to_keep]
	
	Sys.setlocale("LC_TIME", "C")
	conflicts$event_date <- as.Date(conflicts$event_date, format = "%d %B %Y")

	nkivu <- conflicts[conflicts$admin1 == "Nord-Kivu", ]

	nkivu$counts <- 1
	confl_inc <- aggregate(counts ~ event_date, data=nkivu, FUN="sum")

	return(confl_inc)
}


# ================================================= # 
# ================================================= # 
## aggregate data by week

my_make_weekly <- function(vector){
	t = length(vector)
	weekly <- rep(0, times = t/7)
    for (i in 1:length(weekly)){
    	weekly[i] <- sum(vector[(7*(i-1)+1):(7*i)])
    }
    return(weekly)
}
nikosbosse/epipredictr documentation built on April 7, 2020, 9:51 p.m.