inst/code/first-pass-ssr.R

## read in and plot data

library(lubridate)
library(ggplot2)
library(plyr)
library(dplyr)
library(reshape)
library(ssr)

sj <- San_Juan_train

## add lag columns
sj <- sj %>% mutate(total_cases_lag3 = lag(total_cases, 3),
                    total_cases_lag1 = lag(total_cases))

## add log column
sj$log_total_cases <- log(sj$total_cases + 1)

## add smooth log column
sm <- loess(log_total_cases ~ as.numeric(week_start_date), data=sj, span=1/80)
sj$smooth_log_cases <- sm$fitted

                
        
## simple time series plot
ggplot(sj, aes(x=week_start_date, y=total_cases)) + geom_line()
ggplot(sj, aes(x=week_start_date)) + geom_linerange(aes(ymin=0, ymax=total_cases))

## phase plots
(p <- ggplot(sj, aes(x=total_cases_lag1, y=total_cases, color=season)) + 
    geom_point() + geom_path() +
    scale_x_log10() + scale_y_log10() +
    geom_abline(b=1, a=0, color="grey"))

p + facet_wrap(~ season)

## smooth data
sm <- loess(total_cases ~ as.numeric(week_start_date), data=sj, span=1/80)
sj$smooth_cases <- sm$fitted
sj$smooth_cases_lag1 <- lag(sj$smooth_cases)

ggplot(sj, aes(x=week_start_date)) + 
    geom_linerange(aes(ymin=0, ymax=total_cases)) +
    geom_line(aes(y=smooth_cases), color="red", size=1)

## phase plot with smooth data
(p1 <- ggplot(sj, aes(x=smooth_cases_lag1, y=smooth_cases, color=season)) + 
    geom_point() + geom_path() +
    scale_x_log10() + scale_y_log10() +
    geom_abline(b=1, a=0, color="grey"))

p1 + facet_wrap(~ season)







# 1, 13, 26, 39, and 52 weeks ahead density predictions, for plots
options(error = recover)
target_inds <- which(sj$season == "2008/2009")
#prediction_horizon <- 1

ssr_ests <- rbind.fill(lapply(c(1, 13, 26, 39, 52), function(prediction_horizon) {
    last_obs_season_week_and_prediction_horizon_combos <- lapply(target_inds,
        function(ti) {
            list(last_obs_season = as.character(sj$season[ti - prediction_horizon]),
                last_obs_week = sj$season_week[ti - prediction_horizon],
                prediction_horizon = prediction_horizon)
        }
    )
#    params <- last_obs_season_week_and_prediction_horizon_combos[[1]]
    
    rbind.fill(lapply(last_obs_season_week_and_prediction_horizon_combos,
        function(params) {
            preds <- ssr_predict_dengue_one_week(
                last_obs_season = params$last_obs_season,
                last_obs_week = params$last_obs_week,
                lags = list(smooth_log_cases = c(0, 1)),
                theta=list(smooth_log_cases_lag0=list(bw=.1),
                    smooth_log_cases_lag1=list(bw=.1)),
                prediction_horizon = params$prediction_horizon,
                data = sj,
				season_var = "season",
				week_var = "season_week",
                X_names = c("smooth_log_cases"),
                y_name = c("total_cases"),
                time_name = "week_start_date",
                prediction_types = "density")$dist_preds
            preds$last_obs_season <- params$last_obs_season
            preds$last_obs_week <- params$last_obs_week
            last_obs_ind <- which(sj[["season"]] == params$last_obs_season &
                    sj[["season_week"]] == params$last_obs_week)
            temp <- get_prediction_season_week(last_obs_ind,
                params$prediction_horizon,
                sj,
                season_var = "season",
                week_var = "season_week")
            preds$prediction_season <- temp$season
            preds$prediction_week <- temp$week
            preds$prediction_horizon <- params$prediction_horizon
            
            return(preds)
        }
    ))
}))


sj$season_season_week <- paste(as.character(sj$season), sj$season_week, sep = "-")
ssr_ests$season_season_week <- paste(ssr_ests$prediction_season, ssr_ests$prediction_week,
    sep = "-")

ssr_ests$model <- paste0("ssr_p_", ssr_ests$prediction_horizon)

## get naive model estimates -- only handles holdout_seasons with length 1?
get_season_week_kde_ests_one_week <- function(wk, holdout_seasons, data) {
    inds <- which(!(data$season %in% holdout_seasons) & data$season_week == wk)
    
    ## do weighted kde
#    kde_log_scale <- density(data[inds, "log_total_cases"],
#        bw = "SJ")
    kde_orig_scale <- density(data[inds, "total_cases"],
        bw = "SJ")
    
    return(data.frame(#log_total_cases = kde_log_scale$x,
            total_cases = kde_orig_scale$x,
#            est_density_log_scale = kde_log_scale$y,
            est_density_orig_scale = kde_orig_scale$y,
            season = rep(holdout_seasons, each = length(kde_orig_scale$x)),
            season_week = wk
        ))
}

season_week_kde_ests <- rbind.fill(
    lapply(seq_len(52), get_season_week_kde_ests_one_week,
        holdout_seasons = "2008/2009",
        data = sj)
)

season_week_kde_ests$model <- "naive"



# combine estimates from ssr with various lags and naive model
ssr_ests$total_cases <- ssr_ests$x
ssr_ests$est_density_orig_scale <- ssr_ests$est_density
ssr_ests$season_week <- ssr_ests$prediction_week
combined_ests <- rbind.fill(ssr_ests, season_week_kde_ests)


# get observed totals
sj_inds_keep <- seq_len(nrow(sj))[sj$season_season_week %in%
    ssr_ests$season_season_week]
obs_counts <- sj[sj_inds_keep,
    c("smooth_log_cases", "log_total_cases", "total_cases", "season_season_week", "season", "season_week")]
obs_counts$exp_smooth_log_cases <- exp(obs_counts$smooth_log_cases)
obs_counts <- melt(obs_counts, id = c("season", "season_week", "season_season_week"))


## plot estimates, compared with observed cases and smoothed cases
## log scale
# p <- ggplot() +
#     geom_line(aes(x = log_total_cases, y = est_density_log_scale, colour = model), data = combined_ests) +
#     geom_vline(aes(xintercept = value, linetype = variable), data = obs_counts[obs_counts$variable %in% c("smooth_log_cases", "log_total_cases"), ], show_guide = TRUE) +
#     facet_wrap(~ season_week) +
#     ggtitle("Predictions by week for 2008/2009 season (log scale)") +
#     theme_bw()
# 
# print(p)

## original scale
p <- ggplot() +
    geom_line(aes(x = total_cases, y = est_density_orig_scale, colour = model), data = combined_ests) +
    geom_line(aes(x = total_cases, y = est_density, colour = model), data = combined_ests) +
    geom_vline(aes(xintercept = value, linetype = variable), data = obs_counts[obs_counts$variable %in% c("exp_smooth_log_cases", "total_cases"), ], show_guide = TRUE) +
    coord_cartesian(xlim = c(0, 50)) +
    facet_wrap(~ season_week) +
    ggtitle("Predictions by week for 2008/2009 season") +
    theme_bw()

print(p)




## compute point estimates and get MASE for each lag and naive model
holdout_seasons <- c("2007/2008", "2008/2009")
target_inds <- which(sj$season %in% holdout_seasons)
prediction_horizons_vec <- 1:52
ssr_pt_ests <- rbind.fill(lapply(prediction_horizons_vec, function(prediction_horizon) {
    last_obs_season_week_and_prediction_horizon_combos <- lapply(target_inds,
        function(ti) {
            list(last_obs_season = as.character(sj$season[ti - prediction_horizon]),
                last_obs_week = sj$season_week[ti - prediction_horizon],
                prediction_horizon = prediction_horizon)
        }
    )

    rbind.fill(lapply(last_obs_season_week_and_prediction_horizon_combos,
        function(params) {
            preds <- list(pt_preds = ssr_predict_dengue_one_week(
                last_obs_season = params$last_obs_season,
                last_obs_week = params$last_obs_week,
                lags = list(smooth_log_cases = c(0, 1)),
                theta=list(smooth_log_cases_lag0=list(bw=.1),
                    smooth_log_cases_lag1=list(bw=.1)),
                prediction_horizon = params$prediction_horizon,
                data = sj,
                season_var = "season",
                week_var = "season_week",
                X_names = c("smooth_log_cases"),
                y_name = c("total_cases"),
                time_name = "week_start_date",
                prediction_types = "pt")$pt_preds)
            preds$last_obs_season <- params$last_obs_season
            preds$last_obs_week <- params$last_obs_week
            last_obs_ind <- which(sj[["season"]] == params$last_obs_season &
                    sj[["season_week"]] == params$last_obs_week)
            temp <- get_prediction_season_week(last_obs_ind,
                params$prediction_horizon,
                sj,
                season_var = "season",
                week_var = "season_week")
            preds$prediction_season <- temp$season
            preds$prediction_week <- temp$week
            preds$prediction_horizon <- params$prediction_horizon
            
#            browser()
            return(as.data.frame(preds))
        }
    ))
}))


sj$season_season_week <- paste(as.character(sj$season), sj$season_week, sep = "-")
ssr_pt_ests$pt_est <- ssr_pt_ests$pt_preds
ssr_pt_ests$est_total_cases_orig_scale <- ssr_pt_ests$pt_est
ssr_pt_ests$season_week <- ssr_pt_ests$prediction_week
ssr_pt_ests$season <- ssr_pt_ests$prediction_season
ssr_pt_ests$season_season_week <- paste(ssr_pt_ests$season, ssr_pt_ests$season_week,
    sep = "-")

ssr_pt_ests$model <- paste0("ssr_p_", ssr_pt_ests$prediction_horizon)

## get naive model point estimates -- only handles holdout_seasons with length 1?
## turn this and naive density estimates function above into more real functions
get_season_week_pt_ests_one_week <- function(wk, holdout_seasons, data) {
    inds <- which(!(data$season %in% holdout_seasons) & data$season_week == wk)
    
    ## do (unweighted) mean
    pt_est_log_scale <- mean(data[inds, "log_total_cases"])
    pt_est_orig_scale <- mean(data[inds, "total_cases"])
    
    return(data.frame(est_total_cases_log_scale = pt_est_log_scale,
            est_total_cases_orig_scale = pt_est_orig_scale,
            est_total_cases_orig_scale_from_log_scale = exp(pt_est_log_scale),
            season = holdout_seasons,
            season_week = wk
        ))
}

season_week_pt_ests <- rbind.fill(
    lapply(seq_len(52), get_season_week_pt_ests_one_week,
        holdout_seasons = holdout_seasons,
        data = sj)
)

season_week_pt_ests$model <- "naive"



# combine estimates from ssr with various lags and naive model
combined_pt_ests <- rbind.fill(ssr_pt_ests, season_week_pt_ests)


sj_inds_keep <- seq_len(nrow(sj))[sj$season_season_week %in%
				ssr_pt_ests$season_season_week]
obs_counts <- sj[sj_inds_keep,
		c("smooth_log_cases", "log_total_cases", "total_cases", "season_season_week", "season", "season_week")]
obs_counts$exp_smooth_log_cases <- exp(obs_counts$smooth_log_cases)
obs_counts <- melt(obs_counts, id = c("season", "season_week", "season_season_week"))


# get mase for all models and plot
mase_by_model <- tapply(combined_pt_ests$est_total_cases_orig_scale,
    combined_pt_ests$model,
    function(preds) {
        mase(obs_counts$value[obs_counts$variable == "total_cases"], preds)
    })

mase_by_pred_step <- data.frame(
    pred_step = as.integer(sapply(strsplit(names(mase_by_model[2:53]), "_"), function(lc) lc[length(lc)])),
    mase = mase_by_model[2:53]
)

ggplot() +
    geom_point(aes(x = pred_step, y = mase), data = mase_by_pred_step) +
    geom_line(aes(x = pred_step, y = mase), data = mase_by_pred_step) +
    geom_hline(yintercept = mase_by_model[names(mase_by_model) == "naive"]) +
    coord_cartesian(ylim = c(0, 5.1)) +
    ggtitle("MASE for SSR at prediction steps = 1, ..., 52\ncompared with naive weekly model\n in 2008/2009 season") +
    theme_bw()


















### get ssr fit
library(doMC)

registerDoMC(cores=5)        ## number of cores on this machine

set.seed(998468235L,kind="L'Ecuyer")
mcopts <- list(preschedule=FALSE,set.seed=TRUE)



ssr_control <- create_ssr_control(X_names=c("smooth_log_cases", "time_ind"),
    y_names="total_cases",
    time_name=NULL,
    max_lag=list(smooth_log_cases=1,
        time_ind=0),
    prediction_horizon=1,
    kernel_fns=list(smooth_log_cases="squared_exp_kernel",
        time_ind="periodic_kernel"),
    theta_est=list(smooth_log_cases="bw",
        time_ind="bw"),
    theta_fixed=list(time_ind=list(period=2 * pi / 52)),
    theta_transform_fns=list(
        squared_exp_kernel=list(
            bw=list(transform="log",
                detransform="exp")
        ),
        periodic_kernel=list(
            bw=list(transform="log",
                detransform="exp")
        )
    ),
    crossval_buffer=52,
    loss_fn_name="mae_from_kernel_weights_and_centers",
    loss_fn_args=list())

sj$time_ind <- seq_len(nrow(sj))

options(error=recover)
#debug(ssr_crossval_estimate_parameter_loss)
ssr_fit <- ssr(X_names=c("smooth_log_cases", "time_ind"),
    y_names="total_cases",
    time_name=NULL,
    data=sj,
    ssr_control=ssr_control)






### get ssr fit -- prediction horizons 1:52
library(lubridate)
library(ggplot2)
library(plyr)
library(dplyr)
library(reshape)
library(ssr)

sj <- San_Juan_train

## add lag columns
sj <- sj %>% mutate(total_cases_lag3 = lag(total_cases, 3),
    total_cases_lag1 = lag(total_cases))

## add log column
sj$log_total_cases <- log(sj$total_cases + 1)

## add smooth log column
sm <- loess(log_total_cases ~ as.numeric(week_start_date), data=sj, span=1/80)
sj$smooth_log_cases <- sm$fitted

library(doMC)

registerDoMC(cores=5)        ## number of cores on this machine

set.seed(998468235L,kind="L'Ecuyer")
mcopts <- list(preschedule=FALSE,set.seed=TRUE)



# ssr_control <- create_ssr_control(X_names=c("smooth_log_cases", "time_ind"),
#     y_names="total_cases",
#     time_name=NULL,
#     max_lag=list(smooth_log_cases=1,
#         time_ind=0),
ssr_control <- create_ssr_control(X_names=c("smooth_log_cases"),
    y_names="total_cases",
    time_name=NULL,
    max_lag=list(smooth_log_cases=0),
    prediction_horizons=1:52,
    kernel_fns=list(smooth_log_cases="squared_exp_kernel",
        time_ind="periodic_kernel"),
    theta_est=list(smooth_log_cases="bw",
        time_ind="bw"),
    theta_fixed=list(time_ind=list(period=2 * pi / 52)),
    theta_transform_fns=list(
        squared_exp_kernel=list(
            bw=list(transform="log",
                detransform="exp")
        ),
        periodic_kernel=list(
            bw=list(transform="log",
                detransform="exp")
        )
    ),
    crossval_buffer=52,
    loss_fn_name="mae_from_kernel_weights_and_centers",
    loss_fn_args=list())

sj$time_ind <- seq_len(nrow(sj))

options(error=recover)
#debug(ssr_crossval_estimate_parameter_loss)
#ssr_fit <- ssr(X_names=c("smooth_log_cases", "time_ind"),
ssr_fit <- ssr(X_names=c("smooth_log_cases"),
    y_names="total_cases",
    time_name=NULL,
    data=sj,
    ssr_control=ssr_control)






### get ssr fit -- periodic time only
library(doMC)

registerDoMC(cores=5)        ## number of cores on this machine

set.seed(998468235L,kind="L'Ecuyer")
mcopts <- list(preschedule=FALSE,set.seed=TRUE)



ssr_control <- create_ssr_control(X_names=c("time_ind"),
    y_names="total_cases",
    time_name=NULL,
    max_lag=list(time_ind=0),
    prediction_horizon=1,
    kernel_fns=list(time_ind="periodic_kernel"),
    theta_est=list(time_ind="bw"),
    theta_fixed=list(time_ind=list(period=2 * pi / 52)),
    theta_transform_fns=list(
        periodic_kernel=list(
            bw=list(transform="log",
                detransform="exp")
        )
    ),
    crossval_buffer=52,
    loss_fn_name="mae_from_kernel_weights_and_centers",
    loss_fn_args=list())

sj$time_ind <- seq_len(nrow(sj))

options(error=recover)
#debug(ssr_crossval_estimate_parameter_loss)
ssr_fit <- ssr(X_names=c("time_ind"),
    y_names="total_cases",
    time_name=NULL,
    data=sj,
    ssr_control=ssr_control)













ssr_control <- create_ssr_control(X_names="smooth_log_cases",
    y_names="total_cases",
    time_name=NULL,
    max_lag=list(smooth_log_cases=1),
    prediction_horizon=1,
    kernel_fns=list(smooth_log_cases="squared_exp_kernel"),
    theta_est=list(smooth_log_cases="bw"),
    theta_fixed=list(),
    theta_transform_fns=list(
        squared_exp_kernel=list(
            bw=list(transform="log",
                detransform="exp")
        )
    ),
    crossval_buffer=52,
    loss_fn_name="mae_from_kernel_weights_and_centers",
    loss_fn_args=list())



ssr_fits_by_prediction_horizon <- lapply(seq_len(52), function(horizon) {
    list(ssr_control=ssr_control,
        X_names="smooth_log_cases",
        y_names="total_cases",
        time_name=NULL,
        lags_hat=list(smooth_log_cases = c(0, 1)),
        theta_hat=list(smooth_log_cases_lag0=list(bw=.1),
            smooth_log_cases_lag1=list(bw=.1)),
        train_data=sj)
})

make_competition_forecasts(
    ssr_fits_by_prediction_horizon=ssr_fits_by_prediction_horizon,
    n_sims=1000,
    data=sj,
    outfile_path="F:/Reich/dengue-ssr-prediction/competition-predictions",
    location="sanjuan")





ssr_control <- create_ssr_control(X_names="smooth_log_cases",
    y_names="total_cases",
    time_name=NULL,
    max_lag=list(smooth_log_cases=1),
    prediction_horizon=1,
    kernel_fns=list(smooth_log_cases="squared_exp_kernel"),
    theta_est=list(smooth_log_cases="bw"),
    theta_fixed=list(),
    theta_transform_fns=list(
        squared_exp_kernel=list(
            bw=list(transform="log",
                detransform="exp")
        )
    ),
    crossval_buffer=52,
    loss_fn_name="mae_from_kernel_weights_and_centers",
    loss_fn_args=list())



ssr_fits_by_prediction_horizon <- lapply(seq_len(52), function(horizon) {
    list(ssr_control=ssr_control,
        X_names="smooth_log_cases",
        y_names="total_cases",
        time_name=NULL,
        lags_hat=list(smooth_log_cases = c(0, 1)),
        theta_hat=list(smooth_log_cases_lag0=list(bw=.1),
            smooth_log_cases_lag1=list(bw=.1)),
        train_data=sj)
})

make_competition_forecasts(
    ssr_fits_by_prediction_horizon=ssr_fits_by_prediction_horizon,
    n_sims=1000,
    data=sj,
    outfile_path="F:/Reich/dengue-ssr-prediction/competition-predictions",
    location="sanjuan")





















library(lubridate)
library(ggplot2)
library(plyr)
library(dplyr)
library(reshape)
library(ssr)

sj <- San_Juan_train

## add lag columns
sj <- sj %>% mutate(total_cases_lag3 = lag(total_cases, 3),
    total_cases_lag1 = lag(total_cases))

## add log column
sj$log_total_cases <- log(sj$total_cases + 1)

## add smooth log column
sm <- loess(log_total_cases ~ as.numeric(week_start_date), data=sj, span=1/80)
sj$smooth_log_cases <- sm$fitted

ssr_control <- create_ssr_control(X_names="smooth_log_cases",
    y_names="total_cases",
    time_name=NULL,
    max_lag=list(smooth_log_cases=1),
    prediction_horizon=1,
    kernel_fns=list(smooth_log_cases="squared_exp_kernel"),
    theta_est=list(smooth_log_cases="bw"),
    theta_fixed=list(),
    theta_transform_fns=list(
        squared_exp_kernel=list(
            bw=list(transform="log",
                detransform="exp")
        )
    ),
    crossval_buffer=52,
    loss_fn_name="mae_from_kernel_weights_and_centers",
    loss_fn_args=list())



ssr_fit <- list(ssr_control=ssr_control,
        X_names="smooth_log_cases",
        y_names="total_cases",
        time_name=NULL,
        lags_hat=list(smooth_log_cases = c(0, 1)),
        theta_hat=list(smooth_log_cases_lag0=list(bw=.1),
            smooth_log_cases_lag1=list(bw=.1)),
        train_data=sj)

options(error=recover)
options(warn=2)
make_competition_forecasts_by_trajectory(
    ssr_fit=ssr_fit,
    n_sims=1000,
    data=sj,
    outfile_path="F:/Reich/dengue-ssr-prediction/competition-predictions-by-traj",
    location="sanjuan")
reichlab/dengue-ssr-prediction documentation built on May 27, 2019, 4:53 a.m.