#' Run SCUL procedure on target data
#'
#' Calculate a synthetic time series using a penalized regression and a donor pool.
#'
#' @param PostPeriodLength An integer that indicates the length of the post-treatment period.
#' Defailt is calculated using SCUL.input data.
#' @param PrePeriodLength An integer that indicates the length of the pre-treatment period.
#' Defailt is calculated using SCUL.input data.
#' @param NumberInitialTimePeriods An integer that indicates the number of time periods desired in the training data for the first cross-validation run.
#' Default is the stated amount in SCUL.input data.
#' @param OutputFilePath A file path to store output. Default is taken from
#' SCUL.input$OutputFilePath
#' @param x.DonorPool.PreTreatment A data frame with the pre-treament values of the donor pool.
#' Default is to extract this from SCUL.input$x.DonorPool.PreTreatment
#' @param y.PreTreatment A data frame with the pre-treament values of the target variable
#' Default is to extract this from SCUL.input$y.PreTreatment
#' @param x.DonorPool A data frame with the all values of the donor pool.
#' Default is to extract this from SCUL.input$x.DonorPool
#' @param time A data frame that is a column vector (Tx1) sorted by time. Default is taken from
#' SCUL.input$time
#' @param y.actual A data frame that is a column vector (Tx1) sorted by time
#' Default is taken from SCUL.input$y
#' @param TreatmentBeginsAt An integer indicating which row begins treatment.
#' Default is to begin this at SCUL.input$TreatmentBeginsAt.
#' @param TrainingPostPeriodLength The number of timer periods post-treatment for training data. Defaults to all time since treatment begins.
#' @param cvOption Do you want to use the median CV lambda (cvOption = lambda.median),
#' one that produces the minimum MSE (cvOption = lambda.min), or the largest
#' lambda that produces a MSE within one standard error of the minimum MSE
#' (cvOption = lambda.1se).
#' Default is lambda.median.
#' @param plotCV Create a plot of the cross-validated mean squared error against -log(lambda) of the penalty
#' parameter. Default is to not plot CV curve (plotCV == FALSE).
#'
#' @return OutputDataSCUL A list with the synthetic series, weights used to construct the series, and measures of fit.
#' @import glmnet
#' @import stats
#' @export
SCUL <- function(
PostPeriodLength = nrow(SCUL.input$y)-SCUL.input$TreatmentBeginsAt+1,
PrePeriodLength = SCUL.input$TreatmentBeginsAt-1,
NumberInitialTimePeriods = SCUL.input$NumberInitialTimePeriods,
OutputFilePath = SCUL.input$OutputFilePath,
x.DonorPool.PreTreatment = SCUL.input$x.DonorPool.PreTreatment,
y.PreTreatment = SCUL.input$y.PreTreatment,
x.DonorPool = SCUL.input$x.DonorPool,
time = SCUL.input$time,
y.actual = SCUL.input$y,
TreatmentBeginsAt = SCUL.input$TreatmentBeginsAt,
TrainingPostPeriodLength = SCUL.input$TrainingPostPeriodLength,
cvOption = "lambda.median",
plotCV = FALSE
){
# Determine the maximum number of rolling-origin k-fold cross validations that can occur
MaxCrossValidations <- PrePeriodLength-NumberInitialTimePeriods-TrainingPostPeriodLength+1
# Determine the last position of the largest possible training dataset that only uses pre-treatment data
StoppingPoint <- NumberInitialTimePeriods+MaxCrossValidations-1
# What are the number of lambdas we want to search over.
MaxLambdasInGrid <- 100 # by default glmnet won't go passed 100, but it may stop earlier. So picking 100 here ensures that this list is contained.
## Calculate the max-lambda across all of the cross-validation runs.
MaxLambdaList <- matrix(0, 1, MaxCrossValidations)
### First calculate the max lambda for each cross-validation run.
for (i in NumberInitialTimePeriods:StoppingPoint){
#i <- NumberInitialTimePeriods
j = i - NumberInitialTimePeriods + 1
# Specify training data
x.training <-x.DonorPool.PreTreatment[1:i,] # this had me switch row and column. Why?
y.training <-y.PreTreatment[1:i] # this had me switch row and column. Why?
sx <- as.matrix(scale(x.training, scale = apply(x.training, 2, mysd)))
# Get lambda_max: See https://stackoverflow.com/questions/25257780/how-does-glmnet-compute-the-maximal-lambda-value/
MaxLambdaList[j] <- max(abs(colSums(sx*y.training)))/nrow(sx)
}
### Now calculate the lambda sequence
epsilon = .01 # ifelse(nrow(sx) < nvars, 0.01, 1e-04) # default used in glmnet
lambdapath <- round(exp(seq(log(max(MaxLambdaList)), log(min(MaxLambdaList)*epsilon),
length.out = MaxLambdasInGrid)), digits = 10)
# Initialize a matrix of 0's, that is # of lambdas in grid by max # of C.V. runs
# MinimumLambdaRolling <- matrix(0, MaxLambdasInGrid, MaxCrossValidations)
# Initialize a matrix of 0's, that is # of lambdas in grid by max # of C.V. runs
#if (cvPath != lambda.median) {
MinimumLambdaRollingMSE <- matrix(0, MaxLambdasInGrid, MaxCrossValidations)
#}
#if (cvPath == lambda.median) {
MinimumLambdaRollingMSE.median <- matrix(0, 1, MaxCrossValidations)
#}
# Preform lasso for each cross-validation run and store test MSE for each run for each lambda in grid
for (i in NumberInitialTimePeriods:StoppingPoint){
#i <- NumberInitialTimePeriods
j = i -NumberInitialTimePeriods + 1
# Specify training data
x.training <-x.DonorPool.PreTreatment[1:i,] # this had me switch row and column. Why?
y.training <-y.PreTreatment[1:i] # this had me switch row and column. Why?
# Run a lasso with only the training data. It will run on for a grid of lambdas by default
fit = glmnet(x.training, y.training)
# Determine when the testing data begins for this CV run
BeginingOfTestData = i +1
# Determine when the testing data ends for this CV run
EndOfTestData = i + TrainingPostPeriodLength
# Specify testing data
x.testing <-x.DonorPool.PreTreatment[BeginingOfTestData:EndOfTestData,]
y.testing <-y.PreTreatment[BeginingOfTestData:EndOfTestData,]
# When using the non-default lambda path (i.e., extending it to 100. we get a warning)
# Create a predicted y value for the entire pre-treatment time period for each lambda in the the exact grid of lambdas from the training data
#if (cvPath != lambda.median) {
prediction <- predict(fit,
newx = x.DonorPool.PreTreatment,
x = x.training,
y = y.training,
s = lambdapath,
exact = TRUE)
#}
#if (cvPath == lambda.median) {
prediction.median <- predict(fit,
newx = x.DonorPool.PreTreatment,
x = x.training,
y = y.training,
s = fit$lambda,
exact = TRUE)
#}
# Squared error
squared_error <- (prediction[BeginingOfTestData:EndOfTestData,] - y.testing)^2
squared_error.median <- (prediction.median[BeginingOfTestData:EndOfTestData,] - y.testing)^2
# Mean squared error
mse <- colMeans(squared_error)
mse.median <- colMeans(squared_error.median)
#if (cvPath != lambda.median) {
# Save MSE
MinimumLambdaRollingMSE[ , j ] <- mse
#}
#if (cvPath == lambda.median) {
# Save MSE
MinimumLambdaRollingMSE.median[ j ] <- fit$lambda[which.min(mse.median)]
#}
}
medianLambda = median(MinimumLambdaRollingMSE.median)
#if (cvPath != lambda.median) {
# Take mean of MSE to get mean MSE for each lambda across all CV runs
cvMSE <- rowMeans(MinimumLambdaRollingMSE)
# Calculate standard error of MSE for each lambda
cvSE <- apply(MinimumLambdaRollingMSE, 1, sd)
# Extract 1) lambda that minimizes cvMSE, 2) max lambda that prodcuses an MSE within one standard error of the minimum 3) and median min lambda.
lambda <- getOptcv.scul(lambdapath = lambdapath,
mse = cvMSE,
se = cvSE,
fullMSE = MinimumLambdaRollingMSE,
medianLambda = medianLambda)
# Plot CV
if (plotCV == TRUE) {
plotCVfunction(lambdapath = lambdapath,
mse = cvMSE,
se = cvSE,
minLambdas = MinimumLambdaRollingMSE.median,
lambda = lambda,
save.figure = TRUE,
OutputFilePath = OutputFilePath)
}
#}
# Based on options, pick cross validated lambda
CrossValidatedLambda <- switch(cvOption,
lambda.1se = lambda$lambda.1se,
lambda.min = lambda$lambda.min,
lambda.median = lambda$lambda.median,
)
# Run the actual fit
EntirePretreatmentFit = glmnet(x = x.DonorPool.PreTreatment,
y = y.PreTreatment)
# Extract the exact prediction for the cross-validated lambda
y.scul <- predict(x = x.DonorPool.PreTreatment,
y = y.PreTreatment,
newx = as.matrix(x.DonorPool),
EntirePretreatmentFit,
s = CrossValidatedLambda,
exact = TRUE)
# Extract the exact coefficients used
coef.exact = coef(x = x.DonorPool.PreTreatment,
y = y.PreTreatment,
EntirePretreatmentFit,
s = CrossValidatedLambda,
exact = TRUE)
# Calculate the standard deviation of the outcome variable in the pre-treatment period
PreTreatmentSD <- sd(y.PreTreatment[1:TreatmentBeginsAt-1])
# Take the absolute value of the difference between the predicition and the actual data divided by the standard deviation
CohensD <- abs(y.scul[1:(TreatmentBeginsAt-1),]-y.actual[1:(TreatmentBeginsAt-1),])/PreTreatmentSD
# Calculate the mean pre-treatment cohen's D
MeanCohensD <- mean(CohensD)
# unlist y.actual
y.actual <- as.numeric(unlist(y.actual))
time <- as.numeric(unlist(time))
# Wrap all of the items we will need for later into a list.
OutputDataSCUL <- list(
time = time,
y.actual = y.actual,
y.scul = y.scul,
CrossValidatedLambda = CrossValidatedLambda,
TreatmentBeginsAt = TreatmentBeginsAt,
CohensD = MeanCohensD,
coef.exact = coef.exact,
lambda = lambda
)
# Return list
return(OutputDataSCUL)
# TODO (hollina): Figure out why y.actual and time are coming in as a list.
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.