R/insample_sim_hh1.R

#' Hopkins Hybrid 1: Run a simulation and then perform prediction.  Based on insample_sim().  Censoring is
#' fabricated a la the SES of the Growth data.  We will make a _hh2 to do a more realistic censoring.
#'
#' This function comes after many months of running readme_sim and talks with Bryan Lau on
#' what we need to demonstrate for the method.
#'
#' @param sim_seed passed to set.seed()
#' @param sample_size (defaults to 1000) and must be a multiple of 100
#' @param sim_slope see slope in calculate_ses()
#' @param sim_intercept see intercept in calculate_ses()
#' @param sim_ses_coef see ses_coef in apply_censoring()
#' @param sim_age_coef see age_coef in apply_censoring()
#' @param hh_rds the path in single quotes to the local copy of hopkins_hybrid.RDS
#' @export
#' @return results a data frame with rmse for each approach for that simulated dataset
#' @examples
#' ---
insample_sim_hh1 <- function(sim_seed=101, sample_size=1000, sim_slope=100,
                                  sim_intercept=12, sim_ses_coef=.01, sim_age_coef=.01,
                            hh_rds='./data_local/hopkins_hybrid.RDS'){
  ## quick start with the default values as variables for testing.  Loads packages.
  #test_prep_script()
  ## get the data prepped, that is simulated and censored and calculate missing.
  #simulate_censor_summarize()

  ## just for testing; comment out before github commits
  library(devtools); library(roxygen2); install(); document(); 
  hh_rds='./data_local/hopkins_hybrid.RDS'; sim_seed=101; sample_size=1000; sim_slope=100; sim_intercept=12; sim_ses_coef=.01; sim_age_coef=.01;
  ##hh_rds='./data_local/hopkins_hybrid.RDS'; sim_seed=101; sample_size=5000; sim_slope=100; sim_intercept=12; sim_ses_coef=.01; sim_age_coef=.01;
  ##hh_rds='./data_local/hopkins_hybrid.RDS'; sim_seed=101; sample_size=5000; sim_slope=100; sim_intercept=12; sim_ses_coef=.005; sim_age_coef=.005;


  ## d will be a 39 x 32 matrix based on the males from the Berkeley Growth Study in `fda` package
  ## we oversample d based on the fpc as well extract out the ages of measurement
  d<-readRDS(hh_rds)
  age_vec <- c(as.numeric(colnames(d[,-1,with=FALSE])))
  over_samp_mat<-sample_data_fpc(as.matrix(d), sample_size, seed=sim_seed, timepoints=age_vec)

  ## we calculate ses on the oversampled dataset and turn the matrix into a long dataset.
  with_ses <- calculate_ses(over_samp_mat, slope=sim_slope, intercept=sim_intercept)
  long <-make_long(with_ses)

  ## In this chunk we apply censoring.
  censored <- apply_censoring(long, ses_coef=sim_ses_coef, age_coef=sim_age_coef, protected=1:4)

  ## Measurement error: within 1/8 inch.  Can comment out or change.
  censored$inches <- censored$inches + runif(length(censored$inches), -1/8,1/8)

  ## observed_with_stipw has the standardized inverse probability weights (stipw)
  ## wtd_trajectories has same info as observed_with_stipw but has inches_wtd_hadamard as well
  ## we calculate all_with_stipw with NAs
       all_with_stipw <- calculate_stipw(censored,"keep")
  observed_with_stipw <- calculate_stipw(censored,"omit")
  wtd_trajectories    <- calculate_wtd_trajectories(observed_with_stipw)




  ## use data.table where possible speeds up future rbinds post simulation
#setDT(long)
#setkey(long, id, age)

setDT(all_with_stipw)
setkey(all_with_stipw, newid, age, inches, ses)

setDT(wtd_trajectories)
setkey(wtd_trajectories, newid, age, inches, ses)

interim <- wtd_trajectories[all_with_stipw]

## BEGIN:  new step...DEAN:
key.interim <- unique(interim[,c("age","remaining","denom"), with=FALSE])[!is.na(remaining) & !is.na(remaining)]
setkey(key.interim, age)
setkey(interim, age)
holder<-key.interim[interim]
holder[is.na(stipw01.n), stipw01.n := remaining*(stipw2/denom) ]
## END:  new step...DEAN:


## Dean step: change below to holder (formerly interim)
all_with_stipw<-subset(holder,
                       select=c(names(all_with_stipw),
                                "inches_wtd_hadamard",
                                "remaining",
                                "stipw",
                                "stipw01",
                                "stipw01.n"))


## DEAN step: reset keys;
setkey(all_with_stipw, newid, age, inches, ses)
setkey(wtd_trajectories, newid, age, inches, ses)



all_with_stipw[            , inches_ltfu:=inches ]
all_with_stipw[is.na(stipw), inches_ltfu:=NA ]

## calculate "remean" data prep:
all_with_stipw[,
               inches_wtd_remean:= inches_ltfu -
                                   mean(inches_ltfu, na.rm=T) +
                                   mean(inches_wtd_hadamard , na.rm=T),
               by=age]


## standardize the names selected across all approaches and their resultant  data sets
selected<-c("newid","age", "ses",

            "inches", "inches_wtd_hadamard","inches_wtd_remean", "inches_ltfu", "inches_predicted",

            "remaining",
            "stipw",
            "stipw01",
            "stipw01.n",

            "approach")

  ## Note: we can calulate this post-simulation
  ## truth, all data:
  ##true_avg <- ddply(long, .(age), function(w)  mean(w$inches))
  ##true_avg$approach<- "true_avg"

## note: we're simplifying to (w)fpca and (w)lme
## compare the bias of mean trajectories of the following approaches applied to the
## censored/observed data, not the truth:
##a) naive-nonparametric: that is, just na.rm=TRUE and turn the mean() crank
# naive_non_parm_avg <- ddply(observed_with_stipw, .(age), function(w)  mean(w$inches))
# naive_non_parm_avg$approach <- "naive_non_parm_avg"
# ## combine with long for the prediction:
# setDT(naive_non_parm_avg)
# setnames(naive_non_parm_avg, "V1", "inches_predicted")
# setkey(naive_non_parm_avg, age, inches_predicted)
# setkey(long, age) ## no id in this method.
# dim(naive_non_parm_avg)
# dim(long)
# naive_non_parm_avg <- naive_non_parm_avg[long]
# setkey(naive_non_parm_avg, newid, age, inches_predicted)
# naive_non_parm_avg <- subset(naive_non_parm_avg, select=c("newid","age", "inches", "inches_predicted", "approach"))


## note: we're simplifying to (w)fpca and (w)lme
##b) weighted-nonparametric: a little more sophisticated, na.rm=TRUE and turn the weighted mean() crank
# wtd_non_parm_avg <- ddply(observed_with_stipw, .(age), function(w) sum(w$stipw*w$inches)/sum(w$stipw))
# wtd_non_parm_avg$approach <- "wtd_non_parm_avg"
# ## combine with long for the prediction:
# setDT(wtd_non_parm_avg)
# setnames(wtd_non_parm_avg, "V1", "inches_predicted")
# setkey(wtd_non_parm_avg, age, inches_predicted)
# setkey(long, age) ## no id in this method.
# dim(wtd_non_parm_avg)
# dim(long)
# wtd_non_parm_avg <- wtd_non_parm_avg[long]
# setkey(wtd_non_parm_avg, newid, age, inches_predicted)
# wtd_non_parm_avg <- subset(wtd_non_parm_avg, select=c("newid","age", "inches", "inches_predicted", "approach"))


##c)  naive-FPC,
unwtd_fncs      <- dcast(data=wtd_trajectories, formula= newid~age, value.var="inches")
naive_fpc_proc_minutes <- system.time(
  fpca_unwtd_fncs <- fpca.face(Y=as.matrix(unwtd_fncs)[,-1], argvals=age_vec, knots=7)
)[3]/60
naive_fpc       <- data.frame(age=age_vec, V1=fpca_unwtd_fncs$mu, approach="naive_fpc")
## combine with long for the prediction:
## a little different than previous examples; now we do have individual level curves
## need to extract them (Yhat) and rename them and data.table them
naive_fpc_indiv <- as.data.frame(cbind(1:nrow(fpca_unwtd_fncs$Yhat),fpca_unwtd_fncs$Yhat))
colnames(naive_fpc_indiv) <- colnames(unwtd_fncs)
setDT(naive_fpc_indiv)
naive_fpc <- melt(naive_fpc_indiv,
                  id.vars=c("newid"),
                  variable.name = "age",
                  variable.factor=FALSE,
                  value.name="inches_predicted")
naive_fpc[,approach:="naive_fpc",]
naive_fpc[,age:=as.numeric(age)]
setDT(naive_fpc)
setkey(naive_fpc, newid, age)
naive_fpc  <- naive_fpc[all_with_stipw]
setkey(naive_fpc, newid, age)
naive_fpc <- subset(naive_fpc, select=selected)
## add these on post dean:  minutes and number of principle components (npc)
naive_fpc[, minutes:=naive_fpc_proc_minutes]
naive_fpc[, number_pc:=fpca_unwtd_fncs$npc]

## visual checks
# ggplot(data=naive_fpc[newid %in% c(2)], aes(x=age, y=inches_predicted, color=factor(newid)))+geom_point()+
#   geom_point(aes(y=inches_ltfu), color='black')


## visual checks
# ggplot(data=naive_fpc[newid %in% c(1,2,5)], 
#        aes(x=age, y=inches_predicted, color=factor(newid)))+
#   geom_path()+
#   geom_point()+
#   geom_line(aes(y=inches_ltfu, id=factor(newid)), color='black' )+
#   geom_point(aes(y=inches_ltfu, id=factor(newid), shape=factor(newid)), color='black')




##e) remean - weighted-FPC.
wtd_remean_fncs <- dcast(data=subset(all_with_stipw,
                                     select=c("newid","age","inches_wtd_remean")),
                         formula= newid~age, value.var="inches_wtd_remean")
weighted_remean_fpc_proc_minutes <- system.time(
  fpca_wtd_remean_fncs <- fpca.face(Y=as.matrix(wtd_remean_fncs)[,-1], argvals=age_vec, knots=7)
)[3]/60
#weighted_fpc <- data.frame(age=age_vec, V1=fpca_wtd_fncs$mu, approach="weighted_fpc")
## combine with long for the prediction:
## a little different than previous examples; now we do have individual level curves
## need to extract them (Yhat) and rename them and data.table them
weighted_remean_fpc_indiv <- as.data.frame(cbind(1:nrow(fpca_wtd_remean_fncs$Yhat),
                                                 fpca_wtd_remean_fncs$Yhat))
colnames(weighted_remean_fpc_indiv) <- colnames(wtd_remean_fncs)
setDT(weighted_remean_fpc_indiv)
weighted_remean_fpc <- melt(weighted_remean_fpc_indiv,
                     id.vars=c("newid"),
                     variable.name = "age",
                     variable.factor=FALSE,
                     value.name="inches_predicted")
weighted_remean_fpc[,approach:="wtd_remean_fpc",]
weighted_remean_fpc[,age:=as.numeric(age)]
setDT(weighted_remean_fpc)
setkey(weighted_remean_fpc, newid, age)
weighted_remean_fpc <- weighted_remean_fpc[all_with_stipw]
setkey(weighted_remean_fpc, newid, age)
weighted_remean_fpc <- subset(weighted_remean_fpc, select=selected)
## add these on post dean:  minutes and number of principle components (npc)
weighted_remean_fpc[, minutes:=weighted_remean_fpc_proc_minutes]
weighted_remean_fpc[, number_pc:=fpca_wtd_remean_fncs$npc]
## visual checks
ggplot(data=weighted_remean_fpc[newid %in% c(1,2,5)], 
       aes(x=age, y=inches_predicted, color=factor(newid)))+
  geom_path()+
  geom_point()+
  geom_line(aes(y=inches_ltfu, id=factor(newid)), color='black' )+
  geom_point(aes(y=inches_ltfu, id=factor(newid), shape=factor(newid)), color='black')


##e) weighted-FPC.
wtd_fncs <- dcast(data=wtd_trajectories, formula= newid~age, value.var="inches_wtd_hadamard")
weighted_fpc_proc_minutes <- system.time(
  fpca_wtd_fncs <- fpca.face(Y=as.matrix(wtd_fncs)[,-1], argvals=age_vec, knots=7)
)[3]/60
#weighted_fpc <- data.frame(age=age_vec, V1=fpca_wtd_fncs$mu, approach="weighted_fpc")
## combine with long for the prediction:
## a little different than previous examples; now we do have individual level curves
## need to extract them (Yhat) and rename them and data.table them
weighted_fpc_indiv <- as.data.frame(cbind(1:nrow(fpca_wtd_fncs$Yhat),fpca_wtd_fncs$Yhat))
colnames(weighted_fpc_indiv) <- colnames(wtd_fncs)
setDT(weighted_fpc_indiv)
weighted_fpc <- melt(weighted_fpc_indiv,
                     id.vars=c("newid"),
                     variable.name = "age",
                     variable.factor=FALSE,
                     value.name="inches_predicted")
weighted_fpc[,approach:="wtd_hadamard_fpc",]
weighted_fpc[,age:=as.numeric(age)]
setDT(weighted_fpc)
setkey(weighted_fpc, newid, age)
weighted_fpc <- weighted_fpc[all_with_stipw]
## begin extra steps: (weighted inputs average out for mean, but need to be de-weighted for individual)
setDT(wtd_trajectories)
## pre-DEAN: wts <- subset(wtd_trajectories, select=c("newid","age","stipw01.n"))
## post-DEAN: subset all_with_stipw
wts <- subset(all_with_stipw, select=c("newid","age","stipw01.n"))
setkey(wts, newid, age)

## can't do curve completion with these three knuckleheads
#weighted_fpc.test<-weighted_fpc[wts]
#weighted_fpc.test[, inches_predicted_weighted:= inches_predicted]
#weighted_fpc.test[, inches_predicted_deweighted:= inches_predicted/stipw01.n]

weighted_fpc.test<-wts[weighted_fpc]
## see how many 0's
weighted_fpc.test[,table(round(stipw01.n,2))]
## change 0's to NA
weighted_fpc.test[stipw01.n==0, stipw01.n:=NA]
weighted_fpc.test[, inches_predicted_weighted:= inches_predicted]
weighted_fpc.test[!is.na(stipw01.n), inches_predicted_deweighted:= inches_predicted/stipw01.n]
## get the wtd_population_mean in there:
## skip this time:  ##weighted_fpc.test[,wtd_pop_mean:=fpca_wtd_fncs$mu,by=newid]
##for now, if don't have observed data there, we just imputed the weighted mean
## kinda lame, think on it.
## skip this time:  ##weighted_fpc.test[is.na(stipw01.n), inches_predicted_deweighted:= wtd_pop_mean, by=newid]
## end extra steps:
setkey(weighted_fpc.test, newid, age)#, inches_predicted_deweighted)
setnames(weighted_fpc.test, "inches_predicted", "inches_predicted_old")
setnames(weighted_fpc.test, "inches_predicted_deweighted", "inches_predicted")
weighted_fpc <- subset(weighted_fpc.test, select=selected)
## add these on post dean:  minutes and number of principle components (npc)
weighted_fpc[, minutes:=weighted_fpc_proc_minutes]
weighted_fpc[, number_pc:=fpca_wtd_fncs$npc]
## visual checks
ggplot(data=weighted_fpc[newid %in% c(1,2,5, 500, 999,1000)], 
       aes(x=age, y=inches_predicted, color=factor(newid)))+
  geom_path()+
  geom_point()+
  geom_line(aes(y=inches_ltfu, id=factor(newid)), color='black' )+
  geom_point(aes(y=inches_ltfu, id=factor(newid), shape=factor(newid)), color='black')


##f) lme
naive_lme<-tryCatch(
{
  #naive_lme_model<-lme(inches ~ bs(age, df=15), random=~1|newid, data=observed_with_stipw);
  #naive_lme <- data.frame(age=age_vec, V1=predict(naive_lme_model, newdata=data.frame(age=age_vec), level=0), approach="naive_lme")

  ## for 7*12 timepts and 1000 subjects, df=7 gives warning.  Stay at df=5.
  ## for 7*12 timepts and 5000 subjects, df=5 gives FATAL ERROR for ses_coef=0.05
  ## for 7*12 timepts and 5000 subjects, df=5    is OK          for ses_coef=0.01
    naive_lme_proc_minutes <- system.time(
  naive_lme_model<-lmer(inches ~ bs(age, df=5) + (bs(age, df=5)|newid), data=observed_with_stipw)
          )[3]/60
  ## re.form=~0
  #naive_lme <- data.frame(age=age_vec, V1=predict(naive_lme_model, newdata=data.frame(age=age_vec), re.form=~0), approach="naive_lme")
  ## re.form=~1
  naive_lme <- data.table(newid            = all_with_stipw$newid,
                          age              = all_with_stipw$age,
                          ses              = all_with_stipw$ses,
                          inches           = all_with_stipw$inches,
                          inches_wtd_hadamard       = all_with_stipw$inches_wtd_hadamard,
                          inches_wtd_remean         = all_with_stipw$inches_wtd_remean,
                          inches_ltfu      = all_with_stipw$inches_ltfu,
                          inches_predicted = predict(naive_lme_model,
                                                     newdata=all_with_stipw),
                          remaining        = all_with_stipw$remaining,
                          stipw            = all_with_stipw$stipw,
                          stipw01          = all_with_stipw$stipw01,
                          stipw01.n        = all_with_stipw$stipw01.n,
                          approach="naive_lme",
                          minutes          = naive_lme_proc_minutes,
                          number_pc        = NA)
},
warning =function(cond){
  write.csv(observed_with_stipw, paste0("data_that_failed_nlme_lme_fit_",abs(rnorm(1,100,100)),".csv"), row.names=FALSE)
  ## re.form=~0
  ##naive_lme <- data.frame(age=age_vec, V1=NA, approach="naive_lme")  ;
  naive_lme <- data.table(newid            = all_with_stipw$newid,
                          age              = all_with_stipw$age,
                          ses              = all_with_stipw$ses,
                          inches           = all_with_stipw$inches,
                          inches_wtd_hadamard       = all_with_stipw$inches_wtd_hadamard,
                          inches_wtd_remean         = all_with_stipw$inches_wtd_remean,
                          inches_ltfu      = all_with_stipw$inches_ltfu,
                          inches_predicted = NA,
                          remaining        = all_with_stipw$remaining,
                          stipw            = all_with_stipw$stipw,
                          stipw01          = all_with_stipw$stipw01,
                          stipw01.n        = all_with_stipw$stipw01.n,
                          approach="naive_lme",
                          minutes          = naive_lme_proc_minutes,
                          number_pc        = NA)
},
error =function(cond){
  write.csv(observed_with_stipw, paste0("data_that_failed_nlme_lme_fit_",abs(rnorm(1,100,100)),".csv"), row.names=FALSE)
  ## re.form=~0
  #naive_lme <- data.frame(age=age_vec, V1=NA, approach="naive_lme")  ;
  naive_lme <- data.table(newid            = all_with_stipw$newid,
                          age              = all_with_stipw$age,
                          ses              = all_with_stipw$ses,
                          inches           = all_with_stipw$inches,
                          inches_wtd_hadamard       = all_with_stipw$inches_wtd_hadamard,
                          inches_wtd_remean         = all_with_stipw$inches_wtd_remean,
                          inches_ltfu      = all_with_stipw$inches_ltfu,
                          inches_predicted = NA,
                          remaining        = all_with_stipw$remaining,
                          stipw            = all_with_stipw$stipw,
                          stipw01          = all_with_stipw$stipw01,
                          stipw01.n        = all_with_stipw$stipw01.n,
                          approach="naive_lme",
                          minutes          = naive_lme_proc_minutes,
                          number_pc        = NA)
})
setkey(naive_lme, newid, age)
# quick checks:
# summary(naive_lme)
## visual checks
ggplot(data=naive_lme[newid %in% c(1,2,5, 500, 999,1000)], 
       aes(x=age, y=inches_predicted, color=factor(newid)))+
  geom_path()+
  geom_point()+
  geom_line(aes(y=inches_ltfu, id=factor(newid)), color='black' )+
  geom_point(aes(y=inches_ltfu, id=factor(newid), shape=factor(newid)), color='black')


##f) weighted_remean_lme
wtd_remean_lme<-tryCatch(
{
  #naive_lme_model<-lme(inches ~ bs(age, df=15), random=~1|newid, data=observed_with_stipw);
  #naive_lme <- data.frame(age=age_vec, V1=predict(naive_lme_model, newdata=data.frame(age=age_vec), level=0), approach="naive_lme")



#   wtd_remean_lme_model<-lmer(inches_wtd_remean ~ bs(age, df=15) + (1|newid), data=all_with_stipw,
#                              na.action=na.omit);
  ## for 7*12 timepts and 1000 subjects, df=7 gives warning.  Stay at df=5.
      wtd_remean_proc_minutes <- system.time(
  wtd_remean_lme_model<-lmer(inches_wtd_remean ~ bs(age, df=5) + (bs(age, df=5)|newid), data=all_with_stipw,
                             na.action=na.omit)
            )[3]/60
  ## re.form=~0
  #naive_lme <- data.frame(age=age_vec, V1=predict(naive_lme_model, newdata=data.frame(age=age_vec), re.form=~0), approach="naive_lme")
  ## re.form=~1
  wtd_remean_lme <-
               data.table(newid            = all_with_stipw$newid,
                          age              = all_with_stipw$age,
                          ses              = all_with_stipw$ses,
                          inches           = all_with_stipw$inches,
                          inches_wtd_hadamard = all_with_stipw$inches_wtd_hadamard,
                          inches_wtd_remean         = all_with_stipw$inches_wtd_remean,
                          inches_ltfu      = all_with_stipw$inches_ltfu,
                          inches_predicted = predict(wtd_remean_lme_model,
                                                     newdata=all_with_stipw),
                          remaining        = all_with_stipw$remaining,
                          stipw            = all_with_stipw$stipw,
                          stipw01          = all_with_stipw$stipw01,
                          stipw01.n        = all_with_stipw$stipw01.n,
                          approach="wtd_remean_lme",
                          minutes = wtd_remean_proc_minutes,
                          number_pc = NA)
},
warning =function(cond){
  write.csv(observed_with_stipw, paste0("data_that_failed_nlme_lme_fit_",abs(rnorm(1,100,100)),".csv"), row.names=FALSE)
  ## re.form=~0
  ##naive_lme <- data.frame(age=age_vec, V1=NA, approach="naive_lme")  ;
  wtd_remean_lme <- data.table(newid            = all_with_stipw$newid,
                          age              = all_with_stipw$age,
                          ses              = all_with_stipw$ses,
                          inches           = all_with_stipw$inches,
                          inches_wtd_hadamard       = all_with_stipw$inches_wtd_hadamard,
                          inches_wtd_remean         = all_with_stipw$inches_wtd_remean,
                          inches_ltfu      = all_with_stipw$inches_ltfu,
                          inches_predicted = NA,
                          remaining        = all_with_stipw$remaining,
                          stipw            = all_with_stipw$stipw,
                          stipw01          = all_with_stipw$stipw01,
                          stipw01.n        = all_with_stipw$stipw01.n,
                          approach="wtd_remean_lme",
                          minutes = wtd_remean_proc_minutes,
                          number_pc = NA)
},
error =function(cond){
  write.csv(observed_with_stipw, paste0("data_that_failed_nlme_lme_fit_",abs(rnorm(1,100,100)),".csv"), row.names=FALSE)
  ## re.form=~0
  #naive_lme <- data.frame(age=age_vec, V1=NA, approach="naive_lme")  ;
  wtd_remean_lme <- data.table(newid            = all_with_stipw$newid,
                          age              = all_with_stipw$age,
                          ses              = all_with_stipw$ses,
                          inches           = all_with_stipw$inches,
                          inches_wtd_hadamard       = all_with_stipw$inches_wtd_hadamard,
                          inches_wtd_remean         = all_with_stipw$inches_wtd_remean,
                          inches_ltfu      = all_with_stipw$inches_ltfu,
                          inches_predicted = NA,
                          remaining        = all_with_stipw$remaining,
                          stipw            = all_with_stipw$stipw,
                          stipw01          = all_with_stipw$stipw01,
                          stipw01.n        = all_with_stipw$stipw01.n,
                          approach="wtd_remean_lme",
                          minutes = wtd_remean_proc_minutes,
                          number_pc = NA)
})
setkey(wtd_remean_lme, newid, age)
## quick checks:
# summary(wtd_remean_lme)
## visual checks
ggplot(data=wtd_remean_lme[newid %in% c(1,2,5, 500, 999,1000)], 
       aes(x=age, y=inches_predicted, color=factor(newid)))+
  geom_path()+
  geom_point()+
  geom_line(aes(y=inches_ltfu, id=factor(newid)), color='black' )+
  geom_point(aes(y=inches_ltfu, id=factor(newid), shape=factor(newid)), color='black')



## I have two wtd_lme chunks -- only have one uncommented at a time!
## the one immediately preceding this comment is lme(of wtd inches);
## whereas the other one below it are lme4::lmer of inches.
# wtd_lme<-tryCatch(
# {
#   wtd_lme_model<-lme(inches_wtd_hadamard ~ bs(age, df=15), random=~1|newid, data=wtd_trajectories,
#                      na.action=na.omit);
#   ##predict(wtd_lme_model, newdata=data.frame(age=age_vec), level=0)
#   wtd_lme <- data.frame(age=age_vec, V1=predict(wtd_lme_model, newdata=data.frame(age=age_vec), level=0), approach="wtd_lme")
# },
# warning =function(cond){
#   wtd_lme <- data.frame(age=age_vec, V1=NA, approach="wtd_lme")  ;
#   wtd_lme
# },
# error =function(cond){
#   wtd_lme <- data.frame(age=age_vec, V1=NA, approach="wtd_lme")  ;
#   wtd_lme
# })
# summary(wtd_lme)
#

wtd_lme<-tryCatch(
{
  #   wtd_lme_model<-lme(inches_wtd_hadamard ~ bs(age, df=15), random=~1|newid, data=wtd_trajectories,
  #                      na.action=na.omit);

#   wtd_lme_model<-lmer(inches_wtd_hadamard ~ bs(age, df=15) + (1|newid), data=wtd_trajectories,
#                       na.action=na.omit)
  wtd_lme_proc_minutes <- system.time(
  wtd_lme_model<-lmer(inches_wtd_hadamard ~ bs(age, df=5) + (bs(age, df=5)|newid), data=wtd_trajectories,
                      na.action=na.omit)
  )[3]/60
  ##predict(wtd_lme_model, newdata=data.frame(age=age_vec), level=0)
  ##wtd_lme <- data.frame(age=age_vec, V1=predict(wtd_lme_model, newdata=data.frame(age=age_vec), level=0), approach="wtd_lme")
  ## note: below, use re.form=~0 in lme4:predict is equivalent to level=0 in nlme:lme
  ## re.form=~0
  ##wtd_lme <- data.frame(age=age_vec, V1=predict(wtd_lme_model, newdata=data.frame(age=age_vec), re.form=~0), approach="wtd_lme")
  #wtd_lme_pop_mean <- data.frame(age=age_vec, V1=predict(wtd_lme_model, newdata=data.frame(age=age_vec), re.form=~0), approach="wtd_lme")

  ## re.form=~1
  wtd_lme1  <- data.table(newid            = all_with_stipw$newid,
                          age              = all_with_stipw$age,
                          ses              = all_with_stipw$ses,
                          inches           = all_with_stipw$inches,
                          inches_wtd_hadamard       = all_with_stipw$inches_wtd_hadamard,
                          inches_wtd_remean       = all_with_stipw$inches_wtd_remean,
                          inches_ltfu      = all_with_stipw$inches_ltfu,
                          inches_predicted = predict(wtd_lme_model,
                                                     newdata=all_with_stipw),
                          remaining        = all_with_stipw$remaining,
                          stipw            = all_with_stipw$stipw,
                          stipw01          = all_with_stipw$stipw01,
                          stipw01.n        = all_with_stipw$stipw01.n,
                          approach="wtd_hadamard_lme")

  wtd_lme.test<- wts[wtd_lme1]
  ## see how many 0's
  wtd_lme.test[,table(round(stipw01.n,2))]
  ## change 0's to NA
  wtd_lme.test[stipw01.n==0, stipw01.n:=NA]
  wtd_lme.test[, inches_predicted_weighted:= inches_predicted]
  wtd_lme.test[!is.na(stipw01.n), inches_predicted_deweighted:= inches_predicted/stipw01.n]
  ## get the wtd_population_mean in there:
  ## skip this time ## wtd_lme.test[,wtd_pop_mean:=wtd_lme_pop_mean$V1,by=newid]
  ##for now, if don't have observed data there, we just imputed the weighted mean
  ## kinda lame, think on it.
  ## skip this time ## wtd_lme.test[is.na(stipw01.n), inches_predicted_deweighted:= wtd_pop_mean, by=newid]
  ## end extra steps:
  setkey(wtd_lme.test, newid, age)#, inches_predicted_deweighted)
  setnames(wtd_lme.test, "inches_predicted", "inches_predicted_old")
  setnames(wtd_lme.test, "inches_predicted_deweighted", "inches_predicted")
  wtd_lme <- subset(wtd_lme.test, select=selected)
  wtd_lme[,minutes:=wtd_lme_proc_minutes]
  wtd_lme[,number_pc:=NA]
  },
warning =function(cond){
  write.csv(wtd_trajectories, paste0("data_that_failed_lme4_lmer_fit_",abs(rnorm(1,100,100)),".csv"), row.names=FALSE)
  #wtd_lme <- data.frame(age=age_vec, V1=cond, approach="wtd_lme")  ;
  wtd_lme   <- data.table(newid            = all_with_stipw$newid,
                          age              = all_with_stipw$age,
                          ses              = all_with_stipw$ses,
                          inches           = all_with_stipw$inches,
                          inches_wtd_hadamard       = all_with_stipw$inches_wtd_hadamard,
                          inches_wtd_remean       = all_with_stipw$inches_wtd_remean,
                          inches_ltfu      = all_with_stipw$inches_ltfu,
                          inches_predicted = NA,
                          remaining        = all_with_stipw$remaining,
                          stipw            = all_with_stipw$stipw,
                          stipw01          = all_with_stipw$stipw01,
                          stipw01.n        = all_with_stipw$stipw01.n,
                          approach="wtd_hadamard_lme",
                          minutes=wtd_lme_proc_minutes,
                          number_pc=NA)
},
error =function(cond){
  write.csv(wtd_trajectories, paste0("data_that_failed_lme4_lmer_fit_",abs(rnorm(1,100,100)),".csv"), row.names=FALSE)
  #wtd_lme <- data.frame(age=age_vec, V1=cond, approach="wtd_lme")  ;
  wtd_lme   <- data.table(newid            = all_with_stipw$newid,
                          age              = all_with_stipw$age,
                          ses              = all_with_stipw$ses,
                          inches           = all_with_stipw$inches,
                          inches_wtd_hadamard       = all_with_stipw$inches_wtd_hadamard,
                          inches_wtd_remean       = all_with_stipw$inches_wtd_remean,
                          inches_ltfu      = all_with_stipw$inches_ltfu,
                          inches_predicted = NA,
                          remaining        = all_with_stipw$remaining,
                          stipw            = all_with_stipw$stipw,
                          stipw01          = all_with_stipw$stipw01,
                          stipw01.n        = all_with_stipw$stipw01.n,
                          approach="wtd_hadamard_lme",
                          minutes=wtd_lme_proc_minutes,
                          number_pc=NA)
})
setkey(wtd_lme, newid, age)
## quick checks:
# summary(wtd_lme)
## visual checks
ggplot(data=wtd_lme[newid %in% c(1,2,5, 500, 999,1000)], 
       aes(x=age, y=inches_predicted, color=factor(newid)))+
  geom_path()+
  geom_point()+
  geom_line(aes(y=inches_ltfu, id=factor(newid)), color='black' )+
  geom_point(aes(y=inches_ltfu, id=factor(newid), shape=factor(newid)), color='black')




#rbind it!!!!!!!!!!!!!



## plot each approach's mean
## means <- rbind(true_avg, naive_non_parm_avg, wtd_non_parm_avg, naive_fpc, naive_fpc_pabw, weighted_fpc, naive_lme, wtd_lme)
# note: we're simplifying to (w)fpca and (w)lme, so we rbind() fewer than line above (if you decided to add more approaches later,
#         make sure they have same format):
means <- rbind(naive_fpc, weighted_fpc, weighted_remean_fpc, naive_lme, wtd_lme, wtd_remean_lme)
means[,     newid := as.integer(newid)]
means[, remaining := as.integer(remaining)]
setkey(means, "newid","age")



didya<-dcast(means, newid+age ~ approach, value.var="inches_predicted")
## after rbind multiple instances, use this to melt it
##melt.didya <- melt(didya, id.vars=c("newid","age"), variable="approach", value="inches_predicted")


# ## make additions columnwise, not rbind-wise.  Save those GBs.  Can melt once in memory.
# key(naive_fpc)
# key(weighted_fpc)
# key(weighted_remean_fpc)
# key(naive_lme)
# key(wtd_lme)
# key(wtd_remean_lme)
#
base.select <-
            c("newid","age", "ses",
              "inches", "inches_wtd_hadamard","inches_wtd_remean", "inches_ltfu",
#               "inches_predicted",
               "remaining",
               "stipw",
               "stipw01",
               "stipw01.n"#,
#               "approach"
              )

base <- subset(naive_fpc, select=base.select)

base_means <- base[didya]




# from older files:
# means$approach<-factor(means$approach, levels=unique(means$approach))
# colnames(means)[colnames(means)=="V1"]<- "inches"


## next three chunks deal with missingness, then we rbind it us with means....

## this is a useful chunk to check the range of
## probality of being censored (prob.cens)
#   ddply(censored, .(age), function(w) sum(w$instudy==1) )
#   melt.prob.cens=ddply(censored, .(newid,age), function(w) w$prob.cens )
#   dcast.prob.cens=dcast(melt.prob.cens, newid~age, value.var="V1")
#   apply(dcast.prob.cens, 2, function(w) round(range(w),2))
#   head(censored,18)

## do the following to get precent missing at age 18:
## overall:
dcast.wtd.trajectories<-dcast(calculate_wtd_trajectories(calculate_stipw(censored,"keep")), newid~age, value.var="stipw")
percent.missing.at.age.18=sum(is.na(dcast.wtd.trajectories["18"]))/length(unlist(dcast.wtd.trajectories["18"]))
percent.missing = colSums(is.na(dcast.wtd.trajectories[,-1]))/length(unlist(dcast.wtd.trajectories["18"]))

## below/above median SES:
dcast.wtd.trajectories<-dcast(calculate_wtd_trajectories(calculate_stipw(censored,"keep")), newid+ses~age, value.var="stipw")
medianSES<-median(dcast.wtd.trajectories$ses, na.rm=TRUE)
subbie<-subset(dcast.wtd.trajectories, ses <= medianSES,"18")
percent.missing.at.age.18.below.median=sum(is.na(subbie))/nrow(subbie)
subbie<-subset(dcast.wtd.trajectories, ses <= medianSES, select=c(-1,-2))
percent.missing.below.median=colSums(is.na(subbie))/nrow(subbie)
subbie<-subset(dcast.wtd.trajectories, ses  > medianSES,"18")
percent.missing.at.age.18.above.median=sum(is.na(subbie))/nrow(subbie)
subbie<-subset(dcast.wtd.trajectories, ses  > medianSES, select=c(-1,-2))
percent.missing.above.median=colSums(is.na(subbie))/nrow(subbie)

## note: this cbind() works because every age is present for each dataset in `means`
##       and the only non-scalars are vectors that are same length as number of age-levels
## return:
results <- cbind(
        sim_seed                     = as.integer(sim_seed),
        sample_size                  = as.integer(sample_size),
        sim_slope                    = as.integer(sim_slope),
        sim_intercept                = as.integer(sim_intercept),
        sim_ses_coef                 = sim_ses_coef,
        sim_age_coef                 = sim_age_coef,
        perc_ltfu_18                 = percent.missing.at.age.18,
        percent_missing              = percent.missing,
        percent_missing_below_median = percent.missing.below.median,
        percent_missing_above_median = percent.missing.above.median,
        #means,
        base_means)
## note:  choose means OR base_means.  means is from rbind() above and multiplies rows by 6.



## started tinkering with Out Sample predictions...please see `outsample_sim.R`
#
# ##a) naive-nonparametric: that is, just na.rm=TRUE and turn the mean() crank
# ## the best we can do to predict for those missing is to predict the mean curve for everyone
#
#
#
#
#
# ## DO SOME PREDICTIONS
# ## put the following as function inputs... then remove!
# oos_sample_size=100
# oos_age=5
# oos_ind=as.numeric(names(d[,-1])) <= oos_age
# oos_timepoints=as.numeric(names(d[,-1]))[oos_ind]
# oos<-sample_data_fpc(d, oos_sample_size, seed=sim_seed, timepoints=as.numeric(names(d[,-1])))
# oos.early <- oos
# oos.early[,!c(TRUE, oos_ind)] <- NA
# ##oos.early <- rbind(c(999999,fpca_wtd_fncs$mu), oos.early)
#
#   oos.early <- oos[,c(TRUE, oos_ind)]
#
#
# oos.late <- oos[,c(TRUE, !oos_ind)]
#
# ##e) weighted-FPC.
# ##wtd_fncs <- dcast(data=wtd_trajectories, formula= newid~age, value.var="inches_wtd_hadamard")
# fpca_wtd_fncs_pred_oos <- fpca.face(Y=as.matrix(wtd_fncs[,-1]), Y.pred=as.matrix(oos.early[,-1]), argvals=age_vec, knots=10)
#
# weighted_fpc_pred_oos <- data.frame(age=age_vec, V1=fpca_wtd_fncs_pred_oos$mu, approach="weighted_fpc_pred_oos")
#
# fpca_wtd_fncs_pred_oos$Yhat
# fpca_wtd_fncs_pred_oos$scores
#
# oos<-sample_data_fpc(d[,c(TRUE,oos_ind)], oos_sample_size, seed=sim_seed, timepoints=oos_timepoints, knots.face=4)
#
#
#








##  currently not using; going to put some of these in RDS itself
# label=paste("sim_seed", sim_seed,
#       "sample_size", sample_size,
#       "sim_slope", sim_slope,
#       "sim_intercept", sim_intercept,
#       "sim_ses_coef", sim_ses_coef,
#       "sim_age_coef", sim_age_coef,
#       sep="_")


idnum1<-abs(round(10000000*rnorm(1)))
midletter<-sample(letters)[1]
idnum2<-abs(round(10000000*rnorm(1)))
##saveRDS(results,paste0("results_",label,".RDS" ))
saveRDS(results,paste0("results_",idnum1, midletter,idnum2, ".RDS"))



setkey(means, approach)
proc_minutes_number_pcs<-subset(unique(means),
                   select=c("approach","minutes","number_pc"))


write.csv(proc_minutes_number_pcs,
        paste0("proc_minutes_number_pcs_",idnum1, midletter,idnum2, ".csv"),
        row.names=FALSE)


}
swihart/wfpca documentation built on May 30, 2019, 10:38 p.m.