sandbox/estimate_monte_carlo_matchProb.R

#' Compute the Monte Carlo estimate
#'
#' This function computes the Monte Carlo estimate of the expected value of the
#' parameter of interest under specified intervention and under the estimated
#' mechanism.
#'
#' @param fit \code{fit} object obtained by \code{initEst}.
#' @param start Start generating Monte Carlo estimates from this point %O_i. The
#'  earliest time point is 1. For the clever covariate, this is also our s.
#' @param node Start generating Monte Carlo estimates from O_i node W, A or Y.
#' @param t Outcome time point of interest. It must be greater than the intervention node A.
#' @param Anode Intervention node.
#' @param intervention Specify %g^*, of %P(A \mid \text{past}).
#' @param lag This is an user impossed dependency lag necessary for the
#'  calculation of the clever covariate in the targeting step. It refers to
#'  %O_i, and it is of the same dimension as the actual C(i), but further in the
#'  past. Default is 0, meaning that it starts right after the node in question.
#'  Also, note that lag should be equal or smaller than start.
#' @param MC How many Monte Carlo samples should be generated.
#' @param returnMC If \code{TRUE}, returns all MC draws up until outcome time.
#' @param returnMC_full If \code{TRUE}, returns all full MC draws.
#' @param clevCov If \code{TRUE}, this MC is used for the calculation of the
#'  clever covariate. Instead of observed data, it used intervened %P^* for further MC draws.
#' @param set Set the s node to either 1 or 0. Used for the clever covariate calculation.
#' @param full \code{TRUE} if full time-series should be used.
#'
#' @return An object of class \code{tstmle01}.
#' \describe{
#' \item{estimate}{Mean of the outcome at time t under specified intervention, or no intervention.}
#' \item{outcome}{Outcome at time t for each MC interation.}
#' \item{intervention}{Intervention specified.}
#' \item{MC}{How many Monte Carlo samples should be generated.}
#' \item{Anode}{Intervention node as a function of O_i.}
#' \item{MCdata}{If \code{returnMC} is \code{TRUE}, returns a \code{data.frame} with MC time-series.}
#' \item{set}{Returns what the s node was set to.}
#' }
#'
#' @importFrom stats complete.cases rbinom plogis predict
#' @importFrom Hmisc Lag
#' @importFrom parallel makeCluster detectCores stopCluster
#' @importFrom doParallel registerDoParallel
#' @importFrom foreach foreach
#'
#' @export
#

mcEst <- function(fit, start = 1, node = "W", t, Anode, intervention = NULL, lag = 0, 
                  MC=100, returnMC = FALSE, returnMC_full = FALSE, clevCov = FALSE, set = NULL, 
                  full=FALSE) {
  
  # Checks
  if (start < lag) {
    stop("Start time has to be equal or greater than how far in the past we want to go. Pick some start>=lag.")
  }
  
  if (start <= 0) {
    stop("First point in a time series is 1.")
  }
  
  if (Anode <= 0) {
    stop("Anode must be a positive number indicating O_i in a time series.")
  }
  
  if (t <= 0) {
    stop("Outcome must be a positive number indicating O_i in a time series.")
  }
  
  if (t < Anode) {
    warning("Still intervention will not have any effect on the specified outcome.")
  }
  
  #Move this later
  matchProb <- function(fit,node,covs){
    
    if(node=="W"){
      uniq<-fit$combW
      test<-!is.na(prodlim::row.match(uniq[,-ncol(uniq)], covs))
      prob<-uniq[test,ncol(uniq)]
    }else if(node=="A"){
      uniq<-fit$combA
      test<-!is.na(prodlim::row.match(uniq[,-ncol(uniq)], covs))
      prob<-uniq[test,ncol(uniq)]
    }else if(node=="Y"){
      uniq<-fit$combY
      test<-!is.na(prodlim::row.match(uniq[,-ncol(uniq)], covs))
      prob<-uniq[test,ncol(uniq)]
    }
    
    return(prob)
  }
  
  # How many in a batch:
  step <- fit$step
  
  # If estMC is used for clever covariate calculation, use P^*.
  if (clevCov == TRUE && full==FALSE) {
    
    #Already generated O^* with intervention at Anode:
    data <- fit$p_star
    
    #Set s node to "set" (1/0), for Y, A, W.
    #Taking advantage of all MC samples, instead of just averaging over ones that have 1 or 0. 
    #Note: node is the first node that is MCed. So we go back to set the node before.
    if (node == "W") {
      #Set Y:
      #(note, start=s+1 here)
      data[(start * step), ] <- set
    } else if (node == "Y") {
      #Set A:
      #(note, start=s here)
      data[(start * step) + 2, ] <- set
    } else if (node == "A") {
      #Set W:
      #(note, start=s here)
      data[(start * step) + 1, ] <- set
    }
  }else if(clevCov == FALSE && full==TRUE){
    #When simulating p_star and ps for h-densities, use the full data.
    #Shortening should be only for clever cov calculation
    data <- fit$data_orig
  }else if(clevCov == FALSE && full==FALSE){
    data <- fit$data
  }else if(clevCov == TRUE && full==TRUE){
    warning("n for clever covariate and full n might not be the same! Make sure this is the right option.")
  }
  
  # Prepare to return all MCs (up to time t (outcome) generated draws)
  if (returnMC == TRUE & returnMC_full == TRUE) {
    warning("Can't return both a full time-series and a shorter time-series. Returning full time-series.")
  }
  
  # Impose artifical order (C(i)) for the clever covariate calculation.
  # Should default to whatever was estimated in initEst()
  
  if (lag < 0) {
    
    # If lag is negative, move the time series up w.r.t. reference
    
    # This gives further up lag first
    res <- lapply((step * lag + 1):(fit$freqW + step * lag), function(x) {
      Hmisc::Lag(data[, 1], x)
    })
    res <- data.frame(matrix(unlist(res), nrow = length(res[[1]])),
                      stringsAsFactors = FALSE)
    
    data_est_full <- cbind.data.frame(data = data, res)
    data_lag <- data.frame(data_est_full[, -1])
    
    n <- fit$n_true
    
    # Now have to take into account it does not start at 1...
    if (clevCov == FALSE & returnMC_full == FALSE) {
      data_lag <- data_lag[1:((t * step) - (lag * step)), ]
      estNames <- row.names(data_lag)
    }
    
    # Now that we have all future lags, shorten the time-series.
    if (returnMC_full == FALSE & n > t) {
      data_lag <- data_lag[1:((t + 1) * step), ]
      estNames <- row.names(data_lag)[1:((t + 1) * step)]
    }
    
    #Check the time-series is long enough for this estimation step:
    if(is.na(data_lag[nrow(data_lag),2])){
      stop("The time-series considered is too short for this estimation problem. 
           Consider having a longer time-series, or an earlier outcome point.")
    }
    
    # get actual index of Anode
    Anode <- Anode * step + 2
    
    # Get actual index for start, depended on W.
    # (right now, start is set to be the batch, not actual index)
    start <- start * step + 1
    
    }else if (lag >= 0) {
      
      # This is ok under the assumption orders are the same dimension.
      # Note that "lags" in the clever covariate are defined by the size of the "step".
      # TODO: Change this to an option where we can have varying dimensions for W,A,Y.
      
      res <- lapply((1 + step * lag):(fit$freqW + step * lag), function(x) {
        Hmisc::Lag(data[, 1], x)
      })
      res <- data.frame(matrix(unlist(res), nrow = length(res[[1]])),
                        stringsAsFactors = FALSE)
      
      data_est_full <- cbind.data.frame(data = data, res)
      
      n <- fit$n_true
      
      # Drop time 0 for estimation
      # Notice: drop step*(lag+1) each time
      cc <- stats::complete.cases(data_est_full)
      data_est <- data_est_full[cc, ]
      data_lag <- data.frame(data_est[-1, ])
      data_lag <- data.frame(data_lag[, -1])
      
      # Option for returnMC
      if (clevCov == FALSE & returnMC_full == FALSE) {
        data_lag <- data_lag[1:((t * step) - (lag * step)), ]
        estNames <- row.names(data_lag)
      }
      
      # Now that we have all future lags, shorten the time-series. 
      # Unless we are returning the full MC
      if (returnMC_full == FALSE & n > t) {
        data_lag <- data_lag[1:((t * step)-(lag*step)), ]
        estNames <- row.names(data_lag)[1:((t * step)-(lag*step))]
      }
      
      # get actual index of Anode
      Anode <- Anode * step + 2 - (lag + 1) * step
      
      # Get actual index for start, depended on W.
      # (right now, start is set to be the batch, not actual index)
      start <- start * step - (lag + 1) * step + 1
    }
  
  # TO DO: Later on will need to include more Ws
  
  # i always dependent on W iterations. Make everything offset of W
  #NOTE: last newY is at for data_lag shortened to tau just Y(\tau).
  mainMC<-function(data_lag){
    
    iter <- 1
    
    for (i in seq(start, nrow(data_lag), step)) {
      
      # Possibly need to skip the update of W and A at the first iteration,
      # depending on from which node we start. Hence the need for iter and ommitting some nodes.
      
      # Possibly need to condition on earlier time points for calculating our clever covariate.
      # This is our i in the clever covariate.
      
      if (iter == 1) {
        
        # Start at A.
        if (node == "A") {
          if (is.null(intervention)) {
            # Update for Y
            prob<-matchProb(fit,node="A",data_lag[i + 1, ])
            newA <- stats::rbinom(1, 1, prob)
            
            if((i + 2 + (lag*step))>0){
              data_lag[(i + 2 + (lag*step)), 1] <- newA
            }else{
              #Save the obtained newA in place of new.
              data_lag[i + 1, 1]<-newA
            }
          } else if (!is.null(intervention)) {
            if ((i + 1) == Anode) {
              # Update for Y
              newA <- stats::rbinom(1, 1, intervention)
              
              if((i + 2 + (lag*step))>0){
                data_lag[(i + 2 + (lag*step)), 1] <- newA
              }else{
                data_lag[i + 1, 1]<-newA
              }
            } else {
              # Update for Y
              prob<-matchProb(fit,node="A",data_lag[i + 1, ])
              newA <- stats::rbinom(1, 1, prob)
              
              if((i + 2 + (lag*step))>0){
                data_lag[(i + 2 + (lag*step)), 1] <- newA 
              }else{
                data_lag[i + 1, 1]<-newA
              }
            }
          }
          
          # Update for next W
          prob<-matchProb(fit,node="Y",data_lag[i + 2, ])
          newY <- stats::rbinom(1, 1, prob)
          
          if((i + 3 + (lag*step))>0){
            data_lag[(i + 3 + (lag*step)), 2] <- newA
            data_lag[(i + 3 + (lag*step)), 1] <- newY
          }else{
            data_lag[i + 2, 1]<-newY
          }
          
          #Start at Y. 
        }else if (node == "Y") {
          
          # Now we skip both W and A, start generating MC draws from Y.
          # Update for next W
          prob<-matchProb(fit,node="Y",data_lag[i + 2, ])
          newY <- stats::rbinom(1, 1, prob)
          
          if((i + 3 + (lag*step))>0){
            data_lag[(i + 3 + (lag*step)), 1] <- newY
          }else{
            data_lag[i + 2, 1]<-newY
          }
          
          #Start at W.   
        }else if (node == "W") {
          
          # Don't skip anything, start with W.
          # Update for A
          prob<-matchProb(fit,node="W",data_lag[i, ])
          newW <- stats::rbinom(1, 1, prob)
          
          if((i + 1 + (lag*step))>0){
            data_lag[(i + 1 + (lag*step)), 1] <- newW 
          }else{
            data_lag[i, 1]<-newW
          }
          
          if (is.null(intervention)) {
            # Update for Y
            prob<-matchProb(fit,node="A",data_lag[i + 1, ])
            newA <- stats::rbinom(1, 1, prob)
            
            if((i + 2 + (lag*step))>0){
              data_lag[(i + 2 + (lag*step)), 2] <- newW
              data_lag[(i + 2 + (lag*step)), 1] <- newA 
            }else{
              data_lag[i + 1, 1]<-newA
            }
          } else if (!is.null(intervention)) {
            if ((i + 1) == Anode) {
              # Update for Y
              newA <- rbinom(1, 1, intervention)
              
              if((i + 2 + (lag*step))>0){
                data_lag[(i + 2 + (lag*step)), 2] <- newW
                data_lag[(i + 2 + (lag*step)), 1] <- newA
              }else{
                data_lag[i + 1, 1]<-newA
              }
            } else {
              # Update for Y
              prob<-matchProb(fit,node="A",data_lag[i + 1, ])
              newA <- stats::rbinom(1, 1, prob)
              
              if((i + 2 + (lag*step))>0){
                data_lag[(i + 2 + (lag*step)), 2] <- newW
                data_lag[(i + 2 + (lag*step)), 1] <- newA
              }else{
                data_lag[i + 1, 1]<-newA
              }
            }
          }
          
          # Update for next W
          # In the last iteration this will be NA... This is ok for now, just one more row.
          prob<-matchProb(fit,node="Y",data_lag[i + 2, ])
          newY <- stats::rbinom(1, 1, prob)
          
          if((i + 3 + (lag*step))>0){
            data_lag[(i + 3 + (lag*step)), 2] <- newA
            data_lag[(i + 3 + (lag*step)), 1] <- newY 
          }else{
            data_lag[i + 2, 1]<-newY
          }
          
        }
        
        #Later on iterations.  
      }else {
        
        # For later iterations, do as always.
        # MC draws start from "start" and W node.
        
        # Update for A
        prob<-matchProb(fit,node="W",data_lag[i, ])
        newW <- stats::rbinom(1, 1, prob)
        
        if((i + 1 + (lag*step))>0){
          data_lag[(i + 1 + (lag*step)), 1] <- newW
          data_lag[(i + 1 + (lag*step)), 2] <- newY 
        }else{
          data_lag[i, 1]<-newW
        }
        
        #Intervention=NULL
        if (is.null(intervention)) {
          # Update for Y
          prob<-matchProb(fit,node="A",data_lag[i + 1, ])
          newA <- stats::rbinom(1, 1, prob)
          
          if((i + 2 + (lag*step))>0){
            data_lag[(i + 2 + (lag*step)), 2] <- newW
            data_lag[(i + 2 + (lag*step)), 1] <- newA
          }else{
            data_lag[i + 1, 1]<-newA
          }
        } else if (!is.null(intervention)) {
          
          #Intervention at the current node batch
          if ((i + 1) == Anode) {
            # Update for Y
            newA <- stats::rbinom(1, 1, intervention)
            
            if((i + 2 + (lag*step))>0){
              data_lag[(i + 2 + (lag*step)), 2] <- newW
              data_lag[(i + 2 + (lag*step)), 1] <- newA
            }else{
              data_lag[i + 1, 1]<-newA
            }
            
            #There is an intervention but not in this batch  
          } else {
            # Update for Y
            prob<-matchProb(fit,node="A",data_lag[i + 1, ])
            newA <- stats::rbinom(1, 1, prob)
            
            if((i + 2 + (lag*step))>0){
              data_lag[(i + 2 + (lag*step)), 2] <- newW
              data_lag[(i + 2 + (lag*step)), 1] <- newA
            }else{
              data_lag[i + 1, 1]<-newA
            }
          }
        }
        
        # Update for next W
        # In the last iteration this will be NA... This is ok for now, just one more row.
        prob<-matchProb(fit,node="Y",data_lag[i + 2, ])
        newY <- stats::rbinom(1, 1, prob)
        
        if((i + 3 + (lag*step))>0){
          data_lag[(i + 3 + (lag*step)), 2] <- newA
          data_lag[(i + 3 + (lag*step)), 1] <- newY 
        }else{
          data_lag[i + 2, 1]<-newY
        }
        
      }
      
      iter <- iter + 1
    }
    
    if(returnMC == TRUE || returnMC_full == TRUE){
      
      if(lag>=0){
        if(node=="A"){
          data_lag_MC <- data.frame(data=data_lag[(lag*step+3):nrow(data_lag), 1])
          row.names(data_lag_MC)<-row.names(data_lag)[2:(nrow(data_lag)-(lag*step+1))]
          
          #data_lag is truncated to go from start. Add what's missing
          missData<-data.frame(data=data[1:(step+1+(abs(lag)*step)),])
          row.names(missData)<-row.names(data)[1:(step+1+(abs(lag)*step))]
          data_lag_MC<-rbind.data.frame(missData,data_lag_MC)
        }else if(node=="W"){
          data_lag_MC <- data.frame(data=data_lag[(lag*step+2):nrow(data_lag), 1])
          row.names(data_lag_MC)<-row.names(data_lag)[1:(nrow(data_lag)-(lag*step+1))]
          
          #data_lag is truncated to go from start. Add what's missing
          missData<-data.frame(data=data[1:(step+(abs(lag)*step)),])
          row.names(missData)<-row.names(data)[1:(step+(abs(lag)*step))]
          data_lag_MC<-rbind.data.frame(missData,data_lag_MC)
        }else if(node=="Y"){
          data_lag_MC <- data.frame(data=data_lag[(lag*step+4):nrow(data_lag), 1])
          row.names(data_lag_MC)<-row.names(data_lag)[3:(nrow(data_lag)-(lag*step+1))]
          
          #data_lag is truncated to go from start. Add what's missing
          missData<-data.frame(data=data[1:(step+2+(abs(lag)*step)),])
          row.names(missData)<-row.names(data)[1:(step+2+(abs(lag)*step))]
          data_lag_MC<-rbind.data.frame(missData,data_lag_MC)
        }
        #TO DO: Investigate for starting node W,A,Y  
      }else if(lag<0){
        
        if((nrow(data_lag)+(lag*step+1))>0){
          #Goes until the end
          data_lag_MC<-data.frame(data=data_lag[1:(nrow(data_lag)+(lag*step+1)),1])
          row.names(data_lag_MC)<-row.names(data_lag)[-(lag*step):nrow(data_lag)]
          
          #data_lag is truncated to go from start. Add what's missing
          missData<-data.frame(data=data[1:-(lag*step+1),])
          row.names(missData)<-row.names(data)[1:-(lag*step+1)]
          data_lag_MC<-rbind.data.frame(missData,data_lag_MC)
        }else{
          #No updating occured: return what was calculated based on the original ts
          data_lag_MC<-data.frame(data=data_lag[start:nrow(data_lag),1])
          data_lag_MC<-rbind.data.frame(data.frame(data=data_est_full[1:(start-1),1]),data_lag_MC)
          row.names(data_lag_MC)<-row.names(data_lag)
        }
        
      }
      
      return(list(newY=newY,dataMC=data_lag_MC))
      
    }else {
      return(list(newY=newY))
    }
    
  }
  
  #Set up to run in parallel
  no_cores <- parallel::detectCores() - 1
  myCluster <- parallel::makeCluster(no_cores)
  doParallel::registerDoParallel(myCluster)
  
  #Tracks Y(\tau) and MC data
  outcome_all<-foreach::foreach(1:MC, .combine = c) %dopar% mainMC(data_lag=data_lag)
  base::on.exit(parallel::stopCluster(myCluster))
  
  if(returnMC == TRUE){
    
    a <- 1:(2*MC)
    
    #Save Y(\tau)
    outcome<-unlist(lapply(a[seq(1, length(a), 2)], function(x) outcome_all[x]$newY))
    
    #Save MC data
    dataMC<-lapply(a[seq(2, length(a), 2)], function(x) outcome_all[x]$dataMC)
    dataMC <- data.frame(matrix(unlist(dataMC), nrow=(step*(t+1)), byrow=F))
    names(dataMC)<-paste0("MC",1:MC)
    row.names(dataMC)<-row.names(data)[1:(step*(t+1))]
    
  }else if(returnMC_full == TRUE){
    
    a <- 1:(2*MC)
    
    #Save Y(\tau)
    outcome<-unlist(lapply(a[seq(1, length(a), 2)], function(x) outcome_all[x]$newY))
    
    #Save MC data
    dataMC<-lapply(a[seq(2, length(a), 2)], function(x) outcome_all[x]$dataMC)
    dataMC <- data.frame(matrix(unlist(dataMC), nrow=(step*(n+1)), byrow=F))
    names(dataMC)<-paste0("MC",1:MC)
    row.names(dataMC)<-row.names(data)
    
  }else{
    #Save Y(\tau)
    outcome<-unlist(lapply(1:MC, function(x) outcome_all[x]$newY))
  }
  
  if (returnMC == TRUE || returnMC_full == TRUE) {
    return(list(estimate = mean(outcome), outcome = outcome, intervention = intervention,
                MC = MC, t = t, Anode = Anode, set=set, MCdata = dataMC))
    
  }else {
    return(list(estimate = mean(outcome), outcome = outcome, intervention = intervention,
                MC = MC, t = t, Anode = Anode, set=set))
  }
  
}
podTockom/tstmle01 documentation built on May 14, 2019, 2:03 a.m.