R/sampling_gpu.R

#' Run a cmdstan model
#'
#' Runs a cmdstan model. All parameters with the number 2 in the name are
#' only used in case of 2 scores.
#'
#' @param df Input data frame.
#' @param mod_name character. The name of the model. For now the only possible
#' value is "glm".
#' @param SubjectIdVar Name of the column which holds the subject IDs.
#' Must be without quotation marks.
#' @param StudyIdVar Name of the column which holds the study IDs.
#' Must be without quotation marks.
#' @param TimeVar Name of the column which holds the times.
#' Must be without quotation marks.
#' @param ScoreVar Name of the column which holds the scores (responses).
#' Must be without quotation marks.
#' @param is_pbo Name of the placebo column for score 1. Must be without 
#' quotation marks.
#' @param CovariatesR formula. Formula of the form ~ A + B + ..., where
#' the letters represent the names of columns of the covariates for rate.
#' @param CovariatesB formula. Formula of the form ~ A + B + ..., where
#' the letters represent the names of columns of the covariates for baseline.
#' @param m_r binary. 0 if the covariates for rate are additive, 1 if they
#' are multiplicative.
#' @param m_b binary. 0 if the covariates for baseline are additive, 1 if they
#' are multiplicative.
#' @param ScoreVar2 Name of the column which holds the second scores
#' (responses) if applicable.
#' Must be without quotation marks.
#' @param is_pbo2 Name of the placebo column for score 2. Must be without 
#' quotation marks.
#' @param CovariatesR2 formula. Formula of the form ~ A + B + ..., where
#' the letters represent the names of columns of the covariates for rate.
#' @param CovariatesB2 formula. Formula of the form ~ A + B + ..., where
#' the letters represent the names of columns of the covariates for baseline.
#' @param m_r2 binary. 0 if the covariates for rate are additive, 1 if they
#' are multiplicative.
#' @param m_b2 binary. 0 if the covariates for baseline are additive, 1 if they
#' are multiplicative.
#' @param gpu_enabled binary. 1 to use the GPU model, 0 to use the CPU model.
#' @param init_list list. The list of initialization parameters for the model.
#' @param num_samples integer. The number of samples.
#' @param num_warmup integer. The number of samples in the warmup phase of the
#' sampling.
#' @param seed integer. Random seed for the model.
#' @param ... Other parameters for the sampler.
#'
#' @return A list with 4 elements. 
#' \item{stan_model}{An object of stanfit. Fitted model.}
#' \item{data_used}{The exact data that stan model used.}
#' \item{maps}{The maps from original subject and study IDs to the IDs used in the model.}
#' \item{pred_samples}{The predicted values for each iteration.}
#'
sampling_gpu        <- function (df,
                                 mod_name        = "glm",
                                 SubjectIdVar    = IDp,
                                 StudyIdVar      = IDs,
                                 TimeVar         = time,
                                 ScoreVar        = score,
                                 is_pbo          = placebo,
                                 CovariatesR     = NULL,
                                 CovariatesB     = NULL,
                                 m_r             = 0,
                                 m_b             = 0,
                                 ScoreVar2       = NULL,
                                 is_pbo2         = NULL,
                                 CovariatesR2    = NULL,
                                 CovariatesB2    = NULL,
                                 m_r2            = 0,
                                 m_b2            = 0,
                                 gpu_enabled     = 1,
                                 init_list       = NULL,
                                 num_samples     = 1000, # Stan parameters
                                 num_warmup      = 1000,
                                 save_warmup     = 0,
                                 thin            = 1,
                                 engaged         = 1,
                                 gamma           = 0.050000000000000003,
                                 delta           = 0.80000000000000004,
                                 kappa           = 0.75,
                                 t0              = 10,
                                 init_buffer     = 75,
                                 term_buffer     = 50,
                                 window          = 25,
                                 algorithm       = "hmc",
                                 engine          = "nuts",
                                 max_depth       = 10,
                                 metric          = "diag_e",
                                 metric_file     = "",
                                 stepsize        = 1 ,
                                 stepsize_jitter = 0,
                                 id              = 0,
                                 seed            = 1607674300,
                                 ...) {
  if ((m_r == 1) | (m_r2 == 1) | (m_b == 1) | (m_b2 == 1)) {
    stop("Multiplicative model currently not supported.")
  }
  
  df_to_list <- function (df,
                          IDp            = NULL,
                          IDs            = NULL,
                          times          = NULL,
                          score          = NULL,
                          covr_nms       = NULL,
                          covb_nms       = NULL,
                          m_r            = NULL,
                          m_b            = NULL,
                          is_pbo         = NULL,
                          score2         = NULL,
                          covr2_nms      = NULL,
                          covb2_nms      = NULL,
                          m_r2           = NULL,
                          m_b2           = NULL,
                          is_pbo2        = NULL
  ) {
    s1_ind <- df[ ,score]
    df1    <- df[!is.na(s1_ind), ]
    
    # Check for NAs and NULL
    df_vars <- df1[ ,c(IDp, IDs, times, score, covr_nms, covb_nms, is_pbo)]
    if(any(is.na(df_vars))) {
      stop("Missing values in the data frame for score 1.")
    }
    if(any(is.null(df_vars))) {
      stop("NULL values in the data frame for score 1.")
    }
    
    N1      <- nrow(df1)
    P1      <- length(unique(df1[ ,IDp]))
    M1      <- length(unique(df1[ ,IDs]))
    
    # re-index
    reindex <- function  (x) {
      new_inds <- x
      for (i in 1:length(x)) {
        new_inds[i] <- which(x[i] == unique(x))
      }
      return (new_inds)
    }
    # sort by IDs and IDp
    df1 <- df1[order(df1[ ,IDs], df1[ ,IDp]), ]
    
    # patients
    IDp1      <- df1[ ,IDp]
    original  <- IDp1
    IDp1      <- reindex(IDp1)
    reindexed <- IDp1
    IDp1_map  <- unique(data.frame(original, reindexed))
    
    # studies
    IDs1      <- df1[ ,IDs]
    original  <- IDs1
    IDs1      <- reindex(IDs1)
    reindexed <- IDs1
    IDs1_map  <- unique(data.frame(original, reindexed))
    
    map_list <- list(
      "patient1" = IDp1_map,
      "study1"   = IDs1_map
    )
    
    # other variables
    times1  <- df1[ ,times]
    S1      <- df1[ ,score]
    covsr1  <- as.matrix(df1[ ,covr_nms])
    covsb1  <- as.matrix(df1[ ,covb_nms])
    is_pbo1 <- df1[ ,is_pbo]
    
    # get placebo studies
    tmp     <- data.frame(IDs1, is_pbo1)
    
    tmp     <- unique(tmp)
    
    is_pbo1 <- tmp[order(tmp[ ,1]),2]
    
    
    out_list <- list(
      N                = N1,
      P                = P1,
      M                = M1,
      Q_r              = ncol(covsr1),
      Q_s              = ncol(covsb1),
      multiplicative_s = m_b,
      multiplicative_r = m_r,
      X_r              = covsr1,
      X_s              = covsb1,
      is_pbo           = is_pbo1,
      IDp              = IDp1,
      IDs              = IDs1,
      time             = times1,
      score1           = S1
    )
    
    if (!is.null(score2)) {
      s2_ind <- df[ ,score2]
      df2    <- df[!is.na(s2_ind), ]
      
      df_vars <- df2[ ,c(IDp, IDs, times, score2, covr2_nms, covb2_nms, is_pbo2)]
      if(any(is.na(df_vars))) {
        stop("Missing values in the data frame for score 2.")
      }
      if(any(is.null(df_vars))) {
        stop("NULL values in the data frame for score 2.")
      }
      
      N2      <- nrow(df2)
      P2      <- length(unique(df2[ ,IDp]))
      M2      <- length(unique(df2[ ,IDs]))
      
      # sort by IDs and IDp
      df2 <- df2[order(df2[ ,IDs], df2[ ,IDp]), ]
      
      # reindex
      # patients
      IDp2      <- df2[ ,IDp]
      original  <- IDp2
      IDp2      <- reindex(IDp2)
      reindexed <- IDp2
      IDp2_map  <- unique(data.frame(original, reindexed))
      
      # studies
      IDs2      <- df2[ ,IDs]
      original  <- IDs2
      IDs2      <- reindex(IDs2)
      reindexed <- IDs2
      IDs2_map  <- unique(data.frame(original, reindexed))
      
      map_list <- list(
        "patient1" = IDp1_map,
        "study1"   = IDs1_map,
        "patient2" = IDp2_map,
        "study2"   = IDs2_map
      )
      
      times2  <- df2[ ,times]
      S2      <- df2[ ,score2]
      covsr2  <- as.matrix(df2[ ,covr2_nms])
      covsb2  <- as.matrix(df2[ ,covb2_nms])
      is_pbo2 <- df2[ ,is_pbo2]
      
      # get placebo studies
      tmp     <- data.frame(IDs2, is_pbo2)
      tmp     <- unique(tmp)
      is_pbo2 <- tmp[order(tmp[ ,1]),2]
      
      # Prepare mapping of patients for stan
      have_both <- base::intersect(map_list$patient1$original,
                                   map_list$patient2$original)
      pmap_stan <- data.frame("original" = have_both)
      pmap_stan <- merge(pmap_stan, map_list$patient1, by = "original")
      pmap_stan <- merge(pmap_stan, map_list$patient2, by = "original")
      pmap_stan <- pmap_stan[ ,c("reindexed.x", "reindexed.y")]
      
      pother1 <- map_list$patient1$reindexed[!(map_list$patient1$original %in% have_both)]
      pother2 <- map_list$patient2$reindexed[!(map_list$patient2$original %in% have_both)]
      
      # prepare mapping of studies for stan
      have_both <- base::intersect(map_list$study1$original,
                                   map_list$study2$original)
      smap_stan <- data.frame("original" = have_both)
      smap_stan <- merge(smap_stan, map_list$study1, by = "original")
      smap_stan <- merge(smap_stan, map_list$study2, by = "original")
      smap_stan <- smap_stan[ ,c("reindexed.x", "reindexed.y")]
      
      sother1 <- map_list$study1$reindexed[!(map_list$study1$original %in% have_both)]
      sother2 <- map_list$study2$reindexed[!(map_list$study2$original %in% have_both)]
      
      out_list <- list(
        N                 = N1,
        P                 = P1,
        M                 = M1,
        Q_r               = ncol(covsr1),
        Q_s               = ncol(covsb1),
        multiplicative_s  = m_b,
        multiplicative_r  = m_r,
        X_r               = covsr1,
        X_s               = covsb1,
        is_pbo            = is_pbo1,
        IDp               = IDp1,
        IDs               = IDs1,
        time              = times1,
        score1            = S1,
        
        N2                = N2,
        P2                = P2,
        M2                = M2,
        Q_r2              = ncol(covsr2),
        Q_s2              = ncol(covsb2),
        multiplicative_s2 = m_b2,
        multiplicative_r2 = m_r2,
        X_r2              = covsr2,
        X_s2              = covsb2,
        is_pbo2           = is_pbo2,
        IDp2              = IDp2,
        IDs2              = IDs2,
        time2             = times2,
        score2            = S2,
        Pn              = nrow(pmap_stan),
        Sn              = nrow(smap_stan),
        patient_map     = pmap_stan,
        study_map       = smap_stan,
        other_patients1 = pother1,
        other_patients2 = pother2,
        other_studies1  = as.array(sother1),
        other_studies2  = as.array(sother2)
      )
    }
    tmp_out <- list(
      "data" = out_list,
      "maps" = map_list
    )
    return(tmp_out)
  }
  
  # assign names
  IDp        <- deparse(substitute(SubjectIdVar))
  IDs        <- deparse(substitute(StudyIdVar))
  times      <- deparse(substitute(TimeVar))
  score      <- deparse(substitute(ScoreVar))
  is_pbo     <- deparse(substitute(is_pbo))
  
  # order df by study and patient
  df     <- df[order(df[ ,IDs], df[ ,IDp]), ]
  
  
  # check if in data frame
  cnames <- colnames(df)
  if (!(IDp %in% cnames)) {
    stop(paste0("No column with name ", IDp, " in the data frame."))
  }
  if (!(IDs %in% cnames)) {
    stop(paste0("No column with name ", IDs, " in the data frame."))
  }
  if (!(times %in% cnames)) {
    stop(paste0("No column with name ", times, " in the data frame."))
  }
  if (!(score %in% cnames)) {
    stop(paste0("No column with name ", score, " in the data frame."))
  }
  if (!(is_pbo %in% cnames)) {
    stop(paste0("No column with name ", is_pbo, " in the data frame."))
  }
  
  # covariates
  covr_nms    <- all.vars(CovariatesR)
  covb_nms    <- all.vars(CovariatesB)
  
  if (any(!(covr_nms %in% cnames))) {
    stop("No column with CovariatesR in the data frame.")
  }
  if (any(!(covb_nms %in% cnames))) {
    stop("No column with CovariatesB in the data frame.")
  }
  other_names <- c(IDp, IDs, times, score, is_pbo)
  
  score2 <- deparse(substitute(ScoreVar2))
  
  if (score2 != "NULL") {
    if (!(score2 %in% cnames)) {
      stop(paste0("No column with name ", score2, " in the data frame."))
    }
    is_pbo2      <- deparse(substitute(is_pbo2))
    if (!(is_pbo2 %in% cnames)) {
      stop(paste0("No column with name ", is_pbo2, " in the data frame."))
    }
    other_names  <- c(other_names, score2, is_pbo2)
    covr2_nms    <- all.vars(CovariatesR2)
    covb2_nms    <- all.vars(CovariatesB2)
    if (any(!(covr2_nms %in% cnames))) {
      stop("No column with CovariatesR2 in the data frame.")
    }
    if (any(!(covb2_nms %in% cnames))) {
      stop("No column with CovariatesB2 in the data frame.")
    }
    if (length(intersect(covr2_nms, other_names)) > 0) {
      stop("Non-empty intersection between rate2 covariates and core attributes.")
    }
    if (length(intersect(covb2_nms, other_names)) > 0) {
      stop("Non-empty intersection between baseline2 covariates and core attributes.")
    }
  }
  if (length(intersect(covr_nms, other_names)) > 0) {
    stop("Non-empty intersection between rate covariates and core attributes.")
  }
  if (length(intersect(covb_nms, other_names)) > 0) {
    stop("Non-empty intersection between baseline covariates and core attributes.")
  }
  
  # create temporary file names for data and init
  data_file <- paste0(tempdir(),
                      "/data",
                      paste(c(sample(c("a","b","c","d", 1:9), 4, TRUE)),
                            collapse = ""),
                      ".R")
  init_file <- paste0(tempdir(),
                      "/init",
                      paste(c(sample(c("a","b","c","d", 1:9), 4, TRUE)),
                            collapse = ""),
                      ".R")
  out_file  = paste0(tempdir(),
                     "/out",
                     paste(c(sample(c("a","b","c","d", 1:9), 4, TRUE)),
                           collapse = ""),
                     ".csv")
  
  if(score2 == "NULL") {
    data_list <- df_to_list(df,
                            IDp            = IDp,
                            IDs            = IDs,
                            times          = times,
                            score          = score,
                            covr_nms       = covr_nms,
                            covb_nms       = covb_nms,
                            m_r            = m_r,
                            m_b            = m_b,
                            is_pbo         = is_pbo)
  } else {
    # if (sum(!is.na(df[ ,score]) & !is.na(df[ ,score2])) == 0) {
    #   stop("Empty intersection between score 1 and score 2, use separate one-score models instead.")
    # }
    data_list <- df_to_list(df,
                            IDp            = IDp,
                            IDs            = IDs,
                            times          = times,
                            score          = score,
                            covr_nms       = covr_nms,
                            covb_nms       = covb_nms,
                            m_r            = m_r,
                            m_b            = m_b,
                            is_pbo         = is_pbo,
                            score2         = score2,
                            covr2_nms      = covr2_nms,
                            covb2_nms      = covb2_nms,
                            m_r2           = m_r2,
                            m_b2           = m_b2,
                            is_pbo2        = is_pbo2)
  }
  maps      <- data_list$maps
  data_list <- data_list$data
  
  #data
  with(data_list, {
    rstan::stan_rdump(names(data_list), file = data_file)
  })
  
  # init
  with(init_list, {
    rstan::stan_rdump(names(init_list), file = init_file)
  })
  
  # here we have to check for OS and select the correct version into mod
  get_os <- function() {
    if (.Platform$OS.type == "windows") {
      "win"
    } else if (Sys.info()["sysname"] == "Darwin") {
      "mac"
    } else if (.Platform$OS.type == "unix") {
      "unix"
    } else {
      stop("Unknown OS")
    }
  }
  my_os <- get_os()
  
  if (my_os == "win") {
    if (score2 != "NULL") {
      if (gpu_enabled == 0) {
        mod <- system.file(paste0("bin/generalized_logistic_model/Win64/", mod_name, "_two_scores_packed_CPU.exe"), package = "GLMCPath")
      } else {
        mod <- system.file(paste0("bin/generalized_logistic_model/Win64/", mod_name, "_two_scores_packed_GPU.exe"), package = "GLMCPath")
      }
      
    } else {
      if (gpu_enabled == 0) {
        mod <- system.file(paste0("bin/generalized_logistic_model/Win64/", mod_name, "_one_score_packed_CPU.exe"), package = "GLMCPath")
      } else {
        mod <- system.file(paste0("bin/generalized_logistic_model/Win64/", mod_name, "_one_score_packed_GPU.exe"), package = "GLMCPath")
      }
      
    }
  }
  if (my_os == "unix") {
    if (score2 != "NULL") {
      if (gpu_enabled == 0) {
        mod <- system.file(paste0("bin/generalized_logistic_model/Linux/", mod_name, "_two_scores_packed_CPU"), package = "GLMCPath")
      } else {
        mod <- system.file(paste0("bin/generalized_logistic_model/Linux/", mod_name, "_two_scores_packed_GPU"), package = "GLMCPath")
      }
    } else {
      if (gpu_enabled == 0) {
        mod <- system.file(paste0("bin/generalized_logistic_model/Linux/", mod_name, "_one_score_packed_CPU"), package = "GLMCPath")
      } else {
        mod <- system.file(paste0("bin/generalized_logistic_model/Linux/", mod_name, "_one_score_packed_GPU"), package = "GLMCPath")
      }
    }
  }
  if (my_os == "mac") {
    stop("macOS not supported.")
    # + link to page?
  }
  
  # create string
  model_call <- paste0(mod,
                       " sample",
                       " num_samples=", num_samples,
                       " num_warmup=", num_warmup,
                       " save_warmup=", save_warmup,
                       " thin=", thin,
                       " adapt",
                       " engaged=", engaged,
                       " gamma=", gamma,
                       " delta=", delta,
                       " kappa=", kappa,
                       " t0=", t0,
                       " init_buffer=", init_buffer,
                       " term_buffer=", term_buffer,
                       " window=", window,
                       " algorithm=", algorithm,
                       " engine=", engine,
                       " max_depth=", max_depth,
                       " metric=", metric,
                       " metric_file=", metric_file,
                       " stepsize=", stepsize,
                       " stepsize_jitter=", stepsize_jitter,
                       " data",
                       " file=", data_file)
  if (!is.null(init_list)) {
    model_call <- paste0(model_call,
                         " init=", init_file)
  }
  
  model_call <- paste0(model_call,
                       " random",
                       " seed=", seed,
                       " output file=", out_file)
  print(model_call)
  # run model + delete temporary files in case of error
  
  tryCatch({
    system(model_call)
  }, error   = function(e) {
    file.remove(data_file, init_file, out_file)
    stop(e)
  })
  
  
  # read saved csv and return the values + delete temporary files
  tryCatch({
    print(out_file)
    stan_out <- read_stan_csv(out_file)
  }, error   = function(e) {
    file.remove(data_file, init_file, out_file)
    stop(paste(e, "No output from the model. Check model call."))
  })
  
  # reindex back
  reindex_back <- function (x, tmap) {
    for (i in 1:length(x)) {
      x[i] <- tmap$original[tmap$reindexed == x[i]]
    }
    return (x)
  }
  
  if(score2 == "NULL") {
    ext <- rstan::extract(stan_out)
    pred_df1 <- data.frame(IDp = data_list$IDp,
                           IDs = data_list$IDs,
                           time = data_list$time,
                           scoreN = 1)
    pred_df1 <- cbind(pred_df1, t(ext$score1_pred))
    pred_df  <- pred_df1
    
    mdf1 <- df[ , colnames(df) != score2]
    
    # remove NAs
    mdf1 <- mdf1[!is.na(mdf1[ , score]), ]
    
    # rename columns
    colnames(mdf1)[colnames(mdf1) == score] <- "score"
    
    # cbind score name
    mdf1 <- cbind(mdf1, "scoreN" = 1)
    
    # rbind
    mdf <- mdf1
    
    
    pred_df$IDs <- reindex_back(pred_df$IDs, maps$study1)
    pred_df$IDp <- reindex_back(pred_df$IDp, maps$patient1)
    out_data    <- dplyr::left_join(mdf, pred_df, by = c("scoreN", IDs, IDp, times))
  } else {
    ext <- rstan::extract(stan_out)
    pred_df1 <- data.frame(IDp = data_list$IDp,
                           IDs = data_list$IDs,
                           time = data_list$time,
                           scoreN = 1)
    pred_df1 <- cbind(pred_df1, t(ext$score1_pred))
    
    pred_df1$IDs <- reindex_back(pred_df1$IDs, maps$study1)
    pred_df1$IDp <- reindex_back(pred_df1$IDp, maps$patient1)
    
    pred_df2 <- data.frame(IDp = data_list$IDp2,
                           IDs = data_list$IDs2,
                           time = data_list$time2,
                           scoreN = 2)
    pred_df2 <- cbind(pred_df2, t(ext$score2_pred))
    
    pred_df2$IDs <- reindex_back(pred_df2$IDs, maps$study2)
    pred_df2$IDp <- reindex_back(pred_df2$IDp, maps$patient2)
    
    pred_df  <- rbind(pred_df1, pred_df2)
    
    mdf1 <- df[ , colnames(df) != score2]
    mdf2 <- df[ , colnames(df) != score]
    
    # remove NAs
    mdf1 <- mdf1[!is.na(mdf1[ , score]), ]
    mdf2 <- mdf2[!is.na(mdf2[ , score2]), ]
    
    # rename columns
    colnames(mdf1)[colnames(mdf1) == score] <- "score"
    colnames(mdf2)[colnames(mdf2) == score2] <- "score"
    
    # cbind score name
    mdf1 <- cbind(mdf1, "scoreN" = 1)
    mdf2 <- cbind(mdf2, "scoreN" = 2)
    
    # rbind
    mdf <- rbind(mdf1, mdf2)
    out_data    <- dplyr::left_join(mdf, pred_df, by = c("scoreN", IDs, IDp, times))
  }
  out_data <- out_data[order(out_data[ ,"scoreN"],
                             out_data[ ,IDs],
                             out_data[ ,IDp],
                             out_data[ ,times]), ]
  
  return(list("stan_model"   = stan_out,
              "data_used"    = data_list,
              "maps"         = maps,
              "pred_samples" = out_data))
}
bstatcomp/project-cpath documentation built on Feb. 26, 2020, 2:40 p.m.