R/prepareData.R

Defines functions simulate_missing read_data simulate_data split_data

Documented in read_data simulate_data simulate_missing

split_data = function(data, ratio=c(8,2), seed=333){
  # splits the data into training-validation-test sets. Default = 8-2-0 --> test is train set
  # output g is the partition that can be used to split data and Missing (and probMissing optionally).
  ratio = ratio/sum(ratio) # if ratio doesn't sum to 1
  set.seed(333)
  g = sample(cut(
    seq(nrow(data)),
    nrow(data)*cumsum(c(0,ratio)),
    # labels = c("train","valid","test")
    labels = c("train","valid")
  ))
  return(g)
}

#' Simulate data
#'
#' @param N Number of observations
#' @param D Dimension of latent Z
#' @param P Number of features
#' @param sim_index Integer index for sim. Can use to set seed
#' @param seed Seed. Either set custom or set some value x sim_index. Default: 9 x sim_index
#' @param ratio Train-valid-test ratio for splitting of observations
#' @param g_seed Seed for train-valid-test dataset splitting of observations
#' @param beta coefficients for each column of X in simulating a binary class response variable
#' @return list of objects: data (N x P matrix), classes (subgroups of observations), params (those used for simulating data), and g (partitioning of data into train-valid-test sets)
#' @examples
#' simulate_data(N = 10000, D = 2, P = 8, sim_index = 1)
#'
#' @author David K. Lim, \email{deelim@live.unc.edu}
#' @references \url{https://github.com/DavidKLim/NIMIWAE}
#'
#' @importFrom qrnn elu
#'
#' @export
simulate_data = function(N, D, P, sim_index, seed = 9*sim_index, ratio=c(8,2), g_seed = 333,
                         beta=c(rep(-1/4,floor(P/2)), rep(1/4, P-floor(P/2))), nonlinear=F){
  # simulate data by simulating D-dimensional Z latent variable for N observations
  # and then apply W and B (both drawn from N(0,1)) weights and biases to obtain X
  set.seed(seed)

  #### NEED TO UPDATE THESE GENERATIONS
  if(nonlinear){
    print("Nonlinear data generation")
    # H = 64
    # sd1 = 1/4; sd2 = 1/4
    # W1 = matrix(rnorm(D*H,mean=0,sd=sd1),nrow=D,ncol=H) # weights
    # # W2 = matrix(rnorm(H*H,mean=0,sd=sd2),nrow=H,ncol=H)
    # W2 = matrix(rnorm(H*P,mean=0,sd=sd1),nrow=H,ncol=P)
    # B1 = matrix(rnorm(N*H,mean=0,sd=sd2),nrow=N,ncol=H,byrow=T) # biases: same for each obs
    # # B2 = matrix(rnorm(N*H,mean=0,sd=sd2),nrow=N,ncol=H,byrow=T)
    # B2 = matrix(rnorm(N*P,mean=0,sd=sd2),nrow=N,ncol=P,byrow=T)
    #
    # # W0 = matrix(rnorm(D*P,mean=0,sd=sd2),nrow=D,ncol=P)
    # # B0 = matrix(rnorm(N*P,mean=0,sd=sd3),nrow=N,ncol=P,byrow=T)
    #
    # # library(qrnn) # import "elu" function
    # X = qrnn::elu(Z%*%W1 + B1) %*% W2 + B2  # mimicking 1 HL with elu activation functions -- > NxP matrix

    Z1 = MASS::mvrnorm(N, rep(0,D), Sigma=diag(D))
    Z2 = MASS::mvrnorm(N, rep(0,D), Sigma=diag(D))

    sd1 = 0.5

    W1 = matrix(runif(D*floor(P/2),0.5,1),nrow=D,ncol=floor(P/2)) # weights
    W2 = matrix(runif(D*(P-floor(P/2)),0.5,1),nrow=D,ncol=P-floor(P/2)) # weights
    B1 = matrix(rnorm(N*floor(P/2),0,sd1), nrow=N, ncol=floor(P/2))
    B2 = matrix(rnorm(N*(P-floor(P/2)),0,sd1),nrow=N,ncol=P-floor(P/2))
    X1 = qrnn::sigmoid(Z2%*%W1+B1)
    X2 = cos(Z1%*%W2+B2) + qrnn::sigmoid(Z2%*%W2+B2)


    X=cbind(X1,X2)

    sds=list(sd1=sd1); W=list(W1=W2,W2=W2); B=list(B1=B1,B2=B2)
  }else{
    #############
    Z = MASS::mvrnorm(N, rep(0,D), Sigma=diag(D))
    sd1 = 0.5; sd2=1
    print("Linear data generation")
    W = matrix(runif(D*P, 0, sd1),nrow=D,ncol=P) # weights
    B = matrix(rnorm(N*P, 0, sd2),nrow=N,ncol=P)

    X = Z%*%W + B
    sds=list(W=sd1, B=sd2)
  }
  X = apply(X,2,function(x){(x-mean(x))/sd(x)}) # pre normalize

  params=list(N=N, D=D, P=P, Z=Z, W=W, B=B, seed=seed, sds=sds)

  find_int = function(p,beta) {
    # Define a path through parameter space
    f = function(t){
      sapply(t, function(y) mean(1 / (1 + exp(-y -X %*% beta))))
    }
    alpha <- uniroot(function(t) f(t) - p, c(-1e6, 1e6), tol = .Machine$double.eps^0.5)$root
    return(alpha)
  }
  inv_logit = function(x){
    return(1/(1+exp(-x)))
  }
  logit = function(x){
    if(x<=0 | x>=1){stop('x must be in (0,1)')}
    return(log(x/(1-x)))
  }
  alph <- sapply(0.5, function(y)find_int(y,beta))

  beta0 = alph
  mod = beta0 + X%*%beta

  probs = inv_logit(mod)
  classes = rbinom(N,1,probs)


  params=list(N=N, D=D, P=P, Z=Z, W=W, B=B, beta0=beta0, beta=beta, probs=probs, seed=seed)

  # ## simulate clustered data?
  # if(dataset=="TOYZ_CLUSTER"){
  #   N=100000; D=2; P=8; seed=9*sim_index
  #   set.seed(seed)
  #   classes=sample(c(1,2,3,4),N,replace=T)
  #
  #   Z=matrix(nrow=N,ncol=D)
  #   for(d in 1:D){
  #     for(c in 1:length(unique(classes))){
  #       Z[classes==c,d]=rnorm(sum(classes==c),mean=0+c*3,sd=1)
  #     }
  #   }
  #   W = matrix(nrow=D,ncol=P) # weights
  #   B = matrix(nrow=N,ncol=P) # biases
  #   for(p in 1:P){
  #     W[,p]=rnorm(D,mean=0,sd=1)
  #     B[,p]=rnorm(N,mean=0,sd=1)
  #   }
  #   X = Z%*%W + B
  #   data = X
  #   params=list(N=N, D=D, P=P, Z=Z, W=W, B=B, seed=seed)
  # }

  set.seed(g_seed)
  g = split_data(data=X, ratio=ratio)
  return(list(data=X, classes=classes, params=params, g=g))
}




###### NEED TO ADJUST INPUTS FOR SIMULATIONS, AND INCORPORATE simulate_data() function #####
#' Read UCI or Physionet 2012 Challenge Data
#'
#' @param dataset String for name of dataset. Valid datasets: "BANKNOTE","....".
#' @param ratio Train-valid-test ratio for splitting of observations
#' @param g_seed Seed for train-valid-test dataset splitting of observations
#' @return list of objects: data (N x P matrix), classes (subgroups of observations), params (those used for simulating data), and g (partitioning of data into train-valid-test sets)
#' @examples
#' read_data(dataset = "BANKNOTE")
#'
#' @author David K. Lim, \email{deelim@live.unc.edu}
#' @references \url{https://github.com/DavidKLim/NIMIWAE}
#'
#' @importFrom gdata read.xls
#' @importFrom reticulate import
#' @importFrom lubridate hms hour minute
#'
#' @export
read_data = function(dataset=c("Physionet_mean","Physionet_all","HEPMASS","POWER","GAS","IRIS","RED","WHITE","YEAST","BREAST","CONCRETE","BANKNOTE"), ratio=c(8,2), g_seed = 333){
  if(grepl("Physionet",dataset)){
    # np <- reticulate::import("numpy")
    # npz1 <- np$load("data/PhysioNet2012/physionet.npz")
    # classes=c(npz1$f$y_train, npz1$f$y_val, npz1$f$y_test)
    # params=NULL
    # if(strsplit(dataset,"_")[[1]][2] == "mean"){
    #   # dataset=="Physionet_mean"
    #   data = rbind(apply(npz1$f$x_train_miss, c(1,3), mean),
    #                apply(npz1$f$x_val_miss, c(1,3), mean),
    #                apply(npz1$f$x_test_miss, c(1,3), mean))
    # } else if(strsplit(dataset,"_")[[1]][2] == "all"){
    #   # dataset=="Physionet_all"
    #   library(abind)
    #   X3D = aperm(abind(npz1$f$x_train_miss,
    #                     npz1$f$x_val_miss,
    #                     npz1$f$x_test_miss,
    #                     along=1),
    #               c(2,1,3))                                             # switch dims so 48 time points is first dim
    #   data = matrix(X3D, nrow=dim(X3D)[1]*dim(X3D)[2], ncol=dim(X3D)[3])   # stack time series data: 1st subject is 1st - 48th observations, 2nd subj is 49th - 96th, ...
    #   classes=rep(classes, each = 48)
    # }

    # np <- import("numpy")
    # npz1 <- np$load("data/PhysioNet2012/physionet.npz")
    if(!dir.exists("data/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0")){
      dir.create("data/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0",recursive=T)
      # if training set doesn't exist, assume it hasn't been downloaded
      setwd("data")

      print("Downloading Physionet 2012 Challenge Dataset...")
      download.file("https://physionet.org/static/published-projects/challenge-2012/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0.zip",
                    destfile="predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0.zip")

      print("Unzipping compressed directory...")
      unzip("predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0.zip")

      print("Removing zip file...")
      file.remove("predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0.zip")

      if(!file.exists("set_c_merged.h5")){
        print("Merging into one dataset for summary table (Table 1)...")
        source_python("raw_data_gather.py")
      }

      setwd("predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0")

      ###########################################################################
      # Creating Table 1 (done first time runComparisons.R is run)
      np <- reticulate::import("numpy")

      features = c('ALP','ALT','AST','Albumin','BUN','Bilirubin',
                   'Cholesterol','Creatinine','DiasABP','FiO2','GCS',
                   'Glucose','HCO3','HCT','HR','K','Lactate','MAP', 'MechVent',
                   'Mg','NIDiasABP','NIMAP','NISysABP','Na','PaCO2',
                   'PaO2','Platelets','RespRate','SaO2','SysABP','Temp',
                   'TroponinI','TroponinT','Urine','WBC','pH')

      npz_trainval <- np$load("data_train_val.npz")
      npz_test <- np$load("data_test.npz")
      d_train = npz_trainval$f$x_train; d_val = npz_trainval$f$x_val; d_test = npz_test$f$x_test
      M_train = npz_trainval$f$m_train; M_val = npz_trainval$f$m_val; M_test = npz_test$f$m_test

      library(abind)
      d3=abind(d_train, d_val, d_test, along = 1)
      M3=abind(M_train, M_val, M_test, along = 1)

      d3=aperm(d3, c(2,1,3)); M3=aperm(M3,c(2,1,3))
      d = matrix(d3, nrow=dim(d3)[1]*dim(d3)[2], ncol=dim(d3)[3])
      M = 1-matrix(M3, nrow=dim(M3)[1]*dim(M3)[2], ncol=dim(M3)[3])
      colnames(d) = features; colnames(M) = features

      nobs = rep(NA,ncol(d)); n1obs = rep(NA,ncol(d))
      for(c in 1:ncol(d)){
        nobs[c] = sum(M[,c]==1) #; n1obs[c] = any(M[,c]==1)   # n1obs: number of subjects with at least one non-missing obs for each feature
        nonmiss = rep(NA,nrow(M)/48) # number of obs with at least 1 nonmissing entry for feature c
        for(b in 1:(nrow(M)/48)){
          nonmiss[b] = any(M[(b-1)*48+(1:48) , c]==1)
        }
        n1obs[c] = sum(nonmiss)
      }

      ## for table in data section of P2 paper: feature, %missing, %(subjects with at least one nonmissing entry)
      tab1 = cbind(features,1-nobs/nrow(d),n1obs/(nrow(d)/48))
      colnames(tab1)[2:3] = c("\\% Missingness", "Patients with $\\geq 1$ measurements")
      tab1[,2] = format(as.numeric(tab1[,2]),digits=2); tab1[,3] = format(as.numeric(tab1[,3]),digits=2)

      save(tab1,file="Tab1.out")
      ############################################################################



      ## run alistair et al pre-processing --> break down into first/last/median/...
      print("Processing data...")
      source_python("../alistair_preprocessing.py")

      if(!dir.exists("set-c")){ untar("set-c.tar.gz") }
      if(!file.exists("PhysionetChallenge2012-set-a.csv")){ process_Alistair('set-a') }
      if(!file.exists("PhysionetChallenge2012-set-b.csv")){ process_Alistair('set-b') }
      if(!file.exists("PhysionetChallenge2012-set-c.csv")){ process_Alistair('set-c') }

      ## read-in pre-processed data
      ## filter to just Median or last observed value
      # d1 = read.csv("PhysionetChallenge2012-set-a.csv")
      # d2 = read.csv("PhysionetChallenge2012-set-b.csv")
      # d3 = read.csv("PhysionetChallenge2012-set-c.csv")
      # library(dplyr)
      # features = c('recordid','SAPS.I','SOFA','Length_of_stay','Survival','In.hospital_death',
      #              'Age','Gender','Height','Weight','CCU','CSRU','SICU',
      #              'DiasABP_median','GCS_median','Glucose_median','HR_median','MAP_median','NIDiasABP_median',
      #              'NIMAP_median','NISysABP_median','RespRate_median','SaO2_median','Temp_median',
      #              'ALP_last','ALT_last','AST_last','Albumin_last','BUN_last','Bilirubin_last',
      #              'Cholesterol_last','Creatinine_last','FiO2_last','HCO3_last','HCT_last','K_last',
      #              'Lactate_last','Mg_last','Na_last','PaCO2_last','PaO2_last','Platelets_last',
      #              'SysABP_last','TroponinI_last','TroponinT_last','WBC_last','Weight_last','pH_last',
      #              'MechVentStartTime','MechVentDuration','MechVentLast8Hour','UrineOutputSum')
      #
      # ## save filtered data
      # d1 %>% select(features) %>% write.csv("PhysionetChallenge2012-set-a.csv", row.names=FALSE)
      # d2 %>% select(features) %>% write.csv("PhysionetChallenge2012-set-b.csv", row.names=FALSE)
      # d3 %>% select(features) %>% write.csv("PhysionetChallenge2012-set-c.csv", row.names=FALSE)

    } else{
      setwd("data/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0")
    }

    d1 = read.csv("PhysionetChallenge2012-set-a.csv")
    d2 = read.csv("PhysionetChallenge2012-set-b.csv")
    d3 = read.csv("PhysionetChallenge2012-set-c.csv")

    classes = c(d1$"In.hospital_death", d2$"In.hospital_death", d3$"In.hospital_death")
    fit_data = list(classes=classes)

    setwd("../..")

    d1 = d1[,-c(2:6)]  # remove outcome variables (survival, mortality indicator, SAPS/SOFA/length of stay)
    d2 = d2[,-c(2:6)]
    d3 = d3[,-c(2:6)]

    data = rbind(d1,d2,d3); data = data[,-1]   # remove recordid

    # ntrain = 4000; nvalid = 4000; ntest = 4000
    # ids = c( rep("train", ntrain), rep("valid", nvalid), rep("test", ntest) ); g = ids

    ratio = c(train = 8,valid = 2)
    ratio = ratio/sum(ratio)
    set.seed(333)
    g = sample(cut(
      seq(nrow(data)),
      nrow(data)*cumsum(c(0,ratio)),
      labels = c("train","valid")
    ))
    params=NULL
  }else if(dataset=="MINIBOONE"){  # 130K x 50
    # pre-processing steps from https://github.com/gpapamak/maf/blob/master/datasets/miniboone.py
    f = sprintf("./Results/%s/1000_train.csv.gz",dataset)
    if(!file.exists(f)){ download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00199/MiniBooNE_PID.txt", destfile=f) }
    data <- read.table(f, skip=1)
    classes=NULL; params=NULL
    indices = (data[,1] < -100)
    data=data[!indices, ]

    data = (data - colMeans(data)) / apply(data,2,sd)
    ## no pruning of features (not sure if it's necessary)
    # features_to_remove = apply(data,2,function(x) any(table(x)>6))
  }else if(dataset=="HEPMASS"){ # 7M x 27 --> 300K x 21 feats
    # temp <- tempfile()
    f1 = sprintf("./Results/%s/1000_train.csv.gz",dataset)
    f2 = sprintf("./Results/%s/1000_test.csv.gz",dataset)
    if(!file.exists(f1)){ download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00347/1000_train.csv.gz", destfile=f1) }
    if(!file.exists(f2)){ download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00347/1000_test.csv.gz", destfile=f2) }
    data1 <- read.csv(gzfile(f1),header=F,skip=1, nrows=700000)
    data2 <- read.csv(gzfile(f2),header=F,skip=1, nrows=350000)

    data=rbind(data1,data2)

    data = data[data[,1]==1,]    # only class 1 (class 0 is noise)

    classes = data[,1]
    data = data[,-1]; data = data[,-ncol(data)]   # first column: class. last column is bad? pre-processed out

    features_to_remove = apply(data,2,function(x) max(table(x))) > 100    # if more than 100 repeats in feature, remove
    data = data[, !features_to_remove]

    data = (data - colMeans(data)) / apply(data,2,sd)   # normalize
    params=NULL
  }else if(dataset=="POWER"){ # 2.05M --> 1M x 6
    f1 = sprintf("./Results/%s/household_power_consumption.zip",dataset)
    if(!file.exists(f1)){download.file("https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip",destfile=f1)}
    data <- read.csv(unz(f1, "household_power_consumption.txt"),header=T,sep=";")
    data=data[,-1]; data=data[,-3]; data=data[,-4]    # following https://github.com/gpapamak/maf/blob/master/datasets/power.py . They get rid of Date, Global reactive power, and Global Intensity
    time_res = lubridate::hms(data[,1]); data[,1] = 60*lubridate::hour(time_res) + lubridate::minute(time_res)

    for(j in 2:5){ data[,j] = as.numeric(data[,j]) }
    data = data[!is.na(rowSums(data)),]
    data = (data - matrix(colMeans(data),nrow=nrow(data),ncol=ncol(data),byrow=T)) / matrix(apply(data,2,sd),nrow=nrow(data),ncol=ncol(data),byrow=T)
    classes=NULL
    params=NULL
    set.seed(9)
    data=data[sample(1:nrow(data),1000000,replace=F),]
  }else if(dataset=="GAS"){ # 4.21M --> 1M x 8. Some features taken out due to high correlation
    temp <- tempfile()
    download.file("http://archive.ics.uci.edu/ml/machine-learning-databases/00322/data.zip",temp)
    data <- read.table(unz(temp, "ethylene_CO.txt"),header=F,skip=1,sep="")
    unlink(temp)
    classes=NULL
    params=NULL

    # pre-processing step from https://github.com/gpapamak/maf/blob/master/datasets/gas.py
    ## drop first three columns (time, meth, eth concentrations)
    data=data[,-c(1,2,3)]
    #set.seed(99); data=data[sample(1:nrow(data),floor(0.25*nrow(data)),replace=F),] # sample 25% of data

    ## remove highly correlated columns
    get_corr_nums = function(data){
      C=cor(data)
      A=C>0.98
      B=colSums(A)
      return(B)
    }
    B=get_corr_nums(data)
    while(any(B>1)){
      col_to_remove = which(B > 1)[1]
      col_name = names(B)[col_to_remove]
      data=data[,-which(colnames(data)==col_name)]
      B = get_corr_nums(data)
    }

    set.seed(9)
    data=data[sample(1:nrow(data),1000000,replace=F),]     # sample 1M of the 4.209M observations (ACFlow does this too)

  }else if(dataset=="IRIS"){ # 150 x 4
    data <- read.csv(url("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"),
                     header=FALSE, col.names=c("sepal.length","sepal.width","petal.length","petal.width","species"))
    classes=data[,ncol(data)]
    data=data[,-ncol(data)]     # take out class (3)
    params=NULL
  } else if(dataset=="RED"){ # 1599 x 12
    data <- read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"),header=TRUE,sep=";")
    classes=data[,ncol(data)]
    data=data[,-ncol(data)]     # take out class (quality, 1-10)
    params=NULL
  } else if(dataset=="WHITE"){ # 4898 x 12
    data <- read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"),header=TRUE,sep=";")
    classes=data[,ncol(data)]
    data=data[,-ncol(data)]     # take out class (quality, 1-10)
    params=NULL
  } else if(dataset=="BREAST"){ # 569 x 30
    # Wisconsin BC (Diagnostic)
    data <- read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"),
                     header=FALSE)
    colnames(data)=c("ID","diagnosis","radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean",
                     "compactness_mean","concavity_mean","concave_points_mean","symmetry_mean","fractal_dimension_mean",
                     "radius_se","texture_se","perimeter_se","area_se","smoothness_se","compactness_se","concavity_se",
                     "concave_points_se","symmetry_se","fractal_dimension_se","radius_worst","texture_worst","perimeter_worst",
                     "area_worst","smoothness_worst","compactness_worst","concavity_worst","concave_points_worst","symmetry_worst","fractal_dimension_worst")
    data=data[,-1]       # remove ID's
    classes=data[,1]
    data=data[,-1]      # take out class (2)
    params=NULL
  } else if(dataset=="YEAST"){  # 1484 x 8
    data <- read.table(url("https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data"),header=FALSE)
    colnames(data)=c("sequence_name","mcg","gvh","alm","mit","erl","pox","vac","nuc","class")
    data=data[,-1]   # remove sequence names
    classes=data[,ncol(data)]
    data=data[,-ncol(data)] # take out class (10)
    params=NULL
  } else if(dataset=="CONCRETE"){ # 1030 x 8
    data <- gdata::read.xls("http://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls")
    classes=data[,ncol(data)]
    data=data[,-ncol(data)] # take out class: here it's continuous
    params=NULL
  } else if(dataset=="BANKNOTE"){ # 1372 x 4
    data <- read.csv(url("https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt"),header=FALSE)
    classes=data[,ncol(data)]
    data=data[,-ncol(data)] # take out class (2)
    params=NULL
  }

  set.seed(g_seed)
  g = split_data(data=data.frame(data), ratio=ratio)
  return(list(data=data, classes=classes, params=params, g=g))
}







#' Simulate different mechanisms of missingness
#'
#' @param data data frame of data (N x P)
#' @param miss_cols Columns to impose missingness on
#' @param ref_cols Column(s) to use as covariates of missingness for MAR or MNAR. If scheme="UV" and mechanism="MAR", then each element of ref_cols is used as a covariate for each corresponding element of miss_cols (length of miss_cols must be smaller than ref_cols)
#' @param pi Proportion of entries that are missing (for all miss_cols)
#' @param phis Coefficients of each covariate in missingness model. Corresponding element is coef of corr element in miss_cols if MNAR, and in ref_cols if MAR (for scheme=UV)
#' @param phi_z Coefficient of Z in missingness model, only used if fmodel="PM"
#' @param scheme "UV" or "MV": "UV" uses one covariate in missingness model of each miss_cols element (mechanism = "MAR" uses corresponding element of ref_cols for each corresponding element of miss_cols, and mechanism = "MNAR" uses itself for miss model for each miss_cols). "MV" uses all ref_cols of each missingness model
#' @param mechanism "MCAR", "MAR", or "MNAR" missingness. Only pertinent for fmodel="S".
#' @param sim_index Index of simulation run: varies seed based on sim_index and column index for each element of miss_cols for reproducibility.
#' @param fmodel "S" or "PM" for selection model or pattern-mixture model. If "PM", mechanism will be MNAR regardless of specification, and latent matrix Z must be specified and will be used as the only covariate of all missingness models
#' @param Z Matrix of simulated values of the latent variable
#' @return list of objects: Missing (N x P mask matrix), probs (N x P matrix of probabilities of each entry being missing), params (pertaining to simulations),
#' mechanism (of missingness), scheme (univariate or multivariate logistic regression model), phi0s (intercepts for each missingness model),
#' phis (coefficients for each covariate of each missingness model), phi_z (coefficient of Z, if fmodel="PM"))
#' @examples
#' data = read_data("BANKNOTE",NULL,1)$data
#' set.seed(111)
#' ref_cols=sample(c(1:ncol(data)),ceiling(ncol(data)/2),replace=F); miss_cols=(1:ncol(data))[-ref_cols]
#' simulate_missing(data, miss_cols, ref_cols, 0.5, rep(5,length(miss_cols)), NULL, "UV", "MNAR")
#' @export
simulate_missing = function(data,miss_cols,ref_cols,pi,
                            phis,phi_z,
                            scheme,mechanism,sim_index=1,fmodel="S", Z=NULL){

  #sim_index=1 unless otherwise specified
  n=nrow(data)
  # function s.t. expected proportion of nonmissing (Missing=1) is p. let p=1-p_miss
  find_int = function(p,phi) {
    # Define a path through parameter space
    f = function(t){
      sapply(t, function(y) mean(1 / (1 + exp(-y -x %*% phi))))
    }
    alpha <- uniroot(function(t) f(t) - p, c(-1e6, 1e6), tol = .Machine$double.eps^0.5)$root
    return(alpha)
  }
  inv_logit = function(x){
    return(1/(1+exp(-x)))
  }
  logit = function(x){
    if(x<=0 | x>=1){stop('x must be in (0,1)')}
    return(log(x/(1-x)))
  }

  Missing = matrix(1,nrow=nrow(data),ncol=ncol(data))
  prob_Missing = matrix(1,nrow=nrow(data),ncol=ncol(data))
  params=list()
  phi0s = rep(0,length(miss_cols))
  for(j in 1:length(miss_cols)){
    # specifying missingness model covariates
    if(mechanism=="MCAR"){
      x <- matrix(rep(0,n),ncol=1) # for MCAR: no covariate (same as having 0 for all samples)
      phi=0                       # for MCAR: no effect of covariates on missingness (x is 0 so phi doesn't matter)
    }else if(mechanism=="MAR"){
      if(scheme=="UV"){
        x <- matrix(data[,ref_cols[j]],ncol=1)             # missingness dep on just the corresponding ref column (randomly paired)
        phi=phis[ref_cols[j]]
      }else if(scheme=="MV"){
        # check if missing column in ref. col: this would be MNAR (stop computation)
        if(any(ref_cols %in% miss_cols)){stop(sprintf("missing cols in reference. is this intended? this is MNAR not MAR."))}
        x <- matrix(data[,ref_cols],ncol=length(ref_cols)) # missingness dep on all ref columns
        phi=phis[ref_cols]
      }else if(scheme=="NL"){
        ## Nonlinear
        ## if(any(is.na(data[,1:3]))){stop("Missingness in 1st 3 cols of data used for nonlinear MAR interactions")}
        ## x = cbind(data[,1]*data[,2], data[,2]*data[,3], data[,1]*data[,2]*data[,3])  # 3 nonlinear predictors (observed)
        ## beta = betas[1:3]/colMeans(x)  # scale effect of each nonlinear term by mean --> scale down effect of large predictors
        x <- matrix(log(data[,ref_cols[j]] + abs(min(data[,ref_cols[j]])) + 0.01),ncol=1) # just the corresponding ref column
        # x <- matrix((data[,ref_cols[j]])^2,ncol=1) # just the corresponding ref column
        beta=betas[miss_cols[j]]
        # jj = if(j>1){j-1}else{jj=length(ref_cols)}
        # x <- cbind(exp(data[,ref_cols[j]]), (data[,ref_cols[jj]])^2, data[,ref_cols[jj]]*data[,ref_cols[j]]) # just the corresponding ref column
        # beta = rep(betas[ref_cols[j]], 3)
      }
    }else if(mechanism=="MNAR"){
      if(fmodel=="S"){
        # Selection Model
        if(scheme=="UV"){
          # MISSINGNESS OF EACH MISS COL IS ITS OWN PREDICTOR
          x <- matrix(data[,miss_cols[j]],ncol=1) # just the corresponding ref column
          phi=phis[miss_cols[j]]
        }else if(scheme=="MV"){
          # check if missing column not in ref col. this might be MAR if missingness not dep on any other missing data
          if(all(!(ref_cols %in% miss_cols))){warning(sprintf("no missing cols in reference. is this intended? this might be MAR not MNAR"))}
          x <- matrix(data[,ref_cols],ncol=length(ref_cols)) # all ref columns
          phi=phis[ref_cols]         # in MNAR --> ref_cols can overlap with miss_cols (dependent on missingness)

          # address when miss_cols/ref_cols/phis are not null (i.e. want to induce missingness on col 2 & 5 based on cols 1, 3, & 4)
        }else if(scheme=="NL"){
          # ## Nonlinear
          # # x = cbind(data[,1]*data[,2], data[,2]*data[,3], data[,1]*data[,2]*data[,3], log(data[,4]), exp(data[,5]))  # 5 nonlinear predictors
          # # x = cbind(data[,1]*data[,2], data[,2]*data[,3], data[,1]*data[,2]*data[,3])  # 3 nonlinear predictors
          # x = cbind(data[,1], data[,1]*data[,2], data[,3]^2)  # 3 nonlinear predictors
          # # beta = betas[1:5]/colMeans(x)  # scale effect of each nonlinear term by mean --> scale down effect of large predictors
          # # beta = betas[1:3]/colMeans(x)  # scale effect of each nonlinear term by mean --> scale down effect of large predictors
          # beta = betas[1:3]  # scale effect of each nonlinear term by mean --> scale down effect of large predictors

          x <- matrix(log(data[,miss_cols[j]]+abs(min(data[,miss_cols[j]]))+0.01),ncol=1)
          # x <- matrix((data[,miss_cols[j]])^2,ncol=1) # just the corresponding ref column
          beta=betas[miss_cols[j]]
          # jj = if(j>1){j-1}else{jj=length(miss_cols)}
          # x <- cbind(exp(data[,miss_cols[j]]), (data[,miss_cols[jj]])^2, data[,miss_cols[jj]]*data[,miss_cols[j]]) # just the corresponding ref column
          # beta = rep(betas[miss_cols[j]], 3)
        }
      } else if(fmodel=="PM"){
        x <- Z
        phi = phi_z # phis should be length Z
      }
    }
    alph <- sapply(pi, function(y)find_int(y,phi))
    phi0s[j] = alph
    mod = alph + x%*%phi

    # Pr(Missing_i = 1)
    probs = inv_logit(mod)
    prob_Missing[,miss_cols[j]] = probs

    # set seed for each column for reproducibility, but still different across columns
    set.seed(j+(sim_index-1)*length(miss_cols))
    Missing[,miss_cols[j]] = rbinom(n,1,probs)

    params[[j]]=list(phi0=alph, phi=phi, miss=miss_cols[j], ref=ref_cols[j], scheme=scheme)
  }

  return(list(Missing=Missing,
              probs=prob_Missing,params=params, mechanism=mechanism,scheme=scheme, phi0s=phi0s, phis=phis, phi_z=phi_z))
}
DavidKLim/NIMIWAE documentation built on Jan. 19, 2024, 11:18 p.m.