inst/validation_code_arxiv/BW_imp_CH_debug.R

### This file contains:
### fwd_bwd_CH: computes normalized forward/backward variables
### BW_CH     : estimates initial prob and transition prob
###             by Baum Welch algorithm
### impute_CH : imputes missing based on Markov chain (basic working block)
### imputeBS_CH:  imputes missing based on Markov chain and bootstrap
###               allowing strata imputation
### NM2CH     : transfer data from NM format to CH format
### get_start : get start value for BW algorithm according to NM's algorithm
### estTransMatrx : estimate transition matrix empirically
### estInitDist : estimate initial distribution empirically
### Check the examples at the bottom of the file for the
### explanation of arguments.

#######################################################################################################


## return a list with normalized forward/backward
## variables and c function
fwd_bwd_CH <- function(dat, q_func, bin, initP, tP){
  n_dat <- length(dat)
  alpha      <- matrix(NA, nrow = n_dat, ncol = 8) # not output/calculate
  alpha_star <- matrix(NA, nrow = n_dat, ncol = 8)
  beta       <- matrix(NA, nrow = n_dat, ncol = 8) # not output/calculate
  beta_star  <- matrix(NA, nrow = n_dat, ncol = 8)
  c_func     <- numeric(n_dat)
  
  alpha[1, ]         <- initP * q_func[dat[1], ]
  c_func[1]          <- sum(alpha[1, ])
  alpha_star[1, ]    <- alpha[1, ] / c_func[1]
  beta[n_dat, ]      <- 1
  beta_star[n_dat, ] <- 1
  
  for (i in 2:n_dat) {
    cart <- alpha_star[i - 1, ] %*% tP[[bin[i - 1]]] * q_func[dat[i], ]
    c_func[i] <- sum(cart)
    alpha_star[i, ] <- cart / c_func[i]
  }
  
  for (i in (n_dat - 1):1) {
    beta_star[i, ] <- tP[[bin[i]]] %*% (beta_star[i + 1, ] * q_func[dat[i + 1], ]) / c_func[i + 1]
  }
  
  return(list(alpha_star, beta_star, c_func))
}


#######################################################################################################


arraySum <- function(x) {
  result <- matrix(0, nrow = dim(x)[1], ncol = dim(x)[2])
  for (i in 1:dim(x)[3]) {
    result <- result + x[, , i]
  }
  result
}


BW_CH <- function(dataset, q_func, bin, start_initP, start_tP,
                  tol_llk = 0, message = TRUE){
  n_data <- nrow(dataset)
  len_data <- ncol(dataset)
  n_bin <- length(unique(bin))
  initP  <- start_initP
  tP     <- start_tP
  ite <- 1
  llk_last <- 1
  
  while (TRUE) {
    ## initiate each iteration
    cart_updt_initP <- matrix(NA, nrow = n_data, ncol = 8)
    
    cart_updt_tP_nomi <- vector('list', n_bin)
    for (i in 1:n_bin) cart_updt_tP_nomi[[i]] <- array(NA, dim = c(8, 8, n_data))
    
    cart_updt_tP_deno <- vector('list', n_bin)
    for (i in 1:n_bin) cart_updt_tP_deno[[i]] <- matrix(NA, ncol = 8, nrow = n_data)
    
    llk <- 0

    for (i in 1:n_data) { ## calculate i-th patient
      ## get fwd bwd c_func
      cart <- fwd_bwd_CH(dataset[i, ], q_func, bin, initP, tP)
      
      ## get llk
      llk <- llk + sum(log(cart[[3]]))
      
      ## get gamma
      gamma <- cart[[1]] * cart[[2]]
      ## get xi
      ## test : each sum of each xi(t) is 1
      xi <- array(NA, dim = c(8, 8, len_data - 1))
      for (j in 1:(len_data - 1)) {
        cartA <- t(t(cart[[1]][j, ] * tP[[bin[j]]]) * cart[[2]][j + 1, ])
        cartA <- t(t(cartA) * q_func[dataset[i, j + 1], ]) / cart[[3]][j + 1]
        xi[, , j] <- cartA
      }
      
      # store result
      cart_updt_initP[i, ] <- gamma[1, ]
      
      for (s in 1:n_bin) {
        cart_updt_tP_nomi[[s]][, , i] <- arraySum(xi[, , which(bin == s)])
      }
      
      for (s in 1:n_bin) {
        cart_updt_tP_deno[[s]][i, ] <- apply(gamma[which(bin == s), ], 2, sum)
      }
    }
    
    ## update initP and transition prob
    initP <- apply(cart_updt_initP, 2, sum) / n_data
    for (s in 1:n_bin) {
      tP[[s]] <- arraySum(cart_updt_tP_nomi[[s]]) / apply(cart_updt_tP_deno[[s]], 2, sum)
    }
    
    ## fix state 8 related quant for NIAID
    #initP[8] <- 0
    for (s in 1:n_bin) tP[[s]][8, ] <- c(rep(0, 7), 1)
    
    ## print option
    if (message) print(paste0("Iteration ", ite, ": log-likelihood is ", llk, "."))
    
    ## check whether stop
    if (abs((llk_last - llk)/llk_last) <= tol_llk & ite >= 5) break
    
    ## update llk and iteration number
    llk_last <- llk
    ite <- ite + 1
  }
  
  return(list(initial_prob = initP,
              transition_prob = tP,
              iteration = ite,
              llk = llk))
}



#######################################################################################################


impute_CH <- function(dataset, q_func, bin, initP, tP){
  n_data <- nrow(dataset)
  len_data <- ncol(dataset)
  n_bin <- length(tP)
  
  imp_data <- matrix(NA, ncol = len_data, nrow = n_data)
  
  for (i in 1:n_data) { ## calculate i-th patient
    ## get fwd bwd c_func
    cart <- fwd_bwd_CH(dataset[i, ], q_func, bin, initP, tP)
    
    ## get gamma
    gamma <- cart[[1]] * cart[[2]]
    
    ## get xi
    ## test : each sum of each xi(t) is 1
    xi <- array(NA, dim = c(8, 8, len_data - 1))
    for (j in 1:(len_data - 1)) {
      cartA <- t(t(cart[[1]][j, ] * tP[[bin[j]]]) * cart[[2]][j + 1, ])
      cartA <- t(t(cartA) * q_func[dataset[i, j + 1], ]) / cart[[3]][j + 1]
      xi[, , j] <- cartA
    }
    
    ## impuate i-th patient
    imp_cart <- numeric(len_data)
    imp_cart[1] <- sample(1:8, size = 1, replace = TRUE, prob = gamma[1, ])
    for (j in 2:len_data) {
      prob_cart <- xi[imp_cart[j - 1], , j - 1] / sum(xi[imp_cart[j - 1], , j - 1])
      imp_cart[j] <- sample(1:8, size = 1, replace = TRUE, prob = prob_cart)
      
      ### for debug purpose ######################################################
      # cat("patient", i, "date", j, "prob", prob_cart, "\n")
      ### for debug purpose ######################################################
    }
    
    imp_data[i, ] <- imp_cart
  }
  
  imp_data
}



#####################################################################################################


bootstrap_param_est_CH <- function(wide, b, days=paste0("D",1:28), bin=rep(1,length(days)-1), 
           Em=get_emission(wide, days), tol=1E-6, maxiter=200, silent=FALSE, strt) {
    
    M = wide[,days]
    M = as.matrix(do.call(cbind, lapply(M, function(x) {
      x=suppressWarnings(as.numeric(as.character(x)))
      # x[x<1 | x>8]=NA
      x
    })))
    
    #originally I implemented to stratify baseline. However, not planning to expose this functionality.
    #the following line sets up for no strata
    s=rep(1, nrow(wide))
    
    #strt=niaidMI:::.get_start(M=M,s=s,bin=bin)
    
    if(!silent) cat("Fiting Model\n")
    fit=niaidMI:::.baum_welsh(Pri=list(strt[[2]]), s=s, Tran=strt[[1]], Em=Em, bin=bin, tol=tol,  maxiter=maxiter)
    
    ret=list(fit=fit, s=s, bin=bin, days=days, Em=Em)
    
    ret
}

#####################################################################################################


## no strata
enc_imputeBS_CH <- function(data, q_func, bin, start_initP, start_tP, m,
                            tol_llk = 0, fix_const = 1E-6, dataNM) { #m: num of bootstrap
  
  #fitMC <- BW_CH(data, q_func, bin, start_initP, start_tP, tol_llk = tol_llk, message = FALSE)
  
  result <- vector("list", m)
  cart <- result
  cartNM <- result
  for (i in 1:m) {
    id <- sample(seq_len(nrow(data)), size = nrow(data), replace = TRUE)
    cart[[i]] <- data[id, ]
    cartNM[[i]] <- dataNM[id, ]
    
    ### for debug purpose ######################################################
    # cat(i, "-", id, "\n")
    ### for debug purpose ######################################################
    
  }
  
  #strt = niaidMI:::.get_start(M=M,s=s,bin=bin)
  strt = get_start(data = data, bin = bin)
  
  for (i in 1:m) {
    #fitMC <- BW_CH(cart[[i]], q_func, bin, start_initP, start_tP, tol_llk = tol_llk, message = FALSE)
    
    ### for debug purpose ######################################################
    fitMC <- bootstrap_param_est_CH(wide=cartNM[[i]], b=0, bin=bin, tol = tol_llk, silent = TRUE, maxiter = 100000000000, strt = strt)$fit
    fitMC <- list(fitMC[[2]][[1]], fitMC[[1]])
    ### for debug purpose ######################################################
    
    ## solve conflict of bootstrap and fwd/bwd calculation
    cart_initP <- fitMC[[1]] + fix_const
    cart_initP <- cart_initP / sum(cart_initP)
    
    cart_tP <- lapply(fitMC[[2]], function(x) {
      x <- x + fix_const
      x / rowSums(x)
    })
    ## this approach may be improved in future
    
    result[[i]] <- impute_CH(data, q_func, bin, cart_initP, cart_tP)
    
    ### for debug purpose ######################################################
    #print(fitMC)
    print(paste("imputation", i, "is completed"))
    ### for debug purpose ######################################################
  }
  result
}


### Imputation function with bootstrap and Markov Chain model
## The procedure contains three steps: 1. fit Markov Chain with original
## dataset; 2. get bootstrap datasets from original dataset;
## 3. impute each bootstrap dataset separately with fitted Markov Chain.
## param:
##   data:     matrix contains original data. One column is strata if by != NULL.
##   q_func:   matrix with 0-1 specify q function.
##   bin:      vector inidcates bins cross time.
##   start_initP: start value of initial probability for BW algorithm to fit Markov chain.
##   start_tP: start value of transition probability for BW algorithm to fit Markov chain.
##   m:        int, number of imputations.
##   by:       int or char, indicates which column in data is strata.
##             If NULL, impute all together.
##   tol_llk:  num, indicates the tolerance of BW algorithm.
##   fix_const: num, a small enough number to solve conflict of bootstrap and real data.
## value:
##   a list with length m. Each component is one imputation.
## details:
##   If by != NULL, data must include one column indicating strata and column
##   name/num must be indicated in by. If by == NULL, then data must NOT contain
##   strata column.

## with strata
imputeBS_CH <- function(data, q_func, bin, start_initP, start_tP, m,
                        by = NULL, tol_llk = 0, fix_const = 1E-6, dataNM) {
  if (is.null(by)) {
    result <- enc_imputeBS_CH(data, q_func, bin, start_initP, start_tP, m, tol_llk, fix_const, dataNM)
    return(result)
  }
  
  result <- vector("list", m)
  
  by <- data[, by]
  unq_by <- unique(by)
  databy <- vector("list", length(unique(by)))
  databyNM <- vector("list", length(unique(by)))
  for (i in seq_len(length(unique(by)))) {
    databy[[i]] <- data[by == unq_by[i], -1]
    databyNM[[i]] <- dataNM[data[, by] == unq_by[i], ]
    cart <- enc_imputeBS_CH(databy[[i]], q_func, bin, start_initP, start_tP, m, tol_llk, fix_const, databyNM[[i]])
    for (j in 1:m) {
      result[[j]] <- rbind(result[[j]], cbind(unq_by[i], cart[[j]]))
    }
  }
  
  result
}


#####################################################################################################


## 1-8 for 8 niaid score; 9 for NA; 10- for other
NM2CH_data <- function(data, days=paste0("D",1:28), strata = "strata") {
  ## 1-8 for 8 niaid score; 9 for NA; 10- for other
  dat <- data[, days]
  
  
  # get different states
  unq_state <- c()
  for (i in seq_len(ncol(dat))) {
    unq_state <- c(unique(as.character(dat[, i])), unq_state)
  }
  unq_state <- unique(unq_state)
  unq_state <- unq_state[!unq_state %in% as.character(1:8) & !is.na(unq_state)]
  unq_state_num <- seq_len(length(unq_state)) + 9
  
  
  # get q function
  q_func <- matrix(0, ncol = 8, nrow = length(unq_state) + 9)
  q_func[1:8, 1:8] <- diag(8)
  q_func[9, ] <- 1
  for (i in seq_len(length(unq_state))) {
    bg <- substr(unq_state[i], 2, 2)
    ed <- substr(unq_state[i], 4, 4)
    q_func[i + 9, seq(bg, ed)] <- 1
  }
  
  
  # get data
  tdata <- matrix(NA, ncol = length(days), nrow = nrow(data))
  for (i in seq_len(ncol(dat))) {
    for (j in seq_len(nrow(dat))) {
      
      cart_chr <- as.character(dat[j, i])
      if (cart_chr %in% as.character(1:8)) {
        tdata[j, i] <- as.numeric(cart_chr)
      } else {
        if (is.na(cart_chr)) {
          tdata[j, i] <- 9
        } else {
          tdata[j, i] <- unq_state_num[which(unq_state == cart_chr)]
        }
      }
    }
  }
  
  strt <- as.numeric(as.factor(data[, strata]))
  
  list(cbind(strt, tdata), q_func,
       cbind(unq_state, unq_state_num))
  
}



# ####################################################################################################

## get start value according to Nathan's code
get_start <- function(data, bin) {
  
  M = t(rbind(apply(data, 1, function(x) {
    x[x > 8] = NA
    x
  })))
  
  #originally I implemented to stratify baseline. However, not planning to expose this functionality.
  #the following line sets up for no strata
  s=rep(1, nrow(data))
  
  
  strt=niaidMI:::.get_start(M=M,s=s,bin=bin)
  
  list(strt$Tran, strt$Pri[[1]])
}

# ####################################################################################################


## the data matrix should not contains Subject ID
estTransMatrx <- function(data){ ## for niaid os only
  
  n_states <- length(unique(as.vector(data)))
  
  nomi <- matrix(0, ncol = n_states, nrow = n_states)
  deno <- matrix(0, ncol = n_states, nrow = n_states)
  
  for (i in seq_len(nrow(data))) {
    for (j in seq_len(ncol(data) - 1)) {
      if (is.na(data[i, j]) | is.na(data[i, j + 1])) break
      nomi[data[i, j], data[i, j + 1]] <- nomi[data[i, j], data[i, j + 1]] + 1
      deno[data[i, j], ] <- deno[data[i, j], ] + 1
    }
  }
  
  #print(nomi)
  #print(deno)
  nomi / deno
  
}

# estTransMatrx(imp_CH)
# estTransMatrx(imp_NM)
# 
# estTransMatrx(imp_strat_CH[which(imp_strat_CH[, 1] == 1), -1])
# estTransMatrx(imp_strat_NM[which(imp_strat_NM[, 1] == 1), -1])
# 
# estTransMatrx(imp_strat_CH[which(imp_strat_CH[, 1] == 2), -1])
# estTransMatrx(imp_strat_NM[which(imp_strat_NM[, 1] == 2), -1])




estInitDist <- function(data) { ## for niaid os only
  deno <- length(na.omit(data[, 1]))
  result <- numeric(8)
  for (i in 1:8) {
    result[i] <- sum(data[, 1] == i, na.rm = TRUE) / deno
  }
  result
}

# estInitDist(imp_CH)
# estInitDist(imp_NM)
# 
# estInitDist(imp_strat_CH[which(imp_strat_CH[, 1] == 1), -1])
# estInitDist(imp_strat_NM[which(imp_strat_NM[, 1] == 1), -1])
# 
# estInitDist(imp_strat_CH[which(imp_strat_CH[, 1] == 2), -1])
# estInitDist(imp_strat_NM[which(imp_strat_NM[, 1] == 2), -1])



# ####################################################################################################


################# EXAMPLE START HERE #################################################################
# ####################################################################################################
# 
# 
# NMdata <- niaidMI::sim_data(n=200)
# CHdata <- NM2CH_data(NMdata)
# 
# bin <- c(rep(1, 6), #(Please note data has 28 days)
#          rep(2, 7), rep(3, 7), rep(4, 7))
# 
# start_BW <- get_start(CHdata[[1]][, -1], bin)
# start_initP <- start_BW[[2]]
# start_tP <- start_BW[[1]]
# 
# BW_CH(CHdata[[1]][, -1], CHdata[[2]], bin, start_initP, start_tP)
# testimp  <- imputeBS_CH(CHdata[[1]], CHdata[[2]], bin, start_initP, start_tP, m = 3, by = 1)
# testimp2 <- imputeBS_CH(CHdata[[1]][, -1], CHdata[[2]], bin, start_initP, start_tP, m = 3)
#
#
# 
# ####################################################################################################
#
#
#
#
# ### example 1 (only 1 additional state 9 for missing)
# 
# ## q functions: a n_q_func by 8 matrix with (i, j)th
# ## element for q(h^obs=i, h=j)
# q_func <- rbind(diag(8), 1)
# 
# ## dat: observed NIAID for one patient
# dat <- sample(c(1:7, 9), size = 28, replace = TRUE)
# 
# ## bin: a vector with length(dat) - 1
# bin <- c(rep(1, 6), #(Please not data has 28 days)
#          rep(2, 7), rep(3, 7), rep(4, 7))
# 
# ## initial probability of chain
# initP <- c(rep(1, 7) / 7, 0)
# start_initP <- initP
# 
# ## transition probability: a list with length equals
# ## to the number of bins. Each element is a 8*8 matrix.
# tP <- vector('list', length(unique(bin)))
# tP[[1]] <- rbind(matrix(1/8, nrow = 7, ncol = 8),
#                  c(rep(0, 7), 1))
# tP[[2]] <- rbind(matrix(1/8, nrow = 7, ncol = 8),
#                  c(rep(0, 7), 1))
# tP[[3]] <- rbind(matrix(1/8, nrow = 7, ncol = 8),
#                  c(rep(0, 7), 1))
# tP[[4]] <- rbind(matrix(1/8, nrow = 7, ncol = 8),
#                  c(rep(0, 7), 1))
# start_tP <- tP
# 
# ## test fwd_bwd_CH
# test <- fwd_bwd_CH(dat, q_func, bin, initP, tP)
# apply(test[[1]]*test[[2]], 1, sum) ## should be 1
# 
# 
# ## dataset: observed NIAID for all patients
# ## each row is the record of one patient
# dataset <- matrix(NA, ncol = 28, nrow = 100)
# for (i in 1:100) {
#   dataset[i, ] <- sample(c(1:7, 9), size = 28, replace = TRUE)
# }
# dataset_fmt=dataset
# dataset_fmt[dataset_fmt==9]=NA
# colnames(dataset_fmt)=paste0("D",1:28)
# dataset_fmt=data.frame(dataset_fmt)
# 
# 
# ## a quick example
# result1 <- BW_CH(dataset, q_func, bin, start_initP, start_tP)
# result2 = bootstrap_param_est(wide=dataset_fmt, b=2, bin=bin, tol = 1e-20)[[1]]
# result1$initial_prob
# result2$Pri
# 
# result1[[2]]
# result2$Pri
# 
# 
# 
# ## quick example
# impData <- impute_CH(dataset, q_func, bin, result1[[1]], result1[[2]])
#
# ##########################################################################
# 
# 
# ### example 2 (2 additional states, 9 for missing, 10 for missing not die)
# 
# ## q functions: a n_q_func by 8 matrix with (i, j)th
# ## element for q(h^obs=i, h=j)
# q_func <- rbind(diag(8), 1, c(rep(1, 7), 0))
# 
# ## dat: observed NIAID for one patient
# dat <- sample(c(1:7, 9, 10), size = 28, replace = TRUE)
# 
# ## bin: a vector with length(dat) - 1
# bin <- c(rep(1, 7), rep(2, 7), rep(3, 7), rep(4, 7))
# 
# ## initial probability of chain
# initP <- c(rep(1, 7) / 7, 0)
# start_initP <- initP
# 
# ## transition probability: a list with length equals
# ## to the number of bins. Each element is a 8*8 matrix.
# tP <- vector('list', length(unique(bin)))
# tP[[1]] <- rbind(matrix(1/8, nrow = 7, ncol = 8),
#                  c(rep(0, 7), 1))
# tP[[2]] <- rbind(matrix(1/8, nrow = 7, ncol = 8),
#                  c(rep(0, 7), 1))
# tP[[3]] <- rbind(matrix(1/8, nrow = 7, ncol = 8),
#                  c(rep(0, 7), 1))
# tP[[4]] <- rbind(matrix(1/8, nrow = 7, ncol = 8),
#                  c(rep(0, 7), 1))
# start_tP <- tP
# 
# ## test fwd_bwd_CH
# test <- fwd_bwd_CH(dat, q_func, bin, initP, tP)
# apply(test[[1]]*test[[2]], 1, sum) ## should be 1
# 
# 
# ## dataset: observed NIAID for all patients
# ## each row is the record of one patient
# dataset <- matrix(NA, ncol = 28, nrow = 100)
# for (i in 1:100) {
#   dataset[i, ] <- sample(c(1:7, 9, 10), size = 28, replace = TRUE)
# }
# 
# 
# ## a quick example
# result <- BW_CH(dataset, q_func, bin, start_initP, start_tP)
# 
# ## quick example
# impData <- impute_CH(dataset, q_func, bin, result[[1]], result[[2]])
# 
# ## should be all true
# (dataset %in% c(9, 10)) != (impData == dataset)
# 
# ##########################################################################

Try the niaidMI package in your browser

Any scripts or data that you put into this service are public.

niaidMI documentation built on March 18, 2022, 7:26 p.m.