PETLER: Multi-modal Automated Survival Time Annotation

Documented in fpca.combine

#' @importFrom utils head
#' @importFrom graphics hist
#' @importFrom stats IQR approx bw.SJ bw.bcv bw.nrd bw.nrd0 bw.ucv density dnorm optimize
NULL

VTM <- function(vc, dm) matrix(vc, ncol = length(vc), nrow = dm, byrow = TRUE)

GetPK <- function(id, t, tseq, fu) {
  id <- as.character(id)
  PK <- rep(NA, length(fu))
  names(PK) <- names(fu)
  e <- diff(tseq)
  aa <- tapply(t, id, FUN = function(t) {
    x <- hist(t, breaks = tseq, plot = FALSE)$counts
    avg_sp <- cumsum(x) / tseq[-1]
    avg_sp2 <- c(0, head(avg_sp, -1))
    which.max((avg_sp - avg_sp2) / (avg_sp2 + e))
  })
  PK[unique(id)] <- tseq[aa + 1]
  PK
}




fpca.check <- function(time, fu_train, fu_valid){
  #-1-- minimum in "time" should be greater than or equal to 1
  if (!is.vector(time)) stop("Data Entry Issue: 'time' should be a vector")
  chk <- min(time)
  if (chk < 1) {
    stop("Data Entry Issue: The minimum of 'time' should be 1. Please do not code Month 0 for Days 1 to 30 but Month 1.")
  }
  
  #-2-- fu_train and fu_valid should be all vecotrs and one-record per subject and mutually exclusive
  if (!is.vector(fu_train)) stop("Data Entry Issue: 'fu_train' should be a vector")
  if (!is.vector(fu_valid)) stop("Data Entry Issue: 'fu_valid' should be a vector")
  chk1 <- unique(names(fu_train))
  chk2 <- unique(names(fu_valid))
  if (length(fu_train) != length(chk1)) stop("Data Entry Issue: More than one entry from one subject in 'fu_train'")
  if (length(fu_valid) != length(chk2)) stop("Data Entry Issue: More than one entry from one subject in 'fu_valid'")
  chk3 <- c(names(fu_train), names(fu_valid))
  if (length(chk3) != length(unique(chk3))) stop("Data Entry Issue: There are subjects who are in both 'fu_train' and 'fu_valid'")
  
  #-3-- subjects in time should be embedded by fu_train or fu_valid
  chk <- match(names(time), c(names(fu_train), names(fu_valid)))
  if (sum(is.na(chk) != 0)) stop("Data Entry Issue: Some subjects in 'time' do not have follow-up time information in 'fu_train' or 'fu_valid'")
  follow = data.frame("id" = c(names(fu_train),names(fu_valid)),
                      "follow" = c(fu_train,fu_valid))
  time_df = data.frame("id" = names(time),
                       "time" = time)
  follow_expand = merge(follow,time_df)
  time_df_max = tapply(follow_expand$time, follow_expand$id, max)
  time_df_max = data.frame("id" = names(time_df_max),
                       "obs_max" = time_df_max)
  follow = merge(follow, time_df_max,by="id")
  if (sum(follow$obs_max>follow$follow)>0) stop("Data Entry Issue: in longitudinal data, some subjects have encounters later than the follow up time.")
}




fpca.pre <- function(time, fu_train, fu_valid=NULL, Tend, as_int, least_c=1, least_uc=1){
  fu <- c(fu_train, fu_valid)
  uni.count <- tapply(time, names(time), function(x){length(unique(x))})
  count <- tapply(time, names(time), function(x){length(x)})
  names_least <- names(uni.count[uni.count>=least_uc & count>=least_c])
  names_fu = names(fu)
  time <- time[(names(time) %in% names_fu) 
               & (names(time) %in%  names_least)]    #only keep data with follow-up info
  names_time <- names(time)
  count <- tapply(time, names_time, length)
  time_std <- time / fu[names_time]
  if (as_int) {
    names_time <- as.integer(names_time)
  }
  count_all <- 0 * fu
  count_all[names(count)] = count
  out = list(`fu`=fu,
             `time`=time,`time_std`=time_std,`names_time`=names_time,
             `count`=count,`count_all`=count_all)
  return(out)
}






fpca.summary <- function(data, tmp, fu_train, fu_valid){
  
  PKTS <- GetPK(
    id = data$names_time, ### INT/CHAR ###
    t = data$time,
    tseq = 0:floor(max(data$fu)),
    fu = data$fu
  )
  
  ft.e <- cbind(
    matrix(data$fu, nrow = length(data$fu), ncol = 3),
    -tmp$baseline[1], log(1 + data$count_all)
  )                                             # if longitudinal data is missing - use baseline 
  rownames(ft.e) = names(data$count_all)
  
  pos <- names(data$count)
  ft.e[pos, 1] <- tapply(data$time, data$names_time, min)
  locm <- unlist(apply(tmp$densities[, 1:length(pos) + 1], 2, which.max))  # peak
  ft.e[pos, 2] <- ft.e[pos, 2] * tmp$densities[locm, 1]  #density[,1] is the x-coordinate
  ft.e[pos, 3] <- ft.e[pos, 3] * tmp$derivatives[
    sapply(1:length(pos), function(i) {
      #which.max(tmp$derivatives[1:locm[i], i + 1])  #--??
      which.max(tmp$derivatives[, i + 1])
    }), 1]
  ft.e[pos, 4] <- tmp$scores[, 2]
  colnames(ft.e) <- c("1stCode", "Pk", "ChP", "1stScore", "logN")
  TrainN = data.frame("id" = names(fu_train),
                      "pred_total" = data$count_all[names(fu_train)])
  ValidN = data.frame("id" = names(fu_valid),
                      "pred_total" = data$count_all[names(fu_valid)])
  
  out <- list(
    TrainN = TrainN,
    ValidN = ValidN,
    TrainFt = ft.e[names(fu_train),],
    ValidFt = ft.e[names(fu_valid),],
    TrainPK = PKTS[names(fu_train)],
    ValidPK = PKTS[names(fu_valid)]
  )
  class(out) <- "fpca"
  return(out)
}




#' @title Extract Features by Functional Principal Component Analysis (FPCA)
#' @description Performs FPCA to extract features from longitudinal encounter data.
#' @param longitudinal longitudinal encounter times. See an example as \code{longitudinal} data.
#' @param follow_up_time follow-up time information. See an example as \code{follow_up_time}.
#' @param K.select characters indicating which method to choose the number of principal components K.
#'  Default is K.select="PropVar", and K.select="PPIC" is also available.
#' @param Kmax an integer value. The max of the principle components K. Default is \code{5}.
#' @param n.grid an integer value for grid points used in estimating covariance function g. Default is \code{401}.
#' @param propvar a proportion of variation used to select number of FPCs. Default is \code{0.85}.
#' @export
fpca.combine <- function(longitudinal, follow_up_time, 
                         K.select = "PropVar", Kmax = 5, n.grid = 401, propvar = 0.85){
  longitudinal$code = as.numeric(longitudinal$code)
  codes = sort(unique(longitudinal$code))
  if(min(codes)!=1) stop("Codes should start from 1.")
  
  diff.index = codes[2:length(codes)] - codes[1:(length(codes)-1)]
  if(sum(diff.index != 1) > 0) stop("Codes must be incremental with no skipping number.")
 
  D = sapply(codes, function(i){
    x = longitudinal$time[longitudinal$code == i]
    names(x) = longitudinal$id[longitudinal$code == i]
    return(x)
  })
  names(D) = codes
  follow_up_train <- follow_up_time[follow_up_time$train_valid==1, ]
  follow_up_valid <- follow_up_time[follow_up_time$train_valid==2, ]
  
  fu_train <- follow_up_train$fu_time
  fu_valid <- follow_up_valid$fu_time
  names(fu_train) <- follow_up_train$id
  names(fu_valid) <- follow_up_valid$id

  TrainFt <- c()
  ValidFt <- c()
  TrainPK <- c()
  ValidPK <- c()
  TrainN <- c()
  ValidN <- c()

  for (i in 1:length(codes)) {
    time <- D[[i]]
    ans <- fpca.new(time, fu_train, fu_valid, K.select = "PropVar", Kmax = Kmax) 
    #--- create an object for fitting --
    Ft_name <- colnames(ans$TrainFt)
    Ft_name <- paste0(Ft_name, codes[i])
    colnames(ans$TrainFt) = colnames(ans$ValidFt) = Ft_name
    TrainFt <- cbind(TrainFt, ans$TrainFt)
    ValidFt <- cbind(ValidFt, ans$ValidFt)
    
    TrainPK <- cbind(TrainPK, ans$TrainPK)
    ValidPK <- cbind(ValidPK, ans$ValidPK)
    if (i == 1) {
      TrainN <- ans$TrainN
      ValidN <- ans$ValidN
    }else{
      TrainN <- cbind(TrainN, ans$TrainN[, 2])
      ValidN <- cbind(ValidN, ans$ValidN[, 2])
    }
  }
  
  colnames(TrainPK) <- colnames(ValidPK) <- paste0("pred", codes)
  colnames(TrainN) <- colnames(ValidN) <- c("id", paste0("pred", codes, "_total"))
  
  Z <- list()
  Z$TrainFt <- TrainFt
  Z$ValidFt <- ValidFt
  Z$TrainPK <- TrainPK
  Z$ValidPK <- ValidPK
  Z$TrainN <- TrainN
  Z$ValidN <- ValidN
  
  #-- for masta.validation() --   
  Z$data = list(longitudinal=longitudinal, follow_up_time=follow_up_time)
  Z$parm = list(K.select = K.select, Kmax = Kmax , n.grid = n.grid, propvar = propvar)
  return(Z)
}