#' @title Main function implementing the MASTA algorithm
#' @description This function builds an algorithm to identify the occurrence of event outcome from trajectories of several predictors.
#' @param object The object returned by the \code{fpca.combine} function
#' @param survival the labeled data. The columns should be 1) id, 2) event indicator, 3) event time, followed by baseline predictors.
#' @param follow_up_time the follow-up data
#' @param Tend a scalar value. NA in SX is replaced by this. The default is 1.
#' @param cov_group a vector of consecutive integers describing the grouping only for covariates. When \code{NULL} is specified (default), each covariate will be in different group.
#' @param thresh a default is \code{0.7}, which means if there are codes with >70\% patients no codes, only use first code time.
#' @param PCAthresh a threshold value for PCA. Default is \code{0.9}.
#' @param seed random seed used for the sampling. Default is \code{100}.
#' @return A list with components:
#' @return \item{bgbbest_FromChengInit_BFGS}{Details of the fitted model}
#' @return \item{Cstat_BrierSc_ChengInit_BFGS}{Performance of the derived algorithm. C-statistics, etc.}
#' @return \item{group}{A vector of consecutive integers describing the grouping coefficients}
#' @export
masta.fit <- function(object, survival, follow_up_time, Tend=1, cov_group = NULL, thresh = 0.7, PCAthresh = 0.9, seed = 100) {
aa <- merge(survival, follow_up_time, by = "id")
TrainSurv <- aa[aa$train_valid==1,-ncol(aa)]
ValidSurv <- aa[aa$train_valid==2,-ncol(aa)]
npred <- ncol(aa)-5
codes <- paste0("pred", 1:npred)
colnames(TrainSurv) <- colnames(ValidSurv) <- c("case", "delta", "sx", paste0("base_pred", 1:npred), "sc")
nn <- nrow(TrainSurv)
TrainSurv_pred_org <- TrainSurv[, grep("base_pred",names(TrainSurv))] #--- baseline predictors
ValidSurv_pred_org <- ValidSurv[, grep("base_pred",names(ValidSurv))] #--- baseline predictors
TrainN <- object$TrainN
ValidN <- object$ValidN
TrainFt <- object$TrainFt[row.names(object$TrainFt) %in% as.character(TrainSurv$case), ]
ValidFt <- object$ValidFt
TrainPK <- object$TrainPK[row.names(object$TrainPK) %in% as.character(TrainSurv$case), ]
ValidPK <- object$ValidPK
for (i in seq_along(codes)) {
idx0 <- 5 * (i - 1)
#--switch peak & change point if later one is bigger---
tmp <- TrainFt[, idx0 + 2] < TrainFt[, idx0 + 3]
if (sum(tmp) > 0) {
aa <- TrainFt[tmp, idx0 + 2]
TrainFt[tmp, idx0 + 2] <- TrainFt[tmp, idx0 + 3]
TrainFt[tmp, idx0 + 3] <- aa
}
tmp <- ValidFt[, idx0 + 2] < ValidFt[, idx0 + 3]
if (sum(tmp) > 0) {
aa <- ValidFt[tmp, idx0 + 2]
ValidFt[tmp, idx0 + 2] <- ValidFt[tmp, idx0 + 3]
ValidFt[tmp, idx0 + 3] <- aa
}
# set pk & chp to first code time if 0
tmp <- TrainFt[, idx0 + 2] == 0
if (sum(tmp) > 0) TrainFt[tmp, idx0 + 2] <- TrainFt[tmp, idx0 + 1]
tmp <- TrainFt[, idx0 + 3] == 0
if (sum(tmp) > 0) TrainFt[tmp, idx0 + 3] <- TrainFt[tmp, idx0 + 1]
tmp <- ValidFt[, idx0 + 2] == 0
if (sum(tmp) > 0) ValidFt[tmp, idx0 + 2] <- ValidFt[tmp, idx0 + 1]
tmp <- ValidFt[, idx0 + 3] == 0
if (sum(tmp) > 0) ValidFt[tmp, idx0 + 3] <- ValidFt[tmp, idx0 + 1]
}
##### log scale
ncodes <- length(codes)
tmp <- seq(5 * ncodes)
aa <- tmp[tmp %% 5 %in% 1:3]
TrainFt[, aa] <- log(TrainFt[, aa])
ValidFt[, aa] <- log(ValidFt[, aa])
FirstCode <- exp(ValidFt[, seq(1, 5 * ncodes, 5)])
### Regress out baseline ##
tmp <- paste(paste0("base_pred", 1:length(TrainSurv_pred_org)), collapse = " + ")
txt <- paste0("lm(x ~ ", tmp, ", data = TrainSurv)")
func_1 <- function(x) {
tmp <- eval(parse(text = txt))
list(resid = tmp$residuals, coef = tmp$coefficients)
}
TrainFt <- data.frame(TrainFt)
TrainFt <- lapply(TrainFt, func_1)
RegCoef <- do.call(cbind, lapply(TrainFt, `[[`, 2))
TrainFt <- do.call(cbind, lapply(TrainFt, `[[`, 1))
tmp <- paste(paste0("base_pred", 1:length(ValidSurv_pred_org)), collapse = "', '")
txt <- paste0("ValidFt[, l] - as.matrix(cbind(1, ValidSurv[, c('", tmp, "')])) %*% RegCoef[, l]")
ValidFt <- sapply(1:ncol(ValidFt), function(l) eval(parse(text = txt)))
colnames(ValidFt) <- colnames(TrainFt)
### get estimating equation for beta!!!!!!!!!!!!!!
SX <- TrainSurv$sx
SC <- TrainSurv$sc
Delta <- TrainSurv$delta
tseq <- sort(unique(SX[Delta == 1]))
G_fit <- survfit(Surv(SX, 1 - Delta) ~ 1)
G_tseq <- sapply(tseq, function(s) {
tmp2 <- G_fit$time <= s
if (sum(tmp2) == 0) {
1
} else {
G_fit$surv[max(which(tmp2))]
}
})
G_SX <- summary(G_fit, times = SX, extend = TRUE)$surv
G_SX[sort(SX, index.return = TRUE)$ix] <- G_SX
G_SX[G_SX == 0] <- min(G_SX[G_SX > 0])
G_tseq[G_tseq == 0] <- min(G_tseq[G_tseq > 0])
#--- for output ---
G_SX_train <- G_SX
G_tseq_train <- G_tseq
#### get initial values using Cheng et al (1995,1997)
####### number of patients with no codes in the labeled set (training)+SEER
TrainZC <- sapply(seq_along(codes), function(i) sum(TrainN[, i + 1] == 0))
names(TrainZC) <- codes
####### number of patients with no codes in the labeled set (validation)
ValidZC <- sapply(seq_along(codes), function(i) sum(ValidN[, i + 1] == 0))
names(ValidZC) <- codes
# for codes with > 70% patients no codes, only use first code time
codes0 <- which((TrainZC + ValidZC) / (nrow(TrainN) + nrow(ValidN)) > thresh)
codes0 <- c(sapply(codes0, function(x) (x - 1) * 5 + 2:4))
codesK <- lapply(1:length(codes), function(x) {
tmp <- (x - 1) * 5 + 1:5
tmp <- tmp[!tmp %in% codes0]
})
# - Z <- as.matrix(cbind(TrainSurv[, paste0("base_pred", 1:length(TrainSurv_pred_org))], TrainFt))
Z <- as.matrix(cbind(TrainSurv[, grep("base_pred", names(TrainSurv))], TrainFt))
### standardize Z
meanZ <- apply(Z, 2, mean)
sdZ <- apply(Z, 2, sd)
for (m in 1:length(TrainSurv_pred_org)) {
aa <- unique(TrainSurv[, paste0("base_pred", m)])
if (length(aa) == 2) {
#--categorical -> do not standerdize
meanZ[paste0("base_pred", m)] <- 0
sdZ[paste0("base_pred", m)] <- 1
}
}
Z <- (Z - VTM(meanZ, nrow(Z))) / VTM(sdZ, nrow(Z))
# PCA within each code to pre-select features
TrainFt_PCA <- lapply(1:length(codes), function(i) {
if (length(codesK[[i]]) > 0) {
tmp <- Z[, codesK[[i]] + length(TrainSurv_pred_org)]
tmp2 <- prcomp(tmp)
PCA_K <- sum(cumsum(tmp2$sdev) / sum(tmp2$sdev) < PCAthresh) + 1
tmp3 <- tmp2$x[, 1:PCA_K]
list(PCA_K = PCA_K, PCAs = tmp3, PCAobj = tmp2)
} else {
list(PCA_K = 0, PCAs = NULL, PCAobj = NULL)
}
})
PCA_K <- do.call(c, lapply(TrainFt_PCA, `[[`, 1))
PCAnames <- paste0(rep(codes, PCA_K), unlist(sapply(PCA_K, function(x) 1:x)), "_", rep(seq_along(codes), PCA_K))
PCAobjs <- lapply(TrainFt_PCA, `[[`, 3)
TrainFt_PCA <- do.call(cbind, lapply(TrainFt_PCA, `[[`, 2))
colnames(TrainFt_PCA) <- PCAnames
Z <- cbind(Z[, 1:length(TrainSurv_pred_org)], TrainFt_PCA)
#--group: a vector of consecutive integers describing the grouping of the coefficients
# group <- do.call(c, sapply(1:length(PCA_K), function(i) rep(i, PCA_K[i]))) #<--- this does not work if the elements of PCA_K are identical
group <- rep(1:length(PCA_K), PCA_K)
#--add grouping for covariaates and update the group vevtor--
if (is.null(cov_group)) cov_group <- 1:length(TrainSurv_pred_org)
group <- c(cov_group, group + cov_group[length(cov_group)])
#corZExtreme(Z, 0.7)
### try delete ChP, feel like Pk is estimated better
#### B-spline setting (9 knots plus 2 = 11 --- based on quantiles)
degree <- 1
knots <- quantile(SX[Delta == 1], prob = seq(0.1, 0.9, 0.1))
Boundary.knots <- c(0, max(SX))
allknots <- c(Boundary.knots[1], knots, Boundary.knots[2])
q <- length(knots) + degree + 1
#-- for output
nparm4Bspline <- q
#### ===================
#### Initial values
#### ===================
# set.seed(seed)
# if (0) {
# b_ini <- runif(ncol(Z), -0.1, 0.1)
# b_next <- b_ini
# b_next <- estbb.data(bb = b_next, Z = Z, Delta = Delta, G_SX = G_SX, SX = SX)$est
# #### --Option I: Cheng et al (1995,1997) Not as good as NPMLE initial
# func1 <- function(bb, Z, Delta, G_SX, SX) estbb.data(bb = bb, Z = Z, Delta = Delta, G_SX = G_SX, SX = SX)$est
# func2 <- function(bb, Z, Delta, G_SX, SX) estbb.data(bb = bb, Z = Z, Delta = Delta, G_SX = G_SX, SX = SX)$Jacob
# bb <- multiroot(f = func1, start = b_ini, jacfunc = func2, Z = Z, Delta = Delta, G_SX = G_SX, SX = SX)
# bb_Cheng <- bb.init <- bb$root
# }
bb_Cheng <- rep(0, ncol(Z))
ht <- sapply(seq_along(tseq), function(i) {
est <- function(a) mean((SX >= tseq[i]) / G_tseq[i] + g_fun(a + Z %*% bb_Cheng)) - 1
if (est(-1e10) > 0) -1e10 else uniroot(est, lower = -1e10, upper = 1e10)$root
})
ht <- sort(ht)
bbt_Cheng <- cbind(tseq, ht)
### add more weights to origin to get better estimation there ???
### on original scale
# ht[1] <- -30 --- will give an error if ht[2] is less than 30
ht[1] <- min(ht[1], ht[2] - 10)
weit <- (2 / (tseq[-1] + tseq[-length(tseq)]))^(1 / 4)
bgi <- function(bg) {
tmpp <- h.fun(tseq[-1], knots, Boundary.knots, bg)
tmp <- (ht[-1] - log(tmpp))^2
dtmp <- -2 * (ht[-1] - log(tmpp)) * dh.fun(tseq[-1], knots, Boundary.knots, bg) / tmpp
fn <- sum((tmp[-1] + tmp[-length(tmp)]) / 2 * diff(tseq[-1]) * weit[-1])
gr <- apply((dtmp[-1, ] + dtmp[-length(tmp), ]) / 2 * diff(tseq[-1]) * weit[-1], 2, sum)
return(list(fn = fn, gr = gr))
}
ini <- c(-12, 5, 6, rep(7, 7), -3.5) + runif(q, -0.5, 0.5)
bg.init.Cheng <- optim(
par = ini,
fn = function(bg) bgi(bg)$fn,
gr = function(bg) bgi(bg)$gr,
method = "BFGS", control = list(maxit = 30000)
)
bg.init.Cheng <- bg.init.Cheng$par
###### Option III: arbitary initial but with some ridge
## optim with initial bg_bm
bgbm.init <- c(bg.init.Cheng, bb_Cheng)
lam <- 0
bgbm.optim <- optim(
par = bgbm.init,
fn = function(x) {
SurvLoglik(
bg = x[1:q], bb = x[-(1:q)], knots, Boundary.knots, Delta, SX, Z
)$likelihood
},
gr = function(x) {
SurvLoglik(
bg = x[1:q], bb = x[-(1:q)], knots, Boundary.knots, Delta, SX, Z, likelihood = FALSE, gradient = TRUE
)$gradient
},
method = "BFGS",
control = list(maxit = 100000)
)
# inital.values = cbind(bgbm.init, c(bb_NPMLE, bg.init.NPMLE),bgbm.optim$par)
# colnames(inital.values)=c("Cheng (Option 1)","NPMLE (Option 2)","Ridge (Option 3)")
# print(inital.values)
bgbm.optim
#--- gamma ---
cbind(bg.init.Cheng, bgbm.optim$par[1:q])
#--- beta ---
cbind(bb_Cheng, bgbm.optim$par[-c(1:q)])
# ===================
# Group Lasso
# ===================
## optim with initial bgbm
lam.glasso <- sort(seq(0, 0.003, 1e-6), decreasing = TRUE)
bgbm.optim.glasso <- gglasso.Approx.BSNP(
bgbm.optim$par[1:q], bgbm.optim$par[-c(1:q)], knots, Boundary.knots, Delta, SX, Z, 0, lam.glasso, group
)
mo21 <- which.min(bgbm.optim.glasso$AIC.LSA) # AIC
mo22 <- which.min(bgbm.optim.glasso$BIC.LSA) # BIC
mo23 <- which.min(bgbm.optim.glasso$AIC.Orig) # AIC on original model
mo24 <- which.min(bgbm.optim.glasso$BIC.Orig) # BIC on original model
nzpar <- cbind(bgbm.optim.glasso$beta[, c(mo21, mo22, mo23, mo24)]) != 0 # non-zero parameters
nzpar <- rbind(matrix(TRUE, nrow = q + 4, ncol = 4), nzpar[-(1:4), ])
bgbm.optim.glasso <- sapply(1:4, function(i) {
tmp <- rep(0, q + ncol(Z))
tmp[nzpar[, i]] <- optim(
par = bgbm.optim$par[nzpar[, i]],
fn = function(x) {
SurvLoglik.nz(
bg = x[1:q], bb = x[-(1:q)], nzpar[-(1:q), i], knots, Boundary.knots, Delta, SX, Z
)$likelihood
},
gr = function(x) {
SurvLoglik.nz(
bg = x[1:q], bb = x[-c(1:q)], nzpar[-c(1:q), i], knots, Boundary.knots, Delta, SX, Z,
likelihood = FALSE, gradient = TRUE
)$gradient
},
method = "BFGS",
control = list(maxit = 10000)
)$par
tmp
})
bgbm.optim.glasso <- list(
bgbb = bgbm.optim.glasso,
lam.glasso = lam.glasso[c(mo21, mo22, mo23, mo24)]
)
colnames(bgbm.optim.glasso$bgbb) <- names(bgbm.optim.glasso$lam.glasso) <- c("AIC", "BIC", "AIC.Orig", "BIC.Orig")
bgbbest <- cbind(bgbm.init, bgbm.optim$par, bgbm.optim.glasso$bgbb)
tree.fit <- treefit(Delta, Z[, -(1:length(TrainSurv_pred_org))]) # ??????
set.seed(seed)
train <- sample(1:nn, nn * 0.75)
logi.fit <- logifit(Delta, Z, train, colnames(Z)[1:length(TrainSurv_pred_org)])
# vars <- logi.fit$vars
# vars <- sapply(vars, function(x) substr(x, 1, nchar(x) - 3), USE.NAMES = FALSE)
# vars <- unique(vars)
vars_wk1 <- logi.fit$vars
vars_wk2 <- substring(vars_wk1, regexpr("_", vars_wk1) + 1)
vars_wk3 <- paste0("pred", vars_wk2)
vars <- unique(vars_wk3)
wei <- GetWei(TrainPK, vars, Delta, SX)
# ================
### validation
# ================
SX <- ValidSurv$sx
SC <- ValidSurv$sc
Delta <- ValidSurv$delta
## the same PCA basis as training set (use PCAobjs from the training)
tmp <- paste0("base_pred", 1:length(ValidSurv_pred_org))
Z <- as.matrix(cbind(ValidSurv[, tmp], ValidFt))
Z <- (Z - VTM(meanZ, nrow(Z))) / VTM(sdZ, nrow(Z))
ValidFt_PCA <- do.call(cbind, sapply(1:length(codes), function(i) {
if (length(codesK[[i]]) > 0) {
tmp <- matrix(Z[, codesK[[i]] + length(ValidSurv_pred_org)], ncol = length(codesK[[i]]))
colnames(tmp) <- colnames(Z)[codesK[[i]] + length(ValidSurv_pred_org)]
tmp3 <- predict(PCAobjs[[i]], tmp)[, 1:PCA_K[i]]
list(PCAs = tmp3)
} else {
list(PCAs = NULL)
}
}))
colnames(ValidFt_PCA) <- PCAnames
Z <- cbind(Z[, 1:length(ValidSurv_pred_org)], ValidFt_PCA)
endF <- quantile(SX, 0.9)
t.grid <- tseq[seq(9, length(tseq), 10)]
#---- this uses the validation data to get G_fit
G_fit <- survfit(Surv(SX, 1 - Delta) ~ 1)
G_tseq <- sapply(tseq, function(s) {
tmp2 <- G_fit$time <= s
if (sum(tmp2) == 0) {
1
} else {
G_fit$surv[max(which(tmp2))]
}
})
G_SX <- summary(G_fit, times = SX, extend = TRUE)$surv
G_SX[sort(SX, index.return = TRUE)$ix] <- G_SX
G_SX[G_SX == 0] <- min(G_SX[G_SX > 0])
G_tseq[G_tseq == 0] <- min(G_tseq[G_tseq > 0])
eval <- GetPrecAll(
bgbbest, SX, SX, SC, Delta, Z, tseq, t.grid,
knots, Boundary.knots, G_SX, G_tseq, endF, Tend
)
tmp <- eval$CB
tmp2 <- mean(BrierScore.KM2(tseq[tseq <= endF], SX, SX, SC, Delta, tseq, G_SX, G_tseq)[, 2])
tmp[2, ] <- 1 - tmp[2, ] / tmp2
cstats <- tmp
# ================
# Output
# ================
out <- list()
#--- knots used for B-spline
out$nparm4Bspline <- nparm4Bspline
#--- regression parameter estimate (from Training) --
out$bgbbest <- bgbbest
out$gamma_est <- bgbbest[1:nparm4Bspline]
out$beta_est <- bgbbest[-c(1:nparm4Bspline)]
#--- PCA (from Training) --
out$PCAobjs <- PCAobjs
#--- G_SX and G_tseq (from Training) --
out$GX_trian <- G_SX_train
out$Gt_train <- G_tseq_train
#--- group used for group lasso
out$group <- group
#--- result with the validation data
out$result_valid <- cstats
#---- predicted score (validation data)
out$risk_score_valid <- eval$risk_score
out$pred_surv_valid <- eval$pred_surv
#---- for masta.validation()
out$fpca_obj <- object
out$data_survival <- survival
out$parm <- list(Tend=Tend, cov_group = cov_group, thresh = thresh, PCAthresh = PCAthresh, seed = seed)
#-- add class to the return object --
class(out) <- "masta"
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.