#########################################################################################################
#########################################################################################################
#' @name crossvalidation
#'
#' @title Loss Calculation by Cross Validation
#'
#' @description Function here are to calculate the loss by cross validation for Bayesian hierarchical model (see also \code{\link{Hier}})
#' and Bayesian model with Ising prior (see also \code{\link{Ising}}). This can be used to select the best hyperparameters and to compare
#' two models.
#'
#' @details
#' The loss is calcuated by:
#' \deqn{\sqrt{\sum_{bj} [(Y_{bj}-N_t*t_{bj})^2]}/N_t + \sqrt{\sum_{bj} [(X_{bj}-N_c*c_{bj})^2]}/N_c}
#' Here b=1,..., B and j=1, ... , k_b, Y_{bj} and X_{bj} are the number of subjects with
#' an AE with PT j under SOC b in treatment and control groups.
#' N_t and N_c are the number of subjects in treatment and control groups, respectively.
#' t_{bj} and c_{bj} are the model fitted incidence of an AE with PT j under SOC b in treatment and control groups.
#' This formular gives the loss for one interaction/sample, the final loss is the average of loss from all of the interactions/samples.\cr
#'
#' The loss is calcuated in following way: first the subjects original AE dataset (output of \code{\link{preprocess}}) is randomly evenly
#' divided k independent subparts. For each subpart, use this subpart as the testing dataset and use the rest of the whole dataset as the
#' training dataset. Model is trained with the training dataset and then loss is calculated for the testing dataset and training dataset.
#' Repeat this for each subpart and take the average of the testing loss and training loss from each subpart as the final loss. \cr
#'
#' \strong{\code{Lossfun}} takes the AE dataset and fitted incidence as parameters and calculate the loss based on the loss function above.\cr
#'
#' \strong{\code{kfdpar}} first calls function \code{preprocess} to process the data and produce a temporary dataset
#' and also calls function \code{preprocess} to process the data to get the whole AE dataset.
#' Then this temporary dataset will be randomly divided into k equal subparts. For each subpart,
#' use this subpart as the testing dataset and use the rest of the whole dataset as the
#' training dataset.This function will generate a list with k elements with each element is a also a list
#' a list contains two elements, named traindf and testdf.
#' "traindf" is used to train the model and testdf is usesd to calcualte the loss.
#' The output is going to be used for further crossvalidation to calculate loss. \cr
#'
#' \strong{\code{CVhier}} calculates the loss for Bayesian Hierarchical model.\cr
#'
#' \strong{\code{CVising}} calculates the los for Bayesian model with Ising prior. \cr
#'
#' @inheritParams preprocess
#' @param aedata output from function \code{\link{preprocess}}
#' @param PI output from function \code{\link{Hiergetpi}} or \code{\link{Isinggetpi}}
#' @param k interger, the number of folds used to split the dataset for cross validation
#' @inheritParams Ising
#' @inheritParams Hier
#'
#' @return
#' \strong{\code{Lossfun}} returns the loss for dataset \code{aedata} based on the fitted incidence \code{PI}.\cr
#' \strong{\code{kfdpar}} returns a list with k elements with each element is a also a list,
#' that contains two elements, named traindf and testdf.\cr
#' \strong{\code{CVhier}} returns the final training and testing loss for Bayesian hierarchical model. \cr
#' \strong{\code{CVIsing}} returns the final training and testing loss for Bayesian model with Ising model. \cr
#'
#' @examples
#' \dontrun{
#' data(ADAE)
#' data(ADSL)
#' AEdata<-preprocess(adsl=ADSL, adae=ADAE)
#' AELIST<-kfdpar(ADSL, ADAE, k=5)
#'
#' # Bayesian Hierarchical Model
#' HIERRAW<-Hier_history(aedata=AEdata, n_burn=1000, n_iter=1000, thin=20, n_adapt=1000, n_chain=2)
#' HIERPI<-Hiergetpi(aedata=AEdata, hierraw=HIERRAW)
#' loss_1<-Lossfun(aedata=AEdata, PI=HIERPI)
#' LOSSHIER<-CVhier(AElist=AELIST, n_burn=1000, n_iter=1000, thin=20, n_adapt=1000, n_chain=2)
#' LOSSHIER$trainloss # train loss
#' LOSSHIER$testloss # test loss
#'
#' # Bayesian model with Ising prior
#' ISINGRAW<-Ising_history(aedata = AEdata, n_burn=1000, n_iter=5000, thin=20, alpha_=0.5, beta_=0.75, alpha.t=0.5, beta.t=0.75,
#' alpha.c=0.25, beta.c=0.75, rho=1, theta=0.02)
#' ISINGPI<-Isinggetpi(aedata = AEdata, isingraw=ISINGRAW)
#' loss_2<-Lossfun(aedata=AEdata, PI=ISINGPI)
#'
#' LOSSISING<-CVising(AElist=AELIST, n_burn=100, n_iter=500, thin=20, alpha_=0.5, beta_=0.75, alpha.t=0.5, beta.t=0.75,
#' alpha.c=0.25, beta.c=0.75, rho=1, theta=0.02)
#' LOSSISING$trainloss # train loss
#' LOSSISING$testloss # test loss
#' }
#'
#' @seealso
#' \code{\link{preprocess}}, \code{\link{Hier}}, \code{\link{Ising}}, \code{\link{Isinggetpi}},
#' \code{\link{Hiergetpi}}
#'
#' @export
Lossfun<-function (aedata, PI){
# aedata is the output from function preprocess
# PI is the output from either function Hiergetpi or Isiinggetpi
# PI is a list containg two items, pit and pic
# this function will calculate the loss for dataset aedata,
# based on the predicated incidence rate for treatment group pit
# and the predicated incidence rate for control group pic
# aedata is the output from function preprocess
# pit dataset must contain the following items (and only contain these items)
# SoC, PT, incidience rate for treatment group from each iteraction (SoC and PT must be the first two columns)
# pic dataset must contain the following items
# SoC, PT, incidience rate for control group from each iteraction (SoC and PT must be the first two columns)
# the loss is defined as: $\sqrt{\sum_{bj} [(Y_{bj}-N_t*t_{bj})^2]}/N_t + \sqrt{\sum_{bj} [(X_{bj}-N_c*c_{bj})^2]}/N_c$
# get pit and pic from PI
pit<-PI$pit
pic<-PI$pic
# sort the dataset so that AE in these dataset are in the same order
aedata<-aedata[order(aedata$AEBODSYS, aedata$AEDECOD), ]
pit<-pit[order(pit$SoC, pit$PT),]
pic<-pic[order(pic$SoC, pic$PT),]
# it is possible that the number in pit/pic is stored as factor
# convert it to numeric
for (i in 3:(dim(pit)[2])){
pit[,i]<-as.numeric(as.character(pit[,i]))
pic[,i]<-as.numeric(as.character(pic[,i]))
}
# remove SOC and PT, left only predicted incidence rate
pit2<-pit[, 3:(dim(pit)[2])]
pic2<-pic[, 3:(dim(pic)[2])]
# calculate the loss contributed by treatment group
loss_t<-sqrt(mean(colSums((aedata$AEt - aedata$Nt*pit2)^2)))/aedata$Nt[1]
# calculate the loss contributed by control group
loss_c<-sqrt(mean(colSums((aedata$AEc - aedata$Nc*pic2)^2)))/aedata$Nc[1]
# get the total loss
loss<-loss_t+loss_c
return (loss)
}
#########################################################################################################
#########################################################################################################
#' @rdname crossvalidation
#'
#' @export
kfdpar<-function (adsl, adae, k){
# the first two parameters of this function are the same as in function preprocess
# k is integer which determine how many folds will the dataset be divided
# this function will randomly divide the subjects into k equal parts
# for each part i, this subjects in this part is assigned to testing dataset
# the rest (k-1) parts are assigned to training dataset
# this function will generate a list with k elements with each element is a also a list
# a list contains two elements, named traindf and testdf
# traindf is used to train the model and testdf is usesd to calcualte the loss
# this result is going to be used for further crossvalidation to calculate loss
# get the AEdata for whole dataset
tempaedata<-preprocess(adsl=adsl, adae=adae)
### only extract columns "USUBJID" and "TRTCTR" from adsl
### USUBJID: Unique Subject Identifier
### TRTCTR: indicator for treatment (TRTCTR=1) or control (TRTCTR=0) group
SL<-adsl[!is.na(adsl$TRTCTR), ]
SL<-subset(adsl, select=c("USUBJID","TRTCTR"))
SL<-unique(SL)
### extract columns "USUBJID", "AEBODSYS", and "AEDECOD"
### USUBJID: Unique Subject Identifier
### AEBODSYS: Body System or Organ Class (SoC)
### AEDECOD: Dictionary-Derived Term (Perferred Term, PT)
aevars<-c("USUBJID","AEBODSYS","AEDECOD")
AE<-subset(adae, select=aevars)
AE<-unique(AE)
### order AE by AEBODSYS and AEDECOD
AE<-AE[with(AE, order(AEBODSYS, AEDECOD)), ]
### merge SL and AE by usubjid
### we will discard the entries in AE that with usubjid not in SL
IS<-intersect(unique(AE$USUBJID), SL$USUBJID)
AE<-AE[AE$USUBJID %in% IS, ]
### add column "TRTCTR" for AE
AE$TRTCTR<-0
for (i in 1:dim(AE)[1]){
ID<-as.character(AE[i, "USUBJID"])
trtcode<-SL[SL$USUBJID %in% ID, "TRTCTR"]
AE[i, "TRTCTR"]<-trtcode
}
# get the the set of unique usubjid and shuffle it
# IDvect<-unique(tempdf$USUBJID)
IDvect<-unique(SL$USUBJID)
IDvect<-sample(IDvect)
# divide IDdf into k parts, if the number of ID is not exactly divisible, put the residual into the last part
IDnum<-length(IDvect)
eachnum<-floor(IDnum/k)
IDlist<-list()
for (i in 1:(k-1)){
Index<-(eachnum*(i-1)+1):(eachnum*i)
IDlist[[i]]<-IDvect[Index]
}
IDlist[[k]]<-IDvect[(eachnum*(k-1)+1):IDnum]
# get the AE data with ID inside each element of IDlist
# and convert into the dataset that can be used for further analysis
AElist<-list()
for (i in 1:k){
# first get the tempAE and tempSL with USUBJID in IDlist[[i]]
tempAE<-AE[AE$USUBJID %in% IDlist[[i]], ]
tempSL<-SL[SL$USUBJID %in% IDlist[[i]], ]
# sum up the number of entries for each combination of AEBODSYS, AEDECOD, and TRTCTR
library(dplyr)
tempTdat <- count(tempAE,AEBODSYS,AEDECOD,TRTCTR)
library(tidyr)
# get the number of entries for each combination of AEBODSYS and AEDECOD for TRTCTR=1 and TRTCTR=0, respectively
tempTdat2 <- spread(tempTdat, TRTCTR, n)
# change the column names for tempTdat2 with TRTCTR=1 be AEc, which means # of AEs in control groups
# and TRTCTR=0 be AEt, which means # of AEs in treatment group
colnames(tempTdat2)[colnames(tempTdat2) == '1'] <- 'AEt'
colnames(tempTdat2)[colnames(tempTdat2) == '0'] <- 'AEc'
# replave NA by zero
tempTdat2[is.na(tempTdat2)] <- 0
# get the total number of subjects in control and treatment group
# regardless of having AE or not
# record the information as Nc and Nt respectively as two columns in dataset
tempTdat2$Nc<-nrow(tempSL[tempSL$TRTCTR==0,])
tempTdat2$Nt<-nrow(tempSL[tempSL$TRTCTR==1,])
# A protential problem is that the number of AE in differnt parts may not be the same,
# and we cannot calculate loss in this case
# thus we need to make sure all AE shown up in Tdat2, and let AEc and AEt be 0
# for AE not shown
# first delete the columns of AEc, AEt, Nc, Nt in tempaedata
parttempaedata<-tempaedata[, c("AEBODSYS", "AEDECOD", "b", "i", "j")]
# merge tempTdat2 and parttempaedata, and keep all the rows in parttempaedata
# this is to make sure that all AEs with shown up the final dataset
Tdat2_tempaedata<-merge(tempTdat2, parttempaedata, all.y = TRUE)
# replace NA in cloumn Nc and Nt with the correct Nc and Nt
Tdat2_tempaedata$Nc<-tempTdat2$Nc[1]
Tdat2_tempaedata$Nt<-tempTdat2$Nt[1]
# replace all NA in AEc and AEt with 0
Tdat2_tempaedata[is.na(Tdat2_tempaedata)]<-0
# Tdat2_tempaedata is the testing dataset
testset<-Tdat2_tempaedata
# sort testset by b and j
testset<-testset[order(testset$b, testset$j),]
# get training dataset by substract AEc, AEt, Nc, Nt of testset from tempaedata
train<-tempaedata
#train[,c("AEc", "AEt", "Nc", "Nt")]<-tempaedata[, c("AEc", "AEt", "Nc", "Nt")]-testset[, c("AEc", "AEt", "Nc", "Nt")]
train[,"AEc"]<-tempaedata[, "AEc"]-testset[, "AEc"]
train[,"AEt"]<-tempaedata[, "AEt"]-testset[, "AEt"]
train[,"Nc"]<-tempaedata[, "Nc"]-testset[, "Nc"]
train[,"Nt"]<-tempaedata[, "Nt"]-testset[, "Nt"]
# put train and testset toghether in a list
train_test<-list(train=train, test=testset)
# add train_test into AElist
AElist[[i]]<-train_test
}
return(AElist)
}
#########################################################################################################
#########################################################################################################
#' @rdname crossvalidation
#'
#' @export
CVhier<-function(AElist, n_burn, n_iter, thin, n_adapt, n_chain,
alpha.gamma=3, beta.gamma=1,
alpha.theta=3, beta.theta=1, mu.gamma.0.0=0, tau.gamma.0.0=0.1, alpha.gamma.0.0=3,
beta.gamma.0.0=1, lambda.alpha=0.1, lambda.beta=0.1, mu.theta.0.0=0, tau.theta.0.0=0.1,
alpha.theta.0.0=3, beta.theta.0.0=1){
# parameter AElist is the output from function kfoldpartition
# the rest parameters are the same as paramters in function Hier_history with the same name
# this function will calculate the train loss and test loss for each partition of the dataset
# and return the sum as the final train and test loss
trainloss<-0
testloss<-0
library(foreach)
library(doParallel)
cores=detectCores()
cl<-makeCluster(cores[1]-1)
registerDoParallel(cl)
LOSSlist<-foreach (i = 1:length(AElist), .combine=rbind) %dopar% {
train<-AElist[[i]]$train
test<-AElist[[i]]$test
library(FlagAE)
# train the model
train_hier<-Hier_history(aedata=train, n_burn=n_burn, n_iter=n_iter, thin=thin, n_adapt=n_adapt, n_chain=n_chain, alpha.gamma=alpha.gamma, beta.gamma=beta.gamma,
alpha.theta=alpha.theta, beta.theta=beta.theta, mu.gamma.0.0=mu.gamma.0.0, tau.gamma.0.0=tau.gamma.0.0,
alpha.gamma.0.0=alpha.gamma.0.0, beta.gamma.0.0=beta.gamma.0.0, lambda.alpha=lambda.alpha,
lambda.beta=lambda.beta, mu.theta.0.0=mu.theta.0.0, tau.theta.0.0=tau.theta.0.0,
alpha.theta.0.0=alpha.theta.0.0, beta.theta.0.0=beta.theta.0.0)
# get pi
train_hiergetpi<-Hiergetpi(aedata=train, hierraw = train_hier)
# train loss and test loss
temp_trainloss<-Lossfun(aedata=train, PI=train_hiergetpi)
temp_testloss<-Lossfun(aedata=test, PI=train_hiergetpi)
c(temp_trainloss, temp_testloss)
}
stopCluster(cl)
return(list(trainloss=mean(LOSSlist[,1]), testloss=mean(LOSSlist[,2])))
}
#########################################################################################################
#########################################################################################################
#' @rdname crossvalidation
#'
#' @export
CVising<-function(AElist, n_burn, n_iter, thin, alpha_=0.25, beta_=0.75, alpha.t=0.25, beta.t=0.75,
alpha.c=0.25, beta.c=0.75, rho, theta){
# parameter AElist is the output from function kfoldpartition
# the rest parameters are the same as paramters in function Ising_history with the same name
# this function will calculate the train loss and test loss for each partition of the dataset
# and return the sum as the final train and test loss
trainloss<-0
testloss<-0
for (i in 1:length(AElist)){
train<-AElist[[i]]$train
test<-AElist[[i]]$test
# train the model
train_ising<-Ising_history(aedata =train, n_burn=n_burn, n_iter=n_iter, thin=thin,
alpha_=alpha_, beta_=beta_, alpha.t=alpha.t, beta.t=beta.t,
alpha.c=alpha.c, beta.c=beta.c, rho=rho, theta=theta)
# get pi
train_isinggetpi<-Isinggetpi(aedata=train, isingraw = train_ising)
# train loss and test loss
temp_trainloss<-Lossfun(aedata=train, PI=train_isinggetpi)
temp_testloss<-Lossfun(aedata=test, PI=train_isinggetpi)
trainloss<-trainloss+temp_trainloss
testloss<-testloss+temp_testloss
}
return(list(trainloss=trainloss/length(AElist), testloss=testloss/length(AElist)))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.