#' validate
#'
#' This function let's user validate an OncoCast model generated in a new cohort which has it's own
#' survival data.
#'
#' @param OC_object Output of the OncoCast function.
#' @param Results A getResults_OC object that you wish to validate
#' @param in.data New data set containing the information of the incoming patients. Should be a dataframe
#' with patients as rows and features as columns.
#' @param formula the formula to be used for validation
#' @param out.ties phcpe argument to calculate the concordance index. If out.ties is set to FALSE,
#' pairs of observations tied on covariates will be used to calculate the CPE. Otherwise, they will not be used.
#' @param limit Optional numerical argument to set a time limit on the KM plot
#' @param plot.cuts Boolean specifying if lines should be added to risk histogram to indicate where the cuts were performed
#' @param ... futher arguments applicable in ggsurvplot
#' @return data.out : The data frame inputted in the function with an additional column giving the predicted risk
#' score of the incoming patients.
#' @return RiskHist : A histogram of the distribution of the risk scores of patients in the given dataset.
#' @export
#' @examples library(OncoCast)
#' test <- OncoCast(data=survData,formula = Surv(time,status)~.,
#' method = "LASSO",runs = 30,
#' save = FALSE,nonPenCol = NULL,cores =1)
#' results <- getResults_OC(OC_object=test$LASSO,data=survData,
#' cuts=c(0.2,0.4,0.6,0.8),
#' geneList=NULL,mut.data=TRUE)
#' new.data <- as.data.frame(matrix(rbinom(5*20,1,0.5),nrow=20,ncol = 5))
#' colnames(new.data) <- c("ImpCov1","ImpCov2","ImpCov3","ImpCov4","Cov7")
#' rownames(new.data) <- paste0("Incoming",1:20)
#'
#' # add time component
#' coefs <- c(1.2,1.3,-1.5,-1,-1,rep(0,ncol(new.data)-5))
#' mu <- as.vector(coefs %*% t(new.data))
#' n = nrow(row(new.data))
#' time <- rexp(n,exp(mu))*12
#' c <- runif(n,0,as.numeric(quantile(time,0.8)))
#' status <- ifelse(time < c , 1,0)
#' time <-pmin(time,c)
#' new.data$time <- time
#' new.data$status <- status
#'
#' validation <- validate(OC_object=test$LASSO,Results=results,
#' in.data=new.data, formula=Surv(time,status)~.)
#' @import
#' magrittr
#' dtplyr
#' ggplot2
#' survminer
#' reshape2
#' scales
#' pheatmap
#' @importFrom plotly plot_ly layout toRGB add_ribbons
#' @importFrom dplyr select filter mutate group_by rename summarise arrange
validate <- function(OC_object,Results,in.data,formula, out.ties=FALSE,limit = NULL,plot.cuts = TRUE,...){
args <- list(...)
surv.median.line <- ifelse(is.null(args[['surv.median.line']]),"hv",args[['surv.median.line']])
risk.table <- ifelse(is.null(args[['risk.table']]),T,args[['risk.table']])
x.start <- ifelse(is.null(args[['x.start']]),0,args[['x.start']])
break.time <- ifelse(is.null(args[['break.time']]),6,args[['break.time']])
palette.print <- args[['palette.print']]
OC_object <- Filter(Negate(is.null), OC_object)
means.train <- sapply(OC_object,"[[","means")
qts = Results$rawCuts
ori.risk <- as.numeric(Results$risk.raw)
if(OC_object[[1]]$method %in% c("LASSO","RIDGE","ENET")){
LassoFits <- as.matrix(Results$Fits)
features <- colnames(LassoFits)
dums <- apply(in.data,2,function(x){anyNA(as.numeric(as.character(x)))})
if(sum(dums) > 0){
tmp <- in.data %>%
select(which(dums)) %>%
fastDummies::dummy_cols(remove_first_dummy = T) %>%
select(-one_of(names(which(dums))))
in.data <- as.data.frame(cbind(
in.data %>% select(-one_of(names(which(dums)))),
tmp
) %>% mutate_all(as.character) %>%
mutate_all(as.numeric)
)
warning("Character variables were transformed to dummy numeric variables. If you didn't have any character variables make sure all columns in your input data are numeric. The transformed data will be saved as part of the output.")
}
if(!all(is.na(match(colnames(in.data),features)))){
matched.genes <- c(na.omit(match(colnames(in.data),features)))
new.dat <- in.data[,which(!is.na(match(colnames(in.data),features)))]
## ADD ALL MISSING GENES TO BE ALL zero ##
missing <- features[which(is.na(match(features,colnames(new.dat))))]
to.add <- as.data.frame(matrix(0L,nrow=nrow(new.dat),ncol=length(missing)))
colnames(to.add) <- missing
rownames(to.add) <- rownames(new.dat)
new.dat <- as.data.frame(cbind(new.dat,to.add))
new.dat <- new.dat[,match(features,colnames(new.dat))]
#############################################
all.pred <- lapply(1:nrow(LassoFits),function(x){
### Subset to the coefs of that cv ###
coefs <- LassoFits[x,LassoFits[x,] != 0]
new.temp <- select(new.dat,names(coefs))
## substract mean mutation rate of TRAINING SET !!!###
new.x <- new.temp - rep(means.train[[x]][match(names(coefs),names(means.train[[x]]))], each = nrow(new.temp))
cal.risk.test <- drop(as.matrix(new.x) %*% coefs)
return(cal.risk.test)
})
}
else{
stop("No gene in your dataset overlapped with the original data. Please rename genes or check your dataset.")
}
}
if(OC_object[[1]]$method %in% c("GBM","RF","SVM","NN")){
if(OC_object[[1]]$method == "GBM") features <- OC_object[[1]]$GBM$var.names
if(OC_object[[1]]$method == "RF") features <- OC_object[[1]]$RF$forest$independent.variable.names
if(OC_object[[1]]$method == "SVM") features <- names(OC_object[[1]]$Vars)
if(OC_object[[1]]$method == "NN") features <- names(OC_object[[1]]$Vars)
dums <- apply(in.data,2,function(x){anyNA(as.numeric(as.character(x)))})
if(sum(dums) > 0){
tmp <- in.data %>%
select(which(dums)) %>%
fastDummies::dummy_cols(remove_first_dummy = T) %>%
select(-one_of(names(which(dums))))
in.data <- as.data.frame(cbind(
in.data %>% select(-one_of(names(which(dums)))),
tmp
) %>% mutate_all(as.character) %>%
mutate_all(as.numeric)
)
warning("Character variables were transformed to dummy numeric variables. If you didn't have any character variables make sure all columns in your input data are numeric. The transformed data will be saved as part of the output.")
}
if(!all(is.na(match(colnames(in.data),features)))){
matched.genes <- c(na.omit(match(colnames(in.data),features)))
new.dat <- in.data[,which(!is.na(match(colnames(in.data),features)))]
## ADD ALL MISSING GENES TO BE ALL zero ##
missing <- features[which(is.na(match(features,colnames(new.dat))))]
to.add <- as.data.frame(matrix(0L,nrow=nrow(new.dat),ncol=length(missing)))
colnames(to.add) <- missing
rownames(to.add) <- rownames(new.dat)
new.dat <- as.data.frame(cbind(new.dat,to.add))
new.dat <- new.dat[,match(features,colnames(new.dat))]
if(OC_object[[1]]$method == "GBM") {
all.pred <- lapply(OC_object,function(x){
predict(x$GBM,newdata=new.dat,
n.trees = x$bestTreeForPrediction,
type="response")
})}
if(OC_object[[1]]$method == "RF") {
all.pred <- lapply(OC_object,function(x){
predict(x$RF,new.dat)$predictions
})}
if(OC_object[[1]]$method == "SVM") {
all.pred <- lapply(OC_object,function(x){
predict(x$SVM,new.dat)
})}
if(OC_object[[1]]$method == "NN") {
all.pred <- lapply(OC_object,function(x){
predict(x$NN,new.dat)
})}
}
else{
stop("No gene in your dataset overlapped with the original data. Please rename genes or check your dataset.")
}
}
# Get CPE of validation #
CPE <- unlist(lapply(all.pred, function(pred_risk){
temp_formula <- formula(paste0(formula[[2]][1],"(", paste0(formula[[2]][-1],collapse = ","),")","~ pred_risk" ))
paste0(formula[[2]], "~ pred_risk")
CPE <- as.numeric(phcpe(coxph(temp_formula, data = in.data),out.ties=out.ties ))
}))
### PUTTING THINGS TOGETHER ####
all.pred <- do.call("cbind",all.pred)
Risk <- apply(all.pred,1,mean)
names(Risk) <- rownames(new.dat)
# Risk.all <- as.matrix(coefs) %*% as.matrix(t(new.dat))
# Risk <- apply(Risk.all,2,mean)
#in.data$Risk <- Risk
##########################################
ori.risk.range <- range(ori.risk)
in.data$OncoCastRiskScore <- rescale(Risk, to = c(0, 10), from = ori.risk.range) #WithOriginal
#in.data$rescaledRisk <- rescale(in.data$Risk, to = c(0, 10), from = range(in.data$Risk, na.rm = TRUE, finite = TRUE))
RiskHistogram.new <- ggplot(in.data, aes(x = OncoCastRiskScore, y = ..density..)) +
geom_histogram(show.legend = FALSE, aes(fill=..x..),
breaks=seq(min(in.data$OncoCastRiskScore,na.rm = T), max(in.data$OncoCastRiskScore,na.rm = T), by=20/nrow(in.data))) +
geom_density(show.legend = FALSE) +
theme_minimal() +
labs(x = "Average risk score", y = "Density") +
scale_fill_gradient(high = "red", low = "green")
#return(list("RiskHistogram.new"=RiskHistogram.new,"out.data"=in.data))
#### NEED TO REMAKE CUTS BASED ON THE ORIGINAL SCORE ####
#### deal with surv formula #####
survFormula <- as.formula(formula)
survResponse <- survFormula[[2]]
### reprocess data
if(length(as.list(survResponse)) == 3){
colnames(in.data)[match(as.list(survResponse)[2:3],colnames(in.data))] <- c("time","status")
LT = FALSE
}
if(length(as.list(survResponse)) == 4){
colnames(in.data)[match(as.list(survResponse)[2:4],colnames(in.data))] <- c("time1","time2","status")
LT = TRUE
}
riskGroup <- c()
numGroups <- length(qts) + 1
for(i in 1:length(Risk)){
if(numGroups == 2){
if(Risk[i] > qts) riskGroup[i] <- 2
else riskGroup[i] <- 1
}
if(numGroups >2){
temp <- Risk[i] - qts
ind1 <- which(temp>0)
ind2 <- which(temp<0)
if(length(ind1)>0 && length(ind2)>0) {riskGroup[i] <- ind2[1]}
else if (length(ind1)==0){riskGroup[i] <- 1}
else if (length(ind2)==0){riskGroup[i] <- numGroups}
}
}
# if(numGroups == 2){riskGroup[is.na(riskGroup)] <- 1}
in.data$RiskGroup <- riskGroup
in.data$RiskGroup <- factor(in.data$RiskGroup, levels = c(1:length(unique(in.data$RiskGroup))) )
numGroups <- length(levels(in.data$RiskGroup))
if(LT == TRUE) {
fit0 <- coxph(Surv(time1,time2,status) ~ RiskGroup,data=in.data,
na.action=na.exclude)
if(max(in.data$time2) > 1000){
timeType = "Days"
intercept = 5*365}
else{
timeType = "Months"
intercept = 5*12}
}
if(LT == FALSE) {
fit0 <- coxph(Surv(time,status) ~ RiskGroup,data=in.data,
na.action=na.exclude)
if(max(in.data$time) > 1000){
timeType = "Days"
intercept = 5*365}
else{
timeType = "Months"
intercept = 5*12}
}
log.test.pval <- as.vector(summary(fit0)[10][[1]])[3]
CI <- as.numeric(as.vector(summary(fit0)[14])[[1]][1])
if(is.null(limit)) {
if(LT) limit <- as.numeric(max(in.data$time2))
if(!LT) limit <- as.numeric(max(in.data$time))
}
# if(LT) {KM <- ggsurvplot(survfit(Surv(time1,time2,status) ~ RiskGroup,data=in.data, conf.type = "log-log"),conf.int = TRUE,surv.median.line = surv.median.line,
# data = in.data,break.time.by = 6,xlim=c(x.start,limit),risk.table = risk.table) + xlab("Time") +
# labs(title = paste("Kaplan Meier Plot (p-value : " ,round(log.test.pval,digits =5),")",sep=""))} #,xlim=c(0,limit)
# if(!LT){KM <- ggsurvplot(survfit(Surv(time,status) ~ RiskGroup,data=in.data, conf.type = "log-log"),conf.int = TRUE,surv.median.line = surv.median.line,
# data = in.data,break.time.by = 6,xlim=c(x.start,limit),risk.table = risk.table) + xlab("Time") +
# labs(title = paste("Kaplan Meier Plot (p-value : " ,round(log.test.pval,digits =5),")",sep=""))}
if(LT) {
if(is.null(palette.print))
KM <- ggsurvplot(survfit(Surv(time1,time2,status) ~ RiskGroup,data=in.data, conf.type = "log-log"),conf.int = TRUE,
surv.median.line = surv.median.line, risk.table = risk.table,
data = in.data,xlim=c(x.start,limit),break.time.by = break.time) + xlab(paste0("Time (",timeType,")")) +
labs(title = paste("Kaplan Meier Plot (p-value : " ,round(log.test.pval,digits =5),")",sep=""))
else
KM <- ggsurvplot(survfit(Surv(time1,time2,status) ~ RiskGroup,data=in.data, conf.type = "log-log"),conf.int = TRUE,
surv.median.line = surv.median.line, risk.table = risk.table,
data = in.data,xlim=c(x.start,limit),break.time.by = break.time, palette = palette.print) + xlab(paste0("Time (",timeType,")")) +
labs(title = paste("Kaplan Meier Plot (p-value : " ,round(log.test.pval,digits =5),")",sep=""))
}
if(!LT){
if(is.null(palette.print))
KM <- ggsurvplot(survfit(Surv(time,status) ~ RiskGroup,data=in.data, conf.type = "log-log"),conf.int = TRUE,
surv.median.line = surv.median.line, risk.table = risk.table,
data = in.data,xlim=c(x.start,limit),break.time.by = break.time) + xlab(paste0("Time (",timeType,")")) +
labs(title = paste("Kaplan Meier Plot (p-value : " ,round(log.test.pval,digits =5),")",sep=""))
else
KM <- ggsurvplot(survfit(Surv(time,status) ~ RiskGroup,data=in.data, conf.type = "log-log"),conf.int = TRUE,
surv.median.line = surv.median.line, risk.table = risk.table,
data = in.data,xlim=c(x.start,limit),break.time.by = break.time, palette = palette.print) + xlab(paste0("Time (",timeType,")")) +
labs(title = paste("Kaplan Meier Plot (p-value : " ,round(log.test.pval,digits =5),")",sep=""))
}
survivalGroup <- as.data.frame(matrix(nrow=numGroups,ncol=4))
rownames(survivalGroup) <- paste0("riskGroup ",1:numGroups)
colnames(survivalGroup) <- c("MedianOS","95%CI","1Ysurvival","3Ysurvival")
# for each group find closest value to median
if(timeType == "Months"){YR1 <- 1*12;YR3 <- 3*12}
if(timeType == "Days"){YR1 <- 1*365;YR3 <- 3*365}
for(i in 1:numGroups){
if(LT == TRUE){NewObject <- with(in.data[in.data$RiskGroup == i,],Surv(time1,time2,status))}
if(LT == FALSE){NewObject <- with(in.data[in.data$RiskGroup == i,],Surv(time,status))}
Fit <- survfit(NewObject ~ 1,data=in.data[in.data$RiskGroup == i,], conf.type = "log-log")
# med.index <- which.min(abs(Fit$surv-0.5))
YR1.index <- which.min(abs(Fit$time-YR1))
YR3.index <- which.min(abs(Fit$time-YR3))
survivalGroup[i,] <- c(as.numeric(round(summary(Fit)$table[7],digits=2)),
paste0("(",as.numeric(round(summary(Fit)$table[8],digits=2)),",",
as.numeric(round(summary(Fit)$table[9],digits=2)),")"),
paste0(round(Fit$surv[YR1.index],digits=2)," (",
round(Fit$lower[YR1.index],digits=2),",",
round(Fit$upper[YR1.index],digits=2),")"),
paste0(round(Fit$surv[YR3.index],digits=2)," (",
round(Fit$lower[YR3.index],digits=2),",",
round(Fit$upper[YR3.index],digits=2),")"))
}
if(LT) fit <- coxph(Surv(time1,time2, status)~ as.factor(RiskGroup), data = in.data)
else fit <- coxph(Surv(time, status)~ as.factor(RiskGroup), data = in.data)
survivalGroup$HazardRatio <- ""
survivalGroup$HazardRatio[2:nrow(survivalGroup)] <- round(summary(fit)$coefficients[,2],digits = 3)
if(plot.cuts){
RiskHistogram.new <- RiskHistogram.new +
geom_vline(xintercept = as.numeric(quantile(Results$scaled.risk, Results$cuts)),
color = "blue", linetype = "dashed")
}
return(list("CPE"=CPE,"RiskHistogram.new"=RiskHistogram.new,
"out.data"=in.data,"KM"=KM,"survTable"=survivalGroup))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.