#' Internal function called by \code{\link{runDVboostwrapper}}
#'
#' After model training and testing, \code{metricView} generates summary statistics and figures for users to examine the training performance
#' @param DV.fit.res output generated by \code{\link{fitDVboostmodel}}
#' @param input.sample.ID same sample ID used for annotation procedure
#' @param plot.filename full path to the figure file regarding training performance
#' @param DV.sampleQC.filename full path to the text file that contains metrics for model training performance
#' @param subset a vector of integers indicating a subset of the SVs that all metrics will be evaluated on
#' @return \describe{
#' \item{A figure file}{The figure is saved at \code{plot.filename} and contains 6 subpanels, including \enumerate{\item distribution of known(1) versus novel(0) variants; \item feature importance for model training; \item distribution of Q scores for known and novel variants; \item precision, recell characteristics and best F score; \item ROC curve; \item FDR versus Q score}}
#' \item{QC metrics file}{The .txt file is saved at \code{DV.sampleQC.filename} and contains 6 fields, including \itemize{\item sample.ID: input sample ID \item Fscore_PR: F score for precision&recall \item AUC.PR.known.var: Area under the curve for precision and recall \item AUC.FP.TP.known.var:AUROC \item npos:number of known variants used for training \item nneg:number of novel variants used for training }}
#' }
#'
#' @export
#' @import ggplot2
#' @import ROCR
#' @import gridExtra
#'
metricView <- function(DV.fit.res,input.sample.ID, plot.filename, DV.sampleQC.filename, subset) {
t1 <- as.data.frame(table(DV.fit.res$is.known.variant[subset]))
gp1 <- ggplot(data = t1) + geom_col(aes(x=Var1, y=Freq)) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x='1 = known variants; 0 = others', y='Frequency', title=input.sample.ID)
sum.res <- summary.gbm(DV.fit.res, plotit = FALSE, las = 2,
main = "Relative influence of DVboost attributes")
sum.res$var <- factor(sum.res$var,levels = sum.res$var[order(sum.res$rel.inf, decreasing = F)])
gp2 <- ggplot(data = sum.res) + geom_col(aes(x=var, y=rel.inf , fill=rel.inf)) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x=NULL, y='Relative influence', title='Relative influence of DVboost attributes') +
coord_flip() + guides(fill=FALSE)
df1 <- data.frame(DVboost.Q.score=DV.fit.res$DVboost.Q.score[subset],
is.known.variant=DV.fit.res$is.known.variant[subset])
gp3 <- ggplot(data=df1) + geom_boxplot(aes(x=as.factor(is.known.variant), y=DVboost.Q.score)) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x="1 = known variants; 0 = others", y='DVboost q-score', title = paste("boxplots of DVboost Q-scores"))
pred2 <- prediction( (DV.fit.res$DVboost.Q.score[subset]) , DV.fit.res$is.known.variant[subset]==1)
perf3 <- performance(pred2, "prec", "rec")
# perf3@x.name
# [1] "Recall"
# perf3@y.name
# [1] "Precision"
recall.vec <- unlist(perf3@x.values)
precision.vec <- unlist(perf3@y.values)
est.AUC.val <- sum(diff(recall.vec)*precision.vec[2:length(precision.vec)])
df1 <- data.frame(x=recall.vec,
y=precision.vec)
fscore <- 2*recall.vec*precision.vec/(recall.vec + precision.vec)
idx1 <- which.max(fscore)
gp4 <- ggplot(data = df1) + geom_line(aes(x=x, y=y)) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x='Recall', y='Precision', title = paste("Max F score =", round(fscore[idx1],3),' at Q value ',round(perf3@alpha.values[[1]][idx1],2))) +
geom_point(aes(x=recall.vec[idx1], y=precision.vec[idx1]), shape=4, size=6,color='black')
perf.FP <- performance(pred2, "fpr", "tpr")
# perf.FP@x.name
# [1] "True positive rate"
# perf.FP@y.name
# [1] "False positive rate"
TP.vec <- unlist(perf.FP@x.values)
FP.vec <- unlist(perf.FP@y.values)
FP10.index<-which.min(abs(FP.vec-0.1))
TP.vec.F10<-TP.vec[FP10.index]
est.AUC.FPTP.val <- sum(diff(FP.vec)*TP.vec[2:length(TP.vec)])
df1 <- data.frame(x=FP.vec,
y=TP.vec)
gp5 <- ggplot(data = df1) + geom_line(aes(x=x, y=y)) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x='False positive rate', y='True positive rate', title = paste("AUC =",round(est.AUC.FPTP.val,3)))
perf.FP1 <- performance(pred2, "prec", "rec")
FDR.vec <- 1-unlist(perf.FP1@y.values)
alpha.vec <- unlist(perf.FP1@alpha.values)
df1 <- data.frame(x=alpha.vec,
y=FDR.vec)
gp6 <- ggplot(data = df1) + geom_line(aes(x=x, y=y)) +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x='Q values', y='False discovery rate', title = "Q value v. FDR")
ggsave(plot.filename, marrangeGrob(list(gp1,gp2,gp3,gp4,gp5,gp6), ncol=3, nrow=2, layout_matrix = matrix(seq_len(6), nrow = 2, ncol = 3, byrow = T), top = NULL), height = 7, width = 14)
npos <- length(which(DV.fit.res$is.known.variant[subset]==1))
nneg <- length(subset) - npos
DVboost.sample.res <-
cbind(sample.ID = input.sample.ID,Fscore_PR = fscore[idx1],
AUC.PR.known.var = est.AUC.val, AUC.FP.TP.known.var = est.AUC.FPTP.val, npos=npos, nneg=nneg)
write.table(x = DVboost.sample.res, file = DV.sampleQC.filename,
row.names = FALSE, sep = "\t", quote = FALSE)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.