#' This function is for an older version of MethylPipeR and is now deprecated.
#' It is currently still included to allow old scripts to run correctly.
#' To be called at the beginning of a pipeline run.
#'
#' @param note A string describing the pipeline run. This will be saved in the
#' pipeline run log.
#' @param randomSeed A number to be used as the random seed in pipeline function
#' calls.
#' @param log TRUE iff the pipeline log should be saved to file.
#' @param logFolderPath if log is TRUE, the log file will be saved in the folder
#' specified by logFolderPath.
#' @return A list containing pipeline run information. Many of the pipeline
#' functions require this list to be passed in as an argument.
#' @export
beginPipelineRun <- function(note, randomSeed = 42, log = TRUE, logFolderPath) {
.Deprecated()
pipelineRunInformation <- list()
pipelineRunInformation[["note"]] <- note
pipelineRunInformation[["log"]] <- log
pipelineRunInformation[["log folder path"]] <- logFolderPath
pipelineRunInformation[["random seed"]] <- randomSeed
pipelineRunInformation[["start timestamp"]] <- format(Sys.time(),
"%Y_%m_%d_%H_%M_%S")
if (log) {
write(paste0(pipelineRunInformation[["start timestamp"]], ": ", note),
paste0(logFolderPath, "run_notes.txt"), append = TRUE)
tic(paste0("Finished pipeline run. Log files have been saved in ",
logFolderPath))
} else {
tic("Finished pipeline run. Log files have not been saved")
}
print("Starting pipeline run.")
return(pipelineRunInformation)
}
#' This function is for an older version of MethylPipeR and is now deprecated.
#' It is currently still included to allow old scripts to run correctly.
#' To be called at the end of a pipeline run. Saves the pipelineRunInformation
#' to a file. Saves the pipeline timing logs to a file.
#'
#' @param pipelineRunInformation The list of pipeline run information entries
#' (originally output by \code{beginPipelineRun}).
#' @export
endPipelineRun <- function(pipelineRunInformation) {
.Deprecated()
if (pipelineRunInformation[["log"]]) {
# Export pipeline run information list
saveRDS(pipelineRunInformation,
paste0(pipelineRunInformation[["log folder path"]],
"pipeline_run_information_",
pipelineRunInformation[["start timestamp"]],
".rds"))
}
# toc for whole pipeline run tictoc
toc(log = TRUE)
# Write timing logs to file
if (pipelineRunInformation[["log"]]) {
lapply(tic.log(), write, paste0(pipelineRunInformation[["log folder path"]],
"pipeline_tictoc_",
pipelineRunInformation[["start timestamp"]],
".txt"),
append = TRUE,
ncolumns = 1000)
}
}
#' Load data from a file. Currently supports .rds and .csv files
#'
#' @param dataFilepath The filepath of the data to be loaded. Currently supports
#' '.rds' and '.csv' endings
#' @return The loaded data object
#' @export
loadData <- function(dataFilepath) {
tic(paste0("Load data from ", dataFilepath))
if (endsWith(tolower(dataFilepath), ".rds")) {
loadedData <- readRDS(dataFilepath)
} else if (endsWith(tolower(dataFilepath), ".csv")) {
loadedData <- read.csv(dataFilepath)
} else {
print("File format suffix not recognised.")
loadedData <- NULL
}
toc(log = TRUE)
return(loadedData)
}
#' This function is for an older version of MethylPipeR and is now deprecated.
#' It is currently still included to allow old scripts to run correctly.
#' Fits a penalised logistic regression model with cross-validation.
#'
#' @param xs A matrix/data.frame corresponding to X variables. Columns should
#' correspond to features.
#' @param y A matrix/data.frame with a column named 'Event' corresponding to a
#' binary label.
#' @param penalty \code{'lasso'}/\code{'elnet'}/\code{'ridge'} A string
#' specifying the penalty.
#' @param weights A vector of weights corresponding to the training data rows to
#' be used in model fitting.
#' @param nFoldsCV The number of folds to be used in cross-validation (default
#' is 3)
#' @param foldID A vector of numbers specifying a cross-validation fold. Each
#' entry should correspond to a row in \code{xs}.
#' @param pipelineRunInformation The object originally created by
#' \code{\link{beginPipelineRun}}
#' @param modelLabel A string that will act as a label for this function call.
#' The label will be used as a prefix in the 'penalty' and 'number of folds'
#' entries in pipeline.run.information. It will also be used as a prefix for
#' the saved object filenames in this function call.
#' @param prescaled A boolean specifying if the columns in \code{xs} have been
#' scaled before the call to fitLogisticModel e.g. using \code{scale()}. If
#' set to TRUE, this sets \code{standardize = FALSE} in the internal call to
#' \code{cv.glmnet}
#' @param typeMeasure A string specifying the argument to type.measure in
#' \code{cv.glmnet}. Defaults to \code{'default'}.
#' @param penaltyFactor A vector of numbers indicating variable-specific weights
#' for the lambda penalty.
#' @return The glmnet logistic model object.
#' @export
fitLogisticModel <- function(xs,
y,
penalty,
weights,
nFoldsCV = 3,
foldID = NULL,
pipelineRunInformation,
modelLabel,
prescaled = FALSE,
typeMeasure = "default",
penaltyFactor = NULL) {
.Deprecated()
tic("Fit logistic regression model")
checkMatrixOrDF(xs)
pipelineRunInformation[[paste0(modelLabel, " penalty")]] <- penalty
pipelineRunInformation[[paste0(modelLabel, " number of folds")]] <- nFoldsCV
model <- fitModelCVFunctions[["logistic"]](xs,
y[, "Event"],
penalty = penalty,
weights = weights,
nFoldsCV,
pipelineRunInformation[["random seed"]],
foldID,
prescaled = prescaled,
typeMeasure = typeMeasure,
penaltyFactor = penaltyFactor)
toc(log = TRUE)
if (pipelineRunInformation[["log"]]) {
tic(paste0("Save logistic ",
penalty,
" regression model. Label: ",
modelLabel))
saveRDS(model, paste0(pipelineRunInformation[["log folder path"]],
modelLabel, "_logistic_model_fit_",
pipelineRunInformation[["start timestamp"]],
".rds"))
toc(log = TRUE)
}
return(model)
}
#' This function is for an older version of MethylPipeR and is now deprecated.
#' It is currently still included to allow old scripts to run correctly.
#' Outputs predictions given a logistic model, data and labels.
#'
#' @param model A glmnet logistic regression model.
#' @param xs A matrix/data.frame corresponding to X variables. Columns should
#' correspond to features.
#' @param pipelineRunInformation The object originally created by
#' \code{\link{beginPipelineRun}}
#' @param modelLabel A string that will act as a label for this function call.
#' The label will be used as a prefix for the saved object file names in this
#' function call.
#' @param predictType A string specifying the prediction type (\code{'class'} or
#' \code{'response'}).
#' @return A vector of model predictions.
#' @export
predictLogisticModel <- function(model,
xs,
pipelineRunInformation,
modelLabel,
predictType = "class") {
.Deprecated()
tic("Predict using fit logistic regression model")
checkMatrixOrDF(xs)
modelPredictions <- predictModelCVFunctions[["logistic"]](model,
xs,
predictType)
toc(log = TRUE)
if (pipelineRunInformation[["log"]]) {
tic("Save model predictions")
saveRDS(modelPredictions,
paste0(pipelineRunInformation[["log folder path"]],
modelLabel, "_logistic_model_predictions_",
pipelineRunInformation[["start timestamp"]],
".rds"))
toc(log = TRUE)
}
return(modelPredictions)
}
#' This function is for an older version of MethylPipeR and is now deprecated.
#' It is currently still included to allow old scripts to run correctly.
#' Given model predictions and labels, calculates binary classification
#' performance metrics
#'
#' @param modelPredictions A vector of model predictions.
#' @param y A matrix/data.frame with a column named 'Event' corresponding to a
#' binary label.
#' @param pipelineRunInformation The object originally created by
#' \code{\link{beginPipelineRun}}.
#' @param modelLabel A string that will act as a label for this function call.
#' The label will be used as a prefix for the saved object filenames in this
#' function call.
#' @return Object containing model metrics.
#' @export
calculateMetricsBinaryClassifier <- function(modelPredictions,
y,
pipelineRunInformation,
modelLabel) {
.Deprecated()
tic("Calculate logistic regression model metrics")
modelMetrics <- as.matrix(confusionMatrix(as.factor(modelPredictions),
as.factor(y[, "Event"]),
positive="1"),
what = "classes")
pipelineRunInformation[[paste0(modelLabel, " model metrics")]] <- modelMetrics
toc(log = TRUE)
if (pipelineRunInformation[["log"]]) {
tic("Save logistic regression model metrics")
write.csv(modelMetrics,
paste0(pipelineRunInformation[["log folder path"]],
modelLabel,
"_logistic_model_metrics_",
pipelineRunInformation[["start timestamp"]],
".csv"))
toc(log = TRUE)
}
return(modelMetrics)
}
#' This function is for an older version of MethylPipeR and is now deprecated.
#' It is currently still included to allow old scripts to run correctly.
#' Fit glmnet penalised cox model on training data.
#'
#' @param xs A matrix/data.frame corresponding to X variables. Columns should
#' correspond to features.
#' @param target A matrix/data.frame with columns named 'Event' and
#' 'time_to_event'. 'Event' should correspond to a binary label.
#' 'time_to_event' should correspond to a positive numerical value
#' representing the survival time (when Event == 1) or the time-to-censoring
#' (when Event == 0)
#' @param penalty \code{'lasso'}/\code{'elnet'}/\code{'ridge'} A string
#' specifying the penalty.
#' @param weights A vector of weights corresponding to the training data rows to
#' be used in model fitting.
#' @param nFoldsCV The number of folds to be used in cross-validation (default
#' is 3)
#' @param foldID A vector of numbers specifying a cross-validation fold. Each
#' entry should correspond to a row in \code{xs}.
#' @param pipelineRunInformation The object originally created by
#' \code{\link{beginPipelineRun}}.
#' @param modelLabel A string that will act as a label for this function call.
#' The label will be used as a prefix for the saved object filenames in this
#' function call.
#' @param prescaled A boolean specifying if the columns in \code{xs} have been
#' scaled before the call to \code{\link{fitLogisticModel}} e.g. using
#' \code{scale()}. If set to TRUE, this sets \code{standardize = FALSE} in the
#' internal call to \code{cv.glmnet}
#' @param penaltyFactor A vector of numbers indicating variable-specific weights
#' for the lambda penalty.
#' @return The fit penalised cox model glmnet object.
#' @export
fitCoxModel <- function(xs,
target,
penalty,
weights,
nFoldsCV = 3,
foldID = FALSE,
pipelineRunInformation,
modelLabel,
prescaled = FALSE,
penaltyFactor = NULL) {
.Deprecated()
tic(paste0("Fit Cox ", penalty, " regression model. Label: ", modelLabel))
checkMatrixOrDF(xs)
pipelineRunInformation[[paste0(modelLabel, " penalty")]] <- penalty
pipelineRunInformation[[paste0(modelLabel, " number of folds")]] <- nFoldsCV
model <- fitModelCVFunctions[["cox"]](xs,
Surv(target[, "time_to_event"],
target[, "Event"]),
penalty = penalty,
weights = weights,
nFoldsCV,
pipelineRunInformation[["random seed"]],
foldID,
prescaled = prescaled,
penaltyFactor = penaltyFactor)
toc(log = TRUE)
if (pipelineRunInformation[["log"]]) {
tic(paste0("Save Cox regression model. Label: ", modelLabel))
saveRDS(model,
paste0(pipelineRunInformation[["log folder path"]],
modelLabel,
"_cox_model_fit_",
pipelineRunInformation[["start timestamp"]],
".rds"))
toc(log = TRUE)
}
return(model)
}
#' This function is for an older version of MethylPipeR and is now deprecated.
#' It is currently still included to allow old scripts to run correctly.
#' Calculate linear model predictions using glmnet Cox model.
#'
#' @param model A Cox glmnet model object.
#' @param xs A matrix/data.frame corresponding to X (variables). Columns should
#' correspond to features.
#' @param pipelineRunInformation The object originally created by
#' \code{\link{beginPipelineRun}}.
#' @param modelLabel A string that will act as a label for this function call.
#' The label will be used as a prefix for the saved object file names in this
#' function call.
#' @return A vector of model predictions.
#' @export
predictCoxModel <- function(model, xs, pipelineRunInformation, modelLabel) {
.Deprecated()
tic(paste0("Predict using Cox regression model. Label: ", modelLabel))
checkMatrixOrDF(xs)
modelPredictions <- predictModelCVFunctions[["cox"]](model, xs)
toc(log = TRUE)
if (pipelineRunInformation[["log"]]) {
tic(paste0("Save model predictions. Label: ", modelLabel))
saveRDS(modelPredictions,
paste0(pipelineRunInformation[["log folder path"]],
modelLabel, "_cox_model_predictions_",
pipelineRunInformation[["start timestamp"]],
".rds"))
toc(log = TRUE)
}
return(modelPredictions)
}
#' This function is for an older version of MethylPipeR and is now deprecated.
#' It is currently still included to allow old scripts to run correctly.
#' Retrieve non-zero coefficients from a glmnet model
#'
#' @param model A glmnet model object.
#' @param pipelineRunInformation The object originally created by
#' \code{\link{beginPipelineRun}}.
#' @param modelLabel A string that will act as a label for this function call.
#' The label will be used as a prefix for the saved object filenames in this
#' function call.
#' @return A data.frame containing the non-zero model coefficients.
#' @export
getNonZeroCoefficients <- function(model, pipelineRunInformation, modelLabel) {
.Deprecated()
tic("Calculate non-zero coefficients")
modelCoefficients <- coef(model, s = "lambda.min")
nonZeroModelCoefficients <- as.data.frame(modelCoefficients[which(modelCoefficients[, 1] != 0), ])
toc(log = TRUE)
if (pipelineRunInformation[["log"]]) {
tic(paste0("Save non-zero coefficients. Label: ", modelLabel))
write.csv(nonZeroModelCoefficients,
paste0(pipelineRunInformation[["log folder path"]],
modelLabel,
"_model_non_zero_model_coefficients_",
pipelineRunInformation[["start timestamp"]],
".csv"))
toc(log = TRUE)
}
return(nonZeroModelCoefficients)
}
#' Fit coxph model with a predictor variable and a set of covariates.
#'
#' @param modelPredictions A vector of model predictions.
#' @param targetTable A matrix/data.frame with columns named 'Event' and
#' 'time_to_event'. 'Event' should correspond to a binary label.
#' 'time_to_event' should correspond to a positive numerical value
#' representing the survival time (when Event == 1) or the time-to-censoring
#' (when Event == 0)
#' @param covariatesTable A table that includes columns corresponding to
#' covariates to be included in the model.
#' @param covariateColnames A named list with names corresponding to the
#' covariate column names to be used in the model. The values in the list
#' should correspond to the actual column name in \code{covariatesTable}
#' @param pipelineRunInformation The object originally created by
#' \code{\link{beginPipelineRun}}.
#' @param modelLabel A string that will act as a label for this function call.
#' The label will be used as a prefix for the saved object filenames in this
#' function call.
#' @return A named list consisting of the following elements:
#' \code{'coxPHModel'} is the fit coxph model object (from the \code{survival}
#' library). \code{'coxPHModelSummary'} is the result from calling
#' \code{summary()} on \code{coxPHModel}. \code{'pseudoR2'} is the pseudo
#' r-squared value for the model. \code{'hazardRatio'} is the hazard ratio for
#' the predictor variable.
fitCoxPH <- function(modelPredictions,
targetTable,
covariatesTable,
covariateColnames,
pipelineRunInformation,
modelLabel) {
tic(paste0("Fit coxph. Model label: ", modelLabel))
coxPHData <- cbind.data.frame(time_to_event = targetTable[, "time_to_event"],
Event = targetTable[, "Event"],
modelPredictions = modelPredictions)
for (coxPHDataColname in names(covariateColnames)) {
coxPHData[, coxPHDataColname] <- covariatesTable[, covariateColnames[[coxPHDataColname]]]
}
coxPHModel <- coxph(as.formula(paste0("Surv(time_to_event, Event) ~ scale(modelPredictions)+",
paste(names(covariateColnames),
collapse = "+"))),
coxPHData)
coxPHModelSummary <- summary(coxPHModel)
pseudoR2 <- coxPHModelSummary$rsq[[1]]
hazardRatio <- coxPHModelSummary$conf.int[, -2] %>%
cbind(pval = coxPHModelSummary$coefficients[, 5]) %>%
head(1)
toc(log = TRUE)
if (pipelineRunInformation[["log"]]) {
tic("Save coxph objects")
saveRDS(coxPHModelSummary,
paste0(pipelineRunInformation[["log folder path"]],
modelLabel, "_coxph_model_summary_",
pipelineRunInformation[["start timestamp"]],
".rds"))
toc(log = TRUE)
}
return(list(coxPHModel = coxPHModel,
coxPHModelSummary = coxPHModelSummary,
pseudoR2 = pseudoR2,
hazardRatio = hazardRatio))
}
#' Assigns each row in the input data a fold number randomly while ensuring the
#' same number of cases/controls are assigned to each fold.
#'
#' @param data A matrix/data.frame with a column named <class.colname>
#' corresponding to a binary indicator of case/control.
#' @param classColname The name of the column corresponding to a binary
#' indicator of case/control.
#' @param nFolds The number of folds to be assigned.
#' @param pipelineRunInformation The object originally created by
#' \code{\link{beginPipelineRun}}.
#' @return A vector with elements corresponding to rows in data. Each element is
#' a number between 1-\code{nFolds} and represents which cross-validation fold
#' the row is assigned to.
#' @export
assignTrainingFolds <- function(data,
classColname,
nFolds,
pipelineRunInformation) {
tic("Assigning balanced training folds")
# For cases and controls: shuffle indexes and assign an integer 1:nFolds on a
# round robin basis
set.seed(pipelineRunInformation[["random seed"]])
# Save original row names to reorder foldIDs at the end.
originalRowNames <- row.names(data)
caseRowNames <- row.names(data[data[, classColname] == 1, ])
caseRowNames <- sample(caseRowNames)
caseFoldIDs <- 1:length(caseRowNames) %% nFolds + 1
names(caseFoldIDs) <- caseRowNames
controlRowNames <- row.names(data[data[, classColname] == 0, ])
controlRowNames <- sample(controlRowNames)
controlFoldIDs <- 1:length(controlRowNames) %% nFolds + 1
names(controlFoldIDs) <- controlRowNames
foldIDs <- c(caseFoldIDs, controlFoldIDs)
# Reorder according to original row names
foldIDs <- foldIDs[originalRowNames]
# foldids <- sapply(1:length(foldIDs), function(num) {foldIDs[[toString(num)]]})
toc(log = TRUE)
return(foldIDs)
}
#' removeDeathsFromControls
#'
#' @param targetToFilter A matrix or data.frame with a binary 1/0 column named
#' 'dead' and a binary column named 'Event'
#' @param objectsToFilter A list of matrix or data.frame objects each with the
#' same number of rows as targetToFilter. These objects will also have the
#' same rows removed as targetToFilter.
#'
#' @return A list with elements \code{targetFiltered} (a matrix/data.frame) and
#' \code{objectsToFilter} (a list of matrix/data.frames) corresponding to
#' \code{targetToFilter} and \code{objectsToFilter} with rows corresponding to
#' deaths in controls (dead == 1, Event == 0) removed
#' @export
removeDeathsFromControls <- function(targetToFilter, objectsToFilter) {
tic(paste0("Remove deaths from controls"))
deathControlIndexes <- (targetToFilter[, "dead"] == 1) & (targetToFilter[, "Event"] == 0)
rowsToKeep <- !deathControlIndexes
targetToFilter <- targetToFilter[rowsToKeep, ]
objectsToFilter <- lapply(objectsToFilter, function(objectToFilter) {
if (is.data.frame(objectToFilter) || is.matrix(objectToFilter)) {
objectToFilter[rowsToKeep, ]
} else if (is.vector(objectToFilter)) {
objectToFilter[rowsToKeep]
} else {
stop("objectsToFilter not a data.frame, matrix or vector\n")
}
})
toc(log = TRUE)
gc()
return(list(targetFiltered = targetToFilter,
objectsFiltered = objectsToFilter))
}
removeByThreshold <- function(targetToFilter,
objectsToFilter,
variableName,
threshold,
lessThan = TRUE) {
comparisonString <- ifelse(lessThan, "less than", "greater than or equal to")
tic(paste0("Remove rows with ",
variableName,
" ",
comparisonString,
" ",
threshold))
rowsToKeep <- targetToFilter[, variableName] < threshold
if (lessThan) {
# If less.than is TRUE, keep the inverse selection
rowsToKeep <- !rowsToKeep
}
targetToFilter <- targetToFilter[rowsToKeep, ]
objectsToFilter <- lapply(objectsToFilter, function(objectToFilter) {
if (is.data.frame(objectToFilter) || is.matrix(objectToFilter)) {
objectToFilter[rowsToKeep, ]
} else if (is.vector(objectToFilter)) {
objectToFilter[rowsToKeep]
} else {
stop("objectToFilter not data.frame, matrix or vector\n")
}
})
toc(log = TRUE)
gc()
return(list(targetFiltered = targetToFilter, objectsFiltered = objectsToFilter))
}
#' thresholdTTE
#'
#' @param targetToFilter A matrix or data.frame including a numerical column
#' named 'time_to_event' and a binary 1/0 column named 'Event'. eventColname
#' and/or tteColname must be specified if either or both of these columns do
#' not have the default names.
#' @param objectsToFilter A list of matrix/data.frame objects each with the same
#' number of rows as targetToFilter. These will have the same rows (according
#' to index) removed as targetToFilter
#' @param threshold The time-to-event threshold which will be used.
#' @param eventColname The name for the event column. Must be specified if not
#' 'Event'.
#' @param tteColname The name for the time-to-event column. Must be specified if
#' not 'time_to_event'
#'
#' @return A list with elements \code{targetFiltered} and \code{objectsFiltered}
#' corresponding to \code{targetToFilter} and \code{objectsToFilter} after
#' thresholding. Thresholding involves the following: Rows with Event == 1
#' (cases) and time_to_event > threshold will be converted to Event == 0
#' (controls). Rows with Event == 0 and time_to_event <= threshold will be
#' removed.
#' @export
thresholdTTE <- function(targetToFilter,
objectsToFilter,
threshold,
eventColname = "Event",
tteColname = "time_to_event") {
casesIndex <- targetToFilter[, eventColname] == 1
controlsIndex <- targetToFilter[, eventColname] == 0
lessThanOrEqualToThresholdIndex <- targetToFilter[, tteColname] <= threshold
greaterThanThresholdIndex <- targetToFilter[, tteColname] > threshold
casesKeptIndex <- casesIndex & lessThanOrEqualToThresholdIndex
casesToControlsIndex <- casesIndex & greaterThanThresholdIndex
controlsKeptIndex <- controlsIndex & greaterThanThresholdIndex
controlsRemovedIndex <- controlsIndex & lessThanOrEqualToThresholdIndex
# Convert cases to controls
targetToFilter[casesToControlsIndex, eventColname] <- 0
# Filter rows
rowsKeptIndex <- casesKeptIndex | casesToControlsIndex | controlsKeptIndex
targetToFilter <- targetToFilter[rowsKeptIndex, ]
objectsToFilter <- lapply(objectsToFilter, function(objectToFilter) {
if (is.data.frame(objectToFilter) || is.matrix(objectToFilter)) {
objectToFilter[rowsKeptIndex, ]
} else if (is.vector(objectToFilter)) {
objectToFilter[rowsKeptIndex]
} else {
stop("objectToFilter not data.frame, matrix or vector\n")
}
})
list(
targetFiltered = targetToFilter,
objectsFiltered = objectsToFilter,
counts = list(
casesKepty = sum(casesKeptIndex),
casesToControls = sum(casesToControlsIndex),
controlsKept = sum(controlsKeptIndex),
controlsRemoved = sum(controlsRemovedIndex)
)
)
}
# Exploratory analysis functions
createTTEDensityPlot <- function(target,
pipelineRunInformation,
title = "Time-to-event by Case/Control",
xlabel = "TTE",
ylabel = "Density") {
tic("Create time-to-event by Event density plot")
densityPlot <- ggplot(target, aes(x = time_to_event, fill = Event)) +
geom_density(alpha = 0.5, position = "identity") +
ggtitle(title) +
xlab(xlabel) +
ylab(ylabel)
if (pipelineRunInformation[["log"]]) {
}
toc(log = TRUE)
return(densityPlot)
}
#' incrementalTest
#' Fits incremental logistic models (a null model and a full model corresponding
#' to covariates only and covariates+score respectively).
#'
#' @param X A matrix/data.frame with columns corresponding to the labels, all
#' covariates and the score to be evaluated.
#' @param yColname The name of the column in X corresponding to the labels.
#' @param covColnames A list/vector of the column names corresponding to
#' covariates.
#' @param scoreColname The name of the column in \code{X} corresponding to the
#' score to be evaluated.
#' @param family The \code{\link[stats]{glm}} model family. Default is
#' \code{'binomial'} for logistic regression.
#' @param pipelineRunInformation The object originally created by
#' \code{\link{beginPipelineRun}}.
#' @return A list containing results for the null and full models. For both
#' model types, the formula string, glm model object, response, AUC and PRAUC
#' are returned.
#' @export
incrementalTest <- function(X,
yColname,
covColnames,
scoreColname,
family = "binomial",
pipelineRunInformation) {
tic("Fit incremental logistic model")
nullFormulaString <- paste(yColname,
paste(covColnames, collapse = " + "), sep = " ~ ")
nullModel <- glm(as.formula(nullFormulaString), family = family, data = X)
fullFormulaString <- paste(nullFormulaString, scoreColname, sep = " + ")
fullModel <- glm(as.formula(fullFormulaString), family = family, data = X)
nullModelResponse <- predict(nullModel, type = "response")
fullModelResponse <- predict(fullModel, type = "response")
nullAUC <- pROC::roc(X[, yColname], nullModelResponse)$auc
fullAUC <- pROC::roc(X[, yColname], fullModelResponse)$auc
nullPRAUC <- MLmetrics::PRAUC(nullModelResponse, X[, yColname])
fullPRAUC <- MLmetrics::PRAUC(fullModelResponse, X[, yColname])
if (pipelineRunInformation[["log"]]) {
saveRDS(nullModel,
paste0(pipelineRunInformation[["log folder path"]],
"incremental_logistic_null_model_",
pipelineRunInformation[["start timestamp"]],
".rds"))
saveRDS(fullModel,
paste0(pipelineRunInformation[["log folder path"]],
"incremental_logistic_full_model_",
pipelineRunInformation[["start timestamp"]],
".rds"))
saveRDS(nullModelResponse,
paste0(pipelineRunInformation[["log folder path"]],
"incremental_logistic_null_model_response_",
pipelineRunInformation[["start timestamp"]],
".rds"))
saveRDS(fullModelResponse,
paste0(pipelineRunInformation[["log folder path"]],
"incremental_logistic_full_model_response_",
pipelineRunInformation[["start timestamp"]],
".rds"))
}
toc(log = TRUE)
list(
null = list(
formula = nullFormulaString,
model = nullModel,
response = nullModelResponse,
auc = nullAUC,
prauc = nullPRAUC
),
full = list(
formula = fullFormulaString,
model = fullModel,
response = fullModelResponse,
auc = fullAUC,
prauc = fullPRAUC
)
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.