#' random_word
#'
#' Generate a random combination of English alphabets of length \code{n}.
#'
#' @usage random_word(n)
#' @param n A natural number. This is the number of characters in a generated
#' word.
#' @return A character having \code{n} characters.
#' @author Junkyu Park
#' @seealso
#' \code{\link{binarize_binom}},
#' \code{\link{binarize_pois}}
#' @examples
#' set.seed(1024)
#' random_word(5)
#' random_word(10)
#' @export
random_word <- function(n) {
all_alphabets <- sort(
c(funpark::alphabets$upper, funpark::alphabets$lower)
)
dalphabet <- function(x){
dpmf(
x,
rep(1, length(all_alphabets)) / length(all_alphabets),
all_alphabets
)
}
paste0(rpmf(n, dalphabet, all_alphabets), collapse = '')
}
#' binarize_binom
#'
#' Transform a data with binomial responses into a data with binary response.
#'
#' @usage binarize_binom(dat, responses, variable.name = NULL)
#' @param dat A data whose features are categorical, and has two integer
#' columns that correspond to binomial responses
#' @param responses A character vector of length 2 where each element is the
#' names of columns that store the counts of positive and negative
#' responses, in this order
#' @param variable.name NULL by default; a character that will be used for
#' the name of new binary response column after binarization; if NULL,
#' then a randomly generated character will be used
#' @return A data.table that has the same categorical features as \code{dat},
#' and has \code{variable.name} column as its binary response
#' @author Junkyu Park
#' @seealso
#' \code{\link{binarize_pois}},
#' \code{\link{change_form}},
#' \href{https://joon3216.github.io/research_materials/2018/binarize}{Binarizing data using data.table}
#' @examples
#' # data(rich_binom)
#' binarize_binom(rich_binom, c('rich', 'not_rich'))
#' binarize_binom(rich_binom, c('rich', 'not_rich'), 'is_rich')
#' @export
#' @import data.table
binarize_binom <- function(dat, responses, variable.name = NULL) {
# Generate random names to avoid the same names as in features
separator <- random_word(10)
united <- random_word(10)
# Setup
if (!('data.table' %in% class(dat))) {data.table::setDT(dat)}
col_names <- colnames(dat)
id_vars <- col_names[!(col_names %in% responses)]
if (is.null(variable.name)) {
variable_name <- random_word(10)
message('Randomly generated variable.name used: ', variable_name)
} else {
variable_name <- variable.name
}
value_name <- random_word(10)
# Transform into the form that is used in poisson regression
dat <- data.table::melt(
dat,
id.vars = id_vars,
variable.name = variable_name,
value.name = value_name
)
id_vars <- c(id_vars, variable_name)
# Binarize
dat <- eval(parse(text = paste0(
'dat[', value_name, ' != 0, ',
'.(', united,
' = do.call(paste, c(.SD, sep = \"', separator,
'\")), ', value_name, '), .SDcols = id_vars]'
)))
dat <- dat[
,
list(
result = rep(
eval(parse(text = paste0('dat$\"', united, '\"'))),
eval(parse(text = paste0('dat$\"', value_name, '\"')))
)
)
][
, # requires data.table ver >= 1.9.6 because of tstrsplit
c(id_vars) := tstrsplit(result, separator, fixed = T)
][
,
c(id_vars),
with = F
]
dat
}
#' binarize_pois
#'
#' Transform a data with a count response into a data with binary response.
#'
#' @usage binarize_pois(dat, response)
#' @param dat A data whose features are categorical, and has a column of
#' natural numbers that correspond to the count response
#' @param response A character; the name of column that stores the count
#' response
#' @return A data.table that has all of categorical features in \code{dat},
#' where each given combination of features (rows) are populated by integers
#' based on \code{response}. The function will not keep those combinations
#' with a zero count.
#' @author Junkyu Park
#' @seealso
#' \code{\link{binarize_binom}},
#' \code{\link{change_form}},
#' \href{https://joon3216.github.io/research_materials/2018/binarize}{Binarizing data using data.table}
#' @examples
#' # data(rich_pois)
#' binarize_pois(rich_pois, 'count')
#' @export
#' @import data.table
binarize_pois <- function(dat, response) {
# Generate random names to avoid the same names as in features
separator <- random_word(10)
united <- random_word(10)
# Setup
if (!('data.table' %in% class(dat))) {setDT(dat)}
col_names <- colnames(dat)
id_vars <- col_names[!(col_names %in% response)]
variable_name <- random_word(10)
value_name <- random_word(10)
# Binarize
dat <- eval(parse(text = paste0(
'dat[', response, ' != 0, ',
'.(', united, ' = do.call(paste, c(.SD, sep = \"', separator,
'\")), ', response, '), .SDcols = id_vars]'
)))
dat <- dat[
,
list(
result = rep(
eval(parse(text = paste0('dat$\"', united, '\"'))),
eval(parse(text = paste0('dat$\"', response, '\"')))
)
)
][
, # requires data.table ver >= 1.9.6 because of tstrsplit
c(id_vars) := tstrsplit(result, separator, fixed = T)
][
,
c(id_vars),
with = F
]
dat
}
#' change_form
#'
#' Transform a data from one form into another.
#'
#' @usage change_form(dat, from, to,
#' old_response, category, new_response)
#' @param dat A data whose features are categorical and response(s) is/are
#' nonnegative integer vector(s).
#' @param from A character; either 'binary', 'binomial', or 'poisson'
#' @param to A character != from; either 'binary', 'binomial', or 'poisson'
#' @param old_response (always specified) a character vector of:
#' \itemize{
#' \item length 1 if from = 'binary' or 'poisson'; the name of column
#' in \code{dat} that stores a response/count.
#' \item lenght 2 if from = 'binomial'; the names of columns in \code{dat}
#' that store positive and negative case counts, in this order.
#' }
#' @param category (specified if either to = 'poisson' or
#' (from, to) = ('poisson', 'binomial')) a character vector of:
#' \itemize{
#' \item length 1 if from = 'binomial' OR (from, to) =
#' ('poisson', 'binomial'); the new name of column that
#' will store two names in old_response as positive and
#' negative cases, in this order; OR, the column in \code{dat} that is
#' used as a binary response in case of logistic regression if
#' (from, to) = ('poisson', 'binomial').
#' \item length 2 if from = 'binary'; the new names for positive and
#' negative cases in the binary response column, in this order.
#' }
#' @param new_response (specified only if from != 'poisson') a character
#' vector of:
#' \itemize{
#' \item length 1 if to = 'binary' or 'poisson'; the name of the new
#' column in new data that will store either a binary or count
#' response.
#' \item length 2 if to = 'binomial'; the names of two columns in
#' new data that will store positive and negative case counts,
#' in this order.
#' }
#' @return A data.table that has transformed from the form specified in
#' \code{from} to the form specified in \code{to}.
#' @author Junkyu Park
#' @seealso
#' \code{\link{binarize_binom}},
#' \code{\link{binarize_pois}},
#' \href{https://joon3216.github.io/research_materials/2018/binarize}{Binarizing data using data.table}
#' @examples
#' change_form(
#' rich,
#' from = 'binary', to = 'binomial',
#' old_response = 'is_rich',
#' new_response = c('rich', 'not_rich')
#' )
#' change_form(
#' rich,
#' from = 'binary', to = 'poisson',
#' old_response = 'is_rich',
#' category = c('rich', 'not_rich'),
#' new_response = 'count'
#' )
#' change_form(
#' rich_binom,
#' from = 'binomial', to = 'binary',
#' old_response = c('rich', 'not_rich'),
#' new_response = 'is_rich'
#' )
#' change_form(
#' rich_binom,
#' from = 'binomial', to = 'poisson',
#' old_response = c('rich', 'not_rich'),
#' category = 'is_rich',
#' new_response = 'count'
#' )
#' change_form(
#' rich_pois,
#' from = 'poisson', to = 'binary',
#' old_response = 'count'
#' )
#' change_form(
#' rich_pois,
#' from = 'poisson', to = 'binomial',
#' old_response = 'count',
#' category = 'is_rich'
#' )
#' @export
#' @import data.table
change_form <- function(dat, from, to,
old_response, category, new_response) {
if (!('data.table' %in% class(dat))) {data.table::setDT(dat)}
col_names <- colnames(dat)
id_vars <- col_names[!(col_names %in% old_response)]
id_vars_collapsed <- paste0(id_vars, collapse =',')
if (from == 'binary') {
if (to == 'binomial') {
return(eval(parse(text = paste0(
'dat[, ',
'.(', new_response[1] , ' = sum(', old_response, '), ',
new_response[2], ' = sum(!', old_response, ')), ',
'by = \"', id_vars_collapsed, '\"]'
))))
} else if (to == 'poisson') {
dat <- eval(parse(text = paste0(
'dat[, ',
'.(', category[1] , ' = sum(', old_response, '), ',
category[2], ' = sum(!', old_response, ')), ',
'by = \"', id_vars_collapsed, '\"]'
)))
return(suppressWarnings(data.table::melt(
dat,
id.vars = id_vars,
variable.name = old_response,
value.name = new_response
)))
} else {
stop(
'\"to\" must be either \"binomial\" or \"poisson\"',
' if \"from\" = \"binary\"'
)
}
} else if (from == 'binomial') {
if (to == 'binary') {
return(binarize_binom(dat, old_response, new_response))
} else if (to == 'poisson') {
data.table::melt(
dat,
id.vars = id_vars,
variable.name = category,
value.name = new_response
)
} else {
stop(
'\"to\" must be either \"binary\" or \"poisson\"',
' if \"from\" = \"binomial\"'
)
}
} else if (from == 'poisson') {
if (to == 'binary') {
return(binarize_pois(dat, old_response))
} else if (to == 'binomial') {
id_vars <- id_vars[!(id_vars %in% category)]
id_vars_fmlr <- paste0(id_vars, collapse = ' + ')
fmlr <- as.formula(paste0(id_vars_fmlr, ' ~ ', category))
return(data.table::dcast(dat, fmlr, value.var = old_response))
} else {
stop(
'\"to\" must be either \"binary\" or \"binomial\"',
' if \"from\" = \"poisson\"'
)
}
} else {
stop(paste0(
'\"from\" must be either',
' \"binary\", \"binomial\", or \"poisson\"'
))
}
}
#' CI_auc
#'
#' Calculate a confidence interval of AUC by bootstrapping.
#'
#' @usage CI_auc(dat, fmlr, R = 500, type = 'norm', ...)
#' @param dat A data whose response is binary (0 and 1).
#' @param fmlr A formula for the logistic regression with logit link.
#' @param R The same as \code{R} in boot::boot.
#' @param type The same as \code{type} in boot::boot.ci
#' @param \dots Additional arguments of boot::boot.ci
#' @return A \code{bootci} object.
#' @author Junkyu Park
#' @seealso
#' \code{\link{binarize_binom}},
#' \code{\link{binarize_pois}},
#' \code{\link{change_form}},
#' \code{\link{plot_roc}},
#' \href{https://joon3216.github.io/research_materials/2018/binarize}{Binarizing data using data.table}
#' @examples
#' # library(data.table)
#' data(femsmoke, package = 'faraway')
#' femsmoke_binary <- binarize_pois(femsmoke, 'y')
#' femsmoke_binary[, dead := ifelse(dead == 'yes', 1, 0)]
#' CI_auc(femsmoke_binary, dead ~ smoker + age)
#' @export
CI_auc <- function(dat, fmlr, R = 500, type = 'norm', ...) {
AUC_boot <- function(dat, i) {
y <- dat[i, ]
mod <- glm(fmlr, family = binomial, data = y)
ests <- predict(mod, type = 'response')
response <- as.character(fmlr[2])
actual <- eval(parse(text = paste0('y$\"', response, '\"')))
suppressMessages(as.numeric(pROC::auc(pROC::roc(actual, ests))))
}
boot_output <- boot::boot(dat, statistic = AUC_boot, R = R)
boot::boot.ci(boot_output, type = type, ...)
}
#' plot_roc
#'
#' Plot the ROC curve of the logistic regression with logit link.
#'
#' @usage plot_roc(dat, fmlr)
#' @param dat A data whose response is binary (0 and 1).
#' @param fmlr A formula for the logistic regression with logit link.
#' @return A \code{ggplot} object.
#' @author Junkyu Park
#' @seealso
#' \code{\link{binarize_binom}},
#' \code{\link{binarize_pois}},
#' \code{\link{change_form}},
#' \code{\link{CI_auc}},
#' \href{https://joon3216.github.io/research_materials/2018/binarize}{Binarizing data using data.table}
#' @examples
#' # data(nodal, package = 'SMPracticals')
#' plot_roc(nodal, r ~ stage + xray + acid)
#' @export
#' @import ggplot2
plot_roc <- function(dat, fmlr) {
mod <- glm(formula = fmlr, family = binomial, data = dat)
ests <- predict(mod, type = 'response')
response <- as.character(fmlr[2])
actual <- eval(parse(text = paste0('dat$\"', response, '\"')))
roc_result <- pROC::roc(actual, ests)
roc_table <- data.table::data.table(
TPR = roc_result$sensitivities,
FPR = 1 - roc_result$specificities,
thresholds = roc_result$thresholds
)[
order(TPR)
]
ggplot(roc_table, aes(FPR, TPR, label = round(thresholds, 4))) +
geom_point() +
ggrepel::geom_label_repel(
box.padding = 0.3,
point.padding = 0.3,
segment.color = "grey50"
) +
geom_line() +
geom_segment(
aes(x = 0, y = 0, xend = 1, yend = 1),
col = "red", linetype = "dashed"
) +
annotate(
"text", x = 1, y = .05, hjust = 1,
label = paste0(
"AUC : ", round(as.numeric(pROC::auc(roc_result)), 4)
)
) +
labs(
x = "False positive rate",
y = "True positive rate",
title = "ROC curve",
subtitle = paste0(response, " ~ ", as.character(fmlr[3]))
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.