# same as round() function, but rounds .5 up to higher number, rather than rounding even
round2 <- function( vec , digits=0 ){
vec0 <- vec
eps <- 10^(-10)
vec <- abs(vec)
vec <- vec*10^digits
vec2 <- vec-floor(vec)
# vec <- floor( vec ) + ifelse( vec2 < .5 , 0 , 1 )
vec <- floor( vec ) + ifelse( ( vec2 - .5 ) < - eps, 0 , 1 )
vec.round <- sign(vec0) * vec / 10^digits
return(vec.round)
}
# 2020-04-06 rounds p values below 0.0001 to '<0.0001'; converts to character
# 2021-04-06 updated: to use original method, use deprecated = T
pRound <- function( vec , digits=4 ,deprecated = F){
if(deprecated %in% T){
if(is.na(vec)){
return(NA)
} else{
vec0 <- vec
eps <- 10^(-10)
vec <- abs(vec)
vec <- vec*10^digits
vec2 <- vec-floor(vec)
# vec <- floor( vec ) + ifelse( vec2 < .5 , 0 , 1 )
vec <- floor( vec ) + ifelse( ( vec2 - .5 ) < - eps, 0 , 1 )
vec.round <- sign(vec0) * vec / 10^digits
if(vec.round<0.0001){return('<0.0001')} else{return(as.character(vec.round))}
}
} else if(deprecated %in% F){
pRoundFxn <- function(vec,digits=4){
if (is.na(vec)) {
return(NA)
}
else {
vec0 <- vec
eps <- 10^(-10)
vec <- abs(vec)
vec <- vec * 10^digits
vec2 <- vec - floor(vec)
vec <- floor(vec) + ifelse((vec2 - 0.5) < -eps, 0, 1)
vec.round <- sign(vec0) * vec/10^digits
if (vec.round < 1e-04) {
return("<0.0001")
}
else {
return(as.character(vec.round))
}
}
}
sapply(vec,pRoundFxn)
}
}
# 2022-07-06 new approach to normalizing paths by OS
CRnormalizePath <- function(myPath){
if(Sys.info()['sysname'] %in% "Windows"){
gsub(x = myPath,pattern = "/",replacement = "\\\\")
} else {
gsub(x = myPath,pattern = "\\\\",replacement = "/")
}
}
# convert number to percent as character variable
# input:
# numbertobeconvertedtocharacter = numeric value
# numberofdecimals = number of decimal places in percentage (default: 0)
# output: character with that number written as a percent
pct <- function(numbertobeconvertedtocharacter, numberofdecimals = 0){
return(paste0(round2(numbertobeconvertedtocharacter*100,numberofdecimals),"%"))
}
# capitalizes first letters of strings
# 2021-04-06 updated: to use original method, use deprecated = T
simpleCap <- function(x,deprecated = F) {
if(deprecated %in% T){
s <- strsplit(x, " ")[[1]]
paste(toupper(substring(s, 1,1)), substring(s, 2),
sep="", collapse=" ")
} else if (deprecated %in% F){
simpleCapFxn <- function (x) {
s <- strsplit(x, " ")[[1]]
paste(toupper(substring(s, 1, 1)), substring(s, 2), sep = "",
collapse = " ")
}
sapply(x,simpleCapFxn)
}
}
# creates subfolder directories and basic files for project
# works in conjunction with custom shortcuts and custom pipe (not included in this R code)
CRprojectBuilder <- function(){
# identify mac vs windows
cr.pathSetup <- if(Sys.info()['sysname'] %in% "Windows"){
"\\"
} else {
"/"
}
if(!dir.exists(CRnormalizePath("data"))){dir.create(CRnormalizePath("data"))}
if(!dir.exists(CRnormalizePath("code"))){dir.create(CRnormalizePath("code"))}
if(!dir.exists(CRnormalizePath("figures"))){dir.create(CRnormalizePath("figures"))}
if(!dir.exists(CRnormalizePath("output"))){dir.create(CRnormalizePath("output"))}
if(!dir.exists(CRnormalizePath("docs"))){dir.create(CRnormalizePath("docs"))}
if(!file.exists(CRnormalizePath("code\\_import.R"))){
file.create(CRnormalizePath("code\\_import.R"))
fileConn.import <- file(CRnormalizePath("code\\_import.R"))
writeLines(c("library(tidyverse)","library(CRtools)","library(cowplot)","library(ggthemes)","library(readxl)","library(scales)","library(skimr)","library(car)","library(ggfortify)","library(lubridate)","library(data.table)","library(kableExtra)","library(tidytable)","",
"# import ----",
"",
"",
"",
"",
"# data cleaning ----",
"",
"",
"",
"",
"# data filtering ----",
"",
"",
"",
"",
"# final datasets"
),fileConn.import)
close(fileConn.import)
}
if(!file.exists(CRnormalizePath("code\\_libs.R"))){
file.create(CRnormalizePath("code\\_libs.R"))
fileConn.libs <- file(CRnormalizePath("code\\_libs.R"))
writeLines(c("library(tidyverse)","library(CRtools)","library(cowplot)","library(ggthemes)","library(readxl)","library(scales)","library(skimr)","library(car)","library(ggfortify)","library(lubridate)","library(data.table)","library(kableExtra)","library(tidytable)"),fileConn.libs)
close(fileConn.libs)
}
if(!file.exists(CRnormalizePath("code\\_documentation.R"))){
file.create(CRnormalizePath("code\\_documentation.R"))
fileConn.documentation <- file(CRnormalizePath("code\\_documentation.R"))
writeLines(c("dat.doc <- vector('list')","dat.doc[['Notes']] <- 'Enter Notes Here'"),fileConn.documentation)
close(fileConn.documentation)
}
# removing this file since library(renv) can replace it
# if(!file.exists("code\\_sessionInfo.R")){
# file.create("code\\_sessionInfo.R")
# fileConn.sessionInfo <- file("code\\_sessionInfo.R")
# writeLines(c("# write date, then paste results of sessionInfo()","# assuming packrat and renv are still not working correctly with shortcuts","sessionInfo()"),fileConn.sessionInfo)
# close(fileConn.sessionInfo)
# }
if(!file.exists(CRnormalizePath("code\\_Analysis Markdown.rmd"))){
file.create(CRnormalizePath("code\\_Analysis Markdown.rmd"))
fileConn.analysisMarkdown <- file(CRnormalizePath("code\\_Analysis Markdown.rmd"))
writeLines(c(
"---",
"title: \"Analysis\"",
"output:",
" rmdformats::material:",
" highlight: kate",
" self_contained: true",
" thumbnails: false",
" gallery: true",
" df_print: kable",
"pkgdown:",
" as_is: true",
"knit: (function(inputFile, encoding) { rmarkdown::render(inputFile, encoding = encoding, output_file = file.path(dirname(gsub(x = inputFile,pattern = \"/code\",replacement = \"\")), paste0(\"output/Analysis_\",format(Sys.time(), \"%Y-%m-%d-%H-%M-%S\"),\".html\"))) })",
"---",
"",
"```{r setup, echo=F,include=F}",
"cr.pathSetup <- if(Sys.info()['sysname'] %in% \"Windows\"){",
" \"\\\\\"",
"} else {",
" \"/\"",
"}",
"setwd(gsub(x = getwd(),pattern = \"/code\",replacement = \"\"))",
"source(CRnormalizePath(\"code/_functions.R\"))",
"source(CRnormalizePath(\"code/_import.R\"))",
"source(CRnormalizePath(\"code/_analysis.R\"))",
"source(CRnormalizePath(\"code/_figures.R\"))",
"```",
"",
"# Descriptive Statistics",
"",
"## Table 1",
"",
"```{r table 1, echo = F,fig.width=10}",
"func.categoryCount(categoryVariableInQuotes = \"cyl\",columnGroupingInQuotes = \"gear\",mydata = mtcars %>% mutate(gear = as.character(gear)),categoryVariableLabel = \"Gear\",formattedOutput = T,rowOrColumnPercent = \"row\") %>%",
" kableExtra::kbl(",
" booktabs = T,",
" align = \"llcccc\"",
" ) %>% ",
" kable_classic(full_width = T, html_font = \"Arial\") %>% ",
" kable_styling(latex_options = c(\"striped\")) %>%",
" column_spec(2,italic = T)",
"```"
),fileConn.analysisMarkdown)
close(fileConn.analysisMarkdown)
}
if(!file.exists(CRnormalizePath("code\\_analysis.R"))){
file.create(CRnormalizePath("code\\_analysis.R"))
}
if(!file.exists(CRnormalizePath("code\\_functions.R"))){
file.create(CRnormalizePath("code\\_functions.R"))
}
if(!file.exists(CRnormalizePath("code\\_diagnostics.R"))){
file.create(CRnormalizePath("code\\_diagnostics.R"))
fileConn.import <- file(CRnormalizePath("code\\_diagnostics.R"))
writeLines(c(
"# Variable check",
"skim(mtcars) %>% View()",
"# Checking if different pairs of variables have similar missing values",
"CRtools::CRnullCorr(mydataset)",
"# Graphing null values among two variables",
"CRtools::CRnullGraph(mydata = mydataset,x = \"var1inquotes\",y=\"var2inquotes\")",
"# which variables predict nulls in an explanatory variable",
"CRtools::CRnullModel({mydataset %>% select(var1,var2,var3)},\"var1inquotes\")",
"# table of two variable missingness",
"CRtools::CRnullTable(mydataset,\"var1inquotes\",\"var2inquotes\")",
"",
"# outlier check",
"CRoutlierCheck(mtcars)",
"# quick analyses",
"library(GGally)",
"iris %>% ggpairs(mapping = aes(color=factor(Species)))",
"",
"# quick table one",
"library(gtsummary)"
),fileConn.import)
close(fileConn.import)
}
if(!file.exists(CRnormalizePath("code\\_old code.R"))){
file.create(CRnormalizePath("code\\_old code.R"))
}
if(!file.exists(CRnormalizePath("code\\_figures.R"))){
file.create(CRnormalizePath("code\\_figures.R"))
}
file.edit(CRnormalizePath("code\\_import.R"))
# file.edit(CRnormalizePath("code\\_libs.R"))
}
# summarise mathematical models made using things like glm() or glmer() for easier understanding
# requires tidyverse (dplyr and tibble specifically)
CRmodelSummary.logistic <- function(myModel,rounding = 4,simpleResultOutput = T){
myModelSummary <- myModel %>%
summary %>%
coefficients %>%
as.data.frame %>%
rownames_to_column() %>%
rename("p value" = `Pr(>|z|)`,
Coefficient = Estimate) %>%
mutate_at(vars(Coefficient,`Std. Error`,`z value`),~round(.,rounding))
myModelOddsRatios <- exp(myModel %>%
summary %>%
coefficients %>%
.[,1]) %>%
as.data.frame %>%
rownames_to_column() %>%
rename("Odds Ratio" = ".") %>%
mutate("Odds Ratio" = round2(`Odds Ratio`,rounding))
myModelOddsRatioConfidenceInterval <- exp(confint(myModel)) %>%
as.data.frame %>%
rownames_to_column() %>%
rename("Lower OR Confidence Limit" = "2.5 %") %>%
rename("Upper OR Confidence Limit" = "97.5 %") %>%
mutate_at(vars(contains("Confidence Limit")),~round2(.,rounding))
myModelResults <- left_join(myModelSummary,myModelOddsRatios,by = "rowname") %>%
left_join(myModelOddsRatioConfidenceInterval, by = "rowname") %>%
rename(variableName = "rowname") %>%
rowwise() %>%
mutate(`p value.numeric` = `p value`) %>%
mutate(`p value` = ifelse(`p value`<0.0001,"<0.0001",as.character(round2(`p value`,rounding)))) %>%
ungroup()
if(simpleResultOutput %in% T){
myModelResults <- myModelResults %>%
select(variableName,`p value`,`Odds Ratio`,`Lower OR Confidence Limit`,`Upper OR Confidence Limit`)
}
return(myModelResults)
}
# 2020-02-21 epic color palette
epic_color_palette <- c(
rgb(0,133,242,maxColorValue=255),
rgb(221,41,157,maxColorValue=255),
rgb(105,195,0,maxColorValue=255),
rgb(180,41,204,maxColorValue=255),
rgb(0,191,212,maxColorValue=255),
rgb(255,146,0,maxColorValue=255),
rgb(106,76,224,maxColorValue=255),
rgb(24,194,149,maxColorValue=255),
rgb(217,78,111,maxColorValue=255),
rgb(36,164,238,maxColorValue=255)
)
# CRpaletteLightBackground <- c(
# "#1192e8",
# "#6929c4",
# "#005d5d",
# "#9f1853",
# "#fa4d56",
# "#570408",
# "#198038",
# "#002d9c",
# "#ee538b",
# "#b28600",
# "#009d9a",
# "#012749",
# "#8a3800",
# "#a56eff"
# )
CRcolorPalette <- c(
"#33b1ff",
"#8a3ffc",
"#007d79",
"#ff7eb6",
"#fa4d56",
"#fff1f1",
"#6fdc8c",
"#4589ff",
"#d12771",
"#d2a106",
"#08bdba",
"#bae6ff",
"#ba4e00",
"#d4bbff"
)
CRcolorPalette2 <- c(
rgb(239,45,47,maxColorValue = 255),
rgb(75,139,191,maxColorValue = 255),
rgb(95,183,92,maxColorValue = 255),
rgb(161,94,171,maxColorValue = 255),
rgb(255,140,25,maxColorValue = 255),
rgb(254,254,71,maxColorValue = 255),
rgb(175,103,61,maxColorValue = 255),
rgb(247,150,196,maxColorValue = 255),
rgb(163,163,163,maxColorValue = 255)
)
# 2020-06-09 adding cut2 from Hmisc so I don't have to load a package that conflicts with dplyr
cut2 <- function (x, cuts, m = 150, g, levels.mean = FALSE, digits, minmax = TRUE,
oneval = TRUE, onlycuts = FALSE, formatfun = format, ...)
{
if (inherits(formatfun, "formula")) {
if (!requireNamespace("rlang"))
stop("Package 'rlang' must be installed to use formula notation")
formatfun <- getFromNamespace("as_function", "rlang")(formatfun)
}
method <- 1
x.unique <- sort(unique(c(x[!is.na(x)], if (!missing(cuts)) cuts)))
min.dif <- min(diff(x.unique))/2
min.dif.factor <- 1
if (missing(digits))
digits <- if (levels.mean)
5
else 3
format.args <- if (any(c("...", "digits") %in%
names(formals(args(formatfun))))) {
c(digits = digits, list(...))
}
else {
list(...)
}
oldopt <- options("digits")
options(digits = digits)
on.exit(options(oldopt))
xlab <- attr(x, "label")
if (missing(cuts)) {
nnm <- sum(!is.na(x))
if (missing(g))
g <- max(1, floor(nnm/m))
if (g < 1)
stop("g must be >=1, m must be positive")
options(digits = 15)
n <- table(x)
xx <- as.double(names(n))
options(digits = digits)
cum <- cumsum(n)
m <- length(xx)
y <- as.integer(ifelse(is.na(x), NA, 1))
labs <- character(g)
cuts <- approx(cum, xx, xout = (1:g) * nnm/g, method = "constant",
rule = 2, f = 1)$y
cuts[length(cuts)] <- max(xx)
lower <- xx[1]
upper <- 1e+45
up <- low <- double(g)
i <- 0
for (j in 1:g) {
cj <- if (method == 1 || j == 1)
cuts[j]
else {
if (i == 0)
stop("program logic error")
s <- if (is.na(lower))
FALSE
else xx >= lower
cum.used <- if (all(s))
0
else max(cum[!s])
if (j == m)
max(xx)
else if (sum(s) < 2)
max(xx)
else approx(cum[s] - cum.used, xx[s], xout = (nnm -
cum.used)/(g - j + 1), method = "constant",
rule = 2, f = 1)$y
}
if (cj == upper)
next
i <- i + 1
upper <- cj
y[x >= (lower - min.dif.factor * min.dif)] <- i
low[i] <- lower
lower <- if (j == g)
upper
else min(xx[xx > upper])
if (is.na(lower))
lower <- upper
up[i] <- lower
}
low <- low[1:i]
up <- up[1:i]
variation <- logical(i)
for (ii in 1:i) {
r <- range(x[y == ii], na.rm = TRUE)
variation[ii] <- diff(r) > 0
}
if (onlycuts)
return(unique(c(low, max(xx))))
flow <- do.call(formatfun, c(list(low), format.args))
fup <- do.call(formatfun, c(list(up), format.args))
bb <- c(rep(")", i - 1), "]")
labs <- ifelse(low == up | (oneval & !variation), flow,
paste("[", flow, ",", fup, bb, sep = ""))
ss <- y == 0 & !is.na(y)
if (any(ss))
stop(paste("categorization error in cut2. Values of x not appearing in any interval:\n",
paste(format(x[ss], digits = 12), collapse = " "),
"\nLower endpoints:", paste(format(low,
digits = 12), collapse = " "), "\nUpper endpoints:",
paste(format(up, digits = 12), collapse = " ")))
y <- structure(y, class = "factor", levels = labs)
}
else {
if (minmax) {
r <- range(x, na.rm = TRUE)
if (r[1] < cuts[1])
cuts <- c(r[1], cuts)
if (r[2] > max(cuts))
cuts <- c(cuts, r[2])
}
l <- length(cuts)
k2 <- cuts - min.dif
k2[l] <- cuts[l]
y <- cut(x, k2)
if (!levels.mean) {
brack <- rep(")", l - 1)
brack[l - 1] <- "]"
fmt <- do.call(formatfun, c(list(cuts), format.args))
labs <- paste("[", fmt[1:(l - 1)], ",",
fmt[2:l], brack, sep = "")
if (oneval) {
nu <- table(cut(x.unique, k2))
if (length(nu) != length(levels(y)))
stop("program logic error")
levels(y) <- ifelse(nu == 1, c(fmt[1:(l - 2)],
fmt[l]), labs)
}
else levels(y) <- labs
}
}
if (levels.mean) {
means <- tapply(x, y, function(w) mean(w, na.rm = TRUE))
levels(y) <- do.call(formatfun, c(list(means), format.args))
}
attr(y, "class") <- "factor"
if (length(xlab))
label(y) <- xlab
y
}
# new function: overwrite libs
CRlibUpdate <- function(){
# identify mac vs windows
cr.pathSetup <- if(Sys.info()['sysname'] %in% "Windows"){
"\\"
} else {
"/"
}
if(!file.exists(CRnormalizePath("code\\_libsBackup.R"))){
file.create(CRnormalizePath("code\\_libsBackup.R"))
}
fileConn.import <- file(CRnormalizePath("code\\_import.R"))
fileConn.libs <- file(CRnormalizePath("code\\_libs.R"))
fileConn.libsBackup <- file(CRnormalizePath("code\\_libsBackup.R"))
fileLines.import <- fileConn.import %>% readLines
fileLines.libs <- fileConn.libs %>% readLines
fileLines.libsBackup <- fileConn.libsBackup %>% readLines
listOfLibs <- fileLines.import %>%
as.data.frame() %>%
filter(grepl(x = .data[["."]],pattern = "^library(.*)")|grepl(x = .data[["."]],pattern = "^options(.*)")) %>%
pull(.data[["."]])
contentToWriteToLibsBackup <- c(
as.character(fileLines.libsBackup),
"",
as.character(paste0("# ",Sys.time())),
as.character(listOfLibs)
)
writeLines(
contentToWriteToLibsBackup,
fileConn.libsBackup
)
writeLines(
as.character(listOfLibs),
fileConn.libs
)
close(fileConn.import)
close(fileConn.libs)
close(fileConn.libsBackup)
}
# backups of original CRtools methods
# round2 <- function( vec , digits=0 ){
# vec0 <- vec
# eps <- 10^(-10)
# vec <- abs(vec)
# vec <- vec*10^digits
# vec2 <- vec-floor(vec)
# # vec <- floor( vec ) + ifelse( vec2 < .5 , 0 , 1 )
# vec <- floor( vec ) + ifelse( ( vec2 - .5 ) < - eps, 0 , 1 )
# vec.round <- sign(vec0) * vec / 10^digits
# return(vec.round)
# }
# # 2020-04-06 rounds p values below 0.0001 to '<0.0001'; converts to character
# pRound <- function( vec , digits=4 ){
# if(is.na(vec)){
# return(NA)
# } else{
# vec0 <- vec
# eps <- 10^(-10)
# vec <- abs(vec)
# vec <- vec*10^digits
# vec2 <- vec-floor(vec)
# # vec <- floor( vec ) + ifelse( vec2 < .5 , 0 , 1 )
# vec <- floor( vec ) + ifelse( ( vec2 - .5 ) < - eps, 0 , 1 )
# vec.round <- sign(vec0) * vec / 10^digits
# if(vec.round<0.0001){return('<0.0001')} else{return(as.character(vec.round))}
# }
# }
# installing packages to use shortcuts
CRshortcutSetupFile <- function(){
# identify mac vs windows
cr.pathSetup <- if(Sys.info()['sysname'] %in% "Windows"){
"\\"
} else {
"/"
}
if(!file.exists(CRnormalizePath("code\\shortcutSetup.R"))){
file.create(CRnormalizePath("code\\shortcutSetup.R"))
fileConn.shortcut <- file(CRnormalizePath("code\\shortcutSetup.R"))
path_to_open_directory <- ifelse(
Sys.info()['sysname'] %in% "Windows",
"options(\"addinexamplesWV.usrFn3\" = function() { rstudioapi::insertText(\"shell.exec(getwd())\") })",
"options(\"addinexamplesWV.usrFn3\" = function() { rstudioapi::insertText(\"system2(\\\"open\\\",getwd())\") })"
)
writeLines(c(
"install.packages(\"devtools\")",
"library(devtools)",
"library(magrittr)",
"",
"install_github(\"WinVector/addinexamplesWV\",force=T)",
"install_github(\"rstudio/addinexamples\",force=T) # necessary to get the %in% shortcut added",
"",
"options(\"addinexamplesWV.usrFn1\" = function() { rstudioapi::insertText(\" %xTempVar% \") })",
"options(\"addinexamplesWV.usrFn2\" = function() { rstudioapi::insertText(\"libs()\") })",
# "options(\"addinexamplesWV.usrFn3\" = function() { rstudioapi::insertText(\"shell.exec(getwd())\") })",
path_to_open_directory,
"",
"`%xTempVar%` <- function(lhs, rhs) {",
"xTempVar <<- deparse(substitute(lhs))",
"eval(substitute(`%>%`(lhs, rhs)))",
"}",
"",
"# Shortcut setup:",
"# 1) go to Tools > Modify keyboard shortcuts",
"# 2) search usrFn",
"# 3) for usrFn1, modify it to ctrl+alt+shift+M (for the new pipe)",
"# 4) for usrFn2, modify it to ctrl+alt+shift+l (for libraries)",
"# 5) for usrFn3, modify it to ctrl+alt+shift+; (open directory in explorer)"
),fileConn.shortcut)
close(fileConn.shortcut)
file.edit("code\\shortcutSetup.R")
}
}
CRshortcuts <- function(){
# identify mac vs windows
cr.pathSetup <- if(Sys.info()['sysname'] %in% "Windows"){
"\\"
} else {
"/"
}
install.packages("devtools")
library(devtools)
library(magrittr)
path_to_open_directory <- ifelse(
Sys.info()['sysname'] %in% "Windows",
"options(\"addinexamplesWV.usrFn3\" = function() { rstudioapi::insertText(\"shell.exec(getwd())\") })",
"options(\"addinexamplesWV.usrFn3\" = function() { rstudioapi::insertText(\"system2(\"open\",getwd())\") })"
)
# 2022-03-02 moved this function to CRshortcuts.xTempVarSetup
# # creating modified pipe
# `%xTempVar%` <- function(lhs, rhs) {
# xTempVar <<- deparse(substitute(lhs))
# eval(substitute(`%>%`(lhs, rhs)))
# }
install_github("WinVector/addinexamplesWV",force=T)
install_github("rstudio/addinexamples",force=T) # necessary to get the %in% shortcut added
options("addinexamplesWV.usrFn1" = function() { rstudioapi::insertText(" %xTempVar% ") })
options("addinexamplesWV.usrFn2" = function() { rstudioapi::insertText("libs()") })
if(Sys.info()['sysname'] %in% "Windows"){
options("addinexamplesWV.usrFn3" = function() { rstudioapi::insertText("shell.exec(getwd())") })
} else{
options("addinexamplesWV.usrFn3" = function() { rstudioapi::insertText("system2(\"open\",getwd())") })
}
}
# to back up details on loaded packages and versions if not using renv, use sessionInfo()
# 2022-03-23 do multiple variables have missing values? if so, are missings correlated?
CRnullCorr <- function(mydata,corrShape = "pie"){
print("use skim() function to show detailed info on each variable")
library(corrplot)
mydata %>%
mutate_all(~is.na(.)) %>%
cor %>%
corrplot(method = corrShape,type = "lower")
}
# 2021-05-04 experimental package to check missing values on variables in a data frame
# use to identify significant associations, the check individual variables with geom_miss_point()
CRnullModel <- function(mydata,variableWithNulls){
# column index of variable with nulls
columnIndex <- mydata %>% names %>% as.data.frame %>% rownames_to_column() %>% filter(`.` %in% variableWithNulls) %>% pull(rowname) %>% as.numeric
columnListWithoutColumnOfInterest <- data.frame(columnIndices = 1:ncol(mydata)) %>% filter(!columnIndices %in% columnIndex)
myDataForModel <- mydata %>%
mutate(missingValues = case_when(
is.na(.data[[variableWithNulls]]) ~ 1,
T ~ 0
))
modelResults <- data.frame(
term = as.character()#,
# estimate = as.numeric(),
# `std.error` = as.numeric(),
# statistic = as.numeric(),
# p.value = as.numeric()
)
for (i in columnListWithoutColumnOfInterest$columnIndices){
myModel <-
glm(
myDataForModel$missingValues ~
{myDataForModel %>% select(all_of(i)) %>% pull},
family = "binomial"
) %>%
broom::tidy() %>%
mutate(term = gsub(
x = term,
pattern = "myDataForModel\\[, i\\]",
replacement = names(myDataForModel)[i]
)) %>%
filter(!term %in% "(Intercept)")
modelResults <- bind_rows(modelResults,myModel)
}
modelResults <- modelResults %>%
mutate(
significant = case_when(
p.value<0.05 ~ "*",
T ~ ""
),
`Odds Ratio (Odds of Missingness, reference category = non-missing)` = round2(exp(estimate),2),
p.value = pRound(p.value)
) %>%
select(term,`Odds Ratio (Odds of Missingness, reference category = non-missing)`,p.value,significant)
print(modelResults)
}
CRnullGraph <- function(mydata,x,y){
library(naniar)
mydata %>%
ggplot(aes_string(x = x,y = y))+
geom_miss_point(size=3)+
theme_bw()+
scale_color_viridis_d()
}
CRnullTable <- function(mydata,var1,var2){
mydata <- mydata %>%
select(.data[[var1]],.data[[var2]]) %>%
rename(
var1 = .data[[var1]],
var2 = .data[[var2]]
) %>%
mutate(
var1 =
case_when(
is.na(var1) ~ "Missing",
T ~ "Non-Missing"
),
var2 =
case_when(
is.na(var2) ~ "Missing",
T ~ "Non-Missing"
)
) %>%
rename(
!!var1 := var1,
!!var2 := var2
)
print(paste0("Missings in variable: ",var1))
print(mydata %>% select(.data[[var1]]) %>% table(useNA = 'ifany') %>% as_tibble() %>% pivot_wider(names_from = ".",values_from = "n") %>% mutate(missing_percent = pct(`Missing` / (`Missing` + `Non-Missing`),1)))
print("--------------------------------")
print(paste0("Missings in variable: ",var2))
print(mydata %>% select(.data[[var2]]) %>% table(useNA = 'ifany') %>% as_tibble() %>% pivot_wider(names_from = ".",values_from = "n") %>% mutate(missing_percent = pct(`Missing` / (`Missing` + `Non-Missing`),1)))
print("--------------------------------")
print(
mydata %>%
table(useNA = 'ifany') %>%
as_tibble() %>%
mutate(pct = pct(n/sum(n),1))
)
print("--------------------------------")
print(paste0(
"Overall agreement on missing vs non missing values: ",
mydata %>% mutate(agree = as.numeric(.data[[var1]] == .data[[var2]])) %>% summarise(agreement = pct(sum(agree)/nrow(mydata),1)) %>% pull(agreement)
))
}
# example of checking data with missing values: the following data replaces some of the vs variable with missings significantly assoc w/ the am variable
# mtcars2 <- mtcars
# mtcars2$vs <-
# c(0,0,1,NA,NA,NA,0,NA,NA,1,NA,0,0,0,NA,NA,NA,1,1,1,1,0,NA,NA,NA,1,0,1,0,0,0,1)
# CRnullHack(mtcars2)
# doublechecking potential problem variables
# library(naniar)
# mtcars2 %>%
# ggplot(aes(x = vs,y = wt))+
# geom_miss_point()
# reverse the use of split to combine data frames after using split() and map() functions
# reduce(full_join, by = names(.[[1]]))
# same as round() function, but rounds .5 up to higher number, rather than rounding even
# round2 <- function( vec , digits=0 ){
# vec0 <- vec
# eps <- 10^(-10)
# vec <- abs(vec)
# vec <- vec*10^digits
# vec2 <- vec-floor(vec)
# # vec <- floor( vec ) + ifelse( vec2 < .5 , 0 , 1 )
# vec <- floor( vec ) + ifelse( ( vec2 - .5 ) < - eps, 0 , 1 )
# vec.round <- sign(vec0) * vec / 10^digits
# return(vec.round)
# }
# 2020-04-06 rounds p values below 0.0001 to '<0.0001'; converts to character
# 2021-04-06 updated: to use original method, use deprecated = T
# pRound <- function( vec , digits=4 ,deprecated = F){
#
# if(deprecated %in% T){
# if(is.na(vec)){
# return(NA)
# } else{
# vec0 <- vec
# eps <- 10^(-10)
# vec <- abs(vec)
# vec <- vec*10^digits
# vec2 <- vec-floor(vec)
# # vec <- floor( vec ) + ifelse( vec2 < .5 , 0 , 1 )
# vec <- floor( vec ) + ifelse( ( vec2 - .5 ) < - eps, 0 , 1 )
# vec.round <- sign(vec0) * vec / 10^digits
# if(vec.round<0.0001){return('<0.0001')} else{return(as.character(vec.round))}
# }
# } else if(deprecated %in% F){
# pRoundFxn <- function(vec,digits=4){
# if (is.na(vec)) {
# return(NA)
# }
# else {
# vec0 <- vec
# eps <- 10^(-10)
# vec <- abs(vec)
# vec <- vec * 10^digits
# vec2 <- vec - floor(vec)
# vec <- floor(vec) + ifelse((vec2 - 0.5) < -eps, 0, 1)
# vec.round <- sign(vec0) * vec/10^digits
# if (vec.round < 1e-04) {
# return("<0.0001")
# }
# else {
# return(as.character(vec.round))
# }
# }
# }
# sapply(vec,pRoundFxn)
# }
#
# }
# convert number to percent as character variable
# input:
# numbertobeconvertedtocharacter = numeric value
# numberofdecimals = number of decimal places in percentage (default: 0)
# output: character with that number written as a percent
pct <- function(numbertobeconvertedtocharacter, numberofdecimals = 0){
return(paste0(round2(numbertobeconvertedtocharacter*100,numberofdecimals),"%"))
}
# capitalizes first letters of strings
# 2021-04-06 updated: to use original method, use deprecated = T
simpleCap <- function(x,deprecated = F) {
if(deprecated %in% T){
s <- strsplit(x, " ")[[1]]
paste(toupper(substring(s, 1,1)), substring(s, 2),
sep="", collapse=" ")
} else if (deprecated %in% F){
simpleCapFxn <- function (x) {
s <- strsplit(x, " ")[[1]]
paste(toupper(substring(s, 1, 1)), substring(s, 2), sep = "",
collapse = " ")
}
sapply(x,simpleCapFxn)
}
}
# summarise mathematical models made using things like glm() or glmer() for easier understanding
# may be better to just use broom::tidy(.,exponentiate = T,conf.int = T)
# requires tidyverse (dplyr and tibble specifically)
CRmodelSummary.logistic <- function(myModel,rounding = 4,simpleResultOutput = T){
myModelSummary <- myModel %>%
summary %>%
coefficients %>%
as.data.frame %>%
rownames_to_column() %>%
rename("p value" = `Pr(>|z|)`,
Coefficient = Estimate) %>%
mutate_at(vars(Coefficient,`Std. Error`,`z value`),~round(.,rounding))
myModelOddsRatios <- exp(myModel %>%
summary %>%
coefficients %>%
.[,1]) %>%
as.data.frame %>%
rownames_to_column() %>%
rename("Odds Ratio" = ".") %>%
mutate("Odds Ratio" = round2(`Odds Ratio`,rounding))
myModelOddsRatioConfidenceInterval <- exp(confint(myModel)) %>%
as.data.frame %>%
rownames_to_column() %>%
rename("Lower OR Confidence Limit" = "2.5 %") %>%
rename("Upper OR Confidence Limit" = "97.5 %") %>%
mutate_at(vars(contains("Confidence Limit")),~round2(.,rounding))
myModelResults <- left_join(myModelSummary,myModelOddsRatios,by = "rowname") %>%
left_join(myModelOddsRatioConfidenceInterval, by = "rowname") %>%
rename(variableName = "rowname") %>%
rowwise() %>%
mutate(`p value.numeric` = `p value`) %>%
mutate(`p value` = ifelse(`p value`<0.0001,"<0.0001",as.character(round2(`p value`,rounding)))) %>%
ungroup()
if(simpleResultOutput %in% T){
myModelResults <- myModelResults %>%
select(variableName,`p value`,`Odds Ratio`,`Lower OR Confidence Limit`,`Upper OR Confidence Limit`)
}
return(myModelResults)
}
# 2020-02-21 epic color palette
epic_color_palette <- c(
rgb(0,133,242,maxColorValue=255),
rgb(221,41,157,maxColorValue=255),
rgb(105,195,0,maxColorValue=255),
rgb(180,41,204,maxColorValue=255),
rgb(0,191,212,maxColorValue=255),
rgb(255,146,0,maxColorValue=255),
rgb(106,76,224,maxColorValue=255),
rgb(24,194,149,maxColorValue=255),
rgb(217,78,111,maxColorValue=255),
rgb(36,164,238,maxColorValue=255)
)
# 2020-06-09 adding cut2 from Hmisc so I don't have to load a package that conflicts with dplyr
cut2 <- function (x, cuts, m = 150, g, levels.mean = FALSE, digits, minmax = TRUE,
oneval = TRUE, onlycuts = FALSE, formatfun = format, ...)
{
if (inherits(formatfun, "formula")) {
if (!requireNamespace("rlang"))
stop("Package 'rlang' must be installed to use formula notation")
formatfun <- getFromNamespace("as_function", "rlang")(formatfun)
}
method <- 1
x.unique <- sort(unique(c(x[!is.na(x)], if (!missing(cuts)) cuts)))
min.dif <- min(diff(x.unique))/2
min.dif.factor <- 1
if (missing(digits))
digits <- if (levels.mean)
5
else 3
format.args <- if (any(c("...", "digits") %in%
names(formals(args(formatfun))))) {
c(digits = digits, list(...))
}
else {
list(...)
}
oldopt <- options("digits")
options(digits = digits)
on.exit(options(oldopt))
xlab <- attr(x, "label")
if (missing(cuts)) {
nnm <- sum(!is.na(x))
if (missing(g))
g <- max(1, floor(nnm/m))
if (g < 1)
stop("g must be >=1, m must be positive")
options(digits = 15)
n <- table(x)
xx <- as.double(names(n))
options(digits = digits)
cum <- cumsum(n)
m <- length(xx)
y <- as.integer(ifelse(is.na(x), NA, 1))
labs <- character(g)
cuts <- approx(cum, xx, xout = (1:g) * nnm/g, method = "constant",
rule = 2, f = 1)$y
cuts[length(cuts)] <- max(xx)
lower <- xx[1]
upper <- 1e+45
up <- low <- double(g)
i <- 0
for (j in 1:g) {
cj <- if (method == 1 || j == 1)
cuts[j]
else {
if (i == 0)
stop("program logic error")
s <- if (is.na(lower))
FALSE
else xx >= lower
cum.used <- if (all(s))
0
else max(cum[!s])
if (j == m)
max(xx)
else if (sum(s) < 2)
max(xx)
else approx(cum[s] - cum.used, xx[s], xout = (nnm -
cum.used)/(g - j + 1), method = "constant",
rule = 2, f = 1)$y
}
if (cj == upper)
next
i <- i + 1
upper <- cj
y[x >= (lower - min.dif.factor * min.dif)] <- i
low[i] <- lower
lower <- if (j == g)
upper
else min(xx[xx > upper])
if (is.na(lower))
lower <- upper
up[i] <- lower
}
low <- low[1:i]
up <- up[1:i]
variation <- logical(i)
for (ii in 1:i) {
r <- range(x[y == ii], na.rm = TRUE)
variation[ii] <- diff(r) > 0
}
if (onlycuts)
return(unique(c(low, max(xx))))
flow <- do.call(formatfun, c(list(low), format.args))
fup <- do.call(formatfun, c(list(up), format.args))
bb <- c(rep(")", i - 1), "]")
labs <- ifelse(low == up | (oneval & !variation), flow,
paste("[", flow, ",", fup, bb, sep = ""))
ss <- y == 0 & !is.na(y)
if (any(ss))
stop(paste("categorization error in cut2. Values of x not appearing in any interval:\n",
paste(format(x[ss], digits = 12), collapse = " "),
"\nLower endpoints:", paste(format(low,
digits = 12), collapse = " "), "\nUpper endpoints:",
paste(format(up, digits = 12), collapse = " ")))
y <- structure(y, class = "factor", levels = labs)
}
else {
if (minmax) {
r <- range(x, na.rm = TRUE)
if (r[1] < cuts[1])
cuts <- c(r[1], cuts)
if (r[2] > max(cuts))
cuts <- c(cuts, r[2])
}
l <- length(cuts)
k2 <- cuts - min.dif
k2[l] <- cuts[l]
y <- cut(x, k2)
if (!levels.mean) {
brack <- rep(")", l - 1)
brack[l - 1] <- "]"
fmt <- do.call(formatfun, c(list(cuts), format.args))
labs <- paste("[", fmt[1:(l - 1)], ",",
fmt[2:l], brack, sep = "")
if (oneval) {
nu <- table(cut(x.unique, k2))
if (length(nu) != length(levels(y)))
stop("program logic error")
levels(y) <- ifelse(nu == 1, c(fmt[1:(l - 2)],
fmt[l]), labs)
}
else levels(y) <- labs
}
}
if (levels.mean) {
means <- tapply(x, y, function(w) mean(w, na.rm = TRUE))
levels(y) <- do.call(formatfun, c(list(means), format.args))
}
attr(y, "class") <- "factor"
if (length(xlab))
label(y) <- xlab
y
}
# new function: overwrite libs (possibly duplicated fxn)
CRlibUpdate <- function(){
# identify mac vs windows
cr.pathSetup <- if(Sys.info()['sysname'] %in% "Windows"){
"\\"
} else {
"/"
}
if(!file.exists(CRnormalizePath("code\\_libsBackup.R"))){
file.create(CRnormalizePath("code\\_libsBackup.R"))
}
fileConn.import <- file(CRnormalizePath("code\\_import.R"))
fileConn.libs <- file(CRnormalizePath("code\\_libs.R"))
fileConn.libsBackup <- file(CRnormalizePath("code\\_libsBackup.R"))
fileLines.import <- fileConn.import %>% readLines
fileLines.libs <- fileConn.libs %>% readLines
fileLines.libsBackup <- fileConn.libsBackup %>% readLines
listOfLibs <- fileLines.import %>%
as.data.frame() %>%
filter(grepl(x = .data[["."]],pattern = "^library(.*)")) %>%
pull(.data[["."]])
contentToWriteToLibsBackup <- c(
as.character(fileLines.libsBackup),
"",
as.character(paste0("# ",Sys.time())),
as.character(listOfLibs)
)
writeLines(
contentToWriteToLibsBackup,
fileConn.libsBackup
)
writeLines(
as.character(listOfLibs),
fileConn.libs
)
close(fileConn.import)
close(fileConn.libs)
close(fileConn.libsBackup)
}
# 2021-04-21 glimpse for data.table
glimpse. <- function(data_table){
dplyr::glimpse(tibble::as_tibble(data_table))
}
# backing up projects
# 2022-07-06 this is likely deprecated; commenting out - if unneeded then delete
# 2021-05-03 this is to temporarily bypass issue where external libraries can't be added to .renviron for renv
# CRrenv <- function(){
# install.packages("devtools")
# library(devtools)
# install_github("WinVector/addinexamplesWV",force=T)
# install_github("rstudio/addinexamples",force=T) # necessary to get the %in% shortcut added
#
# options("addinexamplesWV.usrFn1" = function() { rstudioapi::insertText(" %xTempVar% ") })
# options("addinexamplesWV.usrFn2" = function() { rstudioapi::insertText("libs()") })
# options("addinexamplesWV.usrFn3" = function() { rstudioapi::insertText("shell.exec(getwd())") })
#
# }
# to back up details on loaded packages and versions if not using renv, use sessionInfo()
# 2021-05-04 experimental package to check missing values on variables in a data frame
# use to identify significant associations, the check individual variables with geom_miss_point()
# 2022-03-23 removed this function, replaced with others
# example of checking data with missing values: the following data replaces some of the vs variable with missings significantly assoc w/ the am variable
# mtcars2 <- mtcars
# mtcars2$vs <-
# c(0,0,1,NA,NA,NA,0,NA,NA,1,NA,0,0,0,NA,NA,NA,1,1,1,1,0,NA,NA,NA,1,0,1,0,0,0,1)
# doublechecking potential problem variables
# library(naniar)
# mtcars2 %>%
# ggplot(aes(x = vs,y = wt))+
# geom_miss_point()
# from rprofile
# 2022-01-28 copied from old RProfile
cx <- function(x,rowNames = F){
if(.Platform$OS.type == "windows"){
write.table(x = x,"clipboard",sep = "\t",row.names = rowNames)
}
else{
library(clipr)
clipr::write_clip(x)
}
} # this copies data for easy pasting to tables
libs <- function(){source(CRnormalizePath("code\\_libs.R"))}
# figures
gx <- function(myggplot, myDpi=300,figType = 'png', figHeight = 6, figWidth = 8){
if(figType %in% "tiff"){
if(.Platform$OS.type == "windows"){
ggsave(plot = myggplot,filename = paste0("figures\\",gsub(xTempVar,pattern = "\\.",replacement = "-"),"_",gsub(gsub(substr(Sys.time(),start=1,stop=16),pattern = " |-",replacement = "_"),pattern = ":",replacement = ""),".",figType),height = figHeight,width=figWidth,units="in",dpi = myDpi,compression = "lzw")
} else{
ggsave(plot = myggplot,filename = paste0("figures/",gsub(xTempVar,pattern = "\\.",replacement = "-"),"_",gsub(gsub(substr(Sys.time(),start=1,stop=16),pattern = " |-",replacement = "_"),pattern = ":",replacement = ""),".",figType),height = figHeight,width=figWidth,units="in",dpi = myDpi,compression = "lzw")
}
}else{
if(.Platform$OS.type == "windows"){
ggsave(plot = myggplot,filename = paste0("figures\\",gsub(xTempVar,pattern = "\\.",replacement = "-"),"_",gsub(gsub(substr(Sys.time(),start=1,stop=16),pattern = " |-",replacement = "_"),pattern = ":",replacement = ""),".",figType),height = figHeight,width=figWidth,units="in",dpi = myDpi)
} else{
ggsave(plot = myggplot,filename = paste0("figures/",gsub(xTempVar,pattern = "\\.",replacement = "-"),"_",gsub(gsub(substr(Sys.time(),start=1,stop=16),pattern = " |-",replacement = "_"),pattern = ":",replacement = ""),".",figType),height = figHeight,width=figWidth,units="in",dpi = myDpi)
}
}
}
# tables (write_csv)
dx <- function(myTable){
if(.Platform$OS.type == "windows"){
write_csv(x = myTable,file = paste0("output\\",gsub(xTempVar,pattern = "\\.",replacement = "-"),"_",gsub(gsub(substr(Sys.time(),start=1,stop=16),pattern = " |-",replacement = "_"),pattern = ":",replacement = ""),".csv"))
} else{
write_csv(x = myTable,file = paste0("output/",gsub(xTempVar,pattern = "\\.",replacement = "-"),"_",gsub(gsub(substr(Sys.time(),start=1,stop=16),pattern = " |-",replacement = "_"),pattern = ":",replacement = ""),".csv"))
}
}
# %PIPES % #############################
# this is experimental code to allow use of variable names create as text in pipes.
# this creates a GLOBAL variable called xTempVar using the input variable. <<- does this.
# using addinexamplsWV involved github package 'WinVector/addinexamplesWV'
# for new RStudio installs, you may need to load library, then:
# 1) go to Tools > Addins > Browse Addins
# 2) under the 'Name' column, select usrFn1 then click Execute
# -do the same for usrFn2 to let libs work
# also, to get shortcuts working:
# 1) go to Tools > Modify keyboard shortcuts
# 2) search usrFn
# 3) for usrFn1, modify it to ctrl+alt+shift+M (for the new pipe)
# 4) for usrFn2, modify it to ctrl+alt+shift+l (for libraries)
# 5) for usrFn3, modify it to ctrl+alt+shift+; (open directory in explorer)
# 2022-01-28 commented out for version to be used on cluster
# options("addinexamplesWV.usrFn1" = function() { rstudioapi::insertText(" %xTempVar% ") })
# options("addinexamplesWV.usrFn2" = function() { rstudioapi::insertText("libs()") })
# options("addinexamplesWV.usrFn3" = function() { rstudioapi::insertText("shell.exec(getwd())") })
v <- function(mydata){View(mydata)}
vx <- function(mydata){View(mydata,title = xTempVar)}
# 2022-03-24 testing more efficient approach for descriptive table results
func.nPctWithinDplyr <-
function(variable,value){
paste0(sum(variable %in% value)," (",pct(sum(variable %in% value)/n(),1),")")
}
func.meanSD <- function(myVariableNotInQuotes,round=2){
paste0(
round2(mean(myVariableNotInQuotes,na.rm=TRUE),round),
" (",
round2(sd(myVariableNotInQuotes,na.rm=TRUE),round),
")"
)
}
func.medianIqr <- function(myVariableNotInQuotes,round=2){
paste0(
round2(median(myVariableNotInQuotes,na.rm=TRUE),round),
" (",
round2(quantile(myVariableNotInQuotes,0.25,na.rm=TRUE),round),
", ",
round2(quantile(myVariableNotInQuotes,0.75,na.rm=TRUE),round),
")"
)
}
# 2022-09-19 alternative versions of meanSD and medianIQR to be used in conjunction with func.categoryCount_gt
func.meanSD_gt <- function(variableOfInterestInQuotes,variableLabel,columnGroupingInQuotes,myData,tTest = F){
if(is.null(variableLabel)){
variableLabel <- variableOfInterestInQuotes
}
myOutput <- myData %>%
transmute(
columnGroupingVariable = as.character(.data[[columnGroupingInQuotes]]),
measure = .data[[variableOfInterestInQuotes]]
) %>%
bind_rows(
myData %>%
transmute(columnGroupingVariable = "Overall",
measure = .data[[variableOfInterestInQuotes]])
) %>%
# select(columnGroupingVariable,measure)
group_by(columnGroupingVariable) %>%
summarise(
Metric = func.meanSD(measure)
) %>%
pivot_wider(names_from = columnGroupingVariable,values_from = Metric) %>%
mutate(Metric = variableLabel) %>%
relocate(Metric,.before = everything())
if(tTest %in% T){
myOutput <-
myOutput %>%
mutate(
"P-Value" = {t.test(
variableOfInterestInQuotes ~ columnGroupingInQuotes,
data = {
myData %>%
rename(columnGroupingInQuotes = .data[[columnGroupingInQuotes]]) %>%
rename(variableOfInterestInQuotes = .data[[variableOfInterestInQuotes]])
}) %>%
.$p.value %>%
pRound(.)}
)
}
return(myOutput)
}
# func.meanSD_gt(
# myData = mtcars,
# variableOfInterest = "wt",
# variableLabel = "Weight, mean (SD)",
# columnGroupingInQuotes = "vs"
# )
func.medianIqr_gt <- function(variableOfInterestInQuotes,variableLabel,columnGroupingInQuotes,myData){
if(is.null(variableLabel)){
variableLabel <- variableOfInterestInQuotes
}
myData %>%
transmute(
columnGroupingVariable = as.character(.data[[columnGroupingInQuotes]]),
measure = .data[[variableOfInterestInQuotes]]
) %>%
bind_rows(
myData %>%
transmute(columnGroupingVariable = "Overall",
measure = .data[[variableOfInterestInQuotes]])
) %>%
# select(columnGroupingVariable,measure)
group_by(columnGroupingVariable) %>%
summarise(
Metric = func.medianIqr(measure)
) %>%
pivot_wider(names_from = columnGroupingVariable,values_from = Metric) %>%
mutate(Metric = variableLabel) %>%
relocate(Metric,.before = everything())
}
# func.medianIqr_gt(
# myData = mtcars,
# variableOfInterest = "wt",
# variableLabel = "Weight, median (IQR)",
# columnGroupingInQuotes = "vs"
# )
# 2022-09-21 simplified chi square test
func.chiSquareTest <- function(mydata,variableOfInterestInQuotes,interventionVariableInQuotes){
mydata %>%
rename(myVar = .data[[variableOfInterestInQuotes]]) %>%
rename(intervention = .data[[interventionVariableInQuotes]]) %>%
select(myVar,intervention) %>%
table %>%
prop.test %>%
.$p.value %>%
pRound()
}
# category count function optimized for gt package
func.categoryCount_gt <- function(
mydata,
categoryVariableInQuotes,
columnGroupingInQuotes = NULL,
categoryVariableLabel = NULL,
formattedOutput = T,
rowOrColumnPercent = "row",
chiSquareTest = F
){
# creating label for Metric column
if(is.null(categoryVariableLabel)){
categoryLabel <- categoryVariableInQuotes
} else {
categoryLabel <- categoryVariableLabel
}
# fixing factor issue
if(is.factor(mydata %>% pull(.data[[categoryVariableInQuotes]]))){
factorLevels <- levels(mydata %>% pull(.data[[categoryVariableInQuotes]]))
mydata <- mydata %>%
mutate(!!categoryVariableInQuotes := as.character(.data[[categoryVariableInQuotes]]))
}
if(is.null(columnGroupingInQuotes)){
myOutput <-
mydata %>%
# mutate(!!categoryVariableInQuotes := as.character(.data[[categoryVariableInQuotes]])) %>%
group_by(.data[[categoryVariableInQuotes]]) %>%
summarise(Overall=n()) %>%
mutate(Percent = pct(Overall/sum(Overall),2)) %>%
add_column(Metric = categoryLabel,.before = 1) %>%
rename("Category" = .data[[categoryVariableInQuotes]])
} else {
myOutput <-
bind_rows(
mydata,
mydata %>% mutate(!!columnGroupingInQuotes := "Overall")
) %>%
group_by(.data[[categoryVariableInQuotes]],.data[[columnGroupingInQuotes]]) %>%
summarise(n=n())
if(formattedOutput == T){
if(tolower(rowOrColumnPercent) %in% "row"){
myOutput <- myOutput %>%
ungroup() %>%
group_by(.data[[categoryVariableInQuotes]]) %>%
mutate(n2 = case_when(
!.data[[columnGroupingInQuotes]] %in% "Overall" ~ paste0(n," (",pct(n/sum(n[!.data[[columnGroupingInQuotes]] %in% "Overall"]),2),")"),
T ~ as.character(n)
)) %>%
select(-n) %>%
rename(n = n2) %>%
ungroup()
}else if(tolower(rowOrColumnPercent) %in% "column"){
myOutput <- myOutput %>%
ungroup() %>%
group_by(.data[[columnGroupingInQuotes]]) %>%
mutate(n = paste0(n," (",pct(n/sum(n),2),")")) %>%
ungroup()
}else {
print("error: must use \"row\" or \"column\"")
}
myOutput <- myOutput %>%
group_by(.data[[categoryVariableInQuotes]],.data[[columnGroupingInQuotes]]) %>%
pivot_wider(names_from = .data[[columnGroupingInQuotes]],values_from = n) %>%
ungroup() %>%
add_column(Metric = categoryLabel,.before = 1) %>%
mutate_at(vars(-Metric,-.data[[categoryVariableInQuotes]]),~ifelse(is.na(.) & !is.na(.data[[categoryVariableInQuotes]]),0,.)) %>%
mutate_all(~ifelse(is.na(.),"",.)) %>%
rename("Category" = .data[[categoryVariableInQuotes]])
if(exists("factorLevels")){
myOutput <- myOutput %>%
mutate(Category = factor(
Category,
levels = c("",factorLevels),
ordered = T
)) %>%
arrange(Category) %>%
mutate(Category = factor(Category,ordered = F))
}
if(chiSquareTest %in% T){
p.value <- func.chiSquareTest(
mydata = mydata,
variableOfInterestInQuotes = categoryVariableInQuotes,
interventionVariableInQuotes = columnGroupingInQuotes
)
myOutput <-
myOutput %>%
mutate("P-Value" = c(p.value,rep("",nrow(myOutput)-1)))
}
return(myOutput)
}else{
return(myOutput)
}
}
}
func.categoryCount <- function(
mydata,
categoryVariableInQuotes,
columnGroupingInQuotes = NULL,
categoryVariableLabel = NULL,
formattedOutput = T,
rowOrColumnPercent = "row"
){
# creating label for Metric column
if(is.null(categoryVariableLabel)){
categoryLabel <- categoryVariableInQuotes
} else {
categoryLabel <- categoryVariableLabel
}
# fixing factor issue
if(is.factor(mydata %>% pull(.data[[categoryVariableInQuotes]]))){
factorLevels <- levels(mydata %>% pull(.data[[categoryVariableInQuotes]]))
mydata <- mydata %>%
mutate(!!categoryVariableInQuotes := as.character(.data[[categoryVariableInQuotes]]))
}
if(is.null(columnGroupingInQuotes)){
myOutput <-
mydata %>%
# mutate(!!categoryVariableInQuotes := as.character(.data[[categoryVariableInQuotes]])) %>%
group_by(.data[[categoryVariableInQuotes]]) %>%
summarise(Overall=n()) %>%
mutate(Percent = pct(Overall/sum(Overall),2)) %>%
add_column(Metric = NA,.before = 1) %>%
add_row(Metric = categoryVariableLabel,.before = 1) %>%
mutate_at(vars(Metric,.data[[categoryVariableInQuotes]]),~ifelse(is.na(.),"",.)) %>%
rename("Category" = .data[[categoryVariableInQuotes]])
} else {
myOutput <-
bind_rows(
mydata,
mydata %>% mutate(!!columnGroupingInQuotes := "Overall")
) %>%
group_by(.data[[categoryVariableInQuotes]],.data[[columnGroupingInQuotes]]) %>%
summarise(n=n())
if(formattedOutput == T){
if(tolower(rowOrColumnPercent) %in% "row"){
myOutput <- myOutput %>%
ungroup() %>%
group_by(.data[[categoryVariableInQuotes]]) %>%
mutate(n2 = case_when(
!.data[[columnGroupingInQuotes]] %in% "Overall" ~ paste0(n," (",pct(n/sum(n[!.data[[columnGroupingInQuotes]] %in% "Overall"]),2),")"),
T ~ as.character(n)
)) %>%
select(-n) %>%
rename(n = n2) %>%
ungroup()
}else if(tolower(rowOrColumnPercent) %in% "column"){
myOutput <- myOutput %>%
ungroup() %>%
group_by(.data[[columnGroupingInQuotes]]) %>%
mutate(n = paste0(n," (",pct(n/sum(n),2),")")) %>%
ungroup()
}else {
print("error: must use \"row\" or \"column\"")
}
myOutput <- myOutput %>%
group_by(.data[[categoryVariableInQuotes]],.data[[columnGroupingInQuotes]]) %>%
pivot_wider(names_from = .data[[columnGroupingInQuotes]],values_from = n) %>%
ungroup() %>%
add_column(Metric = NA,.before = 1) %>%
add_row(Metric = categoryLabel,.before = 1) %>%
mutate_at(vars(-Metric,-.data[[categoryVariableInQuotes]]),~ifelse(is.na(.) & !is.na(.data[[categoryVariableInQuotes]]),0,.)) %>%
mutate_all(~ifelse(is.na(.),"",.)) %>%
rename("Category" = .data[[categoryVariableInQuotes]])
if(exists("factorLevels")){
myOutput <- myOutput %>%
mutate(Category = factor(
Category,
levels = c("",factorLevels),
ordered = T
)) %>%
arrange(Category) %>%
mutate(Category = factor(Category,ordered = F))
}
return(myOutput)
}else{
return(myOutput)
}
}
}
# example for func.categoryCount():
# func.categoryCount(categoryVariableInQuotes = "cyl",columnGroupingInQuotes = "gear",mydata = mtcars %>% mutate(gear = as.character(gear)),categoryVariableLabel = "Cylinders",formattedOutput = T,rowOrColumnPercent = "row")
# 2022-05-06 NHAMCS estimator
func.nhamcsEstimator <- function(
mydata = dat.nh,
mysurveydata = dat.nh.survey,
groupingVariablesVectorInQuotes = c("YEAR","SEX","MSA")
){
library(srvyr)
options(survey.lonely.psu="adjust")
for(i in 1:length(groupingVariablesVectorInQuotes)){
mydata <- mydata %>%
group_by(.data[[groupingVariablesVectorInQuotes[i]]],.add = T)
mysurveydata <- mysurveydata %>%
group_by(.data[[groupingVariablesVectorInQuotes[i]]],.add = T)
}
unreliableUnweightedRows <- mydata %>%
summarise(
n = n()
) %>%
mutate(UNWEIGHTED = ifelse(n<30,"UNRELIABLE",""))
unreliableWeightedRows <- mysurveydata %>%
summarise(
n_weighted = survey_total(vartype = c("se","ci"))
) %>%
mutate(se_prop = n_weighted_se/n_weighted) %>%
mutate(WEIGHTED = ifelse(se_prop>=0.30,"UNRELIABLE",""))
allUnreliableRows <- full_join(
unreliableWeightedRows,
unreliableUnweightedRows,
by = groupingVariablesVectorInQuotes
) %>%
mutate(UNRELIABLE = ifelse(WEIGHTED %in% "UNRELIABLE" | UNWEIGHTED %in% "UNRELIABLE","UNRELIABLE","")) %>%
relocate(WEIGHTED,.after = "UNWEIGHTED") %>%
rename(n_weighted_lowerCI = n_weighted_low,
n_weighted_upperCI = n_weighted_upp)
# print("Unreliable Estimates Reported With Asterisks")
return(allUnreliableRows)
}
func.nhamcsEstimator_experimental <- function(
mydata = dat.nh,
mysurveydata = dat.nh.survey,
# note: always make sure 'year' or 'yearbin' is first
groupingVariablesVectorInQuotes = c("YEAR","SEX","MSA"),
wide_or_long = "long",
# note: measureLabel only works for one secondary variable (besides year)
measureLabel = NA,
convertToPercent = F,
yearbins = F
){
# if(nrow(mydata)!=nrow(mysurveydata)){
# print("ERROR: MISMATCH BETWEEN REGULAR DATASET AND SURVEY DATASET SIZES")
# } else{
library(srvyr)
options(survey.lonely.psu="adjust")
for(i in 1:length(groupingVariablesVectorInQuotes)){
mydata <- mydata %>%
group_by(.data[[groupingVariablesVectorInQuotes[i]]],.add = T)
mysurveydata <- mysurveydata %>%
group_by(.data[[groupingVariablesVectorInQuotes[i]]],.add = T)
}
unreliableUnweightedRows <- mydata %>%
summarise(
n = n()
) %>%
mutate(UNWEIGHTED = ifelse(n<30,"UNRELIABLE",""))
unreliableWeightedRows <- mysurveydata %>%
summarise(
n_weighted = survey_total(vartype = c("se","ci"))
) %>%
mutate(se_prop = n_weighted_se/n_weighted) %>%
mutate(WEIGHTED = ifelse(se_prop>=0.30,"UNRELIABLE",""))
allUnreliableRows <- full_join(
unreliableWeightedRows,
unreliableUnweightedRows,
by = groupingVariablesVectorInQuotes
) %>%
mutate(UNRELIABLE = ifelse(WEIGHTED %in% "UNRELIABLE" | UNWEIGHTED %in% "UNRELIABLE","UNRELIABLE","")) %>%
relocate(WEIGHTED,.after = "UNWEIGHTED") %>%
rename(n_weighted_lowerCI = n_weighted_low,
n_weighted_upperCI = n_weighted_upp)
# conversion to percent
if(convertToPercent %in% T){
allUnreliableRows <-
allUnreliableRows %>%
group_by(.data[[names(allUnreliableRows[1])]]) %>%
mutate(n_weighted = pct(n_weighted/sum(n_weighted,na.rm=T),2)) %>%
ungroup()
}
# print("Unreliable Estimates Reported With Asterisks")
if(wide_or_long %in% "long"){
return(allUnreliableRows)
}
else if(wide_or_long %in% "wide"){
nhamcsEstimate <- allUnreliableRows
# function(nhamcsEstimate,measureLabel = NA,pct = F,yearbins = F){
nhamcsEstimate <-
nhamcsEstimate %>%
ungroup()
if(convertToPercent %in% F){
nhamcsEstimate <-
nhamcsEstimate %>%
mutate(across(c(n_weighted,n_weighted_se,n_weighted_lowerCI,n_weighted_upperCI,se_prop),function(x){round2(x,0)}))
} else if(convertToPercent %in% T){
nhamcsEstimate <-
nhamcsEstimate %>%
mutate(across(c(n_weighted_se,n_weighted_lowerCI,n_weighted_upperCI,se_prop),function(x){round2(x,0)}))
}
if(yearbins == F){
nhamcsEstimate <-
nhamcsEstimate %>%
mutate(across(c(n_weighted:n),function(x){ifelse(!UNRELIABLE %in% "",paste0(x," (unreliable)"),as.character(x))})) %>%
select(YEAR:n_weighted) %>%
pivot_wider(names_from = YEAR,values_from = n_weighted)
} else if(yearbins == T){
nhamcsEstimate <-
nhamcsEstimate %>%
mutate(across(c(n_weighted:n),function(x){ifelse(!UNRELIABLE %in% "",paste0(x," (unreliable)"),as.character(x))})) %>%
select(cr.yearBins:n_weighted) %>%
pivot_wider(names_from = cr.yearBins,values_from = n_weighted)
}
minimumYearGroup <- nhamcsEstimate %>% names %>% sort %>% .[1]
if(names(nhamcsEstimate)[1] == minimumYearGroup){
return(nhamcsEstimate)
} else{
namesList <- nhamcsEstimate %>% select(1:all_of(minimumYearGroup))
namesList <- namesList %>% select(-length(namesList)) %>% names
if(length(namesList) >1){
for(i in 1:length(namesList)){
nhamcsEstimate <-
nhamcsEstimate %>%
mutate(across(i,function(x){paste0(namesList[i],": ",x)})) %>%
rowwise() %>%
mutate("Measure Categories" = paste0(across(namesList),collapse = ", "))
}
nhamcsEstimate <-
nhamcsEstimate %>%
ungroup() %>%
select(-namesList) %>%
relocate("Measure Categories",.before = everything())
} else {
if (is.na(namesList[1])){
measureLabel <- namesList[1]
}
nhamcsEstimate <-
bind_rows(
data.frame("Measure Name" = as.character(measureLabel),check.names = F),
nhamcsEstimate
) %>%
rename("Measure Categories" = namesList[1]) %>%
relocate(`Measure Categories`, .after = `Measure Name`)
}
return(nhamcsEstimate)
# }
}
}
}
# }
# 2022-06-13 outlier detection
CRoutlierCheck <- function(mydata,printWholeRow = F){
for(i in 1:ncol(mydata)){
if(!length(which(mydata[,i] %in% boxplot.stats(mydata[,i])$out)) == 0){
# cat(paste0("Variable: ",paste0(names(mydata %>% select(i)),collapse = ", ","\n")))
if(printWholeRow %in% T){
print(
mydata %>%
mutate(row_number = 1:n()) %>%
relocate(row_number,.before = everything()) %>%
slice(which(mydata[,i] %in% boxplot.stats(mydata[,i])$out))
)
cat('\n')
} else{
# print(
# mydata %>%
# mutate(row_number = 1:n()) %>%
# relocate(row_number,.before = everything()) %>%
# select(row_number,all_of(i+1)) %>%
# slice(which(mydata[,i] %in% boxplot.stats(mydata[,i])$out))
# )
cat('\n')
}
}
}
}
# reference code:
# not in a function for now, but code can be copied -
CRdesplit <- function(){print(
"reduce(full_join, by = names(.[[1]]))"
)}
# reverse the use of split to combine data frames after using split() and map() functions
# reduce(full_join, by = names(.[[1]]))
CRgt_code <- function(){
mtcars %>%
rownames_to_column() %>%
arrange(factor(cyl)) %>%
# adding a currency column
mutate(currency = 10000:(10000+n()-1)) %>%
group_by(cyl) %>%
gt(rowname_col = "rowname") %>%
# title/ subtitle
tab_header(
title = md("Mtcars Heading Here"),
subtitle = "Mtcars Subheading Here"
) %>%
# heading over specific column/columns
tab_spanner(
label = "MPG, Disp, and HP",
columns = c(mpg,disp,hp)
) %>%
# align rowname
tab_style(
style = list(
cell_text(align = "left",indent = pct(2.5))
),
locations = cells_stub(rows = TRUE)
) %>%
# align columns
cols_align(
align = "center",
columns = c(mpg:carb)
) %>%
# add footnote (footnote style 1)
tab_source_note(
source_note = md(
"Source: Base R"
)
) %>%
# add footnotes (NUMBERED footnotes linking to column or cell)
tab_footnote(
footnote = md("this is the first numbered footnote, pointing to a cell"),
# cells_body indicates a cell rather than column name
locations = cells_body(
columns = am,
rows = "Datsun 710"
)
) %>%
tab_footnote(
footnote = md("second numbered footnote, pointing to a column"),
# cells_column_labels indicates a column name rather than cell
locations = cells_column_labels(
columns = vs
)
) %>%
# bold headers
tab_style(
style = list(
cell_text(weight = "bold")
),
locations = cells_column_labels(columns = c(mpg:currency))
) %>%
tab_style(
style = list(
cell_text(weight = "bold")
),
locations = cells_column_labels(columns = c(mpg:currency))
) %>%
# color cell column
data_color(
columns = mpg,
colors = scales::col_numeric(
palette = c("red", "orange", "green", "blue"),
domain = c(min(mpg),max(mpg))
)
) %>%
# color row
tab_style(
style = list(
cell_fill(color = "#D9654B")
),
locations = cells_body(
# columns = c(mpg:currency), # not needed if coloring all columns
rows = 3)
) %>%
# converting a column to currency
fmt_currency(
columns = currency,
currency = "USD",
decimals = 0
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.