R/trahelyk.R

Defines functions an.id fix.mixed.dates fix.Date year.length univ.coxph.mids univ.coxph tidy.univ.coxph tidy.survest survest univ.lm compare.model fmt.pct fmt.pval rnd grepcol cnt_distinct apply_label apply_labels apply_label_vector laglead fact0rny factorny cols.except inline_ci.lme inline_ci.default inline_ci present_or ci_fmt sem cnt tocase look is_unique clear_labels batman

Documented in apply_label apply_labels apply_label_vector ci_fmt clear_labels cnt fact0rny is_unique look sem tocase

options(stringsAsFactors = FALSE)
default.stringsAsFactors()


# Generate n NA values in a vector
batman <- function(n) {
  return(rep(NA, n))
}

#' Remove all Hmisc labels from a data frame
#'
#' @param x Data frame to clean
#'
#' @return Data frame
#' @export
clear_labels <- function(x) {
  if(is.list(x)) {
    for(i in seq_along(x)) {
      class(x[[i]]) <- setdiff(class(x[[i]]), 'labelled') 
      attr(x[[i]],"label") <- NULL
    } 
  } else {
    class(x) <- setdiff(class(x), "labelled")
    attr(x, "label") <- NULL
  }
  return(x)
}

#' Determine whether a column in a data frame is unique across all rows
#'
#' @param x A data frame
#' @param guid Name of the column to evaluate for uniqueness
#' @param quiet Logical. If FALSE, the function echoes the number of unique values of `guid`; if TRUE, it only returns a logical.
#'
#' @return Logical indicating whether the guid is unique across rows
#' @export
#'
#' @examples
#' foo <- tibble(A = c(1, 2, 3, 4, 5),
#'               B = c("A", "B", "B", "C", "D"))
#' is_unique(foo, "A")
#' is_unique(foo, "B")
is_unique <- function(x, guid, quiet=FALSE) {
  n_guids <- cnt_distinct(x[[guid]])
  n_rows <- nrow(x)
  if(!quiet) {
    print(paste0("Unique guids: ", n_guids))
    print(paste0("Rows in data frame: ", n_rows))
  }
  return(n_guids == n_rows)
}

#' Display a summary and SD of a numeric vector
#'
#' @param x A numeric vector
#'
#' @return Summmary and SD
#' @export
#'
#' @examples
#' foo <- rnorm(n=20)
#' look(foo)
look <- function(x) {
  print(summary(x))
  print(paste0("SD = ", sd(x)))
}

#' Change case of a character vector
#'
#' @param x a character vector
#' @param case one of "upper", "lower", "title", or "sentence". 
#'
#' @return A character vector of the same length, with the first letter of each word, separated by a space, capitalized. 
#' @export
#'
#' @examples
#' x <- "the quick BROWN fox jumps over the LaZy Dog."
#' totitle(x)
tocase <- function(x, case) {
  if(case=="lower") {
    return(tolower(x))
  } else if(case=="upper") {
    return(toupper(x))
  } else if(case %in% c("title", "sentence")) {
    if(case=="title") {
      return(sapply(strsplit(x, " "), 
                    FUN = function(x) { if(is.na(x[1])) {
                      return(NA)
                      } else return(paste(toupper(substring(x, 1, 1)), 
                                             tolower(substring(x, 2)),
                                             sep = "", collapse = " ") %>%
                                      stringr::str_replace_all(pattern = " [Oo][Ff] ", " of "))
                    }, 
                    USE.NAMES = !is.null(names(x))))
    } else if(case=="sentence") {
      return(na_if(paste0(toupper(substring(x, 1, 1)), tolower(substring(x, 2))), 
                   "NANA"))
    }
  }
}

# Counter
#' Count observations and number of unique identifiers 
#'
#' @param DT A data.table, tibble, or data.frame
#' @param current_count Previous list created by cnt(), containing the last count
#' @param guid Name of the globally unique identifier to count
#'
#' @return List object containing the number of observations in element one, and the number of unique guids in element two
#' @export
#'
#' @examples
cnt <- function(DT=0, current_count=NULL, guid=c(getOption("guid"))) {
  N <- nrow(DT)
  guids_cnt <- map_int(guid, function(y) {uniqueN(DT[[y]])})
  
  cat(paste0("## Count ", deparse(substitute(DT)), ": \n"))
  cat(paste0("##  ", N, " observations \n",
             paste("## ", guids_cnt, "unique", guid, "numbers\n", collapse="")))
  if(length(current_count)>0) {
    cat(paste0("\n## \n",
               "## Change: \n",
               "##   ", N - current_count$N, " observations \n",
               "##   ", guids_cnt - current_count$guids_cnt, " unique ", guid, " numbers\n"))
  }
  return(invisible(list(N = N,
                        guids_cnt = guids_cnt)))
}


#' Calculate a standard error
#' 
#' @param x A numeric vector
#' @param s A standard deviation
#' @param n N corresponding to s
#' @
#' @return Standard error of the mean (numeric)

sem <- function(x=NA, s=NA, n=NA) {
  if(length(x) > 1) {
    sem <- sd(x)/sqrt(length(x))
  } else {
    if(is.na(s) | is.na(n)) stop("Must provide a numeric vector or s and n.")
    sem <- s/sqrt(n)
  }
  return(sem)
}

#' Calculate a CI and return it as a formatted string
#' 
#' @param x A numeric or integer vector
#' @param d An integer specifying the number of significant digits to be presented in the point estimate and interval.
#' @param dist A character string specifying the distribution from which the interval should be calculated. Currently supports "normal" or "t".
#' @
#' @return A character string with the point estimate of the mean, followed by the interval in parentheses.

ci_fmt <- function(x, d=2, dist="normal") {
  if(dist=="normal") {
    dist_cut <- c(qnorm(0.975))
  } else if(dist=="t") {
    dist_cut <- c(qt(0.975, df=n-1))
  }
  
  n <- sum(!is.na(x))
  se <- sd(x, na.rm=TRUE)/sqrt(n)
  xbar <- mean(x, na.rm=TRUE) 
  return(paste0(rnd(xbar, d), " (", 
                rnd(xbar - dist_cut*se, d), ", ",
                rnd(xbar + dist_cut*se, d), ")"))
}


# Present a specified odds ratio from a glm object as a character string (for use in narrative)
present_or <- function(mdl, varname, d=2, fmt="noparens") {
  est <- rnd(exp(coef(mdl)[varname]), d)
  ci <- as.tibble(t(exp(confint.default(mdl))))[[varname]]
  lb <- rnd(ci[1], d)
  ub <- rnd(ci[2], d)
  if(fmt=="parens") {
    x <- paste0(est, " (95\\% CI: ", lb, ", ", ub, ")")
  } else {
    x <- paste0(est, ", 95\\% CI: ", lb, ", ", ub)
  }
  return(x)
}

# -------------------------------------------------------------------------------- 
# Reports a 95% CI for model coefficients. Ideal for in-line R code in a knitr document.
# Allows the user to specify confidence level and whether results should be exponentiated.
# --------------------------------------------------------------------------------
# Define a generic function
inline_ci <- function(mdl, parm, sep="paren", digits=2, exp=FALSE, level=0.95) UseMethod("inline_ci")

inline_ci.default <- function(mdl, parm, sep="paren", digits=2, exp=FALSE, level=0.95) {
  if(sep=="paren") {
    seps <- c(" (", ")")
  } else {
    seps <- c(", ", "")
  }
  if(exp)
  {
    return(paste0(as.character(rnd(exp(mdl$coefficients[[parm]]),digits)), 
                  seps[1], "95\\% CI: ",
                  as.character(rnd(exp(confint(mdl, parm=parm, level=level))[1],digits)),
                  ", ",
                  as.character(rnd(exp(confint(mdl, parm=parm, level=level))[2],digits)),
                  seps[2]))
  }
  else
  {
    return(paste0(as.character(rnd(mdl$coefficients[[parm]],digits)), 
                  seps[1], "95\\% CI: ",
                  as.character(rnd(confint(mdl, parm=parm, level=level)[1],digits)),
                  ", ",
                  as.character(rnd(confint(mdl, parm=parm, level=level)[2],digits)),
                  seps[2]))
  }
}

inline_ci.lme <- function(mdl, parm, sep="paren", digits=2, exp=FALSE, level=0.95) {
  if(sep=="paren") {
    seps <- c(", (", ")")
  } else {
    seps <- c(", ", "")
  }
  coefs <- intervals(mdl, which="fixed")$fixed
  coefs <- bind_cols(tibble(var = rownames(coefs)), as.tibble(coefs))
  coefs %<>%
    gather(key = est, value = value, 2:ncol(coefs)) %>% 
    spread_(key = names(coefs)[1],value = 'value')
  
  if(exp)
  {
    return(paste0(as.character(rnd(exp(coefs[[parm]][1]),digits)), 
                  seps[1], "95\\% CI: ",
                  as.character(rnd(exp(coefs[[parm]][2]),digits)),
                  ", ",
                  as.character(rnd(exp(coefs[[parm]][3]),digits)),
                  seps[2]))
  }
  else
  {
    return(paste0(as.character(rnd(coefs[[parm]][1],digits)), 
                  seps[1], "95\\% CI: ",
                  as.character(rnd(coefs[[parm]][2],digits)),
                  ", ",
                  as.character(rnd(coefs[[parm]][3],digits)),
                  seps[2]))
  }
}

# Return all the column names of a data frame except for those specified
cols.except <- function(df, except) {
  colnames(df)[!(colnames(df) %in% except)]
}

# Return a 2-level factor with labels "No" and "Yes"
factorny <- function(x) {
  return(factor(x, labels = c("No", "Yes")))
}

#' Return a 2-level factor with labels "No" and "Yes"
#'
#' @param x A binary (0,1) numeric vector
#'
#' @return A factor with levels and lables c("No", "Yes")

fact0rny <- function(x) {
  return(factor(x, levels=c(0,1), labels = c("No", "Yes")))
}

# Define laglead function
laglead<-function(x,shift_by){
  stopifnot(is.numeric(shift_by))
  #stopifnot(is.numeric(x))
  
  if (length(shift_by)>1)
    return(sapply(shift_by,shift, x=x))
  
  out<-NULL
  abs_shift_by=abs(shift_by)
  if (shift_by > 0 )
    out<-c(tail(x,-abs_shift_by),rep(NA,abs_shift_by))
  else if (shift_by < 0 )
    out<-c(rep(NA,abs_shift_by), head(x,-abs_shift_by))
  else
    out<-x
  out
}

#' Apply a character vector as labels for a data frame
#'
#' @param df 
#' @param labels 
#'
#' @return
#' @export
#'
#' @examples
#' foo <- tibble(a = c(1,2,3),
#'               b = c(4,5,6)) %>%
#'   apply_label_vector(c("A", "B))
#' label(foo)
apply_label_vector <- function(df, labels) {
  for(i in 1:length(df)) {
    Hmisc::label(df[[i]]) <- labels[i]
  }
  return(df)
}

#' Apply Hmisc-brand labels to one or more variables in a dataframe or tibble
#' 
#' @param df A data frame or tibble.
#' @param ... Name-value pairs of column names and labels
#' @return A data frame or tibble.
#' @examples
#' foo <- tibble(a = c(1,2,3),
#'               b = c(4, 5,6)) %>%
#'   apply_labels(a = "A", 
#'                b = "B"))
#' label(foo)
apply_labels <- function(x, ...) {
  var_list <- names(list(...))
  var_lbls <- list(...) %>% unlist() %>% as.vector()
  
  for(i in seq_along(var_list)) {
    Hmisc::label(x[[var_list[i]]]) <- var_lbls[i]
  }
  
  return(x)
}

#' Apply an Hmisc-brand label to a single variable in a dataframe or tibble
#' 
#' @param df A data frame or tibble.
#' @param x Unquoted name of the column to be labeled. 
#' @param lbl Quoted label.
#' @return A data frame or tibble.
#' @examples
#' foo <- tibble(a = c(1,2,3),
#'               b = c(4, 5,6)) %>%
#'   apply_label(a, "A") %>%
#'   apply_label(b, "B")
#'   
#' label(foo)
apply_label <- function(df, x, lbl) {
  Hmisc::label(df[[deparse(substitute(x))]]) <- lbl
  return(df)
}


# Count distinct observations in a vector
cnt_distinct <- function(x) {
  return(length(unique(x)))
}

# Search column names of a data frame for specified string
grepcol <- function(phrase, df) {
  return(colnames(df)[grep(phrase, colnames(df), ignore.case=TRUE)])
}

# A simple rounding function that returns exactly the number of digits requested. 
# Defaults to a character string.
# This function is really just a wrapper for format and round. 
rnd <- function(x,d=3,as.numeric=FALSE) {
  if(as.numeric)
    return(as.numeric(format(round(x,d), nsmall=d, scientific=FALSE)))
  else
    return(format(round(x,d), nsmall=d, scientific=FALSE))
}

# Returns a character-formatted version of a p-value, including markup
# to indicate when p is less than the minimum value in the specified number
# of decimal-place digits.
fmt.pval <- function(pval, digits=3, include.p=TRUE, latex=FALSE, md=FALSE) {
  p.df <- as.data.frame(cbind(1:length(pval), pval))
  colnames(p.df) <- c("order", "p")
  if(latex) {
    lt <- "\\textless"
  } else {
    lt <- "<"
  }
  if(md) {
    spc <- "\ "
  } else {
    spc <- " "
  }
  if(include.p) {
    prefix.1 <- paste0("p", spc, lt, spc)
    prefix.2 <- paste0("p", spc, "=", spc)
  }
  else{
    prefix.1 <- lt
    prefix.2 <- ""
  }
  p.df[p.df$p*(10^(digits)) < 1 & !is.na(p.df$p),c("p.fmt")] <- paste0(prefix.1, format(1/(10^digits), scientific=FALSE))
  p.df[p.df$p*(10^(digits)) >= 1 & !is.na(p.df$p),c("p.fmt")] <- paste0(prefix.2, 
                                                                       as.character(rnd(p.df$p[p.df$p*(10^(digits)) >= 1 & !is.na(p.df$p)],digits)))
  p.df[is.na(p.df$p),c("p.fmt")] <- ""
  
  return(p.df$p.fmt)
}

# Convert proportion (0 < p < 1) to a percent (0 < pct < 100)
fmt.pct <- function(x, d=0, as.numeric=FALSE, latex=FALSE) {
  if(latex) pct.char <- "\\%"
  else pct.char <- "%"
  if(as.numeric)
    return(as.numeric(format(round(x*100,d), nsmall=d)))
  else
    return(paste0(format(round(x*100,d), nsmall=d), pct.char))
}

# Function for comparing models to check for confounding.
### SHOULD MODIFY THIS FUNCTION TO INCLUDE P-VALUES
compare.model <- function(mdl.0, mdl.1, cnfd.level=0.2) {
  comp <- merge(cbind(row=rownames(as.data.frame(mdl.0$coef)), coef=as.data.frame(mdl.0$coef), stringsAsFactors = FALSE), 
                cbind(row=rownames(as.data.frame(mdl.1$coef)), coef=as.data.frame(mdl.1$coef), stringsAsFactors = FALSE), 
                by="row", all=TRUE)
  comp$reldiff <- comp[,2] / comp[,3]
  comp$confound <- ""
  comp$confound[(1-comp$reldiff > cnfd.level) | (comp$reldiff-1 > cnfd.level)] <- "*"
  comp <- rbind(comp, c("AIC", AIC(mdl.0), AIC(mdl.1), AIC(mdl.0) - AIC(mdl.1), ""))
  return(comp)
}

# Produces a series of univariable linear regression models and wraps them in LaTeX.
univ.lm <- function(y, x, df, rownames=NULL, caption="", digits=2, label="", asymp.nml=TRUE) {  
  univ.models <- data.frame(matrix(vector(), 0, 6))
  for(pred in x) {
    mdl <- lm(formula(paste0(y, " ~ ", pred)), data=df)
    if(asymp.nml) {
      ci <- confint.default(mdl)
    } else {
      ci <- confint(mdl)
    }
    univ.models <- rbind(univ.models, cbind(summary(mdl)$coef, ci))
  }
  univ.models <- univ.models[!grepl("(Intercept)", row.names(univ.models)),]
  colnames(univ.models) <- c("coef", "SE", "t", "p", "LB", "UB")
  univ.models <- univ.models[,c("coef", "SE", "LB", "UB", "p")]
  
  for (i in 1:length(univ.models$p)) {
    univ.models$p[i] <- fmt.pval(as.numeric(univ.models$p[i]), digits=digits, include.p=FALSE, latex=FALSE)
  }
  
  if(is.null(rownames)) rownames <- rownames(univ.models)
  
  xt.um <- xtable(univ.models, caption=caption, digits=digits, label=label)
  align(xt.um) <- "lrrrrr"
  rownames(xt.um) <- rownames
  print(xt.um,
        include.rownames=TRUE,
        caption.placement="top",
        floating=TRUE,
        table.placement="H")
}


##################################################################################################
# Functions for survival analysis
##################################################################################################

# Formats estimates of survival probability at specified intervals.
survest <- function(sf, times, caption="-year survival probability", label="survtime") {
  for(i in 1:length(times)) {
    foo <- summary(sf, time=times[i])
    latex(data.frame(strata = foo$strata[],
                     time = foo$time,
                     n.risk = foo$n.risk,
                     n.event = foo$n.event,
                     survival.probability = foo$surv,
                     lowerCI = foo$lower,
                     upperCI = foo$upper), 
          digits=3, caption=paste0(times[i], caption), 
          label=paste0(label, times[i]),
          file="", 
          rowname=NULL, size="footnotesize", where="!htbp")  
  }
}

tidy.survest <- function(sf, times, caption="-year survival probability", label="survtime") {
  outt <- data.frame()
  for(i in 1:length(times)) {
    x <- summary(sf, time=times[i], extend=TRUE)
    outt <- rbind(outt, data.frame(strata = x$strata[],
                                   time = x$time,
                                   n.risk = x$n.risk,
                                   n.event = x$n.event,
                                   survival.probability = x$surv,
                                   lowerCI = x$lower,
                                   upperCI = x$upper))
  }
  return(outt)
}

# Produces a series of univariate Cox PH regression models and returns a tidy dataset.
tidy.univ.coxph <- function(timevar, death.indicator, preds, df, rownames=NULL, digits=2) {  
  univ.models <- data.frame(matrix(vector(), 0, 8))
  for(pred in preds) {
    surv.obj <- Surv(df[[timevar]], df[[death.indicator]])
    mdl <- coxph(formula(paste0("surv.obj ~ ", pred)), data=df)
    univ.models <- rbind(univ.models, cbind(n=mdl$n, summary(mdl)$coef, exp(confint(mdl))))
  }
  univ.models <- univ.models[!grepl("(Intercept)", row.names(univ.models)),]
  colnames(univ.models) <- c("n", "coef", "HR", "SE", "z", "p", "LB", "UB")
  univ.models <- univ.models[,c("n", "coef", "SE", "HR", "LB", "UB", "p")]
  
  for (i in 1:length(univ.models$p)) {
    univ.models$p[i] <- fmt.pval(as.numeric(univ.models$p[i]), digits=digits, include.p=FALSE, latex=FALSE)
  }
  
  if(is.null(rownames)) rownames <- rownames(univ.models)
  
  rownames(univ.models) <- rownames
  
  return(univ.models)
}

# Produces a series of univariate Cox PH regression models and wraps them in LaTeX.
univ.coxph <- function(timevar, death.indicator, preds, df, rownames=NULL, caption="", digits=2, label="", size="footnotesize") {  
  univ.models <- tidy.univ.coxph(timevar = timevar,
                                 death.indicator = death.indicator,
                                 preds = preds,
                                 df = df,
                                 rownames = rownames,
                                 digits = digits)
  
  xt.um <- xtable(univ.models, caption=caption, digits=c(0, 0, rep(digits, 6)), label=label)
  align(xt.um) <- "lrrrrrrr"
  rownames(xt.um) <- rownames
  print(xt.um,
        include.rownames=TRUE,
        caption.placement="top",
        floating=TRUE,
        table.placement="H",
        size=size)
}

# Produces a series of univariate Cox PH regression models on a multiply imputed dataset and wraps them in LaTeX.
univ.coxph.mids <- function(timevar, death.indicator, preds, mids, rownames=NULL, caption="", digits=2, label="", size="footnotesize", hl.col="red") {  
  univ.models <- data.frame(matrix(vector(), 0, 8))
  for(i in 1:length(preds)) {
    eval(parse(text=paste0("mdl <- pool(with(mids, coxph(Surv(mids$data[['", timevar, "']], mids$data[['", death.indicator, "']]) ~ ", preds[i], ")))")))
    mdl.summary <- as.data.frame(summary(mdl))
    univ.models <- rbind(univ.models, cbind(var=rownames(mdl.summary),
                                            Missing=mdl.summary$nmis, 
                                            ubar=rnd(mdl.summary$est, d=digits), 
                                            SE=rnd(mdl.summary$se, d=digits),
                                            HR=rnd(exp(mdl.summary$est), d=digits),
                                            LB=rnd(exp(mdl.summary$`lo 95`), d=digits),
                                            UB=rnd(exp(mdl.summary$`hi 95`), d=digits),
                                            p=fmt.pval(mdl.summary$`Pr(>|t|)`, digits=digits, include.p=FALSE, latex=TRUE),
                                            p.num=as.numeric(mdl.summary$`Pr(>|t|)`)))
  }
  colnames(univ.models) <- c("Variable", "Missing", "$\\bar{u}$", "SE", "HR", "LB", "UB", "p", "p.num")
  
  if(!is.null(rownames)) univ.models$Variable <- rownames
  
  for(colname in colnames(univ.models)) {
    univ.models[[colname]][as.numeric(univ.models$p.num)<0.05] <- 
      paste0("\\textcolor{", hl.col, "}{\\bf ", univ.models[[colname]][as.numeric(univ.models$p.num)<0.05], "}")
  }
  
  univ.models$p.num <- NULL
  univ.models$Missing[univ.models$Missing==paste0("\\textcolor{", hl.col, "}{\\bf NA}")] <- NA
  
  # if(is.null(rownames)) rownames <- univ.models$Variable

  xt.um <- xtable(univ.models[2:8], caption=caption, digits=c(0, 0, rep(digits, 6)), label=label)
  align(xt.um) <- "lrrrrrrr"
  rownames(xt.um) <- univ.models$Variable
  print(xt.um,
        include.rownames=TRUE,
        caption.placement="top",
        floating=TRUE,
        table.placement="H",
        size=size,
        sanitize.text.function = identity)
}

##################################################################################################
# Date manipulation functions
##################################################################################################
year.length <- function(year) {
  return(nchar(substr(ca$DOB, as.numeric(regexpr("/[^/]*$", ca$DOB))+1, nchar(ca$DOB))))
}

fix.Date <- function(x, fmt=c("%m/%d/%Y"), origin="1970-01-01") {
  
  if(is.character(x)) {
    fixed.date <- as.Date(parse_date_time(x, fmt))
  }
  else if(is.numeric(x)) {
    fixed.date <- as.Date(x, origin=origin)
  }
  else {
    fixed.date <- as.Date(NA)
  }
  class(fixed.date) <- "Date"
  return(as.Date(fixed.date))
}

fix.mixed.dates <- function(dates) {
  short <- as.Date(dates[year.length(dates)==4], format="%m/%d/%Y", origin="1970-01-01")
  long <- as.Date(dates[year.length(dates)==2], format="%m/%d/%y", origin="1970-01-01")
  return(c(short,long))
}

##################################################################################################
# Generate a random alphanumeric string of length n. These are not necessarily unique.
##################################################################################################
an.id <- function(n) {
  return(paste(replicate(n, c(letters, round(runif(26,0,9)))[round(runif(1,1,52))]), collapse=""))
}
trahelyk/trahelyk_pkg documentation built on June 14, 2021, 9:25 p.m.