statistical computing in r

20 May 2015

source

(html best viewed in chrome)

**Notes** - candy - usage

background

plos title
**1** write programs for people, not computers **2** let the computer do the work **3** make incremental changes **4** don't repeat yourself (or others) **5** plan for mistakes **6** optimize only after it works correctly **7** document design/purpose, not mechanics **8** collaborate

motivation

motivation

Our studies support the claim that knowledge of programming plans and rules of programming discourse can have a _significant impact on program comprehension_.

It is not merely a matter of aesthetics that programs should be written in a particular style.

Rather there is a _psychological basis for writing programs in a conventional manner_: programmers have strong expectations that other programmers will follow these discourse rules.

If the rules are violated, then the utility afforded by the expectations that programmers have built up over time is effectively nullified.
[Soloway and Ehrlich, 1984](http://www.ics.uci.edu/~redmiles/inf233-FQ07/oldpapers/SollowayEhrlich.pdf)

un-motivation

1 write code for (other) people, not computers

fortunes::fortune('dog')
Firstly, don't call your matrix 'matrix'. Would you call your dog 'dog'? Anyway, it might clash with the function 'matrix'.
Barry Rowlingson, R-help (October 2004)

(1b) styling and formatting

(1c) styling and formatting - example

mylmfit=lm(mpg~wt+disp,mtcars)
summary=summary(mylmfit)
coefficients=summary$coefficients
coefficients=round(coefficients,digits=2)
estimates=coefficients[,1]
pvalues=coefficients[,4];pvalues
pvalues[1]="<0.01"
matrix=cbind(estimates,pvalues)
colnames(matrix)=c("Estimates","p-values")
as.data.frame(matrix)

(1d) styling and formatting - example

fit <- lm(mpg ~ wt + disp, data = mtcars)
summ <- summary(fit)$coefficients
summ <- round(summ, digits = 2)
cbind.data.frame(Estimate = summ[, 1],
                 `p-value` = format.pval(summ[, 4], eps = .01))

2 let the computer do the work

f <- function(x, start = 5) {
  l <- seq(start, 1, length.out = length(x))
  paste0(sprintf('<font size=%spt>%s</font>', l, x), collapse = ', ')
}

x <- c('thinking about problem', 'writing code', 'errors', 'manipulating', 
       'road-blocks', 're-writing', 'higher priorities', 'tables and figures',
       'tweaking', 'writing words', 'adding analyses', 'removing observations',
       'etc')



Divide each difficulty into many parts as is feasible and necessary to resolve it.
René Descartes

(2b) modularize code - example

f1 <- function(...) {
  fit <- lm(mpg ~ wt + disp, data = mtcars)
  summ <- round(summary(fit)$coefficients, digits = 2)
  cbind.data.frame(Estimate = summ[, 1],
                   `p-value` = format.pval(summ[, 4], eps = .01))
}

f1()
f1(mpg ~ wt + disp + hp, digits = 3)

(2c) modularize code - example

f2 <- function(form, dat = mtcars, digits = 2) {
  fit <- lm(form, data = dat)
  summ <- round(summary(fit)$coefficients, digits = digits)
  cbind.data.frame(Estimate = summ[, 1],
                   `p-value` = format.pval(summ[, 4], eps = 10 ** -digits))
}

f2(mpg ~ wt + hp + factor(cyl), digits = 3)

3 make incremental changes

without version control:

with version control (git, github):

4 don't repeat yourself

(4b) don't repeat yourself - example

print_counts <- function(x) {
  ## x a vector of character strings (or factors)
  ## usage: print_counts(letters[1:5])

  ## helper function: ipr
  ipr <- function(x) {
    ## tests
    ## ipr(1); ipr(1:2); ipr(1:5)
    if (length(x) == 1) x else
    if (length(x) == 2) paste(x, collapse = ' and ') else
      sprintf('%s, and %s', paste(x[-length(x)], collapse = ', '), tail(x, 1))
  }
  tt <- if (!is.table(x)) sort(table(x), decreasing = TRUE) else x
  ipr(sprintf('%s (n = %s, %s%%)', names(tt), tt, round(tt / sum(tt) * 100)))
}

table(tx <- rep(c('RCHOP','R-CVP','RCHOEP'), c(10, 6, 4)))
print_counts(tx)

(4c) don't repeat yourself - example

(x <- c(rnorm(4), NA))
sum(x)

sum2 <- function(...) sum(..., na.rm = TRUE)
sum2(x)

(4d) don't repeat yourself - example

my_plot <- function(...)
  plot(..., las = 1, bty = 'l', tcl = .2, xlab = 'Patient No.', ylab = '',
       pch = 19, col = 'dodgerblue2', cex = 1.5, family = 'HersheySerif')

par(mfrow = c(1, 2), mar = c(5, 3, .5, 1), bg = 'transparent')
my_plot(1:5, x)
my_plot(6:10, -x)

5 plan for mistakes

6 optimization (a deviation)

7 document (everything)

dmy <- function(d, m, y, origin = c(1, 1, 1900)) {
  ## parse day/month/year column data into standard date format
  ## examples:
    # dmy(NA, 5, 2005)
    # dmy(NA, 5, 13, origin = c(1, 1, 2000))
  ## arguments:
    # d, m, y: day, month, year as single integers, NAs, or vectors
    # origin: a vector of length three giving the origins for d, m, and y
  f <- function(a, b) {
    suppressWarnings(a <- as.numeric(a))
    ifelse(is.na(a), b, a)
  }
  y <- ifelse(nchar(y) <= 2, f(y, 0) + origin[3], f(y, 0))
  as.Date(sprintf('%04s-%02s-%02s', y, f(m, origin[2]), f(d, origin[1])))
}

dmy(NA, 5, 11:15, origin = c(15, 1, 2000))

(7b) document (everything)

#' Date parse
#' 
#' Parse day/month/year column data into standard date format
#' 
#' For two-digit years, the origin year should be specified; otherwise, the
#' default of 1900 will be used. For NA year, month, or day, origin is used
#' for defaults, i.e., origin = c(15, 6, 2000) will convert missing days to
#' day 15, missing months to June, and missing years to 2000.
#' 
#' @param d,m,y day, month, year as single integers or vectors
#' @param origin vector of length 3 with origins for d, m, and y, respectively;
#' see details
#' 
#' @return A vector of \code{\link{Date}}-formatted strings.
#' 
#' @examples
#' dmy(25, 7, 87)
#' dmy(NA, NA, 2000:2005)
#' @export

dmy <- function(d, m, y, origin) {
  ## do stuff
}

(7c) document (everything)

#' Patient demographic data for protocol 15-000
#' 
#' Baseline demographic and lab results data for xx subjects enrolled on study.
#' 
#' @seealso \code{\link{trainr}}, \code{\link{tox}}, \code{\link{surv}}
#'
#' @format An object of class \code{data.frame} containing xx observations and
#' yy variables:
#' 
#' \tabular{rll}{
#' \tab \code{id} \tab patient unique id \cr
#' \tab \code{site} \tab enrollment site \cr
#' \tab \code{\dots} \tab ... \cr
#' \tab \code{wt} \tab weight (kg) \cr
#' \tab \code{ht} \tab height (cm) \cr
#' \tab \code{hgb} \tab hemoglobin (g/dL) \cr
#' \tab \code{\dots} \tab ... \cr
#' }
"demo"

?demo
summary(demo)

8 collaborate

at the end of the day...

wtf

resources

resources

demo

putting these suggestions to practical use...

specific tips

## divide each column of a data frame by a different value of zz
zz <- 2:4
(out1 <- out2 <- out3 <- out4 <- data.frame(x = rep(4, 2), y = 9, z = 16))

## using a for loop is clumsy and litters the workspace with
## ii, the index, and any other variables created in the loop
for (ii in 1:3)
  out1[, ii] <- out1[, ii] / zz[ii]
out1

(b) functionals

## better, cleaner but still clumsy
out2[] <- lapply(1:3, function(ii) out2[, ii] / zz[ii])
out2

## even better
out3[] <- Map(`/`, out3, zz)
out3

## many other ways of getting the same result
as.data.frame(as.matrix(out4) / (col(out4) + 1))

growing objects

n <- 1e4

system.time({
  set.seed(1)
  out1 <- NULL
  for (ii in seq_len(n))
    out1 <- cbind(out1, rnorm(100))
})

system.time({
  set.seed(1)
  out2 <- matrix(NA, nrow = 100, ncol = n)
  for (ii in seq_len(n))
    out2[, ii] <- rnorm(100)
})

for loops, functionals & vectorization

system.time({
  set.seed(1)
  out3 <- sapply(seq_len(n), function(x) rnorm(100))
})

system.time({
  set.seed(1)
  out4 <- matrix(rnorm(100 * n), nrow = 100, ncol = n)
})

l <- list(out1, out2, out3, out4)
all(sapply(seq_along(l)[-1], function(x) identical(l[x - 1], l[x])))

Never use ...

(b) subset()

dd <- data.frame(a = 1, x = 1:5)
f1 <- function(...) {
  y <- 3
  ## do stuff
  subset(dd, ...)
}
f1(x > y)

## calculate y and use it to subset
y <- 1
f1(x > y)

(c) subset()

wzxhzdk:17 wzxhzdk:18

(d) attach()

x <- 0
d1 <- data.frame(x = 1 , y = 2)
d2 <- data.frame(x = 2, y = 1)
attach(d1)
attach(d2)
search()
x
y
detach(d2)
y
x
rm(x)
x
rm(x)
detach(d1)

(e) setwd()

... just remove these words from your vocabulary

Do use ...

(T <- rnorm(5))
TRUE <- 1
(dat <- within(mtcars, {
  gear <- factor(gear, levels = 3:5, labels = as.roman(3:5))
  wt <- wt * 1000
  disp <- drat <- qsec <- hp <- cyl <- NULL
}))[1:5, ]

(b) with()

par(bg = 'transparent', mar = c(3,4,2,2), las = 1)
with(dat[dat$mpg > 20, ], 
     barplot(tapply(wt, list(am, gear), mean), beside = TRUE,
             legend.text = c('auto','manual')))

(c) data=, str()

(fit <- lm(mpg ~ gear, data = dat))
str(fit, list.len = 3, give.attr = FALSE)


raredd/trainr documentation built on May 27, 2019, 2:03 a.m.