Gotchas when moving code from a script to an R package {-}

Packaging

Setup: R, RStudio, Git, GitHub

These steps will direct you to relevant chapters from "Happy Git with R" by Jenny Bryan et. al.

Setup: devtools and testthat

Make the devtools and testthat packages available in every R session. Edit your .Rprofile file to include this code (you may use usethis::edit_r_profile()):

if (interactive()) {
  suppressMessages(require(devtools))
  suppressMessages(require(testthat))
}

(Your .Rprofile should NOT include data analysis packages such as dplyr or ggplot2.)

Ensure you always start each session with a blank slate:

Save, close and restart R.

use_data_raw(), then use_data()

Good.

# > Console
use_data_raw()

# data-raw/dataset-name.R
dataset_name <- readxl::read_excel("data-raw/dataset-name.xlsx")
use_data(dataset_name)

# R/dataset_name.R
#' A dataset
#' 
"dataset_name"

# R/any-file.R
f <- function() {
  dataset_name
}

Bad.

# R/any-file.R
dataset_name <- readxl::read_excel("data/dataset-name.xlsx")

f <- function() {
  dataset_name
}

Bad.

f <- function() {
  load("data/dataset_name.rda")
}

http://r-pkgs.had.co.nz/data.html

Consider using internal data

Good.

# data-raw/my_internal_data.R
use_data(my_internal_data, internal = TRUE)

# R/any.R
f <- function(data) {
  dplyr::left_join(data, my_internal_data)
}

Bad.

# R/any.R
my_internal_data <- mtcars %>% dplyr::select(cyl)

f <- function(data) {
  dplyr::left_join(data, my_internal_data)
}

http://r-pkgs.had.co.nz/data.html#data-sysdata

use_package("dplyr") not library(dplyr)

Good.

use_package("dplyr")

Bad.

library(dplyr)

https://r-pkgs.org/whole-game.html

namespace::function_from_other_package()

Good.

f <- function(data) {
  utils::head(dplyr::select(data, dplyr::last_col()))
}

Good.

#' @importFrom magrittr %>%
#' @importFrom utils head
#' @importFrom dplyr select last_col
f <- function(data) {
  data %>% 
    select(last_col()) %>% 
    head()
}

Bad.

f <- function(data) {
  head(select(data, last_col()))
}

Bad.

f <- function(data) {
  data %>% 
    select(last_col()) %>% 
    head()
}

The tidyverse is for EDA, not packages

Good.

use_package("dplyr")
use_package("tidyr")

Bad.

use_package("tidyverse")

https://www.tidyverse.org/blog/2018/06/tidyverse-not-for-packages/

Use the .data pronoun

Good.

f <- function(data, column_name) {
  dplyr::select(data, .data[[column_name]])
}

Ok.

f <- function(data) {
  stopifnot(hasName(mtcars, "cyl"))

  dplyr::select(data, .data$cyl)
}

Bad.

f <- function(data) {
  dplyr::select(data, cyl)
}

Function interface

Avoid relying on the global environment

Good.

f <- function(data) {
  data
}

my_data <- tibble::tibble(x = 1)
f(my_data)

Bad.

f <- function(data = my_data2) {
  data
}

ls()
try(f())

my_data2 <- tibble::tibble(x = 1)
ls()
f()

Avoid modifying the global environment, e.g. with <<-

Setup.

readr::write_csv(mtcars, "some_data.csv")

Good.

some_data_path <- function() {
  fs::path("some_data.csv")
}

some_data_path()

read_some_data <- function(path) {
  suppressMessages(head(readr::read_csv(path)))
}

path <- some_data_path()  # Define path
read_some_data(path)

Bad.

some_data_path <- function() {
  path <<- "some_data.csv"
}

read_some_data <- function() {
  suppressMessages(head(readr::read_csv(path)))
}

some_data_path()  # Define path
read_some_data()

"[Avoid global variables because they] introduce non-obvious dependencies between functions" -- Advanced R, Environments).

For valid uses of <<- see (Advanced R, Function factories).

Clean up.

fs::file_delete("some_data.csv")

Arguments that provide core data are required

Good.

f <- function(data) {
  data
}

f(mtcars)

Bad.

f <- function(data = mtcars) {
  data
}

f()

Arguments that provide core data are required (have no default); they are often called data, x, or y).

-- Adapted from https://principles.tidyverse.org/args-data-details.html

Descriptor arguments are usually required

Good.

forecast <- function(data, start_year, time_span = 5) {
  end_year <- start_year + time_span
  time_period <- data$year >= start_year & data$year <= end_year

  data %>%
    filter(time_period) %>%
    # ...
}

Bad.

forecast <- function(data, start_year = 2020, time_span = 5) {
  # ...
}

Descriptor arguments describe essential details of the operation, and are usually required.

-- Adapted from https://principles.tidyverse.org/args-data-details.html

Avoid reading and writing operations

Unless it is precisely the purpose of your function, avoid operations that read or write data (in general, avoid side effects).

Good.

f <- function(data) {
  dplyr::select(data, 1L)
}

Bad.

f <- function(path) {
  data <- readxl::read_excel(path)
  dplyr::select(data, 1L)
}

Code smells and feels

Simplify if() with objects named meaningfully

x <- sample(c(1:10), size = 2, replace = TRUE)
say <- function(x, msg) paste(paste(x, collapse = ", "), msg)
say(1:2, "Hey!")

Good.

is_even_between_5and10 <- (x %% 2 == 0) & dplyr::between(x, 5L, 10L)
if (all(is_even_between_5and10)) {
  say(x, "Yeah!")
} else {
  say(x, "Nope!")
}

Bad.

if (all((x %% 2 == 0) & (x >= 5L) & (x <= 10L))) {
  say(x, "Yeah!")
} else {
  say(x, "Nope!")
}

https://speakerdeck.com/jennybc/code-smells-and-feels?slide=36

Program for columns with clean_names

Good.

f <- function(data) {
  clean <- r2dii.utils::clean_column_names(data)

  stopifnot(hasName(clean, "a_column"))
  result <- dplyr::select(clean, .data$a_column)

  r2dii.utils::unclean_column_names(result, data)
}

f(tibble::tibble(A.Column = 1, Another.Column = 1))

Bad.

f <- function(data) {
  dplyr::select(data, .data$A.Column)
}

f(tibble::tibble(A.Column = 1, Another.Column = 1))

?clean_column_names()

Avoid long-running temporary objects

Avoid temporary variables unless they run for only a few, consecutive lines.

Good.

tmp <- dplyr::filter(mtcars, cyl > 4)
tmp <- dplyr::select(tmp, disp)
tmp <- head(tmp)

# ... more unrelated code

Better.

mtcars %>% 
  dplyr::filter(cyl > 4) %>% 
  dplyr::select(disp) %>% 
  head()

Bad.

tmp <- dplyr::filter(mtcars, cyl > 4)
tmp <- dplyr::select(tmp, disp)

# ... more unrelated code (makes your forget what `tmp` holds)

tmp <- head(tmp)

If possible, extract functions to the top level

Good.

f <- function(x) {
  g(x)
}

g <- function(x) {
  x + 1
}

Bad.

f <- function(x) {
  g <- function(x) {
    x + 1
  }

  g(x)
}

Extract commented sections into functions

Good.

f <- function(x) {
  y <- calculate_y(x)

  # ... more code
}

calculate_y <- function(x) {
  x^x * x/2L
  # ... more code specifically about calculating y
}

Bad.

f <- function(x) {
  # calculate y
  y <- x^x * x/2L
  # ... more code specifically about calculating y

  # ... more code
}

Error prone

Avoid hidden arguments: Extract functions with all arguments

Good.

f <- function(x, y, z) {
  x + g(y, z)
}

g <- function(y, z) {
  y + z
}

f(1, 1, 1)

Bad.

# Fragile.
f <- function(x, y, z) {
  g <- function(y) {
    # `z` is outside of the scope of g(). It's a hidden argument
    y + z
  }

  x + g(y)
}

f(1, 1, 1)

# f() breaks when you move g() to the top level
f <- function(x, y, z) {
  x + g(y)
}

g <- function(y) {
  y + z
}

try(f(1, 1, 1))

Separate functions, data, and scripts

A non-package project

It's easy for an analyst to maintain a project when functions, data, and scripts are separate.

Good.

# R/all-functions.R
f <- function(data) {
  # ... some code
}

# data/all-datasets.R
some_data <- readr::read_csv(here::here("data-raw", "some_data.csv"))

# script/this-script.R
library(tidyverse)

source(here::here("R", "all-functions.R"))
source(here::here("data", "all-datasets.R"))


f(data = some_data)

It is error prone to mix functions, data, and scripts. The mess hides inter dependencies that can break your code unexpectedly. Also, this makes it hard for others to reproduce, or understand your code -- the maintainance programmer can only view your code through a toilet paper tube.

Bad.

# sripts-functions-and-data.R
library(tidyverse)

some_data <- readr::read_csv(here::here("data-raw", "some_data.csv"))

f <- function(some_data) {
  some_data %>% 
    dplyr::select() %>% 
    # ... more code
}

f(some_data)

A package project

When functions, data, and scripts are separate, it's easy for a developer to transform a project into an R package. Functions go in the R/ directory, raw data in data-raw/, and data in data/. Scripts become examples, tests, and higher level documentation such as README, and the Home and articles pages of the package-website.

if() uses a single TRUE or FALSE

x <- c(1, 2)
y <- 0L

Good

# Good
if (identical(x, c(1, 2))) {
  say(identical(x, c(1, 2)), "is what you gave.")
}
# Bad
if (x == c(1, 2)) {
  say(x == c(1, 2), "is what you gave.")
}

Caveats: https://github.com/2DegreesInvesting/ds-incubator/issues/13

1 is equal to 1L but not identical

Careful!

1 == 1L
identical(1, 1L)

Good

this_integer <- 1L
if (!identical(this_integer, 1)) "Not the same" else "Wrong result"

Bad.

this_integer <- 1L
if (!this_integer == 1) "Not the same" else "Wrong result"

Style

Limit your code to 80 characters per line

For reference, in RStudio you can set a margin column at 80 characters (Tools > Global Options > Code > Show margin > Margin column).

Strive to limit your code to 80 characters per line. This fits comfortably on a printed page with a reasonably sized font. If you find yourself running out of room, this is a good indication that you should encapsulate some of the work in a separate function.

-- https://style.tidyverse.org/syntax.html#long-lines

If a function definition runs over multiple lines, indent the second line to where the definition starts.

-- https://style.tidyverse.org/functions.html#long-lines-1

Names should use only lowercase letters, numbers, and "_".

Good.

add_row()
permute()

Bad.

row_adder()
permutation()

Avoid T and F as synonyms for TRUE and FALSE

Good

sum(1, 1, na.rm = TRUE)

Bad.

sum(1, 1, na.rm = T)

TRUE and FALSE are reserved words; T and F are not.

T <- "Whatever"
T

# Forbidden
try(TRUE <- "Whatever")

https://www.r-bloggers.com/r-tip-avoid-using-t-and-f-as-synonyms-for-true-and-false/

Reserve return() to return early

Only use return() for early returns. Otherwise, rely on R to return the result of the last evaluated expression

https://style.tidyverse.org/functions.html#return

Return invisibly only when the main purpose is a side effect

Good.

# Main purpose is a side effect: To throw an error if the input is bad
check_f <- function(x) {
  stopifnot(is.numeric(x))

  invisible(x)
}

Good.

# Main purpose is not a side effect. Returning visibly
f <- function(x) {
  x + 1
}

f(1)

Bad.

# Main purpose is not a side effect. Returning invisibly
f <- function(x) {
  out <- x + 1
}

# Returns invisibly
f(1)

out <- f(1)
out


2DegreesInvesting/ds-incubator documentation built on Oct. 13, 2021, 10:09 a.m.