These steps will direct you to relevant chapters from "Happy Git with R" by Jenny Bryan et. al.
Make the devtools and testthat packages available in every R session. Edit your .Rprofile file to include this code (you may use usethis::edit_r_profile())
:
if (interactive()) { suppressMessages(require(devtools)) suppressMessages(require(testthat)) }
(Your .Rprofile should NOT include data analysis packages such as dplyr or ggplot2.)
Ensure you always start each session with a blank slate:
Save, close and restart R.
use_data_raw()
, then use_data()
Good.
# > Console use_data_raw() # data-raw/dataset-name.R dataset_name <- readxl::read_excel("data-raw/dataset-name.xlsx") use_data(dataset_name) # R/dataset_name.R #' A dataset #' "dataset_name" # R/any-file.R f <- function() { dataset_name }
Bad.
# R/any-file.R dataset_name <- readxl::read_excel("data/dataset-name.xlsx") f <- function() { dataset_name }
Bad.
f <- function() { load("data/dataset_name.rda") }
http://r-pkgs.had.co.nz/data.html
Good.
# data-raw/my_internal_data.R use_data(my_internal_data, internal = TRUE) # R/any.R f <- function(data) { dplyr::left_join(data, my_internal_data) }
Bad.
# R/any.R my_internal_data <- mtcars %>% dplyr::select(cyl) f <- function(data) { dplyr::left_join(data, my_internal_data) }
http://r-pkgs.had.co.nz/data.html#data-sysdata
use_package("dplyr")
not library(dplyr)
Good.
use_package("dplyr")
Bad.
library(dplyr)
https://r-pkgs.org/whole-game.html
namespace::function_from_other_package()
Good.
f <- function(data) { utils::head(dplyr::select(data, dplyr::last_col())) }
Good.
#' @importFrom magrittr %>% #' @importFrom utils head #' @importFrom dplyr select last_col f <- function(data) { data %>% select(last_col()) %>% head() }
Bad.
f <- function(data) { head(select(data, last_col())) }
Bad.
f <- function(data) { data %>% select(last_col()) %>% head() }
Good.
use_package("dplyr") use_package("tidyr")
Bad.
use_package("tidyverse")
https://www.tidyverse.org/blog/2018/06/tidyverse-not-for-packages/
.data
pronounGood.
f <- function(data, column_name) { dplyr::select(data, .data[[column_name]]) }
Ok.
f <- function(data) { stopifnot(hasName(mtcars, "cyl")) dplyr::select(data, .data$cyl) }
Bad.
f <- function(data) { dplyr::select(data, cyl) }
Good.
f <- function(data) { data } my_data <- tibble::tibble(x = 1) f(my_data)
Bad.
f <- function(data = my_data2) { data } ls() try(f()) my_data2 <- tibble::tibble(x = 1) ls() f()
<<-
Setup.
readr::write_csv(mtcars, "some_data.csv")
Good.
some_data_path <- function() { fs::path("some_data.csv") } some_data_path() read_some_data <- function(path) { suppressMessages(head(readr::read_csv(path))) } path <- some_data_path() # Define path read_some_data(path)
Bad.
some_data_path <- function() { path <<- "some_data.csv" } read_some_data <- function() { suppressMessages(head(readr::read_csv(path))) } some_data_path() # Define path read_some_data()
"[Avoid global variables because they] introduce non-obvious dependencies between functions" -- Advanced R, Environments).
For valid uses of <<-
see (Advanced R, Function factories).
Clean up.
fs::file_delete("some_data.csv")
Good.
f <- function(data) { data } f(mtcars)
Bad.
f <- function(data = mtcars) { data } f()
Arguments that provide core data are required (have no default); they are often called
data
,x
, ory
).
-- Adapted from https://principles.tidyverse.org/args-data-details.html
Good.
forecast <- function(data, start_year, time_span = 5) { end_year <- start_year + time_span time_period <- data$year >= start_year & data$year <= end_year data %>% filter(time_period) %>% # ... }
Bad.
forecast <- function(data, start_year = 2020, time_span = 5) { # ... }
Descriptor arguments describe essential details of the operation, and are usually required.
-- Adapted from https://principles.tidyverse.org/args-data-details.html
Unless it is precisely the purpose of your function, avoid operations that read or write data (in general, avoid side effects).
Good.
f <- function(data) { dplyr::select(data, 1L) }
Bad.
f <- function(path) { data <- readxl::read_excel(path) dplyr::select(data, 1L) }
if()
with objects named meaningfullyx <- sample(c(1:10), size = 2, replace = TRUE) say <- function(x, msg) paste(paste(x, collapse = ", "), msg) say(1:2, "Hey!")
Good.
is_even_between_5and10 <- (x %% 2 == 0) & dplyr::between(x, 5L, 10L) if (all(is_even_between_5and10)) { say(x, "Yeah!") } else { say(x, "Nope!") }
Bad.
if (all((x %% 2 == 0) & (x >= 5L) & (x <= 10L))) { say(x, "Yeah!") } else { say(x, "Nope!") }
https://speakerdeck.com/jennybc/code-smells-and-feels?slide=36
clean_names
Good.
f <- function(data) { clean <- r2dii.utils::clean_column_names(data) stopifnot(hasName(clean, "a_column")) result <- dplyr::select(clean, .data$a_column) r2dii.utils::unclean_column_names(result, data) } f(tibble::tibble(A.Column = 1, Another.Column = 1))
Bad.
f <- function(data) { dplyr::select(data, .data$A.Column) } f(tibble::tibble(A.Column = 1, Another.Column = 1))
Avoid temporary variables unless they run for only a few, consecutive lines.
Good.
tmp <- dplyr::filter(mtcars, cyl > 4) tmp <- dplyr::select(tmp, disp) tmp <- head(tmp) # ... more unrelated code
Better.
mtcars %>% dplyr::filter(cyl > 4) %>% dplyr::select(disp) %>% head()
Bad.
tmp <- dplyr::filter(mtcars, cyl > 4) tmp <- dplyr::select(tmp, disp) # ... more unrelated code (makes your forget what `tmp` holds) tmp <- head(tmp)
Good.
f <- function(x) { g(x) } g <- function(x) { x + 1 }
Bad.
f <- function(x) { g <- function(x) { x + 1 } g(x) }
Good.
f <- function(x) { y <- calculate_y(x) # ... more code } calculate_y <- function(x) { x^x * x/2L # ... more code specifically about calculating y }
Bad.
f <- function(x) { # calculate y y <- x^x * x/2L # ... more code specifically about calculating y # ... more code }
Good.
f <- function(x, y, z) { x + g(y, z) } g <- function(y, z) { y + z } f(1, 1, 1)
Bad.
# Fragile. f <- function(x, y, z) { g <- function(y) { # `z` is outside of the scope of g(). It's a hidden argument y + z } x + g(y) } f(1, 1, 1) # f() breaks when you move g() to the top level f <- function(x, y, z) { x + g(y) } g <- function(y) { y + z } try(f(1, 1, 1))
It's easy for an analyst to maintain a project when functions, data, and scripts are separate.
Good.
# R/all-functions.R f <- function(data) { # ... some code } # data/all-datasets.R some_data <- readr::read_csv(here::here("data-raw", "some_data.csv")) # script/this-script.R library(tidyverse) source(here::here("R", "all-functions.R")) source(here::here("data", "all-datasets.R")) f(data = some_data)
It is error prone to mix functions, data, and scripts. The mess hides inter dependencies that can break your code unexpectedly. Also, this makes it hard for others to reproduce, or understand your code -- the maintainance programmer can only view your code through a toilet paper tube.
Bad.
# sripts-functions-and-data.R library(tidyverse) some_data <- readr::read_csv(here::here("data-raw", "some_data.csv")) f <- function(some_data) { some_data %>% dplyr::select() %>% # ... more code } f(some_data)
When functions, data, and scripts are separate, it's easy for a developer to transform a project into an R package. Functions go in the R/ directory, raw data in data-raw/, and data in data/. Scripts become examples, tests, and higher level documentation such as README, and the Home and articles pages of the package-website.
if()
uses a single TRUE
or FALSE
x <- c(1, 2) y <- 0L
Good
# Good if (identical(x, c(1, 2))) { say(identical(x, c(1, 2)), "is what you gave.") }
# Bad if (x == c(1, 2)) { say(x == c(1, 2), "is what you gave.") }
Caveats: https://github.com/2DegreesInvesting/ds-incubator/issues/13
1
is equal to 1L
but not identicalCareful!
1 == 1L identical(1, 1L)
Good
this_integer <- 1L if (!identical(this_integer, 1)) "Not the same" else "Wrong result"
Bad.
this_integer <- 1L if (!this_integer == 1) "Not the same" else "Wrong result"
For reference, in RStudio you can set a margin column at 80 characters (Tools > Global Options > Code > Show margin > Margin column).
Strive to limit your code to 80 characters per line. This fits comfortably on a printed page with a reasonably sized font. If you find yourself running out of room, this is a good indication that you should encapsulate some of the work in a separate function.
-- https://style.tidyverse.org/syntax.html#long-lines
If a function definition runs over multiple lines, indent the second line to where the definition starts.
-- https://style.tidyverse.org/functions.html#long-lines-1
Good.
add_row() permute()
Bad.
row_adder() permutation()
T
and F
as synonyms for TRUE
and FALSE
Good
sum(1, 1, na.rm = TRUE)
Bad.
sum(1, 1, na.rm = T)
TRUE
and FALSE
are reserved words; T
and F
are not.
T <- "Whatever" T # Forbidden try(TRUE <- "Whatever")
https://www.r-bloggers.com/r-tip-avoid-using-t-and-f-as-synonyms-for-true-and-false/
return()
to return earlyOnly use
return()
for early returns. Otherwise, rely on R to return the result of the last evaluated expression
https://style.tidyverse.org/functions.html#return
Good.
# Main purpose is a side effect: To throw an error if the input is bad check_f <- function(x) { stopifnot(is.numeric(x)) invisible(x) }
Good.
# Main purpose is not a side effect. Returning visibly f <- function(x) { x + 1 } f(1)
Bad.
# Main purpose is not a side effect. Returning invisibly f <- function(x) { out <- x + 1 } # Returns invisibly f(1) out <- f(1) out
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.