knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(Rdlazer)

a tour of R




Vector

\newline

\newline

# merge two character vectors of length one
c('one', 'two')
# merge a character vector with a numeric one
c('one', 1)
# the numeric is (silently!) coerced to character as a vector can only hold a single type
# concatenate existing vectors (variables)
v1 <- c('one', 'two')
v2 <- c(1, 2)
v3 <- c(FALSE, TRUE)
v4 <- c(v1, v2, v3)
v4
'one' %in% v4

\newline

\newline


attributes

# get names
names(v2)

#set names
names(v2) <- c('first', 'second')
names(v2)

# remove names
names(v2) <- NULL
names(v2)
v5 <- c('first' = 1, 'second' = 2, 'third' = 3)
names(v5)

# the names need not be quoted
v5 <- c(first = 1, second = 2, third = 3)


Every vector has a type and a length. It can also have attributes.




Vectorized Operations

n <- 1:6

# mathematical operation
n + 1

# change case
toupper(letters)

# compare numbers
n > 3
n1 <- 1:4
n2 <- 4:1

n1
n2
n1 + n2
n1 / n2
n3 <- rep(4, 4)
n4 <- 1:2

n3 + n4

l1 <- c(TRUE, FALSE)
paste(n3, l1)

\newline

s1 <- names(v5)
s1

length(s1)
nchar(s1)
l1 <- toupper(letters)
l2 <- LETTERS

identical(l1, l2)
l1 == l2

vna <- c(1, NA, 2, NA, 3, NA)

is.na(vna)
anyNA(vna)




Extraction/Subsetting

\newline

v4[1]
ind <- c(1,3)
v4[ind]
ind <- c(1, 1, 3, 4, 6, 6)
v4[ind]

# invalid index yields an NA
v4[7]

# fractions are rounded down
v4[2.3]
# this should be avoided regardless
v5['first']
ind <- c('first', 'second', 'first', 'first')
v5[ind]
ind <- c(T, T, F, T, F, F)
v4[ind]
ten <- 1:10
x <- letters[ten]
x

# too long an index produces NAs
ind <- rep(T, 11)
x[ind]

# too short an index is recycled
ind <- c(T, T, F)
x[ind]

ind <- TRUE
x[ind]
v4
v4[-2]
v4[-c(2,5)]
v4[c(-2, -5)]




Replacement

ind <- 1:3 * 2

# extract elements
n[ind]

# replace elements
n[ind] <- NA
n
n[7] <- 15
n
n[10] <- 25
n




Matrix

m <- matrix(1:12, nrow = 3, ncol = 4)
m
t(m)

\newline

# get dimensions
dim(m)
# the first number is the number of rows and the second is the number of columns

# the attribute can be modified as will, even removed
dim(m) <- NULL
m
# note that the matrix is dismantled in the same way is is built by default:
# column after column

# dim can be set on any vector as long as the grid size matches its length
dim(m) <- c(4,3)
m
# note that, again, the matrix is build in the default order
length(m)


subsetting a matrix

m[1,3]
m[1:2, 2:3]
m1 <- m[1, ]
is.matrix(m1)
is.vector(m1)
m2 <- m[1, , drop = FALSE]
# mind the empty second dimension!
m[1]
m[7]

\newline

rbind(m, 101:103)
cbind(m, letters[1:4])
# note the coercion
c(m, m)
cbind(m,m)

\newline

rownames(m) <- letters[1:4]
colnames(m) <- LETTERS[1:3]
m
m['a', 'B']
m['a', c('A', 'B'), drop = FALSE]
m[c('a', 'b'), c('A', 'B')]
dimnames(m)
rownames(m)
colnames(m)

\newline

m[2,3] <- "hello"
m
# note the conversion to character
m[] <- "goodbye"
m




Array

a1 <- array(1:12, dim = c(2, 2, 3))
a1

a2 <- 1:12
dim(a2) <- c(2, 2, 3)
a2

identical(a1, a2)

\newline

a1[1, , ]

a1[, , 3, drop = F]

\newline




List

\newline

l1 <- list(1, "one", TRUE, v5)
l1
l2 <- list(l1, "surprize")
l2

l3 <- c(l1, "surprize")
l3
l4 <- list(one = 1:3, two = 4:5, three = 7:8, `forbidden name` = 9)
l4


subsetting a list
it's a bit of a mess...

\newline

# subset with numeric index
l4[2]

ind <- 2:3
l4[ind]

# subset with names
l4['three']

# subset with logical vector
ind <- c(T, F, F)
l4[ind]
# the logical vector has been recycled

# it's a list
x <- l4[1]
x
is.list(x)
# numeric
l4[[3]]

# names
l4[['forbidden name']]

# it's not a list
x <- l4[[3]]
x
is.list(x)
is.numeric(x)

Note: a double bracket can also be used on a normal vector. The difference to a single bracket is that a double one discards names. It also accepts only a length 1 index. [[ is virtually never used with atomic vectors.

l4$one

l4$`forbidden name`


A list is a kind of vector that can hold any item.
Subsetting a list with `[` yields a list of any length. Using `[[` or `$` yields a single list item.




R's Syntactic Flexibility

c(1, c(4, 7))
v6 <- c(1, c(4, 7))

| the arrow operator creates a binding rather than return a value | it actually does return, but invisibly | the value can be visualized by putting the expression in parentheses

(v6 <- c(1, c(4, 7)))

| this explains why we can do something like this:

a <- b <- 1:2
a
b
head(c(paste(letters[1:3], LETTERS[8:5], c(1, 2)), 1:8), 7)
string <- 'This is a rather long character string, 
           like a sentence, it even has a period at the end.'

# get the five longest words from this sentence, in uppercase.
s <- strsplit(string, ' ')[[1]]
toupper(s[order(nchar(s), decreasing = T)][1:5])




Data Frame

nrow(iris)

ncol(iris)

dim(iris)
rownames(iris)

colnames(iris)
dimnames(iris)

\newline

# different lengths of columns throw an error
data.frame(1:4, letters[1:4], c('some', 'strings', 'here'))

data.frame(1:4, letters[1:4], c('some', 'strings', 'go', 'here'))
# the automatic names are unhelpful

d1 <- data.frame(first = 1:4, 
                 second = letters[1:4], 
                 'third' = c('some', 'strings', 'go', 'here'))
d1
# note names need not be quoted
as.data.frame(m)

dimnames(m) <- NULL
as.data.frame(m)
# these automatic names are better
l3
as.data.frame(l3)

l4
as.data.frame(l4)

\newline

\newline

colnames(iris)
names(iris)

identical(names(iris), colnames(iris))

\newline

head(iris)
str(iris)

(What's this "Factor" thing here, you ask? Hold that thought.)


subsetting a data frame

# use numeric indices
iris[1:5, 3:4]

# use row/column names
iris[c('1', '4', '9'), c('Petal.Length', 'Species')]

# leave columns unspecified to get them all
iris[1:5, ]

# leave rows unspecified to get them all
iris[, 4:5]

# selecting a single column will yield a vector, not a data frame
iris[, 'Species']

# unless you specify that dimensions not be dropped
iris[, 'Species', drop = F]

# use logical vectors - highly unorthodox but possible
iris[c(TRUE, FALSE), c(TRUE, FALSE)]
# note the recycling

# again, selecting a single column drops dimensions
iris[, c(F, F, T, F, F)]

# unless forbidden
iris[, c(F, F, T, F, F), drop = FALSE]
iris[4:5]
iris['Species']
# always yields a data frame, consistent with list behavior
iris[[5]]
iris[['Species']]
# always yields a vector

iris[[5, drop = F]]
# keeping dimensions is not applicable

iris[[c('Species', 'Petal.Length')]]
# only one item

iris[[1:5]]
iris[[1:2]]
#madness
iris$Species

\newline

iris[iris$Species == 'setosa', ]
# 1. the iris$Species column is compared to the string 'setosa'
iris$Species == 'setosa'
# this yields a logical vector EXACTLY the same length as iris$Species, i.e. the number of rows

# 2. this vector is then used for extracting rows
ind <- iris$Species == 'setosa'
iris[ind, ]
iris[iris$Species == 'setosa' & iris$Sepal.Length >= 5, ]
iris[iris$Species != 'virginica' & iris$Sepal.Length >= 5, 'Petal.Width']


replaceemnt in a data frame

# suppose we want to omit some outliers from our calculations
# we can replace some data points with NA
iris[iris$Sepal.Length > 6 | 
       iris$Sepal.Length < 5 & 
       iris$Species == 'versicolor', 
     'Petal.Width'] <- NA
iris


operations with columns

head(mtcars)

# compute how much horse power (hp) a car has per cylinder (cyl)
mtcars$hp / mtcars$cyl
mtcars$hp.per.cyl <- mtcars$hp / mtcars$cyl
head(mtcars)
mtcars[['hp.per.cyl']] <- mtcars[['hp']] / mtcars[['cyl']]
# rather than substituting NAs for numbers, 
# let's add a logical column that will flag certain observations as outliers
iris$outlier <- 
  ifelse(iris$Sepal.Length > 6 | 
           iris$Sepal.Length < 5 & 
           iris$Species == 'versicolor', 
         TRUE, FALSE)
iris


A data frame is a special kind of list made up of atomic vectors.
Extracting rows and columns is robust.
Operations on columns are easy and reliable.




Factor

str(iris)
head(iris$Species)

str(iris$Species)

a typical reaction to factors

ch <- rep(c('c', 'b', 'a'), 3)
f <- factor(ch)

ch

f
factor(1:3, labels = c('a', 'b', 'c'))

factor(3:1, labels = c('a', 'b', 'c'))

factor(1:3, labels = c('c', 'b', 'a'))

factor(3:1, labels = c('c', 'b', 'a'))
as.factor(ch)
nch <- as.character(rnorm(10))

nch

as.numeric(nch)

nf <- as.factor(nch)
nf

as.numeric(nf)

\newline

Wherefore the factors?

Factors store categorical data. It used to be a very useful way to encode what is called discreet variables. Today much of what factors do can be don eby character vectors just as well but many procedures rely on a factor input, so they are not going away any time soon.

It may seem superfluous now but we will get to places where factors are helpful and even necessary.

I promise.


Factors are confusing so it's all right to be confused.




olobiolo/Rdlazer documentation built on Aug. 6, 2022, 11:37 a.m.