inst/extdata/R-code/07-Basic-objects.R

#' # Basic Object Classes {#basic-classes}
#' 
## ---- include = FALSE-----------------------------------------------------------------------------------------------
source('Scripts/preamble_chapters.R')

#' 
#' **The basic classes are the most primary elemen
#' 
#' In this chapter, we will study R's basic object
#' 
#' - Numeric (`numeric`)
#' - Text (`character`)
#' - Factors (`factor`)
#' - Logical Values (`logical`)
#' - Dates and Time (`Date` and `timedate`)
#' - Missing Data (`NA`)
#' 
#' 
#' ## `Numeric` Objects
#' 
#' The objects of type `numeric` represent quantit
#' 
#' 
#' ### Creating and Manipulating `numeric` Objects
#' 
#' It is easy to create and manipulate the `numeri
#' 
#' As you can see in the next example, where we ha
#' 
## -------------------------------------------------------------------------------------------------------------------
# create numeric vectors
x <- 1:5
y <- 2:6

# print sum
print(x+y)

# print multiplication
print(x*y)

# print division
print(x/y)

# print exponentiation
print(x^y)

#' 
#' The difference between R and other programming 
#' 
## -------------------------------------------------------------------------------------------------------------------
# set x with 4 elements and y with 2
x <- 1:4
y <- 2:1

# print multiplication
print(x + y)

#' 
#' The result of `x + y` is equivalent to `1:4 + c
#' 
## ----warning=TRUE, error=TRUE---------------------------------------------------------------------------------------
# set x = 4 elements and y with 3
x <- c(1, 2, 3, 4)
y <- c(1, 2, 3)

# print sum (recycling rule)
print(x +y)

#' 
#' The first three elements of `x` were summed to 
#' 
#' Elements of a `numeric` vector can also be name
#' 
## -------------------------------------------------------------------------------------------------------------------
# create named vector
x <- c(item1 = 10, 
       item2 = 14, 
       item3 = 9, 
       item4 = 2)

# print it
print(x)

#' 
#' Empty `numeric` vectors can also be created. So
#' 
## -------------------------------------------------------------------------------------------------------------------
# create empty numeric vector of length 10
my_x <- numeric(length = 10)

# print it
print(my_x)

#' 
#' As you can see, when using `numeric(length = 10
#' 
#' 
#' ### Creating a `numeric` Sequence
#' 
#' In R, you have two ways to create a sequence of
#' 
#' However, using the operator `:` can be restrict
#' 
## -------------------------------------------------------------------------------------------------------------------
# create sequence with seq
my_seq <- seq(from = -10, 
              to = 10, 
              by = 2)

# print it
print(my_seq)

#' 
#' Another interesting feature of function `seq` i
#' 
## -------------------------------------------------------------------------------------------------------------------
# create sequence with defined number of elements
desired_len <- 20
my_seq <- seq(from = 0, 
              to = 10, 
              length.out = desired_len)

# print it
print(my_seq)

#' 
#' The final number of elements in object `my_seq`
#' 
#' 
#' ### Creating Vectors with Repeated Elements
#' 
#' Another way to create `numeric` vectors is by u
#' 
## -------------------------------------------------------------------------------------------------------------------
# repeat vector three times
my_x <- rep(x = 1, times = 10)

# print it
print(my_x)

#' 
#' It also works with vectors. For example, let's 
#' 
## -------------------------------------------------------------------------------------------------------------------
# created a vector with repeated elements
my_x <- rep(x = c(1, 2), 
            times = 3)

# print it
print(my_x)

#' 
#' 
#' ### Creating Vectors with Random Numbers
#' 
#' Some applications in finance and economics requ
#' 
#' In R, several functions create random numbers f
#' 
#' Function `rnorm` generates random numbers from 
#' 
## -------------------------------------------------------------------------------------------------------------------
# generate 10 random numbers from a Normal distribution
my_rnd_vec <- rnorm(n = 10000, 
                    mean = 0, 
                    sd = 1)

# print it
glimpse(my_rnd_vec)

#' 
#' We generated ten thousand random numbers from a
#' 
## ---- message=FALSE, echo=FALSE-------------------------------------------------------------------------------------
p <- ggplot(tibble(x = my_rnd_vec), aes(x = x)) + 
  geom_histogram() + 
  theme_bw()

print(p)

#' 
#' Yes, the shape of the frequency distribution is
#' 
#' Function `runif` generates random values unifor
#' 
## -------------------------------------------------------------------------------------------------------------------
# create a random vector with minimum and maximum
my_rnd_vec <- runif(n = 10, 
                    min = -5, 
                    max = 5)

# print it
print(my_rnd_vec)

#' 
#' Note that both functions, `rnorm` and `runif`, 
#' 
## -------------------------------------------------------------------------------------------------------------------
# create sequence
my_vec <- seq(from = 0, to = 25, by=5)

# sample sequence
my_rnd_vec <- sample(my_vec)

# print it
print(my_rnd_vec)

#' 
#' Function `sample` also allows the random select
#' 
## -------------------------------------------------------------------------------------------------------------------
# sample one element of my_vec
my_rnd_vec <- sample(my_vec, size = 1)

# print it
print(my_rnd_vec)

#' 
#' If we wanted two random elements from `my_rnd_v
#' 
## -------------------------------------------------------------------------------------------------------------------
# sample two elements of my_vec
my_rnd_vec <- sample(my_vec, size = 2)

# print it
print(my_rnd_vec)

#' 
#' Besides, you can select values from a smaller v
#' 
## -------------------------------------------------------------------------------------------------------------------
# create vector
my_vec <- c(5, 10, 15)

# sample
my_rnd_vec <- sample(x = my_vec, size = 10, replace = TRUE)
print(my_rnd_vec)

#' 
#' Another important feature of `sample` is it wor
#' 
## -------------------------------------------------------------------------------------------------------------------
# example of sample with characters
print(sample(c('elem 1','elem 2','elem 3'),
             size = 1))

# example of sample with list
print(sample(list(x = c(1,1,1),
                  y = c('a', 'b')),
             size = 1))

#' 
#' At this point, it is important to acknowledge t
#' 
#' One neat trick is that we can select the starti
#' 
## -------------------------------------------------------------------------------------------------------------------
# set seed with integer 10
set.seed(seed = 10)

# create and print "random" vectors
my_rnd_vec_1 <- runif(5)
print(my_rnd_vec_1)

my_rnd_vec_2 <- runif(5)
print(my_rnd_vec_2)

#' 
#' In the previous code, the value 10 in `set.seed
#' 
#' Function `set.seed` also works for `sample`:
#' 
## -------------------------------------------------------------------------------------------------------------------
# fix seed
set.seed(seed = 15)

# print vectors
print(sample(1:10))
print(sample(10:20))

#' 
#' Likewise, if you execute the previous code in y
#' 
#' 
#' ### Accessing the Elements of a `numeric` Vecto
#' 
#' All elements of a numerical vector can be acces
#' 
## -------------------------------------------------------------------------------------------------------------------
# set vector
x <- c(-1, 4, -9, 2)

# get first element
first_elem_x <- x[1]

# print it
print(first_elem_x)

#' 
#' The same notation is used to extract parts of a
#' 
## -------------------------------------------------------------------------------------------------------------------
# sub-vector of x
sub_x <- x[1:2]

# print it
print(sub_x)

#' 
#' To access named elements of a numeric array, si
#' 
## -------------------------------------------------------------------------------------------------------------------
# set named vector
x <- c(item1 = 10, item2 = 14, item3 = -9, item4 = -2)

# access elements by name
print(x['item2'])
print(x[c('item2','item4')])

#' 
#' We can also access the elements of a numerical 
#' 
## -------------------------------------------------------------------------------------------------------------------
# find all values of x higher than zero
print(x[x > 0])

#' 
#' The selection of elements from a vector, accord
#' 
#' 
#' ### Modifying and Removing Elements of a `numer
#' 
#' The modification of a vector is very simple. Ju
#' 
## -------------------------------------------------------------------------------------------------------------------
# set vector
my_x <- 1:4

# modify first element to 5
my_x[1] <- 5

# print result
print(my_x)

#' 
#' This modification can also be performed block-w
#' 
## -------------------------------------------------------------------------------------------------------------------
# set vector
my_x <- 0:5

# set the first three elements to 5
my_x[1:3] <- 5

# print result
print(my_x)

#' 
#' Using conditions to change values in a vector i
#' 
## -------------------------------------------------------------------------------------------------------------------
# set vector
my_x <- -5:5

# set any value lower than 2 to 0
my_x[my_x<2] <- 0

# print result
print(my_x)

#' 
#' The removal of elements of a vector is carried 
#' 
## -------------------------------------------------------------------------------------------------------------------
# create vector
my_x <- -5:5

# remove first and second element of my_x
my_x <- my_x[-(1:2)]

# show result
print(my_x)

#' 
#' Notice how using negative index simply returns 
#' 
#' 
#' ### Creating Groups
#' 
#' A common data task is to create groups based on
#' 
#' In R, the function used to create intervals fro
#' 
## -------------------------------------------------------------------------------------------------------------------
# set random vector
my_x <- rnorm(10000)

# create groups with 5 breaks
my_cut <- cut(x = my_x, breaks = 5)

# print it!
print(head(my_cut))

#' 
#' Be aware that ranges define the names in `my_cu
#' 
## -------------------------------------------------------------------------------------------------------------------
print(table(my_cut))

#' 
#' As expected, the distribution of values is bala
#' 
#' With the `cut` function, you can also define cu
#' 
## -------------------------------------------------------------------------------------------------------------------
# create random vector in tibble
my_df <- tibble(x = rnorm(10000))

# define breaks and labels manually
my_breaks <- c(min(my_x)-1, -1, 1, max(my_x)+1)
my_labels <- c('Low','Normal', 'High')

# create group from numerical vector
my_df <- my_df %>%
  mutate(cut_x = cut(x = x, 
                     breaks = my_breaks, 
                     labels = my_labels))

# glimpse it!
glimpse(my_df)

#' 
#' Notice that, in this example of creating a grou
#' 
## -------------------------------------------------------------------------------------------------------------------
print(table(my_df$cut_x))

#' 
#' 
#' ### Other Useful Functions
#' 
#' **as.numeric** - Converts an object to the `num
#' 
## -------------------------------------------------------------------------------------------------------------------
my_text <- c('1', '2', '3')
class(my_text)
my_x <- as.numeric(my_text)
print(my_x)
class(my_x)

#' 
#' 
#' **unique** - Returns all unique values of a num
#' 
## -------------------------------------------------------------------------------------------------------------------
my_x <- c(1, 1, 2, 3, 3, 5)
print(unique(my_x))

#' 
#' **sum** - Sums all elements of a `numeric` vect
#' 
## -------------------------------------------------------------------------------------------------------------------
my_x <- 1:50
my_sum <- sum(my_x)
print(my_sum)

#' 
#' **max** - Returns the maximum value of a `numer
#' 
## -------------------------------------------------------------------------------------------------------------------
x <- c(10, 14, 9, 2)
max_x <- max(x)
print(max_x)

#' 
#' **min** - Returns the minimum value of a `numer
#' 
## -------------------------------------------------------------------------------------------------------------------
x <- c(12, 15, 9, 2)
min_x <- min(x)
print(min_x)

#' 
#' **which.max** - Returns the position of the max
#' 
## -------------------------------------------------------------------------------------------------------------------
x <- c(100, 141, 9, 2)
which_max_x <- which.max(x)
cat(paste('The position of the max value of x is ', 
          which_max_x))
cat(' and its value is ', x[which_max_x])

#' 
#' **which.min** - Returns the position of the min
#' 
## -------------------------------------------------------------------------------------------------------------------
x <- c(10, 14, 9, 2)
which_min_x <- which.min(x)
cat(paste('The position of the min value of x is ',
          which_min_x, ' and its value is ', x[which_min_x]))

#' 
#' **sort** - Returns a sorted (ascending or desce
#' 
## -------------------------------------------------------------------------------------------------------------------
x <- runif(5)
print(sort(x, decreasing = FALSE))
print(sort(x, decreasing = TRUE))

#' 
#' **cumsum** - Returns the cumulative sum of the 
#' 
## -------------------------------------------------------------------------------------------------------------------
my_x <- 1:25
my_cumsum <- cumsum(my_x)
print(my_cumsum)

#' 
#' **prod** - Returns the product (multiplication)
#' 
## -------------------------------------------------------------------------------------------------------------------
my_x <- 1:10
my_prod <- prod(my_x)
print(my_prod)

#' 
#' **cumprod** - Returns the cumulative product of
#' 
## -------------------------------------------------------------------------------------------------------------------
my_x <- 1:10
my_prod <- cumprod(my_x)
print(my_prod)

#' 
#' 
#' ## `Character` Objects
#' 
#' The `character` class, or simply text class, is
#' 
#' R has several features that facilitate the crea
#' 
#' A positive aspect of `stringr` is that all func
#' 
#' 
#' ### Creating a Simple `character` Object
#' 
#' In R, every `character` object is created by en
#' 
## -------------------------------------------------------------------------------------------------------------------
tickers <- c('MMM', 'FB', 'ICE')
print(tickers)

#' 
#' We can confirm the class of the created object 
#' 
## -------------------------------------------------------------------------------------------------------------------
class(tickers)

#' 
#' 
#' ### Creating Structured `character` Objects
#' 
#' We can also use R to create a text vector with 
#' 
#' To create a text vector with the junction of te
#' 
## -------------------------------------------------------------------------------------------------------------------
library(stringr)

# create sequence and tex
my_seq <- 1:20
my_text <- 'text'

# paste objects together (without space)
my_char <- str_c(my_text, my_seq)
print(my_char)

# paste objects together (with space)
my_char <- str_c(my_text, 
                 my_seq, 
                 sep = ' ')
print(my_char)

#' 
#' We can do the same procedure with text vectors:
#' 
## -------------------------------------------------------------------------------------------------------------------
# set character value
my_x <- 'My name is'

# set character vector
my_names <- c('Marcelo', 'Ricardo', 'Tarcizio')

# paste and print
print(str_c(my_x, my_names, sep = ' '))

#' 
#' Another possibility of building structured text
#' 
## -------------------------------------------------------------------------------------------------------------------
# repeat 'abc' five times
my_char <- str_dup(string = 'abc', times = 5)

# print it
print(my_char)

#' 
#' 
#' ### `character` Constants
#' 
#' R also allows direct access to all letters of t
#' 
## -------------------------------------------------------------------------------------------------------------------
# print all letters in alphabet (no cap)
print(letters)

#' 
## -------------------------------------------------------------------------------------------------------------------
# print all letters in alphabet (WITH CAP)
print(LETTERS)

#' 
#' Note that, in both cases, `letters` and `LETTER
#' 
#' Other constant `character` objects in R are `mo
#' 
## -------------------------------------------------------------------------------------------------------------------
# print abbreviation and full names of months
print(month.abb)
print(month.name)

#' 
#' 
#' ### Selecting Pieces of a Text Object
#' 
#' A common beginner's mistake is to select charac
#' 
## -------------------------------------------------------------------------------------------------------------------
# set char object
my_char <- 'ABCDE'

# print its second element (WRONG - RESULT is NA)
print(my_char[2])

#' 
#' The `NA` value indicates the second element of 
#' 
## -------------------------------------------------------------------------------------------------------------------
print(my_char[1])

#' 
#' The result is simply the _ABCDE_ text, on the f
#' 
## -------------------------------------------------------------------------------------------------------------------
# print third and fourth characters
my_substr <- str_sub(string = my_char,
                     start = 2,
                     end = 2)
print(my_substr)

#' 
#' These functions also work for atomic vectors. L
#' 
## -------------------------------------------------------------------------------------------------------------------
# build char vec
my_char_vec <- paste0(c('ABC','VBC','ZMN'),
                      ' - other ignorable text')
print(my_char_vec)

#' 
#' Here, we want the information in the first thre
#' 
## -------------------------------------------------------------------------------------------------------------------
# get ids with stringr::str_sub
ids_vec <- str_sub(my_char_vec, 1, 3)
print(ids_vec)

#' 
## **Vector operations in character objects are very common in R**. Almost anything you can do to a single element can be expanded to vectors. This facilitates the development of research scripts as you can easily perform complicated tasks to a series of elements in a single line of code.

#' 
#' ### Finding and Replacing Characters of a Text
#' 
#' A useful operation in handling texts is to loca
#' 
#' The most common case of string search is to ver
#' 
#' The following example shows how to find the _D_
#' 
## -------------------------------------------------------------------------------------------------------------------
# set character object
my_char <- 'ABCDEF-ABCDEF-ABC'

# find position of 'D' using str_locate
pos <- str_locate(my_char, fixed('D'))
print(pos)

#' 
#' Note the `str_locate` function returns only the
#' 
## -------------------------------------------------------------------------------------------------------------------
# set object
my_char <- 'ABCDEF-ABCDEF-ABC'

# find position of ALL 'D' using str_locate_all
pos <- str_locate_all(my_char, fixed('D'))
print(pos)


#' 
#' To replace characters in a text, use functions 
#' 
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
# set char object
my_char <- 'ABCDEF-ABCDEF-ABC'

# substitute the FIRST 'ABC' for 'XXX' with str_replace
my_char <- str_replace(string = my_char,
                       pattern = 'ABC',
                       replacement = 'XXX')
print(my_char)

#' 
#' And now, we globally substitute characters.
#' 
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
# set char object
my_char <- 'ABCDEF-ABCDEF-ABC'

# substitute ALL 'ABC' for 'XXX' with str_replace_all
my_char <- str_replace_all(string = my_char,
                           pattern = 'ABC',
                           replacement = 'XXX')

# print result
print(my_char)

#' 
#' Again, it is worth pointing out that the operat
#' 
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
# set char object
my_char <- c('ABCDEF','DBCFE','ABC')

# create an example of vector
my_char_vec <- paste(sample(my_char, 5, replace = T),
                     sample(my_char, 5, replace = T),
                     sep = ' - ')

# show it
print(my_char_vec)

# substitute all occurrences of 'ABC'
my_char_vec <- str_replace_all(string = my_char_vec,
                               pattern = 'ABC',
                               replacement = 'XXX')

# print result
print(my_char_vec)

#' 
#' 
#' ### Splitting Text
#' 
#' Eventually, you will need to break a text into 
#' 
## -------------------------------------------------------------------------------------------------------------------
# set char
my_char <- 'ABC;ABC;BCD'

# split it based on ';' and using stringr::str_split
splitted_char <- str_split(my_char, ';')

# print result
print(splitted_char)


#' 
#' The output of this function is an object of typ
#' 
## -------------------------------------------------------------------------------------------------------------------
print(splitted_char[[1]][3])

#' 
#' For an example of a split in character vectors,
#' 
## -------------------------------------------------------------------------------------------------------------------
# set char
my_char_vec <- c('ABCDEF','DBCFE','ABFC','ACD')

# split it based on 'B' and using stringr::strsplit
splitted_char <- str_split(my_char_vec, 'B')

# print result
print(splitted_char)

#' 
#' Notice how, again, an object of type `list` is 
#' 
#' 
#' ### Finding the Number of Characters in a Text
#' 
#' If we want to discover the number of characters
#' 
## -------------------------------------------------------------------------------------------------------------------
# set char
my_char <- 'abcdef'

# print number of characters using stringr::str_length
print(str_length(my_char))

#' 
#' And now an example with vectors.
#' 
## -------------------------------------------------------------------------------------------------------------------
#set char
my_char <- c('a', 'ab', 'abc')

# print number of characters using stringr::str_length
print(str_length(my_char))

#' 
#' 
#' ### Generating Combinations of Text
#' 
#' One useful trick in R is to use functions `oute
#' 
## -------------------------------------------------------------------------------------------------------------------
# set char vecs
my_vec_1 <- c('a','b')
my_vec_2 <- c('A','B')

# combine in matrix
comb_mat <- outer(my_vec_1, my_vec_2, paste, sep='-')

# print it!
print(comb_mat)

#' 
#' The output of `outer` is a `matrix` type of obj
#' 
## -------------------------------------------------------------------------------------------------------------------
print(as.character(comb_mat))

#' 
#' Another way to reach the same objective is by u
#' 
## -------------------------------------------------------------------------------------------------------------------
library(tidyverse)

# set vectors
my_vec_1 <- c('John ', 'Claire ', 'Adam ')
my_vec_2 <- c('is fishing.', 'is working.')

# create df with all combinations
my_df <- expand_grid(name = my_vec_1,
                     verb = my_vec_2)

# print df
print(my_df)

# paste columns together in tibble
my_df <- my_df %>%
  mutate(phrase = paste0(name, verb) )

# print result
print(my_df)

#' 
#' Here, we used the function `expand.grid` to cre
#' 
#' 
#' ### Encoding of `character` Objects
#' 
#' For R, a text string is just a sequence of _byt
#' 
#' Let's explore an example. Here, we will import 
#' 
## -------------------------------------------------------------------------------------------------------------------
library(readr)

# read text file
my_f <- afedR::get_data_file('FileWithLatinChar_Latin1.txt')

my_char <- read_lines(my_f)

# print it
print(my_char)

#' 
#' The original content of the file is a text in P
#' 
#' The easiest solution is to modify input `locale
#' 
## -------------------------------------------------------------------------------------------------------------------
my_char <- read_lines(my_f, 
                      locale = locale(encoding='Latin1'))

# print it
print(my_char)

#' 
#' The Latin characters are now correct because th
#' 
#' 
#' ### Other Useful Functions
#' 
#' **stringr::str_to_lower**/**base::tolower** - C
#' 
## -------------------------------------------------------------------------------------------------------------------
print(stringr::str_to_lower('ABC'))

#' 
#' **stringr::str_to_upper**/**base::toupper** - C
#' 
## -------------------------------------------------------------------------------------------------------------------
print(stringr::str_to_upper('abc'))

#' 
#' 
#' ## `Factor` Objects
#' 
#' Object class `factor` is used to represent grou
#' 
#' The `factor` class integrates nicely with stati
#' 
#' 
#' ### Creating `factors`
#' 
#' The creation of factors is accomplished with fu
#' 
## -------------------------------------------------------------------------------------------------------------------
# create factor
my_factor <- factor(c('M', 'F', 'M',
                      'M', 'F', 'F'))

# print it
print(my_factor)

#' 
#' Notice that, in the previous example, the prese
#' 
## -------------------------------------------------------------------------------------------------------------------
# create factor with 3 levels
my_factor <- factor(c('M', 'F', 'M', 
                      'M', 'F', 'F',
                      'ND'))

# print factor
print(my_factor)

#' 
#' Here, we also have the `ND` (not defined) group
#' 
#' An important point about creating factors is th
#' 
## -------------------------------------------------------------------------------------------------------------------
# set factors with 1 level
my_status <- factor(c('Single', 'Single', 'Single'))

# print it
print(my_status)

#' 
#' Accidentally, the data in `my_status` only show
#' 
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
my_status <- factor(c('Single', 'Single', 'Single'),
                    levels = c('Single', 'Married'))

print(my_status)

#' 
#' 
#' ### Modifying `factors`
#' 
#' An important point about the `factor` type of o
#' 
## ----warning=TRUE---------------------------------------------------------------------------------------------------
# set factor
my_factor <- factor(c('a', 'b', 'a', 'b'))

# change first element of a factor to 'c'
my_factor[1] <- 'c'

# print result
print(my_factor)

#' 
#' As we expected, the first element of `my_factor
#' 
## -------------------------------------------------------------------------------------------------------------------
# set factor
my_factor <- factor(c('a', 'b', 'a', 'b'))

# change factor to character
my_char <- as.character(my_factor)

# change first element
my_char[1] <- 'c'

# mutate it back to class factor
my_factor <- factor(my_char)

# show result
print(my_factor)

#' 
#' Using these steps, we have the desired result i
#' 
#' The `tidyverse` universe also has its own packa
#' 
## -------------------------------------------------------------------------------------------------------------------
library(forcats)

# set factor
my_factor <- factor(c('A', 'B', 'C', 
                      'A', 'C', 'M', 
                      'N'))

# modify factors
my_factor <- fct_recode(my_factor,
                        'D' = 'A',
                        'E' = 'B',
                        'F' = 'C')

# print result
print(my_factor)

#' 
#' Using `forcats::fct_recode` is intuitive. All w
#' 
#' 
#' ### Converting `factors` to Other Classes
#' 
#' Attention is required when converting a `factor
#' 
## -------------------------------------------------------------------------------------------------------------------
# create factor
my_char <-factor(c('a', 'b', 'c'))

# convert and print
print(as.character(my_char))

#' 
#' However, when the same procedure is performed f
#' 
## -------------------------------------------------------------------------------------------------------------------
# set factor
my_values <- factor(5:10)

# convert to numeric (WRONG)
print(as.numeric(my_values))

#' 
#' As you can see, all elements in `my_values` wer
#' 
## -------------------------------------------------------------------------------------------------------------------
# converting factors to character and then to numeric
print(as.numeric(as.character(my_values)))

#' 
## **Always be careful when transforming factors into numbers**. This bug is hard to catch and may go unnoticed until it breaks your code or jeopardizes your analysis. Remember that, internally, R converts a numerical factor to its index level number, and not the actual number.

#' 
#' 
#' ### Creating Contingency Tables
#' 
#' After creating a factor, we can find the number
#' 
## -------------------------------------------------------------------------------------------------------------------
# create factor
my_factor <- factor(sample(c('Pref', 'Ord'),
                           size = 20,
                           replace = TRUE))

# print contingency table
print(table(my_factor))

#' 
#' A more advanced usage of function `table` is to
#' 
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
# set factors
my_factor_1 <- factor(sample(c('Pref', 'Ord'),
                             size = 20,
                             replace = TRUE))

my_factor_2 <- factor(sample(paste('Group', 1:3),
                             size = 20,
                             replace = TRUE))

# print contingency table with two factors
print(table(my_factor_1, 
            my_factor_2))

#' 
#' The table that we created previously demonstrat
#' 
#' 
#' ### Other Useful Functions
#' 
#' **levels** - Returns the `Levels` an object of 
#' 
## -------------------------------------------------------------------------------------------------------------------
my_factor <- factor(c('A', 'A', 'B', 'C', 'B'))
print(levels(my_factor))

#' 
#' **as.factor** - Transforms an object to the cla
#' 
## -------------------------------------------------------------------------------------------------------------------
my_y <- c('a','b', 'c', 'c', 'a')
my_factor <- as.factor(my_y)

print(my_factor)

#' 
#' 
#' **split** - Based on a grouping variable and an
#' 
## -------------------------------------------------------------------------------------------------------------------
my_factor <- factor(c('A','B','C','C','C','B'))
my_x <- 1:length(my_factor)

my_l <- split(x = my_x, f = my_factor)

print(my_l)

#' 
#' 
#' ## `Logical` Objects
#' 
#' Logical tests are at the heart of R. In one lin
#' 
#' 
#' ### Creating `logical` Objects
#' 
#' Objects of class `logical` are created based on
#' 
## -------------------------------------------------------------------------------------------------------------------
# set numerical
my_x <- 1:10

# print a logical test
print(my_x > 5)

# print position of elements from logical test
print(which(my_x > 5))

#' 
#' In the previous example, function `which` retur
#' 
#' To perform equality tests, simply use the equal
#' 
## -------------------------------------------------------------------------------------------------------------------
# create char
my_char <- rep(c('abc', 'bcd'), 
               times = 5)

# print its contents
print(my_char)

# print logical test
print(my_char == 'abc')

#' 
#' For an inequality test, use symbol **`!=`**, as
#' 
## -------------------------------------------------------------------------------------------------------------------
# print inequality test
print(my_char != 'abc')

#' 
#' It is also possible to test multiple logical co
#' 
## -------------------------------------------------------------------------------------------------------------------
my_x <- 1:10

# print logical for values higher than 4 and lower than 7
print((my_x > 4)&(my_x < 7) )

# print the actual values
idx <- which( (my_x > 4)&(my_x < 7) )
print(my_x[idx])

#' 
#' For non-simultaneous conditions, i.e., the occu
#' 
## -------------------------------------------------------------------------------------------------------------------
# location of elements higher than 7 or lower than 4
idx <- which( (my_x > 7)|(my_x < 4) )

# print elements from previous condition
print(my_x[idx])

#' 
#' Be aware that we used parentheses to encapsulat
#' 
#' Another interesting use of logical objects is t
#' 
## -------------------------------------------------------------------------------------------------------------------
library(dplyr)
# location of elements higher than 7 or lower than 4
my_contries <- c('Country 1', 'Country 2')

# set df
n_obs <- 100
df_temp <- tibble(country = str_c('Country ',
                                  sample(1:10, 
                                         size = n_obs,
                                         replace = TRUE)),
                  inflation.rate = rnorm(n_obs, sd = 0.05) ) %>%
  glimpse()

# filter rows of df with selected tickers
df_temp <- df_temp %>%
  filter(country %in% my_contries) %>%
  glimpse()


#' 
#' The resulting dataframe only has rows for `'Cou
#' 
#' 
#' ## Date and Time
#' 
#' The representation and manipulation of dates is
#' 
#' 
#' ### Creating Simple Dates
#' 
#' In R, several classes can represent dates. The 
#' 
#' The most basic class, indicating the day, month
#' 
## -------------------------------------------------------------------------------------------------------------------
library(lubridate)

# set Date object (YMD)
print(ymd('2020-06-24'))

# set Date object (DMY)
print(dmy('24-06-2020'))

# set Date object (MDY)
print(mdy('06-24-2020'))

#' 
#' Note that the functions return the exact same o
#' 
#' One benefit of using the `lubridate` package is
#' 
## -------------------------------------------------------------------------------------------------------------------
# set Date object 
print(ymd('2020/06/24'))

# set Date object 
print(ymd('2020&06&24'))

# set Date object
print(ymd('2020 june 24'))

# set Date object
print(dmy('24 of june 2020'))

#' 
#' This is a very useful property of `lubridate`, 
#' 
#' Now, using the `base` package, we can create a 
#' 
## -------------------------------------------------------------------------------------------------------------------
# set Date from dd/mm/yyyy with the definition of format
my_date <- as.Date('24/06/2021', format = '%d/%m/%Y')

# print result
print(my_date)

#' 
#' The symbols used in _input_ `format`, such as `
#' 
#' 
#' | Symbol |          Description   |Example |
#' |:------:|:----------------------:|:------:|
#' |%d      |day of month (decimal)  |0       |
#' |%m      |month (decimal)         |12      |
#' |%b      |month (abbreviation)    |Apr     |
#' |%B      |month (complete name)   |April   |
#' |%y      |year (2 digits)         |16      |
#' |%Y      |month (4 digits)        |2020    |
#' 
#' By using the previous table, you'll be able to 
#' 
#' 
#' ### Creating a Sequence of `Dates`
#' 
#' An interesting aspect of objects `Date` is they
#' 
## -------------------------------------------------------------------------------------------------------------------
# create date
my_date <- ymd('2021-06-01')

# find next day
my_date_2 <- my_date + 1

# print result
print(my_date_2)

#' 
#' This property also works with vectors, facilita
#' 
## -------------------------------------------------------------------------------------------------------------------
# create a sequence of Dates
my_date_vec <- my_date + 0:15

# print it
print(my_date_vec)

#' 
#' A more customizable way for creating `Date` seq
#' 
## -------------------------------------------------------------------------------------------------------------------
# set first and last Date
my_date_1 <- ymd('2021-03-07')
my_date_2 <- ymd('2021-03-20')

# set sequence
my_vec_date <- seq(from = my_date_1,
                   to = my_date_2,
                   by = '2 days')

# print result
print(my_vec_date)

#' 
#' Likewise, if we wanted a sequence of dates for 
#' 
## -------------------------------------------------------------------------------------------------------------------
# set first and last Date
my_date_1 <- ymd('2021-03-07')
my_date_2 <- ymd('2021-04-20')

# set sequence
my_vec_date <- seq(from = my_date_1,
                   to = my_date_2,
                   by = '2 weeks')

# print result
print(my_vec_date)

#' 
#' Another way to use function `seq` is by setting
#' 
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
# set dates
my_date_1 <- as.Date('2021-06-27')
my_date_2 <- as.Date('2021-07-27')

# set sequence with 10 elements
my_vec_date <- seq(from = my_date_1,
                   to = my_date_2,
                   length.out = 10)

# print result
print(my_vec_date)

#' 
#' Once again, the interval between the dates is a
#' 
#' 
#' ### Operations with `Dates`
#' 
#' We can calculate difference of days between two
#' 
## -------------------------------------------------------------------------------------------------------------------
# set dates
my_date_1 <- ymd('2015-06-24')
my_date_2 <- ymd('2020-06-24')

# calculate difference
diff_date <- my_date_2 - my_date_1

# print result
print(diff_date)

#' 
#' The output of the subtraction operation is an o
#' 
## -------------------------------------------------------------------------------------------------------------------
# print difference of days as numerical value
print(diff_date[[1]])

#' 
#' Going further, we can also use mathematical ope
#' 
## -------------------------------------------------------------------------------------------------------------------
# set date and vector
my_date_1 <- ymd('2021-06-20')
my_date_vec <- ymd('2021-06-20') + seq(-5,5)

# test which elements of my_date_vec are older than my_date_1
my.test <- (my_date_vec > my_date_1)

# print result
print(my.test)

#' 
#' The previous operation is useful when selecting
#' 
## -------------------------------------------------------------------------------------------------------------------
library(dplyr)
library(lubridate)

# set first and last dates
first_date <- ymd('2021-06-01')
last_date <- ymd('2021-06-15')

# create dataframe and glimpse it
my_temp_df <- tibble(date.vec = ymd('2020-05-25') + seq(0,30),
                     prices=seq(1,10,
                                length.out = length(date.vec)))

# find dates that are between the first and last date
my_idx <- (my_temp_df$date.vec >= first_date) &
  (my_temp_df$date.vec <= last_date)

# use index to filter dataframe
my_temp_df_filtered <- my_temp_df %>%
  filter(my_idx) %>%
  glimpse()

#' 
#' In the previous code, the object `my_temp_df_fi
#' 
#' 
#' ### Dealing with Time
#' 
#' Using the `Date` class is sufficient when deali
#' 
#' In the `base` package, one class used for this 
#' 
#' In R, the time/date format also follows the [IS
#' 
## -------------------------------------------------------------------------------------------------------------------
# creating a POSIXct object
my_timedate <- as.POSIXct('2021-01-01 16:00:00')

# print result
print(my_timedate)

#' 
#' The `lubridate` package also offers intelligent
#' 
## -------------------------------------------------------------------------------------------------------------------
library(lubridate)

# creating a POSIXlt object
my_timedate <- ymd_hms('2020-01-01 16:00:00')

# print it
print(my_timedate)

#' 
#' You should note that this class automatically a
#' 
## -------------------------------------------------------------------------------------------------------------------
# creating a POSIXlt object with custom timezone
my_timedate_tz <- ymd_hms('2020-01-01 16:00:00',
                          tz = 'GMT')

# print it
print(my_timedate_tz)

#' 
#' An important note in the case of `POSIXlt` and 
#' 
## -------------------------------------------------------------------------------------------------------------------
# Adding values (seconds) to a POSIXlt object and printing it
print(my_timedate_tz + 30)

#' 
#' 
#' ### Customizing the Format of Dates and Times
#' 
#' The ISO format for representing dates and `date
#' 
#' In the same way as objects of class `Date`, the
#' 
#' 
#' | Symbol |           Description    | Example |
#' |:------:|:------------------------:|:-------:|
#' | %H     | Hour (decimal, 24 hours) | 23      |
#' | %I     | Hour (decimal, 12 hours) | 11      |
#' | %M     | Minutes (decimal, 0-59)  | 12      |
#' | %p     | AM/PM indicator          | AM      |
#' | %S     | Seconds (decimal, 0-59)  | 50      |
#' 
#' To format a date, use the `format` function. Us
#' 
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
# create vector of dates
my_dates <- seq(from = as.Date('2020-01-01'),
                to = as.Date('2020-01-15'),
                by = '1 day')

# change format
my_dates_US_format <- format(my_dates, '%m/%d/%Y')

# print result
print(my_dates_US_format)

#' 
#' The same procedure can be used for `POSIXlt` ob
#' 
## -------------------------------------------------------------------------------------------------------------------
# create vector of date-time
my_datetime <- as.POSIXlt('2020-02-01 12:00:00') + seq(0,560,60)

# change to US format
my_dates_US_format <- format(my_datetime, '%m/%d/%Y %H:%M:%S')

# print result
print(my_dates_US_format)

#' 
#' Likewise, we can customize our dates for very s
#' 
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
# set custom format
my_dates_my_format <- format(my_dates,
                             'Year=%Y | Month=%m | Day=%d')

# print result
print(my_dates_my_format)

#' 
#' 
#' ### Extracting Elements of a Date
#' 
#' We can use function `format` to extract data el
#' 
## -------------------------------------------------------------------------------------------------------------------
library(lubridate)

# create vector of date-time
my_datetime <- seq(from = ymd_hms('2020-01-01 12:00:00'),
                   to = ymd_hms('2020-01-01 18:00:00'),
                   by = '1 hour')

# get hours from POSIXlt
my_hours <- format(my_datetime, '%H')

# print result
print(my_hours)

#' 
#' Likewise, we can use symbols `%M` and `%S` to e
#' 
## -------------------------------------------------------------------------------------------------------------------
# create vector of date-time
n_dates <- 10
my_datetime <- seq(from = ymd_hms('2020-01-01 12:00:00'),
                   to = ymd_hms('2020-01-01 18:00:00'),
                   length.out = n_dates) + sample(1:59, 
                                                  size = n_dates)

# get minutes from POSIXlt
my_minutes <- format(my_datetime, '%M')

# print result
print(my_minutes)

#' 
#' Alternatively, we can use `lubridate` functions
#' 
## -------------------------------------------------------------------------------------------------------------------
# get hours with lubridate
print(hour(my_datetime))

# get minutes with lubridate
print(minute(my_datetime))

#' 
#' Functions for extracting other components of a 
#' 
#' 
#' ### Find the Current Date and Time
#' 
#' R also allows the user to find the current date
#' 
#' If you want to find the present day, use functi
#' 
## -------------------------------------------------------------------------------------------------------------------
# get today from base
print(Sys.Date())

# get today from lubridate
print(lubridate::today())

#' 
#' If you want to find the current date and time, 
#' 
## -------------------------------------------------------------------------------------------------------------------
# get time from base
print(Sys.time())

# get time from lubridate
print(lubridate::now())

#' 
#' Going further, based on these functions, we can
#' 
## -------------------------------------------------------------------------------------------------------------------
# example of log message
my_sys_info <- Sys.info()
my_str <- str_c('Log of execution\n',
                'Time of execution: ', now(), '\n',
                'User: ', my_sys_info['user'], '\n',
                'Computer: ', my_sys_info['nodename'])

# print it
cat(my_str)

#' 
#' This is the exact time when this book was compi
#' 
#' 
#' ### Other Useful Functions
#' 
#' **weekdays** - Returns the day of the week from
#' 
## -------------------------------------------------------------------------------------------------------------------
# set date vector
my_dates <- seq(from = ymd('2020-01-01'),
                to = ymd('2020-01-5'),
                by = '1 day')

# find corresponding weekdays
my_weekdays <- weekdays(my_dates)

# print it
print(my_weekdays)

#' 
#' **months** - Returns the month of one or more d
#' 
## -------------------------------------------------------------------------------------------------------------------
# create date vector
my_dates <- seq(from = ymd('2020-01-01'),
                to = ymd('2020-12-31'),
                by = '1 month')

# find months
my_months <- months(my_dates)

# print result
print(my_months)

#' 
#' **quarters** - Returns the location of one or m
#' 
## -------------------------------------------------------------------------------------------------------------------
# get quartiles of the year
my_quarters <- quarters(my_dates)

# print it
print(my_quarters)

#' 
#' **OlsonNames** - Returns an array with the time
#' 
## -------------------------------------------------------------------------------------------------------------------
# get possible timezones
possible_tz <- OlsonNames()

# print it
print(possible_tz[1:5])

#' 
#' **Sys.timezone** - Returns the current timezone
#' 
## -------------------------------------------------------------------------------------------------------------------
print(Sys.timezone())

#' 
#' **cut** - Returns a factor by grouping dates an
#' 
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
# set example date vector
my_dates <- seq(from = as.Date('2020-01-01'),
                to = as.Date('2020-03-01'),
                by = '5 days')

# group vector based on monthly breaks
my_month_cut <- cut(x = my_dates,
                    breaks = 'month',
                    labels = c('Jan', 'Fev', 'Mar'))

# print result
print(my_month_cut)

#' 
## -------------------------------------------------------------------------------------------------------------------
# set example datetime vector
my_datetime <- as.POSIXlt('2020-01-01 12:00:00') + seq(0,250,15)

# set groups for each 30 seconds
my_cut <- cut(x = my_datetime, breaks = '30 secs')

# print result
print(my_cut)

#' 
#' 
#' ## Missing Data - `NA` (_Not available_)
#' 
#' One of the main innovations of R is the represe
#' 
#' 
#' ### Defining `NA` Values
#' 
#' To define omissions in the dataset, use symbol 
#' 
## -------------------------------------------------------------------------------------------------------------------
# a vector with NA
my_x <- c(1, 2, NA, 4, 5)

# print it
print(my_x)

#' 
#' An important information that you must remember
#' 
## -------------------------------------------------------------------------------------------------------------------
# a vector 
my_y <- c(2, 3, 5, 4, 1)

# example of NA interacting with other objects
print(my_x + my_y)

#' 
#' This property demands special attention if you 
#' 
## -------------------------------------------------------------------------------------------------------------------
# set vector with NA
my_x <- c(1:5, NA, 5:10)

# print cumsum (NA after sixth element)
print(cumsum(my_x))

# print cumprod (NA after sixth element)
print(cumprod(my_x))

#' 
#' Therefore, when using functions `cumsum` and `c
#' 
## Every time you use the `cumsum` and` cumprod` functions, make sure that there is no `NA` value in the input vector. Remember that every `NA` is contagious and recursive calculations will result in a vector full of missing data.

#' 
#' 
#' ### Finding and Replacing `NA`
#' 
#' To find `NA` values, use function `is.na`: \ind
#' 
## -------------------------------------------------------------------------------------------------------------------
# set vector with NA
my_x <- c(1:2, NA, 4:10)

# Test and find location of NA
print(is.na(my_x))
print(which(is.na(my_x)))

#' 
#' To replace it, use indexing with the output of 
#' 
## -------------------------------------------------------------------------------------------------------------------
# set vector
my_x <- c(1, NA, 3:4, NA)

# replace NA for 2
my_x[is.na(my_x)] <- 2

# print result
print(my_x)

#' 
#' Another way to remove `NA` values is to use the
#' 
## -------------------------------------------------------------------------------------------------------------------
# set vector
my_char <- c(letters[1:3], NA, letters[5:8])

# print it
print(my_char)

# use na.omit to remove NA
my_char <- na.omit(my_char)

# print result
print(my_char)

#' 
#' Although the class of the object has changed du
#' 
## -------------------------------------------------------------------------------------------------------------------
# trying nchar on a na.omit object
print(nchar(my_char))

#' 
#' For other objects, however, this property may n
#' 
#' 
#' ### Other Useful Functions
#' 
#' **complete.cases** - Returns a logical vector i
#' 
## -------------------------------------------------------------------------------------------------------------------
# create matrix
my_mat <- matrix(1:15, nrow = 5)

# set an NA value
my_mat[2,2] <- NA

# print index with rows without NA
print(complete.cases(my_mat))

#' 
#' ## Exercises
#' 
## ---- echo=FALSE, results='asis'------------------------------------------------------------------------------------
f_in <- list.files('../02-EOCE-Rmd/Chapter07-Basic-Classes/', 
                   full.names = TRUE)

compile_eoc_exercises(f_in, type_doc = my_engine)
msperlin/afedR documentation built on Sept. 11, 2022, 9:49 a.m.