#' # Cleaning and Structuring Data {#cleaning}
#'
## ---- include=FALSE-------------------------------------------------------------------------------------------------
source('Scripts/preamble_chapters.R')
#'
#' Now that we learned how to get data into R and
#'
#' - Changing the format of a dataframe (long/wide
#' - Converting a `list` of `dataframes` into a si
#' - Identifying and treating extreme values (_out
#' - Price data deflation;
#' - Data aggregation based on a change of time-fr
#'
#'
#' ## The Format of a `dataframe`
#'
#' A proper format of a `dataframe` is necessary f
#'
#' **In the wide format**, the rows of the table a
#'
## ---- echo=FALSE----------------------------------------------------------------------------------------------------
library(tidyverse)
set.seed(10)
N <- 4
temp_df <- tibble(ref_date=Sys.Date()+1:N,
STOCK1 = 10+cumsum(rnorm(N, sd = 1.25)),
STOCK2 = 3+ cumsum(rnorm(N, sd = 0.5)),
STOCK3 = 6+ cumsum(rnorm(N, sd = 0.5)))
knitr::kable(temp_df, digits = 2)
#'
#' The above table has three distinct pieces of in
#'
#' **In the long format**, each row of the `datafr
#'
## ---- echo=FALSE----------------------------------------------------------------------------------------------------
long_df <- tidyr::pivot_longer(data = temp_df,
cols = !ref_date,
names_to = 'Ticker',
values_to = 'Price')
knitr::kable(long_df, digits = 2)
#'
#' In comparison, the wide format is more intuitiv
#'
#' This argument may seem trivial since the inform
#'
#' It is worth noting that, in finance, the wide f
#'
#'
#' ### Converting a `dataframe` Structure (long an
#'
#' The conversion from one format to the other is
#'
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
library(tidyverse)
# set dates and stock vectors
ref_date <- as.Date('2015-01-01') + 0:3
STOCK1 <- c(10, 11, 10.5, 12)
STOCK2 <- c(3, 3.1, 3.2, 3.5)
STOCK3 <- c(6, 7, 7.5, 6)
# create wide dataframe
my_df_wide <- tibble(ref_date, STOCK1, STOCK2, STOCK3)
# print it
print(my_df_wide)
# convert wide to long
my_df_long <- tidyr::pivot_longer(data = my_df_wide,
cols = !ref_date,
names_to = 'Ticker',
values_to = 'Price')
# print result
print(my_df_long)
#'
#' The way to read function `tidyr::pivot_longer`
#'
#' To perform the reverse conversion, _long_ to _w
#'
## ---- tidy=FALSE----------------------------------------------------------------------------------------------------
# convert from long to wide
my_df_wide_converted <- my_df_long %>%
tidyr::pivot_wider(names_from = 'Ticker', values_from = 'Price')
# print result
print(my_df_wide_converted)
#'
#' With more complex conversions, where it is nece
#'
#'
#' ## Converting `lists` into `dataframes`
#'
#' Another important case in data re-structuring i
#'
#' For the first, let's use the `purrr` package as
#'
## ---- include=FALSE-------------------------------------------------------------------------------------------------
# clean up files
file.remove(list.files('many_datafiles_2/',
full.names = TRUE) )
#'
## -------------------------------------------------------------------------------------------------------------------
create_rnd_data <- function(n_obs = 100,
folder_out) {
# function for creating random datasets
#
# ARGS: n_obs - number of observations
# folder_out - folder where to save files
#
# RETURN: TRUE, if sucessfull
require(tidyverse)
require(wakefield)
# check if folder exists
if (!dir.exists(folder_out)) dir.create(folder_out)
# create extensive random data
rnd_df <- r_data_frame(n = n_obs,
id,
race,
age,
sex) %>%
r_na(prob = 0.1)
# for 15% of the time, create a new column
if (runif(1) < 0.15 ) {
rnd_df$bad_column <- 'BAD COLUMN!'
}
# set file name
f_out <- tempfile(fileext = '.csv',
pattern = 'file_',
tmpdir = folder_out)
write_csv(x = rnd_df,
file = f_out)
return(TRUE)
}
#'
#' Function `create_rnd_data` will create and writ
#'
#' Going forward, let's use `purrr::pmap` to creat
#'
## -------------------------------------------------------------------------------------------------------------------
n_files <- 50
n_obs <- 100
folder_out <- 'many_datafiles_2'
# create random datasets
l_out <- pmap(.l = list(n_obs = rep(n_obs, n_files),
folder_out = rep(folder_out, n_files)),
.f = create_rnd_data)
# check if files are there
print(head(list.files(folder_out)))
#'
#' The files are available, as expected. Now, let'
#'
## -------------------------------------------------------------------------------------------------------------------
read_single_file <- function(f_in) {
# Function for reading single csv file with random data
#
# ARGS: f_in - path of file
#
# RETURN: A dataframe with the data
require(tidyverse)
df <- read_csv(f_in, col_types = cols())
return(df)
}
#'
## -------------------------------------------------------------------------------------------------------------------
files_to_read <- list.files('many_datafiles_2/',
full.names = TRUE)
l_out <- map(files_to_read, read_single_file)
#'
#' And now we bind them all together with a simple
#'
## -------------------------------------------------------------------------------------------------------------------
compiled_df <- bind_rows(l_out)
glimpse(compiled_df)
#'
#' It worked, as expected. We have `r nrow(compile
#'
#' For the second example, let's take a case of da
#'
## ---- cache=TRUE, message=FALSE-------------------------------------------------------------------------------------
library(BETS)
my_id <- 3785:3791
# set dates
first_date = '2010-01-01'
last_date = as.character(Sys.Date())
# get data
l_out <- BETSget(code = my_id, data.frame = TRUE,
from = first_date, to = last_date)
# check data in first dataframe
glimpse(l_out[[1]])
#'
#' In this example we gather data for unemployment
#'
#' Now, if we want to structure all imported table
#'
## -------------------------------------------------------------------------------------------------------------------
my_countries <- c("Germany", "Canada", "United States",
"France", "Italy", "Japan",
"United Kingdom")
#'
#' The order of elements in vector `my_countries`
#'
#' Going further, we now create a function that wi
#'
## -------------------------------------------------------------------------------------------------------------------
clean_bets <- function(df_in, country_in) {
# function for cleaning data from BETS
#
# ARGS: df_in - dataframe within a list
# country_in - name of country
#
# VALUE: a new dataframe with new column type
#set column
df_in$country <- country_in
# return df
return(df_in)
}
#'
#' The next step is to use the previous function t
#'
## -------------------------------------------------------------------------------------------------------------------
library(purrr)
# set args
l_args <- list(df_in = l_out,
country_in = my_countries)
# format dfs
l_out_formatted <- pmap(.l = l_args,
.f = clean_bets)
# check first element (all are the same structure)
glimpse(l_out_formatted[[1]])
#'
#' From the output of `glimpse` we see that the co
#'
## -------------------------------------------------------------------------------------------------------------------
# bind all rows of dataframes in list
df_unemp <- bind_rows(l_out_formatted)
# check it
glimpse(df_unemp)
#'
#' Done! The result is an organized `dataframe` in
#'
#'
#' ## Removing Outliers
#'
#' A recurrent issue in data analysis is handling
#'
#' Now, to visualize the destructive effect of an
#'
#' The next example might be challenging if it is
#'
## -------------------------------------------------------------------------------------------------------------------
# set seed for reproducibility
set.seed(5)
# set options
nT <- 50
sim_x <- rnorm(nT)
my_beta <- 1
# simulate x and y
sim_y <- sim_x*my_beta + rnorm(nT)
sim_y_with_outlier <- sim_y
# simulate y with outlier
sim_y_with_outlier[10] <- 100
#'
#' Objects `sim_y` and `sim_y_with_outlier` are ex
#'
## ---- message=FALSE-------------------------------------------------------------------------------------------------
library(texreg)
# estimate models
model_no_outlier <- lm(formula = sim_y ~ sim_x)
model_with_outlier <- lm(formula = sim_y_with_outlier ~ sim_x)
# report them
screenreg(list(model_no_outlier,
model_with_outlier),
custom.model.names = c('No Outlier', 'With Outlier'))
#'
#' Notice from the estimation table that the slope
#'
#' One way to accomplish this is to identify poten
#'
## -------------------------------------------------------------------------------------------------------------------
# find the value of vector that sets the 95% quantile
quantile95 <- quantile(x = abs(sim_y_with_outlier),
probs = 0.95)
print(quantile95)
#'
#' Here, the value of `r quantile95` is higher tha
#'
## -------------------------------------------------------------------------------------------------------------------
# find cases higher than 95% quantile
idx <- which(sim_y_with_outlier > quantile95)
print(sim_y_with_outlier[idx])
#'
#' We find the "artificial" outlier we've set in p
#'
#' Finally, we need to treat outliers. We can eith
#'
## -------------------------------------------------------------------------------------------------------------------
# copy content
sim_y_without_outlier <- sim_y_with_outlier
# set NA in outlier
sim_y_without_outlier[idx] <- NA
# or remove it
sim_y_without_outlier <- sim_y_without_outlier[-idx]
#'
#' An alternative for identifying extreme values i
#'
## ---- eval=FALSE----------------------------------------------------------------------------------------------------
## library(outliers)
##
## # find outlier
## my_outlier <- outlier(sim_y_with_outlier)
##
## # print it
## print(my_outlier)
#'
#'
#'
#' As expected, it correctly identified the outlie
#'
#'
#' ### Treating Outliers in `dataframes` {#outlier
#'
#' Let's go a bit deeper. In a real data analysis
#'
#' The first step is to define a function that acc
#'
## -------------------------------------------------------------------------------------------------------------------
replace_outliers <- function(col_in, my_prob = 0.05) {
# Replaces outliers from a vector and returns a new
# vector
#
# INPUTS: col_in The vector
# my_prob Probability of quantiles
# (will remove quantiles at p and 1-p)
#
# OUTPUT: A vector without the outliers
# return if class is other than numeric
if (!(class(col_in) %in%
c('numeric', 'integer'))) return(col_in)
my_outliers <- stats::quantile(x = col_in,
probs = c(my_prob, 1-my_prob),
na.rm = TRUE)
idx <- (col_in <= my_outliers[1])|(col_in >= my_outliers[2])
col_in[idx] <- NA
return(col_in)
}
#'
#' Let's test it:
#'
## -------------------------------------------------------------------------------------------------------------------
# set test vector
my_x <- runif(15)
# artificially set outliers
my_x[5] <- max(my_x)*5
# find and replace outliers
print(replace_outliers(my_x, my_prob = 0.05))
#'
#' As we can see, it performed correctly, replacin
#'
## -------------------------------------------------------------------------------------------------------------------
library(wakefield)
library(tidyverse)
# options
n_obs <- 100
# create extensive random data
my_df <- r_data_frame(n = n_obs,
race,
age,
birth,
height_cm,
sex)
# check it
glimpse(my_df)
#'
#' Now, let's use `purrr::map` to iterate all elem
#'
## -------------------------------------------------------------------------------------------------------------------
library(purrr)
# remove outlivers from vectors
l_out <- map(my_df, replace_outliers)
#'
#' Next, we regroup all vectors into a single data
#'
## -------------------------------------------------------------------------------------------------------------------
# rebuild dataframe
my_df_no_outlier <- bind_rows(l_out)
# check it
glimpse(my_df_no_outlier)
# summary of my_df_no_outlier
summary(my_df_no_outlier)
#'
#' Note that, as expected, we find `NA` values for
#'
#' For last, we remove all rows with outliers usin
#'
## -------------------------------------------------------------------------------------------------------------------
# remove outliers
my_df_no_outlier <- na.omit(my_df_no_outlier)
glimpse(my_df_no_outlier)
#'
#' Notice, however, that some rows were lost. The
#'
#'
#' ## Inflation and Price Data
#'
#' A common effect in economic and financial data
#'
#' To offset the effect of inflation on price data
#'
## ---- message=FALSE-------------------------------------------------------------------------------------------------
library(GetQuandlData)
library(tidyverse)
# set api (you need your OWN from www.quandl.com)
my_api_key <- readLines(
'~/Dropbox/98-pass_and_bash/.quandl_api.txt'
)
# set symbol and dates
my_symbol <- 'RATEINF/INFLATION_USA'
first_date <- as.Date('2000-01-01')
last_date <- Sys.Date()
# get data!
df_inflation <- get_Quandl_series(id_in = my_symbol,
api_key = my_api_key,
first_date = first_date,
last_date = last_date)
# sort by date
df_inflation <- df_inflation %>%
arrange(ref_date)
# check content
glimpse(df_inflation)
#'
#' Now, let's create a random dataframe with rando
#'
## -------------------------------------------------------------------------------------------------------------------
n_T <- nrow(df_inflation)
# create df with prices
my_df <- tibble(Date = df_inflation$ref_date,
x = 100 + cumsum(rnorm(n_T)),
y = 100 + cumsum(rnorm(n_T)))
# check it
glimpse(my_df)
#'
#' The first step is to create a deflator index ba
#'
## -------------------------------------------------------------------------------------------------------------------
# accumulate: R_a = cumprod(r_t + 1)
my_df$infl_idx <- cumprod(df_inflation$value/100 +1)
# set inflation index
my_df$infl_idx <- my_df$infl_idx/my_df$infl_idx[nrow(my_df)]
#'
#' And now we create the new variables:
#'
## -------------------------------------------------------------------------------------------------------------------
my_df$x_desinflated <- my_df$x*my_df$infl_idx
my_df$y_desinflated <- my_df$y*my_df$infl_idx
glimpse(my_df)
#'
#' Done. We now have two new columns with desinfla
#'
#'
#' ## Modifying Time Frequency and Aggregating Dat
#'
#' Sometimes we receive data with a mismatch of ti
#'
#' Let's start with an example with the SP500 inde
#'
## ---- message=FALSE-------------------------------------------------------------------------------------------------
library(BatchGetSymbols)
df_SP500 <- BatchGetSymbols(tickers = '^GSPC',
first.date = '2010-01-01',
freq.data = 'daily',
last.date = '2021-01-01')[[2]]
#'
#' Every time-frequency operation from higher to l
#'
## -------------------------------------------------------------------------------------------------------------------
# from daily to annual
df_SP500_annual <- df_SP500 %>%
mutate(ref_year = lubridate::year(ref.date)) %>%
group_by(ref_year) %>%
summarise(last_value = last(price.adjusted))
# glimpse it
glimpse(df_SP500_annual)
#'
#' For the previous chunk of code, we created a ne
#'
#'
#' ## Exercises
#'
## ---- echo=FALSE, results='asis'------------------------------------------------------------------------------------
f_in <- list.files('../02-EOCE-Rmd/Chapter09-Cleaning-and-Structuring/',
full.names = TRUE)
compile_eoc_exercises(f_in, type_doc = my_engine)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.