knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "README-"
)

ResearchGroupTools

Build Status CRAN_Status_Badge Coverage Status

ResearchGroupTools provides a collection of utilitiy function for rapid prototyping. These functions facilitate implemenation works related to advanced analytics. As such, it specifically supports data handling, preprocessing, visualization and analytics.

Installation

Using the devtools package, you can easily install the latest development version of ResearchGroupTools with

install.packages("devtools")

# Recommended option: download and install latest version from "GitHub"
devtools::install_github("sfeuerriegel/ResearchGroupTools", dependencies = TRUE)

Notes:

Usage

This section shows the basic functionality of how accelerate data science in R. First, load the corresponding package ResearchGroupTools.

library(ResearchGroupTools)

By default, the seed for the random number generator is initialized to 0.

Changes to LaTeX {#LaTeX}

Some export routines require a few changes to your LaTeX document in order to get it running. The steps are documented in the help of e.g. \code{correlationMatrix()}; below is a minimal working example:

\documentclass{article}
\usepackage{SIunitx}
  \newcommand{\sym}[1]{\rlap{$^{#1}$}}
  \sisetup{input-symbols={()*}}
\begin{document}

\begin{tabular}{l SSS}
\toprule
\include{table_cor}
\end{tabular}
\end{document}

Above, we included SIunitx, introduced a command \sym, changed the input-symbols and used custom column alignments (S).

Functionality

Library handling

Library("texreg")
loadRegressionLibraries()

Strings

"a" %+% "b"
3 %+% 4
do.call(`%+%`, as.list(letters))

Numerical functions

ceil(3.4)
library(dplyr)

df <- data_frame(x = 1:10, y = rnorm(10))
cumsd(df$x)

df %>%
  mutate_all(funs("mean" = cummean, "sd" = cumsd))

Data handling

d <- data_frame(x = 1:10,
               y = rnorm(10))
d %>% pull(x)
d %>% pull("x")

v <- "x"
d %>% pull_string(v)

d %>% pull_ith(1)
ts <- data.frame(Date = seq(from = as.Date("2000-01-01"), to = as.Date("2000-03-31"), by = "1 day"))
df_monthly <- data.frame(Month = c(as.Date("2000-01-31"), as.Date("2000-02-29"), as.Date("2000-03-31")),
                         Values = 1:3)

df_daily <- completeLowResolutionData(ts$Date, df_monthly, "Month")

# example of how to bind things together
ts <- ts %>%
  left_join(df_daily, by = c("Date" = "Month"))

Time series

lags(1:5, c(1, 2, 3))
lags(ts(1:5), c(1, 2, 5))
differences(1:10)
differences(c(1, 2, 4, 8, 16, 32))
differences(c(1, 2, 4, 8, 16, 32), order = 2)
differences(c(1, 2, 4, 8, 16, 32), na_padding = FALSE)
returns(1:10)
returns(c(1, 2, 4, 8, 16, 32))
returns(c(1, 2, 4, 8, 16, 32), na_padding = FALSE) # remove trailing NA's
logReturns(c(1, 2, 4, 8, 16, 32), base = 2)

Matrix functions (or data.frame)

m <- matrix(letters[c(1, 2, NA, 3, NA, 4, 5, 6, 7, 8)], ncol = 2, byrow = FALSE)
colnames(m) <- c("x", "y")
m

anyNA(m)      # use built-in routine to test for NA values

findRowsNA(m) # returns indices of that rows
showRowsNA(m) # prints rows with NA values

findColsNA(m) # returns name of that columns
showColsNA(m) # print columns with NA values
last_non_NA(c(1, 2, 3, 4, NA))

values <- 1:100
values[sample(1:100, 10)] <- NA
df <- cbind(Year = c(rep(2000, 5), rep(2001, 5)),
              as.data.frame(matrix(values, nrow = 10)))

df %>%
  group_by(Year) %>%
  summarize_each(funs(last_non_NA)) %>%
  ungroup() %>%
  head()

Descriptive statistics

d <- data.frame(x1 = rnorm(200), x2 = rnorm(200), y = rnorm(200))

d_trimmed <- removeOutlierObservations(d)
dim(d_trimmed)

d_trimmed <- removeOutlierObservations(d, variables = "y", cutoff = 2.0)
dim(d_trimmed)

d_trimmed <- removeOutlierObservations(d, variables = c("x1", "x2"), cutoff = 2.0)
dim(d_trimmed)
data(USArrests)
descriptiveStatistics(USArrests)
unlink("table_descriptives.tex")
correlationMatrix(USArrests)
correlationMatrix(USArrests, filename = "table_cor.tex") # stores output in LaTeX file
unlink("table_cor.tex")

Visualization

library(ggplot2)

df <- data.frame(x = 1:20,
                 y = 1:20,
                 z = as.factor(rep(1:4, each = 5)))

jplot(df) +
  geom_line(aes(x = x, y = y, color = z, linetype = z))
# For comparison:
# ggplot(df) +
#  geom_line(aes(x = x, y = y, color = z, linetype = z))

jplot(df) +
  geom_point(aes(x = x, y = y, color = z))
# For comparison:
# ggplot(df) +
#   geom_point(aes(x = x, y = y, color = z))
linePlot(1:10)

x <- seq(0, 4, length.out = 100)
linePlot(x, sin(x))
df <- data.frame(x=rnorm(100)/1000, y=rnorm(100)/1000)
ggplot(df, aes(x=x, y=y)) +
  geom_point() +
  scale_x_continuous(labels=scientificLabels) +
  scale_y_continuous(labels=scientificLabels)
ggplot(df, aes(x=x, y=y)) +
  geom_point() +
  scale_x_continuous(labels=allDigitsLabels) +
  scale_y_continuous(labels=allDigitsLabels)

Regressions

makeFormula("y", "x")
makeFormula("y", c("x1", "x2", "x3"))
makeFormula("y", c("x1", "x2", "x3"), "dummies")
x1 <- 1:100
x2 <- sin(1:100)
clusters <- rep(c(1, 2), 50)
dummies <- model.matrix(~ clusters)
y <- x1 + x2 + clusters + rnorm(100)
d <- data.frame(x1 = x1, x2 = x2, y = y)

m_dummies <- regression(formula("y ~ x1 + x2 + dummies"), data = d, subset = 1:90,
                        dummies = "dummies", cutoff = 0.5)
summary(m_dummies)

library(sandwich)

m_dummies <- regression(formula("y ~ x1 + x2 + dummies"), data = d, subset = 1:90,
                        dummies = "dummies", cutoff = 0.5, vcov = NeweyWest)
summary(m_dummies)
models <- regressionStepwise(formula("y ~ x1 + x2 + dummies"), data = d, subset = 1:90,
                            dummies = "dummies", cutoff = 0.5)

length(models)

library(texreg)

texreg(models, omit.coef = "dummies")

models <- regressionStepwise(formula("y ~ x1 + x2 + dummies"), data = d, subset = 1:90,
                            dummies = "dummies", cutoff = 0.5, vcov = NeweyWest)
texreg(models, omit.coef = "dummies")
showCoeftest(m_dummies, hide = "x") # leaves only the intercept
library(vars)
data(Canada)

prod <- differences(as.numeric(Canada[, 2]))
production <- data.frame(Prod = prod, Lag1 = dplyr::lag(prod), Lag2 = dplyr::lag(prod, 2))

m <- lm(Prod ~ Lag1, data = production)
standardizeCoefficients(m)

library(quantreg)
data(stackloss)

qr <- rq(stack.loss ~ stack.x, 0.25)
standardizeCoefficients(qr)
x <- 1:10
y <- 1 + x + rnorm(10)
m <- lm(y ~ x)

extractRegressionStatistics(m)
 d <- data.frame(x = 1:200, y = 1:200 + rnorm(200))
m <- lm(y ~ x, d)                  # fit original model

idx_rm <- getRowsOutlierRemoval(m) # identify row indices of outliers
m <- lm(y ~ x, d[-idx_rm, ])       # refit model with outliers removed
texreg_tvalues(m_dummies)
texreg_tvalues(m_dummies, hide = "dummies")
texreg_tvalues(list(m, m_dummies))

qr25 <- rq(stack.loss ~ stack.x, 0.25)
qr50 <- rq(stack.loss ~ stack.x, 0.50)
qr75 <- rq(stack.loss ~ stack.x, 0.75)
texreg_tvalues(list(qr25, qr50, qr75))
library(car)
m <- lm(mpg ~ disp + hp + wt + drat, data = mtcars)

testDiagnostics(m)

Time series analysis

var.2c <- VAR(Canada, p = 2, type = "none")

standardizeCoefficients(var.2c$varresult$e)

std <- standardizeCoefficients(var.2c)
std$e
adf(USArrests, verbose = FALSE)
adf(USArrests, vars = c("Murder", "Rape"), type = "drift",
   filename = "adf.tex", verbose = FALSE)
unlink("adf.tex")
adf_levels <- adf(USArrests)
adf_diff1 <- adf(data.frame(Murder = diff(USArrests$Murder),
                            Assault = diff(USArrests$Assault),
                            UrbanPop = diff(USArrests$UrbanPop),
                            Rape = diff(USArrests$Rape)))
exportAdfDifferences(adf_levels, adf_diff1)
unlink("adf.tex")
cointegrationTable(USArrests, vars = c("Murder", "Rape"), K = 2, filename = "cointegration_eigen.tex")
unlink("cointegration_eigen.tex")
irf <- irf(var.2c, impulse = "e", response = "prod", boot = TRUE)
plotIrf(irf, ylab = "Production")
impulseResponsePlot(var.2c, impulse = "e", response = "prod", ylab = "Production", n.ahead = 5, filename = "irf_e_prod.pdf")
unlink("irf_e_prod.pdf")
testSpecification(var.2c)

Hooks to other packages

coeftostring(-0.000001, digits = 4) # the original function would return "-.0000"

d <- data.frame(y = 1:1000 - 0.0000001, x = 1:1000)
m <- lm(y ~ x, data = d)
texreg(m) # intercept would otherwise be "-0.00"
xtable(matrix(1:4, nrow = 2) * -0.000001) # would otherwise return "-0.00"

Package development

rebuildPackage()
rebuildPackage(TRUE) # also runs README.Rmd

License

ResearchGroupTools is released under the MIT License

Copyright (c) 2016 Stefan Feuerriegel



sfeuerriegel/ResearchGroupTools documentation built on May 29, 2019, 8:01 p.m.