R/fake_data.R

Defines functions fake_panel fake_br fake_tcod

Documented in fake_br fake_panel fake_tcod

#' Generate a panel of firm sales data.
#'
#' This function generates a panel (```tbl```) of
#' firm sales.
#'
#' @param N The number of individuals
#'
#' @param T The number of time periods
#'
#' @param I The number of industries
#'
#' @param mean_sales Cross-section mean of sales, in millions of $
#'
#' @param sd_sales Cross-section standard deviation of sales
#'
#' @param model Arima model arguments; a list of
#' order = c(AR order, degree of differencing, MA order),
#' ar = (AR parameter), ma = (MA parameter)
#'
#' @param missing A number between 0 and 1; the fraction of the sample that should be missing at
#' random
#'
#'
#' @return A tbl of panel data
#' @keywords fake panel data
#'
#' @importFrom magrittr "%>%"
#'
#' @examples
#'
#' fake_panel(N = 10000, T = 50)
#'
#' fake_panel(N = 10, T = 20, missing = 0.2)
#'
#' @export
fake_panel <- function(N, T, I = 2, mean_sales = 30, sd_sales = 0.1 * mean_sales,
                     model = list(order = c(1,0,1), ar = 0.1, ma = 0.1),
                     missing = 0) {
  # make fake data with N observations, T periods, missing data,
  # if I is missing, just drop it from the tbl?
  # use var names as names?
  # growth_rates <- arima.sim(model = model, n = T) / 100 # the 100 somehow scales it properly.
  sales_0 <- rlnorm(n = N, meanlog = log(mean_sales), sdlog = log(sd_sales)) %>% floor()

  f <- function(T) {
    arima.sim(model, n = T) / 100
  }

  static <- tibble::tibble(id = 1:N, I = sample.int(n = I, size = N, replace = TRUE), sales = sales_0)
  pan <- tidyr::crossing(id = 1:N, year = 1:T)
  pan$g <- replicate(N, f(T=T)) %>% matrix(nrow = N*T, ncol = 1, byrow = TRUE) %>% .[,1]
  pan %>%
    dplyr::left_join(static, by = 'id') %>%
    dplyr::group_by(id) %>%
    dplyr::mutate(sales = sales * cumprod(1 + g)) %>%
    dplyr::ungroup() %>%
    dplyr::sample_frac(size = 1 - missing) %>%
    dplyr::arrange(id, year)
}

# library(microbenchmark)
# library(purrr)
# h <- function(i) {
#   microbenchmark(replicate(i, f(T=T)) %>%
#                    matrix(nrow = i*T, ncol = 1, byrow = TRUE) %>%
#                    .[,1]) %>%
#     .$time %>% mean() %>% `/`(., 1e06)
# }
# tblx <- tibble(i = (1:100)*100, t = map_dbl(i, h))
# library(ggplot2)
# tblx %>% ggplot(aes(x = i, y = t)) + geom_line()

# tblx %>% mutate(t = microbenchmark(replicate(i, f(T=T)) %>%
#                                      matrix(nrow = i*T, ncol = 1, byrow = TRUE) %>%
#                                      .[,1]) %>%
#                        .$time %>% mean() %>% `/`(., 1e06))
#
# 10 %>% replicate(f(T=T)) %>% matrix(nrow = i*T, ncol = 1, byrow = TRUE) %>% .[,1]


# tblx %>% mutate(t = map(i, h))


#' Return a list of firm names and addresses.
#'
#' This function returns a list of firm names
#' and addresses.
#'
#' @return A tbl of br names and addresses
#' @keywords fake firm data
#'
#' @importFrom magrittr "%>%"
#'
#' @examples
#'
#' fake_br()
#'
#' @export
fake_br <- function() {

  br <- tibble::tibble(
    name = c("A.-B. SECURITY",
             "Armada Security Canada",
             "Halfway River Safety Limited",
             "RNN Sales & Réntals",
             "Tim Tom Construction & Concrete",
             "All Rhodes Pilot Service",
             "Canadian Quality Control Inc.",
             "Falcon Contracting Ltd.",
             "1 Nation Distribution",
             "Blanshard Group",
             "Bulkley Valley Motel Ltd."),
    address = c("Unit 212, 833 103 Ave",
                "9605 14 St",
                "801 102 Ave",
                "P.O. Box 143, Main Stn",
                "1205 116th Ave, #499",
                "7485 Sunhill Rd",
                "801 102 Ave",
                "8555 Pacific St",
                "8555 Pacific St",
                "101-4442 West Saanich Rd",
                "P.O. Box 143, Main Stn"),
    postal_code = c("V1G2G2", "V1G3Y1", "V1G2B4", "V1G4E9", "V1G4P5",
                    "V2N6E7", "V1G2B4", "V1G3Y1", "V1G3Y1", "V1G4E9", "V1G4E9"),
    city = rep_len("Dawson Creek", 11),
    province = rep_len("59", 11)
  ) %>% tibble::rownames_to_column(var = "id")
  br
}


#' Return a list of shipments.
#'
#' This function returns a list of shipments.
#'
#' @return A tbl of shipments
#' @keywords fake shipment data
#'
#' @importFrom magrittr "%>%"
#'
#' @examples
#'
#' fake_tcod()
#'
#' @export
fake_tcod <- function(br = fake_br()) {

  tcod <- expand.grid(1:dim(br)[1], 1:dim(br)[1]) %>%
    tibble::as.tibble() %>%
    dplyr::rename(id.x = Var1, id.y = Var2) %>%
    dplyr::mutate_all(as.character) %>%
    dplyr::filter(id.x != id.y)

  tcod <- tcod %>%
    dplyr::left_join(br, by = c("id.x" = "id")) %>%
    dplyr::left_join(br, by = c("id.y" = "id"))

  tcod <- tcod %>% dplyr::sample_frac(size = 0.9, replace = TRUE)
}
tweed1e/fakedata documentation built on May 29, 2019, 10:51 a.m.