R/accessdb.R

Defines functions wide2excel transpose_wide wide2long paste_clp long2wide get_data make_date expand_date series datasets datasources safe_query ugatsdb_reconnect

Documented in datasets datasources expand_date get_data long2wide make_date series transpose_wide ugatsdb_reconnect wide2excel wide2long

#' Global Identifier Macros
#'
#' @description
#' The macro \code{.IDvars} contains the string \code{c("DSID", "Series")} denoting
#' variables that uniquely identify series in the database.
#' \emph{Note} that the series code contained in "Series" alone is not sufficient to uniquely identify a series as some
#' series are recorded with the same code in multiple datasets (mostly either the same data aggregated at a different frequency, or a different collection of indicators).
#' For example goods exports with the series code "EX_G" are recorded in the datasets "BOU_MMI", "BOU_MMI_A" (annual data), and "BOU_MMI_FY" (fiscal year data).
#'
#' The macro \code{.Tvars} contains the string
#' \code{c("Date", "Year", "Quarter", "FY", "QFY", "Month", "Day")} denoting temporal identifiers generated by \code{\link{expand_date}}.
#' The "Date" variable is sufficient to uniquely identify a point in time in the database.
#'
#' @usage
#' .IDvars
#' .Tvars
#'
#' @examples
#' .IDvars
#' .Tvars
#'
#'
#' @seealso \code{\link{ugatsdb}}
#'
#' @format NULL
#' @aliases .IDvars .Tvars
#' @export .IDvars
#' @export .Tvars

.IDvars <- c("DSID", "Series")
.Tvars <- c("Date", "Year", "Quarter", "FY", "QFY", "Month", "Day")


#' Reconnect to Database
#'
#' This function terminates an existing connection to the database server and attempts to reconnect to it.
#' It is now somewhat redundant by the safe query mechanism introduced in v0.2.1 of the package, where each query is evaluated inside
#' \code{\link{tryCatch}} and the database connection is renewed if the query fails. This function can still be used to manually
#' renew the database connection.
#' @examples \donttest{
#' ugatsdb_reconnect()
#' }
#' @seealso \code{\link{ugatsdb}}
#'
#' @export ugatsdb_reconnect

ugatsdb_reconnect <- function() {
  if(length(.ugatsdb_con)) tryCatch(dbDisconnect(.ugatsdb_con), error = function(e) cat(""))
  assignInMyNamespace(".ugatsdb_con", .connect())
}

.ugatsdb_con <- NULL # Needed for initialization

safe_query <- function(query) {
  if(is.null(.ugatsdb_con)) ugatsdb_reconnect() # Initial connection
  tryCatch(dbGetQuery(.ugatsdb_con, query), error = function(e) {ugatsdb_reconnect(); dbGetQuery(.ugatsdb_con, query)})
}

#' Retrieve Data Sources Table
#'
#' This function pulls and returns a table called 'DATASOURCE' from the database.
#'
#' The 'DATASOURCE' table gives information about the various sources / providers of data in this database, including the source website, the number of datasets available from the source, a description of the source and the way data is accessed from the source.
#' @param ordered logical. \code{TRUE} orders the result in the order data was entered into the database, while \code{FALSE} returns the result in a random order, to the benefit of faster query execution.
#' @return A \code{\link[data.table]{data.table}} with information about the sources of data in the database.
#' @examples \donttest{
#' datasources()
#' }
#' @seealso \code{\link{datasets}}, \code{\link{ugatsdb}}
#'
#' @export datasources

datasources <- function(ordered = TRUE) {
  query <- "SELECT Source, Source_Url, NDatasets, Description, Access FROM DATASOURCE"
  if(ordered) query <- paste(query, "ORDER BY SRC_Order")
  # if(tryCatch(!dbIsValid(.ugatsdb_con), error = function(e) TRUE)) ugatsdb_reconnect()
  res <- safe_query(query)
  if(!fnrow(res)) stop("Query resulted in empty dataset. This means something is wrong with your internet connection, the connection to the database (try calling ugatsdb_reconnect()) or with the database itself.")
  setDT(res)
  # oldClass(res) <- c("data.table", "data.frame")
  return(res)
}

#' Retrieve Datasets Table
#'
#' This function pulls and return a table called 'DATASET' from the database.
#'
#' The 'DATASET' table gives information about the different datasets read into the database from various sources. It provides a unique id for each dataset, the frequency of data, the minimum and maximum time coverage, when the dataset was last updated, a description, the source (matching the 'Source' column in the 'DATASOURCE' table), and an (optional) url providing direct access to the raw data.
#' @param ordered logical. \code{TRUE} orders the result in the order data was entered into the database, while \code{FALSE} returns the result in a random order, to the benefit of faster query execution.
#' @return A \code{\link[data.table]{data.table}} with information about the available datasets in the database.
#' @examples \donttest{
#' datasets()
#' }
#' @seealso \code{\link{datasources}}, \code{\link{series}}, \code{\link{ugatsdb}}
#'
#' @export datasets

datasets <- function(ordered = TRUE) {
  query <- "SELECT DSID, Dataset, Frequency, DS_From, DS_To, Updated, Description, Source, DS_Url FROM DATASET"
  if(ordered) query <- paste(query, "ORDER BY DS_Order")
  # if(tryCatch(!dbIsValid(.ugatsdb_con), error = function(e) TRUE)) ugatsdb_reconnect()
  res <- safe_query(query)
  if(!fnrow(res)) stop("Query resulted in empty dataset. This means something is wrong with your internet connection, the connection to the database (try calling ugatsdb_reconnect()) or with the database itself.")
  res$DS_From <- as.Date(res$DS_From)
  res$DS_To <- as.Date(res$DS_To)
  res$Updated <- as.Date(res$Updated)
  setDT(res)
  # oldClass(res) <- c("data.table", "data.frame")
  return(res)
}

#' Retrieve Series Table
#'
#' This function pulls and returns a table called 'SERIES' from the database.
#'
#' @param dsid character. (Optional) id's of datasets matching the 'DSID' column of the 'DATASET' table (retrieved using \code{\link[=datasets]{datasets()}}) for which series information is to be returned.
#' @param dataset.info logical. \code{TRUE} returns additional information from the 'DATASET' table about the
#' datasets in which the series are recorded. \code{FALSE} only returns the raw 'SERIES' table.
#' @param ordered logical. \code{TRUE} orders the result in the order data was entered into the database, while \code{FALSE} returns the result in a random order, to the benefit of faster query execution.
#' @param return.query logical. \code{TRUE} will not query the database but instead return the constructed SQL query as a character string.
#'
#' @details The 'SERIES' table gives information about all of the time series in the database. Each series is given a code which is however not unique across datasets (see \code{\link{.IDvars}}).
#' Each series also has a label describing the series. Further information recorded are the minimum and maximum time coverage, and (optionally) a separate series source and url.
#' By default \code{dataset.info = TRUE} and the frequency of the data, the date when the dataset containing the series was last updated, the dataset and data source are added to the
#' 'SERIES' table from the 'DATASET' table.
#'
#' If \code{dataset.info = FALSE}, the 'DATASET' table is not joined to the 'SERIES' table, and \code{ordered = TRUE} only orders the series within each dataset to maintain the column order of series in the source data.
#' In that case the datasets are returned in alphabetic order of 'DSID', not the order in which they were entered into the 'DATASET' table.
#'
#' @return A \code{\link[data.table]{data.table}} with information about the available time series in the database.
#' @examples \donttest{
#' # By default returns all series with additional information
#' series()
#'
#' # Raw series table
#' series(dataset.info = FALSE)
#'
#' # Only series in the Monthly Macroeconomic Indicators of the BoU
#' series("BOU_MMI")
#' }
#' @seealso \code{\link{datasets}}, \code{\link{ugatsdb}}
#'
#' @export series

series <- function(dsid = NULL, dataset.info = TRUE, ordered = TRUE, return.query = FALSE) {
  if(length(dsid)) {
    if(dataset.info) {
      query <- "SELECT DSID, Series, Label, Frequency, S_From, S_To, Updated, Dataset, Source, S_Source, S_Url FROM SERIES NATURAL JOIN DATASET " # , DS_Url
    } else {
      query <- "SELECT DSID, Series, Label, S_From, S_To, S_Source, S_Url FROM SERIES "
    }
    query <- if(length(dsid) == 1L) paste0(query, "WHERE DSID = '", dsid, if(ordered) "' ORDER BY S_Order" else "'") else
                                    paste0(query, "WHERE DSID IN ('", paste(dsid, collapse = "', '"),
                                           if(!ordered) "')" else if(dataset.info) "') ORDER BY DS_Order, S_Order" else "') ORDER BY DSID, S_Order")
  } else {
    if(dataset.info) {
      query <- "SELECT DSID, Series, Label, Frequency, S_From, S_To, Updated, Dataset, Source, S_Source, S_Url FROM SERIES NATURAL JOIN DATASET" # , DS_Url
      if(ordered) query <- paste(query, "ORDER BY DS_Order, S_Order")
    } else {
      query <- "SELECT DSID, Series, Label, S_From, S_To, S_Source, S_Url FROM SERIES"
      if(ordered) query <- paste(query, "ORDER BY DSID, S_Order")
    }
  }
  if(return.query) return(query)
  # if(tryCatch(!dbIsValid(.ugatsdb_con), error = function(e) TRUE)) ugatsdb_reconnect()
  res <- safe_query(query)
  if(!fnrow(res)) stop("Query resulted in empty dataset. Please make sure the DSID esists by checking against the datasets() table. Alternatively check your connection to the database.")

  res$S_From <- as.Date(res$S_From)
  res$S_To <- as.Date(res$S_To)
  if(dataset.info) res$Updated <- as.Date(res$Updated)
  setDT(res)
  # oldClass(res) <- c("data.table", "data.frame")
  return(res)
}
#
# #' Retrieve Time Periods Table
# #'
# #' This function pulls a table called 'TIME' from the database, and returns it as a \emph{data.table} in R.
# #' The 'TIME' table provides temporal identifiers for daily data from 1950 to 2050. It is not a critical component
# #' of the database since all data can be queries based on the 'Date' variable in the 'DATA' table, and other
# #' temporal identifiers as present in the 'TIME' table can be generated using the \code{\link{expand_date}} function.
# #'
# #' @param as.factor logical. \code{TRUE} will generate quarters, fiscal years and months ('Quarter', 'FY', 'QFY', 'Month') as factor variables.
# #' \code{FALSE} will generate fiscal years as character and quarters and months as integer variables.
# #' @param ordered logical. \code{TRUE} ensures that the table is returned ordered by 'Date'.
# #'
# #' @examples
# #' times()
# #'
# #' @seealso \code{\link{datasets}}, \code{\link{series}}, \code{\link{ugatsdb}}
# #'
# #' @export times
#
# times <- function(as.factor = TRUE, ordered = TRUE) {
#   res <- safe_query(if(ordered) "SELECT * FROM TIME ORDER BY Date" else "SELECT * FROM TIME")
#   oldClass(res) <- c("data.table", "data.frame")
#   lab <- c('Date', 'Year', 'Quarter', 'Fiscal Year (July - June)', 'Quarter of Fiscal Year', 'Month', 'Day')
#   res$Date <- as.Date(res$Date)
#   vlabels(res) <- lab
#   if(!as.factor) return(res)
#   Q <- paste0("Q", 1:4)
#   res$Quarter = structure(res$Quarter, levels = Q, class = c("ordered", "factor")) # , "na.included"
#   res$FY = qF(res$FY, ordered = TRUE) # , na.exclude = FALSE # `oldClass<-`(`attr<-`(as.factor(res$FY), "label", "Fiscal Year (July - June)"), c("ordered", "factor", "na.included"))
#   res$QFY = structure(res$QFY, levels = Q, class = c("ordered", "factor")) # , "na.included"
#   res$Month = structure(res$Month, levels = month.name, class = c("ordered", "factor")) # , "na.included"
#   return(res)
# }

#' Generate Temporal Identifiers from a Date Column
#'
#' This function expands a date column and generates additional temporal identifiers from it (such as the year, month, quarter, fiscal year etc.).
#'
#' @param x either a vector of class 'Date', or coercible to date using \code{\link[base]{as.Date}}, or a data frame / list with a date variable in the first column.
#' @param gen character. A vector of identifiers to generate from \code{x}. The possible identifiers are found in \code{\link{.Tvars}}. The default setting is to generate all identifiers apart from "Day".
#' @param origin character / Date. Passed to \code{\link[base]{as.Date}}: for converting numeric \code{x} to date.
#' @param keep.date logical. \code{TRUE} will keep the date variable in the first column of the resulting dataset, \code{FALSE} will remove the date variable in favor of the generated identifiers.
#' @param remove.missing.date logical. \code{TRUE} will remove missing values in \code{x}. If \code{x} is a dataset, the corresponding rows will be removed.
#' @param sort logical. \code{TRUE} will sort the data by the date column.
#' @param as.factor \code{TRUE} will generate quarters, fiscal years and months ('Quarter', 'FY', 'QFY', 'Month') as factor variables. It is also possible to use \code{as.factor = "ordered"} to generate ordered factors.
#' \code{FALSE} will generate fiscal years as character and quarters and months as integer variables.
#' @param \dots not used.
#'
#' @return A \code{\link[data.table]{data.table}} containing the computed identifiers as columns. See Examples.
#'
#' @examples
#' # First a basic example
#' x <- seq.Date(as.Date("1999-01-01"), as.Date("2000-01-01"), by = "month")
#' expand_date(x)
#' expand_date(x, gen = c("Year", "Month"), keep.date = FALSE)
#' \donttest{
#' # Now using the API
#' expand_date(get_data("BOU_CPI")) # Getting Monthly CPI data from the Bank of Uganda
#'
#' # Same thing
#' get_data("BOU_CPI", expand.date = TRUE)
#' }
#'
#' @seealso \code{\link{make_date}}, \code{\link{ugatsdb}} % \code{\link{times}},
#' @export expand_date

expand_date <- function(x, gen = c("Year", "Quarter", "FY", "QFY", "Month"), origin = "1899-12-30",
                        keep.date = TRUE, remove.missing.date = TRUE, sort = TRUE,
                        as.factor = TRUE, ...) {
  lxl <- FALSE
  genopts <- c('Year', 'Quarter', 'FY', 'QFY', 'Month', 'Day')
  genlab <- c('Year', 'Quarter', 'Fiscal Year (July - June)', 'Quarter of Fiscal Year', 'Month', 'Day')
  ordered <- is.character(as.factor) && as.factor == "ordered"
  as.factor <- ordered || isTRUE(as.factor)
  if(is.numeric(gen)) {
    lab <- genlab[gen]
    gen <- genopts[gen]
  } else if(is.character(gen)) {
    lab <- genlab[ckmatch(gen, genopts, "Unknown gen option:")]
  } else stop("gen must be integer or character")

  if(is.list(x)) {
    lxl <- TRUE
    l <- .subset(x, -1L)
    x <- .subset2(x, 1L)
  }
  getFY <- function(y, m) {
    fy <- fifelse(as.integer(m) >= 7L, y, y - 1L)
    fy <- paste(fy, substr(as.character(fy + 1L), 3L, 4L), sep = "/")
    if(as.factor) qF(fy, ordered = ordered) else fy # , na.exclude = FALSE
  }
  getQ <- function(m) {
    Q <- ceiling(m / 3L)
    if(as.factor) structure(Q, levels = paste0("Q", 1:4), class = c(if(ordered) "ordered", "factor")) else Q # , "na.included"
  }
  getFQ <- function(m) {
    mod <- (m + 6L) %% 12L
    mod[mod == 0L] <- 12L
    getQ(mod)
  }
  if(!inherits(x, "Date"))
    x <- if(is.numeric(x)) as.Date(x, origin) else as.Date(x)
  if(remove.missing.date && anyNA(x)) {
    nna <- which(!is.na(x))
    x <- x[nna]
    if(lxl) l <- ss(l, nna)
  }
  if(!lxl && sort && is.unsorted(x)) x <- sort(x)
  xp <- as.POSIXlt(x)
  y <- xp$year + 1900L
  m <- xp$mon + 1L
  res <- lapply(setNames(as.vector(gen, "list"), gen), switch,
                Year = y,
                Quarter = getQ(m),
                FY = getFY(y, m),
                QFY = getFQ(m),
                Month = if(as.factor) structure(m, levels = month.name, class = c(if(ordered) "ordered", "factor")) else m, # , "na.included"
                Day = xp$mday,
                stop("Unknown gen option"))
  vlabels(res) <- lab
  if(keep.date) res <- c(list(Date = `attr<-`(x, "label", "Date")), res)
  if(lxl) {
    res <- c(res, l)
    if(sort && is.unsorted(x))
      res <- ss(res, order(x, method = "radix"))
  }
  return(qDT(res)) # need qDT here, res is a plain list...
}


#' Coerce Vectors to Dates
#'
#' This function coerces date strings i.e. \code{"YYYY-MM-DD"} or \code{"YYYY-MM"}, years e.g. \code{2015} (numeric or character),
#' year-quarters e.g. \code{"2015Q1"} or \code{"2015-Q1"}, year-months e.g. \code{"2015M01"} or \code{"2015-M01"}, fiscal years e.g. \code{"1997/98"} or numeric values representing dates (e.g. previously imported Excel date) to a regular R date.
#'
#' @param x a character date string \code{"YYYY-MM-DD"} or \code{"YYYY-MM"}, year-quarter \code{"YYYYQN"} or \code{"YYYY-QN"}, , year-month \code{"YYYYMNN"} or \code{"YYYY-MNN"}, fiscal year \code{"YYYY/YY"} or calendar year \code{YYYY} (numeric or character), or a numeric value corresponding to a date that can be passed to \code{\link[base]{as.Date.numeric}}.
#' @param end logical. \code{TRUE} replaces missing time information with a period end-date which is the last day of the period. \code{FALSE} replaces missing month and day information with \code{"-01"},
#' so the year date is the 1st of January, the fiscal year date the 1st of July, and for months / quarters the 1st day of the month / quarter.
#' @param origin a date or date-string that can be used as reference for converting numeric values to dates. The default corresponds to dates generated in Excel for Windows. See \code{\link[base]{as.Date.numeric}}.
#'
#' @return A \code{\link[base]{Date}} vector.
#' @examples
#' make_date("2011-05")
#' make_date(2011)
#' make_date("2011/12")
#' make_date("2011/12", end = TRUE)
#' make_date("2011Q1")
#' make_date("2011Q1", end = TRUE)
#'
#' @seealso \code{\link{expand_date}}, \code{\link{ugatsdb}} %, \code{\link{times}}
#' @export make_date
# Note: Before did all days ending on 28th
make_date <- function(x, end = FALSE, origin = "1899-12-30") {
  x1 <- x[1L]
  ncx <- nchar(x1)
  if(ncx == 4L) return(as.Date(paste0(x, if(end) "-12-31" else "-01-01")))
  if(is.numeric(x)) return(as.Date.numeric(x, origin))
  if(ncx >= 6L || ncx <= 8L) { # could be "1999/1"
    s5 <- substr(x1, 5L, 5L)
    if(s5 == "/") {
      x <- if(end) paste0(as.integer(substr(x, 1L, 4L)) + 1L, "-06-30") else
                   paste0(substr(x, 1L, 4L), "-07-01")
    } else if(s5 == "Q" || substr(x1, 6L, 6L) == "Q") {
      Q <- as.integer(substr(x, ncx, ncx))
      x <- if(end) paste0(substr(x, 1L, 4L), "-", Q * 3L, fifelse(as.logical(Q %% 2L), "-31", "-30")) else # fifelse requires logical argument
                   paste0(substr(x, 1L, 4L), "-", Q * 3L - 2L, "-01")
    } else {
      if(s5 == "M" || substr(x1, 6L, 6L) == "M") {
        st <- if(s5 == "M") 6L else 7L
        M <- substr(x, st, st + 1L)
        x <- paste0(substr(x, 1L, 4L), "-", M, "-01")
      } else { # Assuming now Year-Month type string
        x <- as.Date(if(ncx == 8L) x else paste0(x, "-01"))
      }
      if(end) x <- as.Date(format(x + 31L, "%Y-%m-01")) - 1L
    }
  }
  return(as.Date(x))
}


#' Retrieve Data from the Database
#'
#' This is the main function of the package to retrieve data from the database. It constructs an SQL query which is sent to the database and returns the data as a \code{\link[data.table]{data.table}} in R.
#'
#' @param dsid character. (Optional) id's of datasets matching the 'DSID' column of the 'DATASET' table (retrieved using \code{\link[=datasets]{datasets()}}). If none of the following arguments are used, all series from those datasets will be returned.
#' @param series character. (Optional) codes of series matching the 'Series' column of the 'Series' table (retrieved using \code{\link[=series]{series()}}).
#' @param from set the start time of the data retrieved by either supplying a start date, a date-string of the form \code{"YYYY-MM-DD"} or \code{"YYYY-MM"},
#' year-quarters of the form \code{"YYYYQN"} or \code{"YYYY-QN"}, a numeric year \code{YYYY} (numeric or character), or a fiscal year of the form \code{"YYYY/YY"}. These expressions are converted to a regular date by \code{\link{make_date}}.
#' @param to same as \code{from}: to set the time period until which data is retrieved. For expressions that are not full "YYYY-MM-DD" dates, the last day of the period is chosen.
#' @param labels logical. \code{TRUE} will also return labels (series descriptions) along with the series codes.
#' @param wide logical. \code{TRUE} calls \code{\link{long2wide}} on the result. \code{FALSE} returns the data in a long format without missing values (suitable for \code{ggplot2}).
#' @param expand.date logical. \code{TRUE} will call \code{\link{expand_date}} on the result.
#' @param ordered logical. \code{TRUE} orders the result by 'Date' and, if \code{labels = TRUE}, by series, maintaining the column-order of series in the dataset(s).
#' \code{FALSE} returns the result in a random order, to the benefit of faster query execution.
#' @param return.query logical. \code{TRUE} will not query the database but instead return the constructed SQL query as a character string.
#' @param \dots further arguments passed to \code{\link{long2wide}} (if \code{wide = TRUE}) or \code{\link{expand_date}} (if \code{expand.date = TRUE}), no conflicts between these two.
#'
#' @details If \code{labels = FALSE}, the 'SERIES' table is not joined to the 'DATA' table, and \code{ordered = TRUE} will order datasets and series retrieved in alphabetic order.
#' If \code{labels = TRUE} data is ordered by series and date within each dataset, preserving the order of columns in the dataset. If multiple datasets are received they are ordered alphabetically according to the 'DSID' column.
#'
#' It is possible query multiple series from multiple datasets e.g. \code{get_data(c("DSID1", "DSID2"), c("SERFROM1", "SERFROM2"))} etc., but care needs to be taken that the series queried do not occur in both datasets (see \code{\link{.IDvars}}, and check using \code{\link[=series]{series(c("DSID1", "DSID2"))}}).
#' Series from datasets at different frequencies can be queried, but, if \code{wide = TRUE}, this will result in missing values for all but the first observations per period in the lower frequency series.
#'
#' @return A \code{\link[data.table]{data.table}} with the result of the query.
#'
#' @examples \donttest{
#' # Return monthly macroeconomic indicators from the year 2000 onwards
#' get_data("BOU_MMI", from = 2000, wide = FALSE)
#'
#' # Return wide format with date expanded
#' get_data("BOU_MMI", from = 2000, expand.date = TRUE)
#'
#' # Same thing in multiple steps (with additional customization options):
#' library(magrittr) # Pipe %>% operators
#' get_data("BOU_MMI", from = 2000, wide = FALSE) %>% long2wide %>% expand_date
#'
#' # Getting a single series
#' get_data("BOU_MMI", "M2", 2000)
#'
#' # Getting High-Frequency activity indicators from BoU and Revenue & Expense from MoFPED
#' get_data(c("BOU_MMI", "MOF_TOT", "WB_WDI"), c("CIEA", "BTI", "REV_GRA", "EXP_LEN"))
#'
#' # Getting daily interest rates and plotting
#' library(xts)   # Time series class
#' get_data("BOU_I", from = 2018, wide = FALSE) %>%
#'    long2wide(names_from = "Label") %>%
#'    as.xts %>%
#'    plot(legend.loc = "topleft")
#' }
#' @seealso \code{\link{long2wide}}, \code{\link{expand_date}}, \code{\link{ugatsdb}}
#' @export get_data
#'

get_data <- function(dsid = NULL, series = NULL, from = NULL, to = NULL,
                     labels = TRUE, wide = TRUE, expand.date = FALSE,
                     ordered = TRUE, return.query = FALSE, ...) {

  if(anyDuplicated(series)) stop("duplicated series code: ", paste(series[duplicated(series)], collapse = ", "))
  if(labels) {
     data <- "DATA NATURAL JOIN SERIES"
     lab <- ", Label"
  } else {
    data <- "DATA"
    lab <- ""
  }
  ds0 <- is.null(dsid)
  if(!ds0 && length(dsid) == 1L) { # This is good -> more intelligent
     cond <- paste0("DSID = '", dsid, "'")
     vars <- paste0("Date, Series", lab, ", Value")
     if(ordered) ord <- if(labels) "S_Order, Date" else "Series, Date"
  } else {
     cond <- if(ds0) "" else paste0("DSID IN ('", paste(dsid, collapse = "', '"), "')")
     vars <- paste0("Date, DSID, Series", lab,", Value")
     if(ordered) ord <- if(labels) "DSID, S_Order, Date" else "DSID, Series, Date"
  }
  add <- if(ds0) "" else " AND "
  if(length(series)) {
    if(length(series) == 1L) {
      # vars <- sub(", Series", "", vars) # This is inconvenient -> cannot use long2wide anymore
      cond <- paste0(cond, add, "Series = '", series, "'")
    } else {
      cond <- paste0(cond, add, "Series IN ('", paste(series, collapse = "', '"), "')")
    }
  }
  if(length(from)) cond <- paste0(cond, " AND Date >= '", make_date(from) , "'")
  if(length(to))   cond <- paste0(cond, " AND Date <= '", make_date(to, end = TRUE) , "'")

  query <- if(ordered) paste("SELECT", vars, "FROM", data, "WHERE", cond, "ORDER BY", ord) else
                       paste("SELECT", vars, "FROM", data, "WHERE", cond)
  if(return.query) return(query)
  # if(tryCatch(!dbIsValid(.ugatsdb_con), error = function(e) TRUE)) ugatsdb_reconnect()
  res <- safe_query(query)
  if(!fnrow(res)) stop("Query resulted in empty dataset. Please make sure that the DSID, series-codes or the date-range supplied in your query are consistent with the available data. See datasets() and series(). Alternatively check your connection to the database.")
  res$Date <- as.Date(res$Date)
  setDT(res)
  # oldClass(res) <- c("data.table", "data.frame")
  if(wide) {
    res <- long2wide(res, ...) # Could optimize to omit ORDER BY clause if wide and length(series) > 1L
    if(length(series) > 1L) setcolorder(res, c("Date", series)) # what about multiple datasets ? -> should work, as long as series are unique..
  }
  if(expand.date) return(expand_date(res, ...)) else return(res)
}

# long2wide(get_data(series = "M2"), names_from = "DSID") %>% ftransform(BOU_MMI_A = ffirst(BOU_MMI_A, Year, "replace_fill")) %>% View

# TODO: Make these functions class-agnostic?
# rename args using "." ??

#' Reshape Long API Data to Column-Based Format
#'
#' This function automatically reshapes long (stacked) raw data from the API (\code{\link[=get_data]{get_data(..., wide = FALSE)}}) to a wide format where each variable has its own column.
#'
#' @param data raw data from the API: A long format data frame where all values are stacked in a value column.
#' @param id_cols character. Temporal identifiers of the data. By default all variables in \code{\link{.Tvars}} are selected.
#' @param names_from character. The column containing the series codes. These will become the names of the new columns in the wider data format.
#' @param values_from character. The column containing the data values.
#' @param labels_from character. The column containing the labels describing the series.
#' @param expand.date logical. \code{TRUE} will call \code{\link{expand_date}} on the data after reshaping.
#' @param \dots further arguments passed to \code{\link[data.table]{dcast}} or \code{\link{expand_date}}, no conflicts between these two.
#'
#' @return A \code{\link[data.table]{data.table}} with the reshaped data.
#'
#' @examples \donttest{
#' # Return monthly macroeconomic indicators from the year 2000 onwards
#' long2wide(get_data("BOU_MMI", from = 2000, wide = FALSE))
#' }
#' @seealso \code{\link{wide2long}}, \code{\link{wide2excel}}, \code{\link{ugatsdb}}
#'
#' @export long2wide
#'

long2wide <- function(data, id_cols = intersect(.Tvars, names(data)), # setdiff(names(data), c("Series", "Label", "Value")),
                      names_from = "Series", values_from = "Value",
                      labels_from = if(any(names(data) == "Label")) "Label" else NULL,
                      expand.date = FALSE, ...) {
  # Needed for columns to be cast in order...
  if(length(names_from) > 1L) {
    data[, names_from] <- lapply(.subset(data, names_from), function(x) factor(x, levels = funique(x, method = "hash")))
  } else {
    data[[names_from]] <- factor(.subset2(data, names_from), levels = funique(.subset2(data, names_from), method = "hash"))
  }
  if(anyDuplicated(get_vars(data, c(names_from, id_cols))))
    data <- collapv(data, c(names_from, id_cols), fmedian, ffirst, sort = FALSE, na.rm = FALSE)
  form <- as.formula(paste0(paste_clp(id_cols), " ~ ", paste_clp(names_from)))
  res <- dcast(qDT(data), form, value.var = values_from, fill = NA, ...) # , fun.aggregate = median, na.rm = TRUE
  if(length(labels_from)) {
    noid <- -seq_along(id_cols)
    namlab <- funique(.subset(data, c(names_from, labels_from)))
    nam <- if(length(names_from) == 1L) namlab[[1L]] else
      do.call(paste, c(namlab[names_from], list(sep = "_"))) # make sure sep is same as dcast...
    vlabels(res)[noid] <- namlab[[labels_from]][ckmatch(names(res)[noid], nam)]
  }
  if(expand.date) return(expand_date(res, ...)) else return(res)
}

# Helper used in long2wide
paste_clp <- function(x) if(length(x) == 1L) x else paste(x, collapse = " + ")


#' Reshape Column-Based Data to Long Format
#'
#' This function automatically reshapes wide (column-based) data into a long format akin to the format of the raw data coming from the database (\code{\link[=get_data]{get_data(..., wide = FALSE)}}).
#'
#' @param data a wide format data frame where all series have their own column.
#' @param id_cols character. Temporal identifiers of the data. By default all variables in \code{\link{.Tvars}} are selected.
#' @param to_value character. The names of all series to be stacked into the long format data frame.
#' @param variable_name character. The name of the variable to store the names of the series.
#' @param value_name character. The name of the variable to store the data values.
#' @param label_name character. The name of the variable to store the series labels.
#' @param na.rm logical. \code{TRUE} will remove all missing values from the long data frame.
#' @param \dots further arguments passed to \code{\link[data.table]{melt}}.
#'
#' @return A \code{\link[data.table]{data.table}} with the reshaped data.
#'
#' @examples \donttest{
#' # Return monthly macroeconomic indicators from the year 2000 onwards
#' data <- get_data("BOU_MMI", from = 2000)
#' wide2long(data)
#' }
#' @seealso \code{\link{long2wide}}, \code{\link{ugatsdb}}
#'
#' @export wide2long
#'

wide2long <- function(data, id_cols = intersect(.Tvars, names(data)),
                      to_value = setdiff(names(data), id_cols),
                      variable_name = "Series", value_name = "Value",
                      label_name = "Label", na.rm = TRUE, ...) {
  if(length(label_name)) labs <- vlabels(unattrib(.subset(data, to_value)))
  res <- melt(qDT(data), id_cols, to_value, variable_name, value_name,
              na.rm = na.rm, variable.factor = FALSE, ...)
  vlabels(res) <- NULL # needed
  if(length(label_name) && !allNA(labs)) {
    # melt always preserves column order (otherwise could rep with names and match series column)
    labs <- if(na.rm) rep(labs, fnobs(.subset(data, to_value))) else
                      rep(labs, each = fnrow(data))
    add_vars(res, fncol(res)) <- setNames(list(labs), label_name)
  }
  return(res)
}

# all.equal(roworderv(get_data("bou_mmi", from = 2000, to = 2005)),
#           roworderv(wide2long(long2wide(get_data("bou_mmi", from = 2000, to = 2005)))))


#' Transpose a Wide Dataset to a Row-Based Format
#'
#' This function is called by \code{\link{wide2excel}} with option \code{transpose = TRUE} to generate a row-based tabular data format from a wide data frame in R that is suitable for exporting to Excel.
#'
#' @param data a wide format data frame where each column is a variable and the first variable uniquely identifies the data.
#' @param date.format a format for date columns which is passed to \code{\link[base]{format.Date}}. When transposing wide, dates are converted to character. The default R YYYY-MM-DD format for dates is often not recognized by Excel. By default dates are transformed to DD/MM/YYYY format which Excel (UK English) recognizes. Putting \code{FALSE} here does not transform dates into another format.
#'
#' @return A transposed data frame or \code{\link[data.table]{data.table}} (the class of the input is preserved).
#'
#' @examples \donttest{
#' transpose_wide(get_data("BOU_CPI"))
#' }
#' @seealso \code{\link[data.table]{transpose}}, \code{\link{long2wide}}, \code{\link{wide2excel}}, \code{\link{ugatsdb}}
#' @export transpose_wide

transpose_wide <- function(data, date.format = "%d/%m/%Y") {
  nnum <- cat_vars(data, "logical")[-1L]
  lab <- vlabels(data)[-1L]

  if(is.character(date.format) && length(dv <- date_vars(data, "indices"))) {
    get_vars(data, dv) <-  if(length(dv) == 1L) format(.subset2(data, dv), date.format) else
                           lapply(.subset(data, dv), format, date.format)
  }

  if(any(nnum) && !all(nnum)) {
    res <- list(`numeric data` = add_vars(transpose(get_vars(data, c(TRUE, !nnum)),
                                                    make.names = 1L, keep.names = "Variable"),
                                          list(Label = lab[!nnum]), pos = 2L),
                `non-numeric data` =  add_vars(transpose(get_vars(data, c(TRUE, nnum)),
                                                         make.names = 1L, keep.names = "Variable"),
                                               list(Label = lab[nnum]), pos = 2L))
  } else {
    res <- transpose(data, make.names = 1L, keep.names = "Variable")
    add_vars(res, 2L) <- list(Label = lab)
  }

  attr(res, "transposed") <- TRUE
  if(inherits(data, "data.table")) {
    if(inherits(res, "data.table")) return(qDT(res, keep.attr = TRUE))
    for(i in 1:2) setDT(res[[i]])
  }
  return(res)
}

#' Export Wide Data to Excel
#'
#' This function exports a wide format dataset to a column- (default) or row-oriented Excel format.
#'
#' @param data a wide dataset from \code{\link{get_data}} or reshaped to a wide format with \code{\link{long2wide}}.
#' @param \dots further arguments to \code{\link[writexl]{write_xlsx}}. As a minimum a path needs to be supplied that ends with the name of the Excel file. See Examples.
#' @param transpose logical. If \code{TRUE}, the result is returned in a row-oriented Excel format. The default is column oriented (same as the dataset in R).
#' @param transpose.date.format argument passed to \code{\link{transpose_wide}}, setting the format of date columns when data is transposed.
#'
#' @examples
#' \dontrun{
#' # Getting macroeconomic indicators from Bank of Uganda in fiscal years
#' data <- get_data("BOU_MMI_FY", from = "2000/01")
#'
#' # Saving to different Excel formats
#' wide2excel(data, "BOU_MMI_FY.xlsx")
#' wide2excel(data, "BOU_MMI_FY.xlsx", transpose = TRUE)
#'
#' # Saving to alternative path
#' wide2excel(data, "C:/Users/.../BOU_MMI_FY.xlsx")
#' }
#'
#' @seealso \code{\link{transpose_wide}}, \code{\link[writexl]{write_xlsx}}, \code{\link{ugatsdb}}
#'
#' @export wide2excel
#'

wide2excel <- function(data, ..., transpose = FALSE, transpose.date.format = "%d/%m/%Y") {
  ntpl <- !isTRUE(attr(data, "transposed"))
  if(transpose && ntpl) {
    res <- transpose_wide(data, transpose.date.format)
  } else if(ntpl) {
    res <- list(data = data, labels = namlab(data))
  } else {
    res <- data
  }
  if(missing(...)) stop("Need to at least provide path to file. See help(write_xlsx).")
  return(write_xlsx(res, ...))
}

Try the ugatsdb package in your browser

Any scripts or data that you put into this service are public.

ugatsdb documentation built on Nov. 26, 2021, 1:06 a.m.