R/tcpDownload.R

Defines functions tcpDownload

Documented in tcpDownload

#' Download TCP files from Github
#' 
#' Download raw XML files from the Text Creation Partnership's Github repository.
#' 
#' @param results A dataframe, usually the search results generated using 
#'               \code{searchTCP}. Must contain a column of TCP numbers with the 
#'               name 'TCP'.
#'               
#' @section To download the entire TCP corpus:
#' If you'd like to download the entire TCP corpus, run two commands: 
#' \enumerate{
#'     \item \code{data(tcp)} (Activates the TCP data in your R environment.)
#'     \item \code{\link{tcpDownload(tcp)}} (Begins the download.)
#' }
#' 
#' Be careful: the entire TCP is several GBs, so the download will take awhile.
#' 
#' @examples
#' tcpDownload(results)
#'
#' @export
tcpDownload = function(results) {
  if (!requireNamespace("httr", quietly = TRUE)) {
    stop("tei2r uses a package called 'httr' to download XML content from the web. Please install it
         using the command > install.packages('httr'), then re-try your download.")
  }
  selection = results$TCP
  urls = paste("https://raw.githubusercontent.com/textcreationpartnership/",
               selection,
               "/master/",
               selection,
               ".xml",
               sep="")
  files = paste(selection,".xml",sep="")
  for (i in 1:length(urls)) {
    data.r = httr::GET(url = urls[i])
    data.v = httr::content(data.r)
    filename = files[i]
    print(paste("Downloading file: ", selection[i], '.xml', sep=""))
    write.table(data.v,filename,quote = F,row.names = F, col.names = F, fileEncoding="UTF-8")
  }
  print("File download complete.")
}
michaelgavin/tei2r documentation built on May 22, 2019, 9:50 p.m.