R/find_file_structure.R

Defines functions find_file_structure

Documented in find_file_structure

#' Finds the file Structure of website
#' 
#'Find all subfolders containing data and lists the names of the files within each subfolder
#'
#' @param years numeric vector. List of years in which to obtain file structure for. Default = NULL (find structure for all data)
#'
#'@return list of folders with filenames as contents
#'
#'@examples
#'\dontrun{
#'To find the structure for years 2000, ..., 2010
#'fileStr <- find_file_structure(years = 2000:2010)
#'
#'To find the structure for all data present
#'fileStr <- find_file_structure()

#'}
#'
#'@export


find_file_structure <- function(years=NULL){
  if (!is.null(years)) {
    years <- as.character(years)
  }
  #url where all data is stored
  dataPath <- "https://www.ncei.noaa.gov/data/sea-surface-temperature-optimum-interpolation/v2.1/access/avhrr/"

  # grab xml and scan it
  webPageNode <- xml2::read_html(dataPath)
  webPage <- xml2::xml_text(webPageNode)
  webPage <- strsplit(webPage,"\\s+")
  # pick out file names
  
  index <- sapply(webPage[[1]],grepl,pattern="\\/")
  files <- webPage[[1]][index]
  # now split file names (folder and date) to get folder names
  numberBackSlash <- stringr::str_count(files,"\\/")
  files <- files[numberBackSlash==1]
  folders <- as.vector(simplify2array(strsplit(files,"\\/"))[1,])
  folders <- stringr::str_trim(folders)

  # spick out the folder based on the years required
  if (is.null(years)){
    # do nothing
  } else {
    newFolder <- NULL
    for (ayear in years) {
      yf <- stringr::str_match(folders,ayear)
      newFolder <- c(newFolder,folders[!is.na(yf)])
    }
    folders <- newFolder
  }
  

  # loop over each folder and get filenames of contents
  fileStructure <- list()
  for (afol in folders) {
    message(paste0("Processing webpage for yyyymm = ", afol))
    monthDataPath <- paste0(dataPath,afol)
    
    webPageNode <- xml2::read_html(monthDataPath)
    webPage <- xml2::xml_text(webPageNode)
    webPage <- strsplit(webPage,"\\s+")
    
    # pick out file names
    index <- sapply(webPage[[1]],grepl,pattern="\\.nc")
    files <- webPage[[1]][index]
    files <- strsplit(files,"oisst")
    files <- simplify2array(files)[2,]
    files <- strsplit(files,"\\.nc")
    files <- simplify2array(files)[1,]
    files <- paste0("oisst",files,".nc")
    fileStructure[[afol]] <- files
  }
  
  return(fileStructure)
  
}
andybeet/oisst documentation built on Dec. 7, 2022, 6:19 p.m.