R/getZipUrls.R
In neonUtilities: Utilities for Working with NEON Data

Documented in getZipUrls

##############################################################################################
#' @title Get and store the file names, S3 URLs, file size, and download status (default = 0) in a data frame

#' @author
#' Claire Lunch \email{clunch@battelleecology.org}
#' Christine Laney \email{claney@battelleecology.org}

#' @description Used to generate a data frame of available zipfile URLs.
#'
#' @param month.urls The monthly API URL for the URL files
#' @param avg Global variable for averaging interval
#' @param package Global varaible for package type (basic or expanded)
#' @param dpID Global variable for data product ID
#' @param release Data release to be downloaded
#' @param tabl Table name to get
#' @param messages Error/warning messages from previous steps
#' @param include.provisional Should provisional data be included?
#' @param token User specific API token (generated within neon.datascience user accounts)

#' @return A dataframe comprised of file names, S3 URLs, file size, and download status (default = 0)

#' @references
#' License: GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007

# Changelog and author contributions / copyrights
#   Claire Lunch (2018-02-19): original creation
#   Christine Laney (2018-03-05): Added functionality to get new list of URLs if the old ones expire, during the download stream.

##############################################################################################

getZipUrls <- function(month.urls, avg, package, dpID, 
                       release, messages, tabl, include.provisional,
                       token = NA_character_) {

  writeLines("Finding available files")
  pb <- utils::txtProgressBar(style=3)
  utils::setTxtProgressBar(pb, 0)
  
  # get all the file names
  tmp.files <- list(length(month.urls))
  for(j in 1:length(month.urls)) {

    tmp.files[[j]] <- getAPI(month.urls[j], token=token)
    
    if(tmp.files[[j]]$status_code==500) {
      messages <- c(messages, paste("Query for url ", month.urls[j],
                                    " failed. API may be unavailable; check data portal data.neonscience.org for outage alert.",
                                    sep=""))
      next
    }
    tmp.files[[j]] <- jsonlite::fromJSON(httr::content(tmp.files[[j]], as="text", encoding='UTF-8'),
                                         simplifyDataFrame=T, flatten=T)
    utils::setTxtProgressBar(pb, j/length(month.urls))
  }
  
  utils::setTxtProgressBar(pb, 1)
  close(pb)
  
  # if a release is selected, subset to the release
  if(release!="current") {
    tmp.ind <- lapply(tmp.files, function(x) {
      x$data$release==release
    })
    tmp.files <- tmp.files[which(unlist(tmp.ind))]
    
    if(length(tmp.files)==0) {
      stop(paste("No files found for release ", release, 
                 " and query parameters. Check release name and dates.", sep=""))
    }
  }
  
  # if include.provisional==F, exclude provisional data
  if(!include.provisional) {
    tmp.ind <- lapply(tmp.files, function(x) {
      x$data$release!="PROVISIONAL"
    })
    if(length(which(unlist(tmp.ind)))!=length(tmp.files)) {
      messages <- c(messages, "Provisional data were excluded from available files list. To download provisional data, use input parameter include.provisional=TRUE.")
    }
    tmp.files <- tmp.files[which(unlist(tmp.ind))]
    
    if(length(tmp.files)==0) {
      stop("All files found were provisional. Modify download query (dates and/or sites) or, if you want to use provisional data, use input parameter include.provisional=TRUE.")
    }
  }

  # identify index of most recent publication date, and most recent publication date by site
  rdme.nm <- character(length(tmp.files))
  site.nm <- character(length(tmp.files))
  for(k in 1:length(tmp.files)) {
    if(length(tmp.files[[k]]$data$files)!=0) {
      rdme.nm[k] <- tmp.files[[k]]$data$files$name[grep("readme", tmp.files[[k]]$data$files$name)[1]]
      if(nchar(rdme.nm[k])==0) {
        next
      }
      site.nm[k] <- substring(rdme.nm[k], 10, 13)
      rdme.nm[k] <- substring(rdme.nm[k], nchar(rdme.nm[k])-19, nchar(rdme.nm[k])-4)
    }
  }
  max.pub <- which(rdme.nm==max(rdme.nm, na.rm=T))[1]
  if(length(unique(site.nm))==1) {
    max.pub.site <- max.pub
  } else {
    max.pub.site <- numeric(length(unique(site.nm)))
    max.site.val <- tapply(rdme.nm, site.nm, max, na.rm=T)
    ind <- 0
    for(m in unique(site.nm)) {
      ind <- ind + 1
      max.pub.site[ind] <- which(rdme.nm==max.site.val[m] & site.nm==m)[1]
    }
  }

  # stash the URLs for just the zips in an object
  zip.urls <- c(NA, NA, NA, NA)
  for(i in 1:length(tmp.files)) {

    # check for no files
    if(length(tmp.files[[i]]$data$files)==0) {
      messages <- c(messages, paste("No files found for site",
                                    substring(month.urls[i],
                                              nchar(month.urls[i])-11,
                                              nchar(month.urls[i])-8),
                                    "and month", substring(month.urls[i],
                                                           nchar(month.urls[i])-6,
                                                           nchar(month.urls[i])), sep=" "))
      next
    }

    # if only one averaging interval or one table is requested, filter by file names
    if(avg!="all" | tabl!="all") {

      # start with metadata
      # get url for most recent variables & readme
      if(i==max.pub) {
        which.var <- grep("variables", tmp.files[[i]]$data$files$name, fixed=T)[1]
        if(is.na(which.var)) {
          zip.urls <- zip.urls
        } else {
          zip.urls <- rbind(zip.urls, cbind(tmp.files[[i]]$data$files$name[which.var],
                                            tmp.files[[i]]$data$files$url[which.var],
                                            tmp.files[[i]]$data$files$size[which.var],
                                            rep(tmp.files[[i]]$data$release, 
                                                length(tmp.files[[i]]$data$files$name[which.var]))))
        }
        
        which.read <- grep("readme", tmp.files[[i]]$data$files$name, fixed=T)[1]
        if(is.na(which.read)) {
          zip.urls <- zip.urls
        } else {
          zip.urls <- rbind(zip.urls, cbind(tmp.files[[i]]$data$files$name[which.read],
                                            tmp.files[[i]]$data$files$url[which.read],
                                            tmp.files[[i]]$data$files$size[which.read],
                                            rep(tmp.files[[i]]$data$release, 
                                                length(tmp.files[[i]]$data$files$name[which.read]))))
        }
        
      }
      
      # add url for most recent sensor position file for each site
      if(i %in% max.pub.site) {
        
        which.sens <- grep("sensor_position", tmp.files[[i]]$data$files$name, fixed=T)[1]
        if(is.na(which.sens)) {
          zip.urls <- zip.urls
        } else {
          zip.urls <- rbind(zip.urls, cbind(tmp.files[[i]]$data$files$name[which.sens],
                                            tmp.files[[i]]$data$files$url[which.sens],
                                            tmp.files[[i]]$data$files$size[which.sens],
                                            rep(tmp.files[[i]]$data$release, 
                                                length(tmp.files[[i]]$data$files$name[which.sens]))))
        }
        
      }
      
      # drop duplicate files by hash
      unique.files <- tmp.files[[i]]$data$files[!base::duplicated(tmp.files[[i]]$data$files$md5),]
      
      # select files by averaging interval
      if(avg!="all") {
        all.file <- union(grep(paste(avg, "min", sep=""), unique.files$name, fixed=T),
                          grep(paste(avg, "_min", sep=""), unique.files$name, fixed=T))
        
        if(length(all.file)==0) {
          messages <- c(messages, paste("No files found for site", tmp.files[[i]]$data$siteCode,
                                        "and month", tmp.files[[i]]$data$month, sep=" "))
          next
        }
      }
      
      if(tabl!="all") {
        all.file <- grep(paste("[.]", tabl, "[.]", sep=""), unique.files$name)
      }

      # no message if table has no files - prints huge numbers if e.g. downloading only a litter chem table
      if(length(all.file)==0) {
        next
      }

      # if package==expanded, check that expanded package exists
      # if it doesn't, download basic package
      pk <- package
      pk.files <- grep(pk, unique.files$name, fixed=T)
      if(pk=="expanded") {
        if(length(pk.files)==0) {
          pk <- "basic"
          pk.files <- grep(pk, unique.files$name, fixed=T)
          messages <- c(messages, paste("No expanded package found for site ",
                                        tmp.files[[i]]$data$siteCode, " and month ",
                                        tmp.files[[i]]$data$month,
                                        ". Basic package downloaded instead.",
                                        sep=""))
        }
      }

      # subset to package. expanded package can contain basic files,
      # have to account for that when downloading by file and not by zip
      if(length(intersect(pk.files, all.file))==0) {
        which.file <- all.file
      } else {
        which.file <- intersect(pk.files, all.file)
      }

      # check again for no files
      if(length(which.file)==0) {
        messages <- c(messages, paste("No basic package files found for site",
                                      tmp.files[[i]]$data$siteCode,
                                      "and month", tmp.files[[i]]$data$month, sep=" "))
        next
      }

      zip.urls <- rbind(zip.urls, cbind(unique.files$name[which.file],
                                        unique.files$url[which.file],
                                        unique.files$size[which.file],
                                        rep(tmp.files[[i]]$data$release, 
                                            length(unique.files$name[which.file]))))

    # if downloading everything for product-site-month, instead of specific files, get zips
    } else {

      # check for packages section in response
      if("packages" %in% names(tmp.files[[i]]$data)) {
        
        # check for no files
        if(length(tmp.files[[i]]$data$packages)==0) {
          messages <- c(messages, paste("No files found for site", tmp.files[[i]]$data$siteCode,
                                        "and month", tmp.files[[i]]$data$month, sep=" "))
          next
        }
        
        # if package==expanded, check that expanded package exists
        # if it doesn't, download basic package
        pk <- package
        if(pk=="expanded") {
          if(!pk %in% tmp.files[[i]]$data$packages$type) {
            pk <- "basic"
            messages <- c(messages, paste("No expanded package found for site ",
                                          tmp.files[[i]]$data$siteCode, " and month ",
                                          tmp.files[[i]]$data$month,
                                          ". Basic package downloaded instead.",
                                          sep=""))
          }
        }

        # get the file name, and estimate the size
        z <- tmp.files[[i]]$data$packages$url[which(tmp.files[[i]]$data$packages$type==pk)]
        h <- getAPIHeaders(apiURL=z, token=token)
        
        # if no response, move to next silently - message will be printed by getAPIHeaders()
        if(is.null(h)) {
          next
        }
        
        flhd <- httr::headers(h)
        flnm <- gsub('\"', '', flhd$`content-disposition`, fixed=T)
        flnm <- gsub("inline; filename=", "", flnm, fixed=T)
        sz <- sum(tmp.files[[i]]$data$files$size[grep(pk, tmp.files[[i]]$data$files$name)], 
                  na.rm=T)
        rel <- rep(tmp.files[[i]]$data$release, length(flnm))
        
        zip.urls <- rbind(zip.urls, cbind(flnm, z, sz, rel))
        
      } else {
      
      # if no packages, look for pre-packaged zip files
      all.zip <- grep(".zip", tmp.files[[i]]$data$files$name, fixed=T)

      # check for no zips
      if(length(all.zip)==0) {
        
        messages <- c(messages, paste("No zip files found for site", tmp.files[[i]]$data$siteCode,
                                      "and month", tmp.files[[i]]$data$month, sep=" "))
        next
        
      }

      # if package==expanded, check that expanded package exists
      # if it doesn't, download basic package
      pk <- package
      if(pk=="expanded") {
        if(length(grep(pk, tmp.files[[i]]$data$files$name))==0) {
          pk <- "basic"
          messages <- c(messages, paste("No expanded package found for site ",
                                        tmp.files[[i]]$data$siteCode, " and month ",
                                        tmp.files[[i]]$data$month,
                                        ". Basic package downloaded instead.",
                                        sep=""))
        }
      }

      # subset to package
      which.zip <- intersect(grep(pk, tmp.files[[i]]$data$files$name, fixed=T),
                             grep(".zip", tmp.files[[i]]$data$files$name, fixed=T))

      # check again for no files
      if(length(which.zip)==0) {
        messages <- c(messages, paste("No basic package files found for site",
                                      tmp.files[[i]]$data$siteCode,
                                      "and month", tmp.files[[i]]$data$month, sep=" "))
        next
      }

      zip.urls <- rbind(zip.urls, cbind(tmp.files[[i]]$data$files$name[which.zip],
                                        tmp.files[[i]]$data$files$url[which.zip],
                                        tmp.files[[i]]$data$files$size[which.zip],
                                        rep(tmp.files[[i]]$data$release, 
                                            length(tmp.files[[i]]$data$files$name[which.zip]))))
    }
    }
  }

  # check for no files
  if(is.null(nrow(zip.urls))) {
    writeLines(paste0(messages[-1], collapse = "\n"))
    message(paste("No files found. This indicates either your internet connection failed, or the API is temporarily unavailable, or the data available for ",
               dpID,
               " are all hosted elsewhere. Check the data portal data.neonscience.org for outage alerts, and check the ",
               dpID, " data download page for external links.", sep=""))
    return(invisible())
  }

  # get size info
  zip.urls <- data.frame(zip.urls, row.names=NULL)
  colnames(zip.urls) <- c("name", "URL", "size", "release")
  zip.urls$URL <- as.character(zip.urls$URL)
  zip.urls$name <- as.character(zip.urls$name)
  zip.urls$size <- as.character(zip.urls$size)
  zip.urls$release <- as.character(zip.urls$release)
  
  # check for bad table name
  if(tabl!="all" & length(grep(paste("[.]", tabl, "[.]", sep=""), zip.urls$name))==0) {
    message(paste("No files found for ", tabl, ". Check that this is a valid table name in ", 
               dpID, ".", sep=""))
    return(invisible())
  }

  writeLines(paste0(messages[-1], collapse = "\n"))

  return(zip.urls)
}