R/mf_download_data.R

Defines functions mf_download_data

Documented in mf_download_data

#' @name mf_download_data
#' @aliases mf_download_data
#' @title  Download several datasets given their URLs and destination path
#' @description This function enables to download datasets. In a data import workflow, this function is typically used after a call to the \link{mf_get_url} function. The output value of \link{mf_get_url} can be used as input of parameter \code{df_to_dl} of \link{mf_download_data}.
#'
#' The download can the parallelized.
#'
#' @inheritParams mf_get_url
#' @inheritParams mf_login
#' @param df_to_dl data.frame. Urls and destination files of dataset to download. Typically output of \link{mf_get_url}. See Details for the structure
#' @param path string. Target folder for the data to download. Default : temporary folder.
#' @param parallel boolean. Parallelize the download ? Default to FALSE
#' @param num_workers integer. Number of workers in case of parallel download. Default to number of workers available in the machine minus one.
#' @param min_filesize integer. Minimum file size expected (in bites) for one file downloaded. If files downloaded are less that this value, the files will be downloaded again. Default 5000.
#'
#' @return a data.frame with the same structure of the input data.frame \code{df_to_dl} + columns providing details of the data downloaded. The additional columns are :
#' \describe{
#' \item{fileDl}{Booloean (dataset downloaded or failure)}
#' \item{dlStatus}{Download status : 1 = download ok ; 2 = download error ; 3 = dataset was already existing in destination file }
#' \item{fileSize}{File size on disk}
#' }
#'
#' @details
#'
#' Parameter \code{df_to_dl} must be a data.frame with the following minimal structure :
#' \describe{
#' \item{id_roi}{An id for the ROI (character string)}
#' \item{collection}{Collection (character string)}
#' \item{name}{}
#' \item{url}{URL of the file to download (character string)}
#' }
#'
#' @import dplyr parallel httr
#' @importFrom utils write.csv URLdecode
#' @export
#'
#' @examples
#'
#' \dontrun{
#'
#' ### Login to EOSDIS Earthdata with your username and password
#' log <- mf_login(credentials = c("earthdata_un","earthdata_pw"))
#'
#' ### Set-up parameters of interest
#' coll <- "MOD11A1.061"
#'
#' bands <- c("LST_Day_1km","LST_Night_1km")
#'
#' time_range <- as.Date(c("2017-01-01","2017-01-30"))
#'
#' roi <- sf::st_as_sf(data.frame(
#' id = "roi_test",
#' geom="POLYGON ((-5.82 9.54, -5.42 9.55, -5.41 8.84, -5.81 8.84, -5.82 9.54))"),
#' wkt="geom",crs = 4326)
#'
#' ### Get the URLs of the data
#' (urls_mod11a1 <- mf_get_url(
#' collection = coll,
#' variables = bands,
#' roi = roi,
#' time_range = time_range
#' ))
#'
#' ### Download the data
#' res_dl <- mf_download_data(urls_mod11a1)
#'
#' ### Import the data as terra::SpatRast
#' modis_ts <- mf_import_data(dirname(res_dl$destfile[1]), collection = coll)
#'
#' ### Plot the data
#' terra::plot(modis_ts)
#'
#'}


mf_download_data<-function(df_to_dl,path=tempfile("modisfast_"),parallel=FALSE,num_workers=parallel::detectCores()-1,credentials=NULL,verbose=TRUE,min_filesize=5000){

  fileSize <- destfile <- fileDl <- folders <- readme_files <- source <-  NULL

  source="earthdata"

  # tests
  if(!inherits(verbose,"logical")){stop("verbose argument must be boolean\n")}
  if(!inherits(parallel,"logical")){stop("parallel argument must be boolean\n")}
  #if(!is.null(source) && !inherits(source,"character")){stop("source argument must be either NULL or 'earthdata' \n")}
  if(!inherits(df_to_dl,"data.frame")){stop("df_to_dl argument must be a data.frame\n")}
  if(!("url" %in% colnames(df_to_dl))){stop("df_to_dl argument must be a data.frame with at least 4 columns named 'url', 'collection', 'name', and 'id_roi' \n")}
  if(!("collection" %in% colnames(df_to_dl))){stop("df_to_dl argument must be a data.frame with at least 4 columns named 'url', 'collection', 'name, and 'id_roi' '\n")}
  if(!("name" %in% colnames(df_to_dl))){stop("df_to_dl argument must be a data.frame with at least 4 columns named 'url', 'collection', 'name, and 'id_roi' '\n")}
  if(!("id_roi" %in% colnames(df_to_dl))){stop("df_to_dl argument must be a data.frame with at least 4 columns named 'url', 'collection', 'name, and 'id_roi' '\n")}
  if(num_workers>parallel::detectCores()){stop("the number of workers that you set is greater than the number of available workers in your machine\n")}

  .testInternetConnection()

  df_to_dl$destfile <- file.path(path,"data",df_to_dl$id_roi,df_to_dl$collection,df_to_dl$name)

 # if(dir.exists(path)){warning("Target folder already exists\n")}

  # check which data is already downloaded
  data_dl<-df_to_dl %>%
    dplyr::mutate(fileDl=file.exists(destfile)) %>%
    dplyr::mutate(fileSize=ifelse(fileDl==TRUE,file.size(destfile),NA)) %>%
    dplyr::mutate(fileDl=ifelse(fileDl==TRUE & fileSize>=min_filesize,TRUE,FALSE)) %>%
    dplyr::mutate(dlStatus=ifelse(fileDl==TRUE,3,NA))

    file.remove(data_dl$destfile[which(data_dl$fileSize<=min_filesize)])

  # data already downloaded
  data_already_exist<-data_dl %>%
    dplyr::filter(fileDl==TRUE)

  # data to download
  data_to_download<-data_dl %>%
    dplyr::filter(fileDl==FALSE)

  if(verbose){cat(nrow(df_to_dl)," datasets in total : ", nrow(data_already_exist)," already downloaded and ",nrow(data_to_download)," datasets to download\n")}

  if (nrow(data_to_download)>0){

    # Create directories if they do not exist
    unique(dirname(data_to_download$destfile)) %>%
      lapply(dir.create,recursive = TRUE, showWarnings = FALSE#, mode = "0777"
             )

    # download data
    #for (i in 1:nrow(data_to_download)){
    #    httr::GET(data_to_download$url[i],httr::authenticate(username,password),write_disk(data_to_download$destfile[i]))
    # }
    if(!is.null(source)){
      if(source=="earthdata"){
        .testLogin(credentials)
        username<-getOption("earthdata_user")
        password<-getOption("earthdata_pass")
      }
    } else {
      username <- password <- "no_auth"
    }

    dl_func<-function(url,output,username,password) {
      u <- httr::GET(url)
      httr::GET(u$url,httr::authenticate(username,password),httr::write_disk(output),httr::progress(),config = list(maxredirs=-1))
      #GET(u$url, httr::write_disk(output), httr::progress(), config(maxredirs=-1, netrc = TRUE, netrc_file = netrc), set_cookies("LC" = "cookies"))
      }

    if(verbose){cat("Downloading the data...\n")}
    if (parallel){
      cl <- parallel::makeCluster(num_workers)
      parallel::clusterMap(cl, dl_func, url=data_to_download$url,output=data_to_download$destfile,username=username,password=password,
                           .scheduling = 'dynamic')
      parallel::stopCluster(cl)
    } else {
      for (i in 1:nrow(data_to_download)){
        if(verbose){cat("[",i," over ", nrow(data_to_download),"]\n")}
        dl_func(url=data_to_download$url[i],output=data_to_download$destfile[i],username=username,password=password)
      }
    }
  }
  data_dl<-data_to_download %>%
    dplyr::mutate(fileDl=purrr::map_lgl(destfile,file.exists)) %>%
    dplyr::mutate(dlStatus=ifelse(fileDl==TRUE,1,2))  %>%
    dplyr::mutate(fileSize=file.size(destfile)) %>%
    rbind(data_already_exist)

  # to deal with pb when not all the data are downloaded
  data_downloaded <- dplyr::filter(data_dl,fileSize>=min_filesize)

  if(!(identical(data_dl,data_downloaded))){
    if(verbose){cat("Only part of the data has been downloaded. Downloading the remaining datasets one by one...\n")}
    mf_download_data(df_to_dl=df_to_dl,path=path,parallel=FALSE,credentials=credentials)#,source=source)
  } else {

  # 1 : download ok
  # 2 : download error
  # 3 : data already existing in output folder
    if(verbose){cat("\nData were all properly downloaded under the folder(s) ",paste(as.character(unique(dirname(df_to_dl$destfile))), collapse=" and "),"\n**To import the data in R, use the function modisfast::mf_import_data() rather than terra::rast() or stars::read_stars(). More info at help(mf_import_data)**\n")}
  }

  # write readme
  sentence <- paste0("Query performed on the ",Sys.time(),"
Use the function modisfast::mf_import_data() rather than terra::rast() or stars::read_stars() to import the data in R ! More info at help(mf_import_data)
See the file Summary_downloaded_data.csv for more information on the data downloaded"
  )
  writeLines(sentence, file.path(path,"Readme.txt"))
  # write csv dataset
  data_dl$url <- utils::URLdecode(data_dl$url)
  write.csv(data_dl, file.path(path,"Summary_downloaded_data.csv"), row.names = F)

  return(data_dl)
}

Try the modisfast package in your browser

Any scripts or data that you put into this service are public.

modisfast documentation built on Sept. 11, 2024, 8:15 p.m.