wrappeR: A Pipeline for Biodiversity Indicators

Documented in createMetadata

#' \code{createMetadata} - Create master metadata table from objectstore data
#' repository
#' 
#' @description This function creates the metadata from a hierarchical data
#'  structure. It assumes a structure of:
#'  file_location/<taxa>/<data_type>/<dataset_name>. 
#'  It assumes that all dataset names contain the year of entry (in the
#'  format 2019 or 2019.2 for the second dataset in 2019). It also assumes
#'  that there is an equivalent folder at:
#'  file_location/<taxa>/input_data/<dataset_name>
#'  Which contains the input data.
#'  A lot of the parameters are hardcoded, and as such it is unlikely to return
#'  perfect metadata if the structure of outputs is altered.
#'  This takes about an hour to run as of 12 May 2021. You can pass it the
#'  previous metadata as a parameter, so it will not run metadata collection
#'  on any old datasets.
#'
#' @param file_location Path to root directory of data outputs on object store
#' @param oldMetadata if TRUE metadata will be updated from a dataframe object 
#'  previously created by this function, the most recent. 
#'  All rows of data in this object will not be run. Note that if data has changed
#'  in any of the datasets contained within the old metadata dataframe, these
#'  will not be re-run. Defaults to TRUE 
#' 
#' @return Dataframe of metadata
#' 
#' @import pbapply
#' @import stringr
#' @import dplyr
#'         
#' @export

createMetadata <- function(file_location, oldMetadata = TRUE){ # changed oldMetadata so it is now default to produce this from most recent file in /data-s3/most_recent_meta
  exceptions <- read.csv("/data-s3/metadata/exceptions.csv", ## need to create this!
                         stringsAsFactors = FALSE)
  metadata <- suppressWarnings({pblapply(list.files(file_location), function(group){
    cat('Creating metadata for',group,'\n')
    data_types <- list.files(file.path(file_location, group))
    # We don't want the input data yet, we'll link that later
    data_types <- data_types[data_types != 'input_data']
    
    # Create a placeholder empty dataframe
    ds_placeholder <- data.frame(group = group,
                                 data_type = 'None',
                                 dataset_name = 'None',
                                 data_location = 'None',
                                 input_file = 'None',
                                 year = 'None',
                                 most_recent_year = 'None',
                                 most_recent = FALSE,
                                 min_year  = 'NA',
                                 max_year = 'NA',
                                 n_species = 'NA',
                                 n_species_input = 'NA',
                                 regions = 'NA',
                                 regions_aggs = 'NA',
                                 sparta_v = 'NA',
                                 provenance = 'NA',
                                 user = 'NA',
                                 date = 'NA',
                                 stringsAsFactors = FALSE)
    if(length(data_types)==0){
      # No output data types, just return placeholder
      ds <- ds_placeholder
    } else {
      # There's some data types. So get the metadata
      ds <- lapply(data_types, FUN = function(data_type){
        datasets <- list.files(file.path(file_location, group, data_type))
        if(length(datasets)==0){
          # We have no datasets in this data_type folder. Return appropriate dataframe
          df <- ds_placeholder
          df$data_type <- data_type
          return(df)
        } else {
          alreadyRun <- NULL
          if(oldMetadata == TRUE){
            
            all_meta <- gsub(".csv", "", list.files("/data-s3/most_recent_meta"))
            
            most_recent_meta <- grep(as.character(max(
              as.numeric(
                substr(
                  all_meta, 
                  start = 10, 
                  stop = nchar(all_meta))))), 
              all_meta, value = TRUE)
              

            old_meta <- read.csv(file.path("/data-s3/most_recent_meta", 
                                           paste0(most_recent_meta, ".csv")))
            
            datasetsToRun <- !(file.path(file_location, group, data_type, datasets) %in%
                                 old_meta$data_location)
            alreadyRun <- old_meta[old_meta$data_location %in%
                                        file.path(file_location, group, data_type, datasets),]
            if(!any(datasetsToRun)){
              # All our datasets are in metadata already. Just return the previous metadata
              return(alreadyRun)
            } else {
              # We have new datasets. Subset the datasets to the new ones.
              datasets <- datasets[datasetsToRun]
            }
          }
          
          # Find the year of the datasets
          # Later datasets in the same year are listed as XXXX.1 or XXXX.2 etc
          strregex <- '[0-9]{4}([\\.0-9]+)?'
          datasets_with_years <- grepl(strregex, datasets)
          years <- lapply(str_extract(string = datasets,
                                      pattern = strregex), as.numeric) %>% unlist()
          # Find the maximum year, but first we have to drop NAs
          years_num <- years[!is.na(years)]
          if(length(years_num)>0){
            most_recent <- max(years_num)
          } else {
            most_recent <- 'Unknown'
          }
          # For all years where the regex failed, return "Unknown"
          years[is.na(years)] <- 'Unknown'
          
          df <- lapply(1:length(datasets), FUN = function(i){
            # Build data location
            data_location <- file.path(file_location, group, data_type, datasets[i])
            
            # Find input data
            input_path <- file.path(file_location, group, 'input_data', datasets[i])
            if(!dir.exists(input_path)){
              input_file <- 'Not Available'
              n_species_input <- 'Unknown'
            } else {
              input_file <- list.files(input_path, full.names = TRUE)
              if(length(input_file)>1){
                # There's more than 1. Perhaps one is the 'visitData' formatted file. Let's check.
                tmp_input <- input_file[!grepl('visitData',basename(input_file))]
                if(length(tmp_input)==1){
                  input_file <- tmp_input
                } else {
                  # We still have more than one. Maybe one is a source csv?
                  tmp_input <- input_file[!grepl('csv$',basename(input_file))]
                  if(length(tmp_input)==1){
                    input_file <- tmp_input
                  } else {
                    # Oh well, we still have two. Return 'Multiple Available'
                    input_file <- 'Multiple Available' 
                    n_species_input <- 'Unknown'
                  }
                }
              } else if(length(input_file)==0){
                input_file <- 'Not Available'
                n_species_input <- 'Unknown'
              }
            }
            
            if(!(input_file %in% c('Multiple Available', 'Not Available'))){
              speciesListInput <- suppressWarnings({loadRfile(input_file)})
              if('spp_vis' %in% names(speciesListInput)){
                # Visit Data format. Find the names
                speciesList <- as.character(names(speciesListInput$spp_vis)[-1])
              } else {
                # Probably standard format. Find names in first column (hopefully species)
                if('CONCEPT' %in% colnames(speciesListInput)){ ## kattur - made if, so that can use for old and new format
                  speciesList <- as.character(unique(data.frame(speciesListInput)[,'CONCEPT']))} ## kattur - changed to CONCEPT 
                ## rather than [1], because 
                ## not always the first 
                ## column (e.g., Ellcur Trichoptera)
                
                if('species_long' %in% colnames(speciesListInput)){
                  speciesList <- as.character(unique(data.frame(speciesListInput)[,'species_long']))} ## kattur new format started with this
                
                if('long_name' %in% colnames(speciesListInput)){
                  speciesList <- as.character(unique(data.frame(speciesListInput)[,'long_name']))} ## kattur new format plan to use for future
              }
              n_species_input <- length(speciesList)
            } else {
              speciesListInput <- NULL
              
              n_species_input <- 'Unknown'  ## kattur - added here to match the fact that
              ## allocated in the 'if' above
            }
            
            # Pull in some key metadata
            ourfiles <- list.files(data_location, full.names = TRUE)
            ourfiles <- ourfiles[!file.info(ourfiles)$isdir]
            if(length(ourfiles)==0){
              # There's no data. Return what we can.
              dq <- ds_placeholder
              dq$dataset_name <- datasets[i]
              dq$data_type <- data_type
              dq$data_location <- data_location
              dq$input_file <- input_file
              dq$n_species_input <- n_species_input
              dq$year <- as.character(years[i])
              dq$most_recent_year <- as.character(most_recent)
              dq$most_recent <- years[i]==most_recent
            } else {
              # We have data.
              # If the files are daisychained, we need the first item in a daisychain Check for that
              firstRun <-
                str_extract(ourfiles, pattern = '(?<=_)[0-9]+(?=_[1-3]{1}\\.[RrDdSsAaTt]+$)') %>%
                as.numeric()
              if(any(!is.na(firstRun))){
                # We have daisychains
                minDaisy <- min(firstRun[!is.na(firstRun)])
                refFile <- ourfiles[which(firstRun == minDaisy)[1]]
              } else {
                #No daisies. Just return the first file
                refFile <- ourfiles[1]
              }
              min_year <- max_year <- regions <- regions_aggs <- sparta_v <-
                provenance <- user <- submit_date <- n_species  <- 'Unknown' 
              ## kattur removed <- n_species_input, 
              ## because n_species_input has already been
              ## allocated above and this was over-riding it
              
              if(length(ourfiles)==0){
                # We have no outputs. Just set n_species to 0 so we know we tried to find data
                n_species <- 0
              } else {
                if(data_type == "occmod_outputs"){
                  tryCatch({
                    out <- suppressWarnings({loadRfile(refFile)})
                    min_year <- out$min_year
                    max_year <- out$max_year
                    
                    # This bit of code finds the input species, then checks each one off
                    # against the input species list.
                    # It's a bit slow, but only needs to be run once per metadata output,
                    # and is probably more trustworthy than the hacky regex option
                    if(!is.null(speciesListInput)){
                      n_species <- lapply(speciesList, FUN = function(species){
                        any(grepl(tolower(species),tolower(basename(ourfiles))))
                      }) %>% unlist() %>% sum()
                    } else {
                      # We don't have the relevant input data, so instead use a hacky bit of regex
                      n_species <- length(unique(sub("_[0-9]+_[1-3]{1}\\.[RrDdSsAaTt]+$",
                                                     "",
                                                     basename(ourfiles))))
                    }
                    
                    regions <- paste(out$regions, collapse = "; ")
                    regions_aggs <- paste(names(out$region_aggs), collapse = "; ")
                    if(!is.null(attr(out, "metadata"))){
                      metadata_analysis <- attr(out, "metadata")$analysis
                      if('session_info' %in% names(metadata_analysis)){
                        sparta_v <- metadata_analysis$session_info[[2]][['sparta']]
                      } else {
                        sparta_v <- metadata_analysis$session.info$otherPkgs$sparta$Version
                      }
                      params <- lapply(c('provenance','user','date'), FUN = function(parameter){
                        output_param <- metadata_analysis[[parameter]]
                        if(is.null(output_param)) output_param <- 'Unknown'
                        as.character(output_param)
                      })
                      provenance <- params[[1]]
                      user <- params[[2]]
                      submit_date <- params[[3]]
                    } else {
                      sparta_v <- provenance <- user <- submit_date <- 'Unknown'
                    }
                  }, error=function(e){
                    min_year <- max_year <- n_species <- regions <- regions_aggs <-
                      sparta_v <- provenance <- user <- submit_date <- 'Unknown' ## kattur removed <- n_species_input, 
                    ## because n_species_input has already been
                    ## allocated above
                  })
                } else {
                  tryCatch({
                    out <- suppressWarnings({loadRfile(refFile)})
                    n_species <- length(unique(sub("_[0-9]+_[1-3]{1}\\.[RrDdSsAaTt]+$",
                                                   "",
                                                   basename(ourfiles))))
                    if(class(out)=='data.frame'){
                      if('CONCEPT' %in% names(out)){
                        n_species <- length(unique(out$CONCEPT))
                      }
                      if('species_long' %in% names(out)){
                        n_species <- length(unique(out$species_long)) # kattur
                      }
                      
                      
                      if('long_name' %in% names(out)){
                        n_species <- length(unique(out$long_name)) # kattur
                      }
                      
                    } else {
                      if('spp_vis' %in% names(out)){
                        n_species <- dim(out$spp_vis[-1])[2]
                      }
                    }
                    
                    if('min_year' %in% names(out)){
                      min_year <- out$min_year
                    }
                    if('max_year' %in% names(out)){
                      max_year <- out$max_year
                    }
                    if('regions' %in% names(out)){
                      regions <- paste(out$regions, collapse = "; ")
                    }
                    if('regions_aggs' %in% names(out)){
                      regions_aggs <- paste(names(out$regions_aggs), collapse = "; ")
                    }
                  }, error=function(e){
                    min_year <- max_year <- n_species <- regions <- regions_aggs <-
                      sparta_v <- provenance <- user <- submit_date <- 'Unknown'  ## kattur removed <- n_species_input, 
                    ## because n_species_input has already been
                    ## allocated above
                  })
                }
              }
              dq <-
                data.frame(group = group,
                           data_type = data_type,
                           dataset_name = datasets[i],
                           data_location = data_location,
                           input_file = input_file,
                           year = as.character(years[i]),
                           most_recent_year = as.character(most_recent),
                           most_recent = ifelse(datasets[i] %in% exceptions$dataset_name,
                                                FALSE,
                                                years[i]==most_recent),
                           min_year = as.character(min_year),
                           max_year = as.character(max_year),
                           n_species = as.character(n_species),
                           n_species_input = as.character(n_species_input),
                           regions = regions,
                           regions_aggs = regions_aggs,
                           sparta_v = sparta_v,
                           provenance = provenance,
                           user = user,
                           date = submit_date,
                           stringsAsFactors = FALSE)
            }
            # dq is the dataframe for a specific dataset within a data type within a group
            # i.e. will have a unique 'group + data_type + dataset_name' combination
            return(dq)
          }) %>% bind_rows()
          df <- bind_rows(alreadyRun, df)
          # df is the dataframe for all datasets with a data type within a group
          # i.e. will be the dataframe for a unique 'group + data_type' combination
          
          
          # check if multiple most_recent == TRUE [kattur]
          df_true <- df[df$most_recent == TRUE,]
          
          df_false <- df[df$most_recent == FALSE,]
          
          if(nrow(df_true) > 1 ){
          
            
            df_true$most_recent <-  ifelse(df_true$dataset_name %in% alreadyRun$dataset_name, FALSE,df_true$most_recent)
            
          }
          
          df <- rbind(df_false, df_true)
          # assume that any new are most recent unless in skip [kattur]
          ## still need to work out what to do if all most_recent are FALSE
          return(df)
        }
      }) %>% bind_rows()
    }
    # ds is the dataframe for all datasets within all data types within a group
    # i.e. will be the dataframe for a 'group'
    return(ds)
  })}) %>% bind_rows()
  # metadata combines all groups, their sub-'data_types' and sub-'datasets'
  return(metadata)
}