R/Site_metadata.R

Defines functions get_site_metadata warn_missing_metadata check_missing get_site_metadata_web get_fluxdata_org_site_metadata get_site_fluxdata_org_url get_fluxdata_org_site_codes get_ornl_site_metadata get_ornl_site_url_list get_site_ornl_url get_ornl_site_codes update_csv_from_web metadata_list_to_dataframe save_metadata_list_to_csv save_metadata_to_csv get_site_metadata_from_CSV update_metadata add_processing_metadata get_git_version get_site_code site_metadata_template

Documented in add_processing_metadata check_missing get_fluxdata_org_site_codes get_fluxdata_org_site_metadata get_git_version get_ornl_site_codes get_ornl_site_metadata get_ornl_site_url_list get_site_code get_site_fluxdata_org_url get_site_metadata get_site_metadata_from_CSV get_site_metadata_web get_site_ornl_url metadata_list_to_dataframe save_metadata_list_to_csv save_metadata_to_csv site_metadata_template update_csv_from_web update_metadata warn_missing_metadata

# Author: ned haughton

# Uses rvest library for read_html, html_node, html_attr, html_table


#################################################
# General metadata functions
################################################

#' Empty site metadata template, including only the site code
#'
#' @return metadata list
site_metadata_template <- function(site_code) {
    metadata <- list(
        SiteCode = site_code,
        Fullname = NA,
        SiteLatitude = NA,
        SiteLongitude = NA,
        SiteElevation = NA,
        IGBP_vegetation_short = NA,
        IGBP_vegetation_long = NA,
        TowerHeight = NA,
        CanopyHeight = NA,
        Tier = NA,
        Exclude = FALSE,
        Exclude_reason = NA,
        Description = NA,
        TowerStatus = NA,	
        Country = NA,	
        MeasurementHeight = NA,	
        VegetationDescription = NA,	
        SoilType = NA,	
        Disturbance = NA,	
        CropDescription = NA,	
        Irrigation = NA        
        
    )

    return(metadata)
}


#' Get site name from metadata
get_site_code <- function(metadata){
    return(metadata[["SiteCode"]])
}


#' Gets the git version from the installed package
#' See src/zzz.R for how git revision is discovered
get_git_version <- function() {

    #Initialise warnings
    warnings <- ""

    desc <- read.dcf(system.file("DESCRIPTION", package = "FluxnetLSM"))
    if ("git_revision" %in% colnames(desc)) {
        git_rev <- desc[1, "git_revision"]
    } else if ("RemoteSha" %in% colnames(desc)) {
        git_rev <- desc[1, "RemoteSha"]
    } else {
        git_rev <- "UNKNOWN"

        warn <- paste("Unknown git revision of FluxnetLSM. Please",
                      "visit https://github.com/aukkola/FluxnetLSM and",
                      "review the installation procedure")
        warnings <- append_and_warn(warn=warn, warnings)
    }
    return(list(git_rev=git_rev, warn=warnings))
}


#' Adds processor metadata, including processor version
#'
#' @return metadata list
add_processing_metadata <- function(metadata) {

    #return git revision and warning if revision found
    git_rev = get_git_version()

    metadata$Processing <- list(
        processor = "FluxnetLSM",
        URL = "https://github.com/aukkola/FluxnetLSM",
        version = packageVersion("FluxnetLSM"),
        git_rev = git_rev$git_rev
    )

    return(list(out=metadata,warn=git_rev$warn))
}


#' Updates old metadata with new metadata, ignoring NAs, and warning on differences
#'
#' @return metadata list
update_metadata <- function(metadata, new_metadata, overwrite=TRUE) {
    for (n in names(new_metadata)) {
        if (!is.na(new_metadata[[n]])) {  # Don't overwrite with empty data
            if (n %in% names(metadata) && !is.na(metadata[[n]])) { # Old data already exists, check
                different <- FALSE
                if (!is.numeric(metadata[[n]])) {
                    if (new_metadata[[n]] != metadata[[n]]) { # non-numeric and different
                        different <- TRUE
                    }
                } else { # numeric
                    if (round(new_metadata[[n]], 4) != round(metadata[[n]], 4)) { # and different
                        different <- TRUE
                    }
                }

                if (different) {
                    overwrite_text = if (overwrite) "Overwriting" else "Not overwriting"

                    warn <- paste0("New metadata for ", n, " has different values! ",
                                   overwrite_text, ".\n",
                                   "  old: ", metadata[n],
                                   ", new: ", new_metadata[n])
                    message(warn)

                    if (overwrite) {
                        metadata[n] <- new_metadata[n]
                    }

                }
            } else {  # old data is empty, or doesn't exist
                metadata[n] <- new_metadata[n]
            }
          
      #Metadata value is missing but the field doesn't exist in metadata, 
      #save as is to avoid losing metadata field
      } else if (is.na(new_metadata[[n]]) & !(n %in% names(metadata))){
        metadata[n] <- new_metadata[n]
        
      }
    
    }

    return(metadata)
}

################################################
# CSV-stored metadata
################################################

# Find site info file path (not using data() command directly because reads a CSV with a
# semicolon separator and this leads to incorrect table headers)

#' Tries to gather metadata from the included site CSV
#'
#' @return metadata list
#' @export
 
get_site_metadata_from_CSV <- function(
    metadata = NA,
    incl_processing = TRUE,
    model,
    site_csv_file = system.file("extdata", "Site_metadata.csv", package = "FluxnetLSM")
    ) {

    if (!is.list(metadata)) {
        metadata <- site_metadata_template(metadata)
    }

    # allow for an external file to be called instead of a
    # a hard coded one using a new argument
    csv_data <- try(read.csv(site_csv_file, header = TRUE,
                    stringsAsFactors = FALSE))

    if (is.na(metadata[1])) {  # [1] to skip if site_code is set
        # get all existing metadata as a list of lists
        message("Loading metadata for all sites from csv_data cache (", site_csv_file, ")")
        metadata <- lapply(row.names(csv_data), function(row) {
            as.list(csv_data[row, ])
        })
        names(metadata) <- csv_data$SiteCode
        return(metadata)
    }

    site_code <- get_site_code(metadata)

    message("Loading metadata for ", site_code, " from csv_data cache (", site_csv_file, ")")

    #Found site code in CSV
    if (site_code %in% csv_data$SiteCode) {
        csv_row <- as.list(csv_data[csv_data$SiteCode == site_code, ])
        metadata = update_metadata(metadata, csv_row)
       
    #Didn't find it (stop if trying to pass model parameters) 
    } else {
        message("    ", site_code, " not found in csv_data file")
      
        if(!is.na(model)) stop("Cannot read model parameters, site not found in CSV metadata file. ",
                             "Please amend CSV file or set model=NA")
    }

    if (incl_processing) {
        warnings <- ""
        warn_missing_metadata(metadata)

        metadata <- add_processing_metadata(metadata)
        warnings <- append_and_warn(warn=metadata$warn, warnings, call=FALSE)
        metadata <- metadata$out

        return(list(out=metadata,warn=warnings))
    } else {
        return(metadata)
    }
}


#' Writes metadata to CSV, only updating non-NA data
save_metadata_to_csv <- function(metadata) {
    save_metadata_list_to_csv(list(metadata))
}


#' Write multiple site metadata to list at once
save_metadata_list_to_csv <- function(metadata_lists) {
    old_csv_data <- read.csv(site_csv_file, header = TRUE,
                         stringsAsFactors = FALSE)

    new_csv_data <- metadata_list_to_dataframe(metadata_lists)

    common_names <- intersect(names(old_csv_data), names(new_csv_data))
    common_names <- common_names[common_names != "SiteCode"]

    # Merge new and existing datasets, preferring new.
    csv_data <- merge(new_csv_data, old_csv_data,
                      all = TRUE, by = "SiteCode")
    for (n in common_names) {
        # Use old data only if new data is missing.
        csv_data[[n]] <- ifelse(is.na(csv_data[[paste0(n, ".x")]]),
                                csv_data[[paste0(n, ".y")]],
                                csv_data[[paste0(n, ".x")]])
        csv_data[[paste0(n, ".x")]] <- NULL
        csv_data[[paste0(n, ".y")]] <- NULL
    }

    csv_data <- csv_data[order(csv_data$SiteCode), ]

    # Fix new lines in Descriptions
    csv_data$Description <- gsub("\\n", "\\\\n", csv_data$Description)

    write.csv(csv_data, file = site_csv_file, row.names = FALSE)
}


#' Convert a list of metadata lists to a dataframe
metadata_list_to_dataframe <- function(metadata_lists) {

    to_save <- list("SiteCode", "Fullname", "Description", "TowerStatus",
                    "Country", "SiteLatitude", "SiteLongitude", "SiteElevation",
                    "IGBP_vegetation_short", "IGBP_vegetation_long",
                    "TowerHeight", "CanopyHeight", "Tier"
                    )

    csv_data <- data.frame()

    for (metadata in metadata_lists) {
        site_code <- metadata$SiteCode
        for (v in to_save) {
            if (v %in% names(metadata) && !is.na(metadata[[v]])) {
                csv_data[site_code, v] <- metadata[[v]]
            }
        }
    }

    return(csv_data)
}


#' Reads all ORNL data into the CSV file
#' @export
update_csv_from_web <- function() {
    csv_data <- get_site_metadata_from_CSV(incl_processing=FALSE)

    csv_site_codes <- names(csv_data)

    fluxdata_site_codes <- get_fluxdata_org_site_codes()
    ornl_site_codes <- get_ornl_site_codes()

    all_site_codes <- union(union(csv_site_codes, fluxdata_site_codes),
                            ornl_site_codes)

    metadata <- list()
    for (sc in all_site_codes) {
        site_md <- site_metadata_template(sc)
        if (sc %in% csv_site_codes) {
            site_md <- update_metadata(site_md, csv_data[[sc]])
        }
        if (any(check_missing(site_md)) && sc %in% fluxdata_site_codes) {
            # Overwrite with fluxdata.org data
            site_md <- get_fluxdata_org_site_metadata(site_md)
        }
        #if (any(check_missing(site_md)) && sc %in% ornl_site_codes) {
        #    # Don't overwrite with ORNL data, just gapfill
        #    site_md <- get_ornl_site_metadata(site_md, overwrite = FALSE)
        #}
        metadata[[sc]] <- site_md
    }

    message("Saving metadata to ", site_csv_file)
    save_metadata_list_to_csv(metadata)
}

################################################
# Web-based metadata
################################################

### fluxnet.ornl.gov ###

#' Get all available site codes from site_status table
get_ornl_site_codes <- function() {

    library(rvest)
    status_table_url <- "https://fluxnet.ornl.gov/site_status"

    page_html <- read_html(status_table_url)

    table_data <- page_html %>% html_node("#historical_site_list") %>% html_table()

    site_codes <- sort(table_data[["FLUXNET ID"]])

    return(site_codes)

}


#' Get a single ORNL site URL from site_status table
get_site_ornl_url <- function(site_code) {
    ornl_url <- get_ornl_site_url_list(list(site_code))[[site_code]]
    return(ornl_url)
}


#' Get a list of ORNL site URLs from site_status table
get_ornl_site_url_list <- function(site_code_list) {

    library(rvest)
    status_table_url <- "https://fluxnet.ornl.gov/site_status"

    page_html <- read_html(status_table_url)

    ornl_url_list <- list()

    for (site_code in site_code_list) {
        # looks for table cell with site code as contents, then looks up the parent
        # row, and finds the href of the first link.
        xpath <- paste0("//td[text()='", site_code, "']/..")
        trow <- tryCatch(page_html %>% html_node(xpath = xpath), error=function(e) NULL)
        if (class(trow) == "xml_node") {
            ornl_rel_url <- trow %>% html_node("a") %>% html_attr("href")
            ornl_url_list[[site_code]] <- paste0("https://fluxnet.ornl.gov/", ornl_rel_url)
        } else {
            message(site_code, " not found in table at https://fluxnet.ornl.gov/site_status")
        }
    }

    return(ornl_url_list)
}


#' Get metadata from ORNL
#'
#' @return metadata list
get_ornl_site_metadata <- function(metadata, site_url=NULL, overwrite=TRUE) {

    library(rvest)
    site_code <- get_site_code(metadata)

    if (is.null(site_url)) {
        site_url <- get_site_ornl_url(site_code)
        if (is.null(site_url)) {
            # site not found at ORNL
            return(metadata)
        }
        metadata$ORNL_URL <- site_url
    }

    message("Trying to load metadata for ", site_code, " from ORNL (", site_url, ")")

    page_html <- read_html(site_url)

    new_metadata = list()
    # General info
    table_data <- page_html %>% html_node("table#fluxnet_site_information") %>% html_table()
    new_metadata$Fullname <- table_data[table_data[1] == "Site Name:"][2]
    new_metadata$Description <- table_data[table_data[1] == "Description:"][2]
    new_metadata$TowerStatus <- table_data[table_data[1] == "Tower Status:"][2]

    # Location Information
    table_data <- page_html %>% html_node("table#fluxnet_site_location_information") %>% html_table()
    new_metadata$Country <- table_data[table_data[1] == "Country:"][2]
    lat_lon <- strsplit(table_data[table_data[1] == "Coordinates:(Lat, Long)"][2], ", ")[[1]]
    new_metadata$SiteLatitude <- round(as.numeric(lat_lon[1]), 5)  # round to ~1m
    new_metadata$SiteLongitude <- round(as.numeric(lat_lon[2]), 5)

    # Site Characteristics
    tryCatch({
        table_data <- page_html %>% html_node("table#fluxnet_site_characteristics") %>% html_table()
        elevation_text <- table_data[table_data[1] == "GTOPO30 Elevation:"][2]
        new_metadata$SiteElevation <- as.numeric(gsub("m", "", elevation_text))
        new_metadata$IGBP_vegetation_long <- table_data[table_data[1] == "IGBP Land Cover:"][2]
    }, error = function(cond) {
        message(site_code, " doesn't have a Site Characteristics table at ", site_url)
    })

    # ORNL doesn't have any of these:
    #    metadata$IGBP_vegetation_short = NULL,
    #    metadata$TowerHeight = NaN,
    #    metadata$CanopyHeight = NaN,
    #    metadata$Tier = NaN,
    #    metadata$Exclude = FALSE,
    #    metadata$Exclude_reason = NULL

    # TODO: ORNL has other potentially useful site information, affiliation info,
    # and investigator info. Should we use some?

    metadata = update_metadata(metadata, new_metadata, overwrite=overwrite)

    return(metadata)
}


### Fluxdata.org ###

#' Get all available site codes from site_status table
get_fluxdata_org_site_codes <- function() {

    library(jsonlite)

    status_JSON_url <-"https://ameriflux-data.lbl.gov/AmeriFlux/SiteSearch.svc/SiteMapData/Fluxnet"

    site_data <- jsonlite::read_json(status_JSON_url)

    site_codes <- unlist(lapply(site_data, '[[', "SITE_ID"))

    return(site_codes)

}


#' Get a single fluxdata_org site URL from site_status table
get_site_fluxdata_org_url <- function(site_code) {
    fluxdata_org_url <- paste0("http://sites.fluxdata.org/", site_code, "/")
    return(fluxdata_org_url)
}


#' Get metadata from Fluxdata.org
#'
#' @return metadata list
get_fluxdata_org_site_metadata <- function(metadata, site_url=NULL) {
    library(rvest)

    site_code <- get_site_code(metadata)

    new_metadata = list()

    if (is.null(site_url)) {
        site_url <- get_site_fluxdata_org_url(site_code)
        new_metadata$fluxdata_org_URL <- site_url
    }

    message("Trying to load new_metadata for ", site_code, " from Fluxdata.org (", site_url, ")")

    #Try to open site URL
    page_html <- tryCatch(read_html(site_url), error = function(e) NULL)

    # General info
    table_data <- tryCatch(page_html %>% html_node("table.maininfo") %>% html_table(),
                           error = function(e) NULL)
    if (class(table_data) != 'data.frame') {
        message("No data available at ", site_url, " (", class(table_data), ")")
        return(metadata)
    }

    new_metadata$Fullname <- table_data[table_data[1] == "Site Name:"][2]
    new_metadata$SiteLatitude <- as.numeric(table_data[table_data[1] == "Latitude:"][2])
    new_metadata$SiteLongitude <- as.numeric(table_data[table_data[1] == "Longitude:"][2])

    elevation_text <- table_data[table_data[1] == "Elevation (m):"][2]
    elevation <- suppressWarnings(as.numeric(gsub("m", "", elevation_text)))
    if (!is.na(elevation)) {  # lots of Fluxdata.org elevtions are missing, don't overwrite
        new_metadata$SiteElevation <- elevation
    }

    IGBP_text = strsplit(gsub("\\)", "", table_data[table_data[1] == "IGBP:"][2]), " \\(")[[1]]
    new_metadata$IGBP_vegetation_short <- IGBP_text[1]
    new_metadata$IGBP_vegetation_long <- IGBP_text[2]

    # Fluxdata.org doesn't have any of these:
    #    metadata$Description
    #    metadata$TowerStatus
    #    metadata$Country
    #    metadata$TowerHeight
    #    metadata$CanopyHeight
    #    metadata$Tier

    # TODO: fluxdata_org has other potentially useful site information, affiliation info,
    # and investigator info. Should we use some?

    metadata = update_metadata(metadata, new_metadata)

    return(metadata)
}


#' Tries to load metadata from known Fluxnet info sources on the 'web
#'
#' @return metadata list
#' @export
get_site_metadata_web <- function(metadata, incl_processing=TRUE) {

    if (!is.list(metadata)) {
        metadata <- site_metadata_template(metadata)
    }

    metadata <- get_fluxdata_org_site_metadata(metadata)

    #Commented this out as ORNL metadata no longer available
    #if (any(check_missing(metadata))) {
    #    # Don't overwrite with ORNL data, only gap-fill Fluxnet Data
    #    metadata <- get_ornl_site_metadata(metadata, overwrite=FALSE)
    #}

    # TODO: Add loaders for OzFlux, AmeriFlux, etc.

    if (incl_processing) {
        warnings <- ""
        warn_missing_metadata(metadata)

        metadata <- add_processing_metadata(metadata)
        warnings <- append_and_warn(warn=metadata$warn, warnings, call=FALSE)
        metadata <- metadata$out

        return(list(out=metadata,warn=warnings))
    } else {
        return(metadata)
    }
}


################################################
# metadata checks
################################################

#' Checks which metadata are missing (correcting for OK NAs)
#'
#' @return boolean metadata availability vector
check_missing <- function(metadata) {
    key_data = c("SiteCode", "Fullname",
                 "SiteLatitude", "SiteLongitude", "SiteElevation",
                 "IGBP_vegetation_short", "IGBP_vegetation_long",
                 "TowerHeight", "CanopyHeight", "Tier")

    missing_data <- is.na(metadata[key_data])

    return(missing_data)
}


#' Warns about missing metadata for the site
warn_missing_metadata <- function(metadata) {
    missing_data <- check_missing(metadata)

    if (any(missing_data)) {
        message("Missing metadata for site ", metadata$SiteCode, ":")
        message("   ", paste(names(metadata)[missing_data], collapse = ", "))
    }
}


################################################
# Main metadata functions
################################################

#' Get site metadata. Tries multiple methods to retrieve full metadata
#'
#' @return metadata list
#' @export
get_site_metadata <- function(site_code, incl_processing=TRUE,
                              use_csv=TRUE, update_csv=FALSE, ...) {
    #Initialise warnings
    warnings <- ""

    metadata <- site_metadata_template(site_code)

    if (use_csv) {
        metadata <- get_site_metadata_from_CSV(metadata, incl_processing=FALSE, ...)
    }

    if (any(check_missing(metadata))) {
        metadata <- get_site_metadata_web(metadata, incl_processing=FALSE)
    }

    warn_missing_metadata(metadata)

    if (incl_processing) {
        metadata <- add_processing_metadata(metadata)
        warnings <- append_and_warn(warn=metadata$warn, warnings, call=FALSE)
        metadata <- metadata$out
    }

    if (update_csv) {
        save_metadata_to_csv(metadata)
    }

    return(list(out=metadata,warn=warnings))
}
aukkola/FLUXNET2015_processing documentation built on July 4, 2023, 12:02 p.m.