In bpbond/cosore: Continuous Soil Respiration Database

This document is the core of the GCB manuscript: summarizing the database, giving stats, describing functions, etc. Most of the introductory text with citations will be in the Google Doc.

knitr::opts_chunk$set(echo = FALSE)

library(cosore)
library(dplyr)
library(lubridate)
library(readr)
library(tidyr)
library(covr)
library(magrittr)
library(ggplot2)
theme_set(theme_minimal())

# For world map
library(ggmap)
library(scales)

db <- csr_database()

# One dataset came in very late, during final revisions, and it's not continuous, so 
# messes up e.g. Figure 7. Remove it from paper figures and tables (i.e. pretend it came
# in a week later)
db <- filter(db, CSR_DATASET != "d20200825_MINIAT")

db_records <- round(sum(db$CSR_RECORDS, na.rm = TRUE) / 1e6, 2)
db_years <- max(year(db$CSR_DATE_END), na.rm = TRUE) - min(year(db$CSR_DATE_BEGIN), na.rm = TRUE) + 1
db_fields <- csr_metadata()
db_vers <- packageVersion("cosore")
igbp_abbreviations <- read_csv("igbp_abbreviations.csv")

# Generate a list of potential authors from the database
csr_table("contributors") %>% 
  left_join(select(db, CSR_DATASET, CSR_RECORDS), by = "CSR_DATASET") %>% 
  filter(CSR_RECORDS > 0) %>%  # only authors who contribute data
  group_by(CSR_DATASET) %>% 
  mutate(`Author?` = "No", 
         `Institution and address` = "") %>% 
  select(CSR_DATASET, CSR_FAMILY_NAME, CSR_FIRST_NAME, `Author?`,
         CSR_EMAIL, CSR_ORCID, `Institution and address`) %T>% 
  write_csv("authors_from_database.csv", na = "") %>% 
  group_by(CSR_DATASET) %>% 
  filter(CSR_FAMILY_NAME == first(CSR_FAMILY_NAME)) %>% 
  pull(CSR_EMAIL) %>% 
  unique() %>% 
  write_lines("authors_contact.csv")

# Read the edited list of potential authors, from the Google Sheet, and generate the
# author list and contact list
read_csv("authors_from_google_sheets.csv", skip = 1, col_types = "cccccccc") %>% 
  rename(institution = `Institution, city, postal code, country`) %>% 
  mutate(institution = if_else(is.na(institution), 
                               paste("<b>TODO missing:</b>", CSR_DATASET, CSR_FAMILY_NAME), institution)) %>% 
  filter(`Author?` %in% c("Yes", "yes", "Y")) %>%
  distinct(CSR_FAMILY_NAME, CSR_FIRST_NAME, CSR_EMAIL, institution, .keep_all = TRUE) %T>% 
  write_csv("authors_final.csv") ->
  authors

# First five authors are fixed; after that it's alphabetical
authors <- bind_rows(authors[1:5,],
                     arrange(authors[-1:-5,], CSR_FAMILY_NAME, CSR_FIRST_NAME))
a_names <- paste(authors$CSR_FIRST_NAME, authors$CSR_FAMILY_NAME)
a_inst_numbers <- as.character(as.numeric(factor(authors$institution, 
                                                 levels = unique(authors$institution))))
institution_list <- authors$institution[!duplicated(a_inst_numbers)]

# Any duplicate names at this point mean multiple institutions; concatenate
dupes <- which(duplicated(a_names))
if(length(dupes)) {
  a_inst_numbers[dupes - 1] <- paste(a_inst_numbers[dupes - 1], a_inst_numbers[dupes], sep = ",")
  a_inst_numbers <- a_inst_numbers[-dupes]
  a_names <- a_names[-dupes]
}

# Write out the author list and affiliation list. Note that this chunk is
# results='asis', which lets us write html
cat(paste0(a_names, "<sup>", a_inst_numbers, "</sup>"), sep = ", ")
cat("<ol>\n") # ordered HTML list
cat(paste0("\t<li>", institution_list, "</li>"), sep = "\n")
cat("</ol>\n")

COSORE is designed to be a relatively lightweight database: as simple as possible, but not simpler. It is targeted at continuous–i.e., measured by automated systems–soil respiration flux data, but the database design accommodates manual point (survey-style) RS fluxes, methane fluxes, and chamber measurements of net ecosystem exchange as well, paralleling the recent Soil Incubation Database (SIDb) database (Schädel et al., 2019). Its development started in April 2019, and as of this writing (r Sys.Date()) the COSORE version number is r db_vers.

Database and dataset structure

The database is structured as a collection of independent contributed datasets, all of which have been standardized to a common structure and units. Each dataset is given a reference name (internal to COSORE) that links its constituent tables, and provides a point of reference in reports. Each constituent dataset normally has a series of separate data tables:

description (Table 2) describes site and dataset characteristics;
contributors (Table 3) lists individuals who contributed to the measurement, analysis, curation, and/or submission of the dataset;
ports (Table 4) gives the different ports (generally equivalent to separate measurement chambers) in use, and what each is measuring: flux, species, and treatment, as well as characteristics of the measurement collar;
data (Table 5), the central table of the dataset, records flux observations;
ancillary (Supplementary Table S1) summarizes site-level ancillary measurements;
columns (Supplementary Table S2), maps raw data columns to standard COSORE columns, providing a record for reproducibility; and
diagnostics (Supplementary Table S3) provides statistics on the data import process: errors, columns and rows dropped, etc.

The common key linking these dataset tables is the CSR_DATASET field, which records the unique name assigned to the dataset. In addition, a CSR_PORT key field links the ports and data tables. These links make it straightforward to extract datasets that have measured particular fluxes in certain ecosystem types, or isolate only non-treatment (control) chamber fluxes, for example.

Versioning and archiving

COSORE uses semantic versioning (https://semver.org/), meaning that its version numbers generally follow an "x.y.z" format, where x is the major version number (changing only when there are major changes to the database or package structure and/or function, in a manner that may break existing scripts using the data); y is the minor version number (typically changing with significant data updates); and z the patch number (bug fixes, documentation upgrades, or other changes that are completely backwards compatible). Following each official (major) release, a DOI will be issued and the data permanently archived by Zenodo (https://zenodo.org/). All changes to the data or codebase are immediately available through the GitHub repository, but only official releases will be issued a DOI; we anticipate this happening on an approximately annual basis.

Data license and citation

The database license is CC-BY-4 (https://creativecommons.org/licenses/by/4.0/); see the “LICENSE” file in the repository. This is identical to that used by e.g. FLUXNET Tier 1 and ICOS R1. In general, this license provides that users may copy and redistribute the database and R package code in any medium or format, adapting and building upon them for any scientific or commercial purpose, as long as appropriate credit is given. We request that users cite this article and strongly encourage them to (i) cite all constituent dataset primary publications, and (ii) involve data contributors as co-authors whenever possible, as is commonly done for other global databases such as FLUXNET (Baldocchi et al., 2001; Knox et al., 2019). In addition, users should also reference the specific version of the dataset they used (e.g., v0.6.0), access date, and ideally the specific Git commit number. This supports reproducibility of any analyses.

Data access and use

Major COSORE data releases are available via Zenodo (as noted above), as well as the GitHub “Releases” page at https://github.com/bpbond/cosore/releases; we anticipate that institutional repositories such as ESS-DIVE (Environmental Systems Science Data Infrastructure for a Virtual Ecosystem, https://ess-dive.lbl.gov/) may host releases at some point in the future. Downloads via this page are flat-file CSV (comma-separated values), and readable by any modern computing system. Missing values are encoded by a blank (i.e. two successive commas in the CSV format). A release download is fully self-contained, with full data, metadata, and documentation; a file manifest; a copy of the data license; an introductory vignette; a summary report on the entire database; and an explanatory README with links to this publication.

db_memsize <- sum(vapply(db$CSR_DATASET, 
                         FUN = function(x) object.size(csr_dataset(x)$data),
                         FUN.VALUE = numeric(1))) / 1e6
db_disksize <- system2("git", 
                       args = c("count-objects", "-vH"), 
                       stdout = TRUE)
db_disksize <- db_disksize[grepl("size:",db_disksize)]
db_disksize <- gsub("size: ", "", db_disksize)

An alternative way to access COSORE data, including minor updates between major releases, is to install and use the cosore R (R Core Team, 2019) package. This provides a robust framework, including dedicated access functions, dataset and database report generation, and quality assurance and checking (see below). Because the flux data are currently included in the repository itself, the latter is quite large (compared to most Git repositories), ~215.4 MB. (Note that the data are stored in R’s compressed RDS file format; when loaded into memory, the entire database is significantly larger, ~r round(db_memsize, 0) MB.) It thus cannot easily be hosted on CRAN (the Comprehensive R Archive Network), the canonical source for R packages. Installing directly from GitHub is however straightforward using the devtools or remotes packages:

devtools::install_github("bpbond/cosore")
library(cosore)

Four primary user-facing functions are available:

csr_database() summarizes the entire database in a single convenient data frame, with one row per dataset, and is intended as a high-level overview. It returns a selection of variables summarized in Tables 2-5 and S1-S3 below, including dataset name, longitude, latitude, elevation, IGBP code, number of records, dates, and variables measured;
csr_dataset() returns a single dataset: an R list structure, each element of which is a table (description, contributors, etc., as described above);
csr_table() collects, into a single data frame, one of the tables of the database, for any or all datasets;
csr_metadata() provides metadata information about all fields in all tables.

Two additional reporting functions may also be useful to users:

csr_report_database() generates an HTML report on the entire database: number of datasets, locations, number of observations, distribution of flux values, etc.;
csr_report_dataset() generates an HTML report on a single dataset, including tabular and graphical summaries of location, flux data, and diagnostics.

Finally, a number of functions are targeted at developers, and include functionality to ingest contributed data, standardize data, and prepare a new release. See the package documentation for more details on these.

Documentation

The primary documentation for the COSORE database is this manuscript Both the flat-file releases and cosore R package include extensive documentation, including an in-depth vignette included both in the package and online (https://rpubs.com/bpbond/502069). The R package includes documentation available via R's standard help system.

Data quality and testing

When contributed data are imported into COSORE, the package code performs a number of quality assurance checks. These include:

Timestamp errors, for example illegal dates and times for the specified time zone;
Bad email addresses or ORCID identifiers;
Records with no flux value;
Records for which the analyzer recorded an error condition.

# Calculate what percent of observations are removed across all datasets
csr_table("diagnostics", quiet = TRUE) %>% 
  group_by(CSR_DATASET, CSR_RECORDS) %>% 
  summarise(removed = CSR_RECORDS_REMOVED_NA + CSR_RECORDS_REMOVED_ERR + CSR_RECORDS_REMOVED_TIMESTAMP,
            .groups = "drop") %>% 
  mutate(removed_pct = removed / (removed + CSR_RECORDS) * 100) %>% 
  pull(removed_pct) -> errs

Any errors flagged or records removed during this process are summarized in the diagnostics table that is part of each dataset (Table 7 below). Across all contributed datasets, a median of r round(median(errs), 1)% of raw observations were removed for one of these reasons. Note however that no checking on the flux values themselves is performed (e.g. for outliers, improbable values); currently this is the responsibility of the user.

# We exclude parse-others.R from the coverage calculation, because it's all
# temporary code for handling specific raw datasets, and will eventually go away
package_coverage(line_exclusions = list("R/parse-others.R")) %>% 
  percent_coverage() ->
  csr_cov

The cosore R package also has a wide variety of unit tests (CITATION TODO) that test code functionality, typically via assertions about function behavior, but also by verifying behavior of those functions when importing test datasets (of different formats and with a variety of errors, for example). In total these tests cover r round(csr_cov, 1)% of the codebase.

Current data and community contributions

The database currently has r nrow(db) contributed datasets with a total of r db_records million flux observations across r db_years years and five continents (Table X, Figures X and Y).

datasets <- db$CSR_DATASET
intervals <- lapply(datasets, function(x) {
  ds <- csr_dataset(x)
  if(is.null(ds$data)) return(NULL)
  cosore:::compute_interval(ds$data) %>% 
    mutate(CSR_DATASET = x)
})
intervals <- bind_rows(intervals)
q <- round(quantile(intervals$Interval, na.rm = TRUE), 0)
subdailies <- intervals$Interval < 24 * 60
subdaily_ds_pct <- sum(subdailies, na.rm = TRUE) / nrow(intervals) * 100
subdaily_ds_N <- sum(intervals$N[subdailies], na.rm = TRUE) / sum(intervals$N, na.rm = TRUE) * 100

The interval between measurements ranges from r round(min(intervals$Interval, na.rm = TRUE), 0) to r round(max(intervals$Interval, na.rm = TRUE), 0) minutes, with 25%-50%-75% quantile values of r q[2], r q[3], and r q[4] minutes respectively. A one-hour interval between measurements is thus by far the most common choice. Currently r round(subdaily_ds_pct, 0)% of the datasets, and r round(subdaily_ds_N, 3)% of the data, provide sub-daily temporal resolution.

Tables

Table 1. Summary of COSORE v. r db_vers datasets with deposited data by International Geosphere-Biosphere Programme land cover classification (Loveland et al., 2000) as provided by data contributors. Columns include number of datasets, total number of records (flux observations), and dates of first and last records.

smrise <- function(x) {
  x %>% 
    summarise(Datasets = n(),
              Records = format(sum(CSR_RECORDS, na.rm = TRUE), big.mark = ","),
              `First record` = min(CSR_DATE_BEGIN, na.rm = TRUE),
              `Last record` = max(CSR_DATE_END, na.rm = TRUE),
              .groups = "drop") 
}
db %>% 
  filter(CSR_RECORDS > 0) %>%
  left_join(igbp_abbreviations, by = "CSR_IGBP") %>% 
  mutate(`IGBP class` = paste(CSR_IGBP, paste0("(", IGBP, ")"))) %>% 
  select(`IGBP class`, CSR_RECORDS, CSR_DATE_BEGIN, CSR_DATE_END) %>% 
  group_by(`IGBP class`) %>% 
  smrise() %>% 
  bind_rows(bind_cols(tibble(`IGBP class` = "(Total)"), smrise(db))) %>% 
  knitr::kable(format = "markdown", align = c("l", "r", "r", "r", "r"))

Table 2. Individual datasets in COSORE have a number of sub-tables. The first of these is the description table, the fields of which are summarized below. Columns include field name, description, class (i.e. type of data), units, and whether or not the field is required.

options(knitr.kable.NA = "")
make_table <- function(db_fields, table) {
  db_fields %>% 
    rename(`Field name` = Field_name) %>% 
    filter(Table_name == table) %>%
    mutate(Required = if_else(Required, "*", "")) %>% 
    select(-Table_name) %>% 
    kableExtra::kable(format = "markdown")
}
make_table(db_fields, "description")

Table 3. Summary of COSORE's contributors table, which provides information on the researchers (at least one; there may be arbitrarily many listed) who measured and contributed each dataset. Columns include field name, description, class (i.e. type of data), units, and whether or not the field is required.

make_table(db_fields, "contributors")

Table 4. Summary of COSORE's ports table, which provides information on the various multiplexed chambers that are frequently connected to a single measurement analyzer. Columns include field name, description, class (i.e. type of data), units, and whether or not the field is required.

make_table(db_fields, "ports")

Table 5. Summary of COSORE's data table, which holds the actual flux observations and accompanying time-stamped data. Columns include field name, description, class (i.e. type of data), units, and whether or not the field is required. Note that all data in this table are acquired at the point of GHG flux measurement; see Supplementary Table S1 for site-level data.

make_table(db_fields, "data")

Supplementary Table S1. Summary of COSORE's ancillary table, which includes optional information, typically ecosystem-level soil information, carbon fluxes, and climate normals. Columns include field name, description, class (i.e. type of data), units, and whether or not the field is required.

db_fields %>% 
  #  filter(Field_name %in%  c("Variable", "Value")) %>% 
  make_table("ancillary")

Supplementary Table S2. Summary of COSORE's columns table, which maps raw dataset columns to standardized COSORE columns. Columns include field name, description, class (i.e. type of data), units, and whether or not the field is required. We expect that this table will be dropped at some point in the future when COSORE requires structurally compliant data submissions (i.e. contributors will be required to format their data to match COSORE structure before submission).

make_table(db_fields, "columns")

Supplementary Table S3. Summary of COSORE's diagnostics table, which is populated automatically when parsing and importing non-COSORE data. Columns include field name, description, class (i.e. type of data), units, and whether or not the field is required.

make_table(db_fields, "diagnostics")

Figure 2. Geographic distribution of COSORE datasets (N = 898), with point sizes corresponding to the number of records in each dataset. Map tiles show USGS land cover and national elevation data and are by Stamen Design, under CC BY 3.0; data by OpenStreetMap, under ODbL; figure rendered using R’s ggmap (Kahle and Wickham, 2013).

bbox <- make_bbox(lon = c(-160, 150), lat = c(-50, 70)) # make a coordinate box 
map <- get_map(location = bbox, source = "stamen", maptype = "terrain") # load map from online

p <- ggmap(map) + 
  geom_point(data = db, aes(x = CSR_LONGITUDE, y = CSR_LATITUDE, size = CSR_RECORDS), 
             alpha = 0.5, color = "blue4", na.rm = TRUE) + 
  labs(x = "Longitude", y = "Latitude") +
  scale_size_continuous("", labels = comma) +
  theme(legend.position = "bottom")
print(p)

ggsave_quiet <- function(...) suppressMessages(ggsave(...))
ggsave_quiet("figures/figure2-map.pdf")

Figure 3. Climate space figure.

# Note raster masks 'tidyr::extract' and 'dplyr::select'
library(raster)
library(sp)
library(RColorBrewer)

# Download worldclim data for precip and tmean if necessary, into w10/ folder
precip <- getData("worldclim", path = "wc10/", var = "prec", res = 10, download = !file.exists("wc10/wc10/precip1.hdr"))
tmean <- getData("worldclim", path = "wc10/", var = "tmean", res = 10, download = !file.exists("wc10/wc10/tmean1.hdr"))

# Pull out cosore dataset latitudes and longitudes
db %>%
  dplyr::select(CSR_DATASET, CSR_LONGITUDE, CSR_LATITUDE) -> cosore_coords

# Extract cosore location data from worldclim data for precip...
raster::extract(precip, cosore_coords[2:3]) -> precip_coords
apply(precip_coords, 1, sum) -> map_cosore
cbind(cosore_coords, map_cosore) -> map_coords

# ...and tmean
raster::extract(tmean, cosore_coords[2:3]) -> tmean_vals
apply(tmean_vals, 1, mean) -> mat_cosore
cbind(map_coords, mat_cosore)  %>%
  # Temp data is stored in degC * 10, so we need to divide to get back to degC
  mutate(mat_cosore = mat_cosore / 10) -> cosore_points

# Extract global climate space data
raster::as.data.frame(precip, xy = TRUE) %>%
  drop_na() -> precip_global

# Calculate annual sum for precip...
precip_global %>%
  dplyr::select(-x, -y) %>%
  apply(1, sum) -> map_global

raster::as.data.frame(tmean, xy = TRUE) %>%
  drop_na() -> tmean_global

# ...and mean for temperature
tmean_global %>%
  dplyr::select(-x, -y) %>%
  apply(1, mean) -> mat_global

# Create tibble with corresponding coordinates
tibble(x = tmean_global$x, y = tmean_global$y, mat = as.vector(mat_global)) -> mat
tibble(x = precip_global$x, y = precip_global$y, map = as.vector(map_global)) -> map

left_join(map, mat, by = c("x", "y")) %>%
  # Temp data is stored in degC * 10, so we need to divide to get back to degC
  mutate(mat = mat / 10) -> map_mat_global

# Try Mark Tjoelker's suggestion re Whittaker biomes
library(plotbiomes) # devtools::install_github("valentinitnelav/plotbiomes")
p_inset <- whittaker_base_plot() +
  geom_point(data = cosore_points, aes(x = mat_cosore, y = map_cosore / 10),
             color = "black", shape = 4) +
  coord_cartesian(ylim = c(0, 500)) +
  theme(axis.title = element_blank(),
        axis.text = element_text(size = 8),
        legend.text = element_text(size = 7),
        legend.key.size = unit(0.4, "lines"),
        legend.position = c(0.35, 0.75),
        legend.title = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_rect(fill = "white"),
        panel.border = element_rect(colour = "black", fill = NA, size = 0.5))

ggsave_quiet("figures/figure3-climate-inset.png")

# SP's main climate space plot
p <- ggplot() +
  geom_hex(data = map_mat_global,
           aes(x = mat, y = map / 10), bins = 100, na.rm = TRUE) +
  scale_fill_viridis_c(name = "Grid cells", begin = 0.85, end = 0) +
  geom_point(data = cosore_points, aes(x = mat_cosore, y = map_cosore / 10),
             color = "black", shape = 4, size = 1.5, na.rm = TRUE) +
  theme_minimal() +
  coord_cartesian(ylim = c(0, 500)) +
  labs(x = "MAT (°C)", y = "MAP (cm)")
print(p)
ggsave_quiet("figures/figure3-climate.pdf")

suppressMessages(library(cowplot, quietly = TRUE))
p_new <- ggdraw() +
  draw_plot(p) +
  draw_plot(p_inset, x = 0.1, y = 0.52, width = 0.4, height = 0.45)
print(p_new)
save_plot("figures/figure3-climate-whittaker.png", p_new, base_height = 5)
save_plot("figures/figure3-climate-whittaker.pdf", p_new, base_height = 5)

scaling_factor <- 5000

Figure 4. Flux observations, by IGBP (cf. Table 1), over time. Each square represents r format(scaling_factor, big.mark = ",") observations, with categories of <r format(scaling_factor, big.mark = ",") observations rounded up so that they occupy a single square.

# devtools::install_github("hrbrmstr/waffle")
library(waffle)

results <- lapply(datasets, function(x) {
  d <- csr_dataset(x, quiet = TRUE)
  if(!is.data.frame(d$data)) return(NULL)
  d$data %>% 
    mutate(CSR_DATASET = x, 
           year = year(CSR_TIMESTAMP_BEGIN)) %>% 
    left_join(db, by = "CSR_DATASET") %>% 
    group_by(year, CSR_IGBP) %>% 
    summarise(n = n(), .groups = "drop")
})

# Scale records so 1 box equals 5,000 records, and round up to capture records under 5,000
bind_rows(results) %>% 
  arrange(CSR_IGBP) %>% 
  mutate(Records_scale = ceiling(n / scaling_factor)) %>% 
  left_join(igbp_abbreviations, by = "CSR_IGBP") ->
  results_fluxes

# Waffle plot number of records by IGBP
p <- ggplot(results_fluxes, aes(fill = IGBP, values = Records_scale)) +
  geom_waffle(color = "white", size = 0.25, n_rows = 10, flip = TRUE) +
  facet_wrap(~year, nrow = 2, strip.position = "bottom") +
  coord_equal() +
  labs(x = "Year", y = "Count") +
  theme(panel.grid = element_blank(), legend.position = "bottom",
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank())
print(p)
ggsave_quiet("figures/figure4-waffle.pdf", width = 10, height = 5)

Figure 5. Temporal density of COSORE datasets, by latitude of the observational site.

# Get data density by month of year
results <- lapply(datasets, function(x) {
  d <- csr_dataset(x, quiet = TRUE)
  if(!is.data.frame(d$data)) return(NULL)
  if(!"CSR_FLUX_CO2" %in% names(d$data)) d$data$CSR_FLUX_CO2 <- NA_real_
  if(!"CSR_FLUX_CH4" %in% names(d$data)) d$data$CSR_FLUX_CH4 <- NA_real_
  d$data %>% 
    mutate(year = year(CSR_TIMESTAMP_BEGIN),
           month = month(CSR_TIMESTAMP_BEGIN)) %>% 
    group_by(year, month) %>% 
    summarise(timestamp = mean(CSR_TIMESTAMP_BEGIN),
              n_co2 = sum(!is.na(CSR_FLUX_CO2)),
              n_ch4 = sum(!is.na(CSR_FLUX_CH4)), 
              .groups = "drop") %>% 
    mutate(CSR_DATASET = x,
           CSR_LATITUDE = d$description$CSR_LATITUDE)
})
results_lat <- bind_rows(results)

p <- results_lat %>% 
  mutate(Lat_bin = floor(CSR_LATITUDE / 5) * 5) %>%
  group_by(year, Lat_bin) %>% 
  summarise(Records = sum(n_co2),
            timestamp = mean(timestamp), 
            .groups = "drop") %>% 
  ggplot(aes(timestamp, Lat_bin, color = Records / 1000)) + 
  geom_point(size = 5, na.rm = TRUE) + 
  scale_colour_viridis_c(name = "Observations\n(*1000)", begin = 0.95, end = 0) +
  labs(x = "Year", y = "Latitude")
print(p)
ggsave_quiet("figures/figure5-latitude.pdf", width = 8, height = 6)

Figure 6. Temporal data coverage by day of year and hemisphere

datasets <- db$CSR_DATASET
results <- lapply(datasets, function(x) {
  d <- csr_dataset(x, quiet = TRUE)
  if(!is.data.frame(d$data)) return(NULL)
  if(!"CSR_FLUX_CO2" %in% names(d$data)) d$data$CSR_FLUX_CO2 <- NA_real_
  if(!"CSR_FLUX_CH4" %in% names(d$data)) d$data$CSR_FLUX_CH4 <- NA_real_
  d$data %>% 
    mutate(yearday = yday(CSR_TIMESTAMP_BEGIN)) %>% 
    group_by(yearday) %>% 
    summarise(n_co2 = sum(!is.na(CSR_FLUX_CO2)),
              n_ch4 = sum(!is.na(CSR_FLUX_CH4)),
              co2 = mean(CSR_FLUX_CO2, na.rm = TRUE),
              ch4 = mean(CSR_FLUX_CH4, na.rm = TRUE),
              .groups = "drop") %>% 
    mutate(CSR_DATASET = x,
           CSR_LATITUDE = d$description$CSR_LATITUDE)
})
results_doy <- bind_rows(results)


results_doy %>% 
  mutate(Hemi = if_else(CSR_LATITUDE < 0, "Southern~hemisphere", "Northern~hemisphere")) %>% 
  group_by(Hemi, yearday) %>% 
  summarise(n_ch4 = sum(n_ch4), 
            n_co2 = sum(n_co2), 
            .groups = "drop") %>% 
  filter(yearday <= 365) %>% 
  gather(variable, value, n_ch4, n_co2) %>% 
  separate(variable, into = c("junk", "Gas")) %>% 
  mutate(Gas = case_when(Gas == "ch4" ~ "CH[4]",
                         Gas == "co2" ~ "CO[2]")) -> 
  doy_counts

p <- ggplot(data = doy_counts, aes(x = yearday, y = value)) + 
  geom_line() + 
  facet_wrap(~Hemi + Gas, scales = "free", labeller = label_parsed) +
  labs(x = "Day of year", y = "Number of observations") +
  scale_y_continuous(labels = comma)
print(p)
ggsave_quiet("figures/figure6-doy.pdf")

Figure 7. Temporal resolution (time interval between successive measurements, minutes; note logarithmic scale) of COSORE data.

results <- lapply(datasets, function(xname) {
  x <- csr_dataset(xname, quiet = TRUE)

  if(!is.null(x$data)) {
    compute_interval <- function(dsd) {
      results <- cosore:::compute_interval(dsd)
      median(round(weighted.mean(results$Interval, results$N, na.rm = TRUE), 0))
    }
    tibble(Dataset = xname,
           `Interval (min)` = as.integer(compute_interval(x$data)),
           Records = nrow(x$data))
  } else {
    return(NULL)    
  }
})
results_res <- bind_rows(results)

p <- ggplot(results_res, aes(`Interval (min)`, weights = Records)) +
  geom_histogram(bins = 50) + 
  scale_x_continuous(trans = "log10") +
  scale_y_continuous(labels = comma_format(scale = 1e-3)) +
  annotation_logticks(sides = "b") +
  ylab("Number of observations (*1000)")
print(p)
ggsave_quiet("figures/figure7-resolution.pdf")

results <- lapply(datasets, function(x) {
  d <- csr_dataset(x, quiet = TRUE)
  if(!is.data.frame(d$data)) return(NULL)
  if(!"CSR_FLUX_CO2" %in% names(d$data)) d$data$CSR_FLUX_CO2 <- NA_real_
  if(!"CSR_FLUX_CH4" %in% names(d$data)) d$data$CSR_FLUX_CH4 <- NA_real_
  tibble(CSR_DATASET = d$description$CSR_DATASET,
         CSR_IGBP = d$description$CSR_IGBP,
         CSR_FLUX_CO2 = d$data$CSR_FLUX_CO2,
         CSR_FLUX_CH4 = d$data$CSR_FLUX_CH4)
})
bind_rows(results) %>% 
  left_join(igbp_abbreviations, by = "CSR_IGBP") ->
  results_fluxes

co2_limits <- c(-1, 10)
ch4_limits <- tribble(~IGBP, ~min, ~max,
                      "DBF", -4,    1,
                      "DNF", -5,    10,
                      "ENF", -5,    20,
                      "GRA", -2,    2,
                      "WET", -3,    90)
rf_co2 <- filter(results_fluxes, is.finite(CSR_FLUX_CO2))
rf_ch4 <- filter(results_fluxes, is.finite(CSR_FLUX_CH4))
in_co2 <- filter(rf_co2, CSR_FLUX_CO2 >= min(co2_limits), CSR_FLUX_CO2 <= max(co2_limits))
rf_ch4 %>% 
  left_join(ch4_limits, by = "IGBP") %>% 
  filter(CSR_FLUX_CH4 >= min, CSR_FLUX_CH4 <= max) ->
  in_ch4
pct_co2 <- round(100 - nrow(in_co2) / nrow(rf_co2) * 100, 1)
pct_ch4 <- round(100 - nrow(in_ch4) / nrow(rf_ch4) * 100, 1)

Figure 8. Distribution of CO2 fluxes in COSORE datasets, by IGBP classification (cf. Table 1). For visual clarity this figure excludes fluxes <r min(co2_limits) and >r max(co2_limits) µmol m-2 s-1 (r format(nrow(rf_co2) - nrow(in_co2), big.mark = ",") observations, r pct_co2% of the data). Number of datasets (sites) making up data is given in parentheses after IGBP abbreviations in each panel.

in_co2 %>% 
  group_by(IGBP) %>% 
  summarise(Ndatasets = length(unique(CSR_DATASET)), .groups = "drop") ->
  dscounts

p <- in_co2 %>% 
  left_join(dscounts, by = "IGBP") %>% 
  mutate(IGBP_datasets = paste0(IGBP, " (", Ndatasets, ")")) %>% 
  ggplot(aes(x = CSR_FLUX_CO2)) + 
  geom_histogram(bins = 30) +
  facet_wrap(~IGBP_datasets, scales = "free_y") +
  scale_y_continuous(labels = number_format(scale = 1e-3, accuracy = 0.1)) +
  labs(x = expression(CO[2]~flux~(µmol~m^-2~s^-1)), y = "Number of observations (*1000)")
print(p)
ggsave_quiet("figures/figure8-co2.pdf")

Figure 9. Distribution of CH4 fluxes in COSORE datasets, by IGBP classification (cf. Table 1). For visual clarity this figure excludes some extreme values (r format(nrow(rf_ch4) - nrow(in_ch4), big.mark = ",") observations or r pct_ch4% of the data). Number of datasets (sites) making up data is given in parentheses after IGBP abbreviations in each panel. Positive values are emissions to the atmosphere, and negative values uptake by the soil.

in_ch4 %>% 
  group_by(IGBP) %>% 
  summarise(Ndatasets = length(unique(CSR_DATASET)), .groups = "drop") ->
  dscounts

p <- in_ch4 %>% 
  left_join(dscounts, by = "IGBP") %>% 
  mutate(IGBP_datasets = paste0(IGBP, " (", Ndatasets, ")")) %>% 
  ggplot(aes(x = CSR_FLUX_CH4)) + 
  geom_histogram(bins = 30) +
  facet_wrap(~IGBP_datasets, scales = "free") +
  scale_y_continuous(labels = number_format(scale = 1e-3)) +
  labs(x = expression(CH[4]~flux~(nmol~m^-2~s^-1)), y = "Number of observations (*1000)")
print(p)
ggsave_quiet("figures/figure9-ch4.pdf")