In ISCN/SOCDRaHR2: Scripts to read in and harmonize different digital soil data archives

library(data.table)
library(SOCDRaH2)
library(tidyverse)
library(lubridate)
library(tibble)
library(ggmap)
library(maps)
library(mapdata)
knitr::opts_chunk$set(eval=FALSE)

#source('../R/makeKeys.R')
#source('../R/ISCN3.R')

#ISCN_raw <- ISCN3(dataDir = '~/Documents/Datasets/ISCN', orginalFormat = TRUE)

#layer_raw <- vroom::vroom('~/Documents/Datasets/ISCN/ISCN3_layer.csv', col_types = strrep('c', times = 95))

#profile_raw <-  vroom::vroom('~/Documents/Datasets/ISCN/ISCN3_profile.csv', col_types = strrep('c', times = 44))

citation_raw <- read_delim('ISCN/ISCN3_citation.csv', delim = ';', col_types = strrep('c', times = 12))
dataset_raw <- read_delim('ISCN/ISCN3_dataset.csv', delim = ';', col_types = strrep('c', times = 19))
layer_raw <- vroom::vroom('ISCN/ISCN3_layer.csv', col_types = strrep('c', times = 95))
profile_raw <-  vroom::vroom('ISCN/ISCN3_profile.csv', col_types = strrep('c', times = 44))

#when were ISCN SOC stocks caluclated?

#temp <- unique(ISCN_raw$layer[,c('dataset_name_sub', 'dataset_name_soc')]) %>%
#  mutate(value = TRUE) %>%
#  pivot_wider(id_cols = 'dataset_name_sub', names_from='dataset_name_soc')

#looking at temp, the dataset_name_soc doesnt' make any sense to us. So we are going dataset by dataset to reconstruct for ISCN4.

#remove NRCS to be replaced with new database
#as.data.frame(ISCN_raw$layer[dataset_name_sub != 'NRCS Sept/2014', -c('ISCN 1-1 (2015-12-10)')]) %>%
#layer_raw %>%
  #filter(dataset_name_sub != 'NRCS Sept/2014') %>%
 # select(-'ISCN 1-1 (2015-12-10)') %>%
  #group_by(dataset_name_sub) %>% 
#  tally() %>% 
  #arrange(n)

# noNRCS_ISCN3 %>%
#  filter(grepl('ISCN', dataset_name_soc)) %>%
#  select(dataset_name_soc, soc_carbon_flag, soc_method) %>%
#  unique()

##Strip out the soc values where ISCN has filled them in
not_all_na <- function(x) {!all(is.na(x))}

noNRCS_ISCN3_layer <- layer_raw %>%
  filter(dataset_name_sub != 'NRCS Sept/2014') %>% #remove NRCS contribution
  select(-'ISCN 1-1 (2015-12-10)') %>% #remove ID colume
  mutate(`soc (g cm-2)` = if_else(grepl('ISCN', dataset_name_soc), 
                                  as.character(NA), `soc (g cm-2)`),
    `soc_carbon_flag` = if_else(grepl('ISCN', dataset_name_soc),
                                as.character(NA), `soc_carbon_flag`),
    `soc_method` = if_else(grepl('ISCN', dataset_name_soc), 
                           as.character(NA), `soc_method`)) %>%
  select(-dataset_name_soc, )%>% #column now redundent with dataset_name_sub
  select_if(not_all_na)

noNRCS_ISCN3_profile <- as.data.frame(ISCN_raw$profile[dataset_name_sub != 'NRCS Sept/2014', -c('ISCN 1-1 (2015-12-10)')])  %>% #remove NRCS contribution
  mutate(`soc (g cm-2)` = if_else(grepl('ISCN', dataset_name_soc), 
                                  as.character(NA), `soc (g cm-2)`),
    `soc_carbon_flag` = if_else(grepl('ISCN', dataset_name_soc),
                                as.character(NA), `soc_carbon_flag`),
    `soc_method` = if_else(grepl('ISCN', dataset_name_soc), 
                           as.character(NA), `soc_method`)) %>%
  select(-dataset_name_soc)%>% #column now redundent with dataset_name_sub
  select_if(not_all_na)

noNRCS_ISCN3_layer %>% pivot_longer(cols = everything(), values_drop_na = TRUE) %>%
  group_by(name) %>% tally %>% arrange(n)

datasetName.tbl <- tibble::tribble( ~dataset_name_sub, ~CCBY_consistant, ~citation_check, ~notes,
'Worldwide soil carbon and nitrogen data', TRUE, 'Yes', 'Citation is consistant with CC-BY, reference w/ doi, maybe even data doi?',
'Heckman/Swanston Biscuit Burn', TRUE, 'Yes', 'Citation is consistant with CC-BY, manuscript reference needs updating',
'Lu_PIMA', TRUE, 're-send', 'citation is consistant with CC-BY, manuscript reference looks good',
'Heckman lithosequence', TRUE, 'Yes', 'Citation is consistant with CC-BY, Reference manuscript with no doi, no apparent data doi',
'Oak Ridge National Lab_Loblolly_DWJ', TRUE, 'resend', 'Citation is consistant with CC-BY, Reference manuscript with no doi, no apparent data doi',
'Lu_LTER',TRUE, 'resend', 'Citation is consistant with CC-BY, Multiple reference manuscript with some data doi',
'Lehmann Soil C&BC #1', FALSE, 'empty', 'Request contact and acknowledgement, unpublished data',
'Schuur', TRUE, 'resend', 'Citation is consistant with CC-BY, Multiple reference manuscript with some data doi',
'Lehmann NE US soils', TRUE, 'Yes', 'orginal citation superseded by explicit permission, manuscript citation of unpublished data, no data here - consider scrapping',
'Vogel', TRUE, 'resent', 'Multiple reference manuscripts with some data doi',
'Myers-Smith', TRUE, 'resend', 'Citation is consistant with CC-BY, Multiple reference manuscript with some data doi, lter data citation - possible direct data hook',
'USGS Muhs', TRUE, 'Yes', "Cryptic citation in site notes",
'USGS Harden Yazoo', TRUE, "Yes", "Two manuscript citations",
'Boby_Mack', TRUE, "Yes", "Manuscript citation",
'Kane', TRUE, "resend", "multiple manuscript and data citations but no required acknowledgement",
"Bonanza LTER", TRUE, "resend", "Manuscript and data citation",
'Jorgensen_YKDE', TRUE, "resend", "Manuscript citation",
'UMBS_FASET', TRUE, 'Yes', 'orginal citation superseded by explicit permission, unpublished',
'Jorgensen_ARCN', TRUE, 'resend', 'report citations',
'Oak Ridge National Lab_TDE', TRUE, 'resend', 'manuscript citation, possible link to orginal data',
'USGS Harden', TRUE, 'Yes', 'multiple citations including manuscript and data citations',
'USDA-FS NRS Landscape Carbon Inventory',TRUE, 'resend', 'report citation',
'Bockheim', TRUE, "Yes", 'multiple manuscript citations',
'Jorgensen_NPS', TRUE, "resend", 'report citation',
'Permafrost_RCN', TRUE, "Yes", 'single citation, compliation of other studies',
'Northern Circumpolar Soil Carbon Database (NCSCD)', TRUE, 'resend', 'data citation and manuscript',
'USGS_S3C', TRUE, 'Yes', 'link to report with database download',
'AK DSC Project SOC stock computation', TRUE, 'Yes', 'manuscript citation')

not_all_na <- function(x) {!all(is.na(x))}

datasetName <- rev(datasetName.tbl$dataset_name_sub)[1]

dataset_study <- full_join(ISCN_raw$citation[dataset_name == datasetName] %>% select_if(not_all_na),
                           ISCN_raw$dataset[dataset_name == datasetName] %>% select_if(not_all_na), 
                           suffix = c('_citation', '_dataset'))

dataset_profile <- noNRCS_ISCN3_profile  %>%
  filter(dataset_name_sub == datasetName) %>%
  select_if(not_all_na)

dataset_layer <- noNRCS_ISCN3_layer %>%
  filter(dataset_name_sub == datasetName) %>%
  select_if(not_all_na)

Worldwide soil carbon and nitrogen data
- SOC disconnected from other layer data
- cryptic soc methods notes
USGS_S3C
- problematics countrys otherwise spot check looks good
Northern Circumpolar Soil Carbon Database (NCSCD)
- layer level measurements only contains soc and qual
Permafrost_RCN
- problematic SOC but looks like that column can be removed
- oddly no profile information, check into this further
Jorgensen_NPS
- no bd/oc data but complete for what it is
Bockheim
- seems very complete, no first pass issues
USDA-FS NRS Landscape Carbon Inventory
- Country not set (clearly US)
- cryptic vegitation class number 2
- bad SOC values, but it looks like these could just be removed
USGS Harden
- BNZ LTER again citated here
- bd_tot and bd_samp are entered the same here... unclear which is right
Oak Ridge National Lab_TDE
- Country not set (clearly US)
- site note links to project site on ORNL webpage with archived data
Jorgensen_ARCN
- no soc or bd but otherwise good qual descriptions of veg and layers
UMBS_FASET
- Country not set (clearly US)
- Bad SOC values but looks like we could jsut remove that column
Jorgensen_YKDE
- only layer level observations, no quant measurements
- incomplete lat-lon-time
- detailed vegitation description
Bonanza LTER
- looks like lter data again
- Same crytpic vegitation formate, really need to get this keyed out!
Kane
- looks like lter data again, two data citations => direct reads?
- crytpic vegitation but otherwise pretty data
Boby_Mack
- replace hzn ? and hzn_desgn unknow to NA
USGS Harden Yazoo
- Country not set (clearly US)
- looks like bad soc value calculations, no notes, consider removing
USGS Muhs
- Cryptic citation in site notes
Myers-Smith
- Looks like data archived on EDI; site note refers to Bonanza Creek LTER
- Cryptic veg class
Vogel
- replace hzn ? and hzn_desgn unknow to NA
- cryptic veg class
Lehmann NE US soils
- Country not set (clearly US)
- no data here, maybe some info about hzn but it's all 'A'
Schuur
- Looks like data archived on EDI; site note refers to Bonanza Creek LTER
- Cryptic veg class
- replace hzn ? and hzn_desgn unknow to NA
Lehmann Soil C&BC #1
- unpublished data, contact requested
- Country not set (clearly US)
- replace hzn ? with NA
- sparse data only hzn descriptions, and MIR-infered C & N
Lu_LTER
- multiple citations that are not clearly tied to observations
- LTER data might be available via EDI??
- crytpic veg classes
Oak Ridge National Lab_Loblolly_DWJ
- Country not set (clearly US)
- cryptic site note in profile
- hzn in layer and profile reset needs to be reset NA
- gapfilled bd with resampling should be removed

type_cols <- list(num_cols  = c("lat (dec. deg)", "long (dec. deg)",
                                "layer_top (cm)", "layer_bot (cm)",
                                "oc (percent)", 'c_tot (percent)', 'loi (percent)',
                                'bd_samp (g cm-3)',  'bd_tot (g cm-3)', 'bd_other (g cm-3)',
                                'soc (g cm-2)', "soc_depth (cm)",
                                'wpg2 (percent)',
                                'caco3 (percent)',
                                'sand_tot_psa (percent)', 'silt_tot_psa (percent)', 'clay_tot_psa (percent)', 
                                'n_tot (percent)',
                                'cat_exch (cmol H+ kg-1)',
                                'ph_h2o', 'ph_cacl', 'ph_other',
                                "13c (‰)", "14c (‰)", '15n (‰)',
                                "elevation (m)", 
                                "aspect_deg (degree)", "slope (percent)",
                                 "thaw_depth_profile (cm)",
                                'map (mm)', 'mat (°C)'), 
                  factor_cols = c('dataset_name_sub', "datum (datum)", 
                                  "country (country)", "state (state_province)",
                                  "hzn", "hzn_desgn", 
                                  "soil_series", 'color', 'soil_taxon',
                                  'textureClass',
                                  "profile_zero_ref (profile_zero_ref)", 
                                  "ecoregion", "surface_veg",
                                  'vegclass_local', 
                                  'landform (landform)', 'landscape (landscape)', 
                                  '2d_position (2d_position)', 
                                  'drainagecl (drainage)',
                                  'root_quant_size'), 
                  date_cols = c("observation_date (YYYY-MM-DD)", 
                                "modification_date (YYYY-MM-DD)"),
                  char_cols = c("dataset_name", 
                                'site_name', 'profile_name', 'layer_name',
                                "curator_name", "curator_organization",
                                "curator_email",
                                "contact_name", "contact_email",
                                "reference","dataset_description", 
                                "c_method", 'soc_method','bd_method', 'ph_method', 'soc_carbon_flag',
                                'wpg2_method',
                                'site_note', 'landform_note', 'layer_note',
                                "locator_parent_alias"),
                  discard_cols = c("total_lcount", "carbon_lcount", "soc_lcount", "soc_lcount_ISCN",
                                   "total_pcount", "soc_pcount",
                                   'total_scount', 
                                   'dataset_type (dataset_type)'),
                  id_cols = c("dataset_name", 
                              'site_name', 'profile_name', 'layer_name',
                              'dataset_name_sub'))

not_all_na <- function(x) {!all(is.na(x))}

standardCast <- function(data){
  return(data %>%
           select_if(not_all_na) %>%
           mutate_at(intersect(c(type_cols$num_cols, type_cols$date_cols),
                               names(.)), as.numeric) %>%
           mutate_at(intersect(type_cols$factor_cols, names(.)), as.factor) %>%
           mutate_at(intersect(type_cols$date_cols, names(.)), function(xx){
             ##Both conditions will be run but things throw warnings for the wrong conditional... supressing this function
             suppressWarnings(
               ans <- case_when(is.na(xx) ~ NA_Date_,
                                as.numeric(xx) < 2020 ~ lubridate::ymd(paste0(xx, '-01-01')),
                                as.numeric(xx) >= 2020 ~ lubridate::as_date(as.numeric(xx), 
                                                              origin = lubridate::ymd('1899-12-31')),
                                TRUE ~ NA_Date_)
             )
             return(ans)
           }) %>%
           select(-any_of(type_cols$discard_cols)))
}

reduceOneOffs <- function(xx){
  ans <- unique(xx[!is.na(xx)])
  if(length(ans) == 0){
    return(NA)
  }else{
    return(ans)}
}

Heckman/Swanston Biscuit Burn

Heckman/Swanston Biscuit Burn
- Check the lat/lon, looks like an easy fix that could be certified by contributor.
- Country not set (clearly US)

datasetName <- 'Heckman/Swanston Biscuit Burn'

dataset_study <- citation_raw %>% 
  filter(dataset_name == datasetName) %>%
  select(where(function(xx){!(is.na(xx))})) %>%
  full_join(dataset_raw %>% 
              filter(dataset_name == datasetName) %>%
              select(where(function(xx){!(is.na(xx))})), suffix = c('_citation', '_dataset'))%>%
  standardCast()

noNRCS_ISCN3_profile <- profile_raw%>%
  as.data.frame() %>%
  filter(dataset_name_sub != 'NRCS Sept/2014' & dataset_name_sub != ('ISCN 1-1 (2015-12-10)')) %>%
   mutate(`soc (g cm-2)` = if_else(grepl('ISCN', dataset_name_soc), 
                                  as.character(NA), `soc (g cm-2)`),
    `soc_carbon_flag` = if_else(grepl('ISCN', dataset_name_soc),
                                as.character(NA), `soc_carbon_flag`),
    `soc_method` = if_else(grepl('ISCN', dataset_name_soc), 
                           as.character(NA), `soc_method`)) %>%
  select(-dataset_name_soc)%>% #column now redundent with dataset_name_sub
  select_if(not_all_na)

noNRCS_ISCN3_layer <- layer_raw%>%
  as.data.frame() %>%
  filter(dataset_name_sub != 'NRCS Sept/2014' & dataset_name_sub != ('ISCN 1-1 (2015-12-10)')) %>%
   mutate(`soc (g cm-2)` = if_else(grepl('ISCN', dataset_name_soc), 
                                  as.character(NA), `soc (g cm-2)`),
    `soc_carbon_flag` = if_else(grepl('ISCN', dataset_name_soc),
                                as.character(NA), `soc_carbon_flag`),
    `soc_method` = if_else(grepl('ISCN', dataset_name_soc), 
                           as.character(NA), `soc_method`)) %>%
  select(-dataset_name_soc)%>% #column now redundent with dataset_name_sub
  select_if(not_all_na)


dataset_profile <- noNRCS_ISCN3_profile  %>%
  filter(dataset_name_sub == datasetName) %>%
  mutate(`country (country)` = 'United States') %>% #hardcode country
  standardCast() %>%
  rename(dataset_name = 'dataset_name_sub') %>%
  select(dataset_name, `site_name`, `profile_name`,
         `lat (dec. deg)`, `long (dec. deg)`, `datum (datum)`,
         `profile_zero_ref (profile_zero_ref)`,
         `state (state_province)`, `country (country)`, 
         `observation_date (YYYY-MM-DD)`, 
         `soil_series`,`ecoregion`, `surface_veg`,
         `elevation (m)`,  `aspect_deg (degree)`, `slope (percent)`)

dataset_layer <- noNRCS_ISCN3_layer %>%
  filter(dataset_name_sub == datasetName) %>%
  mutate(`country (country)` = 'United States') %>%
  standardCast() %>% 
  rename(dataset_name = 'dataset_name_sub') %>%
  select(dataset_name, `site_name`, `profile_name`, layer_name,
         `lat (dec. deg)`, `long (dec. deg)`, 
         `layer_top (cm)`, `layer_bot (cm)`,
         `hzn`, `hzn_desgn`,
         `c_method`, `oc (percent)`, `13c (‰)`, `14c (‰)`)

all_study <- dataset_study
all_profile <- dataset_profile
all_layer <- dataset_layer
#setdiff(c(names(dataset_study), names(dataset_profile), names(dataset_layer)), unlist(type_cols))

Create a summary report for the data contribution including 1) profile level map of lat/lon 2) profile level tally of state-country 3) count of numerical & categorical values

#print(datasetName)
#head(dataset_study)
#head(dataset_profile)
#head(dataset_layer)

ggplot(data =  map_data('state') %>%
         full_join(dataset_profile %>% 
                     mutate(region = 
                              tolower(`state (state_province)`), by = 'region') %>%
                     group_by(region) %>% 
                     tally)) + 
  geom_polygon(aes(x=long, y = lat, group=group, fill = n), 
               color = 'black') +
  coord_fixed(1.3) + 
  theme_nothing() 

ggplot(data =  map_data('state') %>% 
         filter(region %in% c("oregon", "washington"))) + 
  geom_polygon(aes(x=long, y = lat, group = group), 
               fill = 'grey', color = 'black') + 
  geom_point(data=dataset_profile, 
             aes(x = `long (dec. deg)`, y = `lat (dec. deg)`),
             shape = 'x', color = 'red') +
  coord_fixed(1.3) +
  theme_nothing() 

dataset_layer %>%
  pivot_longer(cols = intersect(names(.), type_cols$num_cols), values_drop_na = TRUE) %>%
  group_by(name) %>% summarize(n = length(value), unique_n = length(unique(value))) %>%
  bind_rows(
    dataset_layer %>%
      pivot_longer(cols = intersect(names(.), type_cols$factor_cols), values_drop_na = TRUE) %>%
      group_by(name) %>% summarize(n = length(value), unique_n = length(unique(value))) ) %>%
  arrange(n) %>%
  knitr::kable()

ggplot(dataset_layer %>% 
         pivot_longer( cols=c('layer_top (cm)', 'layer_bot (cm)'),
                       values_to='depth') %>%
         pivot_longer(cols = c(`oc (percent)`,`13c (‰)`, `14c (‰)`), 
                      values_to = 'measurement', names_to = 'type')) +
         geom_line(aes(x=depth, y= measurement, group = profile_name)) +
  facet_wrap(~type, scales='free')

Lu_PIMA

Lu_PIMA
- modification date not consistant between citation & dataset?

datasetName <- 'Lu_PIMA'

dataset_study <- full_join(ISCN_raw$citation[dataset_name == datasetName] %>% select_if(not_all_na),
                           ISCN_raw$dataset[dataset_name == datasetName] %>% select_if(not_all_na), 
                           suffix = c('_citation', '_dataset')) %>%
  standardCast() %>%
  select(-ends_with('ISCN')) %>% 
  group_by(dataset_name) %>%
  summarize(across(where(is.character), function(xx){unique(xx[grepl('\\w+', xx)])}))

dataset_profile <- noNRCS_ISCN3_profile  %>%
  filter(dataset_name_sub == datasetName) %>%
  mutate(`thaw_depth_profile (cm)` = 
           gsub('none', 'Inf', `thaw_depth_profile (cm)`)) %>%
  standardCast() %>%
  rename(dataset_name = 'dataset_name_sub') %>%
  select(`dataset_name`, `site_name`, `profile_name`, 
         `lat (dec. deg)`, `long (dec. deg)`, `datum (datum)`, 
         `state (state_province)`, `country (country)`, 
         `observation_date (YYYY-MM-DD)`, 
         `profile_zero_ref (profile_zero_ref)`, 
         #`layer_top (cm)`, `layer_bot (cm)`, `soc_depth (cm)`, 
         `elevation (m)`, 
         `ecoregion`, `vegclass_local`, 
         `landform (landform)`, `2d_position (2d_position)`, 
         `aspect_deg (degree)`, `slope (percent)`, 
         `drainagecl (drainage)`, `thaw_depth_profile (cm)`,
         `site_note`)

dataset_layer <- noNRCS_ISCN3_layer %>%
  filter(dataset_name_sub == datasetName) %>%
  standardCast() %>% 
  rename(dataset_name = 'dataset_name_sub') %>%
  select(`dataset_name`, `site_name`, `profile_name`, `layer_name`,
         `lat (dec. deg)`, `long (dec. deg)`, `datum (datum)`, 
         #`state (state_province)`, `country (country)`,  `observation_date (YYYY-MM-DD)`,  `vegclass_local`, 
         `layer_top (cm)`, `layer_bot (cm)`, 
         `hzn`, `hzn_desgn`, `color`, 
         `bd_samp (g cm-3)`, `c_tot (percent)`, `n_tot (percent)`, 
         `ph_h2o`, `sand_tot_psa (percent)`, 
         `silt_tot_psa (percent)`, `clay_tot_psa (percent)`, 
         `wpg2 (percent)`, `root_quant_size`)

all_study <- bind_rows(dataset_study, all_study)
all_profile <- bind_rows(dataset_profile, all_profile)
all_layer <- bind_rows(dataset_layer, all_layer)

ggplot(data =  map_data('world') %>%
         filter(region %in% c('Canada', 'USA', 'Mexico'))) + 
  geom_polygon(aes(x=long, y = lat, group = group), 
               fill = 'grey', color = 'black') + 
  geom_point(data=dataset_profile, 
             aes(x = `long (dec. deg)`, y = `lat (dec. deg)`),
             shape = 'x', color = 'red') +
  coord_fixed(1.3) +
  xlim(-180, -50) +
  theme_nothing() 

dataset_layer %>%
  pivot_longer(cols = intersect(names(.), type_cols$num_cols), values_drop_na = TRUE) %>%
  group_by(name) %>% summarize(n = length(value), unique_n = length(unique(value))) %>%
  bind_rows(
    dataset_layer %>%
      pivot_longer(cols = intersect(names(.), type_cols$factor_cols), values_drop_na = TRUE) %>%
      group_by(name) %>% summarize(n = length(value), unique_n = length(unique(value))) ) %>%
  arrange(n) %>%
  knitr::kable()

ggplot(dataset_layer %>% 
         pivot_longer( cols=c('layer_top (cm)', 'layer_bot (cm)'),
                       values_to='depth') %>%
         pivot_longer(cols = `bd_samp (g cm-3)`:`wpg2 (percent)`, 
                      values_to = 'measurement', names_to = 'type')) +
         geom_line(aes(x=depth, y= measurement, group = profile_name)) +
  facet_wrap(~type, scales='free')

Heckman lithosequence

Heckman lithosequence
- Country not set (clearly US)
- SOC measurements seem to be disconnected from the rest of the data at the layer and profile level

datasetName <- 'Heckman lithosequence'

dataset_study <- full_join(ISCN_raw$citation[dataset_name == datasetName] %>% select_if(not_all_na),
                           ISCN_raw$dataset[dataset_name == datasetName] %>% select_if(not_all_na), 
                           suffix = c('_citation', '_dataset')) %>%
  standardCast()

dataset_profile <- noNRCS_ISCN3_profile  %>%
  filter(dataset_name_sub == datasetName) %>%
  mutate(`thaw_depth_profile (cm)` = 
           gsub('none', 'Inf', `thaw_depth_profile (cm)`),
         `country (country)` = 'United States') %>%
  standardCast() %>%
  rename(dataset_name = 'dataset_name_sub') %>%
  select(`dataset_name`, `site_name`, `profile_name`,
         `lat (dec. deg)`, `long (dec. deg)`, `datum (datum)`, 
         `state (state_province)`, `country (country)`, 
         `observation_date (YYYY-MM-DD)`,  
         `profile_zero_ref (profile_zero_ref)`, 
         #`layer_top (cm)`, `layer_bot (cm)`, 
         `soc_depth (cm)`, `soc (g cm-2)`, `soc_method`, 
         `soil_taxon`, `soil_series`, 
         `elevation (m)`, `2d_position (2d_position)`,
         `aspect_deg (degree)`, `slope (percent)`,
         `ecoregion`, `landscape (landscape)`, 
         `landform (landform)`, 
         `map (mm)`, `mat (°C)`) %>%
  group_by(`dataset_name`, `site_name`, `profile_name`) %>%
  summarise_all(reduceOneOffs) %>%
  ungroup()

dataset_layer <- noNRCS_ISCN3_layer %>%
  filter(dataset_name_sub == datasetName) %>%
  standardCast() %>% 
  rename(dataset_name = 'dataset_name_sub') %>%
  select(`dataset_name`, `site_name`,`profile_name`, `layer_name`, 
         #`lat (dec. deg)`, `long (dec. deg)`, `datum (datum)`, 
         #`state (state_province)`, `country (country)`,  
         `observation_date (YYYY-MM-DD)`, 
         `layer_top (cm)`, `layer_bot (cm)`, 
         `hzn`, `hzn_desgn`, `color`, `soil_taxon`, `soil_series`, 
         `bd_method`, `bd_samp (g cm-3)`, 
         `c_method`, `oc (percent)`, `soc (g cm-2)`, `13c (‰)`, `14c (‰)`, 
         `n_tot (percent)`, 
         `ph_method`, `ph_cacl`, `ph_h2o`, `ph_other`, 
         `sand_tot_psa (percent)`, `silt_tot_psa (percent)`, 
         `clay_tot_psa (percent)`, 
         `wpg2_method`, `wpg2 (percent)`, 
         `root_quant_size`) %>%
  mutate(layer_name = gsub('_[^_]+$', '', layer_name)) %>%
  group_by(`dataset_name`, `site_name`, `profile_name`, layer_name) %>%
  summarise_all(reduceOneOffs) %>%
  ungroup()


all_study <- bind_rows(dataset_study, all_study)
all_profile <- bind_rows(dataset_profile, all_profile)
all_layer <- bind_rows(dataset_layer, all_layer)

dataset_study
summary(dataset_profile %>% select_if(is.factor))
summary(dataset_layer %>% select_if(is.factor))

ggplot(data =  map_data('state') %>%
         full_join(dataset_profile %>% 
                     mutate(region = 
                              tolower(`state (state_province)`), by = 'region') %>%
                     group_by(region) %>% 
                     tally)) + 
  geom_polygon(aes(x=long, y = lat, group=group, fill = n), 
               color = 'black') +
  coord_fixed(1.3) + 
  theme_nothing() 

ggplot(data =  map_data('state') %>% 
         filter(region %in% c("arizona", "utah", "new mexico", "colorado"))) + 
  geom_polygon(aes(x=long, y = lat, group = group), 
               fill = 'grey', color = 'black') + 
  geom_point(data=dataset_profile, 
             aes(x = `long (dec. deg)`, y = `lat (dec. deg)`),
             shape = 'x', color = 'red') +
  coord_fixed(1.3) +
  theme_nothing() 

dataset_layer %>%
  pivot_longer(cols = intersect(names(.), type_cols$num_cols), values_drop_na = TRUE) %>%
  group_by(name) %>% summarize(n = length(value), unique_n = length(unique(value))) %>%
  bind_rows(
    dataset_layer %>%
      pivot_longer(cols = intersect(names(.), type_cols$factor_cols), values_drop_na = TRUE) %>%
      group_by(name) %>% summarize(n = length(value), unique_n = length(unique(value))) ) %>%
  arrange(n) %>%
  knitr::kable()

ggplot(dataset_layer %>% 
         pivot_longer( cols=c('layer_top (cm)', 'layer_bot (cm)'),
                       values_to='depth') %>%
         pivot_longer(cols = intersect(names(.), type_cols$num_cols), 
                      values_to = 'measurement', names_to = 'type')) +
         geom_line(aes(x=depth, y= measurement, group = profile_name)) +
  facet_wrap(~type, scales='free')

Worldwide soil carbon and nitrogen data

Worldwide soil carbon and nitrogen data
- SOC disconnected from other layer data
- cryptic soc methods notes

datasetName <- 'Worldwide soil carbon and nitrogen data'

dataset_study <- full_join(ISCN_raw$citation[dataset_name == datasetName] %>% select_if(not_all_na),
                           ISCN_raw$dataset[dataset_name == datasetName] %>% select_if(not_all_na), 
                           suffix = c('_citation', '_dataset')) %>%
  standardCast()

dataset_profile <- noNRCS_ISCN3_profile  %>%
  filter(dataset_name_sub == datasetName) %>%
  mutate(`country (country)` = 'United States') %>% #hardcode country
  mutate(`soc_depth (cm)` = if_else(is.na(`soc (g cm-2)`), as.character(NA), `soc_depth (cm)`)) %>% #if no SOC then no depth
  ##Clean up the duplicate rows
  group_by(`dataset_name_sub`, `site_name`, `profile_name`) %>% 
  summarise_all(reduceOneOffs) %>%
  ungroup() %>%
  ##cast the columns
  standardCast() %>%
  rename(dataset_name = 'dataset_name_sub') 

dataset_layer <- noNRCS_ISCN3_layer %>%
  filter(dataset_name_sub == datasetName) %>%
  #names()
  standardCast() %>% 
  rename(dataset_name = 'dataset_name_sub')

all_study <- dataset_study
all_profile <- dataset_profile
all_layer <- dataset_layer
#setdiff(c(names(dataset_study), names(dataset_profile), names(dataset_layer)), unlist(type_cols))

dataset_study
summary(dataset_profile %>% select_if(is.factor))
summary(dataset_layer %>% select_if(is.factor))

ggplot(data =  map_data('world') ) + 
  geom_polygon(aes(x=long, y = lat, group = group), 
               fill = 'grey', color = 'black') + 
  geom_point(data=dataset_profile, 
             aes(x = `long (dec. deg)`, y = `lat (dec. deg)`),
             shape = 'x', color = 'red') +
  coord_fixed(1.3) +
  theme_nothing() 

dataset_layer %>%
  pivot_longer(cols = intersect(names(.), type_cols$num_cols), values_drop_na = TRUE) %>%
  group_by(name) %>% summarize(n = length(value), unique_n = length(unique(value))) %>%
  bind_rows(
    dataset_layer %>%
      pivot_longer(cols = intersect(names(.), type_cols$factor_cols), values_drop_na = TRUE) %>%
      group_by(name) %>% summarize(n = length(value), unique_n = length(unique(value))) ) %>%
  arrange(n) %>%
  knitr::kable()

ggplot(dataset_layer %>% 
         pivot_longer( cols=c('layer_top (cm)', 'layer_bot (cm)'),
                       values_to='depth') %>%
         pivot_longer(cols = intersect(names(.), type_cols$num_cols), 
                      values_to = 'measurement', names_to = 'type')) +
         geom_line(aes(x=depth, y= measurement, group = profile_name)) +
  facet_wrap(~type, scales='free')

ISCN/SOCDRaHR2 documentation built on May 26, 2023, 6:44 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ISCN/SOCDRaHR2
Scripts to read in and harmonize different digital soil data archives

In ISCN/SOCDRaHR2: Scripts to read in and harmonize different digital soil data archives

Heckman/Swanston Biscuit Burn

Lu_PIMA

Heckman lithosequence

Worldwide soil carbon and nitrogen data

R Package Documentation

Browse R Packages

We want your feedback!

ISCN/SOCDRaHR2 Scripts to read in and harmonize different digital soil data archives

In ISCN/SOCDRaHR2: Scripts to read in and harmonize different digital soil data archives

Heckman/Swanston Biscuit Burn

Lu_PIMA

Heckman lithosequence

Worldwide soil carbon and nitrogen data

R Package Documentation

Browse R Packages

We want your feedback!

ISCN/SOCDRaHR2
Scripts to read in and harmonize different digital soil data archives