#' Load ISCN Layer and Meta data
#'
#' This function first downloads the layer and meta data from the ISCN website.
#' ISCN (http://iscn.fluxdata.org/data/access-data/database-reports/) data available: ftp://ftp.fluxdata.org/.deba/ISCN/ALL-DATA/ISCN_ALL_DATA_LAYER_C1_1-1.xlsx ftp://ftp.fluxdata.org/.deba/ISCN/ALL-DATA/ISCN_ALL_DATA_LAYER_C2_1-1.xlsx ftp://ftp.fluxdata.org/.deba/ISCN/ALL-DATA/ISCN_ALL_DATA_LAYER_C3_1-1.xlsx ftp://ftp.fluxdata.org/.deba/ISCN/ALL-DATA/ISCN_ALL_DATA_LAYER_C4_1-1.xlsx
#'
#' @param dataDir path to the folder contianing ISCN_ALL_DATA_LAYER_C*_1-1.xlsx, ISCN_ALL-DATA-CITATION_1-1.xlsx and ISCN_ALL_DATA_DATASET_1-1.xlsx files. If this is left NULL then files will be downloaded to a temporary directory from the ISCN website and then deleted.
#' @param verbose boolean flag denoting whether or not to print lots of status messages
#' @param onlyISCNKey boolean flag to only return the processed ISCN3 key which is slightly different then SoilDataR::ISCNKey.df. The regular expressions in the variable column are matched to the column names.
#' @param loadVars an array of characters to read in only certain variables. If NULL then all the variables will be read in.
#'
#' @import dplyr
#' @importFrom tidyr gather spread
#' @importFrom readxl read_excel
#' @importFrom readr read_csv
#' @importFrom lubridate decimal_date
#' @export
processData_ISCN3 <- function(dataDir=NULL, verbose=FALSE, onlyISCNKey=FALSE, loadVars=NULL){
## create the layer and meta directors if needed
delete_dataDir <- is.null(dataDir)
if(is.null(dataDir)){
dataDir <- tempdir()
}
layerDataFiles.arr <- file.path(dataDir, c('ISCN_ALL_DATA_LAYER_C1_1-1.xlsx', 'ISCN_ALL_DATA_LAYER_C2_1-1.xlsx',
'ISCN_ALL_DATA_LAYER_C3_1-1.xlsx', 'ISCN_ALL_DATA_LAYER_C4_1-1.xlsx'))
dataFiles.arr <- c(layerDataFiles.arr,
file.path(dataDir, c(
'ISCN_ALL-DATA-CITATION_1-1.xlsx', 'ISCN_ALL_DATA_DATASET_1-1.xlsx')))
## Download the data
for(dataFiles.arr in dataFiles.arr){
if(!file.exists(dataFiles.arr)){
download.file(sprintf('ftp://ftp.fluxdata.org/.deba/ISCN/ALL-DATA/%s', basename(dataFiles.arr)),
dataFiles.arr, quiet=FALSE)
}
}
#### Fill in the regular expression variables ####
unitVars <- dplyr::filter(SoilDataR::ISCNKey.df, type == 'value')$var
ISCNKey <- SoilDataR::ISCNKey.df %>%
group_by(header, dataframe, class, type, unit, method) %>%
do((function(xx){
if(grepl('(\\^)|(\\|)|(\\$)', xx$var)) #check for regular expression
#return all variables that match
return(data.frame(var=as.character(unitVars[grepl(xx$var, unitVars)]), stringsAsFactors=FALSE))
else
#do nothing
return(data.frame(var=xx$var, stringsAsFactors=FALSE))
})(.)) %>%
group_by(header, dataframe, class, type, unit, method, var) %>%
arrange(var) %>%
##seperate the unit type as either hard coded (hardUnit) or references (unitCol) based on
##...starting match
mutate(hardUnit = if_else(any(grepl(paste0('^',unit), SoilDataR::ISCNKey.df$header)), as.character(NA), unit),
unitCol = if_else(any(grepl(paste0('^',unit), SoilDataR::ISCNKey.df$header)),
SoilDataR::ISCNKey.df$header[grepl(paste0('^',unit),SoilDataR::ISCNKey.df$header)][1],
as.character(NA))) %>%
ungroup()
if(onlyISCNKey) return(ISCNKey)
#### Only read in variables of interest ####
if(!is.null(loadVars)){
ISCNKey <- ISCNKey %>%
filter((dataframe == 'sample' & var %in% loadVars) |
dataframe != 'sample')
}
#### Read data files ####
if(verbose) print('Maybe go get a cup of coffee... this takes a while.')
ans <- list(study=data.frame(),
field=data.frame(),
sample=data.frame(),
measure=data.frame())
for(fileNum in 1:length(layerDataFiles.arr)){
if(verbose) print(paste('Reading', layerDataFiles.arr[fileNum]))
all.temp <- readxl::read_excel(path=layerDataFiles.arr[fileNum], sheet='layer', col_types='text') %>%
filter(!is.na(dataset_name_sub)) %>% #remove empty lines
mutate(rowNum = 1:nrow(.), ##Adding row numbers because dataset_name_soc breaks the soc variable
##...the layer name is no longer a unique row identifier
observation_date = as.numeric(`observation_date (YYYY-MM-DD)`)) %>%
##Dates start from December 20, 1899 in most versions of excel right now
mutate(`observation_date (YYYY-MM-DD)` = if_else(observation_date > 2020,
lubridate::decimal_date(as.Date(observation_date,
origin = "1899-12-30")),
observation_date))
##Check that the date is correct in the first data sheet
if(grepl('ISCN_ALL_DATA_LAYER_C1_1-1.xlsx', basename(dataFiles.arr[fileNum])) &
! floor(all.temp$`observation_date (YYYY-MM-DD)`[1]) == 2006){
##keep going but don't trust it
warning('Dates from ISCN3 layer files do not check out (SoilDataR::processData_ISCN3) ')
}
##Pull the formal name for the ISCN data provider version
datasetName <- as.character(names(all.temp)[1])
dropCols <- apply(all.temp, 2, function(xx){all(is.na(xx))}) ##select_if has issues with non-standard column names
all.temp <- all.temp[,!dropCols]
ans$study <- all.temp %>%
select(one_of(intersect(names(all.temp), (ISCNKey %>% filter(dataframe == 'study'))$header))) %>%
rename(dataset_name = dataset_name_sub) %>% ##Key the study ids on the dataset_name
mutate(dataset_name_super = datasetName) %>%
unique %>%
bind_rows(ans$study)
ans$field <- all.temp %>%
rename(dataset_name = dataset_name_sub) %>%
select(dataset_name,##pull the study IDs... and all the field variables
one_of(intersect(names(all.temp), (ISCNKey %>% filter(dataframe == 'field'))$header))) %>%
unique %>%
bind_rows(ans$field)
sampleVarNames <- intersect(names(all.temp), (ISCNKey %>% filter(dataframe == 'sample'))$header)
if(length(sampleVarNames) > 0){
sample <- all.temp %>%
rename(dataset_name = dataset_name_sub) %>%
select(dataset_name, layer_name, ##pull the study and field IDS... and all the sample variables
rowNum, one_of(sampleVarNames)) %>%
gather(header, entry, ##make the samples long table format
one_of(sampleVarNames),
na.rm=TRUE) %>%
left_join(select(ISCNKey, header, var, type), by=c('header')) %>% ##trace all headers to a variable by their type
group_by(dataset_name, layer_name, rowNum, var, add=FALSE) %>% ##for each variable
spread(type, entry) %>% ##spread out the method, unit, value, or sigma associated with it
filter(any(!is.na(value))) #remove NA data
# mutate(unit=ifelse(exists('unit', where=.), unit, NA), ##Ensure that there are units and methods
# method=ifelse(exists('method', where=.), method, NA)) %>%
##coded in base for runtime issues
if(!'unit' %in% names(sample)){
sample$unit <- as.character(NA)
}
if(!'method' %in% names(sample)){
sample$method <- as.character(NA)
}
sample <- sample %>%
summarize(method=ifelse(all(is.na(method)), as.character(NA),
paste0(paste(header, method, sep=':')[!is.na(method)],
collapse=';')), #glom multiple methods together
unit = ifelse(all(is.na(unit)), as.character(NA), unique(unit[!is.na(unit)])) ,
value = as.numeric(unique(value[!is.na(value)]))) %>%
ungroup() %>%
select(dataset_name, layer_name, var, method, unit, value) %>%
unique()
ans$sample <- sample %>%
bind_rows(ans$sample)
rm(sample)
}
rm(all.temp)
}
####Rename the headers for field####
##TODO key.ls <- rename_(df, .dots = setNames(names(key.ls), key.ls))
# rename_(df, .dots = setNames(names(key.ls), key.ls))
renameNonSampleHeaders <- ISCNKey %>% filter(dataframe != 'sample', header != var)
renameNonSampleHeaders.ls <- as.list(renameNonSampleHeaders$var)
names(renameNonSampleHeaders.ls) <- renameNonSampleHeaders$header
names(ans$field)[names(ans$field) %in% names(renameNonSampleHeaders.ls)] <-
renameNonSampleHeaders.ls[names(ans$field)[names(ans$field) %in% names(renameNonSampleHeaders.ls)]]
#### Add field and measure IDs and reindex samples to save space####
if(verbose) print('Adding field/measure IDs and factoring')
ans$field <- ans$field %>%
mutate_at(vars(one_of(intersect(unique(ISCNKey$var[ISCNKey$class == 'factor']), names(ans$field)))),
funs(factor)) %>% ##Need to rename headers as var
arrange(dataset_name) %>%
mutate(fieldID = 1:nrow(.))
if(nrow(ans$sample) > 0){
ans$measure <- ans$sample %>% ungroup() %>%
select(var, method, unit) %>% unique %>%
arrange(var) %>%
mutate(measureID = 1:nrow(.))
ans$sample <- ans$sample %>%
left_join(select(ans$field, dataset_name, layer_name, fieldID),
by=c('dataset_name', 'layer_name')) %>%
left_join(ans$measure, by=c('var', 'method', 'unit')) %>%
select(fieldID, measureID, value)
}else{
ans$measure <- data.frame()
}
#### read in meta files ####
if(verbose) print('reading in meta files')
ans$study <- read_excel(path=paste(dataDir, 'ISCN_ALL-DATA-CITATION_1-1.xlsx', sep='/'), sheet='citation') %>%
full_join(read_excel(path=paste(dataDir, 'ISCN_ALL_DATA_DATASET_1-1.xlsx', sep='/'), sheet='dataset')) %>%
mutate(`modification_date (YYYY-MM-DD)` = as.POSIXct(round(`modification_date (YYYY-MM-DD)`, unit='day'))) %>%
select(-`ISCN 1-1 (2015-12-10)`) %>%
group_by(dataset_name) %>%
gather(header, value, -dataset_name,na.rm=TRUE) %>%
unique() %>%
full_join(ans$study) %>%
select(dataset_name_super, dataset_name, header, value) %>%
arrange(dataset_name_super, dataset_name, header)
####Rename the headers for the study df####
renameNonSampleHeaders <- ISCNKey %>% filter(dataframe != 'sample', header != var)
renameNonSampleHeaders.ls <- as.list(renameNonSampleHeaders$var)
names(renameNonSampleHeaders.ls) <- renameNonSampleHeaders$header
names(ans$study)[names(ans$study) %in% names(renameNonSampleHeaders.ls)] <-
renameNonSampleHeaders.ls[names(ans$study)[names(ans$study) %in% names(renameNonSampleHeaders.ls)]]
ans$ISCNKey <- ISCNKey
#delete the files from the temepratory directorys
if(delete_dataDir){
if(verbose) print(paste('deleting: ',
paste0(file.path(dataDir, c('ISCN_ALL_DATA_LAYER_C1_1-1.xlsx', 'ISCN_ALL_DATA_LAYER_C2_1-1.xlsx',
'ISCN_ALL_DATA_LAYER_C3_1-1.xlsx', 'ISCN_ALL_DATA_LAYER_C4_1-1.xlsx')),
collapse=', ')))
file.remove(dataFiles.arr, recursive=TRUE)
}
return(ans)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.