#' Read data from a GLATOS project workbook
#'
#' Read data from a GLATOS project workbook (xlsm file) and return a list of
#' class \code{glatos_workbook}.
#'
#' @param wb_file A character string with path and name of workbook in standard
#' GLATOS format (*.xlsm). If only file name is given, then the file must be
#' located in the working directory. File must be a standard GLATOS file
#' (e.g., \emph{xxxxx_GLATOS_YYYYMMDD.xlsm}) submitted via GLATOSWeb Data
#' Portal \url{http://glatos.glos.us}.
#'
#' @param wb_version An optional character string with the workbook version
#' number. If NULL (default value) then version will be determined by
#' evaluating workbook structure. Currently, the only allowed values are
#' \code{NULL} and \code{"1.3"}. Any other values will trigger an error.
#'
#' @param read_all If TRUE, then all columns and sheets (e.g., user-created
#' "project-specific" columns or sheets) in the workbook will be imported. If
#' FALSE (default value) then only columns and sheets in the standard GLATOS
#' workbook will be imported (project-specific columns will be ignored.)
#'
#' @details In the standard glatos workbook (v1.3), data in workbook sheets
#' 'Deployment', 'Recovery', and 'Location' are merged on columns
#' 'GLATOS_PROJECT', 'GLATOS_ARRAY', 'STATION_NO', 'CONSECUTIVE_DEPLOY_NO',
#' AND 'INS_SERIAL_NO' to produce the output data frame \code{receivers}. Data
#' in workbook sheets 'Project' and 'Tagging' are passed through to new data
#' frames named 'project' and 'animals', respectively, and data from workbook
#' sheet 'Proposed' is not included in result. If \code{read_all = TRUE} then
#' each sheet in workbook will be included in result.
#'
#' @details Data are read from the input file using
#' \link[openxlsx]{readWorkbook} in the package 'openxlsx' package. If
#' \code{read_all = TRUE} then the type of data in each user-defined column
#' (and sheet) will be 'guessed' by \link[openxlsx]{readWorkbook}. Therefore,
#' if \code{read_all = TRUE} then the structure of those columnns should be
#' carefully reviewed in the result. See \link[openxlsx]{readWorkbook} for
#' details.
#'
#' @details Column \code{animal_id} is considered a required column by many
#' other functions in this package, so it will be created if any records are
#' \code{NULL}. When created, it will be constructed from \code{tag_code_space}
#' and \code{tag_id_code}, separated by '-'.
#'
#' @details Timezone attribute of all timestamp columns (class \code{POSIXct})
#' in output will be "UTC" and all 'glatos-specific' timestamp and timezone
#' columns will be omitted from result.
#'
#' @note \strong{\emph{On warnings and errors about date and timestamp
#' formats.}} Date and time columns are sometimes stored as text in Excel.
#' When those records are loaded by this function, there are two possible
#' outcomes. \cr
#' \cr
#' 1. If the records are formatted according to the GLATOS Data Dictionary
#' specification (e.g., "YYYY-MM-DD" for dates and "YYYY-MM-DD HH:MM" for
#' timestamps; see \url{https:\\glatos.glos.us}) those records should be
#' properly loaded into R, but the user is encouraged to verify that they were
#' loaded correctly, so a warning points the user to those records in the
#' workbook. Users may want to format as custom date in the workbook to avoid
#' warnings in the future. \cr
#' \cr
#' 2. If the format of a date-as-text column is not consistent with GLATOS
#' specification, then no data will be loaded and an error will alert the user
#' to this condition. \cr
#' \cr
#' \strong{\emph{On cells with locked formatting in Excel:}} Occasionally the
#' format of a cell in Excel will be locked. In those cases, it is sometimes
#' possible to force date formatting in Excel by (1) highlighting the columns
#' that need reformatting, (2) select 'Text-to-columns' in the 'Data' menu,
#' (3) select 'Delimited' and 'next', (4) uncheck all delimiters and 'next',
#' (5) choose 'Date: YMD' in the 'Column data format' box, and (6) 'Finish'.
#'
#' @return A list of class \code{glatos_workbook} with three elements (described
#' below) containing data from the standard GLATOS Workbook sheets. If
#' \code{read_all = TRUE}, then additional elements will be added with names
#' corresponding to non-standard sheet names.
#' \describe{
#' \item{metadata}{A list with data about the project and workbook.}
#' \item{animals}{A data frame of class \code{glatos_animals} with data about
#' tagged animals.}
#' \item{receivers}{A data frame of class \code{glatos_receivers} with data
#' about telemetry receivers.}
#' }
#'
#' @author C. Holbrook \email{cholbrook@usgs.gov}
#'
#' @seealso \link[openxlsx]{readWorkbook}
#'
#' @examples
#' #get path to example GLATOS Data Workbook
#' wb_file <- system.file("extdata",
#' "walleye_workbook.xlsm", package = "glatos")
#'
#' #note that code above is needed to find the example file
#' #for real glatos data, use something like below
#' #wb_file <- "c:/path_to_file/HECWL_GLATOS_20150321.csv"
#'
#' wb <- read_glatos_workbook(wb_file)
#'
#' @export
read_glatos_workbook <- function(wb_file, read_all = FALSE,
wb_version = NULL) {
#Read workbook-----------------------------------------------------------
#see version-specific file specifications
#internal glatos_workbook_spec in R/sysdata.r
#Get sheet names
sheets <- tolower(openxlsx::getSheetNames(wb_file))
#Identify workbook version (based on sheet names)
id_workbook_version <- function(wb_file, sheets){
if(all(names(glatos:::glatos_workbook_schema$v1.3) %in% sheets)) {
return("1.3")
} else {
stop(paste0("Workbook version could not be identified. Double check ",
"that you are using a standard GLATOS Workbook file. The ",
"names of sheets must match standard file."))
}
}
#Check version if specified
if(is.null(wb_version)) {
wb_version <- id_workbook_version(wb_file, sheets)
} else if (!(paste0("v",wb_version) %in%
names(glatos:::glatos_workbook_schema))) {
stop(paste0("Workbook version ", wb_version, " is not supported."))
}
wb <- list() #preallocate
if(read_all) wb[sheets] <- NA #add element for each sheet
#-Workbook v1.3--------------------------------------------------------------
if (wb_version == "1.3") {
wb[names(glatos:::glatos_workbook_schema$v1.3)] <- NA
#Get project data
tmp <- tryCatch(openxlsx::readWorkbook(wb_file, sheet = "Project",
startRow = 1,
colNames = FALSE), error = function(e){
if(e$message ==
"Expecting a single string value: [type=character; extent=0]."){
stop("There was a problem reading from input file specified. It may ",
"be protected.\n Try again after opening, saving, and closing the ",
"file.")
} else {stop(e)}
})
#tmp <- openxlsx::readWorkbook(wb_file, sheet = "Project", startRow = 1,
# colNames = FALSE)
wb$project <- list(project_code = tmp[1,2],
principle_investigator = tmp[2,2],
pi_email = tmp[3,2],
source_file=basename(wb_file),
wb_version = "1.3",
created = file.info(wb_file)$ctime)
#Read all sheets except project
if(read_all) {
sheets_to_read <- sheets
extra_sheets <- setdiff(sheets, names(glatos:::glatos_workbook_schema[[
paste0("v", wb_version)]]))
} else {
sheets_to_read <- names(glatos:::glatos_workbook_schema[[
paste0("v", wb_version)]])
}
sheets_to_read <- setdiff(sheets_to_read, "project") #exclude project
for(i in 1:length(sheets_to_read)){
schema_i <- glatos:::glatos_workbook_schema[[
paste0("v", wb_version)]][[sheets_to_read[i]]]
if(is.null(schema_i)){ xl_start_row <- 1 } else { xl_start_row <- 2 }
#read one row to get dimension and column names
tmp <- openxlsx::readWorkbook(wb_file,
sheet = match(sheets_to_read[i], tolower(sheets)),
check.names = FALSE,
startRow = xl_start_row, na.strings = c("", "NA"))
if(!is.null(schema_i)){
#check that sheet i contains all names in schema
missing_cols <- setdiff(schema_i$name, tolower(colnames(tmp)))
if(length(missing_cols) > 0){
stop(paste0("The following columns were not found in sheet named '",
sheets_to_read[i],"': ",
paste(missing_cols, collapse = ", ")))
}
if(!read_all){
#subset only columns in schema (by name)
# - use match so that first column with each name is selected if > 1
tmp <- tmp[ , match(schema_i$name, tolower(colnames(tmp)))]
} else {
#identify project-specific fields
extra_cols <- colnames(tmp)[- match(schema_i$name, tolower(colnames(tmp)))]
#identify new columns to add
if (length(extra_cols) > 0) {
#count column names to identify and rename any conflicting
col_counts <- table(tolower(colnames(tmp)))
conflict_cols <- col_counts[col_counts > 1]
if(length(conflict_cols) > 0) {
#rename conflict cols
for(k in 1:length(conflict_cols)) {
name_k <- names(conflict_cols)[k]
extra_names_k <- c(name_k,
paste0(name_k, "_x", 1:(conflict_cols[k] - 1)))
names(tmp)[tolower(colnames(tmp)) == name_k] <- extra_names_k
warning(paste0("Non-standard (project-specific) columns ",
"were found with names matching standard \n column names ",
"in sheet '", sheets_to_read[i],"'.\n\n The following ",
"column names were assigned to avoid conflicts:",
"\n ", paste0(extra_names_k, collapse = ", "), "."))
}
} #end if
}
} #end if else
#make column names lowercase
names(tmp) <- tolower(names(tmp))
#set classes; by column name since conflicts resolved above
# character
char_cols <- with(schema_i, name[type == "character"])
for(j in char_cols) tmp[ , j] <- as.character(data.frame(tmp)[ , j])
# numeric
num_cols <- with(schema_i, name[type == "numeric"])
for(j in num_cols) tmp[ , j] <- as.numeric(data.frame(tmp)[ , j])
# POSIXct
posixct_cols <- with(schema_i, name[type == "POSIXct"])
for(j in posixct_cols) {
schema_row <- match(j, schema_i$name)
#Get time zone
#function to construct time zone string from reference column tmp
REFCOL <- function(x) {
col_x <- gsub(")$", "", strsplit(x, "REFCOL\\(")[[1]][2])
x2 <- tmp[, col_x]
utc_rows <- tolower(x2) %in% c("utc", "gmt")
x2[utc_rows] <- "UTC"
x2[!utc_rows] <- with(tmp, paste0("US/", x2[!utc_rows]))
return(x2)
}
#get timezone for this column
tz_cmd <- gsub("^tz = |^tz=|\"","", schema_i$arg[schema_row])
if(grepl("REFCOL", tz_cmd)) {
tzone_j <- REFCOL(tz_cmd)
tz_cmd <- unique(tzone_j)
} else { tzone_j <- tz_cmd }
if(nrow(tmp) > 0){
#Handle mixture of timestamps as date and char
#identify timestamps that can be numeric; assume others character
posix_na <- is.na(tmp[, j]) #identify missing first
posix_as_num <- suppressWarnings(as.numeric(tmp[, j]))
posix_as_char <- !posix_na & is.na(posix_as_num)
if(any(posix_as_char)) {
bad_pc_rows <- which(posix_as_char) + 2
bad_pc_rows <- ifelse(length(bad_pc_rows) < 10,
paste0(bad_pc_rows, collapse = ", "),
paste0(paste(bad_pc_rows[1:10], collapse = ", "),
"... +", length(bad_pc_rows) - 10, " more.",
collapse = " "))
warning(paste0("Some records (see below) ",
"in '", sheets_to_read[i], "` were not recognized as Excel ",
"datetime objects.\n These should have imported correctly if ",
"formatted as 'YYYY-MM-DD HH:MM',\n but see 'Note' in ",
"help(\"read_glatos_workbook\") to avoid this warning.\n\n ",
"Column: '", j, "'\n Rows: ", bad_pc_rows, "\n "))
}
#convert numeric
posix_as_num <- openxlsx::convertToDateTime(posix_as_num,
tz = Sys.timezone())
#handle multiple time zones
for(k in 1:length(tz_cmd)){
rows_k <- tzone_j %in% tz_cmd[k] #get rows with kth tz
#round to nearest minute and force to correct timezone
posix_as_num[rows_k] <- as.POSIXct(round(posix_as_num[rows_k],
"mins"), tz = tz_cmd[k])
#do same for posix_as_char and insert into posix_as_num
if(any(posix_as_char[rows_k])){
posix_as_num[posix_as_char & rows_k] <- as.POSIXct(tmp[posix_as_char & rows_k , j],
tz = tz_cmd[k])
}
} # end k
tmp[ , j] <- posix_as_num
} else {
tmp[ , j] <- as.POSIXct(NA, tz = "UTC")[0]
}
attr(tmp[, j], "tzone") <- "UTC"
} #end j
# Date
date_cols <- with(schema_i, name[type == "Date"])
for(j in date_cols) {
schema_row <- match(j, schema_i$name)
#identify date that can be numeric; assume others character
date_na <- is.na(tmp[, j]) #identify missing
date_as_num <- suppressWarnings(as.numeric(tmp[, j]))
date_as_char <- !date_na & is.na(date_as_num)
#convert numeric
date_as_num <- openxlsx::convertToDate(date_as_num)
#do same for posix_as_char and insert into posix_as_num
if(any(date_as_char)){
bad_dc_rows <- which(date_as_char) + 2
bad_dc_rows <- ifelse(length(bad_dc_rows) < 10,
paste0(bad_dc_rows, collapse = ", "),
paste0(paste(bad_dc_rows[1:10], collapse = ", "),
"... +", length(bad_dc_rows) - 10, " more.",
collapse = " "))
date_as_num[date_as_char] <- tryCatch(as.Date(tmp[date_as_char , j]),
error = function(e) {
if(e$message == "character string is not in a standard unambiguous format"){
stop(paste0("At least one of the records identified below in '",
sheets_to_read[i], "`\n could not be coerced to Date because ",
"the format was invalid.\n Dates stored as ",
"text in GLATOS Workbooks must be formatted \n ",
"'YYYY-MM-DD'. See 'Note' in ",
"help(\"read_glatos_workbook\") \n about formatting ",
"dates and times ",
"in GLATOS Workbooks.\n\n ",
"Column: '", j, "'\n Row: ", bad_dc_rows, "\n"))
} else{ return(e) }
}
)
}
#warn user if no error
if(any(date_as_char)) warning(paste0("Some records (see below) in '",
sheets_to_read[i], "` were not recognized as Excel date objects.\n",
" These should have imported correctly if formatted as ",
"'YYYY-MM-DD',\n but see 'Note' in ",
"help(\"read_glatos_workbook\") to avoid this warning.\n\n ",
"Column: '", j, "'\n Row: ", bad_dc_rows, "\n"))
tmp[ , j] <- date_as_num
} #end j
} #end if
wb[[sheets_to_read[i]]] <- tmp
} #end i
#merge to glatos_workbook list object
ins_key <- list(by.x = c("glatos_project", "glatos_array", "station_no",
"consecutive_deploy_no", "ins_serial_no"),
by.y = c("glatos_project", "glatos_array", "station_no",
"consecutive_deploy_no", "ins_serial_number"))
wb2 <- with(wb, list(
metadata = project,
animals = tagging,
receivers = merge(deployment,
recovery[, unique(c(ins_key$by.y,
setdiff(names(recovery), names(deployment))))],
by.x = c("glatos_project", "glatos_array", "station_no",
"consecutive_deploy_no", "ins_serial_no"),
by.y = c("glatos_project", "glatos_array", "station_no",
"consecutive_deploy_no", "ins_serial_number"),
all.x=TRUE, all.y=TRUE)
))
#add location descriptions
wb2$receivers <- with(wb2, merge(receivers, wb$locations,
by = "glatos_array"))
#Drop unwanted columns from receivers
#coalesce deploy_date_time and glatos_deploy_date_time
attr(wb2$receivers$glatos_deploy_date_time, "tzone") <- "UTC"
ddt_na <- is.na(wb2$receivers$deploy_date_time)
wb2$receivers$deploy_date_time[ddt_na] <-
wb2$receivers$glatos_deploy_date_time[ddt_na]
#coalesce recover_date_time and glatos_recover_date_time
attr(wb2$receivers$glatos_recover_date_time, "tzone") <- "UTC"
rdt_na <- is.na(wb2$receivers$recover_date_time)
wb2$receivers$recover_date_time[rdt_na] <-
wb2$receivers$glatos_recover_date_time[rdt_na]
drop_cols_rec <- c("glatos_deploy_date_time", "glatos_timezone",
"glatos_recover_date_time")
wb2$receivers <- wb2$receivers[ , -match(drop_cols_rec,
names(wb2$receivers))]
#sort rows by deploy_date_time
wb2$receivers <- wb2$receivers[with(wb2$receivers,
order(deploy_date_time, glatos_array, station_no)), ]
row.names(wb2$receivers) <- NULL
#Drop unwanted columns from animals
#coalesce release_date_time and utc_release_date_time
attr(wb2$animals$glatos_release_date_time, "tzone") <- "UTC"
ardt_na <- is.na(wb2$animals$utc_release_date_time)
wb2$animals$utc_release_date_time[ardt_na] <-
wb2$animals$glatos_release_date_time[ardt_na]
drop_cols_anim <- c("glatos_release_date_time", "glatos_timezone")
wb2$animals <- wb2$animals[ , -match(drop_cols_anim,
names(wb2$animals))]
#sort animals
#sort rows by deploy_date_time
wb2$animals <- wb2$animals[with(wb2$animals,
order(utc_release_date_time, animal_id)), ]
row.names(wb2$animals) <- NULL
#create animal_id if missing
anid_na <- is.na(wb2$animals$animal_id)
wb2$animals$animal_id[anid_na] <- with(wb2$animals[anid_na, ],
paste0(tag_code_space, "-", tag_id_code))
#Append new sheets if required
if(read_all) {
for(i in 1:length(extra_sheets)){
wb2[extra_sheets[i]] <- wb[extra_sheets[i]]
}
}
}
#-end v1.3----------------------------------------------------------------
#assign classes
wb2$animals <- glatos:::glatos_animals(wb2$animals)
wb2$receivers <- glatos:::glatos_receivers(wb2$receivers)
wb2 <- glatos:::glatos_workbook(wb2)
return(wb2)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.