Nothing
#' @title Export Data for a BANTER Model
#'
#' @description Exports data from an AcousticStudy into the format
#' required to run a BANTER model from the "banter" package
#'
#' @param x a \linkS4class{AcousticStudy} object or
#' a list of \linkS4class{AcousticEvent} objects
#' @param dropVars a vector of the names of any variables to remove
#' @param dropSpecies a vector of the names of any species to exclude
#' @param training logical flag whether or not this will be used as a
#' training data set, or a value between 0 and 1 specifying what percent
#' of the data should be used for training (with the rest set aside
#' for testing). If TRUE or greater than 0, must contain species ID.
#' NOTE: if value is not 0, 1, \code{TRUE}, or \code{FALSE}, output
#' will be further split into \code{training} and \code{test} items
#' within the list output
#' @param verbose logical flag to show summary and informational messages
#'
#' @return a list with three items, \code{events}, \code{detectors}, and
#' \code{na}. If value of \code{training} is not 0, 1, \code{TRUE}, or
#' \code{FALSE}, output will be split into \code{training} and
#' \code{test} lists that contain \code{events} and \code{detectors}.
#' \code{events} is a dataframe with two columns. \code{event.id} is a
#' unique identifier for each event, taken from the names of the event
#' list. \code{species} is the species classification, taken from the
#' \code{species} slot labelled \code{id}. \code{detectors} is a list
#' of data frames containing all the detections and measurements. There is
#' one list for each unique detector type found in the \code{detectors} slots
#' of \code{x}. The data frames will only have columns with class
#' \code{numeric}, \code{integer}, \code{factor}, or \code{logical}, and
#' will also have columns named \code{UID}, \code{Id}, \code{parentID},
#' \code{sampleRate}, \code{Channel}, \code{angle}, and \code{angleError},
#' removed so that these are not treated as parameters for the banter random
#' forest model. The dataframes will also have columns \code{event.id} and
#' \code{call.id} added. \code{na} contains the UIDs and Binary File names
#' for any detections that had NA values. These cannot be used in the
#' random forest model and are removed from the exported dataset.
#'
#' @author Taiki Sakai \email{taiki.sakai@@noaa.gov}
#'
#' @examples
#'
#' # setting up example data
#' data(exStudy)
#' exStudy <- setSpecies(exStudy, method='pamguard')
#' banterData <- export_banter(exStudy)
#' # drop some variables
#' banterLess <- export_banter(exStudy, dropVars = c('peak', 'duration'))
#'
#' @importFrom PAMmisc squishList
#' @importFrom dplyr distinct n group_by n_distinct summarise rename left_join bind_rows filter
#' @importFrom knitr kable
#' @importFrom purrr map reduce transpose
#' @export
#'
export_banter <- function(x, dropVars=NULL, dropSpecies=NULL, training=TRUE, verbose=TRUE) {
if(is.list(x)) {
whichStudy <- sapply(x, is.AcousticStudy) # two cases if its a list
whichEvent <- sapply(x, is.AcousticEvent)
# NEED TO UNLIST IF ITS A STUDY BUT THEN PROB SHOULDNT DO ALL IN ONE THING WHATEVER BRO
x <- c(x[whichEvent],
unlist(sapply(x[whichStudy], events)))
}
if(is.AcousticStudy(x)) {
x <- events(x)
}
if(is.AcousticEvent(x)) {
x <- list(x)
names(x) <- x[[1]]@id
}
# From here we need x to be list of AcEv
numDets <- sapply(x, function(e) length(detectors(e)))
noDets <- (numDets == 0)
if(all(noDets)) {
warning('No events had any detections.')
return(NULL)
}
if(any(noDets)) {
warning(sum(noDets), ' events have 0 detections, they will be removed.\n', call. = FALSE)
x <- x[!noDets]
}
sp <- sapply(x, function(e) species(e)$id[1])
sp[is.null(sp)] <- NA_character_
spNa <- sapply(sp, is.na)
if(training && any(spNa)) {
warning('Events ', printN(names(x)[which(spNa)], 6),
' do not have a species ID. Data can only be used for prediction, not model training.', call. = FALSE)
training <- FALSE
}
detNA <- data.frame(UID = character(0), BinaryFile = character(0),
event = character(0), detector = character(0), stringsAsFactors = FALSE)
evName <- sapply(x, id)
if(!(length(unique(evName)) == length(evName))) {
warning('Duplicate event names found, these must be unique for BANTER. Adding numbers to event names.', call. = FALSE)
for(i in unique(evName)) {
evName[evName == i] <- paste0(i, 1:(sum(evName == i)))
}
}
names(x) <- evName
events <- data.frame(event.id = names(x),
stringsAsFactors = FALSE)
# check if enough events to make train data, remove any we dont have for
events$species <- sp
if(training) {
nSpecies <- table(events$species)
# need min 2 to train banter, 3 if also making test
if(training == 1) {
noTrainSpecies <- names(nSpecies)[nSpecies < 2]
noTrainSpecies <- noTrainSpecies[!(noTrainSpecies %in% dropSpecies)]
if(length(noTrainSpecies) > 0) {
warning('Species ', printN(noTrainSpecies, 10),
' do not have enough events to train a banter model (min 2),',
' these will be removed.', call. = FALSE)
dropSpecies <- c(dropSpecies, noTrainSpecies)
}
} else {
noTrainSpecies <- names(nSpecies)[nSpecies < 3]
noTrainSpecies <- noTrainSpecies[!(noTrainSpecies %in% dropSpecies)]
if(length(noTrainSpecies) > 0) {
warning('Species ', printN(noTrainSpecies, 10),
' do not have enough events to train a banter model (min 2',
' to train plus 1 for testing), these will be removed.', call. = FALSE)
dropSpecies <- c(dropSpecies, noTrainSpecies)
}
}
}
if(!is.null(dropSpecies)) {
toDrop <- sp %in% dropSpecies
sp <- sp[!toDrop]
events <- events[!toDrop, , drop=FALSE] # dont coerce to vector ever
x <- x[!toDrop]
}
if(nrow(events) == 0) {
warning('No events left with any detections after removing species.')
return(NULL)
}
# check if any event level measures are present in all data
# or should i get all and fill NA, let banter deal with the NAs and warn you?
# if any present in all get them and cbind that ish to your events
allMeasures <- reduce(
lapply(x, function(e) names(ancillary(e)$measures)),
intersect)
# if(length(measureNames) == 0) {
# allMeasures <- NULL
# } else {
# allMeasures <- reduce(measureNames, intersect)
# }
if(length(allMeasures) > 0 ) {
measureData <- bind_rows(lapply(x[events[['event.id']]], function(e) {
ancillary(e)$measures[allMeasures]
}))
measureData <- dropCols(measureData, dropVars)
events <- cbind(events, measureData)
if(verbose) {
cat('Found ', ncol(measureData), ' event level measures (', printN(colnames(measureData), 10),
') that were present in all events, adding these to your event data.\n', sep='')
}
}
for(e in seq_along(x)) {
thisEv <- x[[e]]
for(d in seq_along(detectors(thisEv))) {
thisDet <- detectors(thisEv)[[d]]
if(is.null(thisDet)) next
thisDet$event.id <- names(x)[e]
thisDet$call.id <- paste0(names(x)[e], thisDet$UID)
if('Channel' %in% colnames(thisDet)) {
thisDet$call.id <- paste0('C', thisDet$Channel, thisDet$call.id)
}
# colsToDrop <- c('UID', 'Id', 'parentUID', 'sampleRate', 'Channel',
# 'angle', 'angleError', 'peakTime')
colsToDrop <- ppVars()$nonModelVars
colsToDrop <- unique(c(colsToDrop, dropVars))
useCols <- lapply(thisDet, class) %in% c('numeric', 'integer', 'factor', 'logical') &
!(colnames(thisDet) %in% colsToDrop) |
colnames(thisDet) %in% c('event.id', 'call.id')
whereNA <- sapply(thisDet[, useCols], is.na)
if(nrow(thisDet) == 1) {
whereNA <- matrix(whereNA, nrow=1)
}
naRow <- apply(whereNA, 1, any)
if(length(naRow) != nrow(thisDet)) browser()
thisNA <- thisDet[naRow, c('UID', 'BinaryFile')]
if(nrow(thisNA) > 0) {
thisNA$event <- names(x)[e]
thisNA$detector <- names(detectors(thisEv))[d]
thisNA$measureNames <- apply(whereNA, 1, function(x) paste0(colnames(thisDet[, useCols])[x], collapse=', '))[naRow]
}
detNA <- rbind(detNA, thisNA)
detectors(thisEv)[[d]] <- thisDet[!naRow, useCols]
}
x[[e]] <- thisEv
}
dets <- lapply(x, function(e) {
tmpDet <- detectors(e)
if(length(tmpDet) == 0) return(NULL)
tmpDet[sapply(tmpDet, function(y) !is.null(y) && ncol(y) > 2)]
})
names(dets) <- NULL
dets <- squishList(unlist(dets, recursive = FALSE))
dets <- lapply(dets, distinct)
if(nrow(detNA) > 0) {
warning('Removing ', nrow(detNA), ' NA values, to see affected UID(s) and ',
'BinaryFile(s) check the "na" item in list output.', call. = FALSE)
}
# From here train/test split and report
# result <- list(events=events, detectors=dets, na=detNA)
result <- bntSplit(list(events=events, detectors=dets), training)
# first two cases return $events and $detectors
if(verbose) {
if(training == 0) {
cat('\nCreated data for ', nrow(result$events), ' events with ',
sum(sapply(result$detectors, nrow)), ' total detections.', sep='')
} else if(training==1) {
cat('\nCreated data for ', nrow(result$events), ' events with ',
sum(sapply(result$detectors, nrow)), ' total detections', sep='')
cat(' and ', length(unique(sp)),
' unique species: ', paste0(unique(sp), collapse =', '), '.',
'\nRe-run with dropSpecies argument if any of these are not desired.', sep='')
print(kable(bntSummaryTable(result)))
} else if(training > 0) { # train-test split case has $train$events and $test$events
cat('\nCreated training data for ', nrow(result$train$events), ' events with ',
sum(sapply(result$train$detectors, nrow)), ' total detections', sep='')
cat(' and ', length(unique(sp)),
' unique species: ', paste0(unique(sp), collapse =', '), '.', sep='')
print(kable(bntSummaryTable(result$train)))
cat('\nCreated test data for ', nrow(result$test$events), ' events with ',
sum(sapply(result$test$detectors, nrow)), ' total detections', sep='')
print(kable(bntSummaryTable(result$test)))
cat('\nRe-run with dropSpecies argument if any of these are not desired.', sep='')
}
cat('\n')
}
# list(events=events, detectors=dets, na=detNA)
result$na <- detNA
result
}
# make a summary table from export_banter output
bntSummaryTable <- function(x) {
bind_rows(lapply(x$detectors, function(y) {
tmp <- left_join(y, x$events, by='event.id')
select(tmp, .data$event.id, .data$species)
}), .id = 'detector') %>% group_by(species) %>%
summarise(Events = n_distinct(.data$event.id),
Detections = n()) %>%
rename(Species = species)
}
# x is result from initial export_banter with $events and $detectors for all
bntSplit <- function(x, trainFrac) {
testFrac <- 1-trainFrac
if(testFrac == 0 ||
trainFrac == 0) {
return(x)
}
split <- split(x$events, x$events$species)
ev <- transpose(lapply(split, function(s) {
if(nrow(s) < 3) {
warning('Species ', s$species[1],
' does not have enough events to train a model (min 3).', call. = FALSE)
return(NULL)
}
if(nrow(s) * testFrac < 1) {
warning('Species ', s$species[1],
' does not have enough events for the chosen test proportion,',
' will round up to 1 event for test set.', call. = FALSE)
}
testNum <- ceiling(nrow(s) * testFrac)
testIx <- sample(1:nrow(s), testNum, replace=FALSE)
trainIx <- (1:nrow(s))[-testIx]
list(train=s[trainIx, ], test=s[testIx, ])
}))
result <- list(train = list(events = bind_rows(ev$train)),
test = list(events = bind_rows(ev$test)))
# result$train$detectors <- x$detectors$event.id %in% train$event.id #psuedodfacode
result$train$detectors <- lapply(x$detectors, function(d) {
tmp <- filter(d, .data$event.id %in% result$train$events$event.id)
# if(nrow(tmp) == 0) return(NULL)
tmp
})
result$test$detectors <- lapply(x$detectors, function(d) {
tmp <- filter(d, .data$event.id %in% result$test$events$event.id)
# if(nrow(tmp) == 0) return(NULL)
tmp
})
result
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.