#' Fix systematic problems with the tag registration database.
#'
#' Some tag registrations had problems. The list of all registered
#' tags is obtained from the motus-wts.org server. A cleaned-up
#' database suitable for the find_tags_motus program is generated,
#' including an events table which indicates when tags were activated
#' and inactivated. This allows the database to be used against any
#' receiver dataset, regardless of the dates.
#'
#' @param m data.frame with full tag info from motus, as returned by
#' motusSearchTags()
#'
#' @param s safeSQL object; database where tables "tags" and "events" will
#' be created.
#'
#' @param cleanBI logical; if TRUE, attempt to clean the burst interval;
#' otherwise, leave it as-is. Default: FALSE.
#'
#' @return returns modified m.
#' Side effect:
#' \itemize{
#' \item creates or overwrite the table "tags"
#' in \code{s}, which will contain cleaned tag registrations
#' \item creates or overwrites table "events" in \code{s}, which contains activation / deactivation
#' timestamps for each tag deployment.
#' \item updates the file "tags.csv" in the metadata history repository
#' }
#'
#' The \code{events} table looks like this:
#' \strong{events:}
#' \itemize{
#' \item ts timestamp for event
#' \item tagID motus tag ID for event
#' \item event integer: 1 is activation; 0 is deactivation
#' }
#' @note:
#'
#' For Lotek coded ID tags, the registrations are cleaned up like so:
#' \itemize{
#'
#' \item empirical gap values are replaced with nominal values from
#' the Lotek codeset
#'
#' \item burst intervals are replaced by the mean of nearby good
#' values. Nearby means within 0.05s, and good means no more than
#' 0.0005 s shorter than the longest nearby BI.
#'
#' }
#'
#' Registration problems appear to be mainly from dropped USB packets,
#' when the computer used to make tag recordings has not been able to
#' keep up with the full funcubedongle sampling rate. The result is
#' overly variable estimates of gap values and burst intervals, with a
#' bias downward from the true values (since dropped packets represent
#' lost time).
#'
#' The tag activation events are generated using these items from the
#' motus database, in order of preference (i.e. the first available
#' item is used):
#'
#' \enumerate{
#'
#' \item tsStart - the starting date for a tag deployment record;
#' tsStartCode = 1L
#'
#' \item dateBin - the start of the quarter year in which the tag was
#' expected to be deployed;
#' tsStartCode = 2L
#'
#' \item ts - the date the tag was registered;
#' tsStartCode = 3L
#'
#' }
#'
#' Tag deactivation events are generated using these items, again
#' in order of preference:
#'
#' \enumerate{
#'
#' \item tsEnd - the ending date for a tag deployment; e.g. if a tag
#' was found, or manually deactivated; tsEndCode = 1L
#'
#' \item tsStart for a different deployment of the same tag; tsEndCode = 2L
#'
#' \item tsStart + predictTagLifespan(model, BI) * marginOfError
#' if the tag model is known; tsEndCode = 3L
#'
#' \item tsStart + predictTagLifespan(guessTagModel(speciesID), BI) * marginOfError
#' if the species is known; tsEndCode = 4L
#'
#' \item 90 days if no other information is available; tsEndCode = 5L
#'
#' }
#'
#' Also, as of 6 April 2016, we're using a lifetime of 700 days for tags in the
#' Taylr 2013 project (gulls)
#'
#' @return path to an sqlite database usable by the tag finder; it will have these tables:
#'
#' \strong{tags:}
#' \itemize{
#' \item tagID motus tag ID
#' \item nomFreq nominal frequency, in MHz e.g. 166.38
#' \item offsetFreq offset from nominal, in kHz
#' \item param1 first interpulse gap
#' \item param2 second interpulse gap
#' \item param3 third interpulse gap
#' \item period burst interval
#' \item mfgID lotek ID code
#' \item codeSet Lotek codeset name
#' }
#'
#' @export
#'
#' @author John Brzustowski \email{jbrzusto@@REMOVE_THIS_PART_fastmail.fm}
#'
cleanTagRegistrations = function(m, s, cleanBI = FALSE) {
m = subset(m, projectID != 0)
## add columns we need for lookups
m = as.tbl(m) %>%
mutate (
id = as.integer(mfgID), ## id as bare mfgID, without fractional part and in integer form
bi = round(period * 20)/20 ## burst interval rounded to 0.05 s
)
## cleanup gap values from codesets
codeSets = c("Lotek3", "Lotek4")
## tags not in a known codeset are left alone (these might be beepers, e.g.)
other = m %>% filter_ (~!codeSet %in% codeSets)
## clean up gaps
clean = NULL
for (cs in codeSets) {
lt = ltGetCodeset(cs)
clean = clean %>%
bind_rows(
m %>% filter_(~codeSet==cs) %>%
left_join(lt, by=c("id"="id"))
)
}
if (cleanBI) {
## generate a best estimate of precise BI for each rounded (to 0.1s) value
bi = clean %>% select(bi, period) %>%
group_by(bi) %>%
summarise(avgPer=mean(period))
clean = clean %>% left_join(bi, by="bi")
badBI = clean %>% filter(abs(avgPer-period) > 0.08) %>% collect
if ( badBI %>% nrow > 0) {
cat("Problem; these tags have bad looking BI:\n")
print(badBI)
badBI <<- badBI
stop("go back and revisit tag burst interval cleanup")
}
## copy over better gaps, but not BI for the "uncleaned" database
unclean = clean %>%
mutate(param1 = g1, param2 = g2, param3 = g3) %>%
select(-id, -bi, -g1, -g2, -g3, -avgPer) %>%
bind_rows(other)
clean = clean %>%
mutate(param1 = g1, param2 = g2, param3 = g3, period = avgPer) %>%
select(-id, -bi, -g1, -g2, -g3, -avgPer) %>%
bind_rows(other)
} else {
## just rename g1->param1, g2->param2, g3->param3
clean = clean %>%
mutate(param1 = g1, param2 = g2, param3 = g3) %>%
select(-id, -bi, -g1, -g2, -g3) %>%
bind_rows(other)
}
## sanity check on deployment times. If tsStart and tsEnd are both
## specified in the database, make sure tsStart <= tsEnd
insane = which((! is.na(clean$tsStart)) &
(! is.na(clean$tsEnd)) &
clean$tsStart > clean$tsEnd)
if (length(insane) > 0) {
warning("One or more tag deployments have tsEnd < tsStart; these tags are involved:\n ",
paste(clean$tagID[insane], collapse=", "),
"\nFor these tags, these corrections are applied:\n
- if the start and end times are on the same day, tsEnd will be\n
set to tsStart + 24 hours\n
- otherwise, tsEnd will be calculated based on tag lifespan, as if it had not been provided\n")
sameDay = trunc(clean$tsStart[insane], "day") == trunc(clean$tsEnd[insane], "day")
if (any(sameDay))
clean$tsEnd[insane[sameDay]] = clean$tsStart[insane[sameDay]] + 24*3600
if (!all(sameDay))
clean$tsEnd[insane[! sameDay]] = NA
}
##-------------------- tsStart --------------------
clean$tsStartCode = 1L
noTsStart = is.na(clean$tsStart)
## at worst, we use the registration date
clean$tsStart[noTsStart] = clean$tsSG[noTsStart]
clean$tsStartCode[noTsStart] = 3L
## if a dateBin was specified, use that
haveDateBin = noTsStart & ! is.na(clean$dateBin)
clean$tsStartCode[haveDateBin] = 2L
clean$tsStart[haveDateBin] = subset(clean, haveDateBin) %>%
with( paste(substr(dateBin, 1, 4), (as.numeric(substring(dateBin,6)) - 1) * 3 + 1, 1, sep="-")) %>%
ymd(tz="GMT") %>% as.numeric
##-------------------- tsEnd --------------------
## Compute tsEnd for tag deployments which don't have it (i.e. most)
## If the tag model is specified, use predictTagLifespan;
## If no tag model is specified, use the species to lookup a tag model.
## Otherwise, use 90 days.
clean$tsEndCode = 1L
dayToSec = 24 * 3600
noTsEnd = is.na(clean$tsEnd)
## at worst, use tsStart + 90 days
clean$tsEnd[noTsEnd] = clean$tsStart[noTsEnd] + 90 * dayToSec
## set code for worst case
clean$tsEndCode[noTsEnd] = 5L
## see whether a tag model was specified
haveModel = noTsEnd & ! is.na(clean$model)
clean$tsEndCode[haveModel] = 3L
## when the model is missing, see whether a species was specified
haveSpecies = noTsEnd & ! haveModel & ! is.na(clean$speciesID)
clean$tsEndCode[haveSpecies] = 4L
if (sum(haveSpecies) > 0) {
clean$model[haveSpecies] = guessTagModel(clean$speciesID[haveSpecies])
haveModel = noTsEnd & ! is.na(clean$model)
}
## use the specified or guessed model to estimate lifespan, with 50 % margin of error
lifeSpan = predictTagLifespan(clean$model[haveModel], clean$period[haveModel])
clean$tsEnd[haveModel] = clean$tsStart[haveModel] + lifeSpan * dayToSec * 1.5
## look for overlapping deployments of a given tag, likely due to
## overestimating tsSEnd. When found, make tsEnd for the earlier
## deployment 1s before tsStart for the next. If we just left them
## overlapping like this:
## ------+------------------+---------------+-------------+------ (time)
## | | | |
## tsStart1 tsStart2 tsEnd1 tsEnd2
## then the tag would not be seen as active between tsEnd1 and tsEnd2.
## So we correct that situation to this:
##
## tsEnd1
## |
## ------+-----------------++-----------------------------+------ (time)
## | | |
## tsStart1 tsStart2 tsEnd2
overlap = clean %>% inner_join (clean, by="tagID") %>%
filter (tsStart.x < tsStart.y & tsStart.y <= tsEnd.x)
if (nrow(overlap) > 0) {
fixtsEnd = overlap %>% mutate (tsEnd.x = tsStart.y - 1) %>%
transmute (deployID = deployID.x, tsEnd = tsEnd.x)
clean[match(fixtsEnd$deployID, clean$deployID), "tsEnd"] = fixtsEnd$tsEnd
clean[match(fixtsEnd$deployID, clean$deployID), "tsEndCode"] = 2L
}
## remove duplicates, which are due to multiple deployments
nodups = subset(clean, ! duplicated(tagID))
## clean up the nominal frequency, which has issues upstream:
nodups$nomFreq = round(nodups$nomFreq, 3) ## upstream problems with conversion from lower-precision float
nodups$nomFreq[nodups$nomFreq==166.376] == 166.38 ## from my own registration mistakes
dbWriteTable(s$con, "tmptags", nodups %>% as.data.frame, row.names=FALSE, overwrite=TRUE)
s("delete from tags where tagID in (select distinct tagID from tmptags)")
s("insert into tags select * from tmptags")
s("drop table tmptags")
## re-record tags table summary in the history repo
s("select tagID, manufacturer, model, codeSet, nomFreq, mfgID, period from tags order by tagID") %>%
write.csv(file.path(MOTUS_PATH$METADATA_HISTORY, "tags.csv"), row.names=FALSE)
## now create events table
## This has the columns: ts, tagID, event (0 or 1)
tagOn = clean %>% filter(!is.na(tsStart)) %>% transmute(ts=tsStart, tagID=tagID, event=1L)
tagOff = clean %>% filter(!is.na(tsEnd)) %>% transmute(ts=tsEnd, tagID=tagID, event=0L)
events = tagOn %>% bind_rows(tagOff) %>% arrange(ts)
dbWriteTable(s$con, "tmpevents", events %>% as.data.frame, overwrite=TRUE, row.names=FALSE)
s("delete from events where tagID in (select distinct tagID from tmpevents)")
s("insert into events select * from tmpevents")
s("drop table tmpevents")
s("create index if not exists events_ts on events(ts)")
return (clean)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.