GLFC: Great Lakes Fishery Commission

Documented in DWEFerror

#' Error Check the Deepwater Electrofishing Data
#'
#' Error check the deepwater electofishing data (including information on the
#' lamprey catch, the lamprey lengths, and the identification of plots that
#' were treated) prior to estimation.
#' @param Dir
#'   A character scalar identifying the path where output files will be
#'   stored.  Use forward slashes, e.g., \code{Dir = "C:/temp/mydir"}.
#' @param Catch
#'   A data frame with the catch data, typically the \code{CAT} output from
#'   \code{\link{DWEFprep}}.
#' @param Lengths
#'   A data frame with the lengths data, typically the \code{LEN} output from
#'   \code{\link{DWEFprep}}.
#' @param Continue
#'   A logical scalar indicating if you want to continue adding to the rtf
#'   document after the function has run (TRUE) or if you want to end the
#'   rtf document after the error checking (FALSE).
#' @param Source
#'   A named character vector with the names of the source directory and files,
#'   c(Dir="", CatchFile="", LengthsFile="", PlotsFile=""),
#'   typically the \code{SOURCE} output from \code{\link{DWEFprep}},
#'   default NULL.
#' @details
#'   If \code{Continue=FALSE}, a rich text file will be saved to the \code{DIR}
#'   directory with error checking text, tables, and figures.  If
#'   \code{Continue=TRUE}, the same rich text file will be started, but left
#'   open, typically to add in more text, tables, and figures generated by
#'   \code{\link{DWEFreport}}.
#' @return
#'   A list with cleaned (errors removed) DWEF catch and lengths in two data
#'   frames (CAT2, LEN2), a character vector of the table references for any
#'   remaining errors (ERR), a character vector of the SOURCE directory and
#'   file names, and a character vector of the output file names (OUT).
#' @importFrom geosphere geodesic_inverse
#' @export

DWEFerror <- function(Dir, Catch, Lengths, Continue, Source=NULL) {

#   Dir <- a$SOURCE["Dir"]
#   Catch <- a$CAT
#   Lengths <- a$LEN
#   Continue <- FALSE
#   Source <- a$SOURCE
#   library(GLFC)
#   library(geosphere)
#   DWEFerror(Dir=a$SOURCE$Dir, Catch=a$CAT, Lengths=a$LEN, Continue=FALSE,
#     Source=a$SOURCE)

  if(length(unique(Catch$year))!=1) stop("There should be just one year",
    " of data in the Catch file.")
  YEAR <- Catch$year[1]

  if(length(unique(Catch$period))!=1) stop("There should be just one value",
    " for the variable period in the Catch file.")
  WHEN <- recode(Catch$period[1], c(0, 1, -1, NA), c("No treatment",
    "Post-treatment", "Pre-treatment", "Pre- and post-treatment"))

  nonmissing <- !is.na(Catch$year) & !is.na(Catch$hab.type) &
    is.element(Catch$hab.type, 1:3) & !is.na(Catch$sl.total) &
    Catch$sl.total > -1
  printvars <- c("sampid", "date", "region", "new.numb", "depth", "hab.type",
    "sl.total", "commentwrap")

  TODAY <- Sys.Date()


  # create rtf document
  docname <- paste0(YEAR, " DWEFES Report ", TODAY, ".doc")
  doc <<- startrtf(file=docname, dir=Dir)
  heading(paste(YEAR, "St. Marys River Larval Sea Lamprey Survey"))
  heading(TODAY, 2)


  heading("Data Exploration & Error Checking", 2)

  para("The figures and tables in this section should be examined for possible",
    " errors or omissions in the larval survey data.")

  # table of input files
  tab <- cbind(Source[c("CatchFile", "LengthsFile", "PlotsFile")],
    c("Current year catches", "Current year lengths",
      "Current year treated hotspots"))
  dimnames(tab) <- list(NULL, c("File", "Information"))
  tabl("Input files used.", row.names=FALSE, TAB=tab)


  # table of output files
  catchout.file.name <- paste0(YEAR, "-CatchesSMR.csv")
  lengthout.file.name <- paste0(YEAR, "-LengthsSMR.csv")
  bpestout.file.name <- paste0(YEAR, "-BPlotEstsSMR.csv")
  catch.all.name <- paste0(Sys.Date(), "-CatchALL.csv")
  length.all.name <- paste0(Sys.Date(), "-LengthALL.csv")
  plot.all.name <- paste0(Sys.Date(), "-BPlotEstsALL.csv")
  pe.all.name <- paste0(Sys.Date(), "-RiverEstsALL.csv")
  outfilenames <- c(catchout.file.name, lengthout.file.name, bpestout.file.name,
    catch.all.name, length.all.name, plot.all.name, pe.all.name)
  if(Continue) {
    tab <- cbind(
      c(docname, outfilenames),
      c("Current year report, this document",
        "Current year catches used in estimation",
        "Current year lengths used in estimation",
        "Current year hotspot estimates",
        "Updated historic catches", "Updated historic lengths",
        "Updated historic plot estimates", "Updated historic river estimates"))
  } else {
    tab <- cbind(docname, "This Document")
  }
  dimnames(tab) <- list(NULL, c("File", "Information"))
  tabl("Output file(s) generated.", row.names=FALSE, TAB=tab)

  # start off with no errors, then whenever an error is encountered that
  # requires fixing, append table numbers to the "errors" object
  errors <- NULL

  # figure of sampling depth vs missingness
  propdeep <- 100*tapply(Catch$hab.type==-9999, Catch$depth, mean)
  pdd <- as.numeric(names(propdeep))
  nsamples <- table(Catch$depth)
  fig <- function() {
    par(mar=c(4, 4, 1, 1), cex=1.2, las=1)
    plot(pdd, propdeep, type="n",
      xlab="Depth  (feet)", ylab="Samples with missing habitat  (%)")
    #my.circles(pdd, propdeep, nsamples, add=TRUE, circle.size.range=c(.1, .8))
    points(pdd, propdeep, cex=5*nsamples/max(nsamples)+1)
    lines(pdd, propdeep, lwd=2)
    abline(h=50, lty=2)
    }
  figu("Relation between sample depth and the percent of samples that had",
    " missing habitat.  Missing habitat data is typically an indication of",
    " depths that are too shallow or too deep to sample.",
    "  A circle was drawn for every depth in the catch data.",
    "  Circle size represents the number of samples at that depth.",
    FIG=fig, h=3, w=4)
  rm(propdeep, pdd, nsamples)

  # missing data
  tab <- Catch[!nonmissing, printvars]
  tab <- tab[order(tab$depth, tab$commentwrap, tab$date), ]
  if(dim(tab)[1]>0) tabl("These rows will be deleted prior to joining the ",
    YEAR, " data with data from previous years.",
    "  (And they are excluded from the graphs and summaries that follow.)",
    "  Check to make sure that these are really cases where no larval sampling",
    " was conducted.  Are there any shallow sites listed as",
    " 'too deep to sample', or any deep sites listed as",
    " 'too shallow to sample'?",
    row.names=FALSE, TAB=tab)

  # just subset the nonmissing data
  smr2 <- Catch[nonmissing, ]
  smr2[smr2==-9999] <- NA
  rm(Catch, nonmissing)

  seldeep <- grep("too deep", smr2$comment, ignore.case=TRUE)
  selshal <- grep("too shallow", smr2$comment, ignore.case=TRUE)
  selboth <- sort(unique(c(seldeep, selshal)))
  tab <- smr2[selboth, printvars]
  if(dim(tab)[1]>0) {
    tabl("Comment says too deep/shallow to sample, but habitat type and/or",
    " sl.total is filled in.  For now, DWEFES will assume that the comment is",
    " correct and delete these rows as well.  If this INCORRECT, then the",
    " comment should be changed to 'No comment'.", row.names=FALSE, TAB=tab)
    }
  if(length(selboth)>0) {
    smr2 <- smr2[-selboth, ]
    }

  # fix an error in the 2012 data
  smr2$new.numb[smr2$sampid==201240282] <- 0

  # a bunch of error checking tables
  # whenever an error is encountered that requires fixing, the table numbers are
  # appended to the "errors" object
  tab <- with(smr2, smr2[is.na(hab.type) | is.na(region) | is.na(sl.total),
    printvars])
  if(dim(tab)[1]>0) {
    errors <- append(errors, paste("Table", GLFCenv$tabcount))
    tabl("No sample taken - missing data.", row.names=FALSE, TAB=tab)
    }

  tab <- smr2[!(smr2$region %in% c(NA, 1:6)), printvars]
  if(dim(tab)[1]>0) {
    errors <- append(errors, paste("Table", GLFCenv$tabcount))
    tabl("Region not equal to 1 through 6.", row.names=FALSE, TAB=tab)
    }

  tab <- with(smr2, smr2[is.element(sampid, sampid[duplicated(sampid)]),
    printvars])
  if(dim(tab)[1]>0) {
    errors <- append(errors, paste("Table", GLFCenv$tabcount))
    tabl("Duplicate sample ids.", row.names=FALSE, TAB=tab)
    }

  tab <- with(smr2, smr2[(is.na(inbplot) | inbplot<0.5) & !is.na(new.numb) &
      new.numb>0.5, c(printvars, "inbplot")])
  if(dim(tab)[1]>0) {
    errors <- append(errors, paste("Table", GLFCenv$tabcount))
    tabl("Hotspot number assigned (new.numb), but inbplot not equal to 1.",
      row.names=FALSE, TAB=tab)
    }

  tab <- with(smr2, smr2[!is.na(inbplot) & inbplot>0.5 & (is.na(new.numb) |
      new.numb<0.5), c(printvars, "inbplot")])
  if(dim(tab)[1]>0) {
    errors <- append(errors, paste("Table", GLFCenv$tabcount))
    tabl("Hotspot number not assigned (new.numb), but inbplot=1.",
      row.names=FALSE, TAB=tab)
    }

  n <- dim(smr2)[1]
  keep <- matrix(NA, nrow=0, ncol=2)
  for(i in 1:(n-1)) {
  for(j in (i+1):n) {
    if(with(smr2, abs(longitude[i] - longitude[j]) < 0.0005 &
        abs(latitude[i] - latitude[j]) < 0.0003)) {
      keep <- rbind(keep, c(i, j))
      }
    }}
  lessthan30=0
  if(dim(keep)[1] > 0) {
    for(i in 1:dim(keep)[1]) {
      tab <- smr2[keep[i, ], printvars]
      dist.in.m <- geosphere::geodesic_inverse(smr2[keep[i, 1],
        c("longitude", "latitude")],
        smr2[keep[i, 2], c("longitude", "latitude")])[, "distance"]
      if(dist.in.m < 30) {
        lessthan30 <- lessthan30 + 1
        errors <- append(errors, paste("Table", GLFCenv$tabcount))
        tabl(paste0("These two samples are very close together (",
          round(dist.in.m),
          " m).  Check to see if the same site was sampled twice."),
          row.names=FALSE, TAB=tab)
        }
      }
  }
  if((dim(keep)[1] <= 0) | (lessthan30<1)) {
    para("A test of proximity was applied to all the sample locations. None were within 30 m of each other.")
  }
  rm(n, keep, i, j, lessthan30)

  tab <- with(smr2, smr2[depth>50 | depth<1, printvars])
  if(dim(tab)[1]>0) {
    errors <- append(errors, paste("Table", GLFCenv$tabcount))
    tabl("Extreme depths.", row.names=FALSE, TAB=tab)
    }

  tab <- with(smr2, smr2[!(sample==2), c(printvars, "sample")])
  if(dim(tab)[1]>0) {
    errors <- append(errors, paste("Table", GLFCenv$tabcount))
    tabl("Sample is not listed as initial (sample=2).", row.names=FALSE,
      TAB=tab)
    }

  tab <- with(smr2, smr2[!(is.element(mm, 1:12)) | !(is.element(dd, 1:31)) |
      year!=YEAR, printvars])
  if(dim(tab)[1]>0) {
    errors <- append(errors, paste("Table", GLFCenv$tabcount))
    tabl("Error in sample date.", row.names=FALSE, TAB=tab)
    }

  tab <- Lengths[is.na(match(Lengths$sampid, smr2$sampid)), ]
  tab$sl.larv.adj <- round(tab$sl.larv.adj, 1)
  tab$sl.meta.adj <- round(tab$sl.meta.adj, 1)
  if(dim(tab)[1]>0) {
    errors <- append(errors, paste("Table", GLFCenv$tabcount))
    tabl("Sample ID in LENGTHS file doesn't match Sample ID in CATCHES file.",
      row.names=FALSE, TAB=tab)
    }

  nll <- dim(Lengths)[1]
  ssc <- with(smr2, sum(sl.total[sl.total>-1], na.rm=TRUE))
  if((ssc - nll) != 0) para("Number of lengths in LENGTHS file, ", nll,
    ", doesn't equal sum of sl.total in CATCHES file, ", ssc, ".")
  rm(nll, ssc)

  # match up catch and length files
  # larvae
  slln <- tapply(Lengths$sl.larv.n, Lengths$sampid, sum)
  slla <- tapply(Lengths$sl.larv.adj, Lengths$sampid, sum)
  idl <- as.numeric(names(slln))
  smr2$sl.larv.n <- rep(0, dim(smr2)[1])
  smr2$sl.larv.n[match(idl, smr2$sampid)] <- slln
  smr2$sl.larv.adj <- rep(0, dim(smr2)[1])
  smr2$sl.larv.adj[match(idl, smr2$sampid)] <- slla
  # metamorphosers
  slmn <- tapply(Lengths$sl.meta.n, Lengths$sampid, sum)
  slma <- tapply(Lengths$sl.meta.adj, Lengths$sampid, sum)
  idm <- as.numeric(names(slmn))
  smr2$sl.meta.n <- rep(0, dim(smr2)[1])
  smr2$sl.meta.n[match(idm, smr2$sampid)] <- slmn
  smr2$sl.meta.adj <- rep(0, dim(smr2)[1])
  smr2$sl.meta.adj[match(idm, smr2$sampid)] <- slma
  # both
  smr2$sl.adjctch <- smr2$sl.larv.adj + smr2$sl.meta.adj
  # look for mismatches
  sel <- (abs(smr2$sl.total - smr2$sl.larv.n - smr2$sl.meta.n) > 0.0001)
  tab <- smr2[sel, c("sampid", "date", "region", "new.numb", "depth",
    "hab.type", "sl.total", "sl.larv.n", "sl.meta.n", "sl.adjctch",
    "sl.larv.adj", "sl.meta.adj", "commentwrap")]
  tab$sl.larv.adj <- round(tab$sl.larv.adj, 1)
  tab$sl.meta.adj <- round(tab$sl.meta.adj, 1)
  if(dim(tab)[1]>0) para(
    "Catch in CATCHES file doesn't equal sum of catch in LENGTHS file.")
  rm(slln, slla, idl, slmn, slma, idm)

  # list of comments
  dmin <- with(smr2, tapply(depth, comment, min))
  dmax <- with(smr2, tapply(depth, comment, max))
  nc <- with(smr2, tapply(!is.na(depth), comment, sum))
  df <- data.frame(names(nc), nc, dmin, dmax)
  names(df) <- c("Comment", "Freq", "MinDepth", "MaxDepth")
  df <- df[order(-df$Freq, -df$MaxDepth, -df$MinDepth), ]
  row.names(df) <- 1:dim(df)[1]
  tab <- df
  tabl("List of comments entered, total frequency, and range of depths.",
    TAB=tab)
  rm(dmin, dmax, nc, df)

  #### too deep / too shallow section WAS here

  tab <- with(smr2, smr2[comment=="Type 3" & hab.type!=3, printvars])
  if(dim(tab)[1]>0) {
    errors <- append(errors, paste("Table", GLFCenv$tabcount))
    tabl("Comment says Type 3, but habitat type disagrees.", row.names=FALSE,
      TAB=tab)
    }

  tab <- with(smr2, tapply(sl.total, list(boat, inbplot), sum))
  tabl("Total catch of sea lampreys by BOAT (rows) and INBPLOT (columns).",
    "  Extreme differences between boats may indicate a malfunction in one of",
    " the boats.", TAB=tab)

  tab <- with(smr2, format(signif(tapply(sl.total, list(boat, inbplot),
    mean), 2)))
  tabl("Mean (unadjusted) density (number per sample) of sea lampreys by",
    " BOAT (rows) and INBPLOT (columns).  Extreme differences between boats",
    " may indicate a malfunction in one of the boats.", TAB=tab)

  # LFRs
  Lengths$len5 <- 5*ceiling(Lengths$length/5)
  lensjustA <- Lengths[Lengths$sl.larv.n>0, ]
  all5 <- seq(min(lensjustA$len5), max(lensjustA$len5), 5)
  maxfreq <- max(tapply(lensjustA$sl.larv.adj, lensjustA$len5, sum))
  m2 <- matrix(NA, nrow=dim(lensjustA)[1], ncol=length(all5),
    dimnames=list(NULL, all5))
  am2 <- m2
  m <- tapply(rep(1, dim(lensjustA)[1]),
    list(seq(dim(lensjustA)[1]), lensjustA$len5), sum)
  m2[, match(dimnames(m)[[2]], all5)] <- m
  m2[is.na(m2)] <- 0
  am <- tapply(lensjustA$sl.larv.adj,
    list(seq(dim(lensjustA)[1]), lensjustA$len5), sum)
  am2[, match(dimnames(am)[[2]], all5)] <- am
  am2[is.na(am2)] <- 0
  fig <- function() {
    par(mar=c(3, 3, 2, 1), mfrow=c(2, 1), oma=c(1, 0, 0, 0), las=1, cex=1.2)
    barplot(m2, ylim=c(0, maxfreq+0.2), xlab="", ylab="", main="Catch")
    abline(h=0)
    barplot(am2, ylim=c(0, maxfreq+0.2), xlab="", ylab="",
      main="Adjusted Catch")
    abline(h=0)
    mtext("Length  (mm)", side=1, outer=TRUE, cex=1.2)
    }
  figu("Length frequency distributions (unadjusted and adjusted for",
    " length-based gear efficiency) of larval sea lampreys captured during",
    " the ", YEAR, " survey.", FIG=fig, h=6, w=4)
  rm(maxfreq, m2, am2, m, am)

  # survey dates
  fig <- function() {
    par(mar=c(4, 4, 1, 1), las=1, cex=1.2)
    options(warn=-1)
    with(smr2, hist(date, breaks=max(as.numeric(date), na.rm=TRUE) -
      min(as.numeric(date), na.rm=TRUE) + 1, freq=TRUE, col="gray",
      xlab="Date", ylab="Number of DWEF samples", main=""))
    options(warn=0)
    }
  figu("Number of deepwater electrofisher samples taken over time during the ",
    YEAR, " larval sea lamprey survey.", FIG=fig, h=3, w=6.5)

  # distribution of metrics
  fig <- function() {
    par(mfrow=c(4, 2), mar=c(3, 4, 2, 1), oma=c(0, 1.5, 0, 0), las=1, cex=1.2)
    with(smr2, hist(dec.time, breaks=ceiling(max(dec.time, na.rm=TRUE)) -
      floor(min(dec.time, na.rm=TRUE)) + 1, freq=TRUE, col="gray",
      xlab="", ylab="", main="Time  (h)"))
    plot(as.factor(smr2[, "period"]), xlab="", ylab="", main="Period")
    plot(as.factor(smr2[, "boat"]), xlab="", ylab="", main="Boat")
    plot(as.factor(smr2[, "sample"]), xlab="", ylab="", main="Sample")
    plot(as.factor(smr2[, "hab.type"]), xlab="", ylab="", main="Hab_Type")
    hist(smr2[, "depth"], freq=TRUE, col="gray", xlab="", ylab="",
      main="Depth  (m)")
    plot(as.factor(smr2[, "region"]), xlab="", ylab="", main="Region")
    plot(as.factor(smr2[, "inbplot"]), xlab="", ylab="", main="InBPlot")
    mtext("Number of database records", side=2, outer=TRUE, las=0, cex=1.5)
    }
  figu("Distributions of other metrics from the ", YEAR,
    " larval sea lamprey survey.\n\n", FIG=fig, newpage="port")

  if(!Continue) endrtf()

  list(CAT2=smr2, LEN2=lensjustA, ERR=errors, SOURCE=Source, OUT=outfilenames)

}