R/cpuefuncs.r

Defines functions yearZero yearNA turnover toXL summary.CEout standLM selectdata scaleCE saveresults quants properties makemodels makeLabel makedatasum makecategorical iscol getStand getsps getrmse getfact geomean fishery countgtzero coef.CEout checkDF addcount

Documented in addcount checkDF coef.CEout countgtzero fishery geomean getfact getrmse getsps getStand iscol makecategorical makedatasum makeLabel makemodels properties quants saveresults scaleCE selectdata standLM summary.CEout toXL turnover yearNA yearZero

# codeutils::listFunctions("C:/A_CSIRO/Rcode/r4cpue/R/cpuefuncs.r")

#' @import graphics
#' @importFrom grDevices png rgb dev.cur dev.new dev.off
#' @importFrom stats anova as.formula dnorm lm median qnorm qqline qqnorm
#' @importFrom stats quantile sd loess
#' @importFrom utils data read.csv str tail write.table
NULL


#' @title addcount - adds a count of years and avC to a data.frame
#'
#' @description addcount - given an input data.frame containing catch_kg,
#'    Vessel, and Year as variables this function first sums the catch by
#'    vessel, by year, then counts each vessels occurrence across the
#'    years, and each vessels catch. Then stepping through each vessel it
#'    populates two new colummns named 'count' and 'avC'.
#'
#' @param indat - a data.frame containing at least columns named 'catch_kg',
#'    'Vessel' (or in fact 'varid'), and 'Year'.
#' @param varid the variable in the data.frame for which the count and
#'    avearge catch per year is to be estimated. Defaults to 'Vessel'
#'    ready for the SESSF but could be diver_id for abalone, or any other
#'    factor of interest.
#' @param group the first grouping variable, typically Vessel in the SESSF
#' @param catch the variable name for the catch, catch_kg in the SESSF
#'
#' @return a data.frame made of the input data.frame plus two new columns
#' @export addcount
addcount <- function(indat,varid="Vessel",group="Year",catch="catch_kg") {
   outdat <- indat
   outdat$count <- NA
   outdat$avC <- NA
   cbv <- tapply(outdat[,catch],list(outdat[,varid],outdat[,group]),sum,na.rm=TRUE)/1000
   years <- as.numeric(colnames(cbv))
   nyrs <- length(years)
   count <- apply(cbv,1,function(x) length(which(x > 0)))
   avC <- apply(cbv,1,mean,na.rm=TRUE)
   vessels <- as.numeric(names(count))
   numves <- length(vessels)
   for (i in 1:numves) {
      pickv <- which(outdat[,varid] == vessels[i])
      outdat$count[pickv] <- count[i]
      outdat$avC[pickv] <- avC[i]
   }
   return(outdat)
} # end of addcount

#' @title checkDF - checks a CPUE data.frame has all required fields
#'
#' @description checkDF - checks a CPUE data.frame contains all fields
#'    expected for analysis. These are Year, Month, Vessel, catch_kg,
#'    Long, Lat, Depth, DayNight, Zone, Effort, Method, Fishery, LnCE,
#'    and DepCat. If any are missing it identifies them if not it confirms
#'    all is ok. Although, except for their class checkDF does not consider
#'    the contents of each field.
#'
#' @param x - a data.frame of CPUE data used in the SESSF
#'
#' @return only a text message to the console confirming all ok or
#'    reporting which fields are missing
#' @export  checkDF
#'
#' @examples
#' dataf <- as.data.frame(matrix(rnorm(100,mean=10,sd=1),nrow=20,ncol=5))
#' colnames(dataf) <- c("Year","catch_kg","Long","Zone","LnCE")
#' checkDF(dataf)
#' dataf <- as.data.frame(matrix(rnorm(42,mean=10,sd=1),nrow=1,ncol=14))
#' colnames(dataf) <- c("Year","Month","Vessel","catch_kg","Long","Lat",
#'                      "Depth","DepCat","DayNight","Zone","Effort",
#'                      "Method","Fishery","LnCE")
#' checkDF(dataf)
#' head(dataf)
checkDF <- function(x) {  # x <- sps1
   if (class(x) != "data.frame") stop("Input to checkDF must be a data.frame \n\n")
   needed <- c("Year","Month","Vessel","catch_kg","Long","Lat","Depth","DepCat","Zone",
               "Effort","LnCE","DayNight","Method","Fishery")
   dflocation <- match(needed,colnames(x))
   pick <- which(is.na(dflocation))
   if (length(pick) == 0) {
      cat("All fields needed for SESSF work are present \n")
      whatclass <- sapply(x[dflocation],class)
      numbers <- (whatclass[1:11] == "numeric")
      nonnum <- (whatclass[12:14] == "factor")
      if (length(numbers[numbers == FALSE]) == 0) cat("All numeric fields are numeric \n")
      if (length(nonnum[nonnum == FALSE]) == 0) cat("All non-numeric fields are factors \n")
   } else {
      cat("The following fields are missing ",needed[pick],"\n\n")
   }
   if (dim(x)[1] < 100) cat("Very few records, ",dim(x)[1],", available for standardization \n\n")
}  # end of checkDF


#' @title coef.CEout - S3 method to extract the parameter values from a CEout object
#'
#' @description Extract the parameter values from a CEout object from standLnCE
#'   by extending the generic 'coef' function to apply to CEout objects, such
#'   as generated by standLnCE
#' @param inout is a CEout object produced by standLnCE
#' @return a matrix containing the optimum model parameters, including the
#'   log-space coefficients, the lower and upper 95% cofidence intervals,
#'   the standard error of each parameter, and p the probability that the
#'   parameter is significantly different from zero. These coefficients
#'   would need to be back-transformed to return to the original scale. This
#'   would use x = exp(coef + (sterr*sterr)/2); note the bias correction
#' @exportMethod coef.CEout
#' @examples
#' \dontrun{
#' data(sps)
#' splabel = "Species"
#' labelM <- c("Year","Vessel","Month")
#' mods <- makemodels(labelM)
#' sps1 <- makecategorical(labelM,sps)
#' out <- standLM(mods,sps1,splabel)
#' coef(out)
#' }
coef.CEout <- function(inout) {   # S3 class development
   test <- dim(inout$Parameters$mat)
   if (length(test) > 0) {
      warning("The input object is not a CEout object from standLnCE  \n")
   } else {
      ans <- NA
      cat("Output from standLnCE  \n")
      tmp <- inout$Parameters$coefficients
      backTran <- exp((tmp[,"Estimate"] + (tmp[,"Std. Error"]^2)/2))
      ans <- cbind(tmp,backTran)
      colnames(ans)
   }
   return(ans)
} # end of coef.CEout

#' @title countgtzero used in apply to count the number of zeros in a vector
#'
#' @description countgtzero used in apply to count the number of zeros in a vector
#' @param invect vector of values
#' @return A single value of zero or the number of zeros
#' @export
#' @examples
#' x <- rnorm(30,mean=1,sd=1)
#' countgtzero(x)
#' cat(30 - countgtzero(x),"out of ",30, " are less than 0  \n")
#' y <- matrix(x,nrow=6,ncol=5)
#' apply(y,1,countgtzero)
countgtzero <- function(invect) {
   pick <- which(invect > 0)
   return(length(pick))
}

#' @title fishery - generates vectors of year, catch, effort, and cpue
#'
#' @description fishery - generates vectors of year, catch, effort, and cpue
#'    This permits a summary of the basic fishery characteristics within a
#'    data.frame or matrix. It requires the columns to be labelled.
#' @param indat the input data.frame or matrix
#' @param years the name of the year factor; defaults to "Year"
#' @param catch the name of the catch variable; defaults to "catch"
#' @param effort the name of effort variable; defaults to "effort"
#' @param cpue the name of the cpue variable; defaults to "CE"
#' @return a matrix with columns Year, Catch, Effort, Bias Corrected
#'    geometric mean CPUE, the naive geometric means, the arithemtic mean
#'    CPUE, and the number of records
#' @export
#' @examples
#' \dontrun{
#' data(sps)
#' fishery(sps,years="Year",catch="catch_kg",effort="Effort",cpue="CE")
#' }
fishery <- function(indat,years="Year",catch="catch",effort="effort",
                    cpue="CE") {
   yrs <- sort(unique(indat[,years]))
   catches <- tapply(indat[,catch],indat[,years],sum,na.rm=TRUE)
   eff <- tapply(indat[,effort],indat[,years],sum,na.rm=TRUE)
   geo <- tapply(indat[,cpue],indat[,years],geomean)
   ngeo <- exp(tapply(log(indat[,cpue]),indat[,years],mean,na.rm=TRUE))
   arith <- tapply(indat[,cpue],indat[,years],mean,na.rm=TRUE)
   records <- table(indat[,years])
   ans <- cbind(yrs,catches,eff,geo,ngeo,arith,records)
   colnames(ans) <- c("Years","Catch","Effort","BCGeo","NaiveGeo","Arith","Records")
   return(ans)
}  # end of Fishery


#' @title geomean: the geometric mean of a vector corrected for log-normal bias
#'
#' @description Calculate the geometric mean of a vector corrected for log-normal
#'   bias. NAs and zeros are removed from consideration.
#' @param invect is a vector of numbers in linear space.
#' @return The bias-corrected geometric mean of the vector
#' @export geomean
#' @examples
#' x <- c(1,2,3,4,5,6,7,8,9)
#' geomean(x)
## Calculate the geometric mean of a vector corected for log-normal bias
geomean <- function(invect) {
   pick <- which((invect <= 0.0))
   if (length(pick) == 0) {
      avCE <- mean(log(invect),na.rm=TRUE)
      stdev <- sd(log(invect),na.rm=TRUE)
   } else {
      avCE <- mean(log(invect[-pick]),na.rm=TRUE)
      stdev <- sd(log(invect[-pick]),na.rm=TRUE)
   }
   gmean <- exp(avCE + (stdev^2)/2)
   return(gmean)
}  # end of geomean

#' @title getfact extracts parameter estimates for a given factor from a CEout object
#'
#' @description getfact extracts the parameter estimates for a given factor from a CEout object.
#'    It does this by searching to rownames of the output parameters of the optimum model.
#'    It also checks for interaction terms, which for categorical factors is the same as
#'    determining a trend of the two factors relative to each other. e.g. for Zone:Month
#'    the outcome is the monthly trend for each zone.
#'
#' @param inmat generally this will be a CEout object but it can be
#'    a matrix of coefficients, an lm object, or a gam object
#' @param invar the model variable whose parameters are wanted
#'
#' @return a matrix containing the parameters for invar
#' @export getfact
getfact <- function(inmat,invar) {  # inmat=model; invar="Year"
   allowable <- c("matrix","CEout","lm","gam")
   whatclass <- class(inmat)
   if (length(whatclass) > 1) {
      if ("gam" %in% whatclass) {
         whatclass <- "gam"
        } else {
         if ("lm" %in% whatclass) {
            whatclass <- "lm"
          } else {
            if ("matrix" %in% whatclass) {
               whatclass <- "matrix"
              } else {
               whatclass <- "no"
            }
         }
      } 
   }
   if (whatclass %in% allowable) {
      if (whatclass == "matrix") pardat <- inmat
      if (whatclass == "CEout") pardat <- inmat$Parameters$coefficients
      if (whatclass == "lm") pardat <- summary(inmat)$coefficients
      if (whatclass == "gam") pardat <- summary(inmat)$p.table
   } else {
      stop("Input matrix is not, in fact, a matrix of coefficients")
   }
   pick <- grep(invar,rownames(pardat))
   if (length(pick) == 0) stop("The selected factor is not in the selected model!  \n")
   startmat <- pardat[pick,]               # isolate rows containing variable in question
   pickI <- grep(":",rownames(startmat))  # check for interaction terms and split off
   if (length(pickI) > 0) {               # if present
      intermat <- startmat[pickI,]
      startmat <- startmat[-pickI,]
   }
   lnce <- startmat[,"Estimate"]         # first do the non-interaction terms
   se <- startmat[,"Std. Error"]
   tval <- startmat[,"t value"]
   Prob <- startmat[,"Pr(>|t|)"]
   backtran <- exp(lnce + (se * se)/2)
   ans <- cbind(c(1.0,backtran),c(0,se),c(0,lnce),
                scaleCE(c(1.0,backtran),avCE=1.0),
                c(NA,tval),c(NA,Prob)
   )
   colnames(ans) <- c("Coeff","SE","LogCE","Scaled","t value","Prob")
   rownames(ans)[1] <- invar
   norigvar <- dim(ans)[1]
   if (length(pickI) > 0) {              # do interaction terms if they exist
      terms <- unlist(strsplit(rownames(intermat),":"))
      nrow <- dim(intermat)[1]
      firstvar <- grep(invar,terms)
      nfirst <- length(unique(terms[firstvar]))
      nsecond <- length(unique(terms[-firstvar]))
      if ((nfirst * nsecond) != nrow)
         stop(paste0("something wrong with the interactions terms for ",invar," \n",
                     "the first and second variables are not balanced"))
      start <- 1
      for (i in 1:(nrow/nfirst)) {  # i <- 1
         finish <- (start+nfirst-1)
         tmp <- intermat[start:finish,]
         lnce <- tmp[,"Estimate"]         # first do the non-interaction terms
         se <- tmp[,"Std. Error"]
         backtran <- exp(lnce + (se * se)/2)
         tmpans <- cbind(c(1.0,backtran),c(0,se),c(0,lnce),scaleCE(c(1.0,backtran),avCE=1.0))
         ans <- rbind(ans,tmpans)
         start <- finish + 1
      }
   }
   return(ans)
} # end of getfact

#' @title getrmse calculates the rmse of the input 'invar' series
#'
#' @description getrmse calculates the rmse of the input invar series (defaults
#'     to 'cpue') against an input 'year' time series. This is primarily
#'     designed to generate a more feasible estimate of the intrinsic
#'     variability of a cpue time-series that may be obtained from a cpue
#'     standardization
#'
#' @param indat the matrix, spmdat, or data.frame containing both a 'year'
#'     column and an invar column (default to 'cpue')
#' @param invar the column whose rmse is wanted; defaults to 'cpue'
#' @param inyr the column that points to the year name
#' @return a list of the rmse and the loess predicted values of the invar for
#'     each year in the time-series
#' @export
#'
#' @examples
#' \dontrun{
#' year <- 1986:1995
#' cpue <- c(1.2006,1.3547,1.0585,1.0846,0.9738,1.0437,0.7759,1.0532,1.284,1.3327)
#' dat <- as.matrix(cbind(year,cpue))
#' getrmse(dat,invar="cpue")  # should be 0.08856596
#' getrmse(dat,invar="cpue")$rmse
#' }
getrmse <- function(indat,invar="cpue",inyr="year"){  # indat=fish; invar="cpue"; inyr="year"
   if(iscol(inyr,indat) & iscol(invar,indat)) {
      nyr <- dim(indat)[1]
      predictedCE <- rep(NA,nyr)
      varloc <- grep(invar,colnames(indat))
      nvar <- length(varloc)
      if (nvar > 1) {
         obsvar <- rep(NA,nyr)
         for (i in 1:nvar) {
            pick <- which(indat[,varloc[i]] > 0)
            obsvar[pick] <- indat[pick,varloc[i]]
         }
      } else {
         obsvar <- indat[,varloc]
      }
      picky <- which(obsvar > 0)
      model <- loess(obsvar[picky] ~ indat[picky,inyr])
      predictedCE[picky] <- model$fitted
      rmse <- sqrt(sum(model$residuals^2)/model$n)
      return(list(rmse=rmse,predictedCE=predictedCE))
   } else {
      cat("Input data should contain both 'year' and 'cpue'  \n")
   }
} # end of getrmseCE

#' @title getsps reads in the species catch and effort data
#'
#' @description getsps reads in the species catch and effort data that was
#'     previously extracted from the Oracle data base.
#'
#' @param filename the names of the csv file to be read
#' @param datadir the path to the directory containing the csv files
#' @param depthclass the class width of the depth categories, defaults to 25
#' @param txtout report the filename to the screen? default = FALSE
#'
#' @return invisibly returns the sps data.frame ready for selection
#' @export
#'
#' @examples
#' \dontrun{
#' print("need an example that will work")
#' }
getsps <- function(filename,datadir,depthclass=25,txtout=FALSE) {
   infile <- paste0(datadir,filename)
   if (!file.exists(infile)) {
      stop(paste0("Data file ",filename," does not exist in ",datadir))
   } else {
      if (txtout) cat("Read from ",filename,"\n")
      sps <- read.csv(infile, header=TRUE)
   }
   sps$CE <- NA
   pick <- which((sps$catch_kg > 0) & (sps$Effort > 0))
   sps$CE[pick] <- sps$catch_kg[pick]/sps$Effort[pick]
   sps$LnCE <- NA
   sps$LnCE[pick] <- log(sps$CE[pick])
   sps$DepCat <- NA
   sps$DepCat <- trunc(sps$Depth/depthclass) * depthclass
   return(invisible(sps))
}  # end of getsps

#' @title getStand - extracts major results from an CEout object
#'
#' @description getStand - extracts major results from an CEout object. which
#'    is generated using standLM. The main resutls include the year, the
#'    geometric mean (the first column of the standardization), the optimum
#'    model, the standard error of the optimum, and the lower and upper
#'    confidence intervals
#'
#' @param x the CEout object derived from standLM
#' @param P the probability interval for the confidence intervals
#'
#' @return matrix containing the primary results
#' @export getStand
#'
#' @examples
#' \dontrun{
#' data(ab)
#' splabel = "Mollusc"
#' labelM <- c("year","diverID","month","block","boatID")
#' ab1 <- makecategorical(labelM,ab)
#' mods <- makemodels(labelM)
#' out <- standLM(mods,ab1,splabel)
#' round(out$Results,4)
#' round(out$WhichM,4)
#' getStand(out)
#' }
getStand <- function(x,P=0.95) {
   if (class(x) != "CEout") stop("input to getStand is not a CEout object")
   yrs <- as.numeric(rownames(x$Results))
   geo <- x$Results[,1]
   opt <- x$Results[,x$Optimum]
   se <- x$StErr[,x$Optimum]
   Zmult <- -qnorm((1-(P/100))/2.0)
   lower <- opt * exp(-Zmult * se)
   upper <- opt * exp(Zmult * se)
   ans <- cbind(yrs,geo,opt,se,lower,upper)
   colnames(ans) <- c("year","geom","optimum","sterr","LCI","UCI")
   return(ans)
}  # end of getStand

#' @title incol is a utility to determine is a column is present in a matrix
#'
#' @description incol is a utility to determine whether a names columns is
#'     present in a given matrix or data.frame.
#'
#' @param incol the name of the column; defaults to "year" as an example
#' @param inmat the matrix or data.frame within which to search for incol
#'
#' @return TRUE or FALSE
#' @export
#'
#' @examples
#' \dontrun{
#' test <- matrix(c(1,2,3,4),nrow=2,ncol=2,dimnames=list(1:2,c("year","Catch")))
#' print(test)
#' iscol("year",test)
#' iscol("Catch",test)
#' iscol("catch",test)
#' iscol("ages",test)
#' }
iscol <- function(incol="year",inmat) { # incol="ages"; inmat=dat
   if (length(grep(incol,colnames(inmat))) < 1) return(FALSE)
   else return(TRUE)
}

#' @title makecategorical - converts given variables into categorical factors
#'
#' @description Given a list of variables, as character strings, this function
#'   converts each variable into a factor after checking no mistake has been
#'   made with the spelling of the variable name. A copy of the data is made
#'   so that the non factored variables can be retained.
#' @param labelModel is the set of variables from the data.frame that are to
#'   be converted into categorical factors.
#' @param indat is the data.frame that is to be analysed in the standardization
#' @return a copy of the database with the selected variables converted into
#'     factors
#' @export makecategorical
#' @examples
#' \dontrun{
#' data(sps)
#' labelM <- c("Year","Vessel","Month")
#' sps1 <- makecategorical(labelM,sps)
#' }
makecategorical <- function(labelModel,indat) {
   Interact <- grep(":",labelModel)
   nInteract <- length(Interact)
   numvars <- length(labelModel) - nInteract
   for (fac in 1:numvars) {
      if (length(indat[,labelModel[fac]]) > 0) {
         indat[,labelModel[fac]] <- factor(indat[,labelModel[fac]])
      } else { warning(paste0("Factor name ",labelModel[fac],
                              "does not appear in data.frame"))
      }
   }
   return(indat)
} # end of makecategorical

#' @title makedatasum generates a table summarizing information about data
#'
#' @description makedatasum uses the two input data.frames to generates a
#'     table summarizing information about data, this includes the Total
#'     Catch of the species, number of records used, the catch used in the
#'     analyses, the number of Vessels, the GeoMean, the optimum standardized
#'     mean estimates, their StDev, the total catch <30Kg, and the
#'     proportion of catch <30Kg.
#'
#' @param InData the unselected species data, generally sps
#' @param InData1 the data.frame after selection has been applied, sps1
#' @param out the output from standLM or dosingle, usually out
#'
#' @return a matrix of each summary variable by year
#' @export
#'
#' @examples
#' \dontrun{
#' data(sps)
#' sps$CE <- NA
#' sps$LnCE <- NA
#' pick <- which((sps$catch_kg > 0) & (sps$Effort > 0))
#' sps$CE[pick] <- sps$catch_kg[pick]/sps$Effort[pick]
#' sps$LnCE[pick] <- log(sps$CE[pick])
#' answer <- selectdata(sps,dependent="LnCE",Ldepth=0,Udepth=600,
#' startyear=1986,finalyear=2015,zones=c(10,20,30),method=c("TW","TDO"),
#' fishery="SET")
#' sps1 <- answer$sps1
#' makedatasum(sps,sps1)
#' }
makedatasum <- function(InData, InData1,out) { # InData=sps; InData1=sps1
   years <- sort(unique(InData1$Year))
   if (is.factor(InData1$Year)) years <- as.numeric(levels(years))
   numrow <- length(years)   # Number of years of observations in sps
   firsty <- min(years)
   lasty <- max(years)
   pick <- which((InData$Year >= firsty) & (InData$Year <= lasty))
   if (length(pick) > 0) { InData <- droplevels(InData[pick,]) }
   # Set up matrix for data characterization
   cols <- c("Total","N", "Catch", "Vess","GeoM","Opt",
             "StDev","C<30Kg","P<30Kg")
   DataC <- matrix(nrow=numrow, ncol=length(cols),dimnames = list(years, cols))
   DataC[,1] <- tapply(InData$catch_kg,InData$Year,sum,na.rm=T)/1000.0   # Total Catch
   DataC[,2] <- tapply(InData1$catch_kg,InData1$Year,length)    # Number of records used
   for (n in 1:numrow) {  # Number of vessels in sps1
      DataC[n,4] <- length(unique(InData1[InData1$Year==years[n],]$Vessel))
   }
   means <- tapply(InData1$LnCE,InData1$Year,mean,na.rm=T)
   stdev <- tapply(InData1$LnCE,InData1$Year,sd,na.rm=T)
   DataC[,5] <- exp(means + (stdev*stdev)/2)
   DataC[,3] <-tapply(InData1$catch_kg,InData1$Year,sum,na.rm=T)/1000.0
   pick <- which(InData1$catch_kg <= 30)
   tmpdat <- InData1[pick,]
   tmpdatout <- tapply(tmpdat$catch_kg,tmpdat$Year,sum,na.rm=T)/1000.0
   pickrec <- match(as.numeric(names(tmpdatout)),as.numeric(rownames(DataC)))
   DataC[pickrec,8] <- tmpdatout
   DataC[,9] <- DataC[,8]/DataC[,3]
   DataC[,6:7] <- cbind(out$Results[,out$Optimum],out$StErr[,out$Optimum])
   return(DataC)
}  # End of makedatasum



#' @title makeLabel Convert a vector of numbers or strings into a single label
#'
#' @description makeLabel Convert a vector of numbers of strings into a single
#'   label
#' @param invect the vector of numbers or strings to be converted into a
#'   single string.
#' @param insep defaults to '_' but can be any selected character used to
#'   separate each value
#' @return a character string containing the invect as a single string.
#' @export makeLabel
#' @examples
#' x <- c(1,2,3,4,5)
#' makeLabel(x)
#' makeLabel(x,"-")
makeLabel <- function(invect,insep="_") {
   invect <- as.character(invect)
   invect <- invect[nchar(invect) > 0]
   nlab <- length(invect)
   ans <- invect[1]
   if (nlab > 1) for (i in 2:nlab) ans <- paste(ans, invect[i], sep=insep)
   return(ans)
}  # end of makeLabel

#' @title makemodels - makes a list of models with their labels
#'
#' @description Makes a list of models with their labels from an input vector
#'   of names. Can be used to prepare the input models ready for standLM.
#' @param labelModel is the set of variables from the data.frame that are to
#'   be included as sequentially added models. Thus is labelModel is set equal
#'   to c('Year',"Vessel') the first model will be LnCE ~ Year and the second
#'   will be LnCE ~ Year + Vessel. labelModel will also provide the headings
#'   to the tables in the output list from standLnCE
#' @param dependent - the name of the dependent variable in your dataset;
#'   defaults to 'LnCE'
#' @return a list of formula representing the set of models to be fitted
#'   along with the labels for each model
#' @export makemodels
#' @examples
#' labelM <- c("Year","Vessel","Month")
#' makemodels(labelM)
makemodels <- function(labelModel,dependent="LnCE") {
   numvars <- length(labelModel)
   interterms <- grep(":",labelModel)
   ninter <- length(interterms)
   mods <- vector("list",(numvars+1))
   form <- paste0(dependent," ~ ",labelModel[1])
   mods[[1]] <- assign(paste0("ff",1),as.formula(form))
   if (numvars > 1){
      if (ninter > 0) {
         for (i in 2:(numvars - ninter)) {
            form <- paste0(form," + ",labelModel[i])
            mods[[i]] <- assign(paste0("ff",i),as.formula(form))
         }
         for (i in interterms) {
            interform <- paste0(form," + ",labelModel[i])
            mods[[i]] <- assign(paste0("ff",i),as.formula(interform))
         }
      } else {
         for (i in 2:numvars) {
            form <- paste0(form," + ",labelModel[i])
            mods[[i]] <- assign(paste0("ff",i),as.formula(form))
         }
      }
   }
   mods[[(numvars+1)]] <- labelModel
   return(mods)
} # end of makemodels


#' @title properties - used to check a data.frame before standardization
#'
#' @description properties - used to check a data.frame before
#'     standardization
#' @param indat the data.frame containing the data fields to be used
#'     in the subsequent standardization. It tabulates the number of
#'     NAs and the number of unique values for each variable and finds
#'     the minimum and maximum of the numeric variables
#' @param dimout determines whether or noth the dimensions of the data.frame
#'     are printed to the screen or not; defaults to FALSE
#' @return a data.frame with the rows being each variable from the input
#'     input data.frame and the columns being the number of NAs, the
#'     number of unique values, and minimum and maximum (where possible).
#' @export properties
#' @examples
#' \dontrun{
#' data(ab)
#' properties(ab)
#' }
properties <- function(indat,dimout=FALSE) {
   if(dimout) print(dim(indat))
   isna <- sapply(indat,function(x) sum(is.na(x)))
   uniques <- sapply(indat,function(x) length(unique(x)))
   clas <- sapply(indat,class)
   numbers <- c("integer","numeric")
   pick <- which(clas %in% numbers)
   minimum <- numeric(length(uniques))
   maximum <- minimum
   minimum[pick] <- sapply(indat[,pick],min,na.rm=TRUE)
   maximum[pick] <- sapply(indat[,pick],max,na.rm=TRUE)
   index <- 1:length(isna)
   props <- as.data.frame(cbind(index,isna,uniques,clas,minimum,
                                maximum,t(indat[1,])))
   colnames(props) <- c("Index","isNA","Unique","Class","Min",
                        "Max","Example")
   return(props)
} # end of properties

#' @title quants used in apply to count the number > 1 in a vector
#'
#' @description quants used in apply to count the number > 1 in a vector
#'    designed to be used in apply
#' @param invect vector of values
#' @return a vector of the c(0.025,0.05,0.5,0.95,0.975) quantiles
#' @export
#' @examples
#' x <- rnorm(100,mean=5,sd=1)
#' quants(x)
#' y <- matrix(x,nrow=10,ncol=10)
#' apply(y,2,quants)
quants <- function(invect) {
   ans <- quantile(invect,probs = c(0.025,0.05,0.5,0.95,0.975),na.rm=T)
   return(ans)
}

#' @title saveresults sends the tables generated to a text file in resultdir
#'
#' @description savereults saves all the tables generated to a text file
#'     named after the species and fishery and puts it into resultdir. The
#'     outputs include for all data relating to the species: catch-by-method,
#'     catch-by-zone, catch-by-fishery, then only for the data selected for
#'     the standardization, the: catch-by-method, catch-by-zone, and the
#'     catch-by-DepCat, but also the number of observations modified by each
#'     of the selction criteria, the data summary, and the raw output from the
#'     standardizatio. Customized for use with SESSF data-sets
#'
#' @param sps the data.frame containing all data relating to the species
#' @param answer the list obtained from 'selectdata' containing the data.frame
#'     of the selected data, and the nobs table containing how different
#'     selection criteria affect the number of observations.
#' @param out the output from standLM or dosingle containing the
#'     standardization
#' @param mods the mods defined for the standardization
#' @param splabel the name of the species and fishery
#' @param resdir the path to the results directory
#'
#' @return nothing, but it does write a text file to the results directory.
#' @export
#'
#' @examples
#' \dontrun{
#' print("this will take some doing")
#' }
saveresults <- function(sps,answer,out,mods,splabel,resdir) {
   cmeth <- tapply(sps$catch_kg,list(sps$Year,sps$Method),sum,na.rm=T)/1000
   czone <- tapply(sps$catch_kg,list(sps$Year,sps$Zone),sum,na.rm=T)/1000
   cfishery <- tapply(sps$catch_kg,list(sps$Year,sps$Fishery),sum,na.rm=T)/1000
   sps1 <- answer$sps1
   nobs <- answer$nobs
   LimCfishery <- tapply(sps1$catch_kg,list(sps1$Year,sps1$Fishery),sum,na.rm=T)/1000
   LimCZone <- tapply(sps1$catch_kg,list(sps1$Year,sps1$Zone),sum,na.rm=T)/1000
   LimCDep <- tapply(sps1$catch_kg,list(sps1$Year,sps1$DepCat),sum,na.rm=T)/1000
   datasum <- makedatasum(sps,sps1,out)
   fileout <- paste(resdir,splabel,".txt",sep="")
   if (file.exists(fileout)) file.remove(fileout)
   sink(fileout)
   print(splabel,quote=F)
   cat("\n")
   print(nobs)
   cat("\n")
   print("Models Used",quote=F)
   cat("\n")
   print(mods)
   cat("\n")
   print("Fishery Summary",quote=F)
   cat("\n")
   print(datasum)
   cat("\n")
   print("Catch by Year and Method for All Data",quote=F)
   cat("\n")
   print(cmeth)
   cat("\n")
   print("Catch by Year and Zone All data",quote=F)
   cat("\n")
   print(czone)
   cat("\n")
   print("Catch by Year and by Fishery All data",quote=F)
   cat("\n")
   print(cfishery)
   cat("\n")
   print("Catch by Year and Fishery Selected data",quote=F)
   cat("\n")
   print(LimCfishery)
   cat("\n")
   print("Catch by Year and by Zone, selected data",quote=F)
   cat("\n")
   print(LimCZone)
   cat("\n")
   print("Catch by Year and by DepCat, selected data",quote=F)
   cat("\n")
   print(LimCDep)
   cat("\n")
   print("Standardization Output",quote=FALSE)
   cat("\n")
   for (i in 1:7){
      print(names(out)[i],quote=FALSE)
      cat("\n")
      print(out[[i]])
      cat("\n")
   }
   sink()
   return(invisible(datasum))
}  # end of saveresults


#' @title scaleCE - Function to scale a vector of CPUE to a mean of one of avCE
#'
#' @description Function to scale a vector of CPUE to a mean of one or avCE.
#'   The use of a mean of one means that visual comparisons between different
#'   time-series becomes visually simplified. The avCE option could be used
#'   to scale the CPUE to the average geometric mean - so as to put it on
#'   the nominal scale
#' @param invect a vector of linear scale CPUE
#' @param avCE defaults to one but can be set to a particular value
#' @return the time-series of CPUE re-scaled to a mean of one or avCE
#' @export scaleCE
#' @examples
#' ce <- c(0.4667187,1.2628564,0.8442146,0.9813531, 0.5554076,0.7426321)
#' scaleCE(ce)
#' scaleCE(ce,100.0)
scaleCE <- function(invect,avCE=1.0) {
   average <- mean(invect,na.rm=TRUE)
   ans <- avCE * (invect/average)
   return(ans)
} # end of scaleCE

#' @title selectdata - select records from the sessf data
#'
#' @description selectdata - select records from the sessf data based upon
#'    whether ther edependent variable is empty or not, depth range, years,
#'    SESSF zones, fishing method, and fishery. If any are omitted then
#'    all records will be used
#'
#' @param indat the data set to be filtered - usually sps
#' @param dependent - the dependent variable in the standardization,
#'    defaults to LnCE
#' @param Ldepth the lower depth bound - defaults to 0
#' @param Udepth the upper depth bound, defaults to 1000
#' @param startyear the first year for inclusion, defaults to 0, which
#'    implies the use of all years present
#' @param finalyear the final year for inclusion, defaults to 0
#' @param inzones the sessf zones to be considered, defaults to NA,
#'    which implies either ORzone or sharkzone will be used
#' @param ORzone if the species uses the Orange Roughy zones rather than the
#'    SESSF zones then use this input variable; defaults to NA
#' @param sharkzone if the species uses the Shark Regions rather than the
#'    SESSF zones then use this input variable; defaults to NA
#' @param method the fishing method to be considered, defaults to NA,
#'    which implies all methods to be included
#' @param fishery the three letter acronym for eahc fishery to be
#'    considered, defaults to NA, which means include all.
#'
#' @return a list made up of the filtered data set named 'sps1', and
#'    a matrix containing the effect of the filtering on the numbers of
#'    observations; named 'nobs'.
#' @export selectdata
#'
#' @examples
#' \dontrun{
#' data(sps)
#' sort(unique(sps$Year))
#' print(range(sps$Depth))
#' sps$CE <- NA
#' sps$LnCE <- NA
#' pick <- which((sps$catch_kg > 0) & (sps$Effort > 0))
#' sps$CE[pick] <- sps$catch_kg[pick]/sps$Effort[pick]
#' sps$LnCE[pick] <- log(sps$CE[pick])
#' out <- selectdata(sps)
#' sps1 <- out$sps1
#' nobs <- out$nobs
#' nobs
#' }
selectdata <- function(indat,dependent="LnCE",Ldepth=0,Udepth=1000,
                       startyear=0,finalyear=3000,inzones=NA,ORzone=NA,
                       sharkzone=NA,method=NA,fishery=NA) {
   nobs <- matrix(0,nrow=4,ncol=7)
   colnames(nobs) <- c("Total","NoCE","Depth","Years","Zones","Method","Fishery")
   rownames(nobs) <- c("Records","Difference","Catch","Difference")
   nobs[1,"Total"] <- dim(indat)[1]
   nobs[3,"Total"] <- sum(indat$catch_kg,na.rm=TRUE)/1000
   if (!(dependent %in% colnames(indat)))
      stop(paste0(dependent," not defined in the input data.frame"))
   if (dependent == "LnCE") {
      empty <- which((is.na(indat[,dependent])))
   }
   if (dependent == "catch_kg") {
      empty <- which((is.na(indat[,dependent])) | (indat[,dependent] <= 0))
   }
   if (length(empty) > 0) {sps1 <- indat[-empty,] } else { sps1 <- indat }
   nobs[1,"NoCE"] <- dim(sps1)[1]
   nobs[3,"NoCE"] <- sum(sps1$catch_kg,na.rm=TRUE)/1000
   pick <- which((sps1$Depth >= Ldepth) & (sps1$Depth <= Udepth))
   if (length(pick) > 0) {sps1 <- sps1[pick,] }
   nobs[1,"Depth"] <- dim(sps1)[1]
   nobs[3,"Depth"] <- sum(sps1$catch_kg,na.rm=TRUE)/1000
   if ((startyear > 0) & (finalyear > 0)) {
      pick <- which((sps1$Year >= startyear) & (sps1$Year <= finalyear))
      if (length(pick) > 0) sps1 <- sps1[pick,]
   }
   nobs[1,"Years"] <- dim(sps1)[1]
   nobs[3,"Years"] <- sum(sps1$catch_kg,na.rm=TRUE)/1000
   if (!is.na(inzones)[1]) {
      pick <- which(sps1$Zone %in% inzones)
      if (length(pick) > 0) sps1 <- sps1[pick,]
   }
   if (!is.na(ORzone)[1]) {
      pick <- which(sps1$ORzone %in% ORzone)
      if (length(pick) > 0) sps1 <- sps1[pick,]
   }
   if (!is.na(sharkzone)[1]) {
      pick <- which(sps1$SharkRegion %in% sharkzone)
      if (length(pick) > 0) sps1 <- sps1[pick,]
   }
   nobs[1,"Zones"] <- dim(sps1)[1]
   nobs[3,"Zones"] <- sum(sps1$catch_kg,na.rm=TRUE)/1000
   if (!is.na(method)[1]) {
      pick <- which(sps1$Method %in% method)
      if (length(pick) > 0) sps1 <- sps1[pick,]
   }
   nobs[1,"Method"] <- dim(sps1)[1]
   nobs[3,"Method"] <- sum(sps1$catch_kg,na.rm=TRUE)/1000
   if (!is.na(fishery)[1]) {
      pick <- which(sps1$Fishery %in% fishery)
      if (length(pick) > 0) sps1 <- sps1[pick,]
   }
   sps1 <- droplevels(sps1)
   nobs[1,"Fishery"] <- dim(sps1)[1]
   nobs[3,"Fishery"] <- sum(sps1$catch_kg,na.rm=TRUE)/1000
   nobs[2,1] <- 0
   nobs[2,2:7] <- nobs[1,1:6] - nobs[1,2:7]
   nobs[4,1] <- 0
   nobs[4,2:7] <- nobs[3,1:6] - nobs[3,2:7]
   answer <- list(sps1=sps1,nobs=nobs)
   return(answer)
} # end of selectdata

#' @title standLM Conduct a standarization on log-transformed CPUE using lm
#'
#' @description Conduct a standarization on log-transformed CPUE data using lm.
#'   A number of predefined models are entered (use makemodels) with the raw
#'   data and a label. The optimum model is taken to be the one with the
#'   smallest AIC, though usually (not always) this is the same as with the 
#'   largest adj-r2.
#' @param inmods is the list of models to be analysed along with their
#'   respective labels - can be generated using makemodels
#' @param indat is the data.frame containing the raw data, which must contain
#'   columns with the same names as the factors being included in the models.
#' @param inlab is an optional label that is added to text and plotted outputs,
#'   which defaults to the empty string
#' @param console defaults to TRUE; if TRUE write each model to the screen
#'    as each model runs, which can mess up auto-documenting documents.
#' @return a list containing the standardization of class 'CEout'
#' @export standLM
#' @examples
#' \dontrun{
#' data(ab)
#' splabel = "Abalone"
#' print(tapply(ab$catch,list(ab$year,ab$block),sum,na.rm=TRUE)/1000)
#' labelM <- c("year","diverID","month","block","boatID")
#' ab1 <- makecategorical(labelM,ab)
#' mods <- makemodels(labelM)
#' out <- standLM(mods,ab1,splabel)
#' round(out$Results,4)
#' round(out$WhichM,4)
#' plotstand(out,bars=TRUE)
#' }
standLM <- function(inmods,indat,inlab="",console=TRUE){ #inmods=mods;indat=sps2;inlab=""; console=TRUE
   NModels <- length(inmods)
   labelM <- inmods[[NModels]]
   NModels <- NModels - 1
   ans <- vector("list",NModels)
   names(ans) <- labelM
   Yearnames <- levels(as.factor(indat[,labelM[1]]))
   Nyrs <- length(Yearnames)
   rows <- c("AIC","RSS", "MSS","Nobs", "Npars","adj_r2","%Change")
   WhichM <- matrix(nrow=length(rows),ncol=NModels,dimnames=list(rows,labelM))
   rows <- Yearnames
   Results <- matrix(nrow=Nyrs,ncol=NModels,dimnames = list(rows,labelM))
   ResStErr <- matrix(nrow=Nyrs,ncol=NModels,dimnames = list(rows,labelM))
   modellist <- vector("list",NModels)
   names(modellist) <- inlab
   geomod <- inmods[[1]]   # the Year term
   model <- lm(geomod,data=indat)    # used to estimate the total sum of squared
   totalssq <- sum(anova(model)[2]) # used in the %variation estimates
   for (index in 1:NModels) {  # index <- 1
      if (console) cat(as.character(inmods[[index]]),"\n")
      model <-  lm(inmods[[index]],data=indat)
      modellist[[index]] <- model
      modelsum <- summary(model)
      mat <- modelsum$coefficients
      ans <- getfact(mat,labelM[1])
      Results[,index] <- scaleCE(ans[,"Coeff"])
      ResStErr[,index] <- ans[,"SE"]
      anv <- anova(model)
      RSS <- tail(anv$"Sum Sq",1)
      WhichM["RSS",index] <- RSS
      WhichM["MSS",index] <- totalssq-RSS    # Model SS
      df <- unlist(anv[1])
      ndf <- length(df)
      nobs <- sum(df) + 1
      npars <- sum(df[1:(ndf-1)]) + 1
      WhichM["AIC",index] <- nobs * log(RSS/nobs) + (2.0 * npars)
      WhichM["Nobs",index] <- nobs
      WhichM["Npars",index] <- npars   #npars
      WhichM["adj_r2",index] <- 100.0 * modelsum$adj.r.squared  # adj_r2
   }
   for (index in 2:NModels) {  # calculate the %Change
      WhichM["%Change",index] <- WhichM["adj_r2",index]-WhichM["adj_r2",index-1]
   }
   WhichM["%Change",1] <- 0.0
   pickinter <- grep(":",labelM)
   if (length(pickinter) > 0) {
      lastsimple <- pickinter[1] - 1
      WhichM["%Change",pickinter] <- WhichM["adj_r2",pickinter] - WhichM["adj_r2",lastsimple]
   }
   optimum <- which.max(WhichM["adj_r2",])
   altopt <- which.min(WhichM["AIC",])
   if (optimum != altopt) {
      label <- "adj_r2 gives a different optimum to AIC! AIC being used as optimum."
      warning(cat("\n",label,"\n\n"))
      optimum <- altopt
   }
   msg <- paste("Optimum model ",inmods[optimum],sep="")
   if (console) print(msg,quote=F)
   if (NModels >= 3) {
      count <- 0
      for (i in 3:NModels) { # i <- NModels
         if (WhichM["%Change",i] > WhichM["%Change",(i-1)]) count <- count + 1
      }
      if ((count > 0) & (console)) {
         print("Models need re-ordering with the recommended order being:",quote=F)
         msg <- c(labelM[1],labelM[order(WhichM["%Change",2:NModels],decreasing=TRUE)+1])
         print(msg,quote=F)
      }
   }
   out <- list(Results,ResStErr,WhichM,optimum,Nyrs,inlab,
               summary(modellist[[optimum]]),modellist[[optimum]],modellist)
   names(out) <- c("Results","StErr","WhichM","Optimum","Nyrs","Label",
                   "Parameters","optModel","Models")
   class(out) <- "CEout"
   return(out)
} # end of standLM

#' @title summary.CEout - S3 function to print standardization results
#'
#' @description summary.CEout An S3 function to print out the primary results
#'   from a standardization. It includes the structure of the output soother
#'   things can be obtained if wished.
#' @param x is the output from a standardization using either StandLM.
#'     It must be of class: CEout - an S3 class
#' @return Summarizes the contents of the output from  standardization, this
#'    includes
#' \describe{
#'   \item{Structure}{this lists the top-level structure of the object output
#'      from the standardizations from standLM}
#'   \item{Parameters by Model}{This tabulated the yearly parameters for each
#'      of the models tested - the 'mods' input to standLM}
#'   \item{Std Errors by Model}{The matching standard errors for each of the
#'     models, one column for each column of paramters; large sample sizes
#'     mean these are usually an significant under-estimate of the
#'     true variaiton}
#'   \item{Model Diagnostic Properties}{A table of the statistical performance
#'     of each model, allowing the identification of the optimum model}
#'   \item{Optimum model}{identifies which column relates to the optimum
#'     statistical model}
#'   \item{All parameters}{The un-back-transformed parameters for the complete
#'     optimum model}
#' }
#' @exportMethod summary.CEout
#' @examples
#' \dontrun{
#' data(sps)
#' splabel = "Blue-Eye"
#' labelM <- c("Year","Vessel","Month")
#' sps1 <- makecategorical(labelM,sps)
#' mods <- makemodels(labelM)
#' out <- standLM(mods,sps1,splabel)
#' summary(out)
#' }
summary.CEout <- function(x) {
   cat("Structure of Analytical Output \n")
   str(x,max.level=1)
   cat("\n")
   cat("Parameters by Model \n")
   print(x$Results)
   cat("\n")
   cat("Std Errors by Model \n")
   print(x$StErr)
   cat("\n")
   cat("Model Diagnostic Properties \n")
   print(round(x$WhichM,3))
   cat("\n")
   cat("Optimum Model \n")
   print(x$Optimum)
   cat("\n")
   cat("All Un-backtransformed parameters from the optimum model \n")
   print(x$Parameters$coefficients)
} # end of summary.CEout

#' @title toXL copies a data.frame or matrix to the clipboard
#'
#' @description toXL copies a data.frame or matrix to the clipboard
#'    so one can then switch to Excel and just type <ctrl> + V to paste the
#'    data in place
#'
#' @param x a vector or matrix
#' @param output a boolean determining whether to print the object to the
#'    screen as well as the clipboard; defaults to TRUE
#' @return Places the object 'x' into the clipboard ready for pasting
#' @export
#' @examples
#' datamatrix <- matrix(data=rnorm(100),nrow=10,ncol=10)
#' colnames(datamatrix) <- paste0("A",1:10)
#' rownames(datamatrix) <- paste0("B",1:10)
#' toXL(datamatrix,output=TRUE)
toXL <- function(x,output=TRUE) {
   write.table(x,"clipboard",sep="\t")
   if(output) print(x)
}

#' @title turnover Estimate turnover of vessels from catch by vessel by year data
#'
#' @description Estimate turnover of vessels from catch by vessel by year data;
#'   To specify the minimum number of years that a vessel needs stay in the fishery,
#'   then give a value to the variable minyrs.
#' @param x A matrix of a continuous numeric property by year,
#'    the original usage was to plot catch-by-vessel against year
#' @param minyrs limits the analysis to those vessels that remain in the
#'    fishery for at least minyrs years - which would eliminate the occasional
#'    opportunistic fisher who only fishes for one or two years, or whatever
#'    minimum is selected. Vessels with zero catches are not included in case
#'    zeros and NAs are counted as starting and leaving the fishery.
#' @return a matrix of years by Continue, Leave, Start, Total
#' @export turnover
#' @examples
#' \dontrun{
#' library(r4cpue)
#' data(sps)
#' cbv <- tapply(sps$catch_kg,list(sps$Vessel,sps$Year),sum,na.rm=TRUE)/1000
#' dim(cbv)
#' early <- rowSums(cbv[,1:6],na.rm=TRUE)
#' late <- rowSums(cbv[,7:14],na.rm=TRUE)
#' cbv1 <- cbv[order(late,-early),]
#' plotprep(width=7,height=6)
#' yearBubble(cbv1,ylabel="Catch by Trawl",vline=2006.5,diam=0.2)
#' turnover(cbv)
#' }
turnover <- function(x,minyrs=1) {
   years <- as.numeric(colnames(x))
   ny <- length(years)
   count <- apply(x,1,countgtzero)
   pick <- which(count == 0)
   if (length(pick) > 0) {
      warning("Some Rows only have zeros or NAs")
      x <- x[-pick,]
   }
   pick <- which(count > (minyrs - 1))
   if (length(pick) > 0) x <- x[pick,]
   columns <- c("Continue","Leave","Start","Total")
   turnover <- matrix(0,nrow=ny,ncol=length(columns),
                      dimnames=list(years,columns))
   turnover[1,1] <- length(which(x[,1] > 0))
   for (yr in 2:ny) {
      pair <- x[,(yr-1):yr]
      pickC <- which((pair[,1] > 0) & (pair[,2] > 0))
      pickL <- which((pair[,1] > 0) & ((is.na(pair[,2])) |
                                          (pair[,2] < 0.001)))
      pickS <- which(((is.na(pair[,1])) | (pair[,2] < 0.001)) &
                        (pair[,2] > 0))
      turnover[yr,1:3] <- c(length(pickC),length(pickL),length(pickS))
   }
   turnover[,4] <- turnover[,1] + turnover[,3]
   return(turnover)
} # end of turnover

#' @title yearNA - counts NAs per year in each numeric field in a data.frame
#'
#' @description yearNA - counts the number of NAs in each year of each numeric
#'    field in a data.frame and outputs the results as a matrix
#' @param indat the data.frame whose numeric fields are to be considered
#' @param years identifies the name of the "Year" field in the data.frame
#' @param empty logical default=FALSE, determines whether columns which have
#'    no NA present are printed or not
#' @return a matrix of years x numeric fields with the number of records per
#'    field per year as reference
#' @export yearNA
#' @examples
#' year <- sort(rep(1990:1994,5))
#' columns <- c("year","Var1","Var2","Var3")
#' dat <- matrix(runif(100),nrow=25,ncol=4,dimnames=list(year,columns))
#' dat[trunc(100*runif(20))] <- NA
#' dat[,1] <- year
#' print(dat)
#' yearNA(as.data.frame(dat),years="year")
yearNA <- function(indat,years="Year",empty=FALSE) {
   records <- table(indat[,years])
   nna <- function(x) sum(is.na(x))
   columns <- colnames(indat)
   clas <- sapply(indat, class)
   numbers <- c("integer", "numeric")
   pick <- which(clas %in% numbers)
   num <- length(pick)
   ans <- NULL
   for (i in 1:num) {
      ans <- cbind(ans,tapply(indat[,columns[pick[i]]],indat[,years],nna))
   }
   ans <- cbind(ans,records)
   colnames(ans) <- c(columns[pick],"Records")
   if (!empty) {
      pick <- which(colSums(ans,na.rm=TRUE) > 0)
      if (length(pick) > 0)  ans <- ans[,pick]
   }
   return(ans)
}  #end of yearNA

#' @title yearZero - examine listed variables for zeros and NAs
#'
#' @description yearZero - examines an input data.frame of matrix in the
#'    variables listed in the parameter 'label' and counts the zeros,
#'    NAs, and those greater than 0. It counts these relative to a 'Year' variable,
#'    or any grouping variable identified using the 'years' parameter.
#' @param indat the data.frame or matrix to be examined
#' @param years the grouping variable for counts to be compared across,
#'    defaults to 'Year'
#' @param label the collection of column names which are to be examined,
#'    defaults to c('catch_kg','Effort'), which is suitable for SESSF
#'    data sets.#'
#' @return A matrix of the variables in 'label' with a column for each of
#'    Zeros, >0, and NAs, with a final column of the number of records
#'    per grouping variable value.
#' @export  yearZero
#'
#' @examples
#' insps <- matrix(runif(30),nrow=15,ncol=2)
#' insps[trunc(runif(7)*30)+1] <- 0
#' insps[trunc(runif(7)*30)+1] <- NA
#' insps <- cbind(sort(rep(2000:2004,3)),insps)
#' colnames(insps) <- c("Year","catch_kg","Effort")
#' print(round(insps,4))
#' yearZero(insps)
yearZero <- function(indat,years="Year",label=c("catch_kg","Effort")) { # indat <- sps
   columns <- colnames(indat)
   pick <- which(columns %in% label)
   npick <- length(pick)
   if (npick != length(label)) warning(paste0("Only ",columns[pick]," found"))
   if (npick == 0) stop("None of the listed variable found; rename them")
   records <- table(indat[,years])
   nna <- function(x) sum(is.na(x))  # sums the logical outcomes
   gtzero <- function(x) sum(x > 0,na.rm=TRUE)
   ans <- NULL
   for (i in 1:npick) {
      gtz <- tapply(indat[,columns[pick[i]]],indat[,years],gtzero)
      nas <- tapply(indat[,columns[pick[i]]],indat[,years],nna)
      zeros <- records - gtz - nas
      ans <- cbind(ans,zeros,gtz,nas)
   }
   ans <- cbind(ans,records)
   tmplab <- sort(c(paste0(columns[pick],"NA"),paste0(columns[pick],"GT0"),paste0(columns[pick],"0")))
   colnames(ans) <- c(tmplab,"Records")
   return(ans)
} # end of yearZero
haddonm/r4cpue documentation built on May 11, 2020, 1:31 a.m.