R/utils.R

Defines functions numericfy make.covb is.nullmodel make.survey.pars make.onesit truncdat lnci.nmin inv.logit logit histline nm2m cv bsample

Documented in bsample cv histline inv.logit is.nullmodel lnci.nmin logit make.covb make.onesit make.survey.pars nm2m numericfy truncdat

#                      -----------------------------
#--------------------- Start DLB's utility functions --------------------
#' @title Clone of \code{sample} without weird behaviour for length(x)==1.
#'
#' @usage bsample(x, size, replace = FALSE, prob = NULL)
#' 
#' @description
#'  Identical to \code{sample} but when x is an integer it just returns x rather than an
#'  integer in 1:x (which is what \code{sample} does).
#'  
#' @param x Either a vector of one or more elements from which to choose, or a positive integer. 
#' See \link{sample} for details.
#' @param size a positive number, the number of items to choose from.
#' @param replace Should sampling be with replacement?
#' @param prob A vector of probability weights for obtaining the elements of the vector being sampled.
#' 
bsample=function(x,size,replace=FALSE,prob=NULL) {
  if(length(x)==1) return(x)
  else return(sample(x,size,replace,prob))
}


#' @title Caclulates coefficient of variation.
#'
#' @description
#'  Utility function
#'  
#' @param x Random variable.
cv=function(x) sd(x)/mean(x) # calculates coefficient of variation

#' @title Converts nm to metres.
#'
#' @description
#'  Utility function
#'  
#' @param x Distance in nautical miles.
nm2m=function(x) return(x*1852) # converts nautical miles to metres

#mod=function(x,y) return(x-floor(x/y)*y) # returns the integer part of x/y


#' @title Draws histogram.
#'
#' @description
#'  Utility function to draw histograms with more options than \code{hist} allows.
#'  
#' @param height Height of histogram bars.
#' @param breaks Locations of boundaries of histogram bins (must be 1 longer than \code{height}).
#' @param lineonly If TRUE, uses \code{\link{lines}} to draw lines on current plot; else uses 
#' \code{\link{plot}} to draw lines on new plot.
#' @param outline If TRUE, draws only the outline (profile) of the histogram; else draws each 
#' complete bar.
#' @param fill If TRUE, uses polygon() to fill barsl in this case valid arguments to polygon() 
#' are passed via argument(s) "...". If fill==FALSE, valid arguments to plot() or lines() are 
#' passed via argument(s) "..."
#' @param ylim Range of y-axis.
#' @param xlab Label for x-axis.
#' @param ylab Label for y-axis.
#' @param ... See aargument \code{fill}.
histline=function(height,breaks,lineonly=FALSE,outline=FALSE,fill=FALSE,ylim=range(height),
                  xlab="x",ylab="y",...)
{
  n=length(height)
  if(length(breaks)!=(n+1)) stop("breaks must be 1 longer than height")
  if(outline) {
    y=c(0,rep(height,times=rep(2,n)),0)
    x=rep(breaks,times=rep(2,(n+1)))
  }   else {
    y=rep(0,4*n)
    x=rep(0,4*n+2)
    for(i in 1:n) {
      y[((i-1)*4+1):(i*4)]=c(0,rep(height[i],2),0)
      x[((i-1)*4+1):(i*4)]=c(rep(breaks[i],2),rep(breaks[i+1],2))
    }
    x=x[1:(4*n)]
  }
  if(lineonly) {
    if(!fill) lines(x,y,...)
    else polygon(x,y,...)
  } else {
    if(!fill) plot(x,y,type="l",ylim=ylim,xlab=xlab,ylab=ylab,...)
    else {
      plot(x,y,type="n",ylim=ylim,xlab=xlab,ylab=ylab)
      polygon(x,y,...)
    }
  }
}


#' @title Integration using Simpson's method.
#'
#' @description
#'  Modified version of function sintegral from library Bolstad.
#'  
#' @param fx Values at \code{x} (see below) of the function to be integrated.
#' @param x Values at which function fx has been evaluated.
#' @param n.pts The number of points to be used in integration. If \code{x} contains more than
#'\code{n.pts} then \code{n.pts} will be set to \code{length(x)}.
#' @param type if equal to `\code{cdf}' a list comprising x-values and cdf values at each x-value 
#' is returned, else the integral over the range of \code{x} is returned.
sintegral=function (fx, x, n.pts=16, type="int") 
{
  #  if (class(fx) == "function") 
  #    fx = fx(x)
  #  n.x = length(x)
  #  if (n.x != length(fx)) 
  #    stop("Unequal input vector lengths")
  #  if (n.pts < 64) 
  #    n.pts = 64
  ap = approx(x, fx, n = 2 * n.pts + 1)
  h = diff(ap$x)[1]
  integral = h*(ap$y[2*(1:n.pts)-1]+4*ap$y[2*(1:n.pts)]+ap$y[2*(1:n.pts)+1])/3
  if(type!="cdf") return(sum(integral)) 
  else return(list(x=ap$x[2*(1:n.pts)],y=cumsum(integral)))
}



#' @title Logit function.
#'
#' @description
#'  Logit function
#'  
#' @param p probability (scalar or vector).
logit=function(p) return(log(p/(1-p)))  # returns logit of p

#' @title Inverse logit function.
#'
#' @description
#'  Inverse logit function
#'  
#' @param x scalar or .
inv.logit=function(x) return(1/(1+exp(-x))) # returns p from x=(logit of p)

#' @title Lognormal confidence interval with lower bound
#'
#' @usage lnci.nmin(n,Nhat,cv)
#' 
#' @description
#'  Calculates 95\% confidence interval for abundance estimate \code{Nhat}, based on the assumption that 
#'  \code{Nhat} is lognormally distributed and that \code{n} population members were observed (so that 
#'  lower bound of confidence interval can't be below \code{n}.
#'  
#' @param n observed number of population members.
#' @param Nhat point estimate of abundance.
#' @param cv coefficient of variation of \code{Nhat}.
#' 
#' @export
lnci.nmin=function(n,Nhat,cv){
  varNhat=(Nhat*cv)^2
  cfactor=exp(1.96*sqrt(log(1+varNhat/(Nhat-n)^2)))
  lower=n+(Nhat-n)/cfactor
  upper=n+(Nhat-n)*cfactor
  return(list(lower=lower,upper=upper))
}

#' @title Data truncation.
#'
#' @description
#' Left- and/or right-truncates any variable in data frame dat, inserting effort-only record if 
#' truncation removes all detections on a transect.
#' Subtracts left-trunction point off all variable values - after all truncation.

#' 
#' @param dat distance data frame. Must have columns for stratum stratum.area, transect, transect.length, 
#' <any-observer-level_variable>, and if \code{twosit}==TRUE then object as well. These can 
#' have any names, but the names must be specified, via argument \code{colnames}.
#' @param minvalue left-truncation point.
#' @param maxvalue right-truncation point.
#' @param twosit If TRUE, assumes this is an mrds-type dataset with two lines per detection.
#' @param colnames name of columns containing stratum, stratum area, transect, transect length, 
#' <any-observer-level_variable>, and if \code{twosit}==TRUE then object as well, IN THIS ORDER, 
#' in a character vector. The default value is 
#' colnames=c("stratum","area","transect","L","x","obs").
truncdat=function(dat,minval=0,maxval=NULL,twosit=FALSE,colnames=c("stratum","area","transect","L","x","obs")){
  tdat=dat
  keepcols=rep(NA,4)
  for(i in 1:4) {
    keepcols[i]=which(names(dat)==colnames[i])
    if(is.null(keepcols[i])) stop("No column in dat called ",colnames[i])
  }
  NAs=dat[1,,drop=FALSE]
#  NAs[,-keepcols]=NA
  NAs[,c("object","x","y")]=NA
  xcol=which(names(dat)==colnames[5])
  if(is.null(xcol)) stop("No column in dat called ",colnames[5])
  if(is.null(maxval)) maxval=max(na.omit(dat[,xcol]))
  if(twosit){
    if(length(colnames)<6) stop("With double-observer data, need 6th colname for observer; only 5 colnames:",colnames)
    obscol=which(names(dat)==colnames[6])
    if(is.null(obscol)) stop("No column in dat called ",colnames[6])
    out1=which(dat[,obscol]==1 & (dat[,xcol]<minval | dat[,xcol]>maxval))
    out2=which(dat[,obscol]==2 & (dat[,xcol]<minval | dat[,xcol]>maxval))
    if(length(out1) != length(out2)) stop("Different number of obs1 and obs2 detections for left-truncation")
    if(unique(out2-out1) != 1) stop("Looks like non-consecutive obs1 and obs2 detections for left-truncation")
    nout=length(out1)
    
    if(nout>0) {
      svalues=tvalues=rep(NA,nout)
      if(is.factor(dat[,keepcols[1]])) {
        svalues=rep(levels(dat[,keepcols[1]])[1],nout)
        levels(svalues)=levels(dat[,keepcols[1]])
        NAs[keepcols[1]]=svalues[1] # put a factor in the NA row (arbitrary level, as it will be overwritten)
        levels(NAs)=levels(svalues)
      }
      if(is.factor(dat[,keepcols[3]])) {
        tvalues=rep(levels(dat[,keepcols[3]])[3],nout)
        levels(tvalues)=levels(dat[,keepcols[3]])
        NAs[keepcols[3]]=tvalues[1] # put a factor in the NA row (arbitrary level, as it will be overwritten)
        levels(NAs)=levels(tvalues)
      }
      outeff=data.frame(stratum=svalues,area=rep(NA,nout),transect=tvalues,L=rep(NA,nout))
      if(is.factor(outeff$stratum)) levels(outeff$stratum)=levels(svalues)
      if(is.factor(outeff$transect)) levels(outeff$stratum)=levels(tvalues)
      for(i in 1:nout) outeff[i,]=dat[out1[i],keepcols] # store effort info for truncated sightings
      tdat=dat[-c(out1,out2),] # remove all truncated sightings
      for(i in 1:nout) {
        got=which(tdat[,keepcols[1]]==outeff$stratum[i] & tdat[,keepcols[2]]==outeff$area[i] & 
                    tdat[,keepcols[3]]==outeff$transect[i] & tdat[,keepcols[4]]==outeff$L[i])
        if(length(got)==0) { # transect no longer in truncated dataset
          tdat=rbind(tdat,NAs) # add row of NAs
          tdat[dim(tdat)[1],keepcols]=outeff[i,] # add in missing transect info
        }
      }  
    }
  } else {
    out=which(dat[,xcol]<minval | dat[,xcol]>maxval)
    nout=length(out)
    
    if(nout>0) {
      svalues=tvalues=rep(NA,nout)
      if(is.factor(dat[,keepcols[1]])) {
        svalues=rep(levels(dat[,keepcols[1]])[1],nout)
        levels(svalues)=levels(dat[,keepcols[1]])
        NAs[keepcols[1]]=svalues[1] # put a factor in the NA row (arbitrary level, as it will be overwritten)
        levels(NAs)=levels(svalues)
      }
      if(is.factor(dat[,keepcols[3]])) {
        tvalues=rep(levels(dat[,keepcols[3]])[3],nout)
        levels(tvalues)=levels(dat[,keepcols[3]])
        NAs[keepcols[3]]=tvalues[1] # put a factor in the NA row (arbitrary level, as it will be overwritten)
        levels(NAs)=levels(tvalues)
      }
      outeff=data.frame(stratum=svalues,area=rep(NA,nout),transect=tvalues,L=rep(NA,nout))
      if(is.factor(outeff$stratum)) levels(outeff$stratum)=levels(svalues)
      if(is.factor(outeff$transect)) levels(outeff$stratum)=levels(tvalues)
      #    for(i in 1:nout) outeff[i,]=dat[out1[i],keepcols] # store effort info for truncated sightings
      for(i in 1:nout) outeff[i,]=dat[out[i],keepcols] # store effort info for truncated sightings
      tdat=dat[-out,] # remove all truncated sightings
      for(i in 1:nout) {
        got=which(tdat[,keepcols[1]]==outeff$stratum[i] & tdat[,keepcols[2]]==outeff$area[i] & 
                    tdat[,keepcols[3]]==outeff$transect[i] & tdat[,keepcols[4]]==outeff$L[i])
        if(length(got)==0) { # transect no longer in truncated dataset
          tdat=rbind(tdat,NAs) # add row of NAs
          tdat[dim(tdat)[1],keepcols]=outeff[i,] # add in missing transect info
        }
      }
    }
  }
  
  if(nout>0) {
    tdat=tdat[order(tdat[keepcols[1]],tdat[keepcols[3]]),]
    tdat[!is.na(tdat[,xcol]),xcol]=tdat[!is.na(tdat[,xcol]),xcol]-minval # shift all left
  }
  return(tdat)
}


#' @title Reduces MRDS data frame to CDS data frame.
#'
#' @param dat distance data frame in mrds form. Must have columns 'object' (unique detection 
#' identifier), 'seen' (binary indicating detecteb by observer or not) and 'y' (forward) 
#' detection distance.
#' @param prefer which of the two observers' data to prefer when forward distances are 
#'   missing/equal must be 1 or 2.
#'   
#' @description
#' Reduces mark-recapture Distance sampling (MRDS) data frame dat, with two lines per detection, to a 
#' conventional distance sampling (CDS) data frame with a single line per detection. In the case of 
#' duplicates ("recaptures"), takes the information from the detection made farthest ahead (i.e. that
#' with larger \code{y}) With duplicates that have the same \code{y} or neither of which have a 
#' \code{y}, it chooses according to the parameter \code{prefer}. If only one of the duplicates 
#' has a \code{y}, it chooses that one.
#' 
#' @param dat MRDS data frame.
make.onesit=function(dat,prefer=1) {
  if(prefer!=1 & prefer!=2) stop("Argument 'prefer' must be 1 or 2.")
  n=dim(dat)[1]
  out=rep(FALSE,n)
  i=1
  while(i<=n){
    if(!is.na(dat$object[i])){
      if(dat$seen[i]==0) out[i]=TRUE
      if(dat$seen[i+1]==0) out[i+1]=TRUE
      if(dat$seen[i]==1 & dat$seen[i+1]==1) {
        if(is.na(dat$y[i]) & is.na(dat$y[i+1])) { # no y's; remove not preferred detection
          out[i+(3-prefer)-1]=TRUE
        } else if(is.na(dat$y[i]) & !is.na(dat$y[i+1])) { # keep only non-NA y
          out[i]=TRUE
        } else if(!is.na(dat$y[i]) & is.na(dat$y[i+1])){ # keep only non-NA y
          out[i+1]=TRUE
        } else if(dat$y[i]==dat$y[i+1]){# remove not preferred detection
          out[i+(3-prefer)-1]=TRUE
        } else if(dat$y[i]<dat$y[i+1]){ # remove later (closer) detection
          out[i]=TRUE
        } else {
          out[i+1]=TRUE
        }
      }
      i=i+2
    } else i=i+1
  }
  dat=dat[!out,] # remove worse observer's lines
  return(dat)
}



#' @title Makes survey parameter list (\code{survey.pars}) suitable for passing to 
#' \code{\link{est.hmltm}}.
#'
#' @description
#' \code{make.survey.pars} just puts a bunch of stuff in a list in a format that \code{\link{est.hmltm}}
#'  expects.
#'
#' @param spd is observer speed.
#' @param W is perpendicular right-truncation distance for estimation.
#' @param ymax is maximum forward distance for estimation - it should be far enough ahead that it is
#' not possible to detect anyting beyond \code{ymax}.
#' @param Wl is perpendicular left-truncation distance for estimation. (After truncating \code{Wl} is 
#' subtracted from all perpendicular distances.)
#' @param dT is the time step on which the availability hidden Markov model operates.
#' @param theta.f is REDUNDANT and should be removed. It must equal 0. 
#' @param theta.b is REDUNDANT and should be removed. It must equal 90. 
#'
#' @details Packs the above in a list suitable for passing as \code{survey.pars} to 
#' \code{\link{est.hmltm}}.
make.survey.pars=function(spd,W,ymax,Wl=0,dT=1,theta.f=0,theta.b=90){
  return(list(spd=spd,W=W,ymax=ymax,Wl=Wl,dT=dT,dy=spd*dT,theta.f=theta.f,theta.b=theta.b))
}

#' @title Decides if model is a null model.
#'
#' @description
#'  Logical function: true if \code{models} includes no covariates
#'  
#' @param models list of characters with elements \code{$y} and \code{$x} specifying y- and 
#' x-covariate models. Either \code{NULL} or regression model format (without response on left).
is.nullmodel=function(models){
  null=TRUE
  for(i in 1:length(models)) null=null & is.null(models[[i]])
  return(null)
}


#' @title Constructs linear predictor.
#'
#' @description
#' Returns parameter vector on linear predictor scale for hazard function \code{FUN}, using 
#' \code{model} and data frame \code{dat}. Makes a matrix with each row the parameter values for an 
#' observation, then concatenates these into a single vector (so can pass as vector to C++ code).
#'  
#' @param b parameter vector.
#' @param FUN detection hazard function name (character).
#' @param models list with two components (\code{$y} and \code{$x}) specifying models for detection 
#' hazard function scale parameters in forward (y) and perpendicular (x) dimensions. If detection 
#' hazard function form does not have separate scale parameters in y- and x- dimensions, the model
#' given in \code{$y} is take as the scale parameter model. \code{$y} and \code{$x} must be either 
#' \code{NULL} or "~<regspec>", where <regspec> is a regression model spefication (e.g. 
#' \code{height+weight} or \code{height:weight} or \code{height*weight}, etc.).
#' @param dat data frame, which must have columns corresponding to variable names in \code{$y} and
#' \code{$x}.
make.covb=function(b,FUN,models,dat)
{
  nfixed=switch(FUN,
                h.IP.0 = 2,
                h.EP1.0 = 2,
                h.EP2.0 = 3,
                h.EP1x.0 = 3,
                h.EP2x.0 = 4,
                0
  )
  if(nfixed==0) stop("Hazard model ",FUN," is not progremmed (yet).")
  if(is.nullmodel(models)) {
    if(length(b) != nfixed) stop("length of b inconsistent with model")
    n=dim(dat)[1]
    #    covb=matrix(c(rep(b,rep(n,nfixed))),ncol=nfixed)
    covb=rep(b,n)
  } else {
    if(FUN=="h.EP2.0" | FUN=="h.EP1.0" | FUN=="h.IP.0"){
      X=model.matrix(as.formula(models$y),data=dat)
      n=dim(X)[1]
      nb=dim(X)[2]
      nfixed=nfixed-1
      if(length(b) != (nfixed+nb)) stop("length of b inconsistent with model")
      covb=matrix(c(rep(b[1:nfixed],rep(n,nfixed)),X%*%b[nfixed+(1:nb)]),ncol=(nfixed+1))
      covb=as.vector(t(covb))
    }
    else if(FUN=="h.EP2x.0" | FUN=="h.EP1x.0"){
      if(!is.null(models$y)){
        X=model.matrix(as.formula(models$y),data=dat)
        n=dim(X)[1]
        nb=dim(X)[2]        
        if(!is.null(models$x)){ # here if have covariates for x and y
          X.x=model.matrix(as.formula(models$x),data=dat)
          nb.x=dim(X.x)[2]
          nfixed=nfixed-2
          if(length(b) != (nfixed+nb+nb.x)) stop("length of b inconsistent with model and model.x")
          covb=matrix(c(rep(b[1:nfixed],rep(n,nfixed)),X%*%b[nfixed+(1:nb)],X.x%*%b[nfixed+nb+(1:nb.x)]),ncol=(nfixed+2))  
          covb=as.vector(t(covb))
        } else { # here if have covariates for y only
          nfixed=nfixed-1
          if(length(b) != (nfixed+nb)) stop("length of b inconsistent with model and model.x")
          #      covb=matrix(c(rep(b[1:nfixed],rep(n,nfixed)),X%*%b[nfixed+(1:nb)]),ncol=(nfixed+1))   #### bitchange
          covb=matrix(c(rep(b[1:(nfixed-1)],rep(n,(nfixed-1))),X%*%b[(nfixed-1)+(1:nb)],rep(b[length(b)],n)),ncol=(nfixed+1))  #### bitchange 
          covb=as.vector(t(covb))
        }
      } else { # here if have covariates for x only
        X.x=model.matrix(as.formula(models$x),data=dat)
        n.x=dim(X.x)[1]
        nb.x=dim(X.x)[2]
        nfixed=nfixed-1
        if(length(b) != (nfixed+nb.x)) stop("length of b inconsistent with model and model.x")
        covb=matrix(c(rep(b[1:nfixed],rep(n.x,nfixed)),X.x%*%b[nfixed+(1:nb.x)]),ncol=(nfixed+1))  
        covb=as.vector(t(covb))        
      }
    } else {
      stop("Invalid FUN.")
    }
  }
  return(covb)
}


#' @title Converts factor or character column of data frame to integer.
#'
#' @description
#' Replaces a specified column, which may contain factor or character variables, with
#' uniquely-numbered integer values. Returns the new data frame and a data frame 
#' showing which factor corresponds to which integer, in a list.
#' 
#' @param dat A data frame
#' @param column The column number or name of the column to be converted.
#' @param add If TRUE a new column is added, named `num<column>', where '<column>'
#' is the value of the \code{column} argument, else the old column is overwritten.
#' 
#' @export
numericfy = function(dat,column=1,add=FALSE){
  vals = dat[,column]
  colnames = names(dat)
  ncols = length(colnames)
  if(is.character(column)) colnum = which(colnames==column)
  else colnum = column
  if(is.numeric(vals)) stop("This column is already numeric. No action taken.")
  if(is.factor(vals)) vals = as.character(vals)
  uvals = unique(vals) # unique values
  nvals = length(uvals) # number of unique values
  ivals = rep(NA,nvals) # new data column (numeric)
  for(i in 1:length(vals)) ivals[i] = which(uvals==vals[i])
  if(add) {
    newdat = cbind(dat,ivals)
    names(newdat)[length(names(newdat))] = paste("num",column,sep="")
  } else {
    if(colnum==1) newdat = cbind(ivals,dat[,(colnum+1):ncols])
    else if(colnum==ncols) newdat = cbind(dat[,1:(colnum-1)],ivals)
    else newdat = cbind(dat[,1:(colnum-1)],ivals,dat[,(colnum+1):ncols])
    names(newdat) = colnames
  }
  lookup = data.frame(new=1:nvals, old=uvals)
  return(list(dat=newdat,lookup=lookup))
}
david-borchers/hmltm documentation built on Oct. 29, 2023, 9:07 p.m.