R/organizeBirds.R

Defines functions organizeBirds obsData.OrganizedBirds obsData visits createVisits getGridIDs simplifySpp organizeDate findCols

Documented in createVisits findCols getGridIDs obsData obsData.OrganizedBirds organizeBirds organizeDate simplifySpp visits

#' Find a text among column names
#'
#' Finds a text among column names and return it as it is
#'
#' @param pattern character string containing a string or regular expression
#' to be matched in the given character vector
#' @param df A data.frame among whichs column names to look for the pattern
#' @param exact A logical flag whether to return all matches or just that text.
#' This is always case insensitive.
#' @param value A logical flag (to be passed to grep()). If FALSE, a vector
#' containing the (integer) indices of the matches determined by grep is returned,
#' and if TRUE, a vector containing the matching elements themselves is returned.
#'
#' @return A vector with the column names that match the pattern
#' @export
#' @keywords internal
findCols <- function(pattern, df, exact=FALSE, value = TRUE){
  if(missing(pattern)) stop("The argument 'pattern' must be supplied.")
  if(missing(df)) stop("The argument 'df' must be supplied.")

  patternE <-if (exact) paste("^", pattern, "$", sep="") else pattern

  if(length(patternE) == 1){
    res <- grep(patternE, names(df), ignore.case=TRUE, value=value)
  }
  if(length(patternE) > 1){
    res<- lapply(patternE, grep, names(df), ignore.case=TRUE, value=value)
  }
  return(res)
}

#' Organize the date-column(s)
#'
#' Organize the date-column(s) in a data.frame to three columns
#'
#' @param x A data.frame with at least the columns specified in 'columns'
#' @param columns A character vector with the column names for the dates specified.
#'   It can either be one column formatted as "yyyy-mm-dd" or a vector of
#'   length=3. If the column names are "year", "month" and "day" it will take
#'   these column names. Otherwise, it will take the column names and interpret
#'   the first as year, the second as month and the third as day.
#'
#' @return A data.frame with the columns "year", "month", "day"
#' @export
#' @examples
#' ymd<-as.Date(Sys.Date())+1:5
#' id<-1:5
#' organizeDate(data.frame("id"=id,
#'                         "ymd"=as.character(ymd)),
#'              "ymd")
#' @keywords internal
organizeDate <- function(x, columns){
  if (!length(columns) %in% c(1,3)) stop("Could not create date, please specify either one or three column names")

  stdTimeCols <- c("year", "month", "day")

  cols.df <- findCols(columns, x, exact = TRUE)

  if(all(lengths(cols.df) > 0)){
    cols.df <- unlist(cols.df)

    if(ncol(x)>1) x <- x[,cols.df]
    cols.df<-tolower(cols.df)

    if(length(cols.df) == 3){
      colnames(x)<-tolower(colnames(x))
      if(all(stdTimeCols %in% cols.df)){
        ## all is just fine
        x<-x[,stdTimeCols]
        return(x)
      } else{
        ### This part here assumes the 3 defined columns are in the right order
        ### and gives the standard name
        message("This function assumes that the 3 given column names represent 'year', 'month' and 'day' in that specific order.")
        if(any(stdTimeCols %in% cols.df)){

          wColNot <- which(!(stdTimeCols %in% cols.df))
          for(i in 1:length(wColNot)){
            # x$placeholder <- as.matrix(x[, cols.df[wColNot[i]]])
            x$placeholder <- x[, cols.df[wColNot[i]]]
            names(x)[names(x) == "placeholder"] <- stdTimeCols[wColNot[i]]
          }

        } else {
          ## if none of the columns names are standard then makes them standard
          x$year  <- x[,cols.df[1]]
          x$month <- x[,cols.df[2]]
          x$day   <- x[,cols.df[3]]
        }
      }

      ## empty years are not tolerated! but days and months are given value 1
      if(sum(is.na(x$day))>0) message(paste("There were", sum(is.na(x$day)),"empty days that were given the value 1"))
      if(sum(is.na(x$month))>0) message(paste("There were", sum(is.na(x$month)),"empty months that were given the value 1"))
      x$day <- ifelse(is.na(x$day), 1, x$day)
      x$month <- ifelse(is.na(x$month), 1, x$month)
print(x)
      res<-x
    }

    if(length(cols.df) == 1){
      res <- data.frame(matrix(nrow = length(x), ncol=length(stdTimeCols)))
      colnames(res)<-stdTimeCols
      dateYMD <- as.Date(x)

      res$year  <- lubridate::year(dateYMD)
      res$month <- lubridate::month(dateYMD)
      res$day   <- lubridate::day(dateYMD)

      # res
    }
  } else {
    stop("One or more specified column names are not present in the input data set.")
  }
  ## clean unreadable dates (cleaning also the spatial points)
  wNA <- is.na(res)
  if (sum(wNA) > 1){
    res <- res[-wNA,]
    message(paste(length(wNA), " records deleted because the date was unreadable."))
  }
print(res)
  return(res)
}


#' Simplify species names
#'
#' Removes infraspecific epithets, authors and years from scientific names
#'
#' @param df A data.frame with at least the column specified in sppCol
#' @param sppCol A character vector with the column names for the species names.
#' @return A vector with data.frame with a canonical name given by taxize::gbif_parse(),
#' that is a scientific name with up to 3 elements and no authorship
#' @importFrom taxize gbif_parse
#' @export
simplifySpp <- function(df, sppCol){

  gbifdf <-  gbif_parse(df[, sppCol])
  simpleSpp <- gbifdf$canonicalname

  # splitLits <- strsplit(as.character(df[, sppCol]), "\ ")
  #
  # simpleNamesList<-lapply(splitLits, FUN=function(x){
  #   if (length(x) == 1){ # Only genus
  #     return(x)
  #   }
  #   if (length(x) >= 2){ # Genus and more
  #     if(grepl("(?!-)(?!/)[[:punct:]]|[A-Z]", x[2], perl = TRUE)){ # Genus and more (find punctuation except for - and /)
  #       return(x[1])
  #       # TODO cases like #Falco peregrinus, 1771" will wrongly end like "Falco"
  #       # but without this line it will wrongly take Falco Lineus, 1771 as Falco Lineus
  #       # is the key on the capital letter?
  #     } else {
  #       return(paste(x[1:2], collapse=" ")  ) # only species
  #     }
  #   }
  # } )
  # simpleSpp <- unlist(simpleNamesList)

  return(simpleSpp)
}


#### HANDLE THE VISITS ####
#' Create unique IDs based on a grid
#'
#' Takes a spatial points data.frame and a grid and gets the overlay IDs.
#'
#' @param x a SpatialPointsDataFrame ('sp') or sf ('sf') (with the observations).
#' @param grid a SpatialPolygon object ('sp') or sf ('sf') (a grid is expected) defining the
#' maximum extent of visits effort.
#' @param idcol column name for the grid names or ids
#'
#' @return A vector of the same length as the number of rows (observations) as x
#'   with a unique number corresponding to the grid's ID.
#'
#' @export
getGridIDs <- function(x, grid, idcol="id"){
  if(!any(class(x) %in% c("SpatialPointsDataFrame", "sf"))) {
    stop("The argument 'x' can only be of class sf or SpatialPointsDataFrame")
  }

  if(any(class(x) == "SpatialPointsDataFrame")) x <- st_as_sf(x)

  if(!any(class(grid) %in% c("SpatialPolygonsDataFrame", "SpatialPolygons", "sf"))){
    stop("The argument 'grid' can only be of class sf, SpatialPolygonsDataFrame or SpatialPolygons")
  }

  if(any(class(grid) %in% c("SpatialPolygonsDataFrame", "SpatialPolygons"))) grid <- st_as_sf(grid)

  if(is.null(idcol)) idcol="id"
  #### Rename grid
  if(is.null(colnames(grid))){
    grid <- st_sf(data.frame(paste0("ID", seq(length(grid))), st_geometry(grid)))
    colnames(grid)[1]<-idcol
  } else {
    if(!idcol %in% colnames(grid)) stop(paste0("The column '",idcol,"' was not found in the grid"))
    if (any(duplicated(grid[,idcol]))){
      grid <- renameGrid(grid, idcol)
      warning("There are duplicated cell names in your grid. We rename them internally to 'ID1'...'IDn'.
All results will use this nomenclature, but the order of the cells will remain unaltered.")
      }
  }

  x <- st_transform(x,
                    crs = st_crs(3857))
  grid <- st_transform(grid,
                       crs = st_crs(3857))
  inter <-st_intersects(x, grid)
  res <- sapply(inter,
                function(z) if (length(z)==0) NA_integer_ else z[1])
  return(res)

}



#' Create unique visits IDs
#'
#' Takes a data.frame and a vector of column names and classifies each row of the
#' data.frame based on the combination of values in the specified columns.
#'
#' What a visit should be is not always clearly defined and extractable in a
#' dataset. A reasonable assumption is that a visit could be identified from the
#' records made by one person on a certain day and at a specific location or site.
#' The default value for the variable column is therefore that a visit is identified
#' by the Darwin Core variables \code{c("locality", "day", "month", "year",
#' "recordedBy")}.
#'
#' @param x An object of class \code{data.frame}, \code{sf} or \code{SpatialPointsDataFrame}
#' including at least the columns specified that are used to identify a visit.
#' @param idCols A vector with the names of the columns other (than time columns)
#' that are used to identify a visit. This variable cannot be empty. At least the
#' recorders name or any other ID must be provided. Default is the Darwin Core
#' variables \code{c("locality", "recordedBy")}.
#' @param timeCols A vector with the names of the time columns that are used to
#' identify a visit.  If timeCols=NULL then time is ignored to create a visit ID.
#' Default is the Darwin Core variables \code{c("day", "month", "year")}.
#' @param grid Either \code{NULL} to be ignored or an object of class
#' \code{sf}, \code{SpatialPolygons} or \code{SpatialPolygonsDataFrame} defining
#' the maximum extent of visits effort. Then x must be an object of class sf
#' @param gridIdCol column in grid with ids.
#'
#' @return A vector of the same length as the number of rows as the data.frame
#'   with a unique number for each combination of the values in the specified
#'   columns.
#' @export
#' @examples
#' OB <- organizeBirds(bombusObs)
#' tmp.vis <- createVisits(bombusObs,
#'                         idCols=c("locality", "recordedBy"),
#'                         timeCols=c("day", "month", "year"),
#'                         gridIdCol = "id")
#' visits(OB, name = "visNoRecorder", useAsDefault = TRUE) <- tmp.vis
createVisits<-function(x,
                       idCols = c("locality", "recordedBy"),
                       timeCols = c("day", "month", "year"),
                       grid = NULL,
                       gridIdCol){

  if(any(class(x) %in% c("data.frame", "SpatialPointsDataFrame", "sf"))){
    if (any(class(x)=="sf")){
      df <- st_drop_geometry(x)
      sfdf <- x
    } else if (any(class(x)=="SpatialPointsDataFrame")){
      df <- x@data
      sfdf <- st_as_sf(x)
    } else if(class(x)=="data.frame"){
      df <- as.data.frame(x) ## in case it is a data.table or some other weird class
      sfdf <- NULL
    }

    if (all(idCols == "")) idCols <- NULL
    if (all(timeCols=="")) timeCols <- NULL

    if(!is.null(grid)) {
      if(any(class(grid) %in% c("SpatialPolygons", "SpatialPolygonsDataFrame", "sf"))
         & !is.null(sfdf)){
        df[,"gridID"] <- getGridIDs(sfdf, grid, gridIdCol)
      } else if(class(grid %in% c("character", "numeric"))){
        if(length(grid) == nrow(df)) {
          df[,"gridID"] <- grid
        } else stop("If grid is a vector it should be as long as the number of observations.")
      } else stop("The argument 'grid' should be a geometry or a vector with IDs.")

      gridID <- "gridID"

    } else {
      gridID <- NULL
    }

    columns <- c(gridID, idCols, timeCols)
    if (length(columns) == 0) stop("At least one of the arguments 'idCols','timeCols','grid' needs to be defined.")

    cols.df <- findCols(columns, df, exact = TRUE)

    if(all(lengths(cols.df) > 0)){
      cols.df <- unlist(cols.df)
      res <- as.integer(factor(apply(df[cols.df], 1, paste0, collapse="")))
      return(res)
    } else {
      stop("Some or any of the column names were not found in the data set.")
    }

  } else {
    stop("Argument 'x' must be a 'data.frame' or an 'OrganizedBirds'")
  }

}


#' Get/set the visits
#'
#' Gets or sets the visits identifier for a OrganizedBirds-class.
#'
#' @usage visits(x, name=NULL)
#' @aliases \sQuote{visits<-}
#' @param x An OrganizedBirds-object
#' @param name The name of the visit column. Default is \code{NULL}, which will
#'   get/write to the predefined visit column (\code{visitUID}).
#' @param useAsDefault Specifies if the defined column in \code{name} should be used as
#'   the default column for the visits in further analysis. If name is
#'   \code{NULL} and \code{useAsDefault = TRUE}, \code{value} will be written to
#'   column (\code{visitUID}) and that column will be set to default.
#' @param value the value to assign
#'
#' @export
#' @examples
#' ob <- organizeBirds(bombusObs)
#' attr(ob, "visitCol")
#' vis <- visits(ob)
#' tmp.vis <- createVisits(bombusObs,
#'                         idCols=c("locality"),
#'                         timeCols = c("day", "month", "year"),
#'                         gridIdCol = "id")
#' visits(ob, name = "visNoRecorder", useAsDefault = TRUE) <- tmp.vis
#' vis2 <- visits(ob)
#' attr(ob, "visitCol")
visits<-function(x, name=NULL){

  if(class(x)!="OrganizedBirds"){
    stop("Cannot get the visits from other than a OrganizedBirds-class")
  }

  if(is.null(name)){
    name<-attr(x, "visitCol")
  }

  return(st_drop_geometry(x[[1]][,name]))
}

#' @rdname visits
#' @export
'visits<-'<-function(x,
                     name=NULL,
                     useAsDefault = TRUE,
                     value){

  if(is.null(name)){
    name <- "visitUID"
  }

  if(class(x)=="OrganizedBirds"){

    x$spdf[,name] <- value

    if(useAsDefault){
      attr(x, "visitCol") <- name
    }

  }else if(length(dim(x)) == 2){
    if(any(colnames(x) == name)){
      x[,name] <- value
    }else{
      cName <- colnames(x)
      cName <- c(cName, name)
      x <- cbind(x, value)
      colnames(x) <- cName
    }

    if(useAsDefault){
      warning("Cannot set the default visit column for any object other than a OrganizedBirds. useAsDefault=TRUE will not be used.")
    }

  }else{
    stop("Cannot create a visit column to nothing other than a OrganizedBirds-class or a object with two dimensions")
  }

  return(x)
}




#' Extract observation data
#'
#' Extract the observation data from a OrganizedBirds-object
#'
#' @param x An OrganizedBirds-object
#' @return A data.frame
#' @export
#'
#' @examples
#' ob <- organizeBirds(bombusObs)
#' head(obsData(ob))
obsData<-function(x){
  UseMethod("obsData")
}

#' @rdname obsData
#' @export
obsData.OrganizedBirds<-function(x){
  if(any(class(x$spdf) == "SpatialPointsDataFrame")) return(x$spdf@data)
  if(any(class(x$spdf) == "sf")) return(st_drop_geometry(x$spdf))
}


#### Main Function #####

#'Organize a data.frame to a usable format
#'
#'Takes a data.frame with reported species observations and reformats it, using
#' visit identifiers, to an OrganizedBirds-class that can be used in further
#' analyses with the BIRDS-package.
#'
#'An OrganizedBirds-class is essentially a list containing a spatial element.
#'After version 0.2, the resulting element is of class \code{sf}. However, we
#'now add a parameter for backwards compatibility. It also accepts the inputs
#'in both formats. This sf has its data formatted in a way that the other
#'functions in the BIRDS-package can use further on. It also has the attribute
#'\code{"visitCol"}, which indicates which column in the data.frame holds the
#'visit identifier. The visit identifier is created by the function
#'\code{\link{createVisits}}, which creates a unique id for each combination of
#'the values in the defined columns.
#'
#'The variable \code{timeCol} can be formatted differently. If the variable is a
#'named vector with the names "Year", "Month" and "Day" (letter capitalization
#'does not matter) it will use the variable named year as the year column and so
#'on. Otherwise it will use the first variable as year, the second as month and
#'the third as day, if there is a vector of length three or more. If the vector
#'is of only length one it will interpret the column as a date column formatted
#'as "yyyy-mm-dd".
#'
#' @param x A data.frame, sf or a SpatialPointsDataFrame containing at least a
#'  column for species name, one or several columns for date of observation, one or
#'  several columns for identifying a visit and, if it is not spatial, coordinate
#'  columns.
#' @param sppCol A character string with the column name for the column for the
#' species names. Default is the Darwin Core standard name \code{"scientificName"}.
#' @param idCols A character vector of the names for the columns that
#'  are holding the information that identifies a visit. Default is the Darwin
#'  Core standard column names \code{c("locality", "day", "month", "year",
#'  "recordedBy")}.
#' @param timeCols A character vector with the names for the column(s) holding the
#' observation dates. Default is the Darwin Core standard column names
#' \code{c("year", "month", "day")}.
#' @param timeInVisits A flag indicating whether visits are defined by
#' the time definition or not, and to which resolution. Default is 'day'.
#' Alternatives are \code{c("day", "month", "year", NULL)}. Time is anyhow
#' organised into three columns year, month, day.
#' @param grid Either \code{NULL} to be ignored or an object of class
#' \code{SpatialPolygons} or \code{SpatialPolygonsDataFrame} as identifier of
#' the visits spatial extent.
#' @param presenceCol A character string with the column name for the column for the
#' presence status. Default is \code{NULL}.
#' @param xyCols A character vector of the names for the columns that are holding
#'  the coordinates for the observations. The order should be longitude(x),
#'  latitude(y). Default is the Darwin Core standard column names
#'  \code{c("decimalLongitude", "decimalLatitude")}. Only applicable to non-
#'  spatial data.frames.
#' @param dataCRS A character string or numeric for the data.frame CRS (Coordinate Reference
#'  System). Default is \code{4326}, which is WGS 84. This is only
#'  applicable to non-spatial data.frames, since a spatial data.frames already
#'  should have this information.
#' @param taxonRankCol the name of the column containing the taxonomic rank for
#' the observation.
#' That is the minimum taxonomic identification level.
#' @param taxonRank a string or vector of strings containing the taxonomic ranks to keep.
#' Only evaluated if taxonRankCol is not \code{NULL}
#' @param simplifySppName Logical. whether to remove everything else that is not the species
#' name (authors, years). Default set to FALSE, else leaves a canonical name given
#' by taxize::gbif_parse(), that is a scientific name with up to 3 elements.
#' @param spOut Logical. Whether the result should be a SpatialPolygon (sp) or an sf.
#'
#' @import sf
#' @importFrom stats IQR median na.omit  quantile var
#' @importFrom grDevices boxplot.stats
#' @importFrom graphics barplot layout legend mtext par plot
#' @return a `SpatialPointsDataFrame` wrapped into an object of class OrganizedBirds,
#' with additional attributes.
#' @export
#'
#' @examples OB <- organizeBirds(bombusObs)
#' @seealso \code{\link{createVisits}} to create unique visits IDs,
#'  \code{\link{visits}} to get or set the visit IDs to this class,
#'  \code{\link{simplifySpp}} to simplify species names,
#'  \code{\link{obsData}} to retrieve the data.frame from this class.
#' @aliases organiseBirds
organizeBirds <- function(x,
                        sppCol = "scientificName",
                        idCols = c("locality", "recordedBy"),
                        timeCols = c("year", "month", "day"),
                        timeInVisits = "day",
                        grid = NULL,
                        presenceCol = NULL,
                        xyCols = c("decimalLongitude", "decimalLatitude"),
                        dataCRS = 4326,
                        taxonRankCol=NULL,
                        taxonRank=c("SPECIES","SUBSPECIES","VARIETY"),
                        simplifySppName=FALSE,
                        spOut=FALSE){

  crs <- st_crs(as.numeric(dataCRS))
  stdTimeCols <- c("year", "month", "day")

  # Check the type of data
  if(any(class(x) == "data.frame")){
    x <- as.data.frame(x)

    xyColsl.df <- unlist(findCols(xyCols, x))
    if(length(xyColsl.df) == 0) stop("The column names defined for the coordinates could not be found in the data set")
    if(length(xyColsl.df) == 1) stop("The column names defined for the coordinates must be two. Check your values")

    if (length(xyColsl.df) > 0){
      if (length(xyColsl.df) > 2){ ## if too many matches try exact=TRUE
        xyColsl.df <- unlist(findCols(xyCols, x, exact=TRUE))
        if(length(xyColsl.df) == 0) stop("The column names defined for the coordinates could not be found in the data set")
      }
      x <- st_as_sf(x, coords = xyColsl.df)
      st_crs(x) <- st_crs(as.numeric(dataCRS))
      xdf <- st_drop_geometry(x)

    ### TODO Add message if CRS is not compatible with coordinates?? Do it with try.catch

    } else { stop("The column names defined for the coordinates could not be found in the data set")}
  } else if(any(class(x) == "SpatialPointsDataFrame")){
    x <- st_as_sf(x)
    xdf <- st_drop_geometry(x)
  } else if(any(class(x) == "sf")){
    xdf <- st_drop_geometry(x)## Just continue... :)
  } else {
    stop("The argument 'x' should be of class data.frame, sf or SpatialPointsDataFrame.")
  }

  if(st_crs(x) != crs){
    x <- st_transform(x, crs = crs)
  }

  ### Check the column names
  if (any(duplicated(tolower(names(x))))){
    stop("There are duplicated column names in the dataset (Note: case insensitive check).")
  }

  # Check if user wants to leave a certain level
  if (!is.null(taxonRankCol)){
    TRCol.df <- findCols(taxonRankCol, x, exact = TRUE)
    if (length(TRCol.df) > 0){
      exact.taxonRank <- paste0("\\b", taxonRank, "\\b") ## exact match
      wIn <- unique(
                unlist(
                  lapply(exact.taxonRank,
                         grep,
                         st_drop_geometry(x)[, TRCol.df],
                         ignore.case = TRUE,
                         value = FALSE
                        )
                )
              )

      if (length(wIn) > 0){
        nOut <- nrow(x) - length(wIn)
        x <- x[wIn,]
        if(nOut > 0) message(paste0(nOut, " observations did not match with the specified taxon rank and were removed."))
      } else { stop(paste0("No observation match with the specified taxon rank(s).")) }
    } else { stop(paste0("Taxon Rank: there is no column called ", taxonRankCol))}
  }

  # Simplify species names to reduce epithets and author names
  sppCol.df <- findCols(sppCol, x, exact = TRUE)
  if (length(sppCol.df) > 0){
    if (!is.null(simplifySppName) && simplifySppName == TRUE){
      x[, sppCol.df] <- simplifySpp(st_drop_geometry(x), sppCol.df)
    }
  } else { stop(paste0("Species name: there is no column called ", sppCol))}

  ## column name control defined in the function organizeDate()
  x[, stdTimeCols] <- organizeDate(st_drop_geometry(x), timeCols)


  ## colum name control defined in the function visitUID()
  ## Time is optional in the visits
  if (is.null(timeInVisits)) {
    timeColsVis <- timeInVisits
  } else {
    timeColsVis <- switch(timeInVisits,
                          "day" = c("year", "month", "day"),
                          "month" = c("year", "month"),
                          "year" = "year") ## Else NULL
  }

  x[,"visitUID"] <- createVisits(x,
                                 idCols = idCols,
                                 timeCols = timeColsVis,
                                 grid = grid,
                                 gridIdCol = NULL)

  #### Preparing the output as we want it
  res.df <- x[,c(sppCol.df, stdTimeCols, "visitUID", "geometry")]

  if (!is.null(presenceCol)){
    presenceCol.df <- findCols(presenceCol, x)
    if (length(presenceCol.df) > 0){
      presence <- x[, presenceCol.df]
      presence <- ifelse(presence>=1, 1, 0)
      res.df[,"presence"] <- presence
    } else {stop(paste0("Presence: there is no column called ", presenceCol))}
  }

  colnames(res.df)[1] <- "scientificName"

  #### Add the visits SLL to each visits
  x <- res.df
  if(spOut) x <- as_Spatial(x)

  res <- list(x)

  names(res)<-"spdf"

  class(res)<-c("OrganizedBirds")
  attr(res, "visitCol")<-"visitUID"

  return(res)
}

#' @rdname organizeBirds
#' @export
organiseBirds <- organizeBirds ## To include the Brits as well

Try the BIRDS package in your browser

Any scripts or data that you put into this service are public.

BIRDS documentation built on June 27, 2021, 1:06 a.m.