ERIC_Data_Proc: Functions for data processing and validation

Documented in format_and_check_input_data

#' Format the data for input and perform some checks
#'
#' 'Use this function to perform checks on incoming data
#' Flag whether iRecord, ERIC, standard data, NTBC, and
#'
#' @param raw_data Set of species data
#' @param locCheck whether to check for blank locations
#' @param inputFormat the source of the data, e.g. iRecord
#' @param recorderName name of recorder to insert into BirdTrack data
#'
#' @return
#' @export
#'
#' @examples
#'a <- c('','BirdsDir-A2.2')
#'b <- c('','')
#'c <- c('','')
#'d <- c('insect - butterfly','bird')
#'e <- c('Pieris rapae','Turdus merula')
#'f <- c('Small White','Blackbird')
#'g <- c('Mike Jeffries','Alnwick Wildlife Group')
#'h <- c('Morpeth Town','Morpeth Town')
#'i <- c('04/09/2010','04/09/2010')
#'j <- c('NZ18X','NZ18X')
#'k <- c('Butterfly Conservation','Alnwick Wildlife Group')
#'l <- c('Butterfly Conservation','Alnwick Wildlife Group')
#'m <- c('2 Count','1 Count')
#'n <- c('Considered Correct','Considered Correct')
#'o <- c(419000,419000)
#'p <- c(585000,585000)
#'q <- c(1000,1000)
#'r <- c('layer','layer')
#'s <- c('path','path')

#'df <- data.frame(a,b,c,d,e,f,a,g,h,h,i,j,k,l,m,n,o,p,q,r,s,b)


#'names(df) <- c('All.Design', 'Wildlife..', 'Wildlife_1', 'Taxon.grou', 'Taxon.Lati', 'Taxon.Comm', 'Obs.Commen', 'Sample.Rec', 'Sample.Loc', 'Sample.L_1', 'Sample.Dat', 'Sample.Spa', 'Survey.Run', 'Survey.Nam', 'Obs.Abunda', 'Determinat', 'Central_Ea', 'Central_No', 'Buffer', 'layer', 'path','Info')
#'OutputCols <- c("All.Design","Taxon.grou", "Taxon.Lati","Taxon.Comm", "Obs.Abunda", "Determinat","Obs.Commen","Sample.Rec",  "Sample.Loc",  "Sample.Dat","Sample.Spa","Survey.Run","Survey.Nam","Info")
#'newColNames <- c("Designations","Taxon group","Latin Name","Common Name","Abundances","Determination Type","Comments","Recorder","Location Name","Date","Grid Reference","Survey Run By","Survey Name","Additional Information")
#'data <- format_and_check_data(df,OutputCols,newColNames, 0)

format_and_check_input_data <- function(raw_data,locCheck,inputFormat,recorderName) {

  locsToReplace <- setup_locs_to_replace()
  locsToIgnore <- setup_locs_to_ignore()
  swearWords <- setup_profanity_config()
  speciesToFlag <- setup_species_config()
  recordersToIgnore <- setup_recorders_to_ignore()
  speciesToIgnore <- setup_speciesnames_to_ignore()
  scientific <- setup_scientific_config()
  invasives <- setup_invasives_config()

  #FORMATTING

  #If dataSource is iRecord
  if (inputFormat == "irecord") {
  # Add verifier to observer
    raw_data["Verifier"][is.na(raw_data["Verifier"])] <- ""
    raw_data$Recorder<-paste(raw_data$Recorder,raw_data$Verifier,sep=", ")


    #Replace "present" in Abundances
    raw_data["Abundances"][(raw_data["Abundances"])=="Present"] <- ""

    #Replace "to" in Dates
    raw_data$Date<-str_replace(raw_data$Date," to ", " - ")


    #Remove NA throughout
    raw_data[is.na(raw_data)]<-""

    # Build comment field

    raw_data["Sample method"][(raw_data["Sample method"])=="Unknown"] <- ""

    raw_data["Sex"][(raw_data["Sex"])=="not recorded"] <- ""

    raw_data["Stage"][(raw_data["Stage"])=="not recorded"] <- ""


    raw_data$Biotope <- ifelse(raw_data$Biotope=="","",paste("Biotope:",raw_data$Biotope))

    raw_data$Comments<-str_trim(paste(raw_data$Comments, raw_data$`Sample comment`,raw_data$Biotope,raw_data$`Sample method`,raw_data$Sex,raw_data$Stage))

  }
  #If dataSource is ERIC website
  if (inputFormat == "eric") {
    #Replace "present" in Abundances
    raw_data["Abundances"][(raw_data["Abundances"])=="Present"] <- ""


    #Remove NA throughout
    raw_data[is.na(raw_data)]<-""

    #Build comment field
    raw_data["Sample method"][(raw_data["Sample method"])=="Unknown"] <- ""
    raw_data["Sex"][(raw_data["Sex"])=="not recorded"] <- ""
    raw_data["Stage"][(raw_data["Stage"])=="not recorded"] <- ""
    raw_data["Biotope"][is.na(raw_data["Biotope"])] <- ""
    raw_data$Biotope <- ifelse(raw_data$Biotope=="","",paste("Biotope:",raw_data$Biotope))


    raw_data$Comments<-str_trim(paste(raw_data$Comments, raw_data$`Sample comment`,raw_data$Biotope,raw_data$`Sample method`,raw_data$Sex,raw_data$Stage))
  }

  #If dataSource is BirdTrack
  if (inputFormat == "bt") {

    #Do we have breeding info?
    if ("Breeding status" %in% colnames(raw_data)) {
      #Get breeding codes
      breeding_codes <- setup_bto_breeding_codes()
      raw_data <- dplyr::left_join(raw_data,breeding_codes)
    }

    #Setup a column for the observer name
    raw_data$Recorder <- recorderName

    #Remove any "present" values in abundance column
    raw_data["Abundances"][(raw_data["Abundances"])=="Present"] <- ""

    #Remove NA throughout
    raw_data[is.na(raw_data)]<-""


    #Build comment field allowing for optional fields
    #raw_data$Comments<-str_trim(paste(raw_data$Comments, raw_data$Plumage,ifelse(raw_data$`Breeding details`=="",ifelse(raw_data$`Status`=="","",paste("Breeding status:",raw_data$`Status`)),raw_data$`Breeding details`)))

    if ("Plumage" %in% colnames(raw_data)) raw_data$Comments<-str_trim(paste(raw_data$Comments, raw_data$Plumage))
    if ("Breeding details" %in% colnames(raw_data))  raw_data$Comments<-str_trim(paste(raw_data$Comments, ifelse(raw_data$'Breeding details' == "","",raw_data$`Breeding details`)))
    if ("Breeding status" %in% colnames(raw_data))  raw_data$Comments<-str_trim(paste(raw_data$Comments, ifelse(raw_data$'Status' == "","",raw_data$`Status`)))


    #Get the grid ref from one of two columns
    #Pinpoint field is optional
    if ("Pinpoint" %in% colnames(raw_data)) raw_data$`Grid Reference`<- ifelse(raw_data$Pinpoint=="",raw_data$`Grid Reference`,raw_data$Pinpoint)



    }

  #Remove NA throughout
  raw_data[is.na(raw_data)]<-""

  #Add row number column
  rowNo <- seq_len(nrow(raw_data))
  outputData<-cbind(raw_data,rowNo)
  names(outputData)[names(outputData) == 'rowNo'] <- 'Row No'


  #Strip commas and full stops from the end of the Recorder name
  outputData$Recorder <- str_replace(outputData$Recorder,"[.,]$","")


  #Check recorder for blanks, email addresses, "anon", "unknown",  ampersands or "Mr and mrs" type but ignore allowed values
  outputData$flagRec <- is.na(outputData$Recorder == "") | is.na(outputData$Recorder) | !str_detect(outputData$Recorder," ") | stringr::str_detect(outputData$Recorder,"@") | stringr::str_detect(tolower(outputData$Recorder)," and ") | stringr::str_detect(outputData$Recorder,"&") | stringr::str_detect(outputData$Recorder,"[Aa]nonymous") | stringr::str_detect(outputData$Recorder,"[Aa]non[, ]") | stringr::str_detect(outputData$Recorder,"[Uu]nknown") | stringr::str_detect(outputData$Recorder,"[Mm]r ") | stringr::str_detect(outputData$Recorder,"[Mm]s ")| stringr::str_detect(outputData$Recorder,"[Mm]rs ") | stringr::str_detect(outputData$Recorder,"[Mm]iss ")  & is.na(match(outputData$`Recorder`,table = recordersToIgnore$`Recorder`))

  #Check species
  outputData$flagSpecies <- is.na(outputData$`Common Name`== "" & outputData$`Species Name` == "") |  stringr::str_detect(tolower(outputData$'Species Name'),paste(c(tolower(scientific$term)),collapse = "|"))
  speciesToFlag$match <- "yes"

  #Check both columns against species flag list
  cnMatch <- dplyr::left_join(outputData, speciesToFlag, by=c("Common Name"="species"))
  snMatch <- dplyr::left_join(outputData, speciesToFlag, by=c("Species Name"="species"))

  outputData$flagSpecies <- outputData$flagSpecies | (snMatch$match == "yes")
  outputData$flagCommon <- (cnMatch$match == "yes")

  #Check for invasive species
  cnMatch <- dplyr::left_join(outputData, invasives, by=c("Common Name"="Common"))
  snMatch <- dplyr::left_join(outputData, invasives, by=c("Species Name"="species"))

  outputData$flagInvasives <- !is.na(cnMatch$'Alert.level') | !is.na(snMatch$'Alert.level')

  #Check abundance length & zero counts
  outputData$flagAbun <- str_length(outputData$Abundances) >=10 | outputData$Abundances == "0" | str_detect(outputData$Abundances,"-/")

  #Check comment length, for swearwords and e-mail addresses
  outputData$flagCom <- str_length(outputData$Comments) >= 150 | stringr::str_detect(tolower(outputData$Comments),paste(c(swearWords$word),collapse = "|")) | stringr::str_detect(outputData$Comments,"@")

  #Code to check house nums
  outputData <- check_house_numbers(outputData,locsToReplace,locsToIgnore)

  #Check for "Gosforth Park"
  outputData$flagGP <- stringr::str_detect(outputData$`Location Name`,'[Gg]osforth [Pp]ark') | stringr::str_detect(outputData$`Location Name`,'[Gg][Pp][Nn][Rr]')

  #Check length of location name
  outputData$flag1 <- str_length(outputData$`Location Name`) >= 100

  #Check for blank location name
  if (locCheck) {
    outputData$flagLoc <- is.na(outputData$`Location Name` == "") | outputData$flag1
  } else {

    outputData$flagLoc <- outputData$flag1
  }

  #Check grid ref
  outputData$flagGR <- is.na(outputData$`Grid Reference` =="") | is.na(str_length(outputData$`Grid Reference`) <=4) | str_length(outputData$`Grid Reference`) %% 2 | str_detect(outputData$`Grid Reference`, "[ .,-]") | !apply(sapply(c("NT","NU","NY","NZ"),grepl,str_sub(outputData$`Grid Reference`,1,2)),1,any)

  #Check date
  outputData$flagDate <- (outputData$'Date' == "") | stringr::str_detect(outputData$`Date`,' to ') | stringr::str_detect(outputData$`Date`,'[abcdefghijklmnopqrstuvwxyz]/')

  #Add dup check column


  #Return only the columns we want including the flags so we can highlight issues later
  OutputCols <- c("Recorder","Common Name","Species Name","Date","Grid Reference","Location Name","Abundances","Comments","Row No","flagRec","flagSpecies","flagAbun","flagCom","flagLoc","flagGR", "flagCommon", "flagInvasives", "flagDate","flagGP")
  data_subset <- dplyr::select(outputData,dplyr::all_of(unlist(OutputCols)))
  return(data_subset)


}