In enpjp/PrepareDataForETL: Prepares Data for Extract Transform and Load

Reading and converting Bird Journal files in the bird journal format

knitr::opts_chunk$set(echo = TRUE)
# Libraries
library(gridExtra)
library(magrittr)
library(dplyr)
library(data.table)
library(tidyr)
library(rgdal) # for spTransform
library(stringr)

# Now for some useful short cuts

### shortcuts
ukgrid <- "+init=epsg:27700"
latlong <- "+init=epsg:4326"
bng <- '+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +ellps=airy +datum=OSGB36 +units=m +no_defs'

Reading and converting Bird Journal files in the bird journal format

Run this in the same directory as the Bird Journal file to convert.

# Read the files. This needs to be altered for each set of files
# Run from the same directory as the data
file_for_input <- c("Bird Journal Export.csv")

# And a file for output
file_for_output <- c("converted-to-natureSpot.csv")

# Now read the files
my.csv.data <- read.csv (file= file_for_input, na.strings = "NA", skip = 0, check.names = TRUE, strip.white = TRUE, blank.lines.skip = TRUE, stringsAsFactors=FALSE)

Process the files

# Convert to a data table
my.data <- my.csv.data %>% as.data.table
# The use of NA is not consistent so lets change all the empty strings into NA's
my.data[ my.data$Location. == "" & my.data == ""] <- NA


 for(my.col in 1:ncol(my.data)) {
  my.data <- fill(my.data, names(my.data)[my.col])
 }

# Create the columns we need
# Make a list of field names to ignore because we are already handling them
white.list <- c(
  "Location.",
  "Location.Latitude.",
  "Location.Longitude.",
  "Date.",
  "Start.Time.",
  "End.Time.",
  "Weather.",
  "eBird.Protocol.",
  "Number.of.Observers.",
  "All.Sightings.Recorded.",
  "Effort.Distance.Miles.",
  "Effort.Area.Acres.",
  "Name.",
  "Scientific.Name.",
  "Count.",
  "Latitude.",
  "Longitude.",
  "TimeStamp."
)

# Find the columns we will merge
my.merge.col <- setdiff(names(my.data), white.list)

# Now recode some of the entries

my.data$Heard.Only[my.data$Heard.Only %like% "Yes" ] <- "Heard only."
my.data$Good.View[my.data$Good.View %like% "Yes" ] <- "Good view."

my.data$merge.notes <- apply(my.data[ , my.merge.col , with=FALSE] , 1 , paste , collapse = " " )
# Now remove the NAs
my.data$merge.notes <- gsub("NA", "", my.data$merge.notes)

Co-ordinates

### Create coordinates variable
coords <- cbind(Easting = as.numeric(as.character(my.data$Longitude.)),
                Northing = as.numeric(as.character(my.data$Latitude.)))
coords_df <- as.data.frame(coords)
# Need to drop the rows which have NAs
coords.na <- which(is.na(coords[,1]))
if (length(coords.na) > 0) {
coords <- coords[-coords.na,]
coords_df <- as.data.frame(coords)
my.data <- my.data[-coords.na]}

### Create the SpatialPointsDataFrame
dat_SP <- SpatialPointsDataFrame(coords_df,
                                 data = my.data,
                                 proj4string = CRS(latlong))

## Convert
dat_SP_BNG <- spTransform(dat_SP, CRS(bng))
# Finally have Eastings and Northings appended to the data frame
df_working <- as.data.frame(dat_SP_BNG)
# Trim the leading digit as this is only used to set the grid ref letters
#df_working$Easting_trimmed <- round((df_working$Easting/100000)%%1,4) %>% as.character %>% substr(3,10)
df_working$Easting_trimmed <- df_working$Easting %>% as.character %>% substr(2,6)
df_working$Easting_lookup <- df_working$Easting %>% as.numeric() %>% substr(1,1)
#Easting_trimmed_vector <- df_working$Easting_trimmed
df_working$Northing_trimmed <- df_working$Northing %>% as.character  %>% substr(2,6)
df_working$Northing_lookup <- df_working$Northing %>% as.numeric() %>% substr(1,1)
#Northing_trimmed_vector <- df_working$Northing_trimmed

# Need to look up the grid letter from the National Grid Lookup
UKNG_Lookup <- array(  c(
        "SV", "SQ", "SL", "SF", "SA", "NV", "NQ", "NL", "NF", "NA", "HV", "HQ", "HL",
        "SW", "SR", "SM", "SG", "SB", "NW", "NR", "NM", "NG", "NB", "HW", "HR", "HM",
        "SX", "SS", "SN", "SH", "SC", "NX", "NS", "NN", "NH", "NC", "HX", "HS", "HN",
        "SY", "ST", "SO", "SJ", "SD", "NY", "NT", "NO", "NJ", "ND", "HY", "HT", "HO",
        "SZ", "SU", "SP", "SK", "SE", "NZ", "NU", "NP", "NK", "NE", "HZ", "HU", "HO",
        "TV", "TQ", "TL", "TF", "TA", "OV", "OQ", "OL", "OF", "OA", "JV", "JQ", "JL",
        "TW", "TR", "TM", "TG", "TB", "OW", "OR", "OM", "OG", "OB", "JW", "JR", "JM"
         ),
        dim = c(13, 7), dimnames = NULL
        ) %>% as.data.frame()
# lookup Matrix

for (my.row.number in 1: nrow(df_working) ) {

#grid_letters <- UKNG_Lookup[df_working$Northing_lookup[my.row.number],df_working$Easting_lookup[my.row.number]] %>% as.character()
easting <- df_working$Easting_lookup[my.row.number] %>% as.numeric()
northing <- df_working$Northing_lookup[my.row.number] %>% as.numeric()
grid_letters <- UKNG_Lookup[northing+1,easting+1] %>% as.character()

df_working$GridRef <- paste(grid_letters,df_working$Easting_trimmed[my.row.number], df_working$Northing_trimmed[my.row.number], sep ="")

my.data$GeoTag[my.row.number] <- df_working$GridRef[my.row.number]

}

And finaly output the file

## Now get the naturespot output
NatureSpot <- data.frame(
    Species.name = my.data$Scientific.Name.,
    Abundance = my.data$Count.,
    Confidence= c("Certain"),
    Identified.by = c("Paul Palmer"),
    Occurrence.comment = my.data$merge.notes,
    Date = my.data$Date.,
    Site.name = my.data$Location.,
    Grid.ref= my.data$GeoTag

) 
# Quirks  Recoding 
NatureSpot$Species.name <- gsub("Alopochen aegyptiaca", 
                                "Alopochen aegyptiacus", 
                                NatureSpot$Species.name)
NatureSpot$Species.name <- gsub("Scythris hexadactyla", 
                                "Alucita hexadactyla", 
                                NatureSpot$Species.name)

print(head(NatureSpot))
write.csv(NatureSpot, file = file_for_output, row.names=FALSE )
#rm(NatureSpot,data)