In RadicalCommEcol/CleanR: Clean all the data! (I wish)

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Note: You can use coma chameleon app if you want to edit .csv without opening excel and risking cell unformatting.

Turn on the music and let's clean some dirty data.

1) First, let's load the package and some fake data. dirty_data contains typical data to clean, the example is a template with the desired data format, including all possible categories for categorical variables, and min/max values for numerical variables. Finally, the species thesaurus is a file with the accepted genus and species names.

library(cleanR)
dirty_data <- read.csv("../inst/extdata/dirty_data.csv", stringsAsFactors = FALSE)
example <- read.csv("../inst/extdata/template.csv")  
species_tesaurus <- read.csv("../inst/extdata/species_tesaurus.csv", sep = ";")

I created a few help_ functions to provide tips and refresh my mind on what options are available. Here we first start by checking variable names (colnames) match with what we expect.

#To refresh the steps (and copy-paste the structure) I usually use:
help_structure()

#create template to check against
check <- define_template(example, species_tesaurus)

#Note: Use factors only for those variables you want to compare categories.
dirty_data$Country <- as.factor(dirty_data$Country)
dirty_data$Locality <- as.factor(dirty_data$Locality)
(comp <- compare_variables(check, dirty_data))

We observe that some variables do not match (e.g. UTM vs lat long, or Date vs Day, Month Year). We will look onto that later. In addition, some are simply mispelled (e.g. Reference..doi.). The worker column should be an integer, but it's a character and the males variable has values out of range. Finally, we need to fix some categories. All this process is quite manual because the ofensive values are context dependent. Let's do that.

#Reference doi is misspelled
colnames(dirty_data)[which(colnames(dirty_data) == "Reference..doi.")] <- "Reference.doi"

#There is a O instead of 0
dirty_data$Worker[which(dirty_data$Worker == "O")] <- 0
dirty_data$Worker <- as.numeric(dirty_data$Worker)

#Males can't be -2
dirty_data$Male[which(dirty_data$Male < 0)] <- NA

#Fix misspelled Country and Locality
dirty_data$Country <- as.character(dirty_data$Country) #this is not elegant, fix.
dirty_data$Country <- ifelse(dirty_data$Country %in% as.character(comp$Country$to_fix), 
                             as.character(comp$Country$fixed), dirty_data$Country) #asuming you don't want France to be included.

dirty_data$Locality <- as.character(dirty_data$Locality) #this is not elegant.s
dirty_data$Locality <- ifelse(dirty_data$Locality ==
                                as.character(comp$Locality$to_fix),
                              as.character(comp$Locality$fixed), dirty_data$Locality)

Finally, we have a function to add the missing variables and reorder the data frame

#add missing variables
dirty_data <- add_missing_variables(check, dirty_data)

#recheck
compare_variables(check, dirty_data)

#drop_variables(check, dirty_data) I am not droping variables yet, as I'll need those.

Second, We need to convert a few variables such as Date, Lat-Longs, etc... we have some helpers for that:

#I never remember georeference functions, so I can ask for a tip
help_geo()
dirty_data$Latitude <- parzer::parse_lat(as.character(dirty_data$UTM.N))
dirty_data$Longitude <- parzer::parse_lat(as.character(dirty_data$UTM.S))

alternative <- mgrs::mgrs_to_latlng(as.character(dirty_data$MGRS))
#From here you can extract lat and long easily

#We may also be interested in geolocating some localities 
latlong <- geocode(unique(as.character(dirty_data$Locality)))
latlong ; unique(paste(dirty_data$Latitude, dirty_data$Longitude)) #not bad!
#From here you can extract lat and long easily

#and I also often forget about Date manipulation
help_date()
#When the date format is "easy" I have a wrapper

temp <- extract_date(dirty_data$Date, format = "%d-%m-%Y")
dirty_data$Year <- 2000+temp$year
dirty_data$Month <- temp$month
dirty_data$Day <- temp$day

#Another common task is cleaning or spliting species names with some heuristics:
help_species() #tipical useful functions

temp <- extract_pieces(dirty_data$Species, species = TRUE) #this may fail with more dirty data
#in this case strsplit() may also work.

#One Genus has the Subgenus attached in brackets
temp2 <- extract_pieces(dirty_data$Genus, pattern_ = " (", i = 2, f = 1) 
dirty_data$Subgenus <- temp2$piece2
dirty_data$Genus <- temp$piece2
dirty_data$Species <- temp$piece1

#finally we check again
compare_variables(check, dirty_data) #Note we have a year in 2029 which is not possible
dirty_data$Year[which(dirty_data$Year > 2019)] <- 2019

#Now we can drop variables
clean_data <- drop_variables(check, dirty_data)
#and add a unique id
clean_data <- add_uid(newdat = clean_data, "test")

head(clean_data)

3) Finally, one always want to check species names, as misspellings and synonims are common.

#create a genus species column
clean_data$Gen_sp <- paste(trimws(clean_data$Genus),
                  trimws(clean_data$Species))
#use the check_sp function to compare with the thesaurus.
to_fix <- check_sp(check, clean_data$Genus, clean_data$Species, k = 2)
#We provide fuzzy matching for misspellings. The tolerance in the number of changes is set by the parameter k. k = 2 means two changes.
to_fix #Here we can accept the proposed fixes, as those makes sense.

#We add this columns, to preserve the original names. 
clean_data <- merge(clean_data, to_fix, by.x = "Gen_sp", by.y = "mismatches", all.x = TRUE)

#we can now create a final used_Gen_sp column and rename the original.
clean_data$used_Gen_sp <- ifelse(is.na(clean_data$fixed), clean_data$Gen_sp,
                                 clean_data$fixed)
clean_data$fixed <- NULL

#Note that columns Genus and Species have not been fixed, so we can do it now:
temp <- extract_pieces(clean_data$used_Gen_sp, species = TRUE) 
clean_data$Genus <- temp$piece2
clean_data$Species <- temp$piece1
colnames(clean_data)[which(colnames(clean_data) == "Gen_sp")] <- "original_Gen_sp"

head(clean_data)

For more complex scinetific name cleaning (e.g. detecting synonyms), you can use taxize. I also made a wrapper, but this has evolved quite a lot.

#library(taxize)
species <- c("Osmia rufa", "Osmia bicornis", "Osmia ruffa",
            "Osmia wikifluqie", "watermelon pie", "Osmia sp.")
#clean_species(species) #This is slow


#You can also check 
#install.packages('traitdataform')
#get_gbif_taxonomy()