knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
Note: You can use coma chameleon app if you want to edit .csv without opening excel and risking cell unformatting.
Turn on the music and let's clean some dirty data.
1) First, let's load the package and some fake data. dirty_data
contains typical data to clean, the example
is a template with the desired data format, including all possible categories for categorical variables, and min/max values for numerical variables. Finally, the species thesaurus is a file with the accepted genus and species names.
library(cleanR) dirty_data <- read.csv("../inst/extdata/dirty_data.csv", stringsAsFactors = FALSE) example <- read.csv("../inst/extdata/template.csv") species_tesaurus <- read.csv("../inst/extdata/species_tesaurus.csv", sep = ";")
I created a few help_
functions to provide tips and refresh my mind on what options are available. Here we first start by checking variable names (colnames) match with what we expect.
#To refresh the steps (and copy-paste the structure) I usually use: help_structure() #create template to check against check <- define_template(example, species_tesaurus) #Note: Use factors only for those variables you want to compare categories. dirty_data$Country <- as.factor(dirty_data$Country) dirty_data$Locality <- as.factor(dirty_data$Locality) (comp <- compare_variables(check, dirty_data))
We observe that some variables do not match (e.g. UTM vs lat long, or Date vs Day, Month Year). We will look onto that later. In addition, some are simply mispelled (e.g. Reference..doi.). The worker
column should be an integer, but it's a character and the males variable has values out of range. Finally, we need to fix some categories. All this process is quite manual because the ofensive values are context dependent. Let's do that.
#Reference doi is misspelled colnames(dirty_data)[which(colnames(dirty_data) == "Reference..doi.")] <- "Reference.doi" #There is a O instead of 0 dirty_data$Worker[which(dirty_data$Worker == "O")] <- 0 dirty_data$Worker <- as.numeric(dirty_data$Worker) #Males can't be -2 dirty_data$Male[which(dirty_data$Male < 0)] <- NA #Fix misspelled Country and Locality dirty_data$Country <- as.character(dirty_data$Country) #this is not elegant, fix. dirty_data$Country <- ifelse(dirty_data$Country %in% as.character(comp$Country$to_fix), as.character(comp$Country$fixed), dirty_data$Country) #asuming you don't want France to be included. dirty_data$Locality <- as.character(dirty_data$Locality) #this is not elegant.s dirty_data$Locality <- ifelse(dirty_data$Locality == as.character(comp$Locality$to_fix), as.character(comp$Locality$fixed), dirty_data$Locality)
Finally, we have a function to add the missing variables and reorder the data frame
#add missing variables dirty_data <- add_missing_variables(check, dirty_data) #recheck compare_variables(check, dirty_data) #drop_variables(check, dirty_data) I am not droping variables yet, as I'll need those.
Second, We need to convert a few variables such as Date, Lat-Longs, etc... we have some helpers for that:
#I never remember georeference functions, so I can ask for a tip help_geo() dirty_data$Latitude <- parzer::parse_lat(as.character(dirty_data$UTM.N)) dirty_data$Longitude <- parzer::parse_lat(as.character(dirty_data$UTM.S)) alternative <- mgrs::mgrs_to_latlng(as.character(dirty_data$MGRS)) #From here you can extract lat and long easily #We may also be interested in geolocating some localities latlong <- geocode(unique(as.character(dirty_data$Locality))) latlong ; unique(paste(dirty_data$Latitude, dirty_data$Longitude)) #not bad! #From here you can extract lat and long easily #and I also often forget about Date manipulation help_date() #When the date format is "easy" I have a wrapper temp <- extract_date(dirty_data$Date, format = "%d-%m-%Y") dirty_data$Year <- 2000+temp$year dirty_data$Month <- temp$month dirty_data$Day <- temp$day #Another common task is cleaning or spliting species names with some heuristics: help_species() #tipical useful functions temp <- extract_pieces(dirty_data$Species, species = TRUE) #this may fail with more dirty data #in this case strsplit() may also work. #One Genus has the Subgenus attached in brackets temp2 <- extract_pieces(dirty_data$Genus, pattern_ = " (", i = 2, f = 1) dirty_data$Subgenus <- temp2$piece2 dirty_data$Genus <- temp$piece2 dirty_data$Species <- temp$piece1 #finally we check again compare_variables(check, dirty_data) #Note we have a year in 2029 which is not possible dirty_data$Year[which(dirty_data$Year > 2019)] <- 2019 #Now we can drop variables clean_data <- drop_variables(check, dirty_data) #and add a unique id clean_data <- add_uid(newdat = clean_data, "test") head(clean_data)
3) Finally, one always want to check species names, as misspellings and synonims are common.
#create a genus species column clean_data$Gen_sp <- paste(trimws(clean_data$Genus), trimws(clean_data$Species)) #use the check_sp function to compare with the thesaurus. to_fix <- check_sp(check, clean_data$Genus, clean_data$Species, k = 2) #We provide fuzzy matching for misspellings. The tolerance in the number of changes is set by the parameter k. k = 2 means two changes. to_fix #Here we can accept the proposed fixes, as those makes sense. #We add this columns, to preserve the original names. clean_data <- merge(clean_data, to_fix, by.x = "Gen_sp", by.y = "mismatches", all.x = TRUE) #we can now create a final used_Gen_sp column and rename the original. clean_data$used_Gen_sp <- ifelse(is.na(clean_data$fixed), clean_data$Gen_sp, clean_data$fixed) clean_data$fixed <- NULL #Note that columns Genus and Species have not been fixed, so we can do it now: temp <- extract_pieces(clean_data$used_Gen_sp, species = TRUE) clean_data$Genus <- temp$piece2 clean_data$Species <- temp$piece1 colnames(clean_data)[which(colnames(clean_data) == "Gen_sp")] <- "original_Gen_sp" head(clean_data)
For more complex scinetific name cleaning (e.g. detecting synonyms), you can use taxize. I also made a wrapper, but this has evolved quite a lot.
#library(taxize) species <- c("Osmia rufa", "Osmia bicornis", "Osmia ruffa", "Osmia wikifluqie", "watermelon pie", "Osmia sp.") #clean_species(species) #This is slow #You can also check #install.packages('traitdataform') #get_gbif_taxonomy()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.