In ivanhanigan/exploratoryanalysistemplate: eda

Testing taxonomic review code snippet

projdir <- "~/tools/exploratory_analysis_template"
outdir <- file.path(projdir, "tests")
# if(!file.exists(outdir)) dir.create(outdir)
outfile <- "test_eda.csv"
file.path(outdir, outfile) 
setwd(outdir)

library(gdata)
library(disentangle)
library(EML)
library(sqldf)
library(taxize)
indir <- "~/tools/exploratory_analysis_template/inst/extdata"

dir(indir)
infile <- "taxonomic_dummy_data.csv"

#### load ####
print(file.path(indir,infile))
dat <- read.csv(file.path(indir,infile), stringsAsFactors = F)

#### check ####
str(dat)

head(dat) 
tail(dat)

#### Taxonomic review ####
tx <- as.data.frame(table(dat$species))
names(tx) <- c("species", "Frequency")
write.csv(tx, file.path(outdir, gsub(".csv","_taxonomic_coverage.csv", outfile)), row.names = F)  
# Test new version?
tx <- as.data.frame(table(dat$species))
names(tx) <- c("species", "Frequency")
splist <- tx$species
sources <- gnr_datasources()
sources

eol <- sources$id[sources$title == "EOL"]
gbif_backbone <- sources$id[sources$title == "GBIF Backbone Taxonomy"]
ipni <- sources$id[sources$title == "The International Plant Names Index"]
zk <- sources$id[sources$title == "ZooKeys"]
zb <- sources$id[sources$title == "ZooBank"]
c(eol, gbif_backbone, ipni, zk, zb)
out <- gnr_resolve(splist, data_source_ids=c(eol, gbif_backbone, ipni, zk, zb), stripauthority=TRUE)
#out
out2 <- unique(out$results)
out3 <- sqldf('select submitted_name, matched_name2 as match_via_database, max(score) as max_database_score, "" as change_note, "" as update_to
  from out2
  group by submitted_name, matched_name2')
out3[which(out3$submitted_name == out3$match_via_database),"max_database_score"] <- ""
out3[which(out3$submitted_name == out3$match_via_database),"match_via_database"] <- ""
out3
write.csv(out3, file.path(outdir, gsub(".csv","_taxonomic_coverage.csv", outfile)), row.names = F)  
#### TODO: 
# you should go to this CSV file and edit the final columns, 
# take notes on decisions and create the updates list.  Save as new file with 2.csv at end.
####

# Post review merge fixed names and remove old names
dir(outdir)
tx_file <- "test_eda_taxonomic_coverage.csv"     
tx <- read.csv(file.path(outdir, tx_file), stringsAsFactor = F)
nrow(tx)
head(tx)
tx[tx[,grep("change_note", names(tx))]!="",]                                          
str(dat)

# check that linking variable is identical
idx <- names(table(dat$species))
head(idx)
idy <- tx$submitted_name
head(idy)
idx[-which(idx %in% idy)]
idy[-which(idy %in% idx)]

# if all good then merge
dat <- merge(dat, tx, by.x = "species", by.y = "submitted_name", all.x = T)
str(dat)
dat
# reorder cols
paste(names(dat), collapse = "','", sep = "")
namelist <- c("update_to", "index")

dat <- dat[,namelist]
names(dat) <- gsub("update_to" , "species", names(dat))
str(dat)
dat