#' @export
type_materialize = function(filename, grbio_records, taxon_discovery){
#load the taxon discovery terms and the grbio records
new_taxons = scan(taxon_discovery, character(), quote = "", sep="\n")
grbio_table = read_csv(grbio_records, col_names = TRUE)
grbio_df = as.data.frame(grbio_table)
xml <- read_xml(filename)
#first load taxon_discovery.txt
#then load grbios institution code csv file
#extract institution abbreviations from article (if present)
abbrevs <- xml_find_all(xml, "//abbrev")
if(length(abbrevs)>0){
abr_frame = bind_rows(lapply(xml_attrs(abbrevs), function(x) data.frame(as.list(x), stringsAsFactors=FALSE)))
abr_frame$inst_codes = xml_text(abbrevs)
abr_frame = abr_frame[!duplicated(abr_frame$inst_codes), ]
}else{
abr_frame = NULL
}
#xmlparsing because we want to add nodes to the xml
doc <- xmlParse(xml, isHTML = FALSE)
nodeset <- getNodeSet(doc, "//tp:taxon-treatment")
results <- xml_find_all(xml, "//tp:taxon-treatment")
institution_codes = NULL
if(!(length(results)==0))
{
#extract the status to determine whether the holotype is of a new species or from a collection checklist/redescription
for (r in 1:length(nodeset))
{
#extract the taxonomic status and if it corresponds to taxon discovery, proceed
status = xml_find_all(results[r], "tp:nomenclature/tp:taxon-status")
status = gsub("<.*?>", "", status)
if(length(status) > 0 && status %in% new_taxons ==TRUE){
ns = xml_find_all(results[r], "tp:treatment-sec[@sec-type='type material'] | tp:treatment-sec[@sec-type='material'] | tp:treatment-sec[@sec-type='Holotype'] | tp:treatment-sec[@sec-type='Types'] | tp:treatment-sec[@sec-type='Typification']")
if (length(ns)>0){
string = toString(ns)
#if the materials section contains the words "holotype" and "paratype", continue with extraction and processing
if(grepl("holotype|Holotype|paratype|Paratype", string, perl=TRUE) == TRUE){
holotype_material = newXMLNode("type-material", parent=nodeset[r])
xmlValue(holotype_material) = "RANDOM SGSGSK"
#before removing the xml nodes, extract the coordinates and resolve location (openstreetmap)
dataframe = coordinator(string)
#remove all xml tags and other cleaning
type_string = clean_type_string(string)
#add type materials string to xml
xmlValue(holotype_material) <- type_string
if (is.null(dataframe)==FALSE){
occurrence_locations = newXMLNode("occurrence-locations", parent=holotype_material)
for (i in 1:nrow(dataframe)){
location = newXMLNode("location", parent=occurrence_locations)
if (is.na(dataframe[i, 2]) == FALSE){
xmlValue(location) = dataframe[i,2]
}
latitude = newXMLNode("latitude", parent=location)
xmlValue(latitude) = dataframe[i,3]
longitude = newXMLNode("longitude", parent=location)
xmlValue(longitude) = dataframe[i,4]
}
}
print(type_string)
#extract institutional codes
dwc_inst_codes = extract_dwc_codes(string)
abbrev_codes = extract_abbreviations(string)
#create a dataframe with institution names from the dwc and abbreviations combined
inst_dataframe = build_inst_dataframe(dwc_inst_codes, abbrev_codes, grbio_df, abr_frame)
inst_dataframe = unique(inst_dataframe)
if (nrow(inst_dataframe) >0){
#add nodes to xml with institution codes and names
institution_codes = newXMLNode("storing-institutions", parent=holotype_material)
for (i in 1:nrow(inst_dataframe)){
institution = newXMLNode("institution", parent=institution_codes)
institution_code = newXMLNode("institution_code", parent=institution)
xmlValue(institution_code) = toString(inst_dataframe[i,"abbreviation"])
institution_name = newXMLNode("institution_name", parent=institution)
xmlValue(institution_name) = toString(inst_dataframe[i,"institution_name"])
}
}
}
}
}
}
}
ss = saveXML(doc = doc)
#saveXML(doc = doc, "~/new_test_file.xml")
#ss=TRUE
return(ss)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.