R/type_materialize.R

#' @export

type_materialize = function(filename, grbio_records, taxon_discovery){


  #load the taxon discovery terms and the grbio records
  new_taxons = scan(taxon_discovery, character(), quote = "", sep="\n")
  grbio_table = read_csv(grbio_records, col_names = TRUE)
  grbio_df = as.data.frame(grbio_table)


  xml <- read_xml(filename)
  #first load taxon_discovery.txt
  #then load grbios  institution code csv file

  #extract institution abbreviations from article (if present)
  abbrevs <- xml_find_all(xml, "//abbrev")
  if(length(abbrevs)>0){
    abr_frame = bind_rows(lapply(xml_attrs(abbrevs), function(x) data.frame(as.list(x), stringsAsFactors=FALSE)))
    abr_frame$inst_codes = xml_text(abbrevs)
    abr_frame = abr_frame[!duplicated(abr_frame$inst_codes), ]
  }else{
    abr_frame = NULL
  }


  #xmlparsing because we want to add nodes to the xml
  doc <- xmlParse(xml, isHTML = FALSE)
  nodeset <- getNodeSet(doc, "//tp:taxon-treatment")
  results <- xml_find_all(xml, "//tp:taxon-treatment")
  institution_codes = NULL
  if(!(length(results)==0))
  {
    #extract the status to determine whether the holotype is of a new species or from a collection checklist/redescription
    for (r in 1:length(nodeset))
    {
      #extract the taxonomic status and if it corresponds to taxon discovery, proceed
      status = xml_find_all(results[r], "tp:nomenclature/tp:taxon-status")
      status = gsub("<.*?>", "", status)
      if(length(status) > 0 && status %in% new_taxons ==TRUE){
        ns = xml_find_all(results[r], "tp:treatment-sec[@sec-type='type material'] | tp:treatment-sec[@sec-type='material'] | tp:treatment-sec[@sec-type='Holotype'] | tp:treatment-sec[@sec-type='Types'] | tp:treatment-sec[@sec-type='Typification']")
        if (length(ns)>0){
          string = toString(ns)
          #if the materials section contains the words "holotype" and "paratype", continue with extraction and processing
          if(grepl("holotype|Holotype|paratype|Paratype", string, perl=TRUE) == TRUE){
            holotype_material = newXMLNode("type-material", parent=nodeset[r])
            xmlValue(holotype_material) = "RANDOM SGSGSK"
            #before removing the xml nodes, extract the coordinates and resolve location (openstreetmap)
            dataframe = coordinator(string)
            #remove all xml tags and other cleaning
            type_string = clean_type_string(string)
            #add type materials string to xml
            xmlValue(holotype_material) <- type_string

            if (is.null(dataframe)==FALSE){
              occurrence_locations = newXMLNode("occurrence-locations", parent=holotype_material)
              for (i in 1:nrow(dataframe)){
                location = newXMLNode("location", parent=occurrence_locations)
                if (is.na(dataframe[i, 2]) == FALSE){
                  xmlValue(location) = dataframe[i,2]
                }
                latitude = newXMLNode("latitude", parent=location)
                xmlValue(latitude) = dataframe[i,3]
                longitude = newXMLNode("longitude", parent=location)
                xmlValue(longitude) = dataframe[i,4]
              }
            }

            print(type_string)

            #extract institutional codes
            dwc_inst_codes = extract_dwc_codes(string)
            abbrev_codes = extract_abbreviations(string)
            #create a dataframe with institution names from the dwc and abbreviations combined
            inst_dataframe = build_inst_dataframe(dwc_inst_codes, abbrev_codes, grbio_df, abr_frame)
            inst_dataframe = unique(inst_dataframe)
            if (nrow(inst_dataframe) >0){
              #add nodes to xml with institution codes and names
              institution_codes = newXMLNode("storing-institutions", parent=holotype_material)
              for (i in 1:nrow(inst_dataframe)){
                institution = newXMLNode("institution", parent=institution_codes)
                institution_code = newXMLNode("institution_code", parent=institution)
                xmlValue(institution_code) = toString(inst_dataframe[i,"abbreviation"])
                institution_name = newXMLNode("institution_name", parent=institution)
                xmlValue(institution_name) = toString(inst_dataframe[i,"institution_name"])
              }
            }
          }
        }
      }
    }
  }

  ss = saveXML(doc = doc)
  #saveXML(doc = doc, "~/new_test_file.xml")
  #ss=TRUE
  return(ss)
}
mariyad/openbiodiving documentation built on June 3, 2019, 2:18 p.m.