R/xml.R

#' @export
mySchema = XmlSchema$new(
  schema_name = "mySchema",
  xpath = "/",
  file_pattern = ".*\\.xml",
  extension = ".xml",
  prefix = NA,
  atoms = c(
    title = "/article/front/article-meta/title-group/article-title",
    date = NA,
    pub_year = "/article/front/article-meta/pub-date[@pub-type='epub']/year",
    pub_month = "/article/front/article-meta/pub-date[@pub-type='epub']/month",
    pub_day = "/article/front/article-meta/pub-date[@pub-type='epub']/day",
    doi = "/article/front/article-meta/article-id[@pub-id-type='doi']",
    zenodo = NA,
    zoobank = NA,
    publisher = "/article/front/journal-meta/publisher/publisher-name",
    journal = "/article/front/journal-meta/journal-title-group/journal-title",
    journal_abbrev = "/article/front/journal-meta/journal-title-group/abbrev-journal-title",
    issn = "/article/front/journal-meta/issn[@pub-type='ppub']",
    eIssn = "/article/front/journal-meta/issn[@pub-type='epub']",
    issue = "/article/front/article-meta/issue",
    volume = NA,
    starting_page = NA,
    ending_page = NA,
    keyword = "/article/front/article-meta/kwd-group/kwd",
    bold_id="/article/front/article-meta/bold-ids/bold-id",
    bin="/article/front/article-meta/bins/bin"
    #pensoft_pub = NA # becasue we skip only Pensoft pubs from Plazi
  ),

  atom_lang = c(
    title = NA,
    date = NA,
    pub_year = NA,
    pub_month = NA,
    pub_day = NA,
    doi = NA,
    zenodo = NA,
    publisher = NA,
    journal = "/article/front/journal-meta/journal-title-group/journal-title/@xml:lang",
    journal_abbrev = "/article/front/journal-meta/journal-title-group/abbrev-journal-title/@xml:lang",
    issn = NA,
    eIssn = NA,
    issue = NA,
    volume = NA,
    starting_page = NA,
    ending_page = NA,
    #pensoft_pub = NA
    keyword = NA,
    bold_id = NA,
    bin = NA
  ),

  atom_types = list(
    title = rdf4r::xsd_string,
    date = rdf4r::xsd_date,
    pub_year = rdf4r::xsd_integer,
    pub_month = rdf4r::xsd_integer,
    pub_day = rdf4r::xsd_integer,
    doi = rdf4r::xsd_string,
    zenodo = rdf4r::xsd_string,
    publisher = rdf4r::xsd_string,
    journal = rdf4r::xsd_string,
    journal_abbrev = rdf4r::xsd_string,
    issn = rdf4r::xsd_string,
    eIssn = rdf4r::xsd_string,
    issue = rdf4r::xsd_integer,
    volume = rdf4r::xsd_integer,
    starting_page = rdf4r::xsd_integer,
    ending_page = rdf4r::xsd_integer,
    #pensoft_pub = rdf4r::xsd_string
    keyword = rdf4r::xsd_string,
    bold_id = rdf4r::xsd_string,
    bin = rdf4r::xsd_string
  ),

  constructor = my_metadata,

  components = list(

    #Bold-id
    XmlSchema$new(
      schema_name = "myschema_bold_id",
      xpath = "/article/front/article-meta/bold-ids/bold-id",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),
      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content = rdf4r::xsd_string
      ),

      constructor = bold_id_constr,

      components = NULL
    ),


    #BIN
    XmlSchema$new(
      schema_name = "myschema_bin",
      xpath = "/article/front/article-meta/bins",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        bin = "./bin"
      ),
      atom_lang = c(
        bin = NA
      ),

      atom_types = list(
        bin =  rdf4r::xsd_string
      ),

      constructor = bin_constr,

      components = NULL
    ),

    # Keyword
    XmlSchema$new(
      schema_name = "mySchema_keyword_group",
      xpath = "/article/front/article-meta/kwd-group",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        keyword = "./kwd"
      ),

      atom_lang = c(
        keyword = NA
      ),

      atom_types = list(
        keyword =  rdf4r::xsd_string
      ),

      constructor = keyword_group,

      components = NULL
    ),

    # Abstract
    XmlSchema$new(
      schema_name = "taxpub_abstract",
      xpath = "/article/front/article-meta/abstract",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = ".",
        trans_abstract = "../trans-abstract"
      ),

      atom_lang = c(
        text_content = NA,
        trans_abstract = "../trans-abstract/@xml:lang"
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string,
        trans_abstract = rdf4r::xsd_string
      ),

      constructor = abstract,

      components = NULL
    ),

    # Title
    XmlSchema$new(
      schema_name = "taxpub_title",
      xpath = "/article/front/article-meta/title-group/article-title",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),

      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string
      ),

      constructor = title,

      components = NULL
    ),

    # Author
    XmlSchema$new(
      schema_name = "taxpub_author",
      xpath = "/article/front/article-meta/contrib-group/contrib",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        full_name = NA,
        surname = "./name/surname",
        given_names = "./name/given-names",
        email = "./email",
        aff_id = "./xref/@rid",
        all_affiliations = "/article/front/article-meta/aff/addr-line"
        # role = "./mods:role/mods:roleTerm"
      ),

      atom_lang = c(
        full_name = NA,
        surname = NA,
        given_names = NA,
        email = NA,
        aff_id = NA,
        all_affiliations = NA
        #role = NA
      ),

      atom_types = list(
        full_name = rdf4r::xsd_string,
        surname = rdf4r::xsd_string,
        given_names = rdf4r::xsd_string,
        email = rdf4r::xsd_string,
        aff_id = rdf4r::xsd_integer,
        all_affiliations = rdf4r::xsd_string
      ),

      constructor = author,

      components = NULL
    ),
    # Introduction
    XmlSchema$new(
      schema_name = "taxpub_introduction_section",
      xpath = "/article/body/sec[@sec-type='Introduction']",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),

      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string
      ),

      constructor = introduction_section
    ),
    # DISCUSSION
    XmlSchema$new(
      schema_name = "discussion_section",
      xpath = "//sec[@sec-type='Discussion']", #rel path from treatment
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),

      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string
      ),

      constructor = discussion
    ),
    # Treatment
    XmlSchema$new(
      schema_name = "taxpub_treatment",
      xpath = "/article/body/sec/tp:taxon-treatment",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),

      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string
      ),

      constructor = treatment,

      components = list(
        # Nomenclature
        XmlSchema$new(
          schema_name = "taxpub_nomenclature_section",
          xpath = "./tp:nomenclature", #rel path from treatment
          file_pattern = ".*\\.xml",
          extension = ".xml",
          prefix = NA,
          atoms = c(
            text_content = "."
          ),

          atom_lang = c(
            text_content = NA
          ),

          atom_types = list(
            text_content =  rdf4r::xsd_string
          ),
          # nomenclature citations
          components = list(
            XmlSchema$new(
              schema_name = "taxpub_nomenclature_citations",
              xpath = "./tp:nomenclature-citation-list", #rel path from treatment
              file_pattern = ".*\\.xml",
              extension = ".xml",
              prefix = NA,
              atoms = c(
                text_content = "."
              ),

              atom_lang = c(
                text_content = NA
              ),

              atom_types = list(
                text_content =  rdf4r::xsd_string
              ),

              components = NULL,

              constructor = nomenclature_citations
            )
          ),

          constructor = nomenclature
        ),
        # Materials Examined
        XmlSchema$new(
          schema_name = "taxpub_materials_examined",
          xpath = "./tp:treatment-sec[@sec-type='materials']", #rel path from treatment
          file_pattern = ".*\\.xml",
          extension = ".xml",
          prefix = NA,
          atoms = c(
            text_content = "."
          ),

          atom_lang = c(
            text_content = NA
          ),

          atom_types = list(
            text_content =  rdf4r::xsd_string
          ),

          constructor = materials_examined
        ),

        #Diagnosis
        XmlSchema$new(
          schema_name = "taxpub_diagnosis_section",
          xpath = "./tp:treatment-sec[@sec-type='Diagnosis']", #rel path from treatment
          file_pattern = ".*\\.xml",
          extension = ".xml",
          prefix = NA,
          atoms = c(
            text_content = "."
          ),

          atom_lang = c(
            text_content = NA
          ),

          atom_types = list(
            text_content =  rdf4r::xsd_string
          ),

          constructor = diagnosis
        ),



        # distribution
        XmlSchema$new(
          schema_name = "taxpub_distribution_section",
          xpath = "./tp:treatment-sec[@sec-type='Distribution']", #rel path from treatment
          file_pattern = ".*\\.xml",
          extension = ".xml",
          prefix = NA,
          atoms = c(
            text_content = "."
          ),

          atom_lang = c(
            text_content = NA
          ),

          atom_types = list(
            text_content =  rdf4r::xsd_string
          ),

          constructor = distribution
        )

      )
    ),

    # Taxonomic Key
    XmlSchema$new(
      schema_name = "taxpub_taxonomic_key",
      xpath = "/sec[@sec-type='key']",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),

      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content = rdf4r::xsd_string
      ),

      constructor = taxonomic_key,

      components = NULL
    ),

    # Figure
    XmlSchema$new(
      schema_name = "taxpub_figure",
      xpath = "//fig|//fig-group",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = ".",
        caption = "./caption"
      ),

      atom_lang = c(
        text_content = NA,
        caption = NA
      ),

      atom_types = list(
        text_content = rdf4r::xsd_string,
        caption = rdf4r::xsd_string
      ),

      constructor = figure,

      components = NULL
    ),

    # Taxonomic Name Usage
    XmlSchema$new(
      schema_name = "taxpub_taxonomic_name_usage",
      xpath = "//tp:taxon-name",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        date = NA,
        pub_year = "/article/front/article-meta/pub-date/year",
        pub_month = "/article/front/article-meta/pub-date/month",
        pub_day = "/article/front/article-meta/pub-date/day",
        kingdom = "./tp:taxon-name-part[@taxon-name-part-type='kingdom' or @taxon-name-part-type='Kingdom' or @taxon-name-part-type='divisio' or @taxon-name-part-type='Divisio' or @taxon-name-part-type='division' or @taxon-name-part-type='Division']",
        phylum = "./tp:taxon-name-part[@taxon-name-part-type='phylum' or @taxon-name-part-type='Phylum' or @taxon-name-part-type='regnum' or @taxon-name-part-type='Regnum']",
        class = "./tp:taxon-name-part[@taxon-name-part-type='class' or @taxon-name-part-type='Class' or @taxon-name-part-type='classis' or @taxon-name-part-type='Classis']",
        order = "./tp:taxon-name-part[@taxon-name-part-type='order' or @taxon-name-part-type='Order' or @taxon-name-part-type='ordo' or @taxon-name-part-type='Ordo']",
        family = "./tp:taxon-name-part[@taxon-name-part-type='family' or @taxon-name-part-type='Family' or @taxon-name-part-type='familia' or @taxon-name-part-type='Familia' or @taxon-name-part-type='famil' or @taxon-name-part-type='Famil']",
        subfamily = "./tp:taxon-name-part[@taxon-name-part-type='subfamily' or @taxon-name-part-type='Subfamily' or @taxon-name-part-type='subfamilia' or @taxon-name-part-type='Subfamilia' or @taxon-name-part-type='subfamil' or @taxon-name-part-type='Subfamil' or @taxon-name-part-type='tribe' or @taxon-name-part-type='Tribe' or @taxon-name-part-type='tribus' or @taxon-name-part-type='Tribus' or @taxon-name-part-type='subtribe' or @taxon-name-part-type='Subtribe' or @taxon-name-part-type='subtribus' or @taxon-name-part-type='Subtribus']",
        genus = "./tp:taxon-name-part[@taxon-name-part-type='genus' or @taxon-name-part-type='Genus' or @taxon-name-part-type='genera' or @taxon-name-part-type='Genera']",
        regularzied_genus = "./tp:taxon-name-part[@taxon-name-part-type='genus' or @taxon-name-part-type='Genus' or @taxon-name-part-type='genera' or @taxon-name-part-type='Genera']/@reg",
        subgenus = "./tp:taxon-name-part[@taxon-name-part-type='subgenus' or @taxon-name-part-type='Subgenus' or @taxon-name-part-type='subgenera' or @taxon-name-part-type='Subgenera' or @taxon-name-part-type='section' or @taxon-name-part-type='Section' or @taxon-name-part-type='sectio' or @taxon-name-part-type='Sectio']",
        species = "./tp:taxon-name-part[@taxon-name-part-type='Species' or @taxon-name-part-type='species']",
        subspecies = "/tp:taxon-name-part[@taxon-name-part-type='Subspecies' or @taxon-name-part-type='subspecies' or @taxon-name-part-type='Variety' or @taxon-name-part-type='variety' or @taxon-name-part-type='varietas' or @taxon-name-part-type='Varietas' or @taxon-name-part-type='variation' or @taxon-name-part-type='Variation' or @taxon-name-part-type='subvariety' or @taxon-name-part-type='Subvariety' or @taxon-name-part-type='subvarietas' or @taxon-name-part-type='Subvarietas' or @taxon-name-part-type='subvariation' or @taxon-name-part-type='Subvariation' or @taxon-name-part-type='Form' or @taxon-name-part-type='form' or @taxon-name-part-type='forma' or @taxon-name-part-type='Forma' or @taxon-name-part-type='aberration' or @taxon-name-part-type='Aberration' or @taxon-name-part-type='race' or @taxon-name-part-type='Race' or @taxon-name-part-type='Subform' or @taxon-name-part-type='subform' or @taxon-name-part-type='subforma' or @taxon-name-part-type='Subforma' or @taxon-name-part-type='subaberation' or @taxon-name-part-type='Subaberation' or @taxon-name-part-type='subrace' or @taxon-name-part-type='Subrace' ]",
        verbatim = ".",
        verbatim_rank = "./tp:taxon-name-part[last()]/@taxon-name-part-type",
        verbatim_status = "following-sibling::tp:taxon-status",
        status = NA,
        authorship = "following-sibling::tp:taxon-authority | ./tp:taxon-name-part[@taxon-name-part-type='authority']",
        external_taxonomic_name_id = "./object-id",
        secundum_literal = NA
      ),

      atom_lang = c(
        date = NA,
        pub_year = NA,
        pub_month = NA,
        pub_day = NA,
        kingdom = NA,
        phylum = NA,
        class = NA,
        order = NA,
        family = NA,
        subfamily = NA,
        genus = NA,
        regularzied_genus = NA,
        subgenus = NA,
        species = NA,  ## This is an error in TaxonX, not DwC!
        subspecies = NA,
        verbatim_rank = NA,
        verbatim_status = NA,
        status = NA,
        authorship = NA,
        external_taxonomic_name_id = NA,
        secundum_literal = NA
      ),

      atom_types = list(
        date = rdf4r::xsd_date,
        pub_year = rdf4r::xsd_integer,
        pub_month = rdf4r::xsd_integer,
        pub_day = rdf4r::xsd_integer,
        kingdom = rdf4r::xsd_string,
        class = rdf4r::xsd_string,
        order = rdf4r::xsd_string,
        family = rdf4r::xsd_string,
        subfamily = rdf4r::xsd_string,
        genus = rdf4r::xsd_string,
        regularzied_genus = rdf4r::xsd_string,
        subgenus = rdf4r::xsd_string,
        species = rdf4r::xsd_string,
        subspecies = rdf4r::xsd_string,
        verbatim_rank = rdf4r::xsd_string,
        taxonomic_rank = rdf4r::xsd_string,
        taxonomic_status = rdf4r::xsd_string,
        status = rdf4r::xsd_string,
        authorship = rdf4r::xsd_string,
        external_taxonomic_name_id = rdf4r::xsd_string,
        secundum_literal = rdf4r::xsd_string
      ),

      constructor = taxonomic_name_usage,

      components = NULL
    ),

    # Institution Code Usage
    XmlSchema$new(
      schema_name = "institution_code_usage",
      xpath = "//named-content[@content-type='dwc:institutional_code']",
      file_pattern = "",
      extension = "",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),

      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content = rdf4r::xsd_string
      ),

      constructor = institution_code_usage,

      components = NULL
    )
  )
)





#' @export
my_xml2rdf <- function (filename, xml_schema = mySchema, access_options, serialization_dir,reprocess = FALSE, dry = TRUE)
{

  tryCatch({
    xml = xml2::read_xml(filename)
    triples = ResourceDescriptionFramework$new()
    root_id = identifier(root_id(xml, access_options, xml_schema),
                         access_options$prefix["openbiodiv"])
    triples$set_context(root_id)
    triples = node_extractor(node = xml, xml_schema = mySchema,
                             reprocess = reprocess, triples = triples, access_options = access_options,
                             dry = dry, filename = filename)

    atoms = find_literals(xml, mySchema)



    bold_identifiers = list(nid = identifier(get_or_set_obkms_id(xml),
                                        access_options$prefix["openbiodiv"]), pid = identifier(parent_id(xml),
                                                                                               access_options$prefix["openbiodiv"]), root_id = identifier(root_id(xml,
                                                                                                                                                                 access_options), access_options$prefix["openbiodiv"]))
    #print(bold_identifiers)
    #checks if both bin/clusteruri and boldid/recordid/etc are present or just one of them
    #for each case sends a curl request to bold using bold's bold_seqspec with a list of all ids
    if ((length(atoms$bin) > 0) && (length(atoms$bold_id) > 0))
    {
      bin_text <- lapply(atoms$bin, function(x) x[c('text_value')])
      bin_list <- as.character(unlist(bin_text))
      #BOLD CURL request
      new_xml <- bold_seqspec(taxon=NULL, bin = bin_list, format = "xml")
      bold_text <- lapply(atoms$bold_id, function(x) x[c('text_value')])
      bold_list <- as.character(unlist(bold_text))
      another_xml <- bold_seqspec(taxon=NULL, ids = bold_list, format = "xml")
      #add the results from bold into 1 xml
      doc1children <- xml_children(new_xml)
      for (child in doc1children)
      {
        xml_add_child(another_xml, child)
      }

      bold_file = paste0(configuration$bold_dir,"/",
                         paste0(strip_filename_extension(last_token(filename,
                                                                    split = "/")), "_bold.xml"))
      xml2::write_xml(another_xml, bold_file)






      bold_triples = ResourceDescriptionFramework$new()
      bold_triples$set_context(root_id)


      bold_triples = new_node_extractor(node = another_xml, xml_schema = newSchema,
                                        reprocess = reprocess, triples = bold_triples, access_options = access_options,
                                        dry = dry, filename = filename)
      triples$add_triples(bold_triples)
    }else
    {
      if (length(atoms$bold_id) > 0)
      {
          bold_id_text <- lapply(atoms$bold_id, function(x) x[c('text_value')])
          bold_id_list <- as.character(unlist(bold_id_text))
          bold_xml <- bold_seqspec(taxon=NULL, ids = bold_id_list, format = "xml")

          #match_records modifies bold_xml by setting the same obkms_ids

          bold_xml = match_records(xml, bold_xml, access_options)

          #save _bold file
          bold_file = paste0(configuration$bold_dir,"/",
                             paste0(strip_filename_extension(last_token(filename,
                                                                        split = "/")), "_bold.xml"))
          xml2::write_xml(bold_xml, bold_file)

          bold_triples = ResourceDescriptionFramework$new()
          bold_triples$set_context(root_id)

          bold_triples = new_node_extractor(node = bold_xml, xml_schema = newSchema,
                                            reprocess = reprocess, triples = bold_triples, access_options = access_options,
                                            dry = dry, filename = filename)
          triples$add_triples(bold_triples)

      }
      if (length(atoms$bin) > 0)
      {
        bin_text <- lapply(atoms$bin, function(x) x[c('text_value')])
        bin_list <- as.character(unlist(bin_text))
        bold_xml <- bold_seqspec(taxon=NULL, bin = bin_list, format = "xml")
        #optional: save xml from bold
        bold_file = paste0(configuration$bold_dir,"/",
                           paste0(strip_filename_extension(last_token(filename,
                                                                      split = "/")), "_bold.xml"))
        xml2::write_xml(bold_xml, bold_file)


        bold_triples = ResourceDescriptionFramework$new()

        bold_triples$set_context(root_id)
        bold_triples = new_node_extractor(node = bold_xml, xml_schema = newSchema,
                                          reprocess = reprocess, triples = bold_triples, access_options = access_options,
                                          dry = dry, filename = filename)
        triples$add_triples(bold_triples)

      }
    }


    xml2::write_xml(xml, filename)
    serialization = triples$serialize()
    cat(serialization, file = paste0(serialization_dir, "/",
                                     paste0(strip_filename_extension(last_token(filename,
                                                                                split = "/")), ".ttl")))
    return(TRUE)
  }, error = function(e) {
    warning(e)
    return(FALSE)
  })

}

#' @export
process_bold <- function(file)
{
  suppressWarnings({
    xml <- read_xml(file)
    #xpath to find all bold systems links within article!
    results <- xml_find_all(xml, "//*[starts-with(@xlink:href, 'http://www.boldsystems.org/')] | //*[starts-with(@xlink:href, 'http://boldsystems.org/')]")
    if(!(length(results)==0))
    {
      doc <- xmlParse(xml, isHTML = FALSE)
      nodeset <- getNodeSet(doc, "//article-meta")
      #creates parent nodes for the bold-ids or bins section
      parent_id <- newXMLNode("bold-ids", parent = nodeset)
      parent_bin <- newXMLNode("bins", parent = nodeset)
      empty_bin <- TRUE
      empty_bold_id <- TRUE
      for (r in results)
      {
        node_string <- sub('.*http', '', r)
        #handles cases where the node value is either a link or just an id
        #if we have 2 closing tags => the value is just an id
        #1 closing tag means the value contained a link and sub('.*http','', r) matched and replaced the second 'http' (within the value)
        if (stringi::stri_count_regex(node_string, '>') > 1){
          string <- sub('\">?.*', '', node_string)
        }
        else {
          string <- sub('<.*', '', node_string)
        }

        #adds the http part to url_parse the string => break it down into parts and only take the query part
        string <- paste0("http", string)
        result <- url_parse(string)
        query <- result$query

        #if the query part contains "bin" or "clusteruri" the uri contains a bin
        #else - bold-id (which can be anything, like record-id or process-id)
        if((sub('=.*', '', query) == "bin") || (sub('=.*', '', query) == "clusteruri" ))
        {
          empty_bin = FALSE
          tag = "bin"
          new_node <- newXMLNode(tag, parent = parent_bin)

        } else
        {
          empty_bold_id = FALSE
          tag = "bold-id"
          new_node <- newXMLNode(tag, parent = parent_id)
        }
        value <- sub('.*=', '', query)
        xmlValue(new_node) <- value
      }
      #remove the empty nodesets
      if (empty_bin == TRUE)
      {
        removeNodes(parent_bin)
      }
      if (empty_bold_id == TRUE)
      {
        removeNodes(parent_id)
      }

      saving_file_name <- paste0(dirname(file), "/modified/",basename(file), "_mod.xml" )
      saveXML(doc, saving_file_name)
    }
    else{
      saving_file_name <- NULL
    }
    return(saving_file_name)
  })
}

#' @export
#matches bold records from two xmls by adding obkms_id attributes to nodes
match_records <- function(xml, bold_xml, access_options){
  results_bold_xml <- xml_find_all(bold_xml, "//record")
  results_pensoft_xml <- xml_find_all(xml, "//bold-id")
  i = 1
  for (r in results_pensoft_xml){
    bold_identifiers[[i]] = list(
      nid = identifier(get_or_set_obkms_id(r), access_options$prefix["openbiodiv"]),
      pid = identifier(parent_id(r), access_options$prefix["openbiodiv"]),
      root_id = identifier(root_id(r, access_options), access_options$prefix["openbiodiv"])
    )
    i = i+1
  }
  count = 1
  for (rr in results_bold_xml){
    xml_set_attr(rr, "obkms_id", bold_identifiers[[count]]$nid)
    count = count+1
  }
  #returns the modified xml
  return(bold_xml)
}

#' @export
newSchema = XmlSchema$new(
  schema_name = "newSchema",
  xpath = "//record",
  file_pattern = ".*\\.xml",
  extension = ".xml",
  prefix = NA,
  atoms = c(
    record = "."
    #random = "/bold_records/record/specimen_desc/reproduction"
  ),

  atom_lang = c(
    record = NA
    #random = NA
  ),

  atom_types = list(
    record = rdf4r::xsd_string
    #random = rdf4r::xsd_string
  ),

  constructor = new_metadata,

  components = list(

    #record_id
    XmlSchema$new(
      schema_name = "newSchema_record_id",
      xpath = "./record_id",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),
      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string
      ),

      constructor = record_id,

      components = NULL
    ),

    #process id
    XmlSchema$new(
      schema_name = "newSchema_process_id",
      xpath = "/bold_records/record/processid",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),
      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string
      ),

      constructor = process_id,

      components = NULL
    ),
    #bin
    XmlSchema$new(
      schema_name = "newSchema_bin",
      xpath = "/bold_records/record/bin_uri",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),
      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string
      ),

      constructor = bin_constr,

      components = NULL
    ),
    #sample_id
    XmlSchema$new(
      schema_name = "newSchema_sample_id",
      xpath = "/bold_records/record/specimen_identifiers/sampleid",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),
      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string
      ),
      constructor = sample_id,

      components = NULL
    ),
    #sequence_id
    XmlSchema$new(
      schema_name = "newSchema_sequence_id",
      xpath = "/bold_records/record/sequences/sequence/sequenceID",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),
      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string
      ),

      constructor = sequence_id,

      components = NULL
    ),
    #institution
    XmlSchema$new(
      schema_name = "newSchema_institution",
      xpath = "/bold_records/record/specimen_identifiers/institution_storing",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),
      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string
      ),

      constructor = institution,

      components = NULL
    ),
    #TAXONOMY
    XmlSchema$new(
      schema_name = "newSchema_taxonomy",
      xpath = "/bold_records/record/taxonomy",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),
      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string
      ),

      constructor = taxonomy,
      #PHYLUM
      components = list(
        XmlSchema$new(
          schema_name = "newSchema_phylum",
          xpath = "./phylum",
          file_pattern = ".*\\.xml",
          extension = ".xml",
          prefix = NA,
          atoms = c(
            text_content = "./taxon/name",
            taxonid = "./taxon/taxID"
          ),
          atom_lang = c(
            text_content = NA,
            taxonid = NA
          ),

          atom_types = list(
            text_content = rdf4r::xsd_string,
            taxonid = rdf4r::xsd_integer
          ),

          constructor = phylum,

          components = NULL
        ),
        #CLASS
        XmlSchema$new(
          schema_name = "newSchema_class",
          xpath = "./class",
          file_pattern = ".*\\.xml",
          extension = ".xml",
          prefix = NA,
          atoms = c(
            text_content = "./taxon/name",
            taxonid = "./taxon/taxID"
          ),
          atom_lang = c(
            text_content = NA,
            taxonid = NA
          ),

          atom_types = list(
            text_content = rdf4r::xsd_string,
            taxonid = rdf4r::xsd_integer
          ),

          constructor = clas,

          components = NULL
        ),
        #ORDER
        XmlSchema$new(
          schema_name = "newSchema_order",
          xpath = "./order",
          file_pattern = ".*\\.xml",
          extension = ".xml",
          prefix = NA,
          atoms = c(
            text_content = "./taxon/name",
            taxonid = "./taxon/taxID"
          ),
          atom_lang = c(
            text_content = NA,
            taxonid = NA
          ),

          atom_types = list(
            text_content = rdf4r::xsd_string,
            taxonid = rdf4r::xsd_integer
          ),

          constructor = order,

          components = NULL
        ),
        #FAMILY
        XmlSchema$new(
          schema_name = "newSchema_family",
          xpath = "./family",
          file_pattern = ".*\\.xml",
          extension = ".xml",
          prefix = NA,
          atoms = c(
            text_content = "./taxon/name",
            taxonid = "./taxon/taxID"
          ),
          atom_lang = c(
            text_content = NA,
            taxonid = NA

          ),

          atom_types = list(
            text_content = rdf4r::xsd_string,
            taxonid = rdf4r::xsd_integer
          ),

          constructor = family,

          components = NULL
        ),
        #GENUS
        XmlSchema$new(
          schema_name = "newSchema_genus",
          xpath = "./genus",
          file_pattern = ".*\\.xml",
          extension = ".xml",
          prefix = NA,
          atoms = c(
            text_content = "./taxon/name",
            taxonid = "./taxon/taxID"
          ),
          atom_lang = c(
            text_content = NA,
            taxonid = NA
          ),

          atom_types = list(
            text_content = rdf4r::xsd_string,
            taxonid = rdf4r::xsd_integer
          ),

          constructor = genus,

          components = NULL
        ),
        #species
        XmlSchema$new(
          schema_name = "newSchema_species",
          xpath = "./species",
          file_pattern = ".*\\.xml",
          extension = ".xml",
          prefix = NA,
          atoms = c(
            text_content = "./taxon/name",
            taxonid = "./taxon/taxID"
          ),
          atom_lang = c(
            text_content = NA,
            taxonid = NA
          ),

          atom_types = list(
            text_content = rdf4r::xsd_string,
            taxonid = rdf4r::xsd_integer
          ),

          constructor = species,

          components = NULL
        )
      )),

    #nucleotides
    XmlSchema$new(
      schema_name = "newSchema_nucleotides",
      xpath = "/bold_records/record/sequences/sequence/nucleotides",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),
      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string
      ),

      constructor = nucleotides,

      components = NULL
    )
    ,
    #catalog number
    XmlSchema$new(
      schema_name = "newSchema_catalog_num",
      xpath = "/bold_records/record/specimen_identifiers/catalognum",
      file_pattern = ".*\\.xml",
      extension = ".xml",
      prefix = NA,
      atoms = c(
        text_content = "."
      ),
      atom_lang = c(
        text_content = NA
      ),

      atom_types = list(
        text_content =  rdf4r::xsd_string
      ),

      constructor = catalog,

      components = NULL
    )

  ))
mariyad/openbiodiving documentation built on June 3, 2019, 2:18 p.m.