R/institution_codes.R

Defines functions add_inst_triples institutions_to_mongo abbrev_institutions_extractor dwc_institutions_extractor institutionalizer extract_inst_identifiers set_institution_id grbio_uri_parser check_grbio

#' @export
get_or_set_inst_id = function (name, url, root_id, prefix, collection, grbio = grbio)
{
  mongo_res = check_mongo_inst(tpKey = url, collection = collection)
  if (nrow(mongo_res) > 0) {
    coolURI = mongo_res$coolURI
    code = mongo_res$code
    id = grbio_uri_parser(coolURI)
  }
  else {
    grbio_res = check_grbio(grbio = grbio, url = url, name = name)
    grbio_uri = grbio_res$Cool.URI
    code = grbio_res$Institutional.Code.Acronym
    if (length(grbio_uri) == 0) {
      grbio_uri = uuid::UUIDgenerate()
      grbio_uri = toupper(grbio_uri)
      id = identifier(grbio_uri, prefix = prefix)
      code = code
    }
    else {
      id = grbio_uri_parser(grbio_uri)
    }

    if(code == 0 || length(code)<1 || is.null(code)){
      code = NA
    }

    mongo_df = data.frame(tpKey = url, coolURI = strip_angle(id$uri),
                          name = name, code = code, parent = strip_angle(root_id$uri))
    collection$insert(mongo_df)
  }
  return(id)
}

#' @export
check_grbio = function(grbio, url, name){
  grbio_df  = read.csv(grbio, header = TRUE)
  #first look for the url if present
  if (is.na(url) == FALSE){
    #res = grbio_tibble %>% filter(url == url)
    res = grbio_df[which(tolower(grbio_df$URL) == tolower(url)),]
  }else if (is.na(url) || nrow(res)==0){
    res = grbio_df[which(tolower(grbio_df$Cool.URI) == tolower(url)),]
  }else{
    res = grbio_df[which(tolower(grbio_df$Name.of.Institution) == tolower(name)),]
  }
}


#' @export
grbio_uri_parser = function(grbio_uri){
  #extract grbio uri and turn into id
  #need to detect whether its lsid or a cool uri
  lsid_regex = c(biocol =  "http:\\/\\/biocol\\.org\\/urn:lsid:biocol\\.org:")
  grbio_cool_regex = c(grbioCool = "http:\\/\\/grbio\\.org\\/cool\\/")
  biocol_cool_regex = c(biocolCool = "http:\\/\\/biocol\\.org\\/cool\\/")
  gsrscicol_regex = c(gsrscicoll = "http:\\/\\/grscicoll\\.org\\/cool\\/")
  usfc_regex = c(usfc = "http:\\/\\/usfsc\\.grscicoll\\.org\\/cool\\/")
  default_regex = c(openbiodiv = "http:\\/\\/openbiodiv\\.net\\/")

  patterns = list(lsid_regex, grbio_cool_regex, biocol_cool_regex, gsrscicol_regex, usfc_regex, default_regex)

  grep_and_id = function(pattern, grbio_uri){
    if (grepl(pattern, grbio_uri)){
      grbio_uri = gsub(pattern, "", grbio_uri)
      prefix = stringi::stri_unescape_unicode(pattern)
      names(prefix) = names(pattern)
      id = identifier(grbio_uri, prefix)
      return(id)
    }
  }
 inst_id =  sapply(patterns, grep_and_id, grbio_uri=grbio_uri)
 inst_id=inst_id[lapply(inst_id,length)>0][[1]]
 return(inst_id)
}


#' @export
set_institution_id = function(uri){
  lsid_regex = "http:\\/\\/biocol\\.org\\/urn:lsid:biocol\\.org:"
  grbio_cool_regex = "http:\\/\\/grbio\\.org\\/cool\\/"
  biocol_cool_regex = "http:\\/\\/biocol\\.org\\/cool\\/"
  gsrscicol_regex = "http:\\/\\/grscicoll\\.org\\/cool\\/"
  usfc_regex = "http:\\/\\/usfsc\\.grscicoll\\.org\\/cool\\/"

  if (grepl(lsid_regex, uri)){
    grbio_uri = gsub(lsid_regex, "", uri)
    id = identifier(grbio_uri, c(biocol = "http://biocol.org/urn:lsid:biocol.org:"))
  }else if (grepl(grbio_cool_regex, uri))
  {
    grbio_uri = gsub(grbio_cool_regex, "", uri)
    id = identifier(grbio_uri, c(grbioCool = "http://grbio.org/cool/"))
  } else if (grepl(biocol_cool_regex, uri)){
    grbio_uri = gsub(biocol_cool_regex, "", uri)
    id = identifier(grbio_uri, c(biocolCool = "http://biocol.org/cool"))
  } else if (grepl(gsrscicol_regex, uri)){
    grbio_uri = gsub(gsrscicol_regex, "", uri)
    id = identifier(grbio_uri, c(gsrscicoll = "http://grscicoll.org/cool/"))
  } else if (grepl(usfc_regex, uri)){
    grbio_uri = gsub(usfc_regex, "", uri)
    id = identifier(grbio_uri, c(usfc = "http://usfsc.grscicoll.org/cool/"))
  }else{
    grbio_uri = gsub("http:\\/\\/openbiodiv\\.net\\/", "", uri)
    id = identifier(grbio_uri, c(openbiodiv = "http://openbiodiv.net/"))
  }

  return(id)
}

#' @export
extract_inst_identifiers = function(xml, root_id, prefix, collection, grbio){


  #inst_codes = c()
  inst_names = c()
  inst_urls = c()

  xpath_institutions = "//named-content[@xlink:type='simple'][@content-type='institution']"
  inst =xml2::xml_find_all(xml, xpath_institutions)
  #xpath_codes = "//named-content[@content-type='dwc:institutional_code']"
  #code =xml2::xml_find_all(xml, xpath_codes)

  #inst_lit = xml2::xml_text(inst)
  #code_lit = xml2::xml_text(code)
  #length(unique(inst_lit))
  #length(unique(code_lit))


  #xpath_brace = "//named-content[@xlink:type='simple'][@content-type='institution']/following-sibling::text()[1]"
  #braces = xml2::xml_find_all(xml, xpath_brace)
  #if (length(inst) == length(braces)){
  #  for (r in 1:length(braces)){
  #    if (gsub("\\s+","",braces[r]) == "(" | gsub("\\s+","",braces[r]) == ","){
  #      inst_code = xml2::xml_find_first(braces[r], "following-sibling::named-content[@content-type='dwc:institutional_code']")
  #      inst_codes = c(inst_codes, xml2::xml_text(inst_code))
  #      inst_names = c(inst_names, xml2::xml_text(inst[r]))
  if (length(inst)>0){
    for (r in 1:length(inst)){
      inst_names = c(inst_names, xml2::xml_text(inst[r]))
      inst_urls = c(inst_urls, xml2::xml_attr(inst[r], "href"))
    }


    name_url_df = data.frame(
      names = inst_names,
      urls = inst_urls,
      stringsAsFactors = FALSE
    )

    name_url_df = unique(name_url_df)

    #get or set institution id (from grbio) if name + code combination is the same

    for (n in 1:nrow(name_url_df)){
      inst_id = get_or_set_inst_id(name = name_url_df$names[n], url = name_url_df$urls[n], root_id = root_id, prefix = prefix, collection = collection, grbio = grbio )
    }
  }


}

#' @export
institution_serializer = function (tt, atoms, identifiers)
{
  rdfized_codes = c()
  instNames = c()
  instCodes = c()
  nid = identifiers$nid
  if (!(is.null(unlist(atoms$institution_name)))) {
    sapply(atoms$institution_name, function(n) {
     institution_n = escape_special_json(n$text_value)
      tt$add_triple(nid, inst_names, literal(institution_n))
      instNames = c(instNames, institution_n)
      res = check_mongo_instName(name = institution_n,
                                 collection = inst_collection)
      if (nrow(res) > 0) {
        for (i in 1:nrow(res)) {
          instNames = instNames[-which(tolower(instNames) !=
                                         tolower(res$name[i]))]

          instCodes = instCodes[-which(tolower(instCodes) !=
                                         tolower(res$code[i]))]

          inst_identifier = grbio_uri_parser(res$coolURI[i])
          tt$add_triple(nid, dwc_inst_id, inst_identifier)
          tt$add_triple(inst_identifier, rdf_type, Institution)
          if (names(inst_identifier$prefix) == "grbioCool" ||
              names(inst_identifier$prefix) == "biocol" ||
              names(inst_identifier$prefix) == "biocolCool" ||
              names(inst_identifier$prefix) == "gsrscicoll" ||
              names(inst_identifier$prefix) == "usfc") {
            tt$add_triple(inst_identifier, rdf_type,
                          GrbioInst)
          }
          tt$add_triple(inst_identifier, inst_names,
                        literal(res$name[i]))
          tt$add_triple(inst_identifier, dwc_inst_code,
                        literal(res$code[i]))
          rdfized_codes = c(rdfized_codes, res$code[i])

        }
      }
    })
  }
  if (!(is.null(unlist(atoms$institution_code)))) {

    sapply(atoms$institution_code, function(n) {
      institution_c = escape_special_json(n$text_value)
      tt$add_triple(nid, dwc_inst_code, literal(institution_c))

      instCodes = c(instCodes,institution_c)
      if (!(n$text_value %in% rdfized_codes)) {

        res = check_mongo_instCode(code = institution_c,
                                   collection = inst_collection)
        if (nrow(res) > 0) {
          for (i in 1:nrow(res)) {
            instNames = instNames[-which(tolower(instNames) !=
                                           tolower(res$name[i]))]
            instCodes = instCodes[-which(tolower(instCodes) !=
                                           tolower(res$code[i]))]
            inst_identifier = grbio_uri_parser(res$coolURI[i])
            tt$add_triple(nid, dwc_inst_id, inst_identifier)
            tt$add_triple(inst_identifier, rdf_type,
                          Institution)
            if (names(inst_identifier$prefix) == "grbioCool" ||
                names(inst_identifier$prefix) == "biocol" ||
                names(inst_identifier$prefix) == "biocolCool" ||
                names(inst_identifier$prefix) == "gsrscicoll" ||
                names(inst_identifier$prefix) == "usfc") {
              tt$add_triple(inst_identifier, rdf_type,
                            GrbioInst)
            }
            tt$add_triple(inst_identifier, dwc_inst_code,
                          literal(res$code[i]))
            tt$add_triple(inst_identifier, inst_names,
                          literal(res$name[i]))
            instNames = instNames[tolower(instNames) !=
                                    tolower(res$name[i])]
          }
        }
      }
    })
  }

  if (!(is.null(unlist(atoms$institution_uri)))) {
    sapply(atoms$institution_uri, function(n) {
      inst_identifier = grbio_uri_parser(n$text_value)
      tt$add_triple(nid, dwc_inst_id, inst_identifier)
      tt$add_triple(inst_identifier, rdf_type,
                    Institution)
      if (names(inst_identifier$prefix) == "grbioCool" ||
          names(inst_identifier$prefix) == "biocol" ||
          names(inst_identifier$prefix) == "biocolCool" ||
          names(inst_identifier$prefix) == "gsrscicoll" ||
          names(inst_identifier$prefix) == "usfc") {
        tt$add_triple(inst_identifier, rdf_type,
                      GrbioInst)
      }
    })
  }

  return(tt)
}





#' @export
institutionalizer = function(xml, root_id, collection){
  df1 = dwc_institutions_extractor(xml)
  df2 = abbrev_institutions_extractor(xml)
  institutions_to_mongo(df1, df2, root_id, collection)
}

#' @export
dwc_institutions_extractor = function(xml){
  #extract institution names
  nodes = xml2::xml_find_all(xml, "//named-content[@xlink:type='simple'][@content-type='institution']")
  if(length(nodes)>0){
    codes = c()
    names = c()
    for (d in 1:length(nodes)){
      siblings = xml2::xml_siblings(nodes[d])
        for (s in 1:length(siblings)){
            if(xml2::xml_attrs(siblings[s]) == "dwc:institutional_code" || grepl("dwc:institutional_code", toString(siblings[s])) || grepl("abbrev", toString(siblings[s])) || grepl("content-type = \"institution\"", toString(siblings[s]))){
              codes = c(codes, strip_trailing_whitespace(xml2::xml_text(siblings[s])))
              names = c(names, strip_trailing_whitespace(xml2::xml_text(nodes[d])))
              break
            }
          }

    }
    if(!(length(unique(codes))==length(unique(names)))){
      institutions_df = NULL
    }else{
      identif = sapply(codes, function(x){
        id = identifier(uuid::UUIDgenerate(), prefix = c(openbiodivInstitution = "http://openbiodiv.net/resource/institution/"))
        id$uri
      })
      institutions_df = data.frame(
        key = identif,
        code = codes,
        name = names,
        type = "institution"
      )
    }
  }else{
    institutions_df = NULL
  }

return(institutions_df)
}


#' @export
abbrev_institutions_extractor = function(xml){
  nodes = xml2::xml_find_all(xml, "//abbrev[@content-type = 'institution']")
  if(length(nodes)>0){
  attribs = sapply(nodes, function(x){
    xml2::xml_attrs(x)
  })

  if(!(is.null(nrow(attribs)))){
    names = strip_trailing_whitespace(attribs[2,])
    codes = sapply(nodes, function(x){
      strip_trailing_whitespace(xml2::xml_text(x))
    })

    codes = unique(codes)
    names = unique(names)
    identif = sapply(codes, function(x){
      id = identifier(uuid::UUIDgenerate(), prefix = c(openbiodivInstitution = "http://openbiodiv.net/resource/institution/"))
      id$uri
    })

    institutions_df = data.frame(
      key = identif,
      code =  codes,
      name = names,
      type = "institution"
    )
  }else{
    institutions_df = NULL
    }
  }else{
    institutions_df = NULL
  }
  return(institutions_df)
}


#' @export
institutions_to_mongo = function(df1, df2, root_id, collection){
  df = rbind(df1, df2)
  if(is.null(df) == FALSE){
    rownames(df) = NULL
    #parent saves the context (article id) in which an institution is used
    toRemove = c()
    for (n in 1:nrow(df)){
      res = check_code_name_combo(code = df[n,]$code, name = df[n,]$name, collection = collection)
      if (length(res)==0){
        df$parent = root_id$uri #only add root_id as parent if the code+name combo is not in mongo (not every time)
      }else{
        toRemove = c(toRemove,n)
      }
    }
    removal <- which(rownames(df) %in% toRemove)
    if (length(removal)>0){
      df = df[-removal,]
    }
    collection$insert(df)
  }

  return(TRUE)
}

#' @export
add_inst_triples = function(atoms, triples, identifiers)
{
  triples$prefix_list$add(c(openbiodivHasInst = "http://openbiodiv.net/property/hasInstitution"))
  triples$prefix_list$add(c(openbiodivHasInstName = "http://openbiodiv.net/property/hasInstitutionName"))
  triples$prefix_list$add(c(openbiodivHasInstCode = "http://openbiodiv.net/property/hasInstitutionCode"))

  atoms$inst_code = unique(atoms$inst_code)
  #the inst codes within the abstract, also check mongo
  sapply(atoms$inst_code, function(i){


    res = check_mongo_inst(code = i$text_value, parent = identifiers$root_id$id, collection = inst_collection)

    if (length(res) > 0){
      inst_id = strip_angle(res$key)
      inst_id = gsub("^(.*)resource\\/(.*)\\/", "", inst_id)
      inst_name = res$name
      institut = identifier(inst_id, prefix = c(openbiodivInstitution = "http://openbiodiv.net/resource/Institution/"))
      triples$add_triple(identifiers$nid, has_inst, institut)
      triples$add_triple(institut, rdf_type, Institution)
      triples$add_triple(institut, has_instName, literal(inst_name))
      triples$add_triple(institut, has_instCode, i)

    }
  })

  return(triples)
}
pensoft/ropenbio documentation built on Dec. 17, 2020, 5:50 a.m.