#' @export
clean_type_string = function(s){
clean_string = gsub("<title>", "\n\\1", s)
clean_string = gsub("<///title>", "\n\\1", clean_string)
#remove xml nodes
clean_string = gsub("<.*?>", "", clean_string)
#re,remove double whitespaces
clean_string = gsub("\\␣+", " ", clean_string)
#remove starting whitespace
clean_string = gsub("^\\␣", "", clean_string)
#remove "type material" if present
clean_string = str_replace_all(clean_string, "^[Type material]*. ", "")
clean_string = str_replace_all(clean_string, "^[Material examined]*. ", "")
clean_string = str_replace_all(clean_string, "^[Types]*. ", "")
clean_string = str_replace_all(clean_string, "^[Typification]*. ", "")
return(clean_string)
}
#' @export
holotype_extractor = function(extracted_holotype = FALSE, holotype_string){
first_word = word(holotype_string, 1)
second_word = word(holotype_string, 2)
third_word = word(holotype_string, 3)
fourth_word = word(holotype_string, 4)
first_two = paste(first_word, second_word, sep = " ")
first_four = paste(first_word, second_word,third_word,fourth_word, sep= " ")
#try to extract only sentence containing holotype
#if it starts with holotype
if (grepl("Holotype", first_word) == TRUE)
{
if (grepl("Holotype [(male)|(female)]* and paratype",first_four) == TRUE)
{
holotype = holotype_string
extracted_holotype = TRUE
}
else{
holotype = str_extract(holotype_string, ".*?Holotype(?:(?!Paratype|paratype).)*")
extracted_holotype = TRUE
}
}
#if it starts with number + gender (1 female specimen..)
if ((grepl("[0-9] [female]*", first_two) == TRUE) | (grepl("[0-9] [male]*", first_two) == TRUE))
{
holotype = str_extract(holotype_string, "([0-9] male|female)(.*?)(?=[0-9] male|female)")
extracted_holotype = TRUE
}
#if it's impossible to extract the single sentence (sentence part) containing holotype, save the whole text
if (extracted_holotype == FALSE)
{
holotype = holotype_string
extracted_holotype = TRUE
}
return(holotype)
}
#' @export
code_deduplicator = function(x){
deduplicated = unique(x)
if(length(deduplicated)>1){
x = NULL
}
else{
x = deduplicated
}
return(x)
}
#' @export
#check whether an inst code is present in grbio csv
check_grbio = function(l, grbio){
result = NULL
if(grepl("[aA-zZ\\-()&/]*", l, perl=TRUE) == TRUE){
l = gsub("^[\\-()&/]*","", l, perl=TRUE)
if (l %in% grbio$`Institutional Code/Acronym`){
institution_codes = l
institution_name = subset(grbio, `Institutional Code/Acronym` == institution_codes)$`Name of Institution`
result = c(institution_codes, institution_name)
}
}
return(result)
}
#' @export
#if the holotype contains an institutional code which is not in the csv from grbio check the article abbreviations
check_abbreviations = function(l, abbreviations){
result = NULL
if(grepl("[aA-zZ\\-()&/]*", l, perl=TRUE) == TRUE){
starting_letters = str_extract(l, "^([aA-zZ]+)")
if (l %in% abbreviations$inst_codes)
{
institution_codes = l
if(toupper(institution_codes) == institution_codes){
institution_name = abbreviations[abbreviations$inst_codes==institution_codes, "title"]
result = c(institution_codes, institution_name)
}
}
if (starting_letters %in% abbreviations$inst_codes){
institution_codes = starting_letters
if(toupper(institution_codes) == institution_codes){
institution_name = abbreviations[abbreviations$inst_codes==institution_codes, "title"]
result = c(institution_codes, institution_name)
}
}
}
if(is.null(result)==FALSE){
return(result)
}
}
#' @export
character_zero_to_null= function(x){
if(identical(x, "character(0)") || identical(x, character(0)) || identical(unlist(x), character(0))){
x = NULL
}
return(x)
}
#' @export
build_inst_dataframe = function(dwc_codes, abbr_codes, grbio_df, abbreviations){
first_df = as.data.frame(NULL)
second_df = as.data.frame(NULL)
if (length(dwc_codes) > 0){
d = sapply(dwc_codes, check_grbio, grbio = grbio_df)
first_df = as.data.frame(d)
first_df = t(first_df)
rownames(first_df) = NULL
}
if (length(abbr_codes) > 0){
a = sapply(abbr_codes, check_abbreviations, abbreviations = abr_frame)
a[sapply(a, is.null)] <- NULL
second_df = as.data.frame(a)
second_df = t(second_df)
rownames(second_df) = NULL
}
#colnames(first_df) = c("abbreviation", "institution_name")
#colnames(second_df) = c("abbreviation", "institution_name")
if (nrow(first_df)>0 && nrow(second_df)>0){
inst_df = rbind(first_df, second_df)
colnames(inst_df) = c("abbreviation", "institution_name")
rownames(inst_df) = NULL
}
else
{
if (nrow(first_df)>0){
inst_df = first_df
colnames(inst_df) = c("abbreviation", "institution_name")
rownames(inst_df) = NULL
}
if (nrow(second_df)>0){
inst_df = second_df
colnames(inst_df) = c("abbreviation", "institution_name")
rownames(inst_df) = NULL
}else{
inst_df = as.data.frame(NULL)
}
}
return(inst_df)
}
#' @export
extract_dwc_codes = function(s){
dwc_inst_codes = str_extract_all(s, '(?<=<named-content content-type="dwc:institutional_code">).*?(?=</named-content>)')
dwc_inst_codes = character_zero_to_null(dwc_inst_codes)
dwc_inst_codes = unique(unlist(dwc_inst_codes))
return(dwc_inst_codes)
}
#' @export
extract_abbreviations = function(s){
abbrev_codes = str_extract_all(s,'<abbrev.*?</abbrev>')
abbrev_codes = lapply(abbrev_codes, function(x) gsub("<.*?>", "", x))
abbrev_codes = character_zero_to_null(abbrev_codes)
abbrev_codes = unique(unlist(abbrev_codes))
return(abbrev_codes)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.