R/type_material_functions.R

#' @export
clean_type_string = function(s){
  clean_string = gsub("<title>", "\n\\1", s)
  clean_string = gsub("<///title>", "\n\\1", clean_string)


  #remove xml nodes
  clean_string = gsub("<.*?>", "", clean_string)
  #re,remove double whitespaces
  clean_string =  gsub("\\␣+", " ", clean_string)
  #remove starting whitespace
  clean_string =  gsub("^\\␣", "", clean_string)
  #remove "type material" if present
  clean_string = str_replace_all(clean_string, "^[Type material]*. ", "")
  clean_string = str_replace_all(clean_string, "^[Material examined]*. ", "")
  clean_string = str_replace_all(clean_string, "^[Types]*. ", "")
  clean_string = str_replace_all(clean_string, "^[Typification]*. ", "")
  return(clean_string)
}

#' @export
holotype_extractor = function(extracted_holotype = FALSE, holotype_string){
  first_word = word(holotype_string, 1)
  second_word = word(holotype_string, 2)
  third_word = word(holotype_string, 3)
  fourth_word = word(holotype_string, 4)
  first_two = paste(first_word, second_word, sep = " ")
  first_four = paste(first_word, second_word,third_word,fourth_word, sep= " ")

  #try to extract only sentence containing holotype
  #if it starts with holotype
  if (grepl("Holotype", first_word) == TRUE)
  {
    if (grepl("Holotype [(male)|(female)]* and paratype",first_four) == TRUE)
    {
      holotype = holotype_string
      extracted_holotype = TRUE
    }
    else{
      holotype = str_extract(holotype_string, ".*?Holotype(?:(?!Paratype|paratype).)*")
      extracted_holotype = TRUE
    }
  }
  #if it starts with number + gender (1 female specimen..)
  if ((grepl("[0-9] [female]*", first_two) == TRUE) | (grepl("[0-9] [male]*", first_two) == TRUE))
  {
    holotype = str_extract(holotype_string, "([0-9] male|female)(.*?)(?=[0-9] male|female)")
    extracted_holotype = TRUE
  }

  #if it's impossible to extract the single sentence (sentence part) containing holotype, save the whole text
  if (extracted_holotype == FALSE)
  {
    holotype = holotype_string
    extracted_holotype = TRUE
  }

  return(holotype)
}

#' @export
code_deduplicator = function(x){
  deduplicated = unique(x)
  if(length(deduplicated)>1){
    x = NULL
  }
  else{
    x = deduplicated
  }
  return(x)
}

#' @export
#check whether an inst code is present in grbio csv
check_grbio = function(l, grbio){
  result = NULL
  if(grepl("[aA-zZ\\-()&/]*", l, perl=TRUE) == TRUE){
    l = gsub("^[\\-()&/]*","", l, perl=TRUE)
    if (l %in% grbio$`Institutional Code/Acronym`){
      institution_codes = l
      institution_name = subset(grbio, `Institutional Code/Acronym` == institution_codes)$`Name of Institution`
      result = c(institution_codes, institution_name)
    }
  }
  return(result)

}

#' @export
#if the holotype contains an institutional code which is not in the csv from grbio check the article abbreviations
check_abbreviations = function(l, abbreviations){
  result = NULL
  if(grepl("[aA-zZ\\-()&/]*", l, perl=TRUE) == TRUE){
    starting_letters = str_extract(l, "^([aA-zZ]+)")
    if (l %in% abbreviations$inst_codes)
    {
      institution_codes = l

      if(toupper(institution_codes) == institution_codes){
        institution_name = abbreviations[abbreviations$inst_codes==institution_codes, "title"]
        result = c(institution_codes, institution_name)
      }
    }
    if (starting_letters %in% abbreviations$inst_codes){
      institution_codes = starting_letters
      if(toupper(institution_codes) == institution_codes){
        institution_name = abbreviations[abbreviations$inst_codes==institution_codes, "title"]
        result = c(institution_codes, institution_name)
      }
    }
  }
  if(is.null(result)==FALSE){
    return(result)
  }
}


#' @export
character_zero_to_null= function(x){
  if(identical(x, "character(0)") || identical(x, character(0)) || identical(unlist(x), character(0))){
    x = NULL
  }
  return(x)
}

#' @export
build_inst_dataframe = function(dwc_codes, abbr_codes, grbio_df, abbreviations){
  first_df = as.data.frame(NULL)
  second_df = as.data.frame(NULL)

  if (length(dwc_codes) > 0){
    d = sapply(dwc_codes, check_grbio, grbio = grbio_df)
    first_df  = as.data.frame(d)
    first_df = t(first_df)
    rownames(first_df) = NULL
  }
  if (length(abbr_codes) > 0){
    a = sapply(abbr_codes, check_abbreviations, abbreviations = abr_frame)
    a[sapply(a, is.null)] <- NULL
    second_df  = as.data.frame(a)
    second_df = t(second_df)
    rownames(second_df) = NULL
  }

  #colnames(first_df) = c("abbreviation", "institution_name")
  #colnames(second_df) = c("abbreviation", "institution_name")

  if (nrow(first_df)>0 && nrow(second_df)>0){
    inst_df = rbind(first_df, second_df)
    colnames(inst_df) = c("abbreviation", "institution_name")
    rownames(inst_df) = NULL
  }
  else
  {
    if (nrow(first_df)>0){
      inst_df = first_df
      colnames(inst_df) = c("abbreviation", "institution_name")
      rownames(inst_df) = NULL
    }
    if (nrow(second_df)>0){
      inst_df = second_df
      colnames(inst_df) = c("abbreviation", "institution_name")
      rownames(inst_df) = NULL
    }else{
      inst_df = as.data.frame(NULL)
    }
  }

  return(inst_df)
}

#' @export
extract_dwc_codes = function(s){
  dwc_inst_codes = str_extract_all(s, '(?<=<named-content content-type="dwc:institutional_code">).*?(?=</named-content>)')
  dwc_inst_codes = character_zero_to_null(dwc_inst_codes)
  dwc_inst_codes = unique(unlist(dwc_inst_codes))
  return(dwc_inst_codes)
}

#' @export
extract_abbreviations = function(s){
  abbrev_codes = str_extract_all(s,'<abbrev.*?</abbrev>')
  abbrev_codes = lapply(abbrev_codes, function(x) gsub("<.*?>", "", x))
  abbrev_codes = character_zero_to_null(abbrev_codes)
  abbrev_codes = unique(unlist(abbrev_codes))
  return(abbrev_codes)
}
mariyad/openbiodiving documentation built on June 3, 2019, 2:18 p.m.