#' Parsing scientific names
#'
#' Parse a scientific name into different parts, namely: (1)genus, (2)species,
#' (2)author, (3)infra-specific rank, i.e. var., f., subsp., (4)infra-specific
#' epithet, (5)author of infra-specific rank
#'
#' The details are explained in the value section
#'
#' @param taxa A character string, usually the scientific name of a plant
#' species
#' @return A data.frame containing the following columns:
#' \item{TAXON_PARSED}{The input taxa name}
#' \item{GENUS_PARSED}{The genus:
#' letters before the first white space}
#' \item{SPECIES_PARSED}{The specific
#' epithet: letters after the first white space but before the second white
#' spaccce}
#' \item{AUTHOR_OF_SPECIES_PARSED}{the Author of this taxa, if no
#' var./f./subsp. is fouund with, AUTHOR_OF_SPECIES_PARSED will include all
#' letters (including whitespace) from the second gap onwards }
#' \item{INFRASPECIFIC_RANK_PARSED}{either f., var. or subsp., if these values
#' were detected in the taxa name. If neighther f., var. or subsp. is not
#' detected, this field will be left empty}
#' \item{INFRASPECIFIC_EPITHET_PARSED}{if either f., var. or subsp. is
#' detected, letters after the "INFRASPECIFIC_RANK_PARSED" but before a white
#' space will be extracted as INFRASPECIFIC_EPITHET_PARSED. If neighther f.,
#' var. or subsp. is not detected, this field will be left empty }
#' \item{AUTHOR_OF_INFRASPECIFIC_RANK_PARSED}{All the letters after
#' INFRASPECIFIC_EPITHET_PARSED will be treated as
#' AUTHOR_OF_INFRASPECIFIC_RANK_PARSED. If neighther f., var. or subsp. is not
#' detected, this field will be left empty }
#' @author Jinlong Zhang
#' @seealso \code{\link{status}}, \code{\link{make_checklist}}
#' @examples
#'
#' parse_taxa("Epirixanthes elongata Blume")
#' parse_taxa("Epirixanthes elongata")
#'
#' @export parse_taxa
parse_taxa <-
function(taxa) {
parse_taxon <- function(taxon) {
replace_space <- function(x) {
gsub("[[:space:]]+",
" ",
gsub("^[[:space:]]+|[[:space:]]+$", "", x))
}
if (length(taxon) > 1) {
stop("Only one taxon allowed")
}
GENUS <- ""
SPECIES <- ""
AUTHOR_OF_SPECIES <- ""
INFRASPECIFIC_RANK <- ""
INFRASPECIFIC_EPITHET <- ""
AUTHOR_OF_INFRASPECIFIC_RANK <- ""
taxon <- gsub(" +", " ", replace_space(taxon))
### Parse Genus
gap1 <- regexpr(pattern = " ", text = taxon)
GENUS <-
replace_space(substr(taxon, start = 1, stop = (gap1 - 1)))
### The rest part
part1 <-
replace_space(substr(taxon, start = gap1 + 1, stop = nchar(taxon)))
gap2 <- regexpr(pattern = " ", text = part1)
### Parse Species
if (gap2 < 0) {
SPECIES <-
replace_space(substr(part1,
start = gap2 + 1,
stop = nchar(part1)))
author_temp <- ""
} else {
SPECIES <-
replace_space(substr(part1, start = 1, stop = (gap2 - 1)))
author_temp <-
replace_space(substr(part1,
start = gap2 + 1,
stop = nchar(part1)))
}
AUTHOR_OF_SPECIES <- author_temp
gap3 <- regexpr(pattern = " ", text = author_temp)
if (grepl("var\\. ", taxon) |
grepl("subsp\\. ", taxon) |
grepl(" f\\. ", taxon)) {
if (grepl("var\\. ", taxon)) {
INFRASPECIFIC_RANK <- "var."
gap_var <-
regexpr(pattern = "var\\. ", text = author_temp) + nchar("var.")
AUTHOR_OF_SPECIES <-
replace_space(substr(
author_temp,
start = 1,
stop = gap_var - nchar("var.") - 1
))
part_INFRASP_EP_AUTHOR_OF_INFRASP <-
replace_space(substr(
author_temp,
start = gap_var + 1,
stop = nchar(author_temp)
))
} else {
if (grepl("subsp\\. ", taxon)) {
INFRASPECIFIC_RANK <- "subsp."
gap_subsp <-
regexpr(pattern = "subsp\\. ",
text = author_temp) +
nchar("subsp.")
AUTHOR_OF_SPECIES <-
replace_space(substr(
author_temp,
start = 1,
stop = gap_subsp - nchar("subsp.") - 1
))
part_INFRASP_EP_AUTHOR_OF_INFRASP <-
replace_space(substr(
author_temp,
start = gap_subsp + 1,
stop = nchar(author_temp)
))
} else {
if (substr(author_temp,
start = 1,
stop = nchar("f.")) == "f.") {
INFRASPECIFIC_RANK <- "f."
gap_f <-
regexpr(pattern = "f\\. ",
text = author_temp) + nchar("f.")
position_f <-
regexpr(pattern = "f\\.",
text = taxon)[[1]][1] ### the first f. is the forma
position_white_space <-
regexpr(pattern = "", text = taxon)[[1]]
location_species_end <-
position_white_space[2]
AUTHOR_OF_SPECIES_temp <-
replace_space(substr(taxon,
start = location_species_end,
stop = position_f - 1))
AUTHOR_OF_SPECIES <-
ifelse(is.na(AUTHOR_OF_SPECIES_temp),
"",
AUTHOR_OF_SPECIES_temp)
part_INFRASP_EP_AUTHOR_OF_INFRASP <-
replace_space(substr(
author_temp,
start = gap_f + 1,
stop = nchar(author_temp)
))
} else {
INFRASPECIFIC_RANK <- ""
part_INFRASP_EP_AUTHOR_OF_INFRASP <- ""
AUTHOR_OF_SPECIES <-
replace_space(substr(
author_temp,
start = 1,
stop = nchar(author_temp)
))
}
}
}
} else {
part_INFRASP_EP_AUTHOR_OF_INFRASP <- ""
}
## part_INFRASP_EP_AUTHOR_OF_INFRASP <- replace_space(substr(author_temp, start = gap_var_or_subsp + 1, stop = nchar(author_temp)))
gap4 <-
regexpr(pattern = " ",
text = part_INFRASP_EP_AUTHOR_OF_INFRASP)
if (gap4 > 0) {
INFRASPECIFIC_EPITHET <-
replace_space(substr(
part_INFRASP_EP_AUTHOR_OF_INFRASP,
start = 1,
stop = gap4 - 1
))
AUTHOR_OF_INFRASPECIFIC_RANK <-
replace_space(substr(
part_INFRASP_EP_AUTHOR_OF_INFRASP,
start = gap4 + 1,
stop = nchar(part_INFRASP_EP_AUTHOR_OF_INFRASP)
))
} else {
INFRASPECIFIC_EPITHET <-
replace_space(substr(
part_INFRASP_EP_AUTHOR_OF_INFRASP,
start = 1,
stop = nchar(part_INFRASP_EP_AUTHOR_OF_INFRASP)
))
if (INFRASPECIFIC_EPITHET %in% strsplit(AUTHOR_OF_SPECIES,
" ")[[1]]) {
INFRASPECIFIC_EPITHET <- ""
}
}
if (!grepl(" ", taxon)) {
GENUS <- taxon
SPECIES <- ""
AUTHOR_OF_SPECIES <- ""
INFRASPECIFIC_RANK <- ""
INFRASPECIFIC_EPITHET <- ""
AUTHOR_OF_INFRASPECIFIC_RANK <- ""
}
res <- c(
taxon,
GENUS,
SPECIES,
AUTHOR_OF_SPECIES,
INFRASPECIFIC_RANK,
INFRASPECIFIC_EPITHET,
AUTHOR_OF_INFRASPECIFIC_RANK
)
names(res) <- c(
"TAXON_PARSED",
"GENUS_PARSED",
"SPECIES_PARSED",
"AUTHOR_OF_SPECIES_PARSED",
"INFRASPECIFIC_RANK_PARSED",
"INFRASPECIFIC_EPITHET_PARSED",
"AUTHOR_OF_INFRASPECIFIC_RANK_PARSED"
)
return(res)
}
res <-
data.frame(t(sapply(taxa, parse_taxon, USE.NAMES = FALSE)),
stringsAsFactors = FALSE)
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.