R/parse_taxa.R

parse_taxa <- function(taxa){
    parse_taxon <- function(taxon){
        replace_space <- function(x) {
            gsub("[[:space:]]+", " ", gsub("^[[:space:]]+|[[:space:]]+$", "", x))
        }

        if(length(taxon) > 1){
            stop("Only one taxon allowed")
        }

        GENUS                         <- ""
        SPECIES                       <- ""
        AUTHOR_OF_SPECIES             <- ""
        INFRASPECIFIC_RANK            <- ""
        INFRASPECIFIC_EPITHET         <- ""
        AUTHOR_OF_INFRASPECIFIC_RANK  <- ""

        taxon <- gsub(" +", " ", replace_space(taxon))
        ### Parse Genus
        gap1 <- regexpr(pattern = " ", text = taxon)
        GENUS <- replace_space(substr(taxon, start = 1, stop = (gap1-1)))
        ### The rest part
        part1 <- replace_space(substr(taxon, start = gap1 + 1, stop = nchar(taxon)))
        gap2 <- regexpr(pattern = " ", text = part1)

        ### Parse Species
        if(gap2 < 0){
            SPECIES <- replace_space(substr(part1, start = gap2 + 1, stop = nchar(part1)))
            author_temp <- ""
        } else {
            SPECIES <- replace_space(substr(part1, start = 1, stop = (gap2 - 1)))
            author_temp <- replace_space(substr(part1, start = gap2 + 1, stop = nchar(part1)))
        }
            AUTHOR_OF_SPECIES <- author_temp

            gap3 <- regexpr(pattern = " ", text = author_temp)
        if(grepl("var\\. ", taxon)|grepl("subsp\\. ", taxon)|grepl(" f\\. ", taxon)){
            if(grepl("var\\. ", taxon)){
                INFRASPECIFIC_RANK            <- "var."
                gap_var <- regexpr(pattern = "var\\. ", text = author_temp) + nchar("var.")
                AUTHOR_OF_SPECIES <- replace_space(substr(author_temp, start = 1, stop = gap_var - nchar("var.") -1))
                part_INFRASP_EP_AUTHOR_OF_INFRASP <- replace_space(substr(author_temp, start = gap_var + 1, stop = nchar(author_temp)))
            } else {
                if(grepl("subsp\\. ", taxon)){
                    INFRASPECIFIC_RANK            <- "subsp."
                    gap_subsp <- regexpr(pattern = "subsp\\. ", text = author_temp) + nchar("subsp.")
                    AUTHOR_OF_SPECIES <- replace_space(substr(author_temp, start = 1, stop = gap_subsp - nchar("subsp.") -1))
                    part_INFRASP_EP_AUTHOR_OF_INFRASP <- replace_space(substr(author_temp, start = gap_subsp + 1, stop = nchar(author_temp)))
                } else {
                    if(substr(author_temp, start = 1, stop = nchar("f.")) == "f."){
                            INFRASPECIFIC_RANK            <- "f."
                            gap_f <- regexpr(pattern = "f\\. ", text = author_temp) + nchar("f.")
                            position_f <- regexpr(pattern = "f\\.", text = taxon)[[1]][1] ### the first f. is the forma
                            position_white_space <- regexpr(pattern = "", text = taxon)[[1]]
                            location_species_end <- position_white_space[2]
                            AUTHOR_OF_SPECIES_temp <- replace_space(substr(taxon, start = location_species_end, stop = position_f - 1))
                            AUTHOR_OF_SPECIES <- ifelse(is.na(AUTHOR_OF_SPECIES_temp), "", AUTHOR_OF_SPECIES_temp)
                            part_INFRASP_EP_AUTHOR_OF_INFRASP <- replace_space(substr(author_temp, start = gap_f + 1, stop = nchar(author_temp)))
                        } else {
                            INFRASPECIFIC_RANK            <- ""
                            part_INFRASP_EP_AUTHOR_OF_INFRASP <- ""
                            AUTHOR_OF_SPECIES <- replace_space(substr(author_temp, start = 1, stop = nchar(author_temp)))
                        }
                }
             }
             } else {
                part_INFRASP_EP_AUTHOR_OF_INFRASP <- ""
         }

                ##part_INFRASP_EP_AUTHOR_OF_INFRASP <- replace_space(substr(author_temp, start = gap_var_or_subsp + 1, stop = nchar(author_temp)))
                 gap4 <- regexpr(pattern = " ", text = part_INFRASP_EP_AUTHOR_OF_INFRASP)
                 if(gap4 > 0){
                     INFRASPECIFIC_EPITHET         <- replace_space(substr(part_INFRASP_EP_AUTHOR_OF_INFRASP, start = 1, stop = gap4 - 1 ))
                     AUTHOR_OF_INFRASPECIFIC_RANK  <- replace_space(substr(part_INFRASP_EP_AUTHOR_OF_INFRASP, start = gap4 + 1, stop = nchar(part_INFRASP_EP_AUTHOR_OF_INFRASP)))
                 } else {
                     INFRASPECIFIC_EPITHET         <- replace_space(substr(part_INFRASP_EP_AUTHOR_OF_INFRASP, start = 1, stop = nchar(part_INFRASP_EP_AUTHOR_OF_INFRASP)))
                     if(INFRASPECIFIC_EPITHET %in% strsplit(AUTHOR_OF_SPECIES, " ")[[1]]){
                         INFRASPECIFIC_EPITHET <- ""
                     }
                 }

        if(!grepl(" ", taxon)){
           GENUS = taxon
           SPECIES                       <- ""
           AUTHOR_OF_SPECIES             <- ""
           INFRASPECIFIC_RANK            <- ""
           INFRASPECIFIC_EPITHET         <- ""
           AUTHOR_OF_INFRASPECIFIC_RANK  <- ""
        }

        res <- c(taxon                        ,
                 GENUS                        ,
                 SPECIES                      ,
                 AUTHOR_OF_SPECIES            ,
                 INFRASPECIFIC_RANK           ,
                 INFRASPECIFIC_EPITHET        ,
                 AUTHOR_OF_INFRASPECIFIC_RANK )
        names(res) <- c("TAXON_PARSED"                        ,
                        "GENUS_PARSED"                        ,
                        "SPECIES_PARSED"                      ,
                        "AUTHOR_OF_SPECIES_PARSED"            ,
                        "INFRASPECIFIC_RANK_PARSED"           ,
                        "INFRASPECIFIC_EPITHET_PARSED"        ,
                        "AUTHOR_OF_INFRASPECIFIC_RANK_PARSED" )
        return(res)
    }

    res <- data.frame(t(sapply(taxa, parse_taxon, USE.NAMES = FALSE)), stringsAsFactors = FALSE)
    return(res)
}

Try the herblabel package in your browser

Any scripts or data that you put into this service are public.

herblabel documentation built on May 2, 2019, 4:47 p.m.