R/namesCompare.R

Defines functions namesCompare

Documented in namesCompare

#' @title Checking species names for misspelling and synonyms
#' @description The function cross-references two vectors of species names
#'   checking for possible synonyms, misspelled names, and genus-species or
#'   species-subspecies correspondence.
#' @usage namesCompare(vec1,vec2,proportion=0.15)
#' @param vec1,vec2 a vector of species names. Genus names only are also
#'   allowed. Generic name and specific epithet must be separated by '_'. Note
#'   that \code{vec2} is used as the reference. Incomplete or suspicious names
#'   are better placed in \code{vec1} (see example below).
#' @param proportion the maximum proportion of different characters between any
#'  \code{vec1-vec2} names pair to consider it a possible misspelling.
#' @importFrom utils adist
#' @export
#' @return The function returns a \code{list} including:
#' @return \strong{$genus} if \code{vec1} includes genera names which miss
#'   specific epithet, this object lists all the species in \code{vec2}
#'   belonging to each of the genera.
#' @return \strong{$subspecies} if \code{vec1} includes subspecies (i.e. two
#'   epithets after genus name), this object lists species in \code{vec2}
#'   possibly corresponding to each of the subspecies.
#' @return \strong{$epithet} lists species with matching epithets as possible
#'   synonyms.
#' @return \strong{$misspelling} lists possible misspelled names. For each
#'   proposed mismatched names pair the proportion of characters in the
#'   \code{vec1} differing from the string in \code{vec2} is returned.
#' @author Silvia Castiglione, Carmela Serio, Antonella Esposito
#' @examples
#' \dontrun{
#' names(DataFelids$statefel)->nams
#' nams[c(19,12,37,80,43)]<-c("Puma_yagouaroundi","Felis_manul","Catopuma",
#'                            "Pseudaelurus","Panthera_zdansky")
#' nams<-nams[-81]
#'
#' namesCompare(nams,names(DataFelids$statefel))
#' namesCompare(names(DataFelids$statefel),nams)
#' }

namesCompare<-function(vec1,vec2,proportion=0.15){
  if(!is.null(ncol(vec1))){
    if(ncol(vec1)>1) warning(paste("vec1:",ncol(vec1),"columns supplied, only the first one will be used"))
    vec1[,1]->vec1
  }

  if(!is.null(ncol(vec2))){
    if(ncol(vec2)>1) warning(paste("vec2:",ncol(vec2),"columns supplied, only the first one will be used"))
    vec2[,1]->vec2
  }

  if(all(vec1%in%vec2)) stop("vec1 and vec2 species perfectly match")

  vec1[which(!vec1%in%vec2)]->vec1
  vec2[which(!vec2%in%vec1)]->vec2

  ### GENERA CHECK ###
  if(any(!grepl("_",vec1))){
    do.call(rbind,lapply(vec1[!grepl("_",vec1)],function(x)
      data.frame(vec1=rep(x,length(grep(x,vec2))),vec2=vec2[grep(x,vec2)])))->genera
    if(nrow(genera)<1) genera<-NULL
  }else genera<-NULL
  vec1[which(!vec1%in%genera[,1])]->vec1

  ### SUBSPECIES CHECK ###
  if(any(sapply(strsplit(vec1,"_"),length)>2)){
    vec1[which(sapply(strsplit(vec1,"_"),length)>2)]->suspv1
    strsplit(vec1,"_")[which(sapply(strsplit(vec1,"_"),length)>2)]->suspv1_split

    do.call(rbind,mapply(j=suspv1_split,w=suspv1,function(j,w){
      combn(j,2,function(k) paste(k,collapse="_"))[1:2]->check

      if(any(check%in%vec2)) data.frame(vec1=rep(w,length(which(vec2%in%check))),vec2=vec2[which(vec2%in%check)]) else{
        adcheck2 <- adist(check,vec2)/as.integer(nchar(check))
        adcheck2 <- data.frame(vec1=check,vec2=vec2[apply(adcheck2, 1, which.min)],proportion=apply(adcheck2, 1, min))
        if(any(adcheck2$proportion<=proportion)){
          data.frame(vec1=rep(w,length(which(adcheck2$proportion<=proportion))),
                     vec2=adcheck2[which(adcheck2$proportion<=proportion),2])
        } else NULL
      }
    },SIMPLIFY = FALSE))->subspecies
    vec1[which(!vec1%in%subspecies[,1])]->vec1
  } else subspecies<-NULL

  ### MISSPELLING CHECK ###
  ad1 <- adist(vec1,vec2)/nchar(vec1)
  ad1.dataframe <- data.frame(vec1,vec2=vec2[apply(ad1, 1, which.min)],proportion=apply(ad1, 1, min))

  ad1.dataframe[which(ad1.dataframe$proportion<=proportion),]->ad1.dataframe
  misspelling <- ad1.dataframe[order(ad1.dataframe[,3]),]
  if(nrow(misspelling)<1) misspelling<-NULL

  ### epithet CHECK ###
  vec1[which(sapply(strsplit(vec1,"_"),length)>=2)]->vec1
  vec2[which(sapply(strsplit(vec2,"_"),length)>=2)]->vec2

  do.call(rbind,lapply(1:length(vec2),function(k){
    strsplit(vec2,"_")[[k]]->ep2
    if(length(ep2)>2) {
      if(any(duplicated(ep2))) data.frame(ind=k, sp=ep2[2]) else data.frame(ind=rep(k,2), sp=ep2[2:3])
    } else data.frame(ind=k, sp=ep2[2])
  }))->episp2

  do.call(rbind,lapply(1:length(strsplit(vec1,"_")),function(k){
    if(length(strsplit(vec1,"_")[[k]])>2) {
      strsplit(vec1,"_")[[k]][2:3]->ep
      if(any(duplicated(ep))) ep[1]->ep
    } else strsplit(vec1,"_")[[k]][2]->ep

    if(any(episp2[,2]%in%ep)) data.frame(vec1=rep(vec1[k],length(which(episp2[,2]%in%ep))),
                                         vec2=vec2[episp2[which(episp2[,2]%in%ep),1]]) else {
                                           adepi <- adist(ep,episp2[,2])/nchar(ep)
                                           adepi<-data.frame(ep1=ep,ep2=vec2[episp2[apply(adepi, 1, which.min),1]],proportion=apply(adepi, 1, min))
                                           if(any(adepi$proportion<=proportion)){
                                             data.frame(vec1=rep(vec1[k],length(which(adepi$proportion<=proportion))),
                                                        vec2=adepi[which(adepi$proportion<=proportion),2])
                                           } else NULL
                                         }
  }))->epithet

  return(list(genus=genera,subspecies=subspecies,epithet=epithet,misspelling=misspelling))
}

Try the RRphylo package in your browser

Any scripts or data that you put into this service are public.

RRphylo documentation built on June 7, 2023, 5:49 p.m.