R/worms_func.R
In worms: Retriving Aphia Information from World Register of Marine Species

#' @title GET AphiaRecordsByNames 
#' @importFrom httr  GET content
#' @importFrom plyr rbind.fill
#' 
#' @description takes character vector with taxon names and retrives AphiaRecords from WoRMS 
#'
#' @param taxon_names character vector with names of taxa to look up.
#' @param chunksize there is a limit to the number of taxa that can be looked up at once, so request are split up into chunks. This limit seems to be variable. 50 is very safe.  
#' @param verbose be verbose
#' @param ids add column "id" and "name" with running id and search names
#' @param match taxon_names that could not retrieved will be retried with \code{\link{wormsbymatchnames}}. Implies "id=TRUE"
#' @param like Add a "\%"-sign after the ScientificName (SQL LIKE function). Default=true
#' @param marine_only Limit to marine taxa. Default=true
#' @param sleep_btw_chunks_in_sec pause between requests 
#' 
#' @return a data frame.
#' @details This function will take a character vector with taxon names, 
#' retrive AphiaRecords (CC-BY) from www.marinespecies.org using the 
#' GET /AphiaRecordsByName/{ScientificName} Method described at
#' http://www.marinespecies.org/rest/.
#' Results will be output to a data.frame with each row being a record.
#' For each name given, only the one AphiaRecord will be retrived. AphiaRecord with "accepted" status are preferred.
#' If  not present last entry will be taken which seems to result in best results.
#' 
#' @examples
#' taxon_names <- c( "Westwodilla caecula" ,  "Abra alba", "Chaetozone cf. setosa",  "Algae" )
#' w <- wormsbynames(taxon_names)
#' ## print unrecognized returns
#' failed_species <- rownames(w[is.na(w[,1]),])
#' 
#' ## try again with fuzzy matching turned on
#' \donttest{w <- wormsbynames(taxon_names, match=TRUE)}
#' 
#' ## this is how to load taxon_names from file
#' write.csv(taxon_names , file = "tax.csv", 
#'         row.names = FALSE,
#'         na = "")
#' ## check it out, then load it
#' read.csv(file = "tax.csv",
#'         na = "", 
#'         stringsAsFactors = FALSE,
#'         col.names = FALSE)
#' ## save results to file to inspect with, e.g. spreadsheet software
#' write.csv(w,file = "aphiainfo.csv", 
#'         na = "", 
#'         col.names = TRUE,
#'         row.names = TRUE)
#' 
#'         
#' @export
wormsbynames <- function(taxon_names,ids=FALSE,match=FALSE,verbose=TRUE,chunksize=50,like="false", marine_only="true",sleep_btw_chunks_in_sec=0.1){
  #library(httr)
  #library(plyr)
  
  stopifnot(inherits(taxon_names,"character"))
  if(match){ids<-TRUE}
  search_options<-paste0("like=",like,"&marine_only=",marine_only)
  my_worms<-list()
  request<-"http://www.marinespecies.org/rest/AphiaRecordsByNames"
  #li<-setNames(    as.list(c(a[[1]],"false","true")) , c( rep("scientificnames[]",length(a[[1]])),"like","marine_only" ) )     
  # r<-GET("http://www.marinespecies.org/rest/AphiaRecordsByName", query = list("scientificnames[]" = "Abietinaria%20abietina","scientificnames[]" = "Acanthocardia%20echinata"))
  
  wrapname<-gsub(" ", "%20", taxon_names)
  chunk<-split(wrapname, ceiling(seq_along(taxon_names)/chunksize))
  chunkid<-split(1:length(taxon_names), ceiling(seq_along(taxon_names)/chunksize))
  cat("REQUESTING ",length(taxon_names)," ITEMS BY NAME from World Register of Marine Species (www.marinespecies.org), ",format(Sys.time(), "%d/%m/%Y %X")," (CC-BY)\n",sep = "")
  for (round in 1:length(chunk)){
    
    if(verbose){
      cat(sprintf("%62s", paste0("chunk ",round,"/",length(chunk))),"\n")
    }
    m<-paste(paste(paste0("scientificnames[]=",chunk[[round]]),collapse="&"),search_options,sep="&")
    r<-GET(paste(request,m,sep="?"))
    Sys.sleep(sleep_btw_chunks_in_sec)
    stopifnot(r$status_code==200)
    
    # gather lists in master list
    r_parsed<-content(r,as="parsed")
    for (i in 1:length(r_parsed)){
      w_index<-unlist(chunkid[round])[i]
      if(length(r_parsed[[i]])==0){
        my_worms[[w_index]]<-NA
        cat(sprintf("%-46s       %-40s", taxon_names[w_index] , "no match"),"\n")
      } else if(length(r_parsed[[i]])==1){
        my_worms[[w_index]]<-r_parsed[[i]][[1]]
      } else {
        l<-length(r_parsed[[i]])
        my_worms[[w_index]]<-r_parsed[[i]][[l]]
        for(j in 1:l){
          if(r_parsed[[i]][[j]]$status == "accepted"){
            my_worms[[w_index]]<-r_parsed[[i]][[j]]
          }
        }
        # alternate representation
      }
    }
  }
  # pull dataframe out of master list
  non.null.list <- lapply(my_worms, lapply, function(x)ifelse(is.null(x), NA, x))
  worms<-rbind.fill(lapply(non.null.list, as.data.frame,stringsAsFactors = F))
  worms$NA.<-NULL
  if(ids){
    worms<-cbind(data.frame(id=1:nrow(worms) , name=taxon_names,stringsAsFactors = F),worms)
  }
  if (verbose) {cat("by names ........................................... DONE\n")}
  if(match){
    nonefound<-is.na(worms[,"AphiaID"])
    failed_species<-taxon_names[nonefound]
    if(length(failed_species)>0){
      failed_worms<-wormsbymatchnames(failed_species,verbose=verbose,ids=FALSE)
      worms[nonefound,c(F,F,rep(T,ncol(failed_worms)))]<-failed_worms
    } else {
      cat("  Nothing to match.\n")
    }
    
  } 
  
  
  
  return(worms)
}


















#' @title GET AphiaRecordsByMatchNames
#' @importFrom httr  GET content
#' @importFrom plyr rbind.fill
#' 
#' @description takes character vector with taxon names and retrives AphiaRecords from WoRMS 
#'
#' @param taxon_names character vector with names of taxa to look up.
#' @param chunksize only 50 taxa can be looked up per request, so request are split up into chunks (should be 50 or less)
#' @param verbose be verbose
#' @param ids add column "id" and "name" with running id and search names
#' @param marine_only Limit to marine taxa. Default=true
#' @param sleep_btw_chunks_in_sec pause between requests 
#' 
#' @return a data frame.
#' @details This function will take a character vector with taxon names, 
#' retrive AphiaRecords (CC-BY) from www.marinespecies.org using the 
#' GET /AphiaRecordsByName/{ScientificName} Method described at
#' http://www.marinespecies.org/rest/.
#' Results will be output to a data.frame with each row being a record.
#' For each name given, only the one AphiaRecord will be retrived. AphiaRecord with "accepted" status are preferred.
#' If  not present last entry will be taken which seems to result in best results.
#' 
#' @export
wormsbymatchnames <- function(taxon_names,verbose=TRUE,ids=FALSE,chunksize=50, marine_only="true",sleep_btw_chunks_in_sec=0.1){
  #library(httr)
  #library(plyr)
  stopifnot(inherits(taxon_names,"character"))
  
  
  search_options<-paste0("marine_only=",marine_only)
  my_worms<-list()
  request<-"http://www.marinespecies.org/rest/AphiaRecordsByMatchNames"
  #li<-setNames(    as.list(c(a[[1]],"false","true")) , c( rep("scientificnames[]",length(a[[1]])),"like","marine_only" ) )     
  # r<-GET("http://www.marinespecies.org/rest/AphiaRecordsByName", query = list("scientificnames[]" = "Abietinaria%20abietina","scientificnames[]" = "Acanthocardia%20echinata"))
  wrapname<-gsub(" ", "%20", taxon_names)
  chunk<-split(wrapname, ceiling(seq_along(taxon_names)/chunksize))
  chunkid<-split(1:length(taxon_names), ceiling(seq_along(taxon_names)/chunksize))
  if(verbose){
    cat("REQUESTING ",length(taxon_names)," ITEMS USING FUZZY from World Register of Marine Species  (www.marinespecies.org), ",format(Sys.time(), "%d/%m/%Y %X")," (CC-BY)\n",sep = "")
  }
  for (round in 1:length(chunk)){
    if(verbose){
      cat(sprintf("%62s", paste0("chunk ",round,"/",length(chunk))),"\n")
    }
    m<-paste(paste(paste0("scientificnames[]=",chunk[[round]]),collapse="&"),search_options,sep="&")
    r<-GET(paste(request,m,sep="?"))
    Sys.sleep(sleep_btw_chunks_in_sec)
    stopifnot(r$status_code==200)
    
    # gather lists in master list
    count<-0
    r_parsed<-content(r,as="parsed")
    for (i in 1:length(r_parsed)){
      w_index<-unlist(chunkid[round])[i]
      if(is.null(r_parsed[[i]][[1]])){
        my_worms[[w_index]]<-NA
        count<-count+1
        if(verbose){
          cat(sprintf("%-46s       %-40s", taxon_names[w_index] , "no match"),"\n")
        }
      } else {
        
        if(length(r_parsed[[i]])==1){
          my_worms[[w_index]]<-r_parsed[[i]][[1]]
        } else {
          my_worms[[w_index]]<-r_parsed[[i]][[1]]
          
          for(j in 1:length(r_parsed[[i]])){
            if(r_parsed[[i]][[j]]$status == "unaccepted"){
              my_worms[[w_index]]<-r_parsed[[i]][[j]]
            }
          }
          for(j in 1:length(r_parsed[[i]])){
            if(r_parsed[[i]][[j]]$status == "accepted"){
              my_worms[[w_index]]<-r_parsed[[i]][[j]]
            }
          }
        }
        if(verbose){
          cat(sprintf("%-49s -> %-40s %7s", taxon_names[w_index],my_worms[[w_index]]$scientificname,my_worms[[w_index]]$match_type),"\n")
        }
      }
    }
  }
  if(count<length(r_parsed)){
    # pull dataframe out of master list
    non.null.list <- lapply(my_worms, lapply, function(x)ifelse(is.null(x), NA, x))
    worms<-rbind.fill(lapply(non.null.list, as.data.frame,stringsAsFactors = F))
    worms$NA.<-NULL
    if(ids){
      worms<-cbind(data.frame(id=1:nrow(worms) , name=taxon_names,stringsAsFactors = F),worms)
    }
  } else {
    worms<-my_worms
  }
  if (verbose) {
    cat("matching names ..................................... DONE \n")
    cat("WARNING, fuzzy matching may give unexpected results - check results!\n\n")
  }
  return(worms)
}









#' @title GET AphiaRecordByAphiaID
#' @importFrom httr  GET content
#' @importFrom plyr rbind.fill
#' 
#' @description takes more than one AphiaID and retrives AphiaRecords from WoRMS 
#'
#' @param x AphiaIDs
#' @param verbose be verbose
#' @param ids add column "id" and "name" with running id and search names
#' @param sleep_btw_chunks_in_sec pause between requests 
#' 
#' @return a data frame.
#' @details This function will take a integer vector with AphiaIDs, 
#' retrive AphiaRecords from www.marinespecies.org using the 
#' GET /AphiaRecordByAphiaID Method described at
#' http://www.marinespecies.org/rest/.
#' Results will be output to a data.frame with each row being a record.
#' 
#' For examples, see  \code{\link{wormsaccepted}}
#' 
#' @export
wormsbyid <- function(x,verbose=TRUE,ids=FALSE,sleep_btw_chunks_in_sec=0.01){
  #library(httr)
  #library(plyr)
  stopifnot(inherits(x,c("numeric","integer")))
  my_worms<-list()
  if(verbose){      
    cat("REQUESTING ",length(x)," ITEMS BY ID from World Register of Marine Species  (www.marinespecies.org), ",format(Sys.time(), "%d/%m/%Y %X")," (CC-BY)\n",sep = "")  
  }
  for (round in 1:length(x)){
    if(verbose){      cat(",",x[round],sep = "")    }
    r<-GET(paste("http://www.marinespecies.org/rest/AphiaRecordByAphiaID",x[round],sep="/"))
    Sys.sleep(sleep_btw_chunks_in_sec)
    if (r$status_code!=200) {
      if(verbose){        cat("(",r$status_code,")",sep = "")      }
      r_parsed<-NA
    } else {
      #if(verbose){        cat("success (Status ",r$status_code,") \n",sep = "")      }
      r_parsed<-content(r,as="parsed")
    }
    if(verbose & round%%10==0){        cat("\n",sep = "")      }
    my_worms[[round]]<-r_parsed
  }
  
  
  if (verbose) {cat("\n")}
  # pull dataframe out of master list
  non.null.list <- lapply(my_worms, lapply, function(x)ifelse(is.null(x), NA, x))
  worms<-rbind.fill(lapply(non.null.list, as.data.frame,stringsAsFactors = F))
  if(ids){
    worms<-cbind(data.frame(id=1:nrow(worms) , name=x,stringsAsFactors = F),worms)
  }
  if (verbose) {cat("by id .............................................. DONE\n")}
  return(worms)
}


#' @title Recursivly retrieves respective "accepted" AphiaRecords for all synonyms if not already there
#' 
#' @description takes data.frame as output by \code{\link{wormsbynames}} , 
#' \code{\link{wormsbymatchnames}}, or \code{\link{wormsbyid}} and retrieves  additional
#' Aphia records (CC-BY) for not-"accepted" records in order to ultimately have "accepted" synonyms for all 
#' records in the dataset.
#'
#' @param x data.frame
#' @param verbose be verbose
#' @param sleep_btw_chunks_in_sec pause between requests 
#' @param once only one retrival iteration. No concatination of output with result. (For debugging)
#' 
#' @return a data frame.
#' @details This function will take a integer vector with AphiaIDs, 
#' retrive AphiaRecords from www.marinespecies.org using the 
#' GET /AphiaRecordByAphiaID Method described at
#' http://www.marinespecies.org/rest/.
#' Results will be outbut to a data.frame with each row being a record.
#' 
#' For examples, see  \code{\link{wormsaccepted}}
#'
#' @export
wormsconsolidate <- function(x,verbose=TRUE,sleep_btw_chunks_in_sec=0.01,once=FALSE){
  if(FALSE){
    x<-w
    verbose=TRUE
    sleep_btw_chunks_in_sec=0.01
  }
  count<-0
  while(TRUE){
    count<-count+1
    ids<-ifelse(names(x)[1]=="id",TRUE,FALSE)
    #cat(ids,"\n")
    
    
    unexplained<-x[x$valid_AphiaID>0 & (!is.na(x$valid_AphiaID)) & (x$status!="accepted") & (!x$valid_AphiaID%in%x$AphiaID),]
    anz_unexplaind<-nrow(unexplained)
    if(   anz_unexplaind!=0   ){
      if (verbose) {    cat("\nstill ",anz_unexplaind,"unhappy worms\n")   }
      happyworms<-wormsbyid(unexplained$valid_AphiaID,verbose=verbose) 
      if(ids){
        happyworms<-cbind(data.frame(id=(max(x$id)+1):(max(x$id)+nrow(happyworms)) , name=happyworms$scientificname), happyworms)
      }
      if(once){
        cat("rbind result with input and run maybe again!\n")
        return(happyworms)
      } else {
        x<-rbind(x,happyworms)
      }
    } else {
      if (verbose) {cat("consolidating ...................................... DONE\n")}
      break
    }
    
  }
  return(x)
}




#' @title Constructs "accepted_id" column which contains the "AphiaID" of the respective "accepted" taxon
#' 
#' @description takes data.frame as output by \code{\link{wormsbynames}} , 
#' \code{\link{wormsbymatchnames}}, or \code{\link{wormsbyid}} 
#' and add field "accepted_id" wich contains the "AphiaID" 
#' of the respective "accepted" taxon
#'
#' @param x data.frame
#' @param verbose be verbose
#' @param n_iter maximum search depth. Usually 3 is sufficient. Safety feature for breaking the \code{while} loop
#' 
#' @return a data frame.
#' @details This function helps updating you taxon information and eliminates ambiguity 
#' because the valid AphiaID is nor neccessary the AphiaID of an accepted taxon. You should run 
#' \code{\link{wormsconsolidate}} bevorhand to enshure all "accepted" taxons are present.
#' 
#' @examples 
#' ## start with IDs that are no longer up to date 
#' # get the Aphia information
#' u<-wormsbyid(c(424548,340537))
#' 
#' #recursively retrive information on the taxa they refer to
#' v<-wormsconsolidate(u)
#' 
#' # what are the currently correct "accepted" taxa? Answer: "accepted_id".
#' w<-wormsaccepted(v)
#' w[,c("scientificname","AphiaID","status","valid_AphiaID","valid_name","accepted_id")]
#' 
#' 
#' @export
wormsaccepted <- function(x,verbose=TRUE,n_iter=10){
  # fixme make shure valid_AphiaIDs are unique
  x$accepted_id<-NA 
  noacceptedid<-noacceptedrow<-NULL
  na_entries<-noval_entries<-0
  for(i in 1:nrow(x)){
    count<-0
    if(is.na(x$valid_AphiaID[i])) {
      na_entries<-na_entries+1
      if(!is.na(x$scientificname[i])){
        noval_entries<-noval_entries+1
        cat(x$scientificname[i] , " has no valid_AphiaID \n")
      }
      next
    }
    x$accepted_id[i]<-x$valid_AphiaID[i]
    if(is.na(match(x$accepted_id[i],x$AphiaID))){
      cat("ERROR! AphiaID",x$accepted_id[i],"is missing ... rerun wormsconsolidate and join with input")
      return()
    }
    while(x$status[match(x$accepted_id[i],x$AphiaID)] != "accepted"){
      count<-count+1
      x$accepted_id[i]<-x$valid_AphiaID[match(x$accepted_id[i],x$AphiaID)]
      if(count>n_iter){
        cat("no accepted AphiaID for ",x$scientificname[i] ,x$AphiaID[i], " after", n_iter," interations\n")
        noacceptedid <- cbind(noacceptedid,x$AphiaID[i])
        noacceptedrow <- cbind(noacceptedrow,i)
        break
      }
    }
  }
  if (verbose) {
    if(length(noacceptedid)>0){
      cat("AphiaID without acceptance ........................... ",length(noacceptedid),"\n")
      cat(paste(noacceptedid,","))
      cat("\n")
    }
    if(length(noacceptedrow)>0){
      cat("AphiaID without acceptance (rowsnumbers)............... \n")
      cat(paste(noacceptedrow,","))
      cat("\n")
    }
    if(na_entries>0){
    cat("other entries with no valid ID (NAs)............... ",na_entries-noval_entries,"\n")
    }
    cat("DONE ............................................... accepting\n")
    }
  return(x)
}
Any scripts or data that you put into this service are public.
worms documentation built on May 1, 2019, 9:22 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
worms
Retriving Aphia Information from World Register of Marine Species

R/worms_func.R
In worms: Retriving Aphia Information from World Register of Marine Species

Try the worms package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

worms Retriving Aphia Information from World Register of Marine Species

R/worms_func.R In worms: Retriving Aphia Information from World Register of Marine Species

Try the worms package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

worms
Retriving Aphia Information from World Register of Marine Species

R/worms_func.R
In worms: Retriving Aphia Information from World Register of Marine Species