R/match_algorithm.R
In redbookperu: Access and Analyze Data from the Red Book of Endemic Plants of Peru

Documented in .match_algorithm

#' The matching algorithm
#' @keywords internal
#'
.match_algorithm  <- function(splist_class,
                              max_distance,
                              progress_bar = FALSE,
                              keep_closest = TRUE,
                              genus_fuzzy = TRUE,
                              grammar_check = FALSE) {
  # N species
  n_sps <- nrow(splist_class)

  # N classes
  n_class <- ncol(redbookperu::redbook_sps_class)

  # Save results
  exact <- matrix(ncol = n_class + 1, nrow = n_sps)
  exact

  # Loop across species
  for (i in seq_len(n_sps)) {
    splist_class_i <- splist_class[i, ]
    check_non_defined <-
      splist_class_i[3] %in% c("SP", "SP.",
                               "SPEC.", "AGG.")

    if (!check_non_defined) {
      # Search genus position
      max_distance2 <- ifelse(genus_fuzzy, max_distance, 0)

      pos_genus_pre <- .group_ind(
        group_name = splist_class_i[2],
        group_ref = redbookperu::redbook_position$genus,
        max_distance2,
        only_one = FALSE,
        closest = TRUE
      )

      pos_genus <- .genus_search(pos_genus_pre)

      if (!any(is.na(pos_genus))) {
        # Try exact match first
        exact[i,] <- .exact_match(splist_class_i,
                                  pos_genus,
                                  n_class)
        # Try common grammar errors
        if (any(is.na(exact[i, ])) & grammar_check) {
          epis <- .sub_common(splist_class_i[3])
          if (length(epis) > 0) {
            names_grammar <- paste(splist_class_i[2],
                                   epis)
            splist_class_i_mult <- .splist_classify(names_grammar)
            n_gram <- length(names_grammar)
            temp <- matrix(nrow = n_gram,
                           ncol = length(exact[i, ]))
            for (k in seq_len(n_gram)) {
              temp[k, ] <- .exact_match(splist_class_i_mult[k, ],
                                        pos_genus,
                                        n_class)
            }
            pos_gram <- apply(temp, 1, function(x) {all(!is.na(x))})
            n_pos_gram <- sum(pos_gram)
            if (n_pos_gram > 1) {
              pos_gram_t <- pos_gram == TRUE
              pos_gram[pos_gram_t] <- c(TRUE, rep(FALSE, (n_pos_gram - 1)))
              temp[pos_gram, ncol(temp)] <- TRUE
            }
            if (any(pos_gram)) {
              exact[i,] <- temp[pos_gram, ]
            }
          }
        }

        # Try fuzzy
        if (any(is.na(exact[i, ])) & max_distance > 0) {
          exact[i,] <- .fuzzy_match(splist_class_i,
                                    pos_genus,
                                    max_distance,
                                    n_class,
                                    keep_closest = keep_closest,
                                    max_distance2 = max_distance2)
        }
      }
    }
  }
  return(exact)
}




#-------------------------------------------------------#
# Exact match function
.exact_match <- function(splist_class_i,
                         pos_genus,
                         n_class,
                         fuzzy = FALSE) {
  # Look the categories that are equal
  #redbookperu::redbook_sps_class
  sp_pos <- apply(redbookperu::redbook_sps_class[pos_genus, -n_class,
                                       drop = FALSE],
                  1,
                  function(x) {
                    x == splist_class_i
                  })
  sp_pos
  # Identify the actual number positions
  choosen <- which(sp_pos[3, ])

  # Set homonyms FALSE
  homonyms <- FALSE


  # Work when fuzzy
  if (fuzzy) {
    choosen <- seq_len(ncol(sp_pos))
  }


  n_choosen <- length(choosen)

  # IF NEEDED INSERT COMMON ERRORS HERE
  if (n_choosen == 0) {
    # No match found
    return(rep(NA, n_class + 1))
  } else {
    # if there is more than one matched genus and epithet
    # keep the one with more matches across subcategories
    if (n_choosen > 1) {
      sum_equals <- colSums(sp_pos[, choosen])
      pos_equals <- sum_equals == max(sum_equals)
      choosen <- choosen[pos_equals]
      choosen
      if (length(choosen) > 1) {
        matched_sp <-
          redbookperu::redbook_sps_class[pos_genus, ,
                                         drop = FALSE][choosen, "id"]
      # status_pos <-
      #   redbook_tab[as.numeric(matched_sp), "rank"]
      # status_pos
      # test_accepted <- status_pos == "null"
      # test_accepted
      # if (any(test_accepted)) {
      #   choosen <- choosen[test_accepted][1]
      #   homonyms <- TRUE
      # } else {
      #   homonyms <- TRUE
      #   choosen <- choosen[1]
      # }
      }
    }


    # Pick matched species ID
    matched_sp <-
      redbookperu::redbook_sps_class[pos_genus, , drop = FALSE][choosen, "id"]

    # Concatenate with the matching info
    matched_result <- c(matched_sp, sp_pos[, choosen], homonyms)

    return(matched_result)
  }
}

#-------------------------------------------------------#
# Fuzzy matching function
.fuzzy_match <- function(splist_class_i,
                         pos_genus = NULL,
                         max_distance,
                         n_class,
                         return_all = FALSE,
                         keep_closest = TRUE,
                         max_distance2 = max_distance) {
  # If we did not find an approximation of the genus
  fuzzy_match <- NULL
  if (!is.null(pos_genus)) {
    # Use the `.agrep_whole` function with the max_distance parameter
    name1 <- paste(splist_class_i[2], splist_class_i[3])
    name2 <- paste(redbookperu::redbook_sps_class[pos_genus, 2],
                   redbookperu::redbook_sps_class[pos_genus, 3])

    fuzzy_match <- .agrep_whole(name1,
                                name2,
                                max_distance = max_distance)
  }
  if (is.null(pos_genus) | length(fuzzy_match) == 0) {
    pos_genus <- seq_len(nrow(redbookperu::redbook_sps_class))
    # Use the `.agrep_whole` function with the max_distance parameter
    name1 <- paste(splist_class_i[2], splist_class_i[3])
    name2 <- paste(redbookperu::redbook_sps_class[, 2],
                   redbookperu::redbook_sps_class[, 3])

    fuzzy_match <- .agrep_whole(name1,
                                name2,
                                max_distance = max_distance2)
  }

  if (length(fuzzy_match) == 0) {
    # No match found
    return(rep(NA, n_class + 1))
  } else {
    if (keep_closest) {
      # Keep the closest
      dist_names <- utils::adist(name1, name2[fuzzy_match])
      which_closest <- which(dist_names == min(dist_names))
      fuzzy_match <- fuzzy_match[which_closest]

    }

    # Reuse the exact_match function, but look only for fuzzy matches
    pos_genus <-
      as.numeric(redbookperu::redbook_sps_class[pos_genus, "id"][fuzzy_match])
    n_pos_genus <- length(pos_genus)

    res_fuzzy <- matrix(nrow = n_pos_genus, ncol = n_class + 1)

    for (i in seq_len(n_pos_genus)) {
      res_fuzzy[1, ] <- .exact_match(splist_class_i,
                                     pos_genus,
                                     n_class,
                                     fuzzy = TRUE)
    }

    # keep only the ones with highest number of classes matches
    rights <- apply(res_fuzzy[, -1, drop = FALSE],
                    1,
                    function(x) {
                      sum(x == "TRUE")
                    })

    pos_genus2 <- which(rights == max(rights))

    # If more than one
   if (!return_all) {
    # if (length(pos_genus2) > 1) {
    #   #redbookperu::redbook_tab
    #   sub_tab <- redbook_tab[res_fuzzy[pos_genus2, 1], ]
    #   pos_genus2 <- which(sub_tab$Status == "accepted")
    #   res_fuzzy[, n_class + 1] <- TRUE # homonyms to TRUE
    #   if (length(pos_genus2) == 0) {
    #     pos_genus2 <- 1
    #   }
    # }
     return(res_fuzzy[pos_genus2[1], ])
   }
    if (return_all) {
      return(res_fuzzy[pos_genus2, 1])
    }
  }
}