R/cqc_recommend.R

Defines functions cqc_recommend.cqc_match_result cqc_recommend.cqc_match_result_gate cqc_recommend.cqc_match_result_keyword cqc_recommend.cqc_match_result_marker cqc_recommend.cqc_match_result_channel cqc_recommend.cqc_check cqc_recommend

Documented in cqc_recommend cqc_recommend.cqc_match_result

cqc_recommend <- function(x, ...) UseMethod("cqc_recommend")
cqc_recommend.cqc_check <- function(x, max.distance = 0.1, ...) {
  match_result <- cqc_match(groups, ...)
  cqc_recommend(match_result, max.distance = max.distance)
}

cqc_recommend.cqc_match_result_channel <- function(x, ...) {
  res <- cqc_recommend.cqc_match_result(x, ...)
  attr(res, "class") <- c("cqc_solution_channel", attr(res, "class"))
  res
}
cqc_recommend.cqc_match_result_marker <- function(x, ...) {
  res <- cqc_recommend.cqc_match_result(x, ...)
  attr(res, "class") <- c("cqc_solution_marker", attr(res, "class"))
  res
}
cqc_recommend.cqc_match_result_keyword <- function(x, ...) {
  res <- cqc_recommend.cqc_match_result(x, ...)
  attr(res, "class") <- c("cqc_solution_keyword", attr(res, "class"))
  res
}
cqc_recommend.cqc_match_result_gate <- function(x, ...) {
  res <- cqc_recommend.cqc_match_result(x, ...)
  attr(res, "class") <- c("cqc_solution_gate", attr(res, "class"))
  res
}
#' Find solution to resolve the discrepancy discovered by match_reference
#'
#' It tries to find the aproximate match(based on 'agrep') between the target and reference as well as the extra redundunt items that can be removed.
#' @name cqc_recommend
#' @return a table (with 'from' and 'to' columns) represents the itemized fix recommendation. When 'to' is 'NA', it means the entry is redundunt and can be removed
#' @examples
#' \dontrun{
#' solution <- cqc_recommend(groups, select = c(1, 4))
#' }
#' @param x A CQC object of some kind. See vignettes.
#' @param max.distance -- Maximum distance allowed for a match. This is passed to the max.distance argument in \code{\link{agrep}}.
#' @param partial whether to do the partial sub string matching before the approximate string matching
#' @param ... additional arguments not for the user.
#' @importFrom tibble tibble add_row
#' @importFrom utils adist
cqc_recommend.cqc_match_result <- function(x, max.distance = 0.1, partial = TRUE, ...) {
  unmatched_items <- FALSE
  type <- sub("cqc_match_result_", "", class(x)[1])
  res <- map_dfr(x, function(check_result) {
    targets_queue <- targets <- check_result[["unknown"]]
    refs_queue <- refs <- check_result[["missing"]]
    df <- tibble(from = character(), to = character())

    if (length(refs_queue) > 0 && length(targets_queue) > 0)
    {

      #(1st mat)
      # Levenshtein (edit) distance
      dist_mat <- adist(refs, targets, ignore.case = TRUE)

      #(2nd mat)
      # check if each match passes the max.distance check
      # agrep can be avoided if the formula of max.distance used by agrep is figured out
      is_pass_mat <- do.call(rbind, sapply(refs, function(ref){
                                     sapply(targets, function(target){
                                        #default definition for max.distance of agrep is non-symetric
                                        is.match <- agrepl(ref, target, ignore.case = TRUE, max.distance = max.distance)
                                        is.match || agrepl(target, ref, ignore.case = TRUE, max.distance = max.distance)

                                       })
                                    }, simplify = FALSE)
                            )
      # (3rd mat)
      #check if one if the substring of the other
      if(partial)
      {
        is_substring_mat <- do.call(rbind, sapply(refs, function(ref){
          sapply(targets, function(target){
            grepl(paste0("\\Q", ref, "\\E"), target, ignore.case = TRUE)||grepl(paste0("\\Q", target, "\\E"), ref, ignore.case = TRUE)

          })
        }, simplify = FALSE)
        )
      }else#create dummy mat
        is_substring_mat <- matrix(TRUE, nrow = length(refs), ncol = length(targets), dimnames = list(refs, targets))

      nrows <- nrow(dist_mat)
      ncols <- ncol(dist_mat)

      #combine the info from 3 matrices above to find the best match

      #first scan the substr mat to get pairs that has substring match
      is_sub_idx <- which(is_substring_mat)
      #order these substring matched pairs by the approximate string dist
      #to break tie when multiples are substr-matched to one
      is_sub_idx <- is_sub_idx[order(dist_mat[is_sub_idx])]
      #then process the pairs that has no substr relations
      #and also discard the pairs that do not pass dist threshold
      no_sub_idx <- which((!is_substring_mat)&is_pass_mat)
      #order them by dist
      no_sub_idx <- no_sub_idx[order(dist_mat[no_sub_idx])]
      #add the pairs selected/ordered by the above rules
      for(idx in c(is_sub_idx, no_sub_idx))
      {
        if (length(refs_queue) == 0 || length(targets_queue) == 0)
          break#terminate the loop if one of the queues is empty

        # try to parse out the names of the pair by its x, y coordinates within the mat
        ridx <- idx %% nrows
        if (ridx == 0) {
          ridx <- nrows
        }
        cidx <- ceiling(idx / nrows)
        # get the pair
        from <- targets[cidx]
        to <- refs[ridx]
        #check if they are already matched by previous iterations
        tind <- match(from, targets_queue)
        rind <- match(to, refs_queue)
        #if new match then add them to the solution
        if(!is.na(tind) && !is.na(rind) )
        {
          df <- add_row(df, from = from, to = to)

          # pop the processed item
          refs_queue <- refs_queue[-rind]
          targets_queue <- targets_queue[-tind]
        }

      }#end for
    }

    ## add missing items to insertion list (only for keywords task at the moment)
    if(type == "keyword")
    {
      if (length(targets_queue) == 0 && length(refs_queue) > 0) {

        df <- add_row(df, from = NA, to = refs_queue)
        refs_queue <- character()

      }
    }

    #add those extra items to deletion list
    if (length(targets_queue) > 0 && length(refs_queue) == 0) {

      df <- add_row(df, from = targets_queue, to = NA)
      targets_queue <- character()

    }

    if(length(targets_queue) > 0 || length(refs_queue) > 0)
      unmatched_items <<- TRUE

    df
  }, .id = "group_id") %>% mutate(group_id = as.integer(group_id))

  # If unmatched channels remain for any group, add warning message
  # as work must be done before calling cqc_fix
  if(unmatched_items){
    warning(paste("Unmatched items remain after cqc_match. Before using cqc_fix, please resolve these unmatched items",
                  "manually using cqc_match_update/remove/delete_unmatched or re-attempt automatic matching with cqc_match with a larger max.distance argument."),
            call. = FALSE)
  }

  attr(res, "class") <- c("cqc_solution", attr(res, "class"))
  attr(res, "group") <- attr(x, "group")
  res
}
RGLab/cytoqc documentation built on Jan. 25, 2023, 11:05 p.m.