R/findCloseMatch.R

Defines functions .compareByDiff .compareByLogRatio .compareByPPM findCloseMatch

Documented in .compareByDiff .compareByLogRatio .compareByPPM findCloseMatch

#' Find close numeric values between two vectors
#'
#' \code{findCloseMatch} finds close matches (similar values) between two numeric vectors ('x','y') based on method 'compTy' and threshold 'limit'. 
#'  Return list with close matches of 'x' to given 'y', the numeric value dependes on 'sortMatch' (if FALSE then always value of 'y' otherwise of longest of x&y).  
#'  Note: Speed & memory improvement if 'sortMatch'=TRUE (but result might be inversed!): adopt search of x->y or y->x to searching matches of each longest to each shorter  (ie flip x &y).
#'  Otherwise, if length of 'x' & 'y' are very different, it may be advantagous to use a long(er) 'x'  and short(er) 'y' (with 'sortMatch'=FALSE).  
#'  Note: Names of 'x' & 'y' or (if no names) prefix letters 'x' & 'y' are always added as names to results.
# ' Note: Takes much RAM if x & y are large 
#'
#' @param x numeric vector for comparison
#' @param y numeric vector for comparison
#' @param compTy (character) may be 'diff' or 'ppm', will be used with threshold from argument 'limit'
#' @param limit (numeric) threshold value for retaining values, used with distace-type specified in argument 'compTy'
#' @param asIndex (logical) optionally rather report index of retained values
#' @param maxFitShort (numeric) limit output to max number of elements (avoid returning high number of results if filtering was not enough stringent)
#' @param sortMatch (logical) if TRUE than matching will be preformed as 'match longer (of x & y) to closer', this may process slightly faster (eg 'x' longer: list for each 'y' all 'x' that are close, otherwise list of each 'x'), 
#' @param silent (logical) suppress messages
#' @param debug (logical) display additional messages for debugging
#' @param callFrom (character) allow easier tracking of message(s) produced
#' @return This function returns a list with close matches of 'x' to given 'y', the numeric value dependes on 'sortMatch' (if FASLE then always value of 'y' otherwise of longest of x&y)
#' @seealso \code{\link{checkSimValueInSer}} and (from this package) \code{.compareByDiff}, for convient output \code{\link{countCloseToLimits}} 
#' @examples
#' aa <- 11:14 ; bb <- c(13.1,11.5,14.3,20:21)
#' findCloseMatch(aa,bb,com="diff",lim=0.6)
#' findCloseMatch(c(a=5,b=11,c=12,d=18),c(G=2,H=11,I=12,J=13)+0.5, comp="diff", lim=2)
#' findCloseMatch(c(4,5,11,12,18),c(2,11,12,13,33)+0.5, comp="diff", lim=2)
#' findCloseMatch(c(4,5,11,12,18),c(2,11,12,13,33)+0.5, comp="diff", lim=2, sort=FALSE)
#' .compareByDiff(list(c(a=10,b=11,c=12,d=13),c(H=11,I=12,J=13,K=33)+0.5),limit=1) #' return matrix
#' 
#' a2 <- c(11:20); names(a2) <- letters[11:20]
#' b2 <- c(25:5)+c(rep(0,5),(1:10)/50000,rep(0,6)); names(b2) <- LETTERS[25:5]
#' which(abs(b2-a2[8]) < a2[8]*1e-6*5)                                     #'  find R=18 : no10
#' findCloseMatch(a2, b2, com="ppm", lim=5)                                #'  find Q,R,S,T
#' findCloseMatch(a2, b2, com="ppm", lim=5,asI=TRUE)                       #'  find Q,R,S,T
#' findCloseMatch(b2, a2, com="ppm", lim=5,asI=TRUE,sort=FALSE)
#' findCloseMatch(a2, b2, com="ratio", lim=1.000005)                       #'  find Q,R,S,T
#' findCloseMatch(a2, b2, com="diff", lim=0.00005)                         #'  find S,T
#' @export
findCloseMatch <- function(x, y, compTy="ppm", limit=5, asIndex=FALSE, maxFitShort=100, sortMatch=FALSE, silent=FALSE, debug=FALSE, callFrom=NULL){
  fxNa <- .composeCallName(callFrom,newNa="findCloseMatch")
  if(!isTRUE(silent)) silent <- FALSE
  if(isTRUE(debug)) silent <- FALSE else debug <- FALSE
  compMeth <- c("ppm","diff","ratio")
  msg <- c(paste("argument 'compTy' may be one of",pasteC(compMeth,qu="'",la=" or "))," , trimming to length=1")
  if(length(compTy) <1) stop(msg[1]) else if(length(compTy)>1) {compTy <- compTy[1]; message(msg)}
  if(!compTy %in% compMeth) stop(msg[1])
  if(is.null(names(x))) names(x) <- paste0("x",equLenNumber(1:length(x)))                   # need default names to know which of 'x' in results !
  if(is.null(names(y))) names(y) <- paste0("y",equLenNumber(1:length(y)))                   #
  if(any(names(x) %in% names(y))) message(fxNa,"Note that some names do overlap (beware not to confuse...) !")
  dat <- list(x,y)
  if(sortMatch) dat <- dat[order(sapply(dat,length), decreasing=FALSE)]
  if(is.character(maxFitShort)) if(length(grep("%$", maxFitShort)) >0) {
    maxFitShort <- ceiling(length(dat[[1]])*as.numeric(sub("%$","",maxFitShort))/100)
    } else stop("can't interpret 'maxFitShort'")
  tm2 <- switch(compTy,
    ppm=.compareByPPM(dat, limit, distVal=!asIndex),
    diff=.compareByDiff(dat, limit, distVal=!asIndex),
    ratio=.compareByLogRatio(dat, limit, distVal=!asIndex))
  if(debug) {message(fxNa,"fCM1")}  
  cSu <- colSums(!is.na(tm2))
  cPi <- which(cSu >0)
  if(length(cPi) <1) return(NULL) else {
    if(any(cSu > maxFitShort)) {     # case of fitting close to (very) large number of elements -> keep maxFitShort lowest (+ set others to NA)
      if(!silent) message(fxNa,sum(cSu > maxFitShort,na.rm=TRUE)," column elements at too many 'close elements' try to reduce ..")
      for(i in 1:which(cSu > maxFitShort)) {limDis <- sort(tm2[,i], decreasing=TRUE, na.last=TRUE)[maxFitShort]
        tmX <- which(tm2[,i] <= limDis)
        if(nrow(tm2) > maxFitShort && length(tmX) >= min(maxFitShort*1.15, floor(nrow(tm2)*0.96))) tm2[,i] <- NA else tm2[which(tm2[,i] >limDis),i] <- NA}
      cPi <- which(colSums(!is.na(tm2)) >0)        # refresh
    }
    if(debug) {message(fxNa,"fCM2")}  
    rSu <- rowSums(!is.na(tm2))     
    if(any(rSu > maxFitShort)) {               # case of fitting close to (very) large number of elements -> keep maxFitShort lowest (+ set others to NA)
      if(!silent) message(fxNa,sum(rSu > maxFitShort ,na.rm=TRUE)," row elements at too many 'close elements' try to reduce ..")
      for(i in 1:which(rSu > maxFitShort)) {limDis <- sort(tm2[i,],decreasing=TRUE,na.last=TRUE)[maxFitShort]
        tmY <- which(tm2[i,] <= limDis)
        if(ncol(tm2) > maxFitShort & length(tmY) >= min(maxFitShort*1.15,floor(ncol(tm2)*0.96))) tm2[i,] <- NA else tm2[i,which(tm2[i,] >limDis)] <- NA}
      rPi <- which(rowSums(!is.na(tm2)) >0)        # refresh (rows with some distance values)
    } else rPi <- which(rSu >0)
    if(debug) {message(fxNa,"fCM3")}  
    zz <- if(nrow(tm2) >1) as.matrix(tm2[,cPi])[rPi,] else matrix(tm2[,cPi], nrow=1, dimnames=list(rownames(tm2),colnames(tm2)[cPi]))
    if(length(dim(zz)) <2) zz <- matrix(zz, nrow=length(rPi))
    if(is.null(colnames(zz)) || is.null(rownames(zz))) dimnames(zz) <- list(rownames(tm2)[rPi], colnames(tm2)[cPi])
    out <- if(asIndex) {if(length(cPi) >1) apply(!is.na(zz), 2, which) else which(!is.na(zz))         #list of indexes   dat,function(z) which(z))
    } else {if(length(cPi) >1) apply(zz, 2, naOmit) else naOmit(zz)} 
    if(!is.list(out)) {out <- as.list(out); zn <- rownames(zz)[apply(!is.na(zz),2,which)]; for(j in 1:length(out)) names(out[[j]]) <- zn[j]} 
    if(is.null(names(out))) names(out) <- rep(names(cPi), length(out))[1:length(out)]
  out }}
 
#' Compare by PPM
#'
#' This function allows to compare by ppm
#'
#' @param dat list of 2 numerical vectors
#' @param limit (numeric, length=1) threshold value for retaining values, used with distace-type specified in argument 'compTy'
#' @param distVal (logical) to toggle outpout as matrix of numeric (distance values above 'limit', others NA) or matrix of logical 
#' @return This function returns a list with close matches of 'x' to given 'y', the numeric value dependes on 'sortMatch' (if FASLE then always value of 'y' otherwise of longest of x&y)
#' @seealso \code{\link{findCloseMatch}},  \code{\link{checkSimValueInSer}}, and also \code{.compareByDiff}, for convient output \code{\link{countCloseToLimits}} 
#' @examples
#' cc <- list(aa=11:14, bb=c(13.1,11.5,14.3,20:21))
#' .compareByPPM(cc, 1) 
#' @export
.compareByPPM <- function(dat, limit, distVal=FALSE){
  ## compare both vectors from list 'dat' for similar values based on ppm
  ## 'dat' .. list of 2 numerical vectors
  ## 'distVal'.. (logical) to toggle outpout as matrix of numeric (distance values above 'limit', others NA) or matrix of logical
  ## 'limit'.. (numeric, length=1) threshold to be applied
  ## return logical matrix with rows for 1st & cols for 2nd element of dat  (used in findCloseMatch() )
  ref <- matrix(rep(dat[[1]],each=length(dat[[2]])), nrow=length(dat[[2]]), dimnames=list(names(dat[[2]]),names(dat[[1]])))       # matr of short
  que <- matrix(rep(dat[[2]],length(dat[[1]])), nrow=length(dat[[2]]), dimnames=list(names(dat[[2]]),names(dat[[1]])))
  if(distVal) {tm2 <- (2*(que <= ref) -1)*abs(que/ref -1)/1e-6; 
    tm2[which(!abs(que/ref -1) < limit*1e-6)] <- NA
  } else tm2 <- abs(que/ref -1) < limit*1e-6
  tm2 }

#' Compare by log-ratio
#'
#' This function allows to compare by log-ratio
#'
#' @param dat list of 2 numerical vectors
#' @param limit (numeric, length=1) threshold value for retaining values, used with distace-type specified in argument 'compTy'
#' @param distVal (logical) to toggle outpout as matrix of numeric (distance values above 'limit', others NA) or matrix of logical 
#' @return This function returns a list with close matches of 'x' to given 'y', the numeric value dependes on 'sortMatch' (if FASLE then always value of 'y' otherwise of longest of x&y)
#' @seealso  \code{\link{findCloseMatch}},  \code{\link{checkSimValueInSer}}, and also \code{.compareByDiff}, for convient output \code{\link{countCloseToLimits}} 
#' @examples
#' cc <- list(aa=11:14, bb=c(13.1,11.5,14.3,20:21))
#' .compareByLogRatio(cc, 1) 
#' @export
.compareByLogRatio <- function(dat, limit, distVal=FALSE){
  ## compare both vectors from 'dat' for similar values based on (log)ratio
  ## 'dat' .. list of 2 numerical vectors
  ## 'distVal'.. (logical) to toggle outpout as matrix of numeric (distance values above 'limit', others NA) or matrix of logical
  ## 'limit'.. (numeric, length=1) threshold to be applied
  ## return logical matrix of  (used in findCloseMatch() )
  ref <- matrix(rep(dat[[1]], each=length(dat[[2]])), nrow=length(dat[[2]]), dimnames=list(names(dat[[2]]),names(dat[[1]])))       # matr of short
  que <- matrix(rep(dat[[2]], length(dat[[1]])), nrow=length(dat[[2]]), dimnames=list(names(dat[[2]]),names(dat[[1]])))
  if(distVal) {tm2 <- abs(log2(ref/que)); tm2[which(!abs(log2(ref/que)) <= log2(abs(limit)))] <- NA
  } else tm2 <- abs(log2(ref/que)) <= log2(abs(limit))
  tm2 }

#' Compare by distance/difference
#'
#' This function allows to compare by distance/difference
#'
#' @param dat list of 2 numerical vectors
#' @param limit (numeric, length=1) threshold value for retaining values, used with distace-type specified in argument 'compTy'
#' @param distVal (logical) to toggle outpout as matrix of numeric (distance values above 'limit', others NA) or matrix of logical 
#' @return This function returns a list with close matches of 'x' to given 'y', the numeric value dependes on 'sortMatch' (if FASLE then always value of 'y' otherwise of longest of x&y)
#' @seealso  \code{\link{findCloseMatch}},  \code{\link{checkSimValueInSer}}, and also \code{.compareByLogRatio}, for convient output \code{\link{countCloseToLimits}} 
#' @examples
#' cc <- list(aa=11:14, bb=c(13.1,11.5,14.3,20:21))
#' @export
.compareByDiff <- function(dat, limit, distVal=FALSE){
  ## compare both vectors from 'dat' for similar values based on (absolute) difference
  ## 'dat' .. list of 2 numerical vectors
  ## 'distVal'.. (logical) to toggle outpout as matrix of numeric (distance values above 'limit', others NA) or matrix of logical
  ## 'limit'.. (numeric, length=1) threshold to be applied
  ## return logical matrix of  (used in findCloseMatch() )
  ref <- matrix(rep(dat[[1]], each=length(dat[[2]])), nrow=length(dat[[2]]), dimnames=list(names(dat[[2]]),names(dat[[1]])))       # matr of short
  que <- matrix(rep(dat[[2]], length(dat[[1]])), nrow=length(dat[[2]]), dimnames=list(names(dat[[2]]),names(dat[[1]])))
  if(distVal) {
    tm2 <- que-ref
    chLi <- abs(tm2) > abs(limit)
    if(any(chLi)) tm2[which(chLi)] <- NA
  } else tm2 <- abs(ref -que) <= abs(limit)
  tm2 }
   

Try the wrMisc package in your browser

Any scripts or data that you put into this service are public.

wrMisc documentation built on Nov. 17, 2023, 5:09 p.m.