levenR: Simple Levenshtein Alignment and Distance Calculation

Documented in bestMismatch combineAligns complimentDna hamming .hamming leven levenAlign levenAll multiMismatch revComp reverseString

#' Simple Levenshtein alignments
#'
#' \tabular{ll}{
#' Package: \tab levenR\cr
#' Type: \tab Package\cr
#' Version: \tab 0.1\cr
#' Date: \tab 2011-11-16\cr
#' License: \tab CC BY-NC 3.0\cr
#' LazyLoad: \tab yes\cr
#' }
#' 
#' @name levenR-package
#' @aliases levenstein
#' @docType package
#' @title Simple Levenshtein alignments
#' @author Scott Sherrill-Mix \email{R@@sherrillmix.com}
#' @references
#' \url{http://en.wikipedia.org/wiki/Levenshtein_distance}
#' @keywords alignment, string, distance, levenshtein
#' @seealso \code{Biostrings}
#' @examples
#' library(levenR)
#' leven('ACAC','ACACA')
#' levenAlign(c('ACCCTG','AACTG'),'ACTG')
NULL

#' Reverse a string
#'
#' Reverse a string e.g. ABCDE becomes EDCBA
#' @param strings Vector of strings to be revered
#' @return Vector of reversed strings
reverseString<-function(strings){
	output<-sapply(strsplit(strings,''),function(x)paste(rev(x),collapse=''))	
	return(output)
}

#' Compliment DNA
#'
#' Compliment a string of DNA e.g. A<->T,G<->C. Doesn't attempt to deal with ambigous bases.
#' @param dnas Vector of DNA sequences
#' @return Vector of complimented DNA strings
complimentDna<-function(dnas){
	finds<-'TGAC'
	replaces<-'ACTG'
	return(chartr(finds,replaces,dnas))
}

#' Reverse compliment dna
#'
#' Reverse compliment a string of DNA
#' @param dnas Vector of sequences
#' @return Vector of reverse complimented DNA strings
revComp<-function(dnas){
	return(complimentDna(reverseString(dnas)))
}

#' Find fewest mismatches for multiple patterns
#'
#' Find fewest mismatches (and position if desired) between pattern and string without allowing gaps. Could be programmed better but quick enough for now
#' @param patterns A vector of string patterns to search for in subject
#' @param subject A single subject string to search in
#' @param ... Additional arguments for bestMismatch
#' @param drop If TRUE return only return the first best match
#' @return Three column matrix with columns minMismatch, position, pattern number 
#' @author Scott Sherrill-Mix \email{R@@sherrillmix.com}
#" @export
multiMismatch<-function(patterns, subject,drop=TRUE,...){
	matches<-do.call(rbind,mapply(function(x,y){matches<-bestMismatch(x,subject,TRUE,...);ifelse(is.null(dim(matches)),list(c(matches,y)),list(cbind(matches,y)))[[1]]},patterns,1:length(patterns),SIMPLIFY=FALSE))
	colnames(matches)<-c('mismatch','pos','number')
	rownames(matches)<-NULL
	if(drop) return(matches[matches[,1]==min(matches[,1]),,drop=FALSE][1,])
	else return(matches)
}

#' Find fewest mismatches
#'
#' Find fewest mismatches (and position if desired) between pattern and string without allowing gaps. Could be programmed better but quick enough for now.
#'
#' @param pattern String pattern to search for in subject
#' @param subject Subject string to search in
#' @param pos If TRUE return position in addition to distance
#' @param weights Integer weights for each position of pattern
#' @param findAll If TRUE return all best matches
#' @param dummyChar A character that does not appear in either string (only necessary if findAll==TRUE [and could probably program better])
#' @return Single value giving the least mismatch if pos=FALSE, c(minMismatches,position) if pos=TRUE, if findAll==TRUE matrix with columns minMismatches, position and rows for each occurrence
#' @author Scott Sherrill-Mix \email{R@@sherrillmix.com}
#" @export
bestMismatch <- function(pattern, subject, pos=findAll, weights=rep(1,nchar(pattern)),findAll=FALSE,dummyChar='|'){
	if(nchar(pattern)>nchar(subject))stop(simpleError('Pattern longer than subject'))
	if(nchar(pattern)!=length(weights))stop(simpleError('Weights and pattern length do not match'))
	ans<-.C('bestMismatch',as.integer(c(-1,-1)),as.character(pattern),as.character(subject),as.integer(weights),PACKAGE='levenR')[[1]]
	if(any(ans<0))stop(simpleError('Something did not work in mismatch'))
	stillLooking<-TRUE;nextMatch<-out<-ans
	if(findAll){
		while(stillLooking&findAll){
			substring(subject,nextMatch[2],nextMatch[2]+nchar(pattern)-1)<-paste(rep(dummyChar,nchar(pattern)),collapse='')
			nextMatch=bestMismatch(pattern,subject,TRUE,weights)
			if(nextMatch[1]==ans[1])out<-rbind(out,nextMatch)
			else stillLooking<-FALSE
		}
		return(out)
	}else{
		if(pos)return(ans)
		else return(ans[[1]][1])
	}
}

#' Combine gapped alignments
#'
#' Combine several pairs of reference, query alignments
#'
#' @param refs The gapped reference sequences (one for each query)
#' @param aligns The gapped query sequences
#' @param starts If refs start at different locations add .'s so all start at 1
#' @return A vector of the aligned strings with gaps added
#' @author Scott Sherrill-Mix \email{R@@sherrillmix.com}
#" @export
combineAligns<-function(refs,aligns,starts=NULL){
	nReads<-length(refs)
	if(nReads!=length(aligns))stop(simpleError('Length of refs and queries not same for aligning'))
	if(!is.null(starts)){
		dotDummy<-paste(rep('.',max(starts)),collapse='')
		refs<-sprintf('%s%s',substring(dotDummy,1,starts-1),refs)
		aligns<-sprintf('%s%s',substring(dotDummy,1,starts-1),aligns)
	}

	out<-rep('',length(aligns))

	readLengths<-nchar(refs)
	maxOutLength<-8*max(readLengths)
	out<-rep(paste(rep('Z',maxOutLength),collapse=''),nReads)

	ans<-.C('combineAligns',as.character(aligns),as.character(refs),as.character(out),as.integer(nReads),as.integer(readLengths),as.integer(maxOutLength),PACKAGE='levenR')[[3]]
	return(ans)
}


#' Align many strings to a reference
#'
#' Align strings to a reference string based on Levenshtein distance with ends-free or homopolymer alignments possible
#'
#' @param strings The query strings to be aligned
#' @param ref The reference to align to
#' @param homoLimit deletions or insertions in homopolymers > homoLimit cost 0
#' @param substring1 ends-free for query strings
#' @param substring2 ends-free for reference
#' @param prepend 1 or 2 for ends-free on front of string 1 or 2
#' @param append 1 or 2 for ends-free on back of string 1 or 2
#' @param trimOuterGaps Trim leading and trailing gaps
#' @param revComp Find and return the best alignment of identity or reverse compliment of queries
#' @param ... Additional arguments for levenAll
#' @return A list with first entry for the gapped reference and second entry all the aligned strings
#' @references \url{http://en.wikipedia.org/wiki/Levenshtein_distance}
#' @author Scott Sherrill-Mix \email{R@@sherrillmix.com}
#" @export
levenAlign<-function(strings,ref,homoLimit=0,prepend=NULL,append=NULL,substring1=FALSE,substring2=FALSE,trimOuterGaps=FALSE,revComp=FALSE,...){
	if(length(ref)>1)stop(simpleError('Only a single reference supported'))
	if(substring1){append<-c(append,1);prepend<-c(prepend,1)}
	if(substring2){append<-c(append,2);prepend<-c(prepend,2)}
	append<-1:2 %in% append
	prepend<-1:2 %in% prepend
	aligns<-levenAll(strings,ref,homoLimit=homoLimit,prepend=prepend,append=append,align=TRUE,revComp=revComp,...)[[2]]
	refs<-c(ref,sapply(aligns,'[[',2))
	aligns<-c(ref,sapply(aligns,'[[',1))
	out<-combineAligns(refs,aligns)
	if(trimOuterGaps){
		refRange<-range(gregexpr('[^-.]',out[1])[[1]])
		nonGaps<-gregexpr('[^-.]',out[-1])
		alignRange<-c(min(sapply(nonGaps,min)),max(sapply(nonGaps,max)))
		trimRange<-c(max(refRange[1],alignRange[1]),min(refRange[2],alignRange[2]))
		out<-substring(out,trimRange[1],trimRange[2])
	}

	out<-list('ref'=out[1],'align'=out[-1])
	if(trimOuterGaps)attr(out,'start')<-trimRange[1]
	return(out)
}

#' Wrapper for .C Levenshtein function
#'
#' Align and calculate distance betweeen two strings based on Levenshtein distance with ends-free or homopolymer alignments possible (probably not called directly)
#'
#' @param string1 A single string or vector of strings
#' @param string2 Another single string or vector of strings
#' @param homoLimit deletions or insertions in homopolymers > homoLimit cost 0
# @param debug Output various debugging information #rprintf doesn't work from threads so just disable
#' @param prepend 1 or 2 for ends-free on front of string 1 or 2
#' @param append 1 or 2 for ends-free on back of string 1 or 2
#' @param align Should an alignment be calculated
#' @param revComp Should the reverse compliment of string1 also be tested and the best of identity or reverse compliment returned?
#' @param nThreads If threads >1, run nThreads threads in parallel 
#' @return Either a distance or if align==TRUE a list with distance and alignment
#' @references \url{http://en.wikipedia.org/wiki/Levenshtein_distance}
#' @author Scott Sherrill-Mix \email{R@@sherrillmix.com}
#" @export
levenAll <- function(string1, string2,homoLimit=0,prepend=NULL,append=NULL,revComp=FALSE,align=FALSE,nThreads=1) {
  debug<-FALSE #rprintf doesn't work from threads so just set to FALSE
	if(is.null(string1)||is.null(string2))stop(simpleError('Null vectors received for alignment'))
	if(is.null(prepend))prepend<-c(FALSE,FALSE)
	if(is.null(append))append<-c(FALSE,FALSE)
	if(length(append)!=2&length(prepend!=2))stop(simpleError('Specify prepend and append as a vector of two logicals'))
	if(any(!is.logical(c(prepend,append))))stop(simpleError('Specify prepend and append as a vector of two logicals'))
	if(any(prepend & rev(append)))runs<-list(c(prepend[1],FALSE,append[1],FALSE),c(FALSE,prepend[2],FALSE,append[2]))
	else runs<-list(c(prepend,append))
	if(revComp)runs<-c(lapply(runs,function(x)c(x,FALSE)),lapply(runs,function(x)c(x,TRUE)))
	else runs<-lapply(runs,function(x)c(x,FALSE))

	if(any(is.na(string1))||any(is.na(string2)))stop(simpleError('NA string in levenAll'))

	if(align){
		n1<-nchar(string1)
		n2<-nchar(string2)
		isAlign<-1
		align1<-as.vector(outer(n1,n2,FUN=function(xx,yy)mapply(function(x,y)paste(rep('Z',2*(x+y)),collapse=''),xx,yy)))
		align2<-as.vector(outer(n1,n2,FUN=function(xx,yy)mapply(function(x,y)paste(rep('Z',2*(x+y)),collapse=''),xx,yy)))
	}else{
		isAlign<-0
		align1<-rep('',length(string1)*length(string2))
		align2<-rep('',length(string1)*length(string2))
	}
	out<-rep(-99,length(string1)*length(string2))
	nStrings<-c(length(string1),length(string2))
	ans<-lapply(runs,function(x).C('parallelLeven',as.integer(out),as.character(if(x[5]) revComp(string1) else string1),as.character(string2),as.integer(nStrings),as.integer(homoLimit),as.integer(x[1:2]),as.integer(x[3:4]),as.integer(debug),as.integer(nThreads),as.integer(isAlign),as.character(align1),as.character(align2),PACKAGE='levenR')[c(1,11,12)])
	dists<-do.call(rbind,lapply(ans,'[[',1))
	minDists<-apply(dists,2,min)
	if(!align){
		output<-matrix(minDists,nrow=nStrings[1],ncol=nStrings[2])
	}else{
		aligns<-lapply(ans,'[',2:3)
		selects<-apply(dists,2,which.min)
		#pull out the appropriate alignments
		bestAligns<-mapply(function(x,y)c(aligns[[y]][[1]][x],aligns[[y]][[2]][x]),1:length(selects),selects,SIMPLIFY=FALSE)
		output<-list('dist'=minDists,'align'=bestAligns)
	}
		
	return(output)
}

#' Find Levenshtein distance between many strings
#' @param strings1 vector of strings to compare to strings2 (or as strings1 x strings1 if strings2==NULL)
#' @param strings2 vector of strings to compare to strings1 or NULL (either strings1 or strings2 should be 1 or should have equal lengths)
#' @param oneToOne compare strings1[1] to strings2[1], strings1[2] to strings2[2],... (both vectors must be same length)
#' @param homoLimit deletions or insertions in homopolymers > homoLimit cost 0
#' @param vocal Integer to display a status message every vocal iterations
# @param debug If TRUE display various debugging information #rprintf doesnt work from threads so just remove debug
#' @param prepend 1 or 2 for ends-free on front of string 1 or 2
#' @param append 1 or 2 for ends-free on back of string 1 or 2
#' @param substring1 ends-free for strings1
#' @param substring2 ends-free for strings2
#' @param nThreads Run using threads?
#' @return Levenshtein distance matrix
#' @references \url{http://en.wikipedia.org/wiki/Levenshtein_distance}
#' @author Scott Sherrill-Mix \email{R@@sherrillmix.com}
#" @export
leven<-function(strings1,strings2=NULL,oneToOne=FALSE,homoLimit=0,vocal=0,prepend=NULL,append=NULL,substring1=FALSE,substring2=FALSE,nThreads=1){
  debug<-FALSE #rprintf doesn't work from threads so just set to FALSE
	if(nThreads>1&oneToOne)warnings('oneToOne not currently compatible with parallel processing')
	if(any(!prepend %in% 1:2)|any(!append %in% 1:2))stop(simpleError('Specify prepend and append with 1 or 2'))
	if(is.null(strings2)){
		strings2<-strings1
	}
	nStrings1=length(strings1)
	nStrings2=length(strings2)
	if(oneToOne){
		if(nStrings1!=nStrings2)stop(simpleError('Length of strings1 and strings2 not equal for 1 to 1 dists'))
		nStrings2<-1
	}
	if(substring1){append<-c(append,1);prepend<-c(prepend,1)}
	if(substring2){append<-c(append,2);prepend<-c(prepend,2)}
	append<-1:2 %in% append
	prepend<-1:2 %in% prepend

	#doesn't seem to be any speed benefit to using lapply or doing the looping in C (strange)
	if(oneToOne){
		distMat<-rep(NA,nStrings1)
		for(i in 1:nStrings1){
			if(vocal[1]>0&i%%vocal[1]==0)message("Working on string ",i)
			distMat[i]<-levenAll(strings1[i],strings2[i],homoLimit=homoLimit,append=append,prepend=prepend)
		}
	}else{
		distMat<-matrix(NA,nrow=nStrings1,ncol=nStrings2)
		if(nThreads==1){
			for(i in 1:nStrings1){
				for(j in 1:nStrings2){
					distMat[i,j]<-levenAll(strings1[i],strings2[j],homoLimit=homoLimit,append=append,prepend=prepend)
				}
			}
		}else{
			distMat<-levenAll(strings1,strings2,homoLimit=homoLimit,append=append,prepend=prepend,nThreads=nThreads)
		}
	}
	return(distMat)
}

#' Find hamming distance between many strings
#' @param strings1 vector of strings to compare to strings2 (or as strings1 x strings1 if strings2==NULL)
#' @param strings2 vector of strings to compare to strings1 or NULL (either strings1 or strings2 should be 1 or should have equal lengths)
#' @param oneToOne compare strings1[1] to strings2[1], strings1[2] to strings2[2],... (both vectors must be same length)
#' @param cutoff an integer giving a distance to stop checking beyond and instead return the cutoff value
#' @param vocal Integer to display a status message every vocal iterations
#' @return Hamming distance matrix
#' @references \url{http://en.wikipedia.org/wiki/Hamming_distance}
#' @author Scott Sherrill-Mix \email{R@@sherrillmix.com}
#" @export
hamming<-function(strings1,strings2=NULL,oneToOne=FALSE,cutoff=99999,vocal=0){
	if(is.null(strings2)){
		strings2<-strings1
	}
	nStrings1=length(strings1)
	nStrings2=length(strings2)
	if(oneToOne){
		if(nStrings1!=nStrings2)stop(simpleError('Length of strings1 and strings2 not equal for 1 to 1 dists'))
		nStrings2<-1
	}
	if(oneToOne){
		distMat<-rep(NA,nStrings1)
		for(i in 1:nStrings1){
			if(vocal[1]>0&i%%vocal[1]==0)message("Working on string ",i)
			distMat[i]<-.hamming(strings1[i],strings2[i],cutoff=cutoff)
		}
	}else{
		distMat<-matrix(NA,nrow=nStrings1,ncol=nStrings2)
    for(i in 1:nStrings1){
			if(vocal[1]>0&i%%vocal[1]==0)message("Working on string ",i)
      for(j in 1:nStrings2){
        distMat[i,j]<-.hamming(strings1[i],strings2[j],cutoff=cutoff)
      }
    }
	}
	return(distMat)
}

#' Wrapper for .C Hamming function
#'
#' Align and calculate distance betweeen two strings based on Hamming distance
#'
#' @param string1 A single string or vector of strings
#' @param string2 Another single string or vector of strings
#' @param cutoff an integer giving a distance to stop checking beyond and instead return the cutoff value
#' @return the hamming distance between two strings
#' @references \url{http://en.wikipedia.org/wiki/Hamming_distance}
#' @author Scott Sherrill-Mix \email{R@@sherrillmix.com}
#' @examples
#' levenR:::.hamming('aazaazaaz','aayaayaay')
#' levenR:::.hamming('aazaazaaz','aayaayaay',cutoff=2)
#' levenR:::.hamming('ss','ss')
.hamming <- function(string1, string2,cutoff=99999){
	if(is.null(string1)||is.null(string2))stop(simpleError('Null vectors received for alignment'))
	if(any(is.na(string1))||any(is.na(string2)))stop(simpleError('NA string in .hamming'))
  if(nchar(string1)!=nchar(string2))stop('String lengths do not match in .hamming')
	ans<-.C('hamming',as.integer(c(-99)),as.character(string1),as.character(string2),as.integer(nchar(string1)),as.integer(cutoff),PACKAGE='levenR')
  return(ans[[1]][1])
}
sherrillmix/levenR documentation built on Oct. 25, 2023, 11:42 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
sherrillmix/levenR
Simple Levenshtein Alignment and Distance Calculation

R/leven.R
In sherrillmix/levenR: Simple Levenshtein Alignment and Distance Calculation

Defines functions .hamming hamming leven levenAll levenAlign combineAligns bestMismatch multiMismatch revComp complimentDna reverseString

Documented in bestMismatch combineAligns complimentDna hamming .hamming leven levenAlign levenAll multiMismatch revComp reverseString

R Package Documentation

Browse R Packages

We want your feedback!

sherrillmix/levenR Simple Levenshtein Alignment and Distance Calculation

R/leven.R In sherrillmix/levenR: Simple Levenshtein Alignment and Distance Calculation

Defines functions .hamming hamming leven levenAll levenAlign combineAligns bestMismatch multiMismatch revComp complimentDna reverseString

Documented in bestMismatch combineAligns complimentDna hamming .hamming leven levenAlign levenAll multiMismatch revComp reverseString

R Package Documentation

Browse R Packages

We want your feedback!

sherrillmix/levenR
Simple Levenshtein Alignment and Distance Calculation

R/leven.R
In sherrillmix/levenR: Simple Levenshtein Alignment and Distance Calculation