R/similarity.R

Defines functions motifSimilarity nputPFMfromMatrixOrPWM tryAllMotifAlignments maxAligned

Documented in motifSimilarity

## functions to calculate motif similarity metrics

#' Returned the aligned motif parts
#'
#' This function takes the offset of first motif relative to second and 
#' chops off the end of both motifs that are not aligned. It returns a list
#' containing only the columns that align. 
#'
#' @param m1 frequency matrix of first motif
#' @param m2 frequency matrix of second motif
#' @param offset a number of nucleotides by which the first motif is offsetted compared to the second
#' @return a list of column-trimmed motifs m1, m2
maxAligned = function(m1, m2, offset){
	if(offset > 0){
		# need to cut off beginning of m2
		m2 = m2[,(offset+1):ncol(m2)]
	} else if(offset < 0){
		# net to cut off beginning of m1
		m1 = m1[,(abs(offset)+1):ncol(m1)]
	}
	
	# make sure we keep matrix 
	if(is.vector(m1))
		m1 = matrix(m1, ncol=1)
	if(is.vector(m2))
		m2 = matrix(m2, ncol=1)
	
	# maximal number of columns that align
	max.col = min(ncol(m1),ncol(m2))
	
	m1 = m1[,1:max.col]
	m2 = m2[,1:max.col]
	
	# make sure we keep it in matrix
	if(is.vector(m1))
		m1 = matrix(m1, ncol=1)
	if(is.vector(m2))
		m2 = matrix(m2, ncol=1)
	
	list(m1, m2)
}

#' Try all motif alignments and return max score
#'
#' This function tries all offsets of motif1 compared to motif2 and returns
#' the maximal (unnormalized) correlation score. 
#'
#' The correlation score is essentially the sum of correlations of individual aligned
#' columns as described in Pietrokovski (1996).
#'
#' @param m1 frequency matrix of motif 1
#' @param m2 frequency matrix of motif 2
#' @param min.align minimal number of basepairs that need to align
#' @param exclude.zero if to exclude offset=0, useful for calculating self-similarity
#' @references Pietrokovski S. Searching databases of conserved sequence regions by aligning protein multiple-alignments. Nucleic Acids Res 1996;24:3836-3845.
#' @return single maximal score
tryAllMotifAlignments = function(m1, m2, min.align=2, exclude.zero=FALSE){
	offset = (-ncol(m1)+min.align):(ncol(m2)-min.align)
	
	if(exclude.zero)
		offset = setdiff(offset, 0)
	
	scores = rep(0, length(offset))
	for(i in 1:length(offset)){
		# get aligned matrices
		aligned = maxAligned(m1, m2, offset[i])
		# convert to probabilities
		aligned.norm = lapply(aligned, function(x) divideRows(x, colSums(x)))
		# scores for first alignment (sum of column-wise correlations)
		scores[i] = sum(sapply(1:ncol(aligned.norm[[1]]), function(i) cor(aligned.norm[[1]][,i], aligned.norm[[2]][,i])))
		
	}
	
	max(scores)
}

#' Check the frequency matrix input parameter for motifSimilarity
#'
#' @param m either a PWM object or a matrix
#' @return corresponding PFM
.inputPFMfromMatrixOrPWM = function(m){
	if(class(m) == "PWM")
		return(m@pfm)
	else if(class(m) == "matrix")
		return(m)
	else
		stop("Input motif needs to be either of class PWM, or a frequency matrix")
}

#' Calculates similarity between two PFMs. 
#'
#' This function calculates the normalized motif correlation as a measure of motif
#' frequency matrix similarity. 
#'
#' This score is essentially a normalized version of the sum of column correlations
#' as proposed by Pietrokovski (1996). The sum is normalized by the average motif
#' length of m1 and m2, i.e. (ncol(m1)+ncol(m2))/2. Thus, for two idential motifs
#' this score is going to be 1. For unrelated motifs the score is going to be typically 
#' around 0. 
#'
#' Motifs need to aligned for this score to be calculated. The current implementation
#' tries all possible ungapped alignment with a minimal of two basepair matching, and 
#' the maximal score over all alignments is returned.
#'
#' Motif 1 is aligned both to Motif 2 and its reverse complement. Thus, the motif
#' similarities are the same if the reverse complement of any of the two motifs
#' is given.  
#'
#' @param m1 matrix with four rows representing the frequency matrix of first motif
#' @param m2 matrix with four rows representing the frequency matrix of second motif
#' @param trim bases with information content smaller than this value will be trimmed off both motif ends
#' @param self.sim if to calculate self similarity (i.e. without including offset=0 in alignment)
#' @references Pietrokovski S. Searching databases of conserved sequence regions by aligning protein multiple-alignments. Nucleic Acids Res 1996;24:3836-3845.
#' @export
#' @examples
#' 
#' if(require("PWMEnrich.Dmelanogaster.background")){
#'    data(MotifDb.Dmel.PFM)
#'
#'    # calculate the similarity of tin and vnd motifs (which are almost identical)
#'    motifSimilarity(MotifDb.Dmel.PFM[["tin"]], MotifDb.Dmel.PFM[["vnd"]])
#'
#'    # similarity of two unrelated motifs
#'    motifSimilarity(MotifDb.Dmel.PFM[["tin"]], MotifDb.Dmel.PFM[["ttk"]])
#' }
motifSimilarity = function(m1, m2, trim=0.4, self.sim=FALSE){
	# check input types
	m1 = .inputPFMfromMatrixOrPWM(m1)
	m2 = .inputPFMfromMatrixOrPWM(m2)

	# remove any zero-variance columns
	colSd1 = apply(m1, 2, sd)
	colSd2 = apply(m2, 2, sd)
	
	if(any(colSd1 == 0))
		m1 = m1[,colSd1!=0]

	if(any(colSd2 == 0))
		m2 = m2[,colSd2!=0]
		
	# trim bases with small motif content
	ic1 = motifIC(m1, bycol=TRUE)
	ic2 = motifIC(m2, bycol=TRUE)
	
	# find which columns to keep
	r1 = range(which(ic1 >= trim))
	r2 = range(which(ic2 >= trim))
	
	m1 = m1[,r1[1]:r1[2]]
	m2 = m2[,r2[1]:r2[2]]

	# get max score
	if(self.sim){
		# we don't want offset=0 because then the score would always be 1
		score = tryAllMotifAlignments(m1, m2, exclude.zero=TRUE)
	} else {
		score = max(tryAllMotifAlignments(m1,m2), 
			tryAllMotifAlignments(m1,reverseComplement(m2)))
	}
	
	# normalize the max score by mean motif size
	return(score / mean(c(ncol(m1), ncol(m2))))
	
}

Try the PWMEnrich package in your browser

Any scripts or data that you put into this service are public.

PWMEnrich documentation built on May 2, 2018, 4:20 a.m.