R/pwm_io.R
In cobindR: Finding Co-occuring motifs of transcription factor binding sites

Documented in input.pfm.similarity kl.dist lcm.vector normalize.matrix.colSum pfm2pwm pfm.similarity query.motifDb query.motifDb read.transfac.pfm

# TODO: IO functions for jaspar and transfac position frequency matrices (pfm) and positions weight matrices (pwm)  which can be used later in the analysis 
# 
# Author: rob <r.lehmann@biologie.hu-berlin.de>
# Author: stefan <kroeger@informatik.hu-berlin.de>
###############################################################################
##for conversion from PFM to PWM (MEME format)

## function to read the pfm files from data folder 
## and convert it to MEME PWM format (http://meme.sdsc.edu/meme/examples/sample-dna-motif.meme-io)
# 2013-01-09: Is this an internal function?
setMethod("read.pfm", signature(x="configuration"),
function(x) {
    file_path = x@pfm_path
	
	if(any(grep('^motifdb$',file_path, ignore.case=TRUE))) 
		return(query.motifDb(x))
	
	# find all files in the given folder
	pfm.files = list.files(file_path, pattern ="*.*", full.names=F, ignore.case = TRUE)
	if(length(pfm.files) < 1) {
		warning('no files in pfm_path')
		return(list())
	}
	# create empty list of pwms
	pwms.list = list()
    count.pwms = 1
	pfm.read.errors = NULL
	
	cat('\n reading pfm files:',file_path,'...\n')
	for(i in 1:length(pfm.files)){
		tmp.path = file.path(file_path,pfm.files[i])
		# test which type of file this is
		ending = tail(unlist(strsplit(pfm.files[i], '\\.')), 1)
		switch(ending,
			# case one: it's a *.pfm jaspar file (which has only one matrix as entry)
			pfm = {
				tmp.pfm = apply(as.matrix(read.table(tmp.path)),c(1,2), as.numeric);
				pfm.id = sub(".pfm","", pfm.files[i])
				rownames(tmp.pfm) = c("A", "C", "G", "T")
				pwms.list = c(pwms.list,list(tmp.pfm))
				names(pwms.list)[length(pwms.list)] = pfm.id
			},
			# case two: it's a *.cm jaspar files (which also has the row names within)	
			cm  = {
				tmp.pfm = apply(as.matrix(read.table(tmp.path), row.names=1), c(1,2), as.numeric);
				pfm.id = sub(".cm","", pfm.files[i])
				rownames(tmp.pfm) = c("A", "C", "G", "T")
				pwms.list = c(pwms.list,list(tmp.pfm))
				names(pwms.list)[length(pwms.list)] = pfm.id
			},
			# case three: it's a *.sites jaspar file (not yet implemented)
			sites = {warning('The option of reading *.sites files is not yet implemented. The file is ignored ...')
			},
			# case: it's a *.tfpfm file that contains multiple matrices
			tfpfm ={
				tmp.pfm = read.transfac.pfm(tmp.path)
# 				pfm.id = sub(".tfpfm","", pfm.files[i])
				pwms.list = c(pwms.list,tmp.pfm)
			},
			# the default case - it's a transfac-file, which might have multiple matrices 
			{
				tmp.pfm = read.transfac.pfm(tmp.path)
				cat(pfm.files[i],' ignored - unknown PFM file ending.' )
				pfm.read.errors = c(pfm.read.errors, c(pfm.files[i],"\n"))
				next; #advance loop to next file
			}
		)
	}
	cat('ignored files:\n', pfm.read.errors,'\n')
	return(pwms.list)
}
)

# function queries motifDb package for pfms which are specified
# in pairs of the configuration file 
# call motif by unique name in MotifDb via names(MotifDb)
query.motifDb <- function(x) {
	if(!require('MotifDb')) {
		warning('package MotifDb is reqired. Aborting...')
		return(list())
	}
	mat.names <- unique(unlist(sapply(x@pairs, function(p) strsplit(p,' '))))
	cat('accessing MotifDb package for the requested ',length(mat.names),' PFMs...\n')
	mat.idx <- c()
	for(ma in mat.names) {
		mat.idx <- c(mat.idx, grep(ma, names(MotifDb)))
	}
	return(as.list(MotifDb[unique(mat.idx)]))
# 	return(as.list(MotifDb[unique(match(mat.names, names(MotifDb)))]))
}

# function reads transfac's pfm format files and returns corresponding matrix
# 2013-01-09: This function does not belong to any class.
#             Is this an internal function?             
read.transfac.pfm = function(path) {
	con = file(path) 
	open(con)
	# list of pfms read
	res = list()
	pfm = NA
	# iterate through file
	in.pfm = FALSE
	while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
          if(any(grep('^//$', line, ignore.case=T))){ # new pfm section starts
            if(any(!is.na(pfm))) # save previous pfm
              res[[id]] = pfm
            pfm = matrix(NA, nrow=4, ncol=0) # initialize new pfm
			rownames(pfm) = c('A','C','G','T')
          }
          
          if(any(grep('^P0', line, ignore.case=T))) {#pfm section starts
            in.pfm = TRUE
            next
          }
          if(any(grep('^ID', line, ignore.case=T))){ #pfm name
            id = unlist(strsplit(line, split=" "))[2]
            print(id)
          }
          if(in.pfm & any(grep('^XX$', line, ignore.case=T))) #pfm section ends
            in.pfm = FALSE
          if(in.pfm)
            pfm = cbind(pfm, as.numeric(unlist(strsplit(line, split=" "))[2:5]))
	}

        # save the last/only pfm (otherwise, they are not in the list)
        res[[id]] <- pfm
        
	close(con)
# 	if(ncol(pfm) > 0) # if last pfm not terminated with //, add it from the buffer to the result
#           res[[id]] = pfm
	res
}

  
##function normalized matrix: all columns must have same column sum;
##nessecary to be appliable to other functions like Biostrings::PWM()
# 2013-01-09: This function does not belong to any class.
#             Is this an internal function?
normalize.matrix.colSum = function(matrix){
  if(length(unique(colSums(matrix))) > 1){
    lcm = as.numeric(lcm.vector(colSums(matrix)))
    matrix = t((lcm/colSums(matrix))*t(matrix))
  }
  return(matrix)
}

# #w = log2 ( ( f + sqrt(N) * p ) / ( N + sqrt(N) ) / p )
# w - is a weight for the current nucleotide we are calculating
# f - is a number of occurences of the current nucleotide in the current column (e.g., "1" for A in column 1, "8" for C etc)
# N - total number of observations, the sum of all nucleotides occurences in a column (13 in this example)
# p - [prior] [background] frequency of the current nucleotide; this one usually defaults to 0.25 (i.e. one nucleotide out of four)
#          
# 2013-01-09: This function does not belong to any class.
#             Is this an internal function?          
pfm2pwm = function(pfm) {
	pfm <- normalize.matrix.colSum(pfm)
	pwm = pfm
	p = .25
	for(x in 1:ncol(pfm)){
		for(y in 1:4){
			f = pfm[y,x]
			n = colSums(pfm)[x]
			pwm[y,x] = log2 ( ( f + sqrt(n) * p ) / ( n + sqrt(n) ) / p )
		}
	}
	if(any(is.na(pwm)))
		stop('PFM-PWM conversion produced NAs !')
	else if(any(which(pwm == Inf)))
		stop('PFM-PWM conversion produced Infs!')
	else if(any(which(pwm == -Inf)))
		stop('PFM-PWM conversion produced -Infs!')
	else if(any(which(pwm == NaN)))
		stop('PFM-PWM conversion produced NaN!')
	return(pwm)
}
###############################################################################
# function converts for each input PWM the predicted TFBS hits into a PWM. function
# is intended to be used together with the sequence logo creation function 
# 'plot.tfbslogo' in plot-methods.R 
# as.pfm - if TRUE, the plain base counts of the predicted binding sites is returned. If FALSE, normalized pwm is returned
# 2013-01-09: Is this an internal function?

setMethod("predicted2pwm", signature(x="cobindr"),
function(x, as.pfm=FALSE) {
	predPwm = list() 
	bs = x@binding_sites
	pwms = setdiff(unique(bs$pwm),'0')

	for(pwm.id in pwms) {
		#not using grep due to reserved chars in pwm ids...
		# make reverse complement of the results from the '-' strand
		hits = c(as.character(bs$seq[which(bs$pwm == pwm.id & bs$strand == 1)]), unlist(lapply(as.character(bs$seq[which(bs$pwm == pwm.id & bs$strand == 2)]), function(seq) as.character(reverseComplement(DNAString(seq))))))

		if(length(hits)<1) {
			cat('cannot make logo: PWM',pwm.id,'has no hits in any sequence...\n')
		}

		pwm = consensusMatrix(DNAStringSet(hits), baseOnly=TRUE, as.prob=FALSE)

		pwm.s = matrix(NA,nrow=0, ncol=ncol(pwm))
		ps = c('A','C','G','T')

		for(p in ps)
			pwm.s = rbind(pwm.s, pwm[grep(paste('^',p,'$',sep=''),rownames(pwm),ignore.case=T),])
		rownames(pwm.s) = ps

		if(as.pfm) { predPwm[[pwm.id]] = pwm.s
		} else {
			for(i in 1:ncol(pwm.s))
				pwm.s[,i] = pwm.s[,i]/colSums(pwm.s)[i]
			predPwm[[pwm.id]] = pfm2pwm(pwm.s)
		}
	}
	return(predPwm)
}
)
          
# function returns the input PWM in the format required by the sequence logo 
# creation function 'plot.tfbslogo' in plot-methods.R
# as.pfm - if TRUE, the plain base counts of the predicted binding sites is returned. If FALSE, normalized pwm is returned
setMethod("input.pwm", signature(x="cobindr"),
function(x, as.pfm=FALSE) {
	#get list of input pwms used
	pwms = x@pfm
	if(length(pwms)<1) stop('object is missing the input id\'s')
	# iterate over input pwms and convert to required format
	inPwm = list() 
	for(pwm.id in names(pwms)) {
		pwm = pwms[[pwm.id]]
		if(as.pfm) {
			inPwm[[pwm.id]] <- pwm
		} else {
			inPwm[[pwm.id]] <- pfm2pwm(pwm / matrix(colSums(pwm),ncol=ncol(pwm),nrow=nrow(pwm),byrow=T))
		}
	}
	return(inPwm)
}
)

#function calculates least common multiple for a numeric vector
# 2013-01-09: This function does not belong to any class.
#             Is this an internal function?

lcm.vector <- function(numvec){
        tmp_lcm = numvec[1]
        for(i in 2:length(numvec)){
                tmp_lcm <- lcm.bigz(tmp_lcm,numvec[i])
        }
        return(tmp_lcm)
}

# checks all combinations of PFMs for high similarity (i.e. redundancy)
# thr - kullback-leibler distance threshold below which two matrices are considered similar
# typical values: 0.2 high, 0.3 moderate, 0.4 low stringency
# 2013-01-09: This function does not belong to any class.
#             Is this an internal function?
input.pfm.similarity <- function(pfms, thr=0.3) {
	combs <- combn(1:length(pfms),2)
	similar.matrices <- list()
	for(i in 1:ncol(combs)) {
		if(any(pfms[[combs[1,i]]]!=round(pfms[[combs[1,i]]]))) { 
			warning('matrix',names(pfms)[combs[1,i]],'is not a PFM (i.e. contains non-integers) and will be ignored...')
			next
		}
		if(any(pfms[[combs[2,i]]]!=round(pfms[[combs[2,i]]]))) { 
			warning('matrix',names(pfms)[combs[2,i]],'is not a PFM (i.e. contains non-integers) and will be ignored...')
			next
		}
		
		kl.mean.dist <- (kl.dist(pfms[[combs[1,i]]], pfms[[combs[2,i]]]) +
				kl.dist(pfms[[combs[2,i]]], pfms[[combs[1,i]]])) / 2
		if(kl.mean.dist < thr) {
			mn <- c(names(pfms)[combs[1,i]], names(pfms)[combs[2,i]])
			similar.matrices[[paste(mn, collapse='_')]] <-  
					list(mat1=mn[1], mat2=mn[2], mean.kl=kl.mean.dist)
		}
	}
	return(similar.matrices)
}

# computes kullback-leibler distance between two matrices
# mat1 - count matrix 1
# mat2 - count matrix 2
# 2013-01-09: This function does not belong to any class.
#             Is this an internal function?
kl.dist <- function(mat1, mat2) {
	# 	if(!require('gplots')) {
	# 		warning('package gplots required for this function, returning...')
	# 		return()
	# 	}
	# functions only works with position frequency matrices, so check if only ints 
	for(i in 1:length(mat1)) {
		if(mat1[i] != round(mat1[i])){
			warning('problem with mat1: function only applicable to PFMs (integer counts)...')
			return()
		}
	}
	for(i in 1:length(mat2)) {
		if(mat2[i] != round(mat2[i])){
			warning('problem with mat2: function only applicable to PFMs (integer counts)...')
			return()
	  }
	}
	# check if pseudocounts are necessary
	if(any(mat1 == 0)) mat1 <- mat1 + 1
	if(any(mat2 == 0)) mat2 <- mat2 + 1	
	# make probabilities
	mat1 <- mat1 / matrix(rep(colSums(mat1),each=nrow(mat1)), nrow=nrow(mat1), ncol=ncol(mat1), byrow=FALSE)
	mat2 <- mat2 / matrix(rep(colSums(mat2),each=nrow(mat2)), nrow=nrow(mat2), ncol=ncol(mat2), byrow=FALSE)
	
	bases <- rownames(mat1)
	l1 <- ncol(mat1) # matrix lengths
	l2 <- ncol(mat2)
	w <- min(l1, l2)
	dists <- NULL
	
	if(l1 == l2) {
		mat1s <- list(mat1)
		mat2s <- list(mat2)
	} else if(l1 > l2) { # second mat is shorter
		A <- seq(0,l1-l2,by=1)
		mat1s <- lapply(A, function(x) mat1[,x:(x+l2)])
		mat2s <- rep(list(mat2),length(A))
	} else {  # first mat is shorter
		A <- seq(0,l2-l1,by=1)
		mat1s <- rep(list(mat2),length(A))
		mat2s <- lapply(A, function(x) mat2[,x:(x+l1)])
	}
	for(i in 1:length(mat1s))
		dists <- c(dists, 1 / w * sum(sapply(1:w, function(j) {
							sum(sapply(bases, function(b){mat1s[[i]][b,j] * log2(mat1s[[i]][b,j]/mat2s[[i]][b,j])} ))
						}))
			)
	max(dists)
}

          
# returns kullback-leibler distance matrix between two sets of matrices
# intended usage: 
# compare input and predicted motifs - pfm.similarity(input.pwm(runObj, as.pfm=TRUE), predicted2pwm(runObj, as.pfm=TRUE))
# compare input motifs: 

#heatmap.2( pfm.similarity(input.pwm(runObj, as.pfm=TRUE), input.pwm(runObj, as.pfm=TRUE)) , dendrogram='row')
# 2013-01-09: This function does not belong to any class.
#             Is this an internal function?    
pfm.similarity <- function(set1, set2) {
	rm.idx = c()
	for(i in 1:length(set1))
		if(any(set1[[i]]!=round(set1[[i]]))) { 
			warning('matrix',names(set1)[i],'is not a PFM (i.e. contains non-integers) and will be ignored...')
			rm.idx = c(rm.idx, i)
		}
	if(length(rm.idx)>0) set1 <- set1[-rm.idx]
	rm.idx = c()
	for(i in 1:length(set2))
		if(any(set2[[i]]!=round(set2[[i]]))) { 
			warning('matrix',names(set2)[i],'is not a PFM (i.e. contains non-integers) and will be ignored...')
			rm.idx = c(rm.idx, i)
		}
	if(length(rm.idx)>0) set2 <- set2[-rm.idx]
	
	sims <- matrix(NA,nrow=length(set1),ncol=length(set2), dimnames=list(names(set1), names(set2)))
	for(i in 1:length(set1))
		for(j in 1:length(set2))
			sims[i,j] <- (kl.dist(set1[[i]], set2[[j]]) + kl.dist(set2[[j]], set1[[i]])) / 2
	return(sims)
}

Any scripts or data that you put into this service are public.

cobindR documentation built on April 28, 2020, 6:40 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

cobindR
Finding Co-occuring motifs of transcription factor binding sites

R/pwm_io.R
In cobindR: Finding Co-occuring motifs of transcription factor binding sites

Defines functions pfm.similarity kl.dist input.pfm.similarity lcm.vector pfm2pwm normalize.matrix.colSum read.transfac.pfm query.motifDb

Documented in input.pfm.similarity kl.dist lcm.vector normalize.matrix.colSum pfm2pwm pfm.similarity query.motifDb query.motifDb read.transfac.pfm

Try the cobindR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cobindR Finding Co-occuring motifs of transcription factor binding sites

R/pwm_io.R In cobindR: Finding Co-occuring motifs of transcription factor binding sites

Defines functions pfm.similarity kl.dist input.pfm.similarity lcm.vector pfm2pwm normalize.matrix.colSum read.transfac.pfm query.motifDb

Documented in input.pfm.similarity kl.dist lcm.vector normalize.matrix.colSum pfm2pwm pfm.similarity query.motifDb query.motifDb read.transfac.pfm

Try the cobindR package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

cobindR
Finding Co-occuring motifs of transcription factor binding sites

R/pwm_io.R
In cobindR: Finding Co-occuring motifs of transcription factor binding sites