#!/usr/bin/env R
#
# This file is part of peakAnalysis,
# http://github.com/alexjgriffith/alpha-score/,
# and is Copyright (C) University of Ottawa, 2015. It is Licensed under
# the three-clause BSD License; see LICENSE.txt.
# Author : Alexander Griffith
# Contact: griffitaj@gmail.com
#
#
######################################################################
######################################################################
### Non Finalized Functions
#' Score List
#'
#' The preliminary investigation score mechanism
#' scoreList(lapply(h,function(x) lapply(x,function(y)y[[2]])),mots,addmotifs)
#' @export
scoreList<-function(k,mot,addmotifs){
which(unlist(mapply(function(k,mot){
if(length(k)>0){
t0<-unlist(lapply(seq(min(k),max(k)),function(i)length(which(k==i))))
vs<-unlist(lapply(min(t0):max(t0),function(x)length(which(x==t0))))
t<-(vs[length(vs)]> (vs[1]*2^(-length(vs)/sqrt(var(t0)))))
if(is.na(vs[1]*2^(-length(vs)/sqrt(var(t0))))) {
return(FALSE)
}
else if(t)
if(all(c(vs[1]*2^(-length(vs)/sqrt(var(t0)))<1,length(vs)>5,nchar(addmotifs)<abs(which.max(t0)+min(k)))))
return(TRUE)
else
return(FALSE)
}
return(FALSE)},h,motifs )))
}
#' Prelim socre function
#'
#' @export
motifScoreFunction<-function(x) {
if (length(x)<2){return(0)} else{ if(var(x)==0){ 0} else {t0<-getHeights(x);length(t0)*2^(-max(t0)/sqrt(var(t0)))}}}
#' Get Heights
#'
#' Get the heights of the motif comparison data
#'
#' @export
getHeights<-function(h,range=c(min(h),max(h))){
rep<-rep(0,(range[2]-range[1]+1))
for(i in h)
rep[i-range[1]+1]<-rep[i-range[1]+1]+1
rep
}
#unlist(lapply(range,function(i)length(which(h==i))))}
#' geometric score value
#'
#' @export
geometricScore<-function(h){
n<-length(h)/600
l<-seq(min(h),max(h))
t0<-getHeights(h)
t1<-unlist(lapply(l,function(x) 1+n*(abs(x)/300)/6))
p<-1/(1+mean(t0))
test<-function(x,p){(1-p)^x*p}
plot(unlist(lapply(t0,test,p))*t1)
plot(unlist(t0)*t1)
}
#' Intersect Count
#'
#' Determine the number of cases which 2 motifs intersect
#'
#' @export
numberIntersect<-function(motifA,motifB,Sequences,reg=rep(TRUE,length(Sequences))){
length(
intersect(
intersect(grep(unlist(IUPACtoBase(motifA)),Sequences),grep(unlist(IUPACtoBase(motifB)),Sequences))
, which(reg))
)
}
### Application Functions
#' Motifs Two View
#'
#' Visualize the similarity between two motifs,
#' Very ineficent,
#'
#' @export
#' @examples
#' FastaFile<-"~/Dropbox/UTX-Alex/jan/combined.fasta"
#' Sequences <- readDNAStringSet(FastaFile, "fasta")
#' data<-loadHeightFile("~/Dropbox/UTX-Alex/jan/combined_heights.bed")$data
#' reg<-as.matrix(ascore(data,c(1,1,1),c("top","bottom","middle"),c(1,1,1)))
#' m<-motifs2View("CANNTG","TGACCT",reg[,3],Sequences)
#' k<-motifs2View("CANNTG","GATAAG",reg[,1],Sequences)
#' t0<-unlist(lapply(seq(min(k),max(k)),function(i)length(which(k==i))))
motifs2View<-function(m1,m2,reg,Sequences,nearHeights=FALSE){
m12<-c(m1, m2)
mList<-unlist(lapply(m12,IUPACtoBase))
cList<-lapply(unlist(lapply(m12,IUPACtoBase)),compliment)
locationsM<-lapply(mList,grep,Sequences)
locationsC<-lapply(cList,grep,Sequences)
if(nearHeights==FALSE){
h<-list(motifHist(Sequences,mList,cList,locationsM,locationsC,1,2,reg))
histVisualize(h,m1,m2)
h<-unlist(h)
t0<-unlist(lapply(seq(min(h),max(h)),function(i)length(which(h==i))))}
else{
h<-nearSummit(Sequences,mList,cList,locationsM,locationsC,1,reg)}
#c((max(t0)-mean(t0))/sqrt(var(t0)),scoreFunction(t0))
h
}
#' Histogram Wrapper
#'
#' @export
#' @examples
#' FastaFile<-"~/Dropbox/UTX-Alex/jan/combined.fasta"
#' data<-loadHeightFile("~/Dropbox/UTX-Alex/jan/combined_heights.bed")$data
#' reg<-as.matrix(ascore(data,c(1,1,1),c("top","bottom","middle"),c(1,1,1)))
#' Sequences <- readDNAStringSet(FastaFile, "fasta")
#' motifFile<-"inst/data/abnormal_normal.pwm"
#' motifs<-homerWrapper(Sequences,reg[,1],reg[,2],"~/Masters/mulcal/inst/lib/homer-4.7/cpp/homer2",motifFile)
#' motifs<-loadPWM(motifFile)
#' mots<-unlist(lapply(unlist(motifs[,1]),function(x)gsub(">","",x)))
#' motifs<-loadPWM("inst/data/jaspar_motifs.txt","jaspar")
#' mots<-c(mots,unlist(lapply(1:length(motifs[,2]),function(i)PWMtoCons(motifs[i,2]$data))))
#' addmotifs<-c("CGNNGC")
#' mList<-unlist(lapply(c(mots,addmotifs),IUPACtoBase))
#' cList<-unlist(lapply(lapply(c(mots,addmotifs),IUPACtoBase),compliment))
#' locationsM<-lapply(mList,grep,Sequences)
#' locationsC<-lapply(cList,grep,Sequences)
#' l<-length(cList)
#' motifHist(Sequences,mList,cList,locationsM,locationsC,4,l,reg)
#'
#' h<-lapply(seq(3),function(k)lapply(seq(1:l-1),function(i)motifHist(Sequences,mList,cList,locationsM,locationsC,i,l,reg[,k])))
#'
#' h<-lapply(seq(10),function(k)lapply(seq(10),function(i)lapply(seq(3),function(j)motifHist(Sequences,mList,cList,locationsM,locationsC,i,k,reg[,j]))))
#' lapply(h,scoreList,c(mots,addmotifs)[1:10],addmotifs)
#' histWrapper(lapply(h,function(x) lapply(x,function(y)y[[2]])),c(mots,addmotifs), names=c("NonName"),c(1),c(30))
histWrapper<-function(h,motifNames,names=c("NoName"),...){
BuildernestedList<-function(h,locs){
fun<-function(i,h,...){NULL}
function(funbody,...){
if(is.expression(funbody))
body(fun)<-funbody
else
fun<-funbody
subset.nestedList(h,locs,fun,...)}
}
D2<-BuildernestedList(h,list(...))
vals<-D2(expression(h[[i[1]]][[i[2]]][[i[3]]]))
mot<-D2(function(i,h,motifNames) motifNames[i],motifNames=motifNames)
motx<-Map(function(x)x[1],mot)
moty<-Map(function(x)x[2],mot)
histVisualize(
vals,
motx,moty,prod(rapply(list(...),length)),names)
}
### Utility Functions
#' Turn IUPAC lables into basic neucleotides
#'
#' @param char A string of IUPAC characters
#' c("A","G","C","T","R","Y","S","W","K","M","B","D","H","V","N")
#' @param rl choice to return a value
#' \describe{
#' \item{\strong{TRUE}}{Each posible string is genearated}
#' \item{\strong{FALSE}}{Direct translation to brakets R=[AG]}}
#' @export
#' @template authorTemplate
IUPACtoBase<-function(char,rl=FALSE){
IUPAC<-strsplit(char,"")[[1]]
IUPACCharacters<-list("A","C","G","T",c("A","G"),c("C","T"),c("C","G"),c("A","T"),c("G","T"),c("A","C"),c("C","G","T"),c("A","G","T"),c("A","C","T"),c("A","C","G"),c("A","C","G","T"))
names(IUPACCharacters)<-c("A","C","G","T","R","Y","S","W","K","M","B","D","H","V","N")
if(any( ! IUPAC %in% names(IUPACCharacters)))
{
stop("Input string contains non IUPAC characters.")
}
vals<-sapply(IUPAC,function(x){(IUPACCharacters[x])})
if(rl){
Base<-c("")
for(i in vals){
kit<-c()
for(j in Base)
kit<-c(kit,paste(j,i,sep=""))
Base<-kit}}
else{
Base<-c("")
for(i in vals)
if(length(i)==1)
Base<-paste(Base,i,sep="")
else
Base<-paste(Base,"[",do.call(paste, as.list(c(i,sep=""))),"]",sep="")
}
Base}
#' reverse compliment
#'
#' Takes a string which can include base nucleaotides (AGCT) and brakets ("[","]") and returns the reverse compliment including brakets
#'
#' @export
#' @template authorTemplate
compliment<-function(string){
chars<-c("A","G","C","T","[","]")
names(chars)<-c("T","C","G","A","]","[")
paste(rev(sapply(strsplit(string,"")[[1]], function(x){(chars[x])})),collapse="")
}
#' Motif Testing
#' @export
motifTest<-function(fastaFile="~/Dropbox/UTX-Alex/jan/combined.fasta",
heightFile="~/Dropbox/UTX-Alex/jan/combined_heights.bed",
addmotifs=c("CGNNGC")){
values<-loadHeightFile(heightFile)
data<-values$data
reg4<-ascore(data,1,"top",3)
reg4<-rep(TRUE,length(data[,1]))
test<-readDNAStringSet(fastaFile,use.names = TRUE)
motifs<-as.matrix(read.table("data/normal_not_abnormal_motifs"))
mList<-unlist(lapply(c(motifs,addmotifs),IUPACtoBase))
cList<-unlist(lapply(lapply(c(motifs,addmotifs),IUPACtoBase),compliment))
locationsM<-lapply(mList,grep,test)
locationsC<-lapply(cList,grep,test)
l<-length(cList)
h<-motifHist(test,mList,cList,locationsM,locationsC,24,25,reg4)
h1<-unlist(lapply(seq(from=-300,to=300),function(x){length(which(x==h))}))
hist(h1[! h1==0],breaks=max(h1))
}
#' Motif histogram
#'
#' @examples
#'
#' values<-loadHeightFile(heightFile)
#' data<-values$data
#' reg<-ascore(data,1,"top",3)
#' test<-readDNAStringSet(fastaFile,use.names = TRUE)
#' motifs<-as.matrix(read.table("data/normal_not_abnormal_motifs"))
#' mList<-unlist(lapply(c(motifs,addmotifs),IUPACtoBase))
#' cList<-unlist(lapply(lapply(c(motifs,"CGNNGC"),IUPACtoBase),compliment))
#' locationsM<-lapply(mList,grep,test)
#' locationsC<-lapply(cList,grep,test)
#' l<-length(cList)
#' motifHist(mList,cList,locationsM,locationsC,4,l,reg)
#'
#' @export
motifHist<-function(data,mList,cList,locationsM,locationsC,n1,n2,reg,width=c(-30,30)){
lM<-intersect(intersect(locationsM[[n1]],locationsM[[n2]]),which(reg))
lC<-intersect(intersect(locationsC[[n1]],locationsC[[n2]]),which(reg))
bM<-c()
bC<-c()
if(length(lM)>0)
bM<-lapply(c(mList[n1],mList[n2]),function(x) lapply(gregexpr(x, data[lM]),as.numeric))
if(length(lC)>0)
bC<-lapply(c(cList[n1],cList[n2]),function(x) lapply(gregexpr(x, data[lC]),as.numeric))
#h<-c(getDistance(bM[[1]],bM[[2]]),-getDistance(bC[[1]],bC[[2]]))
#print(c(mList[n1],consensusIUPAC(mList[n2]))
#print(nchar(mList[n1])-nchar(mList[n2]))
print(nchar(consenusIUPAC(mList[n1]))-nchar(consenusIUPAC(mList[n2])))
h<-c(getDistance(bM[[1]],bM[[2]]),-(nchar(consenusIUPAC(mList[n1]))-nchar(consenusIUPAC(mList[n2]))+getDistance(bC[[1]],bC[[2]])))
#h<-getDistance(bM[[1]],bM[[2]])#getDistance(bM[[1]],bM[[2]])#,-getDistance(bC[[1]],bC[[2]]))
#if(length(h)>0)
# hist(h,breaks=1000,xlim=width,xlab=paste(mList[n1],mList[n2],sep=" - "))
h}
#'@export
nearSummit<-function(data,mList,cList,locationsM,locationsC,n1,reg,width=150)
{
lM<-intersect(locationsM[[n1]],which(reg))
lC<-intersect(locationsC[[n1]],which(reg))
bM<-c()
bC<-c()
if(length(lM)>0){
bM<-lapply(gregexpr(mList[n1], data[lM]),as.numeric)
#print(mList[n1])
}
if(length(lC)>0) {
bC<-lapply(gregexpr(cList[n1], data[lC]),as.numeric)
#print("b")
}
h<-c(getDistance(unlist(bM),width),getDistance(unlist(bC),width))
h}
#' @export
getDistance<-function(x,y,one=FALSE){
as.numeric(
unlist(mapply(function(x,y){
temp<-outer(x, y,"-")
temp<-temp[upper.tri(x=temp,diag=TRUE)]
#print(str(temp))
if(one ){temp[which.max(abs(temp))]}
else {temp}
},x,y)))}
#' Homer Wrapper
#'
#' This is only a wrapper for the DENOVO functionality of homer2, homer must be installed on the system before this can be used.
#'
#' @export
homerWrapper<-function(sequences,foreground,background,homerLocation,motifsFile=FALSE,opts="-S 25 -len 6"){
if (!is.character(motifsFile))
motifsFile<-tempfile()
treatmentFile<-tempfile()
controlFile<-tempfile()
writeXStringSet(sequences[foreground],treatmentFile)
writeXStringSet(sequences[background],controlFile)
cmd<-paste(homerLocation, "denovo -i ",treatmentFile," -b ",controlFile,opts," > ",motifsFile,sep=" ")
system(cmd)
loadPWM(motifsFile,"homer")
}
#' PWM to Consensus Motif
#'
#' A caller for ConsesnusIUPAC, Recives PWM and returns a consusensus motif
#' The PWMs should be loaded through the loadPWM utility.
#'
#' @seealso \link{\code{loadPWM()}}
#'
#' @export
PWMtoCons<-function(x)
consenusIUPAC(motifString(x))
#' Consensus IUPAC
#'
#'
#' @export
#'
consenusIUPAC<-function(mstring){
IUPACCharacters<-list("A","C","G","T",c("A","G"),c("C","T"),c("C","G"),c("A","T"),c("G","T"),c("A","C"),c("C","G","T"),c("A","G","T"),c("A","C","T"),c("A","C","G"),c("A","C","G","T"))
names(IUPACCharacters)<-c("A","C","G","T","R","Y","S","W","K","M","B","D","H","V","N")
IUPACc<-unlist(lapply(IUPACCharacters,paste,collapse=""))
x<-splitBlist(mstring)
paste(lapply(x,function(x)names(which(x==IUPACc))),collapse="")}
#' Split by List
#'
#' Applied to DNA nucleotides (ACGT). Returns a list with [] removed
#'
#' @export
#' @examples
#' > consensus<-"AGCT[AGCT]G"
#' > splitBlist(consensus)
#' buf buf buf buf buf buf
#' [1,] "A" "G" "C" "T" "AGCT" "G"
#'
splitBlist<-function(mstring){
ret<-c()
buf<-""
data<-strsplit(mstring,"")[[1]]
i<-1
while (i <= length(data)){
buf<-""
if (data[i]=="["){
while(data[i]!="]"){
if (data[i] %in% c("A","C","G","T"))
buf<-paste(buf,data[i],sep="")
i<-i+1}}
else if (data[i] %in% c("A","C","G","T"))
buf<-data[i]
ret<-cbind(ret,buf)
i<-i+1
}
ret
}
#' Motif String
#'
#' A scoring mechanism to convert PWM to strings
#'
#' @export
motifString<-function(x){
paste(apply(x,2,function(x){
DNA<-c("A","C","G","T")
if(sum(x)==0)
return (paste(c("[",sort(DNA),"]"),collapse=""))
x<-x/sum(x)
or<-order(x,decreasing = TRUE)
t<-or
if(x[or[1]]>=.6){return(DNA[or[1]])}
else if(sum(x[or[1:2]])>=0.8){t<-or[1:2]}
else if((sum(x[or[1:3]])>=0.95)){t<-or[1:3]}
paste(c("[",sort(DNA[t]),"]"),collapse="")
}
),collapse="")
}
### Plotting Functions
#' @title Visualize batches of height data
#' @export
histVisualize<-function(h,m1,m2,n=1,name=c("NoName")){
pushViewport(viewport(layout=grid.layout(n,2)))
if(n==1){
h<-h[[1]]
t0<-unlist(lapply(seq(min(h),max(h)),
function(i)length(which(h==i))))
p1<-heightHist(t0,which.max(t0)+min(h)-1)
p2<-locHist(h,paste(m1,m2,sep="-"))
print(p1,vp=viewport(layout.pos.row=1,layout.pos.col=1))
print(p2,vp=viewport(layout.pos.row=1,layout.pos.col=2))
}
else{
for( k in seq(n)){
t0<-unlist(lapply(seq(min(h[[k]]),max(h[[k]])),
function(i)length(which(h[[k]]==i))))
p1<-heightHist(t0,which.max(t0)+min(h[[k]])-1)+ylab(name[k])
tm2<-m2[k]
tm1<-m1[k]
p2<-locHist(h[[k]],paste(tm1,tm2,sep="-"))+ylab("")
print(p1,vp=viewport(layout.pos.row=k,layout.pos.col=1))
print(p2,vp=viewport(layout.pos.row=k,layout.pos.col=2))
}
}
}
#' @title Height Histrogram
#' A plotting tool for histograms, Requires GGPLOT2
#' @export
heightHist<-function(t0,xlab="Histogram"){
p<-ggplot(as.data.frame(t0),aes(x=t0))+geom_histogram(binwidth=1,xlab=xlab)
if((max(t0)-min(t0))<20)
p<-p+scale_x_continuous(breaks=seq(min(t0), max(t0),1))
p<-p+stat_function(fun=function(x) length(t0)*2^(-x/sqrt(var(t0))),colour="blue" )
p<-p+xlab(xlab)+ylim(c(0,length(which(t0==0))))
p}
#' @title Location Histogram
#' A plotting tool for histograms, Requires GGPLOT2
#' @export
locHist<-function(t0,xlab="Histogram",limits=c(-32,32)){
x<-seq(min(t0),max(t0))
t1<-unlist(lapply(x,function(x)length(which(x==t0))))
locHist2(t1,x,xlab,limits)
#p<-qplot(x,t1,geom="step",ylab="frequency",xlab=xlab)+stat_function(fun=function(x){0})#+xlim(limits)
#n<-4
#m<-ceiling(log2(abs(limits[2]-limits[1])/2))
#s<-seq(max(m-3,1),m)
#b<-c(-sapply(s,function(x)2^x),sapply(s,function(x)2^x))
#p+scale_x_continuous(breaks=b,limits=limits)
}
#' @export
locHist2<-function(t1,x,xlab="Histogram",limits=c(-32,32)){
p<-qplot(x,t1,geom="step",ylab="frequency",xlab=xlab)+stat_function(fun=function(x){0})#+xlim(limits)
n<-4
m<-ceiling(log2(abs(limits[2]-limits[1])/2))
s<-seq(max(m-3,1),m)
b<-c(-sapply(s,function(x)2^x),sapply(s,function(x)2^x))
p+scale_x_continuous(breaks=b,limits=limits)
}
### Import Functions
#' @title Load PWM
#' This utiltiy is designed to load several PWM formats and create a uniform layout within R for analysis
#'@export
loadPWM<-function(fileLocation,version="homer")
{
loadDataBuilder<-function(splitfun,header=FALSE,skip=1,id=">",split=" ",region=1){
function(fileLocation){
data<-read.delim(fileLocation,header=header,skip=skip,sep="\n")
out<-list()
n<-0
name<-c()
info<-c()
for(i in seq(length(t(data)))) {
d<-strsplit(as.character(data[i,]),split)[[1]]
l<-length(d)
if( id == strsplit(d[1],"")[[1]][region])
{
if (! n==0){out<-append(out,list(matrix(box,4)))}
box<-c()
name<-c(name,d[1])
info<-c(info,d[2:l])
n<-n+1}
else{
box<-c(box,splitfun(d,l))
}}
cbind(name=name,info=info,data=append(out,list(matrix(box,4))) )}}
loadFunctions<-list(
homer=c(
skip=0,
split="\t",
splitfun=function(d,l){
as.numeric(unlist(strsplit(as.character(d),"\t")))}),
jaspar=c(skip=0,
splitfun=function(d,l){
na.omit(as.numeric(unlist(strsplit(unlist(d), "[^0-9]+"))))
}))
if (version %in% names(loadFunctions))
do.call(loadDataBuilder,loadFunctions[[version]])(fileLocation)
else
warning(paste(version,"is not an option."))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.