K562.md
In lichen-lab/tfLDA: tfLDA

Apply tfLDA for K562 cell line

topPeak(rawpeakfile='rawpeak.txt',peakdir='peaks')
cmd=paste("ls -rt peaks* | grep ","\"topIP\"",">topIPpeak.txt",sep=""); system(cmd)
CTPeak(topIPpeakfile='topIPpeak.txt',peakdir='peaks')
cmd=paste("cp topIPpeak.txt topCTpeak.txt",sep=""); system(cmd)
cmd=paste("sed -i -e 's/topIP/topCT/g' topCTpeak.txt"); system(cmd)
genomeseq='hg19.fa'
genSeq(peakfile='topIPpeak.txt',peakdir='peaks',seqdir='seqdir',genomeseq=genomeseq)
genSeq(peakfile='topCTpeak.txt',peakdir='peaks',seqdir='seqdir',genomeseq=genomeseq)
cmd=paste("cp topIPpeak.txt seqIP.txt",sep=""); system(cmd)
cmd=paste("sed -i -e 's/topIP/topIP.fa.upper/g' seqIP.txt"); system(cmd)
cmd=paste("cp topCTpeak.txt seqCT.txt",sep=""); system(cmd)
cmd=paste("sed -i -e 's/topCT/topCT.fa.upper/g' seqCT.txt"); system(cmd)
seqIPfile='seqIP.txt';seqCTfile='seqCT.txt';seqdir="seqdir";motifdir="motifdir"
database='JASPAR';score="80%";ncore=4
fmotifscanbatch(seqIPfile,seqCTfile,seqdir,motifdir,database='JASPAR')
res=createDocuments(motifdir='motifcounts',database='JASPAR')
K562=createtfLDAObj(res$documents,names(JASPAR),names(res$documents))
hdpConvert(res$documents,outfile='documents.dat')

system('hdp --train_data documents.dat --directory hdpdir')
m=read.table('hdpdir/train.log',header=T)
plot(m$likelihood,type='l',main='K562',xlab='iteration',ylab='log likelihood',lwd=2)

library(pheatmap)
library(RColorBrewer)
library(lda)
library(tfLDA)
data(K562)
obj=createtfLDAObj(documents=K562@documents,wordnames=K562@wordnames,samplenames=K562@samplenames)
ntopic=14
obj=tfLDA(obj,topic=ntopic)
plotLogLikelihood(obj)

topics=obj@models[[1]]$topics
document_sums=obj@models[[1]]$document_sums
topic_sums=obj@models[[1]]$topic_sums
assignments=obj@models[[1]]$assignments
log.likelihoods=obj@models[[1]]$log.likelihoods
samplenames=obj@samplenames
wordnames=obj@wordnames


# Figure1: the whole heatmap with all samples and TFs
theta=matrix(0,length(assignments),ntopic)
for(i in 1:length(assignments)){
  a=table(assignments[[i]])
  b=match(as.numeric(names(a)),0:(ntopic-1))
  theta[i,b]=a
  theta[i,]=theta[i,]/sum(theta[i,])
}

theta_mean=aggregate(x=theta,by=list(samplenames), FUN="mean")
rownames(theta_mean)=theta_mean[,1]
theta_mean=theta_mean[,-1]
colnames(theta_mean)=paste("Topic",1:ntopic,sep="")

h=hclust(dist(theta_mean))
tfs=rownames(theta_mean)
tfs.order=tfs[h$order]

b1=c("GATA2","GATA1")
b2=c("Rad21","CTCF","SMC3")
b3=c("STAT1","STAT2")
b4=c("MafF","MafK")
b5=c("UBF","UBTF")
b6=c("cMyc","Max")
b7=c("GTF2F1","GTF2B")
b8=c("NFYA","NFYB")

p1=match(b1,tfs)
p2=match(b2,tfs)
p3=match(b3,tfs)
p4=match(b4,tfs)
p5=match(b5,tfs)
p6=match(b6,tfs)
p7=match(b7,tfs)
p8=match(b8,tfs)

df=rep(0,nrow(theta_mean))
df[p1]=1
df[p2]=2
df[p3]=3
df[p4]=4
df[p5]=5
df[p6]=6
df[p7]=7
df[p8]=8

labels=rep("others",nrow(theta_mean))
labels[p1]="block1"
labels[p2]="block2"
labels[p3]="block3"
labels[p4]="block4"
labels[p5]="block5"
labels[p6]="block6"
labels[p7]="block7"
labels[p8]="block8"

blist=plist=list()
blist[[1]]=b1
blist[[2]]=b2
blist[[3]]=b3
blist[[4]]=b4
blist[[5]]=b5
blist[[6]]=b6
blist[[7]]=b7
blist[[8]]=b8

plist[[1]]=p1
plist[[2]]=p2
plist[[3]]=p3
plist[[4]]=p4
plist[[5]]=p5
plist[[6]]=p6
plist[[7]]=p7
plist[[8]]=p8

annotation_row=data.frame(TFs = labels)
rownames(annotation_row)=rownames(theta_mean)
pdf(file='K562-heatmap.pdf')
pheatmap(theta_mean,fontsize_row =7,fontsize=8,annotation_row=annotation_row,main="K562",treeheight_row=0,treeheight_col=0,
         color = colorRampPalette(rev(brewer.pal(n = 7, name = "Spectral")))(100))
dev.off()




# Figure2: subset of whole heatmap
tfset=c("GATA2","GATA1","Rad21","CTCF","SMC3","STAT1","STAT2","MafF","MafK","GTF2F1","GTF2B","NFYA","NFYB","cMyc","Max")
topicset=paste("Topic",c(2,12,3,14,9,10),sep="")
rowid=match(tfset,rownames(theta_mean))
colid=match(topicset,colnames(theta_mean))
theta_mean2=theta_mean[rowid,colid]
dim(theta_mean2)


labels=rep("",length(tfset))
labels[1:2]="block1"
labels[3:5]="block2"
labels[6:7]="block3"
labels[8:9]="block4"
labels[10:11]="block5"
labels[12:13]="block6"
labels[14:15]="block7"
annotation_row=data.frame(TFs = labels)
rownames(annotation_row)=rownames(theta_mean2)
pdf(file='K562-subset-heatmap.pdf')
pheatmap(theta_mean2,cluster_rows=F,cluster_cols=F,annotation_row=annotation_row,main="K562",
         color = colorRampPalette(rev(brewer.pal(n = 7, name = "Spectral")))(100))
dev.off()




# Figure3: K562-topicmotif.pdf
mypar=function (a = 2, b = 2, brewer.n = 8, brewer.name = "Dark2", ...) {
  par(mar = c(2.5, 5, 1.6, 1.1), mgp = c(1.5, 0.5, 0))
  par(mfrow = c(a, b), ...)
}

obj=topMotif(obj,ntopic=ntopic,method='rank')
topicorder=order(obj@models[[1]]$topic_sums,decreasing = T)
topic.word=obj@params$topic.word[topicorder]
prob.topic.word=obj@params$prob.topic.word[topicorder]
cols=rainbow(ntopic)
cols=cols[topicorder]

pdf(file="K562-topicmotif.pdf",width=10,height=5)
mypar(3,5)
for(i in 1:ntopic){
  rbPal=colorRampPalette(c("white",cols[i]))
  barplot(rev(prob.topic.word[[i]]),col=rbPal(length(topic.word[[i]])),horiz=T,names.arg=rev(topic.word[[i]]),las=1,cex.names=0.8,main=paste("Topic",topicorder[i],sep=""),
          col.main=cols[i],col.lab=cols[i],font.axis =2,axes=T,xlim=c(0,0.3))
}
dev.off()


obj=topMotif(obj,ntopic=ntopic,method='gamma')
topicorder=order(obj@models[[1]]$topic_sums,decreasing = T)
topic.word=obj@params$topic.word[topicorder]
prob.topic.word=obj@params$prob.topic.word[topicorder]
cols=rainbow(ntopic)
cols=cols[topicorder]

pdf(file="K562-topicmotif2.pdf",width=10,height=5)
mypar(3,5)
for(i in 1:ntopic){
  rbPal=colorRampPalette(c("white",cols[i]))
  barplot(rev(prob.topic.word[[i]]),col=rbPal(length(topic.word[[i]])),horiz=T,names.arg=rev(topic.word[[i]]),las=1,cex.names=0.8,main=paste("Topic",topicorder[i],sep=""),
          col.main=cols[i],col.lab=cols[i],font.axis =2,axes=T,xlim=c(0,0.3))
}
dev.off()

lichen-lab/tfLDA documentation built on Jan. 10, 2020, 10:13 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Tweet to @rdrrHQ

GitHub issue tracker

ian@mutexlabs.com