.peakToGRangesList <- function(a, header=FALSE) {
# message
print("Converting BED12 to GRangesList")
print("It may take a few minutes")
# read bed file
#a = read.table(filepath,sep="\t",header=header,stringsAsFactors =FALSE)
# mcols_info =a[,13:length(a[1,])]
a = a[,1:12]
# get transcripts
no_tx = length(a[,1])
tx_id = 1:no_tx;
tx_name = paste("line_",1:no_tx,sep="")
tx_chrom = a[,1]
tx_strand = a[,6]
tx_start = a[,2]+1
tx_end = a[,3]
transcripts= data.frame(tx_id,tx_name,tx_chrom,tx_strand,tx_start,tx_end)
head(transcripts)
# get genes
tx_name = tx_name
gene_id = as.character(a[,4])
gene_id[is.na(gene_id)]="NA"
gene=data.frame(tx_name,gene_id)
#
splicing <- lapply(1:no_tx, .spliceSingleTrans, a=a, tx_start=tx_start)
splicing <- .combineListOfSplicing(splicing)
# make txdb
peaks = suppressWarnings(
makeTxDb(transcripts=transcripts,
splicings=splicing,
genes=gene))
# generate GRangesList
tx <- exonsBy(peaks, "tx",use.names=TRUE)
#mcols(tx) <- a
return(tx)
}
.combineListOfSplicing <- function(t){
a <- paste("t[[",1:length(t),"]]", sep="")
a <- paste(a,collapse =",")
a <- paste("rbind(",a,")",sep="")
c <- parse(text=a)
b <- suppressWarnings(eval(c))
return(b)
}
.spliceSingleTrans <- function(i,a,tx_start) {
tx = a[i,]
tx_id = i
exon_rank=1:as.integer(tx[[10]])
# get start
temp = as.integer(strsplit(as.character(tx[[12]]), ",")[[1]]) + tx_start[i]
exon_start=temp
# get end
temp = as.integer(strsplit(as.character(tx[[11]]), ",")[[1]])
temp2 = temp + exon_start - 1
exon_end=temp2
# get CDS
cds_start = exon_start
cds_end = exon_end
# get data frame
splicing_tx = data.frame(tx_id,exon_rank,exon_start,exon_end,cds_start,cds_end)
return(splicing_tx)
}
# make Guitar Coordinates from TranscriptDb object
#' @import rtracklayer
.makeGuitarCoordsFromTxDb <- function(txdb,
maximalAmbiguity = 3,
minimalComponentLength = 100,
minimalNcRNALength = 300,
noBins=100){
parameter = list()
parameter$txdb <- txdb
parameter$maximalAmbiguity <- maximalAmbiguity # whether overlap with another transcript
parameter$minimalComponentLength <- minimalComponentLength # minimal length required for each component
parameter$minimalNcRNALength <- minimalNcRNALength
parameter$noBins <- noBins
# prepare the bins
component <- .extractComponent(parameter)
side <- .get2sides(component)
# print
print("Building Guitar Coordinates. It may take a few minutes ...")
utr3_bin <- .makeGuitarCoordsFromGRangesList(component[["utr3"]],parameter$noBins)
utr5_bin <- .makeGuitarCoordsFromGRangesList(component[["utr5"]],parameter$noBins)
cds_bin <- .makeGuitarCoordsFromGRangesList(component[["cds"]],parameter$noBins)
ncRNA_bin <- .makeGuitarCoordsFromGRangesList(component[["ncRNA"]],parameter$noBins)
mRNA_front_bin <- .makeGuitarCoordsFromGRangesList(side[["mrna_front"]],parameter$noBins)
mRNA_back_bin <- .makeGuitarCoordsFromGRangesList(side[["mrna_back"]],parameter$noBins)
ncRNA_front_bin <- .makeGuitarCoordsFromGRangesList(side[["ncrna_front"]],parameter$noBins)
ncRNA_back_bin <- .makeGuitarCoordsFromGRangesList(side[["ncrna_back"]],parameter$noBins)
print("Guitar Coordinates Built ...")
# group together
mcols(utr3_bin) <- data.frame(mcols(utr3_bin),comp="UTR3",category="mRNA")
mcols(utr5_bin) <- data.frame(mcols(utr5_bin),comp="UTR5",category="mRNA")
mcols(cds_bin) <- data.frame(mcols(cds_bin),comp="CDS",category="mRNA")
mcols(mRNA_front_bin) <- data.frame(mcols(mRNA_front_bin),comp="Front",category="mRNA")
mcols(mRNA_back_bin) <- data.frame(mcols(mRNA_back_bin),comp="Back",category="mRNA")
mcols(ncRNA_bin) <- data.frame(mcols(ncRNA_bin),comp="lncRNA",category="lncRNA")
mcols(ncRNA_front_bin) <- data.frame(mcols(ncRNA_front_bin),comp="Front",category="lncRNA")
mcols(ncRNA_back_bin) <- data.frame(mcols(ncRNA_back_bin),comp="Back",category="lncRNA")
GuitarCoords <- suppressWarnings(c(mRNA_front_bin,utr5_bin, cds_bin, utr3_bin, mRNA_back_bin,
ncRNA_front_bin,ncRNA_bin,ncRNA_back_bin))
return(GuitarCoords)}
.extractComponent <- function(parameter){
txdb <- parameter$txdb
# ambiguity filter
exons <- exonsBy(txdb, by = "tx",use.names=TRUE)
noTx <- length(exons)
print(paste("total",noTx,"transcripts extracted ..."));
temp <- countOverlaps(exons, exons)
ambiguityFilteredTx <- names(exons[temp < (parameter$maximalAmbiguity+2)])
noTxLeft <- length(ambiguityFilteredTx)
print(paste("total",noTxLeft,"transcripts left after ambiguity filter ..."))
exons <- exons[ambiguityFilteredTx]
# extract important components
cds <- cdsBy(txdb, by = "tx",use.names=TRUE)
utr5 <- fiveUTRsByTranscript(txdb, use.names=TRUE)
utr3 <- threeUTRsByTranscript(txdb, use.names=TRUE)
# extract mRNAs
flag_utr5 <- (sum(width(utr5)) > parameter$minimalComponentLength)
name_utr5 <- names(utr5)[flag_utr5]
flag_utr3 <- (sum(width(utr3)) > parameter$minimalComponentLength)
name_utr3 <- names(utr3)[flag_utr3]
flag_cds <- (sum(width(cds)) > parameter$minimalComponentLength)
name_cds <- names(cds)[flag_cds]
name_mRNA <- intersect(intersect(name_utr5,name_utr3),name_cds)
name_filtered_mRNA <- intersect(name_mRNA,names(exons))
cds_filtered <- cds[name_filtered_mRNA]
utr5_filtered <- utr5[name_filtered_mRNA]
utr3_filtered <- utr3[name_filtered_mRNA]
print(paste("total",length(cds_filtered),"mRNAs left after component length filter ..."))
# extract mRNAs
all_mRNA <- unique(c(names(utr5),names(utr3),names(cds)))
name_ncRNA <- setdiff(names(exons),all_mRNA)
ncRNA <- exons[name_ncRNA]
flag_ncRNA <-
(sum(width(ncRNA)) > parameter$minimalComponentLength) &
(sum(width(ncRNA)) > parameter$minimalNcRNALength)
name_ncRNA <- names(ncRNA)[flag_ncRNA]
ncRNA_filtered <- ncRNA[name_ncRNA]
print(paste("total",length(ncRNA_filtered),"ncRNAs left after ncRNA length filter ..."))
# return the result
comp <- list(cds=cds_filtered,utr3=utr3_filtered,utr5=utr5_filtered,ncRNA=ncRNA_filtered)
return(comp)}
.get2sides <- function(component) {
utr3 <- component[["utr3"]]
utr5 <- component[["utr5"]]
ncrna <- component[["ncRNA"]]
mrna_front <- .getNeighborhood(comp=utr5, side=5)
mrna_back <- .getNeighborhood(comp=utr3, side=3)
ncrna_front <- .getNeighborhood(comp=ncrna, side=5)
ncrna_back <- .getNeighborhood(comp=ncrna, side=3)
result <- list(mrna_front=mrna_front,
mrna_back=mrna_back,
ncrna_front=ncrna_front,
ncrna_back=ncrna_back)
return(result)
}
.makeGuitarCoordsFromGRangesList <- function(comp,
noBins=100,
collapseGene=FALSE,
width = 51) {
# get all the check points
tx_length <- as.numeric(sum(width(comp)))
checkpoints_interval <- tx_length/noBins
# get transcript name and strand
tx_name <- names(comp)
granges <- unlist(comp)
tx <- granges[tx_name]
strand <- as.character(as.data.frame(strand(tx))[[1]])
chr <- as.character(as.data.frame(seqnames(tx))[[1]])
# get coordinates
t <- lapply(X=1:noBins,
FUN=.makeGuitarCoordsForSingleIndex,
comp=comp,
noBins=noBins,
checkpoints_interval=checkpoints_interval,
strand=strand,
chr=chr,
tx_name=tx_name)
GuitarCoords <- .combineListOfGRanges(t)
if (collapseGene) {
temp <- split(GuitarCoords, mcols(GuitarCoords)$pos, drop=FALSE)
mcols(temp) <- data.frame(pos=unique(mcols(GuitarCoords)$pos))
GuitarCoords <- temp
}
# return the result
return(GuitarCoords)
}
.combineListOfGRanges <- function(t){
txt <- "c(t[[1]]"
for (i in 2:length(t)) {
txt <- paste(txt,",t[[",i,"]]",sep="")
}
txt <- paste(txt,")",sep="")
c <- parse(text=txt)
# suppressWarnings
# GuitarCoords <- eval(c)
GuitarCoords <- suppressWarnings(eval(c))
return(GuitarCoords)
}
.makeGuitarCoordsForSingleIndex <- function(
index, comp, noBins, checkpoints_interval,
strand, chr, tx_name,
binWidth=51) {
# get checkpoints
checkpoints <- ceiling(checkpoints_interval*(index-0.5))
checkpoints_transcript <- GRanges(seqnames=tx_name,
IRanges(start=checkpoints, end=checkpoints, names=tx_name),
strand=strand)
# convert to genomic coordinates
checkPoints_genomic <- mapFromTranscripts(checkpoints_transcript, comp)
# resize
# binWidth <- 4*ceiling(checkpoints_interval)+1
checkRegion_genomic <- resize(x=checkPoints_genomic,
width=binWidth,
fix="center")
# add annotation information
mcols(checkRegion_genomic) <- data.frame(txid=tx_name, pos=(index-0.5)/noBins, interval=checkpoints_interval)
return(checkRegion_genomic)
}
# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
.multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
.getNeighborhood <- function(comp, side=5, Width=1000) {
# transcript info
tx_name <- names(comp)
granges <- unlist(comp)
tx <- granges[tx_name]
strand <- as.character(as.data.frame(strand(tx))[[1]])
chr <- as.character(as.data.frame(seqnames(tx))[[1]])
if (side == 5) {
checkpoints <- rep(1,length(tx))
checkpoints_transcript <- GRanges(seqnames=tx_name,
IRanges(start=checkpoints, end=checkpoints, names=tx_name),
strand=strand)
# convert to genomic coordinates
checkPoints_genomic <- mapFromTranscripts(checkpoints_transcript, comp)
# resize
checkRegion_genomic <- resize(x=checkPoints_genomic, width=Width, fix="end")
} else if (side == 3) {
checkpoints <- sum(width(comp))
checkpoints_transcript <- GRanges(seqnames=tx_name,
IRanges(start=checkpoints, end=checkpoints, names=tx_name),
strand=strand)
# convert to genomic coordinates
checkPoints_genomic <- mapFromTranscripts(checkpoints_transcript, comp)
# resize
checkRegion_genomic <- resize(x=checkPoints_genomic, width=Width, fix="start")
}
# convert to list
names(checkRegion_genomic) <- rep("",length(tx))
sidelist <- split(checkRegion_genomic, tx_name, drop=TRUE)
sidelist <- sidelist[tx_name]
mapped_chr <- as.character(as.data.frame(seqnames(checkRegion_genomic))[[1]])
mcols(sidelist) <- data.frame(mapped_chr)
return(sidelist)
}
combinedGuitarPlot <- function(ct, comLength = c(0.136,0.459, 0.405)){
# stack
adjust=1
# plot the figure
# extract information of mRNA and lncRNA
ct$weight <- ct$count # as numeric
ct1 <- ct[ct$category=="mRNA",] # mRNA
ct2 <- ct[ct$category=="lncRNA",] # lncRNA
# disable notes
pos=Feature=weight=NULL
# remove DNA
id1 <- which(match(ct1$comp,c("Front","Back")) >0 )
ct1 <- ct1[-id1,]
id2 <- which(match(ct2$comp,c("Front","Back")) >0 )
ct2 <- ct2[-id2,]
# normalize feature
featureSet <- as.character(unique(ct$Feature))
for (i in 1:length(featureSet)) {
id <- (ct1$Feature==featureSet[i])
ct1$weight[id] <- ct1$weight[id]/sum(ct1$weight[id])
id <- (ct2$Feature==featureSet[i])
ct2$weight[id] <- ct2$weight[id]/sum(ct2$weight[id])
}
p2 <-
ggplot(ct2, aes(x=pos, group=Feature, weight=weight)) +
ggtitle("Distribution on lncRNA") +
theme(axis.ticks = element_blank(), axis.text.x = element_blank()) +
xlab("") +
ylab("Frequency") +
geom_density(adjust=adjust,aes(fill=factor(Feature),colour=factor(Feature)),alpha=0.2) +
annotate("text", x = 0.5, y = -0.2, label = "lncRNA")+
annotate("rect", xmin = 0, xmax = 1, ymin = -0.12, ymax = -0.08, alpha = .99, colour = "black")+
theme(legend.position="bottom")
weight <- comLength/sum(comLength)
names(weight) <- c("5'UTR","CDS","3'UTR")
# density
cds_id <- which(ct1$comp=="CDS")
utr3_id <- which(ct1$comp=="UTR3")
utr5_id <- which(ct1$comp=="UTR5")
ct1$count[utr5_id] <- ct1$count[utr5_id]*weight["5'UTR"]
ct1$count[cds_id] <- ct1$count[cds_id]*weight["CDS"]
ct1$count[utr3_id] <- ct1$count[utr3_id]*weight["3'UTR"]
# re-normalization
featureSet <- as.character(unique(ct$Feature))
for (i in 1:length(featureSet)) {
id <- (ct1$Feature==featureSet[i])
ct1$weight[id] <- ct1$count[id]/sum(ct1$count[id])
}
# stratch
x <- cumsum(weight)
ct1$pos[utr5_id] <- ct1$pos[utr5_id]*weight["5'UTR"] + 0
ct1$pos[cds_id] <- ct1$pos[cds_id]*weight["CDS"] + x[1]
ct1$pos[utr3_id] <- ct1$pos[utr3_id]*weight["3'UTR"] + x[2]
p1 <-
ggplot(ct1, aes(x=pos, group=Feature, weight=weight)) +
ggtitle("Distribution on mRNA") +
theme(axis.ticks = element_blank(), axis.text.x = element_blank()) +
xlab("") +
ylab("Frequency") +
geom_density(adjust=adjust,aes(fill=factor(Feature),colour=factor(Feature)),alpha=0.2) +
annotate("text", x = x[1]/2, y = -0.2, label = "5'UTR") +
annotate("text", x = x[1] + weight[2]/2, y = -0.2, label = "CDS") +
annotate("text", x = x[2] + weight[3]/2, y = -0.2, label = "3'UTR") +
theme(legend.position="bottom") +
geom_vline(xintercept= x[1:2], linetype="dotted") +
annotate("rect", xmin = 0, xmax = x[1], ymin = -0.12, ymax = -0.08, alpha = .99, colour = "black")+
annotate("rect", xmin = x[2], xmax = 1, ymin = -0.12, ymax = -0.08, alpha = .99, colour = "black")+
annotate("rect", xmin = x[1], xmax = x[2], ymin = -0.16, ymax = -0.04, alpha = .2, colour = "black")
.multiplot(p1, p2, cols=2)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.