R/Guitar_functions.R

Defines functions combinedGuitarPlot .getNeighborhood .multiplot .makeGuitarCoordsForSingleIndex .combineListOfGRanges .makeGuitarCoordsFromGRangesList .get2sides .extractComponent .makeGuitarCoordsFromTxDb .spliceSingleTrans .combineListOfSplicing .peakToGRangesList

.peakToGRangesList <- function(a, header=FALSE) {

  # message
  print("Converting BED12 to GRangesList")
  print("It may take a few minutes")

  # read bed file
  #a = read.table(filepath,sep="\t",header=header,stringsAsFactors =FALSE)
  # mcols_info =a[,13:length(a[1,])]
  a = a[,1:12]

  # get transcripts
  no_tx = length(a[,1])
  tx_id = 1:no_tx;
  tx_name = paste("line_",1:no_tx,sep="")
  tx_chrom = a[,1]
  tx_strand = a[,6]
  tx_start = a[,2]+1
  tx_end = a[,3]
  transcripts= data.frame(tx_id,tx_name,tx_chrom,tx_strand,tx_start,tx_end)
  head(transcripts)



  # get genes
  tx_name = tx_name
  gene_id = as.character(a[,4])
  gene_id[is.na(gene_id)]="NA"
  gene=data.frame(tx_name,gene_id)

  #
  splicing <- lapply(1:no_tx, .spliceSingleTrans, a=a, tx_start=tx_start)
  splicing <- .combineListOfSplicing(splicing)

  # make txdb
  peaks = suppressWarnings(
    makeTxDb(transcripts=transcripts,
             splicings=splicing,
             genes=gene))

  # generate GRangesList
  tx <- exonsBy(peaks, "tx",use.names=TRUE)
  #mcols(tx) <- a

  return(tx)
}

.combineListOfSplicing <- function(t){

  a <- paste("t[[",1:length(t),"]]", sep="")
  a <- paste(a,collapse =",")
  a <- paste("rbind(",a,")",sep="")
  c <- parse(text=a)
  b <- suppressWarnings(eval(c))

  return(b)
}

.spliceSingleTrans <- function(i,a,tx_start) {
  tx = a[i,]
  tx_id = i
  exon_rank=1:as.integer(tx[[10]])

  # get start
  temp = as.integer(strsplit(as.character(tx[[12]]), ",")[[1]]) + tx_start[i]
  exon_start=temp

  # get end
  temp = as.integer(strsplit(as.character(tx[[11]]), ",")[[1]])
  temp2 = temp + exon_start - 1
  exon_end=temp2

  # get CDS
  cds_start = exon_start
  cds_end = exon_end

  # get data frame
  splicing_tx = data.frame(tx_id,exon_rank,exon_start,exon_end,cds_start,cds_end)
  return(splicing_tx)
}

# make Guitar Coordinates from TranscriptDb object
#' @import rtracklayer
.makeGuitarCoordsFromTxDb <- function(txdb,
                                     maximalAmbiguity = 3,
                                     minimalComponentLength = 100,
                                     minimalNcRNALength = 300,
                                     noBins=100){

  parameter = list()
  parameter$txdb <- txdb
  parameter$maximalAmbiguity <- maximalAmbiguity # whether overlap with another transcript
  parameter$minimalComponentLength <- minimalComponentLength # minimal length required for each component
  parameter$minimalNcRNALength <- minimalNcRNALength
  parameter$noBins <- noBins

  # prepare the bins
  component <- .extractComponent(parameter)
  side <- .get2sides(component)

  # print
  print("Building Guitar Coordinates. It may take a few minutes ...")
  utr3_bin <- .makeGuitarCoordsFromGRangesList(component[["utr3"]],parameter$noBins)
  utr5_bin <- .makeGuitarCoordsFromGRangesList(component[["utr5"]],parameter$noBins)
  cds_bin <- .makeGuitarCoordsFromGRangesList(component[["cds"]],parameter$noBins)
  ncRNA_bin <- .makeGuitarCoordsFromGRangesList(component[["ncRNA"]],parameter$noBins)
  mRNA_front_bin <- .makeGuitarCoordsFromGRangesList(side[["mrna_front"]],parameter$noBins)
  mRNA_back_bin <- .makeGuitarCoordsFromGRangesList(side[["mrna_back"]],parameter$noBins)
  ncRNA_front_bin <- .makeGuitarCoordsFromGRangesList(side[["ncrna_front"]],parameter$noBins)
  ncRNA_back_bin <- .makeGuitarCoordsFromGRangesList(side[["ncrna_back"]],parameter$noBins)
  print("Guitar Coordinates Built ...")

  # group together
  mcols(utr3_bin) <- data.frame(mcols(utr3_bin),comp="UTR3",category="mRNA")
  mcols(utr5_bin) <- data.frame(mcols(utr5_bin),comp="UTR5",category="mRNA")
  mcols(cds_bin) <- data.frame(mcols(cds_bin),comp="CDS",category="mRNA")
  mcols(mRNA_front_bin) <- data.frame(mcols(mRNA_front_bin),comp="Front",category="mRNA")
  mcols(mRNA_back_bin) <- data.frame(mcols(mRNA_back_bin),comp="Back",category="mRNA")

  mcols(ncRNA_bin) <- data.frame(mcols(ncRNA_bin),comp="lncRNA",category="lncRNA")
  mcols(ncRNA_front_bin) <- data.frame(mcols(ncRNA_front_bin),comp="Front",category="lncRNA")
  mcols(ncRNA_back_bin) <- data.frame(mcols(ncRNA_back_bin),comp="Back",category="lncRNA")


  GuitarCoords <- suppressWarnings(c(mRNA_front_bin,utr5_bin, cds_bin, utr3_bin, mRNA_back_bin,
                                     ncRNA_front_bin,ncRNA_bin,ncRNA_back_bin))

  return(GuitarCoords)}

.extractComponent <- function(parameter){

  txdb <- parameter$txdb
  # ambiguity filter
  exons <- exonsBy(txdb, by = "tx",use.names=TRUE)
  noTx <- length(exons)
  print(paste("total",noTx,"transcripts extracted ..."));

  temp <- countOverlaps(exons, exons)
  ambiguityFilteredTx <- names(exons[temp < (parameter$maximalAmbiguity+2)])
  noTxLeft <- length(ambiguityFilteredTx)
  print(paste("total",noTxLeft,"transcripts left after ambiguity filter ..."))
  exons <- exons[ambiguityFilteredTx]


  # extract important components
  cds <- cdsBy(txdb, by = "tx",use.names=TRUE)
  utr5 <- fiveUTRsByTranscript(txdb, use.names=TRUE)
  utr3 <- threeUTRsByTranscript(txdb, use.names=TRUE)

  # extract mRNAs
  flag_utr5 <- (sum(width(utr5)) > parameter$minimalComponentLength)
  name_utr5 <- names(utr5)[flag_utr5]
  flag_utr3 <- (sum(width(utr3)) > parameter$minimalComponentLength)
  name_utr3 <- names(utr3)[flag_utr3]
  flag_cds <- (sum(width(cds)) > parameter$minimalComponentLength)
  name_cds <- names(cds)[flag_cds]
  name_mRNA <- intersect(intersect(name_utr5,name_utr3),name_cds)
  name_filtered_mRNA <- intersect(name_mRNA,names(exons))
  cds_filtered <- cds[name_filtered_mRNA]
  utr5_filtered <- utr5[name_filtered_mRNA]
  utr3_filtered <- utr3[name_filtered_mRNA]
  print(paste("total",length(cds_filtered),"mRNAs left after component length filter ..."))

  # extract mRNAs
  all_mRNA <- unique(c(names(utr5),names(utr3),names(cds)))
  name_ncRNA <- setdiff(names(exons),all_mRNA)
  ncRNA <- exons[name_ncRNA]
  flag_ncRNA <-
    (sum(width(ncRNA)) > parameter$minimalComponentLength) &
    (sum(width(ncRNA)) > parameter$minimalNcRNALength)
  name_ncRNA <- names(ncRNA)[flag_ncRNA]
  ncRNA_filtered <- ncRNA[name_ncRNA]
  print(paste("total",length(ncRNA_filtered),"ncRNAs left after ncRNA length filter ..."))

  # return the result
  comp <- list(cds=cds_filtered,utr3=utr3_filtered,utr5=utr5_filtered,ncRNA=ncRNA_filtered)
  return(comp)}

.get2sides <- function(component) {
  utr3 <- component[["utr3"]]
  utr5 <- component[["utr5"]]
  ncrna <- component[["ncRNA"]]

  mrna_front <- .getNeighborhood(comp=utr5, side=5)
  mrna_back <- .getNeighborhood(comp=utr3, side=3)
  ncrna_front <- .getNeighborhood(comp=ncrna, side=5)
  ncrna_back <- .getNeighborhood(comp=ncrna, side=3)

  result <- list(mrna_front=mrna_front,
                 mrna_back=mrna_back,
                 ncrna_front=ncrna_front,
                 ncrna_back=ncrna_back)
  return(result)
}

.makeGuitarCoordsFromGRangesList <- function(comp,
                                            noBins=100,
                                            collapseGene=FALSE,
                                            width = 51) {

  # get all the check points
  tx_length <- as.numeric(sum(width(comp)))
  checkpoints_interval <- tx_length/noBins

  # get transcript name and strand
  tx_name <- names(comp)
  granges <- unlist(comp)
  tx <- granges[tx_name]
  strand <- as.character(as.data.frame(strand(tx))[[1]])
  chr <- as.character(as.data.frame(seqnames(tx))[[1]])

  # get coordinates
  t <- lapply(X=1:noBins,
              FUN=.makeGuitarCoordsForSingleIndex,
              comp=comp,
              noBins=noBins,
              checkpoints_interval=checkpoints_interval,
              strand=strand,
              chr=chr,
              tx_name=tx_name)

  GuitarCoords <- .combineListOfGRanges(t)

  if (collapseGene) {
    temp <- split(GuitarCoords, mcols(GuitarCoords)$pos, drop=FALSE)
    mcols(temp) <- data.frame(pos=unique(mcols(GuitarCoords)$pos))
    GuitarCoords <- temp
  }

  # return the result
  return(GuitarCoords)
}

.combineListOfGRanges <- function(t){
  txt <- "c(t[[1]]"
  for (i in 2:length(t)) {
    txt <- paste(txt,",t[[",i,"]]",sep="")
  }
  txt <- paste(txt,")",sep="")
  c <- parse(text=txt)

  # suppressWarnings
  # GuitarCoords <- eval(c)
  GuitarCoords <- suppressWarnings(eval(c))

  return(GuitarCoords)
}
.makeGuitarCoordsForSingleIndex <- function(
  index, comp, noBins, checkpoints_interval,
  strand, chr, tx_name,
  binWidth=51) {

  # get checkpoints
  checkpoints <- ceiling(checkpoints_interval*(index-0.5))
  checkpoints_transcript <- GRanges(seqnames=tx_name,
                                    IRanges(start=checkpoints, end=checkpoints, names=tx_name),
                                    strand=strand)

  # convert to genomic coordinates
  checkPoints_genomic <- mapFromTranscripts(checkpoints_transcript, comp)

  # resize
  # binWidth <- 4*ceiling(checkpoints_interval)+1
  checkRegion_genomic <- resize(x=checkPoints_genomic,
                                width=binWidth,
                                fix="center")


  # add annotation information
  mcols(checkRegion_genomic) <- data.frame(txid=tx_name, pos=(index-0.5)/noBins, interval=checkpoints_interval)

  return(checkRegion_genomic)
}

# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols:   Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
.multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {

  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)

  numPlots = length(plots)

  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }

  if (numPlots==1) {
    print(plots[[1]])

  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

.getNeighborhood <- function(comp, side=5, Width=1000) {
  # transcript info
  tx_name <- names(comp)
  granges <- unlist(comp)
  tx <- granges[tx_name]
  strand <- as.character(as.data.frame(strand(tx))[[1]])
  chr <- as.character(as.data.frame(seqnames(tx))[[1]])

  if (side == 5) {
    checkpoints <- rep(1,length(tx))
    checkpoints_transcript <- GRanges(seqnames=tx_name,
                                      IRanges(start=checkpoints, end=checkpoints, names=tx_name),
                                      strand=strand)
    # convert to genomic coordinates
    checkPoints_genomic <- mapFromTranscripts(checkpoints_transcript, comp)
    # resize
    checkRegion_genomic <- resize(x=checkPoints_genomic, width=Width, fix="end")
  } else if (side == 3) {
    checkpoints <- sum(width(comp))
    checkpoints_transcript <- GRanges(seqnames=tx_name,
                                      IRanges(start=checkpoints, end=checkpoints, names=tx_name),
                                      strand=strand)
    # convert to genomic coordinates
    checkPoints_genomic <- mapFromTranscripts(checkpoints_transcript, comp)
    # resize
    checkRegion_genomic <- resize(x=checkPoints_genomic, width=Width, fix="start")
  }

  # convert to list
  names(checkRegion_genomic) <- rep("",length(tx))
  sidelist <- split(checkRegion_genomic, tx_name, drop=TRUE)
  sidelist <- sidelist[tx_name]
  mapped_chr <- as.character(as.data.frame(seqnames(checkRegion_genomic))[[1]])
  mcols(sidelist) <- data.frame(mapped_chr)

  return(sidelist)
}

combinedGuitarPlot <- function(ct, comLength = c(0.136,0.459, 0.405)){
  # stack
  adjust=1

  # plot the figure
  # extract information of mRNA and lncRNA
  ct$weight <- ct$count # as numeric
  ct1 <- ct[ct$category=="mRNA",] # mRNA
  ct2 <- ct[ct$category=="lncRNA",] # lncRNA

  # disable notes
  pos=Feature=weight=NULL

  # remove DNA
  id1 <- which(match(ct1$comp,c("Front","Back")) >0 )
  ct1 <- ct1[-id1,]
  id2 <- which(match(ct2$comp,c("Front","Back")) >0 )
  ct2 <- ct2[-id2,]

  # normalize feature
  featureSet <- as.character(unique(ct$Feature))
  for (i in 1:length(featureSet)) {
    id <- (ct1$Feature==featureSet[i])
    ct1$weight[id] <- ct1$weight[id]/sum(ct1$weight[id])

    id <- (ct2$Feature==featureSet[i])
    ct2$weight[id] <- ct2$weight[id]/sum(ct2$weight[id])
  }

  p2 <-
    ggplot(ct2, aes(x=pos, group=Feature, weight=weight)) +
    ggtitle("Distribution on lncRNA")  +
    theme(axis.ticks = element_blank(), axis.text.x = element_blank()) +
    xlab("") +
    ylab("Frequency") +
    geom_density(adjust=adjust,aes(fill=factor(Feature),colour=factor(Feature)),alpha=0.2) +
    annotate("text", x = 0.5, y = -0.2, label = "lncRNA")+
    annotate("rect", xmin = 0, xmax = 1, ymin = -0.12, ymax = -0.08, alpha = .99, colour = "black")+
    theme(legend.position="bottom")

  weight <- comLength/sum(comLength)
  names(weight) <- c("5'UTR","CDS","3'UTR")

  # density
  cds_id <- which(ct1$comp=="CDS")
  utr3_id <- which(ct1$comp=="UTR3")
  utr5_id <- which(ct1$comp=="UTR5")
  ct1$count[utr5_id] <- ct1$count[utr5_id]*weight["5'UTR"]
  ct1$count[cds_id] <- ct1$count[cds_id]*weight["CDS"]
  ct1$count[utr3_id] <- ct1$count[utr3_id]*weight["3'UTR"]

  # re-normalization
  featureSet <- as.character(unique(ct$Feature))
  for (i in 1:length(featureSet)) {
    id <- (ct1$Feature==featureSet[i])
    ct1$weight[id] <- ct1$count[id]/sum(ct1$count[id])
  }

  # stratch
  x <- cumsum(weight)
  ct1$pos[utr5_id] <- ct1$pos[utr5_id]*weight["5'UTR"] + 0
  ct1$pos[cds_id] <- ct1$pos[cds_id]*weight["CDS"] + x[1]
  ct1$pos[utr3_id] <- ct1$pos[utr3_id]*weight["3'UTR"] + x[2]

  p1 <-
    ggplot(ct1, aes(x=pos, group=Feature, weight=weight))  +
    ggtitle("Distribution on mRNA") +
    theme(axis.ticks = element_blank(), axis.text.x = element_blank()) +
    xlab("") +
    ylab("Frequency") +
    geom_density(adjust=adjust,aes(fill=factor(Feature),colour=factor(Feature)),alpha=0.2) +
    annotate("text", x = x[1]/2, y = -0.2, label = "5'UTR") +
    annotate("text", x = x[1] + weight[2]/2, y = -0.2, label = "CDS") +
    annotate("text", x = x[2] + weight[3]/2, y = -0.2, label = "3'UTR") +
    theme(legend.position="bottom") +
    geom_vline(xintercept= x[1:2], linetype="dotted") +
    annotate("rect", xmin = 0, xmax = x[1], ymin = -0.12, ymax = -0.08, alpha = .99, colour = "black")+
    annotate("rect", xmin = x[2], xmax = 1, ymin = -0.12, ymax = -0.08, alpha = .99, colour = "black")+
    annotate("rect", xmin = x[1], xmax = x[2], ymin = -0.16, ymax = -0.04, alpha = .2, colour = "black")

  .multiplot(p1, p2, cols=2)

}
scottzijiezhang/m6Amonster documentation built on Jan. 8, 2021, 1:37 p.m.