Nothing
#' Function processes UCSC genePred tables to generate coordinates for
#' various genic elements like introns, exons, CDS, UTRs, and upstream and
#' downstream regions. It handles these coordinates with consideration for
#' strand sensitivity and genome information.
#'
#' All the operations in here are vectorized. If the table is big, expect a
#' spike in memory. Using ncbiRefSeq table and genome hg38, the memory is
#' stable at 4-5 GB. I can utilise data.table package to process by chunk if
#' needed.
#' Original table is zero-based open-end index. The indexing system is changed
#' temporarily to follow Rs system. The output coordinate table is one-based
#' close-end index. Critical information based on UCSC Genome website:
#' Column Explanation
#' bin Indexing field to speed chromosome range queries. (Only
#' relevant to UCSC program)
#' name Name of gene (usually transcript_id from GTF)
#' chrom Reference sequence chromosome or scaffold
#' strand + or - for strand
#' txStart Transcription start position (or end position for minus
#' strand item)
#' txEnd Transcription end position (or start position for minus
#' strand item)
#' cdsStart Coding region start (or end position for minus strand item)
#' cdsEnd Coding region end (or start position for minus strand item)
#' exonCount Number of exons
#' exonEnds Exon end positions (or start positions for minus strand
#' item)
#' exonStart Exon start positions (or end positions for minus strand
#' item)
#' name2 Alternate name (e.g. gene_id from GTF)
#' cdsStartStat Status of CDS start annotation (none, unknown, incomplete,
#' or complete) = ('none','unk','incmpl','cmpl')
#' cdsEndStat Status of CDS end annotation (none, unknown, incomplete,
#' or complete)
#' exonFrames Exon frame (0,1,2), or -1 if no frame for exon (Related to
#' codon. Number represents extra bases (modulus of 3) from
#' previous exon block brought to a current exon block.)
#' If cdsStart == cdsEnd, that means non-coding sequence.
#' - maybe cdsStartStat and cdsEndStat == "none" mean the same thing.
#' maybe exonFrames == "-1," means the same thing.
#'
#' @param genepred UCSC genome name (e.g., hg19, mm39).
#' @param element.names Types of genic elements to output: "all", "intron",
#' "exon", "CDS", or "UTR". Default is "all".
#' @param upstream Length of upstream sequence (can overlap other genes).
#' @param downstream Length of downstream sequence (can overlap other genes).
#' @param genome.name UCSC genome name for trimming overflowing coordinates.
#' @param genome Genome object for coordinate resolution.
#' @param return.coor.obj Whether to return a `Coordinate` object (default: FALSE).
#' @return Genic element coordinates in a `data.table` or `Coordinate` object.
#'
#' @importFrom data.table data.table fwrite
#' @importFrom stringi stri_split_fixed
#'
#' @export
generateGenicElementCoor <- function(genepred, element.names="all",
upstream=NULL, downstream=NULL,
genome.name=NULL, genome=NULL,
return.coor.obj=FALSE) {
if (all(is.null(genome) & is.null(genome.name)) &
(!is.null(upstream) | !is.null(downstream)))
warning("No genome information. Upstream or downstream can overflow.")
if ("all" %in% element.names)
element.names <- c("exon", "intron", "CDS", "UTR")
# Convert to 1-based indexing (+1 to every start coordinate)
genepred[, c("txStart", "cdsStart") := .(txStart + 1, cdsStart + 1)]
# BEGIN
# Select exons, introns, CDS, UTR5, UTR3, upstream, downstream
gene <- genepred[, {
# Exons --------------------------------------------------------------------
# Exons coordinates (+1 to convert to 1-based indexing)
exon.starts <- as.numeric(unlist(stri_split_fixed(exonStarts, ",",
omit_empty = TRUE))) + 1
exon.ends <- as.numeric(unlist(stri_split_fixed(exonEnds, ",",
omit_empty = TRUE)))
# For vectorisation purpose
# Repeat strand, cdsStarts, cdsEnds, txStarts, and txEnds for every exons
# This is to be used in vectorisation of every genes in the table.
cds.start <- rep(cdsStart, exonCount)
cds.end <- rep(cdsEnd, exonCount)
exon.strand <- rep(strand, exonCount)
# Exon strand index for + and -
idx.exon.pos <- exon.strand == "+"
idx.exon.neg <- exon.strand == "-"
# Introns ------------------------------------------------------------------
if("intron" %in% element.names){
# First and last exon indices (All exons from different transcripts are
# combined in a vector)
idx.last.exons <- cumsum(exonCount)
idx.first.exons <- shift(idx.last.exons, fill = 0) + 1
# Introns coordinates
intron.starts <- exon.ends[-idx.last.exons] + 1
intron.ends <- exon.starts[-idx.first.exons] - 1
} else {
intron.starts <- intron.ends <- NULL
}
# CDS ----------------------------------------------------------------------
# exons without UTR
if("CDS" %in% element.names){
# Indexes of exons that contain CDS can be divided into 2 categories:
# completely CDS and partly CDS
## Upstream indexes of UTR
idx.contain.cds <- exon.ends >= cds.start & exon.starts <= cds.end
idx.completely.cds <- exon.starts >= cds.start & exon.ends <= cds.end
# complete&partial complete
# TRUE + FALSE
# idx.partly.cds <- (idx.contain.cds + idx.completely.cds) == 1
idx.partly.cds.up <- (exon.starts < cds.start & exon.ends > cds.start) |
(exon.ends == cds.start)
idx.partly.cds.down <- (exon.starts < cds.end & exon.ends > cds.end) |
(exon.starts == cds.end)
# In a situation where idx.partly.up and idx.partly.down is the same, CDS
# is in only one exon block
idx.partly.one <- idx.partly.cds.up & idx.partly.cds.down
# CDS
cds.starts <- c(cds.start[idx.partly.cds.up & !idx.partly.one],
exon.starts[idx.completely.cds],
exon.starts[idx.partly.cds.down & !idx.partly.one],
cds.start[idx.partly.one])
cds.ends <- c(exon.ends[idx.partly.cds.up & !idx.partly.one],
exon.ends[idx.completely.cds],
cds.end[idx.partly.cds.down & !idx.partly.one],
cds.end[idx.partly.one])
# Note: cds.start vs. cds.starts (cds.end vs. cds.ends)
# cds.start is the very start coordinate of CDS. It is the repetition of
# cdsStart to aid with the vectorisation operation.
# cds.starts are multiple start coordinates correspond to exon blocks
} else {
cds.starts <- cds.ends <- NULL
}
# UTR ----------------------------------------------------------------------
# UTR5, UTR3
if ("UTR" %in% element.names) {
# Indexes of exons that contain UTR can be divided into 2 categories:
# completely UTR and partly UTR
## Upstream indexes of UTR
idx.contain.UTR.up <- exon.starts < cds.start
idx.completely.UTR.up <- exon.ends < cds.start
# complete&partial complete
# TRUE + FALSE
idx.partly.UTR.up <- (idx.contain.UTR.up + idx.completely.UTR.up) == 1
## Downstream indexes of UTR
idx.contain.UTR.down <- exon.ends > cds.end
idx.completely.UTR.down <- exon.starts > cds.end
idx.partly.UTR.down <- (idx.contain.UTR.down +
idx.completely.UTR.down) == 1
# Note: The directionality of UTR5 and UTR3 are reversed in - strand.
# So, the strand information is required here.
# UTR5 coordinates
UTR5.starts <- c(exon.starts[idx.completely.UTR.up & idx.exon.pos],
exon.starts[idx.partly.UTR.up & idx.exon.pos],
exon.starts[idx.completely.UTR.down & idx.exon.neg],
cds.end[idx.partly.UTR.down & idx.exon.neg] + 1)
UTR5.ends <- c(exon.ends[idx.completely.UTR.up & idx.exon.pos],
cds.start[idx.partly.UTR.up & idx.exon.pos] - 1,
exon.ends[idx.completely.UTR.down & idx.exon.neg],
exon.ends[idx.partly.UTR.down & idx.exon.neg])
# UTR3 coordinates
UTR3.starts <- c(cds.end[idx.partly.UTR.down & idx.exon.pos]+1,
exon.starts[idx.completely.UTR.down & idx.exon.pos],
exon.starts[idx.partly.UTR.up & idx.exon.neg],
exon.starts[idx.completely.UTR.up & idx.exon.neg])
UTR3.ends <- c(exon.ends[idx.partly.UTR.down & idx.exon.pos],
exon.ends[idx.completely.UTR.down & idx.exon.pos],
cds.start[idx.partly.UTR.up & idx.exon.neg] - 1,
exon.ends[idx.completely.UTR.up & idx.exon.neg])
} else {
UTR5.starts <- UTR5.ends <- UTR3.starts <- UTR3.ends <- NULL
}
# Upstream & downstream ----------------------------------------------------
if ((!is.null(upstream)) | ((!is.null(downstream)))) {
# Strand index
idx.pos <- which(strand == "+")
idx.neg <- which(strand == "-")
# Chromosome length for every row; plus strand followed by minus strand
if (!is.null(genome.name) & is.null(genome))
genome <- loadGenome(genome.name, fasta.style = "UCSC")
if (!is.null(genome))
chr.lengths <- genome$get_length(c(chrom[idx.pos], chrom[idx.neg]))
}
if (!is.null(upstream)) {
# Upstream
upstream.starts <- c(txStart[idx.pos] - upstream, txEnd[idx.neg] + 1)
upstream.ends <- c(txStart[idx.pos] - 1, txEnd[idx.neg] + upstream)
# Trim out-of-range genomic coordinate
if (!is.null(genome)) {
idx.upstream.ends.out <- which(upstream.ends > chr.lengths)
upstream.starts[upstream.starts < 1] <- 1
upstream.ends[idx.upstream.ends.out] <-
chr.lengths[idx.upstream.ends.out]
}
} else {
upstream.starts <- upstream.ends <- NULL
}
if (!is.null(downstream)) {
# Downstream
downstream.starts <- c(txEnd[idx.pos] + 1, txStart[idx.neg] - downstream)
downstream.ends <- c(txEnd[idx.pos] + downstream, txStart[idx.neg] - 1)
# Trim out-of-range genomic coordinate
if (!is.null(genome)) {
idx.downstream.ends.out <- which(downstream.ends > chr.lengths)
downstream.starts[downstream.starts < 1] <- 1
downstream.ends[idx.downstream.ends.out] <-
chr.lengths[idx.downstream.ends.out]
}
} else {
downstream.starts <- downstream.ends <- NULL
}
# --------------------------------------------------------------------------
# Combine and organise to a table
if(!"exon" %in% element.names) exon.starts <- exon.ends <- NULL
# Combine all coordinate above in a vector. The position arrangement is
# important.
start <- c(exon.starts, intron.starts, cds.starts, UTR5.starts, UTR3.starts,
upstream.starts, downstream.starts)
end <- c(exon.ends, intron.ends, cds.ends, UTR5.ends, UTR3.ends,
upstream.ends, downstream.ends)
# Assign name, name2, chromosome, and strand to every coordinates above.
# A function to help assign. The arrangement must be exactly the same like
# above.
repeat.feature <- function(feature){
for.exons <- rep(feature, exonCount)
for.introns <- if ("intron" %in% element.names) {
rep(feature, exonCount-1)
} else NULL
for.cds <- if ("CDS" %in% element.names) {
c(for.exons[idx.partly.cds.up & !idx.partly.one],
for.exons[idx.completely.cds],
for.exons[idx.partly.cds.down & !idx.partly.one],
for.exons[idx.partly.one])
} else NULL
for.UTR5 <- if ("UTR" %in% element.names) {
c(for.exons[idx.completely.UTR.up & idx.exon.pos],
for.exons[idx.partly.UTR.up & idx.exon.pos],
for.exons[idx.completely.UTR.down & idx.exon.neg],
for.exons[idx.partly.UTR.down & idx.exon.neg])
} else NULL
for.UTR3 <- if ("UTR" %in% element.names) {
c(for.exons[idx.partly.UTR.down & idx.exon.pos],
for.exons[idx.completely.UTR.down & idx.exon.pos],
for.exons[idx.partly.UTR.up & idx.exon.neg],
for.exons[idx.completely.UTR.up & idx.exon.neg])
} else NULL
for.upstream <- if (!is.null(upstream)) {
c(feature[idx.pos], feature[idx.neg])
} else NULL
for.downstream <- if (!is.null(downstream)) {
c(feature[idx.pos], feature[idx.neg])
} else NULL
if (!"exon" %in% element.names) for.exons <- NULL
return(c(for.exons, for.introns, for.cds, for.UTR5, for.UTR3,
for.upstream, for.downstream))
}
name <- repeat.feature(name)
name2 <- repeat.feature(name2)
chromosome <- repeat.feature(chrom)
strand <- repeat.feature(strand)
# Add element: "exon", "intron", "CDS", "UTR5", "UTR3", "upstream",
# "downstream"
element <- c(rep("exon", length(exon.starts)),
rep("intron", length(intron.starts)),
rep("CDS", length(cds.ends)),
rep("5'-UTR", length(UTR5.ends)),
rep("3'-UTR", length(UTR3.ends)),
rep("upstream", length(upstream.starts)),
rep("downstream", length(downstream.starts)))
list(chromosome = chromosome, start = start, end = end, strand = strand,
name = name, name2 = name2, element = element)
}]
if (return.coor.obj) {
tmp.dir <- tempfile()
gene[, fwrite(.SD, paste0(tmp.dir, "/", chromosome, ".csv"),
showProgress = FALSE),
by = chromosome]
gene <- loadCoordinate(root.path = tmp.dir,
single.len = NULL,
is.strand.sensitive = TRUE,
merge.replicates = FALSE,
rm.dup = FALSE,
add.col.rep = FALSE,
is.kmer = FALSE,
k = NA,
ori.first.index = 1,
load.limit = 1)
}
# Convert back to 0-based indexing (-1 to every start coordinate)
genepred[, c("txStart", "cdsStart") := list(txStart-1, cdsStart-1)]
if (nrow(gene) == 0)
message(paste("Elements [", element, "] are not found in the annotation",
"table."))
return(gene)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.