#' @include proteinToX.R
#' @title Map transcript-relative coordinates to amino acid residues of the
#' encoded protein
#'
#' @description
#'
#' `transcriptToProtein` maps within-transcript coordinates to the corresponding
#' coordinates within the encoded protein sequence. The provided coordinates
#' have to be within the coding region of the transcript (excluding the stop
#' codon) but are supposed to be relative to the first nucleotide of the
#' transcript (which includes the 5' UTR). Positions relative to the CDS of a
#' transcript (e.g. /PKP2 c.1643delg/) have to be first converted to
#' transcript-relative coordinates. This can be done with the
#' [cdsToTranscript()] function.
#'
#' @details
#'
#' Transcript-relative coordinates are mapped to the amino acid residues they
#' encode. As an example, positions within the transcript that correspond to
#' nucleotides 1 to 3 in the CDS are mapped to the first position in the
#' protein sequence (see examples for more details).
#'
#' @param x `IRanges` with the coordinates within the transcript. Coordinates
#' are counted from the start of the transcript (including the 5' UTR). The
#' Ensembl IDs of the corresponding transcripts have to be provided either
#' as `names` of the `IRanges`, or in one of its metadata columns.
#'
#' @param db `EnsDb` object.
#'
#' @param id `character(1)` specifying where the transcript identifier can be
#' found. Has to be either `"name"` or one of `colnames(mcols(prng))`.
#'
#' @param proteins `DFrame` object generated by [proteins()].
#'
#' @param exons `CompressedGRangesList` object generated by [exonsBy()] where
#' by = 'tx'.
#'
#' @param transcripts `GRanges` object generated by [transcripts()].
#'
#' @return
#'
#' `IRanges` with the same length (and order) than the input `IRanges`
#' `x`. Each element in `IRanges` provides the coordinates within the
#' protein sequence, names being the (Ensembl) IDs of the protein. The
#' original transcript ID and the transcript-relative coordinates are provided
#' as metadata columns. Metadata columns `"cds_ok"` indicates whether the
#' length of the transcript's CDS matches the length of the encoded protein.
#' `IRanges` with a start coordinate of `-1` is returned for transcript
#' coordinates that can not be mapped to protein-relative coordinates
#' (either no transcript was found for the provided ID, the transcript
#' does not encode a protein or the provided coordinates are not within
#' the coding region of the transcript).
#'
#' @family coordinate mapping functions
#'
#' @seealso [cdsToTranscript()] and [transcriptToCds()] for conversion between
#' CDS- and transcript-relative coordinates.
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @examples
#'
#' library(EnsDb.Hsapiens.v86)
#' ## Restrict all further queries to chromosome x to speed up the examples
#' edbx <- filter(EnsDb.Hsapiens.v86, filter = ~ seq_name == "X")
#'
#' ## Define an IRanges with the positions of the first 2 nucleotides of the
#' ## coding region for the transcript ENST00000381578
#' txpos <- IRanges(start = 692, width = 2, names = "ENST00000381578")
#'
#' ## Map these to the corresponding residues in the protein sequence
#' ## The protein-relative coordinates are returned as an `IRanges` object,
#' ## with the original, transcript-relative coordinates provided in metadata
#' ## columns tx_start and tx_end
#' transcriptToProtein(txpos, edbx)
#'
#' ## We can also map multiple ranges. Note that for any of the 3 nucleotides
#' ## encoding the same amino acid the position of this residue in the
#' ## protein sequence is returned. To illustrate this we map below each of the
#' ## first 4 nucleotides of the CDS to the corresponding position within the
#' ## protein.
#' txpos <- IRanges(start = c(692, 693, 694, 695),
#' width = rep(1, 4), names = rep("ENST00000381578", 4))
#' transcriptToProtein(txpos, edbx)
#'
#' ## If the mapping fails, an IRanges with negative start position is returned.
#' ## Mapping can fail (as below) because the ID is not known.
#' transcriptToProtein(IRanges(1, 1, names = "unknown"), edbx)
#'
#' ## Or because the provided coordinates are not within the CDS
#' transcriptToProtein(IRanges(1, 1, names = "ENST00000381578"), edbx)
#'
#' ## Meanwhile, this function can be called in parallel processes if you preload
#' ## the protein, exons and transcripts database.
#'
#' proteins <- proteins(edbx)
#' exons <- exonsBy(edbx)
#' transcripts <- transcripts(edbx)
#'
#' txpos <- IRanges(start = c(692, 693, 694, 695),
#' width = rep(1, 4),
#' names = c(rep("ENST00000381578", 2), rep("ENST00000486554", 2)),
#' info='test')
#'
#' transcriptToProtein(txpos,edbx,proteins = proteins,exons = exons,transcripts = transcripts)
transcriptToProtein <- function(x, db, id = "name", proteins = NA,exons = NA, transcripts = NA){
if (missing(x) || !is(x, "IRanges"))
stop("Argument 'x' is required and has to be an 'IRanges' object")
if (missing(db) || !is(db, "EnsDb"))
stop("Argument 'db' is required and has to be an 'EnsDb' object")
if (is(id, "DFrame") & any(any(is.na(transcripts))))
stop("Your input parameters are not matching; please assign them accordingly.")
preload_ranges_missing <- which(c(
identical(proteins,NA),
identical(exons,NA),
identical(transcripts,NA)
))
if (identical(integer(0), preload_ranges_missing)){
if (!is(proteins, "DFrame"))
stop("Argument 'proteins' has to be an 'DFrame' object")
if (!"tx_id" %in% colnames(proteins))
stop("Argument 'proteins' has to have 'tx_id'")
.txs_to_proteins(x, db = db, id = id, proteins, exons, transcripts)
} else if (length(preload_ranges_missing) == 3){
.txs_to_proteins(x, db = db, id = id)
} else {
stop(paste(
"Argument",
c("'proteins'", "'exons'", "'transcripts'")[preload_ranges_missing],
'missing.'
, sep = " "
))
}
}
#' This version performs the mapping for each IRange separately! Results are
#' the same as in .txs_to_protins. While the code is easier to read and to
#' debug, it is also considerably slower.
#'
#' @noRd
.tx_to_protein <- function(x, db, id = "name") {
if (length(x) > 1)
return(unlist(IRangesList(
lapply(split(x, 1:length(x)), FUN = .tx_to_protein,
db = db, id = id)), use.names = FALSE))
id <- match.arg(id, c("name", colnames(mcols(x))))
if (id == "name")
ids <- names(x)
else ids <- mcols(x)[, id]
if (any(is.null(ids)))
stop("All IDs are NULL")
empty_ranges <- IRanges(start = -1, width = 1)
mcols(empty_ranges) <- DataFrame(tx_id = ids,
tx_start = start(x),
tx_end = end(x),
cds_ok = NA)
tx_lens <- .transcriptLengths(db,
filter = TxIdFilter(ids),
with.cds_len = TRUE,
with.utr5_len = TRUE,
with.utr3_len = TRUE)
## Does the tx exist?
if (nrow(tx_lens) == 0) {
warning("Transcript '", ids, "' can not be found", call. = FALSE)
return(empty_ranges)
}
## Is it a protein coding tx?
if (is.na(tx_lens$cds_len)) {
warning("Transcript '", ids, "' does not encode a protein",
call. = FALSE)
return(empty_ranges)
}
if (is.na(tx_lens$utr5_len))
tx_lens$utr5_len <- 0
if (is.na(tx_lens$utr3_len))
tx_lens$utr3_len <- 0
## Are the coordinates within the CDS? Exclude the stop codon!
if (start(x) <= tx_lens$utr5_len |
end(x) > (tx_lens$utr5_len + tx_lens$cds_len - 3)) {
warning("The provided coordinates for '", ids,
"' are not within the coding region", call. = FALSE)
return(empty_ranges)
}
## Define the coordinates
end(empty_ranges) <- ceiling((end(x) - tx_lens$utr5_len) / 3)
start(empty_ranges) <- ceiling((start(x) - tx_lens$utr5_len) / 3)
## Now check if the CDS length matches the protein sequence length.
prt <- proteins(db, filter = TxIdFilter(ids), columns = "protein_sequence")
names(empty_ranges) <- prt$protein_id
cds_ok <- nchar(prt$protein_sequence) * 3 == (tx_lens$cds_len - 3)
if (!cds_ok)
warning("The CDS of '", ids, "' does not match the length of the ",
"encoded protein. Returned protein coordinates might not be",
" correct", call. = FALSE)
mcols(empty_ranges)$cds_ok <- cds_ok
empty_ranges
}
#' @details
#'
#' Transcript-relative coordinates are mapped to the amino acid residues they
#' encode. As an example, positions within the transcript that corrspond to
#' nucleotides 1 to 3 in the CDS are mapped to the first position in the
#' protein sequence (see examples for more details).
#' (e.g. a transcript relative position corresponding to the first nucleotide
#' of the transcript's CDS is mapped to the first position in the protein
#' sequence, same as if the transcri(no matter which transcript relative
#' position corresponding to the first,
#' second or third nucleotide of the CDS are provided, all (any) of
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @noRd
.txs_to_proteins <- function(x, db, id = "name",
proteins = NA,
exons = NA, transcripts = NA) {
## Fetch all in one.
id <- match.arg(id, c("name", colnames(mcols(x))))
if (id == "name")
ids <- names(x)
else ids <- mcols(x)[, id]
if (any(is.null(ids)))
stop("One or more of the provided IDs are NULL", call. = FALSE)
names(x) <- ids
## Define internal IDs - we'll need them to return the result in the
## correct order.
internal_ids <- paste0(names(x), ":", start(x), ":", end(x))
## Preserve the metadata if any
if(!is.null(mcols(x)))
raw_x_metadata <- mcols(x)
tx_lens <- .transcriptLengths(db,
filter = TxIdFilter(unique(ids)),
with.cds_len = TRUE,
with.utr5_len = TRUE,
with.utr3_len = TRUE,
exons = exons, ## Pass the preloaded
transcripts = transcripts)
## Process transcripts not found
not_found <- !(ids %in% tx_lens$tx_id)
fail_res <- IRanges()
if (any(not_found)) {
## issue #69 does not apply here, even if redundant ids are used,
## all of them are considered.
warning("Transcript(s) ", paste0("'", ids[not_found], "'",
collapse = ", "),
" could not be found", call. = FALSE)
fail_res <- IRanges(start = rep(-1, sum(not_found)),
width = rep(1, sum(not_found)))
## Add metadata columnds.
mcols(fail_res) <- DataFrame(tx_id = ids[not_found],
tx_start = start(x[not_found]),
tx_end = end(x[not_found]),
cds_ok = NA)
if (all(not_found))
return(fail_res[match(internal_ids,
paste0(mcols(fail_res)$tx_id, ":",
mcols(fail_res)$tx_start, ":",
mcols(fail_res)$tx_end))])
}
## Remove non-coding transcripts
not_coding <- is.na(tx_lens$cds_len)
if (any(not_coding)) {
not_coding_id <- tx_lens$tx_id[not_coding]
warning("Transcript(s) ", paste0("'", not_coding_id, "'",
collapse = ", "),
" do/does not encode a protein", call. = FALSE)
## Subset x to the non-coding ones (issue #69)
x_sub <- x[ids %in% not_coding_id]
not_coding_res <- IRanges(start = rep(-1, length(x_sub)),
width = rep(1, length(x_sub)))
## Add metadata columnds.
mcols(not_coding_res) <- DataFrame(tx_id = names(x_sub),
tx_start = start(x_sub),
tx_end = end(x_sub),
cds_ok = NA)
fail_res <- c(fail_res, not_coding_res)
if (all(not_coding))
return(fail_res[match(internal_ids,
paste0(mcols(fail_res)$tx_id, ":",
mcols(fail_res)$tx_start, ":",
mcols(fail_res)$tx_end))])
tx_lens <- tx_lens[!not_coding, , drop = FALSE]
}
## Ensure we have no missing 3' or 5' UTRs
tx_lens$utr3_len[is.na(tx_lens$utr3_len)] <- 0
tx_lens$utr5_len[is.na(tx_lens$utr5_len)] <- 0
x <- x[ids %in% tx_lens$tx_id]
id_idx <- match(names(x), tx_lens$tx_id)
## Are the coordinates within the CDS? Exclude the stop codon!
outside_cds <- start(x) <= tx_lens$utr5_len[id_idx] |
end(x) > (tx_lens$utr5_len[id_idx] + tx_lens$cds_len[id_idx] - 3)
if (any(outside_cds)) {
## Also save against issue #69.
outside_res <- IRanges(start = rep(-1, sum(outside_cds)),
width = rep(1, sum(outside_cds)))
## Add metadata columnds.
mcols(outside_res) <- DataFrame(tx_id = names(x)[outside_cds],
tx_start = start(x[outside_cds]),
tx_end = end(x[outside_cds]),
cds_ok = NA)
fail_res <- c(fail_res, outside_res)
warning("Provided coordinates for ",
paste0("'", names(x)[outside_cds], "'", collapse = ", "),
" are not within the coding region", call. = FALSE)
if (all(outside_cds))
return(fail_res[match(internal_ids,
paste0(mcols(fail_res)$tx_id, ":",
mcols(fail_res)$tx_start, ":",
mcols(fail_res)$tx_end))])
x <- x[!outside_cds]
}
## Now check if the CDS length matches the protein sequence length.
## check if preloaded data is available.
if(!identical(proteins,NA)){
tryCatch({
prt <- proteins[proteins$tx_id %in% unique(names(x)),]
}, error = function(e){
prt <<- DataFrame()
})
} else {
prt <- proteins(db, filter = TxIdFilter(names(x)),
columns = "protein_sequence")
}
id_idx_prt <- match(names(x), prt$tx_id)
id_idx <- match(names(x), tx_lens$tx_id)
res <- IRanges(start = ceiling((start(x) - tx_lens$utr5_len[id_idx]) / 3),
end = ceiling((end(x) - tx_lens$utr5_len[id_idx]) / 3),
names = prt$protein_id[id_idx_prt])
cds_ok <- nchar(prt$protein_sequence[id_idx_prt]) * 3 ==
(tx_lens$cds_len[id_idx] - 3)
if (any(!cds_ok))
warning("The CDS of ", paste0("'", unique(names(x)[!cds_ok]), "'",
collapse = ", "),
" does not match the length of the encoded protein. Returned",
" protein coordinates for this/these transcript(s) might not",
" be correct", call. = FALSE)
mcols(res) <- DataFrame(tx_id = names(x), tx_start = start(x),
tx_end = end(x), cds_ok = cds_ok)
res <- c(fail_res, res)
res <- res[match(internal_ids, paste0(mcols(res)$tx_id, ":",
mcols(res)$tx_start, ":",
mcols(res)$tx_end))]
## Add the preserved metadata is any
if(!is.null(mcols(x)))
mcols(res) <- DataFrame(mcols(res), raw_x_metadata)
res
}
#' @title Map transcript-relative coordinates to genomic coordinates
#'
#' @description
#'
#' `transcriptToGenome` maps transcript-relative coordinates to genomic
#' coordinates. Provided coordinates are expected to be relative to the first
#' nucleotide of the **transcript**, not the **CDS**. CDS-relative coordinates
#' have to be converted to transcript-relative positions first with the
#' [cdsToTranscript()] function.
#'
#' @param x `IRanges` with the coordinates within the transcript. Coordinates
#' are counted from the start of the transcript (including the 5' UTR). The
#' Ensembl IDs of the corresponding transcripts have to be provided either
#' as `names` of the `IRanges`, or in one of its metadata columns.
#'
#' @param db `EnsDb` object or pre-loaded exons 'CompressedGRangesList' object
#' using exonsBy().
#'
#' @param id `character(1)` specifying where the transcript identifier can be
#' found. Has to be either `"name"` or one of `colnames(mcols(prng))`.
#'
#' @return
#'
#' `GRangesList` with the same length (and order) than the input `IRanges`
#' `x`. Each `GRanges` in the `GRangesList` provides the genomic coordinates
#' corresponding to the provided within-transcript coordinates. The
#' original transcript ID and the transcript-relative coordinates are provided
#' as metadata columns as well as the ID of the individual exon(s). An empty
#' `GRanges` is returned for transcripts that can not be found in the database.
#'
#' @md
#'
#' @author Johannes Rainer
#'
#' @family coordinate mapping functions
#'
#' @seealso [cdsToTranscript()] and [transcriptToCds()] for the mapping between
#' CDS- and transcript-relative coordinates.
#'
#' @examples
#'
#' library(EnsDb.Hsapiens.v86)
#' ## Restrict all further queries to chromosome x to speed up the examples
#' edbx <- filter(EnsDb.Hsapiens.v86, filter = ~ seq_name == "X")
#'
#' ## Below we map positions 1 to 5 within the transcript ENST00000381578 to
#' ## the genome. The ID of the transcript has to be provided either as names
#' ## or in one of the IRanges' metadata columns
#' txpos <- IRanges(start = 1, end = 5, names = "ENST00000381578")
#'
#' transcriptToGenome(txpos, edbx)
#' ## The object returns a GRangesList with the genomic coordinates, in this
#' ## example the coordinates are within the same exon and map to a single
#' ## genomic region.
#'
#' ## Next we map nucleotides 501 to 505 of ENST00000486554 to the genome
#' txpos <- IRanges(start = 501, end = 505, names = "ENST00000486554")
#'
#' transcriptToGenome(txpos, edbx)
#' ## The positions within the transcript are located within two of the
#' ## transcripts exons and thus a `GRanges` of length 2 is returned.
#'
#' ## Next we map multiple regions, two within the same transcript and one
#' ## in a transcript that does not exist.
#' txpos <- IRanges(start = c(501, 1, 5), end = c(505, 10, 6),
#' names = c("ENST00000486554", "ENST00000486554", "some"))
#'
#' res <- transcriptToGenome(txpos, edbx)
#'
#' ## The length of the result GRangesList has the same length than the
#' ## input IRanges
#' length(res)
#'
#' ## The result for the last region is an empty GRanges, because the
#' ## transcript could not be found in the database
#' res[[3]]
#'
#' res
#' ## If you are tring to map a huge list of transcript-relative coordinates
#' ## to genomic level, you shall use pre-loaded exons GRangesList to replace
#' ## the SQLite db edbx
#'
#' exons <- exonsBy(EnsDb.Hsapiens.v86)
#'
#' ## Below is just a lazy demo of querying 10^4 transcript-relative
#' ## coordinates without any pre-splitting
#' library(parallel)
#'
#' txpos <- IRanges(
#' start = rep(1,10),
#' end = rep(30,10),
#' names = c(rep('ENST00000486554',9),'some'),
#' note = rep('something',10))
#'
#' ## only run in Linux ##
#' # res_temp <- mclapply(1:10, function(ind){
#' # transcriptToGenome(txpos[ind], exons)
#' # }, mc.preschedule = TRUE, mc.cores = detectCores() - 1)
#'
#' # res <- do.call(c,res_temp)
#' cl <- makeCluster(detectCores() - 1)
#' clusterExport(cl,c('transcriptToGenome','txpos','exons'))
#' res <- parLapply(cl,1:10,function(ind){
#' transcriptToGenome(txpos[ind], exons)
#' })
#' stopCluster(cl)
transcriptToGenome <- function(x, db, id = "name") {
if (missing(x) || !is(x, "IRanges"))
stop("Argument 'x' is required and has to be an 'IRanges' object")
if (missing(db) || !(is(db, "EnsDb") || is(db,"CompressedGRangesList")))
stop("Argument 'db' is required and has to be an 'EnsDb' object or a 'CompressedGRangesList' object") # load the exons priorly to allow spontaneous query since RSQLite does not support
res <- .tx_to_genome(x, db, id = id)
not_found <- sum(lengths(res) == 0)
if (not_found > 0)
warning(not_found, " transcript(s) could either not be found in the ",
"database or the specified range is outside the transcript's ",
"sequence")
res
}
.tx_to_genome <- function(x, db, id = "name") {
id <- match.arg(id, c("name", colnames(mcols(x))))
if (id == "name")
ids <- names(x)
else ids <- mcols(x)[, id]
if (any(is.null(ids)))
stop("One or more of the provided IDs are NULL", call. = FALSE)
names(x) <- ids
if(is(db, "EnsDb")) {
exns <- exonsBy(db, filter = TxIdFilter(unique(ids)))
} else {
tryCatch({
exns <- db[names(db) %in% unique(ids)]
}, error = function(e){
exns <<- GRangesList()
})
}
## Loop over x
res <- lapply(split(x, f = 1:length(x)), function(z) {
if (any(names(exns) == names(z))) {
gen_map <- .to_genome(exns[[names(z)]], z)
if (length(gen_map)) {
mcols(gen_map) <- DataFrame(mcols(gen_map), tx_start = start(z),
tx_end = end(z)) # preserve the metadata from the targeted tx
if(!is.null(mcols(z))) mcols(gen_map) <- DataFrame(mcols(gen_map), mcols(z))
gen_map[order(gen_map$exon_rank)]
} else GRanges()
} else GRanges()
})
names(res) <- ids
as(res, "GRangesList")
}
#' @title Map transcript-relative coordinates to positions within the CDS
#'
#' @description
#'
#' Converts transcript-relative coordinates to positions within the CDS (if the
#' transcript encodes a protein).
#'
#' @param x `IRanges` with the coordinates within the transcript. Coordinates
#' are expected to be relative to the transcription start (the first
#' nucleotide of the transcript). The Ensembl IDs of the corresponding
#' transcripts have to be provided either as `names` of the `IRanges`, or
#' in one of its metadata columns.
#'
#' @inheritParams transcriptToProtein
#'
#' @return
#'
#' `IRanges` with the same length (and order) than the input `IRanges`
#' `x`. Each element in `IRanges` provides the coordinates within the
#' transcripts CDS. The transcript-relative coordinates are provided
#' as metadata columns.
#' `IRanges` with a start coordinate of `-1` is returned for transcripts
#' that are not known in the database, non-coding transcripts or if the
#' provided start and/or end coordinates are not within the coding region.
#'
#' @family coordinate mapping functions
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @examples
#'
#' library(EnsDb.Hsapiens.v86)
#' ## Defining transcript-relative coordinates for 4 transcripts of the gene
#' ## BCL2
#' txcoords <- IRanges(start = c(1463, 3, 143, 147), width = 1,
#' names = c("ENST00000398117", "ENST00000333681",
#' "ENST00000590515", "ENST00000589955"))
#'
#' ## Map the coordinates.
#' transcriptToCds(txcoords, EnsDb.Hsapiens.v86)
#'
#' ## ENST00000590515 does not encode a protein and thus -1 is returned
#' ## The coordinates within ENST00000333681 are outside the CDS and thus also
#' ## -1 is reported.
#'
#' ## Meanwhile, this function can be called in parallel processes if you preload
#' ## the exons and transcripts database.
#'
#' exons <- exonsBy(EnsDb.Hsapiens.v86)
#' transcripts <- transcripts(EnsDb.Hsapiens.v86)
#'
#' transcriptToCds(txcoords, EnsDb.Hsapiens.v86, exons = exons,transcripts = transcripts)
transcriptToCds <- function(x, db, id = "name",
exons = NA, transcripts = NA) {
if (missing(x) || !is(x, "IRanges"))
stop("Argument 'x' is required and has to be an 'IRanges' object")
if (missing(db) || !is(db, "EnsDb"))
stop("Argument 'db' is required and has to be an 'EnsDb' object")
id <- match.arg(id, c("name", colnames(mcols(x))))
if (id == "name")
ids <- names(x)
else ids <- mcols(x)[, id]
tx_lens <- .transcriptLengths(db, with.utr5_len = TRUE,
with.cds_len = TRUE,
filter = ~ tx_id %in% ids,
exons = exons, ## Pass the preloaded
transcripts = transcripts)
mcols(x)$tx_start <- start(x)
mcols(x)$tx_end <- end(x)
start(x) <- -1
end(x) <- -1
if (nrow(tx_lens)) {
## IDs not found
nf <- !(ids %in% tx_lens$tx_id)
if (any(nf))
warning("Could not find ", sum(nf), " of ", length(ids),
" transcript(s): ", .ids_message(ids[nf]))
new_start <- mcols(x)$tx_start - tx_lens[ids, "utr5_len"]
new_end <- mcols(x)$tx_end - tx_lens[ids, "utr5_len"]
## non-coding
isna <- !nf & is.na(new_start)
if (any(isna))
warning(sum(isna), " of ", length(ids), " transcript(s) are non-",
"coding: ", .ids_message(ids[isna]))
## coordinate not in CDS?
notcds <- !nf & !isna & (new_start <= 0 | new_end <= 0)
if (any(notcds))
warning("Coordinates for ", sum(notcds), " of ", length(ids),
" transcript(s) are not in the coding region: ",
.ids_message(ids[notcds]))
endout <- !nf & !isna & !notcds & new_end > tx_lens[ids, "cds_len"]
if (any(endout))
warning("End coordinate for ", sum(endout), " of ", length(ids),
" transcript(s) are outside of the coding region: ",
.ids_message(ids[endout]))
## Now update coordinates
end(x[!nf &!isna & !notcds & !endout]) <-
new_end[!nf &!isna & !notcds & !endout]
start(x[!nf & !isna & !notcds & !endout]) <-
new_start[!nf & !isna & !notcds & !endout]
} else
warning("Could not find any of the provided transcript IDs")
x
}
#' @title Map positions within the CDS to coordinates relative to the start of
#' the transcript
#'
#' @description
#'
#' Converts CDS-relative coordinates to positions within the transcript, i.e.
#' relative to the start of the transcript and hence including its 5' UTR.
#'
#' @param x `IRanges` with the coordinates within the CDS. Coordinates
#' are expected to be relative to the transcription start (the first
#' nucleotide of the transcript). The Ensembl IDs of the corresponding
#' transcripts have to be provided either as `names` of the `IRanges`, or
#' in one of its metadata columns.
#'
#' @inheritParams transcriptToProtein
#'
#' @return
#'
#' `IRanges` with the same length (and order) than the input `IRanges`
#' `x`. Each element in `IRanges` provides the coordinates within the
#' transcripts CDS. The transcript-relative coordinates are provided
#' as metadata columns.
#' `IRanges` with a start coordinate of `-1` is returned for transcripts
#' that are not known in the database, non-coding transcripts or if the
#' provided start and/or end coordinates are not within the coding region.
#'
#' @family coordinate mapping functions
#'
#' @author Johannes Rainer
#'
#' @md
#'
#' @examples
#'
#' library(EnsDb.Hsapiens.v86)
#' ## Defining transcript-relative coordinates for 4 transcripts of the gene
#' ## BCL2
#' txcoords <- IRanges(start = c(4, 3, 143, 147), width = 1,
#' names = c("ENST00000398117", "ENST00000333681",
#' "ENST00000590515", "ENST00000589955"))
#'
#' cdsToTranscript(txcoords, EnsDb.Hsapiens.v86)
#'
#' ## Next we map the coordinate for variants within the gene PKP2 to the
#' ## genome. The variants is PKP2 c.1643DelG and the provided
#' ## position is thus relative to the CDS. We have to convert the
#' ## position first to transcript-relative coordinates.
#' pkp2 <- IRanges(start = 1643, width = 1, name = "ENST00000070846")
#'
#' ## Map the coordinates by first converting the CDS- to transcript-relative
#' ## coordinates
#' transcriptToGenome(cdsToTranscript(pkp2, EnsDb.Hsapiens.v86),
#' EnsDb.Hsapiens.v86)
#'
#' ## Meanwhile, this function can be called in parallel processes if you preload
#' ## the exons and transcripts database.
#'
#' exons <- exonsBy(EnsDb.Hsapiens.v86)
#' transcripts <- transcripts(EnsDb.Hsapiens.v86)
#'
#' cdsToTranscript(txcoords, EnsDb.Hsapiens.v86, exons = exons,transcripts = transcripts)
cdsToTranscript <- function(x, db, id = "name",
exons = NA, transcripts = NA) {
if (missing(x) || !is(x, "IRanges"))
stop("Argument 'x' is required and has to be an 'IRanges' object")
if (missing(db) || !is(db, "EnsDb"))
stop("Argument 'db' is required and has to be an 'EnsDb' object")
id <- match.arg(id, c("name", colnames(mcols(x))))
if (id == "name")
ids <- names(x)
else ids <- mcols(x)[, id]
tx_lens <- .transcriptLengths(db, with.utr5_len = TRUE,
with.cds_len = TRUE,
filter = ~ tx_id %in% ids,
exons = exons, ## Pass the preloaded
transcripts = transcripts)
mcols(x)$cds_start <- start(x)
mcols(x)$cds_end <- end(x)
start(x) <- -1
end(x) <- -1
if (nrow(tx_lens)) {
## IDs not found
nf <- !(ids %in% tx_lens$tx_id)
if (any(nf))
warning("Could not find ", sum(nf), " of ", length(ids),
" transcript(s): ", .ids_message(ids[nf]))
new_start <- mcols(x)$cds_start + tx_lens[ids, "utr5_len"]
new_end <- mcols(x)$cds_end + tx_lens[ids, "utr5_len"]
isna <- !nf & is.na(new_start)
if (any(isna))
warning(sum(isna), " of ", length(ids), " transcript(s) are non-",
"coding: ", .ids_message(ids[isna]))
## coordinate not in CDS?
notcds <- !nf & !isna &
(mcols(x)$cds_start > tx_lens[ids, "cds_len"] |
mcols(x)$cds_end >= tx_lens[ids, "cds_len"])
if (any(notcds))
warning("Coordinates for ", sum(notcds), " of ", length(ids),
" transcript(s) are not in the coding region: ",
.ids_message(ids[notcds]))
## Now update coordinates
end(x[!nf & !isna & !notcds]) <- new_end[!nf & !isna & !notcds]
start(x[!nf & !isna & !notcds]) <- new_start[!nf & !isna & !notcds]
} else
warning("Could not find any of the provided transcript IDs")
x
}
.ids_message <- function(x) {
mess <- paste(x[1:min(3, length(x))], collapse = ", ")
if (length(x) > 3) {
mess <- paste0(mess, " ... (", length(x) - 3, " more)")
}
mess
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.