#' LocToRefSeq
#'
#' Convert MSU identifiers to Rap-DB identifiers and map them to RefSeq mRNA
#' identifiers. Currently only conversions \strong{from} MSU ID are implemented.
#'
#' Conversion from MSU to Rap-DB identifiers uses the table of RAP-DB and MSU
#' IDs \href{http://rapdb.dna.affrc.go.jp/download/irgsp1.html}{available from
#' Rap-DB}. CSGNL symbols and gene names are retrieved from the
#' OryzabaseGeneListEn database, downloaded from
#' \href{http://www.shigen.nig.ac.jp/rice/oryzabase/download/gene}{Oryzabase}.
#' Conversion to refSeq mRNA IDs uses the Rice ID IRGSP Build5 database
#' downloaded from
#' \href{http://www.shigen.nig.ac.jp/rice/oryzabase/download/riceId}{Oryzabase}.
#' If biomaRt is installed, it can be used for secondary searches for unmatched
#' MSU IDs.
#'
#' @param LOCs Character vector of MSU locus identifiers
#' @param useBiomart Boolean (default \code{TRUE}). If biomaRt is installed, use it for
#' secondary searches for genes not matched in the Oryzabase data.
#' @return Returns a list of two \code{data.frame}s, matched for MSU loci that
#' were successfully matched to refSeq IDs and notMatched for ones that
#' weren't. The \code{data.frame}s each have four columns: tigrId (MSU ID),
#' refseq_mrna (refSeq mRNA ID), ensembl_gene_id (Rap-DB ID) and
#' symbols (CSGNL recommended gene symbol(s)).
#'
#' @export
#'
#' @import data.table
#'
#' @examples
#' LocToRefSeq(c('LOC_Os01g03340', 'LOC_Os01g09410', 'LOC_Os01g17396', 'LOC_Os01g18800', 'LOC_Os01g44069', 'LOC_Os01g45470', 'LOC_Os01g53880', 'LOC_Os01g64730', 'LOC_Os01g71820', 'LOC_Os02g07110', 'LOC_Os02g26680', 'LOC_Os02g26700', 'LOC_Os02g41904', 'LOC_Os02g42880', 'LOC_Os02g44630', 'LOC_Os03g15530', 'LOC_Os03g22270', 'LOC_Os03g61160', 'LOC_Os04g58710', 'LOC_Os06g06750', 'LOC_Os07g32170', 'LOC_Os08g24930', 'LOC_Os08g31980', 'LOC_Os09g07154', 'LOC_Os09g25330', 'LOC_Os09g31438', 'LOC_Os10g07229', 'LOC_Os10g18870', 'LOC_Os10g26340', 'LOC_Os10g42210', 'LOC_Os10g42220'))
#' LocToRefSeq(c('LOC_Os03g03034', 'LOC_Os06g36560', 'LOC_Os01g50910'))
LocToRefSeq <- function(LOCs, useBiomart = TRUE) {
# search RapMsuRefSeq by LOC
message(paste(dateF(), "Searching Oryzabase database for", length(LOCs),
"record(s)"))
# fast data.table search: sapply over LOCs with grep, unlist results and
# select columns to output
matchedRecords <- RapMsuRefSeq[
unique(unlist(sapply(toupper(LOCs), grep, x = toupper(tigrId),
fixed = TRUE))),
list(rapLocus, tigrId, refseqRnaNucleotideAccessionNo)]
# remove trailing transcript numbers from tigrId and refSeq ID
matchedRecords[, tigrId := gsub("\\..*$", "", tigrId)]
matchedRecords[, refseqRnaNucleotideAccessionNo := gsub(
"\\..*$", "", refseqRnaNucleotideAccessionNo)]
data.table::setkey(matchedRecords, tigrId, rapLocus)
# search again in RAPMSU for unmatched LOC IDs
unmatchedLocs <- matchedRecords[rapLocus == "", tigrId]
if (length(unmatchedLocs) > 0) {
message(paste(dateF(), "Searching RAP-DB ID database for", length(unmatchedLocs),
"gene(s) not matched in Oryzabase"))
rematchedLocs <- RAPMSU[MSU_ID %in% unmatchedLocs]
# remove trailng Tx numbers, remove unmatched locs
rematchedLocs[, tigrId := gsub("\\..*$", "", MSU_ID)]
rematchedLocs[, rapLocus := toupper(Rap_ID)]
rematchedLocs[, c("Rap_ID", "MSU_ID") := NULL]
rematchedLocs <- rematchedLocs[!rapLocus == "NONE"]
data.table::setkey(rematchedLocs, tigrId)
# re-join tables, remove NAs and duplicates
rematchedRecords <- rbind(matchedRecords, rematchedLocs, fill = TRUE)
rematchedRecords[is.na(refseqRnaNucleotideAccessionNo),
refseqRnaNucleotideAccessionNo := ""]
rematchedRecords <- unique(rematchedRecords,
by = c("rapLocus", "tigrId",
"refseqRnaNucleotideAccessionNo"))
} else {
rematchedRecords <- unique(matchedRecords,
by = c("rapLocus", "tigrId",
"refseqRnaNucleotideAccessionNo"))
}
# uppercase rapLocus and setkey for match with ensembl during biomaRt search
rematchedRecords[, rapLocus := toupper(rapLocus)]
data.table::setkey(rematchedRecords, rapLocus)
unmatchedRecords <- rematchedRecords[refseqRnaNucleotideAccessionNo == ""]
# only search rapLocus ids that really don't already have a refSeqID
matched.so.far <- rematchedRecords[!refseqRnaNucleotideAccessionNo == "",
rapLocus]
unmatchedIds <- unmatchedRecords[rapLocus != "" &
!rapLocus %in% matched.so.far,
rapLocus]
# biomaRt search for unmatched records
if (useBiomart & length(unmatchedIds > 0)) {
if (!requireNamespace("biomaRt", quietly = TRUE)) {
stop("biomaRt is not installed. Either install it or use useBiomart == FALSE")
}
message("Preparing biomaRt object")
ensembl <- biomaRt::useMart(host = "plants.ensembl.org",
biomart = "plants_mart",
dataset = "osativa_eg_gene")
# search BioMart
message(paste(dateF(), "Running biomaRt query for", length(unmatchedIds),
"record(s) with no RefSeqID in Oryzabase"))
refSeq <- data.table(biomaRt::getBM(
attributes = c("ensembl_gene_id", "refseq_mrna"),
filters = "ensembl_gene_id",
values = unique(unmatchedIds),
mart = ensembl))
# only process records that returned a hit
bmMatch <- refSeq[!refseq_mrna == "", .(
refseqRnaNucleotideAccessionNo = paste(refseq_mrna, collapse = "/")),
by = ensembl_gene_id]
setkey(bmMatch, ensembl_gene_id, refseqRnaNucleotideAccessionNo)
# join BM hits to rematchedRecords and remove NAs
setkey(rematchedRecords, rapLocus, refseqRnaNucleotideAccessionNo)
joinedResults <- bmMatch[rematchedRecords]
joinedResults[is.na(refseqRnaNucleotideAccessionNo),
refseqRnaNucleotideAccessionNo := ""]
} else {
joinedResults <- rematchedRecords
joinedResults$refseq_mrna <- ""
data.table::setnames(joinedResults, "rapLocus", "ensembl_gene_id")
}
setnames(joinedResults, "refseqRnaNucleotideAccessionNo", "refseq_mrna")
message(paste(dateF(), "Merging results"))
# fast join by tigrId to concatenate duplicates (tigrId with >1
# rapLocus or refSeqID)
LocToRefSeq.table <- joinedResults[, .(
refseq_mrna = paste(unique(refseq_mrna[!refseq_mrna == ""]), collapse = "/"),
ensembl_gene_id = paste(unique(ensembl_gene_id[!ensembl_gene_id == ""]),
collapse = "/")),
by = tigrId]
# add short gene names
message(paste(dateF(), "Adding short gene names from Oryzabase"))
LocToRefSeq.table[, symbols := LocToGeneName(tigrId)$symbols,
by = tigrId]
LocToRefSeq.table[is.na(symbols), symbols := '']
setkey(LocToRefSeq.table, tigrId)
# split into found / not found and return
matched <- LocToRefSeq.table[!refseq_mrna == ""]
notMatched <- LocToRefSeq.table[refseq_mrna == ""]
message(paste(dateF(), "Done. Found refSeqIDs for", dim(matched)[1],
"records"))
list(matched = matched, notMatched = notMatched)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.