R/makeEpiTxDbFromRMBase.R

Defines functions listAvailableModFromRMBase .get_RMBase_mod listAvailableGenomesFromRMBase .listAvailableGenomesFromRMBase .get_RMBase_genomes .get_RMBase_files listAvailableOrganismsFromRMBase makeEpiTxDbFromRMBaseFiles .add_sequence_check_to_metadata .remove_incompatible_mod_by_seq getRMBaseDataAsGRanges .check_tx_sequences .gene_names_to_chromosome .get_RMBase_data .extract_GRanges_from_RMBase .fix_non_standard_mod_types .simplify_chromosome_identifiers .read_RMBase_file .get_RMBase_colnames .get_RMBase_header makeEpiTxDbFromRMBase downloadRMBaseFiles .download_RMBase_files .get_RMBase_files_available .check_RMBase_files_available .get_RMBase_rnames

Documented in downloadRMBaseFiles getRMBaseDataAsGRanges listAvailableGenomesFromRMBase listAvailableModFromRMBase listAvailableOrganismsFromRMBase makeEpiTxDbFromRMBase makeEpiTxDbFromRMBaseFiles

#' @include EpiTxDb-class.R
#' @include makeEpiTxDb.R
#' @include shiftGenomicToTranscript.R
NULL

#' @name makeEpiTxDbFromRMBase
#'
#' @title Create a \code{EpiTxDb} object from RMBase v2.0 online resources
#'
#' @description
#' \code{makeEpiTxDbFromRMBase} will make use of the RMBase v2.0 online
#' resources.
#'
#' @param organism A \code{character} value, which must match an organism
#'   descriptor on the RMBase download website.
#' @param genome A \code{character} value, which must match a genome
#'   descriptor on the RMBase download website.
#' @param modtype A \code{character} value, which must match one or more
#'   modification descriptors on the RMBase download website.
#' @param files From \code{organism}, \code{genome} and \code{modtype} the
#'   available files will be downloaded using the
#'   \code{\link[BiocFileCache:BiocFileCache-class]{BiocFileCache}} interface
#'   and passed on to \code{makeEpiTxDbFromRMBaseFiles}. However, individual
#'   files can be provided as well.
#' @param tx A \code{\link[GenomicRanges:GRangesList-class]{GRangesList}} object
#'   which will be used to shift the genomic coordinates to transcript
#'   coordinates. This is optional, but highly recommended. (default:
#'   \code{tx = NULL}).
#' @param sequences A named \code{DNAStringSet} or \code{RNAStringSet}, which
#'   will be used to check whether the defined modifications are compatible with
#'   the original base. This uses
#'   \code{\link[Modstrings:separate]{removeIncompatibleModifications()}}
#'   function from the \code{Modstrings} package.
#' @param metadata,reassign.ids See \code{\link[=makeEpiTxDb]{makeEpiTxDb}}
#'
#' @return a \code{EpiTxDb} object.
#'
#' @export
NULL

#' @rdname makeEpiTxDbFromRMBase
#' @export
EPITXDB_RMBASE_URL <- "http://rna.sysu.edu.cn/rmbase/download/"

# makeEpiTxDbFromRMBase --------------------------------------------------------

.get_RMBase_rnames <- function(organism, genome, modtype){
    paste0("RMBase_",organism,"_",genome,"_",modtype)
}

.check_RMBase_files_available <- function(bfc, organism, genome, modtype){
    # get BiocFileCache information
    rnames <- .get_RMBase_rnames(organism, genome, modtype)
    bfci <- BiocFileCache::bfcinfo(bfc)
    m <- match(rnames,bfci$rname)
    m <- m[!is.na(m)]
    res <- bfci[m,]
    #
    nrow(res) == length(rnames)
}

.get_RMBase_files_available <- function(bfc, organism, genome, modtype){
    # get BiocFileCache information
    rnames <- .get_RMBase_rnames(organism, genome, modtype)
    bfci <- BiocFileCache::bfcinfo(bfc)
    m <- match(rnames,bfci$rname)
    m <- m[!is.na(m)]
    res <- bfci[m,]
    #
    files <- as.list(res$rpath)
    if(length(rnames) != length(files)){
        stop(".")
    }
    #
    m <- match(bfci$rname, rnames)
    m <- m[!is.na(m)]
    # update resource if expired
    check_update_required <- BiocFileCache::bfcneedsupdate(bfc, res$rid)
    check_update_required <- check_update_required &
        !is.na(check_update_required)
    files[check_update_required] <-
        Map(function(rid){
            BiocFileCache::bfcdownload(bfc,rid)
        },
        res$rid[check_update_required])
    unname(unlist(files))
}

#' @importFrom BiocFileCache bfcquery bfcadd BiocFileCache
.download_RMBase_files <- function(bfc, organism, genome, modtype){
    # get file names
    files <- .get_RMBase_files(organism)
    f_genome <- vapply(strsplit(files,"_"),"[",character(1),2L) == genome
    f_mod <- vapply(strsplit(files[f_genome],"_"),"[",character(1),4L) %in% modtype
    files <- files[f_genome][f_mod]
    urls <- paste0(EPITXDB_RMBASE_URL,organism,"/zip/",files)
    #
    rnames <- .get_RMBase_rnames(organism, genome, modtype)
    if(length(rnames) != length(urls)){
        stop(".")
    }
    # get BiocFileCache info
    bfci <- BiocFileCache::bfcinfo(bfc)
    # check which rnames are available
    rnames_available <- rnames %in% bfci$rname
    # get file name if available
    files <- as.list(rep("",length(rnames)))
    files[rnames_available] <-
        Map(function(rname){
            BiocFileCache::bfcquery(bfc,rname)[,"rpath"]
        },
        rnames[rnames_available])
    # download if not available and return file name
    files[!rnames_available] <-
        Map(function(url, rname){
            BiocFileCache::bfcadd(bfc,rname,url)
        },
        urls[!rnames_available],
        rnames[!rnames_available])
    unname(unlist(files))
}

#' @rdname makeEpiTxDbFromRMBase
#' @export
downloadRMBaseFiles <- function(organism, genome, modtype){
    bfc <- BiocFileCache::BiocFileCache()
    if(!.check_RMBase_files_available(bfc, organism, genome, modtype)){
        modtypes <- listAvailableModFromRMBase(organism, genome)
        if(!all(modtype %in% modtypes)){
            stop("'modtype' must be a valid modification type from ",
                 "listAvailableModFromRMBase() for the given 'organism' and ",
                 "'genome'.")
        }
        message("Downloading RMBase v2.0 files ...")
        files <- .download_RMBase_files(bfc, organism, genome, modtype)
    } else {
        files <- .get_RMBase_files_available(bfc, organism, genome, modtype)
    }
    files
}

#' @rdname makeEpiTxDbFromRMBase
#' @export
makeEpiTxDbFromRMBase <- function(organism, genome, modtype, tx = NULL,
                                  sequences = NULL, metadata = NULL,
                                  reassign.ids = FALSE){
    message("Loading RMBase files ...")
    files <- downloadRMBaseFiles(organism, genome, modtype)
    makeEpiTxDbFromRMBaseFiles(files, tx = tx, sequences = sequences,
                               metadata = metadata, reassign.ids = reassign.ids)
}

# makeEpiTxDbFromRMBaseFiles ---------------------------------------------------

.get_RMBase_header <- function(file){
    header <- readLines(file,7L)
    header
}

.get_RMBase_colnames <- function(file){
    colnames <- .get_RMBase_header(file)[7L]
    strsplit(trimws(gsub("#","",colnames)), "\\p{Zs}+|\\t", perl=TRUE)[[1L]]
}

EPITXDB_RMBASE_REQ_COLUMS <- c("chromosome", "modStart", "modEnd", "modId",
                               "score", "strand", "modName", "modType",
                               "supportList", "pubmedIds", "geneName",
                               "sequence")

#' @importFrom utils read.delim
.read_RMBase_file <- function(file){
    rmb <- try(read.delim(file, skip = 7L), silent = TRUE)
    if(is(rmb,"try-error")){
        stop("Malformated input file: ", file,".\nError message: ",
             as.character(rmb),
             call. = FALSE)
    }
    if(ncol(rmb) < 15L){
        stop(".")
    }
    colnames <- .get_RMBase_colnames(file)
    if(length(colnames) < ncol(rmb)){
        stop("Header of input could not read correctly. Mismatching number of ",
             "data columns and column labels.",
             call. = FALSE)
    }
    colnames(rmb) <- colnames[seq_len(ncol(rmb))]
    f <- EPITXDB_RMBASE_REQ_COLUMS %in% colnames(rmb)
    if(!all(f)){
        stop("Missing required column from RMBase data. (File: ",
             file, " / Missing columns: ",
             EPITXDB_RMBASE_REQ_COLUMS[!(f)],")",
             call. = FALSE)
    }
    rmb <- rmb[,colnames(rmb) %in% EPITXDB_RMBASE_REQ_COLUMS]
    rmb
}

# if a prefix in the chromosome identifier is present, try to remove it
# only if it simplifies the result
.simplify_chromosome_identifiers <- function(chromosome, seqlevels = NA){
    chromosome <- as.character(chromosome)
    # fix some weird mito chromosome annotation
    f <- chromosome %in% "Mito"
    if(any(f)){
        chromosome[f] <- gsub("Mito","M",chromosome[f])
    }
    # mismatching chromosomes
    if(all(!is.na(seqlevels))){
        mm_chr <- !(chromosome %in% seqlevels)
    } else {
        return(factor(chromosome,unique(chromosome)))
    }
    f_chr_remove <- grepl("Chr|chr",chromosome[mm_chr])
    if(any(f_chr_remove)){
        if(all(levels(factor(gsub("Chr|chr","",chromosome[mm_chr][f_chr_remove]))) %in%
               levels(chromosome[mm_chr][f_chr_remove])) ||
           all(levels(chromosome[mm_chr][f_chr_remove]) %in%
               levels(factor(gsub("Chr|chr","",chromosome[mm_chr][f_chr_remove]))))){
            tmp <- gsub("Chr|chr","",chromosome[mm_chr])
            chromosome[mm_chr] <- tmp
        }
    } else {
        chr_add1 <- paste0("chr",chromosome[mm_chr])
        if(all(chr_add1 %in% seqlevels)){
            chromosome[mm_chr] <- chr_add1
        } else {
            chr_add2 <- paste0("Chr",chromosome[mm_chr])
            if(all(chr_add2 %in% seqlevels)){
                chromosome[mm_chr] <- chr_add2
            }
        }
    }
    factor(chromosome,unique(chromosome))
}

.fix_non_standard_mod_types <- function(mod_type){
    mod_type <- gsub("m22","m2,2",mod_type)
    mod_type <- gsub("m42","m4,4",mod_type)
    mod_type <- gsub("m62","m6,6",mod_type)
    mod_type <- gsub("Tm","Um",mod_type)
    if("m5C" %in% unique(mod_type)){
        mod_type <- gsub("N","m5C",mod_type)
    }
    mod_type
}

#' @importFrom Biostrings DNAStringSet subseq
.extract_GRanges_from_RMBase <- function(rmb, seqtype = "RNA"){
    ############################################################################
    ### check modification information on correct base
    seq <- Biostrings::DNAStringSet(rmb$sequence)
    if(unique(width(seq)) != 41L){
        stop(".")
    }
    seqtype(seq) <- seqtype
    seqtype <- paste0("Mod",seqtype(seq))
    # get the type of modification annotation
    mod_type <- .fix_non_standard_mod_types(as.character(rmb$modType))
    nc_type <- Modstrings:::.get_nc_type(mod_type, seqtype)
    if(nc_type != "short"){
        stop(".")
    }
    # get original base
    codec <- Modstrings:::modscodec(seqtype)
    modValues <- Modstrings:::.norm_seqtype_modtype(mod_type, seqtype,
                                                    nc_type)
    f <- Modstrings:::values(codec)[match(modValues,
                                          Modstrings:::values(codec))]
    # check if reported base matches original base
    subseq <- Biostrings::subseq(seq,21L,21L)
    base_mm <- Modstrings:::originatingBase(codec)[f] != subseq
    # if not delete the modifications with a warning
    if(any(base_mm)){
        warning("Detected mismatch of modification and originating base in ",
                "RMBase data for '", paste(unique(mod_type), collapse = "', '"),
                "'. Removing them ... ",
                call. = FALSE)
        rmb <- rmb[!base_mm,]
    }
    ############################################################################
    # extract position data
    strand <- rmb$strand
    # pos <- rmb$modStart
    # pos[strand == "-"] <- rmb$modEnd[strand == "-"]
    pos <- rmb$modEnd
    # extract modification information
    mod_id <- rmb$modId
    mod_type <- .fix_non_standard_mod_types(as.character(rmb$modType))
    mod_name <- rmb$modName
    # extract intermediate transcript data
    chromosome <- rmb$chromosome
    gene_name <- rmb$geneName
    if(any(is.na(gene_name)) || any(is.null(gene_name))){
        stop(".")
    }
    # extract references
    f_pmid <- rmb$pubmedIds != "" & rmb$pubmedIds != "-"
    f_ref <- rmb$supportList != "" & rmb$supportList != "-"
    ref_type <- list(IRanges::CharacterList(as.list(rep("RMBASE_MOD_ID",
                                                        nrow(rmb)))),
                     IRanges::CharacterList(as.list(rep("PMID",
                                                        nrow(rmb)))),
                     IRanges::CharacterList(as.list(rep("RMBase_REF",
                                                        nrow(rmb)))))
    ref_type[[2]][!f_pmid] <- ""
    ref_type[[3]][!f_ref] <- ""
    ref_type <- do.call(pc,ref_type)
    ref_type <- ref_type[ref_type != ""]
    reference <- list(relist(as.character(rmb$modId),
                             IRanges::PartitioningByWidth(lengths(rmb$modId))),
                      relist(as.character(rmb$pubmedIds),
                             IRanges::PartitioningByWidth(lengths(rmb$pubmedIds))),
                      relist(as.character(rmb$supportList),
                             IRanges::PartitioningByWidth(lengths(rmb$supportList))))
    reference[[2]][!f_pmid] <- ""
    reference[[3]][!f_ref] <- ""
    reference <- do.call(pc,reference)
    reference <- reference[reference != ""]
    # assemble metadata columns
    gene_name <-
        IRanges::CharacterList(strsplit(as.character(gene_name),","))
    mcols <- DataFrame(mod_id = mod_id,
                       mod_type = mod_type,
                       mod_name = mod_name,
                       chromosome = chromosome,
                       gene_name = gene_name,
                       score = rmb$score,
                       ref_type = ref_type,
                       ref = reference)
    # create GRanges result
    ans <- GenomicRanges::GRanges(seqnames = chromosome,
                                  ranges = IRanges::IRanges(pos, width = 1L),
                                  strand = strand,
                                  mcols)
    #
    ans
}

.get_RMBase_data <- function(files){
    grl <- lapply(files,
                  function(file){
                      rmb <- .read_RMBase_file(file)
                      .extract_GRanges_from_RMBase(rmb)
                  })
    grl
}

.gene_names_to_chromosome <- function(gr){
    chromosome <- mcols(gr)$gene_name
    GenomicRanges::GRanges(seqnames = chromosome,
                           ranges = IRanges::ranges(gr),
                           strand = strand(gr),
                           mcols(gr))
}



.check_tx_sequences <- function(tx, sequences){
    if(is.null(sequences)){
        return()
    }
    if(!is(sequences,"DNAStringSet") & !is(sequences,"RNAStringSet")){
        stop("'sequences' must be a DNA/RNAStringSet.", call. = FALSE)
    }
    if(is.null(names(sequences)) || !all(names(sequences) == names(tx) )){
        stop("'sequences' must be named and names have to match 'tx'",
             call. = FALSE)
    }
    if(!all(sum(width(tx)) == width(sequences))){
        stop("'sequences' and 'tx' must have the same dimensions, e.g. the ",
             "length of sequences and annotation must match.",
             call. = FALSE)
    }
}

#' @rdname makeEpiTxDbFromRMBase
#' @export
getRMBaseDataAsGRanges <- function(files){
    message("Assembling RMBase data ...")
    # getting raw data from RMBase files
    grl <- .get_RMBase_data(files)
    gr <- unlist(GenomicRanges::GRangesList(grl))
    gr
}

.remove_incompatible_mod_by_seq <- function(gr, seq){
    seq <- as(seq,"RNAStringSet")
    colnames(mcols(gr)) <- gsub("mod_type","mod",colnames(mcols(gr)))
    # do plus and minus strand separatly, since removeIncompatibleModifications
    # only accepts plus strand
    gr_plus <-
        Modstrings::removeIncompatibleModifications(gr[strand(gr) == "+"], seq)
    gr_minus <- gr[strand(gr) == "-"]
    strand(gr_minus) <- "+"
    gr_minus <-
        Modstrings::removeIncompatibleModifications(gr_minus,
                                                    Biostrings::complement(seq))
    strand(gr_minus) <- "-"
    gr <- c(gr_plus,gr_minus)
    gr <- gr[order(seqnames(gr),start(gr),strand(gr))]
    colnames(mcols(gr)) <- gsub("^mod$","mod_type",colnames(mcols(gr)))
    gr
}

.add_sequence_check_to_metadata <- function(metadata){
    metadata <- rbind(metadata,
                      data.frame(name = "Checked against sequence",
                                 value = "Yes",
                                 check.names = FALSE,
                                 stringsAsFactors = FALSE))
    metadata
}

#' @rdname makeEpiTxDbFromRMBase
#' @export
makeEpiTxDbFromRMBaseFiles <- function(files, tx = NULL, sequences = NULL,
                                       metadata = NULL, reassign.ids = FALSE){
    gr <- getRMBaseDataAsGRanges(files)
    if(!is.null(tx)){
        sl <- GenomeInfoDb::seqlevels(tx)
    } else if(!is.null(sequences)) {
        sl <- names(sequences)
    } else {
        sl <- GenomeInfoDb::seqlevels(gr)
    }
    chromosome <- .simplify_chromosome_identifiers(seqnames(gr), sl)
    gr <- GenomicRanges::GRanges(seqnames = chromosome,
                                 ranges = IRanges::ranges(gr),
                                 strand = BiocGenerics::strand(gr),
                                 S4Vectors::mcols(gr))
    # shift position to transcript coordinates
    if(!is.null(tx)){
        tx <- .norm_tx(tx)
        .check_tx_sequences(tx, sequences)
        message("Shifting RMBase result's coordinates based on transcript ",
                "data ...")
        gr <- shiftGenomicToTranscript(gr, tx)
        f <- !duplicated(paste0(as.character(gr),"-",mcols(gr)$mod_type))
        gr <- gr[f]
    }
    # check if all the referenced modification match their originating base
    if(!is.null(sequences)){
        # remove any positions which do not match the originating base
        gr <- .remove_incompatible_mod_by_seq(gr, sequences)
        metadata <- .add_sequence_check_to_metadata(metadata)
    }
    #
    mcols(gr)$mod_id <- seq_along(gr)
    makeEpiTxDbFromGRanges(gr, metadata = metadata, reassign.ids = reassign.ids)
}


#' @rdname makeEpiTxDbFromRMBase
#' @importFrom curl curl
#' @export
listAvailableOrganismsFromRMBase <- function(){
    # con <- curl::curl(EPITXDB_RMBASE_URL)
    # page <- xml2::read_html(con)
    # organisms <- xml2::xml_attr(xml2::xml_find_all(page,'//img[@alt="[DIR]"]//../following::a'),"href")
    # organisms <- gsub("/","",organisms)
    # organisms[!(organisms %in% c("ajax","otherspecies"))]
    rmbase_data <- NULL
    utils::data("rmbase_data", envir = environment(), package = "EpiTxDb")
    as.character(unique(rmbase_data$organism))
}

#' @importFrom curl curl
.get_RMBase_files <- function(organism){
    con <- curl::curl(paste0(EPITXDB_RMBASE_URL,organism,"/zip/"))
    page <- xml2::read_html(con)
    files <- xml2::xml_attr(xml2::xml_find_all(page,'//img[@alt="[   ]"]//../following::a'),"href")
    files[!grepl("^old",files)]
}

.get_RMBase_genomes <- function(files){
    unique(vapply(strsplit(files,"_"),"[",character(1),2L))
}

.listAvailableGenomesFromRMBase <- function(organism){
    # files <- .get_RMBase_files(organism)
    # .get_RMBase_genomes(files)
    rmbase_data <- NULL
    utils::data("rmbase_data", envir = environment(), package = "EpiTxDb")
    as.character(unique(rmbase_data[rmbase_data$organism == organism,]$genome))
}

#' @rdname makeEpiTxDbFromRMBase
#' @export
listAvailableGenomesFromRMBase <- function(organism){
    if(!(organism %in% listAvailableOrganismsFromRMBase())){
        stop("'organism' must be a valid organism from ",
             "listAvailableOrganismsfromRMBase()")
    }
    .listAvailableGenomesFromRMBase(organism)
}

.get_RMBase_mod <- function(organism, genome){
    # f_genome <- vapply(strsplit(files,"_"),"[",character(1),2L) == genome
    # ans <- unique(vapply(strsplit(files[f_genome],"_"),"[",character(1),4L))
    # ans[!grepl("mod",ans)]
    rmbase_data <- NULL
    utils::data("rmbase_data", envir = environment(), package = "EpiTxDb")
    as.character(rmbase_data[rmbase_data$organism == organism &
                                 rmbase_data$genome == genome,]$mod)
}

#' @rdname makeEpiTxDbFromRMBase
#' @export
listAvailableModFromRMBase <- function(organism, genome){
    if(!(organism %in% listAvailableOrganismsFromRMBase())){
        stop("'organism' must be a valid organism from ",
             "listAvailableOrganismsfromRMBase()")
    }
    if(!(genome %in% listAvailableGenomesFromRMBase(organism))){
        stop("'genome' must be a valid genome for the given 'organism' from ",
             "listAvailableGenomesFromRMBase()")
    }
    .get_RMBase_mod(organism, genome)
}

Try the EpiTxDb package in your browser

Any scripts or data that you put into this service are public.

EpiTxDb documentation built on March 26, 2021, 6 p.m.