R/bedgraph2norm.R

Defines functions .add_sequence bedgraph2norm

Documented in bedgraph2norm

#' Import bedgraph to GRanges
#'
#' Function importing data from bedgraph format compatible with UCSC Genome
#' Browser to norm_GR data frame. Warning: Compatible only with bedgraph files
#' generated by norm2bedgraph function (bedgraph needs to have 2 tracks, first
#' for plus strand, second for minus strand). May be used for transforming
#' normalized data to another different annotation sets.
#'
#' @param bedgraph_file path to compatible bedgraph file
#' @param fasta_file path to fasta file which is used for a) choosing which
#' transcripts to use (transcripts absent from fasta are not reported), b)
#' providing sequence for to display in GRanges metadata
#' @param txDb TranscriptDb object with transcript definitions. Names must
#' match those in fasta_file
#' @param bed_file character containing file path to BED file with transcript
#' definitions. Supply txDb XOR bedfile
#' @param column_name How to name imported metadata in GRanges
#' @param add_to GRanges object made by other normalization function (dtcr(),
#' slograt(), swinsor(), compdata()) to which values from bedgraph should be
#' added.
#' @param track_strand specifies which genomic strand the supplied bedgraph
#' describes ("+" or "-"). Used only if the bedgraph file is composed of only
#' one track.
#' @return Function creates GRanges object or (if add_to specified) adds
#' metadata to already existing object
#' @author Lukasz Jan Kielpinski, Nikos Sidiropoulos
#' @seealso \code{\link{norm2bedgraph}}, \code{\link{GR2norm_df}},
#' \code{\link{plotRNA}}, \code{\link{BED2txDb}}, \code{\link{dtcr}},
#' \code{\link{slograt}}, \code{\link{swinsor}}, \code{\link{compdata}}
#' @examples
#' dummy_euc_GR_control <- GRanges(seqnames="DummyRNA",
#' IRanges(start=round(runif(100)*100), width=round(runif(100)*100+1)),
#'         strand="+", EUC=round(runif(100)*100))
#' dummy_euc_GR_treated <- GRanges(seqnames="DummyRNA",
#'                                 IRanges(start=round(runif(100)*100),
#'                                         width=round(runif(100)*100+1)),
#'                                 strand="+", EUC=round(runif(100)*100))
#' dummy_comp_GR_control <- comp(dummy_euc_GR_control)
#' dummy_comp_GR_treated <- comp(dummy_euc_GR_treated)
#' dummy_norm <- dtcr(control_GR=dummy_comp_GR_control,
#'                    treated_GR=dummy_comp_GR_treated)
#'
#' write(paste(c("chr1", 134212702, 134229870, "DummyRNA", 0, "+", 134212806,
#'             134228958, 0, 8, "347,121,24,152,66,120,133,1973,",
#'             "0,8827,10080,11571,12005,13832,14433,15195,"), collapse = "\t"),
#'       file="dummy.bed")
#' norm2bedgraph(norm_GR = dummy_norm, bed_file = "dummy.bed")
#'
#' write(c(">DummyRNA", paste(sample(c("A","C","G","T"), 100, replace=TRUE),
#'                            collapse="")), file="dummy.fa")
#' bedgraph2norm(bedgraph_file = "out_file.bedgraph", fasta_file = "dummy.fa",
#'               bed_file = "dummy.bed")
#'
#' @import rtracklayer Biostrings GenomicFeatures
#' @export bedgraph2norm
bedgraph2norm <- function(bedgraph_file, fasta_file, txDb,  bed_file,
                          column_name="bedgraph_score", add_to,
                          track_strand="+")
{
    ###Check conditions:
    if(missing(txDb) & missing(bed_file))
        stop("Specify gene annotation")

    if(missing(fasta_file))
        stop("Specify fasta file")

    if(!file.exists(fasta_file))
        stop("Fasta file not found")

    #Read in bedgraph file and convert both strands to single UCSCdata entry:
    bedgraph_list <- import.bedGraph(bedgraph_file)

    if(!(class(bedgraph_list) == "SimpleGenomicRangesList"))
        bedgraph_list <- GenomicRangesList(bedgraph_list)

    if(length(bedgraph_list)==1 | length(bedgraph_list)==2) {

        if(length(bedgraph_list)==1) {
            strand(bedgraph_list[[1]]) <- track_strand
            bedgraph_merged <- bedgraph_list[[1]]
        } else {
            if(length(bedgraph_list)==2){
                strand(bedgraph_list[[1]]) <- "+"
                strand(bedgraph_list[[2]]) <- "-"
                bedgraph_merged <- c(bedgraph_list[[1]], bedgraph_list[[2]])
            }
        }
    } else {
        Message <- "Bedgraph file needs to contain 2 tracks, one for
        each strand or 1 track with provided strand information."
        stop(strwrap(Message))
    }


    ##Prepare GRangesList for mapping values to:
    txs <- readDNAStringSet(fasta_file) #read in fasta file.

    #If txDb not given, make it from bed file
    if(missing(txDb))
        txDb <- BED2txDb(bed_file)

    #Build GRangesList from txDb
    all_exons <- exonsBy(txDb, "tx", use.names=TRUE)
    #Keep in GRangesList only those transcript that are present in fasta file
    my_exons <- all_exons[names(all_exons) %in% names(txs)]

    #Print warning if transcripts overlap:
    overlapping_transcripts <- which(countOverlaps(my_exons) > 1)
    if(length(overlapping_transcripts) > 0){
        message(paste("Warning: transcript",
                      names(my_exons)[overlapping_transcripts],
                      strwrap("overlaps with another transcript. Score is added
                              to more than one RNA.")))
    }

    #Map bedgraph positions to annotated transcripts and format to norm_df:
    bm_width <- width(bedgraph_merged)
    bedgraph_merged_expanded <- bedgraph_merged[rep(1:length(bedgraph_merged),
                                                    bm_width)]
    start(bedgraph_merged_expanded) <-
        start(bedgraph_merged_expanded) + unlist(sapply(bm_width,
                                                        FUN=seq_len)) - 1
    end(bedgraph_merged_expanded) <- start(bedgraph_merged_expanded)

    mapped_to_tx <- mapToTranscripts(bedgraph_merged_expanded, my_exons,
                                     ignore.strand = FALSE)
    hits_in_tx <- mapped_to_tx$transcriptsHits #Transcripts with signal
    hits_in_EF <- mapped_to_tx$xHits #Which signal given transcript has
    #Data frame with results
    normalized <-
        data.frame(RNAid=names(my_exons)[hits_in_tx],
                   Pos=start(ranges(mapped_to_tx)),
                   nt=NA,
                   bedgraph_score=score(bedgraph_merged_expanded)[hits_in_EF])

    #Change column name to one provided by user
    names(normalized)[names(normalized)=="bedgraph_score"] <- column_name

    ###Add sequence information from fasta file (on per RNA basis)
    norm_by_RNA <- split(normalized, f=normalized$RNAid, drop=TRUE)
    normalized <- do.call(rbind, lapply(norm_by_RNA, FUN=.add_sequence, txs))

    #If add_to specified, merge with existing normalized data frame:
    if(!missing(add_to)){
        add_to_df <- GR2norm_df(add_to)
        normalized <- merge(add_to_df, normalized, by=c("RNAid", "Pos", "nt"),
                            suffixes=c(".old",".new"))
    }
    ###

    normalized <- normalized[order(normalized$RNAid, normalized$Pos),]
    norm_df2GR(normalized)
}

###Auxiliary functions

#Function to import the sequence from fasta file, if no sequence -
#fill in with N's:
.add_sequence <- function(oneRNA_norm, txs){

    maxPos <- max(oneRNA_norm$Pos)
    refSeq <- txs[[as.character(oneRNA_norm$RNAid[1])]]

    if (maxPos > length(refSeq))
    {
        unexpected_length_difference <- maxPos - length(refSeq)
        dna <- paste(rep("N",unexpected_length_difference), collapse="")
        one_gene_sequence <- unlist(strsplit(as.character(c(refSeq,
                                    DNAString(dna))[oneRNA_norm$Pos]), ""))
        message(paste("For RNA ", oneRNA_norm$RNAid[1],
                      "positions outside FASTA annotation exist. N's added"))
    }
    else{
        one_gene_sequence <- unlist(strsplit(as.character(
            refSeq[oneRNA_norm$Pos]), ""))
    }
    oneRNA_norm$nt <- one_gene_sequence
    oneRNA_norm
}

Try the RNAprobR package in your browser

Any scripts or data that you put into this service are public.

RNAprobR documentation built on Nov. 8, 2020, 5:57 p.m.