R/Utilities.R
In metagene: A package to produce metagene plots

Documented in avoid_gaps_update get_promoters_txdb write_bed_file_filter_result

# Created by Charles Joly Beauparlant
# 2013-11-26

# Split a GRanges into N bins
#
# Originally posted by Martin Morgan:
# https://stat.ethz.ch/pipermail/bioconductor/2012-September/047923.html
#
# param gr A GRanges with only one seqnames value.
# param n Number of bins to produce.
#
# return
#   A GRanges object splitted into N bins
#
# examples
#   gr <- GRanges("chr1", IRanges(c(100, 300), c(200, 500))
#   gr <- intoNbins(gr)
intoNbins <- function(gr, n = 10) {
    stopifnot(is(gr, "GRanges"))
    stopifnot(length(gr) > 0)
    stopifnot(is.numeric(n))
    stopifnot(n > 0)
    if (any(GenomicRanges::width(gr) < n)) {
        stop("all 'width(gr)' must be >= 'n'")
    }
    d <- width(gr) / n
    dd <- cumsum(rep(d, each=n))
    mask <- logical(n); mask[1] <- TRUE
    dd <- dd - rep(dd[mask], each=n)

    starts <- round(rep(GenomicRanges::start(gr), each=n) + dd)
    ends <- c(starts[-1], 0) - 1L
    ends[rev(mask)] <- end(gr)

    gr <- gr[rep(seq_along(gr), each=n)]
    GenomicRanges::ranges(gr) <- IRanges(starts, ends)
    gr
}

#' Extract Entrez genes promoters from TxDb object.
#'
#' @param txdb A valid \code{TxDb} object.
#' @param upstream The number of nucleotides upstream of TSS.
#' @param downstream The number of nucleotides downstream of TSS.
#'
#' @return A \code{GRanges} object that contains the promoters infos.
#'
#' @examples
#' \dontrun{
#'    # require(TxDb.Hsapiens.UCSC.hg19.knownGene)
#'    txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#'    promoters_hg19 <- get_promoters_txdb(txdb)
#' }
#'
#' @import GenomeInfoDb
get_promoters_txdb <- function(txdb, upstream = 1000, downstream = 1000) {
    stopifnot(is(txdb, "TxDb"))
    GenomicFeatures::promoters(GenomicFeatures::genes(txdb),
                               upstream = upstream, downstream = downstream)
}


#' Extract exons by genes for which data are available in quantification files
#'
#' @param adb A valid \code{EnsDb} object.
#' @param quantification_files the quantification files. A vector of pathes.
#'
#' @return A \code{GRangesList} object containing exons by genes for which 
#'                                data are available in quantification files.
#'
#' @examples
#' \dontrun{
#'    require(EnsDb.Hsapiens.v86)
#'    edb <- EnsDb.Hsapiens.v86
#'    quantification_files <- 'file_path'
#'    ebgwot <- exon_by_gene_with_observed_transcripts(edb, 
#'                                                    quantification_files)
#'    bed_file_content_gr <- GRanges("chr16",ranges = IRanges(start=23581002, 
#'                                                            end=23596356))
#'    bed_file_filter(ebgwot, bed_file_content_gr)
#' }
#'
#' @import EnsDb.Hsapiens.v86
exon_by_gene_with_observed_transcripts <- function (adb, quantification_files){
    stopifnot((is(adb, "TxDb") | is(adb, "EnsDb")))
    stopifnot(all(tolower(tools::file_ext(quantification_files)) == 'tsv'))
    
    message(paste0('Please wait, the process will end in about one minute ',
                                'depending on quantification_files size'))
    #retrieve of all transcript_id found in quantification_files (no duplicate)
    transcript_id <- unique(unlist(map(quantification_files, 
                    ~ fread(.x)[TPM > 0]$transcript_id)))
    transcript_id <- stringr::str_replace(transcript_id, '\\.[0-9]+', '')
    
    if(is(adb, "EnsDb") & substr(transcript_id[1],1,3) == 'ENS') {
        message('Please, wait few minutes during requests to EnsDB.')
        edb <- EnsDb.Hsapiens.v86
        all_exons_id <- unique(exons(edb)$exon_id)
        slct <- unique(ensembldb::select(edb, key=all_exons_id, 
                                    keytype='EXONID', 
                                    columns = c('GENEID', 'EXONID', 
                                            'EXONSEQSTART', 'EXONSEQEND',
                                            'SEQSTRAND','EXONIDX', 'TXID', 
                                            'SEQNAME')))
        
        slct <- slct[which(slct$TXID %in% transcript_id),]
        slct$SEQSTRAND <- stringr::str_replace(slct$SEQSTRAND, '-1', '-')
        slct$SEQSTRAND <- stringr::str_replace(slct$SEQSTRAND, '1', '+')
        slct$SEQSTRAND <- stringr::str_replace(slct$SEQSTRAND, 'NA', '*')
        
        gr <-
        GRanges(seqnames = paste0('chr',slct$SEQNAME),
        ranges = IRanges(start=slct$EXONSEQSTART, end=slct$EXONSEQEND),
        strand = slct$SEQNAME, exon_id = slct$EXONID, 
                                                    gene_id = slct$GENEID)
        return(split(gr, gr$gene_id))
    } else if(is(adb, "TxDb") & is.numeric(transcript_id[1])) {
        #TODO
    } else {
        message(paste0('AnnotationDataBase (adb) or quantification_files not',
                    ' supported. Only EnsDB AnnotationDataBase',
                    ' and corresponding transcripts quantification files', 
                    ' with ENST ID are supported.'))
    }
}

#' Extract a list of ranges defined by the bed_file_content_gr argument from 
#'    the ebgwot GRangesList. Equivalent to the exonsByOverlaps
#'    of GenomicFeatures.
#'
#' @param ebgwot A \code{GRangesList} object 
#'        provided by the exon_by_gene_with_observed_transcripts function.
#' @param bed_file_content_gr A \code{GRanges} object containing ranges of 
#'        interest.
#' @param reduce If the returned \code{GRanges} object will be reduce or not.
#'
#' @return A \code{GRanges} object that contains exons by genes selected.
#'
#' @examples
#' \dontrun{
#'    require(EnsDb.Hsapiens.v86)
#'    edb <- EnsDb.Hsapiens.v86
#'    quantification_files <- 'file_path'
#'    ebgwot <- exon_by_gene_with_observed_transcripts(edb, 
#'                                                    quantification_files)
#'    bed_file_content_gr <- GRanges("chr16",ranges = IRanges(start=23581002, 
#'                                                            end=23596356))
#'    bed_file_filter(ebgwot, bed_file_content_gr)
#' }
#'
bed_file_filter <- function (ebgwot, bed_file_content_gr, reduce = TRUE) {
    if(reduce){
        IRanges::reduce(unlist(ebgwot)[(
            unlist(start(ebgwot)) >= start(bed_file_content_gr) & 
            unlist(end(ebgwot)) <= end(bed_file_content_gr) & 
            as.character(unlist(seqnames(ebgwot))) == as.character(
                                                seqnames(bed_file_content_gr))
        ),])
    } else {
        unlist(ebgwot)[(
            unlist(start(ebgwot)) >= start(bed_file_content_gr) & 
            unlist(end(ebgwot)) <= end(bed_file_content_gr) & 
            as.character(unlist(seqnames(ebgwot))) == as.character(
                                                seqnames(bed_file_content_gr))
        ),]
    }
}


#' Transforms the bed_file_filter function output into a file.BED readable by
#'                                                                metagene.
#'
#' @param bed_file_filter_result A \code{GRanges} object : the output of 
#'        bed_file_filter function.
#' @param file the name of the output file without the extension 
#' @param path The path where the function will write the file
#'
#' @return output of write function
#'
#' @examples
#' \dontrun{
#' require(EnsDb.Hsapiens.v86)
#' edb <- EnsDb.Hsapiens.v86
#' quantification_files <- 'file_path'
#' ebgwot <- exon_by_gene_with_observed_transcripts(edb, 
#'                                                    quantification_files)
#' bed_file_content_gr <- GRanges("chr16",ranges = IRanges(start=23581002, 
#'                                                            end=23596356))
#' bffr <- bed_file_filter(ebgwot, bed_file_content_gr)
#' write_bed_file_filter_result(bffr, file='test','./')
#' }
#'
write_bed_file_filter_result <- function(bed_file_filter_result, 
                                            file = 'file_name', path = './'){
    string <- ''
    bffr <- bed_file_filter_result
    for (i in 1:length(bffr)){
        string <- paste0(string, as.vector(seqnames(bffr))[i],'\t',
                            start(bffr)[i],'\t',
                            end(bffr)[i],'\t',
                            '.\t.\t',
                            as.vector(strand(bffr))[i],
                            '\n')
    }
    write(string, paste0(path, file, '.BED'))
}

#' Is is a function designed to remove values <= to 'gaps_threshold'. 
#' Nucleotides local and global positions, bins, size of regions/genes and exons
#' will be recalculated. To use on metagene's table during RNA-seq analysis. 
#' Not made for ChIP-Seq analysis or to apply on matagene's data_frame. A
#' similar function is implemented in produce_data_frame() with same arguments.
#' The unique goal of this function is to allow permutation_test which match
#' the plot created using avoid_gaps, bam_name and gaps_threshold arguments
#' in the produce_data_frame function.
#'
#' @param table A data.table from produce_table(...) function of metagene.
#' @param bam_name A reference bam_name to allow the same removal 
#'                (position in bam) of values for other bam file.
#' @param gaps_threshold A threshold under which values will be removed.    
#'
#' @return A data.table with values <= to 'gaps_threshold' removed
#'
#' @examples
#' \dontrun{
#' bam_files <- c(
#' system.file("extdata/c_al4_945MLM_demo_sorted.bam", package="metagene"),
#' system.file("extdata/c_al3_362PYX_demo_sorted.bam", package="metagene"),
#' system.file("extdata/n_al4_310HII_demo_sorted.bam", package="metagene"),
#' system.file("extdata/n_al3_588WMR_demo_sorted.bam", package="metagene"))
#' region <- c(
#' system.file("extdata/ENCFF355RXX_DPM1less.bed", package="metagene"),
#' system.file("extdata/ENCFF355RXX_NDUFAB1less.bed", package="metagene"),
#' system.file("extdata/ENCFF355RXX_SLC25A5less.bed", package="metagene"))
#' mydesign <- matrix(c(1,1,0,0,0,0,1,1),ncol=2, byrow=FALSE)
#' mydesign <- cbind(c("c_al4_945MLM_demo_sorted.bam",
#'                    "c_al3_362PYX_demo_sorted.bam",
#'                    "n_al4_310HII_demo_sorted.bam",
#'                    "n_al3_588WMR_demo_sorted.bam"), mydesign)
#' colnames(mydesign) <- c('Samples', 'cyto', 'nucleo')
#' mydesign <- data.frame(mydesign)
#' mydesign[,2] <- as.numeric(mydesign[,2])-1
#' mydesign[,3] <- as.numeric(mydesign[,3])-1
#'
#' mg <- metagene$new(regions = region, bam_files = bam_files, 
#'                                                            assay = 'rnaseq')
#' mg$produce_table(flip_regions = FALSE, bin_count = 100, 
#'                                design = mydesign, normalization = 'RPM')
#' mg$produce_data_frame(avoid_gaps = TRUE, 
#'                        bam_name = "c_al4_945MLM_demo_sorted", 
#'                        gaps_threshold = 10)
#' mg$plot()
#' tab <- mg$get_table()
#' tab <- avoid_gaps_update(tab, 
#'        bam_name = 'c_al4_945MLM_demo_sorted', gaps_threshold = 10)
#' tab1 <- tab[which(tab0$design == "cyto"),]
#' tab2 <- tab[which(tab0$design == "nucleo"),]
#' 
#' library(similaRpeak)
#' perm_fun <- function(profile1, profile2) {
#'    sim <- similarity(profile1, profile2)
#'    sim[["metrics"]][["RATIO_INTERSECT"]]
#' }
#' 
#' ratio_intersect <- 
#'    perm_fun(tab1[, .(moy=mean(value)), by=bin]$moy, 
#'            tab2[, .(moy=mean(value)), by=bin]$moy)
#' ratio_intersect
#' 
#' permutation_results <- permutation_test(tab1, tab2, sample_size = 2,
#'                                sample_count = 1000, FUN = perm_fun)
#' hist(permutation_results, 
#'        main="ratio_intersect (1=total overlapping area)")
#' abline(v=ratio_intersect, col = 'red')
#' sum(ratio_intersect >= permutation_results) / length(permutation_results)
#' }
#'
avoid_gaps_update <- function(table, bam_name, gaps_threshold = 0)
{
    new_table <- data.table::copy(table)
    
	if (!is.null(new_table$bin)){
		bin_count <- max(unique(new_table$bin))
	} else {
		bin_count <- NULL
	}
            
    #how_namy_by_exon_by_design
    nb_nuc_removed <- new_table[value <= gaps_threshold 
                            & bam == bam_name, length(value),
                        by=c('exon', 'region')]
    
    #assignment of new exonsize
    for (i in 1:length(nb_nuc_removed$V1)){
        #selected = lines of the ith region and exon of nb_nuc_removed
        selected <- which(
            new_table$region == nb_nuc_removed$region[i] &
            new_table$exon == nb_nuc_removed$exon[i])
        #retrieve the exonsize value of the ith region and exon
        original_exonsize <- unique(new_table$exonsize[selected])
        #replace former exonsixe
        new_exonsize <- original_exonsize-nb_nuc_removed$V1[i]
        new_table$exonsize[selected] <- new_exonsize
    }
    
    nb_nuc_removed_by_gene <- new_table[value <= gaps_threshold 
                            & bam == bam_name, length(value),
                        by=c('region')]
    #assignment of new regionsize/genesize
    for (i in 1:length(unique(nb_nuc_removed_by_gene$region))){
        #selected = lines of the ith region of nb_nuc_removed
        selected <- which(
            new_table$region == nb_nuc_removed_by_gene$region[i])
        #retrieve the regionsize value of the ith region and exon
        original_regionsize <- unique(new_table$regionsize[selected])
        #replace former regionsize
        new_regionsize <- (original_regionsize
                            - nb_nuc_removed_by_gene$V1[i])
        new_table$regionsize[selected] <- new_regionsize
    }
    
    #assignment of new regionstartnuc (allow flip nuctot after gaps removal)
    for (i in 1:length(unique(nb_nuc_removed_by_gene$region))){
        #selected = lines of the ith region of nb_nuc_removed
        selected <- which(
            new_table$region == nb_nuc_removed_by_gene$region[i])
        #retrieve the regionsize value of the ith region and exon
        original_regionstartnuc <- unique(new_table$regionstartnuc[selected])
        #replace former regionsize
        new_regionstartnuc <- (original_regionstartnuc
                            - nb_nuc_removed_by_gene$V1[i])
		new_regionstartnuc <- replace(new_regionstartnuc, 
								which(new_regionstartnuc < 1), 1)
        new_table$regionstartnuc[selected] <- new_regionstartnuc
    }
    
    ### removal of zero values
    ## stop if all bam haven't the same amount of lines in table
    stopifnot(length(unique(new_table[, .N, by=bam]$N)) == 1)
    bam_line_count <- tab[bam == bam_name, .N]
    #lines_to_remove for bam_name
    lines_to_remove <- which(new_table$bam == bam_name &
                                    new_table$value <= gaps_threshold)
    # %% provide the idx for the first bam
    lines_to_remove <- (lines_to_remove %% bam_line_count)
    #to avoid 0 if there is a x %% x = 0
    lines_to_remove <- replace(lines_to_remove, 
                            which(lines_to_remove == 0), 
                            bam_line_count)
    bam_count <- length(unique(new_table$bam))
    #lines_to_remove for all bam
    lines_to_remove <- unlist(map((0:(bam_count-1)), 
                        ~ lines_to_remove + bam_line_count * .x))
    new_table <- new_table[-lines_to_remove,]

    
    #reinitialization of nuctot before flip in next section to 
    # clear gaps in nuctot number seauence
    new_table$nuctot <- rep(1:length(which(
                    new_table$bam == bam_name)),
                    times = length(unique(new_table$bam)))
    
    #reorder the nuc and nuctot variables
    ascending = function(nuc) {
            nuc[1] < nuc[2]
    }
    are_genes_unflipped <- unlist(lapply(map(unique(new_table$region),
                        ~ new_table[which(new_table$region == .x & 
                                    new_table$bam == new_table$bam[1]),]$nuc)
                                        , ascending))
    if(!all(are_genes_unflipped)){
        
        flip_by_bam_n_region <- map2(rep(unique(new_table$bam), 
                    each=length(unique(new_table$region))), 
            rep(unique(new_table$region),
                    times=length(unique(new_table$bam))), 
            ~which(new_table$bam == .x & new_table$region == .y 
                        & new_table$strand == '-'))
        
        not_empty_idx <- which(map(flip_by_bam_n_region, 
                                                ~length(.x)) > 0) 
        if (length(not_empty_idx) > 0){
            map(flip_by_bam_n_region[not_empty_idx],
                            ~ (new_table$nuc[.x] <- length(.x):1))
            map(flip_by_bam_n_region[not_empty_idx],
                            ~ (new_table$nuctot[.x] <- 
                                    max(new_table$nuctot[.x]):
                                        min(new_table$nuctot[.x])))
        }
        
        unflip_by_bam_n_region <- map2(rep(unique(new_table$bam), 
                    each=length(unique(new_table$region))), 
            rep(unique(new_table$region), 
                    times=length(unique(new_table$bam))), 
            ~which(new_table$bam == .x & new_table$region == .y 
                        & (new_table$strand == '+' | 
                            new_table$strand == '*')))
        not_empty_idx <- which(map(unflip_by_bam_n_region, 
                                                ~length(.x)) > 0) 
        if (length(not_empty_idx) > 0){
            map(unflip_by_bam_n_region[not_empty_idx],
                            ~ (new_table$nuc[.x] <- 1:length(.x)))
            map(unflip_by_bam_n_region[not_empty_idx],
                            ~ (new_table$nuctot[.x] <- 
                                    min(new_table$nuctot[.x]):
                                        max(new_table$nuctot[.x])))
        }
    } else { # if all(are_genes_unflipped)
        by_bam_n_region <- map2(rep(unique(new_table$bam), 
                    each=length(unique(new_table$region))), 
            rep(unique(new_table$region), 
                    times=length(unique(new_table$bam))), 
            ~which(new_table$bam == .x & new_table$region == .y))
        not_empty_idx <- which(map(by_bam_n_region, 
                                                ~length(.x)) > 0) 
        if (length(not_empty_idx) > 0){
            map(by_bam_n_region[not_empty_idx], 
                            ~ (new_table$nuc[.x] <- 1:length(.x)))
            map(by_bam_n_region[not_empty_idx], 
                            ~ (new_table$nuctot[.x] <- 
                                    min(new_table$nuctot[.x]):
                                        max(new_table$nuctot[.x])))
        }
    }
    if(!is.null(bin_count)){
        #reinitialization of region/gene_size to be able to rebuild 
        #bin column
        length_by_region_n_bam <- new_table[,length(nuc),
                                            by=c('region','bam')]$V1
        new_table$regionsize <- rep(length_by_region_n_bam, 
                                    times=length_by_region_n_bam)
        #rebuild the correct bin column
        col_bins <- trunc((new_table$nuc/(new_table$regionsize+1))
                                *bin_count)+1
        new_table$bin <- as.integer(col_bins)
    }
    return(new_table)
}