R/05_manipulate_ranges.R

Defines functions double_flank add_inverse_strand extend down_flank up_flank summarize_loci add_seq

Documented in add_inverse_strand add_seq double_flank down_flank extend up_flank

#' Add sequence to GRanges
#' @param gr           \code{\link[GenomicRanges]{GRanges-class}}
#' @param bsgenome     \code{\link[BSgenome]{BSgenome-class}}
#' @param verbose      TRUE or FALSE (default)
#' @param as.character TRUE (default) or FALSE
#' @return \code{\link[GenomicRanges]{GRanges-class}}
#' @examples 
#' # PE example
#' #-----------
#'     require(magrittr)
#'     bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38  
#'     gr <- char_to_granges(c(PRNP = 'chr20:4699600:+',             # snp
#'                             HBB  = 'chr11:5227002:-',             # snp
#'                             HEXA = 'chr15:72346580-72346583:-',   # del
#'                             CFTR = 'chr7:117559593-117559595:+'), # ins
#'                           bsgenome)
#'    (gr %<>% add_seq(bsgenome))
#'    
#' # TFBS example
#' #-------------
#'     bsgenome <- BSgenome.Mmusculus.UCSC.mm10::BSgenome.Mmusculus.UCSC.mm10
#'     bedfile  <- system.file('extdata/SRF.bed', package='multicrispr')
#'     gr <- bed_to_granges(bedfile, 'mm10')
#'     (gr %<>% add_seq(bsgenome))
#' @export
add_seq <- function(gr, bsgenome, verbose = FALSE, as.character = TRUE){
    
    # Assert
    assert_is_all_of(gr, 'GRanges')
    assert_is_all_of(bsgenome, 'BSgenome')
    
    # Message
    if (verbose)  cmessage('\tAdd seq')
    
    # Align seqlevelsStyle if required
    if (seqlevelsStyle(bsgenome)[1] != seqlevelsStyle(gr)[1]){
            cmessage("\t\t\tSet seqlevelStyle(bsgenome) <- seqlevelStyle(gr)")
            seqlevelsStyle(bsgenome)[1] <- seqlevelsStyle(gr)[1]
    }
    
    # Add seq
    gr$seq <- unname(BSgenome::getSeq(
                        bsgenome,
                        names        = seqnames(gr),
                        start        = start(gr),
                        end          = end(gr), 
                        strand       = strand(gr), 
                        as.character = as.character))
    
    # Return
    gr
}


summarize_loci <- function(gr){
    sprintf('%s:%s-%s', 
            as.character(seqnames(gr)), 
            start(gr), 
            end(gr))
}

#' Extend or Flank GRanges
#' 
#' Returns extensions, upstream flanks, or downstream flanks
#' 
#' \code{up_flank}   returns upstream flanks, in relation to start(gr).
#' \code{down_flank} returns downstream flanks, in relation to end(gr).
#' \code{extend}     returns extensions, in relation to start(gr) and end(gr)
#' @param gr        \code{\link[GenomicRanges]{GRanges-class}}
#' @param start     number or vector (same length as gr): start definition, 
#' relative to gr start (up_flank, extend) or gr end (down_flank).
#' @param end       number or vector (same length as gr): end definition,   
#' relative to gr start (up_flank) or gr end (extend, down_flank).
#' @param strandaware  TRUE (default) or FALSE: consider strand information?
#' @param bsgenome  NULL (default) or \code{\link[BSgenome]{BSgenome-class}}.
#'                  Required to update gr$seq if present.
#' @param verbose   TRUE or FALSE (default)
#' @param plot      TRUE or FALSE (default)
#' @param linetype_var string: gr var mapped to linetype
#' @param ...       passed to \code{\link{plot_intervals}}
#' @return a \code{\link[GenomicRanges]{GRanges-class}}
#' @examples 
#' # PE example
#' #-----------
#' require(magrittr)
#' bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38  
#' gr <- char_to_granges(c(PRNP  = 'chr20:4699600:+',         # snp
#'                          HBB  = 'chr11:5227002:-',            # snp
#'                          HEXA = 'chr15:72346580-72346583:-',  # del
#'                          CFTR = 'chr7:117559593-117559595:+'),# ins
#'                       bsgenome = bsgenome)
#' gr %>% up_flank( -22,  -1, plot=TRUE)
#' gr %>% up_flank( c(-10,-20,-30,-40),  -1, plot=TRUE)
#' gr %>% up_flank( -22,  -1, plot=TRUE, strandaware=FALSE)
#' 
#' gr %>% down_flank(+1, +22, plot=TRUE)
#' gr %>% down_flank(+1, c(10, 20, 30, 40), plot=TRUE)
#' gr %>% down_flank(+1, +22, plot=TRUE, strandaware=FALSE)
#' 
#' gr %>% extend(   -10, +20, plot=TRUE)
#' gr %>% extend(   -10, +20, plot=TRUE, strandaware=FALSE)
#'
#' # TFBS example
#' #-------------
#'     bedfile <- system.file('extdata/SRF.bed', package='multicrispr')
#'     gr <- bed_to_granges(bedfile, genome = 'mm10')
#'     gr %>% extend(plot = TRUE)
#'     gr %>% up_flank(plot = TRUE)
#'     gr %>% down_flank(plot = TRUE)
#' @export
up_flank <- function(
    gr, start = -200, end = -1, strandaware = TRUE, bsgenome = NULL, 
    verbose = FALSE, plot = FALSE, linetype_var = 'set', ...){
# Assert
    assert_is_all_of(gr, 'GRanges')
    assert_is_numeric(start)
    assert_is_numeric(end)
    assert_is_subset(length(start), c(1, length(gr)))
    assert_is_subset(length(end),   c(1, length(gr)))
    assert_is_a_bool(verbose)
# Record
    newgr <- gr
    shift <- sprintf('(%s%d,%s%d)', csign(start[1]), abs(start[1]), 
                    csign(end[1]), abs(end[1]))
    if (!is_scalar(start) | !is_scalar(end)) shift %<>% paste0(' etc.')
    txt <- sprintf('\t\t%d%supstream %s flanks', 
                    length(newgr), 
                    ifelse(!strandaware, ' (strandagnostic) ', ' '),
                    shift)
# Flank
    if (is_scalar(start))  start <- rep(start, length(gr))
    if (is_scalar(end))    end   <- rep(end,   length(gr))
    GenomicRanges::start(newgr) <- 1  # avoid integrity errors
    GenomicRanges::end(newgr)   <- GenomicRanges::start(gr) + end
    GenomicRanges::start(newgr) <- GenomicRanges::start(gr) + start
    if (strandaware){
        idx <- as.logical(strand(newgr)=='-')
        GenomicRanges::start(newgr)[idx] <- 1 # avoid integrity errors
        GenomicRanges::end(newgr)[idx] <- GenomicRanges::end(gr)[idx]-start[idx]
        GenomicRanges::start(newgr)[idx] <- GenomicRanges::end(gr)[idx]-end[idx]
    }
# Add seq
    if ('seq' %in% names(mcols(gr))){
        assert_is_all_of(bsgenome, 'BSgenome')
        newgr %<>% add_seq(bsgenome)
    }
# Plot, Message, Return
    if (plot){
        gr$set    <- 'original'
        newgr$set <- 'upstream flanks'
        allgr <- c(gr, newgr)
        allgr$set %<>% factor(c('original', 'upstream flanks'))
        print(plot_intervals(
                allgr, linetype_var = linetype_var, ..., title = txt))
        newgr$set <- NULL
    }
    if (verbose) message(txt)
    newgr
}


#' @rdname up_flank
#' @export
down_flank <- function(
    gr, start = 1,  end = 200, strandaware = TRUE, bsgenome = NULL,
    verbose = FALSE, plot = FALSE, linetype_var = 'set', ...){
# Assert
    assert_is_any_of(gr, 'GRanges')
    assert_is_numeric(start)
    assert_is_numeric(end)
    assert_is_subset(length(start), c(1, length(gr)))
    assert_is_subset(length(end),   c(1, length(gr)))
    assert_is_a_bool(verbose)
# Record 
    newgr <- gr
    shift <- sprintf('(%s%d,%s%d)', csign(start[1]), abs(start[1]), 
                    csign(end[1]), abs(end[1]))
    if (!is_scalar(start) | !is_scalar(end)) shift %<>% paste0(' etc.')
    txt <- sprintf('\t\t%d%sdownstream %s flanks', 
                    length(newgr), 
                    ifelse(!strandaware, ' (strandagnostic) ', ' '),
                    shift)
# Flank
    if (is_scalar(start))  start <- rep(start, length(gr))
    if (is_scalar(end))    end   <- rep(end,   length(gr))
    GenomicRanges::start(newgr) <- 1 # avoid integrity errors
    GenomicRanges::end(newgr)   <- GenomicRanges::end(gr) + end
    GenomicRanges::start(newgr) <- GenomicRanges::end(gr) + start
    if (strandaware){
        idx <- as.logical(strand(newgr)=='-')
        GenomicRanges::start(newgr)[idx] <- 1 # avoid integrity errors
        GenomicRanges::end(newgr)[idx]<-GenomicRanges::start(gr)[idx]-start[idx]
        GenomicRanges::start(newgr)[idx]<-GenomicRanges::start(gr)[idx]-end[idx]
    }
# Add seq
    if ('seq' %in% names(mcols(gr))){
        assert_is_all_of(bsgenome, 'BSgenome')
        newgr %<>% add_seq(bsgenome)
    }
# Plot, Message, Return
    if (plot){
        gr$set <- 'original'
        newgr$set <- 'downstream flanks'
        allgr <- c(gr, newgr)
        allgr$set %<>% factor(c('original', 'downstream flanks'))
        print(plot_intervals(
                allgr, linetype_var = linetype_var, ..., title=txt))
        newgr$set <- NULL
    }
    if (verbose) message(txt)
    newgr
}


#' @rdname up_flank
#' @export
extend <- function(
    gr, start = -22, end = 22, strandaware = TRUE, bsgenome = NULL,
    verbose = FALSE, plot = FALSE, linetype_var = 'set', ...){
# Assert
    assert_is_any_of(gr, 'GRanges')
    assert_is_numeric(start)
    assert_is_numeric(end)
    assert_is_subset(length(start), c(1, length(gr)))
    assert_is_subset(length(end),   c(1, length(gr)))
    assert_is_a_bool(verbose)
# Record
    newgr <- gr
    if (!is_scalar(start) | !is_scalar(end)) shift %<>% paste0(' etc.')
    shift <- sprintf('(%s%d,%s%d)', 
                    csign(start[1]), abs(start[1]), csign(end[1]), abs(end[1]))
    txt <- sprintf('\t\t%d%sextensions %s', 
                    length(newgr), 
                    ifelse(!strandaware, ' (strandagnostic) ', ' '),
                    shift)
# Extend
    if (is_scalar(start))  start <- rep(start, length(gr))
    if (is_scalar(end))    end   <- rep(end,   length(gr))
    GenomicRanges::start(newgr) <- 1 # avoid integrity errors
    GenomicRanges::end(  newgr) <- GenomicRanges::end(gr) + end
    GenomicRanges::start(newgr) <- GenomicRanges::start(gr) + start
    if (strandaware){
        idx <- as.logical(strand(newgr)=='-')
        GenomicRanges::start(newgr)[idx] <- 1 # avoid integrity errors
        GenomicRanges::end(  newgr)[idx]<-GenomicRanges::end(gr)[idx]-start[idx]
        GenomicRanges::start(newgr)[idx]<-GenomicRanges::start(gr)[idx]-end[idx]
    }
# Add seq
    if ('seq' %in% names(mcols(gr))){
        assert_is_all_of(bsgenome, 'BSgenome')
        newgr %<>% add_seq(bsgenome)
    }
# Plot, Message, Return
    if (plot){
        gr$set <- 'original'
        newgr$set <- 'extensions'
        allgr <- c(gr, newgr)
        allgr$set %<>% factor(c('original', 'extensions'))
        print(plot_intervals(
                allgr, linetype_var = linetype_var, ..., title=txt))
        newgr$set <- NULL
    }
    if (verbose) message(txt)
    newgr
}




#' Add inverse strand
#' @param gr         \code{\link[GenomicRanges]{GRanges-class}}
#' @param verbose    TRUE or FALSE (default)
#' @param plot       TRUE or FALSE (default)
#' @param ...         \code{\link{plot_intervals}} arguments
#' @return \code{\link[GenomicRanges]{GRanges-class}}
#' @examples
#' # PE example
#' #-----------
#'     require(magrittr)
#'     bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38  
#'     gr <- char_to_granges(c(PRNP = 'chr20:4699600:+',             # snp
#'                             HBB  = 'chr11:5227002:-',             # snp
#'                             HEXA = 'chr15:72346580-72346583:-',   # del
#'                             CFTR = 'chr7:117559593-117559595:+'), # ins
#'                           bsgenome)
#'     add_inverse_strand(gr, plot = TRUE)
#' # TFBS example
#' #-------------
#'     bedfile <- system.file('extdata/SRF.bed', package='multicrispr')
#'     gr <- bed_to_granges(bedfile, genome = 'mm10')
#'     add_inverse_strand(gr)
#' @export
add_inverse_strand <- function(gr, verbose = FALSE, plot = FALSE, ...){

    # Assert
    assert_is_all_of(gr, 'GRanges')

    # Invert
    revcomps <- invertStrand(gr)
    if ('seq' %in% names(mcols(gr))){
        revcomps$seq <- as.character(Biostrings::reverseComplement(
                            DNAStringSet(gr$seq)))
    }
    
    # Concatenate
    newgr <- c(gr, revcomps)
    newgr %<>% unique()
    txt <- sprintf(
        '\tRetain %d unique target ranges after adding inverse strands', 
        length(newgr))
    
    # Sort
    newgr <- sortSeqlevels(newgr)
    newgr <- GenomicRanges::sort(newgr, ignore.strand = TRUE)
    names(newgr) %<>% paste0('_', 
                            c('+'='f', '-'='r')[as.character(strand(newgr))])
    
    # Plot
    if (plot){
        gr$set    <- 'original'
        revcomps$set <- 'inverse'
        print(plot_intervals(
                c(gr, revcomps), 
                color_var = 'set', linetype_var = 'set', ..., title = txt))
        gr$set <- NULL
    }
    
    # Message
    if (verbose) cmessage(txt)
    newgr
}

#' Double flank
#' 
#' @param gr \code{\link[GenomicRanges]{GRanges-class}}
#' @param upstart      upstream flank start in relation to start(gr)
#' @param upend        upstream flank end   in relation to start(gr)
#' @param downstart    downstream flank start in relation to end(gr)
#' @param downend      downstream flank end   in relation to end(gr)
#' @param strandaware  TRUE (default) or FALSE
#' @param plot         TRUE or FALSE (default)
#' @param linetype_var gr var mapped to linetype
#' @param ...          passed to plot_intervals
#' @return \code{\link[GenomicRanges]{GRanges-class}}
#' @examples 
#' # Prime Editing example
#' #----------------------
#'     require(magrittr)
#'     bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38  
#'     gr <- char_to_granges(c(PRNP = 'chr20:4699600:+',             # snp
#'                             HBB  = 'chr11:5227002:-',             # snp
#'                             HEXA = 'chr15:72346580-72346583:-',   # del
#'                             CFTR = 'chr7:117559593-117559595:+'), # ins
#'                           bsgenome)
#'     double_flank(gr, -10,  -1, +1, +20, plot = TRUE)
#'       
#' # TFBS example
#' #-------------
#'     bedfile  <- system.file('extdata/SRF.bed', package='multicrispr')
#'     gr  <-  bed_to_granges(bedfile, genome = 'mm10', plot = FALSE)
#'     double_flank(gr, plot = TRUE)
#' @export
double_flank <- function(
    gr, 
    upstart     = -200, 
    upend       = -1, 
    downstart   = 1, 
    downend     = 200,
    strandaware = TRUE,
    plot        = FALSE, 
    linetype_var = 'set',
    ...
){

    # Up flank, down flank, concatenate
    up <- up_flank(gr,   upstart,   upend,   strandaware, verbose = FALSE)
    dn <- down_flank(gr, downstart, downend, strandaware, verbose = FALSE)
    names(up) %<>% paste0('_u') # ensure unique names
    names(dn) %<>% paste0('_d')
    newgr  <- c(up, dn)
    txt <- sprintf('\t\t%d flank ranges: %d up + %d down', 
                        length(newgr), length(up), length(dn))
    
    # Plot    
    if (plot){
        gr$set <- 'original'
        up$set <- 'upstream flank'
        dn$set <- 'downstream flank'
        allgr <- c(gr, up, dn)
        allgr$set %<>% factor(
                        c('original', 'upstream flank', 'downstream flank'))
        print(plot_intervals(allgr, linetype_var = linetype_var, title = txt, 
                            y = 'targetname', ...))
    }
    
    # Return
    message(txt)
    newgr
}

Try the multicrispr package in your browser

Any scripts or data that you put into this service are public.

multicrispr documentation built on Nov. 8, 2020, 5:10 p.m.