R/06_find_primespacers.R

Defines functions add_nickspacers find_gg extend_pe_to_gg copy vextract2 vextract1 cextract2 cextract1

Documented in extend_pe_to_gg find_gg

cextract1 <- function(y) y[, 1] %>% paste0(collapse=';')
cextract2 <- function(y) y[, 2] %>% paste0(collapse=';')
vextract1 <- function(z) vapply(z, cextract1, character(1))
vextract2 <- function(z) vapply(z, cextract2, character(1))

copy <- function(
    gr, 
    start  = GenomicRanges::start(gr), 
    end    = GenomicRanges::end(gr),
    strand = GenomicRanges::strand(gr), 
    ...
){
    newgr <- gr
    GenomicRanges::start(gr) <- start
    GenomicRanges::end(gr)   <- end
    strand(gr) <- strand
    
    dots_list <- list(...)
    for (mvar in names(dots_list)) mcols(gr)[[mvar]] <- dots_list[[mvar]]
    gr
}

#' Extend prime editing target to find GG sites
#' 
#' Extend prime editing target to find GG sites in accessible neighbourhood
#' 
#' Extends each target range to the area in which to search for a prime editing 
#' GG duplet, as shown in the sketch below.
#' 
#'                    ===============>
#'                              ----GG--------->
#'                ----GG--------->
#'                              **
#'                <---------GG---
#'                              <---------GG----
#'                          <===============    
#'         
#' @param gr   target \code{\link[GenomicRanges]{GRanges-class}}
#' @param nrt  n RT nucleotides (default 16, recommended 10-16)
#' @param plot TRUE or FALSE (default)
#' @return \code{\link[GenomicRanges]{GRanges-class}}
#' @examples
#' # PE example
#' #-----------
#'     require(magrittr)
#'     bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38  
#'     gr <- char_to_granges(c(PRNP = 'chr20:4699600:+',             # snp
#'                            HBB  = 'chr11:5227002:-',              # snp
#'                            HEXA = 'chr15:72346580-72346583:-',    # del
#'                            CFTR = 'chr7:117559593-117559595:+'),  # ins
#'                           bsgenome)
#'     extend_pe_to_gg(gr, plot = TRUE)
#' @export
extend_pe_to_gg <- function(gr, nrt=16, plot = FALSE){

    # Extend
    fw <- copy(gr, strand='+', start = end(gr) - nrt + 5, end = start(gr) + 5)
    rv <- copy(gr, strand='-', start = end(gr) - 5, end = start(gr) + nrt - 5)
    #fw$tstart  <- rv$tstart   <- start(gr)
    #fw$tend    <- rv$tend     <- end(gr)
    #fw$tstrand <- rv$tstrand  <- strand(gr)
    ext <- sort(c(fw, rv))

    # Plot    
    if (plot){
        gr$set <- 'targets'
        fw$set <- 'possible GG of fwd strand'
        rv$set <- 'possible GG on rev strand'
        plot_intervals(c(gr, fw, rv), color_var = 'set', y = 'set')
    }
    
    # Return
    ext
}


#' Find GG
#' @param gr \code{\link[GenomicRanges]{GRanges-class}}
#' @return   \code{\link[GenomicRanges]{GRanges-class}}
#' @examples
#' # PE example
#' #-----------
#'     require(magrittr)
#'     bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38  
#'     gr <- char_to_granges(c(PRNP = 'chr20:4699600:+',            # snp
#'                             HBB  = 'chr11:5227002:-',             # snp
#'                             HEXA = 'chr15:72346580-72346583:-',   # del
#'                             CFTR = 'chr7:117559593-117559595:+'), # ins
#'                           bsgenome)
#'     gr %<>% extend_pe_to_gg(plot = TRUE) %>% add_seq(bsgenome) 
#'     find_gg(gr)
#' @export
find_gg <- function(gr){
    
    substart <- subend <- windowstart <- windowend <- NULL
    
    res <- stri_locate_all_fixed(gr$seq, 'GG', overlap=TRUE)
    gr$substart <- vextract1(res)
    gr$subend   <- vextract2(res)
    
    # Rm GG-free gr
    idx <- gr$substart == 'NA'
    if (sum(idx)>0) gr %<>% extract(!idx)

    # Identify GG sites
    gg_dt <- as.data.table(gr) %>% 
            setnames(c('start', 'end'), c('windowstart', 'windowend')) %>% 
            separate_rows(substart, subend)                            %>%
            data.table()                                               %>% 
            extract(, substart := as.numeric(substart))                %>% 
            extract(, subend   := as.numeric(subend))                  %>% 
            # extract(, seq    := substr(seq, substart, subend))  %>%
            extract(strand=='+', start := windowstart + substart - 1)  %>% 
            extract(strand=='+', end   := windowstart + subend   - 1)  %>% 
            extract(strand=='-', end   := windowend   - substart + 1)  %>% 
            extract(strand=='-', start := windowend   - subend   + 1)  %>% 
            extract()
    gg_dt %<>% setorderv(c('seqnames', 'start', 'strand'))
    gg_dt[, c('windowstart', 'windowend', 'seq', 'substart', 'subend') := NULL]
    gg_ranges   <- GRanges(gg_dt, seqinfo = seqinfo(gr))
    #gg_ranges %<>% add_seq(bsgenome, verbose = FALSE)
    return(gg_ranges)
}



add_nickspacers <- function(primespacers, bsgenome, 
    ontargetmethod  = c('Doench2014', 'Doench2016'), 
    offtargetmethod = c('bowtie', 'pdict')[1],
    nickmatches = 3, indexedgenomesdir = INDEXEDGENOMESDIR, outdir = OUTDIR, 
    plot = TRUE, verbose = TRUE
){
# Check / Initialize
    . <- crisprname <- crisprspacer <- crisprpam <- nickoff <- NULL
    off0 <- off1 <- off2 <- pename <- off <- NULL
    primespacers$pename <- primespacers$crisprname
    primespacers$type <- 'pespacer'
# Get nick spacers
    message('Find nickspacers')
    nickzone <- invertStrand(down_flank(primespacers, -3+40-5, -3+90+17))
    mcols(nickzone) %<>% extract(, c('targetname', 'pename'), drop = FALSE)
    nickspacers <- find_spacers(nickzone, bsgenome, complement = FALSE, 
        ontargetmethod = ontargetmethod, offtargetmethod = offtargetmethod, 
        offtargetfilterby = 'pename',
        mismatches = nickmatches, indexedgenomesdir = indexedgenomesdir, 
        outdir = outdir, plot=FALSE)
    nickspacers$type <- 'nickspacer'
    #mcols(nickspacers)[[ontargetmethod]] %<>% round(digits = 2)
# Merge
    nickdt  <-  gr2dt(nickspacers) %>% 
                extract( , .(
                    pename     = pename,
                    nickrange  = as.character(granges(nickspacers)), 
                    nickspacer = crisprspacer, 
                    nickpam    = crisprpam))
    if (!is.null(offtargetmethod)){
        nickdt[, nickoff := nickspacers$off]
        for (i in seq(0, nickmatches)){
            offvalues <- mcols(nickspacers)[[paste0('off',i)]]
            nickdt[, (paste0('nickoff',i)) := offvalues]}}
    if (!is.null(ontargetmethod)){
        method <- ontargetmethod
        nickdt[, (paste0('nick', method)) := mcols(nickspacers)[[method]]]}
    nickdt %<>% extract( , lapply(.SD, pastelapse), by = 'pename')

    pedt <- gr2dt(primespacers)
    pedt$type <- NULL
    mergedranges <- merge(pedt, nickdt, by = 'pename', sort = FALSE, all = TRUE)
    mergedranges$pename <- NULL
    mergedranges %<>% dt2gr(seqinfo(primespacers))
# Plot and Return
    if (plot)   print(plot_intervals(mergedranges))
    return(mergedranges)
}


pastelapse <- function(x) paste0(x, collapse = ';')

#' Find prime editing spacers
#' 
#' Find prime editing spacers around target ranges
#' 
#' Below the architecture of a prime editing site.
#' Edits can be performed anywhere in the revtranscript area.
#' 
#'         spacer        pam
#'   --------------------===
#'           primer     revtranscript
#'       -------------================
#'   1..............17....GG..........
#'   .....................CC..........
#'       ----------extension----------
#' 
#' @param gr        \code{\link[GenomicRanges]{GRanges-class}}
#' @param bsgenome  \code{\link[BSgenome]{BSgenome-class}}
#' @param edits     character vector: desired edits on '+' strand.
#'                  If named, names should be identical to those of \code{gr}
#' @param nprimer   n primer nucleotides (default 13, max 17)
#' @param nrt       n rev transcr nucleotides (default 16, recomm. 10-16)
#' @param ontargetmethod  'Doench2014' or 'Doench2016': on-target scoring method
#' @param offtargetmethod  'bowtie' or 'pdict'
#' @param mismatches  no of primespacer mismatches 
#'                   (default 0, to suppress offtarget analysis: -1)
#' @param nickmatches no of nickspacer offtarget mismatches 
#'                   (default 2, to suppresses offtarget analysis: -1)
#' @param indexedgenomesdir  directory with indexed genomes 
#'                           (as created by \code{\link{index_genome}})
#' @param outdir    directory whre offtarget analysis output is written
#' @param verbose   TRUE (default) or FALSE
#' @param plot      TRUE (default) or FALSE
#' @param ...       passed to plot_intervals
#' @return  \code{\link[GenomicRanges]{GRanges-class}} with prime editing spacer
#' ranges and following mcols: 
#'   * crisprspacer: N20 spacers
#'   * crisprpam:    NGG PAMs
#'   * crisprprimer: primer (on PAM strand)
#'   * crisprtranscript: reverse transcript (on PAM strand)
#'   * crisprextension:  3' extension of gRNA
#'               contains: reverse transcription template + primer binding site
#'               sequence can be found on non-PAM strand
#'   * crisprextrange: genomic range of crispr extension
#'   * Doench2016|4: on-target efficiency scores
#'   * off0, off1, off2: number of offtargets with 0, 1, 2 mismatches
#'   * off: total number of offtargets: off = off0 + off1 + ...
#'   * nickrange:        nickspacer range
#'   * nickspacer:       nickspacer sequence
#'   * nickDoench2016|4: nickspacer Doench scores
#'   * nickoff:          nickspacer offtarget counts
#' @examples
#' # Find PE spacers for 4 clinically relevant loci (Anzalone et al, 2019)
#'     bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38  
#'     gr <- char_to_granges(c(
#'         PRNP = 'chr20:4699600:+',             # snp: prion disease
#'         HBB  = 'chr11:5227002:-',             # snp: sickle cell anemia
#'         HEXA = 'chr15:72346580-72346583:-',   # del: tay sachs disease
#'         CFTR = 'chr7:117559593-117559595:+'), # ins: cystic fibrosis
#'         bsgenome)
#'     spacers <- find_primespacers(gr, bsgenome)
#'     spacers <- find_spacers(extend_for_pe(gr), bsgenome, complement = FALSE)
#'     
#' # Edit PRNP locus for resistance against prion disease (Anzalone et al, 2019)
#'     bsgenome <- BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38  
#'     gr <- char_to_granges(c(PRNP = 'chr20:4699600:+'), bsgenome)
#'     find_primespacers(gr, bsgenome)
#'     find_primespacers(gr, bsgenome, edits = 'T')
#' @seealso \code{\link{find_spacers}} to find standard crispr sites
#' @export
find_primespacers <- function(gr, bsgenome, edits = get_plus_seq(bsgenome, gr), 
    nprimer = 13, nrt = 16, ontargetmethod  = c('Doench2014', 'Doench2016')[1], 
    offtargetmethod = c('bowtie', 'pdict')[1], 
    mismatches = 0, nickmatches = 2, indexedgenomesdir = INDEXEDGENOMESDIR, 
    outdir = OUTDIR, verbose = TRUE, plot = TRUE, ...){
# Assert
    assert_is_all_of(gr, 'GRanges')
    assert_is_all_of(bsgenome, 'BSgenome')
    assert_is_character(edits)
    assert_all_are_matching_regex(edits, '^[ACGTacgt]+$')
    assert_are_same_length(gr, edits)
    if (has_names(edits))   assert_are_identical(names(gr), names(edits))
    assert_is_a_number(nprimer); assert_is_a_number(nrt)
    assert_all_are_less_than(nprimer, 17)
    assert_is_a_bool(verbose);   assert_is_a_bool(plot)
# Find GG in nrt window around target site
    if (verbose)  message('Find primespacers for ', length(gr), ' targets')
    gr %<>% name_uniquely(); names(edits) <- names(gr)
    gg  <-  gr %>% extend_pe_to_gg(nrt) %>% add_seq(bsgenome) %>% find_gg()
    names(gg) <- gg$crisprname <- uniquify(gg$targetname)
# Extract from these other components
    bs <- bsgenome; invstr <- invertStrand
    spacers       <- up_flank(gg, -21,        -2)    %>% add_seq(bs)
    pams          <- up_flank(gg,  -1,        +1)    %>% add_seq(bs) 
    primers       <- up_flank(gg, -4-nprimer, -5)    %>% add_seq(bs)
    transcripts   <- up_flank(gg, -4, -5+nrt) %>% add_fixed_seqs(bs, edits)
    exts  <- up_flank(gg, -4-nprimer, -5+nrt) %>% invertStrand() %>% 
            add_fixed_seqs(bs, edits) # Revcomp for "-" seqs
# Add sequences
    names(mcols(spacers)) %<>% stri_replace_first_fixed('seq', 'crisprspacer')
    spacers$crisprpam        <- pams$seq
    spacers$crisprprimer     <- primers$seq
    spacers$crisprtranscript <- transcripts$seq
    spacers$crisprextension  <- exts$seq
    spacers$crisprextrange   <- unname(as.character(granges(exts)))
# Count offtargets, score ontargets, add nickspacers, plot, return
    if (verbose) message("\tFound ", length(spacers), " primespacers")
    spacers %<>% count_offtargets(bsgenome, mismatches=mismatches, pam='NGG',
        offtargetmethod = offtargetmethod, outdir = outdir, indexedgenomesdir = 
        indexedgenomesdir, verbose= TRUE, plot = FALSE)
    spacers %<>% score_ontargets(
                    bsgenome, ontargetmethod = ontargetmethod, plot = FALSE)
    spacers %<>% add_nickspacers(bsgenome, ontargetmethod = ontargetmethod, 
        offtargetmethod = offtargetmethod, 
        nickmatches = nickmatches, indexedgenomesdir = indexedgenomesdir,
        outdir = outdir, plot = FALSE)
    if (plot) print(plot_intervals(spacers, ...))
    spacers
}

find_pe_spacers <- function(...){
    .Deprecated('find_primespacers')
    find_primespacers(...)
}

add_fixed_seqs <- function(gr, bsgenome, edits){
    # Get '+' seq
    gr$seq <- get_plus_seq(bsgenome, gr)
    substr(gr$seq, gr$targetstart - start(gr)+1, 
                    gr$targetend  - start(gr)+1) <- edits[gr$targetname]
    gr$seq[as.logical(strand(gr)=='-')] %<>% revcomp()     
    gr
}

Try the multicrispr package in your browser

Any scripts or data that you put into this service are public.

multicrispr documentation built on Nov. 8, 2020, 5:10 p.m.