R/filtergRNAs.R

Defines functions filtergRNAs

Documented in filtergRNAs

#' Filter gRNAs
#' 
#' Filter gRNAs containing restriction enzyme cut site
#' 
#' %% ~~ If necessary, more details than the description above ~~
#' 
#' @param all.gRNAs gRNAs as DNAStringSet, such as the output from findgRNAs
#' @param pairOutputFile File path with paired gRNAs
#' @param findgRNAsWithREcutOnly Indicate whether to find gRNAs overlap with
#' restriction enzyme recognition pattern
#' @param REpatternFile File path containing restriction enzyme cut patters
#' @param format Format of the REpatternFile, default as fasta
#' @param minREpatternSize Minimum restriction enzyme recognition pattern
#' length required for the enzyme pattern to be searched for, default 4
#' @param overlap.gRNA.positions The required overlap positions of gRNA and
#' restriction enzyme cut site, default 17 and 18
#' @param overlap.allpos Default TRUE, meaning that only gRNAs overlap with all
#' the positions are retained FALSE, meaning that gRNAs overlap with one or
#' both of the positions are retained
#' @return \item{gRNAs.withRE }{gRNAs as DNAStringSet that passed the filter
#' criteria} \item{gRNAREcutDetails}{a data frame that contains a set of gRNAs
#' annotated with restriction enzyme cut details}
#' @note %% ~~further notes~~
#' @author Lihua Julie Zhu
#' @seealso offTargetAnalysis
#' @references %% ~put references to the literature/web site here ~
#' @keywords misc
#' @examples
#' 
#'     all.gRNAs <- findgRNAs(
#'         inputFilePath = system.file("extdata", "inputseq.fa", 
#'         package = "CRISPRseek"),
#'         pairOutputFile = "testpairedgRNAs.xls",
#'         findPairedgRNAOnly = TRUE)
#' 
#'     gRNAs.RE <- filtergRNAs(all.gRNAs = all.gRNAs,
#'         pairOutputFile = "testpairedgRNAs.xls",minREpatternSize = 6,
#'         REpatternFile = system.file("extdata", "NEBenzymes.fa", 
#'         package = "CRISPRseek"), overlap.allpos = TRUE)
#' 
#'     gRNAs  <- gRNAs.RE$gRNAs.withRE
#'     restriction.enzyme.cut.sites <- gRNAs.RE$gRNAREcutDetails
#' @importFrom Biostrings readDNAStringSet reverseComplement DNAStringSet
#' @importFrom BiocGenerics do.call rbind lapply cbind
#' @importFrom utils read.table
#' @importFrom S4Vectors merge
#' @importFrom IRanges width
#' @export
filtergRNAs <-
    function(all.gRNAs, pairOutputFile = "", 
        findgRNAsWithREcutOnly = FALSE, 
	REpatternFile = system.file("extdata", "NEBenzymes.fa",
            package = "CRISPRseek"), format = "fasta",
        minREpatternSize = 4, overlap.gRNA.positions = c(17, 18),
		overlap.allpos = TRUE)
{
    if (length(all.gRNAs) == 0)
	stop("all.gRNAs contains no gRNAs!")
    if (missing(REpatternFile)) {
        stop("REpatternFile containing the restriction enzyme cut pattern 
            is required!")
    }
    if (!file.exists(REpatternFile)) {
        stop("REpatternFile specified as ", REpatternFile, " does not exists!")
    }
    if (format != "fasta" && format != "fastq") {
        stop("format needs to be either fasta or fastq!")
    }
    patterns <- unique(readDNAStringSet(REpatternFile, format, 
        use.names = TRUE))
    patterns <- patterns[width(patterns) >= minREpatternSize, ]
    countRE <- as.data.frame(table(names(patterns)))
    singleRE <- patterns[names(patterns) %in% countRE[countRE[,2] == 1, 1]]
    duplicateRE <- patterns[names(patterns) %in% countRE[countRE[,2] > 1, 1]]
    duplicateRE <- duplicateRE[order(names(duplicateRE))]
    if (length(duplicateRE) > 0)
    {
        print("More than one pattern found for the following REs and these RE
            will be skipped! If you want to include them in the search, please 
            correct the REpattern file and run this again!")
        print(unique(names(duplicateRE)))
    }
    patterns <- singleRE
    seqs <- as.character(all.gRNAs)
    seq.names <- names(all.gRNAs)
    min.pStart.plus <- min(overlap.gRNA.positions)
    max.pStart.plus <- max(overlap.gRNA.positions)
    gRNAs.RE <- data.frame(gRNAPlusPAM = "", REcutgRNAName = "", 
        REname = "", REpattern = "", REcutStart = "", REcutEnd = "")
    for (j in 1:length(patterns))
    {
        pattern.name <- gsub("'", "", names(patterns)[j])
        pattern <- patterns[[j]]
	this.pattern.size <- length(pattern)
        revpattern <- reverseComplement(pattern)
        if (revpattern != pattern)
        {
	    revpattern <- translatePattern(revpattern)
            minus.gRNAs <- do.call(rbind, lapply(1:length(all.gRNAs), 
                function(i){
                    res1  <- as.numeric(gregexpr(revpattern, seqs[i],
                        perl = TRUE,fixed = FALSE,ignore.case = TRUE)[[1]])
                    do.call(rbind, lapply(1:length(res1), function(k) {
						if (res1[k] >0 && !overlap.allpos && 
							((min.pStart.plus >= res1[k] && 
							min.pStart.plus <= (res1[k] + this.pattern.size -1)) || 
							 (max.pStart.plus >= res1[k] && 
							  max.pStart.plus <= (res1[k] + this.pattern.size -1))))
                            c(as.character(seqs[i]),seq.names[i], pattern.name,
                                as.character(reverseComplement(patterns[[j]])), 
                                this.pattern.size - 1 + res1[k], res1[k])
						else if (res1[k] >0 && overlap.allpos && 
							min.pStart.plus >= res1[k] && 
							max.pStart.plus <= (res1[k] + this.pattern.size -1))
							c(as.character(seqs[i]),seq.names[i], pattern.name,
								as.character(reverseComplement(patterns[[j]])), 
							    this.pattern.size - 1 + res1[k], res1[k])
                   }))
                }
            ))
            if (length(minus.gRNAs) >1)
            {
                #print(minus.gRNAs)
                # print(j)
                # print(dim(minus.gRNAs))
                colnames(minus.gRNAs) <- c("gRNAPlusPAM", 
                    "REcutgRNAName", "REname", "REpattern",
                    "REcutStart","REcutEnd")
                gRNAs.RE <- rbind(gRNAs.RE,  minus.gRNAs)	
            }
        }		 
        pattern <- translatePattern(pattern)
        pos.gRNAs <- do.call(rbind, lapply(1:length(all.gRNAs), 
            function(i){
                res1  <- as.numeric(gregexpr(pattern, seqs[i], perl = TRUE,
                    fixed = FALSE, ignore.case = TRUE)[[1]])
                do.call(rbind, lapply(1:length(res1), function(k) {
			if (res1[k] >0 && !overlap.allpos && 
				((min.pStart.plus >= res1[k] && 
				min.pStart.plus <= (res1[k] + this.pattern.size -1)) || 
				(max.pStart.plus >= res1[k] && 
				max.pStart.plus <= (res1[k] + this.pattern.size -1))))
                        c(as.character(seqs[i]), seq.names[i], pattern.name, 
                            as.character(patterns[[j]]), res1[k], 
                            res1[k] + this.pattern.size - 1)
					else if (res1[k] >0 && overlap.allpos && 
						min.pStart.plus >= res1[k] && 
						max.pStart.plus <= (res1[k] + this.pattern.size -1))
						c(as.character(seqs[i]), seq.names[i], pattern.name, 
						    as.character(patterns[[j]]), res1[k], 
							res1[k] + this.pattern.size - 1)
                }))
            }
        ))
        if (length(pos.gRNAs) > 0)
        {
            #print("pos.gRNAs")
            #print(j)
            #print(pos.gRNAs)
            colnames(pos.gRNAs) <- c("gRNAPlusPAM", "REcutgRNAName", 
                "REname", "REpattern", "REcutStart", "REcutEnd")
            gRNAs.RE <- rbind(gRNAs.RE,  pos.gRNAs)	
        }
    }
    gRNAs.RE <- gRNAs.RE[gRNAs.RE[,1] != "", ]
    if (! missing(pairOutputFile) && file.exists(pairOutputFile)) {
        gRNAs.RE.plus <- gRNAs.RE
        gRNAs.RE.minus <- gRNAs.RE
        colnames(gRNAs.RE.plus) <- paste("Forward", 
            colnames(gRNAs.RE.plus), sep = "")
        colnames(gRNAs.RE.minus) <- paste("Reverse", 
            colnames(gRNAs.RE.minus), sep = "")
        pairgRNAs <- read.table(pairOutputFile, sep = "\t", header = TRUE,
            stringsAsFactors = FALSE)
        ann.gRNAs <- merge(pairgRNAs, gRNAs.RE.plus, 
            by = "ForwardgRNAPlusPAM", all.x = TRUE)
        ann.gRNAs <- merge(ann.gRNAs, gRNAs.RE.minus, 
            by = "ReversegRNAPlusPAM", all.x = TRUE)
        ann.gRNAs <- cbind(ann.gRNAs[,1], ann.gRNAs[,3], ann.gRNAs[,2],
            ann.gRNAs[,4:dim(ann.gRNAs)[2]])
        colnames(ann.gRNAs)[1:3] <- c("ReversegRNAPlusPAM", 
            "ReversegRNAName", "ForwardgRNAPlusPAM")
        ann.gRNAs <- ann.gRNAs[order(ann.gRNAs[,1]), ]
        withRE <- ann.gRNAs[ ! is.na(ann.gRNAs$ForwardREname) | 
            ! is.na(ann.gRNAs$ReverseREname), ]
        withRE <- unique(cbind(withRE[,1:4]))
        if (dim(withRE)[1] == 0 && findgRNAsWithREcutOnly)
            stop("No pairs with RE sites!")
        gRNAs  <- DNAStringSet(c(as.character(withRE$ForwardgRNAPlusPAM),
            as.character(withRE$ReversegRNAPlusPAM)))
        names(gRNAs) <- c(as.character(withRE$ForwardgRNAName),
            as.character(withRE$ReversegRNAName))
    }
    else ### from unpaired search
    {
        if (dim(gRNAs.RE)[1] == 0 && findgRNAsWithREcutOnly)
            stop("No gRNAs with RE sites!")
        temp <- unique(cbind(as.character(gRNAs.RE$gRNAPlusPAM), 
            as.character(gRNAs.RE$REcutgRNAName)))
        gRNAs <- DNAStringSet(temp[, 1])
        names(gRNAs) <- temp[, 2]
        ann.gRNAs <- gRNAs.RE
    }
    list(gRNAs.withRE = unique(gRNAs), gRNAREcutDetails = ann.gRNAs)
}
LihuaJulieZhu/CRISPRseek documentation built on Feb. 3, 2024, 2:44 p.m.