R/cas9.gRNA.oligo1m.R

Defines functions cas9.gRNA.oligo1m

Documented in cas9.gRNA.oligo1m

## cas9.gRNA.oligo1m.R
## An automated R script for searching of CRISPR-Cas9 compatible gRNA-target-site oligos based on a input fasta sequence. It would generate a table that lists all possible gRNA-target-site oligos, includes their sequences, target range to micro-eukaryotes and host taxonomic group, and etc.
## Compared to cas9.gRNA.oligo1.R, this functions is built to deal with the situation that in the input fasta file, there are more than one sequence. While the cas9.gRNA.oligo1.R can only deal with the input fasta file with only one sequence. Here cas9.gRNA.oligo1m.R could deal with multiple sequenes in the input fasta file.
## Kevin Xu ZHONG, kevinzhong2006@gmail.com; xzhong@eoas.ubc.ca
## depends on R libraries biostrings, reshape2 and ape being installed
## depends on a custom V4 region of 18S rRNA genes flanked by 18S primers "TAReuk454FWD1 and TAReukREV3" (Stoeck et al., 2010) that is prepared from the SILVA 18S SSU V119 99rep database.
## The input fasta needs to be the same V4 region flanked by 18S primers "TAReuk454FWD1 and TAReukREV3" (Stoeck et al., 2010).


#' A cas9.gRNA.oligo1m Function
#'
#' This function allows you to design gRNA-target-site oligos for the guding RNA to direct Cas9 nuclease to specifically cut hosts 18S rRNA genes but not of protists and fungi.
#'
#' If you aim to cut host 18S rRNA genes of V4 region that are generated using 18S primers "TAReuk454FWD1 and TAReukREV3" (Stoeck et al., 2010), please use this cas9.gRNA.oligo1m() function as it based on reference database of this region
#'
#' If you aim to target another region of 18S rRNA gene and amplified by different primer sets, or different different genes, please use cas9.gRNA.oligo2() function by providing your own reference database
#'
#' @param inseq 'Path/To/Your/Input_sequence_fasta_file'; For example: inseq="/home/kevin/Desktop/data/pacific_oyster_18S_V4.fasta".
#'
#' @param target The host taxonomic group (from species to kingdom) that you think the obtained sgRNA plan to target; If there is space between two word, the sapce should be replace using "_". For example, target="Homo sapiens" need to change to be target="Homo_sapiens".
#' The target is aiming to obtain target range of your probe (how good it is in helping cleaving the sequence of reference database either within same group or higher taxonomy level).
#'
#' @keywords cas9.gRNA.oligo1m
#' @export
#'
#' @examples
#' ##### If you want to predict the gRNA's target range among sequences of a host taxonomic group in SILVA and all sequences in input fasta file
#' cas9.gRNA.oligo1m(inseq="Path/To/Your/Input_sequence_fasta_file.fasta", target="Taxonomic_group_of_a_host")
#'
#' ##### If you do NOT want to predict the gRNA's target range among sequences of a host taxonomic group in SILVA, but among all sequences in input fasta file
#' cas9.gRNA.oligo1m(inseq="Path/To/Your/Input_sequence_fasta_file.fasta")
#'
#'
#'###############################################
#' ##### To design sgRNA for your human 18S (V4 region of 18S rRNA gene) and predict the sgRNA's host-target range among other "Homo_sapiens" sequences in SILVA and among all sequences in input fasta file.
#' cas9.gRNA.oligo1m(inseq="/home/kevin/Desktop/data/human_3seqs.fasta", target="Homo_sapiens")
#'
#' ##### To design sgRNA for your human 18S (V4 region of 18S rRNA gene), but if you do not want to predict the sgRNA's host-target range among other taxonomic groups
#' cas9.gRNA.oligo1m(inseq="/home/kevin/Desktop/data/human_3seqs.fasta")
#'
#' ##### To design sgRNA for your oyster 18S (V4 region of 18S rRNA gene) and predict the sgRNA's host-target range among other "Crassostrea_gigas" sequences in SILVA and among all sequences in input fasta file.
#' cas9.gRNA.oligo1m(inseq="/home/kevin/Desktop/data/pacific_oyster_18S_V4_3seqs.fasta", target="Crassostrea_gigas")
#'
#' ##### To design sgRNA for your oyster 18S (V4 region of 18S rRNA gene) and predict the sgRNA's host-target range among other "Ostreidae" sequences in SILVA and among all sequences in input fasta file.
#' cas9.gRNA.oligo1m(inseq="/home/kevin/Desktop/data/pacific_oyster_18S_V4_3seqs.fasta", target="Ostreidae")
#'
#' ##### To design sgRNA for your oyster 18S (V4 region of 18S rRNA gene) and predict the sgRNA's host-target range among other "Mollusca" sequences in SILVA and among all sequences in input fasta file.
#' cas9.gRNA.oligo1m(inseq="/home/kevin/Desktop/data/pacific_oyster_18S_V4_3seqs.fasta", target="Mollusca")
#'
#' ##### To design sgRNA for your oyster 18S (V4 region of 18S rRNA gene), but if you do not want to predict the sgRNA's host-target range among other taxonomic groups.
#' cas9.gRNA.oligo1m(inseq="/home/kevin/Desktop/data/pacific_oyster_18S_V4_3seqs.fasta")
#'
#'
#'
#'
#'
#'
#'
#'


cas9.gRNA.oligo1m <- function(inseq, target=NULL) {
  library(ape)
  library(Biostrings)
  library(reshape2)

  inseq <- as.character(inseq)#PATH to the input 18S sequences of V4 region of target host, the output file named after the sequence file


  if(is.null(target)) {
    print("target is empty")
    print("The R is running, please be patient!")

    #myseqs0 <- readDNAStringSet(refseq, "fasta")
    data(silva_v119_18S_99rep_V4)
    #myseqs0 <- readDNAStringSet(system.file("extdata", "silva_v119_18S_99rep_V4.fasta", package = "CasOligo"), "fasta")
    myseqs0 <- silva_v119_18S_99rep_V4

    myseqs0
    length(myseqs0)

    myseqs1 <- myseqs0[!grepl("Metazoa", names(myseqs0))]
    myseqs1 <- myseqs1[!grepl("Embryophyta", names(myseqs1))]

    myseqs1.rc <- reverseComplement(myseqs1)

    myseqs <- c(myseqs1, myseqs1.rc)
    myseqs

    #############input seq ###########################
    sanger <- readDNAStringSet(inseq, "fasta")
    sanger

    sanger.rc <- reverseComplement(sanger)

    if(length(sanger)>1) {

      #######################################
      myseqs0 <- sanger

      myseqs0.rc <- reverseComplement(sanger)

      myseqs10 <- c(myseqs0, myseqs0.rc)
      myseqs10



      #######################################
      myseqs0.D4 <- myseqs0
      qur1 <- sanger[1]
      oyt1 <- myseqs0.D4[!names(myseqs0.D4)==names(qur1)]
      myseqs0.rc.D4 <- myseqs0.rc
      qur2 <- sanger.rc[1]
      oyt2 <- myseqs0.rc.D4[!names(myseqs0.rc.D4)==names(qur2)]


      tab <- data.frame(matrix(NA, nrow = width(qur1)[1]-20, ncol = 6))
      colnames(tab) <- c("strand","start", "end", "seq", "pam.seq", "pam")
      tab$strand <- "forward"
      for (j in 1:(width(qur1)[1]-23)) {
        res <- subseq(qur1, start = j, end = 19+j)
        bps <- subseq(qur1, start = 20+j, end = 20+j)
        ps <- subseq(qur1, start = 21+j, end = 22+j)
        if (grepl("GG", ps[[1]]) == TRUE)
        {tab$pam[j] <- "yes"
        tab$seq[j] <- as.character(res)
        tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        tab$start[j] <- j
        tab$end[j] <- j+19
        }
        if (grepl("gg", ps[[1]]) == TRUE)
        {tab$pam[j] <- "yes"
        tab$seq[j] <- as.character(res)
        tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        tab$start[j] <- j
        tab$end[j] <- j+19
        }
        if (grepl("Gg", ps[[1]]) == TRUE)
        {tab$pam[j] <- "yes"
        tab$seq[j] <- as.character(res)
        tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        tab$start[j] <- j
        tab$end[j] <- j+19
        }
        if (grepl("gG", ps[[1]]) == TRUE)
        {tab$pam[j] <- "yes"
        tab$seq[j] <- as.character(res)
        tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        tab$start[j] <- j
        tab$end[j] <- j+19
        }
      }
      tab <- na.omit(tab)
      tab$id <- tab$start-1

      rctab <- data.frame(matrix(NA, nrow = width(qur1)[1]-20, ncol = 6))
      colnames(rctab) <- c("strand","start", "end", "seq", "pam.seq", "pam")
      rctab$strand <- "reverse"
      for (j in 1:(width(qur2)[1]-23)) {
        res <- subseq(qur2, start = j, end = 19+j)
        bps <- subseq(qur2, start = 20+j, end = 20+j)
        ps <- subseq(qur2, start = 21+j, end = 22+j)
        if (grepl("GG", ps[[1]]) == TRUE)
        {rctab$pam[j] <- "yes"
        rctab$seq[j] <- as.character(res)
        rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        rctab$start[j] <- j
        rctab$end[j] <- j+19
        }
        if (grepl("gg", ps[[1]]) == TRUE)
        {rctab$pam[j] <- "yes"
        rctab$seq[j] <- as.character(res)
        rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        rctab$start[j] <- j
        rctab$end[j] <- j+19
        }
        if (grepl("Gg", ps[[1]]) == TRUE)
        {rctab$pam[j] <- "yes"
        rctab$seq[j] <- as.character(res)
        rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        rctab$start[j] <- j
        rctab$end[j] <- j+19
        }
        if (grepl("gG", ps[[1]]) == TRUE)
        {rctab$pam[j] <- "yes"
        rctab$seq[j] <- as.character(res)
        rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        rctab$start[j] <- j
        rctab$end[j] <- j+19
        }
      }
      rctab <- na.omit(rctab)
      rctab$id <- length(qur2[[1]])-23+1-rctab$start

      tab$target.nb <- "NA"
      for (j in 1:nrow(tab)) {
        int <- 0
        for (i in 1:length(oyt1)){
          if (grepl(tab$seq[j], oyt1[[i]]) == TRUE){int <- int+1}
        }
        tab$target.nb[j] <- int
      }#

      rctab$target.nb <- "NA"
      for (j in 1:nrow(rctab)) {
        int <- 0
        for (i in 1:length(oyt2)){
          if (grepl(rctab$seq[j], oyt2[[i]]) == TRUE){int <- int+1}
        }
        rctab$target.nb[j] <- int
      }#


      for (j in 1:nrow(tab)) {
        int <- 0
        res <- tab$seq[j]
        for (i in 1:length(myseqs)){
          if (grepl(as.character(res), myseqs[[i]]) == TRUE){
            int <- int+1}
        }
        pec <- int/length(myseqs)
        tab$hits[j] <- int
        tab$percentage[j] <- pec
      }
      tab$target.nb <- as.numeric(tab$target.nb)
      tab$hits <- as.numeric(tab$hits)
      tab1 <- subset(tab, tab$target.nb > length(oyt1)-1)
      tab2 <- subset(tab1, tab1$hits=="0")
      tab2 <- tab2[order(tab2$percentage), ]

      for (j in 1:nrow(rctab)) {
        int <- 0
        res <- rctab$seq[j]
        for (i in 1:length(myseqs)){
          if (grepl(as.character(res), myseqs[[i]]) == TRUE){
            int <- int+1}
        }
        pec <- int/length(myseqs)
        rctab$hits[j] <- int
        rctab$percentage[j] <- pec
      }
      rctab$target.nb <- as.numeric(rctab$target.nb)
      rctab$hits <- as.numeric(rctab$hits)
      rctab1 <- subset(rctab, rctab$target.nb > length(oyt1)-1)
      rctab2 <- subset(rctab1, rctab1$hits=="0")
      rctab2 <- rctab2[order(rctab2$percentage), ]

      meg <- rbind(tab, rctab)
      meg1 <- rbind(tab1, rctab1)
      meg2 <- rbind(tab2, rctab2)

      meg$direction <- ifelse(meg$strand=="forward", "Plus", "Minus")
      meg$idt <- paste(meg$direction, meg$id, sep="_")
      meg$class <- target
      meg$total.host <- length(myseqs0)
      meg$target.range <- meg$target.nb / meg$total.host
      meg$ref.seq <- as.character(names(qur1))

      ###############
      meg$sgRNA.ID <- meg$idt
      colnames(meg)[colnames(meg)=="seq"] <- "gRNA-target-site-oligo"
      colnames(meg)[colnames(meg)=="idt"] <- "oligo.ID"
      colnames(meg)[colnames(meg)=="target.nb"] <- "hits.to.host"
      colnames(meg)[colnames(meg)=="hits"] <- "hits.to.microeukaryotes"
      colnames(meg)[colnames(meg)=="total.host"] <- "total.number.host"
      colnames(meg)[colnames(meg)=="target.range"] <- "host.target.range"
      colnames(meg)[colnames(meg)=="percentage"] <- "microeukaryotes.target.range"
      colnames(meg)[colnames(meg)=="class"] <- "host.group"

      write.csv(meg, "output_gRNA-target-site-oligo.csv", row.names = F, col.names = TRUE, quote=F)
      write.table(meg, "output_gRNA-target-site-oligo.txt", row.names = F, col.names = TRUE, quote=F, sep="\t")

    }
    else{
      ##################search the sgRNA #####################
      myseqs0.D4 <- sanger
      qur1 <- sanger[1]
      oyt1 <- myseqs0.D4[!names(myseqs0.D4)==names(qur1)]
      myseqs0.rc.D4 <- sanger.rc
      qur2 <- sanger.rc[1]
      oyt2 <- myseqs0.rc.D4[!names(myseqs0.rc.D4)==names(qur2)]

      #screen the probes which has PAM (NGG) at his downstream sequences
      tab <- data.frame(matrix(NA, nrow = width(qur1)[1]-20, ncol = 6))
      colnames(tab) <- c("strand","start", "end", "seq", "pam.seq", "pam")
      tab$strand <- "forward"
      for (j in 1:(width(qur1)[1]-23)) {
        res <- subseq(qur1, start = j, end = 19+j)
        bps <- subseq(qur1, start = 20+j, end = 20+j)
        ps <- subseq(qur1, start = 21+j, end = 22+j)
        if (grepl("GG", ps[[1]]) == TRUE)
        {tab$pam[j] <- "yes"
        tab$seq[j] <- as.character(res)
        tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        tab$start[j] <- j
        tab$end[j] <- j+19
        }
        if (grepl("gg", ps[[1]]) == TRUE)
        {tab$pam[j] <- "yes"
        tab$seq[j] <- as.character(res)
        tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        tab$start[j] <- j
        tab$end[j] <- j+19
        }
        if (grepl("Gg", ps[[1]]) == TRUE)
        {tab$pam[j] <- "yes"
        tab$seq[j] <- as.character(res)
        tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        tab$start[j] <- j
        tab$end[j] <- j+19
        }
        if (grepl("gG", ps[[1]]) == TRUE)
        {tab$pam[j] <- "yes"
        tab$seq[j] <- as.character(res)
        tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        tab$start[j] <- j
        tab$end[j] <- j+19
        }
      }
      tab <- na.omit(tab)
      tab$id <- tab$start-1

      rctab <- data.frame(matrix(NA, nrow = width(qur1)[1]-20, ncol = 6))
      colnames(rctab) <- c("strand","start", "end", "seq", "pam.seq", "pam")
      rctab$strand <- "reverse"
      for (j in 1:(width(qur2)[1]-23)) {
        res <- subseq(qur2, start = j, end = 19+j)
        bps <- subseq(qur2, start = 20+j, end = 20+j)
        ps <- subseq(qur2, start = 21+j, end = 22+j)
        if (grepl("GG", ps[[1]]) == TRUE)
        {rctab$pam[j] <- "yes"
        rctab$seq[j] <- as.character(res)
        rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        rctab$start[j] <- j
        rctab$end[j] <- j+19
        }
        if (grepl("gg", ps[[1]]) == TRUE)
        {rctab$pam[j] <- "yes"
        rctab$seq[j] <- as.character(res)
        rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        rctab$start[j] <- j
        rctab$end[j] <- j+19
        }
        if (grepl("Gg", ps[[1]]) == TRUE)
        {rctab$pam[j] <- "yes"
        rctab$seq[j] <- as.character(res)
        rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        rctab$start[j] <- j
        rctab$end[j] <- j+19
        }
        if (grepl("gG", ps[[1]]) == TRUE)
        {rctab$pam[j] <- "yes"
        rctab$seq[j] <- as.character(res)
        rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
        rctab$start[j] <- j
        rctab$end[j] <- j+19
        }
      }
      rctab <- na.omit(rctab)
      rctab$id <- length(qur2[[1]])-23+1-rctab$start

      #tab$target.nb <- "NA"
      #for (j in 1:nrow(tab)) {
      #  int <- 0
      #  for (i in 1:length(oyt1)){
      #    if (grepl(tab$seq[j], oyt1[[i]]) == TRUE){int <- int+1}
      #  }
      #  tab$target.nb[j] <- int
      #}

      #rctab$target.nb <- "NA"
      #for (j in 1:nrow(rctab)) {
      #  int <- 0
      #  for (i in 1:length(oyt2)){
      #   if (grepl(rctab$seq[j], oyt2[[i]]) == TRUE){int <- int+1}
      #  }
      #  rctab$target.nb[j] <- int
      #}


      for (j in 1:nrow(tab)) {
        int <- 0
        res <- tab$seq[j]
        for (i in 1:length(myseqs)){
          if (grepl(as.character(res), myseqs[[i]]) == TRUE){
            int <- int+1}
        }
        pec <- int/length(myseqs)
        tab$hits[j] <- int
        tab$percentage[j] <- pec
      }

      #tab$target.nb <- as.numeric(tab$target.nb)
      #tab$hits <- as.numeric(tab$hits)
      #tab1 <- subset(tab, tab$target.nb > length(oyt1)-1)
      #tab2 <- subset(tab1, tab1$hits=="0")
      #tab2 <- tab2[order(tab2$percentage), ]

      for (j in 1:nrow(rctab)) {
        int <- 0
        res <- rctab$seq[j]
        for (i in 1:length(myseqs)){
          if (grepl(as.character(res), myseqs[[i]]) == TRUE){
            int <- int+1}
        }
        pec <- int/length(myseqs)
        rctab$hits[j] <- int
        rctab$percentage[j] <- pec
      }
      #rctab$target.nb <- as.numeric(rctab$target.nb)
      #rctab$hits <- as.numeric(rctab$hits)
      #rctab1 <- subset(rctab, rctab$target.nb > length(oyt1)-1)
      #rctab2 <- subset(rctab1, rctab1$hits=="0")
      #rctab2 <- rctab2[order(rctab2$percentage), ]

      meg <- rbind(tab, rctab)
      #meg1 <- rbind(tab1, rctab1)
      #meg2 <- rbind(tab2, rctab2)

      meg$direction <- ifelse(meg$strand=="forward", "Plus", "Minus")
      meg$idt <- paste(meg$direction, meg$id, sep="_")
      meg$class <- target
      #meg$total.host <- length(myseqs0)
      #meg$target.range <- meg$target.nb / meg$total.host
      meg$ref.seq <- as.character(names(qur1))

      ###############
      meg$sgRNA.ID <- meg$idt
      colnames(meg)[colnames(meg)=="seq"] <- "gRNA-target-site-oligo"
      colnames(meg)[colnames(meg)=="idt"] <- "oligo.ID"
      #colnames(meg)[colnames(meg)=="target.nb"] <- "hits.to.host"
      colnames(meg)[colnames(meg)=="hits"] <- "hits.to.microeukaryotes"
      #colnames(meg)[colnames(meg)=="total.host"] <- "total.number.host"
      #colnames(meg)[colnames(meg)=="target.range"] <- "host.target.range"
      colnames(meg)[colnames(meg)=="percentage"] <- "microeukaryotes.target.range"
      colnames(meg)[colnames(meg)=="class"] <- "host.group"

      write.csv(meg, "output_gRNA-target-site-oligo.csv", row.names = F, col.names = TRUE, quote=F)
      write.table(meg, "output_gRNA-target-site-oligo.txt", row.names = F, col.names = TRUE, quote=F, sep="\t")
    }


  }



  else {
    print("target is not empty")
    target <- as.character(target)#targeted species
    print(paste0("target is: ", target))
    print("The R is running, please be patient!")

    #myseqs0 <- readDNAStringSet(refseq, "fasta")
    data(silva_v119_18S_99rep_V4)
    #myseqs0 <- readDNAStringSet(system.file("extdata", "silva_v119_18S_99rep_V4.fasta", package = "CasOligo"), "fasta")
    myseqs0 <- silva_v119_18S_99rep_V4

    myseqs0
    length(myseqs0)

    myseqs1 <- myseqs0[!grepl("Metazoa", names(myseqs0))]
    myseqs1 <- myseqs1[!grepl("Embryophyta", names(myseqs1))]

    myseqs1.rc <- reverseComplement(myseqs1)

    myseqs <- c(myseqs1, myseqs1.rc)
    myseqs

    ############ ###########################
    target <- gsub("_", " ", target)

    myseqs0 <- myseqs0[grepl(target, names(myseqs0))]
    myseqs0
    length(myseqs0)
    names(myseqs0)

    sanger <- readDNAStringSet(inseq, "fasta")
    sanger

    sanger.rc <- reverseComplement(sanger)

    myseqs0 <- c(DNAStringSet(sanger), DNAStringSet(myseqs0))

    myseqs0.rc <- reverseComplement(myseqs0)

    myseqs10 <- c(myseqs0, myseqs0.rc)
    myseqs10


    #######################################
    myseqs0.D4 <- myseqs0
    qur1 <- sanger[1]
    oyt1 <- myseqs0.D4[!names(myseqs0.D4)==names(qur1)]
    myseqs0.rc.D4 <- myseqs0.rc
    qur2 <- sanger.rc[1]
    oyt2 <- myseqs0.rc.D4[!names(myseqs0.rc.D4)==names(qur2)]


    tab <- data.frame(matrix(NA, nrow = width(qur1)[1]-20, ncol = 6))
    colnames(tab) <- c("strand","start", "end", "seq", "pam.seq", "pam")
    tab$strand <- "forward"
    for (j in 1:(width(qur1)[1]-23)) {
      res <- subseq(qur1, start = j, end = 19+j)
      bps <- subseq(qur1, start = 20+j, end = 20+j)
      ps <- subseq(qur1, start = 21+j, end = 22+j)
      if (grepl("GG", ps[[1]]) == TRUE)
      {tab$pam[j] <- "yes"
      tab$seq[j] <- as.character(res)
      tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
      tab$start[j] <- j
      tab$end[j] <- j+19
      }
      if (grepl("gg", ps[[1]]) == TRUE)
      {tab$pam[j] <- "yes"
      tab$seq[j] <- as.character(res)
      tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
      tab$start[j] <- j
      tab$end[j] <- j+19
      }
      if (grepl("Gg", ps[[1]]) == TRUE)
      {tab$pam[j] <- "yes"
      tab$seq[j] <- as.character(res)
      tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
      tab$start[j] <- j
      tab$end[j] <- j+19
      }
      if (grepl("gG", ps[[1]]) == TRUE)
      {tab$pam[j] <- "yes"
      tab$seq[j] <- as.character(res)
      tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
      tab$start[j] <- j
      tab$end[j] <- j+19
      }
    }
    tab <- na.omit(tab)
    tab$id <- tab$start-1

    rctab <- data.frame(matrix(NA, nrow = width(qur1)[1]-20, ncol = 6))
    colnames(rctab) <- c("strand","start", "end", "seq", "pam.seq", "pam")
    rctab$strand <- "reverse"
    for (j in 1:(width(qur2)[1]-23)) {
      res <- subseq(qur2, start = j, end = 19+j)
      bps <- subseq(qur2, start = 20+j, end = 20+j)
      ps <- subseq(qur2, start = 21+j, end = 22+j)
      if (grepl("GG", ps[[1]]) == TRUE)
      {rctab$pam[j] <- "yes"
      rctab$seq[j] <- as.character(res)
      rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
      rctab$start[j] <- j
      rctab$end[j] <- j+19
      }
      if (grepl("gg", ps[[1]]) == TRUE)
      {rctab$pam[j] <- "yes"
      rctab$seq[j] <- as.character(res)
      rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
      rctab$start[j] <- j
      rctab$end[j] <- j+19
      }
      if (grepl("Gg", ps[[1]]) == TRUE)
      {rctab$pam[j] <- "yes"
      rctab$seq[j] <- as.character(res)
      rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
      rctab$start[j] <- j
      rctab$end[j] <- j+19
      }
      if (grepl("gG", ps[[1]]) == TRUE)
      {rctab$pam[j] <- "yes"
      rctab$seq[j] <- as.character(res)
      rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
      rctab$start[j] <- j
      rctab$end[j] <- j+19
      }
    }
    rctab <- na.omit(rctab)
    rctab$id <- length(qur2[[1]])-23+1-rctab$start

    tab$target.nb <- "NA"
    for (j in 1:nrow(tab)) {
      int <- 0
      for (i in 1:length(oyt1)){
        if (grepl(tab$seq[j], oyt1[[i]]) == TRUE){int <- int+1}
      }
      tab$target.nb[j] <- int
    }

    rctab$target.nb <- "NA"
    for (j in 1:nrow(rctab)) {
      int <- 0
      for (i in 1:length(oyt2)){
        if (grepl(rctab$seq[j], oyt2[[i]]) == TRUE){int <- int+1}
      }
      rctab$target.nb[j] <- int
    }


    for (j in 1:nrow(tab)) {
      int <- 0
      res <- tab$seq[j]
      for (i in 1:length(myseqs)){
        if (grepl(as.character(res), myseqs[[i]]) == TRUE){
          int <- int+1}
      }
      pec <- int/length(myseqs)
      tab$hits[j] <- int
      tab$percentage[j] <- pec
    }
    tab$target.nb <- as.numeric(tab$target.nb)
    tab$hits <- as.numeric(tab$hits)
    tab1 <- subset(tab, tab$target.nb > length(oyt1)-1)
    tab2 <- subset(tab1, tab1$hits=="0")
    tab2 <- tab2[order(tab2$percentage), ]

    for (j in 1:nrow(rctab)) {
      int <- 0
      res <- rctab$seq[j]
      for (i in 1:length(myseqs)){
        if (grepl(as.character(res), myseqs[[i]]) == TRUE){
          int <- int+1}
      }
      pec <- int/length(myseqs)
      rctab$hits[j] <- int
      rctab$percentage[j] <- pec
    }
    rctab$target.nb <- as.numeric(rctab$target.nb)
    rctab$hits <- as.numeric(rctab$hits)
    rctab1 <- subset(rctab, rctab$target.nb > length(oyt1)-1)
    rctab2 <- subset(rctab1, rctab1$hits=="0")
    rctab2 <- rctab2[order(rctab2$percentage), ]

    meg <- rbind(tab, rctab)
    meg1 <- rbind(tab1, rctab1)
    meg2 <- rbind(tab2, rctab2)

    meg$direction <- ifelse(meg$strand=="forward", "Plus", "Minus")
    meg$idt <- paste(meg$direction, meg$id, sep="_")
    meg$class <- target
    meg$total.host <- length(myseqs0)
    meg$target.range <- meg$target.nb / meg$total.host
    meg$ref.seq <- as.character(names(qur1))

    ###############
    meg$sgRNA.ID <- meg$idt
    colnames(meg)[colnames(meg)=="seq"] <- "gRNA-target-site-oligo"
    colnames(meg)[colnames(meg)=="idt"] <- "oligo.ID"
    colnames(meg)[colnames(meg)=="target.nb"] <- "hits.to.host"
    colnames(meg)[colnames(meg)=="hits"] <- "hits.to.microeukaryotes"
    colnames(meg)[colnames(meg)=="total.host"] <- "total.number.host"
    colnames(meg)[colnames(meg)=="target.range"] <- "host.target.range"
    colnames(meg)[colnames(meg)=="percentage"] <- "microeukaryotes.target.range"
    colnames(meg)[colnames(meg)=="class"] <- "host.group"

    write.csv(meg, "output_gRNA-target-site-oligo.csv", row.names = F, col.names = TRUE, quote=F)
    write.table(meg, "output_gRNA-target-site-oligo.txt", row.names = F, col.names = TRUE, quote=F, sep="\t")

  }
}
kevinzhongxu/CasOligo documentation built on Feb. 25, 2025, 8:43 a.m.