## cas9.gRNA.oligo2.R
## An automated R script for searching of CRISPR-Cas9 compatible gRNA-target-site oligos based on a input fasta sequence. It would generate a table that lists all possible gRNA-target-site oligos, includes their sequences, target range to micro-eukaryotes and host taxonomic group, and etc.
## Kevin Xu ZHONG, kevinzhong2006@gmail.com; xzhong@eoas.ubc.ca
## depends on R libraries biostrings, reshape2 and ape being installed
## depends on a custom reference that comprises sequences of the primers-spanned region of a gene. For example, for the V4 region of 18S rRNA genes generated by primer Euk454TAR_fwd/Euk454TAR_fwd, we prepared the reference sequences from the SILVA 18S SSU V119 99rep database.
## The input fasta needs to be the same primers-spanned region as the reference sequences.
#' A cas9.gRNA.oligo2 Function
#'
#' This function allows you to design gRNA-target-site oligos for the guding RNA to direct Cas9 nuclease to specifically cut a gene (e.g. the rRNA genes of 16S, 18S, 23S, etc. Or any other genes) of hosts but not of protists and fungi, as long as a reference sequences database of that gene is provided.
#'
#' To conduct this function, you need to generate your own reference sequence database for the genes that you want to target.
#'
#' Please keep in mind that your input sequence and reference sequences must be within same gene region.
#'
#' If you aim to cut host 18S rRNA genes of V4 region that are generated using 18S primers "TAReuk454FWD1 and TAReukREV3" (Stoeck et al., 2010), you could use easily cas9.gRNA.oligo1() function as it based on reference database of this region.
#'
#'
#'
#' @param inseq 'Path/To/Your/Input_sequence_fasta_file'; For example: inseq="/home/kevin/Desktop/data/pacific_oyster_18S_V4.fasta".
#'
#' @param refseq 'Path/To/Your/Reference_sequences_database_fasta_file', For example: database="/home/kevin/Desktop/data/silva_v119_18S_99rep_V4.fasta".
#'
#' @param target The host taxonomic group (from species to kingdom) that you think the obtained sgRNA plan to target; If there is space between two word, the sapce should be replace using "_". For example, target="Homo sapiens" need to change to be target="Homo_sapiens".
#' The target is aiming to obtain target range of your probe (how good it is in helping cleaving the sequence of reference database either within same group or higher taxonomy level).
#'
#' @keywords cas9.gRNA.oligo2
#' @export
#'
#' @examples
#' ##### If you want to predict the gRNA's target range among a host taxonomic group
#' cas9.gRNA.oligo2(inseq="Path/To/Your/Input_sequence_fasta_file.fasta", refseq="PATH/To/Your/Reference_database_file.fasta", target="Taxonomic_group_of_a_host")
#'
#' ##### If you do NOT want to predict the gRNA's target range among a host taxonomic group
#' cas9.gRNA.oligo2(inseq="Path/To/Your/Input_sequence_fasta_file.fasta", refseq="PATH/To/Your/Reference_database_file.fasta")
#'
#'
#'###############################################
#' ##### To design sgRNA for your human 18S (other regions of 18S rRNA gene) and predict the sgRNA's host-target range among other "Homo_sapiens" sequences in SILVA.
#' cas9.gRNA.oligo2(inseq="/home/kevin/Desktop/data/human.fasta", refseq="PATH/To/Your/Reference_database_file.fasta", target="Homo_sapiens")
#'
#' ##### To design sgRNA for your human 18S (other regions of 18S rRNA gene), but if you do not want to predict the sgRNA's host-target range among other taxonomic groups.
#' cas9.gRNA.oligo2(inseq="/home/kevin/Desktop/data/human.fasta", refseq="PATH/To/Your/Reference_database_file.fasta")
#'
#' ##### To design sgRNA for your oyster 18S (other regions of 18S rRNA gene) and predict the sgRNA's host-target range among other "Crassostrea_gigas" sequences in SILVA.
#' cas9.gRNA.oligo2(inseq="/home/kevin/Desktop/data/pacific_oyster_18S_V4.fasta", refseq="PATH/To/Your/Reference_database_file.fasta", target="Crassostrea_gigas")
#'
#' ##### To design sgRNA for your oyster 18S (other regions of 18S rRNA gene) and predict the sgRNA's host-target range among other "Ostreidae" sequences in SILVA.
#' cas9.gRNA.oligo2(inseq="/home/kevin/Desktop/data/pacific_oyster_18S_V4.fasta", refseq="PATH/To/Your/Reference_database_file.fasta", target="Ostreidae")
#'
#' ##### To design sgRNA for your oyster 18S (other regions of 18S rRNA gene) and predict the sgRNA's host-target range among other "Mollusca" sequences in SILVA.
#' cas9.gRNA.oligo2(inseq="/home/kevin/Desktop/data/pacific_oyster_18S_V4.fasta", refseq="PATH/To/Your/Reference_database_file.fasta", target="Mollusca")
#'
#' ##### To design sgRNA for your oyster 18S (other regions of 18S rRNA gene), but if you do not want to predict the sgRNA's host-target range among other taxonomic groups.
#' cas9.gRNA.oligo2(inseq="/home/kevin/Desktop/data/pacific_oyster_18S_V4.fasta", refseq="PATH/To/Your/Reference_database_file.fasta")
#'
cas9.gRNA.oligo2 <- function(inseq, refseq, target=NULL) {
library(ape)
library(Biostrings)
library(reshape2)
inseq <- as.character(inseq)#PATH to the input 18S sequences of V4 region of target host, the output file named after the sequence file
refseq <- as.character(refseq)#PATH to the reference 18S sequences of V4 region, or other region of 18S, or other genes
if(is.null(target)) {
print("target is empty")
print("The R is running, please be patient!")
myseqs0 <- readDNAStringSet(refseq, "fasta")
myseqs0
length(myseqs0)
myseqs1 <- myseqs0[!grepl("Metazoa", names(myseqs0))]
myseqs1 <- myseqs1[!grepl("Embryophyta", names(myseqs1))]
myseqs1.rc <- reverseComplement(myseqs1)
myseqs <- c(myseqs1, myseqs1.rc)
myseqs
#############input seq ###########################
sanger <- readDNAStringSet(inseq, "fasta")
sanger
sanger.rc <- reverseComplement(sanger)
##################search the sgRNA #####################
myseqs0.D4 <- sanger
qur1 <- sanger[1]
oyt1 <- myseqs0.D4[!names(myseqs0.D4)==names(qur1)]
myseqs0.rc.D4 <- sanger.rc
qur2 <- sanger.rc[1]
oyt2 <- myseqs0.rc.D4[!names(myseqs0.rc.D4)==names(qur2)]
tab <- data.frame(matrix(NA, nrow = width(qur1)[1]-20, ncol = 6))
colnames(tab) <- c("strand","start", "end", "seq", "pam.seq", "pam")
tab$strand <- "forward"
for (j in 1:(width(qur1)[1]-23)) {
res <- subseq(qur1, start = j, end = 19+j)
bps <- subseq(qur1, start = 20+j, end = 20+j)
ps <- subseq(qur1, start = 21+j, end = 22+j)
if (grepl("GG", ps[[1]]) == TRUE)
{tab$pam[j] <- "yes"
tab$seq[j] <- as.character(res)
tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
tab$start[j] <- j
tab$end[j] <- j+19
}
if (grepl("gg", ps[[1]]) == TRUE)
{tab$pam[j] <- "yes"
tab$seq[j] <- as.character(res)
tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
tab$start[j] <- j
tab$end[j] <- j+19
}
if (grepl("Gg", ps[[1]]) == TRUE)
{tab$pam[j] <- "yes"
tab$seq[j] <- as.character(res)
tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
tab$start[j] <- j
tab$end[j] <- j+19
}
if (grepl("gG", ps[[1]]) == TRUE)
{tab$pam[j] <- "yes"
tab$seq[j] <- as.character(res)
tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
tab$start[j] <- j
tab$end[j] <- j+19
}
}
tab <- na.omit(tab)
tab$id <- tab$start-1
rctab <- data.frame(matrix(NA, nrow = width(qur1)[1]-20, ncol = 6))
colnames(rctab) <- c("strand","start", "end", "seq", "pam.seq", "pam")
rctab$strand <- "reverse"
for (j in 1:(width(qur2)[1]-23)) {
res <- subseq(qur2, start = j, end = 19+j)
bps <- subseq(qur2, start = 20+j, end = 20+j)
ps <- subseq(qur2, start = 21+j, end = 22+j)
if (grepl("GG", ps[[1]]) == TRUE)
{rctab$pam[j] <- "yes"
rctab$seq[j] <- as.character(res)
rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
rctab$start[j] <- j
rctab$end[j] <- j+19
}
if (grepl("gg", ps[[1]]) == TRUE)
{rctab$pam[j] <- "yes"
rctab$seq[j] <- as.character(res)
rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
rctab$start[j] <- j
rctab$end[j] <- j+19
}
if (grepl("Gg", ps[[1]]) == TRUE)
{rctab$pam[j] <- "yes"
rctab$seq[j] <- as.character(res)
rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
rctab$start[j] <- j
rctab$end[j] <- j+19
}
if (grepl("gG", ps[[1]]) == TRUE)
{rctab$pam[j] <- "yes"
rctab$seq[j] <- as.character(res)
rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
rctab$start[j] <- j
rctab$end[j] <- j+19
}
}
rctab <- na.omit(rctab)
rctab$id <- length(qur2[[1]])-23+1-rctab$start
#tab$target.nb <- "NA"
#for (j in 1:nrow(tab)) {
# int <- 0
# for (i in 1:length(oyt1)){
# if (grepl(tab$seq[j], oyt1[[i]]) == TRUE){int <- int+1}
# }
# tab$target.nb[j] <- int
#}
#rctab$target.nb <- "NA"
#for (j in 1:nrow(rctab)) {
# int <- 0
# for (i in 1:length(oyt2)){
# if (grepl(rctab$seq[j], oyt2[[i]]) == TRUE){int <- int+1}
# }
# rctab$target.nb[j] <- int
#}
for (j in 1:nrow(tab)) {
int <- 0
res <- tab$seq[j]
for (i in 1:length(myseqs)){
if (grepl(as.character(res), myseqs[[i]]) == TRUE){
int <- int+1}
}
pec <- int/length(myseqs)
tab$hits[j] <- int
tab$percentage[j] <- pec
}
#tab$target.nb <- as.numeric(tab$target.nb)
#tab$hits <- as.numeric(tab$hits)
#tab1 <- subset(tab, tab$target.nb > length(oyt1)-1)
#tab2 <- subset(tab1, tab1$hits=="0")
#tab2 <- tab2[order(tab2$percentage), ]
for (j in 1:nrow(rctab)) {
int <- 0
res <- rctab$seq[j]
for (i in 1:length(myseqs)){
if (grepl(as.character(res), myseqs[[i]]) == TRUE){
int <- int+1}
}
pec <- int/length(myseqs)
rctab$hits[j] <- int
rctab$percentage[j] <- pec
}
#rctab$target.nb <- as.numeric(rctab$target.nb)
#rctab$hits <- as.numeric(rctab$hits)
#rctab1 <- subset(rctab, rctab$target.nb > length(oyt1)-1)
#rctab2 <- subset(rctab1, rctab1$hits=="0")
#rctab2 <- rctab2[order(rctab2$percentage), ]
meg <- rbind(tab, rctab)
#meg1 <- rbind(tab1, rctab1)
#meg2 <- rbind(tab2, rctab2)
meg$direction <- ifelse(meg$strand=="forward", "Plus", "Minus")
meg$idt <- paste(meg$direction, meg$id, sep="_")
meg$class <- target
#meg$total.host <- length(myseqs0)
#meg$target.range <- meg$target.nb / meg$total.host
meg$ref.seq <- as.character(names(qur1))
###############
meg$sgRNA.ID <- meg$idt
colnames(meg)[colnames(meg)=="seq"] <- "gRNA-target-site-oligo"
colnames(meg)[colnames(meg)=="idt"] <- "oligo.ID"
#colnames(meg)[colnames(meg)=="target.nb"] <- "hits.to.host"
colnames(meg)[colnames(meg)=="hits"] <- "hits.to.microeukaryotes"
#colnames(meg)[colnames(meg)=="total.host"] <- "total.number.host"
#colnames(meg)[colnames(meg)=="target.range"] <- "host.target.range"
colnames(meg)[colnames(meg)=="percentage"] <- "microeukaryotes.target.range"
colnames(meg)[colnames(meg)=="class"] <- "host.group"
write.csv(meg, "output_gRNA-target-site-oligo.csv", row.names = F, col.names = TRUE, quote=F)
write.table(meg, "output_gRNA-target-site-oligo.txt", row.names = F, col.names = TRUE, quote=F, sep="\t")
}
else {
print("target is not empty")
target <- as.character(target)#targeted species
print(paste0("target is: ", target))
print("The R is running, please be patient!")
myseqs0 <- readDNAStringSet(refseq, "fasta")
myseqs0
length(myseqs0)
myseqs1 <- myseqs0[!grepl("Metazoa", names(myseqs0))]
myseqs1 <- myseqs1[!grepl("Embryophyta", names(myseqs1))]
myseqs1.rc <- reverseComplement(myseqs1)
myseqs <- c(myseqs1, myseqs1.rc)
myseqs
#######################################
target <- gsub("_", " ", target)
myseqs0 <- myseqs0[grepl(target, names(myseqs0))]
myseqs0
length(myseqs0)
names(myseqs0)
sanger <- readDNAStringSet(inseq, "fasta")
sanger
sanger.rc <- reverseComplement(sanger)
myseqs0 <- c(DNAStringSet(sanger), DNAStringSet(myseqs0))
myseqs0.rc <- reverseComplement(myseqs0)
myseqs10 <- c(myseqs0, myseqs0.rc)
myseqs10
##################search the sgRNA #####################
myseqs0.D4 <- myseqs0
qur1 <- sanger[1]
oyt1 <- myseqs0.D4[!names(myseqs0.D4)==names(qur1)]
myseqs0.rc.D4 <- myseqs0.rc
qur2 <- sanger.rc[1]
oyt2 <- myseqs0.rc.D4[!names(myseqs0.rc.D4)==names(qur2)]
tab <- data.frame(matrix(NA, nrow = width(qur1)[1]-20, ncol = 6))
colnames(tab) <- c("strand","start", "end", "seq", "pam.seq", "pam")
tab$strand <- "forward"
for (j in 1:(width(qur1)[1]-23)) {
res <- subseq(qur1, start = j, end = 19+j)
bps <- subseq(qur1, start = 20+j, end = 20+j)
ps <- subseq(qur1, start = 21+j, end = 22+j)
if (grepl("GG", ps[[1]]) == TRUE)
{tab$pam[j] <- "yes"
tab$seq[j] <- as.character(res)
tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
tab$start[j] <- j
tab$end[j] <- j+19
}
if (grepl("gg", ps[[1]]) == TRUE)
{tab$pam[j] <- "yes"
tab$seq[j] <- as.character(res)
tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
tab$start[j] <- j
tab$end[j] <- j+19
}
if (grepl("Gg", ps[[1]]) == TRUE)
{tab$pam[j] <- "yes"
tab$seq[j] <- as.character(res)
tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
tab$start[j] <- j
tab$end[j] <- j+19
}
if (grepl("gG", ps[[1]]) == TRUE)
{tab$pam[j] <- "yes"
tab$seq[j] <- as.character(res)
tab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
tab$start[j] <- j
tab$end[j] <- j+19
}
}
tab <- na.omit(tab)
tab$id <- tab$start-1
rctab <- data.frame(matrix(NA, nrow = width(qur1)[1]-20, ncol = 6))
colnames(rctab) <- c("strand","start", "end", "seq", "pam.seq", "pam")
rctab$strand <- "reverse"
for (j in 1:(width(qur2)[1]-23)) {
res <- subseq(qur2, start = j, end = 19+j)
bps <- subseq(qur2, start = 20+j, end = 20+j)
ps <- subseq(qur2, start = 21+j, end = 22+j)
if (grepl("GG", ps[[1]]) == TRUE)
{rctab$pam[j] <- "yes"
rctab$seq[j] <- as.character(res)
rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
rctab$start[j] <- j
rctab$end[j] <- j+19
}
if (grepl("gg", ps[[1]]) == TRUE)
{rctab$pam[j] <- "yes"
rctab$seq[j] <- as.character(res)
rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
rctab$start[j] <- j
rctab$end[j] <- j+19
}
if (grepl("Gg", ps[[1]]) == TRUE)
{rctab$pam[j] <- "yes"
rctab$seq[j] <- as.character(res)
rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
rctab$start[j] <- j
rctab$end[j] <- j+19
}
if (grepl("gG", ps[[1]]) == TRUE)
{rctab$pam[j] <- "yes"
rctab$seq[j] <- as.character(res)
rctab$pam.seq[j] <- paste(as.character(bps), as.character(ps), sep="")
rctab$start[j] <- j
rctab$end[j] <- j+19
}
}
rctab <- na.omit(rctab)
rctab$id <- length(qur2[[1]])-23+1-rctab$start
tab$target.nb <- "NA"
for (j in 1:nrow(tab)) {
int <- 0
for (i in 1:length(oyt1)){
if (grepl(tab$seq[j], oyt1[[i]]) == TRUE){int <- int+1}
}
tab$target.nb[j] <- int
}
rctab$target.nb <- "NA"
for (j in 1:nrow(rctab)) {
int <- 0
for (i in 1:length(oyt2)){
if (grepl(rctab$seq[j], oyt2[[i]]) == TRUE){int <- int+1}
}
rctab$target.nb[j] <- int
}
for (j in 1:nrow(tab)) {
int <- 0
res <- tab$seq[j]
for (i in 1:length(myseqs)){
if (grepl(as.character(res), myseqs[[i]]) == TRUE){
int <- int+1}
}
pec <- int/length(myseqs)
tab$hits[j] <- int
tab$percentage[j] <- pec
}
tab$target.nb <- as.numeric(tab$target.nb)
tab$hits <- as.numeric(tab$hits)
tab1 <- subset(tab, tab$target.nb > length(oyt1)-1)
tab2 <- subset(tab1, tab1$hits=="0")
tab2 <- tab2[order(tab2$percentage), ]
for (j in 1:nrow(rctab)) {
int <- 0
res <- rctab$seq[j]
for (i in 1:length(myseqs)){
if (grepl(as.character(res), myseqs[[i]]) == TRUE){
int <- int+1}
}
pec <- int/length(myseqs)
rctab$hits[j] <- int
rctab$percentage[j] <- pec
}
rctab$target.nb <- as.numeric(rctab$target.nb)
rctab$hits <- as.numeric(rctab$hits)
rctab1 <- subset(rctab, rctab$target.nb > length(oyt1)-1)
rctab2 <- subset(rctab1, rctab1$hits=="0")
rctab2 <- rctab2[order(rctab2$percentage), ]
meg <- rbind(tab, rctab)
meg1 <- rbind(tab1, rctab1)
meg2 <- rbind(tab2, rctab2)
meg$direction <- ifelse(meg$strand=="forward", "Plus", "Minus")
meg$idt <- paste(meg$direction, meg$id, sep="_")
meg$class <- target
meg$total.host <- length(myseqs0)
meg$target.range <- meg$target.nb / meg$total.host
meg$ref.seq <- as.character(names(qur1))
###############
meg$sgRNA.ID <- meg$idt
colnames(meg)[colnames(meg)=="seq"] <- "gRNA-target-site-oligo"
colnames(meg)[colnames(meg)=="idt"] <- "oligo.ID"
colnames(meg)[colnames(meg)=="target.nb"] <- "hits.to.host"
colnames(meg)[colnames(meg)=="hits"] <- "hits.to.microeukaryotes"
colnames(meg)[colnames(meg)=="total.host"] <- "total.number.host"
colnames(meg)[colnames(meg)=="target.range"] <- "host.target.range"
colnames(meg)[colnames(meg)=="percentage"] <- "microeukaryotes.target.range"
colnames(meg)[colnames(meg)=="class"] <- "host.group"
write.csv(meg, "output_gRNA-target-site-oligo.csv", row.names = F, col.names = TRUE, quote=F)
write.table(meg, "output_gRNA-target-site-oligo.txt", row.names = F, col.names = TRUE, quote=F, sep="\t")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.