RGO2TR update for NCBI structurally ANNOTATED genomes using UniProt/Swiss-Prot for functional annotation
Example with cheetah immune on a Linux System
mkdir -p ~/cheetah
cd ~/cheetah
# see https://github.com/mamba-org/mamba about mamba installation
# create mamba/conda environment
mamba create -n cheetah -c conda-forge -c bioconda \
r-tidyr bioconductor-GO.db \
r-XML bioconductor-GenomicRanges \
bioconductor-GenomeInfoDb=1 \
bioconductor-GOstats bioconductor-graph bioconductor-Category \
r-base=3.6.3 \
r-Matrix \
bioconductor-AnnotationDbi \
bioconductor-IRanges \
bioconductor-Biobase \
bioconductor-BiocGenerics \
r-rvest \
r-seqinr \
r-RCurl \
r-httr \
conda-ecosystem-user-package-isolation \
r-devtools \
r-data.table \
blast \
parallel
# activate conda environment
conda activate cheetah
# enter R shell
R
```R
library("devtools") devtools::install_github("jelber2/rGO2TR",force=TRUE)
library("data.table") library("httr") library("RCurl") library("seqinr") library("rvest") library("GOstats") library("GenomicRanges") library("XML") library("GO.db") library("tidyr") library("rGO2TR")
system("wget -c https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/32536/101/GCF_003709585.1_Aci_jub_2/GCF_003709585.1_Aci_jub_2_genomic.gff.gz")
system("gunzip GCF_003709585.1_Aci_jub_2_genomic.gff.gz")
gff3 <-read.maker.gff3("./GCF_003709585.1_Aci_jub_2_genomic.gff")
gff3.filtered <- filter.gff3(gff3, 'exon')
mRNA.acc <- get.mRNA.acc(gff3.filtered)
mRNA.translated <- get.mRNA.translated(mRNA.acc, mRNA.translated, "jean.elbers@gmail.com", # put your email address here "mRNA.fasta")
system("wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz")
system("gunzip uniprot_sprot.fasta.gz")
system("makeblastdb -dbtype prot -in uniprot_sprot.fasta")
system("cat mRNA.fasta | parallel --no-notice --jobs 96 --block 10k --recstart '>' --pipe blastp -evalue 1e-20 -outfmt 6 -db uniprot_sprot.fasta -query - >> blastp.result")
system("cat blastp.result | awk '!seen[$1]++' > blastp.result.filtered")
system("cut -f 2 blastp.result.filtered |cut -f 2 -d '|' |sort -u > UniProtKB-entry-ids-unique.txt")
system("cut -f 1 blastp.result.filtered > UniProtKB-protein-ids.txt")
system("cut -f 2 blastp.result.filtered |cut -f 2 -d '|' > UniProtKB-entry-ids.txt")
system("paste UniProtKB-protein-ids.txt UniProtKB-entry-ids.txt > UniProtKB-protein-entry-list.txt")
system('while read i;do wget -q -O - "http://www.uniprot.org/uniprot/?query=${i}&format=tab&columns=id%2Cgo-id" |grep -v "Entry" >> UniProtKB-entry-ids-unique-go-ids.txt ;done < UniProtKB-entry-ids-unique.txt')
uniprot.output <- read.table("UniProtKB-entry-ids-unique-go-ids.txt",sep = "\t",header=F,na.strings = "")
uniprot.output$V1 <- as.character(uniprot.output$V1)
uniprot.output$V2 <- as.character(uniprot.output$V2)
uniprot.output$V2 <- gsub(" ", "", uniprot.output$V2)
uniprot.output <- tidyr::separate_rows(data = uniprot.output,V2,sep = ";")
uniprot.output <- na.omit(uniprot.output)
protein.entry.df <- read.table("UniProtKB-protein-entry-list.txt")
protein.entry.df$V1 <- as.character(protein.entry.df$V1) protein.entry.df$V2 <- as.character(protein.entry.df$V2)
protein.go.id.df <- uniprot.output
for (i in 1:nrow(protein.entry.df)){ protein.go.id.df$V1 <- sub(pattern=protein.entry.df$V2[i], replacement=protein.entry.df$V1[i], protein.go.id.df$V1) cat("Protein", i, "of", nrow(protein.entry.df), "\n") }
GO.term <- "GO:0006955" # sets GO term as immune response GO.term.descendants <- GOBPOFFSPRING$"GO:0006955" # gets descendants for immune response GO.id.list <- c(GO.term, GO.term.descendants) # makes GO id list
retained.protein.list <- filter.mRNA.GO.list(protein.go.id.df, GO.id.list)
target.region <- create.maker.target.region(retained.protein.list, gff3.filtered)
cat("There are at least", length(unique(sub("XM_\d+.\d;(\w+)","\1",target.region$tags,perl=TRUE))), "unique genes in the target region, but possibly more because genes in this target region might overlap genes in the gff3 file.")
cat("There are at least", length(target.region$tags), "exons in the target region, but possibly more because exons in this target region might overlap exons in the gff3 file.")
final.target.region <- reduce(target.region)
cat("The target region after removing overlaps is", sum(width(final.target.region)), "bp.")
save.target.region.as.bed.file(final.target.region, "cheetah.immunome.bed")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.