RGO2TR update for NCBI structurally ANNOTATED genomes using UniProt/Swiss-Prot for functional annotation
Example with cheetah immune on a Linux System
mkdir -p ~/cheetah
cd ~/cheetah
# see https://github.com/mamba-org/mamba about mamba installation
# create mamba/conda environment
mamba create -n cheetah -c conda-forge -c bioconda \
r-tidyr bioconductor-GO.db \
r-XML bioconductor-GenomicRanges \
bioconductor-GenomeInfoDb=1 \
bioconductor-GOstats bioconductor-graph bioconductor-Category \
r-base=3.6.3 \
r-Matrix \
bioconductor-AnnotationDbi \
bioconductor-IRanges \
bioconductor-Biobase \
bioconductor-BiocGenerics \
r-rvest \
r-seqinr \
r-RCurl \
r-httr \
conda-ecosystem-user-package-isolation \
r-devtools \
r-data.table \
blast \
parallel \
julia=1.10.1
# activate conda environment
conda activate cheetah
see mp4 file in email message
awk '$3 == "CDS"' genomic.gff |less -S|cut -f 9|less -S|cut -f 1-2 -d ";"|tr ';' '\t'|\\
perl -pe "s/ID=cds-//g"|perl -pe "s/Parent=rna-//g"|sort -u |fgrep -v "LOC"| \
grep -P "\w+\.\w+\t\w+\.\w+" > prot-2-mRNA
julia
_
_ _ _(_)_ | Documentation: https://docs.julialang.org
(_) | (_) (_) |
_ _ _| |_ __ _ | Type "?" for help, "]?" for Pkg help.
| | | | | | |/ _` | |
| | |_| | | | (_| | | Version 1.10.1 (2024-02-13)
_/ |\__'_|_|_|\__'_| | Official https://julialang.org/ release
|__/ |
julia>
using Pkg
Pkg.add("CSV")
Pkg.add("DataFrames")
Pkg.add("FASTX")
using CSV, DataFrames
# Replace "path/to/your_file.tsv" with the actual path to your TSV file
data = CSV.read("prot-2-mRNA", DataFrame, header=false, delim='\t')
# Assuming the columns are named Column1 and Column2 in the DataFrame
id_map = Dict(row.Column1 => row.Column2 for row in eachrow(data))
using FASTX
function replace_identifiers(fasta_file, id_map, output_file)
# Open the input FASTA file
reader = FASTA.Reader(open(fasta_file, "r"))
# Open the output FASTA file
writer = FASTA.Writer(open(output_file, "w"))
# Iterate over each record in the FASTA file
for record in reader
# Extract the protein identifier from the sequence description
protein_id = string(FASTA.identifier(record)) # Use FASTX.identifier
# Search for the protein identifier in the array
new_id = string(get(id_map, protein_id, protein_id)) # Fallback to the original ID if not found
if new_id != protein_id
# Replace the sequence identifier with the mRNA identifier
# Write the modified record to the output FASTA file
FASTA.write(writer, FASTA.Record(new_id, string(FASTA.sequence(record)))) # Use FASTX.write and FASTX.sequence
end # Close the if block
end
# Close the FASTA files
close(reader)
close(writer)
end
# Example usage
fasta_file = "protein.faa"
output_file = "mRNA.translated.fasta"
replace_identifiers(fasta_file, id_map, output_file)
type "ctrl+d" to exit the REPL
R ```
```R
library("devtools") devtools::install_github("jelber2/rGO2TR",force=TRUE)
library("data.table") library("httr") library("RCurl") library("seqinr") library("rvest") library("GOstats") library("GenomicRanges") library("XML") library("GO.db") library("tidyr") library("rGO2TR")
gff3 <-read.maker.gff3("genomic.gff")
gff3.filtered <- filter.gff3(gff3, 'exon')
system("wget ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz")
system("gunzip uniprot_sprot.fasta.gz")
system("makeblastdb -dbtype prot -in uniprot_sprot.fasta")
system("cat mRNA.translated.fasta | parallel --no-notice --jobs 96 --block 10k --recstart '>' --pipe blastp -evalue 1e-20 -outfmt 6 -db uniprot_sprot.fasta -query - >> blastp.result")
system("cat blastp.result | awk '!seen[$1]++' > blastp.result.filtered")
system("cut -f 2 blastp.result.filtered |cut -f 2 -d '|' |sort -u > UniProtKB-entry-ids-unique.txt")
system("cut -f 1 blastp.result.filtered > UniProtKB-protein-ids.txt")
system("cut -f 2 blastp.result.filtered |cut -f 2 -d '|' > UniProtKB-entry-ids.txt")
system("paste UniProtKB-protein-ids.txt UniProtKB-entry-ids.txt > UniProtKB-protein-entry-list.txt")
system('rm -f UniProtKB-entry-ids-unique-go-ids.txt') system('while read i;do curl -s -H "Accept: text/plain; format=flatfile" "https://rest.uniprot.org/uniprotkb/${i}"|grep -P "GO:"|cut -f 5 -d " "|tr '\n' ' ' |tr ';' ','|perl -pe "s/, $/\n/g"|perl -pe "s/^/${i}\t/g" >> UniProtKB-entry-ids-unique-go-ids.txt ;done < UniProtKB-entry-ids-unique.txt')
uniprot.output <- read.table("UniProtKB-entry-ids-unique-go-ids.txt",sep = "\t",header=F,na.strings = "")
uniprot.output$V1 <- as.character(uniprot.output$V1)
uniprot.output$V2 <- as.character(uniprot.output$V2)
uniprot.output$V2 <- gsub(" ", "", uniprot.output$V2)
uniprot.output <- tidyr::separate_rows(data = uniprot.output,V2,sep = ",")
uniprot.output <- na.omit(uniprot.output)
protein.entry.df <- read.table("UniProtKB-protein-entry-list.txt")
protein.entry.df$V1 <- as.character(protein.entry.df$V1) protein.entry.df$V2 <- as.character(protein.entry.df$V2)
protein.go.id.df <- uniprot.output
for (i in 1:nrow(protein.entry.df)){ protein.go.id.df$V1 <- sub(pattern=protein.entry.df$V2[i], replacement=protein.entry.df$V1[i], protein.go.id.df$V1) cat("Protein", i, "of", nrow(protein.entry.df), "\n") }
GO.term <- "GO:0006955" # sets GO term as immune response GO.term.descendants <- GOBPOFFSPRING$"GO:0006955" # gets descendants for immune response GO.id.list <- c(GO.term, GO.term.descendants) # makes GO id list
retained.protein.list <- filter.mRNA.GO.list(protein.go.id.df, GO.id.list)
target.region <- create.maker.target.region(retained.protein.list, gff3.filtered)
cat("There are at least", length(unique(sub("XM_\d+.\d;(\w+)","\1",target.region$tags,perl=TRUE))), "unique genes in the target region, but possibly more because genes in this target region might overlap genes in the gff3 file.")
cat("There are at least", length(target.region$tags), "exons in the target region, but possibly more because exons in this target region might overlap exons in the gff3 file.")
final.target.region <- reduce(target.region)
cat("The target region after removing overlaps is", sum(width(final.target.region)), "bp.")
save.target.region.as.bed.file(final.target.region, "cheetah.immunome.bed")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.