gff2longest: gff2longest

View source: R/gff2longest.R

gff2longestR Documentation

gff2longest

Description

This function extracts the gene position from GFF3 input and optional extracts the longest isoform.

Usage

gff2longest(gff3file, cds = NULL, removeNonCoding = TRUE, source = "NCBI")

Arguments

gff3file

GFF3 path [mandatory]

cds

DNAStringSet [optional]

removeNonCoding

specify if NonCoding transcripts should be removed

source

source indicating either NCBI or ENSEMBL [default: NCBI]

Value

list

Author(s)

Kristian K Ullrich

See Also

XStringSet-class

Examples

## Not run: 
## load example sequence data
## set NCBI GFF3 URL
NCBI <- "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/"
ARATHA.NCBI.gff3.url <- paste0(NCBI,
    "GCF/000/001/735/GCF_000001735.4_TAIR10.1/",
    "GCF_000001735.4_TAIR10.1_genomic.gff.gz")
ARATHA.NCBI.gff3.file <- tempfile()
## download GTF file
download.file(ARATHA.NCBI.gff3.url, ARATHA.NCBI.gff3.file, quiet=FALSE)
## set NCBI CDS URL
ARATHA.NCBI.cds.url <- paste0(NCBI,
    "GCF/000/001/735/GCF_000001735.4_TAIR10.1/",
    "GCF_000001735.4_TAIR10.1_cds_from_genomic.fna.gz")
ARATHA.NCBI.cds.file <- tempfile()
## download CDS file
download.file(ARATHA.NCBI.cds.url, ARATHA.NCBI.cds.file, quiet=FALSE)
## load CDS
ARATHA.NCBI.cds <- Biostrings::readDNAStringSet(ARATHA.NCBI.cds.file)
## get genepos and longest isoform
ARATHA.NCBI.gff3.longest <- gff2longest(gtffile=ARATHA.NCBI.gff3.file,
    cds=ARATHA.NCBI.cds, source="NCBI")
ARATHA.NCBI.gff3.longest$genepos
ARATHA.NCBI.gff3.longest$cds
## set ENSEMBL GFF3 URL
ensembl <- "http://ftp.ensemblgenomes.org/pub/plants/release-52/"
ARATHA.ENSEMBL.gff3.url <- paste0(ensembl,
    "gff3/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.52.gff3.gz")
ARATHA.ENSEMBL.gff3.file <- tempfile()
## download GTF file
download.file(ARATHA.ENSEMBL.gff3.url, ARATHA.ENSEMBL.gff3.file, quiet=FALSE)
## set ENSEMBL CDS URL
ARATHA.ENSEMBL.cds.url <- paste0(ensembl,
    "fasta/arabidopsis_thaliana/cds/",
    "Arabidopsis_thaliana.TAIR10.cds.all.fa.gz")
ARATHA.ENSEMBL.cds.file <- tempfile()
## download CDS file
download.file(ARATHA.ENSEMBL.cds.url, ARATHA.ENSEMBL.cds.file, quiet=FALSE)
ARATHA.ENSEMBL.cds <- Biostrings::readDNAStringSet(ARATHA.ENSEMBL.cds.file)
## get genepos and longest isoform
ARATHA.ENSEMBL.gff3.longest <- gff2longest(gff3file=ARATHA.ENSEMBL.gff3.file,
    cds=ARATHA.ENSEMBL.cds)
ARATHA.ENSEMBL.gff3.longest$genepos
ARATHA.ENSEMBL.gff3.longest$cds

## End(Not run)

kullrich/CRBHits documentation built on March 29, 2024, 11:34 a.m.