#' getFeatures
#'
#' @description Creates feature loci from a gff file
#'
#' @details Authour: Chris Ward \cr
#' Get genomic features such as 'genes', 'coding sequence', or 'exons' from gff.
#'
#' @param gffName \code{character} \cr
#' Path to gff to extract features from
#' @param contigMD \code{data.frame} \cr
#' Contig metadata. Contains two columns: \code{ID} with scaffold names, \code{length} with scaffold length. \cr
#' Used to select intergenic regions.
#' @param feature \code{character}. \cr
#' Loci to import genotypes for. c("gene", "gene:exon", "gene:cds", "pseudogene", "lncRNA", "intergenic")
#' @param nCores \code{numeric} \cr
#' Number of cores to run in parallel
#' @param includeRange \code{GRanges} or \code{GRangesList} \cr
#' Target ranges to extract features over.
#' @param longestIsoform \code{logical} \cr
#' Only effects feature = "gene:exon". \cr
#' By default function will select the first entry for each gene. \cr
#' If \code{TRUE} is passed, will select the longest isoform for that gene. \cr
#' If there is a \code{"biotype"} field in the GFF it will only deal with those labeled as 'protein coding.' \cr
#' Selecting the longest isoform may be problematic if the \code{"biotype"} is missing.
#' @param geneIdField \code{character} \cr
#' Name of the gene id field in the gff, \code{"Name"} by default.
#'
#' @import rtracklayer
#' @importFrom purrr map
#' @importFrom furrr future_map
#' @importFrom future plan
#'
#' @return A \code{GRangesList} object where each element is a genomic feature
#'
#'
#' @examples
#'
#' @name makeFeatures
#' @rdname makeFeatures-methods
#' @export
setGeneric("makeFeatures", function(gffName, feature, nCores = 1,
longestIsoform, includeRange, geneIDField, ...){
standardGeneric("makeFeatures")
})
#' @aliases makeFeatures,which-Granges
#' @export
setMethod("makeFeatures", signature(gffName = "character",
includeRange = "GRanges"),
function(gffName, feature, nCores,
longestIsoform, includeRange, geneIDField){
G_range <- import.gff(gffName, which = includeRange)
makeFeatures(gffName = G_range, feature = feature, nCores = nCores, longestIsoform = longestIsoform, geneIDField = geneIDField)
})
#' @aliases makeFeatures,which-GrangesList
#' @export
setMethod("makeFeatures", signature(gffName = "character",
includeRange = "GRangesList"),
function(gffName, feature, nCores,
longestIsoform, includeRange, geneIDField){
G_range <- import.gff(gffName, which = includeRange)
makeFeatures(gffName = G_range, feature = feature, nCores = nCores, longestIsoform = longestIsoform, geneIDField = geneIDField)
})
#' @aliases makeFeatures,which
#' @export
setMethod("makeFeatures", signature(gffName = "character"),
function(gffName, feature, nCores,
longestIsoform, geneIDField){
G_range <- import.gff(gffName)
makeFeatures(gffName = G_range, feature = feature, nCores = nCores, longestIsoform = longestIsoform, geneIDField = geneIDField)
})
#' @aliases makeFeatures,import
#' @export
setMethod("makeFeatures", signature = "GRanges",
function(gffName, feature, nCores,
longestIsoform, geneIDField){
# check ig gene is specified in the feature parameter
if(grepl("gene", feature)){
if(missing(geneIDField)) geneIDField <- "ID"
gffName$gene <- gffName@elementMetadata[[geneIDField]]
gffName@elementMetadata <- gffName@elementMetadata[,colnames(gffName@elementMetadata) %in% c("type", "ID", "Name", "gene", "Parent")]
# filter all GRanges to contain only those with type == gene
genes <- gffName[gffName$type == "gene",]
mRNA <- gffName[gffName$type == "mRNA"]
# get ID from Granges using the ID field
geneID <- genes$gene
# make into GRangesList containing only genes
genesList <- GRangesList(
split(genes,
geneID))
# if only genes are specified then return the genes GRangeList
if(feature == "gene") {
return(genesList)
}
# Continue if you also want exons
if(feature == "gene:exons"){
# get all exons from all GRanges
all <- gffName[gffName$type == "exon",]
all@elementMetadata <- all@elementMetadata[colnames(all@elementMetadata) %in% c("gene", "Parent")]
if(length(unlist(mRNA$Parent))){
mRNA_df <- tibble::tibble(geneID = unlist(mRNA$Parent), mrnaID = mRNA$ID)
gene_df <- tibble::tibble(geneName = genes$gene, geneID = genes$ID)
comb_df <- dplyr::full_join(mRNA_df, gene_df)
comb_df$mrnaID[is.na(comb_df$mrnaID)] <- comb_df$geneID[is.na(comb_df$mrnaID)]
mRNAlookup <- comb_df$geneID
names(mRNAlookup) <- comb_df$mrnaID
gNames <- unlist(all$Parent)
gNames <- unname(mRNAlookup[gNames])
geneLookup <- comb_df$geneName
names(geneLookup) <- comb_df$geneID
gNames <- unname(geneLookup[gNames])
all$gene <- gNames
} else all$gene <- unlist(all$Parent)
all <- split(all, all$gene)
# get mRNA using the gene ID field
ExonName <- names(all)
plan(multisession, workers = nCores)
all <- future_map(seq_along(all), .f = .mapFun, all, longestIsoform)
plan(sequential)
all <- dplyr::bind_rows(all)
all <- makeGRangesListFromDataFrame(all, split.field = "Name", keep.extra.columns = TRUE)
return(all)
}
if(feature == "gene:cds"){
# get all exons from all GRanges
all <- gffName[gffName$type == "CDS",]
all@elementMetadata <- all@elementMetadata[colnames(all@elementMetadata) %in% c("gene", "Parent")]
#make lookup table for CDS parent tracking
if(length(unlist(mRNA$Parent))){
mRNA_df <- tibble::tibble(geneID = unlist(mRNA$Parent), mrnaID = mRNA$ID)
gene_df <- tibble::tibble(geneName = genes$gene, geneID = genes$ID)
comb_df <- dplyr::full_join(mRNA_df, gene_df)
comb_df$mrnaID[is.na(comb_df$mrnaID)] <- comb_df$geneID[is.na(comb_df$mrnaID)]
mRNAlookup <- comb_df$geneID
names(mRNAlookup) <- comb_df$mrnaID
gNames <- unlist(all$Parent)
gNames <- unname(mRNAlookup[gNames])
geneLookup <- comb_df$geneName
names(geneLookup) <- comb_df$geneID
gNames <- unname(geneLookup[gNames])
all$gene <- gNames
} else all$gene <- unlist(all$Parent)
all <- split(all, all$gene)
plan(multisession, workers = nCores)
all <- future_map(seq_along(all), .f = .mapFun, all, longestIsoform)
plan(sequential)
all <- dplyr::bind_rows(all)
all$Name <- all$gene
all <- makeGRangesListFromDataFrame(all, split.field = "Name", keep.extra.columns = TRUE)
#CDS <- Filter(length, CDS)
return(all)
}
}
#get the psuedogenes out
if(feature == "pseudogene"){
psGenes <- gffName[gffName$type == "pseudogene"]
psGenes <- GRangesList(split(psGenes, psGenes$Name))
return(psGenes)
}
#get the lncRNA out
if(feature == "lncRNA"){
lncRNA <- gffName[gffName$type == "lnc_RNA"]
lncRNA <- GRangesList(split(lncRNA, lncRNA$Name))
return(lncRNA)
}
# get the intergenic sequences out
if(feature == "intergenic"){
#get contig range from the contigMD object
contigRange <- makeGRangesFromDataFrame(tibble(chr = contigMD[["ID"]], start = 1, strand = ".", end = contigMD[["length"]]))
# remove any chromosome annotations
noChr <- gffName[gffName$type != "chromosome"]
# join all annotations that overlap
noChr <- disjoin(noChr)
# get hits on each sequence and get overlap
hits <- findOverlaps(contigRange, noChr)
overlapHits <- extractList(noChr, as(hits, "List"))
#use hits to get intergenic regions
intergenic <- psetdiff(contigRange, overlapHits)
return(intergenic)
}
})
.mapFun <- function(y, all, longestIsoform){
rec <- all[[y]]
mRNA <- split(rec, as.character(rec$Parent))
#select longest isoform
if(longestIsoform){
#check for biotype to only select isoforms with 'protien_coding' and 'TEC'
#if("biotype" %in% names(mRNA@elementMetadata)){
# if(length(mRNA$biotype) != 1) mRNA <- subset(mRNA, biotype %in% c("protein_coding", "TEC"))
#
# }
#
isoformLengths <- unlist(lapply(mRNA, function(x){
sum(width(x))
}))
# get the max length sequence, extract first element in longest if there are multiple longest
longest <- which(isoformLengths == max(isoformLengths))[1]
mRNA <- mRNA[[longest]]} else mRNA <- mRNA[[1]]
mRNA <- tibble::as_tibble(mRNA)
mRNA <- mRNA[colnames(mRNA) != "Parent"]
mRNA
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.