knitr::opts_chunk$set(echo = TRUE) library(tidyverse) library(stringr) library(Biostrings) library(GenomicFeatures) library(GenomicRanges) library("BSgenome.Hsapiens.NCBI.GRCh38") library("BSgenome.Mmusculus.UCSC.mm10")
#This reads GENCODE GFF3 files for mouse and human into a dataframe with options to filter out undesireable transcripts #undesireable transcripts = transcript_support_level > 2 or contains a "not_found" tag for either mRNA start/end or CDS start/end #It also contains the option to filter for protein coding transcripts #this requires the package stringr ##TODO: ## expand to non-GENCODE GFF3s? GFF2df <- function(genomeGFF3, filter = TRUE, protein.coding = FALSE){ gff_df <- read.table(file = genomeGFF3, col.names = c("seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"), colClasses = c("character", "character", "character", "integer", "integer", "character", "character", "character", "character")) %>% as_tibble() %>% mutate(ID = ifelse(grepl("ID=", attributes), str_match(attributes, "ID=(.*?);")[,2], "NA"), gene_id = ifelse(grepl("gene_id=", attributes), str_match(attributes, "gene_id=(.*?);")[,2], "NA"), gene_name = ifelse(grepl("gene_name=", attributes), str_match(attributes, "gene_name=(.*?);")[,2], "NA"), gene_type = ifelse(grepl("gene_type=", attributes), str_match(attributes, "gene_type=(.*?);")[,2], "NA"), transcript_id = ifelse(grepl("transcript_id=", attributes), str_match(attributes, "transcript_id=(.*?);")[,2], "NA"), transcript_name = ifelse(grepl("transcript_name=", attributes), str_match(attributes, "transcript_name=(.*?);")[,2], "NA"), transcript_type = ifelse(grepl("transcript_type=", attributes), str_match(attributes, "transcript_type=(.*?);")[,2], "NA"), Parent = ifelse(grepl("Parent=", attributes), str_match(attributes, "Parent=(.*?);gene")[,2], "NA"), tag = ifelse(grepl("tag=", attributes), str_match(attributes, "tag=(.*?);")[,2], "NA"), transcript_support_level = ifelse(grepl("transcript_support_level=", attributes), str_match(attributes, "transcript_support_level=(.*?);")[,2], "NA"), exon_id = ifelse(grepl("exon_id=", attributes), str_match(attributes, "exon_id=(.*?);")[,2], "NA"), exon_number = ifelse(grepl("exon_number=", attributes), str_match(attributes, "exon_number=(.*?);")[,2], "NA")) ifelse(filter == TRUE, gff_df <- gff_df %>% filter(transcript_support_level == "1" | transcript_support_level == "2" | transcript_support_level == "NA", !grepl("_NF", tag)), "") ifelse(protein.coding == TRUE, gff_df <- gff_df %>% filter(transcript_type == "protein_coding" | transcript_type == "NA"), "") gff_df } #These take less than 4 minutes each. #mmGFF <- GFF2df("mydata/gencode.vM20.annotation.gff3.gz") #hsGFF <- GFF2df("mydata/gencode.v33.annotation.gff3.gz")
#This subsets the GFF dataframe by a list of transcripts of interest. #it also can return a sequence type of "three_prime_UTR", "five_prime_UTR", "exon", or "CDS" mm_tx <- c("ENSMUST00000159265.1", "ENSMUST00000027032.5", "ENSMUST00000130201.7", "ENSMUST00000157375.1") hs_tx <- c("ENST00000456328.2", "ENST00000338338.9", "ENST00000478641.5", "ENST00000356026.10", "ENST00000607222.1", "ENST00000342066.8") subsetGFFdf <- function(GFF_df, transcript_list, seq_type){ s_gff_df <- GFF_df %>% filter(transcript_id %in% transcript_list) %>% filter(type == seq_type) s_gff_df } #sGFF <- subsetGFFdf(hsGFF, hs_tx, "CDS") #sGFF <- subsetGFFdf(mmGFF, mm_tx, "CDS")
#This writes a gff3 file from a gff_df writeGFF <- function(gff_df, file_name){ gff_df <- gff_df %>% select(seqid, source, type, start, end, score, strand, phase, attributes) colnames(gff_df) <- NULL rownames(gff_df) <- NULL write.table(gff_df, paste(file_name, ".gff3", sep = ""), row.names = FALSE, quote = FALSE) } #writeGFF(sGFF, "mydata/test")
#this will read large genome fasta files into a DNA string set #this requires the package Biostrings ##TODO ## Big problem here, human genome fasta is too large to make a string set...maybe fixable? maybe not... ## this is where I switched to using BSgenome and other packages fa2DNAstringSet <- function(fa){ fa <- readDNAStringSet(fa) names(fa) <- sub(" .*", "", names(fa)) fa } #mmGenome_DNAstringSet <- fa2DNAstringSet("mydata/GRCm38.p6.genome.fa") #hsGenome_DNAstringSet <- fa2DNAstringSet("mydata/GRCh38.p13.genome.fa")
#This will write a fasta of sequences from a gff data frame grouping by both gene and transcript ##TODO: ## need to finish this ## maybe wont though because fasta genomes are so big... GFF2FA <- function(GFF_df, genome_DNAstringSet, fa_name){ coord_list <- GFF_df %>% mutate(range = paste(start, end, sep = ":")) %>% group_by(gene_id, transcript_id) %>% summarize(coord_list = list(range)) %>% pull(., coord_list) chr_list <- GFF %>% group_by(gene_id, transcript_id) %>% summarize(chr_list = list(seqid)) %>% pull(., chr_list) strand_list <- GFF %>% group_by(gene_id, transcript_id) %>% summarize(strand_list = list(strand)) %>% pull(., strand_list) } #gFF2FA(sGFF, DNAstringset, "mydata/test.fa")
#filter_TX() filters out transcripts from gencode gff files #this is compatible with human and mouse gencode annotations #downloaded from https://www.gencodegenes.org/human/ or https://www.gencodegenes.org/mouse/ #there are two filter options, a general filter and a protein coding filter. #The general filter removes any transcripts with undefined ends or with a support level higher than 2 (low confidence transcripts) #The protein coding filter removes all transcripts that lack a protein coding tag. #The default is to remove low confidence transcripts (general filter) but keep all transcripts regardless of protein coding status #the output from filter_Tx() is a TxDb object containing a gene model of the filtered transcripts #this Txdb object is required for downstream functions #if filtering is not desired, the below functions will create TxDb objects for human and mouse Gencode data #hs_gff <- GenomicFeatures::makeTxDbFromGFF(file = "mydata/Gencodedat/gencode.v33.annotation.gff3.gz", format = "gff3", organism = "Homo sapiens") # 11min #GenomeInfoDb::seqlevelsStyle(hs_gff) <- "NCBI" #mm_gff <- GenomicFeatures::makeTxDbFromGFF(file = "mydata/Gencodedat/gencode.vM20.annotation.gff3.gz", format = "gff3", organism = "Mus musculus") filter_Tx <- function(gff, filter = TRUE, protein.coding = FALSE){ gff <- rtracklayer::import(gff) if (filter == TRUE){ print("filtering out low confidence transcripts...", quote = FALSE) #remove low confidence transcripts gff <- gff[gff$transcript_support_level == "1" | gff$transcript_support_level == "2" | is.na(gff$transcript_support_level)] #remove transcripts with undefined starts or ends NF <- c("cds_start_NF", "cds_end_NF", "mRNA_start_NF", "mRNA_end_NF") `%out%` = Negate(`%in%`) gff <- gff[as.list(gff$tag) %out% NF] print("filtering complete.", quote = FALSE) } if (protein.coding == TRUE){ print("filtering out non-coding transcripts...", quote = FALSE) #remove non-coding transcripts gff <- gff[gff$transcript_type == "protein_coding" | is.na(gff$transcript_type)] print("filtering complete.", quote = FALSE) } #create txdb gene model from filtered gff if (all(grepl("ENSG", gff$gene_id))){ txdbGFF <- GenomicFeatures::makeTxDbFromGRanges(gff) GenomeInfoDb::seqlevelsStyle(txdbGFF) <- "NCBI" } else if (all(grepl("ENSMUSG", gff$gene_id))){ txdbGFF <- GenomicFeatures::makeTxDbFromGRanges(gff) } return(txdbGFF) } mm_f_gff <- filter_Tx("mydata/Gencodedat/gencode.vM20.annotation.gff3.gz") #3.5 min hs_f_gff <- filter_Tx("mydata/Gencodedat/gencode.v33.annotation.gff3.gz") #1.5 min
#RNA_breakdown is a simple analysis that reveals the types of RNAs present in a transcript list #this can be used to better understand your case and control sets of transcripts #If you instead have sets of genes, we recommend using expression data to find expresed transcripts within your experiment #otherwise, use gene2tx() to create transcript sets from either make_longest_df or make_median_df RNA_breakdown <- function(gff, tx_list) { # Check that tx_list doesn't contain Ensembl gene IDs print("Ensuring compatibility of GFF and transcript list...", quote = FALSE) if (any(grepl("ENSG", tx_list)) | any(grepl("ENSMUSG", tx_list))) { stop("Please ensure the transcript list contains transcript IDs and not gene IDs.") } # Determine the species of tx_list ifelse(all(grepl("ENST", tx_list)), (list_species = "human"), ifelse(all(grepl("ENSMUST", tx_list)), (list_species = "mouse"), (list_species = "unknown"))) # Import gencode gff as GRanges object gff <- rtracklayer::import(gff) # Determine species of gff annotations gff_species = "unknown" if (all(grepl("ENSG", gff$gene_id))) { gff_species = "human" } else if (all(grepl("ENSMUSG", gff$gene_id))) { gff_species = "mouse" } # Ensure tx_list and gff are of the same species if (list_species != gff_species) { stop("Please ensure the transcript list and gff are of the same species.") } print("Gff and transcript list are compatible.", quote = FALSE) print("Getting RNA type information...") gff <- gff[gff$type == "transcript"] gff <- gff[gff$transcript_id %in% tx_list] if (length(gff) == 0) { stop("No transcript IDs in tx_list were not found in gff.") } RNA_breakdown <- gff@elementMetadata %>% dplyr::as_tibble() %>% dplyr::group_by(transcript_type) %>% dplyr::summarize(count = n(), percent = n() / nrow(.) * 100) return(RNA_breakdown) } RNA_breakdown("mydata/Gencodedat/gencode.vM20.annotation.gff3.gz", mm_tx)
##This function writes gff3 files or fasta files from genome TxDb object subsetted with a transcript list #sequences extracted can be entire transcripts (whole) or just the CDS, 3`UTR or 5`UTR sequences. #If you instead have sets of genes, we recommend using expression data to find expresed transcripts within your experiment to represent your genes of interest. #otherwise, use gene2tx() to create transcript sets from either make_longest_df or make_median_df write_Sequence <- function(TxDb_gff, tx_list, seq_type, file_name, output_type){ #Check that transcript list contains transcripts print("Ensuring compatibility of GFF and transcript list...", quote = FALSE) if (any(grepl("ENSG", tx_list)) | any(grepl("ENSMUSG", tx_list))){ stop("Please ensure the transcript list contains transcript IDs and not gene IDs.") } #Make sure gff and transcript list are the same species ifelse(all(grepl("ENST", tx_list)), (list_species = "human"), ifelse(all(grepl("ENSMUST", tx_list)), (list_species = "mouse"), (list_species = "unknown"))) ifelse(grepl("ENSG", GenomicFeatures::genes(TxDb_gff)[1]$gene_id), (gff_species = "human"), ifelse(grepl("ENSMUSG", GenomicFeatures::genes(TxDb_gff)[1]$gene_id), (gff_species = "mouse"), (gff_species = "unknown"))) if (list_species != gff_species){ stop("Please ensure the transcript list and gff are of the same species.") } print("GFF and transcript list are compatible.", quote = FALSE) #Subset the gffs by transcript and seq type tx_gff <- NULL if (seq_type == "whole"){ print("filtering whole transcripts...", quote = FALSE) tx_gff <- GenomicFeatures::exonsBy(TxDb_gff, "tx", use.names = TRUE) tx_gff <- tx_gff[names(tx_gff) %in% tx_list] print("filtering complete.", quote = FALSE) } else if (seq_type == "CDS"){ print("filtering CDS...", quote = FALSE) tx_gff <- GenomicFeatures::cdsBy(TxDb_gff, "tx", use.names = TRUE) tx_gff <- tx_gff[names(tx_gff) %in% tx_list] print("filtering complete.", quote = FALSE) } else if (seq_type == "UTR5"){ print("filtering 5' UTRs...", quote = FALSE) tx_gff <- GenomicFeatures::fiveUTRsByTranscript(TxDb_gff, use.names = TRUE) tx_gff <- tx_gff[names(tx_gff) %in% tx_list] print("filtering complete.", quote = FALSE) } else if (seq_type == "UTR3"){ print("filtering 3' UTRs...", quote = FALSE) tx_gff <- GenomicFeatures::threeUTRsByTranscript(TxDb_gff, use.names = TRUE) tx_gff <- tx_gff[names(tx_gff) %in% tx_list] print("filtering complete.", quote = FALSE) } else if(is.null(tx_gff) == TRUE){ stop(paste("no sequences of type \"", seq_type, "\" found in transcript list", sep = ""), quote = FALSE) } else stop("not appropriate seq type. please use one of \"whole\", \"CDS\", \"UTR5\" or \"UTR3\"", quote = FALSE) #Get the sequences from subset GFF if (list_species == "human" & gff_species == "human"){ print("extracting sequences...", quote = FALSE) seq <- GenomicFeatures::extractTranscriptSeqs(BSgenome.Hsapiens.NCBI.GRCh38::Hsapiens, tx_gff) } else if (list_species == "mouse" & gff_species == "mouse"){ print("extracting sequences...", quote = FALSE) seq <- GenomicFeatures::extractTranscriptSeqs(BSgenome.Mmusculus.UCSC.mm10::Mmusculus, tx_gff) } else stop("Please ensure the transcript list and gff are of the same species.") names(seq) <- paste(names(seq), seq_type, sep = "_") print("extracting sequences complete.", quote = FALSE) #output sequences in desired format if (output_type == "fa"){ print("writing fasta...", quote = FALSE) Biostrings::writeXStringSet(seq, paste(file_name, ".fa", sep = ""), format = "fasta") print("fasta writen.", quote = FALSE) } else if (output_type == "gff3"){ print("writing GFF3...", quote = FALSE) rtracklayer::export(tx_gff, paste(file_name, ".gff3", sep = ""), format = "gff3") print("GFF3 written.", quote = FALSE) } else if (output_type == "both"){ print("writing fasta...", quote = FALSE) Biostrings::writeXStringSet(seq, paste(file_name, ".fa", sep = ""), format = "fasta") print("fasta writen.", quote = FALSE) print("writing GFF3...", quote = FALSE) rtracklayer::export(tx_gff, paste(file_name, ".gff3", sep = ""), format = "gff3") print("GFF3 written.", quote = FALSE) } else stop("not appropriate format, please use \"fa\", \"gff3\" or \"both\"", quote = FALSE) print(paste("Querried ", length(tx_gff), " of ", length(tx_list), " requested genes.", sep = ""), quote = FALSE) print(paste("Querried ", sum(Biostrings::width(seq))/1000, " KB of sequence.", sep = ""), quote = FALSE) } #write_Sequence(hs_gff, hs_tx, "CDS", "mydata/hs_test", "gff3") #write_Sequence(mm_gff, mm_tx, "CDS", "mydata/mm_test", "gff3")
#This creates a dataframe of the transcripts with the longest features for each gene in a TxDb object #this takes a minute or two so its better to run it once. make_longest_df <- function(TxDb_gff){ #identify transcript with longest feature for each gene len_df <- GenomicFeatures::transcriptLengths(TxDb_gff, with.cds_len = TRUE, with.utr5_len = TRUE, with.utr3_len = TRUE) %>% dplyr::as_tibble() %>% dplyr::mutate(gene_id = sub("\\..*", "", gene_id)) longest_tx <- len_df %>% dplyr::group_by(gene_id) %>% dplyr::top_n(1, tx_len) %>% dplyr::rename("whole" = "tx_name") %>% dplyr::select(gene_id, whole) longest_tx <- len_df %>% dplyr::group_by(gene_id) %>% dplyr::top_n(1, cds_len) %>% dplyr::rename("CDS" = "tx_name") %>% dplyr::select(gene_id, CDS) %>% dplyr::left_join(., longest_tx) longest_tx <- len_df %>% dplyr::group_by(gene_id) %>% dplyr::top_n(1, utr5_len) %>% dplyr::rename("UTR5" = "tx_name") %>% dplyr::select(gene_id, UTR5) %>% dplyr::left_join(., longest_tx) longest_tx <- len_df %>% dplyr::group_by(gene_id) %>% dplyr::top_n(1, utr3_len) %>% dplyr::rename("UTR3" = "tx_name") %>% dplyr::select(gene_id, UTR3) %>% dplyr::left_join(., longest_tx) longest_tx <- longest_tx %>% dplyr::group_by(gene_id) %>% dplyr::summarize(whole = dplyr::first(whole), CDS = dplyr::first(CDS), UTR5 = dplyr::first(UTR5), UTR3 = dplyr::first(UTR3)) } #longest_mm <- make_longest_df(mm_f_gff) #18s #longest_hs <- make_longest_df(hs_f_gff) #5.5min
#This creates a dataframe of the transcripts with the median length features for each gene in a TxDb object #this takes a minute or two so its better to run it once. make_median_df <- function(TxDb_gff){ #identify transcript with longest feature for each gene len_df <- GenomicFeatures::transcriptLengths(TxDb_gff, with.cds_len = TRUE, with.utr5_len = TRUE, with.utr3_len = TRUE) %>% dplyr::as_tibble() %>% dplyr::mutate(gene_id = sub("\\..*", "", gene_id)) #ignore isoforms with no feature when calculating median (many transcripts lack CDS, UTR3 and UTR5 features) len_df[len_df == 0] <- NA median_tx <- len_df %>% dplyr::group_by(gene_id) %>% dplyr::filter(abs(tx_len - median(tx_len, na.rm = TRUE)) == min(abs(tx_len - median(tx_len, na.rm = TRUE)), na.rm = TRUE)) %>% dplyr::rename("whole" = tx_name) %>% dplyr::select(gene_id, whole) median_tx <- len_df %>% dplyr::group_by(gene_id) %>% dplyr::filter(abs(cds_len - median(cds_len, na.rm = TRUE)) == min(abs(cds_len - median(cds_len, na.rm = TRUE)), na.rm = TRUE)) %>% dplyr::rename("CDS" = tx_name) %>% dplyr::select(gene_id, CDS) %>% dplyr::left_join(., median_tx) median_tx <- len_df %>% dplyr::group_by(gene_id) %>% dplyr::filter(abs(utr5_len - median(utr5_len, na.rm = TRUE)) == min(abs(utr5_len - median(utr5_len, na.rm = TRUE)), na.rm = TRUE)) %>% dplyr::rename("UTR5" = tx_name) %>% dplyr::select(gene_id, UTR5) %>% dplyr::left_join(., median_tx) median_tx <- len_df %>% dplyr::group_by(gene_id) %>% dplyr::filter(abs(utr3_len - median(utr3_len, na.rm = TRUE)) == min(abs(utr3_len - median(utr3_len, na.rm = TRUE)), na.rm = TRUE)) %>% dplyr::rename("UTR3" = tx_name) %>% dplyr::select(gene_id, UTR3) %>% dplyr::left_join(., median_tx) median_tx <- median_tx %>% dplyr::group_by(gene_id) %>% dplyr:: summarize(whole = dplyr::first(whole), CDS = dplyr::first(CDS), UTR5 = dplyr::first(UTR5), UTR3 = dplyr::first(UTR3)) } #median_mm <- make_median_df(mm_f_gff) #median_hs <- make_median_df(hs_f_gff)
#This takes a gene list and returns a list of transcripts with the longest or median length feature of interest #hs_gene <- c("ENSG00000000419.12", "ENSG00000001167.14", "ENSG00000000938.13") gene2Tx <- function(length_df, gene_list, seq_type){ #Check that gene list contains genes if (all(grepl("ENST", gene_list)) | all(grepl("ENSMUST", gene_list))){ stop("Please ensure the gene list contains gene IDs and not transcript IDs.") } #Make sure df and gene list are the same species ifelse (all(grepl("ENSG", gene_list)), (list_species = "human"), ifelse (all(grepl("ENSMUSG", gene_list)), (list_species = "mouse"), (list_species = "unknown"))) ifelse (grepl("ENSG", length_df$gene_id[1]), (gff_species = "human"), ifelse (grepl("ENSMUSG", length_df$gene_id[1]), (gff_species = "mouse"), (gff_species = "unknown"))) if (list_species != gff_species){ stop("Please ensure the gene list and gff are of the same species.") } #create longest transcript list from gene list tx_list <- length_df %>% dplyr::filter(gene_id %in% gene_list) %>% dplyr::pull(., seq_type) return(tx_list) } #gene2Tx(longest_hs, hs_gene, "UTR3")
# This will return transcripts represetning all other genes not in gene_list #this isnt really recommended as not all genes will be exprsesed in a given system. get_ctrl_tx <- function(length_df, gene_list, seq_type){ #Check that gene list contains genes if (all(grepl("ENST", gene_list)) | all(grepl("ENSMUST", gene_list))){ stop("Please ensure the gene list contains gene IDs and not transcript IDs.") } #Make sure df and gene list are the same species ifelse (all(grepl("ENSG", gene_list)), (list_species = "human"), ifelse (all(grepl("ENSMUSG", gene_list)), (list_species = "mouse"), (list_species = "unknown"))) ifelse (grepl("ENSG", length_df$gene_id[1]), (gff_species = "human"), ifelse (grepl("ENSMUSG", length_df$gene_id[1]), (gff_species = "mouse"), (gff_species = "unknown"))) if (list_species != gff_species){ stop("Please ensure the gene list and gff are of the same species.") } #get longest transcript list of alll other genes not in gene list `%out%` = Negate(`%in%`) ctrl_tx_list <- length_df %>% dplyr::filter(gene_id %out% gene_list) %>% dplyr::pull(., seq_type) return(ctrl_tx_list) } mm_case_gene <- c("ENSMUSG00000097392", "ENSMUSG00000025607", "ENSMUSG00000030671", "ENSMUSG00000034764", "ENSMUSG00000116215", "ENSMUSG00000039556", "ENSMUSG00000066510", "ENSMUSG00000018160", "ENSMUSG00000114306", "ENSMUSG00000028277", "ENSMUSG00000037216", "ENSMUSG00000032299") #12genes get_ctrl_tx(longest_mm, mm_case_gene, "UTR3")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.