knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(stringr)
library(Biostrings)
library(GenomicFeatures)
library(GenomicRanges)
library("BSgenome.Hsapiens.NCBI.GRCh38")
library("BSgenome.Mmusculus.UCSC.mm10")

Here are my attempts to re-invent the wheel with a few self written functions

it goes ok for a while

#This reads GENCODE GFF3 files for mouse and human into a dataframe with options to filter out undesireable transcripts
#undesireable transcripts = transcript_support_level > 2 or contains a "not_found" tag for either mRNA start/end or CDS start/end
#It also contains the option to filter for protein coding transcripts
#this requires the package stringr
##TODO: 
##      expand to non-GENCODE GFF3s? 
GFF2df <- function(genomeGFF3, filter = TRUE, protein.coding = FALSE){

  gff_df <- read.table(file = genomeGFF3, 
                    col.names = c("seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"),
                    colClasses = c("character", "character", "character", "integer", "integer", "character", "character", "character", "character")) %>% 
    as_tibble() %>% 
    mutate(ID = ifelse(grepl("ID=", attributes), str_match(attributes, "ID=(.*?);")[,2], "NA"), 
           gene_id = ifelse(grepl("gene_id=", attributes), str_match(attributes, "gene_id=(.*?);")[,2], "NA"),
           gene_name = ifelse(grepl("gene_name=", attributes), str_match(attributes, "gene_name=(.*?);")[,2], "NA"),
           gene_type = ifelse(grepl("gene_type=", attributes), str_match(attributes, "gene_type=(.*?);")[,2], "NA"),
           transcript_id = ifelse(grepl("transcript_id=", attributes), str_match(attributes, "transcript_id=(.*?);")[,2], "NA"),
           transcript_name = ifelse(grepl("transcript_name=", attributes), str_match(attributes, "transcript_name=(.*?);")[,2], "NA"),
           transcript_type = ifelse(grepl("transcript_type=", attributes), str_match(attributes, "transcript_type=(.*?);")[,2], "NA"),
           Parent = ifelse(grepl("Parent=", attributes), str_match(attributes, "Parent=(.*?);gene")[,2], "NA"),
           tag = ifelse(grepl("tag=", attributes), str_match(attributes, "tag=(.*?);")[,2], "NA"),
           transcript_support_level = ifelse(grepl("transcript_support_level=", attributes), str_match(attributes, "transcript_support_level=(.*?);")[,2], "NA"),
           exon_id = ifelse(grepl("exon_id=", attributes), str_match(attributes, "exon_id=(.*?);")[,2], "NA"),
           exon_number = ifelse(grepl("exon_number=", attributes), str_match(attributes, "exon_number=(.*?);")[,2], "NA")) 

  ifelse(filter == TRUE,
         gff_df <- gff_df %>% filter(transcript_support_level == "1" | transcript_support_level == "2" | transcript_support_level == "NA", !grepl("_NF", tag)), "")

  ifelse(protein.coding == TRUE,
         gff_df <- gff_df %>% filter(transcript_type == "protein_coding" | transcript_type == "NA"), "")

   gff_df

}
#These take less than 4 minutes each.
#mmGFF <- GFF2df("mydata/gencode.vM20.annotation.gff3.gz")
#hsGFF <- GFF2df("mydata/gencode.v33.annotation.gff3.gz")
#This subsets the GFF dataframe by a list of transcripts of interest.
#it also can return a sequence type of "three_prime_UTR", "five_prime_UTR", "exon", or "CDS"
mm_tx <- c("ENSMUST00000159265.1", "ENSMUST00000027032.5", "ENSMUST00000130201.7", "ENSMUST00000157375.1")
hs_tx <- c("ENST00000456328.2", "ENST00000338338.9", "ENST00000478641.5", "ENST00000356026.10", "ENST00000607222.1", "ENST00000342066.8")
subsetGFFdf <- function(GFF_df, transcript_list, seq_type){

  s_gff_df <- GFF_df %>% filter(transcript_id %in% transcript_list) %>% filter(type == seq_type)

  s_gff_df

}
#sGFF <- subsetGFFdf(hsGFF, hs_tx, "CDS")
#sGFF <- subsetGFFdf(mmGFF, mm_tx, "CDS")
#This writes a gff3 file from a gff_df
writeGFF <- function(gff_df, file_name){
  gff_df <- gff_df %>% select(seqid, source, type, start, end, score, strand, phase, attributes) 
  colnames(gff_df) <- NULL
  rownames(gff_df) <- NULL
  write.table(gff_df, paste(file_name, ".gff3", sep = ""), row.names = FALSE, quote = FALSE)
}
#writeGFF(sGFF, "mydata/test")

Here is where my attempts to re-invent the wheel fail

turns out genomes are big...

please ignore this...

#this will read large genome fasta files into a DNA string set
#this requires the package Biostrings
##TODO
##    Big problem here, human genome fasta is too large to make a string set...maybe fixable? maybe not...
##    this is where I switched to using BSgenome and other packages
fa2DNAstringSet <- function(fa){
  fa <-  readDNAStringSet(fa)
  names(fa) <- sub(" .*", "", names(fa))
  fa
}
#mmGenome_DNAstringSet <- fa2DNAstringSet("mydata/GRCm38.p6.genome.fa")
#hsGenome_DNAstringSet <- fa2DNAstringSet("mydata/GRCh38.p13.genome.fa")
#This will write a fasta of sequences from a gff data frame grouping by both gene and transcript
##TODO:
##     need to finish this
##     maybe wont though because fasta genomes are so big...
GFF2FA <- function(GFF_df, genome_DNAstringSet, fa_name){

  coord_list <- GFF_df %>% mutate(range = paste(start, end, sep = ":")) %>% group_by(gene_id, transcript_id) %>% summarize(coord_list = list(range)) %>% pull(., coord_list)
  chr_list <- GFF %>% group_by(gene_id, transcript_id) %>% summarize(chr_list = list(seqid)) %>% pull(., chr_list)
  strand_list <- GFF %>% group_by(gene_id, transcript_id) %>% summarize(strand_list = list(strand)) %>% pull(., strand_list)


}
#gFF2FA(sGFF, DNAstringset, "mydata/test.fa")

Heres where the good stuff starts!

Back to writing functions with more packages


#filter_TX() filters out transcripts from gencode gff files
#this is compatible with human and mouse gencode annotations
#downloaded from https://www.gencodegenes.org/human/ or https://www.gencodegenes.org/mouse/
#there are two filter options, a general filter and a protein coding filter.
#The general filter removes any transcripts with undefined ends or with a support level higher than 2 (low confidence transcripts)
#The protein coding filter removes all transcripts that lack a protein coding tag.
#The default is to remove low confidence transcripts (general filter) but keep all transcripts regardless of protein coding status
#the output from filter_Tx() is a TxDb object containing a gene model of the filtered transcripts
#this Txdb object is required for downstream functions
#if filtering is not desired, the below functions will create TxDb objects for human and mouse Gencode data
#hs_gff <- GenomicFeatures::makeTxDbFromGFF(file = "mydata/Gencodedat/gencode.v33.annotation.gff3.gz", format = "gff3", organism = "Homo sapiens") # 11min 
#GenomeInfoDb::seqlevelsStyle(hs_gff) <- "NCBI"
#mm_gff <- GenomicFeatures::makeTxDbFromGFF(file = "mydata/Gencodedat/gencode.vM20.annotation.gff3.gz", format = "gff3", organism = "Mus musculus")

filter_Tx <- function(gff, filter = TRUE, protein.coding = FALSE){
  gff <- rtracklayer::import(gff)
  if (filter == TRUE){
    print("filtering out low confidence transcripts...", quote = FALSE)
    #remove low confidence transcripts
    gff <- gff[gff$transcript_support_level == "1" | gff$transcript_support_level == "2" | is.na(gff$transcript_support_level)]
    #remove transcripts with undefined starts or ends
    NF <- c("cds_start_NF", "cds_end_NF", "mRNA_start_NF", "mRNA_end_NF")
    `%out%` = Negate(`%in%`)
    gff <- gff[as.list(gff$tag) %out% NF]
    print("filtering complete.", quote = FALSE)
  }

  if (protein.coding == TRUE){
    print("filtering out non-coding transcripts...", quote = FALSE)
    #remove non-coding transcripts
    gff <- gff[gff$transcript_type == "protein_coding" | is.na(gff$transcript_type)]
    print("filtering complete.", quote = FALSE)
  }

  #create txdb gene model from filtered gff
  if (all(grepl("ENSG", gff$gene_id))){
    txdbGFF <- GenomicFeatures::makeTxDbFromGRanges(gff)
    GenomeInfoDb::seqlevelsStyle(txdbGFF) <- "NCBI"
  } else if (all(grepl("ENSMUSG", gff$gene_id))){
    txdbGFF <- GenomicFeatures::makeTxDbFromGRanges(gff)
  }

  return(txdbGFF)

}

mm_f_gff <- filter_Tx("mydata/Gencodedat/gencode.vM20.annotation.gff3.gz") #3.5 min
hs_f_gff <- filter_Tx("mydata/Gencodedat/gencode.v33.annotation.gff3.gz") #1.5 min
#RNA_breakdown is a simple analysis that reveals the types of RNAs present in a transcript list
#this can be used to better understand your case and control sets of transcripts
#If you instead have sets of genes, we recommend using expression data to find expresed transcripts within your experiment
#otherwise, use gene2tx() to create transcript sets from either make_longest_df or make_median_df

RNA_breakdown <- function(gff, tx_list) {
  # Check that tx_list doesn't contain Ensembl gene IDs
  print("Ensuring compatibility of GFF and transcript list...", quote = FALSE)
  if (any(grepl("ENSG", tx_list)) | any(grepl("ENSMUSG", tx_list))) {
    stop("Please ensure the transcript list contains transcript IDs and not gene IDs.")
  }
  # Determine the species of tx_list
  ifelse(all(grepl("ENST", tx_list)),
         (list_species = "human"),
         ifelse(all(grepl("ENSMUST", tx_list)),
                (list_species = "mouse"),
                (list_species = "unknown")))
  # Import gencode gff as GRanges object
  gff <- rtracklayer::import(gff)
  # Determine species of gff annotations
  gff_species = "unknown"
  if (all(grepl("ENSG", gff$gene_id))) {
    gff_species = "human"
  } else if (all(grepl("ENSMUSG", gff$gene_id))) {
    gff_species = "mouse"
  }
  # Ensure tx_list and gff are of the same species
  if (list_species != gff_species) {
    stop("Please ensure the transcript list and gff are of the same species.")
  }
  print("Gff and transcript list are compatible.", quote = FALSE)

  print("Getting RNA type information...")
  gff <- gff[gff$type == "transcript"]
  gff <- gff[gff$transcript_id %in% tx_list]

  if (length(gff) == 0) {
    stop("No transcript IDs in tx_list were not found in gff.")
  }

  RNA_breakdown <- gff@elementMetadata %>%
    dplyr::as_tibble() %>%
    dplyr::group_by(transcript_type) %>%
    dplyr::summarize(count = n(),
                     percent = n() / nrow(.) * 100)

  return(RNA_breakdown)
}
RNA_breakdown("mydata/Gencodedat/gencode.vM20.annotation.gff3.gz", mm_tx)
##This function writes gff3 files or fasta files from genome TxDb object subsetted with a transcript list
#sequences extracted can be entire transcripts (whole) or just the CDS, 3`UTR or 5`UTR sequences.
#If you instead have sets of genes, we recommend using expression data to find expresed transcripts within your experiment to represent your genes of interest.
#otherwise, use gene2tx() to create transcript sets from either make_longest_df or make_median_df

write_Sequence <- function(TxDb_gff, tx_list, seq_type, file_name, output_type){
  #Check that transcript list contains transcripts

  print("Ensuring compatibility of GFF and transcript list...", quote = FALSE)

  if (any(grepl("ENSG", tx_list)) | any(grepl("ENSMUSG", tx_list))){
         stop("Please ensure the transcript list contains transcript IDs and not gene IDs.")
  }

  #Make sure gff and transcript list are the same species

  ifelse(all(grepl("ENST", tx_list)), 
         (list_species = "human"), 
         ifelse(all(grepl("ENSMUST", tx_list)), 
                (list_species = "mouse"), 
                (list_species = "unknown")))

  ifelse(grepl("ENSG", GenomicFeatures::genes(TxDb_gff)[1]$gene_id), 
         (gff_species = "human"), 
         ifelse(grepl("ENSMUSG", GenomicFeatures::genes(TxDb_gff)[1]$gene_id),
                (gff_species = "mouse"), 
                (gff_species = "unknown")))

  if (list_species != gff_species){ 
    stop("Please ensure the transcript list and gff are of the same species.")
  }

  print("GFF and transcript list are compatible.", quote = FALSE)

  #Subset the gffs by transcript and seq type

  tx_gff <- NULL

  if (seq_type == "whole"){
  print("filtering whole transcripts...", quote = FALSE)
  tx_gff <- GenomicFeatures::exonsBy(TxDb_gff, "tx", use.names = TRUE)
  tx_gff <- tx_gff[names(tx_gff) %in% tx_list]
  print("filtering complete.", quote = FALSE)

  } else if (seq_type == "CDS"){
  print("filtering CDS...", quote = FALSE)
  tx_gff <- GenomicFeatures::cdsBy(TxDb_gff, "tx", use.names = TRUE)
  tx_gff <- tx_gff[names(tx_gff) %in% tx_list]
  print("filtering complete.", quote = FALSE)

  } else if (seq_type == "UTR5"){
  print("filtering 5' UTRs...", quote = FALSE)
  tx_gff <- GenomicFeatures::fiveUTRsByTranscript(TxDb_gff, use.names = TRUE)
  tx_gff <- tx_gff[names(tx_gff) %in% tx_list]
  print("filtering complete.", quote = FALSE)

  } else if (seq_type == "UTR3"){
  print("filtering 3' UTRs...", quote = FALSE)
  tx_gff <- GenomicFeatures::threeUTRsByTranscript(TxDb_gff, use.names = TRUE)
  tx_gff <- tx_gff[names(tx_gff) %in% tx_list]
  print("filtering complete.", quote = FALSE)

  } else if(is.null(tx_gff) == TRUE){
  stop(paste("no sequences of type \"", seq_type, "\" found in transcript list", sep = ""), quote = FALSE)

    } else 
  stop("not appropriate seq type. please use one of \"whole\", \"CDS\", \"UTR5\" or \"UTR3\"", quote = FALSE)

  #Get the sequences from subset GFF

  if (list_species == "human" & gff_species == "human"){  
  print("extracting sequences...", quote = FALSE)
  seq <- GenomicFeatures::extractTranscriptSeqs(BSgenome.Hsapiens.NCBI.GRCh38::Hsapiens, tx_gff)

  } else if (list_species == "mouse" & gff_species == "mouse"){
  print("extracting sequences...", quote = FALSE)
  seq <- GenomicFeatures::extractTranscriptSeqs(BSgenome.Mmusculus.UCSC.mm10::Mmusculus, tx_gff)

  } else
  stop("Please ensure the transcript list and gff are of the same species.")

  names(seq) <- paste(names(seq), seq_type, sep = "_")

  print("extracting sequences complete.", quote = FALSE)

  #output sequences in desired format

  if (output_type == "fa"){
    print("writing fasta...", quote = FALSE)
    Biostrings::writeXStringSet(seq, paste(file_name, ".fa", sep = ""), format = "fasta")
    print("fasta writen.", quote = FALSE)

    } else if (output_type == "gff3"){
    print("writing GFF3...", quote = FALSE)
    rtracklayer::export(tx_gff, paste(file_name, ".gff3", sep = ""), format = "gff3")
    print("GFF3 written.", quote = FALSE)

    } else if (output_type == "both"){
    print("writing fasta...", quote = FALSE)
    Biostrings::writeXStringSet(seq, paste(file_name, ".fa", sep = ""), format = "fasta")
    print("fasta writen.", quote = FALSE)

    print("writing GFF3...", quote = FALSE)
    rtracklayer::export(tx_gff, paste(file_name, ".gff3", sep = ""), format = "gff3")
    print("GFF3 written.", quote = FALSE)

    } else
    stop("not appropriate format, please use \"fa\", \"gff3\" or \"both\"", quote = FALSE)


  print(paste("Querried ", length(tx_gff), " of ", length(tx_list), " requested genes.", sep = ""), quote = FALSE)
  print(paste("Querried ", sum(Biostrings::width(seq))/1000, " KB of sequence.", sep = ""), quote = FALSE)
}
#write_Sequence(hs_gff, hs_tx, "CDS", "mydata/hs_test", "gff3")
#write_Sequence(mm_gff, mm_tx, "CDS", "mydata/mm_test", "gff3")
#This creates a dataframe of the transcripts with the longest features for each gene in a TxDb object
#this takes a minute or two so its better to run it once.

make_longest_df <- function(TxDb_gff){ 
  #identify transcript with longest feature for each gene

  len_df <- GenomicFeatures::transcriptLengths(TxDb_gff, 
                              with.cds_len = TRUE, 
                              with.utr5_len = TRUE, 
                              with.utr3_len = TRUE) %>% 
    dplyr::as_tibble() %>% 
    dplyr::mutate(gene_id = sub("\\..*", "", gene_id))

  longest_tx <- len_df %>% 
    dplyr::group_by(gene_id) %>% 
    dplyr::top_n(1, tx_len) %>% 
    dplyr::rename("whole" = "tx_name") %>% 
    dplyr::select(gene_id, whole)

  longest_tx <- len_df %>% 
    dplyr::group_by(gene_id) %>% 
    dplyr::top_n(1, cds_len) %>% 
    dplyr::rename("CDS" = "tx_name") %>% 
    dplyr::select(gene_id, CDS) %>% 
    dplyr::left_join(., longest_tx)

  longest_tx <- len_df %>% 
    dplyr::group_by(gene_id) %>% 
    dplyr::top_n(1, utr5_len) %>% 
    dplyr::rename("UTR5" = "tx_name") %>% 
    dplyr::select(gene_id, UTR5) %>% 
    dplyr::left_join(., longest_tx)

  longest_tx <- len_df %>%
    dplyr::group_by(gene_id) %>% 
    dplyr::top_n(1, utr3_len) %>% 
    dplyr::rename("UTR3" = "tx_name") %>% 
    dplyr::select(gene_id, UTR3) %>% 
    dplyr::left_join(., longest_tx)

  longest_tx <- longest_tx %>% 
    dplyr::group_by(gene_id) %>% 
    dplyr::summarize(whole = dplyr::first(whole), 
              CDS = dplyr::first(CDS), 
              UTR5 = dplyr::first(UTR5), 
              UTR3 = dplyr::first(UTR3))

}
#longest_mm <- make_longest_df(mm_f_gff) #18s
#longest_hs <- make_longest_df(hs_f_gff) #5.5min
#This creates a dataframe of the transcripts with the median length features for each gene in a TxDb object
#this takes a minute or two so its better to run it once.
make_median_df <- function(TxDb_gff){ 
  #identify transcript with longest feature for each gene

  len_df <- GenomicFeatures::transcriptLengths(TxDb_gff, 
                              with.cds_len = TRUE, 
                              with.utr5_len = TRUE, 
                              with.utr3_len = TRUE) %>% 
    dplyr::as_tibble() %>% 
    dplyr::mutate(gene_id = sub("\\..*", "", gene_id))

  #ignore isoforms with no feature when calculating median (many transcripts lack CDS, UTR3 and UTR5 features)
  len_df[len_df == 0] <- NA

  median_tx <- len_df %>% 
    dplyr::group_by(gene_id) %>% 
    dplyr::filter(abs(tx_len - median(tx_len, na.rm = TRUE)) == min(abs(tx_len - median(tx_len, na.rm = TRUE)), na.rm = TRUE)) %>% 
    dplyr::rename("whole" = tx_name) %>% 
    dplyr::select(gene_id, whole)

  median_tx <- len_df %>% 
    dplyr::group_by(gene_id) %>% 
    dplyr::filter(abs(cds_len - median(cds_len, na.rm = TRUE)) == min(abs(cds_len - median(cds_len, na.rm = TRUE)), na.rm = TRUE)) %>% 
    dplyr::rename("CDS" = tx_name) %>% 
    dplyr::select(gene_id, CDS) %>% 
    dplyr::left_join(., median_tx)

  median_tx <- len_df %>%
    dplyr::group_by(gene_id) %>% 
    dplyr::filter(abs(utr5_len - median(utr5_len, na.rm = TRUE)) == min(abs(utr5_len - median(utr5_len, na.rm = TRUE)), na.rm = TRUE)) %>% 
    dplyr::rename("UTR5" = tx_name) %>% 
    dplyr::select(gene_id, UTR5) %>% 
    dplyr::left_join(., median_tx)

  median_tx <- len_df %>% 
    dplyr::group_by(gene_id) %>% 
    dplyr::filter(abs(utr3_len - median(utr3_len, na.rm = TRUE)) == min(abs(utr3_len - median(utr3_len, na.rm = TRUE)), na.rm = TRUE)) %>% 
    dplyr::rename("UTR3" = tx_name) %>% 
    dplyr::select(gene_id, UTR3) %>% 
    dplyr::left_join(., median_tx)


  median_tx <- median_tx %>% 
    dplyr::group_by(gene_id) %>% 
   dplyr:: summarize(whole = dplyr::first(whole), 
              CDS = dplyr::first(CDS), 
              UTR5 = dplyr::first(UTR5), 
              UTR3 = dplyr::first(UTR3))

}
#median_mm <- make_median_df(mm_f_gff)
#median_hs <- make_median_df(hs_f_gff)
#This takes a gene list and returns a list of transcripts with the longest or median length feature of interest

#hs_gene <- c("ENSG00000000419.12", "ENSG00000001167.14", "ENSG00000000938.13")

gene2Tx <- function(length_df, gene_list, seq_type){
   #Check that gene list contains genes

  if (all(grepl("ENST", gene_list)) | all(grepl("ENSMUST", gene_list))){
         stop("Please ensure the gene list contains gene IDs and not transcript IDs.") 
  }

  #Make sure df and gene list are the same species

  ifelse (all(grepl("ENSG", gene_list)), 
         (list_species = "human"), 
         ifelse (all(grepl("ENSMUSG", gene_list)), 
                (list_species = "mouse"), 
                (list_species = "unknown")))

  ifelse (grepl("ENSG", length_df$gene_id[1]), 
         (gff_species = "human"), 
         ifelse (grepl("ENSMUSG", length_df$gene_id[1]),
                (gff_species = "mouse"), 
                (gff_species = "unknown")))

  if (list_species != gff_species){
    stop("Please ensure the gene list and gff are of the same species.")
  }

  #create longest transcript list from gene list

  tx_list <- length_df %>% dplyr::filter(gene_id %in% gene_list) %>% dplyr::pull(., seq_type)
  return(tx_list)

}
#gene2Tx(longest_hs, hs_gene, "UTR3")
# This will return transcripts represetning all other genes not in gene_list
#this isnt really recommended as not all genes will be exprsesed in a given system.

get_ctrl_tx <- function(length_df, gene_list, seq_type){
  #Check that gene list contains genes

  if (all(grepl("ENST", gene_list)) | all(grepl("ENSMUST", gene_list))){
         stop("Please ensure the gene list contains gene IDs and not transcript IDs.") 
  }
  #Make sure df and gene list are the same species

  ifelse (all(grepl("ENSG", gene_list)), 
         (list_species = "human"), 
         ifelse (all(grepl("ENSMUSG", gene_list)), 
                (list_species = "mouse"), 
                (list_species = "unknown")))

  ifelse (grepl("ENSG", length_df$gene_id[1]), 
         (gff_species = "human"), 
         ifelse (grepl("ENSMUSG", length_df$gene_id[1]),
                (gff_species = "mouse"), 
                (gff_species = "unknown")))

  if (list_species != gff_species){
    stop("Please ensure the gene list and gff are of the same species.")
  }

  #get longest transcript list of alll other genes not in gene list
  `%out%` = Negate(`%in%`)
  ctrl_tx_list <- length_df %>% dplyr::filter(gene_id %out% gene_list) %>% dplyr::pull(., seq_type)
  return(ctrl_tx_list)

}
mm_case_gene <- c("ENSMUSG00000097392", "ENSMUSG00000025607", "ENSMUSG00000030671", "ENSMUSG00000034764", "ENSMUSG00000116215", "ENSMUSG00000039556", "ENSMUSG00000066510", "ENSMUSG00000018160", "ENSMUSG00000114306", "ENSMUSG00000028277", "ENSMUSG00000037216", "ENSMUSG00000032299") #12genes
get_ctrl_tx(longest_mm, mm_case_gene, "UTR3")


TaliaferroLab/FeatureReachR documentation built on Aug. 15, 2021, 2:21 p.m.