R/gtf_process.R

#' @title get annotation from GTF
#'
#' @description  extract tx2gene and gene annotation file from gtf file
#'
#' @param gtf_file a gtf file, typically downloaded from ensembl(http://www.ensembl.org/info/data/ftp/index.html)
#'
#' @param species it must be one of hs, mm
#'
#' @param datasource dataSrouce, like "ensembl GRCh38.95"
#'
#' @param server define a server which biomaRt will use
#'
#' @param fp if NULL, adding fp info will be omitted, if suppied with a vector(Default), then it will be added.
#'
#' @import biomaRt
#'
#' @return a list with two items, tx2gene was used for conversion between transcript and gene, annotation was gene annotation info
#'
#' @export
#'
#' @examples
#' gtf <- "~/Desktop/Mus_musculus.GRCm38.95.gtf"
#' grcm38 <- gtf_to_annotation(gtf, "mm", "ensembl_GRCm38.95") %>% .[[2]] %>% head()

gtf_to_annotation <- function(gtf_file,
                              species="hs",
                              datasource="ensembl",
                              server="asia.ensembl.org",
                              fp=c("EGFP", "mCherry", "tdTomato", "ZsGreen",
                                   "turboGFP", "mNeonGreen", "mScarleti")) {

  if(!(species %in% c("hs", "mm"))) {
    stop("Incorrect species parameter, please check it!")
  }

  if(species=="hs") {
    mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", "hsapiens_gene_ensembl",
                             host=server, ssl.verifypeer=F)
    attrs <- c("ensembl_gene_id", "external_gene_name", "hgnc_symbol", "entrezgene_id", "gene_biotype",
               "description", "chromosome_name", "strand")
    organism <- "Homo sapiens"
  } else {
    mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", "mmusculus_gene_ensembl",
                             host=server, ssl.verifypeer=F)
    attrs <- c("ensembl_gene_id", "external_gene_name", "mgi_symbol", "entrezgene_id", "gene_biotype",
               "description", "chromosome_name", "strand")
    organism <- "Mus musculus"
  }

  cat("Prepare txdb object\n")
  txdb <- GenomicFeatures::makeTxDbFromGFF(file=gtf_file, format="auto",
                             dataSource=datasource, organism=organism)
  # AnnotationDbi::keytypes(txdb)
  # AnnotationDbi::columns(txdb)
  cat("Extract transcripts and gene ids\n")
  txname <- AnnotationDbi::keys(txdb, keytype="TXNAME")
  tx2gene <- AnnotationDbi::select(txdb, keys=txname,
                                   columns=c("TXNAME", "GENEID"), keytype="TXNAME") %>%
    dplyr::distinct(TXNAME, .keep_all=T) %>%
    tibble::as_tibble()


  # all.attrs <- listAttributes(mart) %>% as_tibble()
  # all.filters <- listFilters(mart) %>% as_tibble()
  # get attrs
  # attrs to return
  cat("Annotate genes from biomaRt \n")
  annotation <- biomaRt::getBM(attributes=attrs, filters="ensembl_gene_id",
                         values=unique(tx2gene$GENEID), bmHeader=F, uniqueRows=T, mart=mart) %>%
    dplyr::distinct(ensembl_gene_id, .keep_all=T) %>%
    tibble::as_tibble()
  # annotation$description <- sapply(annotation$description, function(x) strsplit(x, " [[]")[[1]][1],
  #                        simplify=T, USE.NAMES=F)
  # colnames(annotation)[7] <- "chromosome"

  if(!is.null(fp)) {
    ## add custom fluorescence protein info
#   fp <- c("EGFP","mCherry", "tdTomato")
    cat("Add fluorescence protein info\n")
    t2g.modify <- dplyr::tibble(TXNAME=fp, GENEID=fp)
    tx2gene <- dplyr::bind_rows(tx2gene, t2g.modify)

    if(species=="hs") {
      annotation.modify <- tibble::tibble(ensembl_gene_id=fp,
                                          external_gene_name=fp,
                                          hgnc_symbol=fp,
                                          entrezgene_id=rep(NA, length(fp)),
                                          gene_biotype=rep("protein-coding", length(fp)),
                                          description=rep("use defined protein", length(fp)),
                                          chromosome_name=NA,
                                          strand=NA
                                          )
    } else {
      annotation.modify <- tibble::tibble(ensembl_gene_id=fp,
                                          external_gene_name=fp,
                                          mgi_symbol=fp,
                                          entrezgene_id=rep(NA, length(fp)),
                                          gene_biotype=rep("protein-coding", length(fp)),
                                          description=rep("use defined protein", length(fp)),
                                          chromosome_name=NA,
                                          strand=NA
                                          )
    }
    annotation <- dplyr::bind_rows(annotation, annotation.modify)

  }

  cat("Successfully created tx2gene and annotation files\n")

  return(list(tx2gene=tx2gene, annotation=annotation))
}



# # # for first time use
#
# gtf_file <- "~/Desktop/Homo_sapiens.GRCh38.95.gtf"
# GRCh38 <- gtf_to_annotation(gtf_file, species="hs")
# GRCh38_tx2gene <- GRCh38$tx2gene
# GRCh38_annotation <- GRCh38$annotation
#
# gtf_file <- "~/Desktop/Mus_musculus.GRCm38.95.gtf"
# GRCm38 <- gtf_to_annotation(gtf_file, species="mm")
# GRCm38_tx2gene <- GRCm38$tx2gene
# GRCm38_annotation <- GRCm38$annotation
#
#
# usethis::use_data(GRCh38_tx2gene, GRCh38_annotation, internal=F, overwrite=T)
# usethis::use_data(GRCm38_tx2gene, GRCm38_annotation, internal=F, overwrite=T)
#
#
# writexl::write_xlsx(list(GRCh38_tx2gene=GRCh38_tx2gene, GRCh38_annotation=GRCh38_annotation,
#                          GRCm38_tx2gene=GRCm38_tx2gene, GRCm38_annotation=GRCm38_annotation),
#                      path="~/Desktop/annotation_GRC38_ensembl.95.xlsx")
soulong/AnnotationHub documentation built on July 6, 2019, 3:17 a.m.