R/phrase_gtf.R

Defines functions pharse_gtf

Documented in pharse_gtf

#' @name pharse_gtf
#' @title pharse annotation from local gtf file
#' @description prase txdb, gene annotation from local gtf file
#'
#' @param gtf character, file path to GTF file
#' @param species character, one of hs, mm
#' @param geonme character, genome build version like "GRCh38"
#' @param datasource character, source of data
#' @param version character, data source version
#' @param server character, data source links
#' @param fp character vector, add fluoresecent proteins info
#'
#' @importFrom GenomicFeatures makeTxDbFromGFF
#' @importFrom AnnotationDbi keys saveDb
#' @importFrom biomaRt useMart getBM getLDS
#' @importFrom dplyr as_tibble distinct bind_rows
#' @importFrom tibble tibble
#'
#' @return a list, contains txdb, tx2gene, annotation
#'
#' @export
#'
#' @examples
#' GRCh38 <- gtf_to_annotation("Homo_sapiens.GRCh38.95.gtf", species="hs", geonme="GRCh38")
#' GRCh38_txdb <- GRCh38$txdb
#' AnnotationDbi::saveDb(GRCh38_txdb, "txdb.ensembl.95.GRCh38.sqlite")
#' GRCh38_tx2gene <- GRCh38$tx2gene
#' GRCh38_annotation <- GRCh38$annotation
#'
pharse_gtf <- function(gtf,
                       species="hs",
                       geonme="GRCh38",
                       datasource="Ensembl",
                       version="Ensembl v95",
                       server="asia.ensembl.org",
                       fp=c("EGFP", "mCherry", "tdTomato",
                            "ZsGreen", "turboGFP", "mNeonGreen",
                            "mScarleti", "WPRE")
                       ) {

  if(!(species %in% c("hs", "mm"))) {
    stop("incorrect species parameter, please check it") }

  print("prepere biomart")
  if(species=="hs") {
    mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", "hsapiens_gene_ensembl",
                             host=server)
    attrs <- c("ensembl_gene_id", "external_gene_name", "hgnc_symbol",
               "entrezgene_id", "gene_biotype",
               "description", "chromosome_name", "strand")
    organism <- "Homo sapiens"
  } else {
    mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", "mmusculus_gene_ensembl",
                             host=server)
    attrs <- c("ensembl_gene_id", "external_gene_name", "mgi_symbol",
               "entrezgene_id", "gene_biotype",
               "description", "chromosome_name", "strand")
    organism <- "Mus musculus"
  }

  print("Prepare txdb object")
  meta_data <- tibble(name=c("Genome", "Version", "URL"),
                              value=c(geonme, version, server))
  txdb <- GenomicFeatures::makeTxDbFromGFF(file=gtf, format="auto",
                                           metadata=meta_data,
                                           dataSource=datasource,
                                           organism=organism)

  print("extract transcripts and gene ids")
  txname <- AnnotationDbi::keys(txdb, keytype="TXNAME")
  tx2gene <- AnnotationDbi::select(txdb, keys=txname,
                                   columns=c("TXNAME", "GENEID"),
                                   keytype="TXNAME") %>%
    distinct(TXNAME, .keep_all=T) %>% as_tibble()

  print("annotate genes from biomaRt")
  annotation <- biomaRt::getBM(attributes=attrs, filters="ensembl_gene_id",
                               values=unique(tx2gene$GENEID), bmHeader=F,
                               uniqueRows=T, mart=mart) %>%
    distinct(ensembl_gene_id, .keep_all=T) %>% as_tibble()

  # add custom fluorescence protein info
  if(!is.null(fp)) {
    print("add fluorescence protein info")
    t2g.modify <- tibble(TXNAME=fp, GENEID=fp)
    tx2gene <- bind_rows(tx2gene, t2g.modify)

    if(species=="hs") {
      annotation.modify <- tibble(ensembl_gene_id=fp,
                                  external_gene_name=fp,
                                  hgnc_symbol=fp,
                                  entrezgene_id=rep(NA, length(fp)),
                                  gene_biotype=rep("protein-coding", length(fp)),
                                  description=rep("use defined protein", length(fp)),
                                  chromosome_name=NA,
                                  strand=NA
      )
    } else {
      annotation.modify <- tibble(ensembl_gene_id=fp,
                                  external_gene_name=fp,
                                  mgi_symbol=fp,
                                  entrezgene_id=rep(NA, length(fp)),
                                  gene_biotype=rep("protein-coding", length(fp)),
                                  description=rep("use defined protein", length(fp)),
                                  chromosome_name=NA,
                                  strand=NA
      )
    }
    annotation <- bind_rows(annotation, annotation.modify)
  }

  print("successfully created tx2gene and annotation files")
  return(list(txdb=txdb, tx2gene=tx2gene, annotation=annotation))
}





## not run by user
if(FALSE) {
  ## gene annotation

  # human
  GRCh38 <- gtf_to_annotation("~/Desktop/Homo_sapiens.GRCh38.95.gtf",
                              species="hs", geonme="GRCh38")
  GRCh38_txdb <- GRCh38$txdb
  GRCh38_tx2gene <- GRCh38$tx2gene
  GRCh38_annotation <- GRCh38$annotation

  # mouse
  GRCm38 <- gtf_to_annotation("~/Desktop/Mus_musculus.GRCm38.95.gtf",
                              species="mm", geonme="GRCm38")
  GRCm38_txdb <- GRCm38$txdb
  GRCm38_tx2gene <- GRCm38$tx2gene
  GRCm38_annotation <- GRCm38$annotation

  # save data to disk
  save(GRCh38_txdb, file="data/GRCh38_txdb.rda")
  save(GRCh38_tx2gene, file="data/GRCh38_tx2gene.rda")
  save(GRCh38_annotation, file="data/GRCh38_annotation.rda")
  save(GRCm38_txdb, file="data/GRCm38_txdb.rda")
  save(GRCm38_tx2gene, file="data/GRCm38_tx2gene.rda")
  save(GRCm38_annotation, file="data/GRCm38_annotation.rda")
  AnnotationDbi::saveDb(GRCh38_txdb, file="~/Desktop/txdb.ensembl.95.GRCh38.sqlite")
  AnnotationDbi::saveDb(GRCm38_txdb, file="~/Desktop/txdb.ensembl.95.GRCm38.sqlite")

  writexl::write_xlsx(list(GRCh38_tx2gene=GRCh38_tx2gene, GRCh38_annotation=GRCh38_annotation,
                           GRCm38_tx2gene=GRCm38_tx2gene, GRCm38_annotation=GRCm38_annotation),
                      path="~/Desktop/annotation.ensembl.95.xlsx")


  ## orthlog conversion
  attrs <- c("ensembl_gene_id", "external_gene_name",  "entrezgene_id")
  mart.hs <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", "hsapiens_gene_ensembl",
                              host="asia.ensembl.org")
  mart.mm <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", "mmusculus_gene_ensembl",
                              host="asia.ensembl.org")
  hs_mm <- biomaRt::getLDS(attributes=attrs, filters="ensembl_gene_id",
                           values=unique(GRCh38_tx2gene$GENEID), mart=mart.hs,
                           attributesL=attrs, martL=mart.mm)
  colnames(hs_mm) <- c("ensembl_gene_id.hs", "external_gene_name.hs", "entrezgene_id.hs",
                       "ensembl_gene_id.mm", "external_gene_name.mm", "entrezgene_id.mm")
  mm_hs <- biomaRt::getLDS(attributes=attrs, filters="ensembl_gene_id",
                           values=unique(GRCm38_tx2gene$GENEID), mart=mart.mm,
                           attributesL=attrs, martL=mart.hs)
  colnames(mm_hs) <- c( "ensembl_gene_id.mm", "external_gene_name.mm", "entrezgene_id.mm",
                        "ensembl_gene_id.hs", "external_gene_name.hs", "entrezgene_id.hs")
  mm_hs <- dplyr::select(mm_hs, 4:6, 1:3)
  ortholog <- bind_rows(hs_mm, mm_hs) %>% as_tibble()
  ortholog <-   distinct(ortholog, ensembl_gene_id.hs, ensembl_gene_id.mm,
                         external_gene_name.hs, external_gene_name.mm,
                         entrezgene_id.hs, entrezgene_id.mm, .keep_all=T)

  # save data to disk
  save(ortholog, file="data/GRC38_ortholog.rda")

  writexl::write_xlsx(ortholog,
                      path="~/Desktop/ortholog.ensembl.95.xlsx")
}
soulong/annoHub documentation built on May 12, 2020, 7:22 a.m.