#' @name pharse_gtf
#' @title pharse annotation from local gtf file
#' @description prase txdb, gene annotation from local gtf file
#'
#' @param gtf character, file path to GTF file
#' @param species character, one of hs, mm
#' @param geonme character, genome build version like "GRCh38"
#' @param datasource character, source of data
#' @param version character, data source version
#' @param server character, data source links
#' @param fp character vector, add fluoresecent proteins info
#'
#' @importFrom GenomicFeatures makeTxDbFromGFF
#' @importFrom AnnotationDbi keys saveDb
#' @importFrom biomaRt useMart getBM getLDS
#' @importFrom dplyr as_tibble distinct bind_rows
#' @importFrom tibble tibble
#'
#' @return a list, contains txdb, tx2gene, annotation
#'
#' @export
#'
#' @examples
#' GRCh38 <- gtf_to_annotation("Homo_sapiens.GRCh38.95.gtf", species="hs", geonme="GRCh38")
#' GRCh38_txdb <- GRCh38$txdb
#' AnnotationDbi::saveDb(GRCh38_txdb, "txdb.ensembl.95.GRCh38.sqlite")
#' GRCh38_tx2gene <- GRCh38$tx2gene
#' GRCh38_annotation <- GRCh38$annotation
#'
pharse_gtf <- function(gtf,
species="hs",
geonme="GRCh38",
datasource="Ensembl",
version="Ensembl v95",
server="asia.ensembl.org",
fp=c("EGFP", "mCherry", "tdTomato",
"ZsGreen", "turboGFP", "mNeonGreen",
"mScarleti", "WPRE")
) {
if(!(species %in% c("hs", "mm"))) {
stop("incorrect species parameter, please check it") }
print("prepere biomart")
if(species=="hs") {
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", "hsapiens_gene_ensembl",
host=server)
attrs <- c("ensembl_gene_id", "external_gene_name", "hgnc_symbol",
"entrezgene_id", "gene_biotype",
"description", "chromosome_name", "strand")
organism <- "Homo sapiens"
} else {
mart <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", "mmusculus_gene_ensembl",
host=server)
attrs <- c("ensembl_gene_id", "external_gene_name", "mgi_symbol",
"entrezgene_id", "gene_biotype",
"description", "chromosome_name", "strand")
organism <- "Mus musculus"
}
print("Prepare txdb object")
meta_data <- tibble(name=c("Genome", "Version", "URL"),
value=c(geonme, version, server))
txdb <- GenomicFeatures::makeTxDbFromGFF(file=gtf, format="auto",
metadata=meta_data,
dataSource=datasource,
organism=organism)
print("extract transcripts and gene ids")
txname <- AnnotationDbi::keys(txdb, keytype="TXNAME")
tx2gene <- AnnotationDbi::select(txdb, keys=txname,
columns=c("TXNAME", "GENEID"),
keytype="TXNAME") %>%
distinct(TXNAME, .keep_all=T) %>% as_tibble()
print("annotate genes from biomaRt")
annotation <- biomaRt::getBM(attributes=attrs, filters="ensembl_gene_id",
values=unique(tx2gene$GENEID), bmHeader=F,
uniqueRows=T, mart=mart) %>%
distinct(ensembl_gene_id, .keep_all=T) %>% as_tibble()
# add custom fluorescence protein info
if(!is.null(fp)) {
print("add fluorescence protein info")
t2g.modify <- tibble(TXNAME=fp, GENEID=fp)
tx2gene <- bind_rows(tx2gene, t2g.modify)
if(species=="hs") {
annotation.modify <- tibble(ensembl_gene_id=fp,
external_gene_name=fp,
hgnc_symbol=fp,
entrezgene_id=rep(NA, length(fp)),
gene_biotype=rep("protein-coding", length(fp)),
description=rep("use defined protein", length(fp)),
chromosome_name=NA,
strand=NA
)
} else {
annotation.modify <- tibble(ensembl_gene_id=fp,
external_gene_name=fp,
mgi_symbol=fp,
entrezgene_id=rep(NA, length(fp)),
gene_biotype=rep("protein-coding", length(fp)),
description=rep("use defined protein", length(fp)),
chromosome_name=NA,
strand=NA
)
}
annotation <- bind_rows(annotation, annotation.modify)
}
print("successfully created tx2gene and annotation files")
return(list(txdb=txdb, tx2gene=tx2gene, annotation=annotation))
}
## not run by user
if(FALSE) {
## gene annotation
# human
GRCh38 <- gtf_to_annotation("~/Desktop/Homo_sapiens.GRCh38.95.gtf",
species="hs", geonme="GRCh38")
GRCh38_txdb <- GRCh38$txdb
GRCh38_tx2gene <- GRCh38$tx2gene
GRCh38_annotation <- GRCh38$annotation
# mouse
GRCm38 <- gtf_to_annotation("~/Desktop/Mus_musculus.GRCm38.95.gtf",
species="mm", geonme="GRCm38")
GRCm38_txdb <- GRCm38$txdb
GRCm38_tx2gene <- GRCm38$tx2gene
GRCm38_annotation <- GRCm38$annotation
# save data to disk
save(GRCh38_txdb, file="data/GRCh38_txdb.rda")
save(GRCh38_tx2gene, file="data/GRCh38_tx2gene.rda")
save(GRCh38_annotation, file="data/GRCh38_annotation.rda")
save(GRCm38_txdb, file="data/GRCm38_txdb.rda")
save(GRCm38_tx2gene, file="data/GRCm38_tx2gene.rda")
save(GRCm38_annotation, file="data/GRCm38_annotation.rda")
AnnotationDbi::saveDb(GRCh38_txdb, file="~/Desktop/txdb.ensembl.95.GRCh38.sqlite")
AnnotationDbi::saveDb(GRCm38_txdb, file="~/Desktop/txdb.ensembl.95.GRCm38.sqlite")
writexl::write_xlsx(list(GRCh38_tx2gene=GRCh38_tx2gene, GRCh38_annotation=GRCh38_annotation,
GRCm38_tx2gene=GRCm38_tx2gene, GRCm38_annotation=GRCm38_annotation),
path="~/Desktop/annotation.ensembl.95.xlsx")
## orthlog conversion
attrs <- c("ensembl_gene_id", "external_gene_name", "entrezgene_id")
mart.hs <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", "hsapiens_gene_ensembl",
host="asia.ensembl.org")
mart.mm <- biomaRt::useMart("ENSEMBL_MART_ENSEMBL", "mmusculus_gene_ensembl",
host="asia.ensembl.org")
hs_mm <- biomaRt::getLDS(attributes=attrs, filters="ensembl_gene_id",
values=unique(GRCh38_tx2gene$GENEID), mart=mart.hs,
attributesL=attrs, martL=mart.mm)
colnames(hs_mm) <- c("ensembl_gene_id.hs", "external_gene_name.hs", "entrezgene_id.hs",
"ensembl_gene_id.mm", "external_gene_name.mm", "entrezgene_id.mm")
mm_hs <- biomaRt::getLDS(attributes=attrs, filters="ensembl_gene_id",
values=unique(GRCm38_tx2gene$GENEID), mart=mart.mm,
attributesL=attrs, martL=mart.hs)
colnames(mm_hs) <- c( "ensembl_gene_id.mm", "external_gene_name.mm", "entrezgene_id.mm",
"ensembl_gene_id.hs", "external_gene_name.hs", "entrezgene_id.hs")
mm_hs <- dplyr::select(mm_hs, 4:6, 1:3)
ortholog <- bind_rows(hs_mm, mm_hs) %>% as_tibble()
ortholog <- distinct(ortholog, ensembl_gene_id.hs, ensembl_gene_id.mm,
external_gene_name.hs, external_gene_name.mm,
entrezgene_id.hs, entrezgene_id.mm, .keep_all=T)
# save data to disk
save(ortholog, file="data/GRC38_ortholog.rda")
writexl::write_xlsx(ortholog,
path="~/Desktop/ortholog.ensembl.95.xlsx")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.