inst/scripts/buildGeneInfoTable/buildGeneInfoTable.R

library(biomaRt)
library(TrenaProjectIGAP)

igap <- TrenaProjectIGAP()

mtx.names <- getExpressionMatrixNames(igap)
geneSymbols <- c()
ens.genes <- c()

for(mtx.name in mtx.names){
   mtx <- getExpressionMatrix(igap, mtx.name)
   gene.1 <- rownames(mtx)[1]
   if(grepl("^ENSG", gene.1)){
     ens.genes <- unique(c(ens.genes, rownames(mtx)))
    } else {
      geneSymbols <- unique(c(geneSymbols, rownames(mtx)))
      }
   } # for mtx.name
length(ens.genes)    # 17003
length(geneSymbols)  # 15328

ensembl.hg38 <- useMart("ensembl", dataset="hsapiens_gene_ensembl")

coi <- c("ensembl_gene_id", "entrezgene", "chromosome_name", "transcription_start_site",
         "strand", "hgnc_symbol", "transcript_biotype", "gene_biotype", "ensembl_transcript_id",
         "transcript_appris", "transcript_tsl", "transcript_gencode_basic")

key <- "ensembl_gene_id"
printf("--------- calling biomart for ensg/symbol/chrom/tss mapping")

trem2.ensg <- "ENSG00000095970"

tbl.geneInfo <- getBM(attributes=coi, filters=key,
                      values=ens.genes,
                      mart=ensembl.hg38)
dim(tbl.geneInfo)

appris <- tbl.geneInfo$transcript_appris
appris <- sub("alternative", "B_alternative", appris)
appris <- sub("principal"  , "A_principal",   appris)
missing <- which(appris == "")
appris[missing] <- "C_missing"

tsl <- tbl.geneInfo$transcript_tsl
missing <- which(tsl == "")
tsl[missing] <- "z_missing"

tbl.geneInfo$appris <- appris
tbl.geneInfo$tsl <- tsl

new.order <- with(tbl.geneInfo, order(ensembl_gene_id, appris, tsl))

tbl <- tbl.geneInfo[new.order,]

dups <- which(duplicated(tbl$ensembl_gene_id))
length(dups)
tbl <- tbl[-dups,]
dim(tbl)
symbols.missing <- which(nchar(tbl$hgnc_symbol) == 0)
length(symbols.missing)  # 1507
tbl$hgnc_symbol[symbols.missing] <- tbl$ensembl_gene_id[symbols.missing]

#  [1] "ensembl_gene_id"
#  [2] "entrezgene"
#  [3] "chromosome_name"
#  [4] "transcription_start_site"
#  [5] "strand"
#  [6] "hgnc_symbol"
#  [7] "transcript_biotype"
#  [8] "gene_biotype"
#  [9] "ensembl_transcript_id"
# [10] "transcript_appris"
# [11] "transcript_tsl"
# [12] "transcript_gencode_basic"
# [13] "appris"
# [14] "tsl"

colnames.oi <- c(1, 3, 4, 5, 6, 2, 13, 14, 9, 7)
tbl.1 <- tbl[, colnames.oi]
colnames(tbl.1) <- c("ensg", "chrom", "tss", "strand", "geneSymbol", "entrez", "appris", "tsl", "transcript", "type")
rownames(tbl.1) <- NULL
tbl.1$chrom <- paste("chr", tbl.1$chrom, sep="")

tbl.geneInfo <- tbl.1
save(tbl.geneInfo, file="../../extdata/geneInfoTable.RData")

annotated.genes <- tbl.geneInfo$ensg
coi.2 <- c("ensembl_gene_id", "transcript_start", "transcript_end") # "genomic_coding_start", "genomic_coding_end", "5_utr_start", "cds_start")

ensg.bug <- "ENSG00000067601"   # no start/end

tbl.geneBounds <- getBM(attributes=coi.2, filters=key,
                        values=annotated.genes,
                        mart=ensembl.hg38)
dim(tbl.geneBounds)


endpoints <- function(ensg.id) {
   tbl <- subset(tbl.geneBounds, ensembl_gene_id==ensg.id)
   data.frame(ensg=ensg.id,
              start=min(tbl$transcript_start, na.rm=TRUE),
              end=max(tbl$transcript_end, na.rm=TRUE),
              stringsAsFactors=FALSE)
   }

ensg.with.bounds <- unique(tbl.geneBounds$ensembl_gene_id)
length(ensg.with.bounds)

x <- lapply(ensg.with.bounds, endpoints)
tbl.bounds <- do.call(rbind, x)
dim(tbl.bounds)
dim(tbl.geneInfo)

tbl.wide <- merge(tbl.geneInfo, tbl.bounds, by="ensg")
preferred.column.order <- c("ensg",
                            "chrom",
                            "start",
                            "end",
                            "tss",
                            "strand",
                            "geneSymbol",
                            "entrez",
                            "appris",
                            "tsl",
                            "transcript",
                            "type"
                            )
tbl.geneInfo <- tbl.wide[, preferred.column.order]
save(tbl.geneInfo, file="../../extdata/geneInfoTable.RData")

# does this ordering work for MEF2C
# tbl.mef2c <- subset(tbl.geneInfo, hgnc_symbol=="MEF2C")
# x <- with(tbl.mef2c, order(appris, tsl, decreasing=FALSE))
# tbl.mef2c <- tbl.mef2c[x,]
# tbl.mef2c$transcription_start_site[1]  # [1] 88904257
#
#   # INPP5D
# tbl.inpp5d <- subset(tbl.geneInfo, hgnc_symbol=="INPP5D")
# x <- with(tbl.inpp5d, order(appris, tsl, decreasing=FALSE))
# tbl.inpp5d <- tbl.inpp5d[x,]
# tbl.inpp5d$transcription_start_site[1]  # [1] 88904257
#
#
#
# tbl.geneInfo$chromosome_name <- sprintf("chr%s", tbl.geneInfo$chromosome_name)
# colnames(tbl.geneInfo)[4] <- "tss"
#
# failures <- which(nchar(tbl.geneInfo$hgnc_symbol) == 0)
# length(failures)
# tbl.geneInfo$hgnc_symbol[failures] <- tbl.geneInfo$ensembl_gene_id[failures]
#
#
# dim(mtx)
# dim(tbl.geneInfo)
# all(rownames(mtx) %in% tbl.geneInfo$ensembl_gene_id)
# length(setdiff(rownames(mtx), unique(tbl.geneInfo$ensembl_gene_id)))  # [1] 61
#
# ensembl.genes.without.geneSymbol <- unique(setdiff(rownames(mtx), tbl.geneInfo$ensembl_gene_id))
# printf("mtx ensembl genes without geneSymbol: %d", length(ensembl.genes.without.geneSymbol))
#
# print(load(system.file(package="TrenaProject", "extdata", "epigenome", "geneHancer.v4.7.allGenes.RData"))) # 61
# dim(tbl.enhancers)
# geneSymbols.without.enhancers <- setdiff(tbl.geneInfo$hgnc_symbol, unique(tbl.enhancers$geneSymbol))
# printf("mtx ensembl genes without geneSymbol in tbl.enhancers: %d", length(geneSymbols.without.enhancers))  # 141
# ensembl.genes.without.enhancers <- sort(unique(c(ensembl.genes.without.geneSymbol,
#                                                  subset(tbl.geneInfo, hgnc_symbol %in% geneSymbols.without.enhancers)$ensembl_gene_id)))
# printf("mtx ensembl genes without enhancers (via geneSymbol): %d", length(ensembl.genes.without.enhancers)) # 1708
#
# goi.syms <- c("MEF2C", "INPP5D")
# goi <- subset(tbl.geneInfo, hgnc_symbol %in% goi.syms)$ensembl_gene_id
#
# configurationFileRead <- TRUE
#
PriceLab/TrenaProjectAD documentation built on July 11, 2022, 3:42 a.m.