# biomaRt Conversion between Gene Symbol and Entrez IDs
# Gabriel Odom
# 2018-07-26
###### Create Matching Table ################################################
# Use this walkthrough as a guide:
# https://bioconductor.org/packages/devel/bioc/vignettes/biomaRt/inst/doc/biomaRt.html
### Vector of Genes ###
library(pathwayPCA)
wikipHuman_pathwaySet <- read_gmt("extdata/wikipathways_human.gmt")
allEntrezIDs_char <- unique(
unname(unlist(wikipHuman_pathwaySet$pathways))
)
allEntrezIDs_char <- gsub("\r", "", allEntrezIDs_char)
### BioMart Database and Dataset ###
library(biomaRt)
# Code from James:
mart <- useEnsembl(
biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl"
)
### BioMart Query ###
martAttr <- listAttributes(mart = mart)
martFiltr <- listFilters(mart = mart)
humanID_df <- getBM(
attributes = c("entrezgene", "hgnc_symbol", "ensembl_gene_id", "go_id"),
filter = "entrezgene",
values = allEntrezIDs_char,
mart = mart
)
write.csv(humanID_df,
file = "gene_conversion_table.csv",
row.names = FALSE)
###### Convert from EntrezID to Gene Symbol #################################
### Setup ###
library(pathwayPCA)
wikipHuman_pathwaySet <- read_gmt("extdata/wikipathways_human.gmt")
allEntrezIDs_char <- unique(
unname(unlist(wikipHuman_pathwaySet$pathways))
)
allEntrezIDs_char <- gsub("\r", "", allEntrezIDs_char)
library(tidyverse)
humanID_tbl <- read_csv(file = "gene_conversion_table.csv")
### Test Conversion ###
humanID_tbl %>%
filter(entrezgene == as.integer(allEntrezIDs_char[1])) %>%
select(hgnc_symbol) %>%
unique %>%
unlist %>%
unname()
### Test Vector Conversion ###
# EntrezID 640 matches "BLK" and "NA". Other than that, we're all good
humanID_tbl %>%
filter(hgnc_symbol == "BLK") %>%
select(entrezgene, hgnc_symbol) %>%
distinct()
humanID_tbl %>%
filter(entrezgene == 640) %>%
select(entrezgene, hgnc_symbol) %>%
distinct()
humanID_tbl %>%
filter(entrezgene == 640) %>%
distinct() %>%
as.matrix()
# The issue is that there may not be matches to the Ensembl or GO IDs. We'll
# just omit NAs
humanID_tbl %>%
filter(entrezgene %in% as.integer(wikipHuman_pathwaySet$pathways[[1]])) %>%
select(hgnc_symbol) %>%
na.omit %>%
unique %>%
unlist %>%
unname()
### Make a Function ###
convertGene <- function(pathway_vec,
conversion_tbl,
from_char = entrezgene,
to_char = hgnc_symbol){
# browser()
from_char <- enquo(from_char)
to_char <- enquo(to_char)
conversion_tbl %>%
filter(!!from_char %in% pathway_vec) %>%
select(!!to_char) %>%
na.omit %>%
unique %>%
unlist %>%
unname()
}
# Test
convertGene(
pathway_vec = as.integer(wikipHuman_pathwaySet$pathways[[1]]),
conversion_tbl = humanID_tbl
)
###### Apply the Conversion Function ########################################
### Setup ###
library(pathwayPCA)
library(tidyverse)
wikipHuman_pathwaySet <- read_gmt("extdata/wikipathways_human.gmt")
humanID_tbl <- read_csv(file = "gene_conversion_table.csv")
### Function ###
convertGene <- function(pathway_vec,
conversion_tbl,
from_char = entrezgene,
to_char = hgnc_symbol){
# browser()
from_char <- enquo(from_char)
to_char <- enquo(to_char)
conversion_tbl %>%
filter(!!from_char %in% pathway_vec) %>%
select(!!to_char) %>%
na.omit %>%
unique %>%
unlist %>%
unname()
}
### Apply the Function ###
lapply(wikipHuman_pathwaySet$pathways[1:5], convertGene, humanID_tbl)
cleanWikiP_pathwaySet <- wikipHuman_pathwaySet
a <- Sys.time()
cleanWikiP_pathwaySet$pathways <-
lapply(wikipHuman_pathwaySet$pathways, convertGene, humanID_tbl)
Sys.time() - a # 34 sec
### Clean Wikipathways TERMS Field ###
terms_ls <- strsplit(wikipHuman_pathwaySet$TERMS, "%")
cleanWikiP_pathwaySet$TERMS <- sapply(terms_ls, `[`, 3)
cleanWikiP_pathwaySet$description <- sapply(terms_ls, `[`, 1)
### Save ###
saveRDS(cleanWikiP_pathwaySet, file = "extdata/wikipathways_human_clean.RDS")
###### Write a write_gmt Function ###########################################
### Setup ###
library(pathwayPCA)
library(tidyverse)
wikipHuman_pathwaySet <- read_gmt("extdata/wikipathways_human.gmt")
test_pathwaySet <- wikipHuman_pathwaySet
test_pathwaySet$TERMS <- sapply(terms_ls, `[`, 3)
test_pathwaySet$description <- sapply(terms_ls, `[`, 1)
### Create a File Vector ###
# First create the list in .gmt form:
a1 <- Sys.time()
test_ls <- lapply(1:457, function(i){
desc_char <- wikipHuman_pathwaySet$description[i]
desc_char <- ifelse(is.null(desc_char), "", desc_char)
c(wikipHuman_pathwaySet$TERMS[i],
desc_char,
wikipHuman_pathwaySet$pathways[[i]])
})
Sys.time() - a1 # 0.04464793 sec for 100; 0.04784989, 0.04785013 sec for 457
# Now collapse the list by tabs
a2 <- Sys.time()
test_char <- sapply(test_ls, paste, collapse = "\t")
Sys.time() - a2 # 0.032233 sec; 0.007007837, 0.03924084 sec for 457
# Test writing functions
a3 <- Sys.time()
writeLines(test_char, con = "extdata/test_writeLines.txt")
Sys.time() - a3 # 0.00100112 sec; 0 sec for 457
tWL <- c(0, 0.00100112, 0.03223419, 0.032233, 0.04524803)
mean(tWL)
a4 <- Sys.time()
cat(test_char, file = "extdata/test_cat.txt", sep = "\n")
Sys.time() - a4 # 0.03223395 sec; 0.03123283 sec for 457
tCat <- c(0.03123283, 0.001000881, 0.04884982, 0.06446719, 0.03123307)
mean(tCat)
# writeChar is comparable in speed to the other two methods, but doesn't write
# the file in the proper ragged format
# a5 <- Sys.time()
# writeChar(test_char, con = "extdata/test_writeChar.txt")
# Sys.time() - a5 # x.xx sec for 457
### Make it a Function ###
write_gmt <- function(pathwaySet, path){
### Setup ###
pathways_ls <- pathwaySet$pathways
TERMS_char <- pathwaySet$TERMS
desc_char <- pathwaySet$description
nPaths <- length(pathways_ls)
if(is.null(desc_char)){
desc_char <- rep(NA, nPaths)
}
### Write a list in .gmt form ###
# See the Broad Institute wiki page on .gmt file formats:
# https://tinyurl.com/yaf3exlk
out_ls <- lapply(1:nPaths, function(i){
c(TERMS_char[i], desc_char[i], pathways_ls[[i]])
})
### Collapse the list ###
out_char <- sapply(out_ls, paste, collapse = "\t")
### Write the File ###
writeLines(out_char, con = path)
}
# Test
write_gmt(wikipHuman_pathwaySet, path = "extdata/test.gmt")
###### Write the Cleaned and Translated Wikipathways .gmt File ##############
library(pathwayPCA)
library(tidyverse)
clean_pathwaySet <- readRDS(file = "extdata/wikipathways_human_clean.RDS")
write_gmt <- function(pathwaySet, path){
### Setup ###
pathways_ls <- pathwaySet$pathways
TERMS_char <- pathwaySet$TERMS
desc_char <- pathwaySet$description
nPaths <- length(pathways_ls)
if(is.null(desc_char)){
desc_char <- rep(NA, nPaths)
}
### Write a list in .gmt form ###
# See the Broad Institute wiki page on .gmt file formats:
# https://tinyurl.com/yaf3exlk
out_ls <- lapply(1:nPaths, function(i){
c(TERMS_char[i], desc_char[i], pathways_ls[[i]])
})
### Collapse the list ###
out_char <- sapply(out_ls, paste, collapse = "\t")
### Write the File ###
writeLines(out_char, con = path)
}
# Test
write_gmt(clean_pathwaySet, path = "extdata/wikipathways_human_clean.gmt")
clean2_pathwaySet <- read_gmt("extdata/wikipathways_human_clean.gmt",
description = TRUE)
all.equal(clean_pathwaySet, clean2_pathwaySet) # something is wrong...
all.equal(clean_pathwaySet$pathways[[1]],
clean2_pathwaySet$pathways[[1]])
clean_pathwaySet$pathways[[1]] %in% clean2_pathwaySet$pathways[[1]]
clean_pathwaySet$pathways[[1]][96]
clean2_pathwaySet$pathways[[1]][96]
# It's the "\r" expression. It's coming from the read_gmt function, so I fixed
# that function in the devel branch.
# Try again:
clean2_pathwaySet <- read_gmt("extdata/wikipathways_human_clean.gmt",
description = TRUE)
all.equal(clean_pathwaySet, clean2_pathwaySet) # less stuff is wrong, but still
all.equal(clean_pathwaySet$pathways[[167]],
clean2_pathwaySet$pathways[[167]])
clean_pathwaySet$pathways[[167]] # character(0)
clean2_pathwaySet$pathways[[167]] # NA and something that isn't a gene...
# What's the problem?
original_pathwaySet <- read_gmt("extdata/wikipathways_human.gmt")
original_pathwaySet$pathways[[167]] # matches gene CYP2E1 from uniprot.org
clean_pathwaySet$pathways[[167]] # wrong
clean2_pathwaySet$pathways[[167]] # SUPER WRONG
original_pathwaySet$TERMS[[167]]
clean_pathwaySet$TERMS[[167]]
clean_pathwaySet$description[[167]]
clean2_pathwaySet$TERMS[[167]]
clean2_pathwaySet$description[[167]]
# So I need to check my lookup table for this gene.
humanID_tbl <- read_csv("gene_conversion_table.csv")
humanID_tbl %>%
filter(entrezgene == 1571) %>%
select(entrezgene, hgnc_symbol) %>%
distinct()
# So my gene table is screwy, but the write_gmt function should never write
# character(0) to the pathways field. Let's try it again.
###### write_gmt() Version 2 ################################################
library(pathwayPCA)
library(tidyverse)
clean_pathwaySet <- readRDS(file = "extdata/wikipathways_human_clean.RDS")
write_gmt <- function(pathwaySet, path){
### Setup ###
pathways_ls <- pathwaySet$pathways
TERMS_char <- pathwaySet$TERMS
desc_char <- pathwaySet$description
nPaths <- length(pathways_ls)
if(is.null(desc_char)){
desc_char <- rep(NA, nPaths)
}
### Error Checking ###
stopifnot(nPaths == length(TERMS_char), nPaths == length(desc_char))
pathways_ls[which(lengths(pathways_ls) == 0)] <- NA_character_
TERMS_char[length(TERMS_char) == 0] <- NA_character_
desc_char[length(desc_char) == 0] <- NA_character_
### Write a list in .gmt form ###
# See the Broad Institute wiki page on .gmt file formats:
# https://tinyurl.com/yaf3exlk
out_ls <- lapply(1:nPaths, function(i){
c(TERMS_char[i], desc_char[i], pathways_ls[[i]])
})
### Collapse the list ###
out_char <- sapply(out_ls, paste, collapse = "\t")
### Write the File ###
writeLines(out_char, con = path)
}
# Test
write_gmt(clean_pathwaySet, path = "extdata/wikipathways_human_clean2.gmt")
clean3_pathwaySet <- read_gmt("extdata/wikipathways_human_clean2.gmt",
description = TRUE)
all.equal(clean_pathwaySet, clean3_pathwaySet)
# This difference is due to the length of character(0) vs NA_character. I'm ok
# with that.
###### Troubleshoot Matching Table ##########################################
### Setup ###
library(pathwayPCA)
library(tidyverse)
wikipHuman_pathwaySet <- read_gmt("extdata/wikipathways_human.gmt")
humanID_tbl <- read_csv(file = "gene_conversion_table.csv")
### The Issue ###
# We have problems with the following pathways: 167, 180, 184, 244, 314, and
# 450. They have the following entries in the original pathway set:
wikipHuman_pathwaySet$pathways[[167]] # Matching gene = CYP2E1
wikipHuman_pathwaySet$pathways[[180]] # Matching gene = CYP2E1
wikipHuman_pathwaySet$pathways[[184]] # Matching gene = CYP3A4
wikipHuman_pathwaySet$pathways[[244]] # Matching gene = MTHFR
wikipHuman_pathwaySet$pathways[[314]] # Matching gene = INS
wikipHuman_pathwaySet$pathways[[450]] # Matching gene = CYP2E1
# Can we find the correct matches?
humanID_tbl %>%
filter(entrezgene == 1571) %>%
select(entrezgene, hgnc_symbol) %>%
distinct()
humanID_tbl %>%
filter(entrezgene == 1576) %>%
select(entrezgene, hgnc_symbol) %>%
distinct()
humanID_tbl %>%
filter(entrezgene == 4524) %>%
select(entrezgene, hgnc_symbol) %>%
distinct()
humanID_tbl %>%
filter(entrezgene == 3630) %>%
select(entrezgene, hgnc_symbol) %>%
distinct()
# Yes.
# So what in the hell is the problem???
convertGene(
pathway_vec = as.integer(wikipHuman_pathwaySet$pathways[[167]]),
conversion_tbl = humanID_tbl
)
convertGene(
pathway_vec = as.integer(wikipHuman_pathwaySet$pathways[[180]]),
conversion_tbl = humanID_tbl
)
convertGene(
pathway_vec = as.integer(wikipHuman_pathwaySet$pathways[[184]]),
conversion_tbl = humanID_tbl
)
convertGene(
pathway_vec = as.integer(wikipHuman_pathwaySet$pathways[[244]]),
conversion_tbl = humanID_tbl
)
convertGene(
pathway_vec = as.integer(wikipHuman_pathwaySet$pathways[[314]]),
conversion_tbl = humanID_tbl
)
convertGene(
pathway_vec = as.integer(wikipHuman_pathwaySet$pathways[[450]]),
conversion_tbl = humanID_tbl
)
# My conversion function is even working properly?
convertGene(
pathway_vec = as.integer(wikipHuman_pathwaySet$pathways[[1]]),
conversion_tbl = humanID_tbl
)
# Rebuild "clean" list
cleanWikiP_pathwaySet <- wikipHuman_pathwaySet
cleanWikiP_pathwaySet$pathways <-
lapply(wikipHuman_pathwaySet$pathways, convertGene, humanID_tbl)
cleanWikiP_pathwaySet$pathways[[167]]
cleanWikiP_pathwaySet$pathways[[180]]
cleanWikiP_pathwaySet$pathways[[184]]
cleanWikiP_pathwaySet$pathways[[244]]
cleanWikiP_pathwaySet$pathways[[314]]
cleanWikiP_pathwaySet$pathways[[450]]
# these all work. What the HELL IS GOING ON???
# Clean Wikipathways TERMS Field
terms_ls <- strsplit(wikipHuman_pathwaySet$TERMS, "%")
cleanWikiP_pathwaySet$TERMS <- sapply(terms_ls, `[`, 3)
cleanWikiP_pathwaySet$description <- sapply(terms_ls, `[`, 1)
cleanWikiP2_pathwaySet <- readRDS(file = "extdata/wikipathways_human_clean.RDS")
all.equal(cleanWikiP_pathwaySet, cleanWikiP2_pathwaySet)
all.equal(cleanWikiP_pathwaySet$pathways[[1]],
cleanWikiP2_pathwaySet$pathways[[1]])
all.equal(cleanWikiP_pathwaySet$pathways[[1]][1:89],
cleanWikiP2_pathwaySet$pathways[[1]][1:89])
cleanWikiP_pathwaySet$pathways[[1]][90:97]
cleanWikiP2_pathwaySet$pathways[[1]][90:96]
# Where the hell did the "BCL10" gene come from?
# Also
cleanWikiP2_pathwaySet$pathways[[167]]
cleanWikiP2_pathwaySet$pathways[[180]]
cleanWikiP2_pathwaySet$pathways[[184]]
cleanWikiP2_pathwaySet$pathways[[244]]
cleanWikiP2_pathwaySet$pathways[[314]]
cleanWikiP2_pathwaySet$pathways[[450]]
# Overall, something screwy happened to these fields. I'm going to rerun the
# write_gmt function with a browser
write_gmt <- function(pathwaySet, path){
### Setup ###
pathways_ls <- pathwaySet$pathways
TERMS_char <- pathwaySet$TERMS
desc_char <- pathwaySet$description
nPaths <- length(pathways_ls)
if(is.null(desc_char)){
desc_char <- rep("", nPaths)
}
### Error Checking ###
stopifnot(nPaths == length(TERMS_char), nPaths == length(desc_char))
pathways_ls[which(lengths(pathways_ls) == 0)] <- NA_character_
### Write a list in .gmt form ###
# See the Broad Institute wiki page on .gmt file formats:
# https://tinyurl.com/yaf3exlk
out_ls <- lapply(1:nPaths, function(i){
c(TERMS_char[i], desc_char[i], pathways_ls[[i]])
})
### Collapse the list ###
out_char <- sapply(out_ls, paste, collapse = "\t")
### Write the File ###
writeLines(out_char, con = path)
}
# Test
write_gmt(cleanWikiP_pathwaySet, path = "extdata/test.gmt")
# There's nothing wrong with this file, so I'm just going to overwrite the first
# pathway set .gmt file I wrote.
write_gmt(cleanWikiP_pathwaySet, path = "extdata/wikipathways_human_clean.gmt")
cleanWikiP3_pathwaySet <- read_gmt("extdata/wikipathways_human_clean.gmt",
description = TRUE)
all.equal(cleanWikiP_pathwaySet, cleanWikiP3_pathwaySet)
# YES!!!!!!!!!!!!!!!!!
# Ok, rename it as wikipathways_human_symbol.gmt
write_gmt(cleanWikiP_pathwaySet, path = "extdata/wikipathways_human_symbol.gmt")
# NOTE: the write_gmt function has been added to the devel branch of pathwayPCA
# version 0.0.0.9000.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.