knitr::opts_chunk$set(echo = TRUE, collapse = TRUE)
Excel-mogrified gene symbols (mog_map.csv
) was catalogued by importing all gene
symbols into Excel and exporting them in all available date formats. Also, 38,040
gene symbols used in MSigDB database is downloaded and saved in msigdb.v7.0.symbols.gmt
.
library(HGNChelper) readGMT <- function (fname, name.column = 1) { if (!(name.column == 1 | name.column == 2)) stop("name.column should be 1 or 2") x <- readLines(fname) x <- strsplit(x, split = "\t") output <- lapply(x, function(y) y[-1:-2]) names(output) <- make.names(sapply(x, function(y) y[name.column]), unique = TRUE) return(output) } # gmt.files <- dir(pattern = "\\.gmt$") # if(length(gmt.files) == 0) stop(paste("No .gmt files found in", getwd())) # Load all .gmt files gmt.files <- "msigdb.v7.0.symbols.gmt" all.sigs <- lapply(gmt.files, readGMT) all.sigs <- lapply(all.sigs, function(x) unique(unlist(x))) # all.sigs <- lapply(all.sigs, function(x) x[!grepl("^LOC", x)]) names(all.sigs) <- gmt.files all.sigs$OVERALL <- unique(do.call(c, all.sigs)) # Load Excel-mogrified gene symbols mog <- read.csv(system.file("extdata", "mog_map.csv", package = "HGNChelper"), as.is=TRUE)
HGNChelper
Fetch the up-to-date human gene symbol map using HGNChelper::getCurrentHumanMap
.
CurrentHumanMap = getCurrentHumanMap()
Update gene symbols using HGNChelper and check any Excel-mogrified gene symbol.
simplecheck <- sapply(all.sigs, function(x){ tmp <- checkGeneSymbols(x, map = CurrentHumanMap) output <- c(sum(tmp$Approved)/nrow(tmp), sum(!is.na(tmp[, 3]))/nrow(tmp), any(toupper(x) %in% toupper(mog[, 2]))) names(output) <- c("valid.frac", "valid.after.hgnchelper.frac", "any.excel") return(output) }) simplecheck <- t(simplecheck) simplecheck # checkGeneSymbols tmp = checkGeneSymbols(all.sigs[[1]], map = CurrentHumanMap) # the number of invalid gene symbols before sum(tmp$Approved != "TRUE") # the number of invalid gene symbols after sum(is.na(tmp$Suggested.Symbol)) # Example of gene symbols that can't be corrected by HGNGhelper unfixables <- sort(tmp[is.na(tmp[, 3]), 1]) length(unfixables) set.seed(1) sample(unfixables, 10)
Most of unfixable cases are lncRNA. Some are withdrawn gene symbols, LOC- genes. (From NCBI: "Symbols beginning with LOC. When a published symbol is not available, and orthologs have not yet been determined, Gene will provide a symbol that is constructed as 'LOC' + the GeneID. This is not retained when a replacement symbol has been identified, although queries by the LOC term are still supported. In other words, a record with the symbol LOC12345 is equivalent to GeneID = 12345. So if the symbol changes, the record can still be retrieved on the web using LOC12345 as a query, or from any file using GeneID = 12345.")
HGNChelper
GEO platform information is downloaded from https://www.ncbi.nlm.nih.gov/geo/browse/?view=platforms
in five .csv
files and combined/saved as platforms.csv
. As of March 27th, 2020,
there are 20,716 platforms exist in GEO.
dir <- "inst/extdata/GPL_list" gpl_files <- list.files(dir) first <- read.csv(file.path(dir, gpl_files[1]), as.is=TRUE) platform <- as.data.frame(matrix(ncol = ncol(first), nrow = 0)) colnames(platform) <- colnames(first) for (i in seq_along(gpl_files)) { gpl = read.csv(file.path(dir, gpl_files[i]), as.is=TRUE) platform = rbind(platform, gpl) } write.csv(platform, "inst/extdata/platform.csv", row.names = FALSE)
Fetch all the gene symbols found in GPL, store in column 1 of platform.csv
.
library(GEOquery) all.platforms <- read.csv("platform.csv", as.is=TRUE) # human Entrez Gene identifier symbols library(org.Hs.eg.db) hgnc.reference <- as.character(org.Hs.egSYMBOL) names(hgnc.reference) <- NULL
gpls_already_tested.csv
file(This step takes long --> separate R script available as test_GPLs.R
)
library(HGNChelper) raw.platform.dir <- "platforms" dir.create(raw.platform.dir) hgnc.vec.dir <- "hgnc.vecs" dir.create(hgnc.vec.dir) if (file.exists("gpls_already_tested.csv")) { gpls.already.tested <- read.csv("gpls_already_tested.csv", header=TRUE) } else { write.table(t(c("platform", "colname", "frac.hgnc", "nrow", "valid.frac", "valid.after.hgnchelper.frac", "distribution", "submission_date")), file="gpls_already_tested.csv", row.names=FALSE, col.names=FALSE, sep=",") } for (i in 1:nrow(all.platforms)){ gpl <- all.platforms[i, 1] hgnc.vec.file <- paste(hgnc.vec.dir, "/", gpl, "_hgnc.vec.RData", sep="") if(exists("gpls.already.tested") && gpl %in% gpls.already.tested$platform) next gpldat <- try(getGEO(gpl, destdir=raw.platform.dir)) if(class(gpldat) == "try-error") next gpltable <- try(Table(gpldat)) if(class(gpltable) == "try-error") next hgnc.frac <- apply(gpltable, 2, function(x) sum(unique(x) %in% hgnc.reference) / length(unique(x))) # any column containing gene symbol will have >0 value if(any(hgnc.frac > 0.5)) { # updated from `hgnc.frac > 0` # assumed that there is only one 'symbol' column hgnc.vec <- unique(as.character(gpltable[, which.max(hgnc.frac)])) # get rid of anything after a space hgnc.vec <- gsub("[ ].+", "", hgnc.vec) # convert to ascii HGNChelper.output <- checkGeneSymbols(iconv(hgnc.vec, "latin1", "ASCII", ""), map = CurrentHumanMap) # fraction of valid gene symbols BEFORE HGNChelper valid.frac <- sum(HGNChelper.output$Approved) / length(hgnc.vec) # fraction of valid gene symbols AFTER HGNChelper after.HGNChelper.valid.frac <- sum(!is.na(HGNChelper.output$Suggested.Symbol)) / length(hgnc.vec) hgnc.vec <- c(gpldat@header$distribution, gpldat@header$submission_date, hgnc.vec) save(hgnc.vec, file=hgnc.vec.file, compress="bzip2") info.for.file <- t(c(gpl, colnames(gpltable)[which.max(hgnc.frac)], max(hgnc.frac), nrow(gpltable), valid.frac, after.HGNChelper.valid.frac, gpldat@header$distribution, gpldat@header$submission_date)) print(paste(i, gpl, length(hgnc.vec), "HGNC symbols found and saved.")) } else { info.for.file <- t(c(gpl, colnames(gpltable)[which.max(hgnc.frac)], max(hgnc.frac), nrow(gpltable), NA, NA, gpldat@header$distribution, gpldat@header$submission_date)) print(paste(i, gpl, ": No HGNC symbols.")) } write.table(info.for.file, file="gpls_already_tested.csv", append=TRUE, row.names=FALSE, col.names=FALSE, sep=",") }
x <- read.csv("gpls_already_tested.csv", as.is=TRUE) x$valid.after.hgnchelper.frac <- as.numeric(x$valid.after.hgnchelper.frac) x <- x[!is.na(x$valid.frac), ] x <- x[x$valid.frac > 0, ] df <- data.frame(before=cut(100*x$valid.frac, breaks=seq(0, 100, by=20)), fixed=(x$valid.after.hgnchelper.frac - x$valid.frac)/(1-x$valid.frac)) par(mar=c(4,5,2,0.5)) boxplot(fixed ~ before, data=df, main=paste("Correcting the annotations of", nrow(df), "GEO platforms"), xlab="% Valid Before HGNChelper", ylab="Fraction of invalid \n gene symbols fixed", col="grey", boxwex=1.1, varwidth=TRUE)
raw <- read.csv("gpls_already_tested.csv", as.is=TRUE) # 20,713 raws raw$valid.after.hgnchelper.frac <- as.numeric(raw$valid.after.hgnchelper.frac) raw <- raw[!is.na(raw$valid.frac), ] # 2,044 rows raw <- raw[raw$valid.frac > 0.2, ] # 2,043 rows raw$year <- as.numeric(gsub(".+[ ]+", "", raw$submission_date)) n.years <- length(unique(raw$year)) boxplot(valid.frac ~ year, data=raw, boxwex=0.3, ylim = c(0.4, 1), # main="Fig 2. Valid gene symbols for submission year", xlab="Submission Year", ylab="Fraction of valid gene symbols", names=sub("20", "'", sort(unique(raw$year))), col="white") boxplot(valid.after.hgnchelper.frac ~ year, data=raw, ylim = c(0.4, 1), boxwex=0.3, xaxt='n', at=(1:n.years)+0.35, add=TRUE, col="grey") legend("bottomleft", legend=c("Before", "After"), pch=c(0, 15), col=c("black", "grey"), lty=-1, bty='n', cex=1)
png("~/data2/HGNChelper/inst/analyses/Fig1.png", width = 1400, height = 740) boxplot(valid.frac ~ year, data=raw, boxwex=0.3, ylim = c(0.4, 1), # main="Fig 2. Valid gene symbols for submission year", xlab="Submission Year", ylab="Fraction of valid gene symbols", names=sub("20", "'", sort(unique(raw$year))), col="white") boxplot(valid.after.hgnchelper.frac ~ year, data=raw, ylim = c(0.4, 1), boxwex=0.3, xaxt='n', at=(1:n.years)+0.35, add=TRUE, col="grey") legend("bottomleft", legend=c("Before", "After"), pch=c(0, 15), col=c("black", "grey"), lty=-1, bty='n', cex=1) dev.off()
The number of unique platforms (= each dot in the above plot) each year
table(raw$year)
Mouse gene symbols begin with an uppercase letter followed by all lowercase letters except for recessive mutations, which begin with a lowercaase letter.
mouse = c("Pzp", "A2m", "A2mr","SEPT7", "1-Feb", "lrp1", "9/7") checkGeneSymbols(mouse, species = "mouse")
HGNChelper cannot fix mouse gene alias containing uppercase letter in the middle of gene name. For example, "E430008G22Rik" (valid symbol is "Abl1") or "AA536808" (valid symbol is "Abl2").
mouse = c("abl2", "AA536808", "E430008G22Rik", "AbLl", "Cpamd8", "cpamd8", "mug2", "Mug2") checkGeneSymbols(mouse, species = "mouse")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.