Nothing
#'GEO download/processing through GEOquery
#' @export
#' @author Francesco Vallania, Andrew Tam, Ravi Shankar, Aditya M. Rao
#'
#' @description Creates MetaIntegrator formatted objects by downloading and formatting data from GEO.
#' @param gseVector a vector of GSE ids (each a string)
#' @param formattedNames a vector of formatted names corresponding to the GSE ids. Default: gseVector
#' @param qNorm perform quantile normalization of expression data within a dataset or not. Default: FALSE
#' @param ... will pass additional parameters to getGEO, including \code{destdir}, which specifies download location
#'
#' @details Note: if you get the error "Error: Couldn't find driver MySQL" then just library(RMySQL) and then re-run getGEOData
#' @return a Pre-Analysis MetaObject containing the datasets loaded in $originalData
#' @export
getGEOData <- function(gseVector, formattedNames=gseVector, qNorm=FALSE, ...){
#bugfix - sometimes there's a MySQL error if this isn't run
requireNamespace(package = "RMySQL", quietly=TRUE)
#Correct gses to upper case
gseVector <- toupper(gseVector)
originalData <- lapply(gseVector,.GEOqueryCreateGEM, qNorm, ...)
names(originalData) <- gseVector
originalData <- .geoquery_gems_2_singlelist(originalData, formattedNames)
#add sample names on class
originalData <- lapply(originalData,
function(i){
names(i$class) <- row.names(i$pheno)
return(i)
})
#return the data in a format compatible with MetaIntegrator
return(list(originalData = originalData))
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#.GEOqueryCreateGEM
#~by Andrew Tam + Francesco Vallania [July 2014]
#~this will fetch and annotate a GEM object(s) directly from GEO using the GSE ID!!!
#~[requires GEOquery package]
#######################################################################################
.GEOqueryCreateGEM <- function(id, qNorm, ...){
#make sure id starts with GSE or GDS
if(length(grep("^GSE",id,invert=T))==0 & length(grep("^GDS",id,invert=T))==0){
id <- paste("GSE",id,sep="");
}
#fetch GEO object from GSEID
#run function with tryCatch
#note-> picking up only one GSE now. Fix this for multiple GPLs
GEO <- tryCatch(GEOquery::getGEO(id, ...),
error = function(e) e);
if(class(GEO)[[1]]=="GDS") {
GEO <- list(a=GEOquery::GDS2eSet(GEO))
names(GEO)[[1]] <- id
}
#declare array object
gem <- NULL;
#if object was downloaded then go for it!
if(class(GEO[[1]])[1] == "ExpressionSet"){
#lapply function to GEM object
gem <- lapply(GEO,
function(i){
.GEOqueryGEO2GEM(i,id, qNorm);
});
}
#return dataset
return(gem);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#GEOqueryGEO2GEM
#~by Andrew Tam + Francesco Vallania [July 2014] + Ravi Shankar [August 2018] + Aditya M. Rao [2019]
#~this will convert the GEO object to GEM for our convenience!
#~[requires GEOquery package]
#######################################################################################
.GEOqueryGEO2GEM <- function(GEO,id,qNorm){
#generate pheno matrix
pheno = Biobase::pData(GEO);
#generate expression matrix
expr = Biobase::exprs(GEO);
if(class(expr[,1])=='character'){
#return the new numeric matrix
exprN = matrix(as.numeric(expr),nrow = nrow(expr))
row.names(exprN) <- row.names(expr)
colnames(exprN) <- colnames(expr)
#replace original expression matrix
expr <- exprN
}
#create gem object
gem = list(name = id,
load_comment = "Loaded with GEOquery");
#parse out fData
fData_out <- .GEO_fData_key_parser(Biobase::fData(GEO))
#get platform information
platform_info <- NA
if (length(GEO@annotation) > 0 && grepl("^GPL", GEO@annotation)) {
platform_info <- GEO@annotation
}else {
if ("platform" %in% names(GEO@experimentData@other)) {
platform_info <- GEO@experimentData@other$platform
}
}
#update gem object
gem$class = rep(0, nrow(pheno))
gem$keys = fData_out$keys
if(length(gem$keys)>0){
gem$key_comment = fData_out$comment
}else{
gem$keys = NULL
gem$key_comment = "Annotation absent"
}
gem$pheno = pheno
gem$platform = platform_info
#check if there is a expression matrix
if (dim(expr)[1] > 0) {
gem$expr = expr
gem$exp_comment = "Data in log scale"
#check for log scale
if (.GEM_log_check(gem) == FALSE) {
min_value <- min(gem$expr, na.rm = T)
# There is possibility of negative values in processed non-normalized
# data from the background subtraction algorithm. For e.g., GSE103842 (Illumina data)
# if minimum value is negative, add small offset to all values to make them all positive
if (min_value < 0) {
gem$expr <- gem$expr - min_value + 1
}
gem$expr <- log2(gem$expr)
gem$exp_comment = "Data was not in log scale originally"
}
# perform quantile normalization if desired
if (qNorm == TRUE) {
col_names <- colnames(gem$expr)
row_names <- rownames(gem$expr)
gem$expr <- preprocessCore::normalize.quantiles(gem$expr)
colnames(gem$expr) <- col_names
rownames(gem$expr) <- row_names
}
}else{
gem$exp_comment = "Expression data is missing"
}
#return dataset
return(gem);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#GEO_fData_key_parser
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~this function is used to convert the fData input matrix into usable annotation
#~[note-> GEO_fData is fData(GEO) where GEO-> output of getGEO on a GSE object, this
#~corresponds to its SOFT file]
#######################################################################################
.GEO_fData_key_parser <- function(GEO_fData){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Set input variables
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
symbol_pos <- NULL;
comment <- NULL;
keys <- NULL;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Check fData length
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
check_len <- 0;
if(dim(GEO_fData)[2]>0){check_len <- 1};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Case1-> Simple search
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if(check_len==1){
#identify position that contains
#gene position
#added further term of search
symbol_pos <- which(sapply(1:ncol(GEO_fData),
function(i)
length(grep("^GAPDH(S)*$|^ICAM1$|^Gapdh(s)*$",GEO_fData[,i]))>0));
#redundant fields-> pick the first one
if(length(symbol_pos)>1){
#if there are multiple ones pick one that matches most of the probes
symbol_pos <- symbol_pos[which.max(sapply(symbol_pos,
function(i)
length(grep("^GAPDH(S)*$|^ICAM1$|^Gapdh(s)*$",
GEO_fData[,i]))))];
#symbol_pos <- symbol_pos[1];
}
#generate keys vector if succesfull
if(length(symbol_pos)==1){
keys <- as.character(GEO_fData[[symbol_pos]]);
names(keys) <- as.character(GEO_fData[[1]]);
comment <- "Typical annotation";
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Case2-> //[GENE NAME]// format
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if(is.null(keys) && check_len==1){
#absent field
if(length(symbol_pos)<1){
#identify whether it follows the //[GENE NAME]// format
symbol_pos <- which(sapply(1:ncol(GEO_fData),
function(i)
length(grep("\\/\\/ GAPDH(S)* \\/\\/",GEO_fData[,i]))>0));
#pick first one
if(length(symbol_pos)> 1){symbol_pos <- symbol_pos[1]};
if(length(symbol_pos)==1){
#get row [get first one is there are multiples]
row_pos <- grep("\\/\\/ GAPDH(S)* \\/\\/",GEO_fData[,symbol_pos]);
if(length(row_pos)>1){row_pos <- row_pos[1]};
#get exact split position [get first one is there are multiples]
split_pos <- sapply(strsplit(as.character(GEO_fData[row_pos,symbol_pos])," /+ "),
function(i)
grep("^GAPDH(S)*$",i));
if(length(split_pos)>1){split_pos <- split_pos[1]};
#now split all lines and generate new vector
keys <- sapply(strsplit(as.character(GEO_fData[,symbol_pos])," /+ "),
function(i)
i[split_pos]);
names(keys) <- as.character(GEO_fData[[1]]);
comment <- "//[GENE NAME]// format";
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Case3-> genes are encoded in ENSG ID [use Human for this]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if(is.null(keys) && check_len==1){
if(length(symbol_pos)<1){
#identify whether it is stored as ENSG
symbol_pos <- which(sapply(1:ncol(GEO_fData),
function(i)
length(grep("^ENSG00000111640$",GEO_fData[,i]))>0));
if(length(symbol_pos)> 1){symbol_pos <- symbol_pos[1]};
if(length(symbol_pos)==1){
#convert genes into gene names
ens_table <- .ensembl_ensgID_table();
probe2table <- match(GEO_fData[,symbol_pos],ens_table$stable_id);
#store data
keys <- ens_table$display_label[probe2table];
names(keys) <- as.character(GEO_fData[[1]]);
comment <- "Human ENSG format";
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Case4-> genes are encoded in REFSEQ format [use Human for this]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if(is.null(keys) && check_len==1){
if(length(symbol_pos)<1){
#identify whether it is stored as REFSEQ
symbol_pos <- which(sapply(1:ncol(GEO_fData),
function(i)#bug fix-> create pattern match for correctly mapping REFSEQ
length(grep("^NM_002046.*$",GEO_fData[,i]))>0))
if(length(symbol_pos)> 1){symbol_pos <- symbol_pos[1]}
if(length(symbol_pos)==1){
#convert genes into gene names by matching accession numbers
#bug fix -> removed versioning so that it is now possible to match stuff
ucsc_table <- .ucsc_refseq_table()
#if REFSEQ IDs have _at at the end add it to the table
if(length(grep("_at",GEO_fData[1,symbol_pos]))>0){
ucsc_table$name <- gsub("$","_at",ucsc_table$name)
}
probe2table <- match(gsub("\\..$","",GEO_fData[,symbol_pos]),
ucsc_table$name)
#store data
keys <- ucsc_table$name2[probe2table]
names(keys) <- as.character(GEO_fData[[1]])
comment <- "Human REFSEQ format"
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Case5-> genes are encoded in GenBank format [use Human for this]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if(is.null(keys) && check_len==1){
if(length(symbol_pos)<1){
#identify whether it is stored as ENSG
symbol_pos <- which(sapply(1:ncol(GEO_fData),
function(i)
length(grep("^(M61854|M29696)$",GEO_fData[,i]))>0));
if(length(symbol_pos)> 1){
symbol_pos <- symbol_pos[1]
}
if(length(symbol_pos)==1){
#bug fix-> certain probes map to multiple genes
gb_input_table <- GEO_fData[,symbol_pos]
id_input_table <- GEO_fData[[1]]
#
if(length(grep(",",GEO_fData[,symbol_pos]))>0){
#create temp lists
gb_input_tl <- sapply(gb_input_table, function(i) strsplit(as.character(i),",")[[1]])
id_input_tl <- sapply(1:length(gb_input_tl),function(i) rep(id_input_table[i],length(gb_input_tl[[i]])))
#replace final values
gb_input_table <- unlist(gb_input_tl)
id_input_table <- unlist(id_input_tl)
}
#convert genes into gene names by matching accession numbers
ucsc_table <- .ucsc_genbank_table()
probe2table <- match(gsub("\\..*$","",gb_input_table),ucsc_table$acc)
#store data
keys <- ucsc_table$name[probe2table]
names(keys) <- as.character(id_input_table)
comment <- "Human GenBank format"
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Case6-> genes are encoded in EntrezID format [use Human for this]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if(is.null(keys) && check_len==1){
if(length(symbol_pos)<1){
#intentify the presense of EntrezID entries
entrez_pos <- which(sapply(1:ncol(GEO_fData),
function(i)
length(grep("^960$",GEO_fData[,i]))>0))
if(length(entrez_pos)==1){
#get table
entrezid_table <- .ensembl_entrez_connector()
probe2table <- match(GEO_fData[,entrez_pos],entrezid_table$EntrezID)
#store data
keys <- entrezid_table$GeneName[probe2table]
names(keys) <- as.character(GEO_fData[[1]])
comment <- "Human EntrezID format"
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Case7-> MTb case handler (Rohit) using ORF column
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
if(is.null(keys) && check_len==1){
if(length(symbol_pos)<1){
#look for ORF column
orf_position <- which(names(GEO_fData)=='ORF')
#store data in case ORF column was found
if(length(orf_position)>0){
keys <- GEO_fData[,orf_position]
names(keys) <- as.character(GEO_fData[[1]])
comment <- "MTb annotation with ORF"
#remove empty crap
keys[which(keys=="")] <- NA
}
}
}
#Case 0a-> return absence even if there is a fData file
if(is.null(keys) && check_len==1){
symbol_pos <- 1;
comment <- "Annotation absent";
keys <- as.character(GEO_fData[[symbol_pos]]);
names(keys) <- as.character(GEO_fData[[1]]);
}
#Case 0b-> return absence of fData file
if(is.null(keys) && check_len==0){
comment <- "fData file not available";
}
#replace /// into a , to make it compatible
keys <- gsub(";| /// ",",",keys)
#remove probes that do not map back to genes
if(comment!= "MTb annotation with ORF"){
genes <- .ensembl_ensgID_table()
fake_genes <- which(!(unlist(sapply(keys,function(i)strsplit(i,",")[[1]][1]))
%in%
genes$display_label))
keys[fake_genes] <- NA
}
#if key vector is indeed present
if(!is.null(keys)){
#gene_list <- '/projects/fvallania/global_files';
#get gene list form ensembl [replace with our own]
#ens_g <- ensembl_gene_connector_hs76();
#geneset <- unique(ens_g$display_label);
#set non gene keys
#key_map <- match(geneset,keys);
#keys[which(is.na(key_map))] <- NA;
}
#return output
return(list(comment=comment,
keys = keys));
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#GEM GPL annotation functions
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#.ensembl_ensgID_table
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~this function is used to load an ENSGid/HUGOid table [from EnsEMBL v84]
#######################################################################################
.ensembl_ensgID_table <- function(){
#return table from data
return(ens_ensgID_table)
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#.ensembl_entrez_table
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~this function is used to load an Entrez/HUGOid table [from EnsEMBL v84]
#######################################################################################
.ensembl_entrez_table <- function(){
#return table from data
return(ens_entrez_table)
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#.ucsc_genbank_table
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~this function is used to load an GENBANK/HUGOid table [from UCSC]
#######################################################################################
.ucsc_genbank_table <- function(){
#return table from data
return(ucsc_genbank_table)
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#.ucsc_refseq_table
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~this function is used to load an REFSEQ/HUGOid table [from UCSC]
#######################################################################################
.ucsc_refseq_table <- function(){
#return table from data
return(ucsc_refseq_table)
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#ensembl_gene_connector
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~this function is used to fetch a table to convert from
#######################################################################################
.ensembl_gene_connector_hs76 <- function(){
#Establish connection to EnsEMBL
con <- DBI::dbConnect(DBI::dbDriver("MySQL"),
username= 'anonymous',
host = 'ensembldb.ensembl.org',
dbname = 'homo_sapiens_core_76_38',
password= '',
port = 3306);
#build the parts for the query string
query <- "select gene.stable_id,
xref.display_label
from xref
inner join gene
where xref.xref_id = gene.display_xref_id";
#run query and get table
ens_table <- DBI::dbGetQuery(con,query);
#Close connection
DBI::dbDisconnect(con);
#return table
return(ens_table);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#ensembl_entrez_connector
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~this function is used to fetch a table to convert from
#######################################################################################
.ensembl_entrez_connector <- function(){
#Establish connection to EnsEMBL
con <- DBI::dbConnect(DBI::dbDriver("MySQL"),
username= 'anonymous',
host = 'ensembldb.ensembl.org',
dbname = 'homo_sapiens_core_76_38',
password= '',
port = 3306)
#build the parts for the query string
query <- "select xref.dbprimary_acc,
xref.display_label
from external_db
join xref
where external_db.db_name = 'EntrezGene' AND
xref.external_db_id = external_db.external_db_id"
#run query and get table
ens_table <- DBI::dbGetQuery(con,query)
colnames(ens_table) <- c("EntrezID","GeneName")
#Close connection
DBI::dbDisconnect(con)
#return table
return(ens_table)
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#ucsc_genbank_connector_hg19
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~this function is used to fetch a table to convert from
#######################################################################################
.ucsc_genbank_connector_hg19 <- function(){
#Establish connection to UCSC
con <- DBI::dbConnect(DBI::dbDriver("MySQL"),
username= 'genome',
host = 'genome-mysql.cse.ucsc.edu',
dbname = 'hgFixed',
password= '',
port = 3306);
#build the parts for the query string
query <- "select geneName.name,
gbCdnaInfo.acc
from gbCdnaInfo
inner join geneName
where geneName.id = gbCdnaInfo.geneName AND
gbCdnaInfo.organism = 448 AND
geneName.name != 'n/a' AND
geneName.name != 'unknown'";
#run query and get table
ucsc_tab <- DBI::dbGetQuery(con,query);
#Close connection
DBI::dbDisconnect(con);
#return table
return(ucsc_tab);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#ucsc_refseq_connector_hg19
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~this function is used to fetch a table to convert from
#######################################################################################
.ucsc_refseq_connector_hg19 <- function(){
#Establish connection to UCSC
con <- DBI::dbConnect(DBI::dbDriver("MySQL"),
username= 'genome',
host = 'genome-mysql.cse.ucsc.edu',
dbname = 'hg19',
password= '',
port = 3306);
#build the parts for the query string
query <- "select name,name2 from refGene";
#run query and get table
ucsc_tab <- DBI::dbGetQuery(con,query);
#Close connection
DBI::dbDisconnect(con);
#return table
return(ucsc_tab);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#GEM general check functions
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#GEM_log_check_v1
#~[by Francesco 2014/03/31 + Ravi Shankar August 2018]
#~this function checks if GEM is in log scale and returns TRUE/FALSE
#######################################################################################
.GEM_log_check <- function(GEM){
#remove columns that have just the same value [sd of 0] [see in GSE21126]
goodCols <- which(sapply(1:ncol(GEM$expr),
function(i)
!stats::sd(GEM$expr[,i],na.rm=T)==0))
#pick the first column that is not all NAs
ref_p <- goodCols[which(sapply(goodCols,
function(i)
!all(is.na(GEM$expr[,i]))))[1]]
#define input vector
inputC <- GEM$expr[,ref_p]
# Note we cannot assume that if there are negative expression values, the data is log2 normalized
# For e.g., in the case of Illumina data, there is possibility of negative values in processed non-normalized
# data from the background subtraction algorithm. (GSE103842)
# Therefore, make all values positive and apply qq_norm to check if data is log2 normalized
min_value <- min(inputC, na.rm = T);
# if minimum value is negative, add small offset to all values to make them all positive
if (min_value < 0) {
inputC <- inputC - min_value + 1;
}
#The data should be normally distributed
#once in log scale
#Get qq-norm objects
obj_real <- stats::qqnorm(GEM$expr[,ref_p],plot.it=FALSE);
obj_exp <- stats::qqnorm(exp(GEM$expr[,ref_p]),plot.it=FALSE);
obj_log <- stats::qqnorm(log(abs(GEM$expr[,ref_p])+0.00001),plot.it=FALSE);
#remove infinite
obj_exp$x[which(is.infinite(obj_exp$x))] <- NA;
obj_exp$y[which(is.infinite(obj_exp$y))] <- NA;
#look at correlations
cor_real <- stats::cor(obj_real$x,obj_real$y,use='pairwise.complete');
cor_exp <- stats::cor(obj_exp$x, obj_exp$y, use='pairwise.complete');
cor_log <- stats::cor(obj_log$x ,obj_log$y ,use='pairwise.complete');
#compute R^2 difference and take ratio
log_check_score <- 0
if(!all(is.na(cor_exp))){
log_check_score <- log2(abs(cor_real**2 - cor_log**2)/abs(cor_real**2 - cor_exp**2))
if(length(log_check_score) > 0 && !is.nan(log_check_score)){
#dataset is in log scale
if(log_check_score<=0){
return(TRUE)
}
#dataset is not in log scale
else{
return(FALSE)
}
}else{
warning("log check failed - check to make sure this dataset is log2 normalized")
}
}else{
if(cor_real > cor_log){
#dataset is in log scale
return(TRUE)
}else{
#dataset is not in log scale
return(FALSE)
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#geoquery_gems_2_singlelist
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#~this function converts a geo_query gem list into a simple list of GEM objects
#~directly compatible with the rest of the meta-analysis function
#######################################################################################
.geoquery_gems_2_singlelist <- function(geo_gem_list, formattedNames){
print(summary(geo_gem_list))
#make a new output list
gem_format <- list();
#get only the gems of interest
for(i in 1:length(names(geo_gem_list))){
if(length(geo_gem_list[[i]])>0) {
#get inside the object
for(j in 1:length(geo_gem_list[[i]])){
print(names(geo_gem_list[[i]]))
#extract name
set_name <- gsub("_series_matrix.txt.gz",
"",
names(geo_gem_list[[i]])[j]);
#replace dashes with underscores
set_name <- gsub("-","_",set_name);
#save gem object in list
gem_format[[set_name]] <- .GEM_inf_gene_remover(geo_gem_list[[i]][[j]]);
gem_format[[set_name]]$formattedName <- formattedNames[[i]]
if(length(geo_gem_list[[i]])>1) {
gem_format[[set_name]]$formattedName <- paste(gem_format[[set_name]]$formattedName, stringr::str_match_all(set_name,"(GPL[0-9]+)")[[1]][1,2][[1]])
}
}
} else {
warning(paste("Unable to correctly download",formattedNames[[i]],". Dataset will be excluded from this object."))
}
}
#
return(gem_format)
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#GEM_inf_gene_remover
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
#######################################################################################
.GEM_inf_gene_remover <- function(GEM){
#remove completely samples with -Inf, Inf, or they have some buffer overflow/underflow issues
# Note: Due to a deficiency in the which() function, the commented out code fails on very large datasets
# bad_samples <- c(which(GEM$expr==Inf),which(GEM$expr==-Inf),which(GEM$expr >= 1.741796*10**308));
# GEM$expr[bad_samples] <- NA;
if (is.null(GEM$expr))
return(GEM)
if (ncol(GEM$expr) > 0) {
for (i in 1:ncol(GEM$expr)) {
GEM$expr[GEM$expr[,i]==Inf] <- NA;
GEM$expr[GEM$expr[,i]==-Inf] <- NA;
GEM$expr[GEM$expr[,i]>=1.741796*10**308] <- NA;
}
}
#return GEM
return(GEM);
}
#declare global variables for variables in data.table/with notation to avoid R CMD CHECK notes
utils::globalVariables(c("ens_ensgID_table","ens_entrez_table","ucsc_genbank_table","ucsc_refseq_table"))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.