R/GEOHandler.R

Defines functions connectToGEODB experiment_types organism_types getGEOMetadata

Documented in connectToGEODB experiment_types getGEOMetadata organism_types

#' \code{connectToGEODB}
#' @name connectToGEODB
#' @aliases connectToGEODB, GEOHandler-function
#' @rdname GEOHandler-functions
#' @param sqliteFilePath optional SQLite full file path of the SQLite database if already downloaded
#' @param download If TRUE allow the automatic downloading of the database file.
#' @param destdir optional destination directory. Current working directory is the default
#' @return A connection to the GEOmetadb
#' @description This method allows users to connect to the GEOmetadb downloaded. If no parameter is provided than the function retrieves the database in sqlite format and returns a connection to query the database
#' @examples
#' \dontrun{
#'  geo_connection <- connectToGEODB(download=TRUE)
#'  }
#'  if(file.exists('GEOmetadb.sqlite')){
#'     geo_con <- connectToGEODB()
#'  } else {
#'  message('Please provide GEOmetadb.sqlite file')
#'  }
#' @export
#' @importFrom GEOmetadb getSQLiteFile
#' @importFrom RSQLite dbConnect SQLite
connectToGEODB <- function(sqliteFilePath = NULL, download = FALSE, destdir = getwd()) {
    geo_con <- NA
    if(download==TRUE){
      if(dir.exists(destdir)){
        sqliteFileName <- "GEOmetadb.sqlite"
        sqlfile <- GEOmetadb::getSQLiteFile(destdir = destdir)
        sqliteFilePath = file.path(destdir, sqliteFileName)
      }
    }
    if(file.exists(sqliteFilePath))
        geo_con <- RSQLite::dbConnect(RSQLite::SQLite(), sqliteFilePath)
    else
      message('Please provide the valid full path for GEOmetadb.sqlite file')
    return(geo_con)
}




#' \code{experiment_types}
#'
#' @rdname experiment_types
#' @param GEOcon connection to the SQLite GEOmetadb databse
#' @return A character vector with all the possible experiment values
#' @description This method retrierves the experiment types stored in GEOmetadb
#' @examples
#' if(file.exists('GEOmetadb.sqlite')){
#'  geo_con <- connectToGEODB('GEOmetadb.sqlite')
#'  experiments <- experiment_types(geo_con)
#' }else{
#'  print('You need to download GEOmetadb.sqlite to run this example')
#' }
#' @export
#' @importFrom RSQLite dbGetQuery
experiment_types <- function(GEOcon) {
    gse_types <- dbGetQuery(GEOcon, "select distinct type from gse order by type")
    gse_types <- unlist(sapply(as.character(as.vector(gse_types[, 1])), function(list_of_types) {
        if (grepl(";", list_of_types)) 
            NA else list_of_types
    }))
    names(gse_types) <- rep(NA, length(gse_types))
    gse_types <- unique(sapply(gse_types, function(type_string) gsub(";", "", type_string)))
    gse_types <- unique(gse_types[which(!is.na(gse_types))])
    return(gse_types)
}

#' \code{organism_types}
#'
#' @rdname organism_types
#' @param geo_con connection to the SQLite GEOmetadb databse
#' @return A character vector with all the possible organism values
#' @description This method retrierves the allowed organisms in GEOmetadb
#' @examples
#' if(file.exists('GEOmetadb.sqlite')){
#'  geo_con <- connectToGEODB('GEOmetadb.sqlite')
#'  species <- organism_types(geo_con)
#' }else{
#'  print('You need to download GEOmetadb.sqlite to run this example')
#' }
#' @export

organism_types <- function(geo_con) {
    organisms <- dbGetQuery(geo_con, "select distinct organism_ch1 from gsm")
    organisms <- unlist(sapply(as.character(as.vector(organisms[, 1])), function(list_of_types) {
        if (grepl(";", list_of_types)) 
            NA else list_of_types
    }))
    organisms <- unique(organisms[which(!is.na(organisms))])
    return(organisms)
}



#' \code{getGEOMetadata}
#'
#' @rdname getGEOMetadata
#' @param geo_con connection to the SQLite GEOmetadb databse
#' @param experiment_type The type of experiment. Allowed values can be obtained through the function \code{experiment_types}
#' @param organism Optional type of organism. Allowed species can be obtained using the function \code{organism_types}. If no organism is passed as parameter the query will retrieve all the organisms
#' @param gpl Optional platform identifier in case a platform based query has to be executed
#' @return A data frame with the queried samples' metadata
#' @description This method retrierves the descriptive fields of the samples in GEO for a given experiment_type, organism or platform.
#' @examples
#' if(file.exists('GEOmetadb.sqlite')){
#'  geo_con <- connectToGEODB('GEOmetadb.sqlite')
#' methilation <- getGEOMetadata(geo_con,
#' 'Methylation profiling by high throughput sequencing', 'Homo sapiens')
#' expression <- getGEOMetadata(geo_con,
#'  'Expression profiling by array', 'Homo sapiens', 'GPL570')
#' }else{
#'  print('You need to download GEOmetadb.sqlite to run this example')
#' }
#' @export

getGEOMetadata <- function(geo_con, experiment_type = NA, organism = NA, gpl = NA) {
    experiment_acs <- NA
    experiment_metadata <- NA
    statement <- " where "
    if (!is.na(experiment_type)) {
        exp_types <- experiment_types(geo_con)
        if (!experiment_type %in% exp_types) 
            stop("Invalid experiment type. Please run experiment_types(GEOcon) to view valid values")
        experiment_query <- paste0("select gse, title, summary from gse where type ='", 
            experiment_type, "'")
        experiment_metadata <- RSQLite::dbGetQuery(geo_con, experiment_query)
        colnames(experiment_metadata) <- c("gse", "experiment_title", "experiment_summary")
        experiment_acs <- unique(experiment_metadata$gse)
    }
    sample_query <- "select series_id, gsm, title, gpl, source_name_ch1, organism_ch1, characteristics_ch1, description from gsm"
    
    if (!is.na(experiment_acs[1])) {
        sample_query <- paste0(sample_query, statement, "series_id in ('", paste(unique(experiment_acs), 
            collapse = "','"), "')")
        statement <- " and "
    }
    
    if (!is.na(organism)) {
        organism_types <- organism_types(geo_con)
        if (!organism %in% organism_types) 
            stop("Invalid organism. Please run organism_types to visualize valid values") else {
            sample_query <- paste0(sample_query, statement, " organism_ch1 ='", organism, 
                "'")
            statement <- " and "
        }
    }
    if (!is.na(gpl)) {
        sample_query <- paste0(sample_query, statement, " gpl = '", gpl, "'")
    }
    sample_metadata <- RSQLite::dbGetQuery(geo_con, sample_query)
    columns <- c(2, 1, 3:ncol(sample_metadata))
    if (is.data.frame(experiment_metadata) & nrow(experiment_metadata)>=1) {
        geo_metadata <- merge(sample_metadata, experiment_metadata, by.x = "series_id", 
            by.y = "gse")
        columns <- c(2, 3, 5, 7, 8, 9, 10, 1, 4, 6)
    } else {
        geo_metadata <- sample_metadata
        
    }
    geo_metadata$characteristics_ch1 <- sapply(geo_metadata$characteristics_ch1, 
        function(value) {
            gsub(";\t", "  ", value)
        })
    geo_metadata$description <- sapply(geo_metadata$description, function(value) {
        gsub(";\t", "  ", value)
    })
    
    return(geo_metadata[, columns])
}
eugeniaeueu/Onassis documentation built on March 9, 2020, 8:17 a.m.