R/gtxbase.R

Defines functions gtxversion gtxlog label_variant gtxwhere gtxwhat where_from gtxfilter gtxfilter_label gtxanalysis_label gtxanalyses gtxregion gtxentity gtxlabel

Documented in gtxanalyses gtxregion gtxversion gtxwhere

#' gtxversion - Report GTX package info
#' 
#' Internal function to extract and report version and build information, 
#' including ref and sha1 if built from github using devtools.
#' Intent is for this to be printed to help time/versionstamp outputs
#' 
#' @author Karsten Sieber \email{karsten.b.sieber@@gsk.com}
#' @import utils

gtxversion <- function() {
  # Note, not all the listed elements will be present, depending on whether built 
  # using devtools::install_git() or devtools::install_github()
  desc <- unlist(packageDescription('gtx')[c('Package', 'Version', 'Built', 'Packaged', 
                                             'RemoteSha', 'GithubRepo', 'GithubRef', 'GithubSHA1')])
  return(paste0(names(desc), ': ', desc, collapse = '|'))
}

## DEPREACTED
## for debugging, we have an internal function to print messages depending on a global option gtx.debug
gtxlog <- function(...) {
    .Deprecated("gtx_debug")
    # Deprecated version on line below.
    # if (getOption('gtx.debug', FALSE)) return(message(...))
    # Slightly updated version with flog_debug.
    if (getOption('gtx.debug', FALSE)) futile.logger::flog.debug(...)
    return(invisible(NULL))
}

# internal function to produce human-readable labels
# for one or a set of variants, based on optional
# arguments ref, alt, rs
# for data.frame x, use do.call(label_variant, x)
label_variant <- function(chrom, pos, ref, alt, rs, ...) {
  if (missing(chrom)) gtx_fatal2('gtx:::label_variant missing chrom(osome) argument')
  if (missing(pos)) gtx_fatal2('gtx:::label_variant missing pos(ition) argument')
  return(
    sprintf('chr%s:%i%s%s', 
            chrom, pos, 
            if (!missing(alt) && !missing(ref)) sprintf(':%s>%s', ref, alt) else '',
            if (!missing(rs)) ifelse(!is.na(rs), sprintf(' (rs%i)', rs), '') else '')
  )
}

## convenience function to construct WHERE
## part of SQL for genomic data tables
##
#' @export
gtxwhere <- function(chrom, 
                     pos, pos_ge, pos_le, 
                     pos_end_ge, pos_start_le, 
                     pos_start_ge, pos_end_le, 
                     ref, alt, rs, hgncid, ensemblid,
                     entity, 
                     tablename) {
  ## function to construct a WHERE string for constructing SQL queries
  ## Notes:
  ##  if arguments have length > 1, WHERE string OR's over values, producing e.g.
  ##      "hgncid='ABCA1' OR hgncid='ABCA2'
  ##  if multiple arguments, WHERE string AND's over arguments, producing e.g.
  ##      "chrom='1' AND pos_start>=123456 AND pos_end<=123789"

  ## For a query region of interest, to find segments (e.g. genes, recombination rate) that:
  ## wholly *or partially* overlap, use:   pos_end_ge=query_start, pos_start_le=query_end
  ## wholly overlap (only), use:           pos_start_ge=query_start, pos_end_le=query_end

    if (!missing(tablename)) {
        tablename <- paste0(sanitize1(tablename, type = 'alphanum'), '.')
    } else {
        tablename <- ''
    }
    
    ws1 <- list(
        if (missing(chrom)) NULL
        else sprintf("chrom='%s'", sanitize1(chrom, values = c(as.character(1:22), "X", "Y"))),
        
        if (missing(pos)) NULL
        else sprintf("pos=%s", sanitize(pos, type = "int")), 
        
        if (missing(pos_ge)) NULL
        else sprintf("pos>=%s", sanitize(pos_ge, type = "int")),
        
        if (missing(pos_le)) NULL
        else sprintf("pos<=%s", sanitize(pos_le, type = "int")),
        
        if (missing(pos_end_ge)) NULL
        else sprintf("pos_end>=%s", sanitize(pos_end_ge, type = "int")),
        
        if (missing(pos_start_le)) NULL
        else sprintf("pos_start<=%s", sanitize(pos_start_le, type = "int")),
        
        if (missing(pos_start_ge)) NULL
        else sprintf("pos_start>=%s", sanitize(pos_start_ge, type = "int")),
        
        if (missing(pos_end_le)) NULL
        else sprintf("pos_end<=%s", sanitize(pos_end_le, type = "int")),
        
        if (missing(ref)) NULL
        else sprintf("ref='%s'", sanitize(ref, type = "ACGT+")),
        
        if (missing(alt)) NULL
        else sprintf("alt='%s'", sanitize(alt, type = "ACGT+")),
        
        if (missing(rs)) NULL
        else sprintf("rsid=%s", sanitize(rs, type = "rs")), # note sanitize(type="rs") strips the rs parts hence returns integers
        
        if (missing(hgncid)) NULL
        else sprintf("hgncid='%s'", sanitize(hgncid, type = "alphanum-")), # thousands of HGNC ids contain hyphens, e.g. HLA-A
        
        if (missing(ensemblid)) NULL
        else sprintf("ensemblid='%s'", sanitize(ensemblid, type = "ENSG")),

        if (missing(entity)) NULL
        else sprintf("entity='%s'", sanitize(entity, type = "alphanum"))
    )
    ws2 <- paste0("(", 
                  unlist(sapply(ws1, function(x) if (is.null(x)) NULL else paste0(tablename, x, collapse = " OR "))), 
                  ")", collapse = " AND ")
    return(ws2)
}

##
## convenience function to construct WHERE
## part of SQL for analyses table
## Note behaviour for most arguments here is OR/OR, different to gtxwhere()
## (but AND with ncase_ge and ncohort_ge filters, document this better, FIXME)
gtxwhat <- function(analysis1, # rename to analysis_u or analysis_uniq FIXME
                    analysis,
                    analysis_not, 
                    description_contains,
                    phenotype_contains,
                    has_tag, 
                    ncase_ge,
                    ncohort_ge,
                    tablename) {
  ## function to construct a WHERE string for constructing SQL queries
  ## Notes:
  ##  if arguments have length > 1, WHERE string OR's over values
  ##  if multiple arguments, WHERE string OR's over arguments

    if (!missing(tablename)) {
        tablename <- paste0(sanitize1(tablename, type = 'alphanum'), '.')
    } else {
        tablename <- ''
    }

    ## use of analysis1 is a special case where exactly one analysis key is being provided
    ## ignore all other arguments
    if (!missing(analysis1)) {
        if (getOption('gtx.analysisIsString', TRUE)) { # Future release will change default to FALSE
            return(sprintf("(%sanalysis='%s')", tablename, sanitize1(analysis1, type = "alphanum")))
        } else {
            return(sprintf("(%sanalysis=%s)", tablename, sanitize1(analysis1, type = "count"))) # note no quotes
        }
    }
    
    ##
    ## analysis, description_contains, phenotype_contains etc. are OR'ed within and between arguments
    ##
    ## FIXME, in Impala, LIKE is case sensitive and ILIKE is case insensitive
    ##    but in SQLite, LIKE is case insensitive
    ##    so either we switch on the class of the dbc, or use "lower(a) LIKE lower(b)"
    ws1 <- list(
        if (missing(analysis)) NULL
        else {
            if (getOption('gtx.analysisIsString', TRUE)) { # Future release will change default to FALSE
                sprintf("%sanalysis='%s'", 
                        tablename, 
                        sanitize(analysis, type = "alphanum"))
            } else {
                sprintf("%sanalysis=%s", 
                        tablename, 
                        sanitize1(analysis, type = "count")) # note no quotes
            }
        },
        
        if (missing(description_contains)) NULL
        else sprintf("lower(%sdescription) LIKE '%%%s%%'", 
                     tablename, 
                     sanitize(tolower(description_contains), type = "text")), # Sanitation may be too restrictive, should do something intelligent with whitespace

        if (missing(phenotype_contains)) NULL
        else sprintf("lower(%sphenotype) LIKE '%%%s%%'", 
                     tablename, 
                     sanitize(phenotype_contains, type = "text")), # Sanitation may be too restrictive

        if (missing(has_tag)) NULL
        else sprintf("%stag='%s'", tablename, 
                     sanitize(has_tag, type = "alphanum")) # Sanitation may be too restrictive
    )
    ## format
    ws1f <- paste0("(", 
                  unlist(sapply(ws1, function(x) if (is.null(x)) NULL else paste0(x, collapse = " OR "))), 
                  ")", collapse = " OR ")

    ##
    ## analysis_not, ncase_ge, ncohort_ge etc. are AND'ed within and between arguments
    ws2 <- list(
        if (missing(analysis_not)) NULL
        else {
            if (getOption('gtx.analysisIsString', TRUE)) { # Future release will change default to FALSE
                sprintf("analysis!='%s'", sanitize(analysis_not, type = "alphanum"))
            } else {
                sprintf("analysis!=%s", sanitize1(analysis_not, type = "count")) # note no quotes
            }
        },
        
        if (missing(ncase_ge)) NULL
        else sprintf("ncase >= %s", sanitize(ncase_ge, type = "int")),
        
        if (missing(ncohort_ge)) NULL
        else sprintf("ncohort >= %s", sanitize(ncohort_ge, type = "int"))
    )
    ws2f <- paste0("(", 
                  unlist(sapply(ws2, function(x) if (is.null(x)) NULL else paste0(tablename, x, collapse = " AND "))), 
                  ")", collapse = " AND ")

    ## FIXME bad hack make this better
    if (ws1f == '()' && ws2f == '()') {
        return('(True)')
    } else if (ws1f != '()' && ws2f == '()') {
        return(paste0('(', ws1f, ')'))
    } else if (ws1f == '()' && ws2f != '()') {
        return(paste0('(', ws2f, ')'))
    } else if (ws1f != '()' && ws2f != '()') {
        return(paste0('((', ws1f, ') AND (', ws2f, '))'))
    } else {
        stop('gtxwhat() logical error')
    }
}

##
## convenience function to construct WHERE
## part of SQL for gwas_results_[joint|cond]
## note that it is meaningless to select signals without also selecting chrom
##
where_from <- function(analysis, analysisu,
                       entity, entityu,
                       chrom, 
                       signal, signalu, 
                       tablename) {

    if (!missing(tablename)) {
        tablename <- paste0(sanitize1(tablename, type = 'alphanum'), '.')
    } else {
        tablename <- ''
    }

    ##
    ## analysis, entity, signal are OR within, AND between
    ws1 <- list(
        if (missing(analysis)) NULL
        else {
            if (getOption('gtx.analysisIsString', TRUE)) { # Future release will change default to FALSE
                sprintf("analysis='%s'", sanitize(analysis, type = "alphanum"))
            } else {
                sprintf("analysis=%s", sanitize(analysis, type = "count")) # note no quotes
            }
        },

        if (missing(analysisu)) NULL
        else {
            if (getOption('gtx.analysisIsString', TRUE)) { # Future release will change default to FALSE
                sprintf("analysis='%s'", sanitize1(analysisu, type = "alphanum"))
            } else {
                sprintf("analysis=%s", sanitize1(analysisu, type = "count")) # note no quotes
            }
        },

        if (missing(entity)) NULL
        else sprintf("entity='%s'", sanitize(entity, type = "alphanum")),

        if (missing(entityu)) NULL
        else sprintf("entity='%s'", sanitize1(entityu, type = "alphanum")),
        
        if (missing(chrom)) NULL
        else sprintf("chrom='%s'", sanitize1(chrom, values = c(as.character(1:22), "X", "Y"))),
        
        if (missing(signal)) NULL
        else sprintf("signal=%s", sanitize(signal, type = "count")),
        
        if (missing(signalu)) NULL
        else sprintf("signal=%s", sanitize1(signalu, type = "count"))
    )
    ## format
    ws1f <- paste0("(",
                  unlist(sapply(ws1, function(x) if (is.null(x)) NULL else paste0(tablename, x, collapse = " OR "))),
                  ")", collapse = " AND ")
    return(ws1f)
}


gtxfilter <- function(pval_le, pval_gt,
                      maf_ge, maf_lt,
                      rsq_ge, rsq_lt,
                      emac_ge, case_emac_ge, 
                      analysis,
                      tablename,
                      dbc = getOption("gtx.dbConnection", NULL)) {
    ## function to construct a WHERE string for constructing SQL queries
    ## logical AND within and between arguments
    ## queries TABLE analyses to determine results_db table name and
    ## to efficiently handle whether freq and rsq are NULL

    ## query analysis metadata, using cache if it exists
    amd <- sqlWrapper(getOption('gtx.dbConnection_cache_analyses', dbc),
                      sprintf('SELECT results_db, ncase, ncontrol, ncohort, has_freq, has_rsq
                                    FROM analyses
                                    WHERE %s;',
                              gtxwhat(analysis1 = analysis)))
    ## if ncohort is NA, replace with ncase+ncontrol, should not be needed with new schema definition
    amd <- within(amd, ncohort <- ifelse(!is.na(ncohort), ncohort, ncase + ncontrol))
    
    if (!missing(tablename)) {
        tablename <- paste0(sanitize1(tablename, type = 'alphanum'), '.')
    } else {
        tablename <- ''
    }

    ## set flags for whether to print warning messages
    ## can be set at multiple places in code, but we only want to print messages once
    warn_freq <- FALSE
    warn_rsq <- FALSE
    warn_emac <- FALSE

    ## unlike similar functions, gtxwhat(), gtxwhere(), that add the tablename at the
    ## end, for this function the specific OR logic needed for maf_lt means we have to
    ## paste the tablename within every clause (grrr)

    ws1 <- list(
        if (missing(pval_le)) NULL
        else sprintf('%spval<=%s', tablename, sanitize1(pval_le, type = 'double')),

        if (missing(pval_gt)) NULL
        else sprintf('%spval>%s', tablename, sanitize1(pval_gt, type = 'double')),

        if (missing(maf_ge)) NULL # Two list elements for freq>= and freq<= if maf_ge argument used
        else {
            if (amd$has_freq == 1) { # type BOOLEAN in Hive/Impala is returned to R as 0/1
                sprintf('(%sfreq>=%s AND %sfreq<=%s)',
                        tablename, sanitize1(maf_ge, type = 'double'),
                        tablename, sanitize1(1 - maf_ge, type = 'double'))
            } else {
                warn_freq <- TRUE
                NULL
            }
        },

        ## Note, maf_lt needs OR logic, where everything else uses AND
        if (missing(maf_lt)) NULL # Two list elements for freq< and freq> if maf_lt argument used
        else {
            if (amd$has_freq == 1) { # type BOOLEAN in Hive/Impala is returned to R as 0/1
                sprintf('(%sfreq<%s OR %sfreq>%s)',
                        tablename, sanitize1(maf_lt, type = 'double'),
                        tablename, sanitize1(1 - maf_lt, type = 'double'))
            } else {
                warn_freq <- TRUE
                NULL
            }
        },

        if (missing(rsq_ge)) NULL
        else {
            if (amd$has_rsq == 1) { # type BOOLEAN in Hive/Impala is returned to R as 0/1
                sprintf('%srsq>=%s', tablename, sanitize1(rsq_ge, type = 'double'))
            } else {
                warn_rsq <- TRUE
                NULL
            }
        },

        if (missing(rsq_lt)) NULL
        else {
            if (amd$has_rsq == 1) { # type BOOLEAN in Hive/Impala is returned to R as 0/1
                sprintf('%srsq<%s', tablename, sanitize1(rsq_lt, type = 'double'))
            } else {
                warn_rsq <- TRUE
                NULL
            }
        },

        if (missing(emac_ge)) NULL
        else {
            if (amd$has_freq == 1 && amd$has_rsq == 1 && !is.na(amd$ncohort)) { # type BOOLEAN in Hive/Impala is returned to R as 0/1
                ## sanitizing emac_ge converts to string, so convert back to double for calculation, convert to string again
                rfthresh <- sanitize1(as.double(sanitize1(emac_ge, type = 'double'))/(2*amd$ncohort), type = 'double')
                sprintf('((%sfreq*%srsq)>=%s AND ((1.-%sfreq)*%srsq)>=%s)',
                        tablename, tablename, rfthresh,
                        tablename, tablename, rfthresh)
            } else {
                warn_emac <- TRUE
                NULL
            }
        },
        
        if (missing(case_emac_ge)) NULL
        else {
            if (amd$has_freq == 1 && amd$has_rsq == 1 && !is.na(amd$ncase) && !is.na(amd$ncontrol)) { # type BOOLEAN in Hive/Impala is returned to R as 0/1
                rfthresh <- sanitize1(as.double(sanitize1(case_emac_ge, type = 'double'))/(2*min(amd$ncase, amd$ncontrol)), type = 'double')
                sprintf('((%sfreq*%srsq)>=%s AND ((1.-%sfreq)*%srsq)>=%s)',
                        tablename, tablename, rfthresh,
                        tablename, tablename, rfthresh)
            } else {
                warn_emac <- TRUE
                NULL
            }
        }
    )
    if (warn_freq) warning('gtxfilter MAF filtering skipped because has_freq=False for analysis [ ', analysis, ' ]')
    if (warn_rsq) warning('gtxfilter Rsq filtering skipped because has_rsq=False for analysis [ ', analysis, ' ]')
    if (warn_emac) warning('gtxfilter EMAC filtering skipped because ', 
                           paste(c(if (amd$has_freq == 0) 'has_freq=False' else NULL, 
                                   if (amd$has_rsq == 0) 'has_rsq=False' else NULL, 
                                   if (is.na(amd$ncohort)) 'ncohort is NA' else NULL, 
                                   if (is.na(amd$ncase)) 'ncase is NA' else NULL, 
                                   if (is.na(amd$ncontrol)) 'ncontrol is NA' else NULL),
                                 collapse = ', '),
                           ' for analysis [ ', analysis, ' ]')

    ## format, can simplify the inner paste0 since all list elements have length 1
    ws1f <- paste0("(", 
                   unlist(ws1),
#                   unlist(sapply(ws1, function(x) if (is.null(x)) NULL else paste0(tablename, x))), 
                   ")", collapse = " AND ")
    ## handle case where no arguments were non-missing but still need to generate
    ## valid SQL clause for substitution into '... WHERE %s' or '... WHERE %s AND %s'
    if (ws1f == '()') ws1f <- '(True)'
    return(ws1f)
}

## Should be combined into gtxfilter, to return SQL WHERE clause, and label, by one function call
gtxfilter_label <- function(maf_ge, maf_lt,
                            rsq_ge, rsq_lt,
                            emac_ge, case_emac_ge, 
                            analysis,
                            dbc = getOption("gtx.dbConnection", NULL)) {
    ## function to construct a nice label for plot

    ## query analysis metadata, note this is inefficient since gtxfilter() may
    ## be making the same/similar query (potentially multiple times)
    ## added, using cache if it exists
    amd <- sqlWrapper(getOption('gtx.dbConnection_cache_analyses', dbc),
                      sprintf('SELECT ncase, ncontrol, ncohort, has_freq, has_rsq
                                    FROM analyses
                                    WHERE %s;',
                            gtxwhat(analysis1 = analysis)))
    amd <- within(amd, ncohort <- ifelse(!is.na(ncohort), ncohort, ncase + ncontrol))

    ## set flags for whether to print warning messages
    ## can be set at multiple places in code, but we only want to print messages once
    warn_freq <- FALSE
    warn_rsq <- FALSE
    warn_emac <- FALSE

    ws1 <- list(
        if (missing(maf_ge)) NULL
        else {
            if (amd$has_freq == 1) { # type BOOLEAN in Hive/Impala is returned to R as 0/1
                sprintf('MAF>=%s', sanitize1(maf_ge, type = 'double'))
            } else {
                warn_freq <- TRUE
                NULL
            }
        },

        if (missing(maf_lt)) NULL
        else {
            if (amd$has_freq == 1) { # type BOOLEAN in Hive/Impala is returned to R as 0/1
                sprintf('MAF<%s', sanitize1(maf_lt, type = 'double'))
            } else {
                warn_freq <- TRUE
                NULL
            }
        },

        if (missing(rsq_ge)) NULL
        else {
            if (amd$has_rsq == 1) { # type BOOLEAN in Hive/Impala is returned to R as 0/1
                sprintf('rsq>=%s', sanitize1(rsq_ge, type = 'double'))
            } else {
                warn_rsq <- TRUE
                NULL
            }
        },

        if (missing(rsq_lt)) NULL
        else {
            if (amd$has_rsq == 1) { # type BOOLEAN in Hive/Impala is returned to R as 0/1
                sprintf('rsq<%s', sanitize1(rsq_lt, type = 'double'))
            } else {
                warn_rsq <- TRUE
                NULL
            }
        },

        if (missing(emac_ge)) NULL
        else {
            if (amd$has_freq == 1 && amd$has_rsq == 1 && !is.na(amd$ncohort)) { # type BOOLEAN in Hive/Impala is returned to R as 0/1
                sprintf('EMAC>=%s', sanitize1(emac_ge, type = 'double'))
            } else {
                warn_emac <- TRUE
                NULL
            }
        },
        
        if (missing(case_emac_ge)) NULL
        else {
            if (amd$has_freq == 1 && amd$has_rsq == 1 && !is.na(amd$ncase) && !is.na(amd$ncontrol)) { # type BOOLEAN in Hive/Impala is returned to R as 0/1
                sprintf('case EMAC >=%s', sanitize1(case_emac_ge, type = 'double'))
            } else {
                warn_emac <- TRUE
                NULL
            }
        }
    )

    ## FIXME make nicer formatting like 0.01>MAF>=0.001
    
    ws1f <- paste0(unlist(ws1), collapse = " and ")
    if (ws1f == '') {
        ws1f <- 'All variants'
    } else {
        ws1f <- paste('Variants with', ws1f)
    }
    return(ws1f)

    ## FIXME add text if filtering requested but could not be applied
    ##    if (warn_freq) warning('gtxfilter MAF filtering skipped because has_freq=False for analysis [ ', analysis, ' ]')
    ##    if (warn_rsq) warning('gtxfilter Rsq filtering skipped because has_rsq=False for analysis [ ', analysis, ' ]')


}

## function to return pretty printing label for an analysis
## analysis is the analysis id
## entity is the result of a call to gtxentity (i.e. a list with elements entity, entity_label)
## signal is the integer index of a CLEO signal from gwas_results_cond
gtxanalysis_label <- function(analysis, entity, signal, nlabel = TRUE,
                              dbc = getOption("gtx.dbConnection", NULL)) {
    ares <- sqlWrapper(getOption('gtx.dbConnection_cache_analyses', dbc), 
                       sprintf('SELECT label, ncase, ncontrol, ncohort FROM analyses WHERE %s;',
                               gtxwhat(analysis1 = analysis)))
    if (nlabel) {
        if (!is.na(ares$ncase) && !is.na(ares$ncontrol)) {
            alabel <- sprintf('%s, n=%i vs %i', ares$label, ares$ncase, ares$ncontrol)
        } else if (!is.na(ares$ncohort)) {
            alabel <- sprintf('%s, n=%i', ares$label, ares$ncohort)
        } else {
            alabel <- sprintf('%s, n=?', ares$label)
        }
    } else {
        alabel <- ares$label
    }
    if (!missing(entity) && !is.null(entity$entity)) alabel <- paste(entity$entity_label, alabel)
    if (!missing(signal)) alabel <- paste0(alabel, ' signal #', signal)
    return(alabel)
}

#' Find GWAS analyses
#'
#' Find GWAS analyses matching specified criteria.
#'
#' In the near future, a better high level interface will be 
#' provided to find GWAS analyses of interest.  The \code{gtxanalyses()} 
#' and \code{gtxwhat()} functions were originally intended as internal
#' functions, to abstract part of the functionality required within the
#' \code{phewas()} family of functions.  However, in the absence of a
#' better interface, these functions are currently exported for users to
#' use.
#'
#' @param analysis Key value(s) for GWAS analysis/es required
#' @param analysis_not Key value(s) for GWAS analysis/es not required
#' @param phenotype_contains String to search for in phenotype metadata
#' @param description_contains String to search for in description metadata
#' @param has_tag Tag key value(s) for GWAS analysis/es required
#' @param ncase_ge Numeric to select number of cases greater-or-equal
#' @param ncohort_ge Numeric to select number in cohort greater-or-equal
#' @param has_cleo Logical, select only analyses with \link{CLEO} results
#' @param analysis_fields Names of columns/fields to return
#' @param sort_by Column name for descending sort, either ncohort or ncase
#' @param tag_is Internal use only
#' @param with_tags Internal use only
#' @param has_access_only Deprecated
#' @param dbc Database connection
#' @return A data frame of selected rows and columns from \code{TABLE analyses}.
#' @author Toby Johnson \email{Toby.x.Johnson@gsk.com}
#' @export
gtxanalyses <- function(analysis, analysis_not, 
                        phenotype_contains,
                        description_contains,
                        has_tag,
                        ncase_ge,
                        ncohort_ge,
                        has_cleo = FALSE,                        
                        ## if extra filters are added, be sure to update definition of all_analyses below
                        analysis_fields = c('description', 'phenotype', 'covariates', 'cohort', 'unit',
                                    'ncase', 'ncontrol', 'ncohort'),
                        sort_by, 
                        tag_is, with_tags = FALSE,
                        has_access_only = FALSE, 
                        dbc = getOption("gtx.dbConnection", NULL)) {

    ## Determine databases available
    if ('Impala' %in% class(dbc)) {
        ## Avoid frequently issuing SHOW DATABASES; commands
        dbs <- getOption('gtx.dbConnection_SHOW_DATABASES', NULL)
        if (is.null(dbs)) {
            gtxdbcheck(dbc)
            dbs <- sqlWrapper(dbc, 'SHOW DATABASES;', uniq = FALSE)
            options(gtx.dbConnection_SHOW_DATABASES = dbs)
        }
    } else if ('SQLiteConnection' %in% class(dbc)) {
        ## When dbc is an SQLite handle, there is no concept of a database
        dbs <- NULL
    } else if ('KineticaConnection' %in% class(dbc)) {
      ## When dbc is a Kinetica handle, there is no concept of a database
      dbs <- NULL
    } else {
        stop('dbc class [ ', paste(class(dbc), collapse = ', '), ' ] not recognized')
    }

    gtxdbcheck(dbc) # FIXME should check getOption('gtx.dbConnection_cache_analyses')

    ## sanitize and check desired analysis_fields, create sanitized string for SQL
    ## added, using cache if it exists
    acols <- names(sqlWrapper(getOption('gtx.dbConnection_cache_analyses', dbc),
                              'SELECT * FROM analyses LIMIT 0', zrok = TRUE)) # columns present in TABLE analyses
    analysis_fields <- sanitize(union(c('analysis', 'entity_type', 'results_db'),
                              analysis_fields),
                        type = 'alphanum')
    analysis_fields_bad <- !(analysis_fields %in% acols)
    if (any(analysis_fields_bad)) {
        error('analysis_fields [ ', paste(analysis_fields[analysis_fields_bad], collapse = ', '), ' ] not found in TABLE analyses')
    }
    rm(acols) # clean up
    analysis_fields <- paste0('analyses.', analysis_fields, collapse = ', ')

    if (!missing(has_tag)) with_tags <- TRUE
    
    ## sanitize and check desired tag_is logicals
    if (!missing(tag_is)) {
        acols <- names(sqlWrapper(dbc, 'SELECT * FROM analyses_tags LIMIT 0', zrok = TRUE)) # columns present in TABLE analyses
        tag_is <- sanitize(tag_is, type = 'alphanum')
        tag_is_bad <- !(tag_is %in% acols)
        if (any(tag_is_bad)) {
            error('tag_is logicals [ ', paste(tag_is[tag_is_bad], collapse = ', '), ' ] not found in TABLE analyses_tags')
        }
        rm(acols)
        tag_is <- paste0('(analyses_tags.', tag_is, ' OR analyses_tags.', tag_is, ' IS NULL)', collapse = ' AND ')
        with_tags <- TRUE
    } else {
        tag_is <- '(True)'
    }
    
    #all_analyses <- (missing(analysis) && missing(analysis_not) && missing(phenotype_contains) &&
    #                 missing(description_contains) && missing(ncase_ge) && missing(ncohort_ge) &&
    #                 missing(has_tag))
    res <- sqlWrapper(dbc,
                      sprintf('SELECT %s %s FROM analyses %s %s WHERE %s AND %s AND %s',
                              analysis_fields, 
                              if (with_tags) ', analyses_tags.tag' else '',
                              if (has_cleo) 'LEFT SEMI JOIN gwas_results_joint ON (analyses.analysis=gwas_results_joint.analysis)' else '',
                              if (with_tags) 'LEFT JOIN analyses_tags ON (analyses.analysis=analyses_tags.analysis)' else '',
                              gtxwhat(analysis = analysis, analysis_not = analysis_not, 
                                      description_contains = description_contains,
                                      phenotype_contains = phenotype_contains,
                                      ncase_ge = ncase_ge, ncohort_ge = ncohort_ge,
                                      tablename = 'analyses'),
                              gtxwhat(has_tag = has_tag, tablename = 'analyses_tags'),
                              tag_is),
                      uniq = FALSE, zrok = TRUE)
    
    ## If accessible databases known, add 'has_access' column and filter if requested
    if (!is.null(dbs)) {
        res$has_access <- is.na(res$results_db) | res$results_db == '' | res$results_db %in% dbs$name
        if (has_access_only) {
            res <- subset(res, has_access)
        }
    }
    
    if (!missing(sort_by) && identical(tolower(sort_by), 'ncohort') && 'ncohort' %in% names(res)) {
      gtx_debug('Sorting by ncohort')
      res <- res[order(res$ncohort, decreasing = TRUE), ]
    }
    if (!missing(sort_by) && identical(tolower(sort_by), 'ncase') && 'ncase' %in% names(res)) {
      gtx_debug('Sorting by ncase')
      res <- res[order(res$ncase, decreasing = TRUE), ]
    }
    
    if (identical(nrow(res), 0L)) {
      gtx_warn('No analyses match the search criteria')
    }
    return(res)
}

#' @export
gtxregion <- function(chrom, pos_start, pos_end, 
                      hgncid, ensemblid, pos, rs, 
                      signal, analysis, entity, 
                      surround = 500000, 
                      dbc = getOption("gtx.dbConnection", NULL)) {
  gtxdbcheck(dbc)
  if (!missing(chrom) && !missing(pos_start) && !missing(pos_end)) {
    stopifnot(identical(length(chrom), 1L))
    stopifnot(identical(length(pos_start), 1L))
    stopifnot(identical(length(pos_end), 1L))
    stopifnot(pos_end > pos_start)
  } else if (!missing(hgncid)) {
    gp <- sqlWrapper(dbc,
                     sprintf('SELECT chrom, min(pos_start) as pos_start, max(pos_end) as pos_end FROM genes WHERE %s GROUP BY chrom',
                             gtxwhere(hgncid = hgncid)),
                     uniq = FALSE) # require unique result but catch below to provide more informative error
    if (!identical(nrow(gp), 1L)) stop('hgncid(s) [ ', paste(hgncid, collapse = ', '), ' ] do not map to unique chromosome')
    chrom <- gp$chrom[1] # do we need the [1] here?
    pos_start <- gp$pos_start[1] - surround
    pos_end <- gp$pos_end[1] + surround
  } else if (!missing(ensemblid)) {
    gp <- sqlWrapper(dbc,
                     sprintf('SELECT chrom, min(pos_start) as pos_start, max(pos_end) as pos_end FROM genes WHERE %s GROUP BY chrom',
                             gtxwhere(ensemblid = ensemblid)),
                     uniq = FALSE) # require unique result but catch below to provide more informative error
    if (!identical(nrow(gp), 1L)) stop('ensemblid(s) [ ', paste(ensemblid, collapse = ', '), ' ] do not map to unique chromosome')
    chrom <- gp$chrom[1]
    pos_start <- gp$pos_start[1] - surround
    pos_end <- gp$pos_end[1] + surround
    ## Note pos and rs low in priority order to allow secondary use in regionplot()
  } else if (!missing(chrom) && !missing(pos)) {
    stopifnot(identical(length(chrom), 1L))
    stopifnot(identical(length(pos), 1L))
    stopifnot(identical(length(surround), 1L))
    pos_start <- pos - surround
    pos_end <- pos + surround
  } else if (!missing(rs)) {
    gp <- sqlWrapper(dbc,
                     sprintf('SELECT chrom, pos FROM sites WHERE %s;',
                             gtxwhere(rs = rs))) # default uniq = TRUE
    chrom <- gp$chrom
    pos_start <- gp$pos - surround
    pos_end <- gp$pos + surround
  } else if (!missing(signal) && !missing(chrom) && !missing(analysis)) {
    xentity <- gtxentity(analysis, entity = entity)
    sp <- sqlWrapper(dbc,
                sprintf('SELECT chrom, pos FROM %sgwas_results_joint WHERE %s AND %s', 
                        gtxanalysisdb(analysis), 
                        where_from(analysisu = analysis, chrom = chrom, signalu = signal),
                        xentity$entity_where),
                uniq = TRUE, zrok = FALSE)
    chrom <- sp$chrom
    pos_start <- sp$pos - surround
    pos_end <- sp$pos + surround
  } else {
    stop('gtxregion() failed due to inadequate arguments')
  }

  return(list(chrom = chrom, pos_start = pos_start, pos_end = pos_end,
              label =   sprintf('chr%s:%s-%s', chrom, pos_start, pos_end)))
}


## internal function to infer the entity id according to the type required for the analysis
##
## note optional tablename argument, is not completely flexible since we
## might want to apply the WHERE clause on multiple tables
gtxentity <- function(analysis, entity, hgncid, ensemblid, tablename, 
                      dbc = getOption("gtx.dbConnection", NULL)) {
    ## Check the db connections we will actually use (genes will always be queried before exit, unless error)
    gtxdbcheck(getOption('gtx.dbConnection_cache_analyses', dbc), tables_required = 'analyses')
    gtxdbcheck(getOption('gtx.dbConnection_cache_genes', dbc), tables_required = 'genes')

    entity_type <- sqlWrapper(getOption('gtx.dbConnection_cache_analyses', dbc),
                              sprintf('SELECT entity_type FROM analyses WHERE %s;',
                                      gtxwhat(analysis1 = analysis))
                              )$entity_type
    ## FIXME this logging message could be improved when entity_type is null or empty string
    gtx_debug('analysis [ {analysis} ] requires entity type [ {entity_type} ]')

    if (!missing(tablename)) {
      tablename <- paste0(sanitize1(tablename, type = 'alphanum'), '.')
    } else {
      tablename <- ''
    }
        
    if (identical(entity_type, 'ensg')) {
        ## analysis requires an entity that is Ensembl gene id
        if (!missing(entity)) {
            ## entity argument supplied, infer whether Ensembl gene id or HGNC id
            entity <- sanitize1(entity, type = 'alphanum')
            if (grepl('^ENSG[0-9]+$', entity)) {
                gtx_debug('Recognized entity [ {entity} ] as Ensembl gene id')
                ## do nothing
            } else {
                ## attempt to convert assumed HGNC id to Ensembl gene id
                gtx_debug('Looking for Ensembl gene id for entity [ {entity} ]')
                ## Check the db connections we will actually use
                entity <- sqlWrapper(getOption('gtx.dbConnection_cache_genes', dbc), 
                                     sprintf('SELECT ensemblid FROM genes WHERE %s;', 
                                             gtxwhere(hgncid = entity))
                                     )$ensemblid
                gtx_debug('Converted entity to Ensembl gene id [ {entity} ]')
            }
        } else {
            ## infer entity from other arguments if supplied
            if (!missing(hgncid)) {
                entity <- sqlWrapper(getOption('gtx.dbConnection_cache_genes', dbc), 
                                     sprintf('SELECT ensemblid FROM genes WHERE %s;', 
                                             gtxwhere(hgncid = hgncid))
                                     )$ensemblid
                gtx_debug('Inferred entity [ {entity} ] from HGNC id [ {hgncid} ]')
            } else if (!missing(ensemblid)) {
                entity <- sanitize1(ensemblid, type = 'ENSG')
                gtx_debug('Inferred entity [ {entity} ] from Ensembl gene id [ {ensemblid} ]')
            } else {
                stop('Analysis [ ', analysis, ' ] requires an Ensembl gene id entity but none could be inferred')
            }
        }
        entity_label <- sqlWrapper(getOption('gtx.dbConnection_cache_genes', dbc), 
                                   sprintf('SELECT hgncid FROM genes WHERE %s;',
                                           gtxwhere(ensemblid = entity))
                                   )$hgncid
        if (is.na(entity_label) || entity_label == '') entity_label <- entity # may have to paste 'ENSG' back on when switch to ints
        return(list(entity = entity,
                    entity_label = entity_label,
                    entity_where = sprintf('(%sentity = \'%s\')', tablename, entity)))
    } else {
        return(list(entity = NULL,
                    entity_label = '',
                    entity_where = '(True)'))
        # entity_type is NULL or unrecognized type
    }
    stop('internal error in gtxentity')
}

# for certain types, sanitize and gtxlabel are (almost) inverses
gtxlabel <- function(x, type) {
    if (identical(type, 'ensg') || identical(type, 'rs')) { ## could nest up other ENS[PGT] types...
        x <- na.omit(x) ## silently drop missing values
        xi <- suppressWarnings(as.integer(x))
        if (any(is.na(xi) | xi < 0)) {
            stop('gtxlabel invalid input [ ', paste(x[is.na(xi) | xi < 0], collapse = ', '),
                 ' ] for type [ ', type, ' ]')
        }
        if (identical(type, 'ensg')) {
            return(sprintf('ENSG%012i', x))
        } else if (identical(type, 'rs')) {
            return(sprintf('rs%i', x))
        } else {
            stop('internal error in gtxlabel()')
        }
    } else {
        stop('invalid type [ ', type, ' ] in gtxlabel()')
    }
}
tobyjohnson/gtx documentation built on Aug. 30, 2019, 8:07 p.m.