gtx: Genetics ToolboX

Documented in regionplot regionplot.new

## regionplot.new and associated functions
## assumes database connection is provided by getOption("gtx.dbConnection")

#' @export
regionplot <- function(analysis, # what analysis (entity should be next)
                       entity, # optional
                       signal, # optional
                       chrom, pos_start, pos_end, pos,  
                       hgncid, ensemblid, rs, 
                       surround = 500000, # where
                       maf_ge, rsq_ge, emac_ge, case_emac_ge, # which variants to include
                       priorsd = 1, priorc = 1e-5, cs_size = 0.95, # parameters for finemapping
                       plot_ymax = 300, # plot output control starts here... # 300 is too high FIXME
                       style = 'signals',
                       protein_coding_only = TRUE, # whether to only show protein coding genes in annotation below plot
                       highlight_style = 'circle', 
                       dbc = getOption("gtx.dbConnection", NULL)) {

  if (isTRUE(getOption('gtx.debug'))) {
	futile.logger::flog.threshold(DEBUG)
  }

  # check database connection
  gtxdbcheck(dbc)

  # if optional signal argument used, check other arguments for compatibility
  if (!missing(signal)) {
    if (!(missing(maf_ge) && missing(rsq_ge) && missing(emac_ge) && missing(case_emac_ge))) {
      gtx_warn('Variant filtering will have NO EFFECT for conditional results (signal = {signal})')
    }
    if ('signals' %in% tolower(style)) {
      gtx_warn('style = \'signals\' incompatible with conditional results (signal = {signal}), using style = \'signal\' instead')
      style[tolower(style) == 'signals'] <- 'signal' 
    }
  }
  
  ## Obtain pvals by passing arguments through to regionplot.data()
  pvals <- regionplot.data(analysis = analysis, entity = entity, signal = signal, 
                           chrom = chrom, pos_start = pos_start, pos_end = pos_end, pos = pos, 
                           hgncid = hgncid, ensemblid = ensemblid, rs = rs, 
                           surround = surround,                          
                           style = style,
                           priorsd = priorsd, priorc = priorc, cs_size = cs_size, 
                           maf_ge = maf_ge, rsq_ge = rsq_ge,
                           emac_ge = emac_ge, case_emac_ge = case_emac_ge, 
                           dbc = dbc)

  ## Determine x-axis range from attr()ibutes of data
  ## was re-resolved from command line args
  xregion <- attr(pvals, 'region')
  #chrom <- xregion$chrom
  #pos_start = xregion$pos_start
  #pos_end = xregion$pos_end
  
  ## Determine entity from attr()ibutes of data
  xentity <- attr(pvals, 'entity')

  pvals <- within(pvals, impact[impact == ''] <- NA)
  ## Use na.rm in min, even though these should not be present,
  ##   because we want regionplot() to have robust behaviour
  ## Note, regionplot.data SQL includes pval IS NOT NULL clause
  ##   (whcih removes NULL but not NaN)
  ## Also, regionplot.data removes and warns if there are
  ##   non-finite pval
  pmin <- max(min(pvals$pval, na.rm = TRUE), 10^-plot_ymax)

  main <- gtxanalysis_label(analysis = analysis, entity = attr(pvals, 'entity'), signal = signal, nlabel = TRUE, dbc = dbc)

  ## in future we may need to pass maf_lt and rsq_lt as well  
  if (missing(signal)) {
    fdesc <- gtxfilter_label(maf_ge = maf_ge, rsq_ge = rsq_ge, emac_ge = emac_ge, case_emac_ge = case_emac_ge, analysis = analysis, dbc = dbc)
  } else {
    fdesc <- 'All variants (after pre-CLEO filtering)'
  }
  
  ## Highlight index SNP(s) if pos or rs argument was used
  gp <- data.frame(NULL)
  if (!missing(pos)) {
      ## funny syntax to avoid subset(pvals, pos=pos)
      gp <- rbind(gp, pvals[pvals$pos %in% pos, c('pos', 'pval')])
  }
  if (!missing(rs)) {
      # query this rs id
      qrs <- sqlWrapper(dbc,
                       sprintf('SELECT chrom, pos, ref, alt FROM sites WHERE %s;',
                               gtxwhere(chrom = xregion$chrom, rs = rs)),
                        uniq = FALSE, zrok = TRUE)
      gp <- rbind(gp, merge(pvals, qrs, all.x = FALSE, all.y = TRUE)[ , c('pos', 'pval')])
  }

  ## set par to ensure large enough margins, internal xaxs, regular yaxs,
  ## should apply for all plots generated
  oldpar <- par(mar = pmax(c(4, 4, 4, 4) + 0.1, par('mar')), 
                xaxs = 'i', yaxs = 'i') # xaxs='i' stops R expanding x axis; manual *1.04 for yaxs in regionplot.new

  if ('classic' %in% tolower(style)) {
    gl <- regionplot.new(chrom = xregion$chrom, pos_start = xregion$pos_start, pos_end = xregion$pos_end,
                 pmin = pmin, 
                 main = main, fdesc = fdesc, 
                 protein_coding_only = protein_coding_only, 
                 dbc = dbc)
    if (!missing(signal)) mtext(paste0('conditional for signal #', signal), 2, 2)
    ## best order for plotting
    ## Plot all variants with VEP annotation as blue diamonds in top layer
    pvals <- pvals[order(!is.na(pvals$impact), -log10(pvals$pval)), ]
    ## Next statement, captures the actual yvalues used when plotting
    pvals <- within(pvals, ploty <- regionplot.points(pos, pval,
                                  pch = ifelse(!is.na(impact), 23, 21),
                                  col = ifelse(!is.na(impact), rgb(0, 0, 1, .75), rgb(.33, .33, .33, .5)),
                                  bg = ifelse(!is.na(impact), rgb(.5, .5, 1, .75), rgb(.67, .67, .67, .5))))
    regionplot.highlight(gp, highlight_style = highlight_style)
  }
  if ('ld' %in% tolower(style)) {
    gl <- regionplot.new(chrom = xregion$chrom, pos_start = xregion$pos_start, pos_end = xregion$pos_end,
                 pmin = pmin, 
                 main = main, fdesc = fdesc, 
                 protein_coding_only = protein_coding_only, 
                 dbc = dbc)
    if (!missing(signal)) mtext(paste0('conditional for signal #', signal), 2, 2) # DRY should this go into regionplot.new?
    ## best order for plotting
    ## Plot variants shaded by LD
    pvals <- pvals[order(!is.na(pvals$impact), -log10(pvals$pval)), ]
    ## current db format means r<0.01 is not included in table, hence substitute missing with zero
    pvals$r[is.na(pvals$r)] <- 0. #FIXME dangerous to return incorrect numerical data
    pvals$alpha <- .5 + .25*pvals$r^2 # currently use a single alpha for col and bg for coding and noncoding, so precalculate
    with(pvals, regionplot.points(pos, pval,
                                  pch = ifelse(!is.na(impact), 23, 21),
                                  col = ifelse(!is.na(impact), rgb(0, 0, 0, alpha), rgb(.33, .33, .33, alpha)),
                                  bg = ifelse(!is.na(impact), 
                                              rgb((1 - r^2)*.67, (1 - r^2)*.67, .33*r^2 + .67, alpha),
                                              rgb(.33*r^2 + .67, (1 - r^2)*.67, (1 - r^2)*.67, alpha))))
    ## (re)draw the index variant for pairwise LD over the top, in larger size and with no transparency
    pval1 <- merge(pvals, attr(pvals, 'index_ld')[ , c('chrom', 'pos', 'ref', 'alt')])
    if (!is.null(pval1) && !is.na(pval1) && nrow(pval1) > 0) {
      with(pval1, {
        text((xregion$pos_start + xregion$pos_end)*0.5, 
              .75*gl$yline[5] + .25*gl$yline[4],
            labels = glue('Pairwise LD with index chr{chrom}:{pos}:{ref}>{alt}'),
            pos = 3, cex = 0.75)
        arrows(x0 = (xregion$pos_start + xregion$pos_end)*0.5, 
               y0 = .75*gl$yline[5] + .25*gl$yline[4], 
               y1 = .5*gl$yline[5] + .5*gl$yline[4], 
               length = 0, lty = 'dotted')
        arrows(x0 = (xregion$pos_start + xregion$pos_end)*0.5, 
               y0 = .5*gl$yline[5] + .5*gl$yline[4],
               x1 = pos, 
               y1 = .25*gl$yline[5] + .75*gl$yline[4],
               length = 0, lty = 'dotted')
        arrows(x0 = pos, 
               y0 = .25*gl$yline[5] + .75*gl$yline[4],
               y1 = pmin(-log10(pval), plot_ymax), # FIXME still not quite right
               length = 0, lty = 'dotted')
        regionplot.points(pos, pval, 
                           cex = 1.5, 
                           pch = ifelse(!is.na(impact), 23, 21),
                           col = ifelse(!is.na(impact), rgb(0, 0, 0, alpha = 1), rgb(.33, .33, .33, alpha = 1)),
                           bg = ifelse(!is.na(impact), 
                                       rgb(0, 0, 1, alpha = 1),
                                       rgb(1, 0, 0, alpha = 1)))
      })
    }

    regionplot.highlight(gp, highlight_style = highlight_style)
  }
  if ('signal' %in% tolower(style)) {
    gl <- regionplot.new(chrom = xregion$chrom, pos_start = xregion$pos_start, pos_end = xregion$pos_end,
                 pmin = pmin, 
                 main = main, fdesc = fdesc, 
                 protein_coding_only = protein_coding_only, 
                 dbc = dbc)
    if (!missing(signal)) mtext(paste0('conditional for signal #', signal), 2, 2) # DRY should this go into regionplot.new?
    ## best order for plotting is highest pp on top, with impact on top of that
    pvals <- pvals[order(!is.na(pvals$impact), pvals$pp_signal), ]
    pvals <- within(pvals, {
        pprel <- pp_signal/max(pp_signal, na.rm = TRUE) # relative posterior prob for colouring
        alpha <- .5 + .25*pprel # currently use a single alpha for col and bg for coding and noncoding, so precalculate
    })
    with(pvals, regionplot.points(pos, pval,
                                  pch = ifelse(!is.na(impact), 23, 21),
                                  cex = ifelse(cs_signal, 1.25, .75), 
                                  col = ifelse(!is.na(impact), rgb(0, 0, 0, alpha), rgb(.33, .33, .33, alpha)),
                                  bg = ifelse(cs_signal,
                                              ifelse(!is.na(impact),
                                                     rgb((1 - pprel)*.67, (1 - pprel)*.67, .33*pprel + .67, alpha),
                                                     rgb(.33*pprel + .67, (1 - pprel)*.67, (1 - pprel)*.67, alpha)),
                                              rgb(.67, .67, .67, .5))))
    regionplot.highlight(gp, highlight_style = highlight_style)
    # Add extra annotation to indicate CLEO conditional signal is being plotted
    if (!missing(signal)) {
      with(pvals[which.max(pvals$pp_signal), ], {
        #arrows(x0 = pos, y0 = .75*gl$yline[5] + .25*gl$yline[4], 
        #       y1 = -log10(pval), 
        #       length = 0, lty = 'dotted') 
        text(pos, -log10(max(pval, pmin)), labels = paste0('#', signal), pos = 3, cex = 0.75)
        # FIXME this may not work when yaxis truncation is on
      })
    }
  }
  if ('signals' %in% tolower(style)) {
    gl <- regionplot.new(chrom = xregion$chrom, pos_start = xregion$pos_start, pos_end = xregion$pos_end,
                 pmin = pmin, 
                 main = main, fdesc = fdesc, 
                 protein_coding_only = protein_coding_only, 
                 dbc = dbc)

    ## catch situation where CLEO results are present but no signals
    if ('signal' %in% names(pvals)) {
      signals <- sort(unique(na.omit(pvals$signal)))
    } else {
      signals <- NULL
    }
    if (length(signals) == 0L) {
      futile.logger::flog.warn('No CLEO results, using single signal results instead')
      ## no signals, so use single signal results as if they were the CLEO results
      pvals <- within(pvals, {
        cs_cleo <- cs_signal
        pp_cleo <- pp_signal
        signal <- ifelse(cs_signal, 'default', NA)
      })
      signals <- 'default'
    }
    
    ## best order for plotting
    ## Plot variants coloured/sized by credible set, with VEP annotation is diamond shape in top layer
    pvals <- pvals[order(!is.na(pvals$impact), pvals$pp_cleo), ]

    ## colvec is vector of identifying colour for each signal
    if (length(signals) == 1L) {
        colvec <- rgb(1, 0, 0) # red
    } else {
        colvec <- rainbow(length(signals), start = .6667, end = .3333) # blue-magenta-red-yellow-green avoiding cyan part of spectrum
    }

    bg_cleo <- rep(rgb(.67, .67, .67, alpha = .5), nrow(pvals))
    for (idx in 1:length(signals)) {
        ww <- which(pvals$signal == signals[idx])
        if (length(ww) > 0L) {
            pprel <- pvals$pp_cleo[ww]
            pprel <- pprel/max(pprel, na.rm = TRUE) # relative pp *within signal*
            bg_cleo[ww] <- rgb((col2rgb(colvec[idx])[1]/255 - .67)*pprel + .67, 
                               (col2rgb(colvec[idx])[2]/255 - .67)*pprel + .67,
                               (col2rgb(colvec[idx])[3]/255 - .67)*pprel + .67, 
                               alpha = .5)
        }
    }

    with(pvals, regionplot.points(pos, pval,
                                  pch = ifelse(!is.na(impact), 23, 21), 
                                  cex = ifelse(cs_cleo, 1.25, .75),
                                  bg = bg_cleo, 
                                  col = ifelse(!is.na(impact), rgb(0, 0, 0, .5), rgb(.33, .33, .33, .5))))

    ## legend indicating signals
    ## ssi = summary stats for index variants
    ssi <- attr(pvals, 'index_cleo')
    if (!is.null(ssi) && nrow(ssi) > 1) {
      ssi <- ssi[match(signals, ssi$signal), ] # order to match signals and colvec
      ssi$keypos <- seq(from = xregion$pos_start, to = xregion$pos_end, length.out = nrow(ssi) + 2)[rank(ssi$pos) + 1]
      arrows(x0 = ssi$keypos, y0 = .75*gl$yline[5] + .25*gl$yline[4], 
             y1 = .5*gl$yline[5] + .5*gl$yline[4], 
             length = 0, lty = 'dotted')
      arrows(x0 = ssi$keypos, y0 = .5*gl$yline[5] + .5*gl$yline[4],
             x1 = ssi$pos, y1 = .25*gl$yline[5] + .75*gl$yline[4],
             length = 0, lty = 'dotted')
      arrows(x0 = ssi$pos, y0 = .25*gl$yline[5] + .75*gl$yline[4],
             y1 = pmin(-log10(ssi$pval), plot_ymax), # FIXME still not quite right
             length = 0, lty = 'dotted')
      points(ssi$keypos, rep(.75*gl$yline[5] + .25*gl$yline[4], nrow(ssi)),
             pch = 21, col = rgb(.33, .33, .33, .5), bg = colvec, cex = 1)
      text(ssi$keypos, rep(.75*gl$yline[5] + .25*gl$yline[4], nrow(ssi)),
           labels = paste0('#', signals), pos = 4, cex = 0.75)
      text(mean(ssi$keypos), .75*gl$yline[5] + .25*gl$yline[4],
           labels = 'CLEO index variants', pos = 3, cex = 0.75)
      #legend("bottomleft", pch = 21, ,
      #       pt.bg = colvec, legend=paste0('#', signals),
      #       horiz=T, bty="n", cex=.5)
    } else {
      text((xregion$pos_start + xregion$pos_end)/2., .75*gl$yline[5] + .25*gl$yline[4],
           labels = 'No CLEO index variants', pos = 3, cex = 0.75)
    }
    regionplot.highlight(gp, highlight_style = highlight_style)
  } 

  ## restore par
  par(oldpar)

  ## for when called from within GUI tools, return 
  ## the pvalue dataframe.  Should also return information on the genelayout. 
  return(invisible(pvals))
}

#' @export
regionplot.data <- function(analysis, entity, signal, 
                            chrom, pos_start, pos_end, pos, 
                            hgncid, ensemblid, rs, 
                            surround = 500000,
                            style = 'signals',
                            priorsd = 1, priorc = 1e-5, cs_size = 0.95, 
                            maf_ge, rsq_ge,
                            emac_ge, case_emac_ge, 
                            dbc = getOption("gtx.dbConnection", NULL)) {
    ## check database connection
    gtxdbcheck(dbc)

    style <- tolower(style)
    
    ## Determine x-axis range from arguments
    xregion <- gtxregion(chrom = chrom, pos_start = pos_start, pos_end = pos_end, pos = pos, 
                         hgncid = hgncid, ensemblid = ensemblid, rs = rs, 
                         signal = signal, analysis = analysis, entity = entity, 
                         surround = surround,
                         dbc = dbc)
    chrom = xregion$chrom # note, overwriting command line arguments
    pos_start = xregion$pos_start
    pos_end = xregion$pos_end
    
    ## If required, determine entity and associated info including entity_label
    xentity <- gtxentity(analysis, entity = entity, hgncid = hgncid, ensemblid = ensemblid, dbc = dbc)
    
    ## always query marginal p-values
    ## seems more flexible to query CLEO results separately and merge within R code
    if (missing(signal)) {
      gtx_info('Querying marginal p-values')
      t0 <- as.double(Sys.time())
      pvals <- sqlWrapper(dbc,
                          sprintf('SELECT gwas_results.chrom, gwas_results.pos, gwas_results.ref, gwas_results.alt, pval, impact %s FROM %sgwas_results LEFT JOIN vep USING (chrom, pos, ref, alt) WHERE %s AND %s AND %s AND %s AND pval IS NOT NULL;',
                                  if (any(c('signal', 'signals') %in% tolower(style))) ', beta, se, rsq, freq' else '', 
                                  gtxanalysisdb(analysis, dbc = dbc), 
                                  gtxwhat(analysis1 = analysis),
                                  xentity$entity_where, # (entity=...) or (True)
                                  gtxwhere(chrom, pos_ge = pos_start, pos_le = pos_end, tablename = 'gwas_results'),
                                  gtxfilter(maf_ge = maf_ge, rsq_ge = rsq_ge, emac_ge = emac_ge, case_emac_ge = case_emac_ge, analysis = analysis, dbc = dbc)),
                          uniq = FALSE)
      t1 <- as.double(Sys.time())
      gtx_info('Query returned {nrow(pvals)} variants in query region {xregion$label} in {round(t1 - t0, 3)}s.')
    } else {
      gtx_info('Querying conditional p-values')
      t0 <- as.double(Sys.time())
      pvals <- sqlWrapper(dbc,
                          sprintf('SELECT gwas_results_cond.chrom, gwas_results_cond.pos, 
                                          gwas_results_cond.ref, gwas_results_cond.alt, 
                                          beta_cond AS beta, se_cond AS se, impact 
                                   FROM %sgwas_results_cond 
                                   LEFT JOIN vep 
                                   USING (chrom, pos, ref, alt) 
                                   WHERE %s AND %s AND %s;',
                                  gtxanalysisdb(analysis, dbc = dbc), 
                                  where_from(analysisu = analysis, signalu = signal, tablename = 'gwas_results_cond'), 
                                  xentity$entity_where, # (entity=...) or (True)
                                  gtxwhere(chrom, pos_ge = pos_start, pos_le = pos_end, tablename = 'gwas_results_cond')),
                          uniq = FALSE)
      t1 <- as.double(Sys.time())
      gtx_info('Query returned {nrow(pvals)} variants in query region {xregion$label} in {round(t1 - t0, 3)}s.')
      pvals$pval <- with(pvals, pchisq((beta/se)^2, df = 1, lower.tail = FALSE))
    }
      
    if (any(!is.finite(pvals$pval))) {
      gtx_warn('Removing {sum(!is.finite(pvals$pval))} variants with non-finite P-values')
      pvals <- pvals[is.finite(pvals$pval), ]      
      gtx_debug('After non-finite P-values removed, {nrow(pvals)} variants in query region {xregion$label}')
    }
    
    if (any(c('signal', 'signals') %in% tolower(style))) {
        futile.logger::flog.debug('Finemapping under single signal assumption')
        ## cs_only = FALSE since we still want to plot/return variants not in the credible set
        pvals <- fm_signal(pvals, priorsd = priorsd, priorc = priorc, cs_size = cs_size, cs_only = FALSE)
    }
    
    if ('signals' %in% tolower(style)) {
        ## get CLEO credible sets only, since we effectively left join with this, for plot colouring
        futile.logger::flog.debug('Finemapping under multiple signals (CLEO) assumptions')
        fmres <- fm_cleo(analysis = analysis, chrom = chrom, pos_start = pos_start, pos_end = pos_end,
                         priorsd = priorsd, priorc = priorc, cs_size = cs_size, cs_only = TRUE,
                         dbc = dbc)
        ## note fm_cleo already prints logging messages about number of signals
        if (nrow(fmres) > 0) {
            ## sort by decreasing posterior probability, match each
            ## row or pvals with *first* match in fmres
            ## thus linking any variants in more than one
            ## credible set, with the one for the signal for which
            ## it has higher probability
            fmres <- fmres[order(fmres$pp_cleo, decreasing = TRUE), ]
            pvals <- cbind(pvals,
                           fmres[match(with(pvals, paste(chrom, pos, ref, alt, sep = '_')),
                                       with(fmres, paste(chrom, pos, ref, alt, sep = '_'))),
                                 c('signal', 'cs_cleo', 'pp_cleo')])
            ## FIXME in case one variant is in more than one credible
            ## set, it would be better to cbind like we do above with signal,
            ## but then cbind with the *marginal* pp's from aggregating over signals
            pvals$pp_cleo[is.na(pvals$pp_cleo)] <- 0. # make zero to be safe with sorting/plotting
            pvals$cs_cleo[is.na(pvals$cs_cleo)] <- FALSE # make FALSE to be safe with tables/plotting
            ## Propagate CLEO index variants as attr()ibute
            attr(pvals, 'index_cleo') <- attr(fmres, 'index_cleo')
        } else {
            # do nothing
        }
    }

    if ('ld' %in% tolower(style)) {
        # merge with LD information in userspace code since we need to
        # pull the p-values first to find the top hit or otherwise chosen variant

        # note if style='ld', sort order is by ld with index variant,
        # otherwise sort by pval (see else block)

        # Determine Index Variant
        pval1 <- NULL # will be used to store chrom/pos/ref/alt of the index variant
                      # and using NULL to indicate not successfully selected
        # First, if a single pos argument was provided, try to use this as index variant
        if (!missing(pos)) {
          if (identical(length(pos), 1L)) {
            ## funny syntax to avoid subset(pvals, pos %in% pos)
            pval1 <- pvals[pvals$pos == pos, c('chrom', 'pos', 'ref', 'alt'), drop = FALSE]
            if (identical(nrow(pval1), 1L)) {
              pval1$r <- 1
              gtx_debug('Selected pairwise LD index chr{pval1$chrom}:{pval1$pos}:{pval1$ref}>{pval1$alt} (selected by pos argument)')
            } else {
              gtx_warn('Skipping pos argument [ {paste(pos, collapse = \', \')} ] for pairwise LD index selection because {nrow(pval1)} variants match')
              pval1 <- NULL
            }
          } else {
            gtx_warn('Skipping pos argument [ {paste(pos, collapse = \', \')} ] for pairwise LD index selection because multiple values')
          }
        }
        # Next, if a single rs argument was provided, try to use this as index variant
        if (is.null(pval1) && !missing(rs)) {
          if (identical(length(rs), 1L)) {
            # query this rs id
            qrs <- sqlWrapper(dbc,
                       sprintf('SELECT chrom, pos, ref, alt FROM sites WHERE %s;',
                               gtxwhere(chrom = chrom, rs = rs)),
                        uniq = FALSE, zrok = TRUE)
            # use merge as a way to subset on match by all of chrom/pos/ref/alt
            # note default merge is all.x = FALSE, all.y = FALSE
            pval1 <- merge(pvals, qrs)[ , c('chrom', 'pos', 'ref', 'alt'), drop = FALSE]
            if (identical(nrow(pval1), 1L)) {
              pval1$r <- 1
              gtx_debug('Selected pairwise LD index chr{pval1$chrom}:{pval1$pos}:{pval1$ref}>{pval1$alt} (selected by rs argument)')
            } else {
              gtx_warn('Skipping rs argument [ {paste(rs, collapse = \', \')} ] for pairwise LD index selection because {nrow(pval1)} variants match')
              pval1 <- NULL
            }
          } else {
            gtx_warn('Skipping rs argument [ {paste(rs, collapse = \', \')} ] for pairwise LD index selection because multiple values')            
          }
        }
        # Next, pick variant with at least one ld value and with smallest P-value 
        if (is.null(pval1)) {
          # Query all variants present on LHS (chrom1, pos1, ref1, alt1) of pairwise LD table
          # Note that if db guarantees 1:1 between variants in the pairwise TABLE ld, and the
          # per-variant TABLE ldref, this can be done more efficiently
          # Notes:  cannot use gtxwhere because of nonstandard names chrom1, pos1
          has_ld <- sqlWrapper(dbc, 
                          sprintf('SELECT chrom1 AS chrom, pos1 AS pos, ref1 AS ref, alt1 AS alt, True AS has_ld \
                                       FROM ld \
                                       WHERE chrom1 = \'%s\' AND pos1 >= %s AND pos1 <= %s \
                                       GROUP BY chrom1, pos1, ref1, alt1;',
                                  sanitize1(xregion$chrom, values = c(1:22, 'X')), # should be a type for chrom  
                                  sanitize1(xregion$pos_start, type = 'int'),
                                  sanitize1(xregion$pos_end, type = 'int')),
                               uniq = FALSE, zrok = TRUE)
          has_ld <- within(merge(has_ld, pvals[ , c('chrom', 'pos', 'ref', 'alt', 'pval'), drop = FALSE],
                                 all.x = FALSE, all.y = TRUE),
                           has_ld[is.na(has_ld)] <- FALSE)
          if (nrow(has_ld) == 0L || all(!has_ld$has_ld)) {
            gtx_warn('Skipping pairwise LD index selection because no variants have pairwise LD data')
          } else {
            # order so we can warn how many smaller P-value variants were skippedcount 
            has_ld <- has_ld[order(has_ld$pval), , drop = FALSE]
            if (has_ld$has_ld[1]) {
              # smallest P-value variant has LD, so okay to use this
              pval1 <- has_ld[1, c('chrom', 'pos', 'ref', 'alt'), drop = FALSE]
              pval1$r <- 1
              gtx_debug('Selected pairwise LD index chr{pval1$chrom}:{pval1$pos}:{pval1$ref}>{pval1$alt} (smallest P-value)')
            } else {
              w_has_ld <- which(has_ld$has_ld)[1] # guaranteed length >=1 by checks above
              pval1 <- has_ld[w_has_ld, c('chrom', 'pos', 'ref', 'alt'), drop = FALSE]
              pval1$r <- 1
              gtx_warn('Pairwise LD index selection skipped {w_has_ld - 1} variants with -log10(p)<={round(log10(has_ld$pval[w_has_ld]/has_ld$pval[1]), 2)} smaller, with no pairwise LD data')
              gtx_debug('Selected pairwise LD index chr{pval1$chrom}:{pval1$pos}:{pval1$ref}>{pval1$alt} (smallest P-value with pairwise LD data)')
            }
          }
        }
        if (!is.null(pval1)) {
          stopifnot(identical(nrow(pval1), 1L)) # This will fail if has_ld was all FALSE or had zero rows, see above
          gtx_debug('Querying pairwise LD with index chr{pval1$chrom}:{pval1$pos}:{pval1$ref}>{pval1$alt}')
          ld1 <- sqlWrapper(dbc, 
                          sprintf('SELECT chrom2 AS chrom, pos2 AS pos, ref2 AS ref, alt2 AS alt, r FROM ld
                                   WHERE chrom1=\'%s\' AND pos1=%s AND ref1=\'%s\' AND alt1=\'%s\';',
                                  pval1$chrom, pval1$pos, pval1$ref, pval1$alt), # should we sanitize
                          uniq = FALSE, zrok = TRUE)
          pvals <- merge(pvals, rbind(pval1, ld1), all.x = TRUE, all.y = FALSE)
          # sort by decreasing r^2 (will be resorted for plotting, so makes 
          # a .data() call work as a useful proxy search
          pvals <- pvals[order(pvals$r^2, decreasing = TRUE), ]
          attr(pvals, 'index_ld') <- pval1
          if (mean(is.na(pvals$r)) > 0.25) { # threshold seems high but pairwise LD often missing form low frequency variants
            gtx_warn('Pairwise LD with index chr{pval1$chrom}:{pval1$pos}:{pval1$ref}>{pval1$alt} missing for {round(mean(is.na(pvals$r))*100)}% of variants, check warnings and debugging messages for possible cause')
          }
        } else {
          gtx_warn('Selection of pairwise LD index failed, check warnings and debugging messages for possible cause')
          pvals$r <- NA
          attr(pvals, 'index_ld') <- NA
        }
    } else {
        ## sort by increasing pval
        pvals <- pvals[order(pvals$pval), ]
    }
    
    attr(pvals, 'analysis') <- analysis
    attr(pvals, 'region') <- xregion
    attr(pvals, 'entity') <- xentity
    return(pvals)
}

#' @export
regionplot.new <- function(chrom, pos_start, pos_end, pos, 
                           hgncid, ensemblid, rs, surround = 500000, 
                           pmin = 1e-10, main, fdesc, 
                           protein_coding_only = TRUE,   
                           dbc = getOption("gtx.dbConnection", NULL)) {
  gtxdbcheck(dbc)

  ## Determine x-axis range from arguments
  xregion <- gtxregion(chrom = chrom, pos_start = pos_start, pos_end = pos_end, pos = pos, 
                       hgncid = hgncid, ensemblid = ensemblid, rs = rs, surround = surround,
                       dbc = dbc)
  chrom = xregion$chrom
  pos_start = xregion$pos_start
  pos_end = xregion$pos_end

  ## Determine y-axis upper limit
  ## removed +0.5 since space above allocated by regionplot.genelayout()
  ymax <- ceiling(-log10(pmin))

  ## Determine amount of y-axis space needed for gene annotation
  gl <- regionplot.genelayout(chrom, pos_start, pos_end, ymax, protein_coding_only = protein_coding_only, dbc = dbc)
  gtx_debug('gene layout set up for ymax={gl$ymax} yline[1]={gl$yline[1]} yline[4]={gl$yline[4]} yline[5]={gl$yline[5]}')
  ## Set up plotting area
  plot.new()
  plot.window(c(pos_start, pos_end), range(gl$yline * 1.04, na.rm=TRUE)) # y axis *1.04 adjustment for using par(yaxs='i')

  ## Draw axes, and axis labels
  abline(h = 0, col = "grey")
  with(list(xpretty = pretty(c(pos_start, pos_end)*1e-6)),
       axis(1, at = xpretty*1e6, labels = xpretty))
  with(list(ypretty = pretty(c(0, ymax))),
       axis(2, at = ypretty[ypretty <= ymax], las = 1))
  mtext(paste0("chr", chrom, " position (Mb)"), 1, 3)
  mtext(expression(paste("Association ", -log[10](paste(P, "-value")))), 2, 3)

  ## Add recombination rate and gene annotation
  ##  regionplot.recombination determines position range from par("usr")
  regionplot.recombination(chrom, yoff = mean(gl$yline[2:3]), dbc = dbc)
  ##  regionplot.genedraw uses previously determined layout
  regionplot.genedraw(gl)

  ## Add title, scaled to fit if necessary, CHANGE TO USE mtext.fit() FIXME
  if (!missing(main)) {
    xplt <- par("plt")[2] - par("plt")[1] # figure as fraction of plot, assumes no subsequent changes to par("mar")
    mtext(main, 3, 1,
          cex = min(1., xplt/strwidth(main, units = 'figure')))
  }
  if (!missing(fdesc)) {
      mtext(fdesc, 3, 0, cex = 0.5)
  }

  ## Draw box last to overdraw any edge marks
  box()

  return(invisible(gl))
}

#' @export
regionplot.genedraw <- function(gl) {
  arrows(gl$genelayout$pos_start, gl$genelayout$y, x1 = gl$genelayout$pos_end,
         length = 0, lwd = 2, col = "blue")
  if (length(gl$genelayout$label) > 0) {
    text(gl$genelayout$pos_mid, gl$genelayout$y - .25*strheight("M", cex = gl$cex),
         gl$genelayout$label,
         adj = c(.5, 1), font = 3, cex = gl$cex, col = "blue")
  }
  return(invisible(NULL))
}

# each gene is assigned to a line (iline) such that the horizontal line
# and label will not overlap neighboring genes.  The y positions are
# then evenly distributed [add more explanation]
#
# returns a list with elements:
# cex:  value of cex used when computing the layout
# ymax: value of ymax used when computing the layout
# yline:  vector of length 5 delimiting the regions for the genes track, 
#                   zero [min for points], max for points, max for whole plot   
#' @export
regionplot.genelayout <- function (chrom, pos_start, pos_end, ymax, cex = 0.75, 
				   protein_coding_only = TRUE, 
                                   dbc = getOption("gtx.dbConnection", NULL)) {
  gtxdbcheck(dbc)
  
  yplt <- par("plt")[4] - par("plt")[3] # figure as fraction of plot, assumes no subsequent changes to par("mar")
  xplt <- par("plt")[2] - par("plt")[1]
  xusr <- pos_end - pos_start # par("usr")[2] - par("usr")[1] 
  return(with(sqlWrapper(getOption('gtx.dbConnection_cache_genes', dbc), 
                       ## use SQL query to aggregate over multiple rows with same name to get whole span
                       sprintf('SELECT min(pos_start) AS pos_start, max(pos_end) AS pos_end, hgncid, ensemblid FROM genes WHERE %s %s GROUP BY ensemblid, hgncid ORDER BY pos_start', 
                               gtxwhere(chrom = chrom, pos_end_ge = pos_start, pos_start_le = pos_end),
			       if (protein_coding_only) 'AND genetype=\'protein_coding\'' else ''),
			uniq = FALSE, zrok = TRUE),
              {
                label <- ifelse(hgncid != '', as.character(hgncid), as.character(ensemblid)) # could also check NULL or NA?
                ## compute start and end plot positions for each gene, using larger of transcript line and gene name annotation
                pos_mid <- .5*(pos_start + pos_end)
                xwidth <- strwidth(label, units = 'figure', cex = cex)*xusr/xplt
                xpad <- max(10000, strwidth('M', units = 'figure', cex = cex)*xusr/xplt)
                xstart <- pmin(pos_start, pos_mid - .5*xwidth) - xpad
                xend <- pmax(pos_end, pos_mid + .5*xwidth) + xpad
                ## layout engine based on putting each gene on the highest line possible
                iline <- rep(NA, length(label))
                if (length(label) > 0) {
                  iline[1] <- 0
                  if (length(label) > 1) {
                    for (idx in 2:length(label)) {
                      iline[idx] <- with(rbind(aggregate(xend[1:(idx - 1)], by = list(tline = iline[1:(idx - 1)]), FUN = max),
                                               data.frame(tline = max(iline[1:(idx - 1)]) + 1, x = -Inf)),
                                         min(tline[x < xstart[idx]]))
                    }
                  }
                }
                fy <- 0 # fraction of total figure y space needed
                if (length(iline) > 0) {
                  fy <- max(iline + 1)*2*strheight("M", units = "figure", cex = cex)/yplt # fraction of total figure region needed
                  if (fy > 0.5) {
                    futile.logger::flog.warn('Squashing gene annotation to fit within .5 of plot area')
                    fy <- 0.5
                  }
                }
                yd1 = ymax/(0.8-fy) * 0.1
                yd2 = ymax/(0.8-fy) * fy
                yline <- c(-(yd1 + yd2), -yd1, 0, ymax, ymax + yd1)
                y <- numeric(0)
                if (length(iline) > 0) {
                  y <- -(iline/max(iline + 1)*yd2 + yd1)
                }
                list(cex = cex, ymax = ymax, yline = yline, 
                     genelayout = data.frame(label, pos_start, pos_end, pos_mid, iline, y))
              }))
}

regionplot.points <- function(pos, pval, labels, 
                              pch = 21, bg = rgb(.67, .67, .67, .5), col = rgb(.33, .33, .33, .5), cex = 1, 
                              pos.labels = 3, col.labels = rgb(0, 0, 0), cex.labels = 0.5, 
                              ymax, 
                              suppressWarning = FALSE) {
  # auto-detecting ymax (==yline[4]), works because:
  # par('usr')[5] == yline[5] # true since changed to par(yaxs='i')
  # par('usr')[4] == yline[1] # ibid
  # yline[5]-yline[4] is 10% of plot always (by genelayout())
  # reverse the *1.04 applied in regionplot.new plus
  # plus a 0.5 'epsilon' to avoid effect of numerical imprecision on floor()  
  if (missing(ymax)) ymax <- floor(sum(par('usr')[c(3, 4)]*c(.1, .9)/1.04) + 0.5)
  y <- -log10(pval)
  f <- y > ymax
  if (missing(labels)) {
    points(pos, ifelse(f, ymax, y), pch = pch, col = col, bg = bg, cex = cex)
  } else {
    text(x = pos, y = ifelse(f, ymax, y), labels = labels, 
         pos = pos.labels, col = col.labels, cex = cex.labels)
  }
  if (any(f) && !suppressWarning) {
    # would be nice to more cleanly overwrite y axis label
    axis(2, at = ymax, labels = substitute({}>=ymax, list(ymax = ymax)), las = 1)
    # draw at left edge, otherwise multiple regionplot.points() generate 
    # partly overlapping  (and hence illegible) warnings
    text(par('usr')[1] + strwidth('M', cex = 0.5), 
         ymax, '(plot truncated)', adj = c(0, 0.5), cex = 0.5)
    return(FALSE)
  }
  return(ifelse(f, ymax, y))
}

regionplot.highlight <- function(pvals, highlight_style) {
  stopifnot(is.data.frame(pvals))
  if (all(c('pos', 'pval') %in% names(pvals))) {
        pvals <- subset(pvals, !is.na(pos) & !is.na(pval))
        if (nrow(pvals) > 0) {
            if ('circle' %in% tolower(highlight_style)) {
                regionplot.points(pvals$pos, pvals$pval, 
                                  pch = 1, cex = 2, col = rgb(0, 0, 0, .75),
                                  suppressWarning = TRUE) # suppress since points already drawn
            }
            if ('pos' %in% tolower(highlight_style)) {
                regionplot.points(pvals$pos, pvals$pval, 
                                  labels = pvals$pos,
                                  suppressWarning = TRUE) # suppress since points already drawn
            }
            ## in future will add options if 'rs' %in% highlight_style etc.
        }
    } else {
        # silently do nothing, e.g. if pvals==data.frame(NULL)
    }
    return(invisible(NULL))
}

regionplot.recombination <- function(chrom, pos_start, pos_end, yoff = -.5, 
                                     dbc = getOption("gtx.dbConnection", NULL)) {
  gtxdbcheck(dbc)

  ## Recombination segments (r) that fall wholly or partly within [pos_start, pos_end]
  ## have r.pos_end >= pos_start and r.pos_start <= pos_end
  if (missing(pos_start)) pos_start = floor(par("usr")[1])
  if (missing(pos_end)) pos_end = ceiling(par("usr")[2])

  with(sqlWrapper(dbc, 
                sprintf('SELECT pos_start, pos_end, recombination_rate FROM genetic_map WHERE %s', 
                        gtxwhere(chrom = chrom, pos_end_ge = pos_start, pos_start_le = pos_end)),
		uniq = FALSE, zrok = TRUE),
       {
         abline(h = yoff, col = "grey")
         yscale <- (par("usr")[4] - yoff)*.75/max(recombination_rate)
         lines(c(pos_start, pos_end[length(pos_end)]),
               yoff + c(recombination_rate, recombination_rate[length(recombination_rate)])*yscale,
               type = "s", col = "cyan3")
         with(list(ypretty = pretty(c(0, max(recombination_rate)))),
              axis(4, at = yoff + ypretty*yscale, labels = ypretty, las = 1))
       })
  mtext("Recombination rate (cM/Mb)", 4, 3)
  return(invisible(NULL))
}

#library for region plot
#Author: Li Li, modified based on Toby's code



#draw recomination rate 
#' @export
recomb.draw <- function(chr, from.bp, to.bp, ylo, yhi, recomb){
  if (is.integer(chr)) chr <- as.character(chr)
  if(chr == "23") chr <- "X"
  chr <- ifelse(substr(chr, 1, 3) == "chr", chr, paste("chr", chr, sep = ""))
  #recomb <- read.table(paste("genetic_map_chr", chr, ".txt", sep=""), header=T)
  #load recombination info
  recomb.chr <- recomb[[chr]][-1]
  idx.start<- which(recomb.chr[,1] < from.bp)
  idx.end<- which(recomb.chr[,1] > to.bp)
  keep.recomb <-recomb.chr[idx.start[length(idx.start)]: idx.end[1],]
  big.range <- yhi -ylo
  max.curr<- max(keep.recomb[,2], na.rm = T)
  #print (paste("Max recomb rate in the region is", max.curr))
  ticks.curr <- pretty(c(0, max.curr))
  max.curr<- ticks.curr[length(ticks.curr )]
  lines(keep.recomb[,1], ylo + ( ( keep.recomb[,2] / max.curr ) * big.range), type="l", 
        col="lightblue", lwd=1)
  axis(4, at=ylo + big.range /max.curr * ticks.curr, labels=ticks.curr, las=1, hadj = 0)
  mtext("Recomb rate (cM/Mb)", side=4, at=(ylo+big.range/2), line=2.5) 
}



################################################################################
#gwas: gwas results with columns for MARKER, chr, POS (bp), qcflag, genoflag, pvalues
#chr pos: hit snp
#flanking: flanking rang (kb) for plotting
#col.pvals: column names for p values to be plotted by different shapes (21:25, 3,4,7:14) 
#plabel: label for p value column
#col.r2: column name for r2 to index SNP. Background colored by LD to index SNP (white to red)
################################################################################
#' @export
regionplot.multiP <- function(gwas, chr, pos, flanking = 250, col.r2=NULL,
                              col.pvals = c("P1","P2"), plabel = NULL, gencode = gencode, recomb=recomb,
                              main = "", main.sub = NULL, miny = 8, crity = -log10(5e-08)) {
  to.bp <- min(pos+flanking *1000, max(gwas$POS, na.rm=T))
  from.bp <- max(pos - flanking *1000, min(gwas$POS, na.rm = T))
  stopifnot(to.bp > from.bp)
  stopifnot(all(col.pvals %in% names(gwas)))
  sel <- gwas$CHROM == chr & gwas$POS >= from.bp & gwas$POS <= to.bp
  cat(sum(sel), "SNPs selected\n")
  pos.sel<- gwas$POS[sel]
  minP <- 1e-50 #minimum value for p value
  for( c in col.pvals) gwas[[c]][!is.na(gwas[[c]]) & gwas[[c]] < minP] <- minP
  lp <- -log10(as.matrix(subset(gwas, sel, col.pvals))) # lp = -log10(p)
  if (is.null(crity)) crity <- -log10(0.05/nrow(lp))
  if (is.null(plabel)) plabel <- col.pvals
  r2.sel <- rep(1, sum(sel))
  if(!is.null(col.r2)) {
    r2.sel<- gwas[[col.r2]][sel]
    r2.sel[is.na(r2.sel)]<- 0
    r2.sel <- pmin(1, pmax(0, 1-r2.sel))
  }
 
 
  plot.new()
  yhi<- max(c(miny, 1.05 * max(lp, na.rm = TRUE)))
  y.tick<- pretty(c(0, yhi))
  yhi <- y.tick[length(y.tick)]
  scale  <- yhi/8
  plot.window(c(from.bp, to.bp), c(-6 *scale,  yhi))
  
  abline(h = seq(0, 7, 1), col = "lightgrey", lty = "dotted")
  abline(h = crity, col = "red", lty = "dashed")
  ppch <- c(21:25, 3,4,7:14) #rep(21:25, 5)
  index <- pos.sel == pos 
  for (idx in 1:ncol(lp)) {
    points(pos.sel[-index], lp[-index , idx], pch = ppch[idx], col = "black",  bg = rgb(1, r2.sel[-index], r2.sel[-index]), cex = 1)
    if(any(index)) #index snp
      points(pos, lp[index, idx],  pch = ppch[idx], col = rgb(0, 0, 1),  bg = rgb(0, 0, 1), cex = 1.3)
  }   
  legend("topright",   ncol = 2,   bty = "n",  pch = ppch[1:ncol(lp)], 
           text.col =  "black", cex = 0.8,  legend = plabel)
  for(i in 0:10){
    polygon(from.bp + (c(0, 1, 1, 0)+i) *(to.bp-from.bp)/80, yhi- c(0.5, 0.5, 1, 1)*scale, 
            border = "grey", col = rgb(1, 1-i/10, 1-i/10)) 
  }
  text(from.bp, yhi - 0 *scale , expression(paste("LD Map Type:","r"^"2")), adj = 0, cex = 0.8)
  text(from.bp+seq(0.5, 10.5, 2) * (to.bp-from.bp)/80, rep(yhi, 6)- 1.5*scale, 
       labels =c(0, 0.2, 0.4, 0.6, 0.8,1),  cex = 0.7)
  
  xm <- pretty(c(from.bp, to.bp)*1e-6) # x marks
  axis(1, at = xm*1e6, labels = xm)
  axis(2, at = y.tick, las = 1)
  mtext(expression(-log[10](italic(P))), side=2, at=yhi/2, line=2.5) 
  recomb.draw(chr, from.bp, to.bp, -1*scale, yhi, recomb)                        
  gene.draw(paste("chr", chr, sep = ""), from.bp, to.bp, gencode, 
            yhi = -2*scale, ylo = -6*scale,exony = 0.05*scale, genecex = 0.7)
  mtext(paste("Chromosome ", chr, " genomic position (Mb)", sep = ""), side = 1, line = 2.5)
  title(main = main)  # ylab = expression(-log[10](italic(P))),
  if(!is.null(main.sub))
    title(main = main.sub, cex.main = 1, col.main = "blue", line = 0.5)
  box()
  return(invisible(NULL))
}