motifBreakR: A Package For Predicting The Disruptiveness Of Single Nucleotide Polymorphisms On Transcription Factor Binding Sites

Documented in calculatePvalue exportMBbed exportMBtable findSupportingRemapPeaks motifbreakR plotMB shiny_motifbreakR

## scoreMotif.R this module will score an arbitrary list of SNPs for motif
## disruption using a buffet of menu options for some sets of transcription
## factors

## optional: produce random SNP sets of equivalent size to the index SNP list to
## simulate background for enrichment calculations.

## define utility functions 'defaultOmega' (and related fxns) and maxPwm and
## minPwm

prepareVariants <- function(fsnplist, genome.bsgenome, max.pwm.width) {
  k <- max.pwm.width; rm(max.pwm.width)
  ref_len <- nchar(fsnplist$REF)
  alt_len <- nchar(fsnplist$ALT)
  is.indel <- ref_len > 1L | alt_len > 1L
  ## check that reference matches ref genome
  equals.ref <- getSeq(genome.bsgenome, fsnplist) == fsnplist$REF
  if (!all(equals.ref)) {
    stop(paste(names(fsnplist[!equals.ref]), "reference allele does not match value in reference genome. ",
               sep = " "))
  }
  if (sum(is.indel) < length(is.indel) & sum(is.indel) > 0L) {
    fsnplist.indel <- fsnplist[is.indel]
    fsnplist.snv <- fsnplist[!is.indel]
  } else if (sum(is.indel) == length(is.indel)) {
    fsnplist.indel <- fsnplist
    fsnplist.snv <- NULL
  } else if (sum(is.indel) == 0L) {
    fsnplist.indel <- NULL
    fsnplist.snv <- fsnplist
  }
  if (!is.null(fsnplist.indel)) {
    snp.sequence.ref.indel <- getSeq(genome.bsgenome, promoters(fsnplist.indel, upstream = k - 1,
                                                                downstream = k + max(nchar(fsnplist.indel$REF))))
    at <- as(IRanges(start = k, width = width(fsnplist.indel)), "IRangesList")
    snp.sequence.alt.indel <- DNAStringSet(Map(replaceAt,
                                               x = snp.sequence.ref.indel,
                                               at = at,
                                               fsnplist.indel$ALT))

    need.alignment <- !(lengths(fsnplist.indel$REF) == 1 | lengths(fsnplist.indel$ALT) == 1)
    insertion.var <- lengths(fsnplist.indel$REF) < lengths(fsnplist.indel$ALT)
    other.var <- lengths(fsnplist.indel$REF) == lengths(fsnplist.indel$ALT)
    fsnplist.indel$ALT_loc <- 1L
    fsnplist.indel[insertion.var]$ALT_loc <- Map(seq,
                                                 from = nchar(fsnplist.indel[insertion.var]$REF) + 1L,
                                                 to = nchar(fsnplist.indel[insertion.var]$ALT))
    fsnplist.indel[!insertion.var]$ALT_loc <- Map(seq,
                                                  from = nchar(fsnplist.indel[!insertion.var]$ALT) + 1L,
                                                  to = nchar(fsnplist.indel[!insertion.var]$REF))
    ref.len <- nchar(fsnplist.indel[nchar(fsnplist.indel$REF) == nchar(fsnplist.indel$ALT)]$REF)
    fsnplist.indel[nchar(fsnplist.indel$REF) == nchar(fsnplist.indel$ALT)]$ALT_loc <- lapply(ref.len,
                                                                                             function(x) {
                                                                                               seq(from = 1L,
                                                                                                   to = x)
                                                                                             })
    fsnplist.indel$varType <- "Other"
    if (sum(insertion.var) > 0) {
      fsnplist.indel[insertion.var, ]$varType <- "Insertion"
    }
    if (sum(other.var) > 0){
      need.alignment[other.var] <- FALSE
    }
    if (sum(lengths(fsnplist.indel$REF) > lengths(fsnplist.indel$ALT)) > 0) {
      fsnplist.indel[lengths(fsnplist.indel$REF) > lengths(fsnplist.indel$ALT)]$varType <- "Deletion"
    }
    if (any(need.alignment)) {
      need.del <- any(!insertion.var & need.alignment)
      need.ins <- any(insertion.var & need.alignment)
      if (need.del) {
        pattern.del <- Map(matchPattern,
                           fsnplist.indel[!insertion.var & need.alignment]$ALT,
                           fsnplist.indel[!insertion.var & need.alignment]$REF,
                           with.indels = FALSE, max.mismatch = 0)
        names(pattern.del) <- names(fsnplist.indel[!insertion.var & need.alignment])
        pattern.del <- sapply(pattern.del, function(x) {
          slen <- length(subject(x))
          x <- x[start(x) == 1 | end(x) == slen]
          if (length(x) > 0) {
            x <- x[1]
            x <- gaps(x)
            x <- as(x, "IRanges")
          }
        })
        pattern.del.valid <- sapply(pattern.del, function(x) {length(x) > 0})
        nr.pattern.del <- lapply(pattern.del[pattern.del.valid], function(x) {start(x):end(x)})
        fsnplist.indel[!insertion.var & need.alignment][names(pattern.del[pattern.del.valid])]$ALT_loc <- nr.pattern.del
        if (any((!insertion.var & need.alignment)[!pattern.del.valid])) {
          alignment.del <- Map(pairwiseAlignment,
                               fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid]$ALT,
                               fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid]$REF,
                               type = "global")
          alt.width <- width(fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid]$ALT)
          ref.width <- width(fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid]$REF)
          names(alignment.del) <- names(fsnplist.indel[!insertion.var & need.alignment][!pattern.del.valid])
          alignment.ins <- sapply(sapply(alignment.del, insertion), unlist)
          alignment.del <- sapply(sapply(alignment.del, deletion), unlist)
          alignment.del.valid <- sapply(alignment.del, function(x) {length(x) > 0})
          alignment.ins.valid <- sapply(alignment.ins, function(x) {length(x) > 0})
          full.replace <- (alignment.del.valid & alignment.ins.valid & alt.width == ref.width)
          alignment.del.valid <- alignment.del.valid & !full.replace
          nr.alignment.del <- lapply(alignment.del[alignment.del.valid], function(x) {start(x):end(x)})
          fsnplist.indel[!insertion.var & need.alignment][names(alignment.del[alignment.del.valid])]$ALT_loc <- nr.alignment.del
          rm(alignment.del, nr.alignment.del)
        }
        rm(pattern.del, nr.pattern.del, pattern.del.valid, need.del)
      }
      if (need.ins) {
        pattern.ins <- Map(matchPattern,
                           fsnplist.indel[insertion.var & need.alignment]$REF,
                           fsnplist.indel[insertion.var & need.alignment]$ALT,
                           with.indels = FALSE, max.mismatch = 0)
        names(pattern.ins) <- names(fsnplist.indel[insertion.var & need.alignment])
        pattern.ins <- sapply(pattern.ins, function(x) {
          slen <- length(subject(x))
          x <- x[start(x) == 1 | end(x) == slen]
          if (length(x) > 0) {
            x <- x[1]
            x <- gaps(x)
            x <- as(x, "IRanges")
          }
        })
        pattern.ins.valid <- sapply(pattern.ins, function(x) {length(x) > 0})
        nr.pattern.ins <- lapply(pattern.ins[pattern.ins.valid], function(x) {start(x):end(x)})
        fsnplist.indel[insertion.var & need.alignment][names(pattern.ins[pattern.ins.valid])]$ALT_loc <- nr.pattern.ins
        if (any((insertion.var & need.alignment)[!pattern.ins.valid])) {
          alignment.ins <- Map(pairwiseAlignment,
                               fsnplist.indel[insertion.var & need.alignment][!pattern.ins.valid]$ALT,
                               fsnplist.indel[insertion.var & need.alignment][!pattern.ins.valid]$REF,
                               type = "global")
          names(alignment.ins) <- names(fsnplist.indel[insertion.var & need.alignment][!pattern.ins.valid])
          alignment.ins <- sapply(sapply(alignment.ins, insertion), unlist)
          alignment.ins.valid <- sapply(alignment.ins, function(x) {length(x) > 0})
          nr.alignment.ins <- lapply(alignment.ins[alignment.ins.valid], function(x) {start(x):end(x)})
          fsnplist.indel[insertion.var & need.alignment][names(alignment.ins[alignment.ins.valid])]$ALT_loc <- nr.alignment.ins
          rm(alignment.ins, nr.alignment.ins)
        }
        rm(pattern.ins, nr.pattern.ins, pattern.ins.valid, need.ins)
      }
      rm(ref.len, insertion.var, need.alignment)
    }
  }
  if (!is.null(fsnplist.snv)) {
    snp.sequence.ref.snv <- getSeq(genome.bsgenome, promoters(fsnplist.snv, upstream = k - 1,
                                                              downstream = k + 1))
    at <- matrix(FALSE, nrow = length(snp.sequence.ref.snv), ncol = (k * 2))
    at[, k] <- TRUE
    snp.sequence.alt.snv <- replaceLetterAt(snp.sequence.ref.snv, at, fsnplist.snv$ALT)
    fsnplist.snv$ALT_loc <- 1L
    fsnplist.snv$varType <- "SNV"
  }
  if (sum(is.indel) < length(is.indel) & sum(is.indel) > 0L) {
    fsnplist <- c(fsnplist.indel, fsnplist.snv)
    snp.sequence.alt <- strsplit(as.character(c(snp.sequence.alt.indel,
                                                snp.sequence.alt.snv)), "")
    snp.sequence.ref <- strsplit(as.character(c(snp.sequence.ref.indel,
                                                snp.sequence.ref.snv)), "")
    rm(fsnplist.indel, fsnplist.snv,
       snp.sequence.alt.indel, snp.sequence.ref.indel,
       snp.sequence.alt.snv, snp.sequence.ref.snv)
  } else if (sum(is.indel) == length(is.indel)) {
    fsnplist <- fsnplist.indel
    snp.sequence.alt <- strsplit(as.character(snp.sequence.alt.indel), "")
    snp.sequence.ref <- strsplit(as.character(snp.sequence.ref.indel), "")
    rm(fsnplist.indel, snp.sequence.alt.indel, snp.sequence.ref.indel)
  } else if (sum(is.indel) == 0L) {
    fsnplist <- fsnplist.snv
    snp.sequence.alt <- strsplit(as.character(snp.sequence.alt.snv), "")
    snp.sequence.ref <- strsplit(as.character(snp.sequence.ref.snv), "")
    rm(fsnplist.snv, snp.sequence.alt.snv, snp.sequence.ref.snv)
  }
  rm(at); gc()
  return(list(fsnplist = fsnplist,
              ref.seq = snp.sequence.ref,
              alt.seq = snp.sequence.alt))
}

## An evaluator function for SNP effect

varEff <- function(allelR, allelA) {
  score <- allelA - allelR
  if (abs(score) >= 0.7) {
    return(list(score = score, effect = "strong"))
  } else if (abs(score) > 0.4) {
    return(list(score = score, effect = "weak"))
  } else {
    return(list(score = score, effect = "neut"))
  }
}

reverseComplementMotif <- function(pwm) {
  rows <- rownames(pwm)
  cols <- colnames(pwm)
  Ns <- pwm["N", ]
  pwm <- pwm[4:1, length(cols):1]
  pwm <- rbind(pwm, Ns)
  rownames(pwm) <- rows
  colnames(pwm) <- cols
  return(pwm)
}


scoreSeqWindows <- function(ppm, seq) {
  ppm.width <- ncol(ppm)
  seq.len <- length(seq)
  diag.ind <- rep.int(ppm.width, seq.len - ppm.width)
  ranges <- vapply(c(0L, cumsum(diag.ind)),
                   function(x,
                            range = (1L + 0L:(ppm.width - 1L) * (ppm.width + 1L)))
                   {
                     x + range
                   },
                   integer(ppm.width))
  scores <- t(ppm[seq, ])[ranges]
  scores_rc <- t(reverseComplementMotif(ppm)[seq, ])[ranges]
  scores <- split(scores, ceiling(seq_along(scores)/ppm.width))
  scores_rc <- split(scores_rc, ceiling(seq_along(scores_rc)/ppm.width))
  res <- vapply(Map(function(x, y) {matrix(data = c(x, y), nrow = 2,
                                           byrow = TRUE, dimnames = list(c(1, 2)))},
                    scores, scores_rc),
                rowSums,
                numeric(2))
  return(res)
}

maxThresholdWindows <- function(window.frame) {
  start.ind <- as.integer(colnames(window.frame)[1]) - 1L
  max.win <- arrayInd(which.max(window.frame), dim(window.frame))
  return(list(window = as.integer(colnames(window.frame)[max.win[, 2] + start.ind]),
              strand = c(1, 2)[max.win[, 1]]))
}


#' @import methods
#' @import GenomicRanges
#' @import S4Vectors
#' @import BiocGenerics
#' @import IRanges
#' @importFrom Biostrings getSeq replaceLetterAt reverseComplement complement replaceAt matchPattern
#' @importFrom pwalign pairwiseAlignment insertion deletion
#' @importFrom TFMPvalue TFMpv2sc
#' @importFrom stringr str_locate_all str_sub
scoreSnpList <- function(fsnplist, pwmList, method = "default", bkg = NULL,
                         threshold = 1e-3, show.neutral = FALSE, verbose = FALSE,
                         genome.bsgenome=NULL, pwmList.pc = NULL, pwmRanges = NULL, filterp=TRUE) {
  k <- max(sapply(pwmList, ncol))
  snp.sequence.alt <- fsnplist$alt.seq
  snp.sequence.ref <- fsnplist$ref.seq
  fsnplist <- fsnplist$fsnplist

  res.el.e <- new.env()
  for (snp.map.i in seq_along(snp.sequence.alt)) {
    snp.ref <- snp.sequence.ref[[snp.map.i]]
    snp.alt <- snp.sequence.alt[[snp.map.i]]
    ref.len <- nchar(fsnplist[snp.map.i]$REF)
    alt.len <- nchar(fsnplist[snp.map.i]$ALT)
    alt.loc <- fsnplist[snp.map.i]$ALT_loc[[1]]
    res.el <- rep(fsnplist[snp.map.i], length(pwmList))
    res.el$motifPos <- as.integer(NA)
    res.el$motifID <- mcols(pwmList)$providerID
    res.el$geneSymbol <- mcols(pwmList)$geneSymbol
    res.el$dataSource <- mcols(pwmList)$dataSource
    res.el$providerName <- mcols(pwmList)$providerName
    res.el$providerId <- mcols(pwmList)$providerId
    res.el$seqMatch <- as.character(NA)
    res.el$pctRef <- as.numeric(NA)
    res.el$pctAlt <- as.numeric(NA)
    res.el$scoreRef <- as.numeric(NA)
    res.el$scoreAlt <- as.numeric(NA)
    if (filterp) {
      res.el$Refpvalue <- as.numeric(NA)
      res.el$Altpvalue <- as.numeric(NA)
    }
    res.el$altPos <- as.numeric(NA)
    res.el$alleleDiff <- as.numeric(NA)
    res.el$alleleEffectSize <- as.numeric(NA)
    res.el$effect <- as.character(NA)
    for (pwm.i in seq_along(pwmList)) {
      pwm.basic <- pwmList[[pwm.i]]
      pwm <- pwmList.pc[[pwm.i]]
      len <- ncol(pwm)
      thresh <- threshold[[pwm.i]]
      seq.start <- min(alt.loc)
      seq.len <- length(alt.loc)
      alt.range <- ref.range <- (k - (ncol(pwm) - seq.start)):(k + ncol(pwm) + seq.start + seq.len - 2)
      if (!show.neutral & identical(snp.ref[ref.range], snp.alt[alt.range])) next()
      seq.remove <- ref.len - alt.len
      if (seq.remove < 0) {
        ref.range <- ref.range[1:(length(ref.range) + seq.remove)]
      } else {
        alt.range <- alt.range[1:(length(alt.range) - seq.remove)]
      }
      ref.windows <- scoreSeqWindows(ppm = pwm, seq = snp.ref[ref.range])
      alt.windows <- scoreSeqWindows(ppm = pwm, seq = snp.alt[alt.range])
      pass.effect <- ifelse(filterp,
                            any(alt.windows > thresh) | any(ref.windows > thresh),
                            any(((alt.windows - pwmRanges[[pwm.i]][1]) / (pwmRanges[[pwm.i]][2] - pwmRanges[[pwm.i]][1]) > thresh)) |
                                any((ref.windows - pwmRanges[[pwm.i]][1]) / (pwmRanges[[pwm.i]][2] - pwmRanges[[pwm.i]][1]) > thresh))
      if (pass.effect) {
        hit.alt <- maxThresholdWindows(alt.windows)
        hit.ref <- maxThresholdWindows(ref.windows)
        bigger <- ref.windows[hit.ref$strand, hit.ref$window] >= alt.windows[hit.alt$strand, hit.alt$window]
        if (bigger) {
          hit <- hit.ref
        } else {
          hit <- hit.alt
        }
      } else {
        hit.alt <- list(window = 0L, strand = 0L)
        hit.ref <- list(window = 0L, strand = 0L)
        hit <- NULL
      }
      if (!show.neutral) {
        if (identical(alt.windows[hit.alt$strand, hit.alt$window],
                      ref.windows[hit.ref$strand, hit.ref$window])) next()
      }
      if (!is.null(hit)) {
        result <- res.el[pwm.i]
        uniquename <- paste(names(result), result$dataSource, result$providerName, result$providerId, sep = "%%")
        allelR <- ref.windows[hit.ref$strand, hit.ref$window]
        allelA <- alt.windows[hit.alt$strand, hit.alt$window]
        scorediff <- varEff(allelR, allelA)
        effect <- scorediff$effect
        score <- scorediff$score
        ref.pos <- k:(k + nchar(result$REF) - 1L)
        alt.pos <- k:(k + nchar(result$ALT) - 1L)
        if ((effect == "neut" & show.neutral) | effect != "neut") {
          res.el.e[[uniquename]] <- updateResultsIndel(result,
                                                       snp.ref, snp.alt,
                                                       ref.pos, alt.pos,
                                                       hit.ref, hit.alt,
                                                       ref.windows, alt.windows,
                                                       score, effect, len,
                                                       k, pwm, calcp = filterp)
        }
      }
    }
  }
  resultSet <- unlist(GRangesList(as.list.environment(res.el.e)), use.names = FALSE)
  if (length(resultSet) < 1) {
    if (verbose) {
      message(paste("reached end of SNPs list length =", length(fsnplist),
                    "with 0 potentially disruptive matches to", length(unique(resultSet$geneSymbol)),
                    "of", length(pwmList), "motifs."))
    }
    return(NULL)
  } else {
    if ("ALT_loc" %in% names(mcols(resultSet))) mcols(resultSet)$ALT_loc <- NULL
    max.match <- max(vapply(str_locate_all(resultSet$seqMatch, "\\w"), max, integer(1)))
    min.match <- min(vapply(str_locate_all(resultSet$seqMatch, "\\w"), min, integer(1)))
    resultSet$seqMatch <- str_sub(resultSet$seqMatch,
                                  start = min.match + 1,
                                  end = max.match + 1)
    if (verbose) {
      message(paste("reached end of SNPs list length =", length(fsnplist),
                    "with", length(resultSet), "potentially disruptive matches to", length(unique(resultSet$geneSymbol)),
                    "of", length(pwmList), "motifs."))
    }
    return(resultSet)
  }
}

#' @importFrom matrixStats colRanges
#' @importFrom stringr str_pad
#' @importFrom TFMPvalue TFMsc2pv
updateResultsIndel <- function(result,
                               ref.seq, alt.seq,
                               ref.pos, alt.pos,
                               hit.ref, hit.alt,
                               ref.windows, alt.windows,
                               score, effect, len, k, pwm, calcp) {
  strand.opt <- c("+", "-")
  if (score > 0L) {
    best.hit <- hit.alt
    matchs <- alt.seq
    snp.pos <- alt.pos
  } else {
    best.hit <- hit.ref
    matchs <- ref.seq
    snp.pos <- ref.pos
  }

  strand(result) <- strand.opt[[best.hit$strand]]
  best.hit$window <- as.integer(best.hit$window)
  mresult <- mcols(result)
  alt_loc <- range(mresult$ALT_loc)
  ref_start <- (1 - alt_loc[[1]])
  ref_start <- ifelse(ref_start <= 0, ref_start - 1, ref_start)
  motif.start <- (alt_loc[[1]]) + (-len) + (best.hit$window) + ref_start
  motif.start <- ifelse(motif.start >= 0, motif.start + 1, motif.start)
  if ((mresult$varType == "Insertion" & score < 0) |
      (mresult$varType == "Deletion" & score > 0)) {
    motif.end <- motif.start + len
  } else {
    if (motif.start > 0) {
      motif.end <- len - length(motif.start:length(alt_loc[1]:alt_loc[2]))
    } else {
      motif.end <- motif.start + len - length(alt_loc[1]:alt_loc[2])
    }
  }
  motif.end <- ifelse(motif.end <= 0, motif.end - 1, motif.end)
  mresult$motifPos <- list(c(motif.start, motif.end))
  mresult$altPos <- mresult$ALT_loc
  seq.range <- (k - (len - alt_loc[[1]])):(k + len + alt_loc[[2]] - 2)
  matchs[-(snp.pos)] <- tolower(matchs[-(snp.pos)])
  matchs <- paste(matchs[seq.range], collapse = "")
  mresult[["seqMatch"]] <- str_pad(matchs, width = (k * 2) + alt_loc[[2]], side = "both")
  pwmrange <- colSums(colRanges(pwm[-5,]))
  mresult[["scoreRef"]] <- ref.windows[hit.ref$strand, hit.ref$window]
  mresult[["scoreAlt"]] <- alt.windows[hit.alt$strand, hit.alt$window]
  mresult[["pctRef"]] <- (mresult[["scoreRef"]] - pwmrange[[1]]) / (pwmrange[[2]] - pwmrange[[1]])
  mresult[["pctAlt"]] <- (mresult[["scoreAlt"]] - pwmrange[[1]]) / (pwmrange[[2]] - pwmrange[[1]])
  if (calcp) {
    mresult[["Refpvalue"]] <- NA
    mresult[["Altpvalue"]] <- NA
  }
  mresult[["alleleDiff"]] <- score
  mresult[["effect"]] <- effect
  mresult[["alleleEffectSize"]] <- score/pwmrange[[2]]
  mcols(result) <- mresult
  return(result)
}

#' @importFrom matrixStats colMaxs colMins
preparePWM <- function(pwmList,
                       filterp,
                       bkg,
                       scoreThresh,
                       method = "default") {

  bkg <- bkg[c('A', 'C', 'G', 'T')]
  scounts <- as.integer(mcols(pwmList)$sequenceCount)
  mCGMAT <- pwmList@manuallyCuratedGeneMotifAssociationTable
  scounts[is.na(scounts)] <- 20L
  pwmList.pc <- Map(function(pwm, scount) {
    pwm <- (pwm * scount + 0.25)/(scount + 1)
  }, pwmList, scounts)
  if (method == "ic") {
    pwmOmegas <- lapply(pwmList.pc, function(pwm, b=bkg) {
      omegaic <- colSums(pwm * log2(pwm/b))
    })
  }
  if (method == "default") {
    pwmOmegas <- lapply(pwmList.pc, function(pwm) {
      omegadefault <- colMaxs(pwm) - colMins(pwm)
    })
  }
  if (method == "log") {
    pwmList.pc <- lapply(pwmList.pc, function(pwm, b) {
      pwm <- log(pwm) - log(b)
    }, b = bkg)
    pwmOmegas <- 1
  }
  if (method == "notrans") {
    pwmOmegas <- 1
  }
  pwmList.pc <- Map(function(pwm, omega) {
    if (length(omega) == 1 && omega == 1) {
      return(pwm)
    } else {
      omegamatrix <- matrix(rep(omega, 4), nrow = 4, byrow = TRUE)
      pwm <- pwm * omegamatrix
    }
  }, pwmList.pc, pwmOmegas)
  pwmRanges <- Map(function(pwm, omega) {
    x <- colSums(colRanges(pwm))
    return(x)
  }, pwmList.pc, pwmOmegas)
  if (filterp) {
    pwmList.pc2 <- lapply(pwmList.pc, round, digits = 2)
    pwmThresh <- lapply(pwmList.pc2, TFMpv2sc, pvalue = scoreThresh, bg = bkg, type = "PWM")
    pwmThresh <- Map("+", pwmThresh, -0.02)
  } else {
    pwmThresh <- rep.int(scoreThresh, times = length(pwmRanges))
  }
  pwmList@listData <- lapply(pwmList, function(pwm) {
    pwm <- pwm[c("A", "C", "G", "T"), ]
    pwm <- rbind(pwm, N = 0)
    colnames(pwm) <- as.character(1:ncol(pwm))
    return(pwm) })
  pwmList.pc <- lapply(pwmList.pc, function(pwm) {
    pwm <- pwm[c("A", "C", "G", "T"), ]
    pwm <- rbind(pwm, N = 0)
    colnames(pwm) <- as.character(1:ncol(pwm))
    return(pwm) })
  return(list(pwmList = pwmList,
              pwmListPseudoCount = pwmList.pc,
              pwmRange = pwmRanges,
              pwmThreshold = pwmThresh))
}


#' Predict The Disruptiveness Of Single Nucleotide Polymorphisms On
#' Transcription Factor Binding Sites.
#'
#' @param snpList The output of \code{snps.from.rsid} or \code{snps.from.file}
#' @param pwmList An object of class \code{MotifList} containing the motifs that
#'   you wish to interrogate
#' @param threshold Numeric; the maximum p-value for a match to be called or a minimum score threshold
#' @param method Character; one of \code{default}, \code{log}, \code{ic}, or \code{notrans}; see
#'   details.
#' @param bkg Numeric Vector; the background probabilites of the nucleotides
#'   used with method=\code{log} method=\code{ic}
#' @param filterp Logical; filter by p-value instead of by pct score.
#' @param show.neutral Logical; include neutral changes in the output
#' @param verbose Logical; if running serially, show verbose messages
#' @param BPPARAM a BiocParallel object see \code{\link[BiocParallel]{register}}
#'   and see \code{getClass("BiocParallelParam")} for additional parameter
#'   classes.  Try \code{BiocParallel::registered()} to see what's availible and
#'   for example \code{BiocParallel::bpparam("SerialParam")} would allow serial
#'   evaluation.
#' @seealso See \code{\link{snps.from.rsid}} and \code{\link{snps.from.file}} for
#'   information about how to generate the input to this function and
#'   \code{\link{plotMB}} for information on how to visualize it's output
#' @details \pkg{motifbreakR} works with position probability matrices (PPM). PPM
#' are derived as the fractional occurrence of nucleotides A,C,G, and T at
#' each position of a position frequency matrix (PFM). PFM are simply the
#' tally of each nucleotide at each position across a set of aligned
#' sequences. With a PPM, one can generate probabilities based on the
#' genome, or more practically, create any number of position specific
#' scoring matrices (PSSM) based on the principle that the PPM contains
#' information about the likelihood of observing a particular nucleotide at
#' a particular position of a true transcription factor binding site. What
#' follows is a discussion of the three different algorithms that may be
#' employed in calls to the \pkg{motifbreakR} function via the \code{method}
#' argument.
#'
#' Suppose we have a frequency matrix \eqn{M} of width \eqn{n} (\emph{i.e.} a
#' PPM as described above). Furthermore, we have a sequence \eqn{s} also of
#' length \eqn{n}, such that
#' \eqn{s_{i} \in \{ A,T,C,G \}, i = 1,\ldots n}{s_i in {A,T,G,C}, i = 1 \ldots n}.
#' Each column of
#' \eqn{M} contains the frequencies of each letter in each position.
#'
#' Commonly in the literature sequences are scored as the sum of log
#' probabilities:
#'
#' \strong{Equation 1}
#'
#' \deqn{F( s,M ) = \sum_{i = 1}^{n}{\log( \frac{M_{s_{i},i}}{b_{s_{i}}} )}}{
#' F( s,M ) = \sum_(i = 1)^n log ((M_s_i,_i)/b_s_i)}
#'
#' where \eqn{b_{s_{i}}}{b_s_i} is the background frequency of letter \eqn{s_{i}}{s_i} in
#' the genome of interest. This method can be specified by the user as
#' \code{method='log'}.
#'
#' As an alternative to this method, we introduced a scoring method to
#' directly weight the score by the importance of the position within the
#' match sequence. This method of weighting is accessed by specifying
#' \code{method='ic'} (information content). A general representation
#' of this scoring method is given by:
#'
#' \strong{Equation 2}
#'
#' \deqn{F( s,M ) = p_{s} \cdot \omega_{M}}{F( s,M ) = p_s . \omega_M}
#'
#' where \eqn{p_{s}}{p_s} is the scoring vector derived from sequence \eqn{s} and matrix
#' \eqn{M}, and \eqn{w_{M}}{w_M} is a weight vector derived from \eqn{M}. First, we
#' compute the scoring vector of position scores \eqn{p}
#'
#' \strong{Equation 3}
#'
#' \deqn{p_{s} = ( M_{s_{i},i} ) \textrm{\ \ \ where\ \ \ } \frac{i = 1,\ldots n}{s_{i} \in \{ A,C,G,T \}}}{
#' p_s = ( M_s_i,_i ) where (i = 1 \ldots n)/(s_i in {A,C,G,T})}
#'
#' and second, for each \eqn{M} a constant vector of weights
#' \eqn{\omega_{M} = ( \omega_{1},\omega_{2},\ldots,\omega_{n} )}{\omega_M = ( \omega_1, \omega_2, \ldots, \omega_n)}.
#'
#' There are two methods for producing \eqn{\omega_{M}}{\omega_M}. The first, which we
#' call weighted sum, is the difference in the probabilities for the two
#' letters of the polymorphism (or variant), \emph{i.e.}
#' \eqn{\Delta p_{s_{i}}}{\Delta p_s_i}, or the difference of the maximum and minimum
#' values for each column of \eqn{M}:
#'
#' \strong{Equation 4.1}
#'
#' \deqn{\omega_{i} = \max \{ M_{i} \} - \min \{ M_{i} \}\textrm{\ \ \ \ where\ \ \ \ \ \ }i = 1,\ldots n}{
#' \omega_i = max{M_i} - min{M_i} where i = 1 \ldots n}
#'
#' The second variation of this theme is to weight by relative entropy.
#' Thus the relative entropy weight for each column \eqn{i} of the matrix is
#' given by:
#'
#' \strong{Equation 4.2}
#'
#' \deqn{\omega_{i} = \sum_{j \in \{ A,C,G,T \}}^{}{M_{j,i}\log_2( \frac{M_{j,i}}{b_{i}} )}\textrm{\ \ \ \ \ where\ \ \ \ \ }i = 1,\ldots n}{
#' \omega_i = \sum_{j in {A,C,G,T}} {M_(j,i)} log2(M_(j,i)/b_i) where i = 1 \ldots n}
#'
#' where \eqn{b_{i}}{b_i} is again the background frequency of the letter \eqn{i}.
#'
#' Thus, there are 3 possible algorithms to apply via the \code{method}
#' argument. The first is the standard summation of log probabilities
#' (\code{method='log'}). The second and third are the weighted sum and
#' information content methods (\code{method='default'} and \code{method='ic'}) specified by
#' equations 4.1 and 4.2, respectively. \pkg{motifbreakR} assumes a
#' uniform background nucleotide distribution (\eqn{b}) in equations 1 and
#' 4.2 unless otherwise specified by the user. Since we are primarily
#' interested in the difference between alleles, background frequency is
#' not a major factor, although it can change the results. Additionally,
#' inclusion of background frequency introduces potential bias when
#' collections of motifs are employed, since motifs are themselves
#' unbalanced with respect to nucleotide composition. With these cautions
#' in mind, users may override the uniform distribution if so desired. For
#' all three methods, \pkg{motifbreakR} scores and reports the reference
#' and alternate alleles of the sequence
#' (\eqn{F( s_{\textsc{ref}},M )}{F( s_ref,M )} and
#' \eqn{F( s_{\textsc{alt}},M )}{F( s_alt,M )}), and provides the matrix scores
#' \eqn{p_{s_{\textsc{ref}}}}{p_s_ref} and \eqn{p_{s_{\textsc{alt}}}}{p_s_alt} of the SNP (or
#' variant). The scores are scaled as a fraction of scoring range 0-1 of
#' the motif matrix, \eqn{M}. If either of
#' \eqn{F( s_{\textsc{ref}},M )}{F( s_ref,M )} and
#' \eqn{F( s_{\textsc{alt}},M )}{F( s_alt,M )} is greater than a user-specified
#' threshold (default value of 0.85) the SNP is reported. By default
#' \pkg{motifbreakR} does not display neutral effects,
#' (\eqn{\Delta p_{i} < 0.4}{\Delta p_i < 0.4}) but this behaviour can be
#' overridden.
#'
#' Additionally, now, with the use of \code{\link{TFMPvalue-package}}, we may filter by p-value of the match.
#' This is unfortunately a two step process. First, by invoking \code{filterp=TRUE} and setting a threshold at
#' a desired p-value e.g 1e-4, we perform a rough filter on the results by rounding all values in the PWM to two
#' decimal place, and calculating a scoring threshold based upon that. The second step is to use the function \code{\link{calculatePvalue}()}
#' on a selection of results which will change the \code{Refpvalue} and \code{Altpvalue} columns in the output from \code{NA} to the p-value
#' calculated by \code{\link{TFMsc2pv}}.  This can be (although not always) a very memory and time intensive process if the algorithm doesn't converge rapidly.
#'
#' @return a GRanges object containing:
#'  \item{REF}{the reference allele for the variant}
#'  \item{ALT}{the alternate allele for the variant}
#'  \item{snpPos}{the coordinates of the variant}
#'  \item{motifPos}{The position of the motif relative the the variant}
#'  \item{geneSymbol}{the geneSymbol corresponding to the TF of the TF binding motif}
#'  \item{dataSource}{the source of the TF binding motif}
#'  \item{providerName, providerId}{the name and id provided by the source}
#'  \item{seqMatch}{the sequence on the 5' -> 3' direction of the "+" strand
#'  that corresponds to DNA at the position that the TF binding motif was found.}
#'  \item{pctRef}{The score as determined by the scoring method, when the sequence contains the reference variant allele, normalized to a scale from 0 - 1. If \code{filterp = FALSE},
#'  this is the value that is thresholded.}
#'  \item{pctAlt}{The score as determined by the scoring method, when the sequence contains the alternate variant allele, normalized to a scale from 0 - 1. If \code{filterp = FALSE},
#'  this is the value that is thresholded.}
#'  \item{scoreRef}{The score as determined by the scoring method, when the sequence contains the reference variant allele}
#'  \item{scoreAlt}{The score as determined by the scoring method, when the sequence contains the alternate variant allele}
#'  \item{Refpvalue}{p-value for the match for the pctRef score, initially set to \code{NA}. see \code{\link{calculatePvalue}} for more information}
#'  \item{Altpvalue}{p-value for the match for the pctAlt score, initially set to \code{NA}. see \code{\link{calculatePvalue}} for more information}
#'  \item{altPos}{the position, relative to the reference allele, of the alternate allele}
#'  \item{alleleDiff}{The difference between the score on the reference allele and the score on the alternate allele}
#'  \item{alleleEffectSize}{The ratio of the \code{alleleDiff} and the maximal score of a sequence under the PWM}
#'  \item{effect}{one of weak, strong, or neutral indicating the strength of the effect.}
#'  each SNP in this object may be plotted with \code{\link{plotMB}}
#' @examples
#'  library(BSgenome.Hsapiens.UCSC.hg19)
#'  # prepare variants
#'  load(system.file("extdata",
#'                   "pca.enhancer.snps.rda",
#'                   package = "motifbreakR")) # loads snps.mb
#'  pca.enhancer.snps <- sample(snps.mb, 20)
#'  # Get motifs to interrogate
#'  data(hocomoco)
#'  motifs <- sample(hocomoco, 50)
#'  # run motifbreakR
#'  results <- motifbreakR(pca.enhancer.snps,
#'                         motifs, threshold = 0.85,
#'                         method = "ic",
#'                         BPPARAM=BiocParallel::SerialParam())
#' @import BiocParallel
#' @import parallel
#' @importFrom parallel clusterEvalQ
#' @importFrom BiocParallel bplapply
#' @importFrom stringr str_length str_trim
#' @export
motifbreakR <- function(snpList, pwmList, threshold = 0.85, filterp = FALSE,
                        method = "default", show.neutral = FALSE, verbose = FALSE,
                        bkg = c(A = 0.25, C = 0.25, G = 0.25, T = 0.25),
                        BPPARAM = bpparam()) {
  ## Cluster / MC setup
  if (.Platform$OS.type == "windows" && inherits(BPPARAM, "MulticoreParam")) {
    warning(paste0("Serial evaluation under effect, to achive parallel evaluation under\n",
            "Windows, please supply an alternative BPPARAM"))
  }
  cores <- bpnworkers(BPPARAM)
  num.snps <- length(snpList)
  if (num.snps < cores) {
    cores <- num.snps
  }
  if (!(is(BPPARAM, "MulticoreParam")|is(BPPARAM, "SerialParam"))) {
    bpstart(BPPARAM)
    cl <- bpbackend(BPPARAM)
    clusterEvalQ(cl, library("MotifDb"))
  }
  ##
  ## Genome Setup
  genome.package <- attributes(snpList)$genome.package
  if (requireNamespace(eval(genome.package), quietly = TRUE, character.only = TRUE)) {
    genome.bsgenome <- eval(parse(text = paste(genome.package, genome.package, sep = "::")))
  } else {
    stop(paste0(eval(genome.package), " is the genome selected for this snp list and \n",
                "is not present on your environment. Please load it and try again."))
  }
  ##

  pwms <- preparePWM(pwmList = pwmList, filterp = filterp,
                     scoreThresh = threshold, bkg = bkg,
                     method = method)

  k <- max(sapply(pwms$pwmList, ncol))

  snpList <- prepareVariants(fsnplist = snpList,
                             genome.bsgenome = genome.bsgenome,
                             max.pwm.width = k)

  snpList_cores <- split(as.list(rep(names(snpList), times = cores)), 1:cores)
  for (splitr in seq_along(snpList)) {
    splitcores <- sapply(suppressWarnings(split(snpList[[splitr]], 1:cores)), list)
    for (splitcore in seq_along(snpList_cores)) {
      snpList_cores[[splitcore]][[splitr]] <- splitcores[[splitcore]]
      names(snpList_cores[[splitcore]])[splitr] <- names(snpList)[splitr]
    }
  }
  snpList <- snpList_cores; rm(snpList_cores)

  x <- bplapply(snpList, scoreSnpList,
                pwmList = pwms$pwmList, threshold = pwms$pwmThreshold,
                pwmList.pc = pwms$pwmListPseudoCount, pwmRanges = pwms$pwmRange,
                method = method, bkg = bkg, show.neutral = show.neutral,
                verbose = ifelse(cores == 1, verbose, FALSE), genome.bsgenome = genome.bsgenome,
                filterp = filterp, BPPARAM = BPPARAM)

  ## Cluster / MC cleanup
  if (inherits(x, "try-error")) {
    if (is(BPPARAM, "SnowParam")) {
      bpstop(BPPARAM)
    }
    stop(attributes(x)$condition)
  }
  if (is(BPPARAM, "SnowParam")) {
    bpstop(BPPARAM)
  }

  drops <- sapply(x, is.null)
  x <- x[!drops]
  pwmList <- pwms$pwmList
  pwmList@listData <- lapply(pwms$pwmList, function(pwm) { pwm <- pwm[c("A", "C", "G", "T"), ]; return(pwm) })
  pwmList.pc <- lapply(pwms$pwmListPseudoCount, function(pwm) { pwm <- pwm[c("A", "C", "G", "T"), ]; return(pwm) })

  if (length(x) > 1) {
    x <- unlist(GRangesList(unname(x)))
    snpList <- unlist(GRangesList(lapply(snpList, `[[`, "fsnplist")), use.names = FALSE)
    x <- x[order(match(names(x), names(snpList)), x$geneSymbol), ]
    attributes(x)$genome.package <- genome.package
    attributes(x)$motifs <- pwmList[mcols(pwmList)$providerId %in% unique(x$providerId) &
                                      mcols(pwmList)$providerName %in% unique(x$providerName), ]
    attributes(x)$scoremotifs <- pwmList.pc[names(attributes(x)$motifs)]
  } else {
    if (length(x) == 1L) {
      x <- x[[1]]
      attributes(x)$genome.package <- genome.package
      attributes(x)$motifs <- pwmList[mcols(pwmList)$providerId %in% unique(x$providerId) &
                                        mcols(pwmList)$providerName %in% unique(x$providerName), ]
      attributes(x)$scoremotifs <- pwmList.pc[names(attributes(x)$motifs)]
    } else {
      warning("No SNP/Motif Interactions reached threshold")
      x <- NULL
    }
  }
  if (verbose && cores > 1) {
    if (is.null(x)) {
      message(paste("reached end of SNPs list length =", num.snps, "with 0 potentially disruptive matches to",
                    length(unique(x$geneSymbol)), "of", length(pwmList), "motifs."))

    } else {
      message(paste("reached end of SNPs list length =", num.snps, "with",
                    length(x), "potentially disruptive matches to", length(unique(x$geneSymbol)),
                    "of", length(pwmList), "motifs."))
    }
  }
  return(x)
}


#' Calculate the significance of the matches for the reference and alternate alleles for the for their PWM
#'
#' @param results The output of \code{motifbreakR} that was run with \code{filterp=TRUE}
#' @param background Numeric Vector; the background probabilities of the nucleotides
#' @param granularity Numeric Vector; the granularity to which to round the PWM,
#'  larger values compromise full accuracy for speed of calculation. A value of
#'  \code{NULL} does no rounding.
#' @param BPPARAM a BiocParallel object see \code{\link[BiocParallel]{register}}
#'   and see \code{getClass("BiocParallelParam")} for additional parameter
#'   classes.  Try \code{BiocParallel::registered()} to see what's available and
#'   for example \code{BiocParallel::bpparam("SerialParam")} would allow serial
#'   evaluation.
#' @return a GRanges object. The same Granges object that was input as \code{results}, but with
#'  \code{Refpvalue} and \code{Altpvalue} columns in the output modified from \code{NA} to the p-value
#'  calculated by \code{\link{TFMsc2pv}}. Additionally a \code{pvalueEffect} column that indicates "strong"
#'  when the lower p-value (between ref and alt) is an order of magnitude or more different from
#'  the higher p-value, otherwise weak.
#' @seealso See \code{\link{TFMsc2pv}} from the \pkg{TFMPvalue} package for
#'   information about how the p-values are calculated.
#' @details This function is intended to be used on a selection of results produced by \code{\link{motifbreakR}}, and
#' this can be (although not always) a very memory and time intensive process if the algorithm doesn't converge rapidly.
#' @source H{\'e}l{\`e}ne Touzet and Jean-St{\'e}phane Varr{\'e} (2007) Efficient and accurate P-value computation for Position Weight Matrices.
#'  Algorithms for Molecular Biology, \bold{2: 15}.
#' @examples
#' data(example.results)
#' rs1006140 <- example.results[example.results$SNP_id %in% "rs1006140"]
#' # low granularity for speed; 1e-6 or 1e-7 recommended for accuracy
#' rs1006140 <- calculatePvalue(rs1006140, BPPARAM=BiocParallel::SerialParam(), granularity = 1e-4)
#'
# #' @importFrom qvalue qvalue
#'
#' @export
calculatePvalue <- function(results,
                            background = c(A = 0.25, C = 0.25, G = 0.25, T = 0.25),
                            granularity = NULL,
                            BPPARAM = BiocParallel::SerialParam()) {

  ## Cluster / MC setup
  if (.Platform$OS.type == "windows" && inherits(BPPARAM, "MulticoreParam")) {
    warning(paste0("Serial evaluation under effect, to achive parallel evaluation under\n",
                   "Windows, please supply an alternative BPPARAM"))
  }
  cores <- bpnworkers(BPPARAM)
  num.res <- length(results)
  if (num.res < cores) {
    cores <- num.res
  }
  if (!(is(BPPARAM, "MulticoreParam")|is(BPPARAM, "SerialParam"))) {
    bpstart(BPPARAM)
    cl <- bpbackend(BPPARAM)
    clusterEvalQ(cl, library("MotifDb"))
  }
  if(!("Refpvalue" %in% names(mcols(results)))) {
    stop('incorrect results format; please rerun analysis with filterp=TRUE')
  } else {
    pwmListmeta <- mcols(attributes(results)$motifs, use.names=TRUE)
    pwmList <- attributes(results)$scoremotifs
    if(!is.null(granularity)) {
      pwmList <- lapply(pwmList, function(x, g) {x <- floor(x/g)*g; return(x)}, g = granularity)
    }
    results_sp <- split(results, 1:length(results))
    pvalues <- bplapply(results_sp, function(i, pwmList, pwmListmeta, bkg) {
      result <- i
      pwm.id <- result$providerId
      pwm.name.f <- result$providerName
      pwmmeta <- pwmListmeta[pwmListmeta$providerId == pwm.id & pwmListmeta$providerName == pwm.name.f, ]
      pwm <- pwmList[[rownames(pwmmeta)[1]]]
      ref <- TFMsc2pv(pwm, mcols(result)[["scoreRef"]], bg = bkg, type="PWM")
      alt <- TFMsc2pv(pwm, mcols(result)[["scoreAlt"]], bg = bkg, type="PWM")
      gc()
      return(data.frame(ref=ref, alt=alt))
    }, pwmList=pwmList, pwmListmeta=pwmListmeta, bkg = background, BPPARAM = BPPARAM)
    ## Cluster / MC cleanup
    if (inherits(pvalues, "try-error")) {
      if (is(BPPARAM, "SnowParam")) {
        bpstop(BPPARAM)
      }
      stop(attributes(pvalues)$condition)
    }
    pvalues.df <- base::do.call("rbind", c(pvalues, make.row.names = FALSE))
    results$Refpvalue <- pvalues.df[, "ref"]
    results$Altpvalue <- pvalues.df[, "alt"]
    pscore <- with(results, ifelse(Refpvalue < Altpvalue, Altpvalue/Refpvalue, Refpvalue/Altpvalue))
    results$pvalueEffect <- ifelse(pscore > 10, "strong", "weak")

    if (is(BPPARAM, "SnowParam")) {
      bpstop(BPPARAM)
    }

    return(results)
  }
}

addPWM.stack <- function(identifier, index, GdObject, pwm_stack, ...) {
  plotMotifLogoStack.3(pwm_stack)
}

selcor <- function(identifier, index, GdObject, ... ) {
  if (identical(index, 1L)) {
    return(TRUE)
  } else {
    return(FALSE)
  }
}

selall <- function(identifier, GdObject, ... ) {
    return(TRUE)
}

#' @importFrom grid grid.newpage pushViewport viewport popViewport
plotMotifLogoStack.3 <- function(pfms, ...) {
  n <- length(pfms)
  lapply(pfms, function(.ele) {
    #if (class(.ele) != "pfm")
    if (!is(.ele, 'pfm'))
      stop("pfms must be a list of class pfm")
  })
  assign("tmp_motifStack_symbolsCache", list(), pos = ".GlobalEnv")
  # grid.newpage()
  ht <- 1/n
  y0 <- 0.5 * ht
  for (i in rev(seq.int(n))) {
    pushViewport(viewport(y = y0, height = ht))
    plotMotifLogo(pfms[[i]], motifName = pfms[[i]]@name, ncex = 1,
                  p = pfms[[i]]@background, colset = pfms[[i]]@color,
                  xlab = NA, newpage = FALSE, margins = c(1.5, 4.1,
                                                          1.1, 0.1), ...)
    popViewport()
    y0 <- y0 + ht
  }
  rm(list = "tmp_motifStack_symbolsCache", pos = ".GlobalEnv")
  return()
}

#' @importFrom stringr str_replace
#' @importFrom motifStack addBlank
DNAmotifAlignment.2snp <- function(pwms, result) {
  from <- min(sapply(result$motifPos, `[`, 1))
  to <- max(sapply(result$motifPos, `[`, 2))
  for (pwm.i in seq_along(pwms)) {
    ## get pwm info from result data
    pwm.name <- pwms[[pwm.i]]@name
    pwm.name <- str_replace(pwm.name, pattern = "-:rc$", replacement = "")
    pwm.name <- str_replace(pwm.name, pattern = "-:r$", replacement = "")
    pwm.info <- attributes(result)$motifs
    pwm.id <- mcols(pwm.info[pwm.name, ])$providerId
    pwm.name <- mcols(pwm.info[pwm.name, ])$providerName
    mresult <- result[result$providerId == pwm.id & result$providerName == pwm.name, ]
    mstart <- mresult$motifPos[[1]][1]
    mend <- mresult$motifPos[[1]][2]
    if ((mcols(mresult)$varType == "Insertion" & mcols(mresult)$alleleDiff < 0) |
        (mcols(mresult)$varType == "Deletion" & mcols(mresult)$alleleDiff > 0)) {
      new.mat <- cbind(pwms[[pwm.i]]@mat[, 1:abs(mresult$motifPos[[1]][1])],
                       matrix(c(0.25, 0.25, 0.25, 0.25), ncol = length(mcols(mresult)$altPos[[1]]), nrow = 4),
                       pwms[[pwm.i]]@mat[, (abs(mresult$motifPos[[1]][1]) + 1):ncol(pwms[[pwm.i]]@mat)])
      pwms[[pwm.i]]@mat <- new.mat
      start.offset <- mstart - from
      end.offset <- to - mend
    } else {
      if (mstart < 0 | from > 0) {
        start.offset <- mstart - from
      } else {
        start.offset <- (mstart - 1) - from
      }
      if (mend > 0 | to < 0) {
        end.offset <- to - mend
      } else {
        end.offset <- to - (mend + 1)
      }
    }
    if (start.offset > 0) {
      pwms[[pwm.i]] <- addBlank(x = pwms[[pwm.i]], n = start.offset, b = FALSE)
    }
    if (end.offset > 0) {
      pwms[[pwm.i]] <- addBlank(x = pwms[[pwm.i]], n = end.offset, b = TRUE)
    }
  }
  return(pwms)
}



#' Plot a genomic region surrounding a genomic variant, and potentially disrupted
#' motifs
#'
#' @param results The output of \code{motifbreakR}
#' @param rsid Character; the identifier of the variant to be visualized
#' @param reverseMotif Logical; if the motif is on the "-" strand show the
#'   the motifs as reversed \code{FALSE} or reverse complement \code{TRUE}
#' @param effect Character; show motifs that are strongly effected \code{c("strong")},
#'   weakly effected \code{c("weak")}, or both \code{c("strong", "weak")}
#' @param altAllele Character; The default value of \code{NULL} uses the first (or only)
#'   alternative allele for the SNP to be plotted.
#' @seealso See \code{\link{motifbreakR}} for the function that produces output to be
#'   visualized here, also \code{\link{snps.from.rsid}} and \code{\link{snps.from.file}}
#'   for information about how to generate the input to \code{\link{motifbreakR}}
#'   function.
#' @details \code{plotMB} produces output showing the location of the SNP on the
#'   chromosome, the surrounding sequence of the + strand, the footprint of any
#'   motif that is disrupted by the SNP or SNV, and the DNA sequence motif(s).
#'   The \code{altAllele} argument is included for variants like rs1006140 where
#'   multiple alternate alleles exist, the reference allele is A, and the alternate
#'   can be G,T, or C. \code{plotMB} only plots one alternate allele at a time.
#' @return plots a figure representing the results of \code{motifbreakR} at the
#'   location of a single SNP, returns invisible \code{NULL}.
#' @examples
#' data(example.results)
#' example.results
#' \donttest{
#' library(BSgenome.Hsapiens.UCSC.hg19)
#' plotMB(results = example.results, rsid = "rs1006140", effect = "strong", altAllele = "C")
#' }
#' @importFrom motifStack DNAmotifAlignment colorset motifStack plotMotifLogo plotMotifLogoStack
#' @importClassesFrom motifStack pfm marker
#' @import grDevices
#' @importFrom grid gpar
#' @importFrom Gviz IdeogramTrack SequenceTrack GenomeAxisTrack HighlightTrack
#'   AnnotationTrack plotTracks
#' @export
plotMB <- function(results, rsid, reverseMotif = TRUE, effect = c("strong", "weak"), altAllele = NULL) {
  motif.starts <- sapply(results$motifPos, `[`, 1)
  motif.starts <- start(results) + motif.starts
  motif.starts <- order(motif.starts)
  results <- results[motif.starts]
  g <- genome(results)[[1]]
  result <- results[results$SNP_id %in% rsid]
  if(is.null(altAllele)) {
    altAllele <- result$ALT[[1]]
  }
  result <- result[result$ALT == altAllele]
  result <- result[order(sapply(result$motifPos, min), sapply(result$motifPos, max)), ]
  result <- result[result$effect %in% effect]
  chromosome <- as.character(seqnames(result))[[1]]
  genome.package <- attributes(result)$genome.package
  genome.bsgenome <- eval(parse(text = genome.package))
  seq.len <- max(length(result$REF[[1]]), length(result$ALT[[1]]))
  distance.to.edge <- max(abs(c(sapply(result$motifPos, min),
                                sapply(result$motifPos, max)))) + 4
  from <- start(result)[[1]] - distance.to.edge + 1
  to <- end(result)[[1]] + distance.to.edge
  pwmList <- attributes(result)$motifs
  pwm.names <- result$providerId
  results_motifs <- paste0(result$providerId, result$providerName)
  list_motifs <- paste0(mcols(pwmList)$providerId, mcols(pwmList)$providerName)
  pwms <- pwmList <- pwmList[match(results_motifs, list_motifs)]
  if (reverseMotif) {
    for (pwm.i in seq_along(pwms)) {
      pwm.name <- names(pwms[pwm.i])
      pwm.id <- mcols(pwms[pwm.name, ])$providerId
      pwm.name.f <- mcols(pwms[pwm.name, ])$providerName
      doRev <- as.logical(strand(result[result$providerId == pwm.id & result$providerName == pwm.name.f, ]) == "-")
      if (doRev) {
        pwm <- pwms[[pwm.i]]
        pwm <- pwm[, rev(1:ncol(pwm))]
        rownames(pwm) <- c("T", "G", "C", "A")
        pwm <- pwm[c("A", "C", "G", "T"), ]
        pwms[[pwm.i]] <- pwm
        names(pwms)[pwm.i] <- paste0(names(pwms)[pwm.i], "-:rc")
      }
    }
  } else {
    for (pwm.i in seq_along(pwms)) {
      pwm.name <- names(pwms[pwm.i])
      pwm.id <- mcols(pwms[pwm.name, ])$providerId
      pwm.name.f <- mcols(pwms[pwm.name, ])$providerName
      doRev <- as.logical(strand(result[result$providerId == pwm.id & result$providerName == pwm.name.f, ]) == "-")
      if (doRev) {
        pwm <- pwms[[pwm.i]]
        pwm <- pwm[, rev(1:ncol(pwm))]
        pwms[[pwm.i]] <- pwm
        names(pwms)[pwm.i] <- paste0(names(pwms)[pwm.i], "-:r")
      }
    }
  }
  pwms <- lapply(names(pwms), function(x, pwms=pwms) {new("pfm", mat = pwms[[x]],
                                                          name = x)}, pwms)
  pwms <- DNAmotifAlignment.2snp(pwms, result)
  pwmwide <- max(sapply(pwms, function(x) { ncol(x@mat)}))

  markerStart <- result$motifPos[[1]][1]
  if (markerStart > 0) {
    markerEnd <- length(result$altPos[[1]]) + 1
    markerEnd <- markerEnd - markerStart
    markerStart <- 1
  } else {
    markerStart <- -1 * markerStart
    markerEnd <- markerStart + length(result$altPos[[1]])
    if (result$varType[[1]] %in% c("Other", "SNV")) {
      markerStart <- markerStart + 1
    }
  }
  varType <- result$varType[[1]]
  varType <- switch(varType,
                    Deletion = "firebrick",
                    Insertion = "springgreen4",
                    Other = "gray13")
  markerRect <- new("marker", type = "rect",
                    start = markerStart,
                    stop = markerEnd,
                    gp = gpar(lty = 2,
                              fill = NA,
                              lwd = 3,
                              col = varType))
  for (pwm.i in seq_along(pwms)) {
    pwms[[pwm.i]]@markers <- list(markerRect)
  }
  ideoT <- try(IdeogramTrack(genome = g, chromosome = chromosome), silent = TRUE)
  if (inherits(ideoT, "try-error")) {
    backup.band <- data.frame(chrom = chromosome, chromStart = 0,
                              chromEnd = length(genome.bsgenome[[chromosome]]),
                              name = chromosome, gieStain = "gneg")
    ideoT <- IdeogramTrack(genome = g, chromosome = chromosome, bands = backup.band)
  }

  ### blank alt sequence
  altseq <- genome.bsgenome[[chromosome]]

  ### Replace longer sections
  at <- IRanges(start = start(result[1]), width = width(result[1]))
  if (result$varType[[1]] == "Deletion") {
    reflen <- length(result$REF[[1]])
    addedN <- DNAString(paste0(rep.int(".", reflen), collapse = ""))
    addedN <- replaceLetterAt(addedN, at = (1:reflen)[-result$altPos[[1]]], result$ALT[[1]])
    axisT <- GenomeAxisTrack(exponent = 0)
    seqT <- SequenceTrack(genome.bsgenome, fontcolor = colorset("DNA", "auto"))
    altseq <- replaceAt(x = altseq, at = at, addedN)
  } else if (result$varType[[1]] == "Insertion") {
    altlen <- length(result$ALT[[1]])
    addedN <- DNAString(paste0(rep.int(".", altlen), collapse = ""))
    addedN <- replaceLetterAt(addedN, at = (1:altlen)[-result$altPos[[1]]], result$REF[[1]])
    refseq <- genome.bsgenome[[chromosome]]
    refseq <- DNAStringSet(replaceAt(x = refseq, at = at, addedN))
    altseq <- replaceAt(x = altseq, at = at, result$ALT[[1]])
    names(refseq) <- chromosome
    seqT <- SequenceTrack(refseq,
                          fontcolor = c(colorset("DNA", "auto"), N = "#FFFFFF", . = "#FFE3E6"),
                          chromosome = chromosome)
  } else {
    axisT <- GenomeAxisTrack(exponent = 0)
    altseq <- replaceAt(x = altseq, at = at, result$ALT[[1]])
    seqT <- SequenceTrack(genome.bsgenome, fontcolor = colorset("DNA", "auto"))
  }
  altseq <- DNAStringSet(altseq)
  names(altseq) <- chromosome
  seqAltT <- SequenceTrack(altseq,
                           fontcolor = c(colorset("DNA", "auto"), N = "#FFFFFF", . = "#FFE3E6"),
                           chromosome = chromosome)

  histart <- start(result[1]) + min(result[1]$altPos[[1]]) - 2
  histart <- ifelse(result[1]$varType %in% c("Other", "SNV"), histart + 1, histart)
  hiend <- start(result[1]) + min(result[1]$altPos[[1]]) - 2 + length(result[1]$altPos[[1]])
  hiT <- HighlightTrack(trackList = list(seqT, seqAltT),
                        start = histart,
                        end = hiend,
                        chromosome = chromosome)

  selectingfun <- selcor
  detailfun <- addPWM.stack

  motif_ids <- names(pwmList)
  names(motif_ids) <- mcols(pwmList)$providerName

  for (mymotif_i in seq_along(result)) {
    mymotif <- result[mymotif_i]
    start(mymotif) <- start(mymotif) + min(mymotif$altPos[[1]]) - 1
    width(mymotif) <- length(mymotif$altPos[[1]])
    variant.start <- start(mymotif)
    variant.end <- end(mymotif)
    if (mymotif$motifPos[[1]][1] < 0) {
      start(mymotif) <- start(mymotif) + (mymotif$motifPos[[1]][1])
    } else {
      start(mymotif) <- start(mymotif) + (mymotif$motifPos[[1]][1] - 1)
    }
    if (mymotif$motifPos[[1]][2] < 0) {
      end(mymotif) <- end(mymotif) + (mymotif$motifPos[[1]][2] + 1)
    } else {
      end(mymotif) <- end(mymotif) + (mymotif$motifPos[[1]][2])
    }
    if ((result[mymotif_i]$varType == "Deletion" & result[mymotif_i]$alleleDiff > 0) |
        (result[mymotif_i]$varType == "Insertion" & result[mymotif_i]$alleleDiff < 0)) {
      mymotif <- c(mymotif, mymotif)
      end(mymotif)[1] <- variant.start - 1
      start(mymotif)[2] <- variant.end + 1
      mymotif[which.min(width(mymotif))]$motifPos <- NA
    }
    if (exists("mres")) {
      mres <- c(mres, mymotif)
    } else {
      mres <- mymotif
    }
  }
  result <- mres; rm(mres)
  motif_ids <- motif_ids[result$providerName]
  presult <- result
  strand(presult) <- "*"
  pres_cols <- DataFrame(feature = ifelse(!is.na(result$motifPos),
                                          paste(result$geneSymbol, "motif", sep = "_"), ""),
                         group = result$providerName,
                         id = motif_ids)
  presult <- GRanges(seqnames = seqnames(result[1]),
                     ranges = ranges(result))
  mcols(presult) <- pres_cols

  motifT <- AnnotationTrack(presult,
                            fun = detailfun,
                            detailsFunArgs = list(pwm_stack = pwms),
                            name = names(result)[[1]],
                            selectFun = selectingfun,
                            reverseStacking = FALSE,
                            stacking = "squish")

  if (exists("axisT")) {
    track_list <- list(ideoT, motifT, hiT, axisT)
  } else {
    track_list <- list(ideoT, motifT, hiT)
  }
  plotTracks(track_list, from = from, to = to, showBandId = TRUE,
             cex.main = 0.8, col.main = "darkgrey",
             add53 = TRUE, labelpos = "below", chromosome = chromosome,
             fontcolor.item="black",
             collapse = FALSE, min.width = 1, featureAnnotation = "feature", cex.feature = 0.8,
             details.size = 0.85, detailsConnector.pch = NA, detailsConnector.lty = 0,
             shape = "box", cex.group = 0.8, fonts = c("sans", "Helvetica"))
  return(invisible(NULL))
}

#' Run Shiny version of the motifbreakR package.
#' @return returns a \code{\link{shinyAppDir}} that launches the shiny app when printed.
#' @examples
#' library(motifbreakR)
#'
#' app <- shiny_motifbreakR()
#'
#' if (interactive()) {
#'   shiny::runApp(app)
#' }
#' @import shiny bslib bsicons
#' @importFrom DT renderDT DTOutput datatable formatRound JS
#' @importFrom BSgenome available.genomes
#' @export
shiny_motifbreakR <- function() {
  appDir <- system.file("shiny", "shiny_motif", package = "motifbreakR")
  if (appDir == "") {
    stop("Could not find example directory. Try re-installing `motifbreakR`.", call. = FALSE)
  }
  shinyAppDir(appDir)
}

#' Export motifbreakR results to csv or tsv
#'
#' @param results The output of \code{motifbreakR}
#' @param file Character; the file name of the destination file
#' @param format Character; one of tsv (tab separated values) or csv (comma separated values)
#' @return \code{exportMBtable} produces an output file containing the output
#' of the \code{motifbreakR} function.
#' @seealso See \code{\link{exportMBbed}} for the function that exports the
#' \code{motifbreakR} results as a BED file, colored by selected score.
#' @examples
#' data(example.results)
#' example.results
#' \donttest{
#' exportMBtable(example.results, file = "mb_test_output.tsv", format = "tsv")
#' }
#' @importFrom utils write.csv write.table
#' @export
exportMBtable <- function(results, file, format = "tsv") {
  if(missing(file)) {stop("select output file location")}
  format <- match.arg(format, c("csv", "tsv"))
  sep <- switch(format,
                csv = ",",
                tsv = "\t")
  results <- as.data.frame(results, row.names = NULL)
  results <- results[, !colnames(results) %in% "width"]
  results$start <- results$start - 1
  results$motifPos <- vapply(results$motifPos, function(x) { paste0(x[1], ";", x[2]) }, FUN.VALUE = character(1))
  if("matchingCellType" %in% colnames(results)) {
    results$matchingBindingEvent <- vapply(results$matchingCellType, function(x) {
      collapsed_data <- lapply(x, function(y) {
        paste0(y, collapse = "; ")})
      collapsed_data <- paste0(names(collapsed_data), ":(", collapsed_data, ")")
      collapsed_data <- paste(collapsed_data, collapse = "; ")
      ifelse(collapsed_data == ":(NA)", NA_character_, collapsed_data)
    }, FUN.VALUE = character(1))
    results$matchingCellType <- NULL
  }
  if(format == "tsv") {
    write.table(x = results, file = file, quote = FALSE, sep = sep, row.names = FALSE)
  } else {
    write.csv(x = results, file = file, row.names = FALSE)
  }
}

get_color_values <- function(bed_score, color_set) {
  color_values <- quantile(bed_score, probs = seq(0,1,length.out = 9))
  cut_scores <- cut(bed_score, color_values, include.lowest = TRUE)
  cut_points <- levels(cut_scores)
  bed_colors <- factor(cut_scores,
                       labels = color_set)
  color_scale <- levels(bed_colors)
  names(color_scale) <- cut_points
  return(list(bed_colors = bed_colors, color_scale = color_scale))
}

#' Export motifbreakR variants to bed file
#'
#' @param results The output of \code{\link{motifbreakR}}
#' @param file Character; the file name of the destination file
#' @param name Character; name for the BED track, defaults to "motifbreakR results"
#' @param color Character; one of ref_sig (\code{Refpvalue}), alt_sig
#' (\code{Altpvalue}), best_sig (lowest between \code{Refpvalue} and
#' \code{Altpvalue}), (each of which require pre-computation of p-values with
#' \code{\link{calculatePvalue}}), or ref_score (\code{pctRef}), alt_score
#' (\code{pctAlt}), best_score (highest between \code{pctRef} and \code{pctAlt}),
#' or the default value of effect_size (\code{alleleDiff}).
#' @return \code{exportMBbed} produces an output BED file, with diverging color
#' scale for effect_size (blue representing stronger binding in \code{REF}, red
#' representing stronger binding in \code{ALT}), or a sequential color scale
#' otherwise (low values as purple, high values as yellow). The score column is
#' either the effect_size (\code{alleleDiff} column), the -log10(p-value)
#' (capped at 10), corresponding to \code{Refpvalue}, \code{Altpvalue}, or the
#' best match of the two, or the score \code{pctRef}, \code{pctAlt}, or the
#' highest match of the two. The name column is formatted
#' \code{SNP_id:REF/ALT:providerId}. Additionally a color key is returned
#' indicating the range of values for each color output.
#' @seealso See \code{\link{exportMBtable}} for the function that exports the
#' full \code{motifbreakR} results as a tab or comma separated table file.
#' @examples
#' data(example.results)
#' example.results
#' \donttest{
#' exportMBbed(example.results, file = "mb_test_output.bed", color = "effect_size")
#' }
#' @export
exportMBbed <- function(results, file, name = NULL, color = "effect_size") {
  if(missing(file)) {stop("select output file location")}
  color <- match.arg(color, c("ref_sig", "alt_sig", "best_sig",
                              "ref_score", "alt_score", "best_score",
                              "effect_size"))
  sequential_colors <- c("#440154",
                         "#46337E",
                         "#365C8D",
                         "#277F8E",
                         "#1FA187",
                         "#4AC16D",
                         "#9FDA3A",
                         "#FDE725")
  diverging_colors <- c("#001260",
                        "#034A85",
                        "#4A8FB2",
                        "#BDD6E3",
                        "#E7C6B2",
                        "#C98157",
                        "#9D3709",
                        "#590007")
  bed_name <- paste(results$SNP_id, paste(results$REF, results$ALT, sep = "/"), sep = ":")
  bed_name <- paste(bed_name, results$providerId, sep = ":")
  results_bed <- results
  mcols(results_bed) <- NULL
  results_bed$name <- bed_name

  if(color == "effect_size") {
    bed_score <- results$alleleDiff
    color_values <- get_color_values(bed_score, diverging_colors)
  } else {
    if(color %in% c("ref_sig", "alt_sig", "best_sig")) {
      if(!("Refpvalue" %in% names(mcols(results))))
        stop('incorrect results format; please rerun analysis with filterp=TRUE')
      if(any(is.na(results$Refpvalue)))
        stop('run computePvalue before exporting data with p-values')
    }
    bed_score <- switch(color,
                        ref_sig = -log10(results$Refpvalue),
                        alt_sig = -log10(results$Altpvalue),
                        best_sig = ifelse(results$pctRef < results$pctAlt,
                                          -log10(results$Refpvalue),
                                          -log10(results$Altpvalue)),
                        ref_score = results$pctRef,
                        alt_score = results$pctAlt,
                        best_score = ifelse(results$pctAlt < results$pctRef,
                                            results$pctRef,
                                            results$pctAlt))
    color_values <- get_color_values(bed_score, sequential_colors)
  }
  results_bed$score <- bed_score
  results_bed$itemRgb <- color_values$bed_colors
  motif_sources <- levels(factor(results$dataSource))
  data_source <- unique(genome(results))
  results_trackline <- new("BasicTrackLine",
                           itemRgb = TRUE,
                           useScore = TRUE,
                           visibility = "pack",
                           name = ifelse(is.null(name), "motifbreakR results", name),
                           description = paste0("motifbreakR results colored by ", color, " in ",
                                                paste0(data_source, collapse = ", "),
                                                " from motif sources: ",
                                                paste0(motif_sources, collapse = ", ")))
  export.bed(results_bed, con = file, trackLine = results_trackline)
  return(color_values$color_scale)
}

#' Find Corresponding TF Binding From The ReMap2022 Project
#'
#' @param results The output of \code{motifbreakR}
#' @param genome Character; one of:
#' \code{hg38} or \code{hg19} for Homo sapiens,
#' \code{mm10} or \code{mm39} for Mus musculus,
#' \code{dm6} for Drosophila melanogaster,
#' \code{TAIR10_TF} or \code{TAIR10_HISTONE} for Arabidopsis thaliana
#' @param TFClass Logical;  The user may optionally query an expanded
#' motif/transcription factor relationship encompassing the entire potential
#' transcription factor family as implemented by \code{\link{MotifDb}} based on
#' TFClass.
#' @details \code{TFClass} argument works for objects loaded in from the
#' \code{MotifDb} package. \code{hg19} and \code{mm39} are data from liftOver.
#'
#' The ReMap catalogues (2022, 2020, 2018, 2015) are under CC BY-NC 4.0
#' international license, as described in ReMap.
#'
#' The CC BY-NC 4.0 license correspond to the following terms:
#' Attribution — You must give appropriate credit, provide a link to the
#' license, and indicate if changes were made. You may do so in any reasonable
#' manner, but not in any way that suggests the licensor endorses you or your
#' use.
#' NonCommercial — You may not use the material for commercial purposes.
#' No additional restrictions — You may not apply legal terms or technological
#' measures that legally restrict others from doing anything the license
#' permits.
#'
#' @seealso \code{\link{associateTranscriptionFactors}} for information about
#' TFClass. \url{https://remap.univ-amu.fr/} for details about ReMap2022.
#' @return the results GenomicRanges object output by \code{\link{motifbreakR}}
#' containing with the additional columns:
#'  \item{matchingBindingEvent}{The name of the transcription factor that binds
#'  over the motif, or \code{NA} if none}
#'  \item{matchingCellType}{A list corresponding in length to the number of
#'  transcription factors in \code{matchingBindingEvent} indicating the
#'  biotype/celltype that the transcription factor binding was found in.}
#' @examples
#' data(example.results)
#' \donttest{
#' example.results <- findSupportingRemapPeaks(example.results,
#'                                             genome = "hg19",
#'                                             TFClass = TRUE)
#' }
#' @export

findSupportingRemapPeaks <- function(results, genome, TFClass = FALSE) {
  genome <- match.arg(genome, c("hg38", "hg19", "mm10", "mm39", "dm6", "TAIR10_TF", "TAIR10_HISTONE"))
  switch(genome,
         hg19 = message("hg19 peaks have been lifted over from hg38"),
         mm39 = message("mm39 peaks have been lifted over from mm10"))

  remap_links <- list(
    hg38      = "https://remap.simoncoetzee.com/remap2022_nr_macs2_hg38_v1_0.bed.gz",
    hg19      = "https://remap.simoncoetzee.com/remap2022_nr_macs2_hg19_v1_0.bed.gz",
    mm10      = "https://remap.simoncoetzee.com/remap2022_nr_macs2_mm10_v1_0.bed.gz",
    mm39      = "https://remap.simoncoetzee.com/remap2022_nr_macs2_mm39_v1_0.bed.gz",
    dm6       = "https://remap.simoncoetzee.com/remap2022_nr_macs2_dm6_v1_0.bed.gz",
    TAIR10_TF = "https://remap.simoncoetzee.com/remap2022_nr_macs2_TAIR10_v1_0.bed.gz",
    TAIR10_HISTONE = "https://remap.simoncoetzee.com/remap2022_histone_nr_macs2_TAIR10_v1_0.bed.gz"
  )

  results$matchingBindingEvent <- NA
  results$matchingCellType <- NA

  uniqmb <- results
  mcols(uniqmb) <- NULL
  names(uniqmb) <- NULL
  uniqmb <- sort(unique(uniqmb))

  remap_binding <- loadPeakFile(remap_links, genome)
  remap_binding <- subsetByOverlaps(remap_binding, uniqmb)

  if(length(remap_binding) < 1) return(results)

  mb_overlaps <- findOverlaps(results, remap_binding)
  variant_to_peak <- unique(data.frame(variant = names(results[queryHits(mb_overlaps), ]), tf.binding = remap_binding[subjectHits(mb_overlaps), ]$name))
  tf_ct <- stringr::str_split(variant_to_peak$tf.binding, ":")
  variant_to_peak$tf.binding <- vapply(tf_ct, FUN = `[`, character(1), 1)
  variant_to_peak$tf.celltype <- vapply(tf_ct, FUN = `[`, character(1), 2)
  variant_to_peak$tf.celltype <- lapply(variant_to_peak$tf.celltype, function(x) {stringr::str_split(x, ",", simplify = T)[1, ]})
  mcol_variant_to_gene <- data.frame(variant = names(results), tf.gene = results$geneSymbol, motif = results$providerId)

  if(("manuallyCuratedGeneMotifAssociationTable" %in% slotNames(attributes(results)$motifs)) &
      TFClass &
      (nrow(attributes(results)$motifs@manuallyCuratedGeneMotifAssociationTable) > 1)) {
    mb_genelist <- attributes(results)$motifs@manuallyCuratedGeneMotifAssociationTable
    mb_genelist <- unique(mb_genelist[mb_genelist$motif %in% results$providerId, c("motif", "tf.gene")])
    mb_genelist_aug <- unique(data.frame(motif = results$providerId, tf.gene = results$geneSymbol))
    mb_genelist <- unique(rbind(mb_genelist, mb_genelist_aug))
    variant_to_gene <- data.frame(variant = names(results), motif = results$providerId)
    variant_to_gene_master <- merge(variant_to_gene, mb_genelist, all.x = TRUE, sort = FALSE)
    variant_to_gene <- unique(variant_to_gene_master[, c("variant", "tf.gene")])
  } else {
    variant_to_gene_master <- mcol_variant_to_gene
    variant_to_gene <- unique(variant_to_gene_master[, c("variant", "tf.gene")])
  }

  variant_mb_peak <- merge(variant_to_gene_master, variant_to_peak, all.x = TRUE, sort = FALSE)
  variant_mb_peak <- unique(variant_mb_peak[variant_mb_peak$tf.gene == variant_mb_peak$tf.binding, ])
  variant_mb_peak <- variant_mb_peak[!is.na(variant_mb_peak$variant), c("variant", "tf.binding", "tf.celltype", "motif")]

  split_vmbp <- split(variant_mb_peak, as.factor(variant_mb_peak$variant))

  for(variant in seq_along(split_vmbp)) {
    pvariant <- names(split_vmbp[variant])
    vgm <- split_vmbp[[variant]]
    mbv_sel <- which(mcol_variant_to_gene$variant == pvariant)
    vgm$motif <- factor(vgm$motif, levels = unique(mcol_variant_to_gene[mbv_sel, "motif"]))
    names(vgm$tf.celltype) <- vgm$tf.binding
    vgmm <- vgm[which(vgm$motif %in% mcol_variant_to_gene$motif), ]
    vgmm <- split(vgmm, vgmm$motif)
    vgmg <- vgm[which(vgm$tf.gene %in% mcol_variant_to_gene$tf.gene), ]
    vgmg <- split(vgmg, vgmg$motif)
    vgm <- mapply(rbind, vgmm, vgmg, SIMPLIFY = F)

    results[mbv_sel]$matchingBindingEvent <- lapply(vgm, function(x) {unique(x$tf.binding)})[results[mbv_sel]$providerId]
    results[mbv_sel]$matchingCellType <- lapply(vgm, function(x) {x$tf.celltype})[results[mbv_sel]$providerId]
    if(any(lengths(results[mbv_sel]$matchingBindingEvent) == 0)) {
      results[mbv_sel][lengths(results[mbv_sel]$matchingBindingEvent) == 0]$matchingBindingEvent <- NA
      results[mbv_sel][lengths(results[mbv_sel]$matchingCellType) == 0]$matchingCellType <- NA
    }
  }
  attributes(results)$peaks <- remap_binding
  return(results)
}

#' @importFrom tools R_user_dir
#' @importFrom BiocFileCache BiocFileCache
.get_cache <- function() {
  cache <- tools::R_user_dir("motifbreakR", which="cache")
  BiocFileCache(cache = cache, ask = F)
}

#' @importFrom BiocFileCache bfcquery bfcadd bfcneedsupdate bfcdownload bfcrpath bfcnew
cachePeakFile <- function(fileURL, genome) {
  bfc <- .get_cache()
  rname <- paste("remap2022", genome, sep = "_")
  rid <- bfcquery(bfc, rname, "rname")$rid
  if (!length(rid)) {
    rid <- names(bfcadd(bfc, rname, fileURL, ext = ".Rdata", download = FALSE))
  }
  needupdate <- tryCatch(bfcneedsupdate(bfc, rid),
                         error = function(cond) {
                           message("URL is down, using cache if availible")
                           FALSE
                         })
  if (!isFALSE(needupdate)) {
    message("downloading peak file")
    bfcdownload(bfc, rid, ask = FALSE, FUN=convertPeakFile)
    message("peak download complete")
  }
  bfcrpath(bfc, rids = rid)
}

cacheMartObj <- function(genome) {
  bfc <- .get_cache()
  genome_ver <- ifelse(is.null(genome), "hg38", genome)
  rname <- paste("ensembl_mart", genome_ver, sep = "_")
  rid <- bfcquery(bfc, rname, "rname")$rid
  if (!length(rid)) {
    rid <- names(bfcnew(bfc, rname, ext = ".Rdata"))
    bmsnp <- useEnsembl(biomart = 'snps', dataset = "hsapiens_snp", version = genome)
    saveRDS(bmsnp, file = bfcrpath(bfc, rids = rid))
  }
  days.old <- as.numeric(Sys.Date() - as.Date(bfcquery(bfc, field = "rid", rid)$create_time))
  if (days.old > 30) {
    bmsnp <- useEnsembl(biomart = 'snps', dataset = "hsapiens_snp", version = genome)
    saveRDS(bmsnp, file = bfcrpath(bfc, rids = rid))
  }
  bfcrpath(bfc, rids = rid)
}

loadPeakFile <- function(url_list, genome) {
  peak_path <- cachePeakFile(url_list[[genome]], genome)
  remap_peaks <- readRDS(peak_path)
  return(remap_peaks)
}

loadMartObj <- function(genome) {
  mart_path <- cacheMartObj(genome)
  mart_obj <- readRDS(mart_path)
  return(mart_obj)
}

#' @importFrom vroom vroom
convertPeakFile <- function(from, to) {
  message("processing peak file")
  remap_peaks <- GRanges(vroom(from, delim = "\t",
                               col_names = c("chr", "start", "end", "name",
                                             "score", "strand", "tstart",
                                             "tend", "color"),
                               col_types = c("ciiciciic"),
                               col_select = c(chr, start, end, name),
                               progress = FALSE))
  saveRDS(remap_peaks, file = to)
  TRUE
}
Simon-Coetzee/motifBreakR documentation built on Aug. 6, 2024, 5:17 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Simon-Coetzee/motifBreakR
A Package For Predicting The Disruptiveness Of Single Nucleotide Polymorphisms On Transcription Factor Binding Sites

R/scoreMotif.R
In Simon-Coetzee/motifBreakR: A Package For Predicting The Disruptiveness Of Single Nucleotide Polymorphisms On Transcription Factor Binding Sites

Documented in calculatePvalue exportMBbed exportMBtable findSupportingRemapPeaks motifbreakR plotMB shiny_motifbreakR

R Package Documentation

Browse R Packages

We want your feedback!

Simon-Coetzee/motifBreakR A Package For Predicting The Disruptiveness Of Single Nucleotide Polymorphisms On Transcription Factor Binding Sites

R/scoreMotif.R In Simon-Coetzee/motifBreakR: A Package For Predicting The Disruptiveness Of Single Nucleotide Polymorphisms On Transcription Factor Binding Sites

Documented in calculatePvalue exportMBbed exportMBtable findSupportingRemapPeaks motifbreakR plotMB shiny_motifbreakR

R Package Documentation

Browse R Packages

We want your feedback!

Simon-Coetzee/motifBreakR
A Package For Predicting The Disruptiveness Of Single Nucleotide Polymorphisms On Transcription Factor Binding Sites

R/scoreMotif.R
In Simon-Coetzee/motifBreakR: A Package For Predicting The Disruptiveness Of Single Nucleotide Polymorphisms On Transcription Factor Binding Sites