R/motif_analysis.R

Defines functions GetIUPACSequence ComputePValues MatchSubsequence ComputeMotifScore LoadFastaData LoadSNPData LoadMotifLibrary

Documented in ComputeMotifScore ComputePValues GetIUPACSequence LoadFastaData LoadMotifLibrary LoadSNPData MatchSubsequence

#' @name LoadMotifLibrary
#' @title Load position weight matrices.
#' @description Load the file for position weight matrices for motifs.
#' @param filename a MEME format file name.
#' @param urlname URL containing a MEME format file.
#' @param tag A string that marks the description line of the position weight
#' matrix.
#' @param skiprows Number of description lines before each position weight
#' matrix.
#' @param skipcols Number of columns to be skipped in the position weight
#' matrix.
#' @param transpose If TRUE (default), then the position weight matrix should
#' have 4 columns. Otherwise, it should have 4 rows.
#' @param field The index of the field in the description line, seperated by
#' space, that indicates the motif name.
#' @param sep A vector of chars for the string separators to parse each lines of
#'  the matrix. Default: c(" ", "\\t").
#' @param pseudocount An integer for the pseudocount added to each of the
#' original matrices. Default: 0. Recommended to be 1 if the original matrices
#' are position frequency matrices.
#' @details This function reads the formatted file containing motif information
#' and convert them into a list of position weight matrices. The list of
#' arguments should provide enough flexibility of importing a varying number of
#' formats. Some examples are the following:
#' For MEME format, the suggested arguments are: tag = 'Motif', skiprows = 2,
#' skipcols = 0, transpose = FALSE, field = 2, sep = ' ';
#' For motif files from JOHNSON lab (i.e.
#' http://johnsonlab.ucsf.edu/mochi_files/JASPAR_motifs_H_sapiens.txt),
#' the suggested arguments are: tag = '/NAME', skiprows = 1, skipcols = 0,
#' transpose = FALSE, field = 2, sep = "\\t";
#' For JASPAR pfm matrices (i.e. http://jaspar.genereg.net/download/CORE/JASPAR
#' 2018_CORE_vertebrates_non-redundant_pfms_jaspar.txt), the suggested arguments
#' are: tag = ">", skiprows = 1, skipcols = 0, transpose = TRUE, field = 1,
#' sep = "\\t"; For the TRANSFAC library provided by UCF bioinformatics groups
#' (i.e. http://gibbs.biomed.ucf.edu/PreDREM/download/nonredundantmotif.transfac
#' ), the suggested arguments are: tag = "DE", skiprows = 1, skipcols = 1,
#' transpose = FALSE, field = 2, sep = "\\t".
#' @return A list object of position weight matrices.
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' pwms <- LoadMotifLibrary(
#' urlname="http://pages.stat.wisc.edu/~keles/atSNP-Data/pfm_vertebrates.txt",
#' tag = ">", transpose = FALSE, field = 1, sep = c("\t", " ", ">"),
#' skipcols = 1, skiprows = 1, pseudocount = 1)
#' @useDynLib atSNP
#' @import BiocFileCache
#' @import rappdirs
#' @export
LoadMotifLibrary <-
  function(filename = NULL,
           urlname = NULL,
           tag = "MOTIF",
           transpose = FALSE,
           field = 2,
           sep = c("\t", " "),
           skipcols = 0,
           skiprows = 2,
           pseudocount = 0) {
    if (is.null(filename) & is.null(urlname))  {
      stop("one argument among 'filename' and 'urlname' should be provided.")
    } else {
      if (!is.null(filename) & !is.null(urlname))
        stop("only one argument among 'filename' and 'urlname' should be provided.")
    }
    if (is.null(filename)) {
      bfc <-
        BiocFileCache(cache = user_cache_dir(appname = "BiocFileCache"),
                      ask = FALSE)
      rid <-
        bfcrid(bfcquery(
          bfc,
          query = basename(urlname),
          exact = TRUE,
          field = "rname"
        ))
      if (!length(rid))
        rid <-
        names(bfcadd(bfc, rname = basename(urlname), urlname))
      lines <- readLines(bfcrpath(rids = rid))
    } else {
      lines <- readLines(filename)
    }
    
    motifLineNums <- grep(tag, lines)
    if (length(myStrSplit(lines[motifLineNums[1]], sep)[[1]]) >= field) {
      motifnames <-
        sapply(myStrSplit(lines[motifLineNums], sep), function(x)
          x[field])
    } else {
      motifnames <-
        sapply(myStrSplit(lines[motifLineNums], sep), function(x)
          x[field])
    }
    allmats <- as.list(seq_along(motifnames))
    
    for (matrixId in seq_along(motifLineNums)) {
      motifLineNum <- motifLineNums[matrixId] + skiprows
      if (!transpose) {
        pwm <- NULL
        nrows <- 0
        tmp <-
          myStrSplit(lines[nrows + motifLineNum], split = sep)[[1]]
        tmp <- tmp[nchar(tmp) > 0]
        while (length(tmp) >= 4 + skipcols) {
          tmp <- as.numeric(tmp[skipcols + seq(4)])
          if (sum(is.na(tmp)) == 0) {
            pwm <- rbind(pwm, tmp)
          }
          nrows <- nrows + 1
          tmp <-
            myStrSplit(lines[nrows + motifLineNum], split = sep)[[1]]
          tmp <- tmp[nchar(tmp) > 0]
        }
      } else {
        nrows <- 4
        if (skipcols == 0) {
          pwm <-
            matrix(as.numeric(unlist(myStrSplit(
              lines[seq(nrows) + motifLineNum - 1], split = sep
            ))), ncol = 4)
        } else {
          pwm <-
            matrix(as.numeric(unlist(
              sapply(myStrSplit(lines[seq(nrows) + motifLineNum - 1], split = sep), function(x)
                x[-seq(skipcols)])
            )), ncol = 4)
        }
      }
      pwm <- pwm + pseudocount
      pwm <- pwm / apply(pwm, 1, sum)
      pwm <- t(apply(pwm, 1,
                     function(x) {
                       x[x < 1e-10] <- 1e-10 / (1 - 1e-10 * sum(x < 1e-10)) * sum(x)
                       return(x / sum(x))
                     }))
      rownames(pwm) <- NULL
      allmats[[matrixId]] <- pwm
    }
    names(allmats) <- motifnames
    return(allmats)
  }


#' @name LoadSNPData
#' @title Load the SNP information and code the genome sequences around the SNP
#' locations.
#' @description Load the SNP data.
#' @param filename A table containing the SNP information. Must contain at least
#'  five columns with exactly the following names:
#' \tabular{ll}{
#' chr \tab chromosome.\cr
#' snp \tab The nucleotide position of the SNP.\cr
#' snpid \tab The names of the SNPs.\cr
#' a1 \tab The deoxyribose for one allele.\cr
#' a2 \tab The deoxyribose for the other allele.\cr
#' }
#' If this file exists already, it is used to extract the SNP information.
#' Otherwise, SNP information extracted using argument 'snpids' is outputted to
#' this file.
#' @param snpids A vector of rs ids for the SNPs. This argument is overidden
#' if the file with name \code{filename} exists.
#' @param snp.lib A string of the library name to obtain the SNP information
#' based on rs ids. Default: "SNPlocs.Hsapiens.dbSNP144.GRCh38".
#' @param genome.lib A string of the library name for the genome version.
#' Default: "BSgenome.Hsapiens.UCSC.hg38".
#' @param half.window.size An integer for the half window size around the SNP
#' within which the motifs are matched. Default: 30.
#' @param default.par A boolean for whether using the default Markov parameters.
#'  Default: FALSE.
#' @param mutation A boolean for whether this is mutation data. See details for
#' more information. Default: FALSE.
#' @param ... Other parameters passed to \code{\link[utils]{read.table}}.
#' @details This function extracts the nucleotide sequence within a window
#' around each SNP and code them using 1-A, 2-C, 3-G, 4-T.\cr
#' There are two ways of obtaining the nucleotide sequences. If \code{filename}
#' is not NULL and the file exists, it should contain the positions and alleles
#' for each SNP. Based on such information, the sequences around SNP positions
#' are extracted using the Bioconductor annotation package specified by
#' \code{genome.lib}. Users should make sure that this annotation package
#' corresponds to the correct species and genome version of the actual data.
#' Alternatively, users can also provide a vector of rs ids via the argument
#' \code{snpids}. The SNP locations and allele information is then obtained via
#' the Bioconductor annotation package specified by \code{snp.lib}, and passed
#' on to the package specified by \code{genome.lib} to further obtain the
#' nucleotide sequences.\cr
#' If \code{mutation=FALSE} (default), this function assumes that the data is
#' for SNP analysis, and the reference genome should be consistent with either
#' the a1 or a2 nucleotide. When extracting the genome sequence around each SNP
#' position, this function compares the nucleotide at the SNP location on the
#' reference genome with both a1 and a2 to distinguish between the reference
#' allele and the SNP allele. If the nucleotide extracted from the reference
#' genome does not match either a1 or a2, the SNP is discarded. The discarded
#' SNPs are in the 'rsid.rm' field in the output.\cr
#' Alternatively, if \code{mutation=TRUE}, this function assumes that the data
#' is for general single nucleotide mutation analysis. After extracting the
#' genome sequence around each SNP position, it replaces the nucleotide at the
#' SNP location by the a1 nucleotide as the 'reference' allele sequence, and by
#' the a2 nucleotide as the 'snp' allele sequence. It does NOT discard the
#' sequence even if neither a1 or a2 matches the reference genome. When this
#' data set is used in other functions, such as \code{\link{ComputeMotifScore}},
#'  \code{\link{ComputePValues}}, all the results (i.e. affinity scores and
#'  their p-values) for the reference allele are indeed for the a1 allele, and
#'  results for the SNP allele are indeed for the a2 allele.\cr
#' If the input is a list of rsid's, the SNP information extracted from
#' \code{snp.lib} may contain more than two alleles for a single location. For
#' such cases, \code{\link{LoadSNPData}} first extracts all pairs of alleles
#' associated with those locations. If 'mutation=TRUE', all those pairs are
#' considered as pairs of reference and SNP alleles, and their information is
#' contained in 'sequence_matrix', 'a1', 'a2' and 'snpid'. If 'mutation=FALSE',
#' \code{\link{LoadSNPData}} further filters these pairs based on whether one
#' allele matches to the reference genome nucleotide extracted from
#' \code{genome.lib}. Only those pairs with one allele matching the reference
#' genome nucleotide is considered as pairs of reference and SNP alleles, with
#' their information contained in 'sequence_matrix', 'a1', 'a2' and 'snpid'.\cr
#' @return A list object containing the following components:
#' \tabular{ll}{
#' sequence_matrix \tab A list of integer vectors representing the deroxyribose
#' sequence around each SNP.\cr
#' a1 \tab An integer vector for the deroxyribose at the SNP location on the
#' reference genome.\cr
#' a2 \tab An integer vector for the deroxyribose at the SNP location on the
#' SNP genome.\cr
#' snpid \tab A string vector for the SNP rsids.\cr
#' rsid.missing \tab If the data source is a list of rsids, this field records
#' rsids for SNPs that are discarded because they are not in the SNPlocs package.\cr
#' rsid.duplicate \tab If the data source is a list of rsids, this field records
#'  rsids for SNPs that based on the SNPlocs package, this locus has more than
#'  2 alleles. \cr
#' rsid.na \tab This field records rsids for SNPs that are discarded because the
#'  nucleotide sequences contain none ACGT characters.\cr
#' rsid.rm \tab If the data source is a table and \code{mutation=FALSE}, this
#' field records rsids for SNPs that are discarded because the nucleotide on the
#'  reference genome matches neither 'a1' or 'a2' in the data source.\cr
#' }
#' The results are coded as: "A"-1, "C"-2, "G"-3, "T"-4.
#' @author Chandler Zuo \email{chandler.c.zuo@@gmail.com}
#' @examples
#' \dontrun{LoadSNPData(snpids = c("rs53576", "rs7412"),
#' genome.lib ="BSgenome.Hsapiens.UCSC.hg38", snp.lib =
#' "SNPlocs.Hsapiens.dbSNP144.GRCh38", half.window.size = 30, default.par = TRUE
#' , mutation = FALSE)}
#' @import BSgenome Biostrings
#' @useDynLib atSNP
#' @export
LoadSNPData <-
  function(filename = NULL,
           genome.lib = "BSgenome.Hsapiens.UCSC.hg38",
           snp.lib = "SNPlocs.Hsapiens.dbSNP144.GRCh38",
           snpids = NULL,
           half.window.size = 30,
           default.par = FALSE,
           mutation = FALSE,
           ...) {
    useFile <- FALSE
    rsid.rm <- rsid.missing <- rsid.duplicate <- rsid.na <- NULL
    if (!is.null(filename)) {
      if (file.exists(filename)) {
        useFile <- TRUE
      }
    }
    if (useFile) {
      if (!is.null(snpids)) {
        message(
          "Warning: load SNP information from 'filename' only. The argument 'snpids' is overridden."
        )
      }
      tbl <-
        read.table(
          filename,
          header = TRUE,
          stringsAsFactors = FALSE,
          colClasses = c("character"),
          ...
        )
      tbl <- tbl[c("snpid", "chr", "snp", "a1", "a2")]
      tbl$snp <- as.integer(tbl$snp)
      ## check if the input file has the required information
      if (sum(!c("snp", "chr", "a1", "a2", "snpid") %in% names(tbl)) > 0) {
        stop("'filename' must be a table containing 'snp' and 'chr' columns.")
      }
      snpid.index <- seq(nrow(tbl))
    } else {
      if (is.null(snpids)) {
        stop(
          "either 'snpids' should be a vector, or 'filename' should be the file name that contains the SNP information."
        )
      }
      snpid.index <- seq_along(snpids)
      ## load the corresponding snp library
      library(package = snp.lib, character.only = TRUE)
      rsid.missing.all <- NULL
      snps <- get(snp.lib)
      snp.loc <-
        tryCatch({
          snpsById(snps, snpids, ifnotfound = "error")
        }, error = function(e)
          return(e$message))
      ## remove rsids not included in the database
      if (is(snp.loc, "character")) {
        rsid.missing <- myStrSplit(snp.loc, split = c(": ", "\n"))[[1]][-1]
        rsid.missing.all <-
          myStrSplit(rsid.missing, split = c(",", " "))[[1]]
        snpids <- snpids[!snpids %in% rsid.missing]
        snp.loc <- snpsById(snps, snpids, ifnotfound = "drop")
      }
      
      if (!is.null(rsid.missing.all)) {
        message("Warning: the following rsids are not included in the database and discarded: ")
        message(paste(rsid.missing.all, collapse = ", "))
        rsid.missing <- rsid.missing.all
      }
      
      snp.alleles <-
        IUPAC_CODE_MAP[snp.loc@elementMetadata@listData$alleles_as_ambig]
      snp.strands <- as.character(snp.loc@strand)
      if (sum(nchar(snp.alleles) > 2) > 0) {
        message(
          "Warning: the following SNPs have more than 2 alleles. All pairs of nucleotides are considered as pairs of the SNP and the reference allele:"
        )
        rsid.duplicate <- snpids[nchar(snp.alleles) > 2]
        message(paste(rsid.duplicate, collapse = ", "))
      }
      ## retain only SNPs with >= 2 alleles
      tbl <- NULL
      for (nalleles in 2:4) {
        ids <- which(sapply(snp.alleles, nchar) == nalleles)
        if (length(ids) == 0) {
          next
        }
        snp.loc.n <- snp.loc[ids]
        snp.alleles.n <- snp.alleles[ids]
        snp.ids.n <- snpids[ids]
        snp.alleles.n <- strsplit(snp.alleles.n, "")
        snp.strands.n <- snp.strands[ids]
        ## get all pairs of alleles
        for (i_allele1 in seq(nalleles - 1)) {
          for (i_allele2 in (i_allele1 + 1):nalleles) {
            a1 <- sapply(snp.alleles.n, function(x)
              x[i_allele1])
            a2 <- sapply(snp.alleles.n, function(x)
              x[i_allele2])
            
            ## revert the alleles on the reverse strand
            id.rev <- which(snp.strands.n == "-")
            if (length(id.rev) > 0) {
              rev.codes <- c("A", "C", "G", "T")
              names(rev.codes) <- rev(rev.codes)
              a1[id.rev] <- rev.codes[a1[id.rev]]
              a2[id.rev] <- rev.codes[a2[id.rev]]
            }
            tbl <- rbind(
              tbl,
              data.frame(
                snp = as.integer(snp.loc.n@ranges),
                chr = as.character(paste0(
                  "chr", gsub("ch", "", snp.loc.n@seqnames)
                )),
                a1 = as.character(a1),
                a2 = as.character(a2),
                snpid = as.character(snp.ids.n),
                index = snpid.index[ids]
              )
            )
          }
        }
      }
      
      tbl <- tbl[order(tbl$index),]
      if (!is.null(filename)) {
        write.table(
          tbl[, c('snp', 'chr', 'a1', 'a2', 'snpid')],
          file = filename,
          row.names = FALSE,
          col.names = TRUE,
          quote = FALSE
        )
      }
      snpid.index <- tbl$index
    }
    
    ## load the corresponding genome version
    library(package = genome.lib, character.only = TRUE)
    species <- get(strsplit(genome.lib, "[.]")[[1]][2])
    seqvec <- getSeq(
      species,
      as.character(tbl$chr),
      start = tbl$snp - half.window.size,
      end = tbl$snp + half.window.size,
      as.character = TRUE
    )
    codes <- seq(4)
    names(codes) <- c("A", "C", "G", "T")
    sequences <-
      sapply(seqvec, function(x)
        codes[strsplit(x, "")[[1]]])
    sequences <- matrix(sequences, ncol = length(seqvec))
    snpid.output <- as.character(tbl$snpid)
    rownames(sequences) <- NULL
    a1 <- codes[as.character(tbl$a1)]
    a2 <- codes[as.character(tbl$a2)]
    names(a1) <- names(a2) <- NULL
    keep.id <- which(colSums(is.na(sequences)) == 0)
    if (length(keep.id) < nrow(tbl)) {
      message(
        "Warning: the following rows are discarded because the reference genome sequences contain non ACGT characters:"
      )
      rsid.na <- tbl[-keep.id,]$snpid
      print(tbl[-keep.id,])
    }
    ## remove sequences containing non ACGT characters
    sequences <- sequences[, keep.id, drop = FALSE]
    a1 <- a1[keep.id]
    a2 <- a2[keep.id]
    snpid.output <- snpid.output[keep.id]
    snpid.index <- snpid.index[keep.id]
    ## whether use the default parameters
    if (!default.par) {
      transition <-
        .Call("transition_matrix", sequences, package = "atSNP")
      prior <- rowSums(transition)
      prior <- prior / sum(prior)
      transition <- transition / rowSums(transition)
      names(prior) <-
        colnames(transition) <-
        rownames(transition) <- c("A", "C", "G", "T")
    } else {
      data(default_par, envir = environment())
    }
    if (!mutation) {
      ## real SNP data
      a1.ref.base.id <-
        which(a1 == sequences[half.window.size + 1,])
      a2.ref.base.id <-
        which(a2 == sequences[half.window.size + 1,])
      ## store SNPs that have the same base in SNP and REF alleles only once
      a2.ref.base.id <-
        a2.ref.base.id[!a2.ref.base.id %in% a1.ref.base.id]
      discard.id <-
        setdiff(seq_along(a1), c(a1.ref.base.id, a2.ref.base.id))
      if (length(discard.id) > 0) {
        message(
          "Warning: the following sequences are discarded because the reference nucleotide matches to neither a1 nor a2:"
        )
        rsid.rm <-
          unique(as.character(tbl[keep.id[discard.id],]$snpid))
        message("snpid\tchr\tsnp\ta1\ta2")
        message(paste(apply(tbl[keep.id[discard.id], c("snpid", "chr", "snp", "a1", "a2")], 1, function(x)
          paste(x, collapse = "\t")), collapse = "\n"))
      }
    } else {
      ## single nucleotide mutation data
      a1.ref.base.id <- seq_along(a1)
      a2.ref.base.id <- numeric(0)
    }
    sequences <-
      sequences[, c(a1.ref.base.id, a2.ref.base.id), drop = FALSE]
    snpid.output <- snpid.output[c(a1.ref.base.id, a2.ref.base.id)]
    ref.base <- c(a1[a1.ref.base.id], a2[a2.ref.base.id])
    snp.base <- c(a2[a1.ref.base.id], a1[a2.ref.base.id])
    snpid.index <- snpid.index[c(a1.ref.base.id, a2.ref.base.id)]
    ## Keep the order of SNPs as in the input file
    if (useFile) {
      output.index = seq(ncol(sequences))
    } else {
      output.index = order(snpid.index)
    }
    return(
      list(
        sequence_matrix = matrix(sequences[, output.index], nrow = 2 * half.window.size +
                                   1),
        ref_base = ref.base[output.index],
        snp_base = snp.base[output.index],
        snpids = snpid.output[output.index],
        transition = transition,
        prior = prior,
        rsid.na = rsid.na,
        rsid.rm = rsid.rm,
        rsid.duplicate = rsid.duplicate,
        rsid.missing = rsid.missing
      )
    )
  }

#' @name LoadFastaData
#' @title Load the SNP data from fasta files.
#' @description Load SNP data.
#' @param ref.filename a fastq file name for the reference allele sequences.
#' @param snp.filename a fastq file name for the SNP allele sequences.
#'@param ref.urlname URL of a fastq file for the reference allele sequences.
#' @param snp.urlname URL of a fastq file for the SNP allele sequences.
#' @param snpids SNP IDs
#' @param default.par A boolean for whether using the default Markov parameters.
#'  Default: FALSE.
#' @return A list object containing the following components:
#' \tabular{ll}{
#' sequence_matrix \tab A list of integer vectors representing the deroxyribose
#' sequence around each SNP.\cr
#' a1 \tab An integer vector for the deroxyribose at the SNP location on the
#' reference genome.\cr
#' a2 \tab An integer vector for the deroxyribose at the SNP location on the SNP
#'  genome.\cr
#' }
#' The results are coded as: "A"-1, "C"-2, "G"-3, "T"-4.
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo
#' \email{chandler.c.zuo@@gmail.com}
#' @examples LoadFastaData(
#' ref.urlname="http://pages.stat.wisc.edu/~keles/atSNP-Data/sample_1.fasta",
#' snp.urlname="http://pages.stat.wisc.edu/~keles/atSNP-Data/sample_2.fasta")
#' @useDynLib atSNP
#' @import BiocFileCache
#' @import rappdirs
#' @export
LoadFastaData <- function(ref.filename = NULL,
                          snp.filename = NULL,
                          ref.urlname = NULL,
                          snp.urlname = NULL,
                          snpids = NULL,
                          default.par = FALSE) {
  if (is.null(ref.filename) & is.null(ref.urlname))  {
    stop("one argument among 'ref.filename' and 'ref.urlname' should be provided.")
  } else {
    if (!is.null(ref.filename) & !is.null(ref.urlname))
      stop("only one argument among 'ref.filename' and 'ref.urlname' should be provided.")
  }
  if (is.null(ref.filename)) {
    bfc <-
      BiocFileCache(cache = user_cache_dir(appname = "BiocFileCache"),
                    ask = FALSE)
    ref.rid <-
      bfcrid(bfcquery(
        bfc,
        query = basename(ref.urlname),
        exact = TRUE,
        field = "rname"
      ))
    if (!length(ref.rid))
      ref.rid <-
      names(bfcadd(bfc, rname = basename(ref.urlname), ref.urlname))
    refdat <- read.table(bfcrpath(rids = ref.rid))
  } else {
    refdat <- read.table(ref.filename)
  }
  
  if (is.null(snp.filename) & is.null(snp.urlname))  {
    stop("one argument among 'snp.filename' and 'snp.urlname' should be provided.")
  } else {
    if (!is.null(snp.filename) & !is.null(snp.urlname))
      stop("only one argument among 'snp.filename' and 'snp.urlname' should be provided.")
  }
  if (is.null(snp.filename)) {
    bfc <-
      BiocFileCache(cache = user_cache_dir(appname = "BiocFileCache"),
                    ask = FALSE)
    snp.rid <-
      bfcrid(bfcquery(
        bfc,
        query = basename(snp.urlname),
        exact = TRUE,
        field = "rname"
      ))
    if (!length(snp.rid))
      snp.rid <-
      names(bfcadd(bfc, rname = basename(snp.urlname), snp.urlname))
    snpdat <- read.table(bfcrpath(rids = snp.rid))
  } else {
    snpdat <- read.table(snp.filename)
  }
  
  if (nrow(refdat) != nrow(snpdat)) {
    stop("'ref.data' and 'snp.data' should have the same number of rows.")
  }
  n <- nrow(refdat)
  if (n %% 2 == 1) {
    stop("'ref.data' and 'snp.data' should have an even number of rows.")
  }
  
  ids <- 2 * seq(n / 2)
  refseqs <- as.character(refdat[ids, 1])
  snpseqs <- as.character(snpdat[ids, 1])
  
  if (is.null(snpids)) {
    snpids <- gsub(">", "", as.character(refdat[ids - 1, 1]))
  }
  
  if (var(sapply(refseqs, nchar)) != 0) {
    stop("sequences in 'ref.data' have different lengths.")
  }
  if (var(sapply(snpseqs, nchar)) != 0) {
    stop("sequences in 'snp.data' have different lengths.")
  }
  codes <- seq(4)
  names(codes) <- c("A", "C", "G", "T")
  refmat <- sapply(refseqs,
                   function(x)
                     codes[strsplit(x, "")[[1]]])
  snpmat <- sapply(snpseqs,
                   function(x)
                     codes[strsplit(x, "")[[1]]])
  colnames(refmat) <-
    colnames(snpmat) <- rownames(refmat) <- rownames(snpmat) <- NULL
  m <- nrow(refmat)
  if (nrow(refmat) != nrow(snpmat)) {
    stop("the sequences for the SNP alleles and the reference alleles have different lengths.")
  }
  if (m %% 2 == 0) {
    stop("each sequence must have an odd number of length.")
  }
  
  id.na1 <- which(apply(refmat, 2, function(x)
    sum(is.na(x))) > 0)
  id.na2 <- which(is.na(snpmat[(m + 1) / 2,]))
  id.na <- union(id.na1, id.na2)
  if (length(id.na) > 0) {
    message(
      "Warning: the following sequences are discarded, because they include bases other than A, C, G, T: "
    )
    message(paste(id.na, collapse = ", "))
  }
  
  id.wrong <-
    which(apply((refmat != snpmat)[-(m + 1) / 2,], 2, sum) > 0)
  if (length(id.wrong) > 0) {
    message(
      "Warning: the following sequences are discarded, because they have unidentical nucleotides between the SNP and the reference allele at positions other than the central location: "
    )
    message(paste(id.wrong, collapse = ", "))
  }
  
  ids <- union(id.wrong, id.na)
  if (length(ids) > 0) {
    sequences <- refmat[,-ids, drop = FALSE]
    ref.base <- refmat[(m + 1) / 2,-ids]
    snp.base <- snpmat[(m + 1) / 2,-ids]
  } else {
    sequences <- refmat
    ref.base <- refmat[(m + 1) / 2,]
    snp.base <- snpmat[(m + 1) / 2,]
  }
  
  if (!default.par) {
    transition <-
      .Call("transition_matrix", sequences, package = "atSNP")
    prior <- rowSums(transition)
    prior <- prior / sum(prior)
    transition <- transition / rowSums(transition)
    names(prior) <-
      colnames(transition) <-
      rownames(transition) <- c("A", "C", "G", "T")
  } else {
    data(default_par, envir = environment())
  }
  colnames(sequences) <- snpids
  
  return(
    list(
      sequence_matrix = sequences,
      ref_base = ref.base,
      snp_base = snp.base,
      transition = transition,
      prior = prior,
      snpids = snpids
    )
  )
}

#' @name ComputeMotifScore
#' @title Compute the scores for SNP effects on motifs.
#' @description Compute the log-likelihood scores for motifs.
#' @param motif.lib A list object with the output format of function
#' \code{\link{LoadMotifLibrary}}.
#' @param snp.info A list object with the output format of function
#' \code{\link{LoadSNPData}}.
#' @param ncores An integer for the number of parallel process. Default: 1.
#' @details This function computes the binding affinity scores for both alleles
#' at each SNP window. For each pair of SNP and motif, it finds the subsequence
#' from both strand that maximizes the affinity binding score. It returns both
#' the matching positions and the maximized affinity scores.
#' @return A list of two data.frame's. Field \code{snp.tbl} contains:
#' \tabular{cc}{
#' snpid \tab SNP id.\cr
#' ref_seq \tab Reference allele nucleotide sequence.\cr
#' snp_seq \tab SNP allele nucleotide sequence.\cr
#' ref_seq_rev \tab Reference allele nucleotide sequence on the reverse
#' strand.\cr
#' snp_seq_rev \tab SNP allele nucleotide sequence on the reverse strand.\cr}
#' Field \code{motif.score} contains:
#' \tabular{cc}{
#' motif \tab Name of the motif.\cr
#' motif_len \tab Length of the motif.\cr
#' ref_start, ref_end, ref_strand \tab Location of the best matching subsequence
#'  on the reference allele.\cr
#' snp_start, snp_end, snp_strand \tab Location of the best matching subsequence
#'  on the SNP allele.\cr
#' log_lik_ref \tab Log-likelihood score for the reference allele.\cr
#' log_lik_snp \tab Log-likelihood score for the SNP allele.\cr
#' log_lik_ratio \tab The log-likelihood ratio.\cr
#' mean_log_lik_ref \tab Mean subsequence log-likelihood score for the reference allele.\cr
#' mean_log_lik_snp \tab Mean subsequence log-likelihood score for the SNP allele.\cr
#' mean_log_lik_ratio \tab The ratio of mean subsequence log-likelihood between the SNP
#'  and the reference allele.\cr
#' median_log_lik_ref \tab Median subsequence log-likelihood score for the reference allele.\cr
#' median_log_lik_snp \tab Median subsequence log-likelihood score for the SNP allele.\cr
#' median_log_lik_ratio \tab The ratio of median subsequence log-likelihood between the SNP
#'  and the reference allele.\cr
#' log_enhance_odds \tab Difference in log-likelihood ratio between SNP allele
#' and reference allele based on the best matching subsequence on the reference
#' allele.\cr
#' log_reduce_odds \tab Difference in log-likelihood ratio between reference
#' allele and SNP allele based on the best matching subsequence on the SNP
#' allele.\cr
#' }
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' data(example)
#' ComputeMotifScore(motif_library, snpInfo, ncores = 2)
#' @useDynLib atSNP
#' @import data.table
#' @importFrom BiocParallel bpmapply MulticoreParam SnowParam
#' @export
ComputeMotifScore <- function(motif.lib, snp.info, ncores = 1) {
  ## check arguments
  if (sum(!unlist(sapply(motif.lib, is.matrix))) > 0 |
      sum(unlist(sapply(motif.lib, ncol)) != 4) > 0) {
    stop("'motif.lib' must be a list of numeric matrices each with 4 columns.")
  }
  if (sum(!c("sequence_matrix", "snp_base", "ref_base") %in% names(snp.info)) > 0) {
    stop(
      "'snp.info' must contain three components: 'ref_base', 'snp_base', 'sequence_matrix'."
    )
  }
  if (ncol(snp.info$sequence_matrix) != length(snp.info$ref_base) |
      length(snp.info$ref_base) != length(snp.info$snp_base)) {
    stop(
      "the number of columns of 'snp.info$sequence_matrix', the length of 'snp.info$ref_base' and the length of 'snp.info$snp_base' must be the same."
    )
  }
  if (!all(sort(unique(
    c(
      c(snp.info$sequence_matrix),
      snp.info$ref_base,
      snp.info$snp_base
    )
  )) %in% seq(4))) {
    stop(
      "'snp.info$sequence_matrix', 'snp.info$ref_base', 'snp.info$snp_base' can only contain entries in 1, 2, 3, 4."
    )
  }
  if (nrow(snp.info$sequence_matrix) / 2 == as.integer(nrow(snp.info$sequence_matrix) / 2)) {
    stop(
      "'snp.info$sequence_matrix' must have an odd number of rows so that the central row refers to the SNP nucleotide."
    )
  }
  
  motifs <- names(motif.lib)
  snpids <- snp.info$snpids
  snpbases <-
    ifelse(snp.info$snp_base == 1,
           "A",
           ifelse(
             snp.info$snp_base == 2,
             "C",
             ifelse(snp.info$snp_base == 3, "G", "T")
           ))
  nmotifs <- length(motif.lib)
  len_seq <- nrow(snp.info$sequence_matrix)
  snp.info$sequence_matrix[(len_seq + 1) / 2, ] <- snp.info$ref_base
  ncores <- min(c(ncores, length(snp.info$ref_base)))
  
  k <- as.integer(length(snp.info$ref_base) / ncores)
  if (ncores > 1) {
    if (Sys.info()[["sysname"]] == "Windows") {
      snow <- SnowParam(workers = ncores, type = "SOCK")
      motif_score_par_list <-
        bpmapply(
          function(x)
            motif_score_par(
              i = x,
              par.k = k,
              par.ncores = ncores,
              par.motifs = motifs,
              par.nmotifs = nmotifs,
              par.snpids = snpids,
              par.snpbases = snpbases,
              par.len_seq = len_seq,
              par.motif.lib = motif.lib,
              par.snp.info = snp.info
            ),
          seq(ncores),
          BPPARAM = snow,
          SIMPLIFY = FALSE
        )
    } else {
      motif_score_par_list <-
        bpmapply(
          function(x)
            motif_score_par(
              i = x,
              par.k = k,
              par.ncores = ncores,
              par.motifs = motifs,
              par.nmotifs = nmotifs,
              par.snpids = snpids,
              par.snpbases = snpbases,
              par.len_seq = len_seq,
              par.motif.lib = motif.lib,
              par.snp.info = snp.info
            ),
          seq(ncores),
          BPPARAM = MulticoreParam(workers = ncores),
          SIMPLIFY = FALSE
        )
    }
  }
  else {
    motif_score_par_list <-
      list(
        motif_score_par(
          i = 1,
          par.k = k,
          par.ncores = ncores,
          par.motifs = motifs,
          par.nmotifs = nmotifs,
          par.snpids = snpids,
          par.snpbases = snpbases,
          par.len_seq = len_seq,
          par.motif.lib = motif.lib,
          par.snp.info = snp.info
        )
      )
  }
  
  motif.scores_dt <- motif_score_par_list[[1]]
  if (ncores > 1) {
    for (i in 2:ncores) {
      motif.scores_dt <- rbind(motif.scores_dt, motif_score_par_list[[i]])
    }
  }
  #  setkey(motif.scores_dt, motif, snpid, snpbase)
  motif.scores <-
    as.data.frame(motif.scores_dt, stringAsFactors = FALSE)
  motif.scores <-
    motif.scores[order(motif.scores$motif,
                       motif.scores$snpid,
                       motif.scores$snpbase),]
  # motif.scores[with(motif.scores, order(motif, snpid, snpbase)), ]
  ## sequences on the reference genome
  ref_seqs <-
    apply(snp.info$sequence_matrix, 2, function(x)
      paste(c("A", "C", "G", "T")[x], collapse = ""))
  ref_seqs_rev <-
    apply(snp.info$sequence_matrix, 2, function(x)
      paste(c("A", "C", "G", "T")[5 - rev(x)], collapse = ""))
  ## sequences on the snp allele
  id1 <- seq(as.integer(nrow(snp.info$sequence_matrix) / 2))
  id2 <- id1 + (nrow(snp.info$sequence_matrix) + 1) / 2
  snp_seqs <-
    apply(rbind(snp.info$sequence_matrix, snp.info$snp_base), 2,
          function(x)
            paste(c("A", "C", "G", "T")[x[c(id1, length(x), id2)]],
                  collapse = ""))
  snp_seqs_rev <-
    apply(rbind(snp.info$sequence_matrix, snp.info$snp_base), 2,
          function(x)
            paste(c("A", "C", "G", "T")[5 - rev(x[c(id1, length(x), id2)])],
                  collapse = ""))
  
  #  snp_tbl <- data.table(snpid = snpids,
  #                        ref_seq = ref_seqs,
  #                        snp_seq = snp_seqs,
  #                        ref_seq_rev = ref_seqs_rev,
  #                        snp_seq_rev = snp_seqs_rev,
  #                        snpbase=snpbases)
  #  setkey(snp_tbl, snpid, snpbase)
  snp_tbl <-
    data.frame(
      snpid = snpids,
      ref_seq = ref_seqs,
      snp_seq = snp_seqs,
      ref_seq_rev = ref_seqs_rev,
      snp_seq_rev = snp_seqs_rev,
      snpbase = snpbases,
      stringsAsFactors = FALSE
    )
  snpid <- snpbase <- NULL
  snp_tbl[with(snp_tbl, order(snpid, snpbase)), ]
  return(list(snp.tbl = snp_tbl, motif.scores = motif.scores))
}


#' @name MatchSubsequence
#' @title Compute the matching subsequence.
#' @description This function combines the SNP set, the motif library and the
#' affinity score table and produce the matching subsequence found at each SNP
#' location for each motif.
#' @param snp.tbl A data.frame with the following information:
#' \tabular{cc}{
#' snpid \tab SNP id.\cr
#' ref_seq \tab Reference allele nucleotide sequence.\cr
#' snp_seq \tab SNP allele nucleotide sequence.\cr
#' ref_seq_rev \tab Reference allele nucleotide sequence on the reverse
#' strand.\cr
#' snp_seq_rev \tab SNP allele nucleotide sequence on the reverse strand.\cr}
#' @param motif.scores A data.frame with the following information:
#' \tabular{cc}{
#' motif \tab Name of the motif.\cr
#' motif_len \tab Length of the motif.\cr
#' ref_start, ref_end, ref_strand \tab Location of the best matching subsequence
#'  on the reference allele.\cr
#' snp_start, snp_end, snp_strand \tab Location of the best matching subsequence
#'  on the SNP allele.\cr
#' log_lik_ref \tab Log-likelihood score for the reference allele.\cr
#' log_lik_snp \tab Log-likelihood score for the SNP allele.\cr
#' log_lik_ratio \tab The log-likelihood ratio.\cr
#' log_enhance_odds \tab Difference in log-likelihood ratio between SNP allele
#' and reference allele based on the best matching subsequence on the reference
#' allele.\cr
#' log_reduce_odds \tab Difference in log-likelihood ratio between reference
#' allele and SNP allele based on the best matching subsequence on the SNP
#' allele.\cr
#' }
#' @param motif.lib A list of the position weight matrices for the motifs.
#' @param snpids A subset of snpids to compute the subsequences. Default: NULL,
#' when all snps are computed.
#' @param motifs A subset of motifs to compute the subsequences. Default: NULL,
#' when all motifs are computed.
#' @param ncores The number of cores used for parallel computing.
#' @return A data.frame containing all columns in both \code{snp.tbl} and
#' \code{motif.scores}. In addition, the following columns are added:
#' \tabular{ll}{
#' ref_match_seq \tab Best matching subsequence on the reference allele.\cr
#' snp_match_seq \tab Best matching subsequence on the SNP allele.\cr
#' ref_seq_snp_match \tab Subsequence on the reference allele corresponding to
#' the best matching location on the SNP allele.\cr
#' snp_seq_ref_match \tab Subsequence on the SNP allele corresponding to the
#' best matching location on the reference allele.\cr
#' }
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' data(example)
#' MatchSubsequence(motif_scores$snp.tbl, motif_scores$motif.scores,
#' motif_library, ncores=2)
#' @useDynLib atSNP
#' @import data.table
#' @importFrom BiocParallel bpmapply MulticoreParam SnowParam
#' @export
MatchSubsequence <-
  function(snp.tbl,
           motif.scores,
           motif.lib,
           snpids = NULL,
           motifs = NULL,
           ncores = 1) {
    if (is.null(snpids)) {
      snpids <- unique(snp.tbl$snpid)
    }
    if (is.null(motifs)) {
      motifs <- unique(motif.scores$motif)
    }
    if (sum(!motifs %in% names(motif.lib)) > 0) {
      motif.discard <- setdiff(motifs, unique(names(motif.lib)))
      message("Warning: the following motifs are not included in 'motif.lib' and are discarded: ")
      message(paste(motif.discard, collapse = ", "))
      motifs <- setdiff(motifs, motif.discard)
    }
    if (sum(!snpids %in% motif.scores$snpid) > 0) {
      snp.discard <- setdiff(snpids, unique(motif.scores$snpid))
      message("Warning: the following snpids are not included in 'motif.scores' and are discarded: ")
      message(paste(snp.discard, collapse = ", "))
      snpids <- setdiff(snpids, snp.discard)
    }
    snpids <- unique(snpids)
    motifs <- unique(motifs)
    motif.scores_dt <- as.data.table(motif.scores)
    setkey(motif.scores_dt, motif, snpid, snpbase)
    snp.tbl <- as.data.table(snp.tbl)
    setkey(snp.tbl, snpid, snpbase)
    motif.scores <-
      motif.scores_dt[snpid %in% snpids & motif %in% motifs,]
    snp.tbl <- snp.tbl[snpid %in% snpids,]
    len_seq <- ref_seq <- NULL
    snp.tbl[, len_seq := nchar(ref_seq)]
    
    ## get the IUPAC subsequence for the motifs
    motif.tbl <- data.table(motif = motifs,
                            IUPAC = sapply(motif.lib[motifs],
                                           function(x)
                                             GetIUPACSequence(x, prob = 0.25)))
    motif <- snpid <- snpbase <- NULL
    setkey(motif.tbl, motif)
    setkey(snp.tbl, snpid, snpbase)
    
    ncores <- min(c(ncores, length(snpids)))
    
    k <- as.integer(length(snpids) / ncores)
    
    if (ncores > 1) {
      if (Sys.info()[["sysname"]] == "Windows") {
        snow <- SnowParam(workers = ncores, type = "SOCK")
        motif_score_par_list <-
          bpmapply(
            function(x)
              match_subseq_par(
                i = x,
                par.k = k,
                par.ncores = ncores,
                par.snp.tbl = snp.tbl,
                par.snpids = snpids,
                par.motif.scores = motif.scores,
                par.motif = motif,
                par.motif.tbl = motif.tbl
              ),
            seq(ncores),
            BPPARAM = snow,
            SIMPLIFY = FALSE
          )
      } else{
        motif_score_par_list <-
          bpmapply(
            function(x)
              match_subseq_par(
                i = x,
                par.k = k,
                par.ncores = ncores,
                par.snp.tbl = snp.tbl,
                par.snpids = snpids,
                par.motif.scores = motif.scores,
                par.motif = motif,
                par.motif.tbl = motif.tbl
              ),
            seq(ncores),
            BPPARAM = MulticoreParam(workers = ncores),
            SIMPLIFY = FALSE
          )
      }
    } else
    {
      motif_score_par_list <-
        list(
          match_subseq_par(
            1,
            par.k = k,
            par.ncores = ncores,
            par.snp.tbl = snp.tbl,
            par.snpids = snpids,
            par.motif.scores = motif.scores,
            par.motif = motif,
            par.motif.tbl = motif.tbl
          )
        )
    }
    
    motif_score_tbl <- motif_score_par_list[[1]]
    if (ncores > 1) {
      for (i in 2:ncores) {
        motif_score_tbl <- rbind(motif_score_tbl,
                                 motif_score_par_list[[i]])
      }
    }
    motif_score_tbl <-
      as.data.frame(motif_score_tbl, stringAsFacotrs = FALSE)
    return(motif_score_tbl)
  }

#' @name ComputePValues
#' @title Compute p-values for affinity scores.
#' @description This function computes the p-values for allele-specific affinity
#'  scores and between-allele affinity score changes using the importance
#'  sampling technique.
#' @param motif.lib A list object with the output format of function
#' \code{\link{LoadMotifLibrary}}.
#' @param snp.info A list object with the output format of function
#' \code{\link{LoadSNPData}}.
#' @param motif.scores A data.frame object containing at least the following
#' columns:
#' \tabular{ll}{
#' motif \tab The name of the motif.\cr
#' log_lik_ref \tab The log-likelihood score for the reference allele.\cr
#' log_lik_snp \tab The log-likelihood score for the SNP allele.\cr
#' }
#' @param ncores An integer for the number of parallel process. Default: 1.
#' @param  testing.mc If FALSE, Monte Carlo sample size is adaptively chosen
#'  between 2000 and 100K. If TRUE, the sample size is 100, which is used for
#'  testing purposes. Default: FALSE
#' @param loglik.type A string of 'max', 'mean', 'median'.
#' @param figdir A string for the path to print p-value plots for monitoring
#' results. Default: NULL (no figure).
#' @return A data.frame extending \code{motif.scores} by the following
#' additional columns:
#' \tabular{ll}{
#' pval_ref \tab P-values for scores on the reference allele.\cr
#' pval_snp \tab P-values for scores on the SNP allele.\cr
#' pval_cond_ref \tab Conditional p-values for scores on the reference
#' allele.\cr
#' pval_cond_snp \tab Conditional p-values for scores on the SNP allele.\cr
#' pval_diff \tab P-values for the difference in scores between the reference
#' and the SNP alleles.\cr
#' pval_rank \tab P-values for the log rank ratio between the reference and the
#' SNP alleles.\cr
#' }
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' data(example)
#' ComputePValues(motif_library, snpInfo, motif_scores$motif.scores, ncores = 2, testing.mc=TRUE)
#' @import Rcpp
#' @import data.table
#' @import ggplot2
#' @importFrom BiocParallel bpmapply MulticoreParam SnowParam
#' @useDynLib atSNP
#' @export
ComputePValues <-
  function(motif.lib,
           snp.info,
           motif.scores,
           ncores = 1,
           testing.mc = FALSE,
           loglik.type = "max",
           figdir = NULL) {
    ncores <- min(c(ncores, length(motif.lib)))
    results <- as.list(seq_along(motif.lib))
    nsets <- as.integer(length(motif.lib) / ncores)
    motif.scores <- as.data.table(motif.scores)
    prior <- snp.info$prior
    transition <- snp.info$transition
    
    if (!loglik.type %in% LOGLIK_TYPES) {
      stop(loglik.type, "is not one of", LOGLIK_TYPES)
    }
    
    if (Sys.info()[["sysname"]] == "Windows") {
      snow <- SnowParam(workers = ncores, type = "SOCK")
      results <-
        bpmapply(
          function(x)
            p_values_for_motif(
              motif.id = x,
              par.prior = prior,
              par.transition = transition,
              par.motif.lib = motif.lib,
              par.motif.scores = motif.scores,
              par.testing.mc = testing.mc,
              par.loglik.type = loglik.type,
              par.figdir = figdir
            ),
          seq_along(motif.lib),
          BPPARAM = snow,
          SIMPLIFY = FALSE
        )
    } else{
      results <-
        bpmapply(
          function(x)
            p_values_for_motif(
              motif.id = x,
              par.prior = prior,
              par.transition = transition,
              par.motif.lib = motif.lib,
              par.motif.scores = motif.scores,
              par.testing.mc = testing.mc,
              par.loglik.type = loglik.type,
              par.figdir = figdir
            ),
          seq_along(motif.lib),
          BPPARAM = MulticoreParam(workers = ncores),
          SIMPLIFY = FALSE
        )
    }
    
    motif.scores_dt <- as.data.table(motif.scores)
    motif <- snpid <- snpbase <- NULL
    setkey(motif.scores_dt, motif, snpid, snpbase)
    
    pval_ref <-
      pval_snp <-
      pval_cond_ref <- 
      pval_cond_snp <- 
      pval_diff <- 
      pval_rank <- NULL
    for (i in seq_along(results)) {
      motif.scores_dt[results[[i]]$rowids, pval_ref := results[[i]]$pval_a[, 1]]
      motif.scores_dt[results[[i]]$rowids, pval_snp := results[[i]]$pval_a[, 2]]
      motif.scores_dt[results[[i]]$rowids, pval_cond_ref := results[[i]]$pval_cond[, 1]]
      motif.scores_dt[results[[i]]$rowids, pval_cond_snp := results[[i]]$pval_cond[, 2]]
      motif.scores_dt[results[[i]]$rowids, pval_diff := results[[i]]$pval_diff[, 1]]
      motif.scores_dt[results[[i]]$rowids, pval_rank := results[[i]]$pval_rank[, 1]]
    }
    motif.scores.pval <-
      as.data.frame(motif.scores_dt, stringsAsFactors = FALSE)
    return(motif.scores.pval)
  }

#' @name GetIUPACSequence
#' @title Get the IUPAC sequence of a motif.
#' @description Convert the posotion weight matrix of a motif to the IUPAC
#' sequence.
#' @param pwm The position weight matrix, with the columns representing
#' A, C, G, T.
#' @param prob The probability threshold. Default: 0.25.
#' @return A character string.
#' @author Sunyoung Shin \email{sunyoung.shin@@utdallas.edu}, Chandler Zuo
#' \email{chandler.c.zuo@@gmail.com}
#' @examples
#' data(example)
#' GetIUPACSequence(motif_library[[1]], prob = 0.2)
#' @export
GetIUPACSequence <- function(pwm, prob = 0.25) {
  iupac.table <-
    c(".",
      "A",
      "C",
      "M",
      "G",
      "R",
      "S",
      "V",
      "T",
      "W",
      "Y",
      "H",
      "K",
      "D",
      "B",
      "N")
  iupac.value <- (pwm >= prob) %*% c(1, 2, 4, 8) + 1
  return(paste(iupac.table[iupac.value], collapse = ""))
}
chandlerzuo/atSNP documentation built on Oct. 31, 2020, 4:33 a.m.