universalmotif: Import, Modify, and Export Motifs with R

Documented in read_meme

#' Import MEME motifs.
#'
#' Import MEME formatted motifs, as well as original motif sequences. See
#' \url{http://meme-suite.org/doc/meme-format.html}. Both 'full' and 'minimal'
#' formats are supported. DREME and STREME motifs can also be imported, but note
#' that `readsites` and `readsites.meta` arguments do nothing.
#'
#' @param file `character(1)` File name.
#' @param skip `numeric(1)` If not zero, will skip however many desired lines in the
#'    file before starting to read.
#' @param readsites `logical(1)` If `TRUE`, the motif sites will be read as well.
#' @param readsites.meta `logical(1)` If `readsites = TRUE`, then additionally
#'    read site positions and P-values.
#' @param readsites.meta.tidy `logical(1)` If `readsites.meta = TRUE`, merge the
#'    position site information for all motifs into a single tidy `data.frame`.
#'
#' @return `list` [universalmotif-class] objects. If `readsites = TRUE`, a list
#'    comprising of a sub-list of motif objects and a sub-list of
#'    motif sites will be returned. If `readsites.meta = TRUE`, then two
#'    additional list items will be present, one containing site positions
#'    and P-values, and another containing combined sequence p-values. If
#'    `readsites.meta.tidy = TRUE`, an additional list entry named
#'    `sites.meta.tidy` will be added containing a `data.frame`.
#'
#' @details
#' Please note that the typical number precision limit in R is around 1e-308.
#' This means that motif P-values in MEME files below this limit are rounded
#' automatically to 0. To get around this, the E-value is also stored as a
#' string in the `extrainfo` slot. If you require a numeric value for analysis,
#' use the [log_string_pval()] function to get the log of the string-formatted
#' p-value.
#'
#' @examples
#' meme.minimal <- read_meme(system.file("extdata", "meme_minimal.txt",
#'                                       package = "universalmotif"))
#' meme.full <- read_meme(system.file("extdata", "meme_full.txt",
#'                                    package = "universalmotif"))
#' ## Get numeric p-value:
#' log_string_pval(meme.minimal[[1]]["extrainfo"]["eval.string"])
#'
#' @references
#'
#' Bailey TL, Boden M, Buske FA, Frith M, Grant CE, Clementi L, Ren
#' J, Li WW, Noble WS (2009). “MEME SUITE: tools for motif discovery
#' and searching.” *Nucleic Acids Research*, **37**, W202-W208.
#'
#' @family read_motifs
#' @author Benjamin Jean-Marie Tremblay, \email{benjamin.tremblay@@uwaterloo.ca}
#' @export
read_meme <- function(file, skip = 0, readsites = FALSE,
                      readsites.meta = FALSE, readsites.meta.tidy = FALSE) {

  # param check --------------------------------------------
  args <- as.list(environment())
  char_check <- check_fun_params(list(file = args$file),
                                 1, FALSE, TYPE_CHAR)
  num_check <- check_fun_params(list(skip = args$skip), 1, FALSE, TYPE_NUM)
  logi_check <- check_fun_params(list(readsites = args$readsites,
                                      readsites.meta.tidy = args$readsites.meta.tidy,
                                      readsites.meta = args$readsites.meta),
                                 numeric(), logical(), TYPE_LOGI)
  all_checks <- c(char_check, num_check, logi_check)
  if (length(all_checks) > 0) stop(all_checks_collapse(all_checks))
  #---------------------------------------------------------

  raw_lines <- readLines(con <- file(file))
  close(con)
  if (skip > 0) raw_lines <- raw_lines[-seq_len(skip)]
  raw_lines <- raw_lines[raw_lines != ""]
  raw_lines <- raw_lines[!grepl("^#", raw_lines)]
  raw_lines <- raw_lines[!grepl("\\*\\*\\*\\*\\*\\*\\*\\*\\*\\*", raw_lines)]
  raw_lines <- raw_lines[!grepl("------------", raw_lines)]

  check_meme_version(raw_lines)

  alph <- get_meme_alph(raw_lines)
  alph.len <- get_meme_alph_len(alph)
  alph.split <- switch(alph, DNA = DNA_BASES, RNA = RNA_BASES, AA = AA_STANDARD2,
      safeExplode(alph))

  strands <- raw_lines[grepl("^strands:", raw_lines)]
  if (length(strands) > 0) {
    strands <- strsplit(strands, "\\s+")[[1]][-1]
  } else {
    message("Could not find strand info, assuming +.")
    strands <- "+"
  }
  if (all(c("+", "-") %in% strands)) {
    strands <- "+-"
  }
  bkg.start <- grep("^Background letter frequencies", raw_lines)
  if (length(bkg.start)) {
    bkg.offset <- 1
    bkg <- raw_lines[bkg.start + bkg.offset]
    bkg <- strsplit(bkg, "\\s+")[[1]]
    bkg <- bkg[bkg != ""]  # if the line if prepended with a space
    bkg <- as.numeric(bkg[seq_len(length(bkg)) %% 2 == 0])
    while (length(bkg) < alph.len) {
      bkg.offset <- bkg.offset + 1
      bkg.tmp <- raw_lines[bkg.start + bkg.offset]
      bkg.tmp <- strsplit(bkg.tmp, "\\s+")[[1]]
      bkg.tmp <- bkg.tmp[bkg.tmp != ""]
      bkg.tmp <- as.numeric(bkg.tmp[seq_along(bkg.tmp) %% 2 == 0])
      bkg <- c(bkg, bkg.tmp)
    }
    if (anyNA(bkg))
      stop("Could not parse background frequencies, check that they match alphabet")
  } else {
    message("Could not find background, assuming uniform frequencies.")
    bkg <- rep(1 / length(alph.split), length(alph.split))
  }

  motif_meta <- grep("^letter-probability matrix:", raw_lines)
  motif_names_i <- grep("^MOTIF ", raw_lines)
  motif_names <- lapply(raw_lines[motif_names_i], function(x) {
                            x <- strsplit(x, "\\s+")[[1]]
                            if (x[1] == "") x[3] else x[2]
                          })
  motif_altnames <- lapply(raw_lines[motif_names_i], function(x) {
                            x <- strsplit(x, "\\s+")[[1]]
                            if (x[1] == "") x[4] else x[3]
                          })
  motif_starts <- motif_meta + 1
  motif_stops <- sapply(raw_lines[motif_meta],
                        function(x) strsplit(x, "\\s+")[[1]][6])
  motif_stops <- motif_meta + as.numeric(motif_stops)

  motif_meta <- lapply(raw_lines[motif_meta],
                         function(x) {
                           x <- strsplit(x, "\\s+")[[1]]
  #                          c(nsites = as.numeric(x[8]),
  # # Add a C++ function here to convert long double to log10 values?
  #                            eval = as.numeric(x[10]))
                           c(nsites = x[8], eval = x[10])
                         })
  motif_list <- mapply(function(x, y) {
                           z <- trimws(raw_lines[x:y])
                           z <- sapply(z, function(x) strsplit(x, "\\s+")[[1]])
                           if (nrow(z) != alph.len)
                             stop("Alphabet length does not match motif length")
                           z <- z[order(alph.split, method = "radix"), ]
                           as.numeric(z)
                         }, motif_starts, motif_stops, SIMPLIFY = FALSE)

  motif_list <- mapply(function(x, y, z, x2) {
                          mot <- universalmotif_cpp(name = x,
                                           type = "PPM",
                                           altname = x2,
                                           nsites = as.numeric(y[1]),
                                           eval = as.numeric(y[2]),
                                           bkg = bkg,
                                           alphabet = alph,
                                           strand = strands,
                                           extrainfo = c(eval.string = unname(y[2])),
                                           motif = t(matrix(z, ncol = alph.len,
                                                            byrow = TRUE)))
                          validObject_universalmotif(mot)
                          mot
                         }, motif_names, motif_meta, motif_list, motif_altnames,
                         SIMPLIFY = FALSE)

  if (length(motif_list) == 1) motif_list <- motif_list[[1]]

  if (readsites) {
    if (is.list(motif_list))
      mot.names <- vapply(motif_list, function(x) x@name, character(1))
    else
      mot.names <- motif_list@name
    block.starts <- grep("in BLOCKS format", raw_lines)
    if (length(block.starts) == 0) {
      warning("could not find BLOCKS formatted motifs in MEME file")
      motif_list <- list(motifs = motif_list, sites = NULL)
    } else {
      block.len <- vapply(block.starts,
                          function(x) strsplit(raw_lines[x + 1], "seqs=")[[1]][2],
                          character(1))
      block.len <- as.numeric(block.len)
      block.starts <- block.starts + 2
      block.stops <- block.starts +  block.len - 1
      blocks <- mapply(function(x, y) read.table(text = raw_lines[x:y],
                                                 stringsAsFactors = FALSE),
                        block.starts, block.stops,
                        SIMPLIFY = FALSE)
      sites <- lapply(blocks, function(x) x$V4)
      site.names <- lapply(blocks, function(x) x$V1)
      sites <- switch(alph, "DNA" = lapply(sites, DNAStringSet),
                      "RNA" = lapply(sites, RNAStringSet),
                      "AA" = lapply(sites, AAStringSet),
                      lapply(sites, BStringSet))
      sites <- mapply(function(x, y) {names(x) <- y; x},
                        sites, site.names,
                        SIMPLIFY = FALSE)
      names(sites) <- mot.names
      if (length(sites) == 1) sites <- sites[[1]]
      if (is.list(sites) && is.list(motif_list))  # TODO: this is a bug..
        if (length(sites) != length(motif_list))
          sites <- sites[seq_len(length(motif_list))]
      motif_list <- list(motifs = motif_list, sites = sites)
    }

    if (!readsites.meta && readsites.meta.tidy) {
      warning("'readsites.meta.tidy' is not valid if 'readsites.meta = FALSE'")
    } else if (readsites.meta) {
      site.starts <- grep("sites sorted by position p-value", raw_lines)
      site.stops <- grep("block diagrams$", raw_lines)
      if (length(site.starts) == 0 || length(site.stops) == 0) {
        warning("Could not find site P-values in MEME file")
      } else {
        site.starts <- site.starts + 2
        site.stops <- site.stops - 1
        site.tables <- mapply(function(x, y) {
                                z <- raw_lines[x:y]
                                lapply(z, function(x) strsplit(x, "\\s+")[[1]])
                              }, site.starts, site.stops, SIMPLIFY = FALSE)
        col.seqname <- 1
        if (all(grepl("Strand", raw_lines[site.starts - 1]))) {
          col.pos <- 3
          col.pval <- 4
          col.seq <- 6
        } else {
          col.pos <- 2
          col.pval <- 3
          col.seq <- 5
        }
        site.tables <- lapply(site.tables,
                              function(x) {
                                z1 <- vapply(x, function(x) x[col.seqname], character(1))
                                z2 <- vapply(x, function(x) x[col.pos], character(1))
                                z3 <- vapply(x, function(x) x[col.pval], character(1))
                                z4 <- vapply(x, function(x) x[col.seq], character(1))
                                data.frame(Sequence = z1,
                                           Position = as.numeric(z2),
                                           Pvalue = as.numeric(z3),
                                           Site = z4,
                                           stringsAsFactors = FALSE)
                              })
        names(site.tables) <- mot.names
        if (length(site.tables) == 1) site.tables <- site.tables[[1]]
        motif_list <- list(motifs = motif_list$motifs,
                           sites = motif_list$sites,
                           sites.meta = site.tables)

        summ.start <- grep("Combined block diagrams: non-overlapping sites",
                           raw_lines)
        if (length(summ.start) == 0) {
          warning("Could not find combined P-values in MEME file")
        } else {
          summ.start <- summ.start + 2
          summ.raw <- raw_lines[summ.start:(length(raw_lines) - 2)]
          need.fix <- grep("\\", summ.raw, fixed = TRUE)
          if (length(need.fix) > 0) {
            need.fix2 <- need.fix + 1
            summ.raw[need.fix] <- vapply(summ.raw[need.fix],
                                         function(x) strsplit(x, "\\",
                                                              fixed = TRUE)[[1]],
                                         character(1))
            summ.raw[need.fix2] <- gsub("\\s+", "", summ.raw[need.fix2])
            summ.raw[need.fix] <- mapply(function(x, y) paste0(x, y),
                                         summ.raw[need.fix], summ.raw[need.fix2])
            summ.raw <- summ.raw[-need.fix2]
          }
          summ.tab <- read.table(text = summ.raw, stringsAsFactors = FALSE)
          colnames(summ.tab) <- c("Sequence", "Combined.Pvalue", "Diagram")
          motif_list <- list(motifs = motif_list$motifs,
                             sites = motif_list$sites,
                             sites.meta = motif_list$sites.meta,
                             sites.meta.combined = summ.tab)
        }
      }
    }

    if (readsites.meta.tidy) {
      motif_list$sites.meta.tidy <- tidy_meme_meta(motif_list$sites.meta)
    }

  } else if (readsites.meta) {
    warning("'readsites.meta' is not valid if 'readsites = FALSE'")
  }

  motif_list

}

tidy_meme_meta <- function(sites.meta) {
  m <- names(sites.meta)
  for (i in seq_along(sites.meta)) sites.meta[[i]]$Motif <- m[i]
  sites.meta <- do.call(rbind, sites.meta)
  sites.meta[order(sites.meta$Sequence, sites.meta$Position), ]
}

#' Returns type of MEME alphabet to use
#'
#' Used to deploy a switch() statement in get_meme_alph()
#'
#' @param raw_lines raw lines from .meme file
#'
#' @return Alphabet type
#'
#' @noRd
check_meme_alph_type <- function(raw_lines){
  if (any(grepl("^ALPHABET=", raw_lines))) {
    "default"
  } else if (any(grepl("^ALPHABET", raw_lines))) {
    "custom"
  } else {
    message("Could not find alphabet, assuming DNA.")
    "none"
  }
}

#' Grabs alphabet type for defined alphabet entries
#'
#' @param raw_lines raw lines from .meme file
#'
#' @return The alphabet
#'
#' @noRd
get_default_meme_alph <- function(raw_lines) {
  alph <- raw_lines[grepl("^ALPHABET=", raw_lines)]
  alph <- strsplit(alph, "\\s+")[[1]][2]
  alph <- switch(alph, "ACGT" = "DNA", "ACGU" = "RNA",
                 "ACDEFGHIKLMNPQRSTVWY" = "AA", alph)
  return(alph)
}

#' Get custom alphabet defintion from .meme file
#'
#' If this ever runs, import will probably break downstream because custom
#' alphabets typically define a lookup table  which doesn't correspond to
#' alphabet entries in the final matrix. (ie so alph.len will no longer be the
#' correct matrix width)
#'
#' This is kind of a placeholder. So I short-circuit it with an informative
#' error about custom alphabets not being supported.
#'
#' @param raw_lines raw lines from .meme file
#'
#' @return The alphabet
#'
#' @noRd
get_custom_meme_alph <- function(raw_lines) {

  i <- grep("^ALPHABET", raw_lines) + 1
  n <- grep("^END ALPHABET", raw_lines) - 1
  
  if (length(n) == 0){
    # For parsing meme.txt output when run w/ custom alph file,
    # Might not be the best fix.
    # https://github.com/bjmt/universalmotif/issues/24
    n <- grep("^TRAINING SET", raw_lines) - 1
  }
  
  if (length(n) == 0){
    stop("Problem parsing custom alphabet.\nPlease file a bug report at: github.com/bjmt/universalmotif/issues/", call. = FALSE)
  }

  alph_lines <- raw_lines[i:n]

  alph_lines <- alph_lines[!grepl("=", alph_lines, fixed = TRUE)]
  alph_lines <- trimws(unlist(strsplit(alph_lines, "~", fixed = TRUE)))
  alph <- vapply(strsplit(alph_lines, " ", fixed = TRUE), function(x) x[1], character(1))

  alph.order <- c(LETTERS, letters, 0:9, "*", "-", ".")
  alph <- alph[order(match(alph, alph.order))]
  alph <- collapse_cpp(alph)

  switch(alph, "ACGT" = "DNA", "ACGU" = "RNA", "ACDEFGHIKLMNPQRSTVWY" = "AA", alph)

}

#' Return MEME alphabet string
#'
#' @param raw_lines raw lines from .meme file
#'
#' @return `character(1)` alphabet string
#'
#' @noRd
get_meme_alph <- function(raw_lines) {

  switch(check_meme_alph_type(raw_lines),
         default = get_default_meme_alph(raw_lines),
         custom = get_custom_meme_alph(raw_lines),
         none = "DNA")

}

#' Return alphabet length
#'
#' Uses lookup table for default alphabets, otherwise just counts number of
#' letters
#'
#' @param alph alphabet string from get_meme_alph
#'
#' @return `integer(1)` alphabet length
#'
#' @noRd
get_meme_alph_len <- function(alph) {

  switch(alph, "DNA" = 4L, "RNA" = 4L, "AA" = 20L, nchar(alph))

}

check_meme_version <- function(raw_lines) {
  ver <- raw_lines[grep("^MEME version", raw_lines)]
  if (!length(ver)) {
    warning(wmsg("Could not find MEME version. Note that this is considered",
        " a requirement by MEME programs."),
      call. = FALSE)
  } else{
    ver <- strsplit(ver, "\\s+")[[1]][3]
    ver <- strsplit(ver, "[.]")[[1]][1]
    ver <- as.numeric(ver)
    if (!ver %in% 4:5) {
      warning(wmsg("read_meme() is configured for MEME versions 4-5."),
        call. = FALSE)
    }
  }
}