R/universalmotif_df.R

Defines functions clean_up_extrainfo_df vec_to_df vec_to_df_mot extrainfo_to_df bkgs_are_different summarise_motifs_with_extras requires_update to_list update_motifs to_df print.universalmotif_df as.character.universalmotif

Documented in requires_update to_df to_list update_motifs

#' Tidy manipulation of motifs.
#'
#' @return
#'    For [to_df()]: a `data.frame` with the exposed slots as columns.
#'
#'    For [update_motifs()]: the updated `data.frame`.
#'
#'    For [requires_update()]: `TRUE` if the motifs are out of date,
#'    `FALSE` if otherwise. Note that this function uses `identical()`
#'    to check for this, which can be quite slow for large datasets. It
#'    is usually just as fast to simply run [update_motifs()] in such
#'    cases.
#'
#'    For [to_list()]: a list of motifs.
#'
#' @details
#' To turn off the informative messages/warnings when printing the object to
#' the console, set `options(universalmotif_df.warning=FALSE)`.
#'
#' @examples
#' \dontrun{
#' library(universalmotif)
#' library(dplyr)
#'
#' m <- c(create_motif(name = "motif A"), create_motif(name = "motif B"))
#'
#' # Change the names of the motifs using the tidy way:
#' m <- m %>%
#'    to_df() %>%
#'    mutate(name = paste0(name, "-2")) %>%
#'    to_list()
#'
#' # Add your own metadata to be stored in the extrainfo slot:
#' m_df <- to_df(m)
#' m_df$MyMetadata <- c("Info_1", "Info_2")
#' m <- to_list(m_df, extrainfo = TRUE)
#' }
#'
#' @author Benjamin Jean-Marie Tremblay, \email{benjamin.tremblay@@uwaterloo.ca}
#' @name tidy-motifs
NULL

#' @export
as.character.universalmotif <- function(x, maxchar = 12, ...) {
  name <- x["name"]
  if (nchar(name) > (maxchar - 6)) name <- paste0(substr(name, 1, maxchar - 8), "..")
  paste0("<mot:", name, ">")
}

#' @export
print.universalmotif_df <- function(x, na.rm = TRUE, ...) {
  y <- x
  x_og <- x
  if (na.rm) {
    empty_cols <- vapply(x, function(x) all(is.na(x)), logical(1))
    x <- x[, !empty_cols, drop = FALSE]
  }
  # For skinny terminals, this value will be larger than necessary, but I'm
  # not sure how to check for this without easily breakable strsplit() stuff.
  # The point of this is to avoid diff-ing dataframes with thousands of rows
  # if only a fraction of those will be printed. Could probably use some
  # cleaning up.
  toprint <- length(capture.output(print(as.data.frame(x))))
  if (toprint < nrow(x)) {
    y <- x[seq_len(toprint), ]
  }
  checktry <- try(checkdf <- suppressWarnings(update_motifs(y)), silent = TRUE)
  founderr <- FALSE
  founddiff <- FALSE
  if (inherits(checktry, "try-error")) {
    founderr <- TRUE
  } else {
    diffd <- mapply(identical, as.list(y$motif), as.list(checkdf$motif))
    if (any(!diffd)) {
      x <- as.data.frame(cbind(" " = character(nrow(x)), x))
      x[[1]][seq_along(diffd)] <- c("*", "")[diffd + 1]
      founddiff <- TRUE
    }
  }
  print.data.frame(x)
  printNL <- FALSE
  if (na.rm && any(empty_cols) && !isFALSE(getOption("universalmotif_df.warning"))) {
    empty_cols <- names(empty_cols)[empty_cols]
    empty_cols <- paste0(empty_cols, collapse = ", ")
    cat("\n", wmsg("[Hidden empty columns: ", empty_cols, ".]"), sep = "")
    printNL <- TRUE
  }
  if (founderr && !isFALSE(getOption("universalmotif_df.warning"))) {
    cat("\n", wmsg("[Note: incomplete universalmotif_df object.]"), sep = "")
    printNL <- TRUE
  }
  if (founddiff && !isFALSE(getOption("universalmotif_df.warning"))) {
    cat("\n", wmsg("[Rows marked with * are changed. Run update_motifs()",
        " or to_list() to apply changes.]"), sep = "")
    printNL <- TRUE
  }
  if (printNL) cat("\n")
  invisible(x_og)
}

#' @export
#' @param motifs List of motifs.
#' @rdname tidy-motifs
to_df <- function(motifs, extrainfo = TRUE) {
  x <- convert_motifs(motifs)
  if (!is.list(x)) x <- list(x)
  y <- summarise_motifs_with_extras(x)
  if (extrainfo) {
    einfo <- extrainfo_to_df(x)
    if (!is.null(einfo) && nrow(einfo))
      y <- cbind(y, einfo)
  }
  # interesting side effect of using I(x): trying to use the motif column directly
  # in functions such as view_motifs(df$motif) will throw an error.
  # Maybe provide a method for "AsIs" in convert_motifs() which will throw a more
  # informative error message? (Add extra class alongside "AsIs"?)
  y <- cbind(motif = I(x), y)
  # some functions may drop the extra class, but it should only affect pretty printing
  structure(y, class = c("universalmotif_df", "data.frame"))
}

#' @export
#' @param motif_df Motif `data.frame` generated by [to_df()].
#' @param extrainfo Use the `extrainfo` slot in the tidy `data.frame`. The
#'    column names will be taken from the character vectors themselves, and
#'    unnamed elements will be assigned a unique name. To add elements to the
#'    slot, simply create new columns in the `data.frame`. Note that these will
#'    be coerced into characters. If `extrainfo` is not set to `TRUE` in
#'    `to_df()`, then the contents of the slot will not be transferred to the
#'    `data.frame`. If `extrainfo` is not set to `TRUE` in `update_motifs()`
#'    or `to_list()`, then the extra columns will be discarded.
#' @param force Whether to coerce non-character data types into characters for
#'   inclusion in `extrainfo`. If `force` is `FALSE` (the default), columns which
#'   are not of type "character", "numeric", or "integer" (for example, list
#'   columns, or logical values), will not be added to the motif `extrainfo`
#'   slot, but will be passed onto the returned `universalmotif_df` unchanged.
#'   Setting `force = TRUE` coerces these values into a character, adding them
#'   to the `extrainfo` slot, and updating the `universalmotif_df` columns to
#'   reflect this coercion. In other words, forcing inclusion of these data is
#'   destructive and will change the column values. Use with caution.
#' @rdname tidy-motifs
update_motifs <- function(motif_df, extrainfo = TRUE, force = FALSE) {
  # TODO: extrainfo implementation is very messy...
  # TODO: come back and add alphabet change support once switch_alph() is updated
  # TODO: performance
  updated_df <- as.data.frame(motif_df)
  if (!"motif" %in% colnames(updated_df))
    stop("Could not find 'motif' column.")
  m <- updated_df$motif
  class(m) <- NULL
  old_df <- as.data.frame(to_df(m, extrainfo = FALSE))
  cols_new <- colnames(updated_df)
  cols_new <- cols_new[cols_new != "motif"]
  cols_old <- colnames(old_df)
  cols_old <- cols_old[cols_old != "motif"]
  
  holdout <- !force # makes it easier to reason about the code
  extrainfo_holdout_cols <- NA_character_
  if (extrainfo) {
    # for now, just always update extrainfo...
    cols_extrainfo <- cols_new[!cols_new %in% cols_old]
    cols_extrainfo <- cols_extrainfo[cols_extrainfo != "bkg"]
    if (length(cols_extrainfo)) {
      # Keep id_cols temporarily to ensure sort order is OK
      # TODO: in future use a better unique identifier
      extrainfo_new <- updated_df[, cols_extrainfo, drop = FALSE]
      
      if (holdout) {
        # hold out unsupported datatypes
        # Current "supported" types are "character", "numeric" and "integer"
        extrainfo_classes <- vapply(extrainfo_new, class, character(1))
        
        extrainfo_holdout_cols <- names(extrainfo_classes[!(extrainfo_classes %in% c("character", "numeric", "integer"))])
        
        extrainfo_holdouts <- extrainfo_new[, extrainfo_holdout_cols, drop = FALSE]
        # TODO: Consider a message for holdouts? I think it's unnecessary.
      }
      
      # if holding out columns, remove them from the extrainfo passed onto the motifs
      if (holdout & length(extrainfo_holdout_cols) != 0) {
        extrainfo_new <- extrainfo_new[, -which(names(extrainfo_new) %in% extrainfo_holdout_cols), drop = FALSE]
      } 
      
      # Pass extrainfo to motif
      for (i in seq_along(m)) {
        m[[i]]["extrainfo"] <- clean_up_extrainfo_df(extrainfo_new[i, , drop = FALSE])
      }
    } else {
      for (i in seq_along(m)) {
        m[[i]]@extrainfo <- character()
      }
    }
  } else if (any(!cols_new %in% cols_old)) {
    # maybe remove this warning based on how extrainfo is implemented
    message(wmsg(
      "Discarding unknown slot(s) ",
      paste0(paste0("'", cols_new[!cols_new %in% cols_old], "'"), collapse = ", "),
      " (set `extrainfo=TRUE` to preserve these)."))
  }
  if (!extrainfo) {
    for (i in seq_along(m)) {
      m[[i]]@extrainfo <- character()
    }
  }
  if (any(!cols_old %in% cols_new)) {
    # hide this warning when called in to_list()?
    message(wmsg(
      "Restoring missing slot(s) ",
      paste0(paste0("'", cols_old[!cols_old %in% cols_new], "'"), collapse = ", "),
      "."))
  }
  cols_to_check <- cols_new[cols_new %in% cols_old]
  if ("bkg" %in% cols_to_check) {
    for (i in which(mapply(bkgs_are_different, old_df$bkg, updated_df$bkg, SIMPLIFY = TRUE))) {
      m[[i]]["bkg"] <- updated_df$bkg[[i]]
    }
    cols_to_check <- cols_to_check[cols_to_check != "bkg"]
  }
  for (i in seq_along(cols_to_check)) {
    checking <- cols_to_check[i]
    # need a better way to deal with NAs...
    # TODO: this causes 0s to be permanently introduced when altname is missing:
    # dplyr::rename(mydf, name = altname, altname  name)
    updated_df[[checking]][is.na(updated_df[[checking]])] <- 0
    old_df[[checking]][is.na(old_df[[checking]])] <- 0
    if (any(updated_df[[checking]] != old_df[[checking]])) {
      if (checking == "type") {
        for (j in which(updated_df[[checking]] != old_df[[checking]])) {
          m[[j]] <- convert_type(m[[j]], updated_df[[checking]][j])
        }
      } else if (checking %in% c("icscore", "consensus", "alphabet")) {
        warning("Discarding changes in unmodifiable slot(s) '", checking, "'.",
          immediate. = TRUE, call. = FALSE)
      } else {
        for (j in which(updated_df[[checking]] != old_df[[checking]])) {
          msg <- try(m[[j]][checking] <- updated_df[[checking]][j], silent = TRUE)
          if (inherits(msg, "try-error")) {
            stop("Got the following error for motif in row ", j, ":\n", msg,
              call. = FALSE)
          }
        }
      }
    }
  }
  if (holdout & extrainfo & all(!is.na(extrainfo_holdout_cols))){
    # Add back any heldout info
    # TODO: is `cbind` safe here? will row order always preserve?
    new_df <- structure(cbind(to_df(m, extrainfo), extrainfo_holdouts), 
                        class = c("universalmotif_df", "data.frame"))
    return(new_df)
  } else {
    return(to_df(m, extrainfo))
  }
}

#' @export
#' @rdname tidy-motifs
to_list <- function(motif_df, extrainfo = TRUE, force = FALSE) {
  structure(update_motifs(motif_df, extrainfo, force)$motif, class = NULL)
}

#' @export
#' @rdname tidy-motifs
requires_update <- function(motifs, extrainfo = TRUE) {
# identical() is really quite slow...
  motifs2 <- update_motifs(motifs, extrainfo)
  any(!mapply(identical,
      structure(motifs$motif, class = NULL),
      structure(motifs2$motif, class = NULL)))
}

summarise_motifs_with_extras <- function(x) {
  y <- summarise_motifs(x, na.rm = FALSE)
  y$type <- vapply(x, function(x) x@type, character(1))
  y$pseudocount <- vapply(x, function(x) x@pseudocount, numeric(1))
  y$bkg <- lapply(x, function(x) x@bkg)
  y
}

bkgs_are_different <- function(x, y) {
  if (length(x) != length(y)) TRUE
  else !all(x == y)
}

extrainfo_to_df <- function(x) {
  y <- lapply(x, vec_to_df_mot)
  cnames <- unique(unlist(lapply(y, colnames)))
  for (i in seq_along(y)) {
    if (!ncol(y[[i]])) {
      y[[i]] <- vec_to_df(structure(rep(NA_character_, length(cnames)), names = cnames))
    } else {
      for (j in which(!cnames %in% colnames(y[[i]]))) {
        y[[i]][[cnames[j]]] <- rep(NA_character_, nrow(y[[i]]))
      }
      y[[i]] <- y[[i]][, cnames, drop = FALSE]
    }
  }
  do.call(rbind, y)
}

vec_to_df_mot <- function(x) {
  vec_to_df(x@extrainfo)
}

vec_to_df <- function(x) {
  if (!length(x)) data.frame()
  else {
    # TODO: need to mitigate risk of duplicate names here
    if (is.null(names(x))) names(x) <- paste0("extrainfo.", seq_along(x))
    names(x)[names(x) == ""] <- paste0("extrainfo.", seq_len(sum(names(x) == "")))
    x <- list2DF(as.list(x))
    x
  }
}

clean_up_extrainfo_df <- function(x) {
  y <- names(x)
  x <- vapply(x, as.character, character(1))
  x <- as.character(x)
  if (all(is.na(x))) character()
  else {
    names(x) <- y
    x[!is.na(x)]
  }
}
bjmt/universalmotif documentation built on March 18, 2024, 8:32 a.m.