R/helpers_misc.R

Defines functions ymxb_print unAsIs tempfile tmpmd5file sm sillydist saveme renderme rex rbind_summary_rows please_install local_get_value loadme hpgl_dist hpgl_padjust hpgl_cor hpgl_arescore make_simplified_contrast_matrix pull_git_commit get_yyyymm_commit get_git_commit cordist clear_session bioc_all backup_file

Documented in backup_file bioc_all clear_session cordist get_git_commit get_yyyymm_commit hpgl_arescore hpgl_cor hpgl_dist hpgl_padjust loadme local_get_value make_simplified_contrast_matrix please_install pull_git_commit rbind_summary_rows renderme rex saveme sillydist sm tempfile tmpmd5file unAsIs ymxb_print

## helpers_misc.r: Some random helper functions.

#' Make a backup of an existing file with n revisions, like VMS!
#'
#' Sometimes I just want to kick myself for overwriting important files and then
#' I remember using VMS and wish modern computers were a little more like it.
#'
#' @param backup_file Filename to backup.
#' @param backups How many revisions?
backup_file <- function(backup_file, backups = 4) {
  if (file.exists(backup_file)) {
    for (i in seq(from = backups, to = 1)) {
      j <- i + 1
      i <- sprintf("%02d", i)
      j <- sprintf("%02d", j)
      test <- glue("{backup_file}.{i}")
      new <- glue("{backup_file}.{j}")
      if (file.exists(test)) {
        file.rename(test, new)
      }
    }
    newfile <- glue("{backup_file}.{i}")
    message("Renaming ", backup_file, " to ", newfile, ".")
    file.copy(backup_file, newfile)
  } else {
    message("The file does not yet exist.")
  }
}

#' Grab a copy of all bioconductor packages and install them by type
#'
#' This uses jsonlite to get a copy of all bioconductor packages by name and
#' then iterates through them with BiocManager to install all of them.  It
#' performs a sleep between each installation in an attempt to avoid being
#' obnoxious.  As a result, it will of a necessity take forever.
#'
#' @param release Bioconductor release to use, should probably be adjusted to
#'  automatically find it.
#' @param mirror Bioconductor mirror to use.
#' @param base Base directory on the mirror to download from.
#' @param type Type in the tree to use (software or annotation)
#' @param suppress_updates For BiocLite(), don't update?
#' @param suppress_auto For BiocLite(), don't update?
#' @param force Install if already installed?
#' @return a number of packages installed
#' @seealso [BiocManager] [jsonlite]
#' @examples
#' \dontrun{
#'  go_get_some_coffee_this_will_take_a_while <- bioc_all()
#' }
#' @export
bioc_all <- function(release = NULL,
                     mirror = "bioconductor.statistik.tu-dortmund.de",
                     base = "packages", type = "software",
                     suppress_updates = TRUE, suppress_auto = TRUE, force = FALSE) {
  if (is.null(release)) {
    release <- as.character(BiocManager::version())
  }
  dl_url <- glue("https://{mirror}/{base}/json/{release}/tree.json")
  ## dl_url <- "https://bioc.ism.ac.jp/packages/json/3.3/tree.json"
  suc <- c()
  ## Sadly, biocLite() does not give different returns for a successfully/failed
  ## install. Instead it always returns a character of the package(s) requested.
  ## That seems dumb to me, but what do I know.
  fail <- c()
  alr <- c()
  ## Soemthing is weird with the libcurl on the cluster...
  test <- download.file(url = dl_url, destfile = "test.json")
  pkg_list <- jsonlite::fromJSON("test.json")
  pkg_names <- pkg_list[["attr"]][["packageList"]]
  software_names <- strsplit(x = pkg_names[[1]], split = ",")[[1]]
  annotation_names <- strsplit(x = pkg_names[[2]], split = ",")[[1]]
  ## experiment_names <- strsplit(x = pkg_names[[3]], split = ",")[[1]]
  ## It appears that with bioconductor release 3.4,
  ## experiment has been folded into annotation.
  installed <- list(succeeded = c(), failed = c(), already = c())
  attempt <- function(pkg, update = suppress_updates, auto = suppress_auto,
                      forceme = force,
                      state = list(succeeded = c(),
                                   failed = c(),
                                   already = c())) {
    sleep <- 10
    suc <- state[["succeeded"]]
    fail <- state[["failed"]]
    alr <- state[["already"]]
    message("Installing: ", pkg)
    if (isTRUE(forceme)) {
      installedp <- sm(try(BiocManager::install(pkg, ask = FALSE,
                                                suppressUpdates = update,
                                                suppressAutoUpdate = auto)))
      if (class(installedp) == "try-error") {
        fail <- append(fail, pkg)
      } else {
        install_directory <- glue("{.libPaths()[1]}/{pkg}")
        if (file.exists(install_directory)) {
          suc <- append(suc, pkg)
        } else {
          fail <- append(fail, pkg)
        }
      }
    } else {
      if (isTRUE(pkg %in% .packages(all.available = TRUE))) {
        message("Package ", pkg,  " is already installed.")
        alr <- append(alr, pkg)
        sleep <- 0
      } else {
        installedp <- try(sm(BiocManager::install(pkg, ask = FALSE,
                                                  suppressUpdates = update,
                                                  suppressAutoUpdate = auto)))
        if (class(installedp) == "try-error") {
          fail <- append(fail, pkg)
        } else {
          install_directory <- glue("{.libPaths()[1]}/{pkg}")
          if (file.exists(install_directory)) {
            suc <- append(suc, pkg)
          } else {
            fail <- append(fail, pkg)
          }
        }
      }
    }
    Sys.sleep(sleep)
    ret <- list(
      "succeeded" = suc,
      "failed" = fail,
      "already" = alr)
    return(ret)
  } ## End attempt
  if (type == "software") {
    for (pkg in software_names) {
      installed <- attempt(pkg, state = installed)
    }
  } else if (type == "annotation") {
    for (pkg in annotation_names) {
      installed <- attempt(pkg, state = installed)
    }
  } else {
    software_installed <- bioc_all(release = release,
                                   mirror = mirror, base = base,
                                   type = "software")
    annotation_installed <- bioc_all(release = release,
                                     mirror = mirror, base = base,
                                     type = "annotation")
    installed <- list(
      "software" = software_installed,
      "annotation" = annotation_installed)
  }
  return(installed)
}

#' Clear an R session, this is probably unwise given what I have read about R.
#'
#' @param keepers List of namespaces to leave alone (unimplemented).
#' @param depth Cheesy forloop of attempts to remove packages stops after this
#'  many tries.
#' @return A spring-fresh R session, hopefully.
#' @seealso [R.utils]
#' @export
clear_session <- function(keepers = NULL, depth = 10) {
  ## Partially taken from:
  ## https://stackoverflow.com/questions/7505547/detach-all-packages-while-working-in-r
  basic_packages <- c("package:stats", "package:graphics", "package:grDevices",
                      "package:utils", "package:datasets", "package:methods",
                      "package:base")
  package_list <- search()[ifelse(unlist(gregexpr("package:", search()))==1, TRUE, FALSE)]
  package_list <- setdiff(package_list, basic_packages)
  result <- R.utils::gcDLLs()
  if (length(package_list) > 0) {
    for (package in package_list) {
      detach(package, character.only = TRUE)
    }
  }
  tt <- sm(rm(list = ls(all.names = TRUE), envir = globalenv()))
}

#' Similarity measure which combines elements from Pearson correlation and
#' Euclidean distance.
#'
#' Here is Keith's summary:
#' Where the cor returns the Pearson correlation matrix for the input matrix,
#' and the dist function returns the Euclidean distance matrix for the input
#' matrix. The LHS of the equation is simply the sign of the correlation
#' function, which serves to preserve the sign of the interaction. The RHS
#' combines the Pearson correlation and the log inverse Euclidean distance with
#' equal weights. The result is a number in the range from -1 to 1 where values
#' close to -1 indicate a strong negative correlation and values close to 1
#' indicate a strong positive corelation.  While the Pearson correlation and
#' Euclidean distance each contribute equally in the above equation, one could
#' also assign tuning parameters to each of the metrics to allow for unequal
#' contributions.
#'
#' @param data Matrix of data
#' @param cor_method Which correlation method to use?
#' @param dist_method Which distance method to use?
#' @param cor_weight 0-1 weight of the correlation, the distance weight
#'   will be 1-cor_weight.
#' @param ... extra arguments for cor/dist
#' @author Keigth Hughitt
#' @return Matrix of the correlation-modified distances of the original matrix.
#' @export
cordist <- function(data, cor_method = "pearson", dist_method = "euclidean",
                    cor_weight = 0.5, ...) {
  cor_matrix  <- hpgl_cor(t(data), method = cor_method, ...)
  dist_matrix <- as.matrix(dist(data, method = dist_method, diag = TRUE,
                                upper = TRUE))
  dist_matrix <- log1p(dist_matrix)
  dist_matrix <- 1 - (dist_matrix / max(dist_matrix))

  if (cor_weight > 1 | cor_weight < 0) {
    stop("Invalid correlation weight.")
  }
  dist_weight <- 1 - cor_weight
  result <- (cor_weight * abs(cor_matrix)) + (dist_weight * dist_matrix)
  result <- sign(cor_matrix) * result
  return(result)
}

#' Get the current git commit for hpgltools
#'
#' One might reasonably ask about this function: "Why?"  I invoke this function
#' at the end of my various knitr documents so that if necessary I can do a
#' > git reset <commit id> and get back to the exact state of my code.
#'
#' @param gitdir Directory containing the git repository.
#' @export
get_git_commit <- function(gitdir = "~/hpgltools") {
  cmdline <- glue("cd {gitdir} && git log -1 2>&1 | grep 'commit' | awk '{{print $2}}'")
  commit_result <- system(cmdline, intern = TRUE)
  cmdline <- glue("cd {gitdir} && git log -1 2>&1 | grep 'Date' | perl -pe 's/^Date:\\s+//g'")
  date_result <- system(cmdline, intern = TRUE)
  result <- glue("{date_result}: {commit_result}")
  message("If you wish to reproduce this exact build of hpgltools, invoke the following:")
  message("> git clone http://github.com/abelew/hpgltools.git")
  message("> git reset ", commit_result)
  return(result)
}

#' Find the git commit closest to the given yyyymmdd.
#'
#' @param gitdir Location of the git repository, I assume hpgltools.
#' @param version String containing all yyyymmdd.
#' @param year Chosen year which will be coerced to yyyy.
#' @param month Chosen month coerced to mm.
#' @param day Chosen day coerced to dd.
#' @export
get_yyyymm_commit <- function(gitdir = "~/hpgltools", version = NULL,
                              year = NULL, month = NULL, day = NULL) {
  if (is.null(version) && is.null(year) && is.null(month) && is.null(day)) {
    version <- as.Date(lubridate::now())
    year <- format(version, "%Y")
    month <- format(version, "%m")
    day <- format(version, "%d")
  } else {
    version <- as.Date(version, format = "%Y%m%d")
  }
  closest_commit <- glue("git log --since {version} | grep commit | tail -n 1 | awk '{{print $2}}'")
  wanted_commit <- system(closest_commit, intern = TRUE)
  message("Looking for the commit closest to ", version, ".")
  pulled <- pull_git_commit(gitdir, commit = wanted_commit)
  return(pulled)
}

#' Reset the chosen git repository to a chosen commit.
#'
#' @param gitdir Desired repository, defaulting to my hpgltools copy.
#' @param commit commit ID to which to reset.
#' @export
pull_git_commit <- function(gitdir = "~/hpgltools", commit = NULL) {
  if (is.null(commit)) {
    cmdline <- glue("cd {gitdir} && git log | head -n 3 | grep commit | awk '{{print $2}}'")
    commit <- system(cmdline, intern = TRUE)
  }
  cmdline <- glue("cd {gitdir} && git pull && git reset {commit_result}")
  result <- system(cmdline, intern = TRUE)
  return(result)
}

#' Create a contrast matrix suitable for MSstats and similar tools.
#'
#' I rather like makeContrasts() from limma.  I troubled me to have to manually
#' create a contrast matrix when using MSstats.  It turns out it troubled me for
#' good reason because I managed to reverse the terms and end up with the
#' opposite contrasts of what I intended.  Ergo this function.
#'
#' Feed make_simplified_contrast_matrix() a series of numerators and denominators names
#' after the conditions of interest in an experiment and it returns a contrast
#' matrix in a format acceptable to MSstats.
#'
#' @param numerators Character list of conditions which are the numerators of a
#'  series of a/b comparisons.
#' @param denominators Character list of conditions which are the denominators of a
#'  series of a/b comparisons.
#' @return Contrast matrix suitable for use in tools like MSstats.
#' @seealso [MSstats]
#' @export
make_simplified_contrast_matrix <- function(numerators, denominators) {
  if (length(numerators) != length(denominators)) {
    stop("Need a constant number of numerators and denominators.")
  }
  conditions <- unique(c(numerators, denominators))
  contrasts <- matrix(nrow = length(numerators), ncol = length(conditions))
  colnames(contrasts) <- conditions
  rownames(contrasts) <- numerators
  for (col in (colnames(contrasts))) {
    contrasts[, col] <- 0
  }
  for (n in seq_along(numerators)) {
    num <- numerators[[n]]
    den <- denominators[[n]]
    cont <- glue("{num}_vs_{den}")
    contrasts[n, num] <- 1
    contrasts[n, den] <- -1
    rownames(contrasts)[n] <- cont
  }
  return(contrasts)
}

#' Implement the arescan function in R
#'
#' This function was taken almost verbatim from AREScore() in SeqTools
#' Available at: https://github.com/lianos/seqtools.git
#' At least on my computer I could not make that implementation work
#' So I rewrapped its apply() calls and am now hoping to extend its logic
#' a little to make it more sensitive and get rid of some of the spurious
#' parameters or at least make them more transparent.
#'
#' @param x DNA/RNA StringSet containing the UTR sequences of interest
#' @param basal I dunno.
#' @param overlapping default = 1.5
#' @param d1.3 default = 0.75  These parameter names are so stupid, lets be realistic
#' @param d4.6 default = 0.4
#' @param d7.9 default = 0.2
#' @param within.AU default = 0.3
#' @param aub.min.length default = 10
#' @param aub.p.to.start default = 0.8
#' @param aub.p.to.end default = 0.55
#' @return a DataFrame of scores
#' @seealso [IRanges] [Biostrings] [GenomicRanges]
#' @examples
#' \dontrun{
#'  ## Extract all the genes from my genome, pull a static region 120nt following the stop
#'  ## and test them for potential ARE sequences.
#'  ## FIXME: There may be an error in this example, another version I have
#'  ## handles the +/- strand genes separately, I need to return to this and check
#'  ## if it is providing the 5' UTR for 1/2 the genome, which would be
#'  ## unfortunate -- but the logic for testing remains the same.
#'  are_candidates <- hpgl_arescore(genome)
#'  utr_genes <- subset(lmajor_annotations, type == 'gene')
#'  threep <- GenomicRanges::GRanges(seqnames = Rle(utr_genes[,1]),
#'                                   ranges = IRanges(utr_genes[,3], end=(utr_genes[,3] + 120)),
#'                                   strand = Rle(utr_genes[,5]),
#'                                   name = Rle(utr_genes[,10]))
#'  threep_seqstrings <- Biostrings::getSeq(lm, threep)
#'  are_test <- hpgltools::hpgl_arescore(x = threep_seqstrings)
#'  are_genes <- rownames(are_test[ which(are_test$score > 0), ])
#' }
#' @export
hpgl_arescore <- function(x, basal = 1, overlapping = 1.5, d1.3=0.75, d4.6=0.4,
                          d7.9=0.2, within.AU = 0.3, aub.min.length = 10, aub.p.to.start = 0.8,
                          aub.p.to.end = 0.55) {
  xtype <- match.arg(substr(class(x), 1, 3), c("DNA", "RNA"))
  if (xtype == "DNA") {
    pentamer <- "ATTTA"
    overmer <- "ATTTATTTA"
  } else {
    pentamer <- "AUUUA"
    overmer <- "AUUUAUUUA"
  }
  x <- as(x, "DNAStringSet")
  pmatches <- Biostrings::vmatchPattern(pentamer, x)
  omatches <- Biostrings::vmatchPattern(overmer, x)
  basal.score <- S4Vectors::elementNROWS(pmatches) * basal
  over.score <- S4Vectors::elementNROWS(omatches) * overlapping
  no.cluster <- data.frame(d1.3 = 0, d4.6 = 0, d7.9 = 0)
  clust <- lapply(pmatches, function(m) {
    if (length(m) < 2) {
      return(no.cluster)
    }
    wg <- BiocGenerics::width(IRanges::gaps(m))
    data.frame(d1.3=sum(wg <= 3), d4.6=sum(wg >= 4 & wg <= 6), d7.9=sum(wg >= 7 & wg <= 9))
  })
  clust <- do.call(rbind, clust)
  dscores <- clust$d1.3 * d1.3 + clust$d4.6 * d4.6 + clust$d7.9 *  d7.9
  ## require.auto("Biostrings")
  au.blocks <- my_identifyAUBlocks(x, min.length = aub.min.length,
                                   p.to.start = aub.p.to.start, p.to.end = aub.p.to.end)
  aub.score <- sum(IRanges::countOverlaps(pmatches, au.blocks) * within.AU)
  score <- basal.score + over.score + dscores + aub.score
  ans <- S4Vectors::DataFrame(score = score,
                              n.pentamer = S4Vectors::elementNROWS(pmatches),
                              n.overmer = S4Vectors::elementNROWS(omatches),
                              au.blocks = au.blocks,
                              n.au.blocks = S4Vectors::elementNROWS(au.blocks))
  cbind(ans, S4Vectors::DataFrame(clust))
}

#' Wrap cor() to include robust correlations.
#'
#' Take covRob's robust correlation coefficient and add it to the set of
#' correlations available when one calls cor().  I should reimplement this using
#' S4.
#'
#' @param df Data frame to test.
#' @param method Correlation method to use. Includes pearson, spearman, kendal, robust.
#' @param ... Other options to pass to stats::cor().
#' @return Some fun correlation statistics.
#' @seealso [robust]
#' @examples
#' \dontrun{
#'  hpgl_cor(df = df)
#'  hpgl_cor(df = df, method = "robust")
#' }
#' @export
hpgl_cor <- function(df, method = "pearson", ...) {
  if (method == "robust") {
    robust_cov <- robust::covRob(df, corr = TRUE)
    correlation <- robust_cov[["cov"]]
  } else if (method == "cordist") {
    correlation <- cordist(df, ...)
  } else {
    correlation <- stats::cor(df, method = method, ...)
  }
  return(correlation)
}

#' Wrap p.adjust to add IHW adjustments as an option.
#'
#' IHW and apeglm are the two new toys I found, this adds the former as a way to
#' adjust p-values.
#'
#' @param data Column or table containing values to adjust.
#' @param pvalue_column Name of the column in a table containing the p-values.
#' @param mean_column Name of the column in a table containing the mean count
#'   values to weight.
#' @param method p adjustment method to apply.
#' @param significance Passed to IHW
#' @param type Assuming a DE table, what type of DE is this?
#' @return Newly adjusted p-values using either p.adjust() or IHW.
#' @seealso [IHW]
#' @export
hpgl_padjust <- function(data, pvalue_column = "pvalue", mean_column = "base_mean",
                         method = "fdr", significance = 0.05, type = NULL) {
  if (method == "ihw") {
    result <- ihw_adjust(data, pvalue_column = pvalue_column, type = type,
                         mean_column = mean_column, significance = significance)
  } else {
    result <- p.adjust(data[[pvalue_column]], method = method)
  }
  return(result)
}

#' Because I am not smart enough to remember t()
#'
#' It seems to me there should be a function as easy for distances are there is for correlations.
#'
#' @param df data frame from which to calculate distances.
#' @param method Which distance calculation to use?
#' @param ... Extra arguments for dist.
#' @export
hpgl_dist <- function(df, method = "euclidean", ...) {
  input <- t(as.matrix(df))
  result <- as.matrix(dist(input, method = method, ...))
  return(result)
}

#' Load a backup rdata file
#'
#' I often use R over a sshfs connection, sometimes with significant latency,
#' and I want to be able to save/load my R sessions relatively quickly. Thus
#' this function uses my backup directory to load its R environment.
#'
#' @param directory Directory containing the RData.rda.xz file.
#' @param filename Filename to which to save.
#' @return a bigger global environment
#' @seealso [saveme()]
#' @examples
#' \dontrun{
#'  loadme()
#' }
#' @export
loadme <- function(directory = "savefiles", filename = "Rdata.rda.xz") {
  savefile <- glue("{getwd()}/{directory}/{filename}")
  message("Loading the savefile: ", savefile)
  load_string <- glue("load('{savefile}', envir = globalenv())")
  message("Command run: ", load_string)
  eval(parse(text = load_string))
}

#' Perform a get_value for delimited files
#'
#' Keith wrote this as .get_value() but functions which start with . trouble me.
#'
#' @param x Some stuff to split
#' @param delimiter The tritrypdb uses ': ' ergo the default.
#' @return A value!
local_get_value <- function(x, delimiter = ": ") {
  return(gsub(pattern = "^ ", replacement = "",
              x = tail(unlist(strsplit(x, delimiter)), n = 1), fixed = TRUE))
}

#' copy/paste the function from SeqTools and figure out where it falls on its ass.
#'
#' Yeah, I do not remember what I changed in this function.
#'
#' @param x Sequence object
#' @param min.length I dunno.
#' @param p.to.start P to start of course
#' @param p.to.end The p to end -- wtf who makes names like this?
#' @return a list of IRanges which contain a bunch of As and Us.
my_identifyAUBlocks <- function (x, min.length = 20, p.to.start = 0.8, p.to.end = 0.55) {
  xtype <- match.arg(substr(class(x), 1, 3), c("DNA", "RNA"))
  stopifnot(S4Vectors::isSingleNumber(min.length) & min.length >= 5 &  min.length <= 50)
  stopifnot(S4Vectors::isSingleNumber(p.to.start) & p.to.start >= 0.5 & p.to.start <= 0.95)
  stopifnot(S4Vectors::isSingleNumber(p.to.end) & p.to.end >= 0.2 & p.to.end <= 0.7)
  stopifnot(p.to.start > p.to.end)
  if (xtype == "DNA") {
    AU <- "AT"
  } else {
    AU <- "AU"
  }
  y <- as(x, sprintf("%sStringSet", xtype))
  widths <- BiocGenerics::width(x)

  ## I don't want to steal lianos' C from his repository.
  test_seqtools <- "SeqTools" %in% installed.packages()
  if (!isTRUE(test_seqtools)) {
    message("Installing lianos/seqtools from github to get the find_au_start_end() C function.")
    test <- devtools::install_github("lianos/seqtools/R/pkg")
  }
  lib_result <- sm(requireNamespace("SeqTools"))
  att_result <- sm(try(attachNamespace("SeqTools"), silent = TRUE))
  ## library(SeqTools)
  fun <- function(i) {
    one_seq <- x[[i]]
    au <- Biostrings::letterFrequencyInSlidingView(one_seq, min.length, AU, as.prob = TRUE)
    if (is.null(au) | nrow(au) == 0) {
      return(IRanges::IRanges())
    }
    au <- as.numeric(au)
    can.start <- au >= p.to.start
    can.end <- au <= p.to.end
    posts <- .Call("find_au_start_end", au, p.to.start, p.to.end, PACKAGE = "SeqTools")
    blocks <- IRanges::IRanges(posts$start, posts$end + min.length -  1L)
    IRanges::end(blocks) <- ifelse(IRanges::end(blocks) > widths[i],
                                   widths[i],
                                   IRanges::end(blocks))
    IRanges::reduce(blocks)
  }
  au.blocks <- lapply(1:length(x), fun)
  ret <- IRanges::IRangesList(au.blocks)
  return(ret)
}

#' Automatic loading and/or installing of packages.
#'
#' Load a library, install it first if necessary.
#'
#' This was taken from:
#' http://sbamin.com/2012/11/05/tips-for-working-in-r-automatically-install-missing-package/
#' and initially provided by Ramzi Temanni.
#'
#' @param lib String name of a library to check/install.
#' @param update Update packages?
#' @return 0 or 1, whether a package was installed or not.
#' @seealso [BiocManager] [install.packages()]
#' @examples
#' \dontrun{
#'  require.auto("ggplot2")
#' }
#' @export
please_install <- function(lib, update = FALSE) {
  count <- 0
  local({
    r <- getOption("repos")
    r["CRAN"] <- "http://cran.r-project.org"
    options(repos = r)
  })
  if (isTRUE(update)) {
    utils::update.packages(ask = FALSE)
  }
  github_path <- NULL
  ## If there is a / in the library's name, assume it is a github path
  split_lib <- strsplit(x = lib, split = "/")[[1]]
  if (length(split_lib) == 2) {
    github_path <- lib
    lib <- split_lib[[2]]
  }
  if (!isTRUE(lib %in% .packages(all.available = TRUE))) {
    if (is.null(github_path)) {
      ##eval(parse(text = paste("BiocManager::install('", lib, "', update = FALSE)", sep = "")))
      installed <- BiocManager::install(lib, update = FALSE)
      count <- 1
    } else {
      ret <- try(devtools::install_github(github_path, upgrade = "never"))
      if (class(ret) != "try-error") {
        count <- 1
      }
    }
  }
  return(count)
}

#' Append rows containing summary() information.
#'
#' @param df Starter df
#' @return the original df with a couple of new rows at the bottom.
rbind_summary_rows <- function(df) {
  new_rows <- c("minimum", "first_quart", "median", "mean",
                "third_quart", "max", "standard_deviation", "cv")
  new_stuff <- data.frame(row.names = colnames(df))
  for (col in colnames(df)) {
    if (class(df[[col]])[1] == "numeric") {
      column_summary <- as.numeric(summary(df[[col]]))
      column_sd <- sd(df[[col]])
      cv <- column_sd / column_summary[4]
      column_summary <- c(column_summary, column_sd, cv)
    } else {
      column_summary <- c(NA, NA, NA, NA, NA, NA, NA, NA)
    }
    new_stuff <- rbind(new_stuff, col = column_summary, make.row.names = TRUE)
  }
  rownames(new_stuff) <- colnames(df)
  colnames(new_stuff) <- new_rows
  new_df <- rbind(df, t(new_stuff))
  return(new_df)
}

#' Send the R plotter to the computer of your choice!
#'
#' Resets the display and xauthority variables to the new computer I am using so
#' that plot() works.
#'
#' @param display DISPLAY variable to use, if NULL it looks in ~/.displays/$(host).last
#' @return Fresh plotting window to the display of your choice!
#' @export
rex <- function(display = ":0") {
  if (!is.null(dev.list())) {
    for (i in seq_along(dev.list())) {
      dev.off()
    }
  }
  home <- Sys.getenv("HOME")
  host <- Sys.info()[["nodename"]]
  if (is.null(display)) {
    display <- read.table(file.path(home, ".displays", glue("{host}.last")))[1, 1]
  }
  auth <- file.path(home, ".Xauthority")
  message("Setting display to: ", display)
  result <- Sys.setenv("DISPLAY" = display, "XAUTHORITY" = auth)
  X11(display = display)
  return(NULL)
}

#' Add a little logic to rmarkdown::render to date the final outputs as per a
#' request from Najib.
#'
#' @param file Rmd file to render.
#' @param format Chosen file format.
#' @param overwrite Overwrite an existing file?
#' @return Final filename including the prefix rundate.
#' @seealso [rmarkdown]
#' @export
renderme <- function(file, format = "html_document", overwrite = TRUE) {
  original <- Sys.getenv("TMPDIR")
  new_tmpdir <- paste0("render_tmp_", file)
  if (file.exists(new_tmpdir)) {
    if (isTRUE(overwrite)) {
      message("The TMPDIR: ", new_tmpdir, " exists, removing and recreating it.")
      removed <- unlink(new_tmpdir, recursive = TRUE)
      recreated <- dir.create(new_tmpdir)
    } else {
      message("The TMPDIR: ", new_tmpdir, " exists, leaving it alone.")
    }
  }
  Sys.setenv("TMPDIR" = new_tmpdir)
  ret <- rmarkdown::render(file, output_format = format, envir = globalenv())
  removed <- unlink(new_tmpdir, recursive = TRUE)
  Sys.setenv("TMPDIR" = original)
  rundate <- format(Sys.Date(), format = "%Y%m%d")
  outdir <- dirname(ret)
  base <- basename(ret)
  b <- tools::file_path_sans_ext(base)
  ex <- tools::file_ext(ret)
  from <- file.path(outdir, glue::glue("{b}.{ex}"))
  to <- file.path(outdir, glue::glue("{rundate}_{b}.{ex}"))
  message("Moving ", from, " to\n", basename(to), ".")
  final <- file.rename(from, to)
  return(ret)
}

#' Make a backup rdata file for future reference
#'
#' I often use R over a sshfs connection, sometimes with significant latency, and
#' I want to be able to save/load my R sessions relatively quickly.
#' Thus this function uses pxz to compress the R session maximally and relatively fast.
#' This assumes you have pxz installed and >= 4 CPUs.
#'
#' @param directory  Directory to save the Rdata file.
#' @param backups  How many revisions?
#' @param cpus  How many cpus to use for the xz call
#' @param filename  Choose a filename.
#' @return Command string used to save the global environment.
#' @seealso [loadme()]
#' @examples
#' \dontrun{
#'  saveme()
#' }
#' @export
saveme <- function(directory = "savefiles", backups = 2, cpus = 6, filename = "Rdata.rda.xz") {
  environment()
  if (!file.exists(directory)) {
    dir.create(directory)
  }
  ##savefile <- glue("{getwd()}/{directory}/{filename}")
  savefile <- file.path(getwd(), directory, filename)
  message("The savefile is: ", savefile)
  backup_file(savefile, backups = backups)
  ## The following save strings work:
  save_string <- glue(
    "con <- pipe(paste0('pxz > {savefile}'), 'wb'); \\
    save(list = ls(all.names = TRUE, envir = globalenv()),
         envir = globalenv(), file = con, compress = FALSE); \\
    close(con)")
  message("The save string is: ", save_string)
  eval(parse(text = save_string))
}

#' Calculate a simplistic distance function of a point against two axes.
#'
#' Sillydist provides a distance of any point vs. the axes of a plot.
#' This just takes the abs(distances) of each point to the axes,
#' normalizes them against the largest point on the axes, multiplies
#' the result, and normalizes against the max of all point.
#'
#' @param firstterm X-values of the points.
#' @param secondterm Y-values of the points.
#' @param firstaxis X-value of the vertical axis.
#' @param secondaxis Y-value of the second axis.
#' @return Dataframe of the distances.
#' @seealso [ggplot2]
#' @examples
#' \dontrun{
#'  mydist <- sillydist(df[,1], df[,2], first_median, second_median)
#'  first_vs_second <- ggplot(df, aes(x = .data[["first"]], y = .data[["second"]]),
#'                                     environment = hpgl_env) +
#'    ggplot2::xlab(paste("Expression of", df_x_axis)) +
#'    ggplot2::ylab(paste("Expression of", df_y_axis)) +
#'    ggplot2::geom_vline(color = "grey", xintercept=(first_median - first_mad), size = line_size) +
#'    ggplot2::geom_vline(color = "grey", xintercept=(first_median + first_mad), size = line_size) +
#'    ggplot2::geom_vline(color = "darkgrey", xintercept = first_median, size = line_size) +
#'    ggplot2::geom_hline(color = "grey", yintercept=(second_median - second_mad), size = line_size) +
#'    ggplot2::geom_hline(color = "grey", yintercept=(second_median + second_mad), size = line_size) +
#'    ggplot2::geom_hline(color = "darkgrey", yintercept = second_median, size = line_size) +
#'    ggplot2::geom_point(colour = grDevices::hsv(mydist$dist, 1, mydist$dist),
#'                        alpha = 0.6, size = size) +
#'    ggplot2::theme(legend.position = "none")
#'  first_vs_second  ## dots get colored according to how far they are from the medians
#'  ## replace first_median, second_median with 0,0 for the axes
#' }
#' @export
sillydist <- function(firstterm, secondterm, firstaxis = 0, secondaxis = 0) {
  dataframe <- data.frame(firstterm, secondterm)
  dataframe[["x"]] <- (abs(dataframe[, 1]) - abs(firstaxis)) / abs(firstaxis)
  dataframe[["y"]] <- abs((dataframe[, 2] - secondaxis) / secondaxis)
  dataframe[["x"]] <- abs(dataframe[, 1] / max(dataframe[["x"]]))
  dataframe[["y"]] <- abs(dataframe[, 2] / max(dataframe[["y"]]))
  dataframe[["dist"]] <- abs(dataframe$x * dataframe[["y"]])
  dataframe[["dist"]] <- dataframe$dist / max(dataframe[["dist"]])
  return(dataframe)
}

#' Silence
#'
#' Some libraries/functions just won't shut up.  Ergo, silence, peasant!
#' This is a simpler silence peasant.
#'
#' @param ... Some code to shut up.
#' @return Whatever the code would have returned.
#' @export
sm <- function(...) {
  ret <- NULL
  output <- capture.output(type = "output", {
    ret <- suppressWarnings(suppressMessages(...))
  })
  return(ret)
}

#' A hopefully more robust version of tempfile().
#'
#' @param pattern Filename prefix.
#' @param suffix Filename suffix.
#' @param digits Currently I use Sys.time() with this number of digits.
#' @param body No implemented, intended to use other sources of digest()
#' @param fileext Filename extension as per tempfile().
#' @return md5 based tempfilename.
#' @export
tmpmd5file <- function(pattern = "", suffix = "", digits = 6,
                       body = NULL, fileext = "") {
  if (!grepl(pattern = "^\\.", x = fileext)) {
    pattern <- paste0(".", pattern)
  }
  op <- options(digits.secs = digits)
  body_string <- digest::digest(Sys.time())
  new <- options(op)
  outdir <- tempdir()
  if (!file.exists(outdir)) {
    created <- dir.create(outdir, recursive = TRUE)
  }
  file_string <- paste0(pattern, body_string, suffix, fileext)
  file_path <- file.path(outdir, file_string)
  return(file_path)
}

#' A cheater redefinition of tempfile.
#'
#' I found this at:
#' https://stackoverflow.com/questions/5262332/parallel-processing-and-temporary-files
#' and was intrigued.  I did not think to overwrite the tempfile definition.
#' Something in me says this is a terrible idea.
#' The same page suggests creating all the tempfile names _before_
#' beginning the parallel operations.  I think this might be the way
#' to go; however I do not know how that will affect the tempfile
#' names produced by knitr when it is making its images.
#'
#' @param pattern starting string of each tempfile.
#' @param tmpdir Location to put the file.
#' @param fileext suffix.
#' @export
tempfile <- function(pattern = "file", tmpdir = tempdir(), fileext = "") {
  .Internal(tempfile(paste0("pid", Sys.getpid(), pattern), tmpdir, fileext))
}

#' Remove the AsIs attribute from some data structure.
#'
#' Notably, when using some gene ontology libraries, the returned data
#' structures include information which is set to type 'AsIs' which turns out to
#' be more than slightly difficult to work with.
#'
#' @param stuff  The data from which to remove the AsIs classification.
#' @export
unAsIs <- function(stuff) {
  if("AsIs" %in% class(stuff)) {
    class(stuff) <- class(stuff)[-match("AsIs", class(stuff))]
  }
  return(stuff)
}

#' Print a model as y = mx + b just like in grade school!
#'
#' Because, why not!?
#'
#' @param lm_model Model to print from glm/lm/robustbase.
#' @param as Type to return.
#' @return a string representation of that model.
#' @export
ymxb_print <- function(lm_model, as = "glue") {
  coefficients <- summary(lm_model)[["coefficients"]]
  int <- signif(x = coefficients["(Intercept)", 1], digits = 3)
  ## Some invocations of lm() provide different row/column names.
  m <- signif(x = coefficients[2, 1], digits = 3)
  ret <- NULL
  if (as != "glue") {
    retlst <- list(
      "m" = m,
      "int" = int)
    return(retlst)
  }
  if (as.numeric(int) >= 0) {
    ret <- glue::glue("y = {m}x + {int}")
  } else {
    int <- int * -1
    ret <- glue::glue("y = {m}x - {int}")
  }
  return(ret)
}


## EOF
elsayed-lab/hpgltools documentation built on April 8, 2024, 1:30 a.m.