R/pcl_utils.r

Defines functions pcl.group.f pcl.merge pcl.merge.meta pcl.match pcl.match.s pcl.match.f pcl.reorder.f pcl.reorder.s pcl.top.f pcl.top.s pcl.filter.f pcl.filter.s pcl.only pcl.apply.f pcl.apply.s pcl.make pcl.write pcl.read

for (lib in c("diffusionMap", "labdsv", "seriation", "signal", "tsne")) {
  if (!suppressPackageStartupMessages(require(lib, character.only = TRUE))) {
    stop(paste("Please install the R package: ", lib))
  }
}
# -----------------------------------------------------------------------------
#  PCL I/O

# library(plyr)
# library(dplyr)

pcl.read <- function(datafile, metadata.rows = NA, rowfeatures = T, sep = "\t") {
  # Read a pcl structure from a file
  # if metadata.rows is NA, it tries to guess the number of rows automatically
  # if metadata.rows is " ", a blank line is expected after the metadata

  library(data.table)
  starttime <- proc.time()["elapsed"]

  # Read in everything
  if (endsWith(datafile, ".gz")) {
    file <- gzfile(datafile, "r")
    raw <- read.delim(file, stringsAsFactors = F, fill = T, blank.lines.skip = F, sep = sep)
    close(file)
  } else {
    raw <- fread(datafile, data.table = F, stringsAsFactors = F, fill = T, sep = sep, header = T)
  }
  featnames <- raw[, 1]
  dups <- duplicated(featnames)
  if (any(dups)) {
    featnames[dups] <- sprintf("%s_#%d", featnames[dups], cumsum(dups)[dups])
  }
  rownames(raw) <- featnames
  raw <- raw[, -1]

  if (rowfeatures) {
    # Transpose if rows are features in the datafile
    raw <- as.data.frame(t(raw), stringsAsFactors = F)
  }

  # Which columns are numeric?
  sampnames <- rownames(raw)
  if (any(colwise(mode)(raw) != "numeric")) {
    raw <- colwise(type.convert)(raw)
  }
  rownames(raw) <- sampnames

  if (is.na(metadata.rows)) {
    # Try to figure out how many metadata there are
    first_allnum <- suppressWarnings(max(which(!colwise(is.numeric)(raw)))) + 1
    first_k__ <- suppressWarnings(min(which(grepl("^k__", colnames(raw)))))
    first_pwy <- suppressWarnings(min(which(grepl("PWY", colnames(raw)))))
    first_nonchar <- min(which(grepl("[^a-zA-Z0-9_]", colnames(raw))))
    first_feat <- min(first_k__, first_pwy, first_nonchar)

    metadata.rows <- max(first_allnum, ifelse(is.finite(first_feat), first_feat, 1) - 1)
  } else if (is.character(metadata.rows) && metadata.rows == " ") {
    blank <- min(which(featnames == ""))
    if (!is.finite(blank)) {
      stop(sprintf("metadata.rows is \" \", but no blank line was found in %s to denote the end of the metadata block.", datafile))
    }
    metadata.rows <- blank - 1
    featnames <- featnames[-blank]
    raw <- raw[, -blank]
  }

  # Pull out data and metadata
  metadata <- raw[, seq_len(metadata.rows)]
  x <- as.matrix(if (metadata.rows > 0) {
    raw[, -seq_len(metadata.rows)]
  } else {
    raw
  })

  # Report time taken
  runtime <- proc.time()["elapsed"] - starttime
  cat(sprintf("Loaded %s in %gs\n", datafile, runtime))

  return(list(x = x, meta = metadata, ns = dim(x)[1], nf = dim(x)[2]))
}

pcl.write <- function(dat, datafile, rowfeatures = T) {
  # Write a pcl structure to a file

  if (rowfeatures) {
    # Write the sample names
    write.table(cbind("", matrix(rownames(dat$x), nrow = 1)),
      file = datafile, sep = "\t",
      quote = F, col.names = F, row.names = F
    )

    # Write the metadata
    if (dim(dat$meta)[2] > 0) {
      write.table(t(as.matrix(dat$meta)),
        file = datafile, sep = "\t",
        quote = F, append = T, col.names = F
      )
    }

    # Append the data
    write.table(t(dat$x),
      file = datafile, sep = "\t",
      quote = F, append = T, col.names = F
    )
  } else {
    # Merge into one big table
    write.table(dat$x,
      file = datafile, sep = "\t",
      quote = F, append = F, col.names = NA
    )
  }
}

pcl.empty <- list(nf = 0, ns = 0, x = matrix(0, 0, 0), meta = data.frame())

pcl.make <- function(x, meta, metaf) {
  x <- as.matrix(x)
  dat <- list(x = x, nf = dim(x)[2], ns = dim(x)[1])
  dat$meta <- if (missing(meta)) {
    data.frame(row.names = rownames(x))
  } else {
    stopifnot(nrow(meta) == nrow(x)) # Metadata does not have the same number of samples as the data
    meta[match(rownames(x), rownames(meta)), ]
  }
  if (!missing(metaf)) {
    stopifnot(nrow(metaf) == ncol(x)) # Feature metadata does not have the same number of features as the data
    dat$metaf <- metaf[match(colnames(x), rownames(metaf)), ]
  }
  return(dat)
}

# -----------------------------------------------------------------------------
#  Function application over samples and features

pcl.apply.s <- function(dat, expr, fn = substitute(expr), enclos = parent.frame()) {
  # Function application over samples

  # Todo: add metaf support

  evalfor <- function(i) {
    e <- c(as.list(dat$meta[i, , drop = F]), as.list(dat$x[i, , drop = F]))
    e$Name <- rownames(dat$x)[i]
    e$Index <- i
    e$x <- dat$x[i, ]
    return(eval(fn, e, enclos))
  }

  if (dat$ns > 0) {
    firstval <- evalfor(1)
    res <- rep(firstval, dat$ns)
    if (dat$ns > 1) {
      for (i in 2:dat$ns) {
        res[i] <- evalfor(i)
      }
    }
    names(res) <- rownames(dat$x)
    return(res)
  } else {
    return(c())
  }
}

pcl.apply.f <- function(dat, expr, fn = substitute(expr), enclos = parent.frame()) {
  # Function application over features

  # Todo: add metaf support

  evalfor <- function(i) {
    e <- c(
      list(
        Name = colnames(dat$x)[i],
        x = dat$x[, i],
        Index = i
      ),
      as.list(dat$meta),
      as.list(dat$x[, i])
    )
    return(eval(fn, e, enclos))
  }

  if (dat$nf > 0) {
    firstval <- evalfor(1)
    res <- rep(firstval, dat$nf)
    if (dat$nf > 1) {
      for (i in 2:dat$nf) {
        res[i] <- evalfor(i)
      }
    }
    names(res) <- colnames(dat$x)
    return(res)
  } else {
    return(c())
  }
}


# -----------------------------------------------------------------------------
#  PCL Filtering

pcl.only <- function(x, clade, rank, keepname = F) {
  # Slice out a subset of the features by clade and rank
  # Assumes features represent bug abundances stored in k__Bacteria format

  lx <- is.list(x)
  if (lx) {
    stopifnot(!("metaf" %in% names(x))) # This function has not yet had metaf support added
    l <- x
    x <- l$x
  }

  if (!missing(clade)) {
    # Pull out the columns with only that clade
    keep <- grepl(sprintf("(^|\\|)%s($|\\|)", clade), colnames(x))
    x <- x[, keep, drop = F]
    killpat <- sprintf("^(|.*\\|)%s", clade)
    if (!keepname) {
      colnames(x) <- mapply(function(n) {
        gsub(killpat, clade, n)
      }, colnames(x))
    }
  }

  if (!missing(rank)) {
    # Pull out the columns with only that taxonomic rank
    keep <- grepl(sprintf("(^|\\|)%s__[^\\|]+$", rank), colnames(x))
    x <- x[, keep, drop = F]
    if (!keepname) {
      colnames(x) <- mapply(function(n) {
        gsub("^.*\\|", "", n)
      }, colnames(x))
    }
  }

  if (lx) {
    l$x <- x
    l$nf <- dim(x)[2]
    return(l)
  } else {
    return(x)
  }
}

pcl.filter.s <- function(dat, keepwhat, keep, enclos = parent.frame()) {
  # Filter samples based on some criterion
  if (missing(keep)) {
    keep <- pcl.apply.s(dat, fn = substitute(keepwhat), enclos = enclos)
  }
  dat$x <- dat$x[keep, , drop = F]
  dat$meta <- dat$meta[keep, , drop = F]
  dat$ns <- dim(dat$x)[1]
  return(dat)
}

pcl.filter.f <- function(dat, keepwhat, keep, enclos = parent.frame()) {
  # Filter features based on some criterion
  if (missing(keep)) {
    keep <- pcl.apply.f(dat, fn = substitute(keepwhat), enclos = enclos)
  }
  dat$x <- dat$x[, keep, drop = F]
  if ("metaf" %in% names(dat)) {
    dat$metaf <- dat$metaf[keep, , drop = F]
  }
  dat$nf <- dim(dat$x)[2]
  return(dat)
}

pcl.top.s <- function(dat, expr = mean(x), n = 10, fn = substitute(expr),
                      enclos = parent.frame(),
                      statistic = pcl.apply.s(dat, fn = fn, enclos = enclos)) {
  # Keep only the top n of the samples
  nonNA <- sum(!is.na(statistic))
  thresh <- if (nonNA == 0) {
    0
  } else {
    quantile(statistic, max(nonNA - n, 0) / nonNA, na.rm = T)
  }
  return(pcl.filter.s(dat, keep = !is.na(statistic) & statistic >= thresh))
}

pcl.top.f <- function(dat, expr = mean(x), n = 10, fn = substitute(expr),
                      enclos = parent.frame(),
                      statistic = pcl.apply.f(dat, fn = fn, enclos = enclos)) {
  # Keep only the top n of the features
  nonNA <- sum(!is.na(statistic))
  thresh <- if (nonNA == 0) {
    0
  } else {
    quantile(statistic, max(nonNA - n, 0) / nonNA, na.rm = T)
  }
  return(pcl.filter.f(dat, keep = !is.na(statistic) & statistic >= thresh))
}

pcl.reorder.s <- function(dat, ord) {
  # Reorders the samples according to the new ordering
  ord <- ord[!is.na(ord)]
  dat$x <- dat$x[ord, , drop = F]
  dat$meta <- dat$meta[ord, , drop = F]
  dat$ns <- length(ord)
  return(dat)
}

pcl.reorder.f <- function(dat, ord, na.val = NA, keep.na = !is.na(na.val)) {
  # Reorders the features according to the new ordering
  setNames <- F
  if (!is.numeric(ord)) {
    featNames <- ord
    setNames <- T
    ord <- match(ord, colnames(dat$x))
  }
  if (!keep.na) {
    ord <- ord[!is.na(ord)]
  }
  dat$x <- dat$x[, ord, drop = F]
  if (keep.na && !is.na(na.val)) {
    dat$x[, is.na(ord)] <- na.val
  }
  if ("metaf" %in% names(dat)) {
    dat$metaf <- dat$metaf[ord, , drop = F]
  }
  dat$nf <- length(ord)
  if (setNames) {
    colnames(dat$x) <- featNames
  }
  return(dat)
}

pcl.match.f <- function(dat1, dat2, unmatched.val = NULL) {
  # Matches dat1 to the columns of dat2

  stopifnot(!("metaf" %in% names(dat1))) # This function has not yet had metaf support added

  mt <- match(colnames(dat2$x), colnames(dat1$x))
  if (!is.null(unmatched.val)) {
    datm <- pcl.reorder.f(dat1, mt, keep.na = T)
    datm$x[, is.na(mt)] <- unmatched.val
    colnames(datm$x) <- colnames(dat2$x)
    return(datm)
  } else {
    return(pcl.reorder.f(dat1, mt))
  }
}

pcl.match.s <- function(dat1, dat2) {
  # Matches dat1 to the rows of dat2
  return(pcl.reorder.s(dat1, match(rownames(dat2$x), rownames(dat1$x))))
}

pcl.match <- function(dat1, dat2, unmatched.val = NULL) {
  # Matches dat1 to the rows and columns of dat2
  return(pcl.match.f(pcl.match.s(dat1, dat2), dat2, unmatched.val))
}

pcl.merge.meta <- function(dat, datm) {
  # Merges metadata from datm into dat
  # Data tables are untouched

  stopifnot(!("metaf" %in% names(datm))) # This function has not yet had metaf support added

  sm <- match(rownames(dat$meta), rownames(datm$meta))
  unm2m <- !(colnames(datm$meta) %in% colnames(dat$meta))
  dat$meta <- cbind(dat$meta, datm$meta[sm, which(unm2m), drop = F])

  return(dat)
}

pcl.merge <- function(..., lst = NULL) {
  # Merges a bunch of pcl's

  merge2 <- function(dat1, dat2) {
    # Merges two pcl's

    # This function has not yet had metaf support added
    stopifnot(!(("metaf" %in% names(dat1)) || ("metaf" %in% names(dat2))))

    # Find the features, metadata and samples that are common
    fm <- match(colnames(dat1$x), colnames(dat2$x))
    mm <- match(colnames(dat1$meta), colnames(dat2$meta))
    sm <- match(rownames(dat1$x), rownames(dat2$x))

    # Find the features, metadata and samples in dat2 that aren't in dat1
    unm2f <- !(colnames(dat2$x) %in% colnames(dat1$x))
    unm2m <- !(colnames(dat2$meta) %in% colnames(dat1$meta))
    unm2s <- !(rownames(dat2$x) %in% rownames(dat1$x))

    if (any(!is.na(fm)) && any(!is.na(sm))) {
      # There are samples AND features in common
      # Make sure they're equal
      stopifnot(
        all(dat1$x[!is.na(sm), !is.na(fm), drop = F] ==
          dat2$x[sm[!is.na(sm)], fm[!is.na(fm)], drop = F]),
        "Overlapping data must be equal to merge PCL's"
      )
    }

    if (any(!is.na(mm)) && any(!is.na(sm))) {
      # There are samples AND metadata in common
      # Make sure they're equal
      stopifnot(all(dat1$meta[!is.na(sm), !is.na(mm), drop = F] ==
        dat2$meta[sm[!is.na(sm)], mm[!is.na(mm)], drop = F]))
    }

    # All overlapping values are consistent between the two tables - merge!

    # Create the new merged dat
    dat <- list(
      ns = dat1$ns + sum(unm2s),
      nf = dat1$nf + sum(unm2f)
    )

    # Merge the data tables
    dat$x <- rbind(
      cbind(dat1$x, dat2$x[sm, which(unm2f), drop = F]),
      dat2$x[which(unm2s), c(fm, which(unm2f)), drop = F]
    )

    # Merge the metadata
    # NOTE: apparently data frames cannot be indexed with NAs, so the statement
    #       above for the data matrix cannot be used, and the bottom left part
    #       of the metadata data frame must be generated separately:
    bl <- dat1$meta[rep(1, sum(unm2s)), , drop = F]
    bl[, is.na(mm)] <- NA
    bl[, !is.na(mm)] <- dat2$meta[which(unm2s), mm[!is.na(mm)], drop = F]
    rownames(bl) <- rownames(dat2$meta)[which(unm2s)]
    if (dim(dat1$meta)[2] == 0) {
      # NOTE 2: rbind with a 0-column data.matrix does not produce the
      #         expected result.. hence this special case:
      dat$meta <- dat2$meta[c(sm, which(unm2s)), which(unm2m), drop = F]
    } else {
      dat$meta <- cbind(
        rbind(dat1$meta, bl),
        dat2$meta[c(sm, which(unm2s)), which(unm2m), drop = F]
      )
    }

    return(dat)
  }

  lst <- c(list(...), lst)
  if (length(lst) == 0) {
    return(pcl.empty)
  } else {
    merge_sublist <- function(lst) {
      if (length(lst) > 1) {
        midpt <- floor(length(lst) / 2)
        return(merge2(
          merge_sublist(lst[1:midpt]),
          merge_sublist(lst[(midpt + 1):length(lst)])
        ))
      } else {
        stopifnot(length(lst) == 1)
        return(lst[[1]])
      }
    }
    return(merge_sublist(lst))
  }
}

pcl.group.f <- function(dat, map, avg = F, mapping = F) {
  # Group features according to a mapping
  # If avg is T, features are averaged rather than summed
  # If mapping is T, then map is treated as a named vector providing
  # a feature name -> output name map, rather than a direct output name for
  # each feature.

  library(dplyr)

  if (mapping) {
    map <- map[match(colnames(dat$x), names(map))]
  }
  xm <- split(as.data.frame(t(dat$x)), map) %>%
    lapply(colwise(if (avg) {
      mean
    } else {
      sum
    })) %>%
    do.call(rbind, .) %>%
    t()

  if ("metaf" %in% names(dat)) {
    allsame <- function(x) length(x) == 0 || all(x == x[1])
    mfm <- split(pcl$metaf, map) %>%
      lapply(colwise(function(x) {
        if (any(!is.na(x)) && allsame(x[!is.na(x)])) {
          x[min(which(!is.na(x)))]
        } else {
          NA
        }
      })) %>%
      do.call(rbind, .)

    dat$metaf <- mfm
  }

  dat$x <- xm
  dat$nf <- ncol(xm)

  return(dat)
}

pcl.group.s <- function(dat, map, avg = F) {
  # Group samples according to a mapping
  # If avg is T, samples are averaged rather than summed
  # Metadata which is consistent across all grouped samples is kept
  # inconsistent metadata is tossed

  library(dplyr)

  xm <- split(as.data.frame(dat$x), map) %>%
    lapply(colwise(if (avg) {
      mean
    } else {
      sum
    })) %>%
    do.call(rbind, .)

  metaclean <- dat$meta[, colnames(dat$meta) != ""]
  mm <- split(metaclean, map) %>%
    lapply(., colwise(function(x) {
      if (any(!is.na(x)) && all(x == na.omit(x)[1], na.rm = T)) {
        na.omit(x)[1]
      } else {
        NA
      }
    })) %>%
    do.call(rbind, .)

  dat$x <- xm
  dat$meta <- mm
  dat$ns <- nrow(xm)

  return(dat)
}

pcl.t <- function(dat) {
  return(pcl.make(t(dat$x)))
}

# -----------------------------------------------------------------------------
#  Misc PCL Helpers

pcl.map.fnames <- function(dat, mapexpr, enclos = parent.frame()) {
  # Map a function to the feature names
  colnames(dat$x) <- pcl.apply.f(dat, fn = substitute(mapexpr), enclos = enclos)
  return(dat)
}

pcl.map.snames <- function(dat, mapexprenclos = parent.frame()) {
  # Map a function to the sample names
  rownames(dat$x) <- pcl.apply.s(dat, fn = substitute(mapexpr), enclos = enclos)
  if (dim(dat$meta)[2] > 0) {
    rownames(dat$meta) <- rownames(dat$x)
  } else {
    dat$meta <- data.frame(row.names = rownames(dat$x))
  }
  return(dat)
}

pcl.nicenames <- function(dat) {
  # Prettifies feature names by removing stratification and transforming
  # s__Xyz_Abc_1 to Xyz Abc 1
  # Works on pathway names as well as taxonomic trees
  # Also works on ;-separated taxonomies
  return(pcl.map.fnames(dat, gsub("_", " ", gsub(
    "^((.*\\|)?\\w__(.*\\.\\w__)?)|^.*\\||^.*; ", "", Name
  ))))
}

pcl.normalize <- function(dat, s = rowSums(dat$x, na.rm = T)) {
  # Normalize samples to sum to 1
  s[s == 0] <- 1
  if (dat$nf >= 1) {
    dat$x <- dat$x / s
  }
  return(dat)
}

pcl.sort.f <- function(dat, feat, fn = substitute(feat), enclos = parent.frame()) {
  return(pcl.reorder.f(dat, order(pcl.apply.f(dat, fn = fn, enclos = enclos))))
}

pcl.sort.s <- function(dat, feat, fn = substitute(feat), enclos = parent.frame()) {
  return(pcl.reorder.s(dat, order(pcl.apply.s(dat, fn = fn, enclos = enclos))))
}


# -----------------------------------------------------------------------------
#  Common analysis functions

pcl.compmatrix.s <- function(dat, fun, symmetric = TRUE) {
  # Sample-sample comparison matrix
  # FUNCTION LARGELY UNTESTED..

  val1 <- fun(dat$x[1, ], dat$x[1, ])
  comp <- matrix(val1, dat$ns, dat$ns)
  for (j in 2:dat$ns) {
    comp[1, j] <- fun(dat$x[1, ], dat$x[j, ])
  }
  if (symmetric) {
    for (i in 2:dat$ns) {
      for (j in i:dat$ns) {
        comp[i, j] <- fun(dat$x[i, ], dat$x[j, ])
        comp[j, i] <- comp[i, j]
      }
    }
  } else {
    for (i in 2:dat$ns) {
      for (j in 1:dat$ns) {
        comp[i, j] <- fun(dat$x[i, ], dat$x[j, ])
      }
    }
  }
  return(comp)
}

# -----------------------------------------------------------------------------
#  Heatmap visualization function
#' @export
pcl.heatmap <- function(dat, ..., meta = F, row_meta = F, logspace = F, pseudocount = 0,
                        zerocolor = NA, sqrtspace = F, gamma = 2, as = F,
                        ev = 10 / (dat$ns * dat$nf), # Extreme values
                        minx = quantile(dat$x[is.finite(dat$x) & (dat$x > 0)], ev),
                        maxx = quantile(dat$x[is.finite(dat$x) & (dat$x > 0)], 1 - ev),
                        metanames = NA, row_metanames = NA, reorder = T, divergent0 = NA,
                        show_colnames = T, show_rownames = T, treeheight_row = 100, treeheight_col = 100) {
  if (dat$ns == 0 || dat$nf == 0) {
    return(plot.new())
  }

  # Pretty heatmap of the data in dat
  library(pheatmap)

  params <- list(...)

  neg <- any(dat$x[is.finite(dat$x)] < 0)
  if (is.na(divergent0)) {
    divergent0 <- neg
  }

  def.params <- list()
  def.params$show_colnames <- show_colnames # dat$ns <= 50
  def.params$show_rownames <- show_rownames # dat$nf <= 40#40
  def.params$clustering_distance_rows <- if (divergent0) {
    "euclidean"
  } else {
    "bray/curtis"
  }
  def.params$clustering_distance_cols <- def.params$clustering_distance_rows
  def.params$cluster_rows <- dat$nf > 1
  def.params$cluster_cols <- dat$ns > 1
  # def.params$color <- colorRampPalette(c("black","blue","cyan","yellow","red"))(100)
  if (divergent0) {
    def.params$color <- colorRampPalette(c("dodgerblue", "white", "firebrick"))(101)
    mv <- max(1e-10, max(-max(minx, min(dat$x, na.rm = T)), min(maxx, max(dat$x, na.rm = T))))
    def.params$breaks <- c(seq(from = -mv, to = mv, length = 101))
  } else {
    # library(viridis)
    # def.params$color <- rev(mako(100)) #viridis(100) #
    def.params$color <- colorRampPalette(c("gray97", "goldenrod1", "firebrick"))(100)
  }
  def.params$treeheight_row <- treeheight_row
  def.params$treeheight_col <- treeheight_col
  def.params$border_color <- NA
  def.params$cuttree_rows <- 5
  def.params$cuttree_cols <- 5
  def.params$fontsize <- 6

  x <- t(dat$x)
  x[!is.finite(x)] <- 0
  zero <- x == 0

  # Override the clustering
  override_clustering <- function(mat, distance, def) {
    if (class(distance) == "dist") {
      return(distance)
    } else {
      method <- if (is.null(distance)) {
        def
      } else {
        distance
      }
      if (as) {
        mat <- asin(sqrt(mat / 100))
      }
      if (method == "correlation") {
        D <- as.dist(1 - cor(t(mat)))
      } else if (method %in% c(
        "manhattan", "euclidean", "canberra",
        "bray", "kulczynski", "jaccard", "gower",
        "altGower", "morisita", "horn", "mountford",
        "raup", "binomial", "chao", "cao", "mahalanobis"
      )) {
        library(vegan)
        D <- vegdist(mat, method)
      } else if (method %in% c(
        "steinhaus", "sorensen", "ochiai",
        "ruzicka", "bray/curtis", "roberts", "chisq"
      )) {
        library(labdsv)
        D <- dsvdis(mat, method)
      } else {
        D <- dist(mat, method = method)
      }
      return(D)
    }
  }

  params$clustering_distance_rows <- override_clustering(
    x,
    params$clustering_distance_rows, def.params$clustering_distance_rows
  )
  params$clustering_distance_cols <- override_clustering(
    t(x),
    params$clustering_distance_cols, def.params$clustering_distance_cols
  )

  # Perform data transformations
  x <- pmin(pmax(x, minx), maxx) + pseudocount
  if (logspace) {
    x <- log10(x)
  } else if (sqrtspace) {
    x <- x**(1 / gamma)
  }

  # Make zero special
  if (!divergent0 && (length(zerocolor) > 1 || !is.na(zerocolor) && any(zero))) {
    x[zero] <- min(x) - (max(x) - min(x)) * (2 / length(def.params$color))
    if ("color" %in% names(params)) {
      params$color <- c(
        if (is.character(zerocolor)) {
          zerocolor
        } else {
          "gray95"
        },
        params$color[1], params$color
      )
    } else {
      def.params$color <- c(
        if (is.character(zerocolor)) {
          zerocolor
        } else {
          "gray95"
        },
        def.params$color[1], def.params$color
      )
    }
  }

  # Add metadata annotation
  if (length(meta) > 0 && (length(meta) > 1 || !is.na(meta))) {
    if (length(meta) == 1 && is.logical(meta)) {
      if (meta) {
        def.params$annotation_col <- dat$meta[, , drop = F]
      }
    } else {
      def.params$annotation_col <- dat$meta[, match(meta, colnames(dat$meta)), drop = F]
    }

    for (i in seq_along(def.params$annotation_col)) {
      if (class(def.params$annotation_col[, i]) == "logical") {
        def.params$annotation_col[, i] <- factor(def.params$annotation_col[, i])
      }
    }

    if (length(metanames) > 1 || !is.na(metanames)) {
      names2 <- names(def.params$annotation_col)
      nmatch <- match(names2, names(metanames))
      names2[!is.na(nmatch)] <- metanames[nmatch[!is.na(nmatch)]]
      names(def.params$annotation_col) <- names2
    }
  }
  # Add row metadata annotation
  if (length(row_meta) > 0 && (length(row_meta) > 1 || !is.na(row_meta))) {
    if (length(row_meta) == 1 && is.logical(row_meta)) {
      if (row_meta) {
        def.params$annotation_row <- dat$row_meta[, , drop = F]
      }
    } else {
      def.params$annotation_row <- dat$row_meta[, match(row_meta, colnames(dat$row_meta)), drop = F]
    }

    for (i in seq_along(def.params$annotation_row)) {
      if (class(def.params$annotation_row[, i]) == "logical") {
        def.params$annotation_row[, i] <- factor(def.params$annotation_row[, i])
      }
    }

    if (length(row_metanames) > 1 || !is.na(row_metanames)) {
      names2 <- names(def.params$annotation_row)
      nmatch <- match(names2, names(row_metanames))
      names2[!is.na(nmatch)] <- row_metanames[nmatch[!is.na(nmatch)]]
      names(def.params$annotation_row) <- names2
    }
  }
  if (reorder) {
    def.params$clustering_callback <- function(hc, u) {
      library(seriation)
      m <- if (all(dim(u) == dim(x)) && all(u == x)) {
        params$clustering_distance_rows
      } else {
        params$clustering_distance_cols
      }
      ord <- get_order(seriate(m, control = list(hclust = hc), method = "OLO"), 1)
      hc$order <- ord
      return(hc)
    }
  }

  #     def.params$clustering_callback <- function(hc, x) {
  #         library(vegan)
  #         D <- dsvdis(x, "bray/curtis")
  #         return (reorder.hclust(hc, D)) # D is not what is expected here
  #     }


  args <- c(list(mat = x), params, def.params)
  args <- args[unique(names(args))]
  do.call(pheatmap, args)
}

# -----------------------------------------------------------------------------
#  Ordination functions

pcl.nmds <- function(dat, asinsqrt = T, remove_zeros = T, ...) {
  # Performs a NMDS ordination

  x <- dat$x

  # Arcsin Sqrt transform
  if (asinsqrt) {
    maxx <- max(x)
    if (maxx > 1) {
      if (maxx > 100) {
        x <- x / maxx
      } else {
        x <- x / 100
      }
    }
    x <- asin(sqrt(x))
  }

  # Warn on zero samples to avoid some confusion about the wierd error
  # vegan gives in this condition
  zerosamples <- apply(x, 1, function(xx) {
    all(xx == 0)
  })
  if (any(zerosamples)) {
    if (remove_zeros) {
      x <- x[!zerosamples, ]
    } else {
      warning("There are samples with zero abundances. This may cause errors in metaMDS.")
    }
  }

  library(vegan)
  ord <- metaMDS(x, autotransform = F, ...)

  return(ord)
}

#' Perform Principal Coordinate Analysis (PCoA)
#'
#' This function performs Principal Coordinate Analysis (PCoA) on the input data
#' or distance matrix.
#'
#' @param dat A list containing at least an element 'x' which is the data matrix.
#' @param D Distance matrix. If NA, it will be computed from dat$x. Default is NA.
#' @param as Logical. If TRUE, performs arcsine transformation (deprecated parameter). Default is FALSE.
#' @param asinsqrt Logical. If TRUE, performs arcsine square root transformation. Default is the value of 'as'.
#' @param index Character string specifying the distance index to use when computing D. Default is "bray/curtis".
#' @param k Integer. The number of dimensions for PCoA. Must be at least 2. Default is 2.
#'
#' @return A list containing PCoA results, including:
#'   \item{points}{A matrix of PCoA coordinates}
#'   \item{eig}{Eigenvalues}
#'   \item{ordnames}{A character vector of dimension names with variance explained}
#'
#' @details
#' If a distance matrix D is not provided, the function computes it from the input data
#' using the specified index. If asinsqrt is TRUE, the data is transformed using an
#' arcsine square root transformation before computing distances. The PCoA is then
#' performed on the distance matrix.
#'
#' @note
#' This function requires the 'vegan', 'labdsv', and 'ecodist' packages.
#'
#' @examples
#' \dontrun{
#' data <- list(x = matrix(rnorm(1000), ncol = 10))
#' rownames(data$x) <- paste0("Sample", 1:100)
#' result <- pcl.pcoa(data, k = 3)
#' plot(result$points[, 1], result$points[, 2], main = "PCoA plot")
#' }
#'
#' @export
pcl.pcoa <- function(
    dat, D = NA, as = FALSE, asinsqrt = as,
    index = "bray/curtis", k = 2) {
  if (length(D) == 1 && is.na(D)) {
    x <- dat$x
    x[!is.finite(x)] <- 0

    if (asinsqrt) {
      x <- asin_sqrt_transform(x)
    }

    k <- max(k, 2)
    D <- if (index == "euclidean") {
      vegan::vegdist(x, index)
    } else {
      labdsv::dsvdis(x, index)
    }
  }

  pc <- vegan::pco(D, k)

  toteig <- sum(pc$eig[pc$eig > 0])
  pc$ordnames <- sprintf("PCo %d (%0.1f%%)", seq_len(k), 100 * pc$eig[seq_len(k)] / toteig)

  return(pc)
}

#' Perform Arcsine Square Root Transformation
#'
#' @param x Numeric vector or matrix
#' @return Transformed data
#' @keywords internal
asin_sqrt_transform <- function(x) {
  maxx <- max(x)
  if (maxx > 1) {
    x <- if (maxx > 100) x / maxx else x / 100
  }
  asin(sqrt(x))
}

pcl.pcoa_sharpel <- function(dat, D = NA, as = F, asinsqrt = as, index = "bray/curtis", k = 2, var_capture = 0.9) {
  # Performs a PCoA ordination

  if (length(D) == 1 && is.na(D)) {
    x <- dat$x
    x[!is.finite(x)] <- 0

    # Arcsin Sqrt transform
    if (asinsqrt) {
      maxx <- max(x)
      if (maxx > 1) {
        if (maxx > 100) {
          x <- x / maxx
        } else {
          x <- x / 100
        }
      }
      x <- asin(sqrt(x))
    }

    k <- max(k, 2)
    D <- labdsv::dsvdis(x, index)
  }

  pc <- vegan::pco(D, dat$ns - 1)

  varfrac <- cumsum(pmax(0, pc$eig))
  varfrac <- varfrac / max(varfrac)
  qk <- min(which(varfrac > var_capture))

  spc <- pcaL1::sharpel1pca(sweep(pc$points[, 1:qk], 2, varfrac[1:qk], FUN = "*"), k, projections = "l1")
  ord <- list(
    points = spc$scores,
    ordnames = sprintf("Axis %d (%0.1f%%)", 1:k, 100 * spc$dispExp[1:k] * varfrac[qk])
  )

  return(ord)
}

pcl.pca_sharpel <- function(dat, k = 2) {
  spc <- pcaL1::sharpel1pca(dat$x, k, projections = "l1")
  ord <- list(
    points = spc$scores,
    ordnames = sprintf("Axis %d (%0.1f%%)", 1:k, 100 * spc$dispExp[1:k])
  )

  return(ord)
}

pcl.dmap <- function(dat, D = NA, as = F, asinsqrt = as, index = "bray/curtis", k = 2) {
  # Performs a PCoA ordination
  if (length(D) == 1 && is.na(D)) {
    x <- dat$x
    x[!is.finite(x)] <- 0

    # Arcsin Sqrt transform
    if (asinsqrt) {
      maxx <- max(x)
      if (maxx > 1) {
        if (maxx > 100) {
          x <- x / maxx
        } else {
          x <- x / 100
        }
      }
      x <- asin(sqrt(x))
    }

    k <- max(k, 2)
    D <- labdsv::dsvdis(x, index)
  }

  dm <- diffusionMap::diffuse(D, maxdim = k)

  ord <- dm
  ord$points <- dm$X
  ord$ordnames <- sprintf("DMap %d", 1:k)
  rownames(ord$points) <- rownames(dat$x)
  return(ord)
}

pcl.lda <- function(dat, meta, as = T, asinsqrt = as, k = 2) {
  # Linear Discriminant Analysis

  x <- dat$x
  groups <- dat$meta[, meta]

  if (any(is.na(groups))) {
    warning("NAs in the metadata")
    x <- x[!is.na(groups), ]
    groups <- groups[!is.na(groups)]
  }

  # Arcsin Sqrt transform
  if (asinsqrt) {
    maxx <- max(x)
    if (maxx > 1) {
      if (maxx > 100) {
        x <- x / maxx
      } else {
        x <- x / 100
      }
    }
    x <- asin(sqrt(x))
  }

  # lda can't handle variables with zero variance.. remove them
  for (i in seq_along(levels(groups))) {
    xvar <- apply(x[groups == levels(groups)[i], ], 2, var)
    x <- x[, xvar > 0]
  }

  res <- MASS::lda(x, groups)
  p <- predict(res, x)

  ord <- list(lda_res = res)
  if (dim(p$x)[2] >= 2) {
    ord$points <- p$x
    ord$ordnames <- sprintf("LD %d", 1:dim(p$x)[2])
  } else {
    ord2 <- pcl.pcoa(dat, k = 1)

    ord$points <- cbind(p$x, ord2$points[, 1])
    ord$ordnames <- c("LD 1", ord2$ordnames[1])
  }

  return(ord)
}

#' Perform t-SNE Ordination
#'
#' This function performs t-Distributed Stochastic Neighbor Embedding (t-SNE) ordination
#' on the input data or distance matrix.
#'
#' @param dat A list containing at least an element 'x' which is the data matrix.
#' @param D Distance matrix. If NA, it will be computed from dat$x. Default is NA.
#' @param as Logical. If TRUE, performs arcsine transformation (deprecated parameter). Default is FALSE.
#' @param asinsqrt Logical. If TRUE, performs arcsine square root transformation. Default is the value of 'as'.
#' @param index Character string specifying the distance index to use when computing D. Default is "bray/curtis".
#' @param k Integer. The number of dimensions for t-SNE. Must be at least 2. Default is 2.
#' @param seed Integer. Random seed for reproducibility. Default is 1234.
#'
#' @return A list containing two elements:
#'   \item{points}{A matrix of t-SNE coordinates}
#'   \item{ordnames}{A character vector of dimension names}
#'
#' @details
#' The function first checks if a distance matrix D is provided. If not, it computes
#' the distance matrix from the input data using the specified index.
#' If asinsqrt is TRUE, the data is transformed using an arcsine square root transformation
#' before computing distances. The t-SNE algorithm is then applied to the distance matrix.
#'
#' @note
#' This function requires the 'tsne' and 'labdsv' packages.
#'
#' @examples
#' \dontrun{
#' data <- list(x = matrix(rnorm(1000), ncol = 10))
#' rownames(data$x) <- paste0("Sample", 1:100)
#' result <- pcl.tsne(data, k = 3)
#' plot(result$points[, 1], result$points[, 2], main = "t-SNE plot")
#' }
#'
#' @export
pcl.tsne <- function(
    dat, D = NA, as = FALSE, asinsqrt = as,
    index = "bray/curtis", k = 2, seed = 1234) {
  k <- max(k, 2) # Ensure k is at least 2

  if (is.na(D)) {
    x <- dat$x
    x[!is.finite(x)] <- 0

    if (asinsqrt) {
      x <- asin_sqrt_transform(x)
    }

    D <- labdsv::dsvdis(x, index)
  }

  set.seed(seed)
  tsn <- tsne::tsne(D, k = k)

  ord <- list(
    points = tsn,
    ordnames = sprintf("t-SNE %d", seq_len(k))
  )
  rownames(ord$points) <- rownames(dat$x)

  return(ord)
}

#' Perform Arcsine Square Root Transformation
#'
#' @param x Numeric vector or matrix
#' @return Transformed data
#' @keywords internal
asin_sqrt_transform <- function(x) {
  maxx <- max(x)
  if (maxx > 1) {
    x <- if (maxx > 100) x / maxx else x / 100
  }
  asin(sqrt(x))
}

#' @export
pcl.ordplot <- function(dat, ord, pcos = 2, pointoutline = T,
                        colour = NA, colour_title = NA, colour_names = NA, colour_override = NA,
                        shape = NA, shape_title = NA, shape_names = NA, shape_override = NA,
                        surface = NA, surf_maj_lines = 2, surf_min_lines = 4, surf_extent = 1,
                        surf_smoothness = 5, surf_quality = 100, size_abs = NA, outline_size = NA,
                        centroid = NA, loading = NA, sigloadings = NA,
                        enriched = NA, text_halo = T,
                        colour_log = F, colour_log_pseudocount = 0, colour_log_base = 10,
                        arrows_fixed = NULL, arrows = NULL, arrow_text = T, arrow_sqrtnorm = T,
                        connect = NA, sequence = NA, connectwidth = NA, sortby = NA, decreasing = F) {
  # ggplot2-based function to visualize the results of an ordination

  # Set up the plot
  ggp <- ggplot2::ggplot() +
    ggplot2::theme_classic() +
    ggplot2::theme(axis.text.x = ggplot2::element_blank(), axis.ticks.x = ggplot2::element_blank()) +
    ggplot2::theme(axis.text.y = ggplot2::element_blank(), axis.ticks.y = ggplot2::element_blank())
  gptopt <- list()
  gptaes <- list(x = "dim1", y = "dim2")

  # Get the dimensions to plot, and set the axes appropriately
  if (length(pcos) == 1) {
    pcos <- max(pcos, 2)
    pcos <- c(pcos - 1, pcos)
  } else {
    stopifnot(length(pcos) == 2)
  }
  stopifnot(max(pcos) <= dim(ord$points)[2])
  pts <- as.data.frame(ord$points[, pcos])
  if (is.null(ord$ordnames)) {
    dims <- colnames(pts)
  } else {
    dims <- ord$ordnames[pcos]
  }
  colnames(pts) <- c("dim1", "dim2")
  ggp <- ggp + xlab(dims[1]) + ylab(dims[2])

  # Use this mapping to allow the ordination to be done on a subset of the
  # data without requiring that dat also be that subset
  metamatch <- match(rownames(pts), rownames(dat$x))
  if (any(is.na(metamatch))) {
    pts <- pts[!is.na(metamatch), ]
    metamatch <- metamatch[!is.na(metamatch)]
  }
  if (is.na(size_abs)) {
    gptopt$size <- min(3, 50 / sqrt(length(metamatch)))
  } else {
    gptopt$size <- size_abs
  }

  getmeta <- function(name) {
    tgt <- name == colnames(dat$meta)
    if (any(tgt)) {
      return(dat$meta[metamatch, tgt])
    }
    tgt <- name == colnames(dat$x)
    if (any(tgt)) {
      return(dat$x[metamatch, min(which(tgt))])
    }

    stop(sprintf("%s is not a metadata or feature.", name))
  }

  # Apply sorting
  if (!is.na(sortby)) {
    s <- getmeta(sortby)
    pts <- pts[order(s, decreasing = decreasing), ]
    metamatch <- match(rownames(pts), rownames(dat$x))
  }

  # Draw connectors first
  if (!is.na(connect) || !is.na(sequence)) {
    df <- data.frame(x = pts$dim1, y = pts$dim2)

    if (!is.na(connect)) {
      connectby <- getmeta(connect)
      df$group <- connectby
    }
    if (!is.na(sequence)) {
      seqby <- getmeta(sequence)
      df <- df[order(seqby), ]
    }
    if (!is.na(connect)) {
      ggp <- ggp + ggplot2::geom_path(data = df, ggplot2::aes(x = x, y = y, group = group), size = ifelse(is.na(connectwidth), 1, connectwidth))
    } else {
      ggp <- ggp + ggplot2::geom_path(data = df, ggplot2::aes(x = x, y = y), size = ifelse(is.na(connectwidth), 1, connectwidth))
    }
  }

  #     set_by <- function(bywhat, names, override, title,
  #                        aeskey, default, ggscale_manual, legoverride) {
  #         if (!is.na(bywhat)) {
  #             byval <- dat$meta[[bywhat]][metamatch]
  #             if (length(names) > 1 || !is.na(names)) {
  #                 # reorder the factors
  #                 byval <- factor(levels(shapeby)[as.numeric(shapeby)],
  #                                   levels=names(names))
  #             }
  #             pts[[aeskey]] <- byval
  #             gptaes[[aeskey]] <- aeskey
  #             if (is.na(title)) {
  #                 title <- bywhat
  #             }
  #
  #             guideopt <- list()
  #             guideopt[[aeskey]] <- guide_legend(title = shape_title,
  #                                                override.aes = legoverride)
  #             ggp <- ggp + do.call(guides, guideopt)
  #             if (length(override) > 1 || !is.na(override)) {
  #                 ggp <- ggp + ggscale_manual(values=unlist(override),
  #                                             breaks=names(names),
  #                                             labels=unlist(names))
  #             }
  #
  #         } else if (length(override) == 1 && !is.na(override)) {
  #             gptopt[[aeskey]] <- override
  #         } else {
  #             gptopt[[aeskey]] <- default
  #         }
  #     }

  # Draw the surface
  if (!is.na(surface)) {
    surfby <- getmeta(surface)

    # pick an appropriate bandwidth
    bw1 <- surf_smoothness * 1.06 * sd(pts$dim1) * length(pts$dim1)^-0.2
    bw2 <- surf_smoothness * 1.06 * sd(pts$dim2) * length(pts$dim2)^-0.2

    # only make the surface from finite values
    keep <- !is.na(surfby) & is.finite(surfby)
    surfby <- surfby[keep]
    xp <- pts$dim1[keep]
    yp <- pts$dim2[keep]

    # distance from the points to draw
    extent <- 0.5 * surf_extent^2 / surf_smoothness

    # surface from a Gaussian weighted mean
    gkmeansurf <- function(x, y) {
      dx <- x - xp
      dy <- y - yp
      chi2 <- (dx / bw1)^2 + (dy / bw2)^2
      kern <- exp(-chi2)

      theta <- signal::unwrap(atan2(dy, dx))
      mnchi2 <- min(chi2)
      if (mnchi2 > extent && (max(theta) - min(theta)) < pi) {
        return(NA)
      }

      return(-sum(surfby * kern) / sum(kern))
    }

    # drop a grid on the ordination
    n <- surf_quality + 1
    border <- surf_extent
    xx <- matrix(seq(
      from = min(pts$dim1) - border * bw1,
      to = max(pts$dim1) + border * bw1,
      length = n
    ), n, n, byrow = F)
    yy <- matrix(seq(
      from = min(pts$dim2) - border * bw2,
      to = max(pts$dim2) + border * bw2,
      length = n
    ), n, n, byrow = T)

    # measure the surface at all points
    z <- mapply(gkmeansurf, as.vector(xx), as.vector(yy))

    # only keep points in range
    keep <- !is.na(z)

    # plot contours
    zrange <- max(z[keep]) - min(z[keep])
    cdata <- data.frame(x = as.vector(xx)[keep], y = as.vector(yy)[keep], z = z[keep])
    ggp <- ggp + ggplot2::stat_contour(ggplot2::aes(x = x, y = y, z = z),
      data = cdata,
      size = 0.5, colour = "grey50", binwidth = zrange / (surf_maj_lines * (surf_min_lines + 1))
    )
    ggp <- ggp + ggplot2::stat_contour(ggplot2::aes(x = x, y = y, z = z),
      data = cdata,
      size = 1, colour = "black", binwidth = zrange / surf_maj_lines
    )
  }

  # Set up the plot to shape by some metadata
  if (!is.na(shape)) {
    shapeby <- getmeta(shape)
    if (length(shape_names) > 1 || !is.na(shape_names)) {
      # reorder the factors to match the order in shape_names
      # so that the legend is displayed in the correct order
      shapeby <- factor(levels(shapeby)[as.numeric(shapeby)],
        levels = names(shape_names)
      )
    }
    pts$shape <- shapeby
    gptaes$shape <- "shape"
    if (is.na(shape_title)) {
      shape_title <- shape
    }

    ggp <- ggp + ggplot2::guides(shape = ggplot2::guide_legend(
      title = shape_title,
      override.aes = list(size = 4, fill = "gray")
    ))

    if (length(shape_override) > 1 || !is.na(shape_override)) {
      # User-defined shapes
      if (length(shape_names) == 1 && is.na(shape_names)) {
        shape_names <- shape_override
        shape_names[names(shape_override)] <- names(shape_override)
      }
      ggp <- ggp + ggplot2::scale_shape_manual(
        values = unlist(shape_override),
        breaks = names(shape_names),
        labels = unlist(shape_names)
      )
    } else if (pointoutline) {
      # Automatic shapes - select the "fillable" ones
      shps <- list()
      shapeby <- factor(shapeby)
      for (i in 1:length(levels(shapeby))) {
        shps[[levels(shapeby)[i]]] <- 20 + i
      }
      ggp <- ggp + ggplot2::scale_shape_manual(values = unlist(shps))
    } else {
      # Automatic shapes - no outlines
      shps <- list()
      shapeby <- factor(shapeby)
      for (i in 1:length(levels(shapeby))) {
        shps[[levels(shapeby)[i]]] <- 15 + i
      }
      ggp <- ggp + ggplot2::scale_shape_manual(values = unlist(shps))
    }
  } else if (length(shape_override) == 1 && !is.na(shape_override)) {
    gptopt$shape <- shape_override
  } else {
    gptopt$shape <- if (pointoutline) {
      21
    } else {
      19
    }
  }

  # Set up the plot to colour based on metadata
  if (pointoutline) {
    ggscale_manual <- scale_fill_manual
    ggscale_gradientn <- scale_fill_gradientn
    ggscale_brewer <- scale_fill_brewer
    ggcolorfield <- "fill"
    if (!is.na(outline_size)) {
      gptopt[["stroke"]] <- outline_size
    }
  } else {
    ggscale_manual <- scale_colour_manual
    ggscale_gradientn <- scale_colour_gradientn
    ggscale_brewer <- scale_colour_brewer
    ggcolorfield <- "colour"
  }
  if (!is.na(colour)) {
    colby <- getmeta(colour)
    if (length(colour_names) > 1 || !is.na(colour_names)) {
      # reorder the factors to match the order in colour_names
      # so that the legend is displayed in the correct order
      colby <- factor(as.character(colby),
        levels = names(colour_names)
      )
    }
    if (is.numeric(colby) && colour_log) {
      colby <- log(colby + colour_log_pseudocount, base = colour_log_base)
    }
    pts$colour <- colby
    gptaes[[ggcolorfield]] <- "colour"

    if (is.na(colour_title)) {
      colour_title <- colour
    }

    if (length(colour_override) > 1 || !is.na(colour_override)) {
      # Manual colouring
      if (length(colour_names) == 1 && is.na(colour_names)) {
        colour_names <- colour_override
        colour_names[names(colour_override)] <- names(colour_override)
      }
      if (is.numeric(colby)) {
        ggp <- ggp + ggscale_gradientn(colours = colour_override, na.value = "grey80")
      } else {
        ggp <- ggp + ggscale_manual(
          values = unlist(colour_override),
          breaks = names(colour_names),
          labels = unlist(colour_names)
        )
      }
      col_guide <- guide_legend(
        title = colour_title,
        override.aes = list(size = 3, shape = 21)
      )
    } else if (is.numeric(colby)) {
      # Continuous data uses the RdYlGn palette
      # library(RColorBrewer)
      # ggp <- ggp + ggscale_gradientn(colours = colorRampPalette(c("dodgerblue","goldenrod1","firebrick"))(100))
      library(viridis)
      ggp <- ggp + ggscale_gradientn(colours = viridis(100), na.value = "grey80")
      # brewer.pal(n = 11, name = "YlGnBu"))
      # ggp <- ggp + ggscale_gradientn(colours = rev(brewer.pal(n = 11, name = "RdYlGn")))
      col_guide <- guide_colourbar(title = colour_title)
    } else if (length(levels(colby)) > 9) {
      library(RColorBrewer)
      ggp <- ggp + ggscale_manual(values = colorRampPalette(
        brewer.pal(n = 7, name = "Spectral")
      ))(length(levels(colby)))
      col_guide <- guide_legend(
        title = colour_title,
        override.aes = list(size = 3, shape = 21)
      )
    } else {
      # Discrete data uses the Set1 palette
      ggp <- ggp + scale_fill_brewer(palette = "Spectral")
      col_guide <- guide_legend(
        title = colour_title,
        override.aes = list(size = 3, shape = 21)
      )
    }
    if (pointoutline) {
      ggp <- ggp + guides(fill = col_guide)
    } else {
      ggp <- ggp + guides(colour = col_guide)
    }
  } else if (length(colour_override) == 1 && !is.na(colour_override)) {
    gptopt[[ggcolorfield]] <- colour_override
  } else {
    gptopt[[ggcolorfield]] <- "darkslategray4"
  }

  # Draw the points
  gptopt$data <- pts
  gptopt <- c(list(do.call(aes_string, gptaes)), gptopt)
  ggp <- ggp + do.call(geom_point, gptopt)

  # Helper to add text annotations
  xr <- 0.0018
  dx <- xr * (max(pts[, 1]) - min(pts[, 1]))
  dy <- 1.1 * xr * (max(pts[, 2]) - min(pts[, 2]))
  halo_quality <- 12
  put_text <- function(x, y, text, ...) {
    if (text_halo) {
      phi <- 2 * pi / halo_quality
      for (i in 1:halo_quality) {
        ggp <<- ggp + annotate("text",
          label = text, color = "white",
          x = x + dx * cos((i - 0.5) * phi),
          y = y + dy * sin((i - 0.5) * phi), ...
        )
      }
    }
    ggp <<- ggp + annotate("text", label = text, x = x, y = y, ..., color = "black")
  }

  # Add metadata centroids
  if (length(centroid) > 1 || !is.na(centroid)) {
    for (i in seq_along(centroid)) {
      meta <- getmeta(centroid[i])

      if (is.numeric(meta)) {
        d1 <- sum(pts$dim1 * meta) / sum(meta)
        d2 <- sum(pts$dim2 * meta) / sum(meta)
        put_text(d1, d2, centroid[i])
      } else {
        meta <- factor(meta) # remove unused
        for (j in seq_along(levels(meta))) {
          d1 <- mean(pts$dim1[meta == levels(meta)[j]])
          d2 <- mean(pts$dim2[meta == levels(meta)[j]])
          put_text(d1, d2, levels(meta)[j])
        }
      }
    }
  }

  # Add metadata enrichment modes
  if (length(enriched) > 1 || !is.na(enriched)) {
    library(neldermead)
    for (i in seq_along(enriched)) {
      meta <- getmeta(enriched[i])

      if (!is.numeric(meta)) {
        stop(sprintf("%s must be numeric to find its enrichment mode"))
      }

      # select appropriate bandwidths
      bw1 <- 1.06 * sd(pts$dim1) * length(pts$dim1)^-0.2
      bw2 <- 1.06 * sd(pts$dim2) * length(pts$dim2)^-0.2

      # only use finite values
      keep <- !is.na(meta) & is.finite(meta)
      meta <- meta[keep]
      xp <- pts$dim1[keep]
      yp <- pts$dim2[keep]

      enr <- function(xy) {
        chi2 <- ((xy[1] - xp) / bw1)^2 + ((xy[2] - yp) / bw2)^2
        kern <- exp(-chi2)
        nsamps <- sum(kern)
        modulator <- nsamps / (nsamps + 1)
        return(modulator * -sum(meta * kern) / sum(kern))
      }

      bestxy <- 0
      bestnenr <- Inf
      for (x0 in seq(from = min(pts$dim1), to = max(pts$dim1), length = 5)) {
        for (y0 in seq(from = min(pts$dim2), to = max(pts$dim2), length = 5)) {
          res <- fminsearch(enr, c(x0, y0))
          if (neldermead.get(res, "fopt") < bestnenr) {
            bestxy <- neldermead.get(res, "xopt")
            bestnenr <- neldermead.get(res, "fopt")
          }
        }
      }

      put_text(bestxy[1], bestxy[2], enriched[i])
    }
  }

  # Add arrows
  if (!is.null(arrows) || !is.null(arrows_fixed)) {
    if (is.null(arrows_fixed)) {
      arr_pcos <- matrix(0, 0, 2, dimnames = list(c(), dims))
    } else {
      arr_pcos <- arrows_fixed[, pcos]
    }
    if (!is.null(arrows)) {
      if (is.numeric(arrows)) {
        arrSet <- c()
        feats <- c(colnames(dat$x))
        for (i in seq_along(feats)) {
          meta <- getmeta(feats[i])

          if (is.numeric(meta)) {
            d1 <- sum(pts$dim1 * meta) / sum(meta)
            d2 <- sum(pts$dim2 * meta) / sum(meta)
            arr <- matrix(c(d1, d2), 1, 2)
            rownames(arr) <- feats[i]
            arrSet <- rbind(arrSet, arr)
          } else {
            meta <- factor(meta) # remove unused
            for (j in seq_along(levels(meta))) {
              d1 <- mean(pts$dim1[meta == levels(meta)[j]])
              d2 <- mean(pts$dim2[meta == levels(meta)[j]])
              arr <- matrix(c(d1, d2), 1, 2)
              rownames(arr) <- levels(meta)[j]
              arrSet <- rbind(arrSet, arr)
            }
          }
        }
        arrSz <- arrSet[, 1]^2 + arrSet[, 2]^2
        I <- order(-arrSz)
        arr_pcos <- rbind(arr_pcos, arrSet[I[1:arrows], ])
      } else {
        for (i in seq_along(arrows)) {
          meta <- getmeta(arrows[i])

          if (is.numeric(meta)) {
            d1 <- sum(pts$dim1 * meta) / sum(meta)
            d2 <- sum(pts$dim2 * meta) / sum(meta)
            arr <- matrix(c(d1, d2), 1, 2)
            rownames(arr) <- arrows[i]
            arr_pcos <- rbind(arr_pcos, arr)
          } else {
            meta <- factor(meta) # remove unused
            for (j in seq_along(levels(meta))) {
              d1 <- mean(pts$dim1[meta == levels(meta)[j]])
              d2 <- mean(pts$dim2[meta == levels(meta)[j]])
              arr <- matrix(c(d1, d2), 1, 2)
              rownames(arr) <- levels(meta)[j]
              arr_pcos <- rbind(arr_pcos, arr)
            }
          }
        }
      }
    }

    ptsRS <- rowSums(pts[, 1:2]^2)
    maxRadius <- sqrt(max(ptsRS[is.finite(ptsRS)]))
    maxArrRadius <- sqrt(max(rowSums(arr_pcos^2)))
    arr_pcos <- arr_pcos * (0.85 * maxRadius / maxArrRadius)
    if (arrow_sqrtnorm) {
      arr_len <- sqrt(rowSums(arr_pcos^2))
      maxarr_len <- max(arr_len)
      narr_len <- maxarr_len * sqrt(arr_len / maxarr_len)
      arr_pcos <- arr_pcos * (narr_len / arr_len)
    }
    colnames(arr_pcos) <- paste("dim", 1:length(pcos), sep = "")

    ggp <- ggp + geom_segment(
      data = as.data.frame(arr_pcos),
      aes(x = 0, y = 0, xend = dim1, yend = dim2),
      colour = "red", arrow = arrow()
    )

    if (arrow_text) {
      for (i in seq_len(nrow(arr_pcos))) {
        rr <- arr_pcos[i, ]
        hjust <- 0.5
        vjust <- 0.5
        if (abs(rr[1]) > abs(rr[2])) {
          hjust <- if (rr[1] > 0) {
            0
          } else {
            1
          }
        } else {
          vjust <- if (rr[2] > 0) {
            0
          } else {
            1
          }
        }
        tc <- rr * (1 + 0.015 * maxRadius / sqrt(sum(rr^2)))
        put_text(tc[1], tc[2], rownames(arr_pcos)[i],
          hjust = hjust, vjust = vjust, color = "red"
        )
      }
    }
  }

  return(ggp)
}

# -----------------------------------------------------------------------------
#  Efficient massive list/vector creation

blmerge <- function(bl, v) {
  # Merge a list/vector v into a list-of-v's bl
  # The result is a new bl
  # To create a new, empty bl, use list()
  # To finalize the large list/vector, unlist the bl. Elements are guaranteed
  # to be in the same order as if they had been c'd together by this function

  i <- 1
  while (i <= length(bl) && !is.null(bl[[i]])) {
    v <- c(bl[[i]], v)
    bl[[i]] <- NULL
    i <- i + 1
  }

  bl[[i]] <- v
  return(bl)
}

# -----------------------------------------------------------------------------
#  Match function using binary search

bmatch <- function(needle, haystack, le = F, nomatch = NA) {
  # Searches for needles in the vector/list haystack, which must be sorted
  # Returns the indices or nomatch if it does not exist
  # If le is TRUE, then the indices of the highest values that are <=needle
  # are returned (which may be zero all of haystack is greater than needle)
  #
  # If there are ties in haystack, then the index of the last element of tie
  # is returned

  lo <- rep(1, length(needle))
  hi <- rep(length(haystack), length(needle))

  # maintain an active set of needles which we have not yet found
  active <- seq_along(lo)
  while (length(active) > 0) {
    # get the midpoints of each active needle
    mid <- floor((lo[active] + hi[active]) / 2)
    midv <- haystack[mid]

    # limit the range of each active needle based on whether it's higher
    # than the midpoint of its current range in the haystack
    gt <- midv > needle[active]
    lo[active[!gt]] <- mid[!gt] + 1
    hi[active[gt]] <- mid[gt] - 1

    # deactivate found needles
    active <- active[lo[active] <= hi[active]]
  }

  # hi contains the indices we are interested in - replace inexact matches
  # with nomatch
  if (!le) {
    found <- hi > 0
    hi[!found] <- nomatch
    hi[which(found)[haystack[hi[found]] != needle[found]]] <- nomatch
  }

  return(hi)
}

bmatch_sort <- function(x) {
  # Sorts x by its names so that it can be used in bmatch

  x[order(names(x))]
}

# -----------------------------------------------------------------------------
#  Plotting helpers

geom_boxplot_n <- function(offset = 0, ...) {
  # Put the number of observations over a boxplot

  # Adapted from:
  # http://stackoverflow.com/questions/28846348/add-number-of-observations-per-group-in-ggplot2-boxplot
  return(stat_summary(fun.data = function(x) {
    return(c(y = max(x) + offset, label = length(x)))
  }, geom = "text", position = position_dodge(width = 0.75), vjust = "bottom", ...))
}
omicsEye/omicsArt documentation built on Oct. 8, 2024, 5:46 p.m.