R/diffsom.R

Defines functions Load ds_hclust ds_kmeans ds_consensus ds_mhca Embed ExportMap NiceMfrow NicePrettyColnames PlotScatter PlotDiff Plot PlotAnnotated PlotClusters CellFile PlotExprs Gate GetProbs SignificanceColors PlotSignificance PlotCorrelations ExportData

Documented in CellFile Embed ExportData ExportMap Gate GetProbs Load NiceMfrow NicePrettyColnames Plot PlotAnnotated PlotClusters PlotDiff PlotExprs PlotScatter PlotSignificance SignificanceColors

#' Load stuff into analysis object
#'
#' @examples
#' ds <- Load(c("1.fcs", "2.fcs"), transform = T, toTransform = c(7:14), cells = 100000)
#' @export
Load <- function(files, scale = T, transform = F, toTransform = NULL, compensate = F, cells = 1e5, ...) {
  fs <- FlowSOM::ReadInput(
    FlowSOM::AggregateFlowFrames(files, cTotal = cells),
    transform = transform,
    toTransform = toTransform,
    compensate = compensate,
    scale = scale,
    ...
  )

  # unscale fs$data[,'File'] (the rough way; FlowSOM does not export the scaling information anymore)
  fs$data[, "File"] <- as.numeric(factor(fs$data[, "File"]))

  list(fs = fs, files = files)
}

#' Hierarchical clustering for Embed()
#'
#' @export
ds_hclust <- function(codes, data, importance = NULL, mapping, k = 7) {
  # This code was proudly adapted from FlowSOM
  d <- stats::dist(codes, method = "euclidean")
  fit <- stats::hclust(d, method = "ward.D2")
  stats::cutree(fit, k = k)
  # end of FlowSOM copy
}

#' k-means clustering for Embed()
#'
#' @export
ds_kmeans <- function(codes, data, importance = NULL, mapping, k = 7) {
  kmeans(x = codes, centers = k)$cluster
}

#' consensus clustering for Embed()
#'
#' warning: the internally called ConsensusClusterPlus may break the state of
#' random number generator; use caution if the results should be reproducible.
#'
#' @export
ds_consensus <- function(codes, data, importance = NULL, mapping, k = 7) {
  seed <- sample(1000000000, 1) # force "normal" behavior
  # another FlowSOM copy begins
  results <- suppressMessages(ConsensusClusterPlus::ConsensusClusterPlus(
    t(codes),
    maxK = k, reps = 100, pItem = 0.9, pFeature = 1,
    title = tempdir(), plot = "pdf", verbose = FALSE,
    clusterAlg = "hc",
    distance = "euclidean",
    seed = seed
  ))
  results[[k]]$consensusClass
  # FlowSOM copy ends
}

#' Mahalanobis-linked-average clustering for Embed()
#'
#' @export
ds_mhca <- function(codes, data, importance = NULL, mapping, k = 7) {
  cl <- mhca::cutreeApriori(mhca::mhclust(x = data, g = mapping, quick = T, gIntra = F))
  stats::cutree(cl, k = k)
}

#' Embed the analysis object
#'
#' This is the computationally intensive part. You may want to provide
#' a clustering seed.
#'
#' @examples
#' ds <- Embed(ds, colsToUse = c(1, 4, 7:14), nclust = 5, importance = c(1, 1, 2, 1, 3, 1, 1, 1, 1, 1))
#' @export
Embed <- function(ds, importance = NULL, colsToUse = NULL,
                  xdim = 16, ydim = 16, map = NULL, rlen = 10,
                  distf = 2, nhbr.method = "manhattan",
                  nclust = NULL, clustf = NULL,
                  esmooth = NULL, ek = NULL, eadjust = NULL,
                  emcoords = "flat", emcoords.pow = 1) {
  if (is.null(colsToUse)) colsToUse <- colnames(ds$fs$data)
  r <- ds
  if (!is.null(colsToUse)) r$colsUsed <- colsToUse
  if (!is.null(importance)) r$importance <- importance

  if (!is.null(map)) {
    if (is.character(map)) {
      load(map)
      map <- DiffSOM_map$map
    }
    r$importance <- map$importance
    r$colsUsed <- map$colsUsed
    map$importance <- NULL
    map$colsUsed <- NULL
    r$map <- FlowSOM::SOM(r$fs$data[, r$colsUsed],
      xdim = map$xdim, ydim = map$ydim,
      codes = map$codes,
      importance = r$importance,
      rlen = 0, distf = distf, nhbr.method = nhbr.method
    )
  } else {
    r$map <- FlowSOM::SOM(r$fs$data[, r$colsUsed],
      xdim = xdim, ydim = ydim,
      importance = r$importance,
      rlen = rlen
    )
  }

  if (!is.null(emcoords)) { # sometimes this helps saving computation time
    r$e <- EmbedSOM::EmbedSOM(
      fsom = r$fs,
      data = r$fs$data[, r$colsUsed], map = r$map,
      importance = r$importance,
      smooth = esmooth, k = ek, adjust = eadjust,
      emcoords = emcoords, emcoords.pow = emcoords.pow
    )
  }

  if (!is.null(map) && is.null(clustf) && is.null(nclust)) {
    r$nclust <- map$nclust
    r$clust <- map$clust
  } else {
    if (is.null(nclust)) nclust <- 10
    if (is.null(clustf)) clustf <- ds_hclust
    r$clust <- clustf(
      codes = r$map$codes,
      data = r$fs$data[, colsToUse],
      importance = r$importance,
      mapping = r$map$mapping[, 1],
      k = nclust
    )
    r$nclust <- max(r$clust)
  }
  r
}

#' Export the FlowSOM map object from the analysis
#'
#' @param filename File name for saving the map (the same can be used for loading the map in Embed()).
#' @return the map object, if filename is not supplied
#'
#' @example ds <- Embed(ds, map=ExportMap(ds), ...) #re-embed without recomputing SOM
#'
#' @export
ExportMap <- function(ds, filename = NULL) {
  map <- ds$map
  map$importance <- ds$importance
  map$colsUsed <- ds$colsUsed
  map$mapping <- NULL
  map$nclust <- ds$nclust
  map$clust <- ds$clust

  if (!is.null(filename)) {
    DiffSOM_map <- new.env()
    DiffSOM_map$map <- map
    save(DiffSOM_map, file = filename)
  } else {
    map
  }
}

#' Guess a good square-ish mfrow for multiplots
#'
#' @export
NiceMfrow <- function(n) {
  y <- as.integer(sqrt(n))
  x <- as.integer((n + y - 1) / y)
  c(y, x)
}

#' Guess good marker names (ie. just take the first whitesep-word)
#'
#' @export
NicePrettyColnames <- function(fs, sel = T) {
  r <- unlist(lapply(strsplit(split = " ", fs$prettyColnames[sel]), function(x) {
    x[1]
  }))
  names(r) <- names(fs$prettyColnames[sel])
  r
}

#' Plot a scatter plot in markers 'x' and (optionally) 'y'.
#' @param alpha used as plot color
#' @param pch forwarded to plot
#' @param cex forwarded to plot
#'
#' @examples
#' PlotScatter(ds, 1, 4)
#' @export
PlotScatter <- function(ds,
                        fs = 1:(length(ds$files)),
                        x = NULL,
                        y = NULL,
                        alpha = 0.5,
                        ...) {
  fid <- CellFile(ds)

  par(mar = c(0, 0, 0, 0), mfrow = NiceMfrow(length(fs)))

  for (i in fs) {
    xa <- if (is.null(x)) {
      NULL
    } else {
      ds$fs$data[fid == i, x]
    }
    ya <- if (is.null(y)) {
      NULL
    } else {
      ds$fs$data[fid == i, y]
    }
    EmbedSOM::PlotEmbed(cbind(xa, ya), alpha = alpha, ...)
    mtext(ds$files[i], line = -2, cex = 1, col = "red")
  }
}

#' Plot aligned sample density plots. Params are forwarded to PlotEmbed.
#'
#' @examples
#' PlotDiff(ds)
#' @export
PlotDiff <- function(ds, ...) {
  fid <- CellFile(ds)
  fs <- 1:(length(ds$files))

  par(mar = c(0, 0, 0, 0), mfrow = NiceMfrow(length(fs)))

  for (i in fs) {
    EmbedSOM::PlotEmbed(ds$e[fid == i, ], data = ds$fs$data[fid == i, ], ...)
    mtext(ds$files[i], cex = 1, col = "#008000", line = -1.3, side = 1, adj = 0.01)
  }
}

#' Plot aggregate sample information
#'
#' This plots out aggregate density, all marker expression levels, and
#' clustering with cluster numbers. Extra params are forwarded to PlotEmbed.
#'
#' @param colNames column names to be used for plotting
#'
#' @examples
#' Plot(ds)
#' @export
Plot <- function(ds,
                 colNames = NULL,
                 alpha = 0.5,
                 mar = c(0, 0, 0, 0),
                 density = T,
                 clusters = T,
                 cols = ds$colsUsed,
                 mfrow = NiceMfrow(length(cols) + density + clusters),
                 ...) {
  if (!is.null(mfrow) && !is.null(mar)) par(mar = mar, mfrow = mfrow)

  if (density) EmbedSOM::PlotEmbed(ds$e, fsom = ds$fs, alpha = 1, ...)

  if (is.null(colNames)) colNames <- NicePrettyColnames(ds$fs)

  for (i in cols) {
    EmbedSOM::PlotEmbed(ds$e, fsom = ds$fs, value = i, alpha = alpha, ...)
    mtext(colNames[i], cex = 2, line = -1.3, side = 1, adj = 0.01)
  }

  if (clusters) PlotAnnotated(ds, pos = 1, size.text = 2, alpha = alpha, ...)
}


#' Plot annotated version of the plot with clusters
#'
#' This plots out aggregate density, all marker expression levels, and
#' clustering with cluster numbers.
#'
#' @param clusterMap mapping of metaclusters to a new clustering used to join some of them
#' @param annotation vector of strings with annotation
#' @param pos position of the annotation
#'
#' @examples
#' PlotAnnotated(ds)
#' @export
PlotAnnotated <- function(ds,
                          clusterMap = c(1:ds$nclust),
                          annotation = c(1:nMapClust),
                          pos = NULL,
                          size.point = 1,
                          size.text = 1,
                          col.text = "black",
                          cluster.colors = EmbedSOM::ClusterPalette,
                          ...) {
  nMapClust <- max(clusterMap)

  EmbedSOM::PlotEmbed(ds$e,
    fsom = ds$fs,
    clust = clusterMap[ds$clust[ds$map$mapping[, 1]]],
    cluster.colors = cluster.colors,
    ...
  )

  for (i in c(1:(nMapClust))) {
    ex <- ds$e[clusterMap[ds$clust[ds$map$mapping[, 1]]] == i, ]
    loc <- c(median(ex[, 1]), median(ex[, 2]))
    if (!is.null(pos) || size.text == 0) {
      points(loc[1], loc[2], pch = 20, cex = 3 * size.point, col = "white")
      points(loc[1], loc[2], pch = 20, cex = 2 * size.point, col = cluster.colors(nMapClust)[i])
    }
    if (size.text > 0) {
      text(loc[1], loc[2], annotation[i], pos = pos, cex = size.text, col = col.text)
    }
  }
}

#' Plot separated clusters
#'
#' @param alpha alpha for the expression plots
#'
#' @export
PlotClusters <- function(ds, alpha = 1, bgcol = rgb(.75, .75, .75, alpha / 3), ...) {
  fgcols <- EmbedSOM::ClusterPalette(ds$nclust, alpha = alpha)
  par(mar = c(0, 0, 0, 0), mfrow = NiceMfrow(ds$nclust))
  for (i in 1:ds$nclust) {
    col <- rep(bgcol, dim(ds$e)[1])
    col[ds$clust[ds$map$mapping[, 1]] == i] <- fgcols[i]
    EmbedSOM::PlotEmbed(ds$e, fsom = ds$fs, col = col, ...)
    mtext(paste0("Cluster ", i), cex = 2, line = -1.3, side = 1, adj = 0.01)
  }
}

#' Get the number of input file for each cell
#'
#' @export
CellFile <- function(ds) {
  ds$fs$data[, "File"]
}

#' Plot marker expressions for histogram-style comparison of multiple samples/clusters.
#'
#' @param cols Which columns to plot
#' @param names Which names to use for the columns
#' @param files T/F -- plot separate histograms for files
#' @param clusters T/F -- plot separate histograms for clusters
#' @param ref Reference-distribution DiffSOM object
#' @param rdist Scatter distribution generator
#' @param pch,cex,alpha Used for plotting as usual
#'
#' @examples
#' PlotExprs(ds2, ref = ds, clusters = T)
#' @export
PlotExprs <- function(ds,
                      cols = ds$colsUsed,
                      names = NicePrettyColnames(ds$fs, sel = cols),
                      files = F, clusters = F,
                      ref = NULL,
                      alpha = 0.1,
                      rdist = function(n) (0.1 * rnorm(n)),
                      ...) {
  if (is.null(cols)) cols <- colnames(ds$fs$data)

  nplots <- length(cols)

  get.clust <- function(ds) {
    ds$clust[ds$map$mapping[, 1]]
  }

  mrref <- 1
  nrref <- 1
  rref <- rep(0, length(ds$fs$data[, 1]))
  if (!is.null(ref)) {
    rref <- c(rref, rep(1, length(ref$fs$data[, 1])))
  } else if (clusters || files) {
    rref <- c(rref, rep(1, length(rref)))
  }
  col <- rep(rgb(0, 0, 0, alpha), length(rref))

  file <- 0
  nfiles <- 1
  if (files) {
    nfiles <- length(ds$files)
    file <- CellFile(ds)
    if (!is.null(ref)) {
      file <- c(file, CellFile(ref))
      nrref <- 2
    } else {
      if (clusters) {
        file <- c(file, file)
      } else {
        file <- c(file, rep(nfiles + 1, length(file)))
      }
      mrref <- 0
    }

    col <- EmbedSOM::ClusterPalette(nfiles, alpha = alpha, vcycle = .7, scycle = .7)[file]
    file <- file - 1
  }

  clust <- 0
  nclust <- 1
  if (clusters) {
    clust <- get.clust(ds)
    if (!is.null(ref)) {
      clust <- c(clust, rep(ds$nclust + 1, length(ref$fs$data[, 1])))
      mrref <- 0
    } else {
      clust <- c(clust, rep(ds$nclust + 1, length(clust)))
      mrref <- 0
    }

    c2 <- clust
    c2[c2 > ds$nclust] <- NA
    col <- EmbedSOM::ClusterPalette(ds$nclust, alpha = alpha)[c2]
    clust <- clust - 1
    nclust <- ds$nclust
  }

  col[rref > 0] <- rgb(.5, .5, .5, alpha)

  par(mar = c(0, 0, 2, 0), mfrow = NiceMfrow(length(cols)))
  for (i in 1:length(cols)) {
    cc <- cols[i]
    d <- ds$fs$data[, cc]
    if (!is.null(ref)) {
      d <- c(d, ref$fs$data[, cc])
    }
    else if (clusters || files) {
      d <- c(d, d)
    }
    EmbedSOM::PlotEmbed(
      cbind(d, mrref * rref + nrref * (file + nfiles * clust) + rdist(length(d))),
      col = col, ...
    )
    mtext(names[i], outer = F)
  }
}

#' Apply gating and output a new diff-som object (to be embedded)
#'
#' @param clusters The cluster numbers to be selected
#' @param negative Invert the selection
#'
#' @examples
#' ds2 <- Gate(ds, clusters = c(5), negative = T) # removes cluster 5
#' @export
Gate <- function(ds, clusters = NULL, negative = F) {
  flt <- ds$clust[ds$map$mapping[, 1]] %in% clusters
  if (negative) flt <- !flt
  r <- list(fs = ds$fs, files = ds$files)
  r$fs$data <- r$fs$data[flt, ]
  r
}

#' Get a matrix with relative population abundances in clusters (cols) vs files (rows)
#'
#' Useful for exporting the stats and working with them elsewhere.
#'
#' @export
GetProbs <- function(ds, meta = T) {
  pad.zero <- function(v, l) {
    if (length(v) == l) {
      v
    } else if (length(v) > l) {
      v[1:l]
    } else {
      c(v, rep(0, times = l - length(v)))
    }
  }

  fid <- CellFile(ds)
  nf <- length(ds$files)
  ncl <- if (meta) {
    ds$nclust
  } else {
    ds$map$xdim * ds$map$ydim
  }
  cl <- if (meta) {
    ds$clust[ds$map$mapping[, 1]]
  }
  else {
    ds$map$mapping[, 1]
  }

  probs <- matrix(0, ncol = ncl, nrow = nf)

  for (i in (1:nf)) probs[i, ] <- pad.zero(tabulate(cl[fid == i]), ncl) / sum(fid == i)

  list(probs = probs, cl = cl)
}


#' Produce a list of colors for plotting significance with each cell.
#'
#' @param control Numbers of files with the base-line samples
#' @param experiment Numbers of files that are expected to differ
#' @param meta Use metacluster. Set to F for lower granularity
#' @param pow Adjustment to p-vals before plotting. Use a lower value for seeing even insignificant stuff.
#' @param alpha default alpha
#' @param col.inconclusive color for inconclusive outcome
#' @param col.less color for the 'less' test significance
#' @param col.greater the other color
#'
#' @export
SignificanceColors <- function(ds, control, experiment,
                               meta = T, paired = F, pow = 10, alpha = .5,
                               probs = GetProbs(ds, meta),
                               col.inconclusive = rgb(.75, .75, .75, alpha / 2),
                               col.greater = rgb(1, .5, 0, alpha),
                               col.less = rgb(0, .5, 1, alpha)) {
  cl <- probs$cl
  probs <- probs$probs

  ncl <- ncol(probs)
  clusters <- 1:ncl
  p_less <- 0 * clusters
  p_greater <- 0 * clusters
  for (i in clusters) {
    p_less[i] <- (1 - wilcox.test(
      probs[experiment, i],
      probs[control, i],
      alternative = "less",
      paired = paired
    )$p.value)^pow
    p_greater[i] <- (1 - wilcox.test(
      probs[experiment, i],
      probs[control, i],
      alternative = "greater",
      paired = paired
    )$p.value)^pow
  }

  colv.greater <- col2rgb(col.greater, alpha = T)
  colv.less <- col2rgb(col.less, alpha = T)
  colv.inconclusive <- col2rgb(col.inconclusive, alpha = T)
  colv <- outer(p_greater, colv.greater) +
    outer(p_less, colv.less) +
    outer(1 - p_greater - p_less, colv.inconclusive)
  colv <- colv[, , 1] / 255 # TODO explain the 3rd dimension... :]
  colv[colv > 1] <- 1 # fixup rounding errors that sometimes kill rgb()
  colv[colv < 0] <- 0
  rgb(colv[cl, 1], colv[cl, 2], colv[cl, 3], colv[cl, 4])
}

#' Produce the plot with change significances
#'
#' Extra parameters are forwarded to SignificanceColors function.
#' @param pch,cex,plotf forwarded to PlotEmbed
#' @param filter T/F remove the cells that do not belong in any tested group
#'
#' @examples
#' PlotSignificance(ds, control = c(1, 2, 3), experiment = c(4, 5, 6))
#' @export
PlotSignificance <- function(ds,
                             control,
                             experiment,
                             pch = ".",
                             cex = 1,
                             plotf = EmbedSOM::PlotDefault,
                             filter = F, ...) {
  mask <- T
  if (filter) {
    mask <- CellFile(ds) %in% c(control, experiment)
  }
  EmbedSOM::PlotEmbed(ds$e[mask, ],
    col = SignificanceColors(ds, control, experiment, ...)[mask],
    cex = cex, pch = pch, plotf = plotf
  )
}

#' Plot correlations between cluster populations in samples
#'
#' @param meta use metaclusters (default TRUE)
#' @param samples limit the choice of samples (default: all)
#' @param clusterMap a map for grouping clusters (default: identity)
#' @param annotation names of clusters
#' @param filter remove cells from samples that are not present (default FALSE)
#'
#' @export
PlotCorrelations <- function(ds,
                             samples = 1:length(ds$files),
                             meta = T,
                             mfrow = if (meta) {
                               (NiceMfrow(nMapClust))
                             } else {
                               c(ds$map$ydim, ds$map$xdim)
                             },
                             clusterMap = if (meta) {
                               (ds$clust)
                             } else {
                               (1:(ds$map$xdim * ds$map$ydim))
                             },
                             nMapClust = max(clusterMap),
                             plotOrder = if (meta) {
                               (
                                 1:nMapClust
                               )
                             } else {
                               (
                                 as.vector(matrix(
                                   ncol = ds$map$ydim,
                                   1:(ds$map$xdim * ds$map$ydim)
                                 )
                                 [, ds$map$ydim:1])
                               )
                             },
                             annotation = if (meta) paste("Cluster", c(1:nMapClust)) else c(),
                             filter = F, alpha = .5, ...) {
  mask <- T
  if (filter) {
    mask <- CellFile(ds) %in% samples
  }

  probs <- GetProbs(ds, meta = F)$probs[samples, ]
  probs <- sapply(1:nMapClust, function(x) {
    apply(matrix(nrow = dim(probs)[1], probs[, clusterMap == x]), 1, sum)
  })

  par(mar = c(0, 0, 0, 0), mfrow = NiceMfrow(nMapClust))
  for (i in plotOrder) {
    cors <- as.vector(cor(probs[, i], probs))
    EmbedSOM::PlotEmbed(
      ds$e[mask, ],
      col = EmbedSOM::ExpressionPalette(101, alpha = alpha)
      [1 + 50 * (1 + cors[clusterMap[ds$map$mapping[mask, 1]]])],
      ...
    )
    mtext(annotation[i], cex = 2, line = -1.3, side = 1, adj = 0.01)
  }
}

#' Export a data frame with stuff from the DiffSOM object
#'
#' @param colNames column names to use instead of NicePrettyColnames
#'
#' @export
ExportData <- function(ds, colNames = NULL) {
  df <- data.frame(ds$fs$data)
  df <- df[, 1:(length(df) - 2)] # strip off the file+file scatter cols from AggregateFlowFrames

  if (is.null(colNames)) colNames <- NicePrettyColnames(ds$fs)[1:length(df)]
  colnames(df) <- colNames

  df <- data.frame(df, File = CellFile(ds))

  if (is.null(ds$e)) {
    df
  } else {
    data.frame(
      df,
      EmbedSOM1 = ds$e[, 1],
      EmbedSOM2 = ds$e[, 2],
      Precluster = ds$map$mapping[, 1],
      Precluster1 = 1 + ((ds$map$mapping[, 1] - 1) %% ds$map$xdim),
      Precluster2 = 1 + ((ds$map$mapping[, 1] - 1) %/% ds$map$xdim),
      Metacluster = ds$clust[ds$map$mapping[, 1]]
    )
  }
}
exaexa/DiffSOM documentation built on Aug. 22, 2019, 2:46 p.m.