coulteR: Make AccuComp-datasets readable in R

Documented in bulk_peak_detect

################################
#
# Extract Peaks
#
################################

# purpose: Extract a target peak using an a priori diameter
# author: Lukas Muenter
# date: 29.07.2021


## analyses ===================

#' Get target peaks per group
#'
#' A wrapper around `coulteR::detect_peaks`. Per group in a dataframe with identifiers, peaks are reported.
#' @param df `dataframe` of coerced measurements-`modules` as produced by `coulteR::bulk_read`.
#' @param diameter The expected diameter of particles
#' @param full_df Should all peaks be reported? default = `FALSE`
#' @importFrom dplyr group_by
#' @importFrom dplyr group_split
#' @return A `dataframe`.
#' @export
bulk_peak_detect = function(df, diameter, full_df = FALSE){

  if(!is.data.frame(df)){

    stop("Error: argument `df` is no dataframe. Did you coerce your modules when using `coulter::bulk_read`?")

  }

  df %>%
    group_by(sample) %>%
    group_split() %>%
    lapply(peak_detect, diameter = diameter, full_df = full_df) %>%
    do.call("rbind", .)

}

#' Detect target peaks
#'
#' This function detects a specific target peak. Optionally, all peaks are reported.
#' @param df A measurements-`module` as produced by `coulteR::read_accucomp`
#' @param diameter The expected diameter of particles
#' @param full_df Should all peaks be reported? default = `FALSE`
#' @return A `dataframe` with the diameter of the peak, the diameter range, and the total number of cells in the range.
#' @importFrom dplyr left_join
#' @importFrom dplyr select
#' @importFrom dplyr filter
#' @importFrom dplyr pull
#' @importFrom dplyr rename
#' @importFrom pracma findpeaks
#' @importFrom Hmisc wtd.mean
#' @importFrom Hmisc wtd.var
#' @export
peak_detect = function(df, diameter, full_df = FALSE){

  peaks = df %>%
    na.omit() %>%
    pull(number.diff) %>%
    findpeaks() %>%
    as.data.frame() %>%
    setNames(c("number.diff", "bin", "bin.start", "bin.end")) %>%
    left_join(df)

  ## identify Target peak
  peaks$target = FALSE
  peaks$target[get_nearest_peak(peaks$diameter.bin.um, diameter)] = TRUE

  ## get size ranges of peaks
  peaks$d.range.start = df$diameter.bin.um[peaks$bin.start]
  peaks$d.range.end = df$diameter.bin.um[peaks$bin.end]

  ## get weighted mean of peaks
  #peaks.wtdmean = wtd.mean(df$diameter.bin.um[peaks$bin.start], weights = df$number.diff[peaks$bin.start])
  peaks$d.wtdmean = get_wtd_mean(df, peaks)

  ## get weighted sd of means
  peaks$d.wtdsd = get_wtd_sd(df, peaks)

  ## get number of cells in distribution
  peaks$n.cells = get_number_cells(df, starts = peaks$bin.start, ends = peaks$bin.end)

  if(full_df == FALSE){

    return(

      peaks %>%
        filter(target == TRUE) %>%
        select(sample, "d.peak" = diameter.bin.um, d.range.start, d.range.end, n.cells, d.wtdmean, d.wtdsd)

    )

  } else {

    return(

      peaks %>% rename("d.peak" = diameter.bin.um)

    )

  }

}

#' Obtain peak nearest to target
#'
#' @param x A numeric vector
#' @param y The target number
#' @return A `number`
get_nearest_peak = function(x, y){

  which(abs(x - y) == min(abs(x - y)))

}

#' Get number of cells in range
#' @param df A `measurements`-module
#' @param starts Indices of start bins
#' @param ends Indices of end bins
#' @return A `vector` of summed cells
get_number_cells = function(df, starts,ends){

  Map(":", starts, ends) %>%
    lapply(function(x,y) sum(df$number.diff[x]),y = df) %>%
    do.call("c", .)

}

#' Get weighted means
#' Computes weighted means for all detected peaks
#' @param df A `measurements`-module
#' @param peaks A `peaks` dataframe
#' @return A numeric `vector`
get_wtd_mean = function(df, peaks){

  lapply(c(1:nrow(peaks)), function(x,y,z)
    wtd.mean(y$diameter.bin.um[c(x$bin.start[z]:x$bin.end[z])],
             weights = y$number.diff[c(x$bin.start[z]:x$bin.end[z])]),
    x = peaks,
    y = df) %>% do.call("c", .)

}

#' Get weighted SDs
#' Computes weighted standard deviations for all detected peaks
#' @param df A `measurements`-module
#' @param peaks A `peaks` dataframe
#' @return A numeric `vector`
get_wtd_sd = function(df, peaks){

  lapply(c(1:nrow(peaks)), function(x,y,z)
    sqrt(
      wtd.var(y$diameter.bin.um[c(x$bin.start[z]:x$bin.end[z])],
              weights = y$number.diff[c(x$bin.start[z]:x$bin.end[z])])
    ),
    x = peaks,
    y = df) %>% do.call("c", .)

}

#' Plot target tracks
#' Plots tracks of selected samples. Note that file extensions and the character `#` are to be removed from sample names!
#' This function either plots selected samples, or selected indices, or selects and plots 9 randomly selected tracks.
#' @param df A `A dataframe` as generated by `coulteR::bulk_read`, with the parameter `module = "measurements"`
#' @param peaks A `dataframe` as generated by `coulteR::bulk_peak_detect`
#' @param indices Indices of plots to be plotted. Default: `NULL`
#' @param seed Seed for random selection. Default: `123`
#' @param N Number of plots to be randomly selected. Default: 9
#' @param ... Other variables passed on to geom_line
#' @return A `ggplot`-object
#' @import dplyr
#' @import ggplot2
#' @export
ggtracks = function(df, peaks = NULL, samples = NULL, seed = 123, N = 1, show.legend = TRUE){

  if(is.null(samples)){

    set.seed(seed)
    samples.idx = sample(c(1:nrow(peaks)), N, replace = FALSE)
    samples = peaks$sample[samples.idx]
  }

  a = df %>%
    filter(sample%in% samples)

  p = ggplot() +
    geom_line(data = a, aes(x = diameter.bin.um, y = number.diff, color = sample), show.legend = show.legend) +
    theme_bw() +
    facet_wrap(sample ~ .)

  if(is.null(peaks)) {

    p

  } else {

    ## filter the peak dataframe
    b = peaks %>%
      filter(sample %in% samples)

    p  +  geom_rect(data = b, aes(ymin=0, ymax=Inf, xmin=d.range.start, xmax=d.range.end), alpha = 0.1) +
      geom_vline(data = b, aes(xintercept = d.wtdmean)) +
      geom_vline(data = b, aes(xintercept = d.wtdmean + d.wtdsd), linetype = "dotted") +
      geom_vline(data = b, aes(xintercept = d.wtdmean - d.wtdsd), linetype = "dotted")

  }

}