R/PeakRegions.R

Defines functions create_frequency_regions frequency_points_to_frequency_regions mz_points_to_frequency_regions

Documented in create_frequency_regions frequency_points_to_frequency_regions mz_points_to_frequency_regions

#' mz points to frequency regions
#'
#' Given M/Z point data in a data.frame, create IRanges based point "regions" of
#' width 1, using the `frequency_multiplier` argument to convert from the floating
#' point double to an `integer`.
#'
#' @param mz_data_list a list of `data.frame` containing `mz`
#' @param frequency_fit_description the description for frequency ~ mz
#' @param mz_fit_description the description for mz ~ frequency
#' @param frequency_multiplier a value used to convert to integers.
#'
#' @importFrom IRanges IRanges
#' @importFrom S4Vectors mcols
#' @export
mz_points_to_frequency_regions <- function(mz_data_list, frequency_fit_description = c(0, -1/2, -1/3),
                                           mz_fit_description = c(0, -1, -2, -3), frequency_multiplier = 400){
  log_message("Converting scans to frequency")
  frequency_list = mz_scans_to_frequency(mz_data_list, frequency_fit_description, mz_fit_description)
  log_message("Done converting to frequency")

  # this is a check to make sure we will be able to convert
  # to multiply and still get integers out
  log_message("Getting / checking ranges")
  all_range = internal_map$map_function(frequency_list$frequency, ~ range(.x$frequency, na.rm = TRUE))
  range_freq = range(unlist(all_range))

  if (any(is.na(range_freq))) {
    stop("NA entries in conversion from M/Z to frequency! Stopping!")
  }

  init_conversion = suppressWarnings(as.integer(range_freq * frequency_multiplier))
  na_conversion = any(is.na(init_conversion))
  reduce_value = 1
  if (na_conversion) {
    while (na_conversion && (reduce_value >= 0.29)) {
      reduce_value = reduce_value - 0.05
      new_multiplier = (frequency_multiplier * reduce_value)
      #message(new_multiplier)
      new_conversion = suppressWarnings(as.integer(range_freq * new_multiplier))
      na_conversion = any(is.na(new_conversion))
    }

    if (!na_conversion && (reduce_value >= 0.29)) {
      log_message("Frequency point multiplier modified to avoid NA values ...")
      frequency_multiplier = new_multiplier
    } else {
      stop("No good point multiplier found, stopping.")
    }
  }
  log_message("Done getting / checking ranges")

  log_message("Creating point regions")
  frequency_regions = internal_map$map_function(frequency_list$frequency, frequency_points_to_frequency_regions, multiplier = frequency_multiplier)
  log_message("Done creating point regions")
  log_memory()
  frequency_list$frequency_multiplier = frequency_multiplier
  return(list(frequency = frequency_regions,
       metadata = frequency_list[seq(2, length(frequency_list))]))
}

#' frequency points to frequency_regions
#'
#' Given a set of frequency points in a data.frame, create IRanges based point "regions"
#' of width 1, useing the `multiplier` to convert from a floating point double
#' to an `integer`
#'
#' @param frequency_data a `data.frame`
#' @param frequency_variable which column is the `frequency` stored in
#' @param multiplier value used to convert to integers
#'
#' @export
frequency_points_to_frequency_regions = function(frequency_data, frequency_variable = "frequency", multiplier = 400){
  frequency_regions <- IRanges::IRanges(start = round(frequency_data[, frequency_variable] * multiplier), width = 1)
  if (is.null(frequency_data$point)) {
    frequency_data$point <- seq(1, nrow(frequency_data))
  }
  S4Vectors::mcols(frequency_regions) <- frequency_data
  frequency_regions@metadata = list(multiplier = multiplier)
  frequency_regions
}

#' create frequency regions
#'
#' Given a point-point spacing and a frequency range, create IRanges based regions
#' of specified width. Overlapping sliding regions can be creating by specifying
#' a `region_size` bigger than `delta`, adjacent tiled regions can be created
#' by specifying a `region_size` == `delta`.
#'
#' @param point_spacing how far away are subsequent points.
#' @param frequency_range the range of frequency to use
#' @param n_point how many points you want to cover
#' @param delta_point the *step* size between the beginning of each subsequent region
#' @param multiplier multiplier to convert from frequency to integer space
#'
#' @details For Fourier-transform mass spec, points are equally spaced in
#'   frequency space, which will lead to unequal spacing in M/Z space. Therefore,
#'   we create regions using the point-point differences in frequency space.
#'
#'   What will be returned is an `IRanges` object, where the widths are constantly
#'   increasing over M/Z space.
#'
#' @export
#' @return IRanges
create_frequency_regions <- function(point_spacing = 0.5, frequency_range = NULL,
                              n_point = 10, delta_point = 1,
                              multiplier = 500){
  if (is.null(frequency_range)) {
    stop("A valid range of frequencies must be supplied!")
  }

  region_size = n_point * point_spacing
  step_size = point_spacing * delta_point
  use_frequencies = round(frequency_range)

  frequency_start = round(seq(use_frequencies[1], use_frequencies[2] - region_size, by = step_size) * multiplier)
  frequency_end = round(seq(use_frequencies[1] + region_size, use_frequencies[2], by = step_size) * multiplier)

  regions <- IRanges::IRanges(start = frequency_start, end = frequency_end)

  #S4Vectors::mcols(regions) <- list(mz_start = mz_start, mz_end = mz_end)
  regions@metadata <- list(multiplier = multiplier,
                           point_spacing = point_spacing,
                           n_point = n_point,
                           delta_point = delta_point)
  regions

}

#' @export
PeakRegions <- R6::R6Class("PeakRegions",
  public = list(
    frequency_point_regions = NULL,
    frequency_fit_description = NULL,
    mz_fit_description = NULL,

    peak_regions = NULL,
    sliding_regions = NULL,
    tiled_regions = NULL,

    peak_region_list = NULL,

    frequency_multiplier = NULL,
    scan_peaks = NULL,
    peak_data = NULL,
    scan_level_arrays = NULL,
    is_normalized = FALSE,
    normalization_factors = NULL,

    n_scan = NULL,
    scans_per_peak = NULL,
    scan_perc = NULL,
    min_scan = NULL,
    max_subsets = NULL,
    scan_subsets = NULL,

    frequency_range = NULL,

    scan_correlation = NULL,
    keep_peaks = NULL,
    peak_index = NULL,

    scan_indices = NULL,
    instrument = NULL,

    set_min_scan = function(){
      if (!is.null(self$frequency_point_regions)) {
        self$n_scan <- length(unique(self$frequency_point_regions$frequency))
        self$min_scan <- min(c(round(self$scan_perc * self$n_scan), 25))
        if (self$min_scan <= 3) {
          self$min_scan <- 3
        }
      } else {
        warning("No frequency points to pull scan data from!")
      }

      invisible(self)
    },

    add_data = function(raw_ms){
      if (!is.null(raw_ms)) {
        if (inherits(raw_ms, "RawMS")) {
          raw_mz_data = raw_ms$extract_raw_data()
          self$instrument = raw_ms$get_instrument()
        } else if (inherits(raw_ms, "list")) {
          raw_mz_data = raw_ms
        }
        self$frequency_point_regions = mz_points_to_frequency_regions(mz_data = raw_mz_data,
                                                                      frequency_fit_description = self$frequency_fit_description,
                                                                      mz_fit_description = self$mz_fit_description,
                                                                      frequency_multiplier = self$frequency_multiplier)
        self$frequency_multiplier = self$frequency_point_regions$metadata$frequency_multiplier
        self$frequency_range = purrr::map(self$frequency_point_regions$frequency, ~ range(S4Vectors::mcols(.x)$frequency)) %>% do.call(c, .) %>% range(.)
        self$set_min_scan()

      }
      invisible(self)
    },

    initialize = function(raw_ms = NULL,
                          frequency_fit_description = c(0, -1/2, -1/3),
                          mz_fit_description = c(0, -1, -2, -3),
                          frequency_multiplier = 400,
                          scan_perc = 0.1, max_subsets = 100){
      #browser(expr = TRUE)
      self$frequency_multiplier <- frequency_multiplier
      self$frequency_fit_description = frequency_fit_description
      self$mz_fit_description = mz_fit_description
      self$scan_perc <- scan_perc
      self$max_subsets <- max_subsets

      self$add_data(raw_ms)

      invisible(self)
    }
  )
)


#' @export
PeakRegionFinder <- R6::R6Class("PeakRegionFinder",
  public = list(
    run_time = NULL,
    start_time = NULL,
    stop_time = NULL,
    peak_regions = NULL,

    sliding_region_size = NULL,
    sliding_region_delta = NULL,
    quantile_multiplier = NULL,
    n_point_region = NULL,

    tiled_region_size = NULL,
    tiled_region_delta = NULL,

    region_percentage = NULL,

    peak_method = NULL,
    min_points = NULL,
    sample_id = NULL,

    zero_normalization = NULL,

    progress = NULL,

    add_regions = function(){
      log_message("Addling sliding and tiled regions ...")
      sliding_regions <- function(self){
        create_frequency_regions(frequency_range = self$peak_regions$frequency_range, n_point = self$sliding_region_size,
                                           delta_point = self$sliding_region_delta,
                                           multiplier = self$peak_regions$frequency_multiplier)
      }
      tiled_regions <- function(self){
        create_frequency_regions(frequency_range = self$peak_regions$frequency_range, n_point = self$tiled_region_size,
                          delta_point = self$tiled_region_delta,
                          multiplier = self$peak_regions$frequency_multiplier)
      }
      run_regions <- list(sliding = sliding_regions,
                          tiled = tiled_regions)
      new_regions <- internal_map$map_function(names(run_regions), function(region){
        run_regions[[region]](self)
      })
      names(new_regions) <- names(run_regions)
      self$peak_regions$sliding_regions <- new_regions[["sliding"]]
      self$peak_regions$tiled_regions <- new_regions[["tiled"]]
      invisible(self)
    },



    reduce_sliding_regions = function(){
     log_message("Finding initial signal regions ...")
      self$peak_regions$peak_regions <- find_signal_regions(self$peak_regions$sliding_regions, self$peak_regions$frequency_point_regions$frequency, self$quantile_multiplier, self$n_point_region)
      log_memory()
    },

    split_peak_regions = function(use_regions = NULL){
      log_message("Splitting signal regions by peaks ...")
      if (is.null(use_regions)) {
        use_regions <- seq_len(length(self$peak_regions$peak_regions))
      }
      peak_data <- split_regions(self$peak_regions$peak_regions[use_regions], self$peak_regions$frequency_point_regions, self$peak_regions$tiled_regions, self$peak_regions$min_scan, peak_method = self$peak_method, min_points = self$min_points)

      self$peak_regions$peak_region_list = peak_data

      self$peak_regions$peak_index <- seq_len(length(peak_data))
      #self$peak_regions$peak_regions <- subset_signal_regions(self$)
    },

    remove_double_peaks_in_scans = function(){
      peak_region_list = self$peak_regions$peak_region_list

      scan_peaks <-  internal_map$map_function(peak_region_list, function(in_list){
        tmp_peaks = in_list$peaks
        dup_scans <- tmp_peaks[, "scan"][duplicated(tmp_peaks[, "scan"])]
        tmp_peaks = tmp_peaks[!(tmp_peaks[, "scan"] %in% dup_scans), ]
        tmp_points = in_list$points
        tmp_points = tmp_points[as.character(tmp_peaks$scan)]
        in_list$points = tmp_points
        in_list$peaks = tmp_peaks
        in_list
      })

      n_remain <- purrr::map_int(scan_peaks, ~ nrow(.x$peaks))
      keep_remain <- n_remain > 0
      scan_peaks <- scan_peaks[keep_remain]
      self$peak_regions$peak_region_list <- scan_peaks

      self$peak_regions$scan_correlation <- self$peak_regions$scan_correlation[keep_remain, ]
      self$peak_regions$peak_index <- self$peak_regions$peak_index[keep_remain]

      if (!is.null(self$peak_regions$scans_per_peak)) {
        self$peak_regions$scans_per_peak <- self$peak_regions$scans_per_peak[keep_remain]
      }
      invisible(self)
    },

    normalize_data = function(which_data = "both"){
      log_message("Normalizing scans ...")

      if (!self$zero_normalization) {
        self$peak_regions <- two_pass_normalization(self$peak_regions, summary_function = median,
                                                    normalize_peaks = which_data)
      } else {
        self$peak_regions = zero_normalization(self$peak_regions, summary_function = median,
                                               normalize_peaks = which_data)
      }

    },

    find_peaks_in_regions = function(which_data = "raw"){
      log_message("Finding peaks in regions ...")
      self$peak_regions <- characterize_peaks(self$peak_regions)
    },

    model_mzsd = function(){
      self$peak_regions$peak_data$Log10ObservedMZSDModel <- mz_sd_model(self$peak_regions$peak_data)
      invisible(self)
    },

    model_heightsd = function(){
      self$peak_regions$peak_data$Log10HeightSDModel <-
        int_sd_model(self$peak_regions$peak_data)
      invisible(self)
    },

    indicate_high_frequency_sd = function() {
      peak_region = self$peak_regions

      sd_cutoff = max(boxplot.stats(peak_region$peak_data$ObservedFrequencySD)$stats)

      high_peaks = peak_region$peak_data$ObservedFrequencySD > sd_cutoff

      self$peak_regions$peak_data$HighSD = high_peaks
      invisible(self)
    },

    add_data = function(raw_ms) {
      if (inherits(raw_ms, "RawMS")) {
        self$peak_regions$add_data(raw_ms)
      }
      invisible(self)
    },

    summarize_peaks = function(){
      list(TIC = sum(self$peak_regions$peak_data$Height),
           Sample = self$sample_id,
           Peaks = self$peak_regions$peak_data,
           ScanLevel = self$peak_regions$scan_level_arrays)
    },

    add_offset = function(){
      peak_data = self$peak_regions$peak_data
      frequency_offset = mean(unlist(self$peak_regions$frequency_point_regions$metadata$difference_range))
      mz_coefficients = self$peak_regions$frequency_point_regions$metadata$mz_coefficients
      mz_description = self$peak_regions$frequency_point_regions$metadata$mz_fit_description
      mz_1 = predict_exponentials(peak_data$ObservedFrequency, mz_coefficients, mz_description)
      mz_2 = predict_exponentials(peak_data$ObservedFrequency + frequency_offset, mz_coefficients, mz_description)
      peak_data$Offset = (mz_1 - mz_2) * self$offset_multiplier
      self$peak_regions$peak_data = peak_data
      invisible(self)
    },

    sort_ascending_mz = function(){
      self$peak_regions$peak_data = self$peak_regions$peak_data[order(self$peak_regions$peak_data$ObservedMZ), ]
      invisible(self)
    },

    characterize_peaks = function(){
      self$add_regions()
      self$reduce_sliding_regions()
      self$split_peak_regions()
      self$remove_double_peaks_in_scans()
      self$normalize_data()
      self$find_peaks_in_regions()
      self$indicate_high_frequency_sd()
      self$add_offset()
      self$sort_ascending_mz()
      if (nrow(self$peak_regions$peak_data) == 0) {
        stop("No peaks meeting criteria!")
      }
      #self$add_offsets()
      self$model_mzsd()
      #self$model_heightsd()
    },

    summarize = function(package_used = "package:FTMS.peakCharacterization"){
      self$stop_time <- Sys.time()
      self$run_time <- as.numeric(difftime(self$stop_time, self$start_time, units = "s"))
      # generate information about our objects
      pkg_description <- utils::packageDescription(substring(package_used, 9))

      if (!is.null(pkg_description$RemoteSha)) {
        pkg_sha <- pkg_description$RemoteSha
      } else {
        pkg_sha <- NA
      }

      p_finder <- as.list(self)
      p_finder[[".__enclos_env__"]] <- NULL
      p_finder$clone <- NULL

      p_regions <- as.list(self$peak_regions)
      p_finder$peak_regions <- NULL

      p_regions[[".__enclos_env__"]] <- NULL
      p_regions$clone <- NULL
      p_regions$frequency_point_regions <- NULL
      #p_regions$mz_model <- NULL
      p_regions$sliding_regions <- NULL
      p_regions$tiled_regions <- NULL
      p_regions$scan_peaks <- NULL
      p_regions$peak_data <- NULL
      p_regions$peak_regions <- NULL
      p_regions$scan_level_arrays <- NULL
      p_regions$keep_peaks <- NULL
      p_regions$normalization_factors <- NULL
      p_regions$peak_index <- NULL
      p_regions$scan_correlation <- NULL
      p_regions$peak_region_list <- NULL

      processing_info <- list(Package = package_used,
                              Version = pkg_description$Version,
                              Sha = pkg_sha,
                              RunTime = self$run_time,
                              PeakFinder = p_finder,
                              PeakRegions = p_regions
      )
      # and then about the peaks we had
      list(processing_metadata.json = processing_info,
           peak_list.json = self$summarize_peaks())

    },

    peak_meta = function(){
      mz_point_data <- self$peak_regions$frequency_point_regions$frequency
      mz_point_data <- mz_point_data[names(mz_point_data) %in% as.character(self$peak_regions$normalization_factors$scan)]

      tmp_ms_info = purrr::map_df(mz_point_data, function(in_scan){
        tmp_data = S4Vectors::mcols(in_scan)
        data.frame(scan = tmp_data[1, "scan"],
                   tic = sum(tmp_data[, "intensity"]),
                   raw_tic = sum(tmp_data[, "RawIntensity"]))
      })

      list(run_time_info = list(
              run_time = as.numeric(difftime(self$stop_time, self$start_time, units = "s")),
              n_peak_regions = length(self$peak_regions$peak_region_list),
              n_scans = length(self$peak_regions$normalization_factors$scan)
              ),
           ms_info = tmp_ms_info,
           frequency_mz = self$peak_regions$frequency_point_regions$metadata,
           peaks = list(n_peaks = nrow(self$peak_regions$peak_data),
                        mz_range = range(self$peak_regions$peak_data$ObservedMZ, na.rm = TRUE)),
           other_info = list(
             n_scans = length(self$peak_regions$normalization_factors$scan),
             min_scan = self$peak_regions$min_scan,
             n_peaks = nrow(self$peak_regions$peak_data),
             intensity_range = range(self$peak_regions$peak_data$Height),
             dynamic_range = max(self$peak_regions$peak_data$Height) / min(self$peak_regions$peak_data$Height),
             median_intensity = median(self$peak_regions$peak_data$Height),
             indicated_standards = !is.null(self$peak_regions$peak_data$StandardContaminant),
             instrument = self$peak_regions$instrument
           ))
    },

    initialize = function(raw_ms = NULL,
                          sliding_region_size = 10,
                          sliding_region_delta = 1,
                          tiled_region_size = 1,
                          tiled_region_delta = 1,
                          region_percentage = 0.99,
                          offset_multiplier = 1,
                          frequency_multiplier = 400,
                          quantile_multiplier = 1.5,
                          n_point_region = 2000,
                          peak_method = "lm_weighted",
                          min_points = 4,
                          zero_normalization = FALSE,
                          frequency_fit_description = c(0, -1/2, -1/3),
                          mz_fit_description = c(0, -1, -2, -3)){
      if (inherits(raw_ms, "RawMS")) {
        self$peak_regions <- PeakRegions$new(raw_ms = raw_ms$extract_raw_data(), frequency_fit_description = frequency_fit_description,
                                             mz_fit_description = mz_fit_description, frequency_multiplier = frequency_multiplier)
      } else if (inherits(raw_ms, "PeakRegions")) {
        self$peak_regions <- raw_ms
      } else {
        self$peak_regions <- PeakRegions$new(raw_ms = NULL, frequency_fit_description = frequency_fit_description,
                                             mz_fit_description = mz_fit_description, frequency_multiplier = frequency_multiplier)
      }

      self$sliding_region_size <- sliding_region_size
      self$sliding_region_delta <- sliding_region_delta
      self$tiled_region_size <- tiled_region_size
      self$tiled_region_delta <- tiled_region_delta
      self$region_percentage <- region_percentage
      self$quantile_multiplier = quantile_multiplier
      self$n_point_region = n_point_region

      self$peak_method = peak_method
      self$min_points = min_points
      self$zero_normalization = zero_normalization
      self$offset_multiplier = offset_multiplier

      invisible(self)
    }

  ),
  lock_objects = FALSE,
  lock_class = FALSE
)

count_overlaps <- function(regions, point_regions){
  zero_intensities <- point_regions@elementMetadata$intensity == 0

  nonzero_counts <- IRanges::countOverlaps(regions, point_regions[!zero_intensities])
  nonzero_counts
}

#' find signal
#'
#' Given some regions and point_regions, find the regions that actually should
#' contain **real** data. See *details* for an explanation of what is considered
#' **real**.
#'
#' @param regions the regions we want to query
#' @param point_regions_list the individual points
#' @param multiplier how much above base quantiles to use (default = 1.5)
#'
#' @export
#' @return IRanges
find_signal_regions <- function(regions, point_regions_list, multiplier = 1.5, n_point_region = 2000){
  nz_counts = count_overlaps(regions, point_regions_list[[1]])
  n_region = seq(2, length(point_regions_list))
  for (iregion in n_region) {
    nz_counts_iregion = count_overlaps(regions, point_regions_list[[iregion]])
    nz_counts = nz_counts + nz_counts_iregion
  }

  chunk_indices = seq(1, length(nz_counts), by = n_point_region)

  chunk_perc = purrr::map_dbl(chunk_indices, function(in_index){
    use_counts = nz_counts[seq(in_index, min(in_index + (n_point_region - 1), length(nz_counts)))]
    if (max(use_counts) > 0) {
      return(stats::quantile(use_counts, 0.99))
    } else {
      return(0)
    }
  })

  cutoff_value = ceiling(median(chunk_perc) * multiplier)

  regions = regions[nz_counts > cutoff_value]
  IRanges::reduce(regions)
}

create_na_peak <- function(peak_method = "lm_weighted"){
  data.frame(ObservedCenter = as.numeric(NA),
             Height = as.numeric(NA),
             Area = as.numeric(NA),
             SSR = as.numeric(NA),
             type = peak_method,
             stringsAsFactors = FALSE)
}

get_reduced_peaks <- function(in_range, peak_method = "lm_weighted", min_points = 4,
                              which = c("mz", "frequency")){
  range_point_data <- in_range@elementMetadata

  possible_peaks <- pracma::findpeaks(range_point_data$log_int, nups = round(min_points / 2))

  if (!is.null(possible_peaks)) {
    n_peak <- nrow(possible_peaks)
    peaks <- purrr::map_df(seq(1, n_peak), function(in_peak){
      #print(in_peak)
      "!DEBUG Peak `in_peak`"
      peak_loc <- seq(possible_peaks[in_peak, 3], possible_peaks[in_peak, 4])
      peak_data <- range_point_data[peak_loc, ]
      weights <- peak_data$intensity / max(peak_data$intensity)
      out_peak = purrr::map_dfc(which, function(in_which){
        tmp_peak = get_fitted_peak_info(peak_data, use_loc = in_which, w = weights)
        names(tmp_peak) = paste0(names(tmp_peak), ".", in_which)
        tmp_peak
      })
      #out_peak <- get_fitted_peak_info(peak_data, w = weights)
      #out_peak <- get_peak_info(range_data[peak_loc, ], peak_method = peak_method, min_points = min_points)
      out_peak$points <- I(list(IRanges::start(in_range)[peak_loc]))
      out_peak$scan <- range_point_data[peak_loc[1], "scan"]
      out_peak
    })
  } else {
    peaks <- purrr::map_dfc(which, function(in_which){
      tmp_peak = create_na_peak()
      names(tmp_peak) = paste0(names(tmp_peak), ".", in_which)
      tmp_peak
    })
    peaks$points <- NA
    peaks$scan <- range_point_data$scan[1]
  }
  peaks
}

#' split signal region
#'
#' Given a region that should contain signal, and the point data within it,
#' find the peaks, and return the region, and the set of points that make
#' up each point from each scan.
#'
#' @param region_list a list with points and tiles IRanges objects
#' @param peak_method the method for getting the peaks
#' @param min_points how many points are needed for a peak
#'
#' @export
#' @return list
split_region_by_peaks <- function(region_list, peak_method = "lm_weighted", min_points = 4,
                                  metadata = NULL){
  log_memory()
  if (is.null(metadata)) {
    stop("You must pass metadata!")
  }
  frequency_point_list = region_list$points
  tiled_regions = region_list$tiles
  frequency_point_list = purrr::map(frequency_point_list, function(.x){
    .x@elementMetadata$log_int <- log(.x@elementMetadata$intensity + 1e-8)
    .x
  })

  reduced_peaks <- purrr::map_df(names(frequency_point_list), function(in_scan){
    get_reduced_peaks(frequency_point_list[[in_scan]], peak_method = peak_method, min_points = min_points)
  })

  reduced_peaks <- reduced_peaks[!is.na(reduced_peaks$ObservedCenter.frequency), ]
  rownames(reduced_peaks) = NULL

  if (nrow(reduced_peaks) > 0) {
    #reduced_peaks = convert_found_peaks(as.data.frame(S4Vectors::mcols(frequency_point_regions)), reduced_peaks)
    reduced_points <- frequency_points_to_frequency_regions(reduced_peaks, "ObservedCenter.frequency", metadata$frequency_multiplier)

    secondary_regions = split_reduced_points(reduced_points, tiled_regions, n_zero = 1)

    out_regions = purrr::imap(secondary_regions$region, function(in_region, region_name){
      f_data_list = purrr::map(names(in_region), function(.x){
        IRanges::subsetByOverlaps(frequency_point_list[[.x]], in_region[[.x]])
      })
      names(f_data_list) = names(in_region)
      t_data_list = purrr::map(names(in_region), function(.x){
        IRanges::subsetByOverlaps(tiled_regions, in_region[[.x]])
      })
      names(t_data_list) = names(in_region)

      list(points = f_data_list, tiles = t_data_list, region = in_region,
           peaks = rename_peak_data(secondary_regions$peaks[[region_name]]))
    })

  } else {
    out_regions = vector("list", length = 1)
    out_regions[[1]] = list(points = NULL, tiles = NULL,
                            region = NULL, peaks = NULL)
  }
  out_regions
}


convert_peaks_with_consensus_model = function(reduced_points){
  point_data = as.data.frame(S4Vectors::mcols(reduced_points))
  all_models = point_data[, c("intercept", "slope")]
  use_model = data.frame(intercept = median(all_models$intercept, na.rm = TRUE),
                         slope = median(all_models$slope, na.rm = TRUE))
  new_frequency = purrr::map_df(.x = point_data$ObservedCenter.mz,
                                .f = ~ mz_frequency_interpolation(.x, model = use_model))
  new_frequency$frequency = new_frequency$predicted_frequency
  new_frequency$predicted_frequency = NULL
  point_data[, c("frequency", "intercept", "slope")] = new_frequency[, c("frequency", "intercept", "slope")]

  new_points = mz_points_to_frequency_regions(point_data, reduced_points@metadata$frequency_multiplier)
  new_points
}

split_reduced_points = function(reduced_points, tiled_regions, n_zero = 2){
  overlap_counts <- data.frame(peak_count = IRanges::countOverlaps(tiled_regions, reduced_points))
  rle_counts = rle(overlap_counts$peak_count)
  rle_df = data.frame(lengths = rle_counts$lengths, values = rle_counts$values,
                      row = seq(1, length(rle_counts$lengths)))
  rle_index = vector("list", nrow(rle_df))
  start_index = 1
  for (irow in seq(1, nrow(rle_df))) {
    rle_index[[irow]] = seq(from = start_index, length.out = rle_df[irow, "lengths"])
    start_index = max(rle_index[[irow]]) + 1
  }

  mask_values = dplyr::filter(rle_df, values == 0, lengths >= n_zero)
  exclude_indices = unlist(rle_index[mask_values$row])

  sub_tiles <- IRanges::reduce(tiled_regions[-exclude_indices])
  S4Vectors::mcols(sub_tiles) <- list(region = seq(1, length(sub_tiles)))

  reduced_points <- IRanges::mergeByOverlaps(reduced_points, sub_tiles)

  sub_point <- as.list(S4Vectors::split(reduced_points, reduced_points$region))

  sub_region = purrr::map(sub_point, function(iregion){
    all_points <- iregion$points
    names(all_points) = iregion$scan
    all_points_ir = purrr::imap(all_points, function(.x, .y){
      tmp = IRanges::IRanges(start = .x, width = 1)
      S4Vectors::mcols(tmp) = S4Vectors::DataFrame(scan = .y)
      tmp
    })
    all_points_ir
  })

  return(list(region = sub_region, peaks = sub_point))
}



split_regions <- function(signal_regions, frequency_point_regions, tiled_regions, min_scan, peak_method = "lm_weighted", min_points = 4) {
  # alternative idea to current implementation:
  #   take each scan, and then do the subsetting in parallel
  #   the object that needs to be cloned is "in_points", which is points from "frequency"
  #   hmmm, so maybe not.
  signal_list = as.list(split(signal_regions, seq(1, length(signal_regions))))
  min_scan2 = floor(min_scan / 2)
  log_message("Separating sub regions:")
  log_message("Finding non-zero regions")
  scan_regions_list = internal_map$map_function(frequency_point_regions$frequency, function(in_points){
    points_list = purrr::map(signal_list, function(in_region){
      IRanges::subsetByOverlaps(in_points, in_region)
    })
    null_points = purrr::map_lgl(points_list, ~ length(.x) == 0)
    log_memory()
    points_list[!null_points]
  })

  log_message("Mapping back to regions")

  all_regions = vector("list", length(scan_regions_list))
  names(all_regions) = names(scan_regions_list)
  all_points = purrr::map(scan_regions_list, ~ names(.x)) %>% unlist(.) %>% unique(.)
  point_regions_list = vector("list", length(all_points))
  names(point_regions_list) = all_points

  if (get("status", envir = pc_progress)) {
    pb = knitrProgressBar::progress_estimated(length(point_regions_list))
  } else {
    pb = NULL
  }
  for (ipoints in names(point_regions_list)) {
    tmp_scans = all_regions
    for (iscan in names(tmp_scans)) {
      tmp_scans[[iscan]] = scan_regions_list[[iscan]][[ipoints]]
    }
    nonzero_scans = purrr::map_dbl(tmp_scans, function(in_points){
      as.data.frame(in_points@elementMetadata) %>%
        dplyr::filter(intensity > 0) %>%
        dplyr::pull(scan) %>% unique(.) %>% length(.)
    })
    if (sum(nonzero_scans) >= min_scan2) {
      tiles = IRanges::subsetByOverlaps(tiled_regions, signal_list[[ipoints]])
      point_regions_list[[ipoints]] = list(
        points = tmp_scans[nonzero_scans > 0],
        tiles = tiles, region = signal_list[[ipoints]]
      )
    }
    knitrProgressBar::update_progress(pb)
  }

  not_null = purrr::map_lgl(point_regions_list, ~ !is.null(.x))
  point_regions_list = point_regions_list[not_null]
  point_regions_list = point_regions_list[sample(length(point_regions_list))]

  metadata = frequency_point_regions$metadata

  # split_data = purrr::imap(point_regions_list, function(.x, .y){
  #   message(.y)
  #   split_region_by_peaks(.x, peak_method = peak_method, min_points = min_points)
  # })
  log_message("Finding peaks within them")
  # safe_split_region = purrr::possibly(split_region_by_peaks, otherwise = NULL, quiet = TRUE)
  split_data = internal_map$map_function(point_regions_list, split_region_by_peaks,
                                          peak_method = peak_method, min_points = min_points,
                                         metadata = metadata)

  null_regions = purrr::map_lgl(split_data, ~ is.null(.x[[1]]$points))
  split_data = split_data[!null_regions]

  out_data = unlist(split_data, recursive = FALSE, use.names = FALSE)
  return(out_data)
}

two_pass_normalization <- function(peak_regions, intensity_measure = c("RawHeight", "Height"), summary_function = median, normalize_peaks = "both"){
  scan_peaks <- purrr::map(peak_regions$peak_region_list, "peaks")

  normalization_factors <- single_pass_normalization(scan_peaks, intensity_measure = intensity_measure, summary_function = summary_function)

  normed_peaks <- internal_map$map_function(scan_peaks, normalize_scan_peaks, normalization_factors)

  keep_after_round1 = which(purrr::map_lgl(normed_peaks, ~!is.null(.x)))

  normed_scan_cor <- purrr::map_dbl(normed_peaks[keep_after_round1], intensity_scan_correlation)
  normed_scan_cor[is.na(normed_scan_cor)] <- 0
  low_cor <- abs(normed_scan_cor) <= 0.5

  normalization_factors <- single_pass_normalization(scan_peaks[keep_after_round1], intensity_measure = intensity_measure, summary_function = summary_function, use_peaks = low_cor)

  normed_peaks2 <- internal_map$map_function(scan_peaks[keep_after_round1], normalize_scan_peaks, normalization_factors)
  logical_round2 = purrr::map_lgl(normed_peaks2, ~!is.null(.x))
  keep_after_round2 = keep_after_round1[logical_round2]

  named_norm = normalization_factors$normalization
  names(named_norm) = as.character(normalization_factors$scan)
  normed_list_regions = internal_map$map_function(peak_regions$peak_region_list[keep_after_round2], function(in_region){
    in_region$points = normalize_raw_points(in_region$points, named_norm)
    in_region$peaks = normalize_scan_peaks(in_region$peaks, normalization_factors)
    in_region
  })
  normed_raw <- normalize_raw_points(peak_regions$frequency_point_regions$frequency, named_norm)

  peak_regions$frequency_point_regions$frequency <- normed_raw
  peak_regions$is_normalized <- "both"
  peak_regions$normalization_factors <- normalization_factors
  peak_regions$peak_index = peak_regions$peak_index[keep_after_round2]
  peak_regions$peak_region_list = normed_list_regions

  normed_scan_cor <- data.frame(ScanCorrelation = normed_scan_cor[logical_round2])
  peak_regions$scan_correlation <- normed_scan_cor
  log_memory()
  peak_regions

}

calculate_number_of_scans <- function(in_peak){
  length(unique(in_peak$scan))
}

calculate_number_of_scans_normalized <- function(in_peak, normalized_scans){
  sum(unique(in_peak$scan) %in% normalized_scans)
}

intensity_scan_correlation <- function(scan_peak){
  if (nrow(scan_peak) < 3) {
    return(NA)
  }
  cor(scan_peak$Height, scan_peak$scan, method = "spearman", use = "complete.obs")
}

zero_normalization = function(peak_regions, intensity_measure = c("RawHeight", "Height"), summary_function = median, normalize_peaks = "both"){
  scan_peaks <- purrr::map(peak_regions$peak_region_list, "peaks")

  all_scans = unique(unlist(purrr::map(scan_peaks, ~ .x$scan)))
  normalization_factors <- data.frame(scan = sort(all_scans), normalization = 0)

  normed_peaks <- internal_map$map_function(scan_peaks, normalize_scan_peaks, normalization_factors)

  named_norm = normalization_factors$normalization
  names(named_norm) = as.character(normalization_factors$scan)
  normed_list_regions = internal_map$map_function(peak_regions$peak_region_list, function(in_region){
    in_region$points = normalize_raw_points(in_region$points, named_norm)
    in_region$peaks = normalize_scan_peaks(in_region$peaks, normalization_factors)
    in_region
  })


  normed_scan_cor <- purrr::map_dbl(normed_peaks, intensity_scan_correlation)
  normed_scan_cor[is.na(normed_scan_cor)] <- 0
  low_cor <- abs(normed_scan_cor) <= 0.5

  normed_raw <- normalize_raw_points(peak_regions$frequency_point_regions$frequency, normalization_factors)

  peak_regions$peak_region_list = normed_list_regions
  peak_regions$frequency_point_regions$frequency <- normed_raw
  peak_regions$is_normalized <- "both"
  peak_regions$normalization_factors <- normalization_factors

  normed_scan_cor <- data.frame(ScanCorrelation = normed_scan_cor,
                                HighCor = !low_cor)
  n_scans <- purrr::map_int(scan_peaks, calculate_number_of_scans)
  peak_regions$scan_correlation <- normed_scan_cor
  peak_regions
}

single_pass_normalization <- function(scan_peaks, intensity_measure = c("RawHeight", "Height"), summary_function = median, use_peaks = NULL, min_ratio = 0.7){
  n_scan_per_peak <- purrr::map_int(scan_peaks, function(x){
    if (sum(duplicated(x$scan)) == 0) {
      return(length(x$scan))
    } else {
      return(0L)
    }
  })

  use_measure <- NULL
  for (imeasure in intensity_measure){
    if (imeasure %in% names(scan_peaks[[1]])) {
      use_measure <- imeasure
      break()
    }
  }

  if (is.null(use_measure)) {
    stop("The desired intensity measure for normalization is not present!")
  }

  scan_cutoff <- quantile(n_scan_per_peak, 0.95)

  if (is.null(use_peaks)) {
    use_peaks <- rep(TRUE, length(scan_peaks))
  }

  normalize_peaks <- which((n_scan_per_peak >= scan_cutoff) & use_peaks)

  all_scans <- data.frame(scan = unique(unlist(purrr::map(scan_peaks, function(x){unique(x$scan)}))))

  peak_intensity_list = lapply(scan_peaks[normalize_peaks], function(x){
    tmp_data = dplyr::left_join(all_scans, as.data.frame(x[, c("scan", use_measure)]), by = "scan")
    log(tmp_data[, use_measure])
  })
  peak_intensity = as.data.frame(do.call(cbind, peak_intensity_list))

  all_na = purrr::map_lgl(seq_len(nrow(peak_intensity)), ~ sum(is.na(peak_intensity[.x, ])) == ncol(peak_intensity))

  all_scans = all_scans[!all_na, , drop = FALSE]
  peak_intensity = peak_intensity[!all_na, , drop = FALSE]

  intensity_ratio <- purrr::map_dfr(seq_len(nrow(peak_intensity)), function(x){
    #message(x)
    peak_intensity[x, ] / max(peak_intensity[x, ], na.rm = TRUE)
  })
  peak_intensity[intensity_ratio < min_ratio] <- NA

  intensity_scans <- purrr::map_int(seq_len(ncol(peak_intensity)), function(x){
    sum(!is.na(peak_intensity[, x]))
  })

  peak_intensity <- peak_intensity[, intensity_scans >= scan_cutoff, drop = FALSE]

  notna_scans <- rowSums(!is.na(as.matrix(peak_intensity)))
  keep_scans <- notna_scans >= 25

  if (sum(keep_scans) == 0) {
    stop("No scans left to use in normalization!")
  }

  all_scans <- all_scans[keep_scans, , drop = FALSE]
  peak_intensity <- peak_intensity[keep_scans, ]

  scan_distances <- purrr::map_dbl(seq(1, nrow(peak_intensity)), function(in_scan){
    scan_peaks <- peak_intensity[in_scan, ,drop = FALSE]
    scan_peaks_matrix <- matrix(unlist(scan_peaks), nrow = nrow(peak_intensity) - 1, ncol = ncol(scan_peaks), byrow = TRUE)

    other_matrix <- as.matrix(peak_intensity[-in_scan, , drop = FALSE])
    scan_other_diff <- scan_peaks_matrix - other_matrix
    scan_distances <- purrr::map_dbl(seq(1, nrow(scan_other_diff)), function(x){
      sqrt(sum(scan_other_diff[x, ]^2, na.rm = TRUE))
    })
    sum(scan_distances)
  })

  normalize_scan <- which.min(scan_distances)

  scan_norm_matrix <- matrix(unlist(peak_intensity[normalize_scan, , drop = FALSE]),
                             nrow = nrow(peak_intensity), ncol = ncol(peak_intensity), byrow = TRUE)

  diff_matrix <- as.matrix(peak_intensity) - scan_norm_matrix

  normalization_factors <- purrr::map_dbl(seq_len(nrow(diff_matrix)), function(in_row){
    summary_function(diff_matrix[in_row, ], na.rm = TRUE)
  })
  data.frame(scan = all_scans$scan, normalization = normalization_factors)
}

normalize_raw_points <- function(raw_points, named_factors){

  to_norm = intersect(names(raw_points), names(named_factors))
  raw_points = raw_points[to_norm]

  normed_points = purrr::imap(raw_points, function(in_points, scan){
    if (is.null(in_points@elementMetadata$RawIntensity)) {
      in_points@elementMetadata$RawIntensity <- in_points@elementMetadata$intensity
    }
    in_points@elementMetadata$intensity = exp(log(in_points@elementMetadata$RawIntensity) -
                                                named_factors[scan])
    in_points
  })

  normed_points
}

normalize_scan_peaks <- function(in_peak, normalization_factors){
  #split_normalization <- split(normalization_factors$normalization, normalization_factors$scan)

  #normed_peaks <- internal_map$map_function(scan_peaks, function(in_peak){
  if (is.null(in_peak$RawHeight)) {
    in_peak$RawHeight <- in_peak$Height
  }
  in_peak$index <- seq(1, nrow(in_peak))
  height_scan <- as.data.frame(in_peak[, c("index", "RawHeight", "scan")])
  height_scan <- dplyr::right_join(height_scan, normalization_factors, by = "scan")
  height_scan <- height_scan[!is.na(height_scan$RawHeight), ]
  if (nrow(height_scan) > 0) {
    height_scan <- height_scan[order(height_scan$index), ]
    in_peak <- in_peak[height_scan$index, ]
    stopifnot(in_peak@elementMetadata$index == height_scan$index)
    in_peak$Height <- exp(log(height_scan$RawHeight) - height_scan$normalization)
  } else {
    in_peak = NULL
  }
  in_peak
}

apply_normalization_peak_regions <- function(peak_regions, normalization_factors, which_data = "both") {
  to_normalize <- switch(which_data,
                            both = c("frequency_point_regions", "scan_peaks"),
                            raw = "frequency_point_regions",
                            scan_peaks = "scan_peaks")

  split_normalization <- split(normalization_factors$normalization, normalization_factors$scan)

  if ("frequency_point_regions" %in% to_normalize) {
    split_points <- as.list(split(peak_regions$frequency_point_regions, peak_regions$frequency_point_regions@elementMetadata$scan))
    split_points <- split_points[names(split_points) %in% names(split_normalization)]

    normed_points <- purrr::map2(split_points, split_normalization, function(in_points, in_norm){
      in_points@elementMetadata$intensity <- exp(log(in_points@elementMetadata$intensity) - in_norm)
      in_points
    })

    peak_regions$frequency_point_regions <- unlist(IRanges::IRangesList(normed_points))
  }

  if ("scan_peaks" %in% to_normalize) {
    scan_peaks <- peak_regions$scan_peaks

    normed_peaks <- internal_map$map_function(scan_peaks, function(in_peak){
      peak_index <- seq_len(nrow(in_peak))
      split_peak_by_scan <- split(peak_index, in_peak$scan)

      keep_scans <- names(split_peak_by_scan)[names(split_peak_by_scan) %in% names(split_normalization)]
      in_peak <- in_peak[in_peak$scan %in% as.numeric(keep_scans), ]

      for (iscan in keep_scans) {
        in_peak[in_peak$scan %in% as.numeric(iscan), "Height"] <- exp(log(in_peak[in_peak$scan %in% as.numeric(iscan), "Height"]) - split_normalization[[iscan]])
      }
      in_peak
    })
    peak_regions$scan_peaks <- normed_peaks
  }
  peak_regions$is_normalized <- to_normalize
  peak_regions$normalization_factors <- normalization_factors

  return(peak_regions)
}

get_merged_peak_info <- function(point_data, peak_method = "lm_weighted", min_points = 4){

  point_data <- point_data[point_data$intensity > 0, ]
    point_data$log_int <- log(point_data$intensity + 1e-8)
    weights <- point_data$intensity / max(point_data$intensity)
    mz_peak_info <- get_fitted_peak_info(point_data, use_loc = "mz", w = weights)
    freq_peak_info <- get_fitted_peak_info(point_data, use_loc = "frequency", w = weights)
    mz_peak_info$ObservedMZ <- mz_peak_info$ObservedCenter
    mz_peak_info$ObservedCenter <- NULL
    mz_peak_info$ObservedMZMean <- mean(point_data[, "mz"])
    mz_peak_info$ObservedMZMedian <- median(point_data[, "mz"])
    mz_peak_info$ObservedFrequency <- freq_peak_info$ObservedCenter
    mz_peak_info$ObservedFrequencyMean <- mean(point_data[, "frequency"])
    mz_peak_info$ObservedFrequencyMedian <- median(point_data[, "frequency"])

  mz_peak_info

}


#' characterize peaks from points and picked peaks
#'
#' @param peak_region the PeakRegion object to work on
#'
#' @return list
#' @export
characterize_peaks <- function(peak_region){

  stopifnot(peak_region$is_normalized == "both")

  #peak_ranges <- peak_region$peak_regions
  #picked_peaks <- peak_region$scan_peaks
  #frequency_point_regions <- peak_region$frequency_point_regions
  use_scans <- peak_region$normalization_factors$scan
  peak_region$n_scan <- n_scan <- length(use_scans)
  peak_region$set_min_scan()

  peak_region_list = peak_region$peak_region_list
  peak_nscan = purrr::map_int(peak_region_list, ~ length(base::intersect(use_scans, .x$peaks$scan)))
  keep_peaks = which(peak_nscan >= floor(peak_region$min_scan / 2))
  use_region_list = peak_region_list[keep_peaks]
  log_memory()

  # peak_data <- purrr::imap(use_region_list, function(.x, .y){
  #   message(.y)
  #   characterize_mz_points(.x, peak_scans = use_scans)
  # })
  peak_data <- internal_map$map_function(
    use_region_list, characterize_mz_points, peak_scans = use_scans)

  peak_height = purrr::map_df(peak_data, ~ .x$peak_info[, c("Height", "Log10Height")])
  bad_peaks = is.na(peak_height$Height) | is.na(peak_height$Log10Height)

  peak_data = peak_data[!bad_peaks]

  log_memory()

  individual_peak_heights <- log10(purrr::map_dbl(peak_data, function(x){x$peak_info$Height}))
  scan_peak_heights <- purrr::map(peak_data, function(x){x$scan_data$LogHeight})

  individual_peak_nscan <- purrr::map_int(peak_data, function(x){x$peak_info$NScan})

  # update the minimum number of scans required, and then filter
  # all of the relevant bits
  #### This needs to be updated to account for the fact that we filtered above too.
  passing_data = which(individual_peak_nscan >= peak_region$min_scan)
  peak_region$keep_peaks <- keep_peaks[passing_data]

  peak_data <- peak_data[passing_data]
  individual_peak_heights <- individual_peak_heights[passing_data]
  scan_peak_heights <- scan_peak_heights[passing_data]
  individual_peak_nscan <- individual_peak_nscan[passing_data]
  peak_index <- peak_region$peak_index[passing_data]

  corrected_data <- correct_peak_sd_height(individual_peak_heights,
                                           scan_peak_heights,
                                           individual_peak_nscan,
                                           n_scan)

  template_scan_data <- matrix(NA, nrow = 1, ncol = n_scan)
  colnames(template_scan_data) <- sort(use_scans)

  correction_ratios <- corrected_data$OriginalHeight - corrected_data$CorrectedHeight

  corrected_peak_info <- purrr::map(seq_len(length(peak_data)),
                                      function(in_peak){
      original_data <- peak_data[[in_peak]]
      scan_data <- original_data$scan_data
      corrected_scan_data <- scan_data
      corrected_scan_data$LogHeight <- scan_data$LogHeight - correction_ratios[in_peak]
      peak_info <- original_data$peak_info
      peak_info$CorrectedLog10Height <- corrected_data[in_peak, "CorrectedHeight"]
      peak_info$CorrectedLog10HeightSD <- corrected_data[in_peak, "CorrectedSD"]
      peak_info$PeakID <- peak_index[in_peak]
      rownames(peak_info) <- NULL

      frequency_scans <- mz_scans <- original_scan_heights <- corrected_scan_heights <- template_scan_data
      original_scan_heights[1, as.character(scan_data$Scan)] <- scan_data$LogHeight
      rownames(original_scan_heights) <- peak_index[in_peak]
      corrected_scan_heights[1, as.character(corrected_scan_data$Scan)] <- corrected_scan_data$LogHeight
      rownames(corrected_scan_heights) <- peak_index[in_peak]

      mz_scans[1, as.character(scan_data$Scan)] <- scan_data$ObservedMZ
      rownames(mz_scans) <- peak_index[in_peak]

      frequency_scans[1, as.character(scan_data$Scan)] <- scan_data$ObservedFrequency

      list(peak = peak_info, original_scan = original_scan_heights,
           corrected_scan = corrected_scan_heights,
           mz_scan = mz_scans,
           frequency_scan = frequency_scans)
                                      })

  peak_info <- purrr::map_df(corrected_peak_info, "peak")
  #peak_info <- add_offset(peak_info, peak_region$mz_model)
  peak_info$ScanCorrelation <- peak_region$scan_correlation[peak_region$keep_peaks, "ScanCorrelation"]
  peak_info$HighScan = peak_info$NScan >= quantile(peak_info$NScan, 0.9)

  original_height <- do.call(rbind, purrr::map(corrected_peak_info, "original_scan"))
  corrected_height <- do.call(rbind, purrr::map(corrected_peak_info, "corrected_scan"))
  mz_scan <- do.call(rbind, purrr::map(corrected_peak_info, "mz_scan"))
  frequency_scan <- do.call(rbind, purrr::map(corrected_peak_info, "frequency_scan"))

  peak_region$peak_data <- peak_info
  peak_region$scan_level_arrays <- list(Log10Height = original_height,
                                        CorrectedLog10Height = corrected_height,
                                        ObservedMZ = mz_scan,
                                        ObservedFrequency = frequency_scan,
                                        Scan = colnames(original_height),
                                        PeakID = rownames(original_height))
  log_memory()

  invisible(peak_region)
}

characterize_mz_points <- function(in_region, peak_scans = NULL){
  log_memory()
  in_points = in_region$points
  scan_peaks = in_region$peaks
  if (is.null(peak_scans)) {
    peak_scans <- unique(names(in_points))
  }

  # first trim to the scans actually available from the scan peaks
  peak_scans <- base::intersect(peak_scans, scan_peaks$scan)
  char_scans = as.character(peak_scans)

  in_points <- in_points[char_scans]
  null_points = purrr::map_lgl(in_points, is.null)
  in_points = in_points[!null_points]

  if ((nrow(scan_peaks) == 0) || (length(peak_scans) == 0) || (length(in_points) == 0)) {
    peak_info <- data.frame(Height = NA,
                            Area = NA,
                            SSR = NA,
                            ObservedMZ = NA,
                            ObservedMZMean = NA,
                            ObservedMZMedian = NA,
                            ObservedFrequency = NA,
                            ObservedFrequencyMean = NA,
                            ObservedFrequencyMedian = NA,
                            ObservedMZSD = NA,
                            ObservedFrequencySD = NA,
                            Log10ObservedMZSD = NA,
                            Log10Height = NA,
                            HeightSD = NA,
                            Log10HeightSD = NA,
                            Start = NA,
                            Stop = NA,
                            NScan = 0L,
                            NPoint = NA)
    scan_heights <- data.frame(Scan = NA,
                               LogHeight = NA,
                               ObservedMZ = NA,
                               ObservedFrequency = NA)
  } else {

    all_points = purrr::map_df(in_points, ~ as.data.frame(S4Vectors::mcols(.x)))

    peak_info <- get_merged_peak_info(all_points)
    peak_info$ObservedMZSD <- sd(scan_peaks$ObservedMZ)
    peak_info$ObservedFrequencySD <- sd(scan_peaks$ObservedFrequency)
    peak_info$Log10ObservedMZSD <- sd(log10(scan_peaks$ObservedMZ))
    if (peak_info$Height < 1) {
      peak_info$Log10Height = NA
    } else {
      peak_info$Log10Height <- log10(peak_info$Height)
    }
    peak_info$HeightSD <- sd(scan_peaks$Height)
    peak_info$Log10HeightSD <- sd(log10(scan_peaks$Height))

    peak_start <- min(all_points[all_points$intensity > 0, "mz"])
    peak_stop <- max(all_points[all_points$intensity > 0, "mz"])

    peak_info$Start <- peak_start
    peak_info$Stop <- peak_stop
    peak_info$NScan <- length(peak_scans)

    peak_info$NPoint <- median(purrr::map_int(in_points, length))

    scan_heights <- data.frame(Scan = scan_peaks$scan, LogHeight = log10(scan_peaks$Height), ObservedMZ = scan_peaks$ObservedMZ,
                               ObservedFrequency = scan_peaks$ObservedFrequency)
  }

  list(peak_info = peak_info, scan_data = scan_heights)
}


characterize_peaks_in_regions <- function(frequency_point_regions, peak_regions, n_scans, max_subsets = 100, peak_index = NULL){
  peak_region_scans <- peak_regions@elementMetadata$X



  peak_data <- dplyr::bind_rows(peak_data)
  peak_data$NScan <- n_scans

  if (is.null(peak_index)) {
    peak_data$PeakID <- seq_len(nrow(peak_data))
  } else {
    peak_data$PeakID <- peak_index
  }
  peak_data
}

characterize_picked_peaks <- function(scan_peaks, n_scans, peak_index = NULL){

  peak_data <- purrr::map_df(scan_peaks, function(in_peak){
    n_point <- purrr::map_int(in_peak$points, length)
    data.frame(ObservedMZ = mean(in_peak$ObservedMZ),
               Height = mean(in_peak$Height),
               Log10Height = mean(log10(in_peak$Height)),
               Area = mean(in_peak$Area),
               Log10Area = mean(log10(in_peak$Area)),
               ObservedMZSD = sd(in_peak$ObservedMZ),
               Log10ObservedMZSD = sd(log10(in_peak$ObservedMZ)),
               HeightSD = sd(in_peak$Height),
               Log10HeightSD = sd(log10(in_peak$Height)),
               AreaSD = sd(in_peak$Area),
               Log10AreaSD = sd(log10(in_peak$Area)),
               Start = min(in_peak$ObservedMZ),
               Stop = max(in_peak$ObservedMZ),
               NPoint = mean(n_point))
  })
  peak_data$NSubset <- n_scans
  peak_data$NScan <- n_scans
  peak_data$PeakID <- seq_len(nrow(peak_data))
  peak_data
}

add_offset <- function(peak_data, mz_model){
  offsets <- purrr::map_df(peak_data$PeakID, function(in_id){
    #print(in_id)
    peak_mz <- peak_data$ObservedMZ[peak_data$PeakID %in% in_id]
    offset_value <- mz_model$y[which.min(abs(peak_mz - mz_model$x))]
    data.frame(PeakID = in_id, Offset = offset_value)
  })
  dplyr::left_join(peak_data, offsets, by = "PeakID")
}

mz_sd_model <- function(in_data){
  min_scan <- max(in_data$NScan)
  fit_data <- in_data[(in_data$NScan >= min_scan) & (in_data$Height > 1), ]
  if ("ScanCorrelated" %in% names(fit_data)) {
    fit_data <- fit_data[!fit_data$ScanCorrelated, ]
  }

  lm_fit <- lm(Log10ObservedMZSD ~ ObservedMZ + CorrectedLog10Height + CorrectedLog10HeightSD + NPoint, data = fit_data)
  sd_pred <- abs(predict(lm_fit, newdata = in_data))
  sd_pred
}

int_sd_model <- function(in_data){
  min_scan <- quantile(in_data$NScan, 0.75)
  in_data$logHeight <- log10(in_data$Height)
  fit_data <- in_data[in_data$NScan >= min_scan, ]
  if ("Ignore" %in% names(fit_data)) {
    fit_data <- fit_data[!fit_data$Ignore, ]
  }
  fit_data <- fit_data[!is.na(fit_data$Log10HeightSD), ]
  fit_data <- fit_data[fit_data$Height > 10, ]
  fit_data$width <- fit_data$Stop - fit_data$Start
  fit_data$weight_2 <- 1 - (fit_data$width / max(fit_data$width))
  sd_fit <- lm(I(1/Log10HeightSD) ~ I(1/logHeight), data = fit_data, weights = fit_data$weight_2)
  sd_pred <- 1/predict(sd_fit, newdata = in_data)
  sd_pred
}


model_sds <- function(values, sds, loess_span = 0.75){
  sd_frame <- data.frame(x = values, y = sds)
  loess_fit <- stats::loess(y ~ x, data = sd_frame, span = loess_span, control = loess.control(surface = "direct"))
  loess_pred <- predict(loess_fit, values)
  loess_pred
}


#' generate bootstrap samples
#'
#' Generates `n_bootstrap` samples, each with `n_sample` entries, from the provided
#' `indices`. See **Details** for more information.
#'
#' @param n_indices the indices to take the bootstrap sample from
#' @param n_bootstrap how many bootstrap samples to generate? (default is 1000)
#' @param n_sample how many items should be in each sample? See **Details**.
#' @param min_indices the minimum number of indices to be willing to work with
#'
#' @details A *bootstrap* sample is a sample where the entries have been sampled
#'  **[with replacement](https://en.wikipedia.org/wiki/Bootstrapping_(statistics))**.
#'  In general, a *bootstrap* sample has the same number of entries as the original
#'  sample. However, there may be cases where *upsampling* to specific number
#'  may be useful. Therefore, the parameter `n_sample` is provided to enable
#'  *upsampling* the original indices to a larger number. If `n_sample` is NULL,
#'  then no *upsampling* will be done.
#'
#' @return list of bootstrap sampled indices
#' @export
bootstrap_samples <- function(n_indices, n_bootstrap = 100, n_sample = NULL, min_indices = 4){
  if (n_indices < min_indices) {
    return(NULL)
  }

  if (is.null(n_sample)) {
    n_sample <- n_indices
  }
  purrr::map(seq_len(n_bootstrap), function(x){
    sample(n_indices, n_sample, replace = TRUE)
  })
}

rename_peak_data = function(data_frame){
  name_convert = c("ObservedCenter.mz" = "ObservedMZ",
                   "Height.mz" = "Height",
                   "Area.mz" = "Area",
                   "ObservedCenter.frequency" = "ObservedFrequency"
  )
  data_names = names(data_frame)
  for (iname in names(name_convert)) {
    name_match = data_names %in% iname
    if (sum(name_match) == 1) {
      data_names[which(name_match)] = name_convert[iname]
    }
  }
  names(data_frame) = data_names
  data_frame
}
MoseleyBioinformaticsLab/FTMS.peakCharacterization documentation built on April 27, 2022, 3:32 a.m.