R/bin_median_filter.R

Defines functions bin_median_filter breakpoint_filter

Documented in bin_median_filter breakpoint_filter

#' Bin Median filter
#'
#' @param bin_data
#' @param seg_data
#' @param bincount_threshold
#' @param sd_threshold
#' @param lower_quantile
#' @param exploratory_plots
#' @param return_removed_data
#'
#' @return
#' @export
#'
#' @examples
bin_median_filter <- function(bin_data,
                              seg_data,
                              bincount_threshold = NULL,
                              sd_threshold = 2,
                              lower_quantile = NULL,
                              exploratory_plots = FALSE,
                              return_removed_data = FALSE) {

  # removing chrom, chrompos and abspos columns
  if (any(names(bin_data) %in% c("chrom", "chrompos", "abspos")) == TRUE) {
    bin_data <- bin_data[-which(names(bin_data) %in% c("chrom", "chrompos", "abspos"))]
  }

  # Calculating median bins for each row
  median_bins <- apply(bin_data, 2, median)
  mean_mbins <- mean(median_bins)
  sd_mbins <- sd(median_bins)

  sd_threshold <- mean_mbins - sd_threshold*sd_mbins

  # removing samples from bin

  if (sd_threshold) {
    bin_data_kept <- bin_data[-which(median_bins < sd_threshold)]
  }

  if (!is.null(lower_quantile)) {
    bin_data_kept <- bin_data[-which(median_bins < quantile(median_bins, lower_quantile))]
  }

  if (!is.null(bincount_threshold)) {
    bin_data_kept <- bin_data[-which(median_bins < bincount_threshold)]
  }

  # keeping the same cells on seg and bin file
  seg_file_filtered <- seg_data[(names(seg_data) %in% names(bin_data_kept))]

  # attaching bin and seg to a list
  bin_seg_list <- list(seg_data_filtered = seg_file_filtered,
                       bin_data_filtered = bin_data_kept)


  # keeping the removed data

  if (!is.null(lower_quantile)) {
    bin_data_removed <- bin_data[which(median_bins < quantile(median_bins, lower_quantile))]
  }

  if (!is.null(bincount_threshold)) {
    bin_data_removed <- bin_data[which(median_bins < bincount_threshold)]
  }

  if(sd_threshold) {
    bin_data_removed <- bin_data[which(median_bins < sd_threshold)]
  }

  print(paste(dim(bin_data_removed)[2], "cells were removed, to return them please set return_removed_data as TRUE", sep = " "))

  # printing exploratory plots for median bins
  if (exploratory_plots == TRUE) {
    par(mfrow= c(1,2))
    hist(median_bins,
         breaks = 200)
    if (!is.null(lower_quantile)) {
      abline(v = quantile(median_bins, lower_quantile), col = "red")
    }

    if (!is.null(bincount_threshold)) {
      abline(v = bincount_threshold, col = "blue")
    }

    if (sd_threshold) {
      abline(v = sd_threshold, col = "green")
    }

    boxplot(median_bins)
  }

  if (return_removed_data == TRUE) {
    return(bin_data_removed)
  }

  return(seg_file_filtered)

}

#######################################
# breakpoint filter
#######################################

#' Breakpoint Filtering
#'
#' @param seg_data
#' @param exploratory_plots
#' @param sd_threshold
#' @param keep_only_removed_data
#'
#' @return
#' @export
#'
#' @examples
breakpoint_filter <- function(seg_data,
                              exploratory_plots = FALSE,
                              sd_threshold = TRUE,
                              keep_only_removed_data = FALSE) {

  # removing chrom, chrompos and abspos columns
  if (any(names(seg_data) %in% c("chrom", "chrompos", "abspos")) == TRUE) {
    seg_data <- seg_data[-which(names(seg_data) %in% c("chrom", "chrompos", "abspos"))]
  }

  rle_all <- apply(seg_data, 2, rle)

  rle_length <- lapply(rle_all, function(x,i) length(x[[i]][2]))

  rle_length <- vector()
  for (i in seq_along(rle_all)) {
    rle_length[i] <- length(unlist(rle_all[[i]][2]))
  }

  rle_mean <- mean(rle_length)
  rle_sd <- sd(rle_length)

  if (sd_threshold) {
    seg_filtered_data <- seg_data[which(rle_length < (rle_mean + 2*rle_sd))]

    seg_filtered_data_removed <- seg_data[which(rle_length > rle_mean + 2*rle_sd)]
  }


  # printing exploratory plots
  if (exploratory_plots == TRUE) {
    plot.new()
    par(mfrow = c(1,2))
    hist(rle_length,
         breaks = 100)

    if (sd_threshold) {
      abline(v = rle_mean + 2*rle_sd, col = "blue")
    }

    plot(density(rle_length))
  }

  # in case you want to analyze the data that is going to be removed
  if (keep_only_removed_data == TRUE) {
    seg_filtered_data_removed <- seg_data[which( rle_length < quantile(rle_length, lower_quantile) | rle_length > quantile(rle_length, top_quantile))]
    return(seg_filtered_data_removed)
  }

  print(paste(ncol(seg_filtered_data_removed), "cells were removed", sep = " "))

  return(seg_filtered_data)

}
whtns/varbintools documentation built on Dec. 1, 2019, 5:15 a.m.