R/prune.R

Defines functions bamf_prune

Documented in bamf_prune

#' Binned Anomaly, Mitigate and Fit Pruning
#'
#' Prune based on binned IQR multiples emanating from the center of the
#' distribution. Anomalies are not removed if they account for more than 5\% of
#' the population.
#'
#' @param data A melted data frame to be analyzed for outliers.
#' @param drop A boolean stating whether observations from outlier data points
#' should be dropped from the returned data set. Defaults to \code{FALSE}.
#' @return A data frame either with (\code{drop = TRUE}) all of the outlier data
#' points removed from the data frame or (\code{drop = FALSE}) three additional
#' columns stating whether the data point was identified as an outlier in any of
#' the three conditions.
#' @importFrom dplyr %>%
#' @export

bamf_prune <- function(data, drop = FALSE) {
    
    # Make sure that the data being fed into the pruning function is in long
    # format
    
    data <- ensure_long(data)
    
    napheno <- data[is.na(data[, "phenotype"]), ] %>%
        dplyr::mutate(bamfoutlier1 = NA, bamfoutlier2 = NA, bamfoutlier3 = NA)
    
    datacuts <- data %>%
        
        # Filter out all of the wash and/or empty wells
        
        dplyr::filter(!is.na(strain)) %>%
        
        # Group on condition and trait, the, calculate the first and third
        # quartiles for each of the traits
        
        dplyr::group_by(condition, trait) %>%
        dplyr::summarise(iqr = IQR(phenotype, na.rm = TRUE),
                         q1 = quantile(phenotype, probs = .25, na.rm = TRUE),
                         q3 = quantile(phenotype, probs = .75,
                                       na.rm = TRUE)) %>%
        
        # Add a column for the boundaries of each of the bins
        
        dplyr::mutate(cut1h = q3 + (iqr * 2),
                      cut1l =q1 - (iqr * 2),
                      cut2h = q3 + (iqr * 3),
                      cut2l =q1 - (iqr * 3),
                      cut3h = q3 + (iqr * 4),
                      cut3l =q1 - (iqr * 4),
                      cut4h = q3 + (iqr * 5),
                      cut4l =q1 - (iqr * 5),
                      cut5l = q1 - (iqr * 7),
                      cut5h = q3 + (iqr * 7),
                      cut6l = q1 - (iqr * 10),
                      cut6h = q3 + (iqr * 10)) %>%
        
        # Join the bin boundaries back to the original data frame
        
        dplyr::left_join(data, ., by=c("condition", "trait"))
    
    # Add columns tallying the total number of points in each of the bins
    
    datawithoutliers <- datacuts %>%
        dplyr::mutate(onehs = as.numeric(cut2h > phenotype &
                                             phenotype >= cut1h)) %>%
        dplyr::mutate(onels = as.numeric(cut2l < phenotype &
                                             phenotype <= cut1l)) %>%
        dplyr::mutate(twohs = as.numeric(cut3h > phenotype &
                                             phenotype >= cut2h)) %>%
        dplyr::mutate(twols = as.numeric(cut3l < phenotype &
                                             phenotype <= cut2l)) %>%
        dplyr::mutate(threehs = as.numeric(cut4h > phenotype &
                                               phenotype >= cut3h)) %>%
        dplyr::mutate(threels = as.numeric(cut4l < phenotype &
                                               phenotype <= cut3l)) %>%
        dplyr::mutate(fourhs = as.numeric(cut5h > phenotype &
                                              phenotype >= cut4h)) %>%
        dplyr::mutate(fourls = as.numeric(cut5l < phenotype &
                                              phenotype <= cut4l)) %>%
        dplyr::mutate(fivehs = as.numeric(cut6h > phenotype &
                                              phenotype >= cut5h)) %>%
        dplyr::mutate(fivels = as.numeric(cut6l < phenotype &
                                              phenotype <= cut5l)) %>%
        dplyr::mutate(sixhs = as.numeric(phenotype >= cut6h)) %>%
        dplyr::mutate(sixls = as.numeric(phenotype <= cut6l))
    
    # Group on condition and trait, then sum the total number of data points
    # in each of the IQR multiple bins
    
    datawithoutliers <- datawithoutliers %>%
        dplyr::group_by(condition, trait) %>%
        dplyr::mutate(s1h = sum(onehs, na.rm = TRUE),
                      s2h = sum(twohs, na.rm = TRUE),
                      s3h = sum(threehs, na.rm = TRUE),
                      s4h = sum(fourhs, na.rm = TRUE),
                      s5h = sum(fivehs, na.rm = TRUE),
                      s1l = sum(onels, na.rm = TRUE),
                      s2l = sum(twols, na.rm = TRUE),
                      s3l = sum(threels, na.rm = TRUE),
                      s4l = sum(fourls, na.rm = TRUE),
                      s5l = sum(fivels, na.rm = TRUE),
                      s6h = sum(sixhs, na.rm = TRUE),
                      s6l = sum(sixls, na.rm = TRUE))
        
    # Group on condition and trait, then check to see if the number of
    # points in each bin is more than 5% of the total number of data points
    
    datawithoutliers <- datawithoutliers %>%
        dplyr::group_by(condition, trait) %>%
        dplyr::mutate(p1h = ifelse(sum(onehs, na.rm = TRUE) / n() >= .05,1,0),
                      p2h = ifelse(sum(twohs, na.rm = TRUE) / n() >= .05,1,0),
                      p3h = ifelse(sum(threehs, na.rm = TRUE) / n() >= .05,1,0),
                      p4h = ifelse(sum(fourhs, na.rm = TRUE) / n() >= .05,1,0),
                      p5h = ifelse(sum(fivehs, na.rm = TRUE) / n() >= .05,1,0),
                      p6h = ifelse(sum(sixhs, na.rm = TRUE) / n() >= .05,1,0),
                      p1l = ifelse(sum(onels, na.rm = TRUE) / n() >= .05,1,0),
                      p2l = ifelse(sum(twols, na.rm = TRUE) / n() >= .05,1,0),
                      p3l = ifelse(sum(threels, na.rm = TRUE) / n() >= .05,1,0),
                      p4l = ifelse(sum(fourls, na.rm = TRUE) / n() >= .05,1,0),
                      p5l = ifelse(sum(fivels, na.rm = TRUE) / n() >= .05,1,0),
                      p6l = ifelse(sum(sixls,
                                       na.rm = TRUE) / n() >= .05,1,0))
        
    # Count the number of observations in each condition/trait combination
    
    datawithoutliers <- datawithoutliers %>%
        dplyr::mutate(numst = n())
        
    # Group on condition and trait, then filter out NAs in any of the added
    # columns
    
    datawithoutliers <- datawithoutliers %>%
        dplyr::group_by(condition, trait) %>%
        dplyr::filter(!is.na(trait), !is.na(phenotype), !is.na(iqr), !is.na(q1),
                      !is.na(q3), !is.na(cut1h), !is.na(cut1l), !is.na(cut2h),
                      !is.na(cut2l), !is.na(cut3h), !is.na(cut3l),
                      !is.na(cut4h), !is.na(cut4l), !is.na(cut5l),
                      !is.na(cut5h), !is.na(cut6l), !is.na(cut6h),
                      !is.na(onehs), !is.na(onels), !is.na(twohs),
                      !is.na(twols), !is.na(threehs), !is.na(threels),
                      !is.na(fourhs), !is.na(fourls), !is.na(fivehs),
                      !is.na(fivels), !is.na(sixhs), !is.na(sixls),
                      !is.na(s1h), !is.na(s2h), !is.na(s3h), !is.na(s4h),
                      !is.na(s5h), !is.na(s1l), !is.na(s2l), !is.na(s3l),
                      !is.na(s4l), !is.na(s5l), !is.na(s6h), !is.na(s6l),
                      !is.na(p1h), !is.na(p2h), !is.na(p3h), !is.na(p4h),
                      !is.na(p5h), !is.na(p6h), !is.na(p1l), !is.na(p2l),
                      !is.na(p3l), !is.na(p4l), !is.na(p5l), !is.na(p6l),
                      !is.na(numst))
        
    # Add three columns stating whether the observation is an outlier
    # based the three outlier detection functions below
    
    datawithoutliers <- datawithoutliers %>%
        dplyr::ungroup()
    
    datawithoutliers <- datawithoutliers %>%
        dplyr::mutate(cuts = categorize1(.))

    datawithoutliers <- datawithoutliers %>%
        dplyr::mutate(cuts1 = categorize2(.))
    
    datawithoutliers <- datawithoutliers %>%
        dplyr::mutate(cuts2 = categorize3(.))
    
    # Select the necessary columns, rename the outlier calling columns, and
    # arrange by condition, row, and columns. Generally clean things up and bind
    # it back to the original data frame to regain lost data.
    primaryselect <- datawithoutliers %>%
        dplyr::select(date, experiment, round, assay, condition, plate, row,
                      col, trait, phenotype, cuts, cuts1, cuts2)
    
    renamedcols <- primaryselect %>%
        dplyr::rename(bamfoutlier1 = cuts, bamfoutlier2 = cuts1,
                      bamfoutlier3 = cuts2)
    
    joineddata <- dplyr::left_join(renamedcols, data,
                                   by = c("date", "experiment", "round",
                                          "assay", "condition", "plate", "row",
                                          "col", "trait", "phenotype"))

    arrangedcols <- joineddata %>%
        dplyr::arrange(plate, row, as.numeric(col), trait)

    output <- arrangedcols %>%
        dplyr::select(date, experiment, round, assay, condition, control, plate,
                      row, col, strain, trait, phenotype, bamfoutlier1,
                      bamfoutlier2, bamfoutlier3)
    
    # If the user wants to drop the outliers, filter against all three columns

    if (drop) {
        output <- output %>%
            dplyr::filter(!bamfoutlier1 & !bamfoutlier2 & !bamfoutlier3) %>%
            dplyr::select(-bamfoutlier1, -bamfoutlier2, -bamfoutlier3)
    }
    
    # Return the output data frame

    return(output)
}

# If the observation is in the sixth bin (>10x IQR outside the distribution)
# and the three outermost bins are discontinuous and make up less than 5% of the
# distribution, mark the observation as an outlier.

categorize1 <- function(data) {
    with(data,
         (sixhs >= 1 & ( (s6h + s5h + s4h ) / numst) <= .05
          & (s5h == 0 | s4h == 0))
         | (sixls >= 1 & ( (s6l + s5l + s4l) / numst) <= .05
            & (s5l == 0 | s4l == 0))
    )
}

# If the 5 innermost bins are discontinuous by more than a 1 bin gap, the
# observation is in the fifth bin (between 7 and 10x IQR outside the
# distribution), and the four outermost bins make up less than 5% of the
# population, mark the observation an outlier

categorize2 <- function(data) {
    with(data,
         ( ( ! ( (s5h >= 1 & s4h >= 1 & s3h >= 1 & s2h >= 1 & s1h >= 1)
                 | (s5h >= 1 & s3h >= 1 & s2h >= 1 & s1h >= 1)
                 | (s5h >= 1 & s4h >= 1 & s2h >= 1 & s1h >= 1)
                 | (s5h >= 1 & s4h >= 1 & s3h >= 1 & s1h >= 1)
                 | (s5h >= 1 & s4h >= 1 & s3h >= 1 & s2h >= 1)))
           & (fivehs == 1 & ( (s6h + s5h + s4h + s3h) / numst) <= .05))
         | ( ( ! ( (s5h >= 1 & s4l >= 1 & s3l >= 1 & s2l >= 1 & s1l >= 1)
                   | (s5h >= 1 & s3l >= 1 & s2l >= 1 & s1l >= 1)
                   | (s5h >= 1 & s4l >= 1 & s2l >= 1 & s1l >= 1)
                   | (s5h >= 1 & s4l >= 1 & s3l >= 1 & s1l >= 1)
                   | (s5h >= 1 & s4l >= 1 & s3l >= 1 & s2l >= 1)))
             & (fivels == 1 & ( (s6l + s5l + s4l + s3l) / numst) <= .05))
    )
}

# If the 4 innermost bins are discontinuous by more than a 1 bin gap, the
# observation is in the fourth bin (between 5 and 7x IQR outside the
# distribution), and the four outermost bins make up less than 5% of the
# population, mark the observation an outlier

categorize3 <- function(data) {
    with(data,
         ( ( ! ( (s4h >= 1 & s3h >= 1 & s2h >= 1 & s1h >= 1)
                 | (s4h >= 1 & s2h >= 1 & s1h >= 1)
                 | (s4h >= 1 & s3h >= 1 & s1h >= 1)
                 | (s4h >= 1 & s3h >= 1 & s2h >= 1)))
           & (fourhs == 1 & (s5h + s4h + s3h + s2h) / numst <= .05))
         | ( ( ! ( (s4l >= 1 & s3l >= 1 & s2l >= 1 & s1l >= 1)
                   | (s4l >= 1 & s2l >= 1 & s1l >= 1)
                   | (s4l >= 1 & s3l >= 1 & s1l >= 1)
                   | (s4l >= 1 & s3l >= 1 & s2l >= 1)))
             & (fourls == 1 & (s5l + s4l + s3l + s2l) / numst <= .05))
    )
}

#' Prune data based on biological cutoffs
#'
#' Prune the data based on brood size and/or normalized brood size
#'
#' @param data A melted data frame to be analyzed for outliers.
#' @param drop A boolean stating whether observations from outlier data points
#' should be dropped from the returned data set. Defaults to \code{FALSE}.
#' @return A data frame either with (\code{drop = TRUE}) all of the outlier data
#' points removed from the data frame or (\code{drop = FALSE}) three additional
#' columns stating whether the data point was identified as an outlier in any of
#' the three conditions.
#' @importFrom dplyr %>%
#' @export

bioprune <- function(data){
    if ("norm.n" %in% colnames(data)){
        biopruneddata <- data %>%
            dplyr::filter(n > 5, n < 1000, norm.n < 350)
    } else {
        biopruneddata <- data %>%
            dplyr::filter(n > 5, n < 1000)
    }
    return(biopruneddata)
}

#' Prune based on outliers
#'
#' Prune based on outliers being +/- 2 IQR from the median or +/- 2 standard deviations
#' from the mean for each strain. Use for NIL assays with replicates, not mapping experiments.
#'
#' @param data A melted data frame to be analyzed for outliers.
#' @param iqr A boolean stating whether observations should be called outlier data points
#' from med +/- 2*IQR. Defaults to \code{FALSE} - uses mean +/- 2*sd.
#' @return A data frame either with all of the outlier data
#' points removed from the data frame
#' @importFrom dplyr %>%
#' @export
#' 

prune_outliers <- function(data, iqr = FALSE) {
    
    # Make sure that the data being fed into the pruning function is in long
    # format
    
    data <- ensure_long(data)
    
    # remove NA phenotypes
    # not needed
    # napheno <- data[is.na(data[, "phenotype"]), ]
    
    # if type is IQR: 
    if(iqr == TRUE) {
        
        datacuts <- data %>%
            
            # Filter out all of the wash and/or empty wells
            dplyr::filter(!is.na(strain)) %>%
            
            # Group on condition, trait, and strain
            dplyr::group_by(condition, trait, strain) %>%
            
            # calculate the median, mean, 2IQR and 2sd
            dplyr::mutate(med = median(phenotype), 
                          iqr = IQR(phenotype),
                          mph = mean(phenotype), 
                          sph = sd(phenotype)) %>%
            dplyr::ungroup() %>%
            
            # Filter phenotype +/- 2*IQR
            dplyr::filter(phenotype >= med - 2*iqr, phenotype <= 2*iqr + med) %>%
            
            # select out columns not needed
            dplyr::select(-c(med:sph))
        
        # if type is not IQR, it should be SD
    } else {
        
        datacuts <- data %>%
            
            # Filter out all of the wash and/or empty wells
            dplyr::filter(!is.na(strain)) %>%
            
            # Group on condition, trait, and strain
            dplyr::group_by(condition, trait, strain) %>%
            
            # calculate the median, mean, 2IQR and 2sd
            dplyr::mutate(med = median(phenotype), 
                          iqr = IQR(phenotype),
                          mph = mean(phenotype), 
                          sph = sd(phenotype)) %>%
            dplyr::ungroup() %>%
            
            # Filter phenotype +/- 2*sd
            dplyr::filter(phenotype >= mph - 2*sph, phenotype <= 2*sph + mph) %>%
            
            # select out columns not needed
            dplyr::select(-c(med:sph))
    }
    
    return(datacuts)
}
AndersenLab/easysorter documentation built on Nov. 27, 2020, 4:26 a.m.