R/auxiliary.R

Defines functions remove_nonstandard_chromosomes2d remove_nonstandard_chromosomes1d establish_overlap2d establish_overlap1d calculate_relative_overlap2d calculate_relative_overlap1d calculate_midpoint_distance2d calculate_midpoint_distance1d determine_anchor_overlap

Documented in calculate_midpoint_distance1d calculate_midpoint_distance2d calculate_relative_overlap1d calculate_relative_overlap2d determine_anchor_overlap establish_overlap1d establish_overlap2d remove_nonstandard_chromosomes1d remove_nonstandard_chromosomes2d

#' @title Identifies Overlapping Anchors
#'
#' @description
#' Identifies all overlapping anchor pairs (m:n mapping).
#'
#' @param rep1_anchor data frame with the following columns:
#' \tabular{rll}{
#'   column 1: \tab \code{chr} \tab character; genomic location of anchor in
#'   replicate 1 - chromosome (e.g., \code{"chr3"})\cr
#'   column 2: \tab \code{start} \tab integer; genomic location of anchor in
#'   replicate 1 - start coordinate\cr
#'   column 3: \tab \code{end} \tab integer; genomic location of anchor in
#'   replicate 1 - end coordinate
#' }
#' @param rep2_anchor data frame with the following columns:
#' \tabular{rll}{
#'   column 1: \tab \code{chr} \tab character; genomic location of anchor in
#'   replicate 2 - chromosome (e.g., \code{"chr3"})\cr
#'   column 2: \tab \code{start} \tab integer; genomic location of anchor in
#'   replicate 2 - start coordinate\cr
#'   column 3: \tab \code{end} \tab integer; genomic location of anchor in
#'   replicate 2 - end coordinate
#' }
#' @param max_gap integer; maximum gap in nucleotides allowed between two
#' anchors for them to be considered as overlapping
#' (defaults to -1, i.e., overlapping anchors)
#'
#' @return A data frame containing overlapping
#'  anchor pairs with the following columns:
#' \tabular{rll}{
#'   column 1: \tab \code{rep1_idx} \tab anchor index in data frame
#'   \code{rep1_anchor} \cr
#'   column 2: \tab \code{rep2_idx} \tab anchor index in data frame
#'   \code{rep2_anchor}
#' }
#'
#' @examples
#' rep1_df <- idr2d:::chiapet$rep1_df
#' rep2_df <- idr2d:::chiapet$rep2_df
#'
#' rep1_anchor_a <- data.frame(chr = rep1_df[, 1],
#'                             start = rep1_df[, 2],
#'                             end = rep1_df[, 3])
#' rep2_anchor_a <- data.frame(chr = rep2_df[, 1],
#'                             start = rep2_df[, 2],
#'                             end = rep2_df[, 3])
#'
#' anchor_a_overlap <- determine_anchor_overlap(rep1_anchor_a, rep2_anchor_a)
#'
#'
#' @importFrom GenomicRanges makeGRangesFromDataFrame
#' @importFrom GenomeInfoDb seqlevels
#' @importFrom stringr str_sort
#' @importFrom GenomicRanges findOverlaps
#' @export
determine_anchor_overlap <- function(rep1_anchor, rep2_anchor, max_gap = -1L) {
    rep1_ranges <- GenomicRanges::makeGRangesFromDataFrame(rep1_anchor)
    rep2_ranges <- GenomicRanges::makeGRangesFromDataFrame(rep2_anchor)

    # adjust seq levels
    seq_levels_r1 <- GenomeInfoDb::seqlevels(rep1_ranges)
    seq_levels_r2 <- GenomeInfoDb::seqlevels(rep2_ranges)
    combined_seq_levels <- stringr::str_sort(union(seq_levels_r1,
                                                   seq_levels_r2))
    GenomeInfoDb::seqlevels(rep1_ranges) <- combined_seq_levels
    GenomeInfoDb::seqlevels(rep2_ranges) <- combined_seq_levels

    # get overlap between replicates, accept 1000 bp gap
    overlap_df <- data.frame(GenomicRanges::findOverlaps(rep1_ranges,
                                                         rep2_ranges,
                                                         maxgap = max_gap))
    colnames(overlap_df) <- c("rep1_idx", "rep2_idx")
    return(overlap_df)
}

#' @title Distance between Midpoints of two Peaks
#'
#' @description
#' Calculates the distance in nucleotides between the midpoints of
#' two peaks.
#'
#' Note: peaks must be on the same
#' chromosome; start coordinate is always less than end coordinate
#'
#' @inheritParams calculate_relative_overlap1d
#'
#' @return positive integer vector; distances between peak pairs
#'
#' @examples
#' # identical, zero distance
#' calculate_midpoint_distance1d(100, 120,
#'                           100, 120)
#'
#' # centered, zero distance
#' calculate_midpoint_distance1d(100, 120,
#'                           90, 130)
#'
#' # off by 10 per anchor
#' calculate_midpoint_distance1d(100, 120,
#'                          110, 130)
#'
#' # vectorized example
#' calculate_midpoint_distance1d(c(100, 100, 100),
#'                           c(120, 120, 120),
#'                           c(100, 90, 110),
#'                           c(120, 130, 130))
#'
#' @export
calculate_midpoint_distance1d <- function(peak1_start, peak1_end,
                                          peak2_start, peak2_end) {
    midpoint_peak1 <- abs(peak1_start + (peak1_end - peak1_start) / 2)
    midpoint_peak2 <- abs(peak2_start + (peak2_end - peak2_start) / 2)
    return(as.integer(abs(midpoint_peak1 - midpoint_peak2)))
}

#' @title Distance between Anchor Midpoints of two Interactions
#'
#' @description
#' Calculates the distance in nucleotides between the anchor midpoints of
#' two interactions, which is the sum of the distance between midpoints of
#' anchor A in interaction 1 and anchor A in interaction 2, and the distance
#' between midpoints of
#' anchor B in interaction 1 and anchor B in interaction 2.
#'
#' Note: all anchors must be on the same
#' chromosome; start coordinate is always less than end coordinate
#'
#' @inheritParams calculate_relative_overlap2d
#'
#' @return positive integer vector; distances between interaction pairs
#'
#' @examples
#' # identical, zero distance
#' calculate_midpoint_distance2d(100, 120, 240, 260,
#'                               100, 120, 240, 260)
#'
#' # centered, zero distance
#' calculate_midpoint_distance2d(100, 120, 240, 260,
#'                               90, 130, 230, 270)
#'
#' # off by 10 per anchor
#' calculate_midpoint_distance2d(100, 120, 240, 250,
#'                               110, 130, 230, 240)
#'
#' # off by 10 (anchor B only)
#' calculate_midpoint_distance2d(100, 120, 240, 250,
#'                               90, 130, 250, 260)
#'
#' # vectorized example
#' calculate_midpoint_distance2d(c(100, 100, 100, 100),
#'                               c(120, 120, 120, 120),
#'                               c(240, 240, 240, 240),
#'                               c(260, 260, 250, 250),
#'                               c(100, 90, 110, 90),
#'                               c(120, 130, 130, 130),
#'                               c(240, 230, 230, 250),
#'                               c(260, 270, 240, 260))
#'
#' @export
calculate_midpoint_distance2d <- function(int1_anchor_a_start,
                                          int1_anchor_a_end,
                                          int1_anchor_b_start,
                                          int1_anchor_b_end,
                                          int2_anchor_a_start,
                                          int2_anchor_a_end,
                                          int2_anchor_b_start,
                                          int2_anchor_b_end) {
    midpoint_int1_anchor_a <- abs(int1_anchor_a_start +
                                      (int1_anchor_a_end -
                                           int1_anchor_a_start) / 2)
    midpoint_int1_anchor_b <- abs(int1_anchor_b_start +
                                      (int1_anchor_b_end -
                                           int1_anchor_b_start) / 2)
    midpoint_int2_anchor_a <- abs(int2_anchor_a_start +
                                      (int2_anchor_a_end -
                                           int2_anchor_a_start) / 2)
    midpoint_int2_anchor_b <- abs(int2_anchor_b_start +
                                      (int2_anchor_b_end -
                                           int2_anchor_b_start) / 2)
    return(as.integer(abs(midpoint_int1_anchor_a - midpoint_int2_anchor_a) +
                          abs(midpoint_int1_anchor_b - midpoint_int2_anchor_b)))
}

#' @title Relative Anchor Overlap of two Peaks
#'
#' @description
#' Calculates the overlap between anchor A of interaction 1 and anchor
#' A of interaction 2, as well as anchor B of interaction 1 and anchor B of
#' interaction 2. The overlap (in nucleotides) is then normalized by the length
#' of the anchors.
#'
#' @param peak1_start integer vector; genomic start coordinate(s)
#' of peak in replicate 1
#' @param peak1_end integer vector; genomic end coordinate(s)
#' of peak in replicate 1
#' @param peak2_start integer vector; genomic start coordinate(s)
#' of peak in replicate 2
#' @param peak2_end integer vector; genomic end coordinate(s)
#' of peak in replicate 2
#'
#' @return numeric vector; relative overlaps between peak pairs
#'
#' @examples
#' # 100% overlap
#' calculate_relative_overlap1d(100, 120,
#'                          100, 120)
#'
#' # 50% overlap
#' calculate_relative_overlap1d(100, 120,
#'                          100, 110)
#'
#' # negative overlap
#' calculate_relative_overlap1d(100, 120,
#'                          130, 140)
#'
#' # larger negative overlap
#' calculate_relative_overlap1d(100, 120,
#'                          200, 220)
#'
#' # vectorized example
#' calculate_relative_overlap1d(c(100, 100, 100, 100),
#'                          c(120, 120, 120, 120),
#'                          c(100, 100, 130, 200),
#'                          c(120, 110, 140, 220))
#' @export
calculate_relative_overlap1d <- function(peak1_start, peak1_end,
                                         peak2_start, peak2_end) {
    peak_overlap <- pmin(peak1_end, peak2_end) -
        pmax(peak1_start, peak2_start)

    peak_combined_length <- pmax(peak1_end, peak2_end) -
        pmin(peak1_start, peak2_start)

    return(peak_overlap / peak_combined_length)
}

#' @title Relative Anchor Overlap of two Interactions
#'
#' @description
#' Calculates the overlap between anchor A of interaction 1 and anchor
#' A of interaction 2, as well as anchor B of interaction 1 and anchor B of
#' interaction 2. The overlap (in nucleotides) is then normalized by the length
#' of the anchors.
#'
#' Note: anchors A and B of the same interaction have to be on the same
#' chromosome; start coordinate is always less than end coordinate
#'
#' @param int1_anchor_a_start integer vector; genomic start coordinate(s)
#' of anchor A in replicate 1 interaction
#' @param int1_anchor_a_end integer vector; genomic end coordinate(s)
#' of anchor A in replicate 1 interaction
#' @param int1_anchor_b_start integer vector; genomic start coordinate(s)
#' of anchor B in replicate 1 interaction
#' @param int1_anchor_b_end integer vector; genomic end coordinate(s)
#' of anchor B in replicate 1 interaction
#' @param int2_anchor_a_start integer vector; genomic start coordinate(s)
#' of anchor A in replicate 2 interaction
#' @param int2_anchor_a_end integer vector; genomic end coordinate(s)
#' of anchor A in replicate 2 interaction
#' @param int2_anchor_b_start integer vector; genomic start coordinate(s)
#' of anchor B in replicate 2 interaction
#' @param int2_anchor_b_end integer vector; genomic end coordinate(s)
#' of anchor B in replicate 2 interaction
#'
#' @return numeric vector; relative overlaps between interaction pairs
#'
#' @examples
#' # 100% overlap
#' calculate_relative_overlap2d(100, 120, 240, 260,
#'                              100, 120, 240, 260)
#'
#' # 50% overlap
#' calculate_relative_overlap2d(100, 120, 240, 250,
#'                              100, 110, 240, 260)
#'
#' # negative overlap
#' calculate_relative_overlap2d(100, 120, 240, 250,
#'                              130, 140, 260, 280)
#'
#' # larger negative overlap
#' calculate_relative_overlap2d(100, 120, 240, 250,
#'                              200, 220, 340, 350)
#'
#' # vectorized example
#' calculate_relative_overlap2d(c(100, 100, 100, 100),
#'                              c(120, 120, 120, 120),
#'                              c(240, 240, 240, 240),
#'                              c(260, 250, 250, 250),
#'                              c(100, 100, 130, 200),
#'                              c(120, 110, 140, 220),
#'                              c(240, 240, 260, 340),
#'                              c(260, 260, 280, 350))
#' @export
calculate_relative_overlap2d <- function(int1_anchor_a_start,
                                         int1_anchor_a_end,
                                         int1_anchor_b_start,
                                         int1_anchor_b_end,
                                         int2_anchor_a_start,
                                         int2_anchor_a_end,
                                         int2_anchor_b_start,
                                         int2_anchor_b_end) {
    anchor_a_overlap <- pmin(int1_anchor_a_end, int2_anchor_a_end) -
        pmax(int1_anchor_a_start, int2_anchor_a_start)
    anchor_b_overlap <- pmin(int1_anchor_b_end, int2_anchor_b_end) -
        pmax(int1_anchor_b_start, int2_anchor_b_start)

    anchor_a_combined_length <- pmax(int1_anchor_a_end, int2_anchor_a_end) -
        pmin(int1_anchor_a_start, int2_anchor_a_start)
    anchor_b_combined_length <- pmax(int1_anchor_b_end, int2_anchor_b_end) -
        pmin(int1_anchor_b_start, int2_anchor_b_start)

    return((anchor_a_overlap + anchor_b_overlap) /
               (anchor_a_combined_length + anchor_b_combined_length))
}

#' @title Establish m:n Mapping Between Peaks from Replicate 1 and 2
#'
#' @description
#' This method returns all overlapping interactions between two replicates.
#' For each pair of overlapping interactions, the
#' \emph{ambiguity resolution value} (ARV) is calculated, which helps to reduce
#' the m:n mapping to a 1:1 mapping. The semantics of the ARV depend on the
#' specified \code{ambiguity_resolution_method}, but in general interaction
#' pairs with lower ARVs have priority over interaction pairs with higher ARVs
#' when the bijective mapping is established.
#'
#' @param rep1_df data frame of observations (i.e., genomic peaks) of
#' replicate 1, with at least the following columns (position of columns
#' matter, column names are irrelevant):
#' \tabular{rll}{
#'   column 1:  \tab \code{chr} \tab character; genomic location of peak -
#'   chromosome (e.g., \code{"chr3"})\cr
#'   column 2:  \tab \code{start} \tab integer; genomic location of peak -
#'   start coordinate\cr
#'   column 3:  \tab \code{end} \tab integer; genomic location of peak -
#'   end coordinate\cr
#'   column 4:  \tab \code{value} \tab numeric; p-value, FDR, or heuristic used
#'   to rank the interactions
#' }
#' @param rep2_df data frame of observations (i.e., genomic peaks) of
#' replicate 2, with the following columns (position of columns
#' matter, column names are irrelevant):
#' \tabular{rll}{
#'   column 1:  \tab \code{chr} \tab character; genomic location of peak -
#'   chromosome (e.g., \code{"chr3"})\cr
#'   column 2:  \tab \code{start} \tab integer; genomic location of peak -
#'   start coordinate\cr
#'   column 3:  \tab \code{end} \tab integer; genomic location of peak -
#'   end coordinate\cr
#'   column 4:  \tab \code{value} \tab numeric; p-value, FDR, or heuristic used
#'   to rank the interactions
#' }
#' @param ambiguity_resolution_method defines how ambiguous assignments
#' (when one interaction in replicate 1 overlaps with multiple interactions in
#' replicate 2 or vice versa)
#' are resolved. Available methods:
#' \tabular{rl}{
#'   \code{"value"} \tab interactions are prioritized by ascending or descending
#'   \code{value} column (see \code{sorting_direction}), e.g., if two
#'   interactions in replicate 1 overlap with one interaction in replicate 2,
#'   the interaction from replicate 1 is chosen which has a lower (if
#'   \code{sorting_direction} is \code{"ascending"}) or higher (if
#'   \code{"descending"}) value \cr
#'   \code{"overlap"} \tab the interaction pair is chosen which has the highest
#'   relative overlap, i.e., overlap in nucleotides of replicate 1 interaction
#'   anchor A and replicate 2 interaction anchor A,
#'   plus replicate 1 interaction anchor B and replicate 2 interaction anchor B,
#'   normalized by their lengths\cr
#'   \code{"midpoint"} \tab the interaction pair is chosen which has the
#'   smallest
#'   distance between their anchor midpoints, i.e., distance from midpoint of
#'   replicate 1 interaction anchor A to midpoint of
#'   replicate 2 interaction anchor A, plus distance from midpoint of
#'   replicate 1 interaction anchor B to midpoint of
#'   replicate 2 interaction anchor B
#' }
#' @inheritParams determine_anchor_overlap
#'
#' @return data frame with the following columns:
#' \tabular{rll}{
#'   column 1:  \tab \code{rep1_idx} \tab index of interaction in replicate 1
#'   (i.e., row index in \code{rep1_df})\cr
#'   column 2:  \tab \code{rep2_idx} \tab index of interaction in replicate 2
#'   (i.e., row index in \code{rep2_df})\cr
#'   column 3:  \tab \code{arv} \tab ambiguity resolution value used turn
#'   m:n mapping into 1:1 mapping. Interaction pairs with lower \code{arv}
#'   are prioritized.
#' }
#'
#' @examples
#' rep1_df <- idr2d:::chipseq$rep1_df
#' rep1_df$value <- preprocess(rep1_df$value, "log_additive_inverse")
#'
#' rep2_df <- idr2d:::chipseq$rep2_df
#' rep2_df$value <- preprocess(rep2_df$value, "log_additive_inverse")
#'
#' # shuffle to break preexisting order
#' rep1_df <- rep1_df[sample.int(nrow(rep1_df)), ]
#' rep2_df <- rep2_df[sample.int(nrow(rep2_df)), ]
#'
#' # sort by value column
#' rep1_df <- dplyr::arrange(rep1_df, value)
#' rep2_df <- dplyr::arrange(rep2_df, value)
#'
#' pairs_df <- establish_overlap1d(rep1_df, rep2_df)
#'
#' @export
establish_overlap1d <- function(rep1_df, rep2_df,
                                ambiguity_resolution_method = c("overlap",
                                                                "midpoint",
                                                                "value"),
                                max_gap = -1L) {
    # argument handling
    ambiguity_resolution_method <- match.arg(ambiguity_resolution_method,
                                             choices = c("overlap",
                                                         "midpoint",
                                                         "value"))

    if (nrow(rep1_df) > 0 && nrow(rep2_df) > 0) {
        overlaps_df <- determine_anchor_overlap(
            data.frame(chr = rep1_df[, 1],
                       start = rep1_df[, 2],
                       end = rep1_df[, 3]),
            data.frame(chr = rep2_df[, 1],
                       start = rep2_df[, 2],
                       end = rep2_df[, 3]),
            max_gap = max_gap
        )

        idx_rep1 <- overlaps_df$rep1_idx
        idx_rep2 <- overlaps_df$rep2_idx

        # arv is ambiguity resolution value
        # the lower, the better
        if (ambiguity_resolution_method == "value") {
            arv <- rep1_df[idx_rep1, 4] + rep2_df[idx_rep2, 4]
            arv <- (-1) * arv
        } else if (ambiguity_resolution_method == "overlap") {
            arv <- calculate_relative_overlap1d(
                rep1_df[idx_rep1, 2], rep1_df[idx_rep1, 3],
                rep2_df[idx_rep2, 2], rep2_df[idx_rep2, 3]
            )
            arv <- (-1) * arv
        } else if (ambiguity_resolution_method == "midpoint") {
            arv <- calculate_midpoint_distance1d(
                rep1_df[idx_rep1, 2], rep1_df[idx_rep1, 3],
                rep2_df[idx_rep2, 2], rep2_df[idx_rep2, 3]
            )
        }

        pairs_df <- data.frame(rep1_idx = idx_rep1,
                               rep2_idx = idx_rep2,
                               arv = arv)
    } else {
        pairs_df <- data.frame()
    }

    return(pairs_df)
}


#' @title Establish m:n mapping between interactions from replicate 1 and 2
#'
#' @description
#' This method returns all overlapping interactions between two replicates.
#' For each pair of overlapping interactions, the
#' \emph{ambiguity resolution value} (ARV) is calculated, which helps to reduce
#' the m:n mapping to a 1:1 mapping. The semantics of the ARV depend on the
#' specified \code{ambiguity_resolution_method}, but in general interaction
#' pairs with lower ARVs have priority over interaction pairs with higher ARVs
#' when the bijective mapping is established.
#'
#' @param rep1_df data frame of observations (i.e., genomic interactions) of
#' replicate 1, with at least the following columns (position of columns
#' matter, column names are irrelevant):
#' \tabular{rll}{
#'   column 1: \tab \code{chr_a} \tab character; genomic location of anchor A -
#'   chromosome (e.g., \code{"chr3"})\cr
#'   column 2: \tab \code{start_a} \tab integer; genomic location of anchor A -
#'   start coordinate\cr
#'   column 3: \tab \code{end_a} \tab integer; genomic location of anchor A -
#'   end coordinate\cr
#'   column 4: \tab \code{chr_b} \tab character; genomic location of anchor B -
#'   chromosome (e.g., \code{"chr3"})\cr
#'   column 5: \tab \code{start_b} \tab integer; genomic location of anchor B -
#'   start coordinate\cr
#'   column 6: \tab \code{end_b} \tab integer; genomic location of anchor B -
#'   end coordinate\cr
#'   column 7: \tab \code{value} \tab numeric; p-value, FDR, or heuristic
#'   used to rank the interactions
#' }
#' @param rep2_df data frame of observations (i.e., genomic interactions) of
#' replicate 2, with the following columns (position of columns
#' matter, column names are irrelevant):
#' \tabular{rll}{
#'   column 1: \tab \code{chr_a} \tab character; genomic location of anchor A -
#'   chromosome (e.g., \code{"chr3"})\cr
#'   column 2: \tab \code{start_a} \tab integer; genomic location of anchor A -
#'   start coordinate\cr
#'   column 3: \tab \code{end_a} \tab integer; genomic location of anchor A -
#'   end coordinate\cr
#'   column 4: \tab \code{chr_b} \tab character; genomic location of anchor B -
#'   chromosome (e.g., \code{"chr3"})\cr
#'   column 5: \tab \code{start_b} \tab integer; genomic location of anchor B -
#'   start coordinate\cr
#'   column 6: \tab \code{end_b} \tab integer; genomic location of anchor B -
#'   end coordinate\cr
#'   column 7: \tab \code{value} \tab numeric; p-value, FDR, or heuristic used
#'   to rank the interactions
#' }
#' @param ambiguity_resolution_method defines how ambiguous assignments
#' (when one interaction in replicate 1 overlaps with multiple interactions in
#' replicate 2 or vice versa)
#' are resolved. Available methods:
#' \tabular{rl}{
#'   \code{"value"} \tab interactions are prioritized by ascending or descending
#'   \code{value} column (see \code{sorting_direction}), e.g., if two
#'   interactions in replicate 1 overlap with one interaction in replicate 2,
#'   the interaction from replicate 1 is chosen which has a lower (if
#'   \code{sorting_direction} is \code{"ascending"}) or higher (if
#'   \code{"descending"}) value \cr
#'   \code{"overlap"} \tab the interaction pair is chosen which has the highest
#'   relative overlap, i.e., overlap in nucleotides of replicate 1 interaction
#'   anchor A and replicate 2 interaction anchor A,
#'   plus replicate 1 interaction anchor B and replicate 2 interaction anchor B,
#'   normalized by their lengths\cr
#'   \code{"midpoint"} \tab the interaction pair is chosen which has the
#'   smallest
#'   distance between their anchor midpoints, i.e., distance from midpoint of
#'   replicate 1 interaction anchor A to midpoint of
#'   replicate 2 interaction anchor A, plus distance from midpoint of
#'   replicate 1 interaction anchor B to midpoint of
#'   replicate 2 interaction anchor B
#' }
#' @inheritParams determine_anchor_overlap
#'
#' @return data frame with the following columns:
#' \tabular{rll}{
#'   column 1: \tab \code{rep1_idx} \tab index of interaction in replicate 1
#'   (i.e., row index in \code{rep1_df})\cr
#'   column 2: \tab \code{rep2_idx} \tab index of interaction in replicate 2
#'   (i.e., row index in \code{rep2_df})\cr
#'   column 3: \tab \code{arv} \tab ambiguity resolution value used turn
#'   m:n mapping into 1:1 mapping. Interaction pairs with lower \code{arv}
#'   are prioritized.
#' }
#'
#' @examples
#' rep1_df <- idr2d:::chiapet$rep1_df
#' rep1_df$fdr <- preprocess(rep1_df$fdr, "log_additive_inverse")
#'
#' rep2_df <- idr2d:::chiapet$rep2_df
#' rep2_df$fdr <- preprocess(rep2_df$fdr, "log_additive_inverse")
#'
#' # shuffle to break preexisting order
#' rep1_df <- rep1_df[sample.int(nrow(rep1_df)), ]
#' rep2_df <- rep2_df[sample.int(nrow(rep2_df)), ]
#'
#' # sort by value column
#' rep1_df <- dplyr::arrange(rep1_df, rep1_df$fdr)
#' rep2_df <- dplyr::arrange(rep2_df, rep2_df$fdr)
#'
#' pairs_df <- establish_overlap2d(rep1_df, rep2_df)
#'
#' @export
establish_overlap2d <- function(rep1_df, rep2_df,
                                ambiguity_resolution_method = c("overlap",
                                                                "midpoint",
                                                                "value"),
                                max_gap = -1L) {
    # argument handling
    ambiguity_resolution_method <- match.arg(ambiguity_resolution_method,
                                             choices = c("overlap",
                                                         "midpoint",
                                                         "value"))
    if (nrow(rep1_df) > 0 && nrow(rep2_df) > 0) {
        overlaps_anchors_a_df <- determine_anchor_overlap(
            data.frame(chr = rep1_df[, 1],
                       start = rep1_df[, 2],
                       end = rep1_df[, 3]),
            data.frame(chr = rep2_df[, 1],
                       start = rep2_df[, 2],
                       end = rep2_df[, 3]),
            max_gap = max_gap
        )
        overlaps_anchors_b_df <- determine_anchor_overlap(
            data.frame(chr = rep1_df[, 4],
                       start = rep1_df[, 5],
                       end = rep1_df[, 6]),
            data.frame(chr = rep2_df[, 4],
                       start = rep2_df[, 5],
                       end = rep2_df[, 6]),
            max_gap = max_gap
        )

        a <- paste0(overlaps_anchors_a_df$rep1_idx, "-",
                    overlaps_anchors_a_df$rep2_idx)
        b <- paste0(overlaps_anchors_b_df$rep1_idx, "-",
                    overlaps_anchors_b_df$rep2_idx)

        replicates <- intersect(a, b)

        idx_rep1 <- unlist(lapply(strsplit(replicates, "-", fixed = TRUE),
                                  function(interaction) {
                                      return(as.integer(interaction[1]))
                                  }))

        idx_rep2 <- unlist(lapply(strsplit(replicates, "-", fixed = TRUE),
                                  function(interaction) {
                                      return(as.integer(interaction[2]))
                                  }))

        # arv is ambiguity resolution value
        # the lower, the better
        if (ambiguity_resolution_method == "value") {
            arv <- rep1_df[idx_rep1, 7] + rep2_df[idx_rep2, 7]
            arv <- (-1) * arv
        } else if (ambiguity_resolution_method == "overlap") {
            arv <- calculate_relative_overlap2d(
                rep1_df[idx_rep1, 2], rep1_df[idx_rep1, 3],
                rep1_df[idx_rep1, 5], rep1_df[idx_rep1, 6],
                rep2_df[idx_rep2, 2], rep2_df[idx_rep2, 3],
                rep2_df[idx_rep2, 5], rep2_df[idx_rep2, 6]
            )
            arv <- (-1) * arv
        } else if (ambiguity_resolution_method == "midpoint") {
            arv <- calculate_midpoint_distance2d(
                rep1_df[idx_rep1, 2], rep1_df[idx_rep1, 3],
                rep1_df[idx_rep1, 5], rep1_df[idx_rep1, 6],
                rep2_df[idx_rep2, 2], rep2_df[idx_rep2, 3],
                rep2_df[idx_rep2, 5], rep2_df[idx_rep2, 6]
            )
        }

        pairs_df <- data.frame(rep1_idx = idx_rep1,
                               rep2_idx = idx_rep2,
                               arv = arv)
    } else {
        pairs_df <- data.frame()
    }

    return(pairs_df)
}


#' @title Removes Peaks on Non-standard Chromosomes
#' @param x data frame of genomic peaks, with the following columns
#' (position of columns matter, column names are irrelevant):
#' \tabular{rll}{
#'   column 1: \tab \code{chr} \tab character; genomic location of peak -
#'   chromosome (e.g., \code{"chr3"})\cr
#'   column 2: \tab \code{start} \tab integer; genomic location of peak  -
#'   start coordinate\cr
#'   column 3: \tab \code{end} \tab integer; genomic location of peak -
#'   end coordinate\cr
#'   column 4: \tab \code{value} \tab numeric; p-value, FDR, or heuristic used
#'   to rank the peaks
#' }
#'
#' @return \code{x} without non-standard chromosomes.
#'
#' @examples
#' rep1_df <- remove_nonstandard_chromosomes1d(idr2d:::chipseq$rep1_df)
#'
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom GenomeInfoDb keepStandardChromosomes
#' @export
remove_nonstandard_chromosomes1d <- function(x) {
    x <- GenomicRanges::GRanges(x[, 1],
                                IRanges::IRanges(x[, 2], x[, 3]),
                                value = x[, 4])
    x <- GenomeInfoDb::keepStandardChromosomes(x, pruning.mode = "coarse")
    x_df <- as.data.frame(x)
    x_df$width <- NULL
    x_df$strand <- NULL
    colnames(x_df) <- c("chr", "start", "end", "value")
    return(x_df)
}

#' @title Removes Interactions on Non-standard Chromosomes
#' @param x data frame of genomic interactions, with the following columns
#' (position of columns matter, column names are irrelevant):
#' \tabular{rll}{
#'   column 1: \tab \code{chr_a} \tab character; genomic location of anchor A -
#'   chromosome (e.g., \code{"chr3"})\cr
#'   column 2: \tab \code{start_a} \tab integer; genomic location of anchor A -
#'   start coordinate\cr
#'   column 3: \tab \code{end_a} \tab integer; genomic location of anchor A -
#'   end coordinate\cr
#'   column 4: \tab \code{chr_b} \tab character; genomic location of anchor B -
#'   chromosome (e.g., \code{"chr3"})\cr
#'   column 5: \tab \code{start_b} \tab integer; genomic location of anchor B -
#'   start coordinate\cr
#'   column 6: \tab \code{end_b} \tab integer; genomic location of anchor B -
#'   end coordinate\cr
#'   column 7: \tab \code{value} \tab numeric; p-value, FDR, or heuristic used
#'   to rank the interactions
#' }
#'
#' @examples
#' rep1_df <- remove_nonstandard_chromosomes2d(idr2d:::chiapet$rep1_df)
#'
#' @return \code{x} without non-standard chromosomes.
#'
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom GenomeInfoDb keepStandardChromosomes
#' @importFrom dplyr inner_join
#' @importFrom dplyr select
#' @export
remove_nonstandard_chromosomes2d <- function(x) {
    # avoid CRAN warnings
    chr_a <- start_a <- end_a <- chr_b <- start_b <- end_b <- value <- NULL

    x$idx <- seq_len(nrow(x))
    anchor_a <- GenomicRanges::GRanges(x[, 1],
                                       IRanges::IRanges(x[, 2], x[, 3]),
                                       value = x[, 7],
                                       idx = x$idx)
    anchor_a <- GenomeInfoDb::keepStandardChromosomes(anchor_a,
                                                      pruning.mode = "coarse")
    anchor_a_df <- as.data.frame(anchor_a)
    anchor_a_df$width <- NULL
    anchor_a_df$strand <- NULL
    colnames(anchor_a_df) <- c("chr_a", "start_a", "end_a", "value", "idx")

    anchor_b <- GenomicRanges::GRanges(x[, 4],
                                       IRanges::IRanges(x[, 5], x[, 6]),
                                       idx = x$idx)
    anchor_b <- GenomeInfoDb::keepStandardChromosomes(anchor_b,
                                                      pruning.mode = "coarse")
    anchor_b_df <- as.data.frame(anchor_b)
    anchor_b_df$width <- NULL
    anchor_b_df$strand <- NULL
    colnames(anchor_b_df) <- c("chr_b", "start_b", "end_b", "idx")

    x_df <- dplyr::inner_join(anchor_a_df, anchor_b_df, by = "idx")
    x_df <- dplyr::select(x_df, chr_a, start_a, end_a,
                          chr_b, start_b, end_b, value)

    return(x_df)
}

Try the idr2d package in your browser

Any scripts or data that you put into this service are public.

idr2d documentation built on Nov. 8, 2020, 6:16 p.m.