#' Determine all unique rows in GRanges object
#'
#' \code{unique_granges} returns a GRanges object of only unique observations (
#' removing all duplicated rows), yet considers the meta data information.
#'
#' @description Given a GRanges object, this function returns all unique rows,
#' or observations, including the meta data information. Calling the function
#' \code{unique()} on a GRanges object only returns unique ranges, and does not
#' account for meta data information, unlike to a data.frame. Using the option
#' sum.counts = TRUE and specifying the counts.col = "" name, will sum the
#' numerical values within the column for all combined rows.
#'
#' @usage
#' unique_granges(sites)
#'
#' unique_granges(sites, sum.cols = FALSE, rm.cols = NULL, rm.dup.cols = NULL)
#'
#' @param sites A GRanges object with or without metadata columns.
#' @param sum.cols a logical or character vector of column name(s) in the
#' metadata of the input GRanges object to sum across unique observations.
#' These column(s) will not be considered when identifying unique observations.
#' Default is FALSE, if set to TRUE, then will attempt to sum the column
#' 'counts'.
#' @param rm.cols a character vector of column name(s) to remove from the
#' metadata of the input GRanges object. Removal of columns will occur prior to
#' identifying unique observations, and therefore these columns will not only
#' be dropped from the output, but will not be considered in identifying unique
#' observations.
#' @param rm.dup.cols logical or a regular expression to identify duplicate
#' columns and remove the duplicates. This will only remove columns that are
#' true duplicates, meaning they have identical content. Removal of duplicated
#' columns will occur prior to identifying unique observations, and therefore
#' identified duplicates will not only be dropped from the output, but will not
#' be considered in identifying unique observations.
#'
#' @examples
#' gr <- gintools:::generate_test_granges(
#' n_sites = 1,
#' n_reads_p_site = 12,
#' site_range = 1:20,
#' read_width_range = 20:30
#' )
#' gr <- refine_breakpoints(gr)
#' gr <- standardize_sites(gr)
#' gr$sample <- rep(c("A","B"), 6)
#' gr$sample.1 <- rep(c("A","B"), 6)
#' gr$counts <- rep(1:4, c(3,3,3,3))
#'
#' # Calling unique() on gr returns a miss interpreted data set
#' unique(gr)
#'
#' # Using unique_granges() without options returns all distinct rows
#' unique_granges(gr)
#'
#' # Using the options for sum.cols, rows 'counts' are added when combined
#' # together.
#' unique_granges(gr, sum.cols = TRUE)
#'
#' # Or multiple columns can be added simultaneously.
#' gr$tags <- rep(1:2, c(6,6))
#' unique_granges(gr, sum.cols = c("counts", "tags"))
#'
#' # Remove specific columns by passing a character vector to 'rm.cols'
#' unique_granges(gr, sum.cols = c("counts", "tags"), rm.cols = "sample.1")
#'
#' # Remove any column that may be a duplicate of another with a specific
#' # name pattern. Column content will be checked for identity before removal.
#' unique_granges(gr, sum.cols = c("counts", "tags"), rm.dup.cols = "[\\w]+")
#'
#' @author Christopher L. Nobles, Ph.D.
#' @importFrom magrittr %>%
#' @export
unique_granges <- function(sites, sum.cols = FALSE,
rm.cols = NULL, rm.dup.cols = NULL){
# Checks and balance
if( !class(sites) == "GRanges" ){
stop("\n Sites object is not a GRanges class.\n")
}
if( any(sum.cols != FALSE) ){
if( all(sum.cols == TRUE) ){
counts_col <- "counts"
}else{
counts_col <- sum.cols
}
if( any(!counts_col %in% names(GenomicRanges::mcols(sites))) ){
missing_cols <- counts_col[
which(!counts_col %in% names(GenomicRanges::mcols(sites)))
]
stop(
"\n Could not find column name(s) in sites object: ",
paste(missing_cols, collapse = ", "),
"\n"
)
}
}
if( !is.null(rm.cols) ){
if( any(!rm.cols %in% names(GenomicRanges::mcols(sites))) ){
missing_cols <- rm.cols[
which(!rm.cols %in% names(GenomicRanges::mcols(sites)))
]
message(
"Could not find column name(s) to remove in sites object: ",
paste(missing_cols, collapse = ", ")
)
rm.cols <- rm.cols[rm.cols %in% missing_cols]
}
if( length(rm.cols) > 0 ){
GenomicRanges::mcols(sites)[, rm.cols] <- NULL
}
}
if( !is.null(rm.dup.cols) ){
col_names <- names(GenomicRanges::mcols(sites))
if( is.character(rm.dup.cols) ){
col_names <- stringr::str_extract(col_names, rm.dup.cols)
}
dup_cols <- names(table(col_names))[table(col_names) > 1]
dup_idxs <- lapply(dup_cols, function(x) which(col_names == x))
rm_col_idx <- unlist(lapply(dup_idxs, function(x){
col_digests <- unlist(lapply(x, function(y){
digest::digest(GenomicRanges::mcols(sites)[,y, drop = TRUE])
}))
x[which(duplicated(col_digests))]
}))
GenomicRanges::mcols(sites)[,rm_col_idx] <- NULL
if( is.character(rm.dup.cols) ){
names(GenomicRanges::mcols(sites)) <- stringr::str_extract(
names(GenomicRanges::mcols(sites)), rm.dup.cols
)
}
}
# Convert sites to a data.frame and remove duplicates
if( !length(names(sites)) == length(unique(names(sites))) ){
message("Dropping rownames for data.frame conversion.")
df <- GenomicRanges::as.data.frame(sites, row.names = NULL)
}else{
df <- GenomicRanges::as.data.frame(sites)
}
cols <- names(df)
if( any(sum.cols != FALSE) ){
counts_pos <- match(counts_col, cols)
}
# Sum counts if needed
if( !any(sum.cols != FALSE) ){
df <- dplyr::distinct(df)
}else{
groups <- lapply(cols[-counts_pos], as.symbol)
df <- dplyr::group_by(df, .dots = groups) %>%
dplyr::summarise_all(.funs = "sum") %>%
dplyr::ungroup()
df <- df[,cols]
}
# Rebuild GRanges object
gr <- GenomicRanges::GRanges(
seqnames = df$seqnames,
ranges = IRanges::IRanges(start = df$start, end = df$end),
strand = df$strand,
seqinfo = GenomicRanges::seqinfo(sites)
)
GenomicRanges::mcols(gr) <- dplyr::select(df, 6:length(df))
gr
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.