R/venndir-overlap.R

#' Signed overlaps
#' 
#' Calculate signed, directional overlaps across sets
#' 
#' This function is the core function to summarize overlaps
#' that include signed directionality. It is intended for
#' situations where two sets may share items, but where the
#' signed direction associated with those items may or may
#' not also be shared.
#' 
#' One motivating example is with biological data, where
#' a subset of genes, proteins, or regions of genome, may be
#' regulated up or down, and this direction is relevant
#' to understanding the biological process. Two experiments
#' may identify similar genes, proteins, or regions of
#' genome, but they may not regulate them in the same
#' direction. This function is intended to help summarize
#' item overlaps alongside the directionality of each item.
#' 
#' The directional counts can be summarized in slightly different
#' ways, defined by the argument `overlap_type`:
#' 
#' * `overlap_type="detect"` - default behavior: each vector in `setlist`
#'    is handled independently:
#'     * a vector with no names will use the vector
#'     values as items after converting them to `character`;
#'     * a named vector with `character` or `factor` values
#'     will will use the vector names as items,
#'     and character values as item values;
#'     * a named vector with `numeric` or `integer` values
#'     will use vector names as items, and will convert
#'     numeric values to `sign()`.
#' * `overlap_type="each"` - this option returns all possible
#' directions individually counted.
#' * `overlap_type="concordance"` - this option returns the counts
#' for each consistent direction, for example `"up-up-up"` would
#' be counted, and `"down-down-down"` would be counted, but any
#' mixture of `"up"` and `"down"` would be summarized and counted
#' as `"mixed"`. For 3-way overlaps, there are 8 possible directions,
#' the labels are difficult to place in the Venn diagram, and are not
#' altogether meaningful. Note that this option is the default
#' for `venndir()`.
#' * `overlap_type="overlap"` - this option only summarizes overlaps
#' without regard to direction. This option returns standard Venn
#' overlap counts.
#' * `overlap_type="agreement"` - this option groups all directions
#' that agree and returns them as `"concordant"`, all others are
#' returned as `"mixed"`.
#' 
#' Note that `overlap_type="agreement"` and `overlap_type="concordance"`
#' will not convert `numeric` values to `sign()`, so if the input
#' contains `numeric` values such as `1.2435` they should probably be
#' converted to `sign()` before calling `signed_overlaps()`, for example:
#' `signed_overlaps(lapply(setlist, sign))`
#' 
#' @family venndir core
#' 
#' @param setlist `list` of named vectors, whose names represent
#'    set items, and whose values represent direction using 
#'    values `c(-1, 0, 1)`.
#' @param overlap_type `character` value indicating the type of
#'    overlap logic:
#'    * `"each"` records each combination of signs;
#'    * `"overlap"` disregards the sign and returns any match
#'    item overlap;
#'    * `"concordance"` represents counts for full
#'    agreement, or `"mixed"` for any inconsistent overlapping
#'    direction;
#'    * `"agreement"` represents full agreement in direction
#'    as `"agreement"`, and `"mixed"` for any inconsistent
#'    direction.
#' @param return_items `logical` indicating whether to return
#'    the items within each overlap set.
#' @param return_item_labels `logical` indicating whether to return
#'    the directional label associated with each item. A directional
#'    label combines the direction from `setlist` by item.
#' @param sep `character` used as a delimiter between set names,
#'    the default is `"&"`.
#' @param trim_label `logical` indicating whether to trim the
#'    directional label, for example instead of returning `"0 1 -1"`
#'    it will return `"1 -1"` because the overlap name already
#'    indicates the sets involved.
#' @param include_blanks `logical` indicating whether each set overlap
#'    should be represented at least once even when no items are
#'    present in the overlap. When `include_blanks=TRUE` is useful
#'    in that it guarantees all possible combinations of overlaps
#'    are represented consistently in the output.
#' @param ... additional arguments are passed to `list2imsigned()`.
#' 
#' @examples
#' setlist <- make_venn_test(100, 2, do_signed=FALSE);
#' 
#' # detect overlap_type
#' attr(signed_overlaps(setlist, "detect"), "overlap_type")
#' 
#' setlist <- make_venn_test(100, 2, do_signed=TRUE);
#' 
#' # detect overlap_type
#' attr(signed_overlaps(setlist, "detect"), "overlap_type")
#' 
#' # straight overlap counts
#' signed_overlaps(setlist, "overlap");
#' 
#' # each directional overlap count
#' signed_overlaps(setlist, "each");
#' 
#' # concordance overlap counts
#' signed_overlaps(setlist, "concordance");
#' 
#' # agreement overlap counts
#' signed_overlaps(setlist, "agreement");
#' 
#' # test to ensure factor input is handled properly
#' inputlist <- list(setA=factor(c("A", "B", "D")),
#'    setB=factor(c("A", "C", "E", "F")))
#' signed_overlaps(inputlist, return_items=TRUE)
#' 
#' # check to verify
#' signed_overlaps(inputlist, return_items=TRUE)$items
#' 
#' @export
signed_overlaps <- function
(setlist,
 overlap_type=c("detect",
    "each",
    "overlap",
    "concordance",
    "agreement"),
 return_items=FALSE,
 return_item_labels=return_items,
 sep="&",
 trim_label=TRUE,
 include_blanks=TRUE,
 ...)
{
   ##
   overlap_type <- match.arg(overlap_type,
      several.ok=TRUE);
   
   ## 1sec
   # convert setlist to signed incidence matrix
   #svims <- list2im_value(setlist, ...);
   if (inherits(setlist, "Matrix") || inherits(setlist, "matrix")) {
      svims <- setlist;
   } else {
      # handle list input
      setlist <- lapply(setlist, function(i){
         if (length(names(i)) == 0) {
            if (is.numeric(i)) {
               warning("signed_overlaps(): input list contains vector with un-named numeric values.");
            }
            if (is.factor(i)) {
               warning("signed_overlaps(): input list contains factor values which will be coerced to character.");
            }
            i <- jamba::nameVector(rep(1, length(i)),
               as.character(i),
               makeNamesFunc=c);
         } else {
            if ("detect" %in% overlap_type) {
               # for "detect", convert numeric or integer input to sign(i)
               if ( (is.numeric(i) || is.integer(i)) && !all(i %in% c(-1, 0, 1, NA)) ) {
                  i[] <- sign(i);
               } else if (is.character(i) || is.factor(i)) {
                  if (!any(duplicated(i)) && length(i) > 3) {
                     warning("signed_overlaps(): named character vector, non-duplicate items, length > 3, the vector values are used as items.");
                     i <- jamba::nameVector(rep(1, length(i)),
                        as.character(i),
                        makeNamesFunc=c);
                  }
               }
            }
         }
         i
      });
      svims <- list2im_value(setlist);
   }
   
   # handle overlap_type="detect"
   if ("detect" %in% overlap_type) {
      if (all(unique(as.vector(svims)) %in% c(0, 1, NA))) {
         overlap_type <- "overlap";
      } else {
         overlap_type <- head(setdiff(overlap_type, "detect"), 1);
         if (length(overlap_type) == 0) {
            overlap_type <- "concordance";
         }
      }
   }
   
   ## 0.02sec
   # convert to overlap vector (signed)
   # 0 0 1, 1 1 0, 1 1 1, etc.
   if (is.character(svims[1,1])) {
      # if data is stored as character, change NA to ""
      svimsl <- (svims != "") * 1
      svimsl[is.na(svims)] <- 0;
      svimss <- do.call(paste, lapply(seq_len(ncol(svims)), function(i){
         jamba::rmNA(naValue="", svims[,i]);
      }))
   } else {
      # if data is stored as numeric, change NA to "0"
      svimsl <- (svims != 0) * 1
      svimsl[is.na(svims)] <- 0;
      svimss <- do.call(paste, lapply(seq_len(ncol(svims)), function(i){
         jamba::rmNA(naValue="0", svims[,i]);
      }))
   }
   
   ## 1.2sec
   # convert to overlap vector (un-signed)
   # 0 0 1, 1 1 0, 1 1 1, etc.
   svimsv <- do.call(paste, lapply(seq_len(ncol(svimsl)), function(i){svimsl[,i]}))
   
   ## concordance
   ## 0.02sec
   # for observed overlap vectors, determine which are concordant
   # (should be concordant* since up-up and down-down both get assigned TRUE)
   svimssu <- unique(svimss);
   svimssu_concordance <- jamba::nameVector(sapply(strsplit(svimssu, " "), function(i){
      j <- setdiff(i, c("","0"))
      length(unique(j)) == 1;
   }), svimssu);
   
   # alternate approach, split using overlap_type upfront, avoid data.table
   svims_split_names <- sapply(jamba::nameVector(unique(svimsv)), function(i){
      paste(collapse=sep,
         colnames(svims)[strsplit(i, " ")[[1]] %in% "1"])
   });
   svims_split_name_ct <- sapply(strsplit(jamba::nameVectorN(svims_split_names), " "), function(i){
      sum(as.numeric(i))
   })
   if ("overlap" %in% overlap_type) {
      svimsv_olt <- paste(svims_split_names[svimsv],
         sep="|",
         svimsv);
   } else if ("concordance" %in% overlap_type) {
      svimsv_olt <- paste(svims_split_names[svimsv],
         sep="|",
         ifelse(svimssu_concordance[svimss], svimss, "mixed"));
   } else if ("agreement" %in% overlap_type) {
      svimsv_olt <- paste(svims_split_names[svimsv],
         sep="|",
         ifelse(svimssu_concordance[svimss], "agreement", "mixed"));
   } else {
      svimsv_olt <- paste(svims_split_names[svimsv],
         sep="|",
         svimss);
   }
   
   # split by observed directions within each overlap set
   ## 0.07 sec
   svims_split <- split(svimss, svimsv_olt);

   # Create labels for each split
   svims_df <- data.frame(
      sets=gsub("^(.+)[|]([^|]+)$",
         "\\1",
         names(svims_split)),
      overlap_type=gsub("^(.+)[|]([^|]+)$",
         "\\2",
         names(svims_split)),
      stringsAsFactors=FALSE);
   rownames(svims_df) <- names(svims_split);
   colnames(svims_df) <- c("sets", overlap_type);
   svims_df$overlap <- names(svims_split_names)[match(svims_df$sets, svims_split_names)];
   svims_df$num_sets <- svims_split_name_ct[svims_df$overlap];
   svims_df$count <- lengths(svims_split[names(svims_split)]);

   # add one column per setlist name
   sldf <- data.frame(jamba::rbindList(strsplit(svims_df$overlap, " ")),
      stringsAsFactors=FALSE);
   colnames(sldf) <- colnames(svims);
   svims_df[,colnames(svims)] <- sldf;
   
   # optionally include blank entries where no overlaps are present
   if (include_blanks) {
      blank_df <- make_venn_combn_df(colnames(svims));
      blank_df_num <- rowSums(blank_df);
      blank_df$sets <- rownames(blank_df);
      blank_df$count <- 0;
      blank_df$overlap <- jamba::pasteByRow(blank_df[,colnames(svims),drop=FALSE], sep=" ");
      blank_df$num_sets <- blank_df_num;
      rownames(blank_df) <- paste0(blank_df$sets, "|", blank_df$overlap);
      if ("agreement" %in% overlap_type) {
         blank_df[[overlap_type]] <- "agreement";
      } else {
         blank_df[[overlap_type]] <- blank_df$overlap;
      }
      if (any(!blank_df$overlap %in% svims_df$overlap)) {
         missing_overlap <- setdiff(blank_df$overlap, svims_df$overlap);
         add_df <- subset(blank_df, overlap %in% missing_overlap)[,colnames(svims_df),drop=FALSE];
         svims_df <- rbind(svims_df, add_df);
      }
   }

   # order rows by number of overlaps, then by set
   svims_df2 <- jamba::mixedSortDF(svims_df, byCols=c("num_sets", paste0("-", names(setlist))))
   svims_split_names2 <- jamba::nameVector(svims_df2[,c("sets", "overlap")]);
   svims_split_names3 <- jamba::nameVector(svims_df2[,c("overlap", "sets")]);
   
   # optionally clean up the label
   if (trim_label) {
      svims_df2[,"overlap_label"] <- gsub("^[ ]+|[ ]+$", "",
         gsub("[ ]+", " ",
         gsub("0", "",
         svims_df2[,overlap_type])));
   }
   
   # optionally include items
   if (return_items) {
      svitems_split <- split(rownames(svims), svimsv_olt);
      imatch <- match(rownames(svims_df2), names(svitems_split));
      svims_df2$items <- I(svitems_split[imatch]);
   }
   attr(svims_df2, "overlap_type") <- overlap_type;
   return(svims_df2);
}

#' Make full Venn combination data.frame
#' 
#' Make full Venn combination data.frame
#' 
#' This function returns a `data.frame` with all possible
#' combinations of set overlaps for the number of sets
#' provided.
#' 
#' @return `data.frame` where rownames indicate each possible
#'    Venn overlap, colnames indicate each set name, and values
#'    are `0` or `1` indicating whether each set should have
#'    a value in each row.
#' 
#' @family venndir utility
#' 
#' @param x either character vector of set names, or integer number
#'    of sets.
#' @param include_zero logical indicating whether to include one
#'    row with all zeros, in the event of counting factors where
#'    some factor levels may not be present in any sets.
#' @param sep character separator used when combining set names into
#'    a single label.
#' @param ... additional arguments are ignored.
#' 
#' @examples
#' make_venn_combn_df(3);
#' make_venn_combn_df(3, include_zero=TRUE);
#' 
#' make_venn_combn_df(letters[1:3]);
#' 
#' @export
make_venn_combn_df <- function
(x,
 include_zero=FALSE,
 sep="&",
 ...)
{
   if (is.numeric(x) && length(x) == 1 && round(x) == x) {
      x <- paste0("set_", seq_len(x));
   }
   setdf <- data.frame(
      check.names=FALSE,
      stringsAsFactors=FALSE,
      as.list(
         jamba::nameVector(
            rep(0, length(x)),
            x)));
   setdfs <- jamba::rbindList(lapply(seq_along(x), function(i){
      combn_m <- t(combn(x, m=i));
      rownames(combn_m) <- unname(jamba::pasteByRow(combn_m, sep=sep));
      jamba::rbindList(lapply(rownames(combn_m), function(k){
         j <- combn_m[k,];
         setdf1 <- setdf;
         setdf1[,j] <- 1;
         rownames(setdf1) <- k;
         setdf1;
      }))
   }));
   if (include_zero) {
      rownames(setdf) <- "none";
      setdfs <- rbind(setdf, setdfs);
   }
   return(setdfs);
}
jmw86069/venndir documentation built on Nov. 14, 2024, 10:12 a.m.