R/misc_utils.R

Defines functions error seq_along2 rep_len2

Documented in rep_len2 seq_along2

##########
########## KH additional functions
##########

## #' @name eclusters2
## #' @title gGraph R6 public method eclusters2
## #' 
## #' @description
## #' Marks ALT edges belonging (quasi) reciprocal cycles
## #'
## #' 
## #' @param juncs GRangesList of junctions
## #' @param mc.cores parallel
## #' @param only_chains TRUE will only pair breakend to its nearest nearest neighbor IFF the nearest neighbor is reciprocal, see arguments to "strict" for 3 different matching heuristics
## #' @param max.small size below which simple dups and dels are excluded
## #' @param weak logical flag if TRUE will not differentiate between cycles and paths and will return all weakly connected clusters in the junction graph [FALSE]
## #' @param strict Only active if only_chains = TRUE. Can be one of "strict", "one_to_one", or "loose". \cr
## #' "strict": each breakend can only be "monogamously" matched to one other breakend and if the nearest breakend is of the wrong orientation, it is thrown out. \cr
## #' "one_to_one" breakends can only be coupled to a single "monogamous" match but without considering breakends of the wrong orientation. If breakend B is nearest to C, B will only be matched to C. If A's nearest breakend is B but is further away than C, A will not be matched to B. \cr
## #' "loose": the nearest breakend in the correct orientation under the threshold is considered. The same as one_to_one except A will be matched to B, while B will be matched to C. \cr
## #' @param ignore.isolated If TRUE, all simple duplications, duplications nested with only other duplications, and simple deletions without any breakends will be thrown out.
## #' @return gGraph object with edges marked with ecluster id and metadata in gEdge, gNode, and $meta
## #' @author Marcin Imielinski
## eclusters2 = function (thresh = 1000, weak = TRUE, paths = !weak,
##                            mc.cores = 1, verbose = FALSE, chunksize = 1e+30, method = "single",
##                            return_pairs = FALSE, ignore.small = TRUE,
##                            max.small = 1e4, ignore.isolated = TRUE,
##                            strict = c("strict", "one_to_one", "loose"),
##                            min.isolated = max.small,
##                            only_chains = FALSE) {


##   if (!is.character(strict) || any(!strict %in% c("strict", "one_to_one", "loose"))) {
##     stop("strict must be one of 'strict', 'one_to_one', or 'loose'")
##   } else if (length(strict) > 1) {
##     strict = "one_to_one"
##   }
##   self$edges$mark(ecluster = as.integer(NA))
##   altedges = self$edges[type == "ALT", ]
##   if (length(altedges) == 0) {
##     if (verbose) {
##       gmessage("No junction in this graph")
##     }
##     return(NULL)
##   }
##   if (ignore.small) {
##       altedges = altedges[!((class == "DUP-like" | class == "DEL-like") & altedges$span <= max.small)]
##   }
##   if (length(altedges) == 0) {
##     if (verbose) {
##       gmessage("No junction in this graph")
##     }
##     return(NULL)
##   }
##   deldup = altedges[class %in% c("DUP-like", "DEL-like")]
##   ## below removes non-nested events below size threshold
##   ## and simple or nested duplications with no subsumed breakends
##   if (length(deldup) > 0 && ignore.isolated) {
##     altes = deldup$shadow
##     ## altes$sedge.id = altedges[class %in% c("DUP-like", "DEL-like")]$dt[altes$id]$sedge.id
##     bp = grl.unlist(altedges$grl)[, c("grl.ix", "grl.iix", "class", "sedge.id")]
##     bp$sedge.id.y = bp$sedge.id; bp$sedge.id = NULL
##     addon = deldup$dt[altes$id][, .(sedge.id, class)]
##     altes$sedge.id = addon$sedge.id
##     altes$class = addon$class
##     altes$nbp = altes %N% bp # number of breakpoints of any SV that fall within segment
##     numsum = altedges$shadow %>% gr.sum # using the shadows of all of the SVs not just dels and dups
##     altes = altes %$% numsum
##     iso = ((altes) %Q% (score == 1.0))$id
##     ## rm.edges = unique(altes[iso] %Q% (width < thresh))$sedge.id ## old
##     rm.edges = unique(altes[iso] %Q% (width < min.isolated))$sedge.id
##     rm.dups = S4Vectors::with(altes, sedge.id[class == "DUP-like" & nbp <= 2])
##     rm.dups = c(rm.dups, dedup.cols(gr2dt(altes %*% bp))[sedge.id != sedge.id.y][class == "DUP-like"][, .(all(1:2 %in% grl.iix), class.1 = class.1[1]), by = .(sedge.id, sedge.id.y)][, all(V1 == TRUE) & all(class.1 == "DUP-like"), by = sedge.id][V1 == TRUE]$sedge.id) # removing dups that have only other nested dups 
##     rm.edges = union(rm.edges, rm.dups)
##     keepeid = setdiff(altedges$dt$sedge.id, rm.edges)
##     altedges = altedges[as.character(keepeid)]
##   } # ignoring isolated dup and del edges that are smaller than threshold
##   if (verbose & weak)
##     message("Computing weak eclusters")
##   if (length(altedges) == 0) {
##     if (verbose) {
##       gmessage("No junction in this graph")
##     }
##     return(NULL)
##   }
##   bp = grl.unlist(altedges$grl)[, c("grl.ix", "grl.iix", "edge.id")]
##   bp$m.ix = seq_along(bp)
##   bp.dt = gr2dt(bp)
##   ix = split(1:length(bp), ceiling(runif(length(bp)) * ceiling(length(bp)/chunksize)))
##   ixu = unlist(ix)
##   eps = 1e-09
##   ## ij = do.call(rbind, split(1:length(bp), bp$grl.ix))
##   xt.adj = xt.adj0 = Matrix::sparseMatrix(1, 1, x = 0, dims = rep(length(bp),
##                                                         2))
##   if (verbose) {
##     message(sprintf("Computing junction graph across %s ALT edges with distance threshold %s",
##                     length(altedges), thresh))
##   }
##   if (!exists(".INF")) {
##     .INF = pmax(sum(seqlengths(self)), 1e+09)
##   }
##   bp.pair = as.matrix(dcast(data.table(ix = bp$m.ix,
##                              grl.ix = bp$grl.ix,
##                              grl.iix = bp$grl.iix),
##                   grl.ix ~ grl.iix, value.var = "ix")[,2:3])
##   bp.pair = rbind(bp.pair, cbind(bp.pair[,2], bp.pair[,1]))
##   ifun = function(iix, ignore.strand = FALSE,
##                   verbose = FALSE, eps = 1e-9) {
##     if (verbose > 1)
##       cat(".")
##     tmpm = gr.dist(bp[iix], gr.flipstrand(bp), ignore.strand = ignore.strand) +
##       eps
##     return(as(tmpm, "Matrix"))
##   }
##   xt.adj[ixu, ] = do.call(rbind,
##                           mclapply(ix, ifun, ignore.strand = FALSE,
##                                    mc.cores = mc.cores))
##   diag(xt.adj) = NA_real_
##   ## only_chains = TRUE, enforcing that nearest breakpoints are only considered
##   if (only_chains) {
    
##     ## enforcing that no distances between breakends from the same junction  are considered
##     xt.adj[bp.pair] = NA_real_
##     xt.adj0[ixu, ] = do.call(rbind,
##                              mclapply(ix, ifun, ignore.strand = TRUE,
##                                       mc.cores = mc.cores)) # this is to find nearest breakends, regardless of orientation
##     ## enforcing that self-to-self breakend distances are not considered
##     diag(xt.adj0) = NA_real_
##     ## enforcing that no distances between breakends from the same junction  are considered
##     xt.adj0[bp.pair] = NA_real_
##     suppressWarnings({
##       nearest_ix = dunlist(lapply(
##         seq_len(nrow(xt.adj)),
##         function(x) which(xt.adj[x,] == min(xt.adj[x,], na.rm = T)))) %>%
##         as.matrix
##       nearest0_ix = dunlist(lapply(
##         seq_len(nrow(xt.adj0)),
##         function(x) which(xt.adj0[x,] == min(xt.adj0[x,], na.rm = T)))) %>%
##         as.matrix
##     })
##     if (nrow(nearest_ix) & nrow(nearest0_ix)) {
##       if (strict == "strict") {
##         nearest_ix = cbind(rowMins(nearest_ix), rowMaxs(nearest_ix))
##         nearest0_ix = cbind(rowMins(nearest0_ix), rowMaxs(nearest0_ix))
##         nearest_ix = nearest_ix[duplicated(nearest_ix),,drop = FALSE]
##         nearest0_ix = nearest0_ix[duplicated(nearest0_ix),,drop = FALSE]
##         nearest_ix = rbind(nearest_ix, cbind(nearest_ix[,2], nearest_ix[,1]))
##         nearest0_ix = rbind(nearest0_ix, cbind(nearest0_ix[,2], nearest0_ix[,1]))
##         nearest_ix = as.matrix(merge(nearest_ix, nearest0_ix)) # if there is a breakend closer but in the wrong orientation, that distance will be thrown out downstream, i.e. there is a one to one match of breakend and any nearest breakend that is in the wrong orientation disqualifies the clustser
##       } else if (strict == "one_to_one") {
##         nearest_ix = cbind(rowMins(nearest_ix), rowMaxs(nearest_ix))
##         nearest_ix = nearest_ix[duplicated(nearest_ix),,drop = FALSE]
##         nearest_ix = rbind(nearest_ix, cbind(nearest_ix[,2], nearest_ix[,1]))
##         nearest_ix = as.matrix(nearest_ix) # only one-to-one breakends are considered
##       } else if (strict == "loose") {
##         nearest_ix = rbind(nearest_ix, cbind(nearest_ix[,2], nearest_ix[,1]))
##         nearest_ix = nearest_ix[!duplicated(nearest_ix),,drop = FALSE]
##       }
##     }
##   } else if (!only_chains) {
##     xt.adj[bp.pair] = 1
##   }
##   ## rm(xt.adj0)
##   adj = xt.adj
##   xt.adj[which(is.na(as.matrix(xt.adj)))] = .INF + 1
##   adj[which(is.na(as.matrix(adj)))] = 0
##   adj[which(as.matrix(adj) > thresh)] = 0
##   if (only_chains && nrow(nearest_ix)) {
##     tmp = xt.adj[nearest_ix]
##     xt.adj[] = .INF + 1
##     adj[] = 0
##     xt.adj[nearest_ix] = tmp
##     adj[nearest_ix] = tmp
##   }
##   dt = Matrix::which(xt.adj < thresh, arr.ind = T)
##   dt = unique(data.table(cbind(rowMins(dt), rowMaxs(dt))))
##   dt[, bp.dist := xt.adj[dt[, cbind(V1, V2)]]]
##   dt$sign = ifelse(strand(bp[dt$V1]) == "+",
##             ifelse(gr.flipstrand(bp[dt$V1]) > bp[dt$V2], -1, 1),
##             ifelse(gr.flipstrand(bp[dt$V1]) < bp[dt$V2], -1, 1))
##   dt2 = dt[,idx := seq_len(.N)] %>% melt(measure.vars = c("V1", "V2"))
##   meta = cbind(gr2dt(bp)[dt2$value], dt2)[order(idx)]
##   meta = rbind(meta,
##               gr2dt(bp)[paste(grl.ix, grl.iix) %nin% meta[, paste(grl.ix, grl.iix)]],
##               fill = T)
##   if (return_pairs) {
##     return(meta)
##   }
##   hcl = stats::hclust(as.dist(xt.adj), method = "single")
##   hcl.lbl = cutree(hcl, h = thresh)
##   bp.dt$hcl = hcl.lbl
##   bp.hcl = bp.dt[, .(hcl.1 = .SD[grl.iix == 1, hcl], hcl.2 = .SD[grl.iix ==
##                                                                  2, hcl]), keyby = grl.ix]
##   altedges$mark(hcl.1 = bp.hcl[.(seq_along(altedges)), hcl.1])
##   altedges$mark(hcl.2 = bp.hcl[.(seq_along(altedges)), hcl.2])
##   hcl.ig = igraph::graph_from_edgelist(bp.hcl[, unique(cbind(hcl.1,
##                                                              hcl.2))], directed = FALSE)
##   hcl.comp = components(hcl.ig)
##   altedges$mark(ehcl = as.integer(hcl.comp$membership)[bp.hcl[,
##                                                               hcl.1]])
##   adj[adj > thresh] = 0
##   refg = self[, type == "REF"]
##   bpp = Matrix::which(adj != 0, arr.ind = TRUE)
##   dref = pdist(bp[bpp[, 1]], bp[bpp[, 2]])
##   drefg = diag(refg$dist(bp[bpp[, 1]], bp[bpp[, 2]]))
##   ix = which(drefg > dref)
##   if (length(ix))
##     adj[bpp[ix, , drop = FALSE]] = FALSE
##   if (verbose > 1)
##     cat("\n")
##   adj = adj | t(adj)
##   junpos = bp1 = bp$grl.iix == 1
##   junneg = bp2 = bp$grl.iix == 2
##   adj2 = adj & FALSE
##   adj2[junpos, junpos] = adj[bp2, bp1]
##   adj2[junpos, junneg] = adj[bp2, bp2]
##   adj2[junneg, junpos] = adj[bp1, bp1]
##   adj2[junneg, junneg] = adj[bp1, bp2]
##   if (verbose)
##     message(sprintf("Created basic junction graph using distance threshold of %s",
##                     thresh))
##   cl = split(1:length(bp), igraph::clusters(graph.adjacency(adj2),
##                                             ifelse(weak, "weak", "strong"))$membership)
##   cl = cl[S4Vectors::elementNROWS(cl) > 1]
##   cl = cl[order(S4Vectors::elementNROWS(cl))]
##   ## browser()
##   ## jcl = lapply(cl, function(x) unique(sort(bp$grl.ix[x])))
##   jcl = lapply(cl, function(x) unique(sort(bp$edge.id[x])))
##   jcls = sapply(jcl, paste, collapse = " ")
##   jcl = jcl[!duplicated(jcls)]
##   adj3 = adj2
##   altedges$mark(ecycle = as.character(NA))
##   if (length(jcl) > 0) {
##     dcl = dunlist(unname(jcl))[, `:=`(listid, paste0(ifelse(weak,
##                                                             "", "c"), listid))]
##     if (!weak)
##         ## altedges[dcl$V1]$mark(ecycle = dcl$listid)
##         altedges[as.character(dcl$V1)]$mark(ecycle = dcl$listid)
##     ## altedges[dcl$V1]$mark(ecluster = dcl$listid)
##     altedges[as.character(dcl$V1)]$mark(ecluster = dcl$listid)
##     meta = merge(meta, altedges$dt[, .(edge.id, ecluster, hcl.1, hcl.2, ehcl, ecycle)], by = "edge.id")
##     meta[!is.na(ecluster),
##          `:=`(
##            nclust = length(unique(edge.id)),
##            all_positive = all(replace(sign, is.na(sign),  3e9) > 0),
##            all_negative = all(replace(sign, is.na(sign), -3e9) < 0),
##            mixed = {naom = na.omit(sign); any(naom > 0) & any(naom < 0)},
##            bridge = anyNA(bp.dist)
##            ),
##          by = ecluster]
##     meta[, `:=`(
##         num_positive = sum(sign[!duplicated(idx)] > 0, na.rm = T),
##         num_negative = sum(sign[!duplicated(idx)] < 0, na.rm = T)
##     ),
##     by = ecluster]
##     self$set(recip_bp = meta)
##     if (verbose) {
##         message("Annotated weakly connected junction clusters and added ecluster pair metadata")
##     }
##   }
##   if (verbose)
##     message(sprintf("Annotated %s junction cycles in edge field $ecycle",
##                     length(jcl)))
##   if (paths & !weak) {
##     if (verbose)
##       message("Analyzing paths")
##     if (length(jcl) > 0) {
##       adj3[unlist(jcl), unlist(jcl)] = FALSE
##     }
##     sinks = Matrix::which(Matrix::rowSums(adj3) == 0)
##     sources = Matrix::which(Matrix::colSums(adj3) == 0)
##     cl2 = split(1:length(bp), igraph::clusters(graph.adjacency(adj3),
##                                                "weak")$membership)
##     cl2 = cl2[S4Vectors::elementNROWS(cl2) > 1]
##     if (any(ix <- S4Vectors::elementNROWS(cl2) > 2)) {
##       cl3 = do.call(c, mclapply(cl2[ix], function(x) {
##         tmp.adj = adj3[x, x]
##         lapply(all.paths(tmp.adj, sources = sources,
##                          sinks = sinks, verbose = verbose)$paths, function(i) x[i])
##       }, mc.cores = mc.cores))
##       cl2 = c(cl2[!ix], cl3)
##     }
##     jcl2 = lapply(cl2, function(x) unique(sort(bp$grl.ix[x])))
##     jcls2 = sapply(jcl2, paste, collapse = " ")
##     jcl2 = jcl2[!duplicated(jcls2)]
##     altedges$mark(epath = as.character(NA))
##     if (length(jcl2) > 0) {
##       dcl2 = dunlist(unname(jcl2))[, `:=`(listid, paste0("p",
##                                                          listid))]
##       altedges[dcl2$V1]$mark(epath = dcl2$listid)
##       self$edges$mark(ecluster =
##                         ifelse(is.na(self$edges$dt$ecycle) &
##                                is.na(self$edges$dt$epath),
##                                as.character(NA),
##                                paste0(ifelse(is.na(self$edges$dt$ecycle), "",
##                                              self$edges$dt$ecycle),
##                                       ifelse(is.na(self$edges$dt$epath),
##                                              "", self$edges$dt$epath))))
##     }
##     if (verbose)
##       message(sprintf("Annotated %s paths in edge field $epath",
##                       length(jcl2)))
##   }
##   return(invisible(self))
## }
## ## gGraph$public_methods$eclusters2 = tmpeclustpairs # if replacing a binding

## #' @description
## #' make eclusters
## gGraph$public_methods$eclusters2 = NULL; gGraph$set("public", "eclusters2", eclusters2)



#' @name copy3
#' @title make deep copy, recursively
#'
#' useful for dev
#' makes deep copy of R6 object, S4 object, or anything else really
#'
copy3 = function (x, recurse_list = TRUE) {
    if (inherits(x, "R6")) {
        x2 = rlang::duplicate(x$clone(deep = T))
        for (name in intersect(names(x2$.__enclos_env__), c("private", 
            "public"))) for (nname in names(x2$.__enclos_env__[[name]])) tryCatch({
            x2$.__enclos_env__[[name]][[nname]] = copy3(x2$.__enclos_env__[[name]][[nname]])
        }, error = function(e) NULL)
        return(x2)
    } else if (isS4(x)) {
        x2 = rlang::duplicate(x)
        slns = slotNames(x2)
        for (sln in slns) {
            tryCatch({
                slot(x2, sln) = copy3(slot(x2, sln))
            }, error = function(e) NULL)
        }
        return(x2)
    } else if (inherits(x, c("list"))) {
        x2 = rlang::duplicate(x)
        x2 = rapply(x2, copy3, how = "replace")
        return(x2)
    } else {
        x2 = rlang::duplicate(x)
        return(x2)
    }
}


#' @name rep_len2
#' @title recycle vector along length OR nrow of object
#'
#'
#' @author Kevin Hadi
#' @param x data
#' @param objalong any object to recycle x along if uselen = TRUE, or an actual integer value if uselen = FALSE
#' @return vector
rep_len2 = function(x, objalong, uselen = TRUE) {
    if (uselen)
        rep(x, length.out = NROW(objalong))
    else
        rep(x, length.out = objalong)
}

#' @name seq_along2
#' @title seq along either row of table or length of vector
#'
#'
#' @author Kevin Hadi
#' @param x data
#' @return vector
seq_along2 = function(x)  {
  seq_len(NROW(x))
}


#' @name rleseq
#' @title numbers up within repeating elements of a vector
#'
#' @description
#' returns unique id within each unique element of a vector or set of provided vectors
#' and also a running id within each unique element
#'
#' @param ... Vector(s) to identify with unique id and a running id within each unique id
#' @param clump a logical specifying if duplicates are to be counted together
#' @param recurs a logical that is meant to only be set by the function when using clump = TRUE
#' @return a list of idx and seq
#' @author Kevin Hadi
rleseq = function (..., clump = TRUE, recurs = FALSE, na.clump = TRUE, 
                   na.ignore = FALSE, sep = paste0(" ", rand.string(length = 6), 
                     " "), use.data.table = TRUE) 
{
  force(sep)
  out = if (use.data.table) {
    tryCatch(
    {
      dt = data.table(...)
      setnames(dt, make.names(rep("", ncol(dt)), unique = T))
      ## make.unique
      cmd = sprintf("dt[, I := .I][, .(idx = .GRP, seq = seq_len(.N), lns = .N, I), by = %s]", mkst(colnames(dt), "list"))
      dt = eval(parse(text = cmd))
      setkey(dt, I)[, .(idx, seq, lns)]
    }, error = function(e) structure("data table didn't work...", class = "err"))
  }
  if (!(is.null(out) || class(out)[1] == "err"))
    return(as.list(out))
  rand.string <- function(n = 1, length = 12) {
    randomString <- c(1:n)
    for (i in 1:n) {
      randomString[i] <- paste(sample(c(0:9, letters, LETTERS), 
        length, replace = TRUE), collapse = "")
    }
    return(randomString)
  }
  if (isTRUE(na.clump)) 
    paste = function(..., sep) base::paste(..., sep = sep)
  else paste = function(..., sep) base::paste(stringr::str_c(..., 
    sep = sep))
  lns = base::lengths(list(...))
  if (!all(lns == lns[1])) 
    warning("not all vectors provided have same length")
  fulllens = max(lns, na.rm = T)
  vec = setNames(paste(..., sep = sep), seq_len(fulllens))
  if (length(vec) == 0) {
    out = list(idx = integer(0), seq = integer(0), lns = integer(0))
    return(out)
  }
  if (na.ignore) {
    isnotna = which(rowSums(as.data.frame(lapply(list(...), 
      is.na))) == 0)
    out = list(idx = rep(NA, fulllens), seq = rep(NA, fulllens), 
      lns = rep(NA, fulllens))
    if (length(isnotna)) 
      vec = vec[isnotna]
    tmpout = do.call(rleseq, c(alist(... = vec), alist(clump = clump, 
      recurs = recurs, na.clump = na.clump, na.ignore = FALSE, use.data.table = FALSE)))
    for (i in seq_along(out)) out[[i]][isnotna] = tmpout[[i]]
    return(out)
  }
  if (!isTRUE(clump)) {
    rlev = rle(vec)
    if (isTRUE(recurs)) {
      return(unlist(unname(lapply(rlev$lengths, seq_len))))
    }
    else {
      out = list(idx = rep(seq_along(rlev$lengths), times = rlev$lengths), 
        seq = unlist(unname(lapply(rlev$lengths, seq_len))))
      out$lns = ave(out[[1]], out[[1]], FUN = length)
      return(out)
    }
  }
  else {
    if (!isTRUE(na.clump)) {
      vec = replace2(vec, which(x == "NA"), dedup(dg(x)[dg(x) == 
                                                          "NA"]))
    }
    vec = setNames(vec, seq_along(vec))
    lst = split(vec, factor(vec, levels = unique(vec)))
    ord = as.integer(names(unlist(unname(lst))))
    idx = rep(seq_along(lst), times = base::lengths(lst))
    out = list(idx = idx[order(ord)], seq = rleseq(idx, clump = FALSE, 
      recurs = TRUE, use.data.table = FALSE)[order(ord)])
    out$lns = ave(out[[1]], out[[1]], FUN = length)
    return(out)
  }
}


#' @name match3
#' @title similar to setkey except a general use utility
#'
#' very slow version of keying a la data.table
#' but for general/interactive use
#' @author Kevin Hadi
match3 = function(x, table, nomatch = NA_integer_, old = TRUE, use.data.table = TRUE) {
  out = if (use.data.table) {
    tryCatch({
      dx = data.table(x = x)[, id.x := seq_len(.N)]
      dtb = data.table(table = table)[, id.tb := seq_len(.N)]
      ## setkey(dx, x)[list(dtb$table)]$id.x
      setkey(dtb, table)[list(dx$x)]$id.tb
    }, error = function(e) structure("err", class = "err"))
  }
  if (!is.null(out) && !inherits(out, "err")) return(out)
  if (old) {
    dx = within(data.frame(x = x), {id.x = seq_along(x)})
    dtb = within(data.frame(table = table), {id.tb = seq_along(table)})
    res = merge(dx, dtb, by.x = "x", by.y = "table", all.x = TRUE,
      allow.cartesian = TRUE)
    return(res$id.tb[order(res$id.x)])
  } else  {
    m = match(table,x)
    mat = cbind(m, seq_along(m))
    mat = mat[!is.na(mat[,1]),,drop=FALSE]
    mat = mat[order(mat[,1], na.last = FALSE),,drop = FALSE]
    mat = cbind(mat, seq_len(dim(mat)[1]))
    m2 = match(x,table)
    ix = which(!duplicated(m2) & !is.na(m2))
    mat_rix = unlist(rep(split(mat[,3], mat[,1]), base::tabulate(m2)[m2][ix]))
    ## mat_rix = unlist(rep(split(mat[,3], mat[,1]), base::tabulate(m2)[m2][ix]))
    ix = rep(1, length.out = length(m2))
    ## original line
    ## ix[!is.na(m2)] = base::tabulate(m)[!is.na(m2)]
    ix[!is.na(m2)] = base::tabulate(m)[m][m2][!is.na(m2)]
    out = rep(m2, ix)
    out[!is.na(out)] = mat[mat_rix,,drop=F][,2]
    return(out)
    ## m = match(table, x)
    ## mat = cbind(m, seq_along(m))
    ## mat = mat[!is.na(mat[, 1]), , drop = FALSE]
    ## mat = mat[order(mat[, 1]), , drop = FALSE]
    ## mat = cbind(mat, seq_len(dim(mat)[1]))
    ## m2 = match(x, table)
    ## ix = which(!duplicated(m2))
    ## mat_rix = unlist(rep(split(mat[, 3], mat[, 1]), base::tabulate(m2)[m2][ix]))
    ## mat[mat_rix, , drop = F][, 2]
  }
}


#' @name dedup.cols
#' @title applies dedup to colnames
#'
#' dedup the column names of a data.frame/data.table
#'
#' @return A data.table or data.frame
dedup.cols = function(tbl, remove = FALSE) {
    if (remove) {
        if (!inherits(tbl, "data.table"))
            return(tbl[, match(unique(colnames(tbl)), colnames(tbl))])
        else
            return(tbl[, match(unique(colnames(tbl)), colnames(tbl)), with = FALSE])
    } else {
            colnames(tbl) = base::make.unique(colnames(tbl))
            return(tbl)
    }
}


#' @name rowMins
#' @title rowMins hack
#'
#'
#' @return A vector
rowMins = function(x) {
  do.call(pmin, as.data.frame(x))
}

#' @name rowMaxs
#' @title rowMaxs hack
#'
#'
#' @return A vector
rowMaxs = function(x) {
  do.call(pmax, as.data.frame(x))
}

#' @name not.in
#' @title not.in
#'
#' @description
#' not in
#'
#' @param x value to test
#' @param table table to test x against
#'
#' @noRd
`%nin%` = function (x, table)
{
    match(x, table, nomatch = 0L) == 0L
}

#' @name gr.noval
#' @title get rid of mcols on GRanges/GRangesLists
#' @description
#'
#' remove all metadata from GRanges or GRangesList
#'
#' @return GRanges or GRangesList
#' @author Kevin Hadi
gr.noval = function(gr, keep.col = NULL, drop.col = NULL) {
    if (is.null(keep.col) & is.null(drop.col)) {
        select_col = NULL
    } else {
        all_col = colnames(gr@elementMetadata)
        if (inherits(gr, "GRangesList")) {
            all_col = c(all_col, colnames(gr@unlistData@elementMetadata))
        }

        if (!is.null(keep.col) & is.null(drop.col)) {
            select_col = intersect(all_col, keep.col)
        } else if (is.null(keep.col) & !is.null(drop.col)) {
            select_col = setdiff(all_col, drop.col)
        } else if (!is.null(keep.col) && !is.null(drop.col)) {
            if (intersect(keep.col, drop.col) > 0) {
                warning("drop.col and keep.col args have overlapping elements\nkeeping the columns that overlap")
                select_col = intersect(setdiff(all_col, setdiff(drop.col, keep.col)), keep.col)
            }
        }
    }
    if (inherits(gr, "GRangesList")) {
        tmp_query = intersect(select_col, colnames(gr@unlistData@elementMetadata))
        gr@unlistData@elementMetadata = gr@unlistData@elementMetadata[,c(tmp_query), drop = FALSE]
    }
    tmp_query = intersect(select_col, colnames(gr@elementMetadata))
    gr@elementMetadata = gr@elementMetadata[,c(tmp_query),drop = FALSE]
    return(gr)
}


#' @name setcols
#' @title convenience function to set columns
#'
#' sets columns of an object
#'
#' @param dt data frame/table or matrix
#' @param old integer or character or logical vector corresponding to current colnames in dt
#' @param new character vector for new column names
#' @return colnamed object
#' @author Kevin Hadi
setcols = function(dt, old, new) {
  if (inherits(dt, c("GRanges", "GRangesList"))) {
    mcols(dt) = setcols(mcols(dt), old, new)
    return(dt)
  }
  cnames = colnames2(dt)
  if (missing(new) || missing(old)) {
    if (missing(old)) {
      old = new
    }
    if (is.character(old) && length(old) == length(cnames)) {
      colnames(dt) = old
      return(dt)
    } else {
      stop("names provided must be same length as ncol(dt)")
    }
  }
  if (is.character(old)) {
    out = merge(data.frame(cnames, seq_along(cnames)), data.frame(cnames = old, new = new),
      allow.cartesian = T)
    cnames[out[[2]]] = out[[3]]
    colnames(dt) = cnames
    return(dt)
  }
  if (is.logical(old)) {
    if (! length(old) == length(cnames)) stop("logical vector must be same length as ncol(dt)")
    old = which(old)
  }
  cnames[old] = new
  colnames(dt) = cnames
  return(dt)
}

#' @name colnames2
#' @title robust colnames
#'
#' gives back character vector same number of columns of input regardless whether named or not
#'
#' @param str a path string
#' @return a string with multiple parentheses replaced with a single parenthesis
#' @author Kevin Hadi
colnames2 = function(x) {
    nm = colnames(x)
    if (is.null(nm))
        return(rep("", length.out = NCOL(x)))
    else
        return(nm)
}




#' @name %K%
#' @title similar to setkey except a general use utility
#'
#' slower version of setkey, but for interactive use
#'
#' @author Kevin Hadi
`%K%` = function(thisx,thisy, old = TRUE) {
    if (old)
        return(match3(table = thisx, x = thisy, old = TRUE))
    else
        return(match3(table = thisx, x = thisy, old = FALSE))
}


##' @name enframe_list
##' @title data table-ize list elements and add name column
##'
##' @description
##'
##' @author Kevin Hadi
#enframe_list = function(lst, name = "name", value = "value", as.data.table = TRUE, rbind = TRUE, mc.cores = 1) {
#    nms = names(lst)
#    expr = parse(text = sprintf("cbind(%s = nm, df)", name, value))
#    out = mcmapply(function(el, nm) {
#        if (NROW(el)) {
#            if (is.null(dim(el)))
#                df = setColnames(as.data.frame(el), value)
#            else
#                df = as.data.frame(el)
#            if (as.data.table)
#                setDT(df)
#            eval(expr)
#        }
#    }, lst, nms, SIMPLIFY = FALSE, mc.cores = mc.cores)
#    if (rbind) {
#        if (as.data.table)
#            return(rbindlist(out))
#        else
#            return(do.call(rbind, out))
#    }
#}

##' @name setColnames
##' @title convenience function to set column names
##'
##' @param object tabled object
##' @param nm names of the new columns
##' @return colnamed object
##' @author Kevin Hadi
#setColnames = function(object = nm, nm = NULL, pattern = NULL, replacement = "") {
#    if (!is.null(nm)) {
#        if (is.null(names(nm)))
#            colnames2(object)  = nm
#        else {
#            ix = match3(names(nm), colnames(object))
#            colnames2(object)[ix] = nm
#        }
#    } else if (!is.null(pattern)) {
#        colnames2(object) = gsub(pattern, replacement, colnames2(object))
#    }
#    return(object)
#}

#' @name isNA
#' @title is.na but also tests for "NA" character
#'
#' @description
#'
#' @author Kevin Hadi
isNA = function(x, na.char = c("NA", "NULL", "na", "null")) {
    if (is.character(x)) {
        return(is.na(x) | x %in% na.char)
    } else {
        return(is.na(x))
    }
}

#' @name na2false
#' @title replace logical vector with NA to FALSE
#'
#' @description
#' A convenience function to set a logical vector with NAs to false
#'
#' @return A logical vector with NAs set to FALSE
#' @author Kevin Hadi
na2false = function(v)
{
    v[isNA(v)] = FALSE
    ## mode(v) = "logical"
    v
}

#' @name rand.string
#' @title make a random string
#'
#' @return random string
#' @author Someone from Stackoverflow
rand.string <- function(n=1, length=12)
{
    randomString <- c(1:n)                  # initialize vector
    for (i in 1:n)
    {
        randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
                                        length, replace=TRUE),
                                 collapse="")
    }
    return(randomString)
}

#' @name gr.spreduce
#' @title reduce based on a field(s) to split by in elementMetadata of GRanges, or given vector
#' @description
#'
#' split and reduce GRanges by field(s)
#' if providing a variable not already within the GRanges,
#' may need to use dynget(variable_name)
#'
#' @return GRanges
#' @author Kevin Hadi
gr.spreduce = function(gr,  ..., ignore.strand = FALSE, pad = 0, return.grl = FALSE, sep = paste0(" ", rand.string(length = 8), " ")) {
  lst = as.list(match.call())[-1]
  ix = which(!names(lst) %in% c("gr", "sep", "pad", "ignore.strand", "return.grl"))
  vars = unlist(sapply(lst[ix], function(x) unlist(sapply(x, toString))))
  if (length(vars) == 1) {
    if (!vars %in% colnames(mcols(gr)))
      vars = tryCatch(unlist(list(...)), error = function(e) vars)
  }
  if (!all(vars %in% colnames(mcols(gr))))
    stop("Must specify valid metadata columns in gr")
  tmpix = do.call(
    function(...) paste(..., sep = sep),
    as.list(mcols(gr)[,vars, drop = F]))
  unix = which(!duplicated(tmpix))
  tmpix = factor(tmpix, levels = tmpix[unix])
  grl = unname(gr.noval(gr) %>% GenomicRanges::split(tmpix))
  grl = GenomicRanges::reduce(grl + pad, ignore.strand = ignore.strand)
  if (return.grl) {
    mcols(grl) = mcols(gr)[unix,vars,drop = F]
    return(grl)
  } else {
    out = unlist(grl)
    mcols(out) = mcols(gr)[rep(unix, times = IRanges::width(grl@partitioning)),
      vars,drop = F]
    return(out)
  }
}

#' @name duped
#' @title duped
#'
#'
#'
#' @param ... vectors to paste by
#' @author Kevin Hadi
duped = function(..., binder = "data.table") {
    duplicated(tryCatch(et(sprintf("%s(...)", binder)),
                        ## error = function(e) do.call(cbind, list(...))))
                        error = function(e) paste(...)))
}

#' @name merge.repl
#' @title merging data tables with collapsing columns with the same name
#'
#' Merge two data tables with various replacing strategies
#' for columns common between x and y that are not used to merge
#' (i.e. not specified in the "by" argument)
#'
#' @param replace_NA logical, only use values in dt.y, any dt.x not in dt.y is clobbered (NA)
#' @param force_y logical, should x and y common columns be merged?
#' @param overwrite_x logical, if force_y = TRUE, should NA values in y replace x?
#' @return A data.table
#' @author Kevin Hadi
merge.repl = function(dt.x,
                      dt.y,
                      sep = "_",
                      replace_NA = TRUE,
                      force_y = TRUE,
                      overwrite_x = FALSE,
                      keep_order = FALSE,
                      keep_colorder = TRUE,
                      keep_factor = TRUE,
                      ...) {
    arg_lst = as.list(match.call())
    by.y = eval(arg_lst[['by.y']], parent.frame())
    by.x = eval(arg_lst[['by.x']], parent.frame())
    by = eval(arg_lst[['by']], parent.frame())
    all.x = eval(arg_lst[['all.x']], parent.frame())
    all.y = eval(arg_lst[['all.y']], parent.frame())
    all = eval(arg_lst[['all']], parent.frame())
    allow.cartesian = eval(arg_lst[['allow.cartesian']])
    key_x = key(dt.x)
    if (is.null(all.x)) {
        all.x = TRUE
    }
    if (is.null(all.y)) {
        all.y = FALSE
    }
    if (!is.null(all) && all) {
        all.y = TRUE
        all.x = TRUE
    }
    if (is.null(allow.cartesian)) {
        allow.cartesian = FALSE
    }
    if (!inherits(dt.x, "data.table")) {
        dt.x = as.data.table(dt.x)
    }
    if (!inherits(dt.y, "data.table")) {
        dt.y = as.data.table(dt.y)
    }
    if (keep_order == TRUE) {
        dt.x[['tmp.2345098712340987']] = seq_len(nrow(dt.x))
    }

    dt.x[['in.x.2345098712340987']] = rep(TRUE, length.out = nrow(dt.x))
    dt.y[['in.y.2345098712340987']] = rep(TRUE, length.out = nrow(dt.y))

    new_ddd_args = list(by = by, by.x = by.x, by.y = by.y, all.x = all.x, all.y = all.y, allow.cartesian = allow.cartesian)

    if (is.null(by.y) & is.null(by.x) & is.null(by)) {

        if (length(attributes(dt.x)[['sorted']]) > 0 &&
            length(attributes(dt.y)[['sorted']]) > 0) {
            k.x = key(dt.x)
            k.y = key(dt.y)
        } else {
            k.y = k.x = intersect(names2(dt.x), names2(dt.y))
            if (length(k.x) == 0)
                stop("no common columns to merge by!")
            message("intersecting by: ", paste(k.x, collapse = ", "))
            new_ddd_args[['by']] = k.x
        }
        if (is.null(k.x) | is.null(k.y) || (k.x != k.y)) {
            stop("neither by.x/by.y  nor by are supplied, keys of dt.x and dt.y must be identical and non NULL")
        }
        x.cols = setdiff(names(dt.x), k.x)
        y.cols = setdiff(names(dt.y), k.y)

    } else if (!is.null(by.x) & !is.null(by.y)) {

        x.cols = setdiff(names(dt.x), by.x)
        y.cols = setdiff(names(dt.y), by.y)
        new_ddd_args = new_ddd_args[setdiff(names(new_ddd_args), c("by"))]

    } else if (!is.null(by)) {

        x.cols = setdiff(names(dt.x), by)
        y.cols = setdiff(names(dt.y), by)
        if (! all(by %in% colnames(dt.x)) | ! all(by %in% colnames(dt.y))) {
            stop("column ", by, " does not exist in one of the tables supplied \nCheck the column names")
        }
        new_ddd_args = new_ddd_args[setdiff(names(new_ddd_args), c("by.y", "by.x"))]

    }
    these_cols = intersect(x.cols, y.cols)
    ## if (replace_in_x) {
    if (!replace_NA) {
        dt.x.tmp = copy(dt.x)
        for (this_col in these_cols) {
            data.table::set(dt.x.tmp, i = NULL, j = this_col, value = NULL)
        }
        dt.repl = suppressWarnings(do.call("merge", args = c(list(x = dt.x.tmp, y = dt.y), new_ddd_args)))
        ## dt_na2false(dt.repl, c("in.x.2345098712340987", "in.y.2345098712340987"))
    } else {
        dt.repl = suppressWarnings(do.call("merge", args = c(list(x = dt.x, y = dt.y), new_ddd_args)))
        dt_na2false(dt.repl, c("in.x.2345098712340987", "in.y.2345098712340987"))
        in.x = which(dt.repl[["in.x.2345098712340987"]])
        in.y = which(dt.repl[["in.y.2345098712340987"]])
        this_env = environment()
        for (this_col in these_cols) {
            x_cname = paste0(this_col, ".x")
            y_cname = paste0(this_col, ".y")
            x_col = dt.repl[[x_cname]]
            y_col = dt.repl[[y_cname]]
            xf = inherits(x_col, "factor")
            yf = inherits(y_col, "factor")
            if ( {xf || yf} && keep_factor) {
                if (!xf) { x_col = factor(x_col); xf = TRUE }
                if (!yf) { y_col = factor(y_col); yf = TRUE }
            }
            if (xf && !keep_factor) { x_col = as.character(x_col); xf = FALSE } 
            if (yf && !keep_factor) { y_col = as.character(y_col); yf = FALSE }
            if (force_y) {
                if (!overwrite_x) {
                    ## if (inherits(x_col, "factor") & inherits(y_col, "factor")) {
                    ##     new_col = factor(y_col, forcats::lvls_union(list(y_col, x_col)))
                    ##     new_col[is.na(new_col)] = x_col[is.na(new_col)]
                    ## } else {
                    ##     new_col = ifelse(!is.na(y_col), y_col, x_col)
                    ## }                    
                    if (xf || yf) {
                        new_col = factor(y_col, forcats::lvls_union(list(y_col, x_col)))
                        new_col[is.na(new_col)] = x_col[is.na(new_col)]
                    } else {
                        new_col = ifelse(!is.na(y_col), y_col, x_col)
                    }
                } else {
                    ## if (inherits(x_col, "factor") & inherits(y_col, "factor")) {
                    ##     new_col = factor(x_col, forcats::lvls_union(list(y_col, x_col)))
                    ## } else {
                    ##     new_col = x_col
                    ## }
                    ## new_col[dt.repl[['in.y.2345098712340987']]] = y_col[dt.repl[['in.y.2345098712340987']]]
                    if (xf || yf) {
                        new_col = factor(x_col, forcats::lvls_union(list(y_col, x_col)))
                    } else {
                        new_col = x_col
                    }
                    new_col[in.y] = y_col[in.y]
                }
            } else {
                ## if (inherits(x_col, "factor") & inherits(y_col, "factor")) {
                ##     new_col = factor(x_col, forcats::lvls_union(list(x_col, y_col)))
                ##     new_col[is.na(new_col) & !is.na(y_col)] = y_col[is.na(new_col) & !is.na(y_col)]
                ## } else {
                ##     new_col = ifelse(is.na(x_col) & !is.na(y_col), y_col, x_col)
                ## }
                if (xf | yf) {
                    new_col = factor(x_col, forcats::lvls_union(list(x_col, y_col)))
                    new_col[is.na(new_col) & !is.na(y_col)] = y_col[is.na(new_col) & !is.na(y_col)]
                } else {
                    new_col = ifelse(is.na(x_col) & !is.na(y_col), y_col, x_col)
                }
            }
            data.table::set(dt.repl, j = c(x_cname, y_cname, this_col), value = list(NULL, NULL, this_env[["new_col"]]))
        }
    }
    ## } else if (!replace_in_x & !is.null(suffix)) {
    ##     y.suff.cols = paste0(y.cols, sep, suffix)
    ##     ## dt.y.tmp = copy(dt.y)[, eval(dc(y.suff.cols)) := eval(dl(y.cols))][, eval(dc(y.cols)) := NULL]
    ##     dt.y.tmp = copy(dt.y)
    ##     data.table::set(dt.y, j = y.suff.cols, value = dt.y[, y.cols, with = FALSE])
    ##     data.table::set(dt.y, j = y.cols, value = NULL)
    ##     ## dt.repl = merge(dt.x, dt.y.tmp, all.x = TRUE, ...)
    ##     dt.repl = do.call("merge", args = c(list(x = dt.x, y = dt.y.tmp), new_ddd_args))
    ## }
    if (keep_order == TRUE) {
        data.table::setorderv(dt.repl, "tmp.2345098712340987")
        dt.repl[['tmp.2345098712340987']] = NULL
    }
    data.table::set(dt.repl, j = c("in.y.2345098712340987", "in.x.2345098712340987"),
                    value = list(NULL, NULL))
    if (keep_colorder) {
        x_cols = colnames(dt.x)
        ## get the order of columns in dt.repl in order of X with
        ## additional columns tacked on end
        setcolorder(dt.repl,
                    intersect(union(colnames(dt.x), colnames(dt.repl)),
                              colnames(dt.repl)))
    }
    return(dt.repl)
}


#' @name dt_na2false
#' @title convert columns with NA to false
#'
#' coerce NA in columns of class "logical" to FALSE
#'
#' @param dt data.table
#' @param these_cols NULL by default, will select columns of class logical, otherwise will be specified
#' @return A data.table
#' @author Kevin Hadi
dt_na2false = function(dt, these_cols = NULL) {
    na2false = function(v)
    {
        ## v = ifelse(is.na(v), v, FALSE)
        v[is.na(v)] = FALSE
        as.logical(v)
    }
    if (is.null(these_cols)) {
        these_cols = which(sapply(dt, class) == "logical")
    }
    for (this_col in these_cols) {
        ## this_val = as.data.frame(dt[, this_col, with = FALSE])[,1]
        this_val = dt[[this_col]]
        data.table::set(dt, j = this_col, value = na2false(this_val))
    }
    return(dt)
}
mskilab/gGnome documentation built on May 8, 2024, 4:25 p.m.