gGnome: gGnome: reference based assembly graph for analyzing rearranged genomes

Documented in colnames2 dedup.cols dt_na2false duped gr.noval gr.spreduce isNA isNA match3 na2false rand.string rep_len2 rowMaxs rowMins seq_along2 setcols

##########
########## KH additional functions
##########

## #' @name eclusters2
## #' @title gGraph R6 public method eclusters2
## #' 
## #' @description
## #' Marks ALT edges belonging (quasi) reciprocal cycles
## #'
## #' 
## #' @param juncs GRangesList of junctions
## #' @param mc.cores parallel
## #' @param only_chains TRUE will only pair breakend to its nearest nearest neighbor IFF the nearest neighbor is reciprocal, see arguments to "strict" for 3 different matching heuristics
## #' @param max.small size below which simple dups and dels are excluded
## #' @param weak logical flag if TRUE will not differentiate between cycles and paths and will return all weakly connected clusters in the junction graph [FALSE]
## #' @param strict Only active if only_chains = TRUE. Can be one of "strict", "one_to_one", or "loose". \cr
## #' "strict": each breakend can only be "monogamously" matched to one other breakend and if the nearest breakend is of the wrong orientation, it is thrown out. \cr
## #' "one_to_one" breakends can only be coupled to a single "monogamous" match but without considering breakends of the wrong orientation. If breakend B is nearest to C, B will only be matched to C. If A's nearest breakend is B but is further away than C, A will not be matched to B. \cr
## #' "loose": the nearest breakend in the correct orientation under the threshold is considered. The same as one_to_one except A will be matched to B, while B will be matched to C. \cr
## #' @param ignore.isolated If TRUE, all simple duplications, duplications nested with only other duplications, and simple deletions without any breakends will be thrown out.
## #' @return gGraph object with edges marked with ecluster id and metadata in gEdge, gNode, and $meta
## #' @author Marcin Imielinski
## eclusters2 = function (thresh = 1000, weak = TRUE, paths = !weak,
##                            mc.cores = 1, verbose = FALSE, chunksize = 1e+30, method = "single",
##                            return_pairs = FALSE, ignore.small = TRUE,
##                            max.small = 1e4, ignore.isolated = TRUE,
##                            strict = c("strict", "one_to_one", "loose"),
##                            min.isolated = max.small,
##                            only_chains = FALSE) {


##   if (!is.character(strict) || any(!strict %in% c("strict", "one_to_one", "loose"))) {
##     stop("strict must be one of 'strict', 'one_to_one', or 'loose'")
##   } else if (length(strict) > 1) {
##     strict = "one_to_one"
##   }
##   self$edges$mark(ecluster = as.integer(NA))
##   altedges = self$edges[type == "ALT", ]
##   if (length(altedges) == 0) {
##     if (verbose) {
##       gmessage("No junction in this graph")
##     }
##     return(NULL)
##   }
##   if (ignore.small) {
##       altedges = altedges[!((class == "DUP-like" | class == "DEL-like") & altedges$span <= max.small)]
##   }
##   if (length(altedges) == 0) {
##     if (verbose) {
##       gmessage("No junction in this graph")
##     }
##     return(NULL)
##   }
##   deldup = altedges[class %in% c("DUP-like", "DEL-like")]
##   ## below removes non-nested events below size threshold
##   ## and simple or nested duplications with no subsumed breakends
##   if (length(deldup) > 0 && ignore.isolated) {
##     altes = deldup$shadow
##     ## altes$sedge.id = altedges[class %in% c("DUP-like", "DEL-like")]$dt[altes$id]$sedge.id
##     bp = grl.unlist(altedges$grl)[, c("grl.ix", "grl.iix", "class", "sedge.id")]
##     bp$sedge.id.y = bp$sedge.id; bp$sedge.id = NULL
##     addon = deldup$dt[altes$id][, .(sedge.id, class)]
##     altes$sedge.id = addon$sedge.id
##     altes$class = addon$class
##     altes$nbp = altes %N% bp # number of breakpoints of any SV that fall within segment
##     numsum = altedges$shadow %>% gr.sum # using the shadows of all of the SVs not just dels and dups
##     altes = altes %$% numsum
##     iso = ((altes) %Q% (score == 1.0))$id
##     ## rm.edges = unique(altes[iso] %Q% (width < thresh))$sedge.id ## old
##     rm.edges = unique(altes[iso] %Q% (width < min.isolated))$sedge.id
##     rm.dups = S4Vectors::with(altes, sedge.id[class == "DUP-like" & nbp <= 2])
##     rm.dups = c(rm.dups, dedup.cols(gr2dt(altes %*% bp))[sedge.id != sedge.id.y][class == "DUP-like"][, .(all(1:2 %in% grl.iix), class.1 = class.1[1]), by = .(sedge.id, sedge.id.y)][, all(V1 == TRUE) & all(class.1 == "DUP-like"), by = sedge.id][V1 == TRUE]$sedge.id) # removing dups that have only other nested dups 
##     rm.edges = union(rm.edges, rm.dups)
##     keepeid = setdiff(altedges$dt$sedge.id, rm.edges)
##     altedges = altedges[as.character(keepeid)]
##   } # ignoring isolated dup and del edges that are smaller than threshold
##   if (verbose & weak)
##     message("Computing weak eclusters")
##   if (length(altedges) == 0) {
##     if (verbose) {
##       gmessage("No junction in this graph")
##     }
##     return(NULL)
##   }
##   bp = grl.unlist(altedges$grl)[, c("grl.ix", "grl.iix", "edge.id")]
##   bp$m.ix = seq_along(bp)
##   bp.dt = gr2dt(bp)
##   ix = split(1:length(bp), ceiling(runif(length(bp)) * ceiling(length(bp)/chunksize)))
##   ixu = unlist(ix)
##   eps = 1e-09
##   ## ij = do.call(rbind, split(1:length(bp), bp$grl.ix))
##   xt.adj = xt.adj0 = Matrix::sparseMatrix(1, 1, x = 0, dims = rep(length(bp),
##                                                         2))
##   if (verbose) {
##     message(sprintf("Computing junction graph across %s ALT edges with distance threshold %s",
##                     length(altedges), thresh))
##   }
##   if (!exists(".INF")) {
##     .INF = pmax(sum(seqlengths(self)), 1e+09)
##   }
##   bp.pair = as.matrix(dcast(data.table(ix = bp$m.ix,
##                              grl.ix = bp$grl.ix,
##                              grl.iix = bp$grl.iix),
##                   grl.ix ~ grl.iix, value.var = "ix")[,2:3])
##   bp.pair = rbind(bp.pair, cbind(bp.pair[,2], bp.pair[,1]))
##   ifun = function(iix, ignore.strand = FALSE,
##                   verbose = FALSE, eps = 1e-9) {
##     if (verbose > 1)
##       cat(".")
##     tmpm = gr.dist(bp[iix], gr.flipstrand(bp), ignore.strand = ignore.strand) +
##       eps
##     return(as(tmpm, "Matrix"))
##   }
##   xt.adj[ixu, ] = do.call(rbind,
##                           mclapply(ix, ifun, ignore.strand = FALSE,
##                                    mc.cores = mc.cores))
##   diag(xt.adj) = NA_real_
##   ## only_chains = TRUE, enforcing that nearest breakpoints are only considered
##   if (only_chains) {
    
##     ## enforcing that no distances between breakends from the same junction  are considered
##     xt.adj[bp.pair] = NA_real_
##     xt.adj0[ixu, ] = do.call(rbind,
##                              mclapply(ix, ifun, ignore.strand = TRUE,
##                                       mc.cores = mc.cores)) # this is to find nearest breakends, regardless of orientation
##     ## enforcing that self-to-self breakend distances are not considered
##     diag(xt.adj0) = NA_real_
##     ## enforcing that no distances between breakends from the same junction  are considered
##     xt.adj0[bp.pair] = NA_real_
##     suppressWarnings({
##       nearest_ix = dunlist(lapply(
##         seq_len(nrow(xt.adj)),
##         function(x) which(xt.adj[x,] == min(xt.adj[x,], na.rm = T)))) %>%
##         as.matrix
##       nearest0_ix = dunlist(lapply(
##         seq_len(nrow(xt.adj0)),
##         function(x) which(xt.adj0[x,] == min(xt.adj0[x,], na.rm = T)))) %>%
##         as.matrix
##     })
##     if (nrow(nearest_ix) & nrow(nearest0_ix)) {
##       if (strict == "strict") {
##         nearest_ix = cbind(rowMins(nearest_ix), rowMaxs(nearest_ix))
##         nearest0_ix = cbind(rowMins(nearest0_ix), rowMaxs(nearest0_ix))
##         nearest_ix = nearest_ix[duplicated(nearest_ix),,drop = FALSE]
##         nearest0_ix = nearest0_ix[duplicated(nearest0_ix),,drop = FALSE]
##         nearest_ix = rbind(nearest_ix, cbind(nearest_ix[,2], nearest_ix[,1]))
##         nearest0_ix = rbind(nearest0_ix, cbind(nearest0_ix[,2], nearest0_ix[,1]))
##         nearest_ix = as.matrix(merge(nearest_ix, nearest0_ix)) # if there is a breakend closer but in the wrong orientation, that distance will be thrown out downstream, i.e. there is a one to one match of breakend and any nearest breakend that is in the wrong orientation disqualifies the clustser
##       } else if (strict == "one_to_one") {
##         nearest_ix = cbind(rowMins(nearest_ix), rowMaxs(nearest_ix))
##         nearest_ix = nearest_ix[duplicated(nearest_ix),,drop = FALSE]
##         nearest_ix = rbind(nearest_ix, cbind(nearest_ix[,2], nearest_ix[,1]))
##         nearest_ix = as.matrix(nearest_ix) # only one-to-one breakends are considered
##       } else if (strict == "loose") {
##         nearest_ix = rbind(nearest_ix, cbind(nearest_ix[,2], nearest_ix[,1]))
##         nearest_ix = nearest_ix[!duplicated(nearest_ix),,drop = FALSE]
##       }
##     }
##   } else if (!only_chains) {
##     xt.adj[bp.pair] = 1
##   }
##   ## rm(xt.adj0)
##   adj = xt.adj
##   xt.adj[which(is.na(as.matrix(xt.adj)))] = .INF + 1
##   adj[which(is.na(as.matrix(adj)))] = 0
##   adj[which(as.matrix(adj) > thresh)] = 0
##   if (only_chains && nrow(nearest_ix)) {
##     tmp = xt.adj[nearest_ix]
##     xt.adj[] = .INF + 1
##     adj[] = 0
##     xt.adj[nearest_ix] = tmp
##     adj[nearest_ix] = tmp
##   }
##   dt = Matrix::which(xt.adj < thresh, arr.ind = T)
##   dt = unique(data.table(cbind(rowMins(dt), rowMaxs(dt))))
##   dt[, bp.dist := xt.adj[dt[, cbind(V1, V2)]]]
##   dt$sign = ifelse(strand(bp[dt$V1]) == "+",
##             ifelse(gr.flipstrand(bp[dt$V1]) > bp[dt$V2], -1, 1),
##             ifelse(gr.flipstrand(bp[dt$V1]) < bp[dt$V2], -1, 1))
##   dt2 = dt[,idx := seq_len(.N)] %>% melt(measure.vars = c("V1", "V2"))
##   meta = cbind(gr2dt(bp)[dt2$value], dt2)[order(idx)]
##   meta = rbind(meta,
##               gr2dt(bp)[paste(grl.ix, grl.iix) %nin% meta[, paste(grl.ix, grl.iix)]],
##               fill = T)
##   if (return_pairs) {
##     return(meta)
##   }
##   hcl = stats::hclust(as.dist(xt.adj), method = "single")
##   hcl.lbl = cutree(hcl, h = thresh)
##   bp.dt$hcl = hcl.lbl
##   bp.hcl = bp.dt[, .(hcl.1 = .SD[grl.iix == 1, hcl], hcl.2 = .SD[grl.iix ==
##                                                                  2, hcl]), keyby = grl.ix]
##   altedges$mark(hcl.1 = bp.hcl[.(seq_along(altedges)), hcl.1])
##   altedges$mark(hcl.2 = bp.hcl[.(seq_along(altedges)), hcl.2])
##   hcl.ig = igraph::graph_from_edgelist(bp.hcl[, unique(cbind(hcl.1,
##                                                              hcl.2))], directed = FALSE)
##   hcl.comp = components(hcl.ig)
##   altedges$mark(ehcl = as.integer(hcl.comp$membership)[bp.hcl[,
##                                                               hcl.1]])
##   adj[adj > thresh] = 0
##   refg = self[, type == "REF"]
##   bpp = Matrix::which(adj != 0, arr.ind = TRUE)
##   dref = pdist(bp[bpp[, 1]], bp[bpp[, 2]])
##   drefg = diag(refg$dist(bp[bpp[, 1]], bp[bpp[, 2]]))
##   ix = which(drefg > dref)
##   if (length(ix))
##     adj[bpp[ix, , drop = FALSE]] = FALSE
##   if (verbose > 1)
##     cat("\n")
##   adj = adj | t(adj)
##   junpos = bp1 = bp$grl.iix == 1
##   junneg = bp2 = bp$grl.iix == 2
##   adj2 = adj & FALSE
##   adj2[junpos, junpos] = adj[bp2, bp1]
##   adj2[junpos, junneg] = adj[bp2, bp2]
##   adj2[junneg, junpos] = adj[bp1, bp1]
##   adj2[junneg, junneg] = adj[bp1, bp2]
##   if (verbose)
##     message(sprintf("Created basic junction graph using distance threshold of %s",
##                     thresh))
##   cl = split(1:length(bp), igraph::clusters(graph.adjacency(adj2),
##                                             ifelse(weak, "weak", "strong"))$membership)
##   cl = cl[S4Vectors::elementNROWS(cl) > 1]
##   cl = cl[order(S4Vectors::elementNROWS(cl))]
##   ## browser()
##   ## jcl = lapply(cl, function(x) unique(sort(bp$grl.ix[x])))
##   jcl = lapply(cl, function(x) unique(sort(bp$edge.id[x])))
##   jcls = sapply(jcl, paste, collapse = " ")
##   jcl = jcl[!duplicated(jcls)]
##   adj3 = adj2
##   altedges$mark(ecycle = as.character(NA))
##   if (length(jcl) > 0) {
##     dcl = dunlist(unname(jcl))[, `:=`(listid, paste0(ifelse(weak,
##                                                             "", "c"), listid))]
##     if (!weak)
##         ## altedges[dcl$V1]$mark(ecycle = dcl$listid)
##         altedges[as.character(dcl$V1)]$mark(ecycle = dcl$listid)
##     ## altedges[dcl$V1]$mark(ecluster = dcl$listid)
##     altedges[as.character(dcl$V1)]$mark(ecluster = dcl$listid)
##     meta = merge(meta, altedges$dt[, .(edge.id, ecluster, hcl.1, hcl.2, ehcl, ecycle)], by = "edge.id")
##     meta[!is.na(ecluster),
##          `:=`(
##            nclust = length(unique(edge.id)),
##            all_positive = all(replace(sign, is.na(sign),  3e9) > 0),
##            all_negative = all(replace(sign, is.na(sign), -3e9) < 0),
##            mixed = {naom = na.omit(sign); any(naom > 0) & any(naom < 0)},
##            bridge = anyNA(bp.dist)
##            ),
##          by = ecluster]
##     meta[, `:=`(
##         num_positive = sum(sign[!duplicated(idx)] > 0, na.rm = T),
##         num_negative = sum(sign[!duplicated(idx)] < 0, na.rm = T)
##     ),
##     by = ecluster]
##     self$set(recip_bp = meta)
##     if (verbose) {
##         message("Annotated weakly connected junction clusters and added ecluster pair metadata")
##     }
##   }
##   if (verbose)
##     message(sprintf("Annotated %s junction cycles in edge field $ecycle",
##                     length(jcl)))
##   if (paths & !weak) {
##     if (verbose)
##       message("Analyzing paths")
##     if (length(jcl) > 0) {
##       adj3[unlist(jcl), unlist(jcl)] = FALSE
##     }
##     sinks = Matrix::which(Matrix::rowSums(adj3) == 0)
##     sources = Matrix::which(Matrix::colSums(adj3) == 0)
##     cl2 = split(1:length(bp), igraph::clusters(graph.adjacency(adj3),
##                                                "weak")$membership)
##     cl2 = cl2[S4Vectors::elementNROWS(cl2) > 1]
##     if (any(ix <- S4Vectors::elementNROWS(cl2) > 2)) {
##       cl3 = do.call(c, mclapply(cl2[ix], function(x) {
##         tmp.adj = adj3[x, x]
##         lapply(all.paths(tmp.adj, sources = sources,
##                          sinks = sinks, verbose = verbose)$paths, function(i) x[i])
##       }, mc.cores = mc.cores))
##       cl2 = c(cl2[!ix], cl3)
##     }
##     jcl2 = lapply(cl2, function(x) unique(sort(bp$grl.ix[x])))
##     jcls2 = sapply(jcl2, paste, collapse = " ")
##     jcl2 = jcl2[!duplicated(jcls2)]
##     altedges$mark(epath = as.character(NA))
##     if (length(jcl2) > 0) {
##       dcl2 = dunlist(unname(jcl2))[, `:=`(listid, paste0("p",
##                                                          listid))]
##       altedges[dcl2$V1]$mark(epath = dcl2$listid)
##       self$edges$mark(ecluster =
##                         ifelse(is.na(self$edges$dt$ecycle) &
##                                is.na(self$edges$dt$epath),
##                                as.character(NA),
##                                paste0(ifelse(is.na(self$edges$dt$ecycle), "",
##                                              self$edges$dt$ecycle),
##                                       ifelse(is.na(self$edges$dt$epath),
##                                              "", self$edges$dt$epath))))
##     }
##     if (verbose)
##       message(sprintf("Annotated %s paths in edge field $epath",
##                       length(jcl2)))
##   }
##   return(invisible(self))
## }
## ## gGraph$public_methods$eclusters2 = tmpeclustpairs # if replacing a binding

## #' @description
## #' make eclusters
## gGraph$public_methods$eclusters2 = NULL; gGraph$set("public", "eclusters2", eclusters2)



#' @name rep_len2
#' @title recycle vector along length OR nrow of object
#'
#'
#' @author Kevin Hadi
#' @param x data
#' @param objalong any object to recycle x along if uselen = TRUE, or an actual integer value if uselen = FALSE
#' @return vector
rep_len2 = function(x, objalong, uselen = TRUE) {
    if (uselen)
        rep(x, length.out = NROW(objalong))
    else
        rep(x, length.out = objalong)
}

#' @name seq_along2
#' @title seq along either row of table or length of vector
#'
#'
#' @author Kevin Hadi
#' @param x data
#' @return vector
seq_along2 = function(x)  {
  seq_len(NROW(x))
}


#' @name match3
#' @title similar to setkey except a general use utility
#'
#' very slow version of keying a la data.table
#' but for general/interactive use
#' @author Kevin Hadi
match3 = function(x, table, nomatch = NA_integer_, old = TRUE, use.data.table = TRUE) {
  out = if (use.data.table) {
    tryCatch({
      dx = data.table(x = x)[, id.x := seq_len(.N)]
      dtb = data.table(table = table)[, id.tb := seq_len(.N)]
      ## setkey(dx, x)[list(dtb$table)]$id.x
      setkey(dtb, table)[list(dx$x)]$id.tb
    }, error = function(e) structure("err", class = "err"))
  }
  if (!is.null(out) && !inherits(out, "err")) return(out)
  if (old) {
    dx = within(data.frame(x = x), {id.x = seq_along(x)})
    dtb = within(data.frame(table = table), {id.tb = seq_along(table)})
    res = merge(dx, dtb, by.x = "x", by.y = "table", all.x = TRUE,
      allow.cartesian = TRUE)
    return(res$id.tb[order(res$id.x)])
  } else  {
    m = match(table,x)
    mat = cbind(m, seq_along(m))
    mat = mat[!is.na(mat[,1]),,drop=FALSE]
    mat = mat[order(mat[,1], na.last = FALSE),,drop = FALSE]
    mat = cbind(mat, seq_len(dim(mat)[1]))
    m2 = match(x,table)
    ix = which(!duplicated(m2) & !is.na(m2))
    mat_rix = unlist(rep(split(mat[,3], mat[,1]), base::tabulate(m2)[m2][ix]))
    ## mat_rix = unlist(rep(split(mat[,3], mat[,1]), base::tabulate(m2)[m2][ix]))
    ix = rep(1, length.out = length(m2))
    ## original line
    ## ix[!is.na(m2)] = base::tabulate(m)[!is.na(m2)]
    ix[!is.na(m2)] = base::tabulate(m)[m][m2][!is.na(m2)]
    out = rep(m2, ix)
    out[!is.na(out)] = mat[mat_rix,,drop=F][,2]
    return(out)
    ## m = match(table, x)
    ## mat = cbind(m, seq_along(m))
    ## mat = mat[!is.na(mat[, 1]), , drop = FALSE]
    ## mat = mat[order(mat[, 1]), , drop = FALSE]
    ## mat = cbind(mat, seq_len(dim(mat)[1]))
    ## m2 = match(x, table)
    ## ix = which(!duplicated(m2))
    ## mat_rix = unlist(rep(split(mat[, 3], mat[, 1]), base::tabulate(m2)[m2][ix]))
    ## mat[mat_rix, , drop = F][, 2]
  }
}


#' @name dedup.cols
#' @title applies dedup to colnames
#'
#' dedup the column names of a data.frame/data.table
#'
#' @return A data.table or data.frame
dedup.cols = function(tbl, remove = FALSE) {
    if (remove) {
        if (!inherits(tbl, "data.table"))
            return(tbl[, match(unique(colnames(tbl)), colnames(tbl))])
        else
            return(tbl[, match(unique(colnames(tbl)), colnames(tbl)), with = FALSE])
    } else {
            colnames(tbl) = base::make.unique(colnames(tbl))
            return(tbl)
    }
}


#' @name rowMins
#' @title rowMins hack
#'
#'
#' @return A vector
rowMins = function(x) {
  do.call(pmin, as.data.frame(x))
}

#' @name rowMaxs
#' @title rowMaxs hack
#'
#'
#' @return A vector
rowMaxs = function(x) {
  do.call(pmax, as.data.frame(x))
}

#' @name not.in
#' @title not.in
#'
#' @description
#' not in
#'
#' @param x value to test
#' @param table table to test x against
#'
#' @noRd
`%nin%` = function (x, table)
{
    match(x, table, nomatch = 0L) == 0L
}

#' @name gr.noval
#' @title get rid of mcols on GRanges/GRangesLists
#' @description
#'
#' remove all metadata from GRanges or GRangesList
#'
#' @return GRanges or GRangesList
#' @author Kevin Hadi
gr.noval = function(gr, keep.col = NULL, drop.col = NULL) {
    if (is.null(keep.col) & is.null(drop.col)) {
        select_col = NULL
    } else {
        all_col = colnames(gr@elementMetadata)
        if (inherits(gr, "GRangesList")) {
            all_col = c(all_col, colnames(gr@unlistData@elementMetadata))
        }

        if (!is.null(keep.col) & is.null(drop.col)) {
            select_col = intersect(all_col, keep.col)
        } else if (is.null(keep.col) & !is.null(drop.col)) {
            select_col = setdiff(all_col, drop.col)
        } else if (!is.null(keep.col) && !is.null(drop.col)) {
            if (intersect(keep.col, drop.col) > 0) {
                warning("drop.col and keep.col args have overlapping elements\nkeeping the columns that overlap")
                select_col = intersect(setdiff(all_col, setdiff(drop.col, keep.col)), keep.col)
            }
        }
    }
    if (inherits(gr, "GRangesList")) {
        tmp_query = intersect(select_col, colnames(gr@unlistData@elementMetadata))
        gr@unlistData@elementMetadata = gr@unlistData@elementMetadata[,c(tmp_query), drop = FALSE]
    }
    tmp_query = intersect(select_col, colnames(gr@elementMetadata))
    gr@elementMetadata = gr@elementMetadata[,c(tmp_query),drop = FALSE]
    return(gr)
}


#' @name setcols
#' @title convenience function to set columns
#'
#' sets columns of an object
#'
#' @param dt data frame/table or matrix
#' @param old integer or character or logical vector corresponding to current colnames in dt
#' @param new character vector for new column names
#' @return colnamed object
#' @author Kevin Hadi
setcols = function(dt, old, new) {
  if (inherits(dt, c("GRanges", "GRangesList"))) {
    mcols(dt) = setcols(mcols(dt), old, new)
    return(dt)
  }
  cnames = colnames2(dt)
  if (missing(new) || missing(old)) {
    if (missing(old)) {
      old = new
    }
    if (is.character(old) && length(old) == length(cnames)) {
      colnames(dt) = old
      return(dt)
    } else {
      stop("names provided must be same length as ncol(dt)")
    }
  }
  if (is.character(old)) {
    out = merge(data.frame(cnames, seq_along(cnames)), data.frame(cnames = old, new = new),
      allow.cartesian = T)
    cnames[out[[2]]] = out[[3]]
    colnames(dt) = cnames
    return(dt)
  }
  if (is.logical(old)) {
    if (! length(old) == length(cnames)) stop("logical vector must be same length as ncol(dt)")
    old = which(old)
  }
  cnames[old] = new
  colnames(dt) = cnames
  return(dt)
}

#' @name colnames2
#' @title robust colnames
#'
#' gives back character vector same number of columns of input regardless whether named or not
#'
#' @param str a path string
#' @return a string with multiple parentheses replaced with a single parenthesis
#' @author Kevin Hadi
colnames2 = function(x) {
    nm = colnames(x)
    if (is.null(nm))
        return(rep("", length.out = NCOL(x)))
    else
        return(nm)
}




#' @name %K%
#' @title similar to setkey except a general use utility
#'
#' slower version of setkey, but for interactive use
#'
#' @author Kevin Hadi
`%K%` = function(thisx,thisy, old = TRUE) {
    if (old)
        return(match3(table = thisx, x = thisy, old = TRUE))
    else
        return(match3(table = thisx, x = thisy, old = FALSE))
}


##' @name enframe_list
##' @title data table-ize list elements and add name column
##'
##' @description
##'
##' @author Kevin Hadi
#enframe_list = function(lst, name = "name", value = "value", as.data.table = TRUE, rbind = TRUE, mc.cores = 1) {
#    nms = names(lst)
#    expr = parse(text = sprintf("cbind(%s = nm, df)", name, value))
#    out = mcmapply(function(el, nm) {
#        if (NROW(el)) {
#            if (is.null(dim(el)))
#                df = setColnames(as.data.frame(el), value)
#            else
#                df = as.data.frame(el)
#            if (as.data.table)
#                setDT(df)
#            eval(expr)
#        }
#    }, lst, nms, SIMPLIFY = FALSE, mc.cores = mc.cores)
#    if (rbind) {
#        if (as.data.table)
#            return(rbindlist(out))
#        else
#            return(do.call(rbind, out))
#    }
#}

##' @name setColnames
##' @title convenience function to set column names
##'
##' @param object tabled object
##' @param nm names of the new columns
##' @return colnamed object
##' @author Kevin Hadi
#setColnames = function(object = nm, nm = NULL, pattern = NULL, replacement = "") {
#    if (!is.null(nm)) {
#        if (is.null(names(nm)))
#            colnames2(object)  = nm
#        else {
#            ix = match3(names(nm), colnames(object))
#            colnames2(object)[ix] = nm
#        }
#    } else if (!is.null(pattern)) {
#        colnames2(object) = gsub(pattern, replacement, colnames2(object))
#    }
#    return(object)
#}

#' @name isNA
#' @title is.na but also tests for "NA" character
#'
#' @description
#'
#' @author Kevin Hadi
isNA = function(x, na.char = c("NA", "NULL", "na", "null")) {
    if (is.character(x)) {
        return(is.na(x) | x %in% na.char)
    } else {
        return(is.na(x))
    }
}

#' @name na2false
#' @title replace logical vector with NA to FALSE
#'
#' @description
#' A convenience function to set a logical vector with NAs to false
#'
#' @return A logical vector with NAs set to FALSE
#' @author Kevin Hadi
na2false = function(v)
{
    v[isNA(v)] = FALSE
    ## mode(v) = "logical"
    v
}

#' @name rand.string
#' @title make a random string
#'
#' @return random string
#' @author Someone from Stackoverflow
rand.string <- function(n=1, length=12)
{
    randomString <- c(1:n)                  # initialize vector
    for (i in 1:n)
    {
        randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
                                        length, replace=TRUE),
                                 collapse="")
    }
    return(randomString)
}

#' @name gr.spreduce
#' @title reduce based on a field(s) to split by in elementMetadata of GRanges, or given vector
#' @description
#'
#' split and reduce GRanges by field(s)
#' if providing a variable not already within the GRanges,
#' may need to use dynget(variable_name)
#'
#' @return GRanges
#' @author Kevin Hadi
gr.spreduce = function(gr,  ..., ignore.strand = FALSE, pad = 0, return.grl = FALSE, sep = paste0(" ", rand.string(length = 8), " ")) {
  lst = as.list(match.call())[-1]
  ix = which(!names(lst) %in% c("gr", "sep", "pad", "ignore.strand", "return.grl"))
  vars = unlist(sapply(lst[ix], function(x) unlist(sapply(x, toString))))
  if (length(vars) == 1) {
    if (!vars %in% colnames(mcols(gr)))
      vars = tryCatch(unlist(list(...)), error = function(e) vars)
  }
  if (!all(vars %in% colnames(mcols(gr))))
    stop("Must specify valid metadata columns in gr")
  tmpix = do.call(
    function(...) paste(..., sep = sep),
    as.list(mcols(gr)[,vars, drop = F]))
  unix = which(!duplicated(tmpix))
  tmpix = factor(tmpix, levels = tmpix[unix])
  grl = unname(gr.noval(gr) %>% GenomicRanges::split(tmpix))
  grl = GenomicRanges::reduce(grl + pad, ignore.strand = ignore.strand)
  if (return.grl) {
    mcols(grl) = mcols(gr)[unix,vars,drop = F]
    return(grl)
  } else {
    out = unlist(grl)
    mcols(out) = mcols(gr)[rep(unix, times = IRanges::width(grl@partitioning)),
      vars,drop = F]
    return(out)
  }
}

#' @name duped
#' @title duped
#'
#'
#'
#' @param ... vectors to paste by
#' @author Kevin Hadi
duped = function(..., binder = "data.table") {
    duplicated(tryCatch(et(sprintf("%s(...)", binder)),
                        ## error = function(e) do.call(cbind, list(...))))
                        error = function(e) paste(...)))
}

#' @name dt_na2false
#' @title convert columns with NA to false
#'
#' coerce NA in columns of class "logical" to FALSE
#'
#' @param dt data.table
#' @param these_cols NULL by default, will select columns of class logical, otherwise will be specified
#' @return A data.table
#' @author Kevin Hadi
dt_na2false = function(dt, these_cols = NULL) {
    na2false = function(v)
    {
        ## v = ifelse(is.na(v), v, FALSE)
        v[is.na(v)] = FALSE
        as.logical(v)
    }
    if (is.null(these_cols)) {
        these_cols = which(sapply(dt, class) == "logical")
    }
    for (this_col in these_cols) {
        ## this_val = as.data.frame(dt[, this_col, with = FALSE])[,1]
        this_val = dt[[this_col]]
        data.table::set(dt, j = this_col, value = na2false(this_val))
    }
    return(dt)
}
mskilab/gGnome documentation built on June 10, 2025, 11:29 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
mskilab/gGnome
gGnome: reference based assembly graph for analyzing rearranged genomes

R/misc_utils.R
In mskilab/gGnome: gGnome: reference based assembly graph for analyzing rearranged genomes

Defines functions dt_na2false duped gr.spreduce rand.string na2false isNA `%K%` colnames2 setcols gr.noval rowMaxs rowMins dedup.cols match3 seq_along2 rep_len2

Documented in colnames2 dedup.cols dt_na2false duped gr.noval gr.spreduce isNA isNA match3 na2false rand.string rep_len2 rowMaxs rowMins seq_along2 setcols

R Package Documentation

Browse R Packages

We want your feedback!

mskilab/gGnome gGnome: reference based assembly graph for analyzing rearranged genomes

R/misc_utils.R In mskilab/gGnome: gGnome: reference based assembly graph for analyzing rearranged genomes

Defines functions dt_na2false duped gr.spreduce rand.string na2false isNA `%K%` colnames2 setcols gr.noval rowMaxs rowMins dedup.cols match3 seq_along2 rep_len2

Documented in colnames2 dedup.cols dt_na2false duped gr.noval gr.spreduce isNA isNA match3 na2false rand.string rep_len2 rowMaxs rowMins seq_along2 setcols

R Package Documentation

Browse R Packages

We want your feedback!

mskilab/gGnome
gGnome: reference based assembly graph for analyzing rearranged genomes

R/misc_utils.R
In mskilab/gGnome: gGnome: reference based assembly graph for analyzing rearranged genomes