#' @name gGnome
#' @title gGnome
#' @description
#' Reference-based graph representation of structurally altered genome
#' employing GenomicRanges framework.
#'
#'
#' Copyright (C) 2018 Xiaotong Yao, Joseph DeRose, Marcin Imielinski
#'
#' This program is free software: you can redistribute it and/or modify
#' it under the terms of the GNU General Public License as published by
#' the Free Software Foundation, either version 3 of the License, or
#' (at your option) any later version.
#'
#' This program is distributed in the hope that it will be useful,
#' but WITHOUT ANY WARRANTY; without even the implied warranty of
#' MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#' GNU General Public License for more details.
#'
#' You should have received a copy of the GNU General Public License
#' along with this program. If not, see <http://www.gnu.org/licenses/>.
#'
#' Github: https://github.com/mskilab/gGnome
#' For questions: xiaotong.yao23@gmail.com
#'
#' @importFrom parallel mclapply
#' @importFrom reshape2 melt
#' @importFrom VariantAnnotation readVcf info
#'
#' @import methods
#' @import R6
#' @import data.table
#' @import Matrix
#' @import jsonlite
#' @import GenomicRanges
#' @import igraph
#' @import gUtils
#' @import gTrack
#' @import fishHook
#' @useDynLib gGnome
"_PACKAGE"
#' @name cbind
#' @title cbind wrapper
#'
#' @description
#' Forcing correct call of cbind
#'
#' @param ... arguments to cbind
#'
#' @return vector of combined arguments
cbind = function(..., deparse.level = 1) {
lst_ = list(...)
## anyS4 = any(vapply(lst_, inherits, FALSE, c("DFrame", "DataFrame", "List")))
anyS4 = any(vapply(lst_, isS4, FALSE))
if (anyS4) cbind.DataFrame(...) else BiocGenerics::cbind(..., deparse.level = deparse.level)
}
## ================= gNode class definition ================== ##
#' @name gNode
#' @title gNode
#' @description
#' gNode object.
#' @export
gNode = setClass("gNode")
gNode = R6::R6Class("gNode",
public = list(
#' @name initialize
#' @description
#' Set up the constructor GNODE
#' index - snode.id
#' graph
#' @param snode.id signed node id
#' @param graph gGraph associated with this sNode
#' @author Joe DeRose
initialize = function(snode.id = NULL, graph)
{
private$pgraph = graph
private$ptimestamp = graph$timestamp ## inherit timestamp from graph object
private$porientation = private$prindex = private$pindex = c()
if (is.null(snode.id)) {
return(self)
}
if (!all(snode.id %in% graph$gr$snode.id)) {
stop("Not valid snode.id(s)")
}
private$pnode.id = abs(snode.id)
lookup = private$pgraph$queryLookup(snode.id)
private$pindex = lookup[, index]
private$prindex = lookup[, rindex]
private$porientation = sign(snode.id)
},
#' @name mark
#' @description
#'
#' Marks the nodes pointed at by this gNode by adding metadata in the gGraph they came from.
#' Metadata names can be arbitrary, such as "lwd = 7" or "highlight = TRUE".
#' gGraph nodes have metadata specified appended to them
#'
#' @param ... metadata names = data to store in metadata columns
#'
#' @details
#' ```
#' gg$nodes[1:5]$mark(col = "purple")
#' gg$nodes$mark(changed = FALSE)
#' @param ... name = value pairs of scalar or vector (length edges in graph) arguments
#' ```
#' @author Marcin Imielinski
mark = function(...)
{
self$check
if (length(self)>0)
{
NONO.FIELDS = c('node.id', 'snode.id', 'index', 'loose.left', 'loose.right')
args = list(...)
if (any(names(args) %in% NONO.FIELDS)){
stop(paste('Cannot modify these reserved gNode metadata fields:', paste(NONO.FIELDS, collapse = ', ')))
}
for (nm in names(args)){
private$pgraph$annotate(nm, args[[nm]],
private$pnode.id, "node")
}
}
},
#' @name rep
#' @description
#'
#' Creates "bubbles" in the graph by replicating the nodes in the argument. Node replication
#' replicates edges going in and out of all replicated nodes. If an edge connects a pair of replicated nodes
#' that edge will be replicated across all pairs of those replciated nodes. Walk replication will create "longer bubbles"
#' with fewer edges getting replicated i.e. it will only replicate intra-walk edges within each walk replicate (but not between
#' separate walk replicates).
#'
#' New graph keeps track of the parent node and edge ids in the original graph using node metadata parent.node.id
#' and edge metadata parent.edge.id i.e. the replicated nodes will be connected to the sources of the original nodes
#' and if replicated nodes connect to each other, then there will exist an edge connecting
#' all of their instances to each other.
#'
#' @param nodes = gNode object must point to a node in the graph, can also be an index of a node (but not a metadata expression), can also be a gWalk object
#' @param times scalar or vector of length self$length specifying how many times to replicate each of the nodes.
#' @author Marcin Imielinski
#' @return Returns a pointer to the new nodes
rep = function(times)
{
if (length(self)>0)
{
private$pgraph$rep(self, times)
}
return(invisible(self))
},
#' @name swap
#' @description
#'
#' Swap nodes with granges, grl, or Gwalks.
#' Provided replacement vector must be the same length as the inputted nodes, resulting in each node being "swapped" by the provided
#' interval, node, grl (representing a walk), or gWalk. The replacement will inherit left and right edges for the removed node.
#' If the replacement is a walk, then the left side of the first node in the walk will inherit the edges that were
#' previously to the left of the node being replaced, and right side of the last node of the walk will inherit the edges
#' that were previously to the right of the node being replaced.
#'
#' Note: these replacement obey the orientation of the arguments. So if the node to be replaced is flipped (- orientation with
#' respect to the reference, then it's "left" is to the right on the reference. Similarly for walks whose first interval
#' is flipped with respect to the reference, the left edges will be attached to the right of the node on the reference.)
#'
#' @param replacement GRanges, GRangesList, or gWalk object whose length is the length(nodes)
#' @author Marcin Imielinski
#' @return gGraph (also modified in place) with nodes annotated with parent.node.id, parent.rep
swap = function(replacement)
{
if (length(self)>0)
{
private$pgraph$swap(self, replacement)
}
},
#' @name subset
#' @description
#' Allows subseting of the Node object using bracket notation
#'
#' @param i integer or logical index
#' @return subset of nodes indexed by i
#' @author Joe DeRose
subset = function(i)
{
self$check
if (is.null(i)){
i = integer()
}
if (is.logical(i)){
i = which(i)
}
if (!is.null(dim(i)))
stop('subscript dimensions malformed')
if (length(i)>0 && (is.numeric(i) | is.integer(i))) {
if (max(abs(i), na.rm = TRUE) > self$length || max(abs(i), na.rm = TRUE) == 0) {
stop('index out of bounds')
}
}
## if character is provided as the query, we will
## use signed node id to look up signed node.ids in this object
## *** we will find the first match of the absolute value of the queried
## snode id and flip that node.id if necessary
if (is.character(i))
{
i = as.integer(i)
i = sign(i)*match(abs(i), abs(private$pnode.id))
if (any(is.na(i)))
{
stop('one or more signed node ids are not found in this object')
}
}
private$pnode.id = private$pnode.id[abs(i)]
new.pindex = ifelse(i>0, private$pindex[abs(i)], private$prindex[abs(i)])
new.prindex = ifelse(i>0, private$prindex[abs(i)], private$pindex[abs(i)])
private$pindex = new.pindex
private$prindex = new.prindex
private$porientation = private$porientation[abs(i)]*sign(i)
return(self)
},
#' @name eval
#' @description
#'
#' Evaluates expression over edge metadata for all nodes in self
#' with a keyby node.id returning a self$length vector
#' @param x expression over edge metadata
eval = function(x, right = TRUE, all = FALSE)
{
self$check
if (self$length==0)
return(NULL)
tmpdt = data.table()
if (right)
tmpdt = rbind(tmpdt, self$eright$dt[, node.id := n1])
if (!right | all)
tmpdt = rbind(tmpdt, self$eleft$dt[, node.id := n2])
if (nrow(tmpdt)==0)
return(rep(NA, self$length))
## remove any dups which will inflate the function
## eg if the same edges happen to be on the left and
## right of the given node
tmpdt = unique(tmpdt, by = c("edge.id", "node.id"))
out = eval(parse(text = paste("tmpdt[, ", lazyeval::expr_text(x), ", keyby = node.id]")))[.(self$id), V1]
return(out)
},
#' value for "cluster"
#'
#'
#' @param weak character scalar that can take one of the following possible values - "weak" or "strong" specifying weakly or strongly connected components, walktrap specifying cluster_walktrap community detection
#' @author Marcin Imielinski
#' @name clusters
#' @description
#'
#' Marks nodes in graph with metadata field $cluster
#' based on one of several algorithms, selected by mode.
#'
#' Unlike the gGraph version of this function, this usage enables computation of clusters
#' based on a subset of nodes in the graph (i.e. ignoring certain nodes)
#' while still marking the original graph (and leaving the excluded graph with an NA)
#' value for "cluster"
#'
#'
#' @param weak character scalar that can take one of the following possible values - "weak" or "strong" specifying weakly or strongly connected components, walktrap specifying cluster_walktrap community detection
#' @author Marcin Imielinski
clusters = function(mode = 'weak')
{
graph = self$graph$copy
graph$nodes$mark(parent.node.id = graph$nodes$dt$node.id)
sgraph = graph[self$dt$node.id, ]
sgraph$clusters(mode = mode)
self$graph$nodes$mark(cluster = as.numeric(NA))
self$graph$nodes[sgraph$nodes$dt$parent.node.id]$mark(cluster = sgraph$nodes$dt$cluster)
return(invisible(self$graph))
},
#' @name ego
#' @description
#'
#' returns all nodes with k degrees of separation (i.e. "order")
#' from these
#'
#' @param order integer edge distance from these (seed) nodes to return
#' @param mindist minimum distance from these (seed) nodes
#' @return gNode object comprising nodes within the ego of this node
#' @author Marcin Imielinski
ego = function(order = 0, mindist = 0)
{
self$check
G = self$graph$igraph
nodes = c(private$pindex, private$prindex)
egoi = unique(unlist(ego(G, order = order, nodes = nodes, mindist = mindist)))
egoini = unique(self$graph$gr[egoi]$node.id)
return(self$graph$nodes[egoini])
},
#' @name print
#' @description
#'
#' Prints out the gNode Object. Prints the length and the GRanges of the nodes.
print = function()
{
self$check
message('gNode object of length ', self$length)
if (self$length>0)
{
HEAD = 1:pmin(4, self$length)
print(self$gr[HEAD, ])
more = self$length-HEAD[length(HEAD)]
if (more>0)
{
message('... (', more,' additional nodes)')
}
}
},
#' @name loose.degree
#'
#' @param orientation (character) one of 'left' or 'right'
#'
#' @return number of loose ends with given orientation
loose.degree = function(orientation)
{
if (!(orientation %in% c('right', 'left'))){
stop('Bad orientation: "', orientation, '". orientation must be either "right", or "left"')
}
self$check
if (!nrow(private$pgraph$edgesdt))
return(0)
op.orientation = 'right'
if (orientation == 'right'){
op.orientation = 'left'
}
deg = rowSums(cbind(private$pgraph$edgesdt[, sum(n1.side == orientation), keyby = n1][.(private$pnode.id), V1], private$pgraph$edgesdt[, sum(n2.side == orientation), keyby = n2][.(private$pnode.id), V1]), na.rm = TRUE)
op.deg = deg
if (any(private$porientation<0)){
op.deg = rowSums(cbind(private$pgraph$edgesdt[ , sum(n1.side == op.orientation), keyby = n1][.(private$pnode.id), V1], private$pgraph$edgesdt[, sum(n2.side == op.orientation), keyby = n2][.(private$pnode.id), V1]), na.rm = TRUE)}
return(pmax(0, ifelse(private$porientation>0, deg, op.deg), na.rm = TRUE))
}
),
private = list(
## stores the unsigned node.id's of the nodes in private$pgraph in this gNode
pnode.id = NULL,
ptimestamp = NULL,
## Stores the index and reverse index of the pnode.id's in this gNode
pindex = NULL,
prindex = NULL,
## Stores the signs of the nodes associated with indices in private$pindex
porientation = NULL,
## Pointer to gGraph that this gNode belongs to
pgraph = NULL
),
active = list(
## object is stale if the recorded timestamp of the gGraph
## != timestamp the actual gGraph pointed to by pgraph
## suggesting that the indices are no longer valid
stale = function() private$ptimestamp != private$pgraph$timestamp,
## checks if object is stale i.e.
check = function() if (self$stale) stop('object is stale, underlying gGraph has changed. You will need to re-instantiate.'),
#' @name length
#' @description
#'
#' Returns the number of nodes in this gNode Object.
#'
#' @return Number of nodes in this gNode Object
length = function()
{
self$check
return(length(private$pnode.id))
},
#' @name copy
#' @title copy
#' @description
#'
#' Return a deep copy of the graph
#'
#' @return copy of the object
copy = function() self$clone(),
#' @name graph
#' @description
#'
#' Returns the gGraph this gNode points at.
#'
#' @return gGraph this gNode points at
graph = function()
{
self$check
return(private$pgraph)
},
#' @name footprint
#' @description
#'
#' Returns the reduced genomic footprint of this object
#'
#' @return GRanges of footprint of this object
footprint = function()
{
self$check
return(sort(reduce(self$gr)))
},
#' @name dt
#' @description
#'
#' Returns the GRanges of the nodes in the gNode as a data.table.
#'
#' @return data.table GRanges of the nodes coverted to a data.table
dt = function() {
self$check
return(as.data.table(private$pgraph$gr[private$pindex]))
},
#' @name gr
#' @description
#'
#' Returns a GRanges of the nodes in the gNode.
#'
#' @return GRanges of the nodes
gr = function() {
self$check
return(private$pgraph$gr[private$pindex])
},
#' @name id
#' @description
#'
#' Returns the node.id's of the nodes in this gNode
#'
#' @return Vector of the node.id's in this gNode
id = function()
{
self$check
return(private$pnode.id)
},
#' @name sid
#' @description
#'
#' Returns the snode.id's of the nodes in this gNode
#'
#' @return Vector of the snode.id's in this gNode
sid = function()
{
self$check
return(ifelse(private$porientation == 1, private$pnode.id, -private$pnode.id))
},
#' @name flip
#' @title flip
#' @description
#'
#' returns flipped version of this node
#'
#' @return reverse complemented gNode
flip = function()
{
self$check
sid = self$dt$snode.id
return(self$graph$nodes[-sid])
},
#' @name left
#' @description
#'
#' Returns a gNode containing the nodes connected to the left of the nodes in this
#' gNode. If there are no nodes to the left, returns an empty gNode.
#'
#' @return gNode Nodes connected to the left of the nodes in this gNode Object
left = function()
{
self$check
dt = copy(private$pgraph$sedgesdt)
setkeyv(dt, "to")
tmp = dt[.(private$pindex), from]
tmp = tmp[!is.na(tmp)]
leftNodes = gNode$new(snode.id = private$pgraph$gr$snode.id[tmp],
graph = private$pgraph)
return(leftNodes)
},
#' @name right
#' @description
#'
#' Returns a gNode containing the nodes connected to the right of the nodes in this
#' gNode. If there are no nodes to the right, returns an empty gNode.
#'
#' @return gNode Nodes connected to the right of the nodes in this gNode
right = function()
{
self$check
dt = copy(private$pgraph$sedgesdt)
setkeyv(dt, "from")
tmp = dt[.(private$pindex), to]
tmp = tmp[!is.na(tmp)]
rightNodes = gNode$new(snode.id = private$pgraph$gr$snode.id[tmp],
graph = private$pgraph)
return(rightNodes)
},
#' @name edges
#' @description
#'
#' Returns a gEdge containing the all edges connected to the nodes in this gNode.
#' If no edges are associated with this gNode, returns an empty gEdge.
#'
#' @return gEdge Edges connected to the nodes in this gNode
edges = function()
{
self$check
ed = c(self$eleft, self$eright)
if (length(ed)>0)
{
ed =ed[!duplicated(edge.id)]
}
return(ed)
},
#' @name eleft
#' @description
#'
#' Returns a gEdge containing the all edges connected to the left of the nodes in this gNode.
#' If no edges are left of this gNode, returns an empty gEdge.
#'
#' @return gEdge Edges connected to the left of the nodes in this gNode
eleft = function()
{
self$check
if (nrow(private$pgraph$sedgesdt)==0)
return(gEdge$new(graph = private$pgraph))
sedge.id = private$pgraph$sedgesdt[to %in% private$pindex, sedge.id]
return(gEdge$new(sedge.id, graph = private$pgraph))
},
#' @name eright
#' @description
#'
#' Returns a gEdge containing the all edges connected to the right of the nodes in this gNode.
#' If no edges are right of this gNode, returns an empty gEdge.
#'
#' @return gEdge Edges connected to the left of the nodes in this gNode
eright = function()
{
self$check
if (nrow(private$pgraph$sedgesdt)==0)
return(gEdge$new(graph = private$pgraph))
sedge.id = private$pgraph$sedgesdt[from %in% private$pindex, sedge.id]
return(gEdge$new(sedge.id, graph = private$pgraph))
},
#' @name loose
#' @description
#' Returns a GRanges containing loose ends information
#' @details
#' Returns info on loose ends connected to the nodes in this gNode. If there are no loose ends, returns an empty GRanges.
#'
#' @return GRanges Loose ends connected to the nodes in this gNode Object
loose = function()
{
return(c(self$lleft, self$lright))
},
#' @name loose.left
#' @description
#' sets or accesses a logical vector specifying whether the given node has a loose.end on its left
#'
#' @param value logical value to replace loose end value by
#' @author Marcin Imielinski
loose.left = function(value)
{
self$check
if (!missing(value))
{
if (!is.logical(value)){
stop('replacement must be logical vector')}
if (any(is.na(value))){
stop('replacement cannot contain NAs')}
ldeg = self$ldegree
value = value | ldeg==0
ix = private$porientation>0
if (any(ix)){
private$pgraph$annotate('loose.left', value[ix],
private$pnode.id[ix], "node")}
if (any(!ix)){
private$pgraph$annotate('loose.right', value[!ix],
private$pnode.id[!ix], "node")}
##FIXME: R6 doesn't like something wrong w setting active binding here
##FIXED: R6 won't let you do this with nested active bindings, i.e.
## gg$nodes$loose.left = TRUE won't work; but nodes = gg$nodes; nodse$loose.left = TRUE will
} else
{
return(ifelse(private$porientation>0,
self$gr$loose.left, self$gr$loose.right))
}
},
#' @name loose.right
#' @description
#' sets or accesses a logical vector specifying whether the given node has a loose.end on its right
#'
#' @param value logical value to replace loose end value by
#' @author Marcin Imielinski
loose.right = function(value)
{
self$check
if (!missing(value))
{
if (!is.logical(value)){
stop('replacement must be logical vector')
}
if (any(is.na(value))){
stop('replacement cannot contain NAs')
}
rdeg = self$rdegree
value = value | rdeg==0
ix = private$porientation>0
if (any(ix)){
private$pgraph$annotate('loose.right', value[ix],
private$pnode.id[ix], "node")
}
if (any(!ix)){
private$pgraph$annotate('loose.left', value[!ix],
private$pnode.id[!ix], "node")
}
##FIXME: R6 doesn't like something wrong w setting active binding here
##FIXED: R6 won't let you do this with nested active bindings, i.e.
## gg$nodes$loose.left = TRUE won't work; but nodes = gg$nodes; nodse$loose.left = TRUE will
} else
{
return(ifelse(private$porientation>0,
self$gr$loose.right, self$gr$loose.left))
}
},
degree = function()
{
self$check
deg = rowSums(cbind(
private$pgraph$edgesdt[n1.side == 'left', .N, keyby = n1][.(private$pnode.id), N],
private$pgraph$edgesdt[n2.side == 'left', .N, keyby = n2][.(private$pnode.id), N],
private$pgraph$edgesdt[n1.side == 'right', .N, keyby = n1][.(private$pnode.id), N],
private$pgraph$edgesdt[n2.side == 'right', .N, keyby = n2][.(private$pnode.id), N]),
na.rm = TRUE)
return(deg)
},
#' @name terminal
#' @title terminal
#' @description
#' Get a GRanges containing all terminal loose ends
#' @return GRanges
terminal = function()
{
self$check
if (self$length==0)
{
return(GRanges(seqlengths = seqlengths(self)))
}
ix = which(self$ldegree==0 | self$rdegree==0)
terminal.left = self[self$ldegree==0]$lleft
terminal.right = self[self$rdegree==0]$lright
return(c(terminal.left, terminal.right))
},
ldegree = function()
{
return(self$loose.degree(orientation = 'left'))
},
rdegree = function()
{
return(self$loose.degree(orientation = 'right'))
},
#' @name lleft
#' @description
#'
#' Returns a gNode containing the loose ends connected to the left of the nodes in this
#' gNode. If there are no loose ends to the left, returns an empty gNode.
#'
#' @return GRanges Loose ends connected to the left of the nodes in this gNode Object
lleft = function()
{
return(gNode.loose(self, orientation = 'left'))
},
#' @name lright
#' @description
#'
#' Returns a GRanges containing the loose ends connected to the right of the nodes in this
#' gNode. If there are no loose ends to the right, returns an empty GRanges
#'
#' @return GRanges Loose ends connected to the right of the nodes in this gNode Object
lright = function()
{
return(gNode.loose(self, orientation = 'right'))
},
#' @name dist
#' @description
#'
#' returns a distance matrix of all node pairs in this gGnode
#' representing base pairs separating each node pair
#'
#' @author Marcin Imielinski
dist = function()
{
self$check
return(private$pgraph$dist(private$pnode.id))
},
#' @name subgraph
#' @description
#'
#' Generates a new gGraph from the gGraph pointed at by this gNode containing only the nodes
#' in this gNode and the edges that connect them. The only edges included in this subgraph are
#' those that only contain the nodes in this gNode.
#'
#' @return gGraph Subgraph of all of the nodes in this gNode and the edges containing them
subgraph = function()
{
self$check
## Get all the edges that connect our nodes and remove the duplicate edge.id's as they will
## be automatically filled in in the Edge Constructor
edge.id = private$pgraph$sedgesdt[to %in% c(private$pindex, private$prindex)
& from %in% c(private$pindex, private$prindex), edge.id]
edge.id = edge.id[!duplicated(edge.id)]
edgeObj = gEdge$new(seid = edge.id,
graph = private$pgraph)
return(gGraph$new(nodeObj = self,
edgeObj = edgeObj,
meta = private$pgraph$meta))
}
)
)
## ================== Non-Member Functions for gNode ================== ##
#' @name length
#' @title length.gNode
#' @description
#'
#' The number of nodes in the gNode Object
#'
#' @param gNode a gNode object
#' @return the number of nodes in the gNode Object
#' @author Joe DeRose
#' @export
`length.gNode` = function(gNode)
{
if(!inherits(gNode, "gNode")) {
stop("Error: Invalid input")
}
return(gNode$length)
}
#' @name c
#' @title c.gNode
#' @description
#'
#' Concatenates gNode objects by node.id's stored in them. All arguments
#' must point at the same graph or an error will be thrown.
#'
#' @param gNode objects
#' @return a new concatenated gNode
#' @author Joe DeRose
#' @export
`c.gNode` = function(...)
{
gNode.list=list(...)
isg = sapply(gNode.list, function(x) class(x)[1]=='gNode')
if(any(!isg)){
stop('Error: All inputs must be of class gNode.')
}
##Check to make sure they all come from the same graph
graphs = lapply(gNode.list, function(x) x$graph)
if(length(unique(graphs))>1){
stop('Error: All gNodes must come from the same graph')
}
##Get all the pnode.id's to create new gNode
ids = lapply(gNode.list, function(x) x$id)
return (gNode$new(unlist(ids), graphs[[1]]))
}
#' @name setdiff
#' @title setdiff.gNode
#' @description
#'
#' Returns a new gNode which is the difference between x and y (node.id's).
#' All arguments must point at the same graph or an error will be thrown.
#'
#' @param x a gNode Object
#' @param y a gNode Object
#' @return new gNode Object containing the difference between x and y
#' @author Joe DeRose
#' @export
setMethod("setdiff", c("gNode", "gNode"),
function(x, y)
{
if(!identical(x$graph, y$graph)) {
stop("Arguments do not point to the same graph")
}
new.ids = setdiff(x$id, y$id)
return(gNode$new(new.ids, x$graph))
})
#' @name union
#' @title union.gNode
#' @description
#'
#' Returns a new gNode which is the union of x and y (node.id's). All
#' arguments must point at the same graph or an error will be thrown.
#'
#' @param x a gNode Object
#' @param y a gNode Object
#' @return new gNode Object containing the union of x and y
#' @author Joe DeRose
#' @export
#'
setMethod("union", c("gNode", "gNode"), function(x, y, ...)
{
if(!identical(x$graph, y$graph)) {
stop("Arguments do not point to the same graph")
}
new.ids = union(x$id, y$id)
return(gNode$new(new.ids, x$graph))
})
#' @name intersect.gNode
#' @title intersect.gNode
#' @description
#'
#' Returns a new gNode which is the intersection of x and y (node.id's).
#' All arguments must point at the same graph or an error will be thrown.
#'
#' @param x a gNode Object
#' @param y a gNode Object
#' @return new gNode Object containing the intersection of x and y
#' @author Joe DeRose
#' @export
"intersect.gNode" = function(x, y)
{
if(!identical(x$graph, y$graph)) {
stop("Arguments do not point to the same graph")
}
new.ids = intersect(x$id, y$id)
return(gNode$new(new.ids, x$graph))
}
registerS3method("intersect", "gNode", intersect.gNode, envir = globalenv())
#' @name [.gNode
#' @title gNode
#' @description
#'
#' Overloads subset operator for gNode. Allows subsetting of gNode via index or
#' snode.id (as a key). Also, allows for data.table style queries on gNode GRanges
#' and corresponding metadata.
#'
#' @param obj This is the gNode object to be subset
#' @param i integer, logical, or expression in gNode metadata used to subset gNodes
#' @return A new gNode that contains only the given id's
#' @author Joe DeRose
#' @export
'[.gNode' = function(obj, i = NULL, with = TRUE, ...)
{
nodes = obj$clone()
if (with)
{
## yes I finally figured it out!!!! grrrrrr
inew = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(i)))), parent.frame()), nodes$dt, parent.frame(2)), error = function(e) NULL)
if (is.null(inew))
{
inew = i ## just give up
}
}
else
{
inew = i
}
nodes$subset(inew)
return(nodes)
}
## ================= gEdge class definition ================== ##
#' @name gEdge
#' @title gEdge
#'
#' @description
#' gEdge obejct
#' @export
gEdge = setClass("gEdge")
gEdge = R6::R6Class("gEdge",
public = list(
#' @name gEdge constructor
#' @description
#' Constructor GEDGE
#' Builds an gEdge Class from a given edge.id
#' graph - the gGraph you want to build this class from
#' @param seid vector of signed integer edge ids
#' @param graph gGraph that this edge belongs to
#' @author Joe DeRose
initialize = function(seid = NULL, graph)
{
private$pgraph = graph ## reference to graph that these edges belong to
private$porientation = private$pedge.id = private$psedge.id = c()
private$ptimestamp = private$pgraph$timestamp
if (is.null(seid)) {
return(self)
}
if (any(is.na(private$pgraph$sedgesdt[.(seid), edge.id]))) {
stop("one or more provided signed edge ids are out of bounds")
}
private$porientation = rep(1, length(seid))
private$psedge.id = seid ## signed edge id, referring to the edge in the directed graph for which the $left direction is 5' and $right direction is '
private$pedge.id = abs(seid) ## unsigned edge id
private$pedges = private$pgraph$sedgesdt[list(seid), ]
return(self)
},
#' @name mark
#' @description
#'
#' Marks the nodes pointed at by this gEdge by adding metadata in the gGraph they came from.
#' Metadata names can be arbitrary, such as "lwd = 7" or "highlight = TRUE".
#' gGraph nodes have metadata specified appended to them
#'
#' @param ... metadata names = data to store in metadata columns
#' @param ... name = value pairs of scalar or vector (length edges in graph) arguments
#'
#' @details
#' ```
#' gg$edges[1:5]$mark(col = "purple")
#' gg$edges$mark(changed = FALSE)
#' ```
#' @author Marcin Imielinski
mark = function(...)
{
self$check
if (length(self)>0)
{
NONO.FIELDS = c('from', 'to', 'sedge.id', 'edge.id')
args = list(...)
if (any(names(args) %in% NONO.FIELDS)){
stop(paste('Cannot modify the following reserved gEdge metadata fields:', paste(NONO.FIELDS, collapse = ', ')))
}
for (nm in names(args)){
private$pgraph$annotate(nm, args[[nm]],
c(private$psedge.id, -private$psedge.id), "edge")
}
}
},
#' @name subset
#' @description
#' Allows subseting of the gNode object using bracket notation
#'
#' @param i integer or logical index
#' @return subset of nodes indexed by i
#' @author Joe DeRose
#' Allows for subsetting of the gEdge Object using bracket notation
subset = function(i)
{
self$check
if (is.null(i)){
i = integer()
}
if (is.logical(i)){
i = which(i)
}
if (!is.null(dim(i)))
stop('subscript dimensions malformed')
if ((length(i)>0) && (is.numeric(i) | is.integer(i)))
{
if (max(i, na.rm = TRUE)>self$length){
stop('index out of bounds')
}
}
if (is.character(i))
{
i = as.integer(i)
i = sign(i)*match(abs(i), abs(private$psedge.id))
}
private$psedge.id = sign(i)*private$psedge.id[abs(i)]
private$pedges = private$pgraph$sedgesdt[.(private$psedge.id), ]
private$pedge.id = private$pedges$edge.id
private$porientation = sign(i)*private$porientation[abs(i)]
return(self)
},
#' @name print
#' @description
#'
#' Prints out the gEdge. Prints the length and the data.table of the edges.
#' Prints the edges in the form n1, n2, n1.side, n2.side instead of to, from
print = function()
{
self$check
message("gEdge object with ", self$length, " edges")
if (self$length>0)
{
HEAD = 1:pmin(4, self$length)
tmpdt = self$dt[HEAD, ]
more = self$length-HEAD[length(HEAD)]
print(tmpdt)
if (more>0)
message('... (', more,' additional edges)')
}
}
),
private = list(
## data.table of the edges in this gEdge
pedges = NULL,
## edge.id's and sedge.id's of the edges in this gEdge
pedge.id = NULL,
psedge.id = NULL,
ptimestamp = NULL,
## Sign of our sedge.id's
porientation = NULL,
## Pointer to the gGraph
pgraph = NULL
),
active = list(
## every inter-edge in the bidirected graph is connection of the form a <-> b
## in the directed graph each inter edge is represented by
## two edges a->b (+ sign), b_->a_ (- sign) versions of that edge
## for the + signed edge a is "left" and b is "right"
## for the - signed edge b_ is "left" and a_ is "right"
#' @name graph
#' @description
#'
#' Returns the gGraph this gEdge points at.
#'
#' @return gGraph this gEdge points at
graph = function()
{
self$check
return(private$pgraph)
},
copy = function() self$clone(),
shadow = function() self$junctions$shadow,
## object is stale if the recorded timestamp of the gGraph
## != timestamp the actual gGraph pointed to by pgraph
## suggesting that the indices are no longer valid
stale = function() private$ptimestamp != private$pgraph$timestamp,
## checks if object is stale i.e.
check = function() if (self$stale) stop('object is stale, underlying gGraph has changed. You will need to re-instantiate.'),
#' @name length
#' @description
#'
#' Returns the number of edge pairs in this gEdge
#'
#' @return Number of edge pairs in this gEdge
length = function()
{
self$check
return(length(private$pedge.id))
},
#' @name footprint
#' @description
#'
#' Returns the reduced genomic footprint of this object
#'
#' @return GRanges of footprint of this object
footprint = function()
{
self$check
return(sort(reduce(gr.stripstrand(unlist(self$grl)))))
},
#' @name span
#' @description
#'
#' Returns a vector of spans of the junctions referred to by these
#' edges (i.e. reference distance between the breakpoints)
#'
#' @return Number of edge pairs in this gEdge
span = function()
{
self$check
return(self$junctions$span)
},
#' @name sign
#' @description
#'
#' Returns a vector of signs of the junctions referred to by these
#' edges (i.e. the product of the numeric interpretation of the strands)
#'
#' @return Number of edge pairs in this gEdge
sign = function()
{
self$check
return(self$junctions$sign)
},
#' @name left
#' @description
#'
#' Returns a gNode containing the nodes connected to the left (from's) of the edges in this
#' gEdges. If there are no nodes to the left, returns an empty gNode.
#'
#' @return gNode Nodes connected to the left of the edges in this gNode Object
left = function()
{
self$check
leftNodes = private$pedges[, from]
leftNodes = gNode$new(snode.id=private$pgraph$dt[, snode.id[leftNodes]], private$pgraph)
return(leftNodes)
},
#' @name right
#' @description
#'
#' Returns a gNode containing the nodes connected to the right (to's) of the edges in this
#' gNode. If there are no nodes to the right, returns an empty gNode.
#'
#' @return gNode Nodes connected to the right of the nodes in this gNode
## Returns the nodes connected to the right of the nodes
right = function()
{
self$check
rightNodes = private$pedges[, to]
return(gNode$new(snode.id = private$pgraph$dt[,snode.id[rightNodes]],
private$pgraph))
},
#' @name nodes
#' @description
#'
#' Returns a gNode containing the all nodes connected to the edges in this gEdges.
#' If no nodes are associated with this gEdge, returns an empty gNode.
#'
#' @return gNode Nodes connected to the nodes in this gEdge
nodes = function()
{
self$check
no = c(self$left, self$right)
if (length(no)>0)
no = no[!duplicated(node.id)]
return(no)
},
#' @name sdt
#' @description
#'
#' Returns a data.table of the signed edges in this gEdge in backend format (to, from, type, edge.id)
#'
#' @return data.table of the signed edges in this gEdge in to,from,type,edge.id form
sdt = function()
{
self$check
out = copy(private$pedges)
if (nrow(out))
{
out$n1 = self$graph$dt$snode.id[out$from]
out$n2 = self$graph$dt$snode.id[out$to]
}
return(out)
},
#' @name dt
#' @description
#'
#' Returns a data.table of the unsigned edges in this gEdge in front end format
#'
#' @return data.table of the unsigned edges in this gEdge
dt = function()
{
self$check
sides = c('left', 'right')
if (length(private$psedge.id)==0)
{
pedges = copy(private$pedges)
}
else
{
pedges = private$pgraph$sedgesdt[.(private$psedge.id), ]
}
return(copy(convertEdges(private$pgraph$gr, pedges, metacols = TRUE, cleanup = FALSE)[, n1.side := sides[n1.side+1]][, n2.side := sides[n2.side+1]]))
},
#' @name class
#' @description
#'
#' recomputes the "class" of each gEdge (note: automatically done during
#' gGRaph instantiation and added to edge metadata table as $class column
#' using the standard taxonomy of "DUP", "DEL", "TRA", and "INV" but
#' based on the orientation and reference location of junctions
#' but appending suffix "-like" to enunciate the fact that these are only
#' preliminary classifications of these junctions
class = function()
{
self$check
nodes = self$graph$nodes$gr
ed = self$dt[, .(n1, n1.side, n2, n2.side)]
ed[, intrachr := as.logical(seqnames(nodes)[n1] == seqnames(nodes)[n2])]
ed[intrachr == TRUE,
":="(
n1.coord = ifelse(n1.side == 'right', end(nodes)[n1], start(nodes)[n1]),
n2.coord = ifelse(n2.side == 'right', end(nodes)[n2], start(nodes)[n2])
)]
ed[intrachr == TRUE, span12 := n1.coord-n2.coord]
ed[, class := ifelse(intrachr,
ifelse(n1.side == n2.side, 'INV-like',
ifelse(span12<0 & n1.side == 'right' | span12>0 & n2.side == 'right',
ifelse(abs(span12)==1, 'REF', 'DEL-like'),
'DUP-like')
), 'TRA-like')]
return(ed$class)
},
#' @name junctions
#' @description
#'
#' Coverts the edges in this gEdge to junctions and returns these as a Junctions Object.
#' See Junctions Object documentation to see how junctions are defined in this package.
junctions = function()
{
self$check
grl = self$grl
return(Junction$new(grl))
},
grl = function()
{
self$check
if (length(self)==0)
{
## awkward but best constructor for empty grl with proper seqinfo
return(GRangesList(self$graph$gr[c()])[c()])
}
gr1 = gr.flipstrand(gr.end(private$pgraph$gr[private$pedges$from], ignore.strand = FALSE))
gr2 = gr.start(private$pgraph$gr[private$pedges$to], ignore.strand = FALSE)
grl = GenomicRanges::split(c(gr1, gr2), rep(1:length(gr1), 2))[as.character(1:length(gr1))]
names(grl) = private$edges$sedge.id
pedges = private$pgraph$sedgesdt[.(private$psedge.id), ]
meta = cbind(pedges, data.table(bp1 = gr.string(gr1), bp2 = gr.string(gr2)))
values(grl) = meta[, unique(c("edge.id", "sedge.id", "from", "to", "bp1", "bp2", colnames(meta))), with = FALSE]
return(grl)
},
#' @name id
#'
#' Returns the edge.id's of the edges in this gEdge
#'
#' @return edge.id's of the edges in this gEdge
id = function()
{
self$check
return(private$pedge.id)
},
#' @name subgraph
#' @description
#'
#' Generates a new gGraph from the gGraph pointed at by this gEdge containing only the edges
#' in this gEdgges and the nodes that connect them. The only nodes included in this subgraph are
#' those that only contain the edges in this gEdge.
#'
#' @return gGraph Subgraph of all of the edges in this gEdges and the nodes containing them
subgraph = function()
{
self$check
## Get all of the nodes in either to or from
nodeIndex = c(private$pedges[,to], private$pedges[,from])
## Get the unsigned node.id's and remove any duplicate nodes
node.id = private$pgraph$dt[index %in% nodeIndex, node.id]
node.id = node.id[!duplicated(node.id)]
nodeObj = gNode$new(snode.id = node.id,
graph = private$pgraph)
return(gGraph$new(nodeObj = nodeObj, edgeObj = self, meta = private$pgraph$meta))
}
)
)
## ================== Non-Member Functions for gEdge ================== ##
#' @name [.gEdge
#' @title gEdge
#' @description
#'
#' Overloads subset operator for gEdge. Allows subsetting of gEdge via index or
#' sedge.id (as a key). Also, allows for data.table style queries on gEdge and
#' corresponding metadata.
#'
#' @param obj This is the gEdge object to be subset
#' @param i integer, logical, or expression in gEdge metadata used to subset gEdge
#' @return A new gEdges that contains only the given id's
#' @author Joe DeRose
#' @export
'[.gEdge' = function(obj, i = NULL, from = NULL, to = NULL, with = TRUE, ...)
{
edges = obj$clone()
if (!is.null(from) | !is.null(to))
{
tmpdt = edges$graph$sedgesdt
sid = tmpdt$sedge.id
if (!is.null(from))
{
n1 = edges$graph$nodes[from]
setkeyv(tmpdt, c("from"))
sid = intersect(sid, tmpdt[.(n1$dt$index), sedge.id])
}
if (!is.null(to))
{
n2 = edges$graph$nodes[to]
setkeyv(tmpdt, c("to"))
sid = intersect(sid, tmpdt[.(n2$dt$index), sedge.id])
}
sid = setdiff(sid, NA)
edges$subset(sid)
}
if (length(edges)==0)
return(edges)
if (any(deparse(substitute(i)) == "NULL"))
return(edges)
if (with)
{
## yes I finally figured it out grrrrrr
inew = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(i)))), parent.frame()),edges$dt, parent.frame(2)), error = function(e) NULL)
if (is.null(inew))
inew = i ## just give up
} else
{
inew = i
}
edges$subset(inew)
return(edges)
}
#' @name length
#' @title length.gEdge
#' @description
#'
#' The number of edge pairs in the gEdge Object
#'
#' @param gNode a gEdge object
#' @return the number of edge pairs in the gEdge Object
#' @author Joe DeRose
#' @export
`length.gEdge` = function(gEdge)
{
if(!inherits(gEdge, "gEdge")) {
stop("Error: Invalid input")
}
return(gEdge$length)
}
#' @name c
#' @title c.gEdge
#' @description
#'
#' Concatenates gEdge by edge.id's stored in them. All arguments
#' must point at the same graph or an error will be thrown.
#'
#' @param gEdge objects
#' @return a new concatenated gEdge Object
#' @author Joe DeRose
#' @export
`c.gEdge` = function(...)
{
gEdge.list=list(...)
isg = sapply(gEdge.list, function(x) class(x)[1]=='gEdge')
if(any(!isg)){
stop('Error: All inputs must be of class gEdge.')
}
##Check to make sure they all come from the same graph
graphs = lapply(gEdge.list, function(x) x$graph)
if(length(unique(graphs))>1){
stop('Error: All gEdges must come from the same graph')
}
##Get all the pnode.id's to create new gNode
ids = lapply(gEdge.list, function(x) x$id)
return (gEdge$new(unlist(ids), graphs[[1]]))
}
#' @name setdiff
#' @title setdiff.gEdge
#' @description
#'
#' Returns a new gEdge which is the difference between x and y (edge.id's).
#' All arguments must point at the same graph or an error will be thrown.
#'
#' @param x a gEdge Object
#' @param y a gEdge Object
#' @export
#' @author Joe DeRose
#' @return new gEdge containing the difference between x and y
setMethod("setdiff", c("gEdge", "gEdge"),
function(x, y)
{
if(!identical(x$graph, y$graph)) {
stop("Arguments do not point to the same graph")
}
new.ids = setdiff(x$id, y$id)
return(gEdge$new(new.ids, x$graph))
})
#' @name union
#' @title union.gEdge
#' @description
#'
#' Returns a new gEdge which is the union of x and y (edge.id's).
#' All arguments must point at the same graph or an error will be thrown.
#'
#' @param x a gEdge Object
#' @param y a gEdge Object
#' @export
#' @return new gEdge containing the union of x and y
setMethod("union", c("gEdge", "gEdge"), function(x, y, ...)
{
if(!identical(x$graph, y$graph)) {
stop("Arguments do not point to the same graph")
}
new.ids = union(x$id, y$id)
return(gEdge$new(new.ids, x$graph))
})
#' @name intersect.gEdge
#' @title intersect.gEdge
#' @description
#'
#' Returns a new gEdge which is the intersection of x and y (edge.id's).
#' All arguments must point at the same graph or an error will be thrown.
#'
#' @param x a gEdge Object
#' @param y a gEdge Object
#' @export
#' @author Joe DeRose
#' @return new gEdge containing the intersection of x and y
"intersect.gEdge" = function(x, y)
{
if(!identical(x$graph, y$graph)) {
stop("Arguments do not point to the same graph")
}
new.ids = intersect(x$id, y$id)
return(gEdge$new(new.ids, x$graph))
}
registerS3method("intersect", "gEdge", intersect.gEdge, envir = globalenv())
## ================== Junction class definition ================== ##
#' @name Junction
#' @title Junction
#'
#' @description
#' Junction object
#'
#' @details
#' signed adjacency between two genomic loci
#'
#' @export
Junction = setClass("Junction")
Junction = R6::R6Class("Junction",
public = list(
#' @name Junction class constructor
#' @description
#' Builds junction class, grl must be GRangesList with each GRanges of length 2 JUNCTION
#' Empty junctions are removed
#' If grl is empty GRangesList, Junctions class is empty
#' @param grl GRangesList of signed breakpoint pairs
#' @author Joe DeRose and Rick Mortensen
initialize = function(grl = NULL, ...)
{
## Check to make sure input is GRangesList with each element of length 2
if (is.character(grl))
grl = read.juncs(grl, ...)
if (is.null(grl))
grl = GRangesList()
## Will allow elements to be empty and will just remove them
if (!inherits(grl, "GRangesList")) {
stop("Input is not a GRangesList")
}
widths = elementNROWS(grl)
empty = widths == 0
ix = widths != 2
if(any(ix[!empty])) {
stop(paste0("grl contains junctions that are of improper length. Indices are: ", paste(ix[!empty], collapse=" ")))
}
private$pjuncs = grl[!empty]
return(self)
},
#' @name removeDups
#' @description
#'
#' Removes duplicate junctions in this Junction Object
#'
#' this Junction Object has no duplicate junctions
#' @author Rick Mortensen
removeDups = function()
{
tmp = gr2dt(sort(grl.unlist(private$pjuncs)))
tmp = tmp[, paste(seqnames, start, end, strand, collapse = ','), by = grl.ix]
uix = tmp[!duplicated(V1), grl.ix]
private$pjuncs = private$pjuncs[uix]
return(self)
},
#' @name gw
#' @description
#'
#' Returns a gWalk representing the left -> right traversal of this junction +- padding
#'
#' @return GRangesList of the junctions in this Junction Object
gw = function(pad = 0)
{
left = gr.flipstrand(flank(self$left, -pad+1))
right = flank(self$right, -pad+1)
grl = grl.pivot(GRangesList(left, right))
values(grl) = self$meta
return(gW(grl = grl))
},
#' @name subset
#' @description
#' Allows subseting of the Junction object using bracket notation
#' @param i integer or self$length logical vector specifying subset
subset = function(i)
{
if (is.null(i)){
i = integer()
}
if (is.logical(i)){
i = which(i)
}
if (!is.null(dim(i)))
stop('subscript dimensions malformed')
if (length(i)>0 && (is.numeric(i) | is.integer(i))) {
if (max(i, na.rm = TRUE)>self$length) {
stop('index out of bounds')
}
if (any(i<0))
{
if (!all(i<0)){
stop('cannot mix positive with negative subscripts for Junction object')
}
i = setdiff(1:self$length, abs(i))
}
}
private$pjuncs = private$pjuncs[i]
return(self)
},
#' @name set
#' @title set
#' @description
#' set metadata for junction object
#'
#' @details
#' sets metadata to either a scalar or vector
#' where the vector is the same length as the junction object
#' ```
#' jj$set(cn = 7)
#' jj$set(cn = c(1,2,3))
#' jj$set(col = c("red", "blue", "green"))
#' ```
#' (after setting, metadata is accessible through $dt accessor)
set = function(...)
{
self$check
args = list(...)
NONO.FIELDS = c('junc','ix')
args = list(...)
meta = as.data.table(values(private$pjuncs))
if (any(names(args) %in% NONO.FIELDS)){
stop(paste('Cannot modify the following reserved gWalk metadata fields:', paste(NONO.FIELDS, collapse = ', ')))
}
for (arg in names(args)){
meta[[arg]] = args[[arg]]
}
values(private$pjuncs) = as.data.frame(meta)
return(invisible(self))
},
#' @name print
#' @description
#'
#' Prints out the Junction Object. Prints the length and the GRangesList of the junctions.
print = function() ## junction::print
{
message("Junction Object with ", self$length, " junctions\n")
if (self$length>0)
{
HEAD = 1:pmin(4, self$length)
bp = grl.unlist(self$grl[HEAD])
bpstr = gr.string(bp)
bpdt = data.table(str = bpstr, ix = bp$grl.ix)[, .(junc = paste(str, collapse = ' <-> ')), keyby = ix][.(HEAD), ]
if (ncol(self$dt)>0)
{
bpdt = cbind(bpdt[, "junc", with = FALSE], self$dt[HEAD, ])
}
print(bpdt)
more = self$length-HEAD[length(HEAD)]
if (more>0)
message('... (', more,' additional junctions)')
}
}
),
private = list(
## GRangesList of junctions, each element has length 2
pjuncs = NULL
),
active = list(
#' @name grl
#' @description
#'
#' Returns the GRangesList of the junctions in this Junction Object
#'
#' @return GRangesList of the junctions in this Junction Object
grl = function()
{
return(private$pjuncs)
},
#' @name length
#' @description
#'
#' Returns the number of junctions in this Junction Object.
#'
#' @return Number of junctions in this Junction Object
length = function()
{
return(length(private$pjuncs))
},
#' @name footprint
#' @description
#'
#' Returns the reduced genomic footprint of this object
#'
#' @return GRanges of footprint of this object
footprint = function()
{
self$check
return(sort(reduce(gr.stripstrand(unlist(self$grl)))))
},
#' @name shadow
#' @description
#'
#' Returns the "shadow" of this junction object
#' which for intrachromosomal junctions is the interval between the breakpoints
#' and for interchromosomal junctions is just the locations of the end points
#' Note that this is different from footprint: which is only the locations
#' of the breakpoints, and the output is not reduced but each junction will produce
#' either one or two ranges. The id can be traced by $id metadata.
#' @return GRanges of footprint of this object
shadow = function()
{
self$check
if (!length(self))
return(unlist(self$grl))
gru = gr2dt(grl.unlist(self$grl))
shadow = dt2gr(gru[, .(start = min(start), end = max(start)), by = .(seqnames, grl.ix)], seqlengths = seqlengths(self))[, 'grl.ix']
names(values(shadow)) = 'id'
return(shadow)
},
#' @name junc
#' @description
#' short character blurb of this junction
#'
junc = function()
{
bp = grl.unlist(self$grl)
bpstr = gr.string(bp)
data.table(str = bpstr, ix = bp$grl.ix)[, .(junc = paste(str, collapse = ' <-> ')), keyby = ix]$junc
},
copy = function() self$clone(),
#' @name dt
#' @description
#'
#' Returns the GRangesList of the junctions in the Junction Object as a data.table.
#'
#' @return data.table GRangesList of the junctions coverted to a data.table
dt = function()
{
return(as.data.table(values(private$pjuncs)))
},
#' @name breakpoints
#' @description
#'
#' Returns a GRanges representing the spots a genome needs to be broken for these
#' junctions to be added to the genome. If there is a pad on a junctions or
#' there is an approximate junction, the left end of a "-" strand and the right
#' most edge of a "+" strand will be used. Will remove duplicate breakpoints.
#'
#' @return GRanges of breakpoints generated from the junctions in this Junction Object
breakpoints = function()
{
## Get the breakpoints, left most end of - strand, right most end of + strand
bps = gr.end(granges(unlist(private$pjuncs)), ignore.strand = FALSE)
names(bps) = NULL
## Deal with shift at value one
gr = bps %Q% (strand == "+")
gr = GenomicRanges::shift(gr, -1)
gr = c(bps %Q% (strand == "-"), gr)
gr = gr[!duplicated(gr)]
return(gr)
},
#' @name left
#' @description
#'
#' Returns a GRanges of "left" ends of this junction object. Left / right is just an arbitrary
#' designation (left refers to the "first" endpoint in each junction
#'
#' @return GRanges of left end points of this junction object
left = function()
{
## Get the breakpoints, left most end of - strand, right most end of + strand
return(grl.pivot(private$pjuncs)[[1]])
},
#' @name right
#' @description
#'
#' Returns a GRanges of "right" ends of this junction object. Left / right is just an arbitrary
#' designation (right refers to the "second" endpoint in each junction)
#'
#' @return GRanges of right end points of this junction object
right = function()
{
## Get the breakpoints, left most end of - strand, right most end of + strand
return(grl.pivot(private$pjuncs)[[2]])
},
#' @name span
#' @description
#'
#' Returns the distance between breakpoint pairs on the genome
#'
#' @return vector of spans of all junctions in junction object
span = function()
{
if (!length(self))
return(c())
grl.eval(private$pjuncs, ifelse(seqnames[1]==seqnames[2], as.numeric(abs(diff(start))), Inf))
},
#' @name flip
#' @description
#'
#' Flips strand of junctions in junction object, i.e.
#' returning the reciprocal juncion
#'
#' @return flipped Junction object
flip = function()
{
tmp = grl.pivot(private$pjuncs)
tmp[[1]] = gr.flipstrand(tmp[[1]])
tmp[[2]] = gr.flipstrand(tmp[[2]])
newjunc = grl.pivot(tmp)
values(newjunc) = values(private$pjuncs)
private$pjuncs = newjunc
return(self)
},
#' @name sign
#' @description
#'
#' Sign is the product of strands, i.e. ++ and -- is 1
#' and "+-" and "-+" is -1
#'
#' @return vector of orientations of each junction in junction object
sign = function()
{
if (!self$length)
return(c())
return(grl.eval(private$pjuncs, sign((sign(strand[1]=='+')-0.5)*(sign(strand[2]=='+')-0.5))))
},
#' @name graph
#' @description
#'
#' Returns the gGraph this Junction Object points at.
#'
#' @return gGraph this Junction Object points at
graph = function()
{
return(gGraph$new(juncs = self))
}
)
)
## ================== Non-Member Functions for Junction ================== ##
#' @name length
#' @title length.Junction
#' @description
#'
#' The number of junctions in this Junction Object
#'
#' @param Junction a Junction Object
#' @return the number of junctions in the Junction Object
#' @export
`length.Junction` = function(Junction)
{
return(Junction$length)
}
#' @name c
#' @title c.Junction
#' @description
#'
#' Concatenates Junction Objects
#'
#' @param Junction object
#'
#' @return a new concatenated Junction Object
#' @author Rick Mortensen
#' @export
`c.Junction` = function(...)
{
juncs.list=list(...)
isg = sapply(juncs.list, inherits, 'Junction')
if(any(!isg)){
stop('Error: All inputs must be of class Junction.')
}
## Get all the pjuncs to create new Junction Object
grll = lapply(juncs.list, function(x) x$grl)
grlm = lapply(grll, function(x) as.data.table(values(x)))
newgrl = dodo.call(grl.bind, lapply(grll, function(x) {values(x) = NULL; x}))
values(newgrl) = rbindlist(grlm, fill = TRUE)
return (Junction$new(newgrl))
}
#' @name union
#' @title union.Junction
#' @description
#o'
#' Returns a new Junction Object which is the union of x and y.
#'
#' @param x a Junction Object
#' @param y a Junction Object
#' @author Rick Mortensen
#' @return new Junction Object containing the union of x and y
#' @export
setMethod("union", c('Junction', "Junction"), function(x, y, pad = 0, ignore.strand = FALSE, ...)
{
newJunc=c(x, y)
return(unique(newJunc, pad, ignore.strand))
})
#' @name unique
#' @title unique.Junction
#' @description
#'
#' Returns the subset of Junction object that it is unique
#'
#' @param x a Junction Object
#' @author Rick Mortensen
#' @return new Junction Object containing the union of x and y
#' @export
"unique.Junction" = function(x, pad = 0, ignore.strand = FALSE)
{
if (pad==0)
return(x$copy$removeDups())
else
return(x[!ra.duplicated(x$grl)])
}
setMethod("unique", c('Junction'), unique.Junction)
#' @name setdiff
#' @title setdiff.Junction
#' @description
#'
#' Returns a new Junction Object which is the difference between x and y.
#'
#' @param x a Junction Object
#' @param y a Junction Object
#' @author Rick Mortensen
#' @exportMethod setdiff
#' @return new Junction Object containing the difference between x and y
#' @export
setMethod("setdiff", c('Junction', "Junction"), function(x, y, pad = 0, ...)
{
ov = ra.overlaps(x$grl, y$grl, pad = pad)
ix = setdiff(1:length(x$grl), ov[,1])
return(x[ix])
})
#' @name intersect.Junction
#' @title intersect.Junction
#' @description
#'
#' Returns a new Junction Object which is the intersection of x and y.
#'
#' @param x a Junction Object
#' @param y a Junction Object
#' @author Rick Mortensen
#' @return new Junction Object containing the intersection of x and y
#' @export
#setMethod("intersect", c('Junction', 'Junction'), function(x, y, pad = 0, ...) {
"intersect.Junction" = function(x, y, pad = 0, ...) {
ov = ra.overlaps(x$grl, y$grl, pad = pad)
return(unique(x[ov[, 'ra1.ix']], pad = pad))
}
registerS3method("intersect", "Junction", intersect.Junction, envir = globalenv())
#' @name [.Junction
#' @title Junction
#' @description
#'
#' Overloads subset operator for Junction. Allows subsetting of Junction via index.
#' Also, allows for data.table style queries on Junction GRangesList and
#' corresponding metadata.
#'
#' @param obj Junction object This is the Junction object to be subset
#' @param i integer, logical, or expression in Junction metadata used to subset Junction
#' @return A new Junction object that contains only the given id's
#' @export
'[.Junction' = function(obj, i, j, with = TRUE){
juncs = obj$clone()
if (!missing(i))
{
if (with)
{
inew = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(i)))), parent.frame()),juncs$dt, parent.frame(2)), error = function(e) NULL)
if (is.null(inew))
inew = i ## just give up
}
else
{
inew = i
}
juncs$subset(inew)
}
if (!missing("j"))
{
grl = juncs$grl
values(grl) = values(grl)[, j, drop = FALSE]
juncs = jJ(grl)
}
return(juncs)
}
#' @name +.Junction
#' @description
#'
#' Allows padding of junctions. The rpad will be added to the left of a "+" junction
#' and to the right of "-" junction.
#'
#' @param jj Junction Object
#' @param pad Positive number representing amount to pad the Junction Object.
#' @return a new Junction class with the padding applied
#' @export
'+.Junction' = function(jj, pad)
{
new.grl = resize(jj$grl, pad+1, fix="end")
values(new.grl) = values(jj$grl)
return(Junction$new(new.grl))
}
#' @name union
#' Returns a new Junction object which is the union of x and y.
#'
#' @param x a Junction Object
#' @param y a Junction Object
#' @author Rick Mortenson
#' @exportMethod union
#' @return new Junction Object containing the union of x and y
setMethod("union", c("Junction", "Junction"),
function(x, y) {
newJunc=c(x, y)
newJunc$removeDups()
return(newJunc)
})
#' @name setdiff
#' Returns a new Junction object which is the difference between x and y (id's).
#'
#' @param x a Junction Object
#' @param y a Junction Object
#' @author Rick Mortenson
#' @exportMethod setdiff
#' @return new Junction containing the difference between x and y
setMethod("setdiff", c("Junction", "Junction"),
function(x, y) {
## Make sure that both come from the same graph
overlaps=ra.overlaps(x$grl, y$grl)
overlaps=overlaps[, "ra1.ix"]
all.ix=c(1:length(x$grl))
dif.ix=setdiff(all.ix, overlaps)
return(Junction$new(x$grl[dif.ix]))
})
#' @name refresh
#' @description
#'
#' Updates Junction object to reflect changes in source code
#'
#' @param Junction object
#' @return Junction object
#' @exportMethod refresh
#' @export
setGeneric("refresh", function(x) standardGeneric("refresh"))
setMethod("refresh", "Junction",
function(x) {
return(Junction$new(x$juncs))
})
## ================== gGraph class definition ================== ##
#' @name gGraph
#' @title gGraph
#'
#' @description
#' a genome graph object
#'
#' @export
gGraph = setClass("gGraph")
gGraph = R6::R6Class("gGraph",
public = list(
#' @name gGraph constructor
#' @description
#' All purpose constructor of gGraphs from
#' nodes, edges, junctions or various input formats (JaBbA, Weaver, etc)
#' @param genome Seqinfo or object coercible to seqinfo
#' @param breaks GRanges whose endpoints specify breakpoints in the genome
#' @param juncs Junction object or GRangesList coercible to Junction
#' @param jabba JaBbA graph rds file
#' @param cougar CouGar output directory path
#' @param weaver Weaver output directory path
#' @param remixt RemiXT output directory path
#' @param prego PREGO output directory path
#' @param rck RCK output directory path
#' @author Joe DeRose
initialize = function(genome = NULL,
breaks = NULL,
juncs = NULL,
alignments = NULL,
prego = NULL,
jabba = NULL,
cougar = NULL,
weaver = NULL,
remixt = NULL,
rck = NULL,
walks = NULL,
nodes = NULL,
edges = NULL,
nodeObj = NULL,
edgeObj = NULL,
meta = NULL,
verbose = FALSE
)
{
## control how to construct
meta = c(meta, list())
if (is.null(nodes) & is.null(edges))
{
if (!is.null(nodeObj))
{
private$gGraphFromNodeObj(nodeObj, edgeObj)
meta = meta[!duplicated(names(meta))]
private$pmeta = meta
private$stamp()
return(self)
}
else if (!is.null(alignments))
{
ne = alignments2gg(alignments, verbose)
}
else if (!is.null(prego))
{
ne = pr2gg(prego)
meta[['y.field']] = 'cn'
meta[['name']] = 'PREGO'
meta[['y0']] = 0
meta[['by']] = 'cn'
}
else if (!is.null(jabba))
{
ne = jab2gg(jabba)
meta[['y.field']] = 'cn'
meta[['name']] = 'JaBbA'
meta[['y0']] = 0
meta[['by']] = 'cn'
meta$purity = ne$purity
meta$ploidy = ne$ploidy
}
else if (!is.null(cougar))
{
ne = cougar2gg(cougar)
cgg = gG(breaks = ne$breaks, juncs = ne$juncs)
ne$nodes = cgg$gr
ne$edges = cgg$edgesdt
ne$nodes = inferLoose(ne$nodes, ne$edges)
meta[['name']] = 'CouGAR'
meta[['y.field']] = 'cn'
meta[['y0']] = 0
meta[['by']] = 'cn'
}
else if (!is.null(weaver))
{
ne = wv2gg(weaver)
ne$nodes = inferLoose(ne$nodes, ne$edges)
meta[['name']] = 'Weaver'
meta[['y.field']] = 'cn'
meta[['y0']] = 0
meta[['by']] = 'cn'
}
else if (!is.null(remixt))
{
ne = remixt2gg(remixt)
ne$nodes = inferLoose(ne$nodes, ne$edges)
meta[['name']] = 'ReMiXt'
meta[['y.field']] = 'cn'
meta[['y0']] = 0
meta[['by']] = 'cn'
}
else if (!is.null(rck))
{
ne = rck2gg(rck, haploid = TRUE) ## for now only unphased graphs
meta[['name']] = 'RCK'
meta[['y.field']] = 'cn'
meta[['y0']] = 0
meta[['by']] = 'cn'
}
else if (!is.null(walks))
{
gn = haplograph(walks, breaks)
ne = list(nodes = gn$nodes$gr, edges = gn$edges$dt)
}
else if(!is.null(breaks) || !is.null(juncs))
{
ne = breakgraph(breaks, juncs, genome)
}
else
{
private$emptyGGraph(genome)
return(self)
}
nodes = ne$nodes
edges = ne$edges
}
private$gGraphFromNodes(nodes, edges)
## set gGraph metadata
## (mostly gTrack settings at this point)
meta = meta[!duplicated(names(meta))]
private$pmeta = meta
return(self)
},
#' @name walks
#' @description
#' Exhaustively generates walks (if greedy = FALSE)
#' or otherwise applies a greedy heuristic (greedy = TRUE)
#' returns a gWalk object tied to this graph
#' warning: greedy = FALSE will not scale to large graphs
#' (i.e. may even hang on a couple of hundred nodes, depending
#' on the topology)
#' @param greedy logical scalar specifying whether to generate greedy walks
#' @param verbose logical scalar
#' @author Marcin Imielinski
walks = function(field = NULL, greedy = FALSE, verbose = FALSE)
{
if (greedy==TRUE){
stop('Greedy walks not yet implemented - please stay tuned')
}
A = self$adj
colnames(A) = rownames(A) = self$gr$snode.id
lleft = which(self$nodes$loose.left)
lright = which(self$nodes$loose.right)
llefti = self$queryLookup(lleft)$index
lrighti = self$queryLookup(lright)$index
lleftir = self$queryLookup(-lleft)$index
lrightir = self$queryLookup(-lright)$index
sources = c(llefti, lrightir)
sinks = c(lrighti, lleftir)
ap = all.paths(A, sources = sources, sinks = sinks, verbose = verbose)
## first make paths from any nodes that are both
## sources and sinks
tmp = self$gr$snode.id[intersect(sources, sinks)]
paths = split(tmp, seq_along(tmp))
paths = c(paths, lapply(ap$paths, function(x) self$gr$snode.id[x]))
cycles = lapply(ap$cycles, function(x) self$gr$snode.id[x])
## dedup reciprocals
## gets tricky for cycles since may be
## out of phase, however the (unordered) set of vertices / antivertices
## will be unique to that path / cycle and it's reciprocal
## so we don't need to worry about the sequence to match these up
## can just sort and match
## label each path / cycle
pstr = sapply(paths, function(x) paste(sort(x), collapse = ' '))
cstr = sapply(cycles, function(x) paste(sort(x), collapse = ' '))
## label reverse complement version
rpstr = sapply(paths, function(x) paste(sort(-x), collapse = ' '))
rcstr = sapply(cycles, function(x) paste(sort(-x), collapse = ' '))
## match up paths and their reverse complement
ped = cbind(seq_along(rpstr), match(pstr, rpstr))
pcl = igraph::clusters(igraph::graph.edgelist(ped), 'weak')$membership
## match up cycles and their reverse complement
## use vgrep here since cycles generated by all paths
## may be "out of phase" with their reverse complements
ced = cbind(seq_along(rcstr), match(cstr, rcstr))
ccl = igraph::clusters(igraph::graph.edgelist(ced), 'weak')$membership
paths = paths[!duplicated(pcl)]
cycles = cycles[!duplicated(ccl)]
circular = c(rep(FALSE, length(paths)),
rep(TRUE, length(cycles)))
return(gWalk$new(snode.id = c(paths, cycles), graph = self, circular = circular))
},
#' @name set
#' @description
#'
#' set metadata of gGraph
#' right now mainly useful for gTrack defaults
#' such as "name" or "colormaps", and also used
#' for setting default "by" field for simplify
#' but can be used for configuring other settings in the future
#' (note that this is graph level and not gNode or gEdge level
#' metadata)
#' @param ... name value pairs
#' @author Marcin Imielinski
set = function(...)
{
args = list(...)
for (arg in names(args)){
private$pmeta[[arg]] = args[[arg]]
}
return(invisible(self))
},
#' @name queryLookup
#' @description
#'
#' Returned a data.table of the provided snode.ids, their indicies and the indicies of
#' their reverse complements in the graph. data.table is keyed on snode.id.
#'
#' @param id snode.ids to look up
#' @param id signed node ids in graph
#' @return data.table of snode.ids, indicies and reverse complement indicies
#' @author Joe DeRose
queryLookup = function(id) {
dt = private$lookup[list(id)]
return(dt)
},
#' @name disjoin
#' @description
#' disjoins (i.e. collapses) all overlapping nodes in graph (subject to "by" argument), and aggregates node and edge
#' metadata among them using FUN
#' modifies the current graph
#' optional input gr will first concatenate a reference graph with GRanges gr prior to disjoining
#' collapse argument (if TRUE) will output a graph where there is a single
#' node per reference interval
#' and if collapse = FALSE will only disjoin all the nodes in the graph but keep all overlapping nodes separate
#' i.e. so that overlapping graphs are composed of a common set of disjoint intervals, but we allow there
#' to be several instances of a given interval among the different graphs
#' @param gr GRanges around which to disjoin the graph
#' @param by metadata field of current graph around which to limit disjoining
#' @param collapse logical scalar specifying whether to collapse graph nodes after disjoining
#' @param na.rm logical scalar specifying whether to remove NA's when aggregating metadata after collapsing
#' @param avg logical scalar specifying whether to average (if TRUE) or sum (if FALSE) numeric metadata during aggregation (default = FALSE)
#' @param FUN function which should take (numeric or character) x and na.rm = TRUE and return a scalar value
#' @author Marcin Imielinski
disjoin = function(gr = NULL, by = NULL, collapse = TRUE, na.rm = TRUE, avg = FALSE, sep = ',', FUN = default.agg.fun.generator(na.rm = na.rm, sep = sep, avg = avg))
{
this = self
if (length(self)==0)
return(invisible(self))
# if (!is.null(gr))
# this = c(self, gG(breaks = gr))
dgr = this$nodes$gr
if (!is.null(gr))
{
dgr = gr.stripstrand(grbind(dgr, gr %&% reduce(gr.stripstrand(dgr))))
}
## now we disjoin all the nodes and merge them back with their parents
dnodes = disjoin(dgr) %*% this$nodes$gr
## nmap maps our original node ids to the new node ids
## (but we throw out all "internal nodes" created by the disjoin
## here since we will use nmap to map edges onto the new disjoined nodes
nmap = gr2dt(dnodes)[, .(subject.id)][, dnode.id := 1:.N]
nmap[, left := start(this$nodes$gr)[subject.id] == start(dnodes)]
nmap[, right := end(this$nodes$gr)[subject.id] == end(dnodes)]
## clean up loose ends dnodes
## i.e. internal nodes can't be loose ends
dnodes$loose.left[nmap[left == FALSE, dnode.id]] = FALSE
dnodes$loose.right[nmap[right == FALSE, dnode.id]] = FALSE
## remove all non left / non right (i.e. "internal") nodes
## since we don't need them to remap edges
nmap = nmap[left | right, ]
## nmap will now be a one to 1/2 mapping
setkey(nmap, subject.id)
## update edge table to new dnodes n1 and n2
edges = copy(this$edges$dt)
## first merge n1 and n2 with nmap
## converting n1 / n2 edge pairs
## into candidate dnode.id.x / dnode.id.y edge pairs
## since we've already labeled nodes as left or right (or both)
## then we just need to subset on the merged dnode pairs
## that are abutting the correct side of the parent node
dedges = data.table(
n1 = integer(),
n1.side = character(),
n2 = integer(),
n2.side = character(),
type = character(),
sedge.id = integer())
if (nrow(edges)>0)
{
dedges = merge(edges[, .(sedge.id, n1, n1.side, n2, n2.side, type)],
nmap, by.x = 'n1', by.y = 'subject.id', allow.cartesian = TRUE)
setnames(dedges, c('left', 'right'), c('n1.left', 'n1.right'))
dedges = merge(dedges, nmap, by.x = 'n2', by.y = 'subject.id', allow.cartesian = TRUE)
setnames(dedges, c('left', 'right'), c('n2.left', 'n2.right'))
## remove edges to internal nodes
dedges = dedges[((n1.side == 'left' & n1.left) |
(n1.side == 'right' & n1.right)) &
((n2.side == 'left' & n2.left) |
(n2.side == 'right' & n2.right)), ]
dedges = dedges[, .(n1 = dnode.id.x, n2 = dnode.id.y, n1.side, n2.side, sedge.id, type)]
}
## we now add new reference
## adjacent edges connecting disjoined nodes
## in sequence to each other
tmp = gr2dt(dnodes)[, dnode.id := 1:.N][, .(dnode.id, seqnames, start, end, subject.id)]
setkeyv(tmp, c("subject.id", "start"))
missing = tmp[, .(
n1 = dnode.id[-.N],
n1.side = 'right',
n2 = dnode.id[-1],
n2.side = 'left'), by = subject.id][!is.na(n1) & !is.na(n2), ][, type := 'REF'][, sedge.id := NA]
## add these internal reference edges to new edges
newedges = rbind(dedges, missing[, -1, with = FALSE])
## identify unique dnodes in the new graph
## (+/- applying by = argument)
if (is.null(by))
{
udnodes = dnodes[!duplicated(dnodes$query.id), ]
## now we want to map, then project the newedges onto the the udnodes
## applying the aggregation function
dnodes$udnode.id = match(dnodes$query.id, udnodes$query.id)
}
else
{
if (!is.character(by))
stop('by must be a character vector specifying one or more node metadata columns')
tmp = gr2dt(dnodes)[, dup := duplicated(query.id), by = eval(by)]
udnodes = dt2gr(tmp[dup == FALSE, ][, -ncol(tmp), with = FALSE], seqlengths = seqlengths(dnodes))
dnodes$udnode.id =
match(do.call('paste', c(as.list(values(dnodes)[, c('query.id', by)]),
list(sep = ','))),
do.call('paste', c(as.list(values(udnodes)[, c('query.id', by)]),
list(sep = ','))))
}
## now project
colsn = c('node.id', 'snode.id', 'index', 'loose.left', 'loose.right')
colse = c('sedge.id', 'n1', 'n1.side', 'n2', 'n2.side', 'type')
metacolsn = setdiff(names(values(this$gr)), colsn)
metacolse = setdiff(names(this$edgesdt), colse)
final.nodes = gr2dt(dnodes[, c('udnode.id', 'loose.left', 'loose.right')])
if (length(metacolsn)>0){
final.nodes = cbind(final.nodes, as.data.table(values(this$gr))[dnodes$subject.id, metacolsn, with = FALSE])
}
final.edges = gr2dt(newedges[, colse, with = FALSE])
if (nrow(final.edges)>0 & nrow(edges)>0)
{
if (length(metacolse)>0)
{
final.edges = cbind(final.edges, edges[.(newedges$sedge.id), metacolse, with = FALSE])
}
final.edges[, type := ifelse(is.na(edges[.(newedges$sedge.id), type]), type, edges[.(newedges$sedge.id), type])]
}
if (!collapse)
{
private$gGraphFromNodes(dt2gr(final.nodes, seqlengths = seqlengths(dnodes)), final.edges)
return(invisible(self))
}
final.edges[, ":="(n1 = final.nodes$udnode.id[n1], n2 = final.nodes$udnode.id[n2])]
## fix edge type
if (is.null(FUN)){
FUN = function(x) x[1]
}
id.col = c("seqnames", "start", "end", "strand", "width", "udnode.id")
other.col = c(id.col, metacolsn)
loose.col = c(id.col, 'loose.left', 'loose.right')
## merge loose ends separately
## i.e. for gGraph we take their logical OR
## in bGraph, loose.agg.fun will be replaced
## by addition
tmp.nodes =
final.nodes[, loose.col, with = FALSE][, lapply(.SD, private$loose.agg.fun), by = .(seqnames, start, end, strand, width, udnode.id)]
## merge / aggregate metadata columns
## if any exist
if (length(metacolsn)>0)
{
final.nodes = skrub(final.nodes)
meta = final.nodes[, other.col, with = FALSE][, lapply(.SD, FUN), by = .(seqnames, start, end, strand, width, udnode.id)][, metacolsn, with = FALSE]
tmp.nodes = cbind(tmp.nodes,
meta)
}
## add annotations from gr if those exist
if (length(gr) && ncol(values(gr)))
{
tmp.nodes = cbind(tmp.nodes, as.data.table(values(gr)[gr.match(dt2gr(tmp.nodes), gr, by = by), ]))
}
final.nodes = tmp.nodes
final.edges = skrub(final.edges)
## here we will aggregate metadata for "identical"
## edges ... i.e. those that connect the same node / sides
## here we need to dedup based on those edge pairs that are identical
## but have n1 and n2 flipped (ie they are the identical edge
## but our aggregation will not recognize it)
## to do this we standardized the edge notation
## so that n1 < n2, where the "order" is arbitrary but standard / unambiguous
## (ordering also includes the "side")
## tmp.n1 = final.edges$n1
## tmp.n2 = final.edges$n2
## tmp.n1.side = final.edges$n1.side
## tmp.n2.side = final.edges$n2.side
old.final.edges = copy(final.edges)
## factor then convert to integer so we can apply an (arbitrary) order
n1.tag = final.edges[, paste(n1, n1.side)]
n2.tag = final.edges[, paste(n2, n2.side)]
ulev = union(n1.tag, n2.tag)
n1.tag = as.integer(factor(n1.tag, ulev))
n2.tag = as.integer(factor(n2.tag, ulev))
## final.edges[, n1 := pmin(tmp.n1, tmp.n2)]
## final.edges[, n2 := pmax(tmp.n1, tmp.n2)]
## final.edges[, n1.side := ifelse(tmp.n1<tmp.n2, tmp.n1.side, tmp.n2.side)]
## final.edges[, n2.side := ifelse(tmp.n1>=tmp.n2, tmp.n1.side, tmp.n2.side)]
n1 = ifelse(n1.tag<n2.tag, final.edges$n1, final.edges$n2)
n1.side = ifelse(n1.tag<n2.tag, final.edges$n1.side, final.edges$n2.side)
n2 = ifelse(n1.tag<n2.tag, final.edges$n2, final.edges$n1)
n2.side = ifelse(n1.tag<n2.tag, final.edges$n2.side, final.edges$n1.side)
final.edges$n1 = n1
final.edges$n2 = n2
final.edges$n1.side = n1.side
final.edges$n2.side = n2.side
## final merging of edges
final.edges = final.edges[, lapply(.SD, FUN), by = .(n1, n1.side, n2, n2.side)]
private$gGraphFromNodes(dt2gr(final.nodes, seqlengths = seqlengths(dnodes)), final.edges)
return(invisible(self))
},
#' @name simplify
#' @title simplify
#' @description
#' Simplifies gGraph by collapsing reference adjacent nodes
#' that lack a junction or loose end (ignore.loose = FALSE)
#' between them.
#'
#' Takes an optional "by" column. If by is not NULL simplify
#' will only collapse adjacent nodes if they share metadata
#' in the columns specified by "by"
#' @param by metadata field of current graph around which to limit simplification
#' @param na.rm logical scalar specifying whether to remove NA's when aggregating metadata after collapsing
#' @param avg logical scalar specifying whether to average (if TRUE) or sum (if FALSE) numeric metadata during aggregation (default = FALSE)
#' @param FUN function which should take (numeric or character) x and na.rm = TRUE and return a scalar value
#' @author Marcin Imielinski
simplify = function(by = private$pmeta$by, na.rm = TRUE, avg = TRUE, sep = ',', FUN = default.agg.fun.generator(na.rm = na.rm, sep = sep, avg = avg), ignore.loose = FALSE)
{
if (length(self)==0)
return(invisible(self))
if (length(self$edges)==0)
return(invisible(self))
edges = copy(self$edgesdt)
nodes = self$nodes$gr
## let's figure out reference adjacent dnode pairs
nodes.dt = gr2dt(nodes)[, c("node.id", "seqnames", "start", "end", by), with = FALSE]
nodes.dt[, endnext := end +1]
refadj = merge(nodes.dt, nodes.dt, by.x = 'endnext', by.y = 'start',
allow.cartesian = TRUE)[, .(n1 = node.id.x, n1.side = 'right', n2 = node.id.y, n2.side = 'left', qsedge.id = 1:.N)]
refadjr = copy(refadj)
refadjr$qsedge.id = -refadjr$qsedge.id
setnames(refadjr, c('n2', 'n2.side', 'n1', 'n1.side', 'qsedge.id'))
refadj = rbind(refadj, refadjr)
## label ref edges in current edges using refadj
setkeyv(edges, c("n1", "n1.side", "n2", "n2.side"))
edges[, ref := FALSE]
edges[refadj, ref := TRUE]
## collect non ref edges
abed = edges[is.na(ref), ]
edcount = rbind(edges[, .(node = n1, side = n1.side)],
edges[, .(node = n2, side = n2.side)]
)[, .(count = .N), keyby = .(node, side)]
abcount = rbind(edges[ref==FALSE, .(node = n1, side = n1.side)],
edges[ref==FALSE, .(node = n2, side = n2.side)]
)[, .(count = .N), keyby = .(node, side)]
## tabulate all edges (including "ref")
## now mark node sides that abut a non-reference adjacency
## or have more than one adjacency emanating from them
nodes.dt[, ab.right := abcount[.(node.id, 'right'), ifelse(is.na(count), 0, count)]>0 |
edcount[.(node.id, 'right'), ifelse(is.na(count), 0, count)]>1]
nodes.dt[, ab.left := abcount[.(node.id, 'left'), ifelse(is.na(count), 0, count)]>0 |
edcount[.(node.id, 'left'), ifelse(is.na(count), 0, count)]>1]
## also mark nodes that abut a loose end
if (!ignore.loose)
{
nodes.dt[, ab.left := ab.left | self$nodes$loose.left]
nodes.dt[, ab.right := ab.right | self$nodes$loose.right]
}
setkey(nodes.dt, node.id)
## keep track of interval sides
sides = rbind(nodes.dt[, .(node.id, side = 'right', ab = ab.right)],
nodes.dt[, .(node.id, side = 'left', ab = ab.left)])
setkeyv(sides, c('node.id', 'side'))
## an internal edge is a ref edge that connects two non-ab "sides"
edges[, n1.ab := sides[.(n1, n1.side), ab]]
edges[, n2.ab := sides[.(n2, n2.side), ab]]
edges[, internal := ref & !n1.ab & !n2.ab]
## if "by" provided, then we take this also into account
## to place additional constrain on internal edges
if (!is.null(by) && (by %in% names(nodes.dt)))
{
edges$n1.by = nodes.dt[.(edges$n1), ][[by]]
edges$n2.by = nodes.dt[.(edges$n2), ][[by]]
edges[, internal := internal & n1.by == n2.by]
}
## now make a quick graph from only internal edges and find clusters
## and their "ends", of which there is guaranteed to be one left
## and one right due to our definitions above
tmp.gr = self$nodes$gr
tmp.gr$node.id.og = tmp.gr$node.id
igg = gG(nodes = tmp.gr, edges = edges[internal == TRUE, ])
igg$clusters(mode = 'weak')
nodemap = data.table(node.id = igg$nodes$gr$node.id.og,
new.node.id = as.integer(factor(igg$nodes$gr$cluster, unique(igg$nodes$gr$cluster))), key = 'node.id',
left.end = igg$nodes$gr$loose.left,
right.end = igg$nodes$gr$loose.right
)
## start a fresh dt just in case we overwrote
## some user metadata columns
nodes.dt2 = gr2dt(nodes)
metadata.cols = setdiff(names(nodes.dt2),
c('seqnames', 'start', 'end', 'strand', 'width',
'snode.id', 'node.id', 'index', 'loose.left',
'new.node.id', 'left.end', 'right.end',
'loose.right'))
nodes.dt2 = cbind(nodemap[.(nodes.dt2$node.id), .(new.node.id, left.end, right.end)],
nodes.dt2)
if (is.null(FUN)) ## if no fun specified just take the first
FUN = function(x) x[1]
new.nodes =
nodes.dt2[, .(seqnames = seqnames[left.end],
start = start[left.end], end = end[right.end],
node.id.left = node.id[left.end],
node.id.right = node.id[right.end],
loose.left = loose.left[left.end],
loose.right = loose.right[right.end]), keyby = new.node.id]
if (length(metadata.cols)>0)
{
new.nodes = cbind(new.nodes, skrub(nodes.dt2)[, lapply(.SD, FUN), .SDcols = metadata.cols, keyby = new.node.id][.(new.node.id), metadata.cols, with = FALSE])
}
## now find the old nodes that comprise the "sides" of the new nodes
## and reconnect edges to these new ids
new.sides = rbind(
nodes.dt2[left.end == TRUE, .(node = node.id, side = 'left', new.node.id)],
nodes.dt2[right.end == TRUE, .(node = node.id, side = 'right', new.node.id)])
setkeyv(new.sides, c("node", "side"))
## only edges that remain are external edges
new.edges = NULL
if (nrow(edges)>0)
{
new.edges = self$edges$dt[abs(sedge.id) %in% abs(edges$sedge.id[!edges$internal])]
if (nrow(new.edges)>0)
{
new.edges$n1 = new.sides[.(new.edges$n1, new.edges$n1.side), new.node.id]
new.edges$n2 = new.sides[.(new.edges$n2, new.edges$n2.side), new.node.id]
}
}
## clean up new.nodes metadata
new.nodes$new.node.id = NULL
new.nodes$node.id.left = NULL
new.nodes$node.id.right = NULL
private$gGraphFromNodes(dt2gr(new.nodes, seqlengths = seqlengths(nodes)), new.edges)
return(invisible(self))
},
#' @name reduce
#' @title reduce
#' @description
#'
#' Reduces graph which is $disjoin() followed by a simplify()$
#' i.e. collapsing overlapping nodes, then merging
#' adjacent ones subject to (optional) matching on some
#' metadata field
#' @param by metadata field of current graph around which to limit reduction (i.e. disjoining and simplification)
#' @param na.rm logical scalar specifying whether to remove NA's when aggregating metadata after collapsing
#' @param avg logical scalar specifying whether to average (if TRUE) or sum (if FALSE) numeric metadata during aggregation (default = FALSE)
#' @param FUN function which should take (numeric or character) x and na.rm = TRUE and return a scalar value
#' @author Marcin Imielinski
reduce = function(...)
{
self$disjoin(...)
self$simplify(...)
},
#' @name subgraph
#' @description
#' compute subgraph within a certain distance or degree of separation
#' of (all nodes) intersection given GRanges "seed" window win
#' @param seed GRanges around which to subgraph
#' @param d distance in bp around which to subgraph
#' @param k order in number of edges which to subgraph
#' @param pad positive integer scalar padding to add to seed
#' @param ignore.strand logical scalar specifying whether to ignore.strand
#' @param verbose logical scalar
#' @author Xiaotong Yao
subgraph = function(seed = si2gr(self),
d=NULL,
k=0,
bagel=FALSE,
mod = FALSE,
ignore.strand=T,
verbose=FALSE
)
{
win = seed;
if (is.character(win))
win = parse.gr(win)
if (ignore.strand){
win = gr.stripstrand(win)
}
win = gr.fix(win, private$pnodes, drop = TRUE)
## DONE: what to do when win is larger than segs?????
## ans: return self
if (length(setdiff(streduce(private$pnodes), win))==0){
return(invisible(self))
}
## overlapping window and segs, removing loose ends
interGr = gr.findoverlaps(private$pnodes, win, ignore.strand=ignore.strand)
qix = interGr$query.id
if (!is.null(d)){
## no k, use distance
if (is.null(d) | d < 0){
stop("Must provide either valid k or d.")
}
## blend window with segs
win = gr.fix(win, private$pnodes)## fix seqinfo
ss = tryCatch(c(private$pnodes[, c()],
win[, c()]), error = function(e) NULL)
if (is.null(ss)){
ss = grbind(c(private$pnodes[, c()],
win[, c()]))
}
if (ignore.strand){
ss = gr.stripstrand(ss)
}
## TODO: can we take overlapping segs into the picture??
## break it into non-overlapping segs
ss = disjoin(ss)
## update win
win = gr.findoverlaps(ss, win, ignore.strand = ignore.strand)
## start/end of ss
seg.s = suppressWarnings(gr.start(ss, ignore.strand = TRUE))
seg.e = suppressWarnings(gr.end(ss, ignore.strand = TRUE))
## distance from all of win to start/end of ss
## DONE: connect with dist
D.s = suppressWarnings(self$dist(win, seg.s, verbose = verbose))
D.e = suppressWarnings(self$dist(win, seg.e, verbose = verbose))
## shortest path distance
min.s = apply(D.s, 2, min, na.rm = TRUE)
min.e = apply(D.e, 2, min, na.rm = TRUE)
## which idx bear them?
s.close = min.s<=d
e.close = min.e<=d
## generate new gGraph, trim the subgraph
out = GRanges(seqlengths = seqlengths(self))
if (any(s.close)){
out = c(out,
GenomicRanges::flank(seg.s[s.close],
-(d-min.s[s.close]))
)
}
if (any(e.close)){
out = c(out,
GenomicRanges::shift(flank(seg.e[e.close],
d-min.e[e.close]),1)
)
}
if (!bagel){
## out = streduce(c(win[, c()], out[, c()]))
out = streduce(gUtils::grbind(win[, c()], out[, c()]))
}
hoodRange = streduce(out)
return(self$trim(hoodRange))
}
else {
## with k, go no more k steps
kNeighbors = unique(unlist(igraph::ego(self$igraph, order=k, nodes=qix)))
ix = unique(private$pnodes[kNeighbors]$node.id)
return(self$nodes[ix]$subgraph)
}
},
#' @name clusters
#' @description
#'
#' Marks nodes in graph with metadata field $cluster
#' based on one of several algorithms, selected by mode
#' If i and j are specified, graph is first subsetted then
#' clusters computed, then cluster ids are lifted back to mark
#' the original graph.
#'
#' @param weak character scalar that can take one of the following possible values - "weak" or "strong" specifying weakly or strongly connected components, walktrap specifying cluster_walktrap community detection
#' @param i node filter to apply to graph prior to clustering
#' @param j edge filter to apply to graph prior to clustering
#' @author Marcin Imielinski
clusters = function(i = NULL,
j = NULL,
mode = 'weak'
)
{
filtered = FALSE
self$nodes$mark(cluster = NULL)
self$nodes$mark(pcluster = NULL)
self$nodes$mark(ncluster = NULL)
self$nodes$mark(cluster = as.integer(NA))
self$nodes$mark(pcluster = as.integer(NA))
self$nodes$mark(ncluster = as.integer(NA))
graph = self$copy
if (length(graph)==0)
return(invisible(self))
graph$nodes$mark(og.node.id = 1:length(graph))
if (all(deparse(substitute(i)) != "NULL")
| all(deparse(substitute(j)) != "NULL"))
{
## NSE voodoo to fill in the parent.frame values before passing on
i = deparse(eval(parse(text = substitute(deparse(substitute(i)))), parent.frame()))
j = deparse(eval(parse(text = substitute(deparse(substitute(j)))), parent.frame()))
if (length(i)>1)
i = paste(i, collapse = ' ')
if (length(j)>1)
j = paste(j, collapse = ' ')
graph = eval(parse(text = sprintf('graph[%s, %s]', i, j)))
graph$clusters(mode = mode)
self$nodes[graph$nodes$dt$og.node.id]$mark(
cluster = graph$nodes$dt$cluster,
pcluster = graph$nodes$dt$pcluster,
ncluster = graph$nodes$dt$ncluster
)
return(invisible(self))
}
algos = c('weak','strong','walktrap')
mode = as.character(factor(mode,algos))
if (is.na(mode))
{
stop(sprintf('mode argument must be one of the following possibilities', paste(allowed.levels, collapse = ',')))
}
G = self$igraph
if (mode %in% c("strong", "weak")){
membership = igraph::clusters(G, mode)$membership
}
else if (mode== 'walktrap'){
membership = igraph::cluster_walktrap(G)$membership
}
## note that membership may be different
## (for certain algorithms) for a node and its
## reverse complement, so we keep track of both
names(membership) = as.character(as.integer(self$gr$snode.id))
## "positive membership" membership of the positive strand of the node
pmembership = membership[as.character(self$nodes$dt$node.id)]
## "reverse" i.e. negative strand membership (will be different unless there is
## a palindromic community / component
nmembership = membership[as.character(-self$nodes$dt$node.id)]
## rename clusters so most popular is first
rename = data.table(names(rev(sort(table(c(pmembership, nmembership))))))[, structure(1:.N, names = V1)]
## rename again so that positive clusters are first
names(rename) = names(rename)[order(!(names(rename) %in% as.character(pmembership)))]
pmembership = rename[as.character(pmembership)]
nmembership = rename[as.character(nmembership)]
## final cluster membership of the node is the min of the two clusters
## this will ensure that the node inherits the most popular cluster of its
## two "sides"
## compose strand agnostic cluster name choosing most popular of the signed clusters
## for each node
tmp.cluster = pmin(pmembership, nmembership)
## then rename these according to their popularity so they are in decreasing
## order of popularity
tab.cluster = rev(sort(table(tmp.cluster)))
rename = structure(1:length(tab.cluster), names = names(tab.cluster))
cluster = rename[as.character(tmp.cluster)]
self$annotate('cluster', data = cluster, id = self$nodes$dt$node.id,
class = 'node')
self$annotate('pcluster', data = pmembership, id = self$nodes$dt$node.id,
class = 'node')
self$annotate('ncluster', data = nmembership, id = self$nodes$dt$node.id,
class = 'node')
return(invisible(self))
},
#' @name eclusters
#' #' @description
#' Marks ALT edges belonging (quasi) reciprocal cycles
#' @param thresh the distance threshold with which to group nearby quasi-reciprocal junctions - i.e. if thresh=0 then we only consider clusters of exactly reciprocal junctions.
#' @param mc.cores parallel
#' @param weak logical flag if TRUE will not differentiate between cycles and paths and will return all weakly connected clusters in the graph [FALSE]
#' @return numerical vector of the same length, Inf means they r not facing each other
#' @author Marcin Imielinski
eclusters = function(thresh = 1e3,
range = 1e6,
weak = TRUE,
paths = !weak,
mc.cores = 1,
verbose = FALSE,
chunksize = 1e30,
method = "single")
{
self$edges$mark(ecluster = as.integer(NA))
altedges = self$edges[type == "ALT", ]
if (verbose & weak)
message('Computing weak eclusters')
if (length(altedges)==0){
if (verbose){
message("No junction in this graph")
}
return(NULL)
}
bp = grl.unlist(altedges$grl)[, c("grl.ix", "grl.iix")]
bp.dt = gr2dt(bp)
ix = split(1:length(bp), ceiling(runif(length(bp))*ceiling(length(bp)/chunksize)))
ixu = unlist(ix)
eps = 1e-9
ij = do.call(rbind, split(1:length(bp), bp$grl.ix))
xt.adj = old.adj = Matrix::sparseMatrix(1, 1, x = 0, dims = rep(length(bp), 2))
if (verbose){
message(sprintf('Computing junction graph across %s ALT edges with distance threshold %s', length(altedges), thresh))
}
if (!exists(".INF")){
.INF = pmax(sum(seqlengths(self)), 1e9)
}
xt.adj[ixu, ] = do.call(rbind,
mclapply(ix,
function(iix)
{
if (verbose>1)
cat('.')
tmpm =
gr.dist(bp[iix],
gr.flipstrand(bp),
ignore.strand = FALSE)+eps
return(as(tmpm, "Matrix"))
},
mc.cores = mc.cores))
## get back to marcin's version
adj = xt.adj
adj[which(is.na(as.matrix(adj)))] = 0
adj[which(as.matrix(adj)>thresh)] = 0
## message(identical(as.logical(adj), as.logical(as.matrix(old.adj))))
xt.adj[which(is.na(as.matrix(xt.adj)))] = .INF + 1
## two breakpoints of the same junction should be distance 1
bp.pair = t(
sapply(unique(bp$grl.ix),
function(ix){
matrix(which(bp$grl.ix==ix), ncol=2, nrow=1)
}))
xt.adj[bp.pair] = 1
## do single linkage hierarchical clustering within `range`
hcl = stats::hclust(as.dist(xt.adj), method = "single")
hcl.lbl = cutree(hcl, h = thresh)
bp.dt$hcl = hcl.lbl
bp.hcl =
bp.dt[,.(hcl.1 = .SD[grl.iix==1, hcl],
hcl.2 = .SD[grl.iix==2, hcl]),
keyby=grl.ix]
## sometimes two breakpoints belong to diff hcl
## merge them!
altedges$mark(hcl.1 = bp.hcl[.(seq_along(altedges)), hcl.1])
altedges$mark(hcl.2 = bp.hcl[.(seq_along(altedges)), hcl.2])
hcl.ig = igraph::graph_from_edgelist(
bp.hcl[, unique(cbind(hcl.1, hcl.2))], directed = FALSE)
hcl.comp = components(hcl.ig)
altedges$mark(ehcl = as.integer(hcl.comp$membership)[bp.hcl[, hcl.1]])
## connect to MI's code
adj[adj>thresh] = 0
## check bp pairs to see if they are actually reference connected (ignore.strand = TRUE)
## on the given graphs ...
## which if we have many graphs overlapping the same intervals
## may not actually be the case
## we only check connectivity using ref edges
## compute reference graph distance and
## remove any bp pairs that are farther away
## on the reference graph than on the
## linear reference
##FIX ME: can't handle when there are no reference edges
refg = self[, type == 'REF']
bpp = Matrix::which(adj!=0, arr.ind = TRUE)
dref = pdist(bp[bpp[,1]], bp[bpp[,2]])
drefg = diag(refg$dist(bp[bpp[,1]], bp[bpp[,2]]))
ix = drefg>dref
if (any(ix))
adj[bpp[ix,, drop = FALSE]] = FALSE
if (verbose>1)
cat('\n')
adj = adj | t(adj) ## symmetrize
## bidirected graph --> skew symmetric directed graph conversion
## split each junction (bp pair) into two nodes, one + and -
## arbitrarily call each bp1-->bp2 junction is "+" orientation
## then all odd proximities adjacent to bp1 will enter the "+"
## version of that junction and exit the "-" version
## new matrix will be same dimension as adj
## however the nodes will represents + and -
## orientation of junctions
## using the foollowing conversion
## i.e.
## bp2 --> bp1 + +
## bp2 --> bp2 + -
## bp1 --> bp1 - +
## bp1 --> bp2 - -
## we'll use the same indices just to keep things confusing
junpos = bp1 = bp$grl.iix == 1
junneg = bp2 = bp$grl.iix == 2
adj2 = adj & FALSE ## clear out adj for new skew symmetric version
adj2[junpos, junpos] = adj[bp2, bp1]
adj2[junpos, junneg] = adj[bp2, bp2]
adj2[junneg, junpos] = adj[bp1, bp1]
adj2[junneg, junneg] = adj[bp1, bp2]
if (verbose)
message(sprintf('Created basic junction graph using distance threshold of %s', thresh))
## strongly connected components consists of (possibly nested) cycles
cl = split(1:length(bp), igraph::clusters(graph.adjacency(adj2), ifelse(weak, 'weak', 'strong'))$membership)
## choose only clusters with length > 1
cl = cl[S4Vectors::elementNROWS(cl)>1]
cl = cl[order(S4Vectors::elementNROWS(cl))]
jcl = lapply(cl, function(x) unique(sort(bp$grl.ix[x])))
jcls = sapply(jcl, paste, collapse = ' ')
jcl = jcl[!duplicated(jcls)]
adj3 = adj2
altedges$mark(ecycle = as.character(NA))
if (length(jcl)>0)
{
dcl = dunlist(unname(jcl))[, listid := paste0(ifelse(weak, '', 'c'), listid)]
if (!weak)
altedges[dcl$V1]$mark(ecycle = dcl$listid)
altedges[dcl$V1]$mark(ecluster = dcl$listid)
}
if (verbose)
message(sprintf('Annotated %s junction cycles in edge field $ecycle', length(jcl)))
if (paths & !weak)
{
if (verbose)
message('Analyzing paths')
## remove all cycles and enumerate remaining paths > 1
if (length(jcl)>0)
{
adj3[unlist(jcl), unlist(jcl)] = FALSE
}
sinks = Matrix::which(Matrix::rowSums(adj3)==0)
sources = Matrix::which(Matrix::colSums(adj3)==0)
cl2 = split(1:length(bp), igraph::clusters(graph.adjacency(adj3), 'weak')$membership)
cl2 = cl2[S4Vectors::elementNROWS(cl2)>1]
if (any(ix <- S4Vectors::elementNROWS(cl2)>2))
{ ## only need to do this for connected components that have 3 or more junctions
cl3 = do.call(c, mclapply(cl2[ix], function(x)
{
tmp.adj = adj3[x, x]
lapply(all.paths(tmp.adj, sources = sources, sinks = sinks, verbose = verbose)$paths, function(i) x[i])
}, mc.cores = mc.cores))
cl2 = c(cl2[!ix], cl3)
}
jcl2 = lapply(cl2, function(x) unique(sort(bp$grl.ix[x])))
jcls2 = sapply(jcl2, paste, collapse = ' ')
jcl2 = jcl2[!duplicated(jcls2)]
altedges$mark(epath = as.character(NA))
if (length(jcl2)>0)
{
dcl2 = dunlist(unname(jcl2))[, listid := paste0('p', listid)]
altedges[dcl2$V1]$mark(epath = dcl2$listid)
## also mark ecluster, though they may have
## overlapping edges
self$edges$mark(ecluster =
ifelse(
is.na(self$edges$dt$ecycle) &
is.na(self$edges$dt$epath), as.character(NA),
paste0(
ifelse(is.na(self$edges$dt$ecycle),
'',
self$edges$dt$ecycle),
ifelse(is.na(self$edges$dt$epath),
'',
self$edges$dt$epath))))
}
if (verbose)
message(sprintf('Annotated %s paths in edge field $epath', length(jcl2)))
}
return(invisible(self))
},
#' @name eclusters
#' @description
#' Marks ALT edges belonging (quasi) reciprocal cycles
#' @param juncs GRangesList of junctions
#' @param mc.cores parallel
#' @param only_chains TRUE will only pair breakend to its nearest nearest neighbor IFF the nearest neighbor is reciprocal
#' @param max.small size below which simple dups and dels are excluded
#' @param ignore.strand usually TRUE
#' @param weak logical flag if TRUE will not differentiate between cycles and paths and will return all weakly connected clusters in the junction graph [FALSE]
#' @return numerical vector of the same length, Inf means they r not facing each other
#' @author Marcin Imielinski
eclusters2 = function(thresh = 1e3,
range = 1e6,
weak = TRUE,
paths = !weak,
mc.cores = 1,
verbose = FALSE,
chunksize = 1e30,
method = "single",
return_pairs = FALSE,
ignore.small = TRUE,
max.small = 1e4,
ignore.isolated = TRUE,
strict = c("strict", "one_to_one", "loose"),
min.isolated = max.small,
only_chains = FALSE
)
{
if (!is.character(strict) ||
any(!strict %in% c("strict", "one_to_one", "loose"))) {
stop("strict must be one of 'strict', 'one_to_one', or 'loose'")
} else if (length(strict) > 1) {
strict = "one_to_one"
}
self$edges$mark(ecluster = as.integer(NA))
altedges = self$edges[type == "ALT", ]
if (ignore.small &&
!is.null(max.small) &&
!is.na(max.small) &&
max.small > 0) {
altedges = altedges[!((class == "DUP-like" |
class == "DEL-like") &
altedges$span <= max.small)]
}
if (verbose & weak)
message('Computing weak eclusters')
if (length(altedges)==0){
if (verbose){
message("No junction in this graph")
}
return(NULL)
}
deldup = copy3(altedges)[class %in% c("DUP-like", "DEL-like", "INV-like")]
if (length(deldup) > 0 && ignore.isolated) {
altes = deldup$shadow
## altes$sedge.id = altedges[class %in% c("DUP-like", "DEL-like")]$dt[altes$id]$sedge.id
bp = gr.noval(grl.unlist(altedges$grl), keep.col = c("grl.ix", "grl.iix", "class", "sedge.id"))
bp$sedge.id.y = bp$sedge.id; bp$sedge.id = NULL
addon = deldup$dt[altes$id][, .(sedge.id, class)]
altes$sedge.id = addon$sedge.id
altes$class = addon$class
altes$nbp = altes %N% bp # number of breakpoints of any SV that fall within segment
numsum = altedges$shadow %>% gr.sum # using the shadows of all of the SVs not just dels and dups
altes = altes %$% numsum
iso = ((altes) %Q% (score == 1.0))$id
## rm.edges = unique(altes[iso] %Q% (width < thresh))$sedge.id ## old
rm.edges = unique(altes[iso] %Q% (width < min.isolated))$sedge.id
rm.dups = S4Vectors::with(altes, sedge.id[class == "DUP-like" & nbp <= 2])
rm.dups = c(rm.dups, dedup.cols(gr2dt(altes %*% bp))[sedge.id != sedge.id.y][class == "DUP-like"][, .(all(1:2 %in% grl.iix), class.1 = class.1[1]), by = .(sedge.id, sedge.id.y)][, all(V1 == TRUE) & all(class.1 == "DUP-like"), by = sedge.id][V1 == TRUE]$sedge.id) # removing dups that have only other nested dups
rm.inv = S4Vectors::with(altes, sedge.id[class == "INV-like" & nbp <= 2 & width < min.isolated]) # removing inv that are isolated
rm.edges = unique(c(rm.edges, rm.dups, rm.inv))
keepeid = setdiff(altedges$dt$sedge.id, rm.edges)
altedges = altedges[as.character(keepeid)]
} # ignoring isolated dup and del edges that are smaller than threshold
if (verbose & weak)
message("Computing weak eclusters")
if (length(altedges) == 0) {
if (verbose) {
message("No junction in this graph")
}
return(NULL)
}
bp = grl.unlist(altedges$grl)[, c("grl.ix", "grl.iix", "class", "edge.id")]
bp$m.ix = seq_along(bp)
bp.dt = gr2dt(bp)
ix = split(1:length(bp), ceiling(runif(length(bp))*ceiling(length(bp)/chunksize)))
ixu = unlist(ix)
eps = 1e-9
ij = do.call(rbind, split(1:length(bp), bp$grl.ix))
xt.adj = xt.adj0 = Matrix::sparseMatrix(1, 1, x = 0, dims = rep(length(bp), 2))
if (verbose){
message(sprintf('Computing junction graph across %s ALT edges with distance threshold %s', length(altedges), thresh))
}
if (!exists(".INF")){
.INF = pmax(sum(seqlengths(self)), 1e9)
}
bp.pair = as.matrix(dcast(data.table(ix = seq_along(bp),
grl.ix = bp$grl.ix,
grl.iix = bp$grl.iix),
grl.ix ~ grl.iix, value.var = "ix")[,2:3])
bp.pair = rbind(bp.pair, cbind(bp.pair[,2], bp.pair[,1]))
ifun = function(iix, ignore.strand = FALSE,
verbose = FALSE, eps = 1e-9) {
if (verbose > 1)
cat(".")
tmpm = gr.dist(bp[iix], gr.flipstrand(bp), ignore.strand = ignore.strand) +
eps
return(as(tmpm, "Matrix"))
}
xt.adj[ixu, ] = do.call(rbind,
mclapply(ix, ifun, ignore.strand = FALSE,
mc.cores = mc.cores))
## only_chains = TRUE, enforcing that nearest breakpoints are only considered
diag(xt.adj) = NA_real_
if (only_chains) {
## enforcing that no distances between breakends from the same junction are considered
xt.adj[bp.pair] = NA_real_
xt.adj0[ixu, ] = do.call(rbind,
mclapply(ix, ifun, ignore.strand = TRUE,
mc.cores = mc.cores))
## enforcing that self-to-self breakend distances are not considered
diag(xt.adj0) = NA_real_
## enforcing that no distances between breakends from the same junction are considered
xt.adj0[bp.pair] = NA_real_
suppressWarnings({
nearest_ix = dunlist(lapply(
seq_len(nrow(xt.adj)),
function(x) which(xt.adj[x,] == min(xt.adj[x,], na.rm = T)))) %>%
as.matrix
nearest0_ix = dunlist(lapply(
seq_len(nrow(xt.adj0)),
function(x) which(xt.adj0[x,] == min(xt.adj0[x,], na.rm = T)))) %>%
as.matrix
})
if (nrow(nearest_ix) & nrow(nearest0_ix)) {
if (strict == "strict") {
nearest_ix = cbind(rowMins(nearest_ix), rowMaxs(nearest_ix))
nearest0_ix = cbind(rowMins(nearest0_ix), rowMaxs(nearest0_ix))
nearest_ix = nearest_ix[duplicated(nearest_ix),,drop = FALSE]
nearest0_ix = nearest0_ix[duplicated(nearest0_ix),,drop = FALSE]
nearest_ix = rbind(nearest_ix, cbind(nearest_ix[,2], nearest_ix[,1]))
nearest0_ix = rbind(nearest0_ix, cbind(nearest0_ix[,2], nearest0_ix[,1]))
nearest_ix = as.matrix(merge(nearest_ix, nearest0_ix)) # if there is a breakend closer but in the wrong orientation, that distance will be thrown out downstream, i.e. there is a one to one match of breakend and any nearest breakend that is in the wrong orientation disqualifies the clustser
} else if (strict == "one_to_one") {
nearest_ix = cbind(rowMins(nearest_ix), rowMaxs(nearest_ix))
nearest_ix = nearest_ix[duplicated(nearest_ix),,drop = FALSE]
nearest_ix = rbind(nearest_ix, cbind(nearest_ix[,2], nearest_ix[,1]))
nearest_ix = as.matrix(nearest_ix) # only one-to-one breakends are considered
} else if (strict == "loose") {
nearest_ix = rbind(nearest_ix, cbind(nearest_ix[,2], nearest_ix[,1]))
nearest_ix = nearest_ix[!duplicated(nearest_ix),,drop = FALSE]
}
}
} else if (!only_chains) {
## two breakpoints of the same junction should be distance 1
xt.adj[bp.pair] = 1
}
rm(xt.adj0)
## get back to marcin's version
adj = xt.adj
adj[which(is.na(as.matrix(adj)))] = 0
adj[which(as.matrix(adj)>thresh)] = 0
## message(identical(as.logical(adj), as.logical(as.matrix(old.adj))))
xt.adj[which(is.na(as.matrix(xt.adj)))] = .INF + 1
if (only_chains && nrow(nearest_ix)) {
tmp = xt.adj[nearest_ix]
xt.adj[] = .INF + 1
adj[] = 0
xt.adj[nearest_ix] = tmp
adj[nearest_ix] = tmp
}
## below update - grab all the properly oriented links regardless
## of distance threshold - use metadata
## here we are making the reciprocal breakend metadata...
## creating a breakend table of reciprocal distances
dt = Matrix::which(xt.adj < thresh, arr.ind = T)
dt = unique(data.table(V1 = rowMins(dt), V2 = rowMaxs(dt)))
dt[, bp.dist := xt.adj[dt[, cbind(V1, V2)]]]
dt$sign = ifelse(strand(bp[dt$V1]) == "+",
ifelse(gr.flipstrand(bp[dt$V1]) > bp[dt$V2], -1, 1),
ifelse(gr.flipstrand(bp[dt$V1]) < bp[dt$V2], -1, 1))
dt2 = dt[,idx := seq_len(.N)] %>%
melt(measure.vars = c("V1", "V2"))
meta = cbind(gr2dt(bp)[dt2$value], dt2)[order(idx)]
meta = rbind(meta,
gr2dt(bp)[!paste(grl.ix, grl.iix) %in%
meta[, paste(grl.ix, grl.iix)]],
fill = T)
meta$thresh = thresh
dt = Matrix::which(adj > 0, arr.ind = T)
dt = unique(data.table(cbind(rowMins(dt), rowMaxs(dt))))
dt[, bp.dist := xt.adj[dt[, cbind(V1, V2)]]]
dt$sign = ifelse(strand(bp[dt$V1]) == "+",
ifelse(gr.flipstrand(bp[dt$V1]) > bp[dt$V2], -1, 1),
ifelse(gr.flipstrand(bp[dt$V1]) < bp[dt$V2], -1, 1))
dt2 = dt[,idx := seq_len(.N)] %>% melt(measure.vars = c("V1", "V2"))
meta.links = cbind(gr2dt(bp)[dt2$value], dt2)[order(idx)]
meta.links = rbind(meta.links,
gr2dt(bp)[paste(grl.ix, grl.iix) %nin% meta.links[, paste(grl.ix, grl.iix)]],
fill = T)
meta.links$thresh = thresh
## do single linkage hierarchical clustering within `range`
hcl = stats::hclust(as.dist(xt.adj), method = "single")
hcl.lbl = cutree(hcl, h = thresh)
bp.dt$hcl = hcl.lbl
bp.hcl =
bp.dt[,.(hcl.1 = .SD[grl.iix==1, hcl],
hcl.2 = .SD[grl.iix==2, hcl]),
keyby=grl.ix]
## sometimes two breakpoints belong to diff hcl
## merge them!
altedges$mark(hcl.1 = bp.hcl[.(seq_along(altedges)), hcl.1])
altedges$mark(hcl.2 = bp.hcl[.(seq_along(altedges)), hcl.2])
hcl.ig = igraph::graph_from_edgelist(
bp.hcl[, unique(cbind(hcl.1, hcl.2))], directed = FALSE)
hcl.comp = components(hcl.ig)
altedges$mark(ehcl = as.integer(hcl.comp$membership)[bp.hcl[, hcl.1]])
## connect to MI's code
adj[adj>thresh] = 0
## check bp pairs to see if they are actually reference connected (ignore.strand = TRUE)
## on the given graphs ...
## which if we have many graphs overlapping the same intervals
## may not actually be the case
## we only check connectivity using ref edges
## compute reference graph distance and
## remove any bp pairs that are farther away
## on the reference graph than on the
## linear reference
##FIX ME: can't handle when there are no reference edges
refg = self[, type == 'REF']
bpp = Matrix::which(adj!=0, arr.ind = TRUE)
dref = pdist(bp[bpp[,1]], bp[bpp[,2]])
drefg = diag(refg$dist(bp[bpp[,1]], bp[bpp[,2]]))
ix = drefg>dref
if (length(ix))
adj[bpp[ix,, drop = FALSE]] = FALSE
if (verbose>1)
cat('\n')
adj = adj | t(adj) ## symmetrize
## bidirected graph --> skew symmetric directed graph conversion
## split each junction (bp pair) into two nodes, one + and -
## arbitrarily call each bp1-->bp2 junction is "+" orientation
## then all odd proximities adjacent to bp1 will enter the "+"
## version of that junction and exit the "-" version
## new matrix will be same dimension as adj
## however the nodes will represents + and -
## orientation of junctions
## using the foollowing conversion
## i.e.
## bp2 --> bp1 + +
## bp2 --> bp2 + -
## bp1 --> bp1 - +
## bp1 --> bp2 - -
## we'll use the same indices just to keep things confusing
junpos = bp1 = bp$grl.iix == 1
junneg = bp2 = bp$grl.iix == 2
adj2 = adj & FALSE ## clear out adj for new skew symmetric version
adj2[junpos, junpos] = adj[bp2, bp1]
adj2[junpos, junneg] = adj[bp2, bp2]
adj2[junneg, junpos] = adj[bp1, bp1]
adj2[junneg, junneg] = adj[bp1, bp2]
if (verbose)
message(sprintf('Created basic junction graph using distance threshold of %s', thresh))
## strongly connected components consists of (possibly nested) cycles
cl = split(1:length(bp), igraph::clusters(graph.adjacency(adj2), ifelse(weak, 'weak', 'strong'))$membership)
## choose only clusters with length > 1
cl = cl[S4Vectors::elementNROWS(cl)>1]
cl = cl[order(S4Vectors::elementNROWS(cl))]
jcl.0 = lapply(cl, function(x) unique(sort(bp$edge.id[x])))
jcls = sapply(jcl.0, paste, collapse = " ")
jcl = jcl.0[!duplicated(jcls)]
adj3 = adj2
altedges$mark(ecycle = as.character(NA))
self$nodes$mark(ecluster = NA_integer_)
self$set(recip_event = data.table())
if (NROW(jcl)>0)
{
## dcl = dunlist(unname(jcl))[, listid := paste0(ifelse(weak, '', 'c'), listid)]
dcl.0 = dunlist(unname(jcl.0))
setnames(dcl.0, "listid", "jcl.0")
dcl = dunlist(unname(jcl))[, `:=`(listid, paste0(ifelse(weak,
"", "c"), listid))]
dcl.0$listid = dcl[V1 %K% dcl.0$V1]$listid
cl.1 = dunlist(unname(cl))
setnames(cl.1, "listid", "jcl.0")
cl.eclust = merge(cl.1,
unique(dcl.0[, .(jcl.0, listid)]), by = "jcl.0")
if (!weak)
altedges[as.character(dcl$V1)]$mark(ecycle = dcl$listid)
altedges[as.character(dcl$V1)]$mark(ecluster = dcl$listid)
## mark edge id's with ecluster id's
meta = merge(meta,
altedges$dt[
,.(edge.id,
ecluster,
hcl.1,
hcl.2,
ehcl,
ecycle)], by = "edge.id")
meta.thresh = merge(meta.links[bp.dist < thresh], altedges$dt[, .(edge.id, ecluster, hcl.1, hcl.2, ehcl, ecycle)], by = "edge.id")
meta.links = rbind(meta.thresh, meta.links[!na2false(bp.dist < thresh)], fill = T)
meta[, `:=`(
num_positive = sum(sign[!duplicated(idx)] > 0, na.rm = T),
num_negative = sum(sign[!duplicated(idx)] < 0, na.rm = T)
),
by = ecluster]
## self$set(recip_bp = meta)
meta[!is.na(ecluster),
`:=`(
nclust = length(unique(edge.id)),
all_positive = all(replace(sign, is.na(sign), 3e9) > 0),
all_negative = all(replace(sign, is.na(sign), -3e9) < 0),
mixed = {naom = na.omit(sign); any(naom > 0) & any(naom < 0)},
bridge = anyNA(bp.dist)
),
by = ecluster]
self$set(recip_links = meta.links)
recip_bp = copy(meta)
eclust.edge = self$edges[!is.na(ecluster)]
ndt = data.table::melt(eclust.edge$dt, id.vars = c("ecluster"),
measure.vars = c("n1", "n2"))[!duplicated(cbind(ecluster, value))]
ndt$nodefp = gr.string(gr.stripstrand(self$nodes[as.character(ndt$value)]$gr)) # footprint of the involved node
self$nodes[ndt$value]$mark(ecluster = ndt$ecluster)
nnodes = ndt[, .(nnodes = length(unique(value))), by = ecluster]
recip_bp = merge(recip_bp[!is.na(ecluster)], nnodes, by = "ecluster")
recip_bp[intersect((GenomicRanges::order(gr.stripstrand(dt2gr(recip_bp)))),
which(!is.na(sign))), `:=`(fstart = start[1], fend = start[2]), by = .(idx)]
recip_bp[!is.na(fstart), `:=`(footprint = (unique(paste0(unique(paste0(seqnames, ":", fstart, "-", fend, ifelse(sign > 0, "+", "-"))), collapse = ",")))), by = .(ecluster)] # footprint of the reciprocal bridge
recip_bp[, footprint := na.omit(footprint)[1], by = .(ecluster)]
recip_event = recip_bp[
,.(njuncs = nclust[1], nnodes = nclust[1],
num_positive = num_positive[1], num_negative = num_negative[1],
all_positive = all(replace(sign, is.na(sign), 3e9) > 0),
all_negative = all(replace(sign, is.na(sign), -3e9) < 0),
mixed = unique(na.omit(mixed)),
bridge = unique(na.omit(bridge)),
footprint = footprint[1]), by = ecluster]
edt = copy3(self$edges$dt)[
, `:=`(unfused.n1 = n1 + ifelse(n1.side == "right", 1L, -1L),
unfused.n2 = n2 + ifelse(n2.side == "right", 1L, -1L))]
edt$fp1 = self$nodes[as.character(edt$n1)]$gr %>% gr.stripstrand %>% gr.string
edt$fp2 = self$nodes[as.character(edt$n2)]$gr %>% gr.stripstrand %>% gr.string
edt$unfused.fp1 = self$nodes[as.character(edt$unfused.n1)]$gr %>% gr.stripstrand %>% gr.string
edt$unfused.fp2 = self$nodes[as.character(edt$unfused.n2)]$gr %>% gr.stripstrand %>% gr.string
recip_bp = merge(recip_bp,
edt,
by = "edge.id",
all.x = T,
suffixes = c("", ".y_remove_"))## %>%
## dplyr::select(-dplyr::matches(".y_remove_"))
rmcols = grepl(".y_remove_", colnames(recip_bp))
recip_bp = recip_bp[,!rmcols, with = F]
ufp = recip_bp %>% melt(id.vars = c("ecluster"), measure.vars = c("unfused.fp1", "unfused.fp2"))
## ufp2 = with(ufp, parse.gr(value, meta = g2())) %>% gr.spreduce(ecluster) %>% within({unrfp = gr.string(g2())}) %>% gr2df
ufp2 = parse.gr(ufp$value, meta = ufp) %>% gr.spreduce(ecluster)
ufp2$unrfp = gr.string(ufp2)
ufp2 = gr2dt(ufp2)
ufp3 = ufp2[!duped(ecluster, unrfp)][, .(unrfp = paste(sort(unique(unrfp)), collapse = ",")), by = .(ecluster)]
un = recip_bp %>% melt(id.vars = c("ecluster"), measure.vars = c("unfused.n1", "unfused.n2"))
un2 = un[!duped(ecluster, value)][, paste(sort(unique(value)), collapse = ","), by = .(ecluster)]
un2 = setcols(un2, "V1", "un.snode.ids")
recip_bp = merge.repl(merge.repl(recip_bp, ufp3, by = "ecluster"), un2, by = "ecluster")
self$set(recip_bp = recip_bp)
this = ndt[order(ecluster)][GenomicRanges::order(parse.gr(nodefp))]
ndredf = parse.gr(this$nodefp, meta = this) %>% gr.spreduce(ecluster)
## ndredf = with(this, parse.gr(nodefp, meta = g2())) %>% gr.spreduce(ecluster) # reduced involved node footprint
dt = data.table(ecluster = ndredf$ecluster, wid = width(ndredf), tmp = gr.string(ndredf))[, .(nodefp = paste(tmp, collapse = ","), nrwid = sum(wid), nrfp = .N), keyby = ecluster] # nrfp is the number of reduced footprints
recip_event = merge(recip_event, dt, by = "ecluster") %>% merge(this[, .(snode.ids = paste(value, collapse = ",")), keyby = ecluster], by = "ecluster")
recip_event = merge.repl(merge.repl(recip_event, ufp3, by = "ecluster"), un2, by = "ecluster")
self$set(recip_event = recip_event)
}
if (verbose)
message(sprintf('Annotated %s junction cycles in edge field $ecycle', length(jcl)))
if (paths & !weak)
{
if (verbose)
message('Analyzing paths')
## remove all cycles and enumerate remaining paths > 1
if (length(jcl)>0)
{
adj3[unlist(jcl), unlist(jcl)] = FALSE
}
sinks = Matrix::which(Matrix::rowSums(adj3)==0)
sources = Matrix::which(Matrix::colSums(adj3)==0)
cl2 = split(1:length(bp), igraph::clusters(graph.adjacency(adj3), 'weak')$membership)
cl2 = cl2[S4Vectors::elementNROWS(cl2)>1]
if (any(ix <- S4Vectors::elementNROWS(cl2)>2))
{ ## only need to do this for connected components that have 3 or more junctions
cl3 = do.call(c, mclapply(cl2[ix], function(x)
{
tmp.adj = adj3[x, x]
lapply(all.paths(tmp.adj, sources = sources, sinks = sinks, verbose = verbose)$paths, function(i) x[i])
}, mc.cores = mc.cores))
cl2 = c(cl2[!ix], cl3)
}
jcl2 = lapply(cl2, function(x) unique(sort(bp$grl.ix[x])))
jcls2 = sapply(jcl2, paste, collapse = ' ')
jcl2 = jcl2[!duplicated(jcls2)]
altedges$mark(epath = as.character(NA))
if (length(jcl2)>0)
{
dcl2 = dunlist(unname(jcl2))[, listid := paste0('p', listid)]
altedges[dcl2$V1]$mark(epath = dcl2$listid)
## also mark ecluster, though they may have
## overlapping edges
self$edges$mark(ecluster =
ifelse(
is.na(self$edges$dt$ecycle) &
is.na(self$edges$dt$epath), as.character(NA),
paste0(
ifelse(is.na(self$edges$dt$ecycle),
'',
self$edges$dt$ecycle),
ifelse(is.na(self$edges$dt$epath),
'',
self$edges$dt$epath))))
}
if (verbose)
message(sprintf('Annotated %s paths in edge field $epath', length(jcl2)))
}
return(invisible(self))
},
#' @name paths
#' @description
#' Returns shortest paths from query gNode to subject gNode in graph in the form of gWalks
#' (note: the gNodes must exist in the graph, unlike in the related but more general proximity function)
#'
#' Each output path is a gWalk that connects query-subject on the genome
#' described by gGraph gg. Each gWalk is annotated by the metadata of the
#' corresponding query-subject GRanges pair as well as fields "altdist" and "refdist"
#' specifying the "alternate and "reference" gGraph distance of the
#' query-subject pair. The gWalk metadata field "reldist" specifies
#' the relative distance (i.e. ratio of altdist to refdist) for that walk.
#'
#' NOTE: this operation can be quite expensive for large combinations of
#' of query and subject, so max.dist parameter will by default only compute
#' paths for query-subject pairs that are less then max.dist apart (default 1MB).
#' That default is chosen for large queries (eg >10K on each side), however
#' for smaller queries (eg length <100) the user may want to set max.dist = Inf
#'
#' By default performs a "cartesian" search, i.e. all pairs of query and subject
#' but if cartesian is set to FALSE will only search for specified pairs of query
#' and subject (then query and subject must be of the same length)
#'
#' @param query gNode object or snode.id of gNode in this graph
#' @param subject gNode object or snode.id of gNode in this graph
#' @param weight edge column to use as the weight (instead of standard weight column)
#' @param mc.cores how many cores (default 1)
#' @return gWalk object each representing a query-subject shortest path (if any exist)
#' @author Marcin Imielinski
paths = function(query,
subject = query,
mc.cores = 1,
weight = NULL,
meta = NULL,
ignore.strand = TRUE, cartesian = TRUE)
{
if (!inherits(query, 'gNode'))
query = self$nodes[query]
if (!inherits(subject, 'gNode'))
subject = self$nodes[subject]
if (!cartesian && length(query) != length(subject))
{
stop('If cartesian = FALSE then query and subject must have the same length')
}
## get igraph whose edge weights correspond to the width of the target interval
G = self$igraph
if (length(query)==0 | length(subject)==0)
return(gW(graph = self))
if (!is.null(weight)) ## replace standard weight with user provided column
{
weight = weight[1]
if (!(weight %in% names(self$sedgesdt)))
stop(paste('column', weight, 'not found in edges metadata'))
E(G)$weight = self$sedgesdt[.(E(G)$sedge.id),weight, with = FALSE][[1]]
}
## only takes a length 1 query, since igraph::get.shortest.paths only supports scalar queries
## for the source, so then we will have to iterate this query on all the input query nodes
## do.trim is there to do.trim the last interval from edge weight is we are using the
## plain vanilla edge weights i.e. where each edge is weighted by the width of its target
.path = function(qid, G, query, subject, ignore.strand = TRUE, do.trim = TRUE, cartesian = TRUE)
{
query = query[qid, with = FALSE]
if (!cartesian) ## only choose one of the subject intervals to run
{
subject = subject[qid, with = FALSE]
}
.get.shortest.paths = function(G, query, subject, do.trim = TRUE)
{
snode.id = self$gr$snode.id
p = suppressWarnings(igraph::get.shortest.paths(G, query, subject,
weight = igraph::E(G)$weight, output = 'both'))
## convert to integer .. and back to snode.id
## if path does not exist get.shortest.paths (for some reason) will return a single vector with the target
## so we clean out all paths that don't have the query
v = lapply(p$vpath, function(x) {y = as.integer(x); if (length(y) == 0 | is.na(y[1]) | y[1] != query) return(c()) else return(snode.id[y])})
## chop off the last interval to get the "true" distance from query to subject
if (do.trim)
{
w.all = sapply(p$epath, function(x) sum(x$weight[-length(x)])+1)
}
else
{
w.all = sapply(p$epath, function(x) sum(x$weight))
}
w.all[elementNROWS(v)==0] = Inf
return(data.table(source = snode.id[query], sink = snode.id[subject], path = v, dist = w.all)[!is.infinite(dist), ])
}
p.dt = .get.shortest.paths(G, query$dt$index, subject$dt$index, do.trim)
## compute the three other flavors of paths - i.e. ++, +-, -+, ++
if (ignore.strand)
{
p.fr = .get.shortest.paths(G, query$dt$index, subject$flip$dt$index, do.trim)
p.rr = .get.shortest.paths(G, query$flip$dt$index, subject$flip$dt$index, do.trim)
p.rf = .get.shortest.paths(G, query$flip$dt$index, subject$dt$index, do.trim)
p.dt = rbind(p.dt, p.fr, p.rr, p.rf)[order(dist), ][!duplicated(abs(sink)), ]
}
return(p.dt)
}
do.trim = is.null(weight)
walks.dt = rbindlist(mclapply(1:length(query), .path, query = query,
subject = subject, G = G, ignore.strand = ignore.strand,
cartesian = cartesian,
mc.cores = mc.cores, do.trim = do.trim))
walks = gW(snode.id = walks.dt$path, graph = self, meta = walks.dt[, setdiff(colnames(walks.dt), 'path'), with = FALSE])
return(walks)
},
#' @name dist
#' @description
#' Computes a distance matrix of query and subject
#' intervals (in base pairs) on the gGraph
#' between any arbitrary pairs of granges gr1 and gr2.
#' @param gr1 GRanges query
#' @param gr2 GRanges query (if NULL, will set to gr1)
#' @param weight metadata field of gEdges to use as weight (instead of distance of target node)
#' @param dt returns data.table if TRUE, excluding all Inf distances
#' @param include.internal logical flag whether to allow
#' paths that begin or end inside teh query or subject
#' @author Marcin Imielinski
dist = function(query,
subject,
weight = NULL,
ignore.strand = TRUE,
include.internal = TRUE,
verbose=FALSE)
{
if (!missing('query'))
{
if (inherits(query, 'gNode'))
{
queryf = query
}
else
{
## first try evaluating query and subject as node indices or filters
queryf = tryCatch(self$nodes[query], error = function(e) NULL)
}
}
else
{
queryf = self$nodes
}
## primitive function to compute distance on graph, called by
## both gNode and GRanges queries below
.dist = function(gg, query, subject, weight = NULL, ignore.strand = TRUE)
{
## get igraph and (populated with edge weights)
G = gg$igraph
if (!nrow(gg$sedgesdt))
{
return(matrix(Inf, nrow = length(gg$nodes), ncol = length(gg$nodes)))
}
if (!is.null(weight)) ## replace standard weight with user provided column if provided
{
weight = weight[1]
if (!(weight %in% names(gg$sedgesdt)))
stop(paste('column', weight, 'not found in edges metadata'))
igraph::E(G)$weight = gg$sedgesdt[
.(E(G)$sedge.id),weight, with = FALSE][[1]]
}
qmap = data.table(
query.ix = query$dt$index,
query.rix = query$flip$dt$index)
smap = data.table(
subject.ix = subject$dt$index,
subject.rix = subject$flip$dt$index)
## shortest.paths does not like dups so we have to dedup here
## will redup later
qmap[, dup := duplicated(query.ix)]
smap[, dup := duplicated(subject.ix)]
query.ix = qmap[dup == FALSE, query.ix]
query.rix = qmap[dup == FALSE, query.rix]
subject.ix = smap[dup == FALSE, subject.ix]
subject.rix = smap[dup == FALSE, subject.rix]
Dff = igraph::shortest.paths(G, query.ix, subject.ix,
weights = igraph::E(G)$weight,
mode = 'out')
if (ignore.strand)
{
Dfr = igraph::shortest.paths(G, query.ix, subject.rix,
weights = igraph::E(G)$weight,
mode = 'out')
Drr = igraph::shortest.paths(G, query.rix, subject.rix,
weights = igraph::E(G)$weight,
mode = 'out')
Drf = igraph::shortest.paths(G, query.rix, subject.ix,
weights = igraph::E(G)$weight,
mode = 'out')
D = pmin(Dff, Dfr, Drr, Drf)
}
else
D = Dff
## remap to original values
rownames(D) = query.ix
colnames(D) = subject.ix
D = D[as.character(qmap$query.ix), as.character(smap$subject.ix), drop = FALSE]
if (is.null(weight)) ## we only need to sweep for the standard usage, but not for a custom weight
{
D = pmax(sweep(D, 2, width(subject)-1, '-'), 0)
}
return(D)
}
## we have gNodes or gNode indices as arguments
if (!is.null(queryf))
{
if (missing('subject'))
subjectf = queryf
else
{
if (inherits(subject, 'gNode'))
{
subjectf = subject
}
else
{
subjectf = tryCatch(self$nodes[subject], error = function(e) NULL)
}
}
if (is.null(subjectf))
stop('Error in parsing query or subject filters')
D = .dist(self, queryf, subjectf, ignore.strand = ignore.strand, weight = weight)
return(D)
}
## we have GRanges as arguments
if (!ignore.strand) ##FIXME
{
stop('Stranded distance not yet implemented for $dist method with GRanges arguments')
}
## means that query and subject are GRanges
## so we have to chop up graph
if (missing("subject"))
subject = query
if (length(query)==0 || length(subject)==0)
return(matrix()[c(), c()])
if (!any(query %^% self$nodes$gr) | !any(subject %^% self$nodes$gr))
return(matrix(Inf, nrow = length(query), ncol = length(subject)))
## strip metadata from query and subject
values(query) = NULL
values(subject) = NULL
## copy current graph without metadata
## and simplify
query$id = 1:length(query)
subject$id = 1:length(subject)
query.og = query
subject.og = subject
## if (nrow(self$edgesdt)>0){
ed = self$edgesdt[,.(n1,n2,n1.side,n2.side,type)]
## } else {
## ed = data.table(n1 = numeric(0), n2 = numeric(0), n1.side = character(0), n2.side = character(0))
## }
## if no edges, then infinite distance
if (!nrow(ed))
return(matrix(Inf, nrow = length(query), ncol = length(subject)))
if (!is.null(weight))
{
ed = cbind(ed, self$edgesdt[, weight, with = FALSE])
}
simpleg = gG(nodes = self$nodes$gr[, c()],
edges = ed)$simplify()
simpleg$nodes$mark(simpleg = TRUE)
## split query and subject if we want to allow
## paths that originate or end inside a node
## to constitute a "distance"
if (include.internal)
{
query = query %*% simpleg$nodes$gr[, c()]
subject = subject %*% simpleg$nodes$gr[, c()]
}
grd = unique(gr.stripstrand(grbind(query, subject)))
## this will disjoin by grd without collapsing
## and only keep the nodes and edges that were previously in simpleg
simpleg$disjoin(gr = grd, collapse = FALSE)
simpleg = simpleg[which(simpleg==TRUE), ]
## if custom weight field is specified then we will have
## created many possibly internal edges that we have to assume
## be zero weight
if (!is.null(weight))
{
wval = simpleg$sedgesdt[[weight]]
sed = simpleg$sedgesdt
simpleg$annotate(weight, 0,
sed$sedge.id[is.na(wval)], "edge")
}
grd$is.query = grd %^% query
grd$is.subject = grd %^% subject
grds = grd %*% simpleg$nodes$gr[, 'snode.id']
values(grds) = cbind(values(grds),
simpleg$queryLookup(grds$snode.id)[, .(index, rindex)])
qnode = simpleg$nodes[grds$snode.id[grds$is.query]]
snode = simpleg$nodes[grds$snode.id[grds$is.subject]]
D = .dist(simpleg, qnode, snode,
ignore.strand = TRUE, weight = weight)
## now we melt D and merge back with input
## using overlaps of query / subject with grds
Dt = as.data.table(reshape2::melt(D))[!is.infinite(value), ]
setkeyv(Dt, c("Var1", "Var2"))
queryl = query[, 'id'] %*% grds[, 'index']
subjectl = subject[, 'id'] %*% grds[, 'index']
out.dt = merge(gr2dt(queryl)[, .(id, qindex = index)],
merge(gr2dt(subjectl)[, .(id, sindex = index)],
Dt, by.x = 'sindex', by.y = 'Var2',
allow.cartesian = TRUE),
by.x = 'qindex', by.y = 'Var1',
allow.cartesian = TRUE)
setkey(out.dt, value) ## sorts by value i.e. distance
out.dt = unique(out.dt, by = c("id.x", "id.y"))
D = matrix(Inf, nrow = length(query.og), ncol = length(subject.og))
D[cbind(out.dt$id.x, out.dt$id.y)] = out.dt$value
return(D)
},
#' @name rep
#' @description
#'
#' Creates "bubbles" in the graph by replicating the nodes or gwalks in the argument. Node replication
#' replicates edges going in and out of all replicated nodes. If an edge connects a pair of replicated nodes
#' that edge will be replicated across all pairs of those replciated nodes. Walk replication will create "longer bubbles"
#' with fewer edges getting replicated i.e. it will only replicate intra-walk edges within each walk replicate (but not between
#' separate walk replicates).
#'
#' (note that this changes the current gGraph in place, and thus the input gNode or gWalk will no longer
#' apply to the new altered gWalk)
#'
#' New graph keeps track of the parent node and edge ids in the original graph using node metadata parent.node.id
#' and edge metadata parent.edge.id i.e. the replicated nodes will be connected to the sources of the original nodes
#' and if replicated nodes connect to each other, then there will exist an edge connecting
#' all of their instances to each other.
#'
#' @param nodes = gNode object must point to a node in the graph, can also be an index of a node (but not a metadata expression), can also be a gWalk object
#' @param times scalar or vector of length self$length specifying how many times to replicate each of the nodes.
#' @author Marcin Imielinski
#' @return gGraph (also modified in place) with nodes annotated with parent.node.id, parent.rep
rep = function(nodes = NULL, times)
{
## by default i.e. if nodes not specified assume times applies to the whole graph
if (is.null(nodes))
{
nodes = self$nodes
}
## convert everything to integer list
else if (is(nodes, 'gNode'))
{
if (!(nodes$graph == self))
{
stop('gWalk input must point to the nodes of this gGraph')
}
nlist = split(nodes$dt$snode.id, 1:length(nodes))
}
else if (is(nodes, 'gWalk'))
{
if (!(nodes$graph == self))
{
stop('gWalk input must point to this gGraph')
}
nlist = nodes$snode.id
}
else if (is.integer(nodes) | is.numeric(nodes))
{
nlist = split(nodes, 1:length(nodes))
}
else ## otherwise user provided list of integer node indices as input
{
nlist = nodes
}
if (length(times) != 1 && length(nlist) != length(times))
{
stop('Length of times must be either length 1 or length of nodes')
}
times = cbind(1:length(nlist), times)[,2]
dtu = dunlist(unname(nlist))
setnames(dtu, c('walk.id', 'node.id'))
dtu$times = times[dtu$walk.id]
dtu[, node.id := abs(node.id)]
if (!is.integer(dtu$node.id) && any(is.na(as.integer(dtu$node.id))))
{
stop('Raw node indices (or lists of node indices) must be integers')
}
if (any(dtu$V1>length(self)))
{
stop('One or more provided node indices are outside the bounds of the graph')
}
gr = self$nodes$gr
ed = self$edges$dt
ed[, parent.edge.id := edge.id]
gr$parent.node.id = gr$node.id
other = setdiff(1:length(self), dtu$node.id)
## parent.node.id = is the node.id in the original graph
## walk.id = is the original walk.id
## walk.rep = is a local id unique instance of each walk replicate inside each walk.id
nmap = data.table(parent.node.id = c(other, rep(dtu$node.id, dtu$times)),
walk.id = c(rep(NA, length(other)), paste(rep(dtu$walk.id, dtu$times))),
walk.rep = c(rep(NA, length(other)), data.table(ind = 1:nrow(dtu), time = dtu$times)[, .(1:time), by = ind][, V1])
)[, node.id := 1:.N]
new.gr = gr[nmap$parent.node.id]
new.gr$parent.rep = nmap$walk.rep
## now we only replicate edges if parent.node.id match and either
## (1) walk.id of either node is NA
## (2) walk.ids and walk.rep match
## (3) walk.ids are non NA and they don't match
## first we merge everything (may create a ton of edges
ed$parent.edge.id = abs(ed$edge.id)
new.ed = merge(merge(ed, nmap, by.x = 'n1', by.y = 'parent.node.id', allow.cartesian = TRUE),
nmap, by.x = 'n2', by.y = 'parent.node.id', allow.cartesian = TRUE)
new.ed = new.ed[which(is.na(walk.id.x) | is.na(walk.id.y) | ## condition 1
(walk.id.x != walk.id.y) | ## condition 2
(walk.id.x == walk.id.y & walk.rep.x == walk.rep.y) ## condition 3
), ]
new.ed[, n1 := node.id.x]
new.ed[, n2 := node.id.y]
new.ed$walk.rep.y = new.ed$walk.rep.x = new.ed$walk.id.y = new.ed$walk.id.x = new.ed$node.id.x = new.ed$node.id.y = NULL
private$gGraphFromNodes(new.gr, new.ed)
return(invisible(self))
},
#' @name swap
#' @description
#'
#' Swap nodes with granges, grl, or Gwalks.
#' Provided replacement vector must be the same length as the inputted nodes, resulting in each node being "swapped" by the provided
#' interval, node, grl (representing a walk), or gWalk. The replacement will inherit left and right edges for the removed node.
#' If the replacement is a walk, then the left side of the first node in the walk will inherit the edges that were
#' previously to the left of the node being replaced, and right side of the last node of the walk will inherit the edges
#' that were previously to the right of the node being replaced.
#'
#' Note: these replacement obey the orientation of the arguments. So if the node to be replaced is flipped (- orientation with
#' respect to the reference, then it's "left" is to the right on the reference. Similarly for walks whose first interval
#' is flipped with respect to the reference, the left edges will be attached to the right of the node on the reference.
#'
#' @param nodes = gNode object must point to a node in this graph, can also be an index of a node (but not a metadata expression), can also be a gWalk object
#' @param replacement GRanges, GRangesList, or gWalk object whose length is the length(nodes)
#' @author Marcin Imielinski
#' @return gGraph (also modified in place) with nodes annotated with parent.node.id, parent.rep
swap = function(nodes, replacement)
{
if (length(nodes) != length(replacement))
stop('nodes to be swapped must be the same length as their replacement')
if (is.integer(nodes) | is.numeric(nodes))
nodes = self$nodes[nodes]
if (nodes$graph != self)
stop('nodes must be matched to the current graph')
if (inherits(replacement, 'GRangesList'))
{
replacement = gW(grl = replacement, disjoin = FALSE)
## automatically label all inserted edges as ALT by default
replacement$graph$edges$mark(type = 'ALT')
}
## at this point should be either gRanges or gWalk
if (inherits(replacement, 'GRanges'))
{
nodes.rep = replacement
nodes.rep$parent.node.id = nodes$dt$node.id
nodes.rep$parent.rep = 1:length(replacement)
ed.rep = data.table() ## there are no edges in this case
}
else if (inherits(replacement, 'gWalk'))
{
nodes.rep = replacement$graph$nodes$gr
tmp = dunlist(unname(replacement$snode.id))
nodes.rep$parent.node.id[abs(tmp$V1)] = nodes$dt$node.id[tmp$listid]
nodes.rep$parent.rep = tmp$listid
ed.rep = replacement$graph$edges$dt[, n1 := paste('replacement', n1)][, n2 := paste('replacement', n2)]
}
## relabel the nodes that we are swapping into the graph
nodes.rep$node.id = paste('replacement', nodes.rep$node.id)
## extract the node GRanges from the current graph, we will swap a subset of these
nodes.old = self$nodes$gr
nodes.old$parent.rep = NA
## extract the edges
ed = self$edges$dt[, n1 := as.character(n1)][, n2 := as.character(n2)]
## these vectors will be all of the same length
swap = data.table(
toswap = nodes$dt$node.id,
firsts = sapply(replacement$snode.id, '[', 1),
lasts = sapply(replacement$snode.id, function(x) x[length(x)]),
key = 'toswap'
)
eswap.left = nodes$eleft$dt ## these edges will have the node to be swapped as n2
eswap.right = nodes$eright$dt ## these edges will have the node to be swapped as n1
## firsts get left edges attached to their left, unless they are flipped (i.e. have negative snode.id)
eswap.left[, n2.swap := swap[.(n2), paste('replacement', abs(firsts))]]
eswap.left[, n2.swap.side := swap[.(n2), ifelse(firsts>0, 'left', 'right')]]
## lasts get right edges attached to their right, unless they are flipped (i.e. have negative snode.id)
eswap.right[, n1.swap := swap[.(n1), paste('replacement', abs(lasts))]]
eswap.right[, n1.swap.side := swap[.(n1), ifelse(lasts>0, 'right', 'left')]]
## ed1 only has edges in "forward" orientation so need to make sure to swap the correct "end" of each edge
## we will use the sign to guide us - i.e. we swap n2 for eswap.left, unless they are flipped, in which
## case we swap n1
ed[.(abs(eswap.left$sedge.id)), ":="(
n1 = ifelse(eswap.left$sedge.id<0, eswap.left$n2.swap, n1),
n1.side = ifelse(eswap.left$sedge.id<0, eswap.left$n2.swap.side, n1.side),
n2 = ifelse(eswap.left$sedge.id>0, eswap.left$n2.swap, n2),
n2.side = ifelse(eswap.left$sedge.id>0, eswap.left$n2.swap.side, n2.side))]
## for eswap.right we do the reverse, i.e. forward orientation edges get their n1 switched, and n2 if flipped
ed[.(abs(eswap.right$sedge.id)), ":="(
n1 = ifelse(eswap.right$sedge.id>0, eswap.right$n1.swap, n1),
n1.side = ifelse(eswap.right$sedge.id>0, eswap.right$n1.swap.side, n1.side),
n2 = ifelse(eswap.right$sedge.id<0, eswap.right$n1.swap, n2),
n2.side = ifelse(eswap.right$sedge.id<0, eswap.right$n1.swap.side, n2.side))]
## if there are additional internal edges (i.e. inside the provided gWalk) then we add these to the new graph
## along with any metadata
if (nrow(ed.rep)>0)
{
ed = rbind(ed, ed.rep, fill = TRUE)
ed$sedge.id = ed$edge.id = NULL
}
## by now all instances of swap$toswap node.ids should be missing from ed
## and replaced with nodes from n2
## we now concatenate the remaining n1 with n2, find a common
nodes.new = grbind(nodes.old[-swap$toswap, ], nodes.rep)
ed[, n1 := as.integer(factor(n1, nodes.new$node.id))]
ed[, n2 := as.integer(factor(n2, nodes.new$node.id))]
## remove previous node.id and snode.id so as not to confuse instantiator
nodes.new$node.id = NULL
nodes.new$snode.id = NULL
## reinstantiate
private$gGraphFromNodes(nodes.new, ed)
return(invisible(self))
},
#' @name connect
#' @description
#'
#' Connect node pairs in the gGraph by adding (optional) edge metadata and (optionally) inserting nodes or grl / walks
#' in between the given edge. Note: the connections are made with respect to the provided node orientation
#' so if the node is provided in a "flipped" orientation then it's right direction will point left on the reference.
#'
#' @param n1 = gNode object must point to a node in this graph, can also be an index of a node (but not a metadata expression), can also be a gWalk object
#' @param n2 = gNode object must point to a node in this graph, can also be an index of a node (but not a metadata expression), can also be a gWalk object
#' @param n1.side character vector of length length(n1) whose value is either "left" or "right" (default 'right')
#' @param n2.side character vector of length length(n1) whose value is either "left" or "right" (default 'left')
#' @author Marcin Imielinski
#' @return gGraph (also modified in place) with nodes annotated with parent.node.id, parent.rep
connect = function(n1, n2, n1.side = 'right', n2.side = 'left', type = 'ALT', meta = NULL, insert = NULL)
{
if (length(n1) != length(n2))
stop('length of n1 and n2 must be the same')
if (inherits(n1, 'gNode'))
{
if (n1$graph != self)
stop('n1 must match gGraph')
n1 = n1$dt$snode.id
}
if (inherits(n2, 'gNode'))
{
if (n2$graph != self)
stop('n2 must match gGraph')
n2 = n2$dt$snode.id
}
newedges = data.table(n1 = abs(n1), n2 = abs(n2),
n1.side = ifelse(sign(n1)*sign((n1.side == 'right')-0.5)>0, 'right', 'left'),
n2.side = ifelse(sign(n2)*sign((n2.side == 'right')-0.5)>0, 'right', 'left'))
if (!is.null(meta))
newedges = cbind(newedges, meta)
if (is.null(newedges$type))
newedges$type = type
newnodes = self$nodes$gr
newnodes$parent.node.id = newnodes$node.id
newnodes$parent.rep = NA
## if insert is provided then we add more nodes and edges and reconnect them to the provided
if (!is.null(insert))
{
if (length(insert) != nrow(newedges))
stop('length of insert must be equal to the number of inserted edges')
if (inherits(insert, 'GRanges'))
insert = split(insert, 1:length(insert))
if (inherits(insert, 'GRangesList'))
{
insert = gW(grl = insert, disjoin = FALSE)
## automatically label all inserted edges as ALT by default
insert$graph$edges$mark(type = 'ALT')
}
## at this point insert is a gWalk
insert.gr = insert$graph$nodes$gr
tmp = dunlist(unname(insert$snode.id))
insert.gr$parent.node.id[abs(tmp$V1)] = insert$dt$node.id[tmp$listid]
insert.gr$parent.rep = tmp$listid
ed.insert = insert$graph$edges$dt[, n1 := paste('insert', n1)][, n2 := paste('insert', n2)]
## relabel the insert that we are swapping into the graph
insert.gr$node.id = paste('insert', insert.gr$node.id)
## these vectors will be all of the same length
inodes = data.table(
firsts = sapply(insert$snode.id, '[', 1),
lasts = sapply(insert$snode.id, function(x) x[length(x)])
)
## we split newedges into 2
newedges1 = copy(newedges)
newedges2 = copy(newedges)
## connect n2 side of newedges1 to (the correct side) of firsts
## i.e. if firsts is + then left side otherwise right side
## note: this does not depend on the provided n1.side or n2.side
newedges1[, n2 := insert.gr$node.id[abs(inodes$firsts)]]
newedges1[, n2.side := ifelse(inodes$firsts>0, 'left', 'right')]
## connect n1 of newedges2 to (the correct side) of lasts
## i.e. if first is + then right side otherwise left side
newedges2[, n1 := insert.gr$node.id[abs(inodes$lasts)]]
newedges2[, n1.side := ifelse(inodes$lasts>0, 'right', 'left')]
newedges = rbind(newedges1, newedges2, ed.insert, fill = TRUE)
## by now all instances of insert$toinsert node.ids should be missing from ed
## and replaced with nodes from n2
## we now concatenate the remaining n1 with n2, find a common
newnodes$node.id = as.character(newnodes$node.id)
## reset the loose end status of the inserted walk node GRanges
insert.gr$loose.right = insert.gr$loose.left = FALSE
newnodes = grbind(newnodes, insert.gr)
## factor across new levels and convert to integers to include the newnodes
newedges[, n1 := as.integer(factor(n1, newnodes$node.id))]
newedges[, n2 := as.integer(factor(n2, newnodes$node.id))]
}
if (nrow(self$edges$dt)>0)
newedges = rbind(self$edges$dt, newedges, fill = TRUE)
newnodes$node.id = newnodes$snode.id = NULL
newedges$edge.id = newedges$sedge.id = NULL
private$gGraphFromNodes(newnodes, newedges)
return(invisible(self))
},
#' @name toposort
#'
#' marks node metadata with topo.order after topological sort
#' if graph has cycles, will do partial sort, leave some nodes as NA
#'
#' @return current graph, modified with nodes marked according to topological sort
toposort = function()
{
## check if graph is a DAG (non-DAGs break topo-sort in newer igraph versions)
if (igraph::is_dag(self$igraph)) {
v = unique(abs(self$gr$snode.id[as.vector(
suppressWarnings(
igraph::topo_sort(self$igraph)))]))
} else {
v = unique(abs(self$gr$snode.id[as.vector(
suppressWarnings(
igraph::topo_sort(igraph::mst(self$igraph))))]))
}
self$nodes$mark(topo.sort = NULL)
self$nodes[v]$mark(topo.order = 1:length(v))
return(invisible(self))
},
#' @name gGraph$print
#' @description
#'
#' Prints out this gGraph. Prints number of nodes and edges, the gNode associated
#' with this gGraph and the gEdge associated with this gGraph
print = function()
{
nl = length(self$loose)
nt = length(self$terminal)
nalt = sum(private$pedges$type == "ALT")/2
nref = sum(private$pedges$type == "REF")/2
ne = nalt+nref
message(sprintf('gGraph with %s nodes, %s loose ends (%s terminal and %s internal), and %s edges (%s REF and %s ALT)',
self$length, nl, nt, nl-nt, ne, nref, nalt),
appendLF = FALSE)
if (self$nodes$length > 0){
message(' comprising:\n')
}
else
message('\n')
self$nodes$print()
if (nrow(private$pedges))
{
message()
self$edges$print()
}
},
#' @name annotate
#' @description
#'
#' Used by the mark() functions in gNode, gEdge and gWalks to alter the metadata
#' associated with the nodes and edges in this gGraph. Not recommended to use this
#' function. It is much safer to use mark.
#'
#' FYI
#' id for nodes is the node.id (not snode.id)
#' id for edges is the edge.id (not sedge.id)
annotate = function(colName, data, id, class)
{
if (class == "node") {
# NONO.FIELDS = c('node.id', 'snode.id', 'index', 'loose.left', 'loose.right', 'loose.left', 'loose.right')
NONO.FIELDS = c('node.id', 'snode.id', 'index')
if (colName %in% NONO.FIELDS)
stop(paste('Cannot alter these protected gNode fields: ', paste(NONO.FIELDS, collapse = ', ')))
if (is.null(data))
{
values(private$pnodes)[[colName]] = NULL
return(invisible(self))
}
id = self$queryLookup(id)
id$data = data
index = c(id[, index], id[, rindex])
data = c(id[, data], id[, data])
gr.dt = as.data.table(private$pnodes)
suppressWarnings(gr.dt[index, paste(colName) := data])
values(private$pnodes)[[colName]] = gr.dt[[colName]]
} else if (class == "edge") {
NONO.FIELDS = c('from', 'to', 'sedge.id', 'edge.id','n1', 'n2', 'n1.side', 'n2.side')
if (colName %in% NONO.FIELDS)
stop(paste('Cannot alter these protected gEdge fields: ', paste(NONO.FIELDS, collapse = ', ')))
if (is.null(data))
{
private$pedges[[colName]] = NULL
return(invisible(self))
}
if (colName == "type" && (!is.character(data) || !all(data %in% c('REF', 'ALT'))))
stop('type is a reserved gEdge metadata field and can only be replaced with values REF and ALT')
id2 = id ## for some reason data.table needs this to assign correctly (SCARY!!) ... must be a substitution / promise / NSE issue
suppressWarnings(private$pedges[.(id2), paste(colName) := rep(data, length.out = .N)])
} else {
stop("Not sure how we got to this error at all, we should never be here")
}
return (self)
},
#' @name maxflow
#' @description
#'
#' Computes the "max flow" between every node pair in self
#' for some metadata field.
#'
#' The "max flow" for a node pair i, j is the
#' maximum value m of node and/or edge metadata for which there
#' exists a path p between i and j whose nodes n and/or edges e
#' obey field(n)>=m and/or field(e)>=m for all n,e \eqn{\in} p.
#' (i.e. m is the maximum lower bound of the value of
#' nodes / edges across all paths connecting ij)
#'
#' The "min version" of this problem (max = FALSE) will
#' determine the min value m for which there exists p
#' whose nodes n and edges e
#' obey field(n)>=m and/or field(e)>=m for all n,e \eqn{\in} p.
#'
#' The user can also do the problem with lower.bound = FALSE
#' i.e. where m is the (maximum or minimum) <upper> bound
#' value in each path.
#'
#' By default will try to solve problem across both node and
#' edge metadata if the field is present in either. If the field
#' is only present in one then will solve for that. This property
#' can be toggled using edges.only and nodes.only parameters.
#'
#' @param field metadata field to run maxflow on
#' @param max logical flag whether to find maximum path or (if max = FALSE) minimum path
#' @param walk if TRUE will return the single walk that maximizes the sum of metadata fields
#' @param nfield field to specify a node field to maximize across paths
#' @param efield field to specify an edge field to maximize across paths
#' @param cfield field to specify a node / edge field that limits / caps the dosage at nodes / edges
#' @param path.only logical flag relevant only if walk = TRUE, if path is TRUE will only allow path based maxflows (TRUE) ie will not return a solution when the graph contains only cycles
#' @param multi logical flag (FALSE) if TRUE will allow the optimization to compute a solution that outputs multiple disjoint paths
#' @param ncopies positive integer representing the number of copies of the flow that we want the graph to support
#' @param reverse complement will compute maximum flow between each node i and the reverse complement of node j in a strand specific way
#' @author Marcin Imielinski
maxflow = function(field = NA, walk = FALSE, max = TRUE,
lower.bound = TRUE,
nfield = NA,
efield = NA,
cfield = NA,
path.only = TRUE,
require.nodes = NULL,
multi = FALSE,
ncopies = 1,
reverse.complement = FALSE,
verbose = FALSE
)
{
do.edges = FALSE
do.nodes = FALSE
if (is.na(field))
field = 'cn'
if (is.na(cfield))
cfield = field
if (is.na(efield) & is.na(nfield))
efield = nfield = field
if (!is.na(efield) && efield %in% names(self$edges$dt))
do.edges = TRUE
if (!is.na(nfield) && nfield %in% names(values(self$nodes$gr)))
do.nodes = TRUE
if (!do.edges & !do.nodes)
{
warning(sprintf('field %s not found in node and field %s not found in edge metadata, marking each with dummy values (1)', efield, nfield))
nval = eval = list(1)
names(eval) = efield
names(nval) = nfield
do.call(self$nodes$mark, nval)
do.call(self$edges$mark, eval)
}
## if (walk)
## {
## if (is.null(self$edges$dt$cn) | is.null(self$nodes$dt$cn))
## {
## do.edges = FALSE
## do.nodes = FALSE
## if (is.null(self$nodes$dt$loose.cn.left) | is.null(self$nodes$dt$loose.cn.right))
## {
## warning('loose cn left and right is currently required for maxflow(walk = TRUE), putting in dummy values using $loose.left and $loose.right node features')
## if (is.na(field))
## field = 'cn'
## if (is.na(cfield))
## cfield = field
## if (is.na(efield) & is.na(nfield))
## efield = nfield = field
## if (!is.na(efield) && efield %in% names(self$edges$dt))
## do.edges = TRUE
## if (!is.na(nfield) && nfield %in% names(values(self$nodes$gr)))
## do.nodes = TRUE
## if (!do.edges & !do.nodes)
## {
## warning(sprintf('field %s not found in node and field %s not found in edge metadata, marking each with dummy values (1)', efield, nfield))
## nval = eval = list(1)
## names(eval) = efield
## names(nval) = nfield
## do.call(self$nodes$mark, nval)
## do.call(self$edges$mark, eval)
## }
if (walk)
{
if (is.null(self$edges$dt$cn) | is.null(self$nodes$dt$cn))
{
warning('cn is required for maxflow(walk = TRUE), putting in dummy values')
if (is.null(self$edges$dt$cn))
self$edges$mark(cn = 1)
if (is.null(self$nodes$dt$cn))
self$nodes$mark(cn = 1)
}
if (is.null(self$nodes$dt$loose.cn.left) | is.null(self$nodes$dt$loose.cn.right))
{
warning('loose cn left and right is currently required for maxflow(walk = TRUE), putting in dummy values using $loose.left and $loose.right node features')
self$nodes$mark(loose.cn.left = self$nodes$dt$cn*sign(self$nodes$dt$loose.left))
self$nodes$mark(loose.cn.right = self$nodes$dt$cn*sign(self$nodes$dt$loose.right))
}
if (any(is.na(self$edges$dt$cn)))
stop('cn has NA values, cn is a required edge data field for maxflow(walk = TRUE) so either make blank or fill in with non NA values')
if (any(is.na(self$nodes$dt$cn)))
stop('cn has NA values, cn is a required node data field for maxflow(walk = TRUE) so either make blank or fill in with non NA values')
ed = copy(private$pedges)
## graph needs loose ends to output a walk
## since all walks must begin and end at a loose end
if (!nrow(ed) & !length(self$loose))
return(gW(c(), graph = self))
## make incidence matrix nodes x edges + loose ends
Inc = sparseMatrix(1, 1, x = 0,
dims = c(length(self$nodes),
length(self$edges) + length(self$loose))*2)
rownames(Inc) = c(1:length(self$nodes), -(1:length(self$nodes)))
colnames(Inc) = 1:ncol(Inc)
colnames(Inc)[1:nrow(ed)] = ed$sedge.id
Inc[cbind(ed$from, 1:nrow(ed))] = -1
Inc[cbind(ed$to,1:nrow(ed))] = 1 + Inc[cbind(ed$to,1:nrow(ed))]
lleft = which(self$nodes$loose.left)
lright = which(self$nodes$loose.right)
Inc[cbind(match(lleft, rownames(Inc)),
nrow(ed) + 1:length(lleft))] = 1
Inc[cbind(match(-lleft, rownames(Inc)),
nrow(ed) + length(lleft) + 1:length(lleft))] = -1
Inc[cbind(match(lright, rownames(Inc)),
nrow(ed) + 2*length(lleft)+ 1:length(lright))] = -1
Inc[cbind(match(-lright, rownames(Inc)),
nrow(ed) + 2*length(lleft) + length(lright) + 1:length(lright))] = 1
## metadata for edges including loose ends
meta = data.table(index = 1:ncol(Inc),
sedge.id = NA_integer_,
lledge.id = NA_integer_, ## left loose end
lredge.id = NA_integer_ ## right loose end
)
meta[1:nrow(ed), sedge.id := ed$sedge.id %>% as.integer]
if (length(lleft))
meta[1:(2*length(lleft)) + nrow(ed), lledge.id := c(lleft, -lleft)]
if (length(lright))
meta[1:(2*length(lright)) + nrow(ed) + 2*length(lleft), lredge.id := c(lright, -lright)]
meta$cn = ed[.(meta$sedge.id), cn]
meta[!is.na(lledge.id), cn := self$nodes[lledge.id]$dt$loose.cn.left]
meta[!is.na(lredge.id), cn := self$nodes[lredge.id]$dt$loose.cn.right]
## these ids will allow us to group rc edges
meta[, id := ifelse(!is.na(sedge.id), abs(sedge.id),
ifelse(!is.na(lledge.id), paste0(abs(lledge.id), 'l'),
paste0(abs(lredge.id), 'r')))]
meta[, id := factor(id) %>% as.integer]
cvec = rep(0, ncol(Inc))
## do.nodes determines whether to create the objective from a node or edge metadata
if (do.nodes)
{
cvec[1:nrow(ed)] = values(private$pnodes)[ed$from,][[nfield]]
}
else
{
cvec[1:nrow(ed)] = ed[[efield]]
}
## reward the use of loose ends
lix = meta[is.na(sedge.id), index]
cvec[lix] = 1
Amat = Inc
b = data.table(
type = 'flow',
bvec = rep(0, nrow(Amat)),
sense = 'E'
)
## if multi we don't place any constraints on the # of loose ends ie paths
if (!multi)
{
## add loose end constraint ie require total weight 2 on loose ends
lec = ifelse(1:ncol(Inc) %in% 1:nrow(ed), 0, 1) ## lec = 1 if loose end, 0 otherwise
Amat = rbind(Inc, lec)
if (path.only) ## require single path
{
b = rbind(b,
data.table(
type = 'pathonly',
bvec = 2,
sense = 'E'))
}
else ## allow max one path
{
b = rbind(b,
data.table(
type = 'pathonly',
bvec = 2,
sense = 'L'))
}
}
## add basic utilization constraints
## limiting the flow through each signed node to 1
## each edge utilizes half of each node
Umat = sparseMatrix(1,1, x = 0,
dims = c(length(self$nodes),
length(self$edges) + length(self$loose))*2)
rownames(Umat) = c(1:length(self$nodes), -(1:length(self$nodes)))
colnames(Umat) = 1:ncol(Umat)
colnames(Umat)[1:nrow(ed)] = ed$sedge.id
Umat.so = Umat.si = Umat ## make separate source and sink versions
## Umat.so catalogues the edges for which that node is a source
## Umat.si ... for which that node is a sink
Umat.so[cbind(ed$from, 1:nrow(ed))] = 1
Umat.si[cbind(ed$to,1:nrow(ed))] = 1
Amat = rbind(
Amat,
Umat.so,
Umat.si
)
## this constrains each node to be used at most once
## as a sink or a source
b = rbind(b,
data.table(
type = rep(
c('utilization.so', 'utilization.si'),
each = nrow(Umat)),
bvec = rep(1, 2*nrow(Umat)),
sense = 'L')
)
ub = rep(1, ncol(Amat))
## add node and edge capacity constraints if cfield is provided
if (!is.null(cfield) & cfield %in% names(self$nodes$dt))
{
## we want to figure out the
## strand free usage of a node in any flow
## meaning add the number of times that node
## "side" is used in any flow
## meaning that we want to add the usage of
## node sinks and the sources of their reverse complements
## ie have a constraint that
## limits the sum of copy number across the union of
## sink_edges(node_i) and source_edges(rc(node_i))
## to the node copy number
Umatc.left = Umat.si[self$nodes$dt$node.id %>% as.character, ] +
Umat.so[-self$nodes$dt$node.id %>% as.character, ]
Umatc.right = Umat.so[self$nodes$dt$node.id %>% as.character, ] +
Umat.si[-self$nodes$dt$node.id %>% as.character, ]
Amat = rbind(
Amat,
Umatc.left,
Umatc.right
)
## cap the signed node usage to node copy number
## divided by ncopies
b = rbind(b,
data.table(
type = 'capacity.left',
bvec = self$nodes$dt[[cfield]]/ncopies,
sense = 'L'),
data.table(
type = 'capacity.right',
bvec = self$nodes$dt[[cfield]]/ncopies,
sense = 'L')
)
if (!is.null(require.nodes))
{
Amat = rbind(
Amat,
Umatc.left[require.nodes,,drop = FALSE] +
Umatc.right[require.nodes,,drop = FALSE]
)
b = rbind(b,
## data.table(
## type = 'require.left',
## bvec = rep(1, length(require.nodes)),
## sense = 'G'),
data.table(
type = 'require.right',
bvec = rep(1, length(require.nodes)),
sense = 'G')
)
}
## now strand collapse edges
Emat = sparseMatrix(
meta$id,
meta$index, x = as.numeric(1), dims = c(nrow(meta)/2,ncol(Amat))) %>% as.matrix
## this matrix now summarizes the copy number of this flow
## across pairs of stranded edges
## which can't be bigger than capacity constraints
## on the unsigned edge
Amat = rbind(
Amat,
Emat
)
## cap the signed node usage to node copy number
## these capacity constraints are stored in meta
b = rbind(b,
data.table(
type = 'ecapacity',
bvec = meta[match(1:nrow(Emat), id), cn]/ncopies,
sense = 'L')
)
## cap individual signed edge.ids to their capacity
ub[1:nrow(ed)] = pmin(ub[1:nrow(ed)], ed[[cfield]])
}
## sol = Rcplex::Rcplex(Amat = Amat)
sol = Rcplex2(Amat = Amat,
lb = rep(0, ncol(Amat)),
ub = ub,
bvec = b$bvec,
sense = b$sense,
vtype = 'I',
control = list(trace = verbose > 1),
objsense = ifelse(max, 'max', 'min'),
cvec = cvec)
## find edges used in the solution
opted = ed[round(sol$xopt[1:.N])!=0, ]
## now need to convert pile of edges to walk(s)
## (note: there can be more than one walk because of cycles)
## (however the edge pile is constrained because of the above
## MIP to have at most one edge entering and leaving each
## node)
## will use dfs
## pick a source (or random node if none exists) for the DFS
## find source nodes by mining the loose ends that were utilized
## add also additional sources from the used edges
lix = meta[is.na(sedge.id), index]
so = which(((Inc[, lix]>0) %*% sol$xopt[lix])!=0) %>%
c(opted$from)%>% unique %>% as.character
## combine with all other utilized vertices to build graph
allv = union(so, opted$to)
if (!length(so))
return(gW(graph = self))
## we have to make adjacency matrix here to take care of edge cases
## where there are graphs with no edges but loose ends
adj = sparseMatrix(1, 1, x = 0, dim = rep(length(allv),2), dimnames = rep(list(allv), 2))
adj[cbind(match(opted$from, allv), match(opted$to, allv))] = 1
G = graph.adjacency(adj)
## we are using dfs to convert this flow into paths / cycles
## we do a separate dfs for each source, prioritizing the true "source" nodes first
## cyclic paths will begin from an arbitrary internal source
## (this will create some redundant walks, which we prune out below)
walksi = lapply(so, function(x) V(G)$name[dfs(G, x, unreachable = FALSE)$order %>% as.integer %>% setdiff(NA)] %>% as.integer)
## for each node, we only keep the first walk that has that node
## (this will remove reverse complements as well as redundant paths from the dfs)
snode.id = lapply(walksi, function(x) private$pnodes$snode.id[x])
keep = dunlist(snode.id)[!duplicated(abs(V1)), listid] %>% unique
walksi = walksi[keep]
snode.id = snode.id[keep]
## mark all cycles
setkeyv(opted, c('from', 'to'))
is.cycle = opted[lapply(walksi, function(x) data.table(x[length(x)], x[1])) %>% rbindlist, !is.na(edge.id)]
## now first let's instantiate a graph around opted so we can get the edge lists
tmp.gg = gG(nodes = self$nodes$gr, edges = self$edges[abs(opted$sedge.id)]$dt)
tmp.gw = gW(snode.id = snode.id, circular = is.cycle, graph = tmp.gg)
sedge.id = lapply(tmp.gw$sedge.id, function(x) if (length(x)) sign(x)*abs(opted$sedge.id)[abs(x)])
gw = gW(sedge.id = sedge.id, snode.id = snode.id, circular = is.cycle, graph = self)
if (path.only) ## keep only the paths
{
gw = gw[!gw$circular]
}
return(gw)
}
uval = c()
if (do.edges)
uval = c(uval, self$edges$dt[[nfield]])
if (do.nodes)
uval = c(uval, values(self$nodes$gr)[[nfield]])
uval = sort(unique(setdiff(uval, NA)))
## if max we will begin with high values
## and determine the
## connectivity of graph nodes at all nodes / edges with
## field>=this.val
## if !max we will begin low values and determine
## the connectivity of all nodes / edges with field<=this.val
if (lower.bound)
uval = rev(uval)
out = matrix(ifelse(max, -Inf, Inf), nrow = length(self$nodes),
ncol = length(self$nodes))
if (length(uval)==0)
return(out)
comparator = ifelse(lower.bound, '>=', '<=')
estring = ifelse(do.edges,
sprintf('%s %s this.val', efield, comparator),
'')
nstring = ifelse(do.nodes,
sprintf('%s %s this.val', nfield, comparator),
'')
cmd = sprintf("graph[%s,%s]", nstring, estring)
## copy self and label og nodes
graph = self$copy
graph$nodes$mark(og.node.id = graph$nodes$dt$node.id)
for (i in 1:length(uval))
{
this.val = uval[i]
if (verbose)
message(sprintf('trying values %s %s (%s of %s)',
comparator, this.val,
i, length(uval)))
this.graph = eval(parse(text = cmd))
node.ids = this.graph$nodes$dt$og.node.id
if (reverse.complement)
{
nix = this.graph$nodes %>% seq_along
mat = pmin(
this.graph$dist(nix, -nix, ignore.strand = FALSE),
this.graph$dist(-nix, nix, ignore.strand = FALSE))
reachable = as.matrix(
as.data.table(
which(mat<Inf,
arr.ind = TRUE))[,
.(nid1 = node.ids[row], nid2 = node.ids[col])])
}
else
{
reachable = as.matrix(
as.data.table(
which(this.graph$dist()<Inf,
arr.ind = TRUE))[,
.(nid1 = node.ids[row], nid2 = node.ids[col])])
}
if (max)
out[reachable] = pmax(out[reachable], this.val)
else
out[reachable] = pmin(out[reachable], this.val)
}
return(out)
},
#' @name window
#' @description
#'
#' Returns the region this gGraph spans as a GRanges
#'
#' @param pad A positive amount to pad the window by
#' @return GRanges of the region this gGraph covers
window = function(pad = 0)
{
return(streduce(gr.stripstrand(private$pnodes + pad)))
},
gtrack = function(y.field = NULL, lwd.loose = 3, col.loose = alpha("blue",0.6), col.alt = alpha("red",0.4), ...)
{
if (!length(self$nodes))
return(gTrack(GRanges(seqlengths = seqlengths(self))))
ss = private$pnodes
ed = private$pedges
lleft = self$nodes$lleft[, c('snode.id', 'node.id')]
lright = self$nodes$lright[, c('snode.id', 'node.id')]
if ((length(lleft) + length(lright))>0)
{
last = length(ss)
lleft$index = seq_along(lleft) + last
last = last + length(lleft)
lright$index = seq_along(lright) + last
loose.ed = rbind(
data.table(from = lleft$index,
to = self$queryLookup(lleft$snode.id)$index),
data.table(from = self$queryLookup(lright$snode.id)$index,
to = lright$index)
)[, type := 'loose']
loose = gr.flipstrand(grbind(lleft, lright))
loose$loose = TRUE
ss$loose = FALSE
ss = grbind(ss, loose)
ed = rbind(ed, loose.ed, fill = TRUE)
}
if (!is.null(ed))
{
## set edge apperaances
## lwd, lty, col, cex.arrow, v, not.flat, h, dangle.w
if (is.null(y.field) || !is.element(y.field, colnames(ed))) {
ed[, y := 1]
} else
{
ed$y = ed[[y.field]]
}
if (!("col" %in% names(ed)))
{
ed$col = as.character(NA)
}
if (!("lty" %in% names(ed)))
{
ed$lty = as.numeric(NA)
}
if (!("v" %in% names(ed)))
{
ed$v = as.numeric(NA)
}
if (!("h" %in% names(ed)))
{
ed$h = as.numeric(NA)
}
if (!("cex.arrow" %in% names(ed)))
{
ed$cex.arrow = as.numeric(NA)
}
if (!("lwd" %in% names(ed)))
{
ed$lwd = as.numeric(NA)
}
if (!("dangle.w" %in% names(ed)))
{
ed$dangle.w = as.numeric(NA)
}
if (!("not.flat" %in% names(ed)))
{
ed$not.flat = as.logical(NA)
}
ed$col = as.character(ed$col)
ed$border = as.character(ed$border)
ed[, ":="(lwd = ifelse(is.na(lwd), ifelse(type=="ALT", log2(0.2*pmax(0, y, na.rm = TRUE)+2)+1, ifelse(type == 'loose', lwd.loose, 1)),lwd),
lty = ifelse(is.na(lty), ifelse(type=='loose', 1, 1),lty),
col = ifelse(is.na(col), ifelse(is.na(col), ifelse(type=="ALT",
ifelse(is.na(y),
col.alt,
ifelse(y>0,
col.alt,
alpha("purple", 0.3))),
ifelse(type=="loose",
col.loose,
alpha("grey",0.5))), col), col),
cex.arrow = ifelse(is.na(cex.arrow), 0, cex.arrow),
not.flat = ifelse(is.na(not.flat), type=="ALT", not.flat),
v = ifelse(is.na(v), ifelse(type=="ALT", 2, 1),v),
h = ifelse(is.na(h), ifelse(type=="ALT", 2, 1),h),
dangle.w = ifelse(is.na(dangle.w), 0.5, dangle.w))]
ed = ed[!is.na(from) & !is.na(to)]
}
## DONE: handle so/si-less edges, omit for now
## col, border, ywid
if (!("col" %in% names(values(ss)))){
ss$col = as.character(NA)
}
## col, border, ywid
if (!("border" %in% names(values(ss)))){
ss$border = as.character(NA)
}
ss$border = as.character(ss$border)
ss$col = as.character(ss$col)
if (!("lwd.border" %in% names(values(ss)))){
ss$lwd.border = as.numeric(NA)
}
if (!("ywid" %in% names(values(ss)))){
ss$ywid = as.numeric(NA)
}
ss$col = ifelse(is.na(ss$col), ifelse(ss$loose, alpha("white", 0), alpha("grey", 0.5)), ss$col)
ss$border = ifelse(is.na(ss$border), ifelse(ss$loose, ss$col, alpha("black", 0.5)), ss$border)
ss$lwd.border = ifelse(is.na(ss$lwd.border), 1, ss$lwd.border)
ss$ywid = ifelse(is.na(ss$ywid), ifelse(ss$loose, 0.001, 0.8), ss$ywid)
gt.args = private$pmeta[intersect(c('colormap',
'gr.colorfield',
'gr.labelfield',
'y.field',
'col',
'border',
'name',
"height",
'y1',
'y0',
'lwd.border',
'labels.suppress',
'labels.suppress.gr',
'label.suppress.grl',
'yaxis'), names(private$pmeta))]
args = list(...)
for (arg in names(args))
{
gt.args[[arg]] = args[[arg]]
}
if (!is.null(y.field)){
gt.args[['y.field']] = y.field
}
if (is.null(gt.args$stack.gap)){
gt.args[['stack.gap']] = 0
}
stack.gap = gt.args$stack.gap
y.field = gt.args$y.field
if (is.null(stack.gap)){
stack.gap = 1e5
}
if (is.null(gt.args$angle)){
gt.args[['angle']] = 0
}
if (is.null(gt.args$y0)){
gt.args[['y0']] = 0
}
if (is.null(gt.args$ylab)){
gt.args[['ylab']] = y.field
}
if (is.null(gt.args$col)){
gt.args$col = NA
}
gt.args$edges = ed
if (is.null(gt.args$name)){
gt.args$name = 'gGraph'
}
if (!is.null(y.field) && y.field %in% colnames(values(ss))){
ss$y = values(ss)[, y.field]
gt.args$y.field = 'y'
## let loose ends inherit their y value from their parent nodes
if (any(lix <- ss$loose))
{
ss$y[lix] = ss$y[ss$node.id[lix]] + 0.25
}
gt.args[['data']] = unname(ss)
gt = do.call(gTrack::gTrack, gt.args)
} else {
gt.args$y.field = "y"
gt.args$yaxis = FALSE
## stack node pairs via stack.gap
tmp.ss = ss[ss$snode.id>0]
tmp.ss$y = suppressWarnings(disjointBins(tmp.ss+stack.gap))-0.5
ss$y = tmp.ss$y[match(ss$node.id, tmp.ss$node.id)]
## stack parent graphs
## if the data here is the result of a
## concatenation
if (!is.null(ss$parent.graph) | !is.null(ss$parent.rep))
{
if (is.null(ss$parent.graph))
{
ss$parent.graph = ss$parent.rep
}
else if (!is.null(ss$parent.rep))
{
ss$parent.graph = paste(ss$parent.graph, ss$parent.rep)
}
## fill in any blanks using igraph
if(any(is.na(ss$parent.graph)))
{
cl = igraph::clusters(self$igraph, 'weak')$membership
tmp = data.table(pg = ss$parent.graph,
cl = cl[ss$node.id])
## use weak components to impute parent.graph
pg.map = tmp[!is.na(pg), pg[1], keyby = cl]
ss$parent.graph = pg.map[.(tmp$cl), V1]
}
mx = as.data.table(ss)[, max(y), keyby = .(seqnames, parent.graph)]
mx[, offset := c(0, cumsum(V1)[-.N]), by = .(seqnames)]
setkeyv(mx, c('seqnames', 'parent.graph'))
ss$y = ss$y + mx[.(as.character(seqnames(ss)), ss$parent.graph), offset]
}
## let loose ends inherit their y value from their parent nodes
if (any(lix <- ss$loose))
ss$y[lix] = ss$y[ss$node.id[lix]] + 0.2
gt.args[['data']] = unname(ss)
gt = do.call(gTrack::gTrack, gt.args)
}
return(gt)
},
#' @name trim
#' @description
#'
#' Trims the current gGraph to the provided GRanges and returns this as a new
#' gGraph.
#'
#' @param tile GRanges to trim on
#' @param mod Defaults to FALSE, set to TRUE to modify this gGraph
#' @param tile interval around which to trim the gGraph
#' @return new gGraph trimmed to tile, unless mod is set to TRUE
#'
#' @details
#' ```
#' gr = c(GRanges("1", IRanges(10000,100000), "+"), GRanges("2", IRanges(10000,100000), "+"))
#' new.gg = gg$trim(gr)
#' ```
#'
#' @author Joe DeRose
trim = function(tile)
{
## Some quick catch cases
if (length(tile) == 0) {
stop("tile cannot contain no nodes, nothing to trim around")
}
if(length(self) == 0) {
return(invisible(self))
}
if (is.character(tile))
tile = parse.gr(tile)
es = convertEdges(private$pnodes, private$pedges, metacols=T)
tile = gr.fix(tile, private$pnodes)
tile = streduce(tile)
## Get positive overlaps
nodes = unname(private$pnodes) %Q% (strand == "+")
new.nodes = gr.findoverlaps(nodes, tile)
## If the new trim has no nodes, return empty graph
if(length(new.nodes) == 0) {
return(gGraph$new())
}
nodes = nodes[new.nodes$query.id]
## Find which nodes had their ends changed
## Edges only will be kept if the end of a node was not trimmed
new.nodes$left = start(new.nodes)==start(nodes)
new.nodes$right = end(new.nodes)==end(nodes)
mcols(new.nodes) = cbind(mcols(new.nodes), mcols(nodes)) ## carry over the metadata
## map the edges
if (nrow(private$pedges)==0){
es = private$pedges
}
else {
## Get edges that have n1 or n2 == query.id
validNodes = new.nodes$query.id
es[, keep := n1 %in% validNodes & n2 %in% validNodes]
## Want to remove edges that went into nodes that were trimmed on one side
## Find query.id's that have all FALSE in left or right - might still be multiple trims but one side is trimmed
sg = gr2dt(new.nodes)[, .(mean(left)>0, mean(right)>0), by=query.id]
leftRemove = sg[V1 == FALSE, query.id]
rightRemove = sg[V2 == FALSE, query.id]
## Remove edges that had one of their sides trimmed
es = es[, keep := keep & !(n1 %in% leftRemove & n1.side == 0) & !(n2 %in% leftRemove & n2.side == 0)]
es = es[, keep := keep & !(n1 %in% rightRemove & n1.side == 1) & !(n2 %in% rightRemove & n2.side == 1)]
new.es = es[keep == TRUE, ]
## keep track of node sides that will now acquire loose ends
new.loose = unique(es[keep == FALSE, .(n = c(n1, n2), side = c(n1.side, n2.side))][n %in% (new.nodes$node.id), ])
if (nrow(new.loose)>0)
{
new.loose$present = TRUE
setkeyv(new.loose, c("n", "side"))
new.nodes$loose.right = new.nodes$loose.right | new.loose[.(new.nodes$node.id, 1), !is.na(present)]
new.nodes$loose.left = new.nodes$loose.left | new.loose[.(new.nodes$node.id, 0), !is.na(present)]
}
## map left==TRUE to n1 or n2 side == 0
## map right==TRUE to n1 or n2 side == 1
map = data.table(old = c(new.nodes[new.nodes$left]$query.id, new.nodes[new.nodes$right]$query.id),
side = c(rep(0, length(which(new.nodes$left))), rep(1, length(which(new.nodes$right)))),
new = c(which(new.nodes$left), which(new.nodes$right)))
setkeyv(map, c("old", "side"))
new.es[, ":="(n1 = map[.(n1,n1.side), new], n2 = map[.(n2,n2.side), new])]
es = new.es
}
## Remove miscellaneous metacols added in this function
new.nodes$left = NULL
new.nodes$right = NULL
new.nodes$query.id = NULL
new.nodes$subject.id = NULL
pmeta = copy(private$pmeta);
private$gGraphFromNodes(nodes = new.nodes,
edges = es)
private$pmeta = pmeta
return(invisible(self))
},
#' @name fix
#' @description
#'
#' Modifies (in place) the current seqlevels of the gGraph, including
#' keeping only certain seqlevels, dropping certain seqlevels, and replacing seqlevels.
#'
#' Warning: this may modify the graph including getting rid of nodes and edges (i.e. those outside
#' the retained seqlevels) and also change coordinates (ie move ranges that were previously on different
#' chromosomes to the same chromosome etc.). Use with caution!
#'
#' Default behavior is to replace 'chr', with ''.
#'
#' @param pattern character pattern to replace in seqlevels (used in a gsub, can have backreferences)
#' @param replacement character to replace pattern with (used in a gsub, can have backreferences)
#' @param drop character vector of seqlevels to drop or logical TRUE to drop all seqlevels that are unused (TRUE)
#' @param seqlengths new seqlengths i.e. named integer vector of seqlevels to drop or embed graph into
#' @author Marcin Imielinski
#' @return current graph modified in place with additional nodes and edges, as specified by user
fix = function(pattern = NULL, replacement = NULL, drop = TRUE, seqlengths = NULL)
{
if (is.logical(drop))
{
if (drop)
drop = setdiff(seqlevels(self), as.character(seqnames(private$pnodes)))
else
drop = NULL
}
if (!is.null(drop) | !is.null(seqlengths))
{
tmp = as.data.table(self$nodes$dt)
if (!is.null(seqlengths))
sl.new = seqlengths
else
sl.new = seqlengths(self)
ix = which(tmp$seqnames %in% names(sl.new) & !(tmp$seqnames %in% drop))
sl.new = sl.new[setdiff(names(sl.new), drop)]
gg = self
gg = self[ix, ]
sl.new = sl.new[setdiff(names(sl.new), drop)]
if (length(sl.new))
private$pnodes = dt2gr(as.data.table(gg$gr), seqlengths = sl.new)
else
private$pnodes = GRanges(seqlengths = sl.new)
private$pedges = gg$sedgesdt
}
if (!is.null(pattern) & !is.null(replacement))
{
tmp = as.data.table(private$pnodes)
tmp$seqnames = gsub(pattern, replacement, tmp$seqnames)
sl.new = seqlengths(self)
names(sl.new) = gsub(pattern, replacement, names(sl.new))
sl.new = data.table(sn = names(sl.new), sl = sl.new)[, max(sl), by = sn][, structure(V1, names = sn)]
private$pnodes = dt2gr(tmp, seqlengths = sl.new)
}
private$buildLookupTable() ## essential for graph integrity!!
private$stamp()
return(invisible(self))
},
#' @name add
#' @description
#'
#' Adds GRanges nodes, edges (data.table), or junctions to graph
#' Only one of the below parameters can be specified at a time (since
#' the graph is modified in place, order matters)
#'
#' @param nodes GRanges, strand is ignored
#' @param edges data.table specifying edges in existing table with field n1, n2, n1.side, n2.side
#' @param junctions Junction object or GRangesList coercible to junction object
#' @author Marcin Imielinski
#' @return current graph modified in place with additional nodes and edges, as specified by user
add = function(nodes = NULL, edges = NULL, junctions = NULL)
{
if (sum(c(!is.null(nodes), !is.null(edges), !is.null(junctions)))!=1)
stop('Exactly one of the following can be added in a given call to add: nodes, edges, junctions')
if (!is.null(nodes))
{
newnodes = grbind(self$nodes$gr, nodes)
private$gGraphFromNodes(nodes = newnodes,
edges = self$edgesdt)
return(invisible(self))
}
else if (!is.null(edges))
{
if (!is.data.table(edges))
edges = as.data.table(edges)
if (!all(c('n1', 'n2', 'n1.side', 'n2.side') %in% names(edges)))
{
stop('edges must be data.table with fields n1, n2, n1.side, n2.side')
}
if (any(duplicated(names(edges))))
{
stop('edges has duplicated column names')
}
if (is.null(edges$type))
edges$type = 'ALT'
if (any(iix <- is.na(edges$type)))
edges[iix, type := 'ALT']
newedges = rbind(self$edgesdt, edges, fill = TRUE)
private$gGraphFromNodes(nodes = self$nodes$gr,
edges = newedges)
return(invisible(self))
}
else if (!is.null(junctions))
{
if (!inherits(junctions, 'Junction'))
junctions = jJ(junctions)
bp = grl.unlist(junctions$grl)
self$disjoin(bp[, c()], collapse = FALSE)
ov = bp[, c('grl.ix', 'grl.iix')] %*% self$nodes$gr[, 'node.id']
strand(ov) = strand(bp)[ov$query.id]
ov1 = ov[ov$grl.iix == 1]
ov2 = ov[ov$grl.iix == 2]
newedges =
merge(gr2dt(ov1),
gr2dt(ov2),
by = 'grl.ix', allow.cartesian = TRUE)[, ":="(
n1 = node.id.x,
n1.side = ifelse(strand.x == '+', 'left', 'right'),
n2 = node.id.y,
n2.side = ifelse(strand.y == '+', 'left', 'right'))][, .(n1, n1.side, n2, n2.side, grl.ix, type = 'ALT')]
if (ncol(junctions$dt)>0)
{
newedges = cbind(newedges, junctions$dt)
newedges = newedges[, unique(colnames(newedges)), with = FALSE]
}
private$gGraphFromNodes(nodes = self$nodes$gr,
edges = rbind(self$edgesdt, newedges, fill = TRUE))
return(invisible(self))
}
},
#' @name json
#' @description
#' Creates a json file for active visualization using gGnome.js
#' annotations are node / edge features that will be dumped to json
#' @param filename character path to save to
#' @param save whether to save or return list object representing json contents
#' @param nfields which node fields to dump to json (NULL)
#' @param efields which edge fields to dump to json (NULL)
#' @param annotations which graph annotations to dump to json
#' @param settings gGnome.js settings values to add to the output JSON files (list)
#' @param cid.field field in the graph edges that should be used for setting the cid values in the JSON (default: 'sedge.id').
#' This is useful for cases in which there is some unique identifier used across samples to identify
#' identical junctions (for example "merged.ix" field, which is generated by merge.Junction())
#' @author Marcin Imielinski
json = function(filename='.',
maxcn=100,
maxweight=100,
save = TRUE,
verbose = FALSE,
annotations = NULL,
nfields = NULL, ## node metadata fields to dump
efields = NULL, ## edge metadata fields to dump
settings = list(y_axis = list(title = "copy number",
visible = TRUE)),
cid.field = NULL,
no.y = FALSE)
{
## Make sure that our nodes are not empty before visualizing
if (is.null(private$pnodes) || length(private$pnodes) == 0) {
stop("Cannot have empty graph and visualize it")
}
if (save){
if (file.exists(filename) && (file.info(filename)$isdir))
{
basedir = filename
filename = paste(basedir, "data.json", sep = '/')
} else
{
if (grepl('\\.js(on)*$', filename)){
basedir = dirname(filename)
} else
{
stop('filename must be directory in which data.json will be created or .json filepath')
}
}
if (!file.exists(basedir)) {
message('Creating directory ', basedir)
system(paste('mkdir -p', basedir))
}
}
## range of CN
ymin=0
ymax=maxcn
node.json =
gr2dt(self$nodes$gr[, "snode.id"])[, .(chromosome = seqnames, startPoint = start, endPoint = end, iid = snode.id, y = 1)]
if (length(nfields))
{
nfields = setdiff(intersect(nfields, names(self$nodes$dt)), names(node.json))
node.json = cbind(node.json, gr2dt(self$nodes$gr)[, nfields, with = FALSE])
}
.dtstring = function(dt)
dt[, gsub('\\|+', '|', gsub('\\|+$', '', gsub('^\\|+', '', do.call(paste, c(lapply(names(.SD), function(x) ifelse(!is.na(.SD[[x]]), paste0(x, '=', .SD[[x]]), '')), sep = '|')))))]
if (!is.null(annotations))
{
nodes.overlap.annotations = intersect(annotations, names(values(self$nodes$gr)))
if (length(nodes.overlap.annotations) == 0){
warning('There is no overlap between the provided annotations and the annotaions available in the nodes in your gGraph.')
node.json$annotation = ''
} else {
node.json = cbind(node.json, data.table(annotation = .dtstring(as.data.table(values(self$nodes$gr))[, intersect(annotations, names(values(self$nodes$gr))), with = FALSE])))
}
}
if (is.null(cid.field)){
cid.field = 'sedge.id'
} else {
if (!(cid.field %in% names(private$pedges))){
stop('Invalid cid.field: "', cid.field, '"')
}
if (any(is.na(get_cids(self, cid.field)))){
stop('Invliad cid.field: "', cid.field)
}
}
ed = data.table()
efields = setdiff(intersect(efields, names(private$pedges)), c("sedge.id", "class", "from", "to", "type", annotations))
if (nrow(private$pedges))
{
ed = copy(private$pedges)[sedge.id>0, intersect(names(private$pedges), c("sedge.id", "class", "from", "to", "type", cid.field, efields, annotations)), with = FALSE] ## otherwise change by reference!
# cid.field is sometimes only defines for ALT edges so here we add values for REF (We also need to add this later to LOOSE edges. See below)
cid_na = ed[is.na(get(cid.field)), .N]
if (cid_na > 0){
max.cid = max(ed[, get(cid.field)], na.rm = T)
ed[is.na(get(cid.field)), (cid.field) := (max.cid + .I)]
}
if (!is.null(annotations)){
edges.overlap.annotations = ed[, intersect(names(ed), annotations), with = FALSE]
if (length(edges.overlap.annotations) == 0){
warning('There is no overlap between the provided annotations and the annotaions available in the edges in your gGraph.')
ed$annotation = ''
} else {
ed$annotation = .dtstring(ed[, intersect(names(ed), annotations), with = FALSE])
}
ed$from = private$pnodes$snode.id[ed$from]
ed$to = -private$pnodes$snode.id[ed$to]
}
}
yf = NULL
if (!no.y && !is.null(yf <- self$meta$y.field) && yf %in% names(values(self$nodes$gr)))
{
settings$y.axis = yf
node.json[['y']] = pmin(pmax(ymin, values(self$nodes$gr)[, yf]), ymax)
node.json[['y']] = ifelse(is.na(node.json[['y']]), 0, node.json[['y']])
if (yf %in% names(private$pedges))
ed$weight = private$pedges[.(ed$sedge.id), yf, with = FALSE]
}
else
{
no.y = TRUE
}
## make loose end nodes
if (length(self$loose)>0)
{
lleft = self$nodes$lleft[, c('snode.id')]
lright = self$nodes$lright[, c('snode.id')]
last = length(self)
lleft$index = seq_along(lleft) + last
last = last + length(self)
lright$index = seq_along(lright) + last
lleft$weight = lright$weight = as.numeric(NA)
if (!is.null(yf))
{
values(lleft)$weight = values(self$nodes[lleft$snode.id]$gr)[[yf]]
values(lright)$weight = values(self$nodes[lright$snode.id]$gr)[[yf]]
}
loose.ed = rbind(
data.table(from = -lleft$snode.id,
to = NA, weight = lleft$weight),
data.table(from = lright$snode.id,
to = NA, weight = lright$weight)
)[, type := 'LOOSE']
## remove zero or NA weight loose ends
loose.ed = loose.ed[!is.na(weight), ][weight>0, ]
## add cid values for loose ends
max.cid = max(ed[, get(cid.field)], na.rm = T)
loose.ed[, (cid.field) := 1:.N + max.cid]
loose.ed[, class := '']
if (!is.null(annotations))
loose.ed$annotation = ''
ed = rbind(ed, loose.ed, fill = TRUE)
}
## remove nodes and edges that are off the map
## good.nodes = node.json[chromosome %in% seqlevels, iid]
## node.json = node.json[iid %in% good.nodes, ]
## ed[, good.count := rowSums(cbind(abs(from) %in% good.nodes, abs(to) %in% good.nodes), na.rm = TRUE)]
## ed[, node.count := rowSums(cbind(!is.na(from), !is.na(to)))]
## ed = ed[good.count == node.count, ]
# KSK: temporary fix for old-style notation (large values instead of negatives)
# of whether edge is going out/coming in from left or right side of node
# so that json is rendered correctly in gGnome.js:
# max.node.id <- max(self$dt$node.id)
# ed[from > max.node.id, from := (from - max.node.id)*-1]
# ed[to <= max.node.id, to := -1*to]
# ed[to > max.node.id, to := (to - max.node.id)]
# KSK end
## TODO: do not assume things are paired up
## do not assume the cn field in the segs is correct
node.json$title = as.character(node.json$iid)
node.json[, type := "interval"]
node.json[, strand := "*"]
## if any edge left, process
if (nrow(ed)>0){
## EDGE.JSON
ed[is.na(weight), weight := 0]
ed.json = ed[,
.(cid = get(cid.field),
source = from,
sink = to,
title = class,
type = type,
weight)]
if (!is.null(annotations))
ed.json = cbind(ed.json, ed[, "annotation", with = FALSE])
## append list of edge metadata features if efields is specified
if (length(efields))
{
.fix = function(x) as.list(x)[!is.na(unlist(x))]
## list of main features
mainl = lapply(split(ed.json, 1:nrow(ed.json)), .fix)
metal = lapply(split(ed[, efields, with = FALSE], 1:nrow(ed.json)), .fix)
## combine the two lists one inside the other
ed.json = unname(mapply(function(x,y) c(x, metadata = list(y)), mainl, metal, SIMPLIFY = FALSE))
}
} else {
ed.json = data.table(cid = numeric(0),
source = numeric(0),
sink = numeric(0),
title = character(0),
type = character(0),
weight = numeric(0))
}
## append list of node metadata features if nfields is specified
if (length(nfields))
{
.fix = function(x) as.list(x)[!is.na(unlist(x))]
mainl = lapply(split(node.json[, setdiff(names(node.json), nfields), with = FALSE], 1:nrow(node.json)), .fix)
metal = lapply(split(node.json[, nfields, with = FALSE], 1:nrow(node.json)), .fix)
## combine the two lists one inside the other
node.json = unname(mapply(function(x,y) c(x, metadata = list(y)), mainl, metal, SIMPLIFY = FALSE))
}
gg.js = list(intervals = node.json, connections = ed.json)
if (no.y){
settings$y_axis = list(visible=FALSE)
}
if (!is.null(self$meta$name) | !is.null(self$meta$description))
{
name = paste0('<h3>', self$meta$name, '</h3>')
description = paste0('', self$meta$description)
settings$description = paste(name, description)
}
if (!is.null(settings)){
gg.js = c(list(settings = settings), gg.js)
}
if (save){
if (verbose)
{
message("Saving JSON to: ", filename)
}
jsonlite::write_json(gg.js, filename,
pretty=TRUE, auto_unbox=TRUE, digits=4)
return(normalizePath(filename))
} else {
return(gg.js)
}
},
get.diameter = function(weights = NULL){
self$check
if (length(self$nodes)==0){
return(gW())
}
if (length(self$edges)>0){
tmp.ig = igraph::remove.edge.attribute(self$igraph, "weight")
} else {
tmp.ig = self$igraph
}
if (!is.null(weights)){
if (length(weights)==1 &&
is.character(weights) &&
is.element(weights, colnames(self$sedgesdt))){
tmp.ig = self$igraph
igraph::set.edge.attribute(
tmp.ig, name="weight",
value = self$sedgesdt[get.edge.attribute(tmp.ig)$sedge.id,
get("weights")])
} else if (length(weights)==nrow(self$sedgesdt)){
names(weights) = self$sedgesdt$sedge.id
igraph::set.edge.attribute(
tmp.ig, name="weight",
value = weights[
as.character(get.edge.attribute(tmp.ig)$sedge.id)])
}
}
diam = igraph::get_diameter(tmp.ig, weights = weights)
snd = self$gr[as.numeric(diam)]$snode.id
return(gW(snode.id = list(snd), graph = self))
},
circos = function(...){
have.circlize = !inherits(try(find.package("circlize")), "try-error")
if (!have.circlize){
stop("You need the packages 'circlize' to use this function.")
}
args = list(...)
## some important pars
labels.cex = ifelse(is.null(args$labels.cex), 1, args$labels.cex)
bands.height = ifelse(is.null(args$bands.height), 0.1, args$bands.height)
cn.height = ifelse(is.null(args$cn.height), 0.1, args$cn.height)
link.h.ratio = ifelse(is.null(args$link.h.ratio), 0.75, args$link.h.ratio)
junc.col = setNames(c('#e41a1c','#377eb8','#4daf4a','#984ea3'),
c('DUP-like', "DEL-like", "INV-like", "TRA-like"))
div.col3 = c('#67a9cf','#f7f7f7','#ef8a62')
juncs = gr2dt(gr.chr(grl.unlist(self$edges[type=="ALT"]$grl)))
bp1 = juncs[grl.iix==1][order(grl.ix)]
bp2 = juncs[grl.iix==2][order(grl.ix)]
cn.bed = self$nodes$dt[!is.na(cn)]
if (cn.bed[, all(!grepl("chr", seqnames))]){
cn.bed[, seqnames := paste0("chr", seqnames)]
}
circlize::circos.initializeWithIdeogram(
track.height = bands.height,
labels.cex = labels.cex)
## add CN
f = circlize::colorRamp2(
breaks = c(0, 2, cn.bed[, max(cn)]),
colors = div.col3)
circlize::circos.genomicTrackPlotRegion(
ylim = cn.bed[!is.na(cn), range(cn)],
track.height = cn.height,
cn.bed[, .(seqnames, start, end, cn)],
bg.border = NA,
panel.fun = function(region, value, ...) {
i = circlize::get.cell.meta.data(
"sector.numeric.index")
circlize::circos.genomicRect(
region,
value,
area = TRUE,
border = f(value),
baseline = 0,
col = f(value),
...)
})
## add junctions
circlize::circos.genomicLink(
bp1[, 1:3, with=FALSE],
bp2[, 1:3, with=FALSE],
col = setNames(junc.col[as.character(bp1$class)],
rownames(bp1)),
border=NA,
h.ratio = link.h.ratio)
circlize::circos.clear()
},
split = function(by = "parent.graph"){
if (!is.element(by, colnames(self$nodes$dt))){
stop(by, " not found among node metadata!")
}
val = self$nodes$dt[[by]]
uval = unique(val)
names(uval) = as.character(uval)
if (length(na.ix <- which(is.na(uval))) > 0){
names(uval)[na.ix] = "NA"
}
ggs = lapply(uval, function(x){
if (!is.na(x)){
return(self$copy[which(val==x),])
} else {
return(self$copy[which(is.na(val)),])
}
})
return(ggs)
}
),
private = list( #### PRIVATE GGRAPH
## ----- private fields
## ===== required
## node/vertex, a GRanges obj of strand-specific ranges
## pnodes GRanges has these reserved metadata fields:
## $index $loose.left $loose.right $node.id, $snode.id
pnodes = NULL,
## data.table of all edges in g, from, to, cn, type
## type can be REF, ALT
## pedges data.table has these reserved columns
## $from $to sedge.id $type $edge.id
pedges = NULL,
## timestamp of object helps us identify when the node edge / structure was last updated
ptimestamp = NULL,
## Lookup table - must be reset with buildLookupTable
lookup = NULL,
## ===== optional slots
## ALERT: whenever segs or es changes, all of these need to be reset!
## igraph obj representing the graph structure DEPRECATED
## pgraph = NULL,
## list of metadata including gTrack default settings
## (e.g. colormaps) which can be saved with the object
pmeta = list(),
loose.agg.fun = function(x) as.logical(sum(x)),
stamp = function() private$ptimestamp = tstamp(),
## ----- private methods
#' @name buildLookupTable
#' Builds the lookup table for this gGraph which is stored in private$lookup
#' Lookup table contains the columns:
#' snode.id - signed node id of the nodes
#' index - index corresponding to the input snode.id
#' rindex - index corresponding to the complement snode.id
#' @param genome Optional seqinfo if the user wants to
#' @author Joe DeRose
buildLookupTable = function() {
private$lookup = data.table()
if (length(private$pnodes) == 0) {
return(invisible(self))
}
lt = match(private$pnodes$snode.id, -private$pnodes$snode.id)
private$lookup = data.table(snode.id = private$pnodes$snode.id,
index = seq_along(private$pnodes),
rindex = lt)
setkey(private$lookup, snode.id)
return(invisible(self))
},
#' @name emptyGGraph
#' @description
#' Constructor, initializes an empty gGraph object. If the user does not provide genome
#' and using to update this class, this will try to inherit the current gGraph's seqinfo.
#' See class documentation for usage.
#' @param genome Optional seqinfo if the user wants to
#' @author Joe DeRose
emptyGGraph = function(genome=NULL)
{
## If there is an old private segs and it has seqinfo, inherit that seqinfo
## If the user provides genome, skip this step
if (is.null(genome)) {
if(!is.null(private$pnodes) && length(seqinfo(private$pnodes)@seqlengths) > 0) {
genome = seqinfo(private$pnodes)
}
}
if (inherits(genome, 'GRanges') | inherits(genome, 'GRangesList'))
genome = seqinfo(genome)
if (!is.null(genome) && !inherits(genome, 'Seqinfo'))
genome = Seqinfo(seqnames = names(genome), seqlengths = genome)
## Set up the private fields to be empty
private$pnodes = GRanges(seqinfo = genome)
private$pedges = data.table(from=integer(0),
to=integer(0),
type=character(0))
private$stamp()
return(invisible(self))
},
#' @name gGraphFromNodes
#' @description
#'
#' This is the "master" gGraph constructor
#' all other constructors funnel into this one.
#'
#' @param nodes granges of intervals, <sign is IGNORED>, optional metadata column $loose.left and $loose.right is a logical argument specifying whether the given side of the interval is a "loose end". (Note: all terminal node "sides" will be automatically labeled as loose ends.
#' @param edges optional
#' data.table with required fields n1, n1.side, n2, n2.side
#' representing node 1 and 2 index and side, where side = 0
#' is left side, side = 1 is right
#'
#' sets private fields $pnodes and $pedges field of gGraph
#' object.
#' @param nodes GRanges, strand is ignored
#' @param edges data.table with fields n1, n2, n1.side, and n2.side (see description for details)
#' @author Joe DeRose, Marcin Imielinski
gGraphFromNodes = function(nodes,
edges = NULL)
{
if (is.null(edges) || nrow(edges) == 0)
{
private$pedges = data.table(from = integer(0),
to = integer(0),
type = character(0))
## HOW TO ADD empty fields to self$edgesdt
## make sure empty edge data table also has the following default fields
## edges = data.table(n1 = numeric(0), n2 = numeric(0), n1.side = character(0), n2.side = character(0))
if (length(nodes)==0){
private$pnodes = GRanges(seqinfo = seqinfo(nodes))
return(self)
}
}
else
{
if (!all(c('n1', 'n1.side', 'n2', 'n2.side') %in% names(edges))) {
stop('edges table not in proper format: requires columns n1, n2, n1.side, n2.side, where n1 and n2 index the provided nodes and the side arguments specify right side if 1 and left side if 0')
}
edges = as.data.table(edges)
if (any(edges$n1>length(nodes) | edges$n2>length(nodes), na.rm = TRUE))
{
stop('n1 or n2 fields in edges table indexing out of range nodes')
}
if (any(ix <- is.na(edges$n1) & is.na(edges$n2)))
{
warning(paste('Removed', sum(ix), 'edges from graph that have NA in both n1 and n2, edges should have either n1 or n2 NA'))
edges = edges[!ix, ]
}
if (any(ix <- (!is.na(edges$n1) & is.na(edges$n1.side)) | (!is.na(edges$n2) & is.na(edges$n2.side))))
{
stop(paste('All non NA n1 or n2 must have an n1.side or n2.side, but we found', sum(ix), 'edges that violate this requirement'))
}
## Make sure to remove these columns as we will add them again later
if ("sedge.id" %in% names(edges)) {
edges[, sedge.id := NULL]
}
if("edge.id" %in% names(edges)) {
edges[, edge.id := NULL]
}
}
if (any(is.na(seqlengths(nodes)))){
nodes = gUtils::gr.fix(nodes)
}
if (is.null(nodes$loose.left)){
nodes$loose.left = FALSE
}
if (is.null(nodes$loose.right)){
nodes$loose.right = FALSE
}
if (!is.null(edges) && nrow(edges)>0)
{
if (is.character(edges$n1.side)) ## convert to numeric
{
edges[, n1.side := sign(n1.side == 'right')]
edges[, n2.side := sign(n2.side == 'right')]
}
nodes$loose.left = nodes$loose.left | !(1:length(nodes) %in% union(edges$n1[edges$n1.side==0], edges$n2[edges$n2.side==0]))
nodes$loose.right = nodes$loose.right | !(1:length(nodes) %in% union(edges$n1[edges$n1.side==1], edges$n2[edges$n2.side==1]))
} else
{
nodes$loose.left = TRUE
nodes$loose.right = TRUE
}
strand(nodes) = '+'
nodes$node.id = 1:length(nodes) ## for reverse compatibility, to get rid
names(nodes) = NULL
segs = c(nodes, gr.flipstrand(nodes))
segs$snode.id = ifelse(as.logical(strand(segs)=='+'), 1, -1)*segs$node.id
segs$index = 1:length(segs)
names(segs) = segs$snode.id
segs$loose.left = ifelse(is.na(segs$loose.left), FALSE, segs$loose.left)
segs$loose.right = ifelse(is.na(segs$loose.right), FALSE, segs$loose.right)
private$pnodes = segs
if (!is.null(edges) && nrow(edges)>0)
{
## FIXME: Need to change this to make sure it labels edges
type.provided = TRUE
if(!"type" %in% names(edges)) {
type.provided = FALSE
edges[, type := NA_character_]
}
## n1.side, n2.side --> strand combo
## 1, 0 --> n1+,n2+ and n2-,n1-
## 0, 1 --> n1-,n2- and n2+,n1+
## 1, 1 --> n1+,n2- and n2+,n1-
## 0, 0 --> n1-,n2+ and n2-,n1+
map = data.table(pid = private$pnodes$node.id, str = as.character(strand(private$pnodes)), id = 1:length(private$pnodes))
setkeyv(map, c('pid', 'str'))
edges[, jid := 1:.N]
## remove reserved fields or else we mess up merge below
if (!is.null(edges[['from']]))
edges$from = NULL
if (!is.null(edges[['to']]))
edges$to = NULL
## See if there is a cn in the edges, in which case we should carry it over
## is.cn = 'cn' %in% names(edges)
## is.type = 'type' %in% names(edges)
tmp = rbind(
edges[, .(jid, from = ifelse(n1.side == 1, map[.(n1, '+'), id], map[.(n1, '-'), id]),
to = ifelse(n2.side == 1, map[.(n2, '-'), id], map[.(n2, '+'), id]),
sedge.id = 1:.N)],
edges[, .(jid, from = ifelse(n2.side == 1, map[.(n2, '+'), id], map[.(n2, '-'), id]),
to = ifelse(n1.side == 1, map[.(n1, '-'), id], map[.(n1, '+'), id]),
sedge.id = -1*(1:.N))]
)
cols = c('jid', setdiff(names(edges), names(tmp)))
tmp = merge(tmp, edges[, cols, with = FALSE], by = "jid")
tmp[, edge.id := abs(sedge.id)]
## Drop unused columns
tmp[, c("jid","n1","n2","n1.side","n2.side") := NULL]
private$pedges = tmp
## if not REF then ALT
private$pedges[, type := ifelse(is.na(type), type, ifelse(type == 'REF', 'REF', 'ALT'))]
setkey(private$pedges, sedge.id)
}
private$buildLookupTable()
private$stamp()
if (is.null(edges) || nrow(edges)==0){
self$edges$mark(n1 = numeric(0), n2 = numeric(0), n1.side = character(0), n2.side = character(0))
}
## label edges with class and type
self$edges$mark(class = self$edges$class)
## label remaining edges with NA type as REF or ALT based on class
self$edges$mark(type = ifelse(is.na(self$edges$dt$type),
ifelse(self$edges$class == 'REF',
'REF', 'ALT'),
self$edges$dt$type)
)
return(self)
},
#' @name gGraphFromNodeObj
#' @description
#' Operates only on positive strand, treats all nodes in NodeObj as strandless -- so if a node is marked
#' negative it will be treated as a duplicate positive node
#' Possibly remove them idk
#' @author Joe DeRose
#' @param NodeObj gNode object e.g. one obtained from an existing gGraph
#' @param EdgeObj gEdge object e.g. one obtained from an existing gGraph
gGraphFromNodeObj = function(NodeObj, EdgeObj = NULL, looseterm = TRUE)
{
## Make sure that NodeObj and EdgeObj are gNode and gEdge respectively
if(!inherits(NodeObj, "gNode")) {
stop("NodeObj is not a gNode Object")
}
if(!is.null(EdgeObj) && !inherits(EdgeObj, "gEdge")) {
stop("EdgeObj is not a gEdge Object")
}
if(!identical(NodeObj$graph, EdgeObj$graph)) {
stop("NodeObj and EdgeObj do not point to the same graph")
}
## If we have no edges, just call the constructor on the Nodes Object
if(is.null(EdgeObj) || EdgeObj$length == 0) {
private$gGraphFromNodes(nodes = NodeObj$gr)
return(self)
}
## If we have Node Obj but no EdgeObj, throw an error
if(NodeObj$length == 0) {
stop("Error: Edges cannot non-empty if nodes are empty")
}
nodes = c(NodeObj$gr, NodeObj$clone()$flip$gr)
## Validate EdgeObj, no indices not present in index column of
edges = EdgeObj$sdt
if(!all(nix <- c(edges[,to], edges[,from]) %in% nodes$index)) {
stop(paste0("Edge Object contains indices not in NodeObj. Indicies to remove are: ",
paste(c(edges[,to], edges[,from])[!nix], collapse = " ")))
}
## FIXME: this only works because of how our nodes are set up but might fail later, need to use $index within convertEdges
nodes = nodes[!duplicated(nodes$index), ] ## MARCIN FIX .. happens with subgraphs from node / edge neibhbors
## identify any nodes that previously had edges
## on left or right and make these loose on that side
eid = EdgeObj$dt$edge.id
nid = NodeObj$dt$node.id
lleft = lright = c()
if (length(NodeObj$eleft)>0)
{
lleft = NodeObj$eleft$dt[!(edge.id %in% eid),
intersect(c(n1, n2), nid)]
}
if (length(NodeObj$eright)>0)
{
lright = NodeObj$eright$dt[!(edge.id %in% eid),
intersect(c(n1, n2), nid)]
}
edges = convertEdges(nodes, edges, metacols = TRUE)
nodes = NodeObj$gr
## assign any new loose ends caused by breaking
## these nodes from their previous partners
nodes$loose.left = nodes$loose.left | (nodes$node.id %in% lleft)
nodes$loose.right = nodes$loose.right | (nodes$node.id %in% lright)
private$gGraphFromNodes(nodes = nodes, edges = edges)
private$stamp()
return(self)
}
),
active = list(
## Returns a Node Object of the nodes in the graph
nodes = function()
{
if (length(self$gr)>0){
tmp = private$pnodes$snode.id[private$lookup[.(1:(length(private$pnodes)/2)), index]]
}
else{
tmp = NULL
}
return(gNode$new(tmp, self))
},
#' @name length
#' @description
#'
#' Returns the number of nodes in this gGraph (pair of nodes counts as 1 node).
#'
#' @return Number of nodes in this gGraph
length = function()
{
return(self$nodes$length)
},
copy = function() self$clone(),
timestamp = function() private$ptimestamp,
## Returns an Edge Object of the edges in the graph
edges = function()
{
if (nrow(self$sedgesdt)>0){
tmp = private$pedges[.(1:(nrow(private$pedges)/2)), sedge.id]
}
else{
tmp = NULL
}
return(gEdge$new(tmp, self))
},
## Returns all the loose nodes in the graph
## calls gNode $loose which uses $loose.left and $loose.right
## $pnodes metadata to determine the nodes that are loose on
## the left and loose on the right
loose = function()
{
return(self$nodes$loose)
},
## returns all terminal nodes in the graph
## (this is a subset of loose ends that have either
## 0 left or right degree)
terminal = function()
{
return(self$nodes$terminal)
},
#' returns adjacency matrix of graph as sparseMatrix
adj = function(){
adjMat = igraph::as_adj(self$igraph)
## if (is.element("cn", colnames(private$pedges))){
## adjMat[private$pedges[,cbind(from, to)]] = private$pedges$cn
## }
return(adjMat)
},
meta = function()
{
return(private$pmeta)
},
## Returns the igraph associated with this graph
## the edge weights of this igraph will be populated by the width of the target interval
igraph = function()
{
if (length(self$gr)==0)
return(igraph::graph_from_adjacency_matrix(sparseMatrix(1,1, x = 0)))
v = as.data.frame(
values(self$gr)[, c('index', 'node.id', 'snode.id')])
if (nrow(self$sedgesdt)==0)
{
G = igraph::graph_from_adjacency_matrix(sparseMatrix(1,1, x = 0, dims = rep(nrow(v), 2)))
V(G)$index = v$index
V(G)$node.id = v$node.id
V(G)$snode.id = v$snode.id
return(G)
}
ed = as.data.frame(self$sedgesdt)[, c("from", "to", "sedge.id", "edge.id", "type")]
G = igraph::graph_from_data_frame(ed,
vertices = v,
directed = TRUE)
edG = self$sedgesdt[.(igraph::E(G)$sedge.id), ]
E(G)$weight = width(self$gr)[edG$to]
return(G)
},
## Returns all of the nodes in the graph as a GRanges
gr = function() {
return(private$pnodes)
},
## Returns all of the nodes in the graph as a data.table
dt = function() {
return(as.data.table(private$pnodes))
},
## Returns an Junction Object of the edges in the graph
junctions = function()
{
return(self$edges$junctions)
},
## Returns all the edges in the graph as a data.table
sedgesdt = function() {
return(copy(private$pedges))
},
## Returns all the edges in the graph as a data.table
edgesdt = function() {
sides = c('left', 'right')
if (!nrow(private$pedges))
return(data.table(n1 = numeric(0), n2 = numeric(0), n1.side = character(0), n2.side = character(0)))
return(copy(convertEdges(self$gr, private$pedges[.(1:(nrow(private$pedges)/2)), ], metacols = TRUE)[, n1.side := sides[n1.side+1]][, n2.side := sides[n2.side+1]]))
},
## Returns a gTrack
gt = function()
{
stack.gap = private$pmeta$stack.gap
if (is.null(stack.gap)){
stack.gap = 5e4
}
self$gtrack(y.field = private$pmeta$y.field, stack.gap = stack.gap)
},
## Returns the footprint of this graph
footprint = function()
{
return(self$window())
},
diameter = function(){
self$check
return(self$get.diameter())
}
)
)
#' @name c.gGraph
#' @title c.gGraph
#' @description
#'
#' Concatenates gGraphs without doing any merging or aggregating
#'
#' @param ... set of gGraph arguments (names will be incorporated as graph metadata $parent.graph)
#' @return A new gGraph object that is the union of the nodes and edges in the input gGraphs
#' @author Marcin Imielinski
#' @export
'c.gGraph' = function(...){
args = list(...)
args = args[sapply(args, inherits, 'gGraph')]
if (is.null(names(args))){
names(args) = paste0('gGraph', 1:length(args))
}
all.nodes = lapply(names(args), function(nm)
{
gr = args[[nm]]$nodes$gr
if (!length(gr))
return(gr)
gr$parent.graph = nm
gr$og.node.id = gr$node.id
values(gr) = values(gr)[, setdiff(names(values(gr)), c('snode.id', 'index', 'node.id'))]
return(gr)
})
## need to offset node and edge ids in edge vector so they don't collide in the union graph
ns = elementNROWS(all.nodes)
noffset = c(0, cumsum(ns))[1:length(args)]
all.edges = mapply(function(nm, noffset)
{
gr = args[[nm]]$gr
edt = args[[nm]]$sedgesdt
edt$parent.graph = nm
edt$og.edge.id = edt$edge.id
edges = convertEdges(gr, edt, metacols = TRUE)
edges$n1 = edges$n1 + noffset
edges$n2 = edges$n2 + noffset
return(edges)
}, names(args), noffset, SIMPLIFY = FALSE)
meta = do.call('c', lapply(names(args), function(nm) args[[nm]]$meta))
meta = meta[!duplicated(names(meta))]
nodes = do.call(grbind, all.nodes)
edges = rbindlist(all.edges, fill = TRUE)
return(gGraph$new(nodes = nodes, edges = edges, meta = meta))
}
#' @name split.gGraph
#' @title split.gGraph
#' @description
#'
#' Split gGraph by a node metadata field
#'
#' @param by field name to split by, default "parent.graph"
#' @return A new gGraph object that is the union of the nodes and edges in the input gGraphs
#' @author Marcin Imielinski
#' @export
'split.gGraph' = function(gg, by = "parent.graph"){
return(gg$split(by = by))
}
#' @name gG
#' @title create gGraph
#' @description
#'
#' Wrapper that instantiates a gGraph object from a variety of different inputs.
#'
#' Only select parameters combos need to be / should be specified simultaneously.
#'
#' @examples
#' mygenome = c('chr1' = 1e6, 'chr2' = 1e7)
#' myjuncs = system.file('extdata', "delly.final.vcf.gz", package = "gGnome") ## junctions
#' mybreaks = gr.tile(mygenome, 10000) ## 1kb tiles across my genome
#' myweaver = system.file('extdata', 'weaver', package='gGnome')
#' myremixt = system.file('extdata', 'remixt', package='gGnome')
#' myprego = system.file('extdata', 'intervalFile.results', package='gGnome')
#' myjab = system.file('extdata', 'jabba.simple.rds', package="gGnome")
#'
#' gG(genome = mygenome)
#'
#' ## create a graph from junctions
#' gG(breaks = mybreaks, juncs = myjuncs)
#'
#' ## import a genome graph from popular callers
#' gG(jabba = myjab)
#' gG(weaver = myweaver)
#' gG(prego = myprego)
#' gG(remixt = myremixt)
#'
#'
#' ## hard code a graph from GRanges and edges
#' nodes = c(GRanges("1",IRanges(1,100),"*"), GRanges("1",IRanges(101,200),"*"),
#' GRanges("1",IRanges(201,300),"*"), GRanges("1",IRanges(301,400),"*"),
#' GRanges("1",IRanges(401,500),"*"))
#' edges = data.table(n1 = c(3,2,4,1,3),
#' n2 = c(3,4,2,5,4),
#' n1.side = c(1,1,0,0,1),
#' n2.side = c(0,0,0,1,0))
#' gg = gG(nodes = nodes, edges = edges, meta = list(gr.colorfield = 'type'))
#'
#' ## mostly for developer use
#' gG(nodeObj = gg$nodes, edgeObj = gg$edges)
#'
#' @param genome seqlengths or seqinfo object around which to build an empty gGraph
#' @param breaks GRanges around which to build a reference gGraph
#' @param junctions BND vcf or bedpe file path, gGraph::Junctions object, or GRangesList specifying pairs of locations to reconnect (can be used in conjunction with breaks)
#' @param alignments GRanges with fields $cigar $flag and $qname (e.g. output of bamUtils::read.bam or RSeqLib::BWA specifying alignments to a reference, the graph will then represent the (possibly multimapped) implicit nodes and edges representing an "end to end" walk of all queries lifted to the reference. Insertions are represented as 0-width nodes and multi-maps yield branches. The resulting graph can be walked to exhaustively or greedily enumerate all possible linear embeddings of the queries on the reference
#' @param jabba path to JaBbA .rds output file, if specified instantiates a gGraph object from JaBbA output with y.field "cn" specifying copy number
#' @param prego path to PREGO output file. if specified, instantiates a gGraph object from PREGO output with y.field "cn" specifying copy number
#' @param weaver path to Weaver output. if specified, instantiates a gGraph object from Weaver output with y.field "cn" specifying copy number
#' @param remixt path to RemiXT output, if specified, instantiates a gGraph object from ReMiXT output with y.field "cn" specifying copy number
#' @param rck path to RCK output, if specified, instantiates a gGraph object from RCK output with y.field "cn" specifying total copy number
#' @param nodes GRanges of unsigned intervals to be rejoined in gGRaph, used in conjunction with edges argument below
#' @param walks GRangesList or gWalk to build a "haplograph" around, see ?haplograph
#' @param edges data.table with field n1, n2, n1.side, n2.side with each row specifying an edge, i.e. which sides ("left" vs "right") of which integer node indices to connect in the gGRaph
#' @param nodeObj gNode object to create a gGraph around (similar to nodes input above), except generated via the $nodes accessor of an existing gGraph object, used in cojunction with gEdge input to create a new gGnome object from an existing one
#' @param edgeeObj gEdge object to create a gGraph around (similar to edges input above), except generated via the $edges accessor of an existing gGraph object
#' @param meta list of metadata to associate with this gGraph object, for now mostly used to populate gTrack visualization parameters
#' @return A new gGraph object that is the sum of the component gGraphs
#' @export
gG = function(genome = NULL,
breaks = NULL,
junctions = NULL,
alignments = NULL,
juncs = NULL,
prego = NULL,
jabba = NULL,
cougar = NULL,
weaver = NULL,
remixt = NULL,
rck = NULL,
nodes = NULL,
edges = NULL,
walks = NULL,
nodeObj = NULL,
edgeObj = NULL,
meta = NULL
)
{
if (!is.null(junctions))
juncs = junctions;
return(gGraph$new(genome = genome,
breaks = breaks,
juncs = juncs,
alignments = alignments,
prego = prego,
jabba = jabba,
cougar = cougar,
weaver = weaver,
remixt = remixt,
rck = rck,
nodes = nodes,
edges = edges,
walks = walks,
nodeObj = nodeObj,
edgeObj = edgeObj,
meta))
}
#' @name sum.gGraph
#' @title sum.gGraph
#' @description
#'
#' Sums gGraphs using default aggregation operation for metadata (sum for numeric and paste(..., collapse = ',') for character, and c() for list)
#'
#' @param ... set of gGraph arguments (names will be incorporated as graph metadata $parent.graph)
#' @param y gGraph
#' @return A new gGraph object that is the sum of the component gGraphs
#' @export
'sum.gGraph' = function(..., by = NULL, na.rm = TRUE){
cgg = c(...)
return(cgg$disjoin(by = NULL, na.rm = na.rm))
}
#' @name ==.gGraph
#' @rdname equals.gGraph
#' @title equals.gGraph
#' @description
#'
#' Returns TRUE if two graphs are equivalent by value based on having equivalent nodes and edges
#' (note this is different - intentionally - from identical, which will determine whether
#' the two variables are identical by reference i.e. point to the exact same location in memory)
#'
#' @param x gGraph
#' @param y gGraph
#' @return TRUE if objects are equivalent
#' @export
'==.gGraph' = function(x,y){
return(x$nodes == y$nodes & x$edges == y$edges)
}
#' @name ==.gNode
#' @rdname equals.gNode
#' @description
#'
#' Returns TRUE if two gNode objects are equivalent, based on having identical node grs
#' (note this is different - intentionally - from identical, which will determine whether
#' the two variables point to the exact same location in memory)
#'
#' @param x gNode
#' @param y gNode
#' @return TRUE if objects are equivalent
#' @export
'==.gNode' = function(x,y){
return(identical(x$gr, y$gr))
}
#' @name ==.gEdge
#' @rdname equals.gEdge
#' @description
#'
#' Returns TRUE if two gEdge objects are equivalent, based on having identical edge tables
#' (note this is different - intentionally - from identical, which will determine whether
#' the two variables point to the exact same location in memory)
#'
#' @param x gNode
#' @param y gNode
#' @return TRUE if objects are equivalent
#' @export
'==.gEdge' = function(x,y){
return(identical(x$dt, y$dt))
}
#' @name !=.gGraph
#' @rdname notequals.gGraph
#' @description
#'
#' Returns TRUE if two graphs are not equivalent based on having non equivalent nodes and edges
#' (note this is different - intentionally - from identical, which will determine whether
#' the two variables point to the exact same location in memory)
#'
#' @param x gGraph
#' @param y gGraph
#' @return TRUE if objects are not equivalent
#' @export
'!=.gGraph' = function(x,y){
return(x$nodes != y$nodes & x$edges != y$edges)
}
#' @name !=.gNode
#' @rdname notequals.gNode
#' @description
#'
#' Returns TRUE if two gNode objects are not equivalent, based on having non identical node grs
#' (note this is different - intentionally - from identical, which will determine whether
#' the two variables point to the exact same location in memory)
#'
#' @param x gNode
#' @param y gNode
#' @return TRUE if objects are not equivalent
#' @export
'!=.gNode' = function(x,y){
return(!identical(x$gr, y$gr))
}
#' @name !=.gEdge
#' @rdname notequals.gEdge
#' @description
#'
#' Returns TRUE if two gEdge objects are non equivalent, based on having non identical edge tables
#' (note this is different - intentionally - from identical, which will determine whether
#' the two variables point to the exact same location in memory)
#'
#' @param x gNode
#' @param y gNode
#' @return TRUE if objects are not equivalent
#' @export
'!=.gEdge' = function(x,y){
return(!identical(x$dt, y$dt))
}
#' @name +.gGraph
#' @rdname plus.gGraph
#' @description
#'
#' Adds two gGraphs using default aggregation operation for metadata (sum for numeric and paste(..., collapse = ',') for character, and c() for list)
#'
#' @param x gGraph
#' @param y gGraph
#' @return A new gGraph object that is the sum of the component gGRaphs
#' @export
'+.gGraph' = function(x,y){
return(sum(x,y))
}
#' @name [.gGraph
#' @rdname subset.gGraph
#' @description
#'
#' Overloads subset operator for gGraph.
#'
#' Subsetting nodes will remove any edges associated with non-existent nodes
#' Subsetting edges will not remove the associated nodes
#'
#' @param obj gGraph
#' @param i integer, logical, or expression in gNode metadata used to subset gNodes
#' @param j integer, logical, or expression in gEdge metadata used to subset gEdges
#' @return A new gGraph object that only contains the given nodes and edges
#' @author Marcin Imielinski
#' @export
'[.gGraph' = function(obj, i = NULL, j = NULL, with = TRUE, ...){
if (any(deparse(substitute(j)) != "NULL"))
{
edges = obj$edges[j, with = with]
if (all(deparse(substitute(i)) == "NULL")){
nodes = obj$nodes
}
else
{
nodes = obj$nodes[i, with = with]
nids = c(nodes$dt$index, nodes$flip$dt$index);
eid = edges$sdt[to %in% nids & from %in% nids, sedge.id]
if (length(edges)>0)
{
eix = edges$dt[, ix := 1:.N][.(eid), ix]
edges = edges[eix, with = with]
}
}
} else
{
if (all(deparse(substitute(i)) == "NULL")){
nodes = obj$nodes
}
else{
nodes = obj$nodes[i, with = with]
}
nids = c(nodes$dt$index, nodes$flip$dt$index);
if (length(nids) && length(nodes$edges))
{
eid = nodes$edges$sdt[to %in% nids & from %in% nids, sedge.id]
edges = obj$edges[eid, with = with]
}
else
edges = obj$edges[c()]
}
return(gGraph$new(nodeObj = nodes, edgeObj = edges, meta = obj$meta))
}
## ================== Non-Member Functions for gGraph ================== ##
#' @name length
#' @title length
#' @description
#' The number of nodes in the gGraph
#'
#' @param gGraph a \code{gGraph} object
#'
#' @return the number of nodes in the gGraph
#' @export
`length.gGraph` = function(x){
return(x$length)
}
#' @name length
#' @title length
#' @description
#' The number of walks in the gWalk
#'
#' @param gWalk a \code{gWalk} object
#'
#' @return the number of nodes in the gWalk
#' @export
`length.gWalk` = function(x){
return(x$length)
}
#' @name lengths
#' @title lengths
#'
#' @description
#'
#' establish s3 method for lengths
#'
#' @param gWalk a \code{gWalk} object
#'
#' @return the number of nodes per walk in the gWalk
#' @export
lengths = function(x, use.names = T) {
UseMethod("lengths")
}
#' @name lengths
#' @title lengths
#' @description
#' A vector of walk lengths associated with this walk
#'
#' @param gWalk a \code{gWalk} object
#'
#' @return the number of nodes per walk in the gWalk
#' @export
`lengths.gWalk` = function(x, use.names = FALSE){
return(x$lengths)
}
#' @name length
#' @title length
#' @description
#' The number of nodes in the gNode
#'
#' @param gGraph a \code{gNode} object
#'
#' @return the number of nodes in the gNode
#' @export
`length.gNode` = function(x){
return(x$length)
}
#' @name length
#' @title length
#' @description
#' The number of edges in the gEdge
#'
#' @param gGraph a \code{gEdge} object
#'
#' @return the number of edges in the gEdge
#' @export
`length.gEdge` = function(x){
return(x$length)
}
#' @name dim
#' @title dim
#' @description
#' The number of nodes and edges in the gGraph
#'
#' @param gGraph a \code{gGraph} object
#'
#' @return the number of nodes and edges in the gGraph
#' @export
`dim.gGraph` = function(x){
## input must be a gGraph!
if (!inherits(x, "gGraph")){
stop("Error: Invalid input.")
}
return(c(x$length, nrow(x$sedgesdt)/2))
}
#' @name refresh
#' @title refresh
#' @description
#' Updates gGraph object to reflect changes in source code
#'
#' @param gGraph object
#'
#' @return gGraph object
#' @exportMethod refresh
#' @export
setMethod("refresh", "gGraph",
function(x) {
return(gGraph$new(nodeObj = x$nodes,
edgeObj = x$edges,
meta = x$meta
))
})
#' @name convertEdges
#' @title convertEdges
#' @description
#' Takes nodes and edges and converts the edge table for the nodes if they were strandless
#' gEdge table will be the appropriate table for if we did:
#' nodes[which(as.logical(strand(nodes)=="+"))]
#' strand(nodes) = "*"
#' pre: Nodes have snode.id and index column (indicates index)
#' edges have sedge.id
#'
#' @author Joe DeRose
#' @param nodes GRanges of signed nodes (ie $pnodes in gGraph object)
#' @param edges data.table of signed edges (ie $pedges in gGraph object)
#' @keywords internal
convertEdges = function(nodes, edges, metacols = FALSE, cleanup = TRUE)
{
## Check to make sure we have some edge table, if not return empty
if(is.null(edges) || nrow(edges) == 0) {
return(data.table(n1 = integer(), n2 = integer(), n1.side = numeric(), n2.side = numeric()))
}
es = copy(edges)
nodedt = gr2dt(nodes)
setkey(nodedt, index)
## Map between to/from and n1.side, n2.side
## to (+) ---- n2.side = 0
## to (-) ---- n2.side = 1
## from (+) -- n1.side = 1
## from (-) -- n1.side = 0
es[, ":="(n1.side = ifelse(nodedt[.(es[,from]), strand] == "+", 1, 0),
n2.side = ifelse(nodedt[.(es[,to]), strand] == "+", 0, 1))]
## Get positive nodes
new.nodes = unname(nodes) %Q% (strand == "+")
## Create map between old id positions and new positions (pos - pos, neg - pos)
## Assumes no two values are not NA which they shouldnt be if everything is set up right on backend
indexMap = pmin(match(nodes$snode.id, new.nodes$snode.id), match(nodes$snode.id, -new.nodes$snode.id), na.rm=T)
indexMap = data.table(index = nodes$index, new.index = indexMap)
setkey(indexMap, index)
## Map the edges to their new location
es[, ":="(n1 = indexMap[.(from), new.index],
n2 = indexMap[.(to), new.index])]
if (is.null(es$sedge.id) || any(is.na(es$sedge.id))) ## infer sedge.id if not provided
{
## arbitrary order for edges
esign = es[, ifelse(n1+0.1*n1.side < n2+0.1*n2.side, 1, -1)]
nutag = factor(es[, ifelse(esign>0, paste(n1, n1.side, n2, n2.side), paste(n2, n2.side, n1, n1.side))])
es[, sedge.id := esign*as.integer(nutag)]
}
## remove duplicate edges ie those that are present in both positive and negative orientations hence have
## abs(sedge.id) present twice
if (cleanup)
{
es = es[!duplicated(abs(sedge.id)), ]
es = es[!is.na(n1) & !is.na(n2)]
}
## If the user chooses to keep metacols, just remove to and from, otherwise, only keep the essentials
if (metacols) {
es[, c("to","from") := NULL]
return(es)
} else {
return(es[, .(n1, n2, n1.side, n2.side,
cn = if('cn' %in% names(es)) cn,
type = if('type' %in% names(es)) type)])
}
}
## ================= gWalk class definition ================== ##
#' @name gWalk
#' @title gWalk
#' @description
#' gWalk object
#' @export
gWalk = setClass("gWalk")
gWalk = R6::R6Class("gWalk", ## GWALKS
public = list(
## snode.id is a list of snode.id vectors
## sedge.id is a list of sedge.id vectors
## grl is an arbitrary GRangesList representing a walk
## graph is a pointer to the graph that snode.id/sedge.id comes from if being used
## Only one of these can be non-NULL when initializing this object (graph must be used with snode.id and sedge.id)
## circular is a logical vector of length walks, specifying which walk should be interpreted as a circular contig, in which case an edge is implied from the last interval / node / edge in the walk to the first (i.e. do not include this in the walk definition)
## however, for circular walks defined through the sedge.id, we specify the edge between the final and the first node
initialize = function(snode.id = NULL,
sedge.id = NULL,
grl = NULL,
graph = NULL,
meta = NULL, ## metadata with one row per walk, can include every metadata field except for $circular, $walk.id
circular = NULL, ## logical vector specifying which contigs are circular, i.e. have an implied edge from the final node to the first
disjoin = FALSE, ## flag whether to collapse ie disjoin the intervals
drop = FALSE ## flag whether in the case of snode.id should we just drop any walks with non-existent edges (as opposed to just erroring out)
)
{
dgraph = NULL ## only used if grl and graph are both non NULL
if (all(is.null(snode.id), is.null(sedge.id), is.null(grl))) {
## initializing empty gWalk
if (is.null(graph)){
graph = gGraph$new()
}
## data.table of node.ids in the walk
private$pnode = gGraph$new()$nodes
## data.table of edge.ids in the walk
private$pedge = gGraph$new()$edges
## data.table of walk metadata
private$pmeta = data.table()
## Pointer to the graph the snode.id/sedge.ids come from
## If initialized with grl, this is the graph from the gWalk
private$pgraph = graph
return(self)
} else if (sum(c(!is.null(snode.id) | !is.null(sedge.id), !is.null(grl))) > 1) {
stop("More than one of snode.id, sedge.id and grl cannot be non-NULL")
}
if (!is.null(grl)) {
## will create the graph first using the unsigned nodes / edges constructor
## doing it differently depending on whether disjoin is flagged
grlu = grl.unlist(grl)
if (any(ix <- as.character(strand(grlu))=='*'))
strand(grlu)[ix] = '+'
## we disjoin by default if a graph is already provided
disjoin = disjoin | !is.null(graph)
if (length(grlu)>0)
{
if (disjoin)
{
## here we disjoin all nodes first and then build the graph
nodes = disjoin(gr.stripstrand(grlu))
tmpdt = as.data.table(grlu[, c('grl.ix', 'grl.iix')] %*% nodes)[, strand := as.character(strand(grlu)[query.id])]
}
else ## otherwise we keep all nodes separate
{
NONO.FIELDS = c('node.id', 'snode.id', 'loose.left', 'loose.right', 'index')
nodes = gr.stripstrand(grlu[, setdiff(names(values(grlu)), NONO.FIELDS)])
nodes$parent.graph = grlu$grl.ix
tmpdt = cbind(as.data.table(grlu[, c('grl.ix', 'grl.iix')]),
data.table(query.id = 1:length(nodes), subject.id = 1:length(nodes)))
}
## pos will take of duplicated created in the "disjoin" case where a single walk interval comprises multiple nodes
## i.e. has teh same grl.ix and grl.iix but different pos
## we want to make sure that these get ordered properly in the key call below
## so that on negative strands the leftward segments come after the rightward in the same grl.iix
tmpdt[, pos := (sign(strand=='+')-0.5)*start]
setkeyv(tmpdt, c("grl.ix", "grl.iix", 'pos'))
edges = tmpdt[, .(n1 = subject.id[-.N], n2 = subject.id[-1],
n1.side = sign(strand[-.N] == '+'),
n2.side = sign(strand[-1] == '-')), by = grl.ix]
if (!is.null(circular))
{
if (!is.logical(circular)){
stop('circular must be a logical vector')
}
circular = data.table(walk.id = 1:length(grl),
circular = circular)$circular
}
else {
circular = rep(FALSE, length(grl))
}
if (any(circular))
{
ix = which(circular)
setkeyv(tmpdt, c("grl.ix", "grl.iix"))
edges = rbind(edges, tmpdt[grl.ix %in% ix, .(n1 = subject.id[.N],
n1.side = sign(strand[.N] == '+'),
n2 = subject.id[1],
n2.side = sign(strand[1] == '-')
), by = grl.ix])
}
edges$grl.ix = NULL
snode.id = split(ifelse(tmpdt$strand=='+', 1, -1)*tmpdt$subject.id, tmpdt$grl.ix)[as.character(1:length(grl))]
if (!is.null(names(grl)))
names(snode.id) = names(grl)
if (!is.null(graph)) ## disjoin the input graph over the provided grls
{
dgraph = graph$clone()
## first we disjoin this graph over the intervals in ours
gr = disjoin(unlist(grl))
dgraph$disjoin(gr = gr)
}
if (ncol(values(grl))>0)
{
if (is.null(meta))
{
meta = as.data.table(values(grl))
}
else
{
meta = cbind(meta, as.data.table(values(grl)))
}
}
graph = gGraph$new(nodes = nodes, edges = edges)
## mark all class != 'REF' edges in the walks as ALT
graph$edges$mark(type = ifelse(graph$edges$dt$class == 'REF', 'REF', 'ALT'))
}
else
{
snode.id = list()
graph = gGraph$new(genome = grl)
}
}
if ((!is.null(snode.id) | !is.null(sedge.id)) & is.null(graph))
{
stop('graph must be non NULL if snode.id or sedge.id are provided')
}
if (!is.null(snode.id)) {
walk.names = names(snode.id)
if (is.null(walk.names))
walk.names = seq_along(snode.id)
names(snode.id) = seq_along(snode.id)
private$gWalkFromNodes(snode.id = snode.id,
graph = graph,
circular = circular,
drop = drop,
meta
)
}
## overwrite above if edges also provided
if (!is.null(sedge.id)) {
walk.names = names(sedge.id)
if (is.null(walk.names))
walk.names = seq_along(sedge.id)
names(sedge.id) = seq_along(sedge.id)
private$gWalkFromEdges(sedge.id = sedge.id,
snode.id = snode.id, ## provide node input to over-ride edges and also check if compatible
graph = graph,
circular = circular,
drop = drop,
meta = meta)
}
if (nrow(private$pmeta)>0)
{
setkey(private$pmeta, walk.id)
if (nrow(private$pnode))
setkey(private$pnode, walk.id)
if (nrow(private$pedge))
setkey(private$pedge, walk.id)
}
if (!is.null(dgraph)) ## in this case both grl and graph were provided, so now we will try to reconcile
{
## the trick is to label all edges of dgraph and then see
## if the walks use any edges that are <not> inherited from dgraph
## in which case we will break
dgraph$edges$mark(in.dgraph = TRUE)
self$graph$edges$mark(pid = as.character(self$graph$edges$dt$sedge.id))
cloned = self$copy
self$disjoin(graph = dgraph)
if (any(is.na(self$graph$edgesdt$in.dgraph)))
{
stop('Some grl edges are not present in provided graph, please check inputs')
}
self$graph$edges$mark(in.dgraph = NULL)
}
## fill in remaining metadata
if (self$length>0 && nrow(private$pnode))
{
tmp = private$pnode[, .(walk.id, snode.id)]
tmp$wid = width(self$graph$nodes$gr)[abs(tmp$snode.id)]
if (!is.null(walk.names))
private$pmeta$name = walk.names[private$pmeta$walk.id]
private$pmeta$wid = tmp[, .(wid = sum(as.numeric(wid), na.rm = TRUE)), keyby = walk.id][.(private$pmeta$walk.id), wid]
## we need to fix factors if drop == TRUE and there are missing walk.ids
if (drop == TRUE & any(private$pmeta$walk.id>self$length)){
lev = private$pmeta$walk.id
private$pmeta[, walk.id := as.integer(factor(as.character(walk.id), lev))]
private$pnode[, walk.id := as.integer(factor(as.character(walk.id), lev))]
if (nrow(private$pnode))
setkey(private$pnode, walk.id)
if (nrow(private$pmeta))
setkey(private$pmeta, walk.id)
if (nrow(private$pedge)>0)
{
private$pedge[, walk.id := as.integer(factor(as.character(walk.id), lev))]
setkey(private$pedge, walk.id)
}
}
}
private$ptimestamp = private$pgraph$timestamp
return(self)
},
#' sets metadata of gWalk object
#' (accessible through $dt accessor)
set = function(...)
{
self$check
args = list(...)
NONO.FIELDS = c('walk.id', 'circular', 'sedge.id', 'snode.id', 'length', 'wid')
args = list(...)
if (any(names(args) %in% NONO.FIELDS)){
stop(paste('Cannot modify the following reserved gWalk metadata fields:', paste(NONO.FIELDS, collapse = ', ')))
}
for (arg in names(args)){
private$pmeta[[arg]] = args[[arg]]
}
return(invisible(self))
},
#' @name gWalk.subset
#' @description
#' Allows for subsetting of the gWalk Object using bracket notation
#' @author Marcin Imielinski
subset = function(i)
{
self$check
if (is.null(i)){
i = integer()
}
if (is.logical(i)){
i = which(i)
}
if (!is.null(dim(i)))
stop('subscript dimensions malformed')
if (length(i)>0 && (is.numeric(i) | is.integer(i)))
{
if (max(i, na.rm = TRUE)>self$length){
stop('index out of bounds')
}
}
if (length(i)==0)
{
private$pnode = data.table()
private$pedge = data.table()
private$pmeta = data.table()
return(self)
}
## create nodes and their "mirrors" allows "walk flipping" using negative indices
ui = unique(i) ## need unique indices otherwise dups get created below!!
tmp.node = rbind(
private$pnode[.(abs(ui)), , allow.cartesian = TRUE],
copy(private$pnode[.(abs(ui)), ])[,
":="(walk.id = -walk.id, snode.id = -snode.id)][rev(1:.N), ])
new.edge = tmp.edge = data.table()
if (nrow(private$pedge)>0)
{
tmp.edge = rbind(
private$pedge[.(abs(i)), , allow.cartesian = TRUE],
copy(private$pedge[.(abs(i)), , allow.cartesian = TRUE])[,
":="(walk.id = -walk.id, sedge.id = -sedge.id)][rev(1:.N), ])
setkey(tmp.edge, walk.id)
}
tmp.meta = private$pmeta[.(abs(i)), , allow.cartesian = TRUE]
setkey(tmp.node, walk.id)
new.node = merge(data.table(query.id = 1:length(i), walk.id = i), tmp.node, by = 'walk.id', allow.cartesian = TRUE)
if (nrow(tmp.edge)>0)
{
new.edge = merge(data.table(query.id = 1:length(i), walk.id = i), tmp.edge, by = 'walk.id',, allow.cartesian = TRUE)
}
new.meta = merge(data.table(query.id = 1:length(i), walk.id = abs(i)), private$pmeta, by = 'walk.id', , allow.cartesian = TRUE)
new.node[, walk.id := query.id]
new.meta[, walk.id := query.id]
if (nrow(new.edge)>0)
{
new.edge[, walk.id := query.id]
private$pedge = new.edge[,-2]
}
else
private$pedge = data.table()
private$pnode = new.node[,-2]
private$pmeta = new.meta[, -2]
if (nrow(private$pnode))
setkey(private$pnode, walk.id)
if (nrow(private$pmeta))
setkey(private$pmeta, walk.id)
if (nrow(new.edge)>0)
{
setkey(private$pedge, walk.id)
}
return(invisible(self))
},
dts = function(ix = 1:self$length, makelists = TRUE)
{
self$check
if (self$length==0)
return(data.table())
ix = ix[ix %in% 1:self$length]
out = private$pmeta[.(ix), ]
if (makelists == FALSE) ## just return raw pmeta subset
return(out)
node.sum = data.table(
snode.id = rep(list(),self$length),
walk.id = private$pmeta$walk.id,
key = 'walk.id')
if (nrow(private$pnode))
{
ix2 = ix[ix %in% private$pnode$walk.id]
node.sum = private$pnode[.(ix2), .(snode.id = list(c(snode.id))), keyby = walk.id][.(ix),][, -1]
}
out = cbind(out, node.sum)
if (nrow(private$pedge)>0)
out = cbind(out, private$pedge[.(ix), .(sedge.id = list(c(sedge.id))), keyby = walk.id][.(ix),][, -1])
return(out)
},
#' @name fix
#' @description
#'
#' Returns a modified version of the gWalk with seqlevels "fixed", including
#' keeping only certain seqlevels, dropping certain seqlevels, and replacing seqlevels.
#'
#' Warning: this may modify the walks including getting rid of nodes and edges (i.e. those outside
#' the retained seqlevels) and also change coordinates (ie move ranges that were previously on different
#' chromosomes to the same chromosome etc.). Use with caution!
#'
#' Default behavior is to replace 'chr', with ''.
#'
#' @param pattern character pattern to replace in seqlevels (used in a gsub, can have backreferences)
#' @param replacement character to replace pattern with (used in a gsub, can have backreferences)
#' @param drop character vector of seqlevels to drop or logical TRUE to drop all seqlevels that are unused (TRUE)
#' @param seqlengths new seqlengths i.e. named integer vector of seqlevels to drop or embed graph into
#' @author Marcin Imielinski
#' @return current graph modified in place with additional nodes and edges, as specified by user
fix = function(pattern = NULL, replacement = NULL, drop = TRUE, seqlengths = NULL)
{
if (is.logical(drop))
{
if (drop)
drop = setdiff(seqlevels(self), as.character(seqnames(self$nodes$gr)))
else
drop = NULL
}
if (!is.null(drop) | !is.null(seqlengths))
{
if (!is.null(seqlengths))
sl.new = seqlengths
else
sl.new = seqlengths(self)
tmp = as.data.table(self$nodes$dt)
ix = tmp$seqnames %in% names(sl.new) & !(tmp$seqnames %in% drop)
sl.new = sl.new[intersect(names(sl.new), tmp$seqnames[ix])]
gg = refresh(self$graph$copy)
gg$nodes$mark(og.node.id = gg$nodes$dt$node.id)
gg$fix(seqlengths = sl.new, pattern = pattern, replacement = replacement, drop = drop)
if (length(sl.new))
{
map = unique(gg$dt[, .(snode.id = c(og.node.id, -og.node.id), new.snode.id = c(node.id, -node.id))], by = 'snode.id')
pnode.new = merge(private$pnode, map, by = 'snode.id', all.x = TRUE)[, .(snode.id, walk.id, walk.iid, new.snode.id)]
setkeyv(pnode.new, c('walk.id', 'walk.iid'))
## walks may need to be split or made empty because there are no longer seqlengths associated with them
## basically each NA to non NA should give a new walk id
pnode.new[, new.walk := cumsum(c(0, diff(!is.na(new.snode.id))>0)), by = walk.id]
## broken walks ie those with NA lose the right to be circular if they were before
pnode.new[, broken := any(is.na(new.snode.id))]
pnode.new[, new.walk.id := paste(walk.id, new.walk)]
pnode.new[, new.walk.iid := cumsum(!is.na(new.snode.id)), by = new.walk.id]
wmap = unique(pnode.new[, .(walk.id, new.walk.id, broken)])
meta.new = merge(private$pmeta, wmap, by = 'walk.id')
meta.new[, circular := circular & !broken]
setkey(meta.new, new.walk.id)
tmp = pnode.new[!is.na(new.snode.id), ]
snode.id = split(tmp$new.snode.id, factor(tmp$new.walk.id, meta.new$new.walk.id))
}
else
{
meta.new = private$pmeta
snode.id = lapply(meta.new$walk.id, function(x) c())
names(snode.id) = meta.new$walk.id
}
private$gWalkFromNodes(snode.id = snode.id,
graph = gg,
circular = meta.new$circular,
meta = meta.new
)
if (nrow(private$pnode))
setkey(private$pnode, walk.id)
if (nrow(private$pedge))
setkey(private$pedge, walk.id)
if (nrow(private$pmeta))
setkey(private$pmeta, walk.id)
private$ptimestamp = private$pgraph$timestamp
}
return(invisible(self))
},
#' @name rep
#' @description
#'
#' Creates "bubbles" in the underlying graph by replicating the gwalks. Node replication
#' replicates edges going in and out of all replicated nodes. If an edge connects a pair of replicated nodes
#' that edge will be replicated across all pairs of those replciated nodes. Walk replication will create "longer bubbles"
#' with fewer edges getting replicated i.e. it will only replicate intra-walk edges within each walk replicate (but not between
#' separate walk replicates).
#'
#' (note that this returns a copy of the gGraph that this node is linked to, i.e. it does not make
#' this gWalk stale)
#'
#' New graph keeps track of the parent node and edge ids in the original graph using node metadata parent.node.id
#' and edge metadata parent.edge.id i.e. the replicated nodes will be connected to the sources of the original nodes
#' and if replicated nodes connect to each other, then there will exist an edge connecting
#' all of their instances to each other.
#'
#' @param nodes = gNode object must point to a node in the graph, can also be an index of a node (but not a metadata expression), can also be a gWalk object
#' @param times scalar or vector of length self$length specifying how many times to replicate each of the nodes.
#' @author Marcin Imielinski
#' @return Returns a pointer to the new nodes
rep = function(times)
{
self$check
if (length(self)>0)
{
self$graph$clone()$rep(self, times)
}
},
print = function()
{
self$check
if (self$length==0)
{
message('empty gWalk object')
return()
}
TOOBIG = 5
TEASER = 3
.tease = function(stuff, toobig = TOOBIG, teaser = TEASER)
{
if (length(stuff)>toobig){
paste(paste(stuff[1:teaser], collapse = ' -> '), '-> ... ->', paste(stuff[length(stuff)-TEASER:1+1], collapse = ' -> '))
}
else{
paste(stuff, collapse = ' -> ')
}
}
message('gWalk object with ', self$length, ' walks (',
sum(!self$circular), ' linear and ',
sum(self$circular), ' circular)')
if (self$length>0)
{
ix = 1:min(self$length, TOOBIG)
cols = setdiff(colnames(private$pmeta), c('snode.id', 'sedge.id'))
out = self$dts(ix)[ , cols, with = FALSE]
if (nrow(private$pnode))
{
tmp.node = private$pnode[.(ix), .(walk.id, snode.id)]
tmp.node[, nix := private$pgraph$queryLookup(snode.id)$index]
setkey(tmp.node, walk.id)
teaser = tmp.node[.(ix), ][,
.(gr =
{
if (any(is.na(nix)))
''
else
.tease(gr.string(private$pgraph$gr[nix],
mb = FALSE))
}
), keyby = walk.id][.(ix),][, -1]
out = cbind(out, teaser)
}
print(out)
if (self$length>TOOBIG)
{
message('\n ... \n(', self$length-TOOBIG, ' more walks )')
}
}
},
#' @name gWalk json
#' @description
#' Dumps walk (and underlying graph)
#' to json file
#' @param filename character path to save to
#' @param save whether to save or return list object representing json contents
#' @param nfields which node fields to dump to json (NULL)
#' @param efields which edge fields to dump to json (NULL)
#' @param annotations which graph annotations to dump to json
#' @param include.graph if set to TRUE, then the output will include the underlying graph for the gWalk (TRUE)
#' @param settings for details see gGraph json() module
#' @param no.y for details see gGraph json() module
#' @param stack.gap this currently does not do anything, but is intended to control the spread of intervals on the y-axis
#' @param verbose
#' @author Marcin Imielinski
json = function(filename = '.',
save = TRUE,
verbose = FALSE,
annotations = NULL,
nfields = NULL,
efields = NULL,
stack.gap = 1e5,
include.graph = TRUE,
settings = list(y_axis = list(title = "copy number",
visible = TRUE)),
cid.field = NULL,
no.y = FALSE)
{
if (length(self) == 0){
warning('This is an empty gWalk so no JSON will be produced.')
return(NA)
}
if (length(self$edges) == 0){
warning('There are no edges in this gWalk so no JSON will be produced.')
return(NA)
}
# check if the gWalk includes walks with no ALT edges and if so then remove these and call json again
# non.alt.exist = any(self$dt[,sapply(sedge.id, length) == 0])
non.alt.exist = any(self$dt[,sapply(sedge.id, length) == 0] | self$dt[,sapply(sedge.id, anyNA) == TRUE]) ## kat: I had to add this bandaid because I was using subsetted gWalk objects, where walks of length one have NAs in place of NULLs for sedge.id... ultimately this should be patched under the hood in the gWalk subset function so that subsetted walk sedge.id output is EXACTLY the same style as original...
if (non.alt.exist){
# we call json function again but only including the walks that include at least one ALT edge (anyNA condition added to fix subsetting issue as described in the comments just above)
return(refresh(self[self$dt[,sapply(sedge.id, length) > 0] & self$dt[,sapply(sedge.id, anyNA) == FALSE]])$json(filename = filename,
save = save,
verbose = verbose,
annotations = annotations,
nfields = nfields,
efields = efields,
stack.gap = stack.gap,
include.graph = include.graph,
settings = settings,
no.y = no.y))
}
if (include.graph){
## build graph level js
graph.js = refresh(self$graph)$json(filename = NA, save = FALSE, verbose = verbose,
annotations = annotations, nfields = nfields, efields = efields,
settings = settings, no.y = no.y)
}
pids = split(self$dt[, .(pid = walk.id,
strand = '+',
type = ifelse(self$circular, 'cycle', 'path'))], 1:self$length)
efields = unique(c('type', efields))
protected_efields = c('cid', 'source', 'sink', 'title', 'weight')
rejected_efields = intersect(efields, protected_efields)
if (length(rejected_efields) > 0){
warning(sprintf('The following fields were included in efields: "%s", but since these are conserved fields in the json walks output then they will be not be included in efields. If these fields contain important metadata that you want included in the json output, then consider renaming these field names in your gWalk object.', paste(rejected_efields, collapse = '" ,"')))
efields = setdiff(efields, rejected_efields)
}
missing_efields = setdiff(efields, names(self$edges$dt))
if (length(missing_efields) > 0){
warning(sprintf('Invalid efields value/s provided: "%s". These fields were not found in the gWalk and since will be ignored.', paste(missing_efields, collapse = '" ,"')))
efields = intersect(efields, names(self$edges$dt))
}
sedu = dunlist(self$sedge.id)
cids = lapply(unname(split(cbind(data.table(cid = sedu$V1,
source = self$graph$edges[sedu$V1]$left$dt$snode.id,
sink = -self$graph$edges[sedu$V1]$right$dt$snode.id, # notice that we need to add negative sign here to meet the gGnome.js expectations
title = "",
weight = 1),
self$graph$edges[sedu$V1]$dt[, ..efields]
), sedu$listid)),
function(x) unname(split(x, 1:nrow(x))))
snu = dunlist(self$snode.id)
snu$ys = gGnome:::draw.paths.y(self$grl) %>% unlist
protected_nfields = c('chromosome', 'startPoint', 'endPoint',
'y', 'type', 'strand', 'title')
rejected_nfields = intersect(nfields, protected_nfields)
if (length(rejected_nfields) > 0){
warning(sprintf('The following fields were included in nfields: "%s", but since these are conserved fields in the json walks output then they will be not be included in nfields. If these fields contain important metadata that you want included in the json output, then consider renaming these field names in your gWalk object.', paste(rejected_nfields, collapse = '" ,"')))
nfields = setdiff(nfields, rejected_nfields)
}
missing_nfields = setdiff(nfields, names(self$nodes$dt))
if (length(missing_nfields) > 0){
warning(sprintf('Invalid nfields value/s provided: "%s". These fields were not found in the gWalk and since will be ignored.', paste(missing_nfields, collapse = '" ,"')))
nfields = intersect(nfields, names(self$edges$dt))
}
iids = lapply(unname(split(cbind(
data.table(iid = abs(snu$V1)),
self$graph$nodes[snu$V1]$dt[,
.(chromosome = seqnames,
startPoint = start,
endPoint = end,
y = snu$ys,
type = "interval",
strand = ifelse(snu$V1 > 0, "+", "-"),
title = abs(snu$V1))],
self$graph$nodes[snu$V1]$dt[,..nfields]), snu$listid)),
function(x) unname(split(x, 1:nrow(x))))
walks.js = lapply(1:length(self), function(x)
c(as.list(pids[[x]]),
list(cids = rbindlist(cids[[x]])),
list(iids = rbindlist(iids[[x]]))))
if (include.graph){
out = c(graph.js, list(walks = walks.js))
} else {
out = list(walks = walks.js)
}
if (save){
if (verbose)
{
message("Saving JSON to: ", filename)
}
jsonlite::write_json(out, filename,
pretty=TRUE, auto_unbox=TRUE, digits=4)
return(normalizePath(filename))
} else {
return(out)
}
},
#' @name gWalk eval
#' @description
#'
#' Evaluates an expression on gWalk node or edge metadata
#' and returns a vector of length length(self) one per walk.
#'
#' Ideally the expression should generate a scalar value for each walk.id, though will dedup and re-order
#' even if it does not.
#' @param x data.table style expression on node or edge metadata (will try each)
#' @param node data.table style expresion on node metadata
#' @param edge data.table style expresion on edge metadata
#' @author Marcin Imielinski
eval = function(x, node, edge)
{
self$check
if (self$length==0)
return(NULL)
out = NULL
if (missing('node') & !missing('x'))
delayedAssign("node", x)
if (missing('edge') & !missing('x'))
delayedAssign("edge", x)
if (!missing("node"))
{
out = tryCatch({
pdt = private$pgraph$dt
pdt = pdt[, setdiff(names(pdt), c('walk.id', 'walk.iid')), with = FALSE]
tmpdt = merge(private$pnode, pdt, by = 'snode.id')
setkeyv(tmpdt, c("walk.id", "walk.iid")) #fix order to match private$pnode
tmpdt = tmpdt[.(private$pnode$walk.id, private$pnode$walk.iid), ]
out = eval(parse(text = paste("tmpdt[, ", lazyeval::expr_text(node), ", keyby = walk.id]")))
out = unique(out, by = "walk.id")
out[.(1:self$length), ][[2]]
}, error = function(e) NULL)
}
if (is.null(out) & !missing('edge'))
{
out = tryCatch({
ped = private$pgraph$sedgesdt
ped = ped[, setdiff(names(ped), c('walk.id', 'walk.iid')),
with = FALSE]
tmpdt = merge(private$pedge, ped, by = 'sedge.id')
setkeyv(tmpdt, c("walk.id", "walk.iid")) #fix order to match private$pnode
tmpdt = tmpdt[.(private$pedge$walk.id, private$pedge$walk.iid), ]
out = eval(parse(text = paste("tmpdt[, ", lazyeval::expr_text(edge), ", keyby = walk.id]")))
out = unique(out, by = "walk.id")
out[.(1:self$length)][[2]]
}, error = function(e) NULL)
}
if (is.null(out))
stop('error with gWalk eval expression, check against node and edge metadata to see that all referenced fields exist')
return(out)
},
#' @name gWalk disjoin
#' @description
#' similar to gGraph disjoin but also updates according gWalk
#' and allows disjoining around a set of gRanges, essentially
#' adding "breaks" to the walks and allowing the ranges in those
#' walks to inherit metadata from overlapping GRanges
disjoin = function(gr = NULL, graph = NULL, by = NULL, collapse = TRUE, na.rm = TRUE, avg = FALSE, sep = ',', FUN = default.agg.fun.generator(na.rm = na.rm, sep = sep, avg = avg))
{
self$check
if (self$length==0){
return(invisible(self))
}
node.id = private$pgraph$nodes$dt$node.id
## disjoin current graph
tmpg = private$pgraph$copy
## mark old nodes in current graph
if (!collapse) ## keep track of node ids
{
tmpg$nodes$mark(node.id.old = 1:length(tmpg))
}
old.gr = tmpg$nodes$gr
if (!is.null(graph))
{
if (!collapse) ## unmark node.id.old to prevent collisions
graph$nodes$mark(node.id.old = NA_integer_)
tmpg = c(tmpg, graph)
}
if (!is.null(gr))
{
gr$node.id.old = NULL
## grb = gG(breaks = gr)
## if (!collapse)
## grb$nodes$mark(node.id.old = NA_integer_)
## tmpg = c(tmpg, grb)
}
tmpg$disjoin(gr = gr, by = by, na.rm = na.rm, collapse = collapse,
avg = avg, FUN = FUN)
## we will redo overlaps here
## since we know tmpg$nodes$gr are disjoint
## and we need coordinates to properly reorder
## any disjoint walk nodes that came from a parent
## node
old.gr$snode.id.old = old.gr$snode.id
old.gr$node.id.old = old.gr$node.id
if (collapse) ## we match on intervals since many input nodes collapsed into a single node
map = gr2dt(sort(gr.findoverlaps(old.gr, tmpg$nodes$gr, qcol = 'snode.id.old', scol = 'snode.id', by = by)))
else
{
old.gr$query.id = 1:length(old.gr)
new.gr = tmpg$nodes$gr %>% gr2dt
new.gr$subject.id = 1:nrow(new.gr)
map = merge(old.gr[, c("query.id", "snode.id.old", "node.id.old")] %>% gr2dt,
new.gr[, .(subject.id, node.id.old, snode.id)],
by = 'node.id.old')[, .(query.id, subject.id, snode.id.old, snode.id)]
}
map = map[, .(query.id, subject.id, snode.id.old, snode.id.new = snode.id)]
map[, query.iid := 1:.N, by = query.id]
setkey(map, snode.id.old)
pn = copy(private$pnode)[, walk.iid := 1:.N, by = walk.id]
pn[, node.id := abs(snode.id)]
pn.new = merge(pn, map, by.x = 'node.id',
by.y = 'snode.id.old', allow.cartesian = TRUE)
pn.new[, snode.id.new := sign(snode.id)*snode.id.new]
pn.new[, query.iid := sign(snode.id)*query.iid]
## sorting on (now signed) query.iid
## should cause the more rightwards nodes
## in a query.id to be later in the path
setkeyv(pn.new, c("walk.id", "walk.iid", "query.iid"))
pn.new = pn.new[, .(walk.id, snode.id = snode.id.new)]
private$gWalkFromNodes(
snode.id = split(pn.new$snode.id, pn.new$walk.id),
graph = tmpg,
meta = copy(private$pmeta),
circular = private$pmeta$circular)
private$ptimestamp = private$pgraph$timestamp
if (nrow(private$pnode))
setkey(private$pnode, walk.id)
if (nrow(private$pedge))
setkey(private$pedge, walk.id)
if (nrow(private$pmeta))
setkey(private$pmeta, walk.id)
return(invisible(self))
},
#' @name simplify
#' @description
#'
#' gWalk simplify will merge reference adjacent nodes in walk +/- "by field"
#' (i.e. which requires the nodes to additional match on by field)
#' and simplify the underlying graph.
#'
#' Note that this only simplifies walks;
#' the associated graph may not be completely simplified
#' since we retain all walk endpoints
#'
simplify = function(by = NULL, na.rm = TRUE, avg = FALSE, sep = ',', FUN = default.agg.fun.generator(na.rm = na.rm, sep = sep, avg = avg))
{
self$check
tmpgr = private$pgraph$nodes$gr
## we will first simplify tmpg
## then rematch the walk nodes to tmpg
newgraph = private$pgraph$copy
newgraph$nodes$mark(parent.node.id = 1:length(newgraph))
newgraph$simplify(FUN = NULL, by = by)
## we do simplify here just to keep track of nodes
## will do a "real" simplify downstream
## if the user wants to aggregate metadata from the walks
## now we just need to match up walk nodes
## to simplified graph
## some of our current walk nodes will become
## "internal" so this just requires doing a simple
## match up and then providing an updated set
## snode.ids after removing NAs
newnode = copy(private$pnode)
newnode$snode.id = sign(newnode$snode.id)*
match(abs(newnode$snode.id), newgraph$nodes$dt$parent.node.id)
newnode = newnode[!is.na(snode.id), ]
private$gWalkFromNodes(
snode.id = split(newnode[, snode.id], newnode[, walk.id]),
graph = newgraph,
meta = private$pmeta)
if (nrow(private$pnode))
setkey(private$pnode, walk.id)
if (nrow(private$pedge))
setkey(private$pedge, walk.id)
if (nrow(private$pmeta))
setkey(private$pmeta, walk.id)
private$ptimestamp = private$pgraph$timestamp
return(invisible(self))
},
#' @name mark
#' @description
#'
#' marks the nodes and edges in the graph corresponding to this walk
#' @author Marcin Imielinski
mark = function(...)
{
self$check
self$nodes$mark(...)
self$edges$mark(...)
},
#' @name fitcn
#' @description
#'
#' Given gWalks on a graph with node and edge
#' metadata "cn" attempts to find a non-negative integer
#' combination of those walks that generates those walks
#' minimizes the number of unique walks used (L0 norm).
#'
#' Modifications to this objecti include (default behavior)
#' if min.alt = TRUE then solution will minimizes the number of
#' walks that contain an ALT edge. If weight is provided
#' then the objective function be \eqn{\sum_i w_i*I[x_i>0]} where
#' w_i is the provided weight for walk i and I[x_i>0] is indicator
#' that is 1 if walk i has a nonzero copy number in the solution
#' and 0 otherwise.
#'
#' Complains if "cn" is not a metadata of nodes and edges
#' and if the graph is not "balanced" ie either
#' every node cn >= sum(edge cn) for both incoming
#' and outgoing edges.
#'
#' @param logical flag trim whether to "trim" the graph relative to the footprint of the walks
#' @param weight either a numeric metadata field of self or a self$length integer weight
#' @param min.alt logical flag whether to make objective function
#' @param verbose logical flag whether to output CPLEX output
#' @param edgeonly logical
#' @param evolve logical
#' @param n.sol numeric
#' @author Julie Behr
fitcn = function(trim=TRUE,
weight=NULL,
obs.mat=NULL,
verbose = FALSE,
min.alt=TRUE,
edgeonly = FALSE,
evolve=FALSE,
n.sol=2)
{
gw = self
gg = self$graph
if (is.null(gw$graph$nodes$dt$cn) |
is.null(gw$graph$edges$dt$cn))
{
stop('cn field is missing from node and edge metadata')
}
## quick check to see if graph is junction balanced
## ie that node cn >= sum(junction cn) for each node side
## and node cn == sum(junction cn) on all node sides
## where there is no loose end
rcn = gg$nodes$eval(sum(cn));
lcn = gg$nodes$eval(sum(cn), FALSE);
jbal.left = all((lcn == gg$nodes$dt$cn &
!gg$nodes$dt$loose.left) |
lcn <= gg$nodes$dt$cn &
gg$nodes$dt$loose.left, na.rm = TRUE)
jbal.right = all((rcn == gg$nodes$dt$cn &
!gg$nodes$dt$loose.right) |
rcn <= gg$nodes$dt$cn &
gg$nodes$dt$loose.right, na.rm = TRUE)
if (!jbal.left | !jbal.right)
warning('graph does not appear to be junction balanced, please check inputs')
constrain.evolution = function(K, gw, A, b, sense){
h = K[gw$edges[type=="ALT"]$dt[!duplicated(edge.id), edge.id],]
A = rbind(A, cbind(sparseMatrix(1, 1, x=0, dims=dim(h)), h))
b = c(b, rep(1, nrow(h)))
sense = c(sense, rep("L", nrow(h)))
return(list(A=A, b=b, sense=sense))
}
## only affects which x will optimize min(x * c),
## currently doesn't matter because all solutions
## (regardless of how optimal) are returned
constrain.observations = function(obs.mat, A, b, cvec, sense, vtype){
if(!(ncol(obs.mat)*2)==ncol(A))
stop('input obs.mat contains the wrong number of columns; should match length of gw')
p = nrow(obs.mat)
w = ncol(obs.mat)
Zero = sparseMatrix(1, 1, x=0, dims=c(2*w*p, 2*w*p))
A0 = Zero[rep(1, nrow(A)), 1:(2*p)]
Ap = cbind(Zero[rep(1, p), 1:w], sign(obs.mat), diag(rep(-1, p)), Zero[rep(1,p), 1:p])
Mpub = cbind(Zero[rep(1,p), 1:(2*w)], diag(rep(1, p)), diag(rep(-1e7, p)))
Mplb = cbind(Zero[rep(1,p), 1:(2*w)], diag(rep(1, p)), diag(rep(-0.1, p)))
Amp = rbind(cbind(A, A0), Ap, Mpub, Mplb)
b = c(b, rep(0, 3*p))
cvec = c(cvec, rep(0, p), -1*rowMax(obs.mat))
sense = c(sense, rep("E", p), rep("L", p), rep("G", p))
vtype = c(vtype, rep("I", p), rep("B", p))
return(list(A=Amp, b=b, c=cvec, sense=sense, vtype=vtype))
}
## walks x edges matrix
generate.Ke = function(gw){
dt = gw$edgesdt[, c("walk.id", "sedge.id")][, edge.id := abs(sedge.id)]
dt$listid = factor(dt$walk.id, 1:length(gw))
cdt = dcast(dt[!is.na(sedge.id),], listid ~ edge.id, fun.aggregate=length, value.var="edge.id", drop=FALSE)
mat = cdt[, -1]
rownames(mat) = cdt$listid
return(t(mat))
}
## walks x nodes matrix
generate.Kn = function(gw){
dt = gw$nodesdt[, c("walk.id", "snode.id")][, node.id := abs(snode.id)]
dt$listid = factor(dt$walk.id, 1:length(gw))
cdt = dcast(dt[!is.na(snode.id),], listid ~ node.id, fun.aggregate=length, value.var="node.id", drop=FALSE)
mat = cdt[, -1]
rownames(mat) = cdt$listid
return(t(mat))
}
generate.Amat = function(K){
M = 1e7
K = as(K, 'sparseMatrix')
w = ncol(K)
## upper bound is infinity if indicator is positive
Zero = sparseMatrix(1, 1, x=0, dims=c(2*w, 2*w))
Amub = cbind(diag(rep(1, w)), diag(rep(-M, w)))
## lower bound > 0 if indicator is positive
Amlb = cbind(diag(rep(1, w)), diag(rep(-0.1, w)))
A = rbind(cbind2(K, Zero[rep(1, nrow(K)), (w+1:w)]), Amub, Amlb)
return(A)
}
generate.bvec = function(e, K){
w = ncol(K)
bvec = c(e, rep(0, 2*w))
return(bvec)
}
generate.cvec = function(K, weight, min.alt, gw){
if(!is.null(weight) | min.alt) weight = prep.weight(weight, min.alt, gw)
w = ncol(K)
if(is.null(weight)) weight=rep(1, w)
cvec = c(rep(0, w), weight)
return(cvec)
}
prep.weight = function(weight, min.alt, gw){
if(!is.null(weight)){
if(length(weight)==1 && is.character(weight) && weight %in% colnames(gw$dt)) weight = gw$dt[, weight, with=F]
if(!(is.numeric(weight))){
stop("weight must either be numeric vector of same length as gw or the name of a single numeric annotation in gw")
}
}
if(min.alt){
if(!is.null(weight)){
warning("modifying input weight to satisfy min.alt=TRUE")
} else weight = rep(1, ncol(K))
numalt = gw$eval(edge = sum(type=='ALT'))
## walks with no edges or only REF edges will not add to objective function
weight[is.na(numalt) | numalt==0] = 0
}
return(weight)
}
generate.vtype = function(K){
w = ncol(K)
vtype = c(rep("I", w), rep("B", w))
return(vtype)
}
generate.sense = function(K){
w = ncol(K)
e = nrow(K)
sense = c(rep("E", e), rep("L", w), rep("G", w))
return(sense)
}
if(!"cn" %in% colnames(gw$graph$edges$dt)){
stop('cn field must be populated in the input graph node and edges metadata')
# warning("gw$graph does not have cn field on edges. Defaulting to all cn=1...")
if(trim){
e = rep(1, length(unique(abs(unlist(gw$sedge.id)))))
e2 = rep(1, length(unique(abs(unlist(gw$snode.id)))))
} else{
e = rep(1, length(gw$graph$edges))
e2 = rep(1, length(gw$graph$nodes))
}
} else{
if(trim){
e = gw$graph$edges[sedge.id %in% abs(unlist(gw$sedge.id))]$dt$cn
e2 = gw$graph$nodes[snode.id %in% abs(unlist(gw$snode.id))]$dt$cn
} else{
e = gw$graph$edges$dt$cn
e2 = gw$graph$nodes$dt$cn
}
}
if (edgeonly)
{
K = generate.Ke(gw)
}
else
{
K = rbind(generate.Ke(gw), generate.Kn(gw))
e = c(e, e2)
}
if(nrow(K) != length(e)){
stop("Mismatch between size of A matrix and length of b vector. Some edges in gw$graph are not covered by gw. If this was intended, try trim=TRUE")
}
A = generate.Amat(unname(K))
b = generate.bvec(e, K)
c = generate.cvec(K, weight, min.alt, gw)
sense = generate.sense(K)
vtype = generate.vtype(K)
if(evolve){
ll = constrain.evolution(K, gw, A, b, sense)
A = ll$A
b = ll$b
sense = ll$sense
}
if(!is.null(obs.mat)){
ll = constrain.observations(obs.mat, A, b, c, sense, vtype)
A = ll$A
b = ll$b
c = ll$c
sense = ll$sense
vtype = ll$vtype
}
## sol = Rcplex::Rcplex(cvec = c,
sol = Rcplex2(cvec = c,
Amat = A,
bvec = b,
sense = sense,
Qmat = NULL,
lb = 0,
ub = Inf,
n = n.sol,
objsense = "min",
vtype = vtype,
control = list(
trace = ifelse(verbose>=1, 1, 0),
tilim = 100,
epgap = 1))
if (!is.null(sol$xopt)){
sol = list(sol)
}
if(length(sol)==0){
stop("No solutions found satisfying given constraints")
}
rerun=T
while(rerun){
z = sign(vtype == 'B')
P = do.call(rbind, lapply(sol, function(x) x$xopt*z))
p = rowSums(P)-1
Ahat = rbind(A, P)
bhat = c(b, p)
sensehat = c(sense, rep('L', length(p)))
## sol.new = Rcplex::Rcplex(cvec = c,
sol.new = Rcplex2(cvec = c,
Amat = Ahat,
bvec = bhat,
sense = sensehat,
Qmat = NULL,
lb = 0,
ub = Inf,
n = n.sol,
objsense = "min",
vtype = vtype,
control = list(
trace = ifelse(verbose>=1, 1, 0),
tilim = 100,
epgap = 1))
## this could be optional?
## sol.new = sol.new[round(sapply(sol.new, function(x) x$obj))==round(sol[[1]]$obj)]
if(length(sol.new)==0){
rerun=F
} else{
sol = c(sol, sol.new)
if(length(sol)>=n.sol){
sol = sol[1:n.sol]
rerun=F
}
}
}
gw$set(cn = round(sol[[1]]$xopt[1:ncol(K)]))
if(length(sol)>1){
all = setNames(lapply(sol,
function(s)
round(s$xopt[1:ncol(K)])),
paste("cn", 1:length(sol), sep="."))
do.call(gw$set, all)
}
return(invisible(gw))
},
gtrack = function(name = NULL, stack.gap = 1e5, ...)
{
self$check
## inherit gTrack arguments from gGraph
gt.args = private$pgraph$meta[intersect(c('colormap',
'gr.colorfield',
'col',
'border',
'name',
'y1',
'y0',
'lwd.border',
'labels.suppress',
'labels.suppress.gr',
'label.suppress.grl',
'yaxis'), names(private$pgraph$meta))]
args = list(...)
for (arg in names(args))
{
gt.args[[arg]] = args[[arg]]
}
if (!is.null(name)){
gt.args[["name"]] = name
}
if (is.null(gt.args[['name']])){
gt.args[['name']] = 'gWalk'
}
gt.args[['draw.paths']] = TRUE
tmp.grl = unname(self$grl)
if (length(tmp.grl)>0){
values(tmp.grl)$is.cycle = self$dt$circular
}
gt.args[['data']] = tmp.grl
gt.args[['grl.labelfield']] = "name"
do.call(gTrack, gt.args)
}
),
private = list(
## data.table of node.ids in the walk
pnode = NULL,
## data.table of edge.ids in the walk
pedge = NULL,
## data.table of walk metadata
pmeta = NULL,
## timestamp of underlying graph, used to check staleness
ptimestamp = NULL,
## Pointer to the graph the snode.id/sedge.ids come from
## If initialized with grl, this is the graph from the gWalk
pgraph = NULL,
gWalkFromNodes = function(snode.id,
graph,
meta = NULL, ## metadata with one row per walk, can include every metadata field except for $circular, $walk.id
circular = NULL, ## specifies whether the given walk is circular
drop = FALSE ## logical flag specifying whether to drop provided walks traversing non-existent edges
) ## logical vector specifying which contigs are circular, i.e. have an implied edge from the final node to the first
{
pnode.empty = as.data.table(list(walk.id = c(), snode.id = c()))
pedge.empty = as.data.table(list(walk.id = c(), sedge.id = c()))
if (length(snode.id)==0)
{
private$pnode = pnode.empty
private$pedge = pedge.empty
private$pmeta = data.table()
private$pgraph = graph
return()
}
## List of snode.id's
## Get list of sedge.ids with only snode.ids from each element
if (!is.null(circular))
{
if (!is.logical(circular)){
stop('circular must be a logical vector')
}
}
else {
circular = rep(FALSE, length(snode.id))
}
if (!is.null(meta)) ## need this for some reason to get rid of pass by reference stickiness (doesn't work higher in call stack)
{
if (!is.data.table(meta))
meta = as.data.table(meta)
meta = copy(meta)
}
if (is.null(names(snode.id)))
names(snode.id) = seq_along(snode.id)
private$pmeta = data.table(walk.id = seq_along(snode.id), name = names(snode.id), length = elementNROWS(snode.id), wid = 0,
circular = circular)
if (!is.null(meta))
{
if (nrow(meta) != length(snode.id))
{
stop('data.table of meta.data must be same length and order as node, edge, or GRangesList list input to gWalk constructor')
}
GW.NONO = c('walk.id', 'walk.iid', 'circular', 'snode.id', 'sedge.id', 'wid', 'length', 'name', 'subject.id', 'query.id', 'i')
good.cols = setdiff(names(meta), GW.NONO)
if (length(good.cols)>0)
{
private$pmeta = cbind(private$pmeta, meta[, good.cols, with = FALSE])
}
}
## always faster to unlist than lapply
tmp = dunlist(unname(snode.id))
if (nrow(tmp)==0)
{
private$pnode = pnode.empty
private$pedge = pedge.empty
private$pgraph = graph
return()
}
pnode = tmp[, .(walk.id = listid, snode.id = V1)]
## first need to check if the edges corresponding to the consecutive
## node pairs in the input lists exist
pedge = pnode[, .(from = snode.id[-.N], to = snode.id[-1]), by = walk.id]
sedgesdt = copy(graph$sedgesdt)[, from := graph$gr$snode.id[from]][, to := graph$gr$snode.id[to]]
if (any(private$pmeta$circular))
{
ix = private$pmeta$circular
wid = private$pmeta$walk.id[ix]
last = sapply(snode.id[ix], function(x) x[length(x)])
first = sapply(snode.id[ix], "[", 1)
pedge = rbind(pedge,
data.table(
walk.id = wid,
from = last, to = first))
}
setkeyv(sedgesdt, c('from', 'to'))
setkey(pnode, walk.id)
## just in case there is more than one edge
## connecting a node pair (possible)
sedgesdt = unique(sedgesdt, by = c("from", "to"))
if (nrow(pedge)>0){
setkey(pedge, walk.id)
pedge$sedge.id = NA
if (nrow(sedgesdt))
pedge$sedge.id = sedgesdt[.(pedge$from, pedge$to), sedge.id]
if (!drop) ## default behavior is we stop if the provided node lists refer to non-existent edges
{
if (any(is.na(pedge$sedge.id))){
stop('One or more provided walks refers to a non-existent edge, please check your input or re-instantiate with drop = TRUE to ignore this error and remove those walks')
}
}
else
{
good.walks = setdiff(pnode$walk.id, pedge[is.na(sedge.id), walk.id])
pnode = pnode[.(good.walks), ]
pedge = pedge[.(good.walks), ]
private$pmeta = private$pmeta[good.walks, ]
}
if (nrow(pedge)>0)
{
pedge[, walk.iid := 1:.N, by = walk.id]
}
}
if (nrow(pnode)>0)
{
pnode[, walk.iid := 1:.N, by = walk.id]
}
private$pnode = pnode
private$pedge = pedge
private$pgraph = graph
private$ptimestamp = private$pgraph$timestamp
tmp = copy(pnode)
tmp$wid = width(self$graph$nodes$gr)[abs(tmp$snode.id)]
private$pmeta$wid = tmp[, .(wid = sum(as.numeric(wid), na.rm = TRUE)), keyby = walk.id][.(private$pmeta$walk.id), wid]
if (nrow(private$pmeta) && any(ix <- is.na(private$pmeta$wid)))
private$pmeta$wid[ix] = 0
},
gWalkFromEdges = function(sedge.id,
snode.id = NULL,
graph = NULL,
meta = NULL, ## metadata with one row per walk, can include every metadata field except for $circular, $walk.id
drop = FALSE,
circular = NULL) ## logical vector specifying which contigs are circular, i.e. have an implied edge from the final node to the first
{
pnode.empty = as.data.table(list(walk.id = c(), walk.iid = c(), snode.id = c()))
pedge.empty = as.data.table(list(walk.id = c(), walk.iid = c(), sedge.id = c()))
if (length(sedge.id)==0)
{
private$pnode = pnode.empty
private$pedge = pedge.empty
private$pmeta = data.table()
private$pgraph = graph
return()
}
if (!is.null(circular))
{
if (!is.logical(circular))
stop('circular must be a logical vector')
}
else {
circular = rep(FALSE, length(sedge.id))
}
private$pmeta = data.table(walk.id = seq_along(sedge.id), name = names(sedge.id), length = elementNROWS(sedge.id), wid = 0,
circular = circular)
if (!is.null(meta)) ## need this for some reason to get rid of pass by reference stickiness (doesn't work higher in call stack)
{
if (!is.data.table(meta))
meta = as.data.table(meta)
meta = copy(meta)
if (nrow(meta) != length(sedge.id))
{
stop('data.table of meta data must be same length and order as node, edge, or GRangesList list input')
}
GW.NONO = c('walk.id', 'walk.iid', 'circular', 'snode.id', 'sedge.id', 'wid', 'length', 'name', 'query.id', 'subject.id', 'i')
good.cols = setdiff(names(meta), GW.NONO)
if (length(good.cols)>0)
{
private$pmeta = cbind(private$pmeta,
meta[, good.cols, with = FALSE])
}
}
private$pgraph = graph
tmp = dunlist(unname(sedge.id))
if (nrow(tmp)==0)
{
private$pnode = pnode.empty
private$pedge = pedge.empty
return()
}
setkey(private$pmeta, walk.id)
pedge = tmp[, .(walk.id = listid, sedge.id = V1)]
tmpe = graph$sedgesdt[.(pedge$sedge.id), .(from, to)]
if (any(is.na(tmpe$to))){
stop('one or more provided sedge.ids are out of bounds')
}
pedge = cbind(pedge, tmpe)
## first need to check if consecutive edges in the provided
## walk share sink and source nodes
pedge[, matches.next := c(to[-.N] == from[-1], NA), by = walk.id]
if (any(!pedge$matches.next, na.rm = TRUE)){
stop('at least one pair of consecutive edges i and i+1 in the walk do not share a sink and source node, respectively')
}
pedge[, to := graph$dt$snode.id[to]]
pedge[, from := graph$dt$snode.id[from]]
pnode = merge(pedge, private$pmeta, by = 'walk.id')[, .(snode.id = {if (circular) from else c(from, to[.N])}), by = .(walk.id, circular)][, .(walk.id, snode.id)]
if (is.null(snode.id))
{
snode.id = split(pnode$snode.id, pnode$walk.id)
}
else ## check to see if provided node ids are compatible for walks with 1 or more edges
{
.f = function(y) sapply(y, function(x) paste(x, collapse = ', '))
ix = names(sedge.id)[base::lengths(sedge.id)>0]
if (length(ix))
{
if (!identical(.f(split(pnode$snode.id, pnode$walk.id)[ix]), .f(snode.id[ix])))
stop('plesae check inputs - provided snode.id and sedge.ids are not compatible')
}
tmp = dunlist(unname(snode.id))
## if so let's populate pnode with the snode.id version
pnode = tmp[, .(walk.id = listid, snode.id = V1)]
}
if (nrow(pedge)>0)
pedge[, walk.iid := 1:.N, by = walk.id]
if (nrow(pnode)>0)
pnode[, walk.iid := 1:.N, by = walk.id]
setkey(pnode, walk.id)
setkey(pedge, walk.id)
## check to see if circular are truly circular
## i.e. does the final edge touch the first node
if (any(private$pmeta$circular))
{
tmp = merge(pedge, private$pmeta[, .(walk.id, circular)], by = 'walk.id') %>%
merge(pnode[walk.iid == 1, .(walk.id, snode.id)], by = 'walk.id')
tmp = tmp[circular == TRUE, .(check = to[.N] == snode.id), by = .(walk.id, snode.id)]
if (length(notcircle <- tmp[check == FALSE, walk.id]))
stop(paste('Walk', paste(notcircle, collapse = ', '), "not circular (last edge doesn't touch the first node), please check circular flag"))
}
private$pedge = pedge
private$pnode = pnode
## run everything through gWalkFromNodes to make sure compatible
## private$gWalkFromNodes(snode.id = snode.id,
## graph = graph,
## circular = circular,
## drop = drop,
## private$pmeta
## )
}
),
active = list(
## Returns a GRangesList of walks in the graph
grl = function()
{
self$check
if (self$length == 0){
return(GRangesList(private$pgraph$gr[c()])[c()])
}
## Does not get both strands only 1 strand
nix = private$pgraph$queryLookup(private$pnode$snode.id)$index
out = split(private$pgraph$gr[nix], factor(private$pnode$walk.id, private$pmeta$walk.id))[as.character(1:self$length)]
values(out) = private$pmeta
return(out)
},
meta = function()
{
self$check
return(private$pmeta)
},
## object is stale if the recorded timestamp of the gGraph
## != timestamp the actual gGraph pointed to by pgraph
## suggesting that the indices are no longer valid
stale = function() if (!is.null(private$ptimestamp)) private$ptimestamp != private$pgraph$timestamp else FALSE,
## checks if object is stale i.e.
check = function() if (self$stale) stop('object is stale, underlying gGraph has changed. You will need to re-instantiate.'),
nodes = function()
{
self$check
return(private$pgraph$nodes[private$pnode$snode.id])
},
nodesdt = function()
{
private$pnode
},
edgesdt = function()
{
private$pedge
},
snode.id = function()
{
self$check
if (self$length==0)
return(list())
node.sum = data.table(
snode.id = rep(list(),self$length),
walk.id = private$pmeta$walk.id,
key = 'walk.id')
if (nrow(private$pmeta))
{
ix = private$pmeta$walk.id
node.sum = private$pnode[.(ix), .(snode.id = list(c(snode.id))), keyby = walk.id][.(ix),][, -1]
}
return(node.sum$snode.id)
},
sedge.id = function()
{
self$check
if (self$length==0)
return(list())
edge.sum = data.table(
sedge.id = rep(list(),self$length),
walk.id = private$pmeta$walk.id,
key = 'walk.id')
if (nrow(private$pmeta))
{
ix = unique(private$pmeta$walk.id)
edge.sum = private$pedge[.(ix), .(sedge.id = list(c(sedge.id))), keyby = walk.id][.(ix),][, -1]
}
return(lapply(edge.sum$sedge.id, function(x) x[!is.na(x)]))
},
copy = function() self$clone(),
## returns a length(self) logical vector specifying whether
## each walk is circular or not
circular = function()
{
self$check
return(private$pmeta$circular)
},
length = function() {
self$check
return(nrow(private$pmeta))
},
lengths = function() {
self$check
return(lengths(self$grl))
},
edges = function()
{
self$check
if (self$length == 0){
return(gEdge$new(graph = private$pgraph))
}
seid = private$pedge$sedge.id
seid = seid[!is.na(seid)]
return(private$pgraph$edges[seid])
},
#' @name nalt
#' @description
#' computes number of alt edges in each walk
nalt = function()
{
nalt = pmax(0, self$eval(edge = sum(class != 'REF')), na.rm = TRUE)
# private$pmeta[, nalt := nalt]
return(nalt)
},
footprint = function()
{
self$check
return(sort(reduce(gr.stripstrand(self$nodes$gr))))
},
gt = function()
{
self$check
return(self$gtrack())
},
graph = function()
{
self$check
return(private$pgraph)
},
dt = function() {
self$check
if (self$length==0){
return(data.table())
}
ix = 1:self$length
return(self$dts(ix))
}
)
)
#' @name c
#' @title c
#' @description
#'
#' Concatenates gWalk objects by id's
#'
#' @param gWalk objects
#'
#' @return a new concatenated gWalk Object
#' @export
`c.gWalk` = function(..., force = FALSE)
{
gWalk.list=list(...)
isg = sapply(gWalk.list, function(x) class(x)[1]=='gWalk')
if(any(!isg)){
stop('Error: All inputs must be of class gWalk.')
}
##Check to make sure they all come from the same graph
if (!force)
{
graphs = lapply(gWalk.list, function(x) x$graph)
if (length(graphs)>1)
{
gg = graphs[[1]]
for (i in 2:length(graphs))
if (graphs[[i]] != gg)
stop('All inputs must point to the same graph')
}
}
##Get all the pnode.id's to create new gNode
## snode.id = do.call('c', lapply(gWalk.list, function(x) {y = x$snode.id; names(y) = x$dt$name; y}))
sedge.id = do.call('c', lapply(gWalk.list, function(x) {y = x$sedge.id; names(y) = x$dt$name; y}))
snode.id = do.call('c', lapply(gWalk.list, function(x) {y = x$snode.id; names(y) = x$dt$name; y}))
circular = do.call('c', lapply(gWalk.list, function(x) x$circular))
metas = rbindlist(lapply(gWalk.list, function(x) x$meta), fill = TRUE)
return(gWalk$new(snode.id = snode.id, sedge.id = sedge.id, circular = circular, graph = gWalk.list[[1]]$graph, meta = metas))
}
#' @name gW
#' @title create gWalk
#' @description
#'
#' Wrapper that instantiates a gWalk object from a variety of different inputs. If a gGraph is provided
#' as input (in conjunction with sedge.id, snode.id, or grl input) then will check for existence of
#' the provided walks in the provided graph, and will error out if those walks do not exist).
#'
#'
#' @examples
#'
#' ## read in GRangesList of walks
#' walks.grl = readRDS(system.file('extdata', 'gw.grl.rds', package = "gGnome"))
#'
#' ## read in gGraph
#' gg = readRDS(system.file('extdata', 'walks.gg.rds', package="gGnome"))
#'
#' ## create gWalks from GRangesList - creates graph in which walks are disconnected, i.e. phased,
#' ## so there may be several intervals per reference region
#' gW(grl = walks.grl)
#'
#' ## disjoin graph from GRangesList, these refer to a single disjoint graph
#' ## where there is one interval per reference region
#' gW(grl = walks.grl, disjoin = TRUE)
#'
#' ## thread GRangesList onto existing graph
#' gW(grl = walks.grl, graph = gg)
#'
#' ## from lists of signed node ids
#' nid.list = list(c(-888, 322, 324, 325, 325), c(-708, 699, -706, 701, 702, 702))
#' gW(snode.id = nid.list, graph = gg)
#'
#' ## from lists of signed edge ids
#' eid.list = list(c(-1462, 1461), c(-133, -132, -134))
#' gW(sedge.id = eid.list, graph = gg)
#'
#' @export
#'
#' @param snode.id list of (possibly negative) integer signed node ids in a gGraph
#' @param sedge.id list of (possibly negative) integer signed edge ids in a gGraph,
#' @param grl GRangesList of walks in the genome, each walk is a (stranded) GRangesList, if graph is NULL will build a gGraph from the provided GRangesLists
#' @param graph an existing graph from which to build a graph (used in conjunction with snode.id, sedge.id, or grl), if used with grl will attempt to match up the provided grl's with edges in the graph, if the GRL walks non-existent edges, then will error out
#' @param disjoin relevant for grl input when graph is NULL. If TRUE (default FALSE) will create a graph taking the GRanges disjoin of the GRanges comprising the provided GRL
#' @param drop logical flag only relevant when snode.id is specified indicating whether to drop any walks that refer to non-existent edges
#' @return gWalk object of provided walks, with pointers back to the graph to which they refer.
gW = function(snode.id = NULL,
sedge.id = NULL,
grl = NULL,
graph = NULL,
meta = NULL, ## metadata with one row per walk, can include every metadata field except for $circular, $walk.id
circular = NULL, ## logical vector specifying which contigs are circular, i.e. have an implied edge from the final node to the first
drop = FALSE,
disjoin = FALSE ## flag whether to collapse ie disjoin the intervals
)
{
return(gWalk$new(snode.id = snode.id,
sedge.id = sedge.id,
grl = grl,
graph = graph,
meta = meta,
circular = circular,
drop = drop,
disjoin = disjoin))
}
#' @name [.gWalk
#' @title gWalk
#' @description
#'
#' Overloads subset operator for walks
#'
#' @param obj gWalk object This is the gWalk object to be subset
#' @param i integer, logical, or expression on gWalk metadata used to subset gWalk
#' @return A new gWalk object that contains only the given subset of walks
#' @export
'[.gWalk' = function(obj, i = NULL, with = TRUE, ...){
walks = obj$clone()
if (with)
{
inew = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(i)))), parent.frame()),obj$dts(makelists = FALSE), parent.frame(2)), error = function(e) NULL)
if (is.null(inew))
inew = i ## just give up
}
else
{
inew = i
}
walks$subset(inew)
return(walks)
}
edge.queries = function(x, y) {
if (is.character(y)){
y = parse.gr(y)
}
gr = unlist(x$gr, use.names = FALSE)
if (inherits(y, "gEdge")) {
overlaps=ra.overlaps(x$junctions$grl, y$junctions$grl)
good.ix=overlaps[, "ra1.ix"]
good.ix=unique(good.ix)
if (is.na(good.ix)){
return("No overlaps")
}
else{
return(x[good.ix])
}
}
else if (inherits(y, "Junction")) {
juncs=x$junctions$grl
overlaps=ra.overlaps(juncs, y$grl)
good.ix=overlaps[, "ra1.ix"]
good.ix=unique(good.ix)
if (is.na(good.ix)){
return("No overlaps")
}
else{
return(x[good.ix])
}
}
}
#' @name gdist
#' @title gdist
#' @description
#'
#' Compute "distance" between two gGraphs using node and edge features.
#'
#' Distance computation involves
#' (1) assigning a value to each node and edge via value / nvalue / evalue expression (default cn metadata field)
#' (2) assigning a weight to each disjoint interval and edge in the union of the two graphs (log10(width) to intervals and 1 to edges)
#' (3) for each disjoint interval computing a ndist = sum(weight*|value1-value2|)/sum(weight), where if the interval does not exist for one of the graphs we use the value "fill" in place
#' (4) for each (ALT) edge present in one or both graphs compute edist = sum(weight*|value1-value2|)/sum(weight), filling in missing edges as above
#' (5) combining node.coef*ndist + edge.coef*edist
#'
#' Variations on these distances include defining alternate weight and value expressions
#' for nodes and edges (based on their metadata), changing the padding to align ALT junctions that are not exactly identical.
#'
#' If graphs are non-haploid (ie have two more intervals for a given reference region)
#' then agg.fun (default min) is used to aggregate the several ndist that map to a
#' that given reference region into a single ndist value for that coordinate.
#'
#'
#' @param gg1 first graph (or list of graphs)
#' @param gg2 second graph (or list of graphs)
#' @param value expression on node and edge metadata compute node and edge value
#' @param weight expression on node and edge metadata to compute node and edge weights
#' @param nvalue expression on node metadata to compute node value
#' @param evalue expression on edge metadata to compute edge value
#' @param nweight expression on node metadata to compute node weight
#' @param eweight expression on edge metadata to compute edge weight
#' @param normalize logical flag specifying whether to divide distance by sum(weight) (if TRUE) or not
#' @param fill 0 value to fill value for missing nodes and edges
#' @param na.rm logical flag specifying whether to remove NA when summing distance (should not be any NA's if value and weight are well specified
gdist = function(gg1, gg2,
weight = quote(log10(width)),
value = quote(cn),
nvalue = NULL,
evalue = NULL,
nweight = NULL,
eweight = 1,
fill = 0,
normalize = TRUE,
na.rm = TRUE,
pad = 1000,
agg.fun = min,
node.coef = 1,
edge.coef = 1,
verbose = FALSE,
L = 1
)
{
if (L==1)
lfun = function(x, y) abs(x-y)
else if (L==0)
lfun = function(x, y) abs(sign(x-y))
else if (L==2)
lfun = function(x, y) (x-y)^2
else
stop('L specifies L norm and can be either 0, 1, ,2')
if (deparse(substitute(nvalue)) == "NULL")
nvalue = substitute(value)
if (deparse(substitute(evalue)) == "NULL")
evalue = substitute(value)
if (deparse(substitute(nweight)) == "NULL")
nweight = substitute(weight)
if (deparse(substitute(eweight)) == "NULL")
eweight = substitute(weight)
.broken = function(x) is.null(x) || (!is.numeric(x) & !is.integer(x) & !is.logical(x))
## compare intervals
## make disjoint nodes
grb = grbind(gg1$nodes$gr, gg2$nodes$gr)
if (length(grb)==0)
return(NA)
dnodes = gr.stripstrand(disjoin(grb))
values(dnodes) = values(grb)[gr.match(dnodes, grb), ]
## weight is a feature of the disjoint granges, if ambiguity will choose the first
nwt = tryCatch(
eval(eval(parse(text = substitute(deparse(nweight))),
parent.frame()), gr2dt(dnodes), parent.frame(2)), error = function(e) NULL)
if (.broken(nwt)) ## try another way
nwt = tryCatch(
eval(eval(parse(text = substitute(deparse(substitute(nweight)))),
parent.frame()), gr2dt(dnodes), parent.frame(2)), error = function(e) NULL)
if (.broken(nwt)) ## give up
{
warning('weight expression did not complete for nodes, defaulting to 1')
nwt = rep(1, length(dnodes))
}
## disjoin by dnodes
suppressWarnings(gg1$disjoin(dnodes))
suppressWarnings(gg2$disjoin(dnodes))
gr1 = gg1$nodes$gr
gr2 = gg2$nodes$gr
## painful acrobatics to allow expressions as inputs to value
nval1 = tryCatch(
eval(eval(parse(text = substitute(deparse(nvalue))),
parent.frame()), gr2dt(gr1), parent.frame(2)), error = function(e) NULL)
if (.broken(nval1)) ## try another way
{
nval1 = tryCatch(
eval(eval(parse(text = substitute(deparse(substitute(nvalue)))),
parent.frame()), gr2dt(gr1), parent.frame(2)), error = function(e) NULL)
}
nval2 = tryCatch(
eval(eval(parse(text = substitute(deparse(nvalue))),
parent.frame()), gr2dt(gr2), parent.frame(2)), error = function(e) NULL)
if (.broken(nval2)) ## try another way
nval2 = tryCatch(
eval(eval(parse(text = substitute(deparse(substitute(nvalue)))),
parent.frame()), gr2dt(gr2), parent.frame(2)), error = function(e) NULL)
if (.broken(nval1)) ## give up
{
warning(sprintf('value expression failed for nodes in graph 1, defaulting to 1'))
nval1 = rep(1, length(gr2))
}
if (.broken(nval2)) ## give up
{
warning(sprintf('value expression failed for nodes in graph 2, defaulting to 1'))
nval2 = rep(1, length(gr2))
}
gr1$value = nval1
gr2$value = nval2
## intersect and setdiff node pairs in gg1 and gg2
nodes.int = gr1[, "value"] %*% gr2[, "value"]
names(values(nodes.int))[ncol(values(nodes.int))] = 'value.1'
nodes.sd1 = gr1[!(gr1 %^% gr2)][, "value"]
nodes.sd2 = gr2[!(gr2 %^% gr1)][, "value"]
names(values(nodes.sd2)) = 'value.1'
## outer + inner join of nodes based on coordinate
mnodes = grbind(nodes.int, nodes.sd1, nodes.sd2)[, c("value", "value.1")]
names(values(mnodes)) = c('value', 'value.1')
mnodes$ix = match(mnodes, dnodes)
mnodes = gr2dt(mnodes)
## fill any NAs for field
mnodes[is.na(value), value := fill]
mnodes[is.na(value.1), value.1 := fill]
## node distance
if (normalize)
ndist = mnodes[, .(diff = agg.fun(nwt[ix]*lfun(value, value.1))), by = ix][, sum(diff, na.rm = na.rm)/sum(nwt[ix], na.rm = na.rm)]
else
ndist = mnodes[, .(diff = agg.fun(nwt[ix]*lfun(value, value.1))), by = ix][, sum(diff, na.rm = na.rm)]
jj1 = gg1$junctions[type == 'ALT']
jj2 = gg2$junctions[type == 'ALT']
eval1 = tryCatch(
eval(eval(parse(text = substitute(deparse(nvalue))),
parent.frame()), gr2dt(gr1), parent.frame(2)), error = function(e) NULL)
if (.broken(eval1)) ## try another way
eval1 = tryCatch(
eval(eval(parse(text = substitute(deparse(substitute(nvalue)))),
parent.frame()), gr2dt(gr1), parent.frame(2)), error = function(e) NULL)
eval2 = tryCatch(
eval(eval(parse(text = substitute(deparse(nvalue))),
parent.frame()), gr2dt(gr2), parent.frame(2)), error = function(e) NULL)
if (.broken(eval2)) ## try another way
eval2 = tryCatch(
eval(eval(parse(text = substitute(deparse(substitute(nvalue)))),
parent.frame()), gr2dt(gr2), parent.frame(2)), error = function(e) NULL)
if (.broken(eval1))
{
warning(sprintf('value expression failed for edges for graph 1, defaulting to 1'))
eval1 = rep(1, length(jj1))
}
if (.broken(eval2)) ## give up
{
warning(sprintf('value expression failed for edges for graph 2, defaulting to 1'))
eval2 = rep(1, length(jj2))
}
jjm = merge.Junction(jj1, jj2, pad = pad, cartesian = TRUE, all = TRUE)$dt
jdist = 0
if (nrow(jjm)>0)
{
jjm$value = eval1[jjm$query.id]
jjm$value.1 = eval1[jjm$subject.id]
## some more acrobatics to parse eweight as an expression
ewt = tryCatch(eval(eval(parse(text = substitute(deparse(eweight))),
parent.frame()), jjm, parent.frame(2)), error = function(e) NULL)
if (.broken(ewt)) ## try another way
ewt = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(eweight)))),
parent.frame()), jjm, parent.frame(2)), error = function(e) NULL)
if (.broken(ewt)) ## give up
{
warning(sprintf('weight expression failed for edges, defaulting to 1'))
ewt = rep(1, nrow(jjm))
}
jjm$weight = ewt
jjdat = jjm[, .(value, value.1, weight)]
jjdat[is.na(value), value := fill]
jjdat[is.na(value.1), value.1 := fill]
if (normalize) ## normalize by the weights
jdist = jjdat[, diff := weight*lfun(value, value.1)][, sum(diff, na.rm = na.rm)/sum(weight, na.rm = na.rm)]
else
jdist = jjdat[, diff := weight*lfun(value, value.1)][, sum(diff, na.rm = na.rm)]
}
dist = node.coef*ndist + edge.coef*jdist
if (L==2)
dist = sqrt(dist)
return(dist)
}
#' @name merge
#' @title merge for undefined number of Junction objects
#'
#' @export
merge = function(...) {
UseMethod("merge")
}
#' @name merge.Junction
#' @title merge junctions by overlaps with padding
#'
#' Merges a set of junctions and keeps "seen.by" metadata of junction origin
#' using the argument names to this function
#'
#' If cartesian = TRUE, can only merge a pair of junction objects but then the output contains the overlapping
#' junctions and metadata annotated with a $query.id (index into first argument) and $subject.id
#' (index to into second argument) with deduped outputs,
#'
#'
#' @examples
#'
#' ## wil output a Junction object with metadata seen.by.svaba etc.
#' ## will pad with 500 bases prior to merging
#'
#' svaba = jJ(system.file('extdata', "HCC1143.svaba.somatic.sv.vcf", package = "gGnome"))
#' delly = jJ(system.file('extdata', "delly.final.vcf.gz", package = "gGnome"))
#' novobreak = jJ(system.file('extdata', "novoBreak.pass.flt.vcf", package = "gGnome"))
#'
#' ## merge(svaba = svaba, delly = delly, caller3 = novobreak, pad = 500)
#'
#' @param ... GRangesList representing rearrangements to be merged
#' @param pad non-negative integer specifying padding
#' @param cartesian whether to do a pairwise merge of all junction pairs in two junction objects x and y, which can potentially result in more rows than the number of inputs, Note: only works when there are exactly two inputs x and y
#' @param all only applicable if cartesian = TRUE, logical flag specifying whether to keep the junctions and metadata for non-overlapping junction pairs from both x and y inputs, aka "outer join" + "inner join"
#' @param all.x only applicable if cartesian = TRUE, logical flag specifying whether to keep the junctions and metadata for non-overlapping junction pairs from j1 in the output aka "left join" + "inner join""
#' @param all.y only applicable if cartesian = TRUE, logical flag specifying whether to keep the junctions and metadata for non-overlapping junction pairs from j2 in the output aka "right join" + "inner join"
#' @param ind logical flag (default FALSE) specifying whether the "seen.by" fields should contain indices of inputs (rather than logical flags) and NA if the given junction is missing
#' @export merge.Junction
#' @export
"merge.Junction" = function(..., pad = 0, ind = FALSE, cartesian = FALSE, all = FALSE, all.x = all, all.y = all)
{
list.args = list(...)
if (cartesian)
{
if (length(list.args)!=2)
stop('cartesian mode requires exactly two arguments')
ra1 = list.args[[1]]
ra2 = list.args[[2]]
ov = ra.overlaps(ra1$grl, ra2$grl, pad = pad)
ov = ov[!is.na(ov[,1]), , drop = FALSE]
if (nrow(ov)==0)
out.grl = ra1[c()]$grl
else
{
out.grl = ra1$grl[ov[,1]]
if (ncol(ra2$dt))
values(out.grl) = cbind(values(out.grl),ra2$dt[ov[,2]])
values(out.grl)$query.id = ov[,1]
values(out.grl)$subject.id = ov[,2]
}
.cgrl = function(x, y)
{
tmp1 = x
values(tmp1) = NULL
tmp2 = y
values(tmp2) = NULL
out = c(tmp1, tmp2)
values(out) = rbind(as.data.table(values(x)), as.data.table(values(y)), fill = TRUE)
out
}
if (all.x && length(ra1) && length(ix <- setdiff(1:length(ra1), ov[,1])))
{
left.grl = ra1[ix]$grl
values(left.grl)$query.id = ix
out.grl = .cgrl(out.grl, left.grl)
}
if (all.y && length(ra2) && length(ix <- setdiff(1:length(ra2), ov[,2])))
{
right.grl = ra2[ix]$grl
values(right.grl)$subject.id = ix
out.grl = .cgrl(out.grl, right.grl)
}
names(values(out.grl)) = dedup(names(values(out.grl)), itemize.all = TRUE)
Junction$new(out.grl)
}
else
{
if (all(sapply(list.args, length)==0))
return(jJ())
Junction$new(do.call(gGnome::ra.merge, c(lapply(list.args, function(x) x$grl), list(pad = pad))))
}
}
registerS3method("merge", "Junction", merge.Junction, envir = globalenv())
setMethod("%&%", signature(x = 'gEdge'), edge.queries)
#' @name default.agg.fun.generator
#' @title default.agg.fun.generator)
#' @description
#'
#' Generates aggregation function for gGraph reduce and disjoin.
#'
#' Returns a function that takes x and sums (+/- na.rm) if non-character, pastes with given sep if numeric, and concatenates if list
#' (note this returns a function, it is not the actual functioon)
#'
#' @author Marcin Imielinski
#' @return a function with the given characteristics
#' @keywords internal
#' @noRd
default.agg.fun.generator = function(na.rm = TRUE, avg = FALSE, sep = ',')
{
function(x)
{
if (length(x) == 1){
out = x
}
else if (all(is.na(x))){
out = x[1]
}
else if (is.numeric(x))
{
if (avg){
out = mean(x, na.rm = na.rm)
}
else{
out = sum(x, na.rm = na.rm)
}
}
else if (is.integer(x))
{
if (avg){
out = as.integer(median(x, na.rm = na.rm))
}
else{
out = as.integer(sum(x, na.rm = na.rm))
}
}
else if (is.logical(x)){
out = as.logical(sum(x, na.rm = na.rm))
}
else if (is.character(x) | is.factor(x)){
out = paste(unique(x[!is.na(x)]), collapse = sep)
}
else if (is.list(x))
{
out = do.call(c, x)
}
else
{
stop('gGraph default aggregation failed for unknown meta data type (numeric, integer, logical, character, list)')
}
return(out)
}
}
setGeneric("lengths")
#' @name lengths
#' @title lengths
#' @description
#'
#' @param x a gWalk object
#'
#' @return the number of nodes in each gWalk
#' @exportMethod lengths
setMethod("lengths", c("gWalk"),
function(x) {
return(base::lengths(x$snode.id))
})
#' @name seqinfo
#' @title seqinfo
#' @description
#'
#' @param x a gGraph object
#'
#' @return the seqinfo of this graph
#' @exportMethod seqinfo
setMethod("seqinfo", c("gGraph"),
function(x) {
return(seqinfo(x$gr))
})
#' @name seqinfo
#' @title seqinfo
#' @description
#'
#' @param x a gNode object
#'
#' @return the seqinfo of this gNode
#' @exportMethod seqinfo
setMethod("seqinfo", c("gNode"),
function(x) {
return(seqinfo(x$gr))
})
#' @name seqinfo
#' @title seqinfo
#' @description
#'
#' @param x a gEdge object
#'
#' @exportMethod seqinfo
#' @return the seqinfo of this gEdge
#' @exportMethod seqinfo
setMethod("seqinfo", c("gEdge"),
function(x) {
return(seqinfo(x$grl))
})
#' @name seqinfo
#' @title seqinfo
#' @description
#'
#' @param x a Junction
#'
#' @exportMethod seqinfo
#' @return the seqinfo of this Junction
setMethod("seqinfo", c("Junction"),
function(x) {
return(seqinfo(x$grl))
})
#' @name seqinfo
#' @title seqinfo
#' @description
#'
#' @param x a gWalk
#'
#' @return the seqinfo of this gWalk
#' @exportMethod seqinfo
#' @export
setMethod("seqinfo", c("gWalk"),
function(x) {
return(seqinfo(x$grl))
})
#' @name seqlengths
#' @title seqlengths
#' @description
#'
#' @param x a gGraph object
#'
#' @return the seqlengths of this graph
#' @exportMethod seqlengths
#' @export
setMethod("seqlengths", c("gGraph"),
function(x) {
return(seqlengths(x$gr))
})
#' @name seqlevels
#' @title seqlevels
#' @description
#'
#' @param x a gGraph object
#'
#' @return the seqlevels of this graph
#' @exportMethod seqlevels
#' @export
setMethod("seqlevels", c("gGraph"),
function(x) {
return(seqlevels(x$gr))
})
#' @name seqlengths
#' @title seqlengths
#' @description
#'
#' @param x a gNode object
#'
#' @return the seqlengths of this gNode
#' @exportMethod seqlengths
#' @export
setMethod("seqlengths", c("gNode"),
function(x) {
return(seqlengths(x$gr))
})
#' @name seqlengths
#' @title seqlengths
#' @description
#'
#' @param x a gEdge object
#'
#' @return the seqlengths of this gEdge
#' @exportMethod seqlengths
#' @export
setMethod("seqlengths", c("gEdge"),
function(x) {
return(seqlengths(x$grl))
})
#' @name seqlengths
#' @title seqlengths
#' @description
#'
#' @param x a Junction
#'
#' @return the seqlengths of this Junction
#' @exportMethod seqlengths
#' @export
setMethod("seqlengths", c("Junction"),
function(x) {
return(seqlengths(x$grl))
})
#' @name seqlengths
#' @title seqlengths
#' @description
#'
#' @param x a gWalk
#'
#' @return the seqlengths of this gWalk
#' @exportMethod seqlengths
#' @export
setMethod("seqlengths", c("gWalk"),
function(x) {
return(seqlengths(x$graph))
})
#' @name width
#' @title width
#' @description
#'
#' @param x a gWalk
#'
#' @return the width of each walk in this gWalk
#' @exportMethod width
#' @export
setMethod("width", c("gWalk"),
function(x) {
return(x$dt$wid)
})
#' @name width
#' @title width
#' @description
#'
#' @param x a gNode
#'
#' @return the width of each gNode
#' @exportMethod width
#' @export
setMethod("width", c("gNode"),
function(x) {
return(width(x$gr))
})
#' @name refresh
#' @title refresh
#' @description
#' Updates gWalk object
#'
#' @param gWalk object
#'
#' @return gWalk object
#' @exportMethod refresh
#' @export
setMethod("refresh", "gWalk",
function(x) {
gW(snode.id = x$snode.id, sedge.id = x$sedge.id, circular = x$circular, graph = x$graph, meta = x$meta)
})
#' @name gGraph.subset
#' @title subset gGraph on overlaps
#' @description
#'
#' @param x a gGraph
#' @param y a GRanges or string coercible to one
#' @return the portion of the gGraph that overlaps y (with loose ends wherever nodes / edges are broken)
#' @export
setMethod("%&%", signature(x = 'gGraph'), function(x, y) {
ix = x$nodes$gr %^% y
return(x[ix])
})
#' @name gNode subset
#' @title subset gNode on overlaps
#' @description
#'
#' @param x a gNode
#'
#' @return the subset of gNodes overlapping with the query
#' @export
setMethod("%&%", signature(x = 'gNode'), function(x, y) {
if (inherits(y, 'gNode'))
y = y$gr
if (inherits(y, 'gEdge'))
y = unlist(y$grl)
if (inherits(y, 'GRangesList') | inherits(y, 'CompressedGrangesList'))
y = unlist(y)
if (is.character(y)){
y = parse.gr(y)
}
ix = gr.in(x$gr, y)
return(x[ix])
})
#' @name gNode over
#' @title subset gNode on overlaps
#' @description
#'
#' @param x a gNode
#'
#' @return logical vector of length(x) specifying overlap
#' @export
setMethod("%^%", signature(x = 'gNode'), function(x, y) {
if (inherits(y, 'gNode'))
y = y$gr
if (inherits(y, 'gEdge'))
y = unlist(y$grl)
if (inherits(y, 'GRangesList') | inherits(y, 'CompressedGrangesList'))
y = unlist(y)
if (is.character(y)){
y = parse.gr(y)
}
return(gr.in(x$gr, y))
})
#' @name gEdge.in
#' @title subset gEdge on overlaps
#' @description
#'
#' @param x a gEdge
#'
#' @return the subset of gEdges overlapping with the query
#' @export
setMethod("%&%", signature(x = 'gEdge'), function(x, y) {
if (inherits(y, 'Junction') | inherits(y, 'gEdge'))
{
y = y$grl
}
if (inherits(y, 'gNode'))
{
y = y$gr
}
if (inherits(y, 'GRangesList') | inherits(y, 'CompressedGRangesList'))
{
ix = setdiff(ra.overlaps(x$grl, y)[,1], NA)
return(x[ix])
}
if (is.character(y)){
y = parse.gr(y)
}
ix = grl.in(x$grl, y)
return(x[ix])
})
#' @name gEdge.over
#' @title gEdge over
#' @description
#'
#' @param x a gEdge
#'
#' @return logical vector of length(x) which is TRUE if an overlap exists
#' @export
setMethod("%^%", signature(x = 'gEdge'), function(x, y) {
if (inherits(y, 'Junction') | inherits(y, 'gEdge'))
{
y = y$grl
}
if (inherits(y, 'gNode'))
{
y = y$gr
}
if (inherits(y, 'GRangesList') | inherits(y, 'CompressedGRangesList'))
{
ix = setdiff(ra.overlaps(x$grl, y)[,1], NA)
return(1:length(x) %in% ix)
}
if (is.character(y)){
y = parse.gr(y)
}
return(grl.in(x$grl, y))
})
#' @name gWalk.subset
#' @title subset gWalk on overlaps
#' @description
#'
#' @param x a gWalk
#'
#' @return the subset of gWalks overlapping with the query
#' @export
setMethod("%&%", signature(x = 'gWalk'), function(x, y) {
if (inherits(y, 'Junction') | inherits(y, 'gEdge'))
{
y = unlist(y$grl)
}
if (inherits(y, 'gNode'))
{
y = y$gr
}
if (is.character(y)){
y = parse.gr(y)
}
ix = grl.in(x$grl, y)
return(x[ix])
})
#' @name gWalk.over
#' @title subset gWalk on overlaps
#' @description
#'
#' @param x a gWalk
#'
#' @return logical vector of length(x) which is TRUE if an overlap exists
#' @export
setMethod("%^%", signature(x = 'gWalk'), function(x, y) {
if (inherits(y, 'Junction') | inherits(y, 'gEdge'))
{
y = unlist(y$grl)
}
if (inherits(y, 'gNode'))
{
y = y$gr
}
if (is.character(y)){
y = parse.gr(y)
}
return(grl.in(x$grl, y))
})
#' @name Junction.subset
#' @title subset gEdge on overlaps
#' @description
#'
#' @param x a Junction
#' @param y a Junction, gEdge, GenomicRanges, string
#' @return the subset of gEdges overlapping with the query
#' @export
setMethod("%&%", signature(x = 'Junction'), function(x, y) {
if (inherits(y, 'Junction') | inherits(y, 'gEdge'))
{
y = y$grl
}
if (inherits(y, 'gNode'))
{
y = y$gr
}
if (inherits(y, 'GRangesList') | inherits(y, 'CompressedGRangesList'))
{
ix = setdiff(ra.overlaps(x$grl, y)[,1], NA)
return(x[ix])
}
if (is.character(y)){
y = parse.gr(y)
}
ix = grl.in(x$grl, y)
return(x[ix])
})
#' @name Junction.over
#' @title Junction over
#' @description
#'
#' @param x a Junction
#' @param y a Junction, gEdge, GenomicRanges, string
#' @return logical vector of length(x) which is TRUE if an overlap exists
#' @export
setMethod("%^%", signature(x = 'Junction'), function(x, y) {
if (inherits(y, 'Junction') | inherits(y, 'gEdge'))
{
y = y$grl
}
if (inherits(y, 'gNode'))
{
y = y$gr
}
if (inherits(y, 'GRangesList') | inherits(y, 'CompressedGRangesList'))
{
ix = setdiff(ra.overlaps(x$grl, y)[,1], NA)
return(1:length(x) %in% ix)
}
if (is.character(y)){
y = parse.gr(y)
}
return(grl.in(x$grl, y))
})
#' @name jJ
#' @title parse junctions from a variety of formats
#'
#' @description Parsing various formats of structural variation data into junctions.
#'
#' @usage jJ(rafile,
#' keep.features = T,
#' seqlengths = NULL,
#' chr.convert = T,
#' geno=NULL,
#' flipstrand = FALSE,
#' swap.header = NULL,
#' breakpointer = FALSE,
#' seqlevels = NULL,
#' force.bnd = FALSE,
#' skip = NA)
#'
#' @param rafile path to the junctions file. See details for the compatible formats.
#' @param keep.features \code{logical}, if TRUE preserve meta data from the input
#' @param seqlengths a named \code{numeric} vector containing reference contig lengths
#' @param chr.convert \code{logical}, if TRUE strip "chr" prefix from contig names
#' @param geno \code{logical}, whether to parse the 'geno' fields of VCF
#' @param hg \code{character}, human genome version
#' @param flipstrand \code{logical}, if TRUE will flip breakpoint strand
#' @param swap.header path to the alternative VCF header file
#' @param breakpointer \code{logical}, if TRUE will parse as breakpointer output
#' @param seqlevels vector for renaming the chromosomes
#' @param force.bnd if TRUE overwrite all junction "type" to "BND"
#' @param skip \code{numeric} lines to skip
#'
#' @details
#' A junction is a unordered pair of strand-specific genomic locations (breakpoints). Within a given
#' reference genome coordinate system, we call the direction in which coordinates increase "+". A breakpoint
#' is a width 1 (\code{start==end})genomic range with \code{strand} specified, and "+" means the side with larger
#' coordinate is fused with the other breakpoint in a junction.
#'
#' \code{rafile} must be one of the following formats:
#' 1) Some VCF (variant call format). We currently support the VCF output
#' from a number of structural variation detection methods, namely
#' SvABA (https://github.com/walaj/svaba),
#' DELLY (https://github.com/dellytools/delly),
#' LUMPY (https://github.com/arq5x/lumpy-sv),
#' novoBreak (https://sourceforge.net/projects/novobreak/). In theory,
#' VCF defined with BND style should be compatible but be cautious
#' when using the output from other methods since
#' no universal data definition is adopted by the community yet.
#' 2) BEDPE (http://bedtools.readthedocs.io/en/latest/content/general-usage.html#bedpe-format)
#' 3) Textual output from Breakpointer
#' (http://archive.broadinstitute.org/cancer/cga/breakpointer)
#' 4) R serialized object storing junctions (.rds)
#'
#' @section Warning:
#' We assume the orientation definition in the input is consistent with ours. Check with
#' the documentation of your respective method to make sure. If the contrary, use
#' \code{flipstrand=TRUE} to reconcile.
#'
#' @return a \code{Junction} object
#'
#' @export
#' @author Xiaotong Yao
#' @importFrom VariantAnnotation readVcf
#' @import data.table
jJ = function(rafile = NULL,
keep.features = T,
seqlengths = NULL,
chr.convert = T,
geno=NULL,
hg=NULL,
flipstrand = FALSE,
swap.header = NULL,
breakpointer = FALSE,
seqlevels = NULL,
force.bnd = FALSE,
skip = NA,
get.loose = FALSE){
return(Junction$new(
rafile,
keep.features = keep.features,
seqlengths = seqlengths,
chr.convert = chr.convert,
geno=geno,
hg=hg,
flipstrand = flipstrand,
swap.header = swap.header,
breakpointer = breakpointer,
seqlevels = seqlevels,
force.bnd = force.bnd,
skip = skip,
get.loose = get.loose)
)
}
##########
########## KH additional functions
##########
## #' @name eclusters2
## #' @title gGraph R6 public method eclusters2
## #'
## #' @description
## #' Marks ALT edges belonging (quasi) reciprocal cycles
## #'
## #'
## #' @param juncs GRangesList of junctions
## #' @param mc.cores parallel
## #' @param only_chains TRUE will only pair breakend to its nearest nearest neighbor IFF the nearest neighbor is reciprocal, see arguments to "strict" for 3 different matching heuristics
## #' @param max.small size below which simple dups and dels are excluded
## #' @param weak logical flag if TRUE will not differentiate between cycles and paths and will return all weakly connected clusters in the junction graph [FALSE]
## #' @param strict Only active if only_chains = TRUE. Can be one of "strict", "one_to_one", or "loose". \cr
## #' "strict": each breakend can only be "monogamously" matched to one other breakend and if the nearest breakend is of the wrong orientation, it is thrown out. \cr
## #' "one_to_one" breakends can only be coupled to a single "monogamous" match but without considering breakends of the wrong orientation. If breakend B is nearest to C, B will only be matched to C. If A's nearest breakend is B but is further away than C, A will not be matched to B. \cr
## #' "loose": the nearest breakend in the correct orientation under the threshold is considered. The same as one_to_one except A will be matched to B, while B will be matched to C. \cr
## #' @param ignore.isolated If TRUE, all simple duplications, duplications nested with only other duplications, and simple deletions without any breakends will be thrown out.
## #' @return gGraph object with edges marked with ecluster id and metadata in gEdge, gNode, and $meta
## #' @author Marcin Imielinski
## eclusters2 = function (thresh = 1000, weak = TRUE, paths = !weak,
## mc.cores = 1, verbose = FALSE, chunksize = 1e+30, method = "single",
## return_pairs = FALSE, ignore.small = TRUE,
## max.small = 1e4, ignore.isolated = TRUE,
## strict = c("strict", "one_to_one", "loose"),
## min.isolated = max.small,
## only_chains = FALSE) {
## if (!is.character(strict) || any(!strict %in% c("strict", "one_to_one", "loose"))) {
## stop("strict must be one of 'strict', 'one_to_one', or 'loose'")
## } else if (length(strict) > 1) {
## strict = "one_to_one"
## }
## self$edges$mark(ecluster = as.integer(NA))
## altedges = self$edges[type == "ALT", ]
## if (length(altedges) == 0) {
## if (verbose) {
## gmessage("No junction in this graph")
## }
## return(NULL)
## }
## if (ignore.small) {
## altedges = altedges[!((class == "DUP-like" | class == "DEL-like") & altedges$span <= max.small)]
## }
## if (length(altedges) == 0) {
## if (verbose) {
## gmessage("No junction in this graph")
## }
## return(NULL)
## }
## deldup = altedges[class %in% c("DUP-like", "DEL-like")]
## ## below removes non-nested events below size threshold
## ## and simple or nested duplications with no subsumed breakends
## if (length(deldup) > 0 && ignore.isolated) {
## altes = deldup$shadow
## ## altes$sedge.id = altedges[class %in% c("DUP-like", "DEL-like")]$dt[altes$id]$sedge.id
## bp = grl.unlist(altedges$grl)[, c("grl.ix", "grl.iix", "class", "sedge.id")]
## bp$sedge.id.y = bp$sedge.id; bp$sedge.id = NULL
## addon = deldup$dt[altes$id][, .(sedge.id, class)]
## altes$sedge.id = addon$sedge.id
## altes$class = addon$class
## altes$nbp = altes %N% bp # number of breakpoints of any SV that fall within segment
## numsum = altedges$shadow %>% gr.sum # using the shadows of all of the SVs not just dels and dups
## altes = altes %$% numsum
## iso = ((altes) %Q% (score == 1.0))$id
## ## rm.edges = unique(altes[iso] %Q% (width < thresh))$sedge.id ## old
## rm.edges = unique(altes[iso] %Q% (width < min.isolated))$sedge.id
## rm.dups = S4Vectors::with(altes, sedge.id[class == "DUP-like" & nbp <= 2])
## rm.dups = c(rm.dups, dedup.cols(gr2dt(altes %*% bp))[sedge.id != sedge.id.y][class == "DUP-like"][, .(all(1:2 %in% grl.iix), class.1 = class.1[1]), by = .(sedge.id, sedge.id.y)][, all(V1 == TRUE) & all(class.1 == "DUP-like"), by = sedge.id][V1 == TRUE]$sedge.id) # removing dups that have only other nested dups
## rm.edges = union(rm.edges, rm.dups)
## keepeid = setdiff(altedges$dt$sedge.id, rm.edges)
## altedges = altedges[as.character(keepeid)]
## } # ignoring isolated dup and del edges that are smaller than threshold
## if (verbose & weak)
## message("Computing weak eclusters")
## if (length(altedges) == 0) {
## if (verbose) {
## gmessage("No junction in this graph")
## }
## return(NULL)
## }
## bp = grl.unlist(altedges$grl)[, c("grl.ix", "grl.iix", "edge.id")]
## bp$m.ix = seq_along(bp)
## bp.dt = gr2dt(bp)
## ix = split(1:length(bp), ceiling(runif(length(bp)) * ceiling(length(bp)/chunksize)))
## ixu = unlist(ix)
## eps = 1e-09
## ## ij = do.call(rbind, split(1:length(bp), bp$grl.ix))
## xt.adj = xt.adj0 = Matrix::sparseMatrix(1, 1, x = 0, dims = rep(length(bp),
## 2))
## if (verbose) {
## message(sprintf("Computing junction graph across %s ALT edges with distance threshold %s",
## length(altedges), thresh))
## }
## if (!exists(".INF")) {
## .INF = pmax(sum(seqlengths(self)), 1e+09)
## }
## bp.pair = as.matrix(dcast(data.table(ix = bp$m.ix,
## grl.ix = bp$grl.ix,
## grl.iix = bp$grl.iix),
## grl.ix ~ grl.iix, value.var = "ix")[,2:3])
## bp.pair = rbind(bp.pair, cbind(bp.pair[,2], bp.pair[,1]))
## ifun = function(iix, ignore.strand = FALSE,
## verbose = FALSE, eps = 1e-9) {
## if (verbose > 1)
## cat(".")
## tmpm = gr.dist(bp[iix], gr.flipstrand(bp), ignore.strand = ignore.strand) +
## eps
## return(as(tmpm, "Matrix"))
## }
## xt.adj[ixu, ] = do.call(rbind,
## mclapply(ix, ifun, ignore.strand = FALSE,
## mc.cores = mc.cores))
## diag(xt.adj) = NA_real_
## ## only_chains = TRUE, enforcing that nearest breakpoints are only considered
## if (only_chains) {
## ## enforcing that no distances between breakends from the same junction are considered
## xt.adj[bp.pair] = NA_real_
## xt.adj0[ixu, ] = do.call(rbind,
## mclapply(ix, ifun, ignore.strand = TRUE,
## mc.cores = mc.cores)) # this is to find nearest breakends, regardless of orientation
## ## enforcing that self-to-self breakend distances are not considered
## diag(xt.adj0) = NA_real_
## ## enforcing that no distances between breakends from the same junction are considered
## xt.adj0[bp.pair] = NA_real_
## suppressWarnings({
## nearest_ix = dunlist(lapply(
## seq_len(nrow(xt.adj)),
## function(x) which(xt.adj[x,] == min(xt.adj[x,], na.rm = T)))) %>%
## as.matrix
## nearest0_ix = dunlist(lapply(
## seq_len(nrow(xt.adj0)),
## function(x) which(xt.adj0[x,] == min(xt.adj0[x,], na.rm = T)))) %>%
## as.matrix
## })
## if (nrow(nearest_ix) & nrow(nearest0_ix)) {
## if (strict == "strict") {
## nearest_ix = cbind(rowMins(nearest_ix), rowMaxs(nearest_ix))
## nearest0_ix = cbind(rowMins(nearest0_ix), rowMaxs(nearest0_ix))
## nearest_ix = nearest_ix[duplicated(nearest_ix),,drop = FALSE]
## nearest0_ix = nearest0_ix[duplicated(nearest0_ix),,drop = FALSE]
## nearest_ix = rbind(nearest_ix, cbind(nearest_ix[,2], nearest_ix[,1]))
## nearest0_ix = rbind(nearest0_ix, cbind(nearest0_ix[,2], nearest0_ix[,1]))
## nearest_ix = as.matrix(merge(nearest_ix, nearest0_ix)) # if there is a breakend closer but in the wrong orientation, that distance will be thrown out downstream, i.e. there is a one to one match of breakend and any nearest breakend that is in the wrong orientation disqualifies the clustser
## } else if (strict == "one_to_one") {
## nearest_ix = cbind(rowMins(nearest_ix), rowMaxs(nearest_ix))
## nearest_ix = nearest_ix[duplicated(nearest_ix),,drop = FALSE]
## nearest_ix = rbind(nearest_ix, cbind(nearest_ix[,2], nearest_ix[,1]))
## nearest_ix = as.matrix(nearest_ix) # only one-to-one breakends are considered
## } else if (strict == "loose") {
## nearest_ix = rbind(nearest_ix, cbind(nearest_ix[,2], nearest_ix[,1]))
## nearest_ix = nearest_ix[!duplicated(nearest_ix),,drop = FALSE]
## }
## }
## } else if (!only_chains) {
## xt.adj[bp.pair] = 1
## }
## ## rm(xt.adj0)
## adj = xt.adj
## xt.adj[which(is.na(as.matrix(xt.adj)))] = .INF + 1
## adj[which(is.na(as.matrix(adj)))] = 0
## adj[which(as.matrix(adj) > thresh)] = 0
## if (only_chains && nrow(nearest_ix)) {
## tmp = xt.adj[nearest_ix]
## xt.adj[] = .INF + 1
## adj[] = 0
## xt.adj[nearest_ix] = tmp
## adj[nearest_ix] = tmp
## }
## dt = Matrix::which(xt.adj < thresh, arr.ind = T)
## dt = unique(data.table(cbind(rowMins(dt), rowMaxs(dt))))
## dt[, bp.dist := xt.adj[dt[, cbind(V1, V2)]]]
## dt$sign = ifelse(strand(bp[dt$V1]) == "+",
## ifelse(gr.flipstrand(bp[dt$V1]) > bp[dt$V2], -1, 1),
## ifelse(gr.flipstrand(bp[dt$V1]) < bp[dt$V2], -1, 1))
## dt2 = dt[,idx := seq_len(.N)] %>% melt(measure.vars = c("V1", "V2"))
## meta = cbind(gr2dt(bp)[dt2$value], dt2)[order(idx)]
## meta = rbind(meta,
## gr2dt(bp)[paste(grl.ix, grl.iix) %nin% meta[, paste(grl.ix, grl.iix)]],
## fill = T)
## if (return_pairs) {
## return(meta)
## }
## hcl = stats::hclust(as.dist(xt.adj), method = "single")
## hcl.lbl = cutree(hcl, h = thresh)
## bp.dt$hcl = hcl.lbl
## bp.hcl = bp.dt[, .(hcl.1 = .SD[grl.iix == 1, hcl], hcl.2 = .SD[grl.iix ==
## 2, hcl]), keyby = grl.ix]
## altedges$mark(hcl.1 = bp.hcl[.(seq_along(altedges)), hcl.1])
## altedges$mark(hcl.2 = bp.hcl[.(seq_along(altedges)), hcl.2])
## hcl.ig = igraph::graph_from_edgelist(bp.hcl[, unique(cbind(hcl.1,
## hcl.2))], directed = FALSE)
## hcl.comp = components(hcl.ig)
## altedges$mark(ehcl = as.integer(hcl.comp$membership)[bp.hcl[,
## hcl.1]])
## adj[adj > thresh] = 0
## refg = self[, type == "REF"]
## bpp = Matrix::which(adj != 0, arr.ind = TRUE)
## dref = pdist(bp[bpp[, 1]], bp[bpp[, 2]])
## drefg = diag(refg$dist(bp[bpp[, 1]], bp[bpp[, 2]]))
## ix = which(drefg > dref)
## if (length(ix))
## adj[bpp[ix, , drop = FALSE]] = FALSE
## if (verbose > 1)
## cat("\n")
## adj = adj | t(adj)
## junpos = bp1 = bp$grl.iix == 1
## junneg = bp2 = bp$grl.iix == 2
## adj2 = adj & FALSE
## adj2[junpos, junpos] = adj[bp2, bp1]
## adj2[junpos, junneg] = adj[bp2, bp2]
## adj2[junneg, junpos] = adj[bp1, bp1]
## adj2[junneg, junneg] = adj[bp1, bp2]
## if (verbose)
## message(sprintf("Created basic junction graph using distance threshold of %s",
## thresh))
## cl = split(1:length(bp), igraph::clusters(graph.adjacency(adj2),
## ifelse(weak, "weak", "strong"))$membership)
## cl = cl[S4Vectors::elementNROWS(cl) > 1]
## cl = cl[order(S4Vectors::elementNROWS(cl))]
## ## browser()
## ## jcl = lapply(cl, function(x) unique(sort(bp$grl.ix[x])))
## jcl = lapply(cl, function(x) unique(sort(bp$edge.id[x])))
## jcls = sapply(jcl, paste, collapse = " ")
## jcl = jcl[!duplicated(jcls)]
## adj3 = adj2
## altedges$mark(ecycle = as.character(NA))
## if (length(jcl) > 0) {
## dcl = dunlist(unname(jcl))[, `:=`(listid, paste0(ifelse(weak,
## "", "c"), listid))]
## if (!weak)
## ## altedges[dcl$V1]$mark(ecycle = dcl$listid)
## altedges[as.character(dcl$V1)]$mark(ecycle = dcl$listid)
## ## altedges[dcl$V1]$mark(ecluster = dcl$listid)
## altedges[as.character(dcl$V1)]$mark(ecluster = dcl$listid)
## meta = merge(meta, altedges$dt[, .(edge.id, ecluster, hcl.1, hcl.2, ehcl, ecycle)], by = "edge.id")
## meta[!is.na(ecluster),
## `:=`(
## nclust = length(unique(edge.id)),
## all_positive = all(replace(sign, is.na(sign), 3e9) > 0),
## all_negative = all(replace(sign, is.na(sign), -3e9) < 0),
## mixed = {naom = na.omit(sign); any(naom > 0) & any(naom < 0)},
## bridge = anyNA(bp.dist)
## ),
## by = ecluster]
## meta[, `:=`(
## num_positive = sum(sign[!duplicated(idx)] > 0, na.rm = T),
## num_negative = sum(sign[!duplicated(idx)] < 0, na.rm = T)
## ),
## by = ecluster]
## self$set(recip_bp = meta)
## if (verbose) {
## message("Annotated weakly connected junction clusters and added ecluster pair metadata")
## }
## }
## if (verbose)
## message(sprintf("Annotated %s junction cycles in edge field $ecycle",
## length(jcl)))
## if (paths & !weak) {
## if (verbose)
## message("Analyzing paths")
## if (length(jcl) > 0) {
## adj3[unlist(jcl), unlist(jcl)] = FALSE
## }
## sinks = Matrix::which(Matrix::rowSums(adj3) == 0)
## sources = Matrix::which(Matrix::colSums(adj3) == 0)
## cl2 = split(1:length(bp), igraph::clusters(graph.adjacency(adj3),
## "weak")$membership)
## cl2 = cl2[S4Vectors::elementNROWS(cl2) > 1]
## if (any(ix <- S4Vectors::elementNROWS(cl2) > 2)) {
## cl3 = do.call(c, mclapply(cl2[ix], function(x) {
## tmp.adj = adj3[x, x]
## lapply(all.paths(tmp.adj, sources = sources,
## sinks = sinks, verbose = verbose)$paths, function(i) x[i])
## }, mc.cores = mc.cores))
## cl2 = c(cl2[!ix], cl3)
## }
## jcl2 = lapply(cl2, function(x) unique(sort(bp$grl.ix[x])))
## jcls2 = sapply(jcl2, paste, collapse = " ")
## jcl2 = jcl2[!duplicated(jcls2)]
## altedges$mark(epath = as.character(NA))
## if (length(jcl2) > 0) {
## dcl2 = dunlist(unname(jcl2))[, `:=`(listid, paste0("p",
## listid))]
## altedges[dcl2$V1]$mark(epath = dcl2$listid)
## self$edges$mark(ecluster =
## ifelse(is.na(self$edges$dt$ecycle) &
## is.na(self$edges$dt$epath),
## as.character(NA),
## paste0(ifelse(is.na(self$edges$dt$ecycle), "",
## self$edges$dt$ecycle),
## ifelse(is.na(self$edges$dt$epath),
## "", self$edges$dt$epath))))
## }
## if (verbose)
## message(sprintf("Annotated %s paths in edge field $epath",
## length(jcl2)))
## }
## return(invisible(self))
## }
## ## gGraph$public_methods$eclusters2 = tmpeclustpairs # if replacing a binding
## #' @description
## #' make eclusters
## gGraph$public_methods$eclusters2 = NULL; gGraph$set("public", "eclusters2", eclusters2)
#' @name copy3
#' @title make deep copy, recursively
#'
#' useful for dev
#' makes deep copy of R6 object, S4 object, or anything else really
#'
copy3 = function (x, recurse_list = TRUE) {
if (inherits(x, "R6")) {
x2 = rlang::duplicate(x$clone(deep = T))
for (name in intersect(names(x2$.__enclos_env__), c("private",
"public"))) for (nname in names(x2$.__enclos_env__[[name]])) tryCatch({
x2$.__enclos_env__[[name]][[nname]] = copy3(x2$.__enclos_env__[[name]][[nname]])
}, error = function(e) NULL)
return(x2)
} else if (isS4(x)) {
x2 = rlang::duplicate(x)
slns = slotNames(x2)
for (sln in slns) {
tryCatch({
slot(x2, sln) = copy3(slot(x2, sln))
}, error = function(e) NULL)
}
return(x2)
} else if (inherits(x, c("list"))) {
x2 = rlang::duplicate(x)
x2 = rapply(x2, copy3, how = "replace")
return(x2)
} else {
x2 = rlang::duplicate(x)
return(x2)
}
}
#' @name rep_len2
#' @title recycle vector along length OR nrow of object
#'
#'
#' @author Kevin Hadi
#' @param x data
#' @param objalong any object to recycle x along if uselen = TRUE, or an actual integer value if uselen = FALSE
#' @return vector
rep_len2 = function(x, objalong, uselen = TRUE) {
if (uselen)
rep(x, length.out = NROW(objalong))
else
rep(x, length.out = objalong)
}
#' @name seq_along2
#' @title seq along either row of table or length of vector
#'
#'
#' @author Kevin Hadi
#' @param x data
#' @return vector
seq_along2 = function(x) {
seq_len(NROW(x))
}
#' @name rleseq
#' @title numbers up within repeating elements of a vector
#'
#' @description
#' returns unique id within each unique element of a vector or set of provided vectors
#' and also a running id within each unique element
#'
#' @param ... Vector(s) to identify with unique id and a running id within each unique id
#' @param clump a logical specifying if duplicates are to be counted together
#' @param recurs a logical that is meant to only be set by the function when using clump = TRUE
#' @return a list of idx and seq
#' @author Kevin Hadi
rleseq = function (..., clump = TRUE, recurs = FALSE, na.clump = TRUE,
na.ignore = FALSE, sep = paste0(" ", rand.string(length = 6),
" "), use.data.table = TRUE)
{
force(sep)
out = if (use.data.table) {
tryCatch(
{
dt = data.table(...)
setnames(dt, make.names(rep("", ncol(dt)), unique = T))
## make.unique
cmd = sprintf("dt[, I := .I][, .(idx = .GRP, seq = seq_len(.N), lns = .N, I), by = %s]", mkst(colnames(dt), "list"))
dt = eval(parse(text = cmd))
setkey(dt, I)[, .(idx, seq, lns)]
}, error = function(e) structure("data table didn't work...", class = "err"))
}
if (!(is.null(out) || class(out)[1] == "err"))
return(as.list(out))
rand.string <- function(n = 1, length = 12) {
randomString <- c(1:n)
for (i in 1:n) {
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
length, replace = TRUE), collapse = "")
}
return(randomString)
}
if (isTRUE(na.clump))
paste = function(..., sep) base::paste(..., sep = sep)
else paste = function(..., sep) base::paste(stringr::str_c(...,
sep = sep))
lns = base::lengths(list(...))
if (!all(lns == lns[1]))
warning("not all vectors provided have same length")
fulllens = max(lns, na.rm = T)
vec = setNames(paste(..., sep = sep), seq_len(fulllens))
if (length(vec) == 0) {
out = list(idx = integer(0), seq = integer(0), lns = integer(0))
return(out)
}
if (na.ignore) {
isnotna = which(rowSums(as.data.frame(lapply(list(...),
is.na))) == 0)
out = list(idx = rep(NA, fulllens), seq = rep(NA, fulllens),
lns = rep(NA, fulllens))
if (length(isnotna))
vec = vec[isnotna]
tmpout = do.call(rleseq, c(alist(... = vec), alist(clump = clump,
recurs = recurs, na.clump = na.clump, na.ignore = FALSE, use.data.table = FALSE)))
for (i in seq_along(out)) out[[i]][isnotna] = tmpout[[i]]
return(out)
}
if (!isTRUE(clump)) {
rlev = rle(vec)
if (isTRUE(recurs)) {
return(unlist(unname(lapply(rlev$lengths, seq_len))))
}
else {
out = list(idx = rep(seq_along(rlev$lengths), times = rlev$lengths),
seq = unlist(unname(lapply(rlev$lengths, seq_len))))
out$lns = ave(out[[1]], out[[1]], FUN = length)
return(out)
}
}
else {
if (!isTRUE(na.clump)) {
vec = replace2(vec, which(x == "NA"), dedup(dg(x)[dg(x) ==
"NA"]))
}
vec = setNames(vec, seq_along(vec))
lst = split(vec, factor(vec, levels = unique(vec)))
ord = as.integer(names(unlist(unname(lst))))
idx = rep(seq_along(lst), times = base::lengths(lst))
out = list(idx = idx[order(ord)], seq = rleseq(idx, clump = FALSE,
recurs = TRUE, use.data.table = FALSE)[order(ord)])
out$lns = ave(out[[1]], out[[1]], FUN = length)
return(out)
}
}
#' @name match3
#' @title similar to setkey except a general use utility
#'
#' very slow version of keying a la data.table
#' but for general/interactive use
#' @author Kevin Hadi
match3 = function(x, table, nomatch = NA_integer_, old = TRUE, use.data.table = TRUE) {
out = if (use.data.table) {
tryCatch({
dx = data.table(x = x)[, id.x := seq_len(.N)]
dtb = data.table(table = table)[, id.tb := seq_len(.N)]
## setkey(dx, x)[list(dtb$table)]$id.x
setkey(dtb, table)[list(dx$x)]$id.tb
}, error = function(e) structure("err", class = "err"))
}
if (!is.null(out) && !inherits(out, "err")) return(out)
if (old) {
dx = within(data.frame(x = x), {id.x = seq_along(x)})
dtb = within(data.frame(table = table), {id.tb = seq_along(table)})
res = merge(dx, dtb, by.x = "x", by.y = "table", all.x = TRUE,
allow.cartesian = TRUE)
return(res$id.tb[order(res$id.x)])
} else {
m = match(table,x)
mat = cbind(m, seq_along(m))
mat = mat[!is.na(mat[,1]),,drop=FALSE]
mat = mat[order(mat[,1], na.last = FALSE),,drop = FALSE]
mat = cbind(mat, seq_len(dim(mat)[1]))
m2 = match(x,table)
ix = which(!duplicated(m2) & !is.na(m2))
mat_rix = unlist(rep(split(mat[,3], mat[,1]), base::tabulate(m2)[m2][ix]))
## mat_rix = unlist(rep(split(mat[,3], mat[,1]), base::tabulate(m2)[m2][ix]))
ix = rep(1, length.out = length(m2))
## original line
## ix[!is.na(m2)] = base::tabulate(m)[!is.na(m2)]
ix[!is.na(m2)] = base::tabulate(m)[m][m2][!is.na(m2)]
out = rep(m2, ix)
out[!is.na(out)] = mat[mat_rix,,drop=F][,2]
return(out)
## m = match(table, x)
## mat = cbind(m, seq_along(m))
## mat = mat[!is.na(mat[, 1]), , drop = FALSE]
## mat = mat[order(mat[, 1]), , drop = FALSE]
## mat = cbind(mat, seq_len(dim(mat)[1]))
## m2 = match(x, table)
## ix = which(!duplicated(m2))
## mat_rix = unlist(rep(split(mat[, 3], mat[, 1]), base::tabulate(m2)[m2][ix]))
## mat[mat_rix, , drop = F][, 2]
}
}
#' @name dedup.cols
#' @title applies dedup to colnames
#'
#' dedup the column names of a data.frame/data.table
#'
#' @return A data.table or data.frame
dedup.cols = function(tbl, remove = FALSE) {
if (remove) {
if (!inherits(tbl, "data.table"))
return(tbl[, match(unique(colnames(tbl)), colnames(tbl))])
else
return(tbl[, match(unique(colnames(tbl)), colnames(tbl)), with = FALSE])
} else {
colnames(tbl) = base::make.unique(colnames(tbl))
return(tbl)
}
}
#' @name rowMins
#' @title rowMins hack
#'
#'
#' @return A vector
rowMins = function(x) {
do.call(pmin, as.data.frame(x))
}
#' @name rowMaxs
#' @title rowMaxs hack
#'
#'
#' @return A vector
rowMaxs = function(x) {
do.call(pmax, as.data.frame(x))
}
#' @name gr.noval
#' @title get rid of mcols on GRanges/GRangesLists
#' @description
#'
#' remove all metadata from GRanges or GRangesList
#'
#' @return GRanges or GRangesList
#' @author Kevin Hadi
gr.noval = function(gr, keep.col = NULL, drop.col = NULL) {
if (is.null(keep.col) & is.null(drop.col)) {
select_col = NULL
} else {
all_col = colnames(gr@elementMetadata)
if (inherits(gr, "GRangesList")) {
all_col = c(all_col, colnames(gr@unlistData@elementMetadata))
}
if (!is.null(keep.col) & is.null(drop.col)) {
select_col = intersect(all_col, keep.col)
} else if (is.null(keep.col) & !is.null(drop.col)) {
select_col = setdiff(all_col, drop.col)
} else if (!is.null(keep.col) && !is.null(drop.col)) {
if (intersect(keep.col, drop.col) > 0) {
warning("drop.col and keep.col args have overlapping elements\nkeeping the columns that overlap")
select_col = intersect(setdiff(all_col, setdiff(drop.col, keep.col)), keep.col)
}
}
}
if (inherits(gr, "GRangesList")) {
tmp_query = intersect(select_col, colnames(gr@unlistData@elementMetadata))
gr@unlistData@elementMetadata = gr@unlistData@elementMetadata[,c(tmp_query), drop = FALSE]
}
tmp_query = intersect(select_col, colnames(gr@elementMetadata))
gr@elementMetadata = gr@elementMetadata[,c(tmp_query),drop = FALSE]
return(gr)
}
#' @name setcols
#' @title convenience function to set columns
#'
#' sets columns of an object
#'
#' @param dt data frame/table or matrix
#' @param old integer or character or logical vector corresponding to current colnames in dt
#' @param new character vector for new column names
#' @return colnamed object
#' @author Kevin Hadi
setcols = function(dt, old, new) {
if (inherits(dt, c("GRanges", "GRangesList"))) {
mcols(dt) = setcols(mcols(dt), old, new)
return(dt)
}
cnames = colnames2(dt)
if (missing(new) || missing(old)) {
if (missing(old)) {
old = new
}
if (is.character(old) && length(old) == length(cnames)) {
colnames(dt) = old
return(dt)
} else {
stop("names provided must be same length as ncol(dt)")
}
}
if (is.character(old)) {
out = merge(data.frame(cnames, seq_along(cnames)), data.frame(cnames = old, new = new),
allow.cartesian = T)
cnames[out[[2]]] = out[[3]]
colnames(dt) = cnames
return(dt)
}
if (is.logical(old)) {
if (! length(old) == length(cnames)) stop("logical vector must be same length as ncol(dt)")
old = which(old)
}
cnames[old] = new
colnames(dt) = cnames
return(dt)
}
#' @name colnames2
#' @title robust colnames
#'
#' gives back character vector same number of columns of input regardless whether named or not
#'
#' @param str a path string
#' @return a string with multiple parentheses replaced with a single parenthesis
#' @author Kevin Hadi
colnames2 = function(x) {
nm = colnames(x)
if (is.null(nm))
return(rep("", length.out = NCOL(x)))
else
return(nm)
}
#' @name isNA
#' @title is.na but also tests for "NA" character
#'
#' @description
#'
#' @author Kevin Hadi
isNA = function(x, na.char = c("NA", "NULL", "na", "null")) {
if (is.character(x)) {
return(is.na(x) | x %in% na.char)
} else {
return(is.na(x))
}
}
#' @name na2false
#' @title replace logical vector with NA to FALSE
#'
#' @description
#' A convenience function to set a logical vector with NAs to false
#'
#' @return A logical vector with NAs set to FALSE
#' @author Kevin Hadi
na2false = function(v)
{
v[isNA(v)] = FALSE
## mode(v) = "logical"
v
}
#' @name rand.string
#' @title make a random string
#'
#' @return random string
#' @author Someone from Stackoverflow
rand.string <- function(n=1, length=12)
{
randomString <- c(1:n) # initialize vector
for (i in 1:n)
{
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
length, replace=TRUE),
collapse="")
}
return(randomString)
}
#' @name gr.spreduce
#' @title reduce based on a field(s) to split by in elementMetadata of GRanges, or given vector
#' @description
#'
#' split and reduce GRanges by field(s)
#' if providing a variable not already within the GRanges,
#' may need to use dynget(variable_name)
#'
#' @return GRanges
#' @author Kevin Hadi
gr.spreduce = function(gr, ..., ignore.strand = FALSE, pad = 0, return.grl = FALSE, sep = paste0(" ", rand.string(length = 8), " ")) {
lst = as.list(match.call())[-1]
ix = which(!names(lst) %in% c("gr", "sep", "pad", "ignore.strand", "return.grl"))
vars = unlist(sapply(lst[ix], function(x) unlist(sapply(x, toString))))
if (length(vars) == 1) {
if (!vars %in% colnames(mcols(gr)))
vars = tryCatch(unlist(list(...)), error = function(e) vars)
}
if (!all(vars %in% colnames(mcols(gr))))
stop("Must specify valid metadata columns in gr")
tmpix = do.call(
function(...) paste(..., sep = sep),
as.list(mcols(gr)[,vars, drop = F]))
unix = which(!duplicated(tmpix))
tmpix = factor(tmpix, levels = tmpix[unix])
grl = unname(gr.noval(gr) %>% GenomicRanges::split(tmpix))
grl = GenomicRanges::reduce(grl + pad, ignore.strand = ignore.strand)
if (return.grl) {
mcols(grl) = mcols(gr)[unix,vars,drop = F]
return(grl)
} else {
out = unlist(grl)
mcols(out) = mcols(gr)[rep(unix, times = IRanges::width(grl@partitioning)),
vars,drop = F]
return(out)
}
}
#' @name duped
#' @title duped
#'
#'
#'
#' @param ... vectors to paste by
#' @author Kevin Hadi
duped = function(..., binder = "data.table") {
duplicated(tryCatch(et(sprintf("%s(...)", binder)),
## error = function(e) do.call(cbind, list(...))))
error = function(e) paste(...)))
}
#' @name merge.repl
#' @title merging data tables with collapsing columns with the same name
#'
#' Merge two data tables with various replacing strategies
#' for columns common between x and y that are not used to merge
#' (i.e. not specified in the "by" argument)
#'
#' @param replace_NA logical, only use values in dt.y, any dt.x not in dt.y is clobbered (NA)
#' @param force_y logical, should x and y common columns be merged?
#' @param overwrite_x logical, if force_y = TRUE, should NA values in y replace x?
#' @return A data.table
#' @author Kevin Hadi
merge.repl = function(dt.x,
dt.y,
sep = "_",
replace_NA = TRUE,
force_y = TRUE,
overwrite_x = FALSE,
keep_order = FALSE,
keep_colorder = TRUE,
keep_factor = TRUE,
...) {
arg_lst = as.list(match.call())
by.y = eval(arg_lst[['by.y']], parent.frame())
by.x = eval(arg_lst[['by.x']], parent.frame())
by = eval(arg_lst[['by']], parent.frame())
all.x = eval(arg_lst[['all.x']], parent.frame())
all.y = eval(arg_lst[['all.y']], parent.frame())
all = eval(arg_lst[['all']], parent.frame())
allow.cartesian = eval(arg_lst[['allow.cartesian']])
key_x = key(dt.x)
if (is.null(all.x)) {
all.x = TRUE
}
if (is.null(all.y)) {
all.y = FALSE
}
if (!is.null(all) && all) {
all.y = TRUE
all.x = TRUE
}
if (is.null(allow.cartesian)) {
allow.cartesian = FALSE
}
if (!inherits(dt.x, "data.table")) {
dt.x = as.data.table(dt.x)
}
if (!inherits(dt.y, "data.table")) {
dt.y = as.data.table(dt.y)
}
if (keep_order == TRUE) {
dt.x[['tmp.2345098712340987']] = seq_len(nrow(dt.x))
}
dt.x[['in.x.2345098712340987']] = rep(TRUE, length.out = nrow(dt.x))
dt.y[['in.y.2345098712340987']] = rep(TRUE, length.out = nrow(dt.y))
new_ddd_args = list(by = by, by.x = by.x, by.y = by.y, all.x = all.x, all.y = all.y, allow.cartesian = allow.cartesian)
if (is.null(by.y) & is.null(by.x) & is.null(by)) {
if (length(attributes(dt.x)[['sorted']]) > 0 &&
length(attributes(dt.y)[['sorted']]) > 0) {
k.x = key(dt.x)
k.y = key(dt.y)
} else {
k.y = k.x = intersect(names2(dt.x), names2(dt.y))
if (length(k.x) == 0)
stop("no common columns to merge by!")
message("intersecting by: ", paste(k.x, collapse = ", "))
new_ddd_args[['by']] = k.x
}
if (is.null(k.x) | is.null(k.y) || (k.x != k.y)) {
stop("neither by.x/by.y nor by are supplied, keys of dt.x and dt.y must be identical and non NULL")
}
x.cols = setdiff(names(dt.x), k.x)
y.cols = setdiff(names(dt.y), k.y)
} else if (!is.null(by.x) & !is.null(by.y)) {
x.cols = setdiff(names(dt.x), by.x)
y.cols = setdiff(names(dt.y), by.y)
new_ddd_args = new_ddd_args[setdiff(names(new_ddd_args), c("by"))]
} else if (!is.null(by)) {
x.cols = setdiff(names(dt.x), by)
y.cols = setdiff(names(dt.y), by)
if (! all(by %in% colnames(dt.x)) | ! all(by %in% colnames(dt.y))) {
stop("column ", by, " does not exist in one of the tables supplied \nCheck the column names")
}
new_ddd_args = new_ddd_args[setdiff(names(new_ddd_args), c("by.y", "by.x"))]
}
these_cols = intersect(x.cols, y.cols)
## if (replace_in_x) {
if (!replace_NA) {
dt.x.tmp = copy(dt.x)
for (this_col in these_cols) {
data.table::set(dt.x.tmp, i = NULL, j = this_col, value = NULL)
}
dt.repl = suppressWarnings(do.call("merge", args = c(list(x = dt.x.tmp, y = dt.y), new_ddd_args)))
## dt_na2false(dt.repl, c("in.x.2345098712340987", "in.y.2345098712340987"))
} else {
dt.repl = suppressWarnings(do.call("merge", args = c(list(x = dt.x, y = dt.y), new_ddd_args)))
dt_na2false(dt.repl, c("in.x.2345098712340987", "in.y.2345098712340987"))
in.x = which(dt.repl[["in.x.2345098712340987"]])
in.y = which(dt.repl[["in.y.2345098712340987"]])
this_env = environment()
for (this_col in these_cols) {
x_cname = paste0(this_col, ".x")
y_cname = paste0(this_col, ".y")
x_col = dt.repl[[x_cname]]
y_col = dt.repl[[y_cname]]
xf = inherits(x_col, "factor")
yf = inherits(y_col, "factor")
if ( {xf || yf} && keep_factor) {
if (!xf) { x_col = factor(x_col); xf = TRUE }
if (!yf) { y_col = factor(y_col); yf = TRUE }
}
if (xf && !keep_factor) { x_col = as.character(x_col); xf = FALSE }
if (yf && !keep_factor) { y_col = as.character(y_col); yf = FALSE }
if (force_y) {
if (!overwrite_x) {
## if (inherits(x_col, "factor") & inherits(y_col, "factor")) {
## new_col = factor(y_col, forcats::lvls_union(list(y_col, x_col)))
## new_col[is.na(new_col)] = x_col[is.na(new_col)]
## } else {
## new_col = ifelse(!is.na(y_col), y_col, x_col)
## }
if (xf || yf) {
new_col = factor(y_col, forcats::lvls_union(list(y_col, x_col)))
new_col[is.na(new_col)] = x_col[is.na(new_col)]
} else {
new_col = ifelse(!is.na(y_col), y_col, x_col)
}
} else {
## if (inherits(x_col, "factor") & inherits(y_col, "factor")) {
## new_col = factor(x_col, forcats::lvls_union(list(y_col, x_col)))
## } else {
## new_col = x_col
## }
## new_col[dt.repl[['in.y.2345098712340987']]] = y_col[dt.repl[['in.y.2345098712340987']]]
if (xf || yf) {
new_col = factor(x_col, forcats::lvls_union(list(y_col, x_col)))
} else {
new_col = x_col
}
new_col[in.y] = y_col[in.y]
}
} else {
## if (inherits(x_col, "factor") & inherits(y_col, "factor")) {
## new_col = factor(x_col, forcats::lvls_union(list(x_col, y_col)))
## new_col[is.na(new_col) & !is.na(y_col)] = y_col[is.na(new_col) & !is.na(y_col)]
## } else {
## new_col = ifelse(is.na(x_col) & !is.na(y_col), y_col, x_col)
## }
if (xf | yf) {
new_col = factor(x_col, forcats::lvls_union(list(x_col, y_col)))
new_col[is.na(new_col) & !is.na(y_col)] = y_col[is.na(new_col) & !is.na(y_col)]
} else {
new_col = ifelse(is.na(x_col) & !is.na(y_col), y_col, x_col)
}
}
data.table::set(dt.repl, j = c(x_cname, y_cname, this_col), value = list(NULL, NULL, this_env[["new_col"]]))
}
}
## } else if (!replace_in_x & !is.null(suffix)) {
## y.suff.cols = paste0(y.cols, sep, suffix)
## ## dt.y.tmp = copy(dt.y)[, eval(dc(y.suff.cols)) := eval(dl(y.cols))][, eval(dc(y.cols)) := NULL]
## dt.y.tmp = copy(dt.y)
## data.table::set(dt.y, j = y.suff.cols, value = dt.y[, y.cols, with = FALSE])
## data.table::set(dt.y, j = y.cols, value = NULL)
## ## dt.repl = merge(dt.x, dt.y.tmp, all.x = TRUE, ...)
## dt.repl = do.call("merge", args = c(list(x = dt.x, y = dt.y.tmp), new_ddd_args))
## }
if (keep_order == TRUE) {
data.table::setorderv(dt.repl, "tmp.2345098712340987")
dt.repl[['tmp.2345098712340987']] = NULL
}
data.table::set(dt.repl, j = c("in.y.2345098712340987", "in.x.2345098712340987"),
value = list(NULL, NULL))
if (keep_colorder) {
x_cols = colnames(dt.x)
## get the order of columns in dt.repl in order of X with
## additional columns tacked on end
setcolorder(dt.repl,
intersect(union(colnames(dt.x), colnames(dt.repl)),
colnames(dt.repl)))
}
return(dt.repl)
}
#' @name dt_na2false
#' @title convert columns with NA to false
#'
#' coerce NA in columns of class "logical" to FALSE
#'
#' @param dt data.table
#' @param these_cols NULL by default, will select columns of class logical, otherwise will be specified
#' @return A data.table
#' @author Kevin Hadi
dt_na2false = function(dt, these_cols = NULL) {
na2false = function(v)
{
## v = ifelse(is.na(v), v, FALSE)
v[is.na(v)] = FALSE
as.logical(v)
}
if (is.null(these_cols)) {
these_cols = which(sapply(dt, class) == "logical")
}
for (this_col in these_cols) {
## this_val = as.data.frame(dt[, this_col, with = FALSE])[,1]
this_val = dt[[this_col]]
data.table::set(dt, j = this_col, value = na2false(this_val))
}
return(dt)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.