gGnome: gGnome: reference based assembly graph for analyzing rearranged genomes

Documented in duplicated.matrix label.runs ra.duplicated ra.merge sparse_subset

## appease R CMD check vs data.table
sid=side1=side2=side_1=side_2=silent=snid=splice.variant=splicevar=str1=str2=strand_1=strand_2=subject.id=suffix=tag=threep.cc=threep.coord=threep.exon=threep.frame=threep.pc=threep.sc=threep.sc.frame=to=transcript.id.x=transcript.id.y=transcript_associated=transcript_id=twidth=tx.cc=tx.ec=tx_strand=tx_strand.x=tx_strand.y=txid=type=uid=uids=val=walk.id=walk.iid=walk.iid.x=walk.iid.y=wkid=NULL


#' @name duplicated.matrix
#' @title R-3.5.1 version of duplicated.matrix
#'
#' @description
#' R-3.6.1 base::duplicated.matrix returns an array rather than a logical, breaking convex.basis
#'
#' @return a logical
duplicated.matrix = function(x, incomparables = FALSE, MARGIN = 1, fromLast = FALSE, ...) {
    if (!isFALSE(incomparables))
        .NotYetUsed("incomparables != FALSE")
    dx <- dim(x)
    ndim <- length(dx)
    if (length(MARGIN) > ndim || any(MARGIN > ndim))
        stop(gettextf("MARGIN = %d is invalid for dim = %d",
                      MARGIN, dx), domain = NA)
    temp <- if ((ndim > 1L) && (prod(dx[-MARGIN]) > 1L))
                apply(x, MARGIN, list)
            else x
    res <- duplicated.default(temp, fromLast = fromLast, ...)
    dim(res) <- dim(temp)
    dimnames(res) <- dimnames(temp)
    res
}

#' @name convex.basis
#' @description
#'
#' Outputs a matrix K of the convex basis of matrix A
#'
#' i.e. each column x = K[,i] is a minimal solution (with respect to sparsity) to
#' Ax = 0, x>=0
#'
#' exclude.basis =  0, 1 matrix of dimension k x ncol(A) specifying k sparsity patterns that we would
#' like to exclude from the convex.basis.  This can speed up computation since any non-negative
#' combinations of vectors that satisfy an exclusion property will also be excludable, and thus
#' we can remove such vectors as soon as we detect them..
#'
#' exclude.range = 9, 1 matrix of dimension k x nrow(A) specifying k sparsity patterns that we would like
#' exclude, but these are specified in terms of the range of abs(A) .. i.e. we want to exclude all
#' basis vectors v such that nz(exclude.ranges[i, ]) C  nz(abs(A)*v)) for some pattern i.  Again
#' any non-neg linear comb of any intermediate-basis vector that satisfies this property will satisfy it,
#' as a result we can exclude these vectors when we see them.
#'
#' @param A nxm signed incidence matrix of directed graph of n nodes and m edges
#' @param interval chunks with which to do all vs all basis comparison
#' @param chunksize  chunksize with which to proceed through higher level iteration
#' @param verbose logical flag whether to output
#' @return m x k matrix of elementary paths and cycles comprising the non-negative convex basis of the' non-negative column span of A
#' @author Marcin Imielinski
convex.basis = function(A,
                        interval = 80,
                        chunksize = 100,
                        exclude.basis = NULL,
                        exclude.range = NULL,
                        maxchunks = Inf,
                        verbose = F){
    ZERO = 1e-8;
    remaining = 1:nrow(A);
    iter = 0;
    i = 0;
                                        #    order = c()
    numelmos = c()
    K_i = I = as(diag(rep(1, ncol(A))), 'sparseMatrix');
                                        #    A_i = as(A %*% K_i, 'sparseMatrix');
    K_i = I = diag(rep(1, ncol(A)))
    A_i = A %*% K_i

    if (!is.null(exclude.basis)){
        exclude.basis = sign(exclude.basis)
        exclude.basis = exclude.basis[rowSums(exclude.basis)>0, ]
        if (nrow(exclude.basis) == 0){
            exclude.basis = NULL
        }
    }

    if (!is.null(exclude.range)){
        exclude.range = sign(exclude.range)
        exclude.range = exclude.range[rowSums(exclude.range)>0, ]
        if (nrow(exclude.range) == 0){
            exclude.range = NULL
        }
    }

                                        # vector to help rescale matrix (avoid numerical issues)
    mp  = apply(abs(A), 1, min); # minimum value of each column
    mp[mp[ZERO]] = 1; # columns with zero minimum get scale "1"

    st = Sys.time()
                                        # iterate through rows of A, "canceling" them out
    while (length(remaining)>0){
        ## TODO figure out why we have to check this so many times
        if (nrow(K_i)==0 | ncol(K_i)==0){
            return(matrix())
        }

        iter = iter+1;
        K_last = K_i;

        if (verbose){
            print(Sys.time() - st)
        }

        if (verbose){
            cat('Iter ', iter, '(of',  nrow(A_i),  ') Num basis vectors: ', nrow(K_i), " Num active components: ", sum(Matrix::rowSums(K_i!=0)), "\n")
        }

        i = remaining[which.min(Matrix::rowSums(A_i[remaining,, drop = FALSE]>=ZERO)*Matrix::rowSums(A_i[remaining,, drop = FALSE]<=(-ZERO)))]  # chose "cheapest" rows

        remaining = setdiff(remaining, i);
                                        #        order = c(order, i);

        zero_elements = which(abs(A_i[i, ]) <= ZERO);
        K_i1 = K_last[zero_elements, , drop = FALSE];  ## K_i1 = rows of K_last that are already orthogonal to row i of A
        K_i2 = NULL; ## K_i1 = will store positive combs of K_last rows that are orthogonal to row i of A (will compute these below)

        pos_elements = which(A_i[i, ]>ZERO)
        neg_elements = which(A_i[i, ]<(-ZERO))

        if (verbose){
            cat('Iter ', iter, " Row ", i, ":", length(zero_elements), " zero elements ", length(pos_elements), " pos elements ", length(neg_elements), " neg elements \n")
        }

        if (length(pos_elements)>0 & length(neg_elements)>0)
            for (m in seq(1, length(pos_elements), interval))
                for (l in seq(1, length(neg_elements), interval)){
                    ind_pos = c(m:min(c(m+interval, length(pos_elements))))
                    ind_neg = c(l:min(c(l+interval, length(neg_elements))))

                    indpairs = cbind(rep(pos_elements[ind_pos], length(ind_neg)),
                                     rep(neg_elements[ind_neg], each = length(ind_pos))); # cartesian product of ind_pos and ind_neg
                    pix = rep(1:nrow(indpairs), 2)
                    ix = c(indpairs[,1], indpairs[,2])
                                        #                coeff = c(-A_i[i, indpairs[,2]], A_i[i, indpairs[,1]])  ## dealing with Matrix ghost
                    coeff = c(-A_i[i, ][indpairs[,2]], A_i[i, ][indpairs[,1]])  ##
                    combs = Matrix::sparseMatrix(pix, ix, x = coeff, dims = c(nrow(indpairs), nrow(K_last)))
                    combs[cbind(pix, ix)] = coeff;

                    H = combs %*% K_last;

                                        # remove duplicated rows in H (with respect to sparsity)
                    H = H[!duplicated(as.matrix(H)>ZERO), , drop = F];

                                        # remove rows in H that have subsets in H (with respect to sparsity) ..
                    if ((as.numeric(nrow(H))*as.numeric(nrow(H)))>maxchunks){
                        print('Exceeding maximum number of chunks in convex.basis computation')
                        stop('Exceeding maximum number of chunks in convex.basis computation')
                    }
                    keep = which(Matrix::colSums(sparse_subset(abs(H)>ZERO, abs(H)>ZERO, chunksize = chunksize, quiet = !verbose))<=1) # <=1 since every H is its own subset
                    H = H[keep, , drop = FALSE]

                                        # remove rows in H that have subsets in K_i2
                    if (!is.null(K_i2))
                        if (nrow(K_i2)>0){
                            if ((as.numeric(nrow(K_i2))*as.numeric(nrow(H)))>maxchunks){
                                print('Exceeding maximum number of chunks in convex.basis computation')
                                stop('Exceeding maximum number of chunks in convex.basis computation')
                            }
                            keep = which(Matrix::colSums(sparse_subset(abs(K_i2)>ZERO, abs(H)>ZERO, chunksize = chunksize, quiet = !verbose))==0)
                            H = H[keep, , drop = FALSE]
                        }

                                        # remove rows in H that have subsets in K_i1
                    if (!is.null(K_i1))
                        if (nrow(K_i1)>0){
                            if ((as.numeric(nrow(K_i1))*as.numeric(nrow(H)))>maxchunks)
                            {
                                print('Exceeding maximum number of chunks in convex.basis computation')
                                stop('Exceeding maximum number of chunks in convex.basis computation')
                            }
                            keep = which(Matrix::colSums(sparse_subset(abs(K_i1)>ZERO, abs(H)>ZERO, chunksize = chunksize, quiet = !verbose))==0)
                            H = H[keep, , drop = FALSE]
                        }

                                        # maintain numerical stability
                    if ((iter %% 10)==0){
                        H = diag(1/apply(abs(H), 1, max)) %*% H

                                        #                K_i2 = rBind(K_i2, H)
                    }
                    K_i2 = rbind(K_i2, as.matrix(H))
                }

                                        #        K_i = rBind(K_i1, K_i2)
        K_i = rbind(K_i1, K_i2) ## new basis set

        if (nrow(K_i)==0){
            return(matrix())
        }

        ## only keep vectors that fail to intersect all vectors "exclude" in matrix
        if (!is.null(exclude.basis)) {
            if ((as.numeric(nrow(exclude.basis))*as.numeric(nrow(K_i)))>maxchunks){
                print('Exceeding maximum number of chunks in convex.basis computation')
                stop('Exceeding maximum number of chunks in convex.basis computation')
            }
            keep = Matrix::colSums(sparse_subset(exclude.basis>0, K_i>ZERO))==0
            if (verbose){
                cat('Applying basis exclusion and removing', sum(keep==0), 'basis vectors\n')
            }
            K_i = K_i[keep, , drop = F]
        }

        ## only keep vectors that fail to intersect all vectors "exclude" in matrix
        if (!is.null(exclude.range)){
            A_i_abs = abs(A) %*% t(K_i)
            if ((as.numeric(nrow(exclude.range))*as.numeric*ncol(A_i_abs))>maxchunks){
                print('Exceeding maximum number of chunks in convex.basis computation')
                stop('Exceeding maximum number of chunks in convex.basis computation')
            }
            keep = Matrix::colSums(sparse_subset(exclude.range>0, t(A_i_abs), quiet = !verbose))==0
            if (verbose){
                cat('Applying range exclusion and removing', sum(keep==0), 'basis vectors\n')
            }
            K_i = K_i[keep, , drop = F]
        }

        A_i = A %*% t(K_i)
    }

    return(t(K_i))
}

#' @name all.paths
#' @title all.paths
#' @description
#'
#'
#' Low level function to enumerate all elementary paths and cycles through graph
#'
#' takes directed graph represented by n x n binary adjacency matrix  A and outputs all cycles and paths between source.vertices, sink.vertices
#'
#'
#' @param A nxn adjacency matrix
#' @param all logical flag, if all = T, will include all sources (parentless vertices) and sinks (childless vertices) in path computati
#' @param ALL logical flag, if ALL = T, will also include vertices without outgoing and incoming edges in paths
#' @param sources graph indices to treat as sources (by default is empty)
#' @param sinks graph indices to treat as sinks (by default is empty)
#' @param verbose logical flag
#' @return list of integer vectors corresponding to indices in A (i.e. vertices)
#' $paths = paths indices
#' $cycles = cycle indices
#' @keywords internal
#' @author Marcin Imielinski
#' @noRd
all.paths = function(A, all = F, ALL = F, sources = c(), sinks = c(), source.vertices = sources, sink.vertices = sinks,
                     exclude = NULL, ## specifies illegal subpaths, all such paths / cycles and
                     ## their supersets will be excluded, specified as k x nrow(A) matrix of vertex sets
                     verbose = FALSE,...)
{
    blank.vertices = Matrix::which(Matrix::rowSums(A)==0 & Matrix::colSums(A)==0)

    if (ALL)
        all = T

    if (all)
    {
        source.vertices = Matrix::which(Matrix::rowSums(A)>0 & Matrix::colSums(A)==0)
        sink.vertices = Matrix::which(Matrix::colSums(A)>0 & Matrix::rowSums(A)==0)
    }

    out = list(cycles = NULL, paths = NULL)

    node.ix = which(Matrix::rowSums(A!=0)>0 | Matrix::colSums(A!=0)>0)
    if (length(node.ix)==0)
        return(out)

    A = A[node.ix, node.ix]

    if (!is.null(exclude))
        exclude = sign(abs(exclude[, node.ix]))

    ij = Matrix::which(A!=0, arr.ind = T)
    B = Matrix::sparseMatrix(c(ij[,1], ij[,2]), rep(1:nrow(ij), 2), x = rep(c(-1, 1), each = nrow(ij)), dims = c(nrow(A), nrow(ij)))
    I = diag(rep(1, nrow(A)))

    source.vertices = setdiff(match(source.vertices, node.ix), NA)
    sink.vertices = setdiff(match(sink.vertices, node.ix), NA)

    B2 = Matrix::cbind2(Matrix::cbind2(B, I[, source.vertices, drop = FALSE]), -I[, sink.vertices, drop = FALSE])

    if (verbose)
        cat(sprintf('Computing paths for %s vertices and %s edges\n', nrow(B2), ncol(B2)))

    K = convex.basis(B2, verbose = verbose, exclude.range = exclude, ...)

    if (all(is.na(K)))
        return(out)

    K = K[, Matrix::colSums(K[1:ncol(B), ,drop = FALSE])!=0, drop = FALSE] ## remove any pure source to sink paths

    is.cyc = Matrix::colSums(B %*% K[1:ncol(B), ,drop = FALSE]!=0)==0


    out$cycles = lapply(which(is.cyc),
                        function(i)
                        {
                            k = which(K[1:ncol(B), i]!=0)
                            v.all = unique(as.vector(ij[k, , drop = FALSE]))
                            sG = graph.edgelist(ij[k, , drop = FALSE])
                            tmp.v = v.all[c(1,length(v.all))]
                            p.fwd = get.shortest.paths(sG, tmp.v[1], tmp.v[2])
                            p.bwd = get.shortest.paths(sG, tmp.v[2], tmp.v[1])
                            return(node.ix[unique(unlist(c(p.fwd, p.bwd)))])
                        })

    out$paths = lapply(which(!is.cyc),
                       function(i)
                       {
                           k = K[1:ncol(B), i]
                           eix = which(k!=0)
                           v.all = unique(as.vector(ij[eix, , drop = FALSE]))
                           sG = graph.edgelist(ij[eix, , drop = FALSE])
                           io = B %*% k
                           v.in = Matrix::which(io<0)[1]
                           v.out = Matrix::which(io>0)[1]
                           return(node.ix[unlist(get.shortest.paths(sG, v.in, v.out))])
                       })

    if (length(out$cycles)>0)
    {
        tmp.cix = cbind(unlist(lapply(1:length(out$cycles), function(x) rep(x, length(out$cycles[[x]])))), unlist(out$cycles))
        out$cycles = out$cycles[!duplicated(as.matrix(Matrix::sparseMatrix(tmp.cix[,1], tmp.cix[,2], x = 1)))]
    }

    if (length(out$paths)>0)
    {
        tmp.pix = cbind(unlist(lapply(1:length(out$paths), function(x) rep(x, length(out$paths[[x]])))), unlist(out$paths))
        out$paths = out$paths[!duplicated(as.matrix(Matrix::sparseMatrix(tmp.pix[,1], tmp.pix[,2], x = 1)))]
    }

    if (ALL & length(blank.vertices)>0)
        out$paths = c(out$paths, lapply(blank.vertices, identity))

    return(out)
}


#' @name skrub
#' @title skrub
#' @description
#'
#' Converts data.table columns to standard types or replaces with NA and throws a warning
#'
#' Also converts factor columns to character columns in data.table, making
#' everyone's life easier.
#'
#' @author Marcin Imielinski
#' @param dt data.table or data.frame
#' @keywords internal
#' @noRd
skrub = function(dt)
{
    cl = lapply(names(dt), function(x) class(dt[[x]]))
    names(cl) = names(dt)
    for (nm in names(cl)[cl=='factor'])
        dt[[nm]] = as.character(dt[[nm]])

    for (nm in names(cl)[cl=='integer'])
        dt[[nm]] = as.numeric(dt[[nm]])

    ## clean up any additional weird types before aggregating
    ALLOWED.CLASSES = c('integer', 'numeric', 'logical', 'character', 'list')
    if (any(tofix <- !sapply(dt, class) %in% ALLOWED.CLASSES))
    {
        warning(sprintf('found non-standard data types among one or more gEdge metadata columns (%s): converting to character before aggregating.  Consider manually converting these columns to one of the standard types: %s',
                        paste(names(dt)[tofix], collapse = ', '),
                        paste(ALLOWED.CLASSES, collapse = ', ')))

        for (fix in which(tofix))
        {
            replace = tryCatch(as.character(dt[[fix]]), error = function(e) NULL)

            if (is.null(replace))
            {
                warning(sprintf('Conversion of column character failed for column %s, replacing values with NA', names(dt)[fix]))
                replace = NA
            }
            dt[[fix]] = replace
        }
    }
    return(dt)
}


#' @name sparse_subset
#' @title sparse_subset
#' @description
#'
#' given k1 x n matrix A and k2 x n matrix B
#' returns k1 x k2 matrix C whose entries ij = 1 if the set of nonzero components of row i of A is
#' a (+/- strict) subset of the nonzero components of row j of B
#'
sparse_subset = function(A, B, strict = FALSE, chunksize = 100, quiet = FALSE)
{
    nz = Matrix::colSums(as.matrix(A)!=0, 1)>0

    if (is.null(dim(A)) | is.null(dim(B)))
        return(NULL)

    C = Matrix::sparseMatrix(i = c(), j = c(), dims = c(nrow(A), nrow(B)))

    for (i in seq(1, nrow(A), chunksize))
    {
        ixA = i:min(nrow(A), i+chunksize-1)
        for (j in seq(1, nrow(B), chunksize))
        {
            ixB = j:min(nrow(B), j+chunksize-1)

            if (length(ixA)>0 & length(ixB)>0 & !quiet)
                cat(sprintf('\t interval A %s to %s (%d) \t interval B %d to %d (%d)\n', ixA[1], ixA[length(ixA)], nrow(A), ixB[1], ixB[length(ixB)], nrow(B)))
            if (strict)
                C[ixA, ixB] = (sign((A[ixA, , drop = FALSE]!=0)) %*% sign(t(B[ixB, , drop = FALSE]!=0))) * (sign((A[ixA, , drop = FALSE]==0)) %*% sign(t(B[ixB, , drop = FALSE]!=0))>0)
            else
                C[ixA, ixB] = (sign(A[ixA, nz, drop = FALSE]!=0) %*% sign(t(B[ixB, nz, drop = FALSE]==0)))==0
        }
    }

    return(C)
}


#' @name label.runs
#' @title label.runs
#' @description
#'
#' For logical input labels all instances of "TRUE" with a unique label and everything else as false
#'
#' For non-logical (e.g. character) input labels, labels each contiguous runs of the same value with a unique label
#' (note: even subsequent runs of an earlier used value in the vector will be given a new unique label)
#'
#'
#' @author Marcin Imielinski
#' @export
label.runs = function(x)
{
    if (!is.logical(x))
    {
        cumsum(abs(diff(as.numeric(c(0, as.integer(factor(x))))))>0)
    }
    else ## note will label all runs of FALSE with NA
    {
        as.integer(ifelse(x, cumsum(diff(as.numeric(c(FALSE, x)))>0), NA))
    }
}

#' @name dunlist
#' @title dunlist
#'
#' @description
#' unlists a list of vectors, matrices, data.tables into a data.table indexed by the list id
#' $listid
#'
#' does fill = TRUE in case the data.tables inside the list do not have compatible column names
#'
#' @param x list of vectors, matrices, or data frames
#' @return data.frame of concatenated input data with additional fields $ix and $iix specifying the list item and within-list index from which the given row originated from
#' @author Marcin Imielinski
#' @export
#' @keywords internal
#' @noRd
#############################################################
dunlist = function(x)
{

    if (length(x)==0)
        return(data.table())

    if (is.null(names(x)))
        names(x) = seq_along(x)
    tmp = lapply(x, as.data.table)

    out = cbind(data.table(listid = rep(names(x), elementNROWS(x)), rbindlist(tmp, fill = TRUE)))
    setkey(out, listid)
    return(out)
}



# #' @name read_vcf
# #' @title parses VCF into GRanges or data.table
# #'
# #' @description
# #'
# #' Wrapper around Bioconductor VariantAnnotation. Reads VCF into GRanges or data.table format
# #'
# #' @param fn argument to parse via bcftools
# #' @param gr GRanges input GRanges (default = NULL)
# #' @param hg string Human reference genome (default = 'hg19')
# #' @param geno boolean Flag whether to pull the genotype information information in the GENO vcf fields (default = NULL)  
# #' @param swap.header string Pathn to another VCF file (in case of VCF with malformed header)(default = NULL)   
# #' @param verbose boolean Flag (default = FALSE)
# #' @param add.path boolean Flag to add the path of the current VCF file to the output (default = FALSE)
# #' @param tmp.dir string Path to directory for temporary files (default = '~/temp/.tmpvcf')
# #' @param ... extra parameters
# #' @author Marcin Imielinski
# #' @keywords internal
# #' @noRd
# read_vcf = function(fn, gr = NULL, hg = 'hg19', geno = NULL, swap.header = NULL, verbose = FALSE, add.path = FALSE, tmp.dir = '~/temp/.tmpvcf', ...)
# {

#     in.fn = fn

#     if (verbose){
#         cat('Loading', fn, '\n')
#     }

#     ## check if default genome has been set
#     if (grepl("hg38", Sys.getenv("DEFAULT_GENOME"))) {
#         hg = "hg38"
#     }

#     if (!is.null(gr)){

#         tmp.slice.fn = paste(tmp.dir, '/vcf_tmp', gsub('0\\.', '', as.character(runif(1))), '.vcf', sep = '')
#         cmd = sprintf('bcftools view %s %s > %s', fn,  paste(gr.string(gr.stripstrand(gr)), collapse = ' '), tmp.slice.fn)

#         if (verbose){
#             cat('Running', cmd, '\n')
#         }
#         system(cmd)
#         fn = tmp.slice.fn
#     }

#     if (!is.null(swap.header)){

#         if (!file.exists(swap.header)){
#             stop(sprintf('Error: Swap header file %s does not exist\n', swap.header))
#         }

#         system(paste('mkdir -p', tmp.dir))
#         tmp.name = paste(tmp.dir, '/vcf_tmp', gsub('0\\.', '', as.character(runif(1))), '.vcf', sep = '')
#         if (grepl('gz$', fn)){
#             system(sprintf("zcat %s | grep '^[^#]' > %s.body", fn, tmp.name))
#         }
#         else{
#             system(sprintf("grep '^[^#]' %s > %s.body", fn, tmp.name))
#         }

#         if (grepl('gz$', swap.header)){
#             system(sprintf("zcat %s | grep '^[#]' > %s.header", swap.header, tmp.name))
#         }
#         else{
#             system(sprintf("grep '^[#]' %s > %s.header", swap.header, tmp.name))
#         }

#         system(sprintf("cat %s.header %s.body > %s", tmp.name, tmp.name, tmp.name))
#         vcf = readVcf(tmp.name, hg, ...)
#         system(sprintf("rm %s %s.body %s.header", tmp.name, tmp.name, tmp.name))

#     }
#     else{
#         vcf = readVcf(fn, hg, ...)
#     }

#     out = granges(vcf)

#     if (!is.null(values(out))){
#         values(out) = cbind(values(out), info(vcf))
#     }
#     else{
#         values(out) = info(vcf)
#     }

#     if (!is.null(geno)){

#         if (!is.logical(geno)){
#             geno = TRUE
#         }

#         if (geno){
#             for (g in  names(geno(vcf))){
#                 geno = names(geno(vcf))
#                 warning(sprintf('Warning: Loading all geno fields:\n\t%s', paste(geno, collapse = ',')))
#             }
#         }

#         gt = NULL

#         if (length(g) > 0){

#             for (g in geno){
#                 m = as.data.frame(geno(vcf)[[g]])
#                 names(m) = paste(g, names(m), sep = '_')
#                 if (is.null(gt)){
#                     gt = m
#                 }
#                 else{
#                     gt = cbind(gt, m)
#                 }
#             }
            
#             values(out) = cbind(values(out), as(gt, 'DataFrame'))
#         }
#     }

#     if (!is.null(gr)){
#         system(paste('rm', tmp.slice.fn))
#     }

#     if (add.path){
#         values(out)$path = in.fn
#     }

#     return(out)
# }




#' @name file.url.exists
#' @title Check if a file or url exists
#' @param f File or url
#' @return TRUE or FALSE
#' @noRd
file.url.exists <- function(f) {
    return(file.exists(f) || RCurl::url.exists(f))
}

#' @name read.rds.url
#' @title Checks if path is URL or file, then reads RDS
#' @param f File or url
#' @return data
#' @noRd
read.rds.url <- function(f) {
    if (grepl("^http",f))
        return(readRDS(gzcon(url(f))))
    return(readRDS(f))
}




#' @name ra.overlaps
#' @title ra.overlaps
#' @description
#'
#' Determines overlaps between two piles of rearrangement junctions ra1 and ra2 (each GRangesLists of signed locus pairs)
#' against each other, returning a sparseMatrix that is T at entry ij if junction i overlaps junction j.
#'
#' if argument pad = 0 (default) then only perfect overlap will validate, otherwise if pad>0 is given, then
#' padded overlap is allowed
#'
#' strand matters, though we test overlap of both ra1[i] vs ra2[j] and gr.flipstrand(ra2[j])
#'
#' @param ra1 \code{GRangesList} with rearrangement set 1
#' @param ra2 \code{GRangesList} with rearrangement set 2
#' @param pad Amount to pad the overlaps by. Larger is more permissive. Default is exact (0)
#' @param arr.ind Default TRUE
#' @param ignore.strand Ignore rearrangement orientation when doing overlaps. Default FALSE
#' @param ... params to be sent to \code{\link{gr.findoverlaps}}
#' @name ra.overlaps
#' @keywords internal
#' @noRd
ra.overlaps = function(ra1, ra2, pad = 0, arr.ind = TRUE, ignore.strand=FALSE, ...)
{
    bp1 = grl.unlist(ra1) + pad
    bp2 = grl.unlist(ra2) + pad
    ix = gr.findoverlaps(bp1, bp2, ignore.strand = ignore.strand, ...)

    .make_matches = function(ix, bp1, bp2)
    {
        if (length(ix) == 0){
            return(NULL)
        }
        tmp.match = cbind(bp1$grl.ix[ix$query.id], bp1$grl.iix[ix$query.id], bp2$grl.ix[ix$subject.id], bp2$grl.iix[ix$subject.id])
        tmp.match.l = lapply(split(1:nrow(tmp.match), paste(tmp.match[,1], tmp.match[,3])), function(x) tmp.match[x, , drop = F])

        ## match only occurs if each range in a ra1 junction matches a different range in the ra2 junction
        matched.l = sapply(tmp.match.l, function(x) all(c('11','22') %in% paste(x[,2], x[,4], sep = '')) | all(c('12','21') %in% paste(x[,2], x[,4], sep = '')))

        return(do.call('rbind', lapply(tmp.match.l[matched.l], function(x) cbind(x[,1], x[,3])[!duplicated(paste(x[,1], x[,3])), , drop = F])))
    }

    tmp = .make_matches(ix, bp1, bp2)

    if (is.null(tmp)){
        if (arr.ind){

            return(as.matrix(data.table(ra1.ix = as.numeric(NA), ra2.ix = as.numeric(NA))))
        }
        else{
            return(Matrix::sparseMatrix(length(ra1), length(ra2), x = 0))
        }
    }

    rownames(tmp) = NULL

    colnames(tmp) = c('ra1.ix', 'ra2.ix')

    if (arr.ind) {
        ro = tmp[order(tmp[,1], tmp[,2]), , drop = FALSE]
        if (inherits(ro, "integer")) {
            ro <- matrix(ro, ncol=2, nrow=1, dimnames=list(c(), c('ra1.ix', 'ra2.ix')))
        }
        return(ro)
    } else {
        ro = Matrix::sparseMatrix(tmp[,1], tmp[,2], x = 1, dims = c(length(ra1), length(ra2)))
        return(ro)
    }
}

#' @name gt.gencode
#' @description
#'
#' internal function to format transcript annotations for fusion output
#'
#' @param gencode GRanges output of rtracklayer::import of GENCODE gtf
#' @param bg.col character representing color to put in colormap
#' @param cds.col color for CDS
#' @param utr.col color for UTR
#' @param st.col scalar character representing color of CDS start
#' @param en.col scalar character representing color of CDS end
#' @keywords internal
#' @noRd
#' @return gTrack object of gencode output
gt.gencode = function(gencode, bg.col = alpha('blue', 0.1), cds.col = alpha('blue', 0.6), utr.col = alpha('purple', 0.4), st.col = 'green',
                      en.col = 'red')
{
    if (length(gencode)==0)
        return(gTrack())

    tx = gencode[gencode$type =='transcript']
    genes = gencode[gencode$type =='gene']
    exons = gencode[gencode$type == 'exon']
    utr = gencode[gencode$type == 'UTR']
    ## ut = unlist(utr$tag)
    ## utix = rep(1:length(utr), sapply(utr$tag, length))
    ## utr5 = utr[unique(utix[grep('5_UTR',ut)])]
    ## utr3 = utr[unique(utix[grep('3_UTR',ut)])]
    ## utr5$type = 'UTR5'
    ## utr3$type = 'UTR3'
    startcodon = gencode[gencode$type == 'start_codon']
    stopcodon = gencode[gencode$type == 'stop_codon']
    OUT.COLS = c('gene_name', 'transcript_name', 'transcript_id', 'type', 'exon_number', 'type')
    tmp = c(genes, tx, exons, utr, startcodon, stopcodon)[, OUT.COLS]

    ## compute tx ord of intervals
    ord.ix = order(tmp$transcript_id, match(tmp$type, c('gene', 'transcript', 'exon', 'UTR', 'start_codon','stop_codon')))
    tmp.rle = rle(tmp$transcript_id[ord.ix])
    tmp$tx.ord[ord.ix] = unlist(lapply(tmp.rle$lengths, function(x) 1:x))
    tmp = tmp[rev(order(match(tmp$type, c('gene', 'transcript', 'exon', 'UTR', 'start_codon','stop_codon'))))]
    tmp.g = tmp[tmp$type != 'transcript']
    cmap = list(type = c(gene = bg.col, transcript = bg.col, exon = cds.col, start_codon = st.col, stop_codon = en.col, UTR = utr.col))
    tmp.g = gr.disjoin(gr.stripstrand(tmp.g))
    return(gTrack(tmp.g[, c('type', 'gene_name')], colormaps = cmap))
}


#' @name alpha
#' @title alpha
#' @description
#' Give transparency value to colors
#'
#' Takes provided colors and gives them the specified alpha (ie transparency) value
#'
#' @author Marcin Imielinski
#' @param col RGB color
#' @keywords internal
#' @noRd
alpha = function(col, alpha)
{
    col.rgb = grDevices::col2rgb(col)
    out = grDevices::rgb(red = col.rgb['red', ]/255, green = col.rgb['green', ]/255, blue = col.rgb['blue', ]/255, alpha = alpha)
    names(out) = names(col)
    return(out)
}


#' @name dunlist
#'  @title dunlist
#'
#' @description
#' unlists a list of vectors, matrices, data.tables into a data.table indexed by the list id
#' $listid
#'
#' does fill = TRUE in case the data.tables inside the list do not have compatible column names
#'
#' @param x list of vectors, matrices, or data frames
#' @return data.frame of concatenated input data with additional fields $ix and $iix specifying the list item and within-list index from which the given row originated from
#' @author Marcin Imielinski
#' @keywords internal
#' @noRd
dunlist = function(x)
{
    listid = rep(1:length(x), elementNROWS(x))

    if (!is.null(names(x))) ## slows things down
        listid = names(x)[listid]

    xu = unlist(x, use.names = FALSE)

    if (is.null(xu))
    {
        return(as.data.table(list(listid = c(), V1 = c())))
    }

    if (!(inherits(xu, 'data.frame')) | inherits(xu, 'data.table'))
        xu = data.table(V1 = xu)


    out = cbind(data.table(listid = listid), xu)
    setkey(out, listid)
    return(out)
}


#' @name pdist
#' @title pdist
#'
#' @description
#'
#' Given two GRanges gr1 and gr2 each of the same length, returns the reference
#' distance between them, subject to ignore.strand = TRUE
#'
#' @param gr1 GRanges
#' @param gr2 GRAnges
#' @return vector of
#' @author Marcin Imielinski
#' @keywords internal
#' @noRd
pdist = function(gr1, gr2, ignore.strand = TRUE)
{
    if (length(gr1) != length(gr2))
        stop('arguments have to be the same length')

    d = ifelse(as.logical(seqnames(gr1) != seqnames(gr2)), Inf,
               pmin(abs(start(gr1)-start(gr2)),
                    abs(start(gr1)-end(gr2)),
                    abs(end(gr1)-start(gr2)),
                    abs(end(gr1)-end(gr2))))

    if (!ignore.strand && any(ix <- strand(gr1) != strand(gr2)))
        d[as.logical(ix)] = Inf

    return(d)
}


#' @name ra.duplicated
#' @title ra.duplicated
#' @description
#'
#' Show if junctions are Deduplicated
#'
#' Determines overlaps between two or more piles of rearrangement junctions (as named or numbered arguments) +/- padding
#' and will merge those that overlap into single junctions in the output, and then keep track for each output junction which
#' of the input junctions it was "seen in" using logical flag  meta data fields prefixed by "seen.by." and then the argument name
#' (or "seen.by.ra" and the argument number)
#'
#' @author Xiaotong Yao
#' @param grl GRangesList representing rearrangements to be merged
#' @param pad non-negative integer specifying padding
#' @param ignore.strand whether to ignore strand (implies all strand information will be ignored, use at your own risk)
#' @return \code{GRangesList} of merged junctions with meta data fields specifying which of the inputs each outputted junction was "seen.by"
ra.duplicated = function(grl, pad=500, ignore.strand=FALSE){

    if (!is(grl, "GRangesList")){
        stop("Error: Input must be GRangesList!")
    }

    ##if (any(elementNROWS(grl)!=2)){
    ##    stop("Error: Each element must be length 2!")
    ##}

    if (length(grl)==0){
        return(logical(0))
    }

    if (length(grl)==1){
        return(FALSE)
    }

    if (length(grl)>1){

        ix.pair = as.data.table(ra.overlaps(grl, grl, pad=pad, ignore.strand = ignore.strand))[ra1.ix!=ra2.ix]

        if (nrow(ix.pair)==0){
            return(rep(FALSE, length(grl)))
        }
        else {
            ##           dup.ix = unique(rowMax(as.matrix(ix.pair)))
            dup.ix = unique(apply(as.matrix(ix.pair), 1, max))
            return(seq_along(grl) %in% dup.ix)
        }
    }
}



#' Merges rearrangements represented by \code{GRangesList} objects
#'
#' Determines overlaps between two or more piles of rearrangement junctions (as named or numbered arguments) +/- padding
#' and will merge those that overlap into single junctions in the output, and then keep track for each output junction which
#' of the input junctions it was "seen in" using logical flag  meta data fields prefixed by "seen.by." and then the argument name
#' (or "seen.by.ra" and the argument number)
#'
#' @param ... GRangesList representing rearrangements to be merged
#' @param pad non-negative integer specifying padding
#' @param ind  logical flag (default FALSE) specifying whether the "seen.by" fields should contain indices of inputs (rather than logical flags) and NA if the given junction is missing
#' @param ignore.strand whether to ignore strand (implies all strand information will be ignored, use at your own risk)
#' @return \code{GRangesList} of merged junctions with meta data fields specifying which of the inputs each outputted junction was "seen.by"
#' @name ra.merge
#' @export
#' @examples
#'
#' # generate some junctions
#' gr1 <- GRanges(1, IRanges(1:10, width = 1), strand = rep(c('+', '-'), 5))
#' gr2 <- GRanges(1, IRanges(4 + 1:10, width = 1), strand = rep(c('+', '-'), 5))
#' ra1 = split(gr1, rep(1:5, each = 2))
#' ra2 = split(gr2, rep(1:5, each = 2))
#'
#' ram = ra.merge(ra1, ra2)
#' values(ram) # shows the metadata with TRUE / FALSE flags
#'
#' ram2 = ra.merge(ra1, ra2, pad = 5) # more inexact matching results in more merging
#' values(ram2)
#'
#' ram3 = ra.merge(ra1, ra2) #indices instead of flags
#' values(ram3)
ra.merge = function(..., pad = 0, ignore.strand = FALSE){
    ra = list(...)
    ra = ra[which(!sapply(ra, is.null))]

    ## figure out names
    nm = names(ra)
    if (is.null(nm)){
        nm = paste('ra', 1:length(ra), sep = '')
    }
    names(ra) = nm
    nml = structure(paste('seen.by', nm, sep = '.'), names = nm)

    ## combine and sort all bps from all input ra's, keeping track of grl.ix and listid
    dtl = ra %>% lapply(function(x)
    {
        tmp = grl.unlist(x)
        if (!length(tmp))
            data.table()
        else
            as.data.table(tmp[, c('grl.ix', 'grl.iix')])
    })

    gr = lapply(
        names(dtl),
        function(x) {
            out = dtl[[x]]; if (!nrow(out)) return(NULL) else out[, listid := x]
        }) %>%
        rbindlist(fill = TRUE) %>%
        dt2gr %>%
        sort ## sorting means first bp will be first below

    ## matching will allow us to match by padding
    ugr = reduce(gr+pad)

    gr$uix = gr.match(gr, ugr, ignore.strand = FALSE)
    juncs = gr2dt(gr)[
      , ":="(ubp1 = min(uix[1]), ubp2 = max(uix[2]), jid = paste(listid, grl.ix)),
        by = .(listid, grl.ix)]
    ubp = juncs[, unique(paste(ubp1, ubp2))]
    juncs[, merged.ix := match(paste(ubp1, ubp2), ubp)]
    juncs[, ":="(select = jid==min(jid)), by = merged.ix][(select)][merged.ix==1]

    jmap = juncs[, .(listid, grl.ix), keyby = .(merged.ix, jid)][!duplicated(jid)]

    ## merging will cast all unique bp1 pairs and find the (first) junction in each input list that matches it
    ## merged = dcast.data.table(
    ##     juncs, bp1 + bp2 ~ listid,
    ##     value.var = 'grl.ix',
    ##     fun.aggregate = function(x, na.rm = TRUE) x[1], fill = NA)

    ## ugr = gr.start(ugr - pad) ## don't do this, make junction wider
    ## ugr = ugr - pad
    ## out = grl.pivot(GRangesList(ugr[merged$bp1], ugr[merged$bp2]))
    out = dt2gr(
        juncs[(select),
              .(seqnames, start, end, strand, merged.ix = merged.ix)]) %>%
        split(.$merged.ix)

    ## values(out) = merged[, -(1:2)]
    ## add "seen.by" fields
    ## values(out) = cbind(values(out), do.call(cbind, structure(lapply(nm, function(x) !is.na(values(out)[[x]])), names = nml)))
    seen.by = dcast.data.table(
        jmap, merged.ix ~ listid, value.var = "grl.ix",
        fun.aggregate = function(x) {
            if (length(x)){
                paste(x, collapse = ",")
            } else {
                NA_character_
            }
        })
    seen.mat = seen.by[, setdiff(colnames(seen.by), "merged.ix"), with = FALSE] %>% as.matrix %>% is.na %>% `!`
    colnames(seen.mat) = paste0("seen.by.", colnames(seen.mat))
    seen.by = cbind(seen.by, seen.mat)

    mc = copy(seen.by)
    for (i in seq_along(nm)){
        mc2 = as.data.table(mcols(ra[[nm[i]]]))
        if (length(nm) > 1){
            if (length(names(mc2)) > 0){
                names(mc2) = paste0(names(mc2), '.', nm[i])
            }
        }
        mc2[, tmp.ix := seq_len(.N)]
        mc = merge(
            copy(mc)[
              , tmp.ix := as.numeric(gsub("^([0-9]+)((,[0-9]+)?)$", "\\1", mc[[nm[i]]]))
            ],
            mc2
            ,
            by = "tmp.ix", all.x = TRUE
        )
    }
    mc = mc[order(merged.ix)]

    ## now merge in metadata from input out, using the appropriate id
    ## metal = lapply(
    ##     1:length(nm), function(i){
    ##         as.data.table(values(ra[[nm[i]]]))[merged[[nm[i]]], ]
    ##     }
    ## )
    ## metal = metal[sapply(metal, nrow)>0]
    ## if (length(metal))
    ##   values(out) = cbind(values(out), do.call(cbind, metal))
    values(out) = mc
    return(out)
}

#' @name tstamp
#' @title tstamp
#' @description
#' Timestamp used to check for staleness of gGraph and other objects
#' @keywords internal
#' @noRd
tstamp = function()
{
    return(paste(as.character(Sys.time()), runif(1)))
}

#' @name dodo.call
#' @title dodo.call
#' @description
#' do.call implemented using eval parse for those pesky (e.g. S4) case when do.call does not work
#' @keywords internal
#' @noRd
dodo.call = function(FUN, args)
{
    if (!is.character(FUN))
        FUN = substitute(FUN)
    cmd = paste(FUN, '(', paste('args[[', 1:length(args), ']]', collapse = ','), ')', sep = '')
    return(eval(parse(text = cmd)))
}

#' @name dedup
#' @title dedup
#'
#' @description
#' relabels duplicates in a character vector with .1, .2, .3
#' (where "." can be replaced by any user specified suffix)
#'
#' @param x input vector to dedup
#' @param suffix suffix separator to use before adding integer for dups in x
#' @param itemize.all by default the first item will not have a suffix. If itemize.all is set to TRUE then all items (including the first item) will have a suffix added to them
#' @return length(x) vector of input + suffix separator + integer for dups and no suffix for "originals" (unless itemize.all is set to TRUE and then all copies get a suffix)
#' @author Marcin Imielinski
#' @noRd
dedup = function(x, suffix = '.', itemize.all = FALSE)
{
    dup = duplicated(x);
    udup = setdiff(unique(x[dup]), NA)
    udup.ix = lapply(udup, function(y) which(x==y))
    if (itemize.all){
        udup.suffices = lapply(udup.ix, function(y) c(paste(suffix, 1:length(y), sep = '')))
    } else {
        udup.suffices = lapply(udup.ix, function(y) c('', paste(suffix, 2:length(y), sep = '')))
    }
    out = x;
    out[unlist(udup.ix)] = paste(out[unlist(udup.ix)], unlist(udup.suffices), sep = '');
    return(out)
}


#' @name spmelt
#' @title spMelt
#' @description
#' Melts sparse matrix into data.table
#'
#' @param A
#' @return data.table of all non
#' @author Marcin Imielinski
spmelt = function(A, baseval = 0) {
    if (!is.null(baseval))
    {
        ij = Matrix::which(A!=baseval, arr.ind = TRUE)
    }
    else ## take everything
    {
        ij = as.matrix(expand.grid(1:nrow(A), 1:ncol(A)))
    }
    dt = data.table(i = ij[,1], j = ij[,2], val = A[ij])
}






#' @name gstat
#'
#' @export
gstat = function(gg,
                 INF = max(seqlengths(gg)) + 1,
                 thresh = 1e6){
    if (!inherits(gg, "gGraph")){
        stop("Input is not a gGraph object.")
    }

    if (length(gg$nodes)==0 |
        length(gg$edges)==0){
        return(NULL)
    }
    if (!is.element("cn", colnames(gg$nodes$dt)) |
        !is.element("cn", colnames(gg$edges$dt)) |
        !any(gg$edges$dt[, type=="ALT"])){
        return(NULL)
    }
    if (is.na(INF)){
        INF = 1e9
    }
    ## needed "fb" in the edge table
    if (!is.element("fb", colnames(gg$edges$dt))){
        gg$annotate("fb",
                    data=gg$junctions$span<=1e4 & gg$junctions$sign==1,
                    id=gg$junctions$dt$edge.id,
                    class="edge")
    }
    ## needed "term" field in the node table
    if (!is.element("term", colnames(gg$nodes$dt)) |
        !is.element("fused", colnames(gg$nodes$dt))){
        n2e = rbind(
            gg$edges$dt[, .(nid = n1,
                            type,
                            side = n1.side)],
            gg$edges$dt[, .(nid = n2,
                            type,
                            side = n2.side)])
        n2e[, term := length(unique(side))<2, by=nid]
        gg$annotate("term",
                    n2e[, term],
                    n2e[, nid],
                    "node")
        fused = gg$nodes$dt$node.id %in% n2e[type=="ALT", unique(nid)]
        gg$nodes$mark(fused = fused)
    }
    edt = gg$edges$dt
    ndt = gg$nodes$dt
    ## four traditional classes, plus fold back
    n.del = edt[, sum(class=="DEL-like", na.rm = TRUE)]
    n.dup = edt[, sum(class=="DUP-like", na.rm = TRUE)]
    n.inv = edt[, sum(class=="INV-like", na.rm = TRUE)]
    n.tra = edt[, sum(class=="TRA-like", na.rm = TRUE)]
    n.fb = edt[, sum(fb==TRUE, na.rm = TRUE)]
    ## highest CN of a tra, dup, fb
    max.cn.del = edt[class=="DEL-like", pmax(max(cn, na.rm=T), 0)]
    max.cn.dup = edt[class=="DUP-like", pmax(max(cn, na.rm=T), 0)]
    max.cn.inv = edt[class=="INV-like", pmax(max(cn, na.rm=T), 0)]
    max.cn.tra = edt[class=="TRA-like", pmax(max(cn, na.rm=T), 0)]
    max.cn.fb = edt[fb==TRUE, pmax(max(cn, na.rm=T), 0)]
    ## edge wise features
    this.alt.sg = gg[, type=="ALT"]
    diam = this.alt.sg$diameter
    alt.on.diam = length(diam$edges)
    ## walk the whole ALT graph
    alt.wks = this.alt.sg$walks()
    circ.ix = alt.wks$dt[circular==TRUE, walk.id]
    n.circ = length(circ.ix)
    if (n.circ == 0){
        max.len.circ = 0
        max.cn.circ = 0
    } else {
        max.len.circ = alt.wks$dt[circular==TRUE, max(length)]
        max.cn.circ = max(sapply(circ.ix,
                                 function(y) {
                                     alt.wks[y]$edges$dt[, min(cn)]
                                 }),
                          na.rm=T)
    }
    ## number of junctions
    n.junc = length(gg$edges)
    jcn.tab = table(edt[type=="ALT", cn])
    ## junction copy numbers
    n.jcn1 = ifelse(is.na(jcn.tab['1']), 0, jcn.tab['1'])
    n.jcn1.prop = n.jcn1/n.junc
    n.jcn2 = ifelse(is.na(jcn.tab['2']), 0, jcn.tab['2'])
    n.jcn2.prop = n.jcn2/n.junc
    n.jcn3p = n.junc - n.jcn1 - n.jcn2
    max.jcn = edt[type=="ALT", max(cn, na.rm=T)]
    ## diam cn1
    if (n.jcn1>0){
        this.alt.1.sg = this.alt.sg[, cn==1]
        alt.1.diam = this.alt.1.sg$diameter
        alt.1.on.diam = length(alt.1.diam$edges)
    } else {
        alt.1.on.diam = 0
    }
    ## diam cn2
    if (n.jcn2>0){
        this.alt.2.sg = this.alt.sg[, cn==2]
        alt.2.diam = this.alt.2.sg$diameter
        alt.2.on.diam = length(alt.2.diam$edges)
    } else {
        alt.2.on.diam = 0
    }
    ## node wise features
    n.fused = ndt[, sum(fused==TRUE, na.rm=T)]
    n.unfus = ndt[, sum(fused==FALSE, na.rm=T)]
    ## fused sized and unfused sizes, without terminal node
    fused.size.med = median(ndt[term==FALSE & fused==TRUE, width])
    fused.size.mean = mean(ndt[term==FALSE & fused==TRUE, width])
    unfus.size.med = median(ndt[term==FALSE & fused==FALSE, width])
    if (is.na(unfus.size.med)){unfus.size.med = 0}
    unfus.size.mean = mean(ndt[term==FALSE & fused==FALSE, width])
    if (is.na(unfus.size.mean)){unfus.size.mean = 0}
    ## foot print
    n.chr = length(unique(as.character(seqnames(gg$gr))))
    ## excluding terminal nodes
    footprint = gg[is.na(term) | term==FALSE]$footprint
    if (length(footprint)>1){
        footprint.ds = gr.dist(footprint, footprint)
        footprint.ds[is.na(footprint.ds)] = INF
        footprint.ds[is.infinite(footprint.ds)] = INF
        footprint.cl = hclust(as.dist(footprint.ds), "single")
        footprint.gp = cutree(footprint.cl, h = thresh)
        n.fp.gp = length(unique(footprint.gp))
    } else {
        n.fp.gp = 1
    }
    ## node CN
    cn.min = ndt[, min(cn, na.rm=T)]
    cn.max = ndt[, max(cn, na.rm=T)]
    cn.tab = ndt[, table(cn)]
    cn.mode = as.numeric(names(which.max(cn.tab)))
    cn.mode.prop = ndt[, sum(cn==cn.mode)/.N]
    cn.states = ndt[, length(unique(cn))]
    ## fused CN
    cn.fused.mode = as.numeric(
        names(which.max(ndt[fused==TRUE, table(cn)])))
    cn.fused.mode.prop = ndt[, sum(fused==TRUE & cn==cn.fused.mode)/sum(fused)]
    ## unfused CN
    cn.unfus.mode = as.numeric(
        names(which.max(ndt[fused==FALSE, table(cn)]))
    )
    if (length(cn.unfus.mode)==0){
        cn.unfus.mode = 0
        cn.unfus.mode.prop = 0
    } else {
        cn.unfus.mode.prop = ndt[, sum(fused==FALSE & cn==cn.unfus.mode)/sum(!fused)]
    }
    out =
        data.table(alt.all = n.junc,
                   n.del,
                   n.dup,
                   n.inv,
                   n.tra,
                   max.cn.del,
                   max.cn.dup,
                   max.cn.inv,
                   max.cn.tra,
                   max.cn.fb,
                   n.jcn1,
                   n.jcn2,
                   n.jcn3p,
                   n.jcn1.prop,
                   n.jcn2.prop,
                   alt.on.diam,
                   alt.on.diam.prop = alt.on.diam/n.junc,
                   alt.1.on.diam,
                   alt.1.on.diam.prop = ifelse(n.jcn1==0, 0, alt.1.on.diam/n.jcn1),
                   alt.2.on.diam,
                   alt.2.on.diam.prop = ifelse(n.jcn2==0, 0, alt.2.on.diam/n.jcn2),
                   n.fused, fused.size.med, fused.size.mean,
                   n.unfus, unfus.size.med, unfus.size.mean,
                   n.chr,
                   n.fp = length(footprint),
                   n.fp.gp,
                   cn.mode, cn.mode.prop,
                   cn.fused.mode, cn.fused.mode.prop,
                   cn.unfus.mode, cn.unfus.mode.prop,
                   cn.states,
                   cn.max, cn.min, max.jcn,
                   max.len.circ, max.cn.circ)
    return(out)
}

#' @name readCov
#' @description
#' Read coverage input. Make sure it is either a GRanges, RDS of GRanges or txt/tsv/csv/bed/bw/wig that can be parsed into GRanges
#' @param x input coverage
#' @author Xiaotong Yao, Alon Shaiber
readCov = function(x){
    ## first x must be either a GRanges,
    ## a RDS file containing a GRanges,
    ## or a TXT file that can be read as a GRanges
    if (inherits(x, "character")){
        if (!file.exists(x)){
            stop("Input file not found.")
        }
        fn = x
        if (grepl("rds$", fn)){
            x = readRDS(fn)
        } else if (grepl("[(bed)|(bw)|(wig)]$", fn)){
            x = rtracklayer::import(fn)
        } else if (grepl("[(txt)|(tsv)|(csv)]$", fn)) {
            x = dt2gr(fread(fn))
        } else {
            stop("Input file not in valid format: txt, tsv, csv, bed, bw, wig, rds")
        }
    }
    if (!(inherits(x, 'GRanges'))){
        stop('Invalid coverage input. Coverage input must be either a GRanges, RDS file containing a GRanges or a txt/tsv/bed/bw/wig that can be parsed into a GRanges object.')
    }
    return(x)
}


#' @name j.dist
#' @description
#' @param j1
#' @param j2
#' @export
j.dist = function(j1, j2 = NULL){
    if (is.null(j2)){
        j2 = j1
    }
    if (!inherits(j1, "Junction")){
        j1 = Junction$new(grl = j1)
    }
    if (!inherits(j2, "Junction")){
        j2 = Junction$new(grl = j2)
    }
    ij = data.table(expand.grid(
        list(i = seq_along(j1),
             j = seq_along(j2))))[i<j]
    ij[, ":="(i1)]
}

#' @name draw.paths.y
#' Determine the Y axis elevation of segments in a walk
draw.paths.y = function(grl, path.stack.x.gap=0, path.stack.y.gap=1){
    ## if grl is not named
    if (is.null(names(grl))){
        names(grl) = seq_along(grl)
    }

    if (any(is.na(names(grl)))){
        names(grl) = seq_along(grl)
    }

    grl.props = cbind(data.frame(group = names(grl), stringsAsFactors = F),
                      as.data.frame(values(grl)))

    gr = tryCatch(grl.unlist(grl),
                  error = function(e){
                      gr = unlist(grl);
                      if (length(gr)>0)
                      {
                          tmpc = textConnection(names(gr));
                          cat('budget .. \n')
                          gr$grl.ix = read.delim(tmpc, sep = '.', header = F)[,1];
                          gr$grl.iix = data.table::data.table(ix = gr$grl.ix)[
                                                     , iix := 1:length(ix), by = ix][, iix]
                          close(tmpc)
                      }
                      return(gr)
                  })

    gr$group = grl.props$group[gr$grl.ix]
    gr$group.ord = gr$grl.iix
    gr$first = gr$grl.iix == 1

    gr$last = iix = NULL ## NOTE fix, what is this??
    if (length(gr)>0){
        gr$last = data.table::data.table(
                                  iix = as.numeric(gr$grl.iix),
                                  ix = gr$grl.ix)[
                                , last := iix == max(iix), by = ix][, last]
    }
    grl.props$group = as.character(grl.props$group)
    S4Vectors::values(gr) =
        cbind(as.data.frame(values(gr)),
              grl.props[match(values(gr)$group, grl.props$group),
                        setdiff(colnames(grl.props),
                                c(colnames(values(gr)), 'group', 'labels')),
                        drop = FALSE])

    seqlevels(gr) = seqlevels(gr)[seqlevels(gr) %in% as.character(seqnames(gr))]
    windows = as(GenomicRanges::coverage(gr), 'GRanges'); ## Too deeply recursion
    windows = windows[values(windows)$score!=0]
    windows = GenomicRanges::reduce(windows, min.gapwidth = 1);

    win.gap = mean(width(windows))*0.2

    ## add 1 bp to end for visualization .. ranges avoids weird width < 0 error
    if (length(gr)>0)
    {
        IRanges::ranges(gr) =
            IRanges::IRanges(
                         start(gr),
                         pmax(end(gr),
                              ##                              pmin(end(gr)+1,
                              pmin(end(gr), ## FIXED BY MARCIN, above was causing needless stacking
                                   GenomeInfoDb::seqlengths(gr)[as.character(seqnames(gr))],
                                   na.rm = T),
                              na.rm = T)) ## jeremiah commented
    }

    suppressWarnings(end(windows) <- end(windows) + 1) ## shift one needed bc gr.flatmap has continuous convention, we have categorical (e.g. 2 bases is width 2, not 1)
    mapped = gr.flatmap(gr, windows, win.gap);

    grl.segs = mapped$grl.segs
    window.segs = mapped$window.seg

    dt = data.table(grl.segs)
    mx = dt[, max(c(as.numeric(pos1), as.numeric(pos2)))]
    int.mx = as.double(.Machine$integer.max)

    grl.segs$pos1 = round(as.double(grl.segs$pos1)/as.double(mx)*int.mx)
    grl.segs$pos2 = round(as.double(grl.segs$pos2)/as.double(mx)*int.mx)
    window.segs$start = round(as.double(window.segs$start)/as.double(mx)*int.mx)
    window.segs$end = round(as.double(window.segs$end)/as.double(mx)*int.mx)

    ix.l = lapply(split(1:nrow(grl.segs), grl.segs$group),
                  function(x) x[order(grl.segs$group.ord[x])])
    grl.segs$y.relbin = NA

    ## we want to layout paths so that we prevent collissions between different paths
    grl.segs$y.relbin[unlist(ix.l)] = unlist(lapply(ix.l, function(ix)
    {
        if (length(ix)>1)
        {
            iix = 1:(length(ix)-1)
            concordant = ((grl.segs$pos1[ix[iix+1]] >= grl.segs$pos2[ix[iix]]
                & grl.segs$strand[ix[iix+1]] != '-' & grl.segs$strand[ix[iix]] != '-') |
                (grl.segs$pos1[ix[iix+1]] <= grl.segs$pos2[ix[iix]]
                    & grl.segs$strand[ix[iix+1]] == '-' & grl.segs$strand[ix[iix]] == '-'))
            return(c(0, cumsum(!concordant)))
        }
        else{
            return(0)
        }
    }))

    contig.lim = data.frame(
        group = names(vaggregate(y.relbin ~ group, data = grl.segs, FUN = max)),
        pos1  = vaggregate(pos1 ~ group, data = grl.segs, FUN = min),
        pos2  = vaggregate(pos2~ group, data = grl.segs, FUN = max),
        height = vaggregate(y.relbin ~ group, data = grl.segs, FUN = max)
    );
    contig.lim$width = contig.lim$pos2 - contig.lim$pos1
    contig.lim$y.bin = 0;

    contig.lim = contig.lim[order(-contig.lim$width), ]

    if (nrow(contig.lim)>1){
        for (i in 2:nrow(contig.lim))
        {
            ir1 = IRanges::IRanges(contig.lim[1:(i-1), 'pos1'], contig.lim[1:(i-1), 'pos2'])
            ir2 = IRanges::IRanges(contig.lim[i, 'pos1'], contig.lim[i, 'pos2'])
            clash = which(ir1 %over% (ir2 + path.stack.x.gap))
            pick = clash[which.max(contig.lim$y.bin[clash] + contig.lim$height[clash])]
            contig.lim$y.bin[i] = c(contig.lim$y.bin[pick] + contig.lim$height[pick] + path.stack.y.gap, 0)[1]
        }
    }

    grl.segs$y.bin = contig.lim$y.bin[match(grl.segs$group, contig.lim$group)] + grl.segs$y.relbin + 1

    m.y.bin = max(grl.segs$y.bin)
    ylim = c(1, m.y.bin) + c(-0.5*m.y.bin, 0.5*m.y.bin)

    ## squeeze y coordinates into provided (or inferred) ylim
    tmp.ylim = ylim

    ## provide bottom and top padding of y.bin
    y.pad = 1/(m.y.bin+1)/2
    y.pad = pmin(1/(m.y.bin+1)/2, 0.125)
    tmp.ylim = tmp.ylim + c(1, -1)*y.pad*diff(tmp.ylim);

    ## make final y coordinates by squeezing y.bin into tmp.ylim
    grl.segs$y = affine.map(grl.segs$y.bin, tmp.ylim)

    ## MARCIN EDIT: grl.segs are not in order of paths
    ## but in coordinate order and so the order of the ys will be misintepreted
    ## down the line as being aligned to the order of segs in each path
    ## which will cause a mixup in the graphics

    grl.segs = grl.segs[order(grl.segs$group, grl.segs$group.ord), ]

    return(split(grl.segs$y, grl.segs$group)[names(grl)])
}



affine.map = function(x, ylim = c(0,1), xlim = c(min(x), max(x)), cap = F, cap.min = cap, cap.max = cap, clip = T, clip.min = clip, clip.max = clip)
{
  #  xlim[2] = max(xlim);
  #  ylim[2] = max(ylim);

  if (xlim[2]==xlim[1])
    y = rep(mean(ylim), length(x))
  else
    y = (ylim[2]-ylim[1]) / (xlim[2]-xlim[1])*(x-xlim[1]) + ylim[1]

  if (cap.min)
    y[x<min(xlim)] = ylim[which.min(xlim)]
  else if (clip.min)
    y[x<min(xlim)] = NA;

  if (cap.max)
    y[x>max(xlim)] = ylim[which.max(xlim)]
  else if (clip.max)
    y[x>max(xlim)] = NA;

  return(y)
}

  gr.flatmap = function(gr, windows, gap = 0, strand.agnostic = TRUE, squeeze = FALSE, xlim = c(0, 1))
{
  if (strand.agnostic)
    GenomicRanges::strand(windows) = "*"

  ## now flatten "window" coordinates, so we first map gr to windows
  ## (replicating some gr if necessary)
  #    h = findOverlaps(gr, windows)

  h = gr.findoverlaps(gr, windows);

  window.segs = gr.flatten(windows, gap = gap)

  grl.segs = BiocGenerics::as.data.frame(gr);
  grl.segs = grl.segs[values(h)$query.id, ];
  grl.segs$query.id = values(h)$query.id;
  grl.segs$window = values(h)$subject.id
  grl.segs$start = start(h);
  grl.segs$end = end(h);
  grl.segs$pos1 = pmax(window.segs[values(h)$subject.id, ]$start,
                       window.segs[values(h)$subject.id, ]$start + grl.segs$start - start(windows)[values(h)$subject.id])
  grl.segs$pos2 = pmin(window.segs[values(h)$subject.id, ]$end,
                       window.segs[values(h)$subject.id, ]$start + grl.segs$end - start(windows)[values(h)$subject.id])
  grl.segs$chr = grl.segs$seqnames

  if (squeeze)
  {
    min.win = min(window.segs$start)
    max.win = max(window.segs$end)
    grl.segs$pos1 = affine.map(grl.segs$pos1, xlim = c(min.win, max.win), ylim = xlim)
    grl.segs$pos2 = affine.map(grl.segs$pos2, xlim = c(min.win, max.win), ylim = xlim)
    window.segs$start = affine.map(window.segs$start, xlim = c(min.win, max.win), ylim = xlim)
    window.segs$end = affine.map(window.segs$end, xlim = c(min.win, max.win), ylim = xlim)
  }

  return(list(grl.segs = grl.segs, window.segs = window.segs))
}



#' rel2abs
#'
#' rescales CN values from relative to "absolute" (i.e. per cancer cell copy) scale given purity and ploidy
#'
#' takes in gr with signal field "field"
#'
#' @param gr GRanges input with meta data field corresponding to mean relative copy "mean" in that interval
#' @param purity purity of sample
#' @param ploidy ploidy of sample
#' @param gamma gamma fit of solution (over-rides purity and ploidy)
#' @param beta beta fit of solution (over-rides purity and ploidy)
#' @param field meta data field in "gr" variable from which to extract signal, default "mean"
#' @param field.ncn meta data field in "gr" variable from which to extract germline integer copy number, default "ncn", if doesn't exist, germline copy number is assumed to be zero
#' @return
#' numeric vector of integer copy numbers
#' @export
rel2abs = function(gr, purity = NA, ploidy = NA, gamma = NA, beta = NA, field = 'ratio', field.ncn = 'ncn')
{
  mu = values(gr)[, field]
  mu[is.infinite(mu)] = NA
  w = as.numeric(width(gr))
  w[is.na(mu)] = NA
  sw = sum(w, na.rm = T)
  mutl = sum(mu * w, na.rm = T)

  ncn = rep(2, length(mu))
  if (!is.null(field.ncn))
    if (field.ncn %in% names(values(gr)))
      ncn = values(gr)[, field.ncn]

  ploidy_normal = sum(w * ncn, na.rm = T) / sw  ## this will be = 2 if ncn is trivially 2

  if (is.na(gamma))
    gamma = 2*(1-purity)/purity

  if (is.na(beta))
    beta = ((1-purity)*ploidy_normal + purity*ploidy) * sw / (purity * mutl)
                                        #      beta = (2*(1-purity)*sw + purity*ploidy*sw) / (purity * mutl)


                                        # return(beta * mu - gamma)
  return(beta * mu - ncn * gamma / 2)
}


#' \code{stats::aggregate}, but returns vector
#'
#' @description
#' Same as \code{stats::aggregate} except returns named vector
#' with names as first column of output and values as second
#'
#' Note: there is no need to ever use aggregate or vaggregate, just switch to data.table
#'
#' @param ... arguments to aggregate
#' @return named vector indexed by levels of "by"
#' @author Marcin Imielinski
#' @keywords internal
vaggregate = function(...)
{
  out = aggregate(...);
  return(structure(out[,ncol(out)], names = do.call(paste, lapply(names(out)[1:(ncol(out)-1)], function(x) out[,x]))))
}


#' @name setxor
#' @title setxor
#'
#' @param A vector specifying set A
#' @param B vector specifying set B
#' @export
#' @author Marcin Imielinski
#' @return elements in A or B that are not in the intersection of A and B
setxor = function(A, B)
{
    return(setdiff(union(A,B), intersect(A,B)))
}


#' @name read_xmap
#' @title read_xmap
#'
#' @description
#'
#' Reads xmap as GRangesList
#' using
#' https://bionanogenomics.com/wp-content/uploads/2017/03/30040-XMAP-File-Format-Specification-Sheet.pdf
#' as guide. The outputted GRangesList has one GRangesList Items per molecule, each
#' containing an ordered set of GRanges, each cooresponding to a mapped location
#' of a fluorescent marker in each molecule
#'
#' If lift = TRUE (default) then will lift markers to genome using the
#' affine transformation defined by the xmap i.e. scaling and
#' offset of query and reference coordinates. This transformation is defined by
#' QryStartPos, QryEndPos, RefStartPos, RefEndPos fields in the xmap.
#'
#' @param path path to xmap file
#' @param win only import ranges overlapping a given interval
#' @param merge logical flag specifying whether to merge the xmap with the cmaps
#' @param lift logical flag whether to lift the original marks to reference via the map implied by the mapping (TRUE), if false will just use the reference mark annotations
#' @param grl logical flag whether to return a GRangesList representing each molecule as an ordered walk (ie where markers are ordered according to the SiteId in the query cmap)
#' @param seqlevels vectors of reference seqlevels which is indexed by the 1-based integer RefContigID and CMapId in xmap and reference cmap, respectively.  NOTE: seqlevels may need to be provided in order to output a GRanges that is compatible with a standard genome reference (eg 1,..,22, X, Y)
#' @author Marcin Imielinski
#' @export
read_xmap = function(path, win = NULL, merge = TRUE, lift = TRUE, grl = TRUE,
                     verbose = FALSE, seqlevels = NULL)
{
  lines = readLines(path)
  if (verbose)
    message('loaded file')

  ## header column starts with #h, so we find then strip
  header = gsub('^\\#h\\s+', '', grep('^\\#h', lines, value = TRUE))

  ## data are hashless lines
  data = grep('^\\#', lines, value = TRUE, invert = TRUE)

  if (verbose)
    message('found header')

  lift = merge & lift

  if (!length(data))
  {
      warning('No data found in: ', path)
      if (grl)
        return(GRangesList())
      else
        return(GRanges())
  }

  ## now concatenate, paste collapse so can feed into fread
  dat = fread(paste(c(header, data), collapse = '\n'))
  dat$listid = 1:nrow(dat) %>% as.character

  if (verbose)
    message('finished fread')

  ## split gr cols vs grl cols
  cols = setdiff(names(dat), c('Alignment', 'HitEnum'))

  ## merge the alignments which will expand dat for every mark
  dat.marks = dat[, cols, with = FALSE]

  if (!is.null(seqlevels))
    dat.marks[, RefContigID := seqlevels[RefContigID]]

  if (!is.null(win))
  {
    cid = dat.marks[GRanges(RefContigID, IRanges(RefStartPos, RefEndPos)) %^% win, QryContigID] %>% unique
    setkey(dat.marks, QryContigID)
    dat.marks = dat.marks[.(cid), ]

    if (verbose)
      message('subsetted to region of interest')
  }

  if (merge)
    {
      ## dat has one row per "alignment" ie marker set
      ## process alignment string
      al = dunlist(strsplit(gsub('^\\(', '', dat$Alignment), '[\\(\\)]+'))
      al = cbind(al, reshape::colsplit(al$V1, split = ',', names = c('refsite', 'querysite')))
      al[, listid := as.character(listid)]

      datal = as.data.table(merge(dat.marks, al[, .(listid, refsite, querysite)], by = 'listid'))

      ## read query and reference cmaps to merge  ]
      if (verbose)
        message('reading in query cmap')


      qcmap = read_cmap(gsub('.xmap', '_q.cmap', path), gr = FALSE, seqlevels = seqlevels)

      if (verbose)
        message('reading in reference cmap')
      rcmap = read_cmap(gsub('.xmap', '_r.cmap', path), gr = FALSE, seqlevels = seqlevels)

      setkeyv(qcmap, c("CMapId", "SiteID"))
      setkeyv(rcmap, c("CMapId", "SiteID"))

      ## merge in query and reference cmap data
      datal = cbind(datal, qcmap[.(datal$QryContigID, datal$querysite), ][, .(qpos = start)])
      if (verbose)
        message('merged xmap with query cmap')

      datal = cbind(datal, rcmap[.(datal$RefContigID, datal$refsite), ][, .(rpos = start)])
      if (verbose)
        message('merged xmap with reference cmap')
    }
  else
  {
    datal = dat.marks
    datal[, qpos := pmin(QryStartPos, QryEndPos)]
  }

  datal[, seqnames := RefContigID]
  datal[, strand := Orientation]
  ## adjust rpos
  if (merge)
  {
    datal = unique(datal, by = c('QryContigID', 'querysite'))

    if (lift)
    {
      ## first compute scaling (stretching) factor between molecule and reference
      datal[, scale := abs(RefEndPos-RefStartPos)/abs(QryEndPos-QryStartPos)]
      datal[, lpos := round(scale*abs(qpos-QryStartPos)+RefStartPos)]
      datal[, ":="(start = lpos, end = lpos)]

      if (verbose)
        message('calculated lifted marker coordinates')
    }
    else
    {
      datal[, ":="(start = rpos, end = rpos)]
    }
  }
  else
  {
    datal[, ":="(start = RefStartPos, end = RefEndPos)]
  }

  gr.cols = c('refsite', 'querysite', 'qpos', 'rpos','lpos', 'XmapEntryID', 'QryContigID', 'RefContigID', 'QryStartPos', 'QryEndPos', 'RefStartPos', 'RefEndPos', 'Orientation', 'Confidence', 'HitEnum') %>% intersect(names(datal))


  ## need to order the contig alignments with respect to query coordinate
  setkeyv(datal, c("QryContigID", "qpos"))

  gr = gr.fix(dt2gr(datal))

  if (!grl)
    return(gr)

  if (verbose)
    message('splitting into GRangesList')

  grl = split(gr[, gr.cols], gr$QryContigID)

  values(grl) = data.frame(contig = names(grl))
  return(grl)
}


#' @name read_cmap
#' @title read_cmap
#'
#' @description
#'
#' Reads cmap as GRanges
#' using
#' https://bionanogenomics.com/wp-content/uploads/2017/03/30039-CMAP-File-Format-Specification-Sheet.pdf
#'
#' @param path path to cmap file
#' @author Marcin Imielinski
#' @export
read_cmap = function(path, gr = TRUE, seqlevels = NULL)
{
  lines = readLines(path)
  ## header column starts with #h, so we find then strip
  header = gsub('^\\#h\\s+', '', grep('^\\#h', lines, value = TRUE))

  ## data are hashless lines
  data = grep('^\\#', lines, value = TRUE, invert = TRUE)

  if (!length(data))
    {
      warning('No data found in cmap: ', path)
      if (gr)
        return(GRanges())
      else
        return(data.table())
    }

  ## now concatenate, paste collapse so can feed into fread
  dat = fread(paste(c(header, data), collapse = '\n'))
  dat[, seqnames := CMapId]
  dat[, start := Position]
  dat[, end := Position]

  if (!is.null(seqlevels))
    dat$seqnames = seqlevels(dat$seqnames)

  if (gr)
    return(dt2gr(dat))

  return(dat)
}

#' @name gNode.loose
#' @title gNode.loose
#'
#' @description internal
#'
#' Internal utility function to get a GRanges with the right or left loose ends.
#' This function serves as the backend of gNode$loose
#'
#' @param orientation (character) either '', 'right' or 'left'. By default returns all loose ends (orientation = ''). If 'right' or 'left' returns only the loose ends that are to the right or left of nodes, accordingly.
#' @author Alon Shaiber
gNode.loose = function(gn, orientation)
{
    if (!(orientation %in% c('right', 'left'))){
      stop('Bad orientation: "', orientation, '". orientation must be either "right", or "left"')
    }
    if (!inherits(gn, 'gNode')){
      stop('Input must be a gNode, but "', class(gn), '", was provided')
    }

    gn$check
    loose.fields = c('cn', paste0('loose.cn.', orientation), 'index', 'snode.id', 'node.id')
    new.names = c('node.cn', 'cn', 'index', 'snode.id', 'node.id')
    names(new.names) = loose.fields
    if (orientation == 'left'){
        node.ids = which(gn$gr$loose.left>0)
        l = gr.start(gn$gr[node.ids])
    } else {
        node.ids = which(gn$gr$loose.right>0)
        l = gr.flipstrand(gr.end(gn$gr[node.ids]))
    }
    loose.fields.keep = intersect(names(mcols(l)), loose.fields)
    l = l[, loose.fields.keep]
    names(mcols(l)) = new.names[loose.fields.keep]
    if (length(l) > 0){
        # mark the orientation
        l$orientation = orientation

        deg = gn[node.ids]$loose.degree(orientation = orientation)
        l$degree = deg
        l$terminal = deg == 0
    }
    return(l)
}

# #' @export BY.SEP1
# BY.SEP1 = " G89LbS7RCine "

# #' @export BY.SEP2
# BY.SEP2 = " VxTofMAXRbkl "

# #' Adding on "by" field(s) to seqnames for more efficient by queries
# #' 
# #' Uses by field from metadata column to insert into seqnames
# #' This is useful for more efficient queries findoverlaps queries between 2 ranges
# #' when we want to stratify the query with a "by" field.
# #' This feeds into the gr.findoverlaps family of gUtils tools.
# #'
# #' @return A GRanges with the by metadata field attached to the seqnames
# #' @author Kevin Hadi
# #' @export gr_construct_by
# gr_construct_by = function(x, by = NULL, na.seqlevels = TRUE) {
#     if (is.null(by) || length(x) == 0) return(x)
#     ## this.sep1 = {set.seed(10); paste0(" ", rand.string(), " ")}
#     this.sep1 = gGnome::BY.SEP1
#     this.sep2 = gGnome::BY.SEP2
#     ## this.sep2 = {set.seed(11); paste0(" ", rand.string(), " ")}
#     ## ans = copy2(x)
#     ans = x
#     thisp = function(...) paste(..., sep = this.sep1)
#     f1 = do.call(paste, c(as.list(mcols(x)[, by, drop = FALSE]), sep = this.sep1))
#     f2 = as.character(seqnames(x))
#     f2i = as.integer(seqnames(x))
#     f12 = paste(f1, f2, sep = this.sep2)
#     ui = which(!duplicated(f12))
#     ans_seqlevels = f12[ui]
#     x_seqinfo <- seqinfo(x)
#     ans_seqlengths = unname(seqlengths(x_seqinfo)[f2i[ui]])
#     if (isTRUE(na.seqlevels))
#         ans_seqlengths[] = NA_integer_
#     ans_isCircular <- unname(isCircular(x_seqinfo))[f2i[ui]]
#     ans_seqinfo <- Seqinfo(ans_seqlevels, ans_seqlengths, ans_isCircular)
#     ans@seqnames <- Rle(factor(f12, ans_seqlevels))
#     ans@seqinfo <- ans_seqinfo
#     return(ans)
# }

# #' Appending by field in seqnames to metadata column
# #' 
# #' For use downstream of gr_construct_by.
# #' By field that is keyed onto the original seqnames
# #' can be rescued and re-appended to a GRanges output.
# #' For example if you do GenomicRanges set operations like
# #' in GenomicRanges::setdiff(), ::union(), or ::intersect(), 
# #' the metadata fields will be stripped. But the by field
# #' can be re appended as a column.
# #' 
# #' @return GRanges with seqnames keyed with by field with metadata column(s) appended.
# #' @author Kevin Hadi
# #' @export
# append_by_field_from_seqnames = function(x, by = NULL){
#     if (is.null(by) || length(x) == 0) {
#         return(x)
#     }
#     this.sep1 = gGnome::BY.SEP1
#     this.sep2 = gGnome::BY.SEP2
#     ans = x
#     f1 = as.character(seqnames(x))
#     f2 = sub(paste0(".*", this.sep2), "", f1)
#     f0 = sub(paste0(this.sep2, ".*"), "", f1)
#     f0 = data.table::tstrsplit(f0, this.sep1, fixed = T)
#     mc = as.data.table(unname(f0))
#     if (!(identical(by, "") | identical(by, NA) | identical(by, "NA")) &&
#         length(by) == NCOL(mc)) {
#         colnames(mc) = by
#     }
#     ## debugonce(do.assign)
#     mcols(x) = do.assign(mcols(x), mc)
#     return(x)
# }

# #' Removing by field and random string barcode to seqnames for more efficient by queries
# #' 
# #' To be used with gr_construct_by
# #'
# #' @return A GRanges with the by metadata field attached to the seqnames
# #' @author Kevin Hadi
# #' @export gr_deconstruct_by
# gr_deconstruct_by <- function (x, by = NULL, meta = FALSE) 
# {
#     if (is.null(by) || length(x) == 0) 
#         return(x)
#     this.sep1 = gGnome::BY.SEP1
#     this.sep2 = gGnome::BY.SEP2
#     ans = x
#     f1 = as.character(seqnames(x))
#     f2 = sub(paste0(".*", this.sep2), "", f1)
#     ## f2 = trimws(gsub(paste0(".*", this.sep2), "", f1))
#     ## f2 = trimws(gsub(paste0(".*", this.sep1), "", f2))
#     ui = which(!duplicated(f1))
#     x_seqinfo <- seqinfo(x)
#     seql = rleseq(f2[ui], clump = T)
#     lst = lapply(split(seqlengths(x_seqinfo)[f1[ui]], seql$idx), 
#         function(x) max(x))
#     uii = which(!duplicated(f2[ui]))
#     ans_seqlevels = f2[ui][uii]
#     ans_seqlengths = setNames(unlist(lst), ans_seqlevels)
#     ans_isCircular <- unname(isCircular(x_seqinfo))[ans_seqlevels]
#     ans_seqinfo <- Seqinfo(ans_seqlevels, ans_seqlengths, ans_isCircular)
#     ans@seqnames <- Rle(factor(f2, ans_seqlevels))
#     ans@seqinfo <- ans_seqinfo
#     if (isTRUE(meta)) {
#         f0 = sub(paste0(this.sep2, ".*"), "", f1)
#         f0 = data.table::tstrsplit(f0, this.sep1, fixed = T)
#         ## f0 = strsplit(f0, this.sep1)
#         ## f0 = trans(f0, c)
#         mc = as.data.table(unname(f0))
#         if (!(identical(by, "") | identical(by, NA) | identical(by, "NA")) &&
#             length(by) == NCOL(mc)) {
#             colnames(mc) = by
#         }
#         ## debugonce(do.assign)
#         mcols(ans) = do.assign(mcols(ans), mc)
#     }
#     return(ans)
# }

# #' Assign columns or list elements
# #'
# #' Wrapper to assign list elements to tabular object.
# #' Intended to be robust to S4 object specs.
# #'
# #' @author Kevin Hadi
# #' @export do.assign
# do.assign = function(x, ..., pf = parent.frame()) {
#   mc_2340873450987 = match.call(expand.dots = FALSE)
#   ddd = as.list(mc_2340873450987)$`...`
#   if (is.null(names(ddd))) names(ddd) = paste0(rep_len("V", length(ddd)), seq_along(ddd))
#   for (i in seq_along(ddd)) {
#     d = ddd[[i]]
#     nml = names(ddd[i])
#     if (is.call(d) || is.name(d)) {
#       ev = BiocGenerics::eval(d, envir = parent.frame())
#       nm = names(ev)
#       .DIM = DIM(ev)
#       .dim = dim(ev)
#       nr = .DIM[1L]
#       nc = .DIM[2L]
#       if (inherits(ev, c("list")) && nc == 1L) {
#         if (is.null(nm)) nm = rep_len(nml, nr)
#         for (ii in seq_len(nr)) {
#           x[[nm[ii]]] = ev[[ii]]
#         }
#       } else if (length(.dim) > 0L) {
#         if (is.null(nm)) nm = rep_len(nml, nc)
#         for (ii in seq_len(nc)) {
#           x[[nm[ii]]] = ev[[ii]]
#         }
#       } else {
#         x[[nml]] = ev
#       }
#     }
#   }
#   return(x)
# }

# #' Always returns a length 2 vector describing dimensions of input.
# #'
# #' NROW returns either length(x) or nrows(x) if x has dimensions.
# #' NCOL2 returns 0 if length(x)
# #'
# #' @export
# DIM <- function(x) {
#     return(c(NROW(x), NCOL2(x)))
# }

# #' Returns a length 2 vector describing first two dimensions of input, and .
# #'
# #' NROW returns either length(x) or nrows(x) if x has dimensions.
# #' NCOL2 returns 0 if length(x)
# #'
# #' @export
# MULTIDIM <- function(x) {
#     otherdim = integer(0)
#     if (!is.null(dim(x)) && length(dim(x)) > 2) {
#         otherdim = tail(dim(x), -2)
#     }
#     return(c(NROW(x), NCOL2(x), otherdim))
# }


# #' Ensuring 2nd dimension of empty vector or NULL is 1
# #' 
# #' NCOL = 1 for NULL, or any vector with length == 0.
# #' Seems counterintuitive so this is the fix
# #'
# #' @author Kevin Hadi
# #' @export
# NCOL2 <- function(x) {
#   d = dim(x)
#   ln = length(d)
#   lx = length(x)
#   if (ln > 1L) {
#     d[2L]
#   } else if (lx == 0L) {
#     0L
#   } else {
#     1L
#   }
# }

# #' numbers up within repeating elements of a vector
# #'
# #' returns unique id within each unique element of a vector or set of provided vectors
# #' and also a running id within each unique element
# #'
# #' @param ... Vector(s) to identify with unique id and a running id within each unique id
# #' @param clump a logical specifying if duplicates are to be counted together
# #' @param recurs a logical that is meant to only be set by the function when using clump = TRUE
# #' @return a list of idx and seq
# #' @author Kevin Hadi
# #' @export
# rleseq = function (..., clump = TRUE, recurs = FALSE, na.clump = TRUE, 
#                    na.ignore = FALSE, sep = paste0(" ", rand.string(length = 6), 
#                      " "), use.data.table = FALSE) 
# {
#     rand.string <- function(n = 1, length = 12) {
#         randomString <- c(1:n)
#         for (i in 1:n) {
#             randomString[i] <- paste(sample(c(0:9, letters, LETTERS), 
#                                             length, replace = TRUE), collapse = "")
#         }
#         return(randomString)
#     }
#     force(sep)
#     out = if (use.data.table) {
#               tryCatch(
#               {
#                   dt = data.table(...)
#                   setnames(dt, make.names(rep("", ncol(dt)), unique = T))
#                   ## make.unique
#                   cmd = sprintf("dt[, I := .I][, .(idx = .GRP, seq = seq_len(.N), lns = .N, I), by = %s]", mkst(colnames(dt), "list"))
#                   dt = eval(parse(text = cmd))
#                   setkey(dt, I)[, .(idx, seq, lns)]
#               }, error = function(e) structure("data table didn't work...", class = "err"))
#           }
#     if (!(is.null(out) || class(out)[1] == "err"))
#         return(as.list(out))


#     if (na.clump) {
#         paste = function(..., sep) base::paste(..., sep = sep)
#     } else {
#         paste = function(..., sep) {
#             tryCatch({
#                 stringr::str_c(..., sep = sep)
#             }, error = function(e) {
#                 comp = complete.cases(list(...))
#                 out = base::paste(..., sep = sep)
#                 out[!comp] = NA_character_
#                 return(out)
#             })
#         }
#     }
#     ## vec = setNames(do.call(paste, ...), seq_len(fulllens))
#     ddd = match.call(expand.dots = FALSE)$`...`
#     doeval = length(ddd) == 1 && (is.call(ddd[[1]]) || is.symbol(ddd[[1]]))
#     if (doeval) ddd = eval(ddd[[1]], parent.frame())
#     dodocall = inherits(ddd, c("data.frame", "DataFrame", "list", "List"))
#     if (dodocall) {
#         ddd = as.list(ddd)
#         lns = base::lengths(ddd)
#         if (!all(lns == lns[1])) 
#             warning("not all vectors provided have same length")
#         fulllens = max(lns, na.rm = T)        
#         vec = setNames(do.call(function(...) paste(..., sep = sep), ddd), seq_len(fulllens))
#     } else {
#         lns = base::lengths(list(...))
#         if (!all(lns == lns[1])) 
#             warning("not all vectors provided have same length")
#         fulllens = max(lns, na.rm = T)        
#         vec = setNames(paste(..., sep = sep), seq_len(fulllens))
#     }
#     if (length(vec) == 0) {
#         out = list(idx = integer(0), seq = integer(0), lns = integer(0))
#         return(out)
#     }
#     if (na.ignore) {
#         if (!(doeval && dodocall))
#             isnotna = which(rowSums(is.na(as.data.frame(list(...)))) == 0)
#         else
#             isnotna = which(rowSums(is.na(as.data.frame(ddd))) == 0)
#         ## isnotna = which(rowSums(as.data.frame(lapply(list(...), 
#         ##                                              is.na))) == 0)
#         out = list(idx = rep(NA, fulllens), seq = rep(NA, fulllens), 
#                    lns = rep(NA, fulllens))
#         if (length(isnotna)) 
#             vec = vec[isnotna]
#         tmpout = do.call(rleseq, c(alist(... = vec), alist(clump = clump, 
#                                                            recurs = recurs, na.clump = na.clump, na.ignore = FALSE, use.data.table = FALSE)))
#         for (i in seq_along(out)) out[[i]][isnotna] = tmpout[[i]]
#         return(out)
#     }
#     if (!clump) {
#         rlev = rle(vec)
#         if (recurs) {
#             ## return(unlist(unname(lapply(rlev$lengths, seq_len))))
#             return(sequence(rlev$lengths))
#         }
#         else {
#             out = list(idx = rep(seq_along(rlev$lengths), times = rlev$lengths), 
#                        ## seq = unlist(unname(lapply(rlev$lengths, seq_len))))
#                        seq = sequence(rlev$lengths))
#             out$lns = ave(out[[1]], out[[1]], FUN = length)
#             return(out)
#         }
#     }
#     else {
#         if (!na.clump) {
#             vec[which(isNA(vec))] = base::paste(make.unique(vec[which(isNA(vec))]))
#         }
#         vec = setNames(vec, seq_along(vec))
#         lst = split(vec, factor(vec, levels = unique(vec)))
#         ord = as.integer(names(unlist(unname(lst))))
#         idx = rep(seq_along(lst), times = base::lengths(lst))
#         out = list(idx = idx[order(ord)], seq = rleseq(idx, clump = FALSE, 
#                                                        recurs = TRUE, use.data.table = FALSE)[order(ord)])
#         ## out$lns = ave(out[[1]], out[[1]], FUN = length)
#         out$lns = unname(rep(base::lengths(lst), times = base::lengths(lst)))
#         return(out)
#     }
# }


#' transpose
#' 
#' transpose using base R
#' 
#' Transpose a list without purrr or data.table
#' to avoid external dependencies. E.g. not all 
#' data.table::transpose versions are
#' fully compatible with lists
#' 
#' @export 
transpose = function(lst, ffun = list) {
    return(do.call(Map, c(f = ffun, lst)))
}
mskilab/gGnome documentation built on June 10, 2025, 11:29 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
mskilab/gGnome
gGnome: reference based assembly graph for analyzing rearranged genomes

R/utils.R
In mskilab/gGnome: gGnome: reference based assembly graph for analyzing rearranged genomes

Defines functions ra.merge ra.duplicated pdist dunlist alpha gt.gencode ra.overlaps read.rds.url file.url.exists dunlist label.runs sparse_subset skrub all.paths convex.basis duplicated.matrix

Documented in duplicated.matrix label.runs ra.duplicated ra.merge sparse_subset

R Package Documentation

Browse R Packages

We want your feedback!

mskilab/gGnome gGnome: reference based assembly graph for analyzing rearranged genomes

R/utils.R In mskilab/gGnome: gGnome: reference based assembly graph for analyzing rearranged genomes

Defines functions ra.merge ra.duplicated pdist dunlist alpha gt.gencode ra.overlaps read.rds.url file.url.exists dunlist label.runs sparse_subset skrub all.paths convex.basis duplicated.matrix

Documented in duplicated.matrix label.runs ra.duplicated ra.merge sparse_subset

R Package Documentation

Browse R Packages

We want your feedback!

mskilab/gGnome
gGnome: reference based assembly graph for analyzing rearranged genomes

R/utils.R
In mskilab/gGnome: gGnome: reference based assembly graph for analyzing rearranged genomes