R/intervals.R

Defines functions giterator.intervals giterator.cartesian_grid gintervals.union gintervals.summary gintervals.update gintervals.save gintervals.rm gintervals.rbind gintervals.quantiles repair_names gintervals.neighbors gintervals.mapply gintervals.ls gintervals.load_chain gintervals.load gintervals.liftover gintervals.is.bigset gintervals.chrom_sizes gintervals.intersect gintervals.import_genes gintervals.force_range gintervals.exists gintervals.diff gintervals.canonic gintervals.2d.band_intersect gintervals.all gintervals.2d.all gintervals.2d gintervals .gintervals.is2d .gintervals.is1d .gintervals.big.save .gintervals.big.save_meta .gintervals.big.meta .gintervals.small2big .gintervals.big2small .gintervals.big.is2d .gintervals.big.is1d .gintervals.loadable .gintervals.needs_bigset .gintervals.is_bigset .gintervals.save_file .gintervals.load_file .gintervals.load .gintervals.load_ext .gintervals.check_new_set .gintervals.apply .gintervals

Documented in gintervals gintervals.2d gintervals.2d.all gintervals.2d.band_intersect gintervals.all gintervals.canonic gintervals.chrom_sizes gintervals.diff gintervals.exists gintervals.force_range gintervals.import_genes gintervals.intersect gintervals.is.bigset gintervals.liftover gintervals.load gintervals.load_chain gintervals.ls gintervals.mapply gintervals.neighbors gintervals.quantiles gintervals.rbind gintervals.rm gintervals.save gintervals.summary gintervals.union gintervals.update giterator.cartesian_grid giterator.intervals

.gintervals <- function(chroms, starts, ends, strands) {
    if (is.null(strands)) {
        intervals <- data.frame(chrom = .gchroms(chroms), start = starts, end = ends)
    } else {
        intervals <- data.frame(chrom = .gchroms(chroms), start = starts, end = ends, strand = strands)
    }

    numintervals <- nrow(intervals)

    maxends <- get("ALLGENOME", envir = .misha)[[1]]$end[match(as.character(intervals$chrom), get("ALLGENOME", envir = .misha)[[1]]$chrom)]
    maxidx <- intervals$end == -1
    intervals$end[maxidx] <- maxends[maxidx]

    err.intervs <- intervals[intervals$start < 0, ]
    if (nrow(err.intervs) > 0) {
        stop(sprintf("Invalid interval (%s, %g, %g): start coordinate is out of range", err.intervs$chrom[1], err.intervs$start[1], err.intervs$end[1]), call. = FALSE)
    }

    err.intervs <- intervals[intervals$end > maxends, ]
    if (nrow(err.intervs) > 0) {
        stop(sprintf("Invalid interval (%s, %g, %g): end coordinate exceeds chromosome boundaries", err.intervs$chrom[1], err.intervs$start[1], err.intervs$end[1]), call. = FALSE)
    }

    err.intervs <- intervals[intervals$start >= intervals$end, ]
    if (nrow(err.intervs) > 0) {
        stop(sprintf("Invalid interval (%s, %g, %g): start coordinate exceeds or equals to end coordinate", err.intervs$chrom[1], err.intervs$start[1], err.intervs$end[1]), call. = FALSE)
    }

    if (!is.null(strands)) {
        if (!is.numeric(intervals$strand)) {
            stop("Invalid strand values", call. = FALSE)
        }

        err.intervs <- intervals[intervals$strand != as.integer(intervals$strand) | intervals$strand < -1 | intervals$strand > 1, ]
        if (nrow(err.intervs) > 0) {
            stop(sprintf("Invalid strand value %g of interval (%s, %g, %g)", err.intervs$strand[1], err.intervs$chrom[1], err.intervs$start[1], err.intervs$end[1]))
        }
    }

    intervals
}

.gintervals.apply <- function(chroms, intervals, intervals.set.out, FUN, ...) {
    if (!is.null(intervals.set.out)) {
        fullpath <- .gintervals.check_new_set(intervals.set.out)
    }

    if (is.data.frame(intervals)) {
        intervals <- list(intervals)
    }

    # sort chroms
    chroms$size <- NULL
    if ("chrom" %in% colnames(chroms)) {
        chroms <- data.frame(chrom = chroms[with(chroms, order(chrom)), ])
    } else {
        chroms <- chroms[with(chroms, order(chrom1, chrom2)), ]
    }


    # let's assume that if any of the input intervals sets is big then intervals.set.out should be big as well
    if (any(unlist(lapply(intervals, function(intervals) {
        .gintervals.is_bigset(intervals) || .gintervals.needs_bigset(intervals)
    })))) {
        stats <- NULL
        zeroline <- NULL
        success <- FALSE
        t <- Sys.time()
        progress.percentage <- -1
        tryCatch(
            {
                # if any of the source intervals sets is big then create the output intervals set big too
                if (!is.null(intervals.set.out)) {
                    dir.create(fullpath, recursive = TRUE, mode = "0777")
                }

                if (.gintervals.is1d(intervals[[1]])) {
                    mapply(function(chrom) {
                        loaded_intervals <- lapply(intervals, function(intervals) {
                            .gintervals.load_ext(intervals, chrom = chrom)
                        })
                        res <- do.call(FUN, list(loaded_intervals, ...))
                        if (!is.null(intervals.set.out) && !is.null(res) && nrow(res) > 0) {
                            zeroline <<- res[0, ]
                            .gintervals.big.save(fullpath, res, chrom = chrom)
                            stat <- .gcall("gintervals_stats", res, .misha_env())
                            stats <<- rbind(stats, data.frame(chrom = chrom, stat))
                        }
                        if (as.integer(difftime(Sys.time(), t, units = "secs")) > 3) {
                            t <<- Sys.time()
                            percentage <- as.integer(100 * match(chrom, chroms$chrom) / nrow(chroms))
                            if (percentage < 100 && progress.percentage != percentage) {
                                message(sprintf("%d%%...", percentage), appendLF = FALSE)
                                progress.percentage <<- percentage
                            }
                        }
                    }, chroms$chrom)
                } else {
                    mapply(function(chrom1, chrom2) {
                        loaded_intervals <- lapply(intervals, function(intervals) {
                            .gintervals.load_ext(intervals, chrom1 = chrom1, chrom2 = chrom2)
                        })
                        res <- do.call(FUN, list(loaded_intervals, ...))
                        if (!is.null(intervals.set.out) && !is.null(res) && nrow(res) > 0) {
                            zeroline <<- res[0, ]
                            .gintervals.big.save(fullpath, res, chrom1 = chrom1, chrom2 = chrom2)
                            stat <- .gcall("gintervals_stats", res, .misha_env())
                            stats <<- rbind(stats, data.frame(chrom1 = chrom1, chrom2 = chrom2, stat))
                        }
                        if (as.integer(difftime(Sys.time(), t, units = "secs")) > 3) {
                            t <<- Sys.time()
                            percentage <- as.integer(100 * which(chroms$chrom1 == chrom1 & chroms$chrom2 == chrom2) / nrow(chroms))
                            if (percentage < 100 && progress.percentage != percentage) {
                                message(sprintf("%d%%...", percentage), appendLF = FALSE)
                                progress.percentage <<- percentage
                            }
                        }
                    }, chroms$chrom1, chroms$chrom2)
                }

                if (!is.null(intervals.set.out)) {
                    if (is.null(stats)) {
                        return(retv <- NULL)
                    }
                    .gintervals.big.save_meta(fullpath, stats, zeroline)
                }

                if (progress.percentage >= 0) {
                    message("100%")
                }

                success <- TRUE

                # check whether the output intervals set needs to remain in big format
                if (!is.null(intervals.set.out) && !.gintervals.needs_bigset(intervals.set.out)) {
                    .gintervals.big2small(intervals.set.out)
                }
            },
            finally = {
                if (!success && !is.null(intervals.set.out)) {
                    unlink(fullpath, recursive = TRUE)
                }
            }
        )
    } else {
        loaded_intervals <- lapply(intervals, .gintervals.load_ext)
        res <- do.call(FUN, list(loaded_intervals, ...))
        if (!is.null(intervals.set.out) && !is.null(res) && nrow(res) > 0) {
            if (.gintervals.is1d(res)) {
                res <- res[order(res$chrom), ]
            } else {
                res <- res[order(res$chrom1, res$chrom2), ]
            }
            if (.gintervals.needs_bigset(res)) {
                .gintervals.small2big(intervals.set.out, res)
            } else {
                .gintervals.save_file(fullpath, res)
            }
        } else {
            return(NULL)
        }
    }

    # refresh the list of GINTERVS, etc.
    if (!is.null(intervals.set.out)) {
        .gdb.add_intervals.set(intervals.set.out)
    }
}

.gintervals.check_new_set <- function(intervals.set) {
    if (!is.na(match(intervals.set, get("GINTERVS", envir = .misha)))) {
        stop(sprintf("Intervals set %s already exists", intervals.set), call. = FALSE)
    }

    if (!length(grep("^[A-Za-z][\\w.]*$", intervals.set, perl = TRUE))) {
        stop("Invalid interval name %s. Only alphanumeric characters and _ are allowed in the name.")
    }

    path <- gsub(".", "/", intervals.set, fixed = TRUE)
    path <- paste(path, ".interv", sep = "")
    fullpath <- paste(get("GWD", envir = .misha), path, sep = "/")
    dir <- dirname(path)
    fulldir <- paste(get("GWD", envir = .misha), dir, sep = "/")

    if (!file.exists(fulldir)) {
        stop(sprintf("Directory %s does not exist", dir), call. = FALSE)
    }

    if (file.exists(fullpath)) {
        stop(sprintf("File %s already exists", path), call. = FALSE)
    }

    if (!is.na(match(intervals.set, get("GTRACKS", envir = .misha)))) {
        stop(sprintf("Track %s already exists", intervals.set), call. = FALSE)
    }

    if (!is.na(match(intervals.set, gvtrack.ls()))) {
        stop(sprintf("Virtual track %s already exists", intervals.set), call. = FALSE)
    }

    if (.ggetOption(".gautocompletion", FALSE) && exists(intervals.set, envir = .misha)) {
        stop(sprintf("Variable \"%s\" shadows the name of the new intervals set.\nPlease remove this variable from the environment or switch off autocompletion mode.", intervals.set), call. = FALSE)
    }

    fullpath
}

.gintervals.load_ext <- function(intervals.set = NULL, chrom = NULL, chrom1 = NULL, chrom2 = NULL, progress = FALSE) {
    if (is.null(intervals.set)) {
        stop("Usage: gintervals.load(intervals.set, chrom = NULL, chrom1 = NULL, chrom2 = NULL)", call. = FALSE)
    }
    .gcheckroot()

    if (is.character(intervals.set) && length(intervals.set) == 1 && is.na(match(intervals.set, get("GINTERVS", envir = .misha))) && is.na(match(intervals.set, get("GTRACKS", envir = .misha)))) {
        stop(sprintf("Intervals set %s does not exist", intervals.set), call. = FALSE)
    }

    .gintervals.load(intervals.set, chrom, chrom1, chrom2, progress)
}

.gintervals.load <- function(intervals.set = NULL, chrom = NULL, chrom1 = NULL, chrom2 = NULL, progress = FALSE) {
    if (!is.null(chrom)) {
        chrom <- .gchroms(chrom)
        if (length(chrom) > 1) {
            stop("chrom parameter should mark only one chromosome")
        }
    }

    if (!is.null(chrom1)) {
        chrom1 <- .gchroms(chrom1)
        if (length(chrom1) > 1) {
            stop("chrom1 parameter should mark only one chromosome")
        }
    }

    if (!is.null(chrom2)) {
        chrom2 <- .gchroms(chrom2)
        if (length(chrom2) > 1) {
            stop("chrom2 parameter should mark only one chromosome")
        }
    }

    if (!is.null(chrom) && !is.null(chrom1)) {
        stop("Cannot use chrom and chrom1 parameters in the same call", call. = FALSE)
    }

    if (!is.null(chrom) && !is.null(chrom2)) {
        stop("Cannot use chrom and chrom2 parameters in the same call", call. = FALSE)
    }

    if (is.character(intervals.set) && length(intervals.set) != 1 || !is.character(intervals.set) && !.gintervals.is1d(intervals.set) && !.gintervals.is2d(intervals.set)) {
        stop("Invalid format of intervals", call. = FALSE)
    }

    res <- NULL
    if (.gintervals.is_bigset(intervals.set)) {
        meta <- .gintervals.big.meta(intervals.set)
        zeroline <- meta$zeroline
        t <- Sys.time()
        progress.percentage <- -1

        if (.gintervals.big.is1d(intervals.set)) {
            if (!is.null(chrom1) || !is.null(chrom2)) {
                stop(sprintf("%s is a 1D big intervals set.\nchrom1 or chrom2 parameters can be applied only to 2D intervals.", intervals.set), call. = FALSE)
            }

            if (!is.null(chrom)) {
                meta$stats <- meta$stats[meta$stats$chrom == chrom, ]
            }

            if (!.gintervals.loadable(intervals.set, chrom = chrom)) {
                if (is.null(chrom)) {
                    stop(sprintf(
                        "Cannot load a big intervals set %s: its size (%d) exceeds the limit (%d) controlled by gmax.data.size option.\nFor big intervals sets only one chromosome pair can be loaded at a time.",
                        intervals.set, sum(meta$stats$size), .ggetOption("gmax.data.size", 10000000)
                    ), call. = FALSE)
                } else {
                    stop(sprintf(
                        "Cannot load chromosome %s of an intervals set %s: its size (%d) exceeds the limit (%d) controlled by gmax.data.size option.",
                        chrom, intervals.set, sum(meta$stats$size), .ggetOption("gmax.data.size", 10000000)
                    ), call. = FALSE)
                }
            }

            if (nrow(meta$stats) > 1) {
                res <- list(zeroline)
                lapply(
                    meta$stats$chrom,
                    function(chrom) {
                        loaded_intervs <- .gintervals.load_file(intervals.set, chrom = chrom)
                        if (!identical(sapply(loaded_intervs, "class"), sapply(zeroline, "class"))) {
                            stop(sprintf("Intervals set %s, chrom %s: invalid columns definition", intervals.set, chrom), call. = FALSE)
                        }
                        res <<- c(res, list(loaded_intervs))
                        if (as.integer(difftime(Sys.time(), t, units = "secs")) > 3) {
                            t <<- Sys.time()
                            percentage <- as.integer(100 * match(chrom, meta$stats$chrom) / length(meta$stats$chrom))
                            if (progress && percentage < 100 && progress.percentage != percentage) {
                                message(sprintf("%d%%...", percentage), appendLF = FALSE)
                                progress.percentage <<- percentage
                            }
                        }
                    }
                )
                res <- do.call(.grbind, res) # much faster than calling rbind incrementally in mapply
            } else if (nrow(meta$stats) == 1) {
                res <- .gintervals.load_file(intervals.set, chrom = meta$stat$chrom[1])
                if (!identical(sapply(res, "class"), sapply(zeroline, "class"))) {
                    stop(sprintf("Intervals set %s, chrom %s: invalid columns definition", intervals.set, meta$stat$chrom[1]), call. = FALSE)
                }
            } else {
                res <- meta$zeroline
            }
        } else {
            if (!is.null(chrom)) {
                stop(sprintf("%s is a 2D big intervals set.\nchrom parameter can be applied only to 1D intervals.", intervals.set), call. = FALSE)
            }

            if (!is.null(chrom1)) {
                meta$stats <- meta$stats[meta$stats$chrom1 == chrom1, ]
            }
            if (!is.null(chrom2)) {
                meta$stats <- meta$stats[meta$stats$chrom2 == chrom2, ]
            }

            if (!.gintervals.loadable(intervals.set, chrom1 = chrom1, chrom2 = chrom2)) {
                if (!is.null(chrom1) && !is.null(chrom2)) {
                    stop(sprintf(
                        "Cannot load chromosome pair (%s, %s) of an intervals set %s: its size (%d) exceeds the limit (%d) controlled by gmax.data.size option.",
                        chrom1, chrom2, intervals.set, sum(meta$stats$size), .ggetOption("gmax.data.size", 10000000)
                    ), call. = FALSE)
                } else if (!is.null(chrom1)) {
                    stop(sprintf(
                        "Cannot load chromosome %s of an intervals set %s: its size (%d) exceeds the limit (%d) controlled by gmax.data.size option.",
                        chrom1, intervals.set, sum(meta$stats$size), .ggetOption("gmax.data.size", 10000000)
                    ), call. = FALSE)
                } else if (!is.null(chrom2)) {
                    stop(sprintf(
                        "Cannot load chromosome %s of an intervals set %s: its size (%d) exceeds the limit (%d) controlled by gmax.data.size option.",
                        chrom2, intervals.set, sum(meta$stats$size), .ggetOption("gmax.data.size", 10000000)
                    ), call. = FALSE)
                } else {
                    stop(sprintf(
                        "Cannot load a big intervals set %s: its size (%d) exceeds the limit (%d) controlled by gmax.data.size option.\nFor big intervals sets only one chromosome pair can be loaded at a time.",
                        intervals.set, sum(meta$stats$size), .ggetOption("gmax.data.size", 10000000)
                    ), call. = FALSE)
                }
            }

            if (nrow(meta$stats) > 1) {
                res <- list(zeroline)
                mapply(
                    function(chrom1, chrom2) {
                        loaded_intervs <- .gintervals.load_file(intervals.set, chrom1 = chrom1, chrom2 = chrom2)
                        if (!identical(sapply(loaded_intervs, "class"), sapply(zeroline, "class"))) {
                            stop(sprintf("Interval set %s, chrom1 %s, chrom2 %s: invalid columns definition", intervals.set, chrom1, chrom2), call. = FALSE)
                        }
                        res <<- c(res, list(loaded_intervs))
                        if (as.integer(difftime(Sys.time(), t, units = "secs")) > 3) {
                            t <<- Sys.time()
                            percentage <- as.integer(100 * which(meta$stats$chrom1 == chrom1 & meta$stats$chrom2 == chrom2) / nrow(meta$stats))
                            if (progress && percentage < 100 && progress.percentage != percentage) {
                                message(sprintf("%d%%...", percentage), appendLF = FALSE)
                                progress.percentage <<- percentage
                            }
                        }
                    },
                    meta$stats$chrom1, meta$stats$chrom2
                )
                res <- do.call(.grbind, res) # much faster than calling rbind incrementally in mapply
            } else if (nrow(meta$stats) == 1) {
                res <- .gintervals.load_file(intervals.set, chrom1 = meta$stat$chrom1[1], chrom2 = meta$stat$chrom2[1])
                if (!identical(sapply(res, "class"), sapply(zeroline, "class"))) {
                    stop(sprintf("Interval set %s, chrom1 %s, chrom2 %s: invalid columns definition", intervals.set, meta$stat$chrom1[1], meta$stat$chrom2[1]), call. = FALSE)
                }
            } else {
                res <- meta$zeroline
            }
        }

        if (progress.percentage >= 0) {
            message("100%")
        }
    } else {
        if (is.character(intervals.set) && length(intervals.set) == 1) {
            res <- .gintervals.load_file(intervals.set)
        } else {
            res <- intervals.set
        }
        if (!is.null(res)) {
            if (!.gintervals.is1d(res) && !is.null(chrom)) {
                stop("chrom parameter can be applied only to 1D intervals", call. = FALSE)
            }

            if (!.gintervals.is2d(res) && (!is.null(chrom1) || !is.null(chrom2))) {
                stop("chrom1 or chrom2 parameters can be applied only to 2D intervals", call. = FALSE)
            }

            if (nrow(res) > 0) {
                if (!is.null(chrom)) {
                    res <- res[res$chrom == chrom, ]
                    if (nrow(res)) {
                        rownames(res) <- 1:nrow(res)
                    }
                }
                if (!is.null(chrom1)) {
                    res <- res[res$chrom1 == chrom1, ]
                    if (nrow(res)) {
                        rownames(res) <- 1:nrow(res)
                    }
                }
                if (!is.null(chrom2)) {
                    res <- res[res$chrom2 == chrom2, ]
                    if (nrow(res)) {
                        rownames(res) <- 1:nrow(res)
                    }
                }
            }
        }
    }
    res
}

.gintervals.load_file <- function(intervals.set, chrom = NULL, chrom1 = NULL, chrom2 = NULL) {
    intervfname <- sprintf("%s.interv", paste(get("GWD", envir = .misha), gsub("\\.", "/", intervals.set), sep = "/"))
    if (!is.null(chrom)) {
        chrom <- .gchroms(chrom)
        intervfname <- sprintf("%s/%s", intervfname, chrom)
    } else if (!is.null(chrom1) && !is.null(chrom2)) {
        chrom1 <- .gchroms(chrom1)
        chrom2 <- .gchroms(chrom2)
        intervfname <- sprintf("%s/%s-%s", intervfname, chrom1, chrom2)
    }

    if (intervals.set %in% get("GTRACKS", envir = .misha)) {
        .gcall("gtrack_intervals_load", intervals.set, chrom, chrom1, chrom2, .misha_env())
    } else {
        if (file.exists(intervfname) || (is.null(chrom) && is.null(chrom1) && is.null(chrom2))) {
            f <- file(intervfname, "rb")
            intervals.set <- unserialize(f)
            close(f)
            intervals.set
        } else {
            if (.gintervals.is_bigset(intervals.set)) {
                .gintervals.big.meta(intervals.set)$zeroline
            } else {
                stop(sprintf("File %s does not exist", intervfname), call. = FALSE)
            }
        }
    }
}

.gintervals.save_file <- function(filename, intervs) {
    if (nrow(intervs)) {
        if (.gintervals.is1d(intervs)) {
            intervs$chrom <- as.factor(intervs$chrom)
            point.intervs <- intervs$start == intervs$end
            intervs[point.intervs, ]$end <- intervs[point.intervs, ]$end + 1
        } else {
            intervs$chrom1 <- as.factor(intervs$chrom1)
            intervs$chrom2 <- as.factor(intervs$chrom2)
            point.intervs <- intervs$start1 == intervs$end1
            intervs[point.intervs, ]$end1 <- intervs[point.intervs, ]$end1 + 1
            point.intervs <- intervs$start2 == intervs$end2
            intervs[point.intervs, ]$end2 <- intervs[point.intervs, ]$end2 + 1
        }
    }

    f <- file(filename, "wb")
    serialize(intervs, f)
    close(f)
    nrow(intervs)
}

.gintervals.is_bigset <- function(intervals.set, err_if_non_exist = TRUE) {
    if (is.character(intervals.set) & length(intervals.set) == 1) {
        if (intervals.set %in% get("GTRACKS", envir = .misha)) {
            intervfname <- sprintf("%s.track", paste(get("GWD", envir = .misha), gsub("\\.", "/", intervals.set), sep = "/"))
        } else {
            intervfname <- sprintf("%s.interv", paste(get("GWD", envir = .misha), gsub("\\.", "/", intervals.set), sep = "/"))
        }
        if (file.exists(intervfname)) {
            if (file.info(intervfname)$isdir) {
                return(TRUE)
            }
        } else if (err_if_non_exist) {
            stop(sprintf("Intervals set %s does not exist", intervals.set), call. = FALSE)
        }
    }
    FALSE
}

.gintervals.needs_bigset <- function(intervals = NULL, size = NULL) {
    if (!is.null(intervals)) {
        chromsizes <- gintervals.chrom_sizes(intervals)
        nrow(chromsizes) && sum(chromsizes$size) > min(.ggetOption("gmax.data.size", 10000000), .ggetOption("gbig.intervals.size", 1000000))
    } else {
        size > min(.ggetOption("gmax.data.size", 10000000), .ggetOption("gbig.intervals.size", 1000000))
    }
}

.gintervals.loadable <- function(intervals = NULL, size = NULL, chrom = NULL, chrom1 = NULL, chrom2 = NULL) {
    if (!is.null(intervals)) {
        chromsizes <- gintervals.chrom_sizes(intervals)
        if (nrow(chromsizes)) {
            if (!is.null(chrom)) {
                chromsizes <- chromsizes[chromsizes$chrom == chrom, ]
            }
            if (!is.null(chrom1)) {
                chromsizes <- chromsizes[chromsizes$chrom1 == chrom1, ]
            }
            if (!is.null(chrom2)) {
                chromsizes <- chromsizes[chromsizes$chrom2 == chrom2, ]
            }
        }
        !nrow(chromsizes) || sum(chromsizes$size) <= .ggetOption("gmax.data.size", 10000000)
    } else {
        size <= .ggetOption("gmax.data.size", 10000000)
    }
}

.gintervals.big.is1d <- function(intervals.set) {
    "chrom" %in% colnames(.gintervals.big.meta(intervals.set)$stats)
}

.gintervals.big.is2d <- function(intervals.set) {
    "chrom1" %in% colnames(.gintervals.big.meta(intervals.set)$stats)
}

.gintervals.big2small <- function(intervals.set) {
    # We assume that writing the intervals might be a lengthy process.
    # During this time the process might get interrupted leaving the intervals set in incomplete state.
    # Even though it's not fully transaction-safe, we prefer to create a temporary file and then move it hoping it's fast enough.
    path <- gsub(".", "/", intervals.set, fixed = TRUE)
    path <- paste(get("GWD", envir = .misha), path, sep = "/")
    path <- paste(path, ".interv", sep = "")

    intervals <- .gintervals.load(intervals.set)
    tmpfilename <- tempfile(".", dirname(path)) # tmpdir = the parent directory of intervals set, otherwise rename might not work
    # (tmpdir might be at another file system)
    file.rename(path, tmpfilename)
    success <- FALSE
    tryCatch(
        {
            .gintervals.save_file(path, intervals)
            success <- TRUE
        },
        finally = {
            if (!success) {
                unlink(path, recursive = TRUE)
                file.rename(tmpfilename, path)
            }
            unlink(tmpfilename, recursive = TRUE)
        }
    )
}

.gintervals.small2big <- function(intervals.set, intervals = NULL) {
    # We assume that writing the intervals might be a lengthy process.
    # During this time the process might get interrupted leaving the intervals set in incomplete state.
    # Even though it's not fully transaction-safe, we prefer to create a temporary file and then move it hoping it's fast enough.
    if (is.null(intervals)) {
        intervals <- .gintervals.load(intervals.set)
    }

    path <- gsub(".", "/", intervals.set, fixed = TRUE)
    path <- paste(get("GWD", envir = .misha), path, sep = "/")
    path <- paste(path, ".interv", sep = "")

    tmpfilename <- tempfile(".", dirname(path)) # tmpdir = the parent directory of intervals set, otherwise rename might not work
    # (tmpdir might be at another file system)
    file.rename(path, tmpfilename)
    gintervs <- get("GINTERVS", envir = .misha)
    assign("GINTERVS", gintervs[gintervs != intervals.set], envir = .misha)
    success <- FALSE
    tryCatch(
        {
            .gcall_noninteractive(gintervals.save, intervals.set, intervals)
            success <- TRUE
        },
        finally = {
            if (!success) {
                unlink(path, recursive = TRUE)
                file.rename(tmpfilename, path)
                gintervs <- c(gintervs, intervals.set)
                gintervs <- unique(gintervs)
                sort(gintervs)
                assign("GINTERVS", gintervs, envir = .misha)
            }
            unlink(tmpfilename, recursive = TRUE)
        }
    )
}

.gintervals.big.meta <- function(intervals.set) {
    metafname <- ""
    if (intervals.set %in% get("GTRACKS", envir = .misha)) {
        metafname <- sprintf("%s.track/.meta", paste(get("GWD", envir = .misha), gsub("\\.", "/", intervals.set), sep = "/"))
        if (!file.exists(metafname)) {
            .gcall("gtrack_create_meta", intervals.set, .misha_env())
        }
    } else {
        metafname <- sprintf("%s.interv/.meta", paste(get("GWD", envir = .misha), gsub("\\.", "/", intervals.set), sep = "/"))
    }
    f <- file(metafname, "rb")
    res <- unserialize(f)
    close(f)
    res
}

.gintervals.big.save_meta <- function(path, stats, zeroline) {
    f <- file(sprintf("%s/.meta", path), "wb")
    serialize(list(stats = stats, zeroline = zeroline), f)
    close(f)
}

.gintervals.big.save <- function(path, intervs, chrom = NULL, chrom1 = NULL, chrom2 = NULL) {
    if (!is.null(chrom)) {
        filename <- sprintf("%s/%s", path, chrom)
    } else {
        filename <- sprintf("%s/%s-%s", path, chrom1, chrom2)
    }

    if (is.null(intervs) || nrow(intervs) == 0) {
        unlink(filename, recursive = TRUE)
    } else {
        if (!is.null(chrom)) {
            intervs <- intervs[intervs$chrom == chrom, ]
        } else {
            intervs <- intervs[intervs$chrom1 == chrom1 & intervs$chrom2 == chrom2, ]
        }
        .gintervals.save_file(filename, intervs)
    }
}

.gintervals.is1d <- function(intervals) {
    if (is.character(intervals)) {
        if (.gintervals.is_bigset(intervals)) {
            return(.gintervals.big.is1d(intervals))
        }
        intervals <- .gintervals.load(intervals)
    }
    all(colnames(intervals)[1:3] == c("chrom", "start", "end"))
}

.gintervals.is2d <- function(intervals) {
    if (is.character(intervals)) {
        if (.gintervals.is_bigset(intervals)) {
            return(.gintervals.big.is2d(intervals))
        }
        intervals <- .gintervals.load(intervals)
    }
    all(colnames(intervals)[1:6] == c("chrom1", "start1", "end1", "chrom2", "start2", "end2"))
}



#' Creates a set of 1D intervals
#'
#' Creates a set of 1D intervals.
#'
#' This function returns a set of one-dimensional intervals. The returned value
#' can be used in all functions that accept 'intervals' argument.
#'
#' One-dimensional intervals is a data frame whose first three columns are
#' 'chrom', 'start' and 'end'. Each row of the data frame represents a genomic
#' interval of the specified chromosome in the range of [start, end).
#' Additional columns can be presented in 1D intervals object yet these columns
#' must be added after the three obligatory ones.
#'
#' If 'strands' argument is not 'NULL' an additional column "strand" is added
#' to the intervals. The possible values of a strand can be '1' (plus strand),
#' '-1' (minus strand) or '0' (unknown).
#'
#' @param chroms chromosomes - an array of strings with or without "chr"
#' prefixes or an array of integers (like: '1' for "chr1")
#' @param starts an array of start coordinates
#' @param ends an array of end coordinates. If '-1' chromosome size is assumed.
#' @param strands 'NULL' or an array consisting of '-1', '0' or '1' values
#' @return A data frame representing the intervals.
#' @seealso \code{\link{gintervals.2d}}, \code{\link{gintervals.force_range}}
#' @keywords ~intervals
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#'
#' ## the following 3 calls produce identical results
#' gintervals(1)
#' gintervals("1")
#' gintervals("chrX")
#'
#' gintervals(1, 1000)
#' gintervals(c("chr2", "chrX"), 10, c(3000, 5000))
#'
#' @export gintervals
gintervals <- function(chroms = NULL, starts = 0, ends = -1, strands = NULL) {
    if (is.null(chroms)) {
        stop("Usage: gintervals(chroms, starts = 0, ends = -1, strands = NULL)", call. = FALSE)
    }
    .gcheckroot()

    intervals <- .gintervals(chroms, starts, ends, strands)
    .gcall("gintervsort", intervals, .misha_env())
}



#' Creates a set of 2D intervals
#'
#' Creates a set of 2D intervals.
#'
#' This function returns a set of two-dimensional intervals. The returned value
#' can be used in all functions that accept 'intervals' argument.
#'
#' Two-dimensional intervals is a data frame whose first six columns are
#' 'chrom1', 'start1', 'end1', 'chrom2', 'start2' and 'end2'. Each row of the
#' data frame represents two genomic intervals from two chromosomes in the
#' range of [start, end). Additional columns can be presented in 2D intervals
#' object yet these columns must be added after the six obligatory ones.
#'
#' @param chroms1 chromosomes1 - an array of strings with or without "chr"
#' prefixes or an array of integers (like: '1' for "chr1")
#' @param starts1 an array of start1 coordinates
#' @param ends1 an array of end1 coordinates. If '-1' chromosome size is
#' assumed.
#' @param chroms2 chromosomes2 - an array of strings with or without "chr"
#' prefixes or an array of integers (like: '1' for "chr1"). If 'NULL',
#' 'chroms2' is assumed to be equal to 'chroms1'.
#' @param starts2 an array of start2 coordinates
#' @param ends2 an array of end2 coordinates. If '-1' chromosome size is
#' assumed.
#' @return A data frame representing the intervals.
#' @seealso \code{\link{gintervals}}, \code{\link{gintervals.force_range}}
#' @keywords ~intervals
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#'
#' ## the following 3 calls produce identical results
#' gintervals.2d(1)
#' gintervals.2d("1")
#' gintervals.2d("chrX")
#'
#' gintervals.2d(1, 1000, 2000, "chrX", 400, 800)
#' gintervals.2d(c("chr2", "chrX"), 10, c(3000, 5000), 1)
#'
#' @export gintervals.2d
gintervals.2d <- function(chroms1 = NULL, starts1 = 0, ends1 = -1, chroms2 = NULL, starts2 = 0, ends2 = -1) {
    if (is.null(chroms1)) {
        stop("Usage: gintervals.2d(chroms1, starts1 = 0, ends1 = -1, chroms2 = NULL, starts2 = 0, ends2 = -1)", call. = FALSE)
    }
    .gcheckroot()

    if (is.null(chroms2)) {
        chroms2 <- chroms1
    }

    intervals1 <- .gintervals(chroms1, starts1, ends1, NULL)
    intervals2 <- .gintervals(chroms2, starts2, ends2, NULL)

    intervals <- data.frame(
        chrom1 = intervals1$chrom, start1 = intervals1$start, end1 = intervals1$end,
        chrom2 = intervals2$chrom, start2 = intervals2$start, end2 = intervals2$end
    )

    .gcall("gintervsort", intervals, .misha_env())
}



#' Returns 2D intervals that cover the whole genome
#'
#' Returns 2D intervals that cover the whole genome.
#'
#' This function returns a set of two-dimensional intervals that cover the
#' whole genome as it is defined by 'chrom_sizes.txt' file.
#'
#' @return A data frame representing the intervals.
#' @seealso \code{\link{gintervals.2d}}
#' @keywords ~genome ~chromosome ~chromosomes ~ALLGENOME
#' @export gintervals.2d.all
gintervals.2d.all <- function() {
    .gcheckroot()
    get("ALLGENOME", envir = .misha)[[2]]
}



#' Returns 1D intervals that cover the whole genome
#'
#' Returns 1D intervals that cover the whole genome.
#'
#' This function returns a set of one-dimensional intervals that cover the
#' whole genome as it is defined by 'chrom_sizes.txt' file.
#'
#' @return A data frame representing the intervals.
#' @seealso \code{\link{gintervals}}
#' @keywords ~genome ~chromosome ~chromosomes ~ALLGENOME
#' @export gintervals.all
gintervals.all <- function() {
    .gcheckroot()
    get("ALLGENOME", envir = .misha)[[1]]
}



#' Intersects two-dimensional intervals with a band
#'
#' Intersects two-dimensional intervals with a band.
#'
#' This function intersects each two-dimensional interval from 'intervals' with
#' 'band'. If the intersection is not empty, the interval is shrunk to the
#' minimal rectangle that contains the band and added to the return value.
#'
#' If 'intervals.set.out' is not 'NULL' the result is saved as an intervals
#' set. Use this parameter if the result size exceeds the limits of the
#' physical memory.
#'
#' @param intervals two-dimensional intervals
#' @param band track expression band. If 'NULL' no band is used.
#' @param intervals.set.out intervals set name where the function result is
#' optionally outputted
#' @return If 'intervals.set.out' is 'NULL' a data frame representing the
#' intervals.
#' @seealso \code{\link{gintervals.2d}}, \code{\link{gintervals.intersect}}
#' @keywords ~band ~intersect
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' gintervals.2d.band_intersect(gintervals.2d(1), c(10000, 20000))
#'
#' @export gintervals.2d.band_intersect
gintervals.2d.band_intersect <- function(intervals = NULL, band = NULL, intervals.set.out = NULL) {
    if (is.null(intervals)) {
        stop("Usage: gintervals.2d.band_intersect(intervals, band = NULL, intervals.set.out = NULL)", call. = FALSE)
    }

    intervals <- rescue_ALLGENOME(intervals, as.character(substitute(intervals)))

    intervals.set.out <- do.call(.gexpr2str, list(substitute(intervals.set.out)), envir = parent.frame())

    if (!is.null(intervals.set.out)) {
        fullpath <- .gintervals.check_new_set(intervals.set.out)
    }

    # intervals can be NULL if gextract is piped with gscreen and the latter returns NULL
    success <- FALSE
    res <- NULL
    tryCatch(
        {
            if (!is.null(intervals)) {
                res <- .gcall("ginterv_intersectband", intervals, band, intervals.set.out, .misha_env())
                if (!is.null(intervals.set.out) && .gintervals.is_bigset(intervals.set.out, FALSE) && !.gintervals.needs_bigset(intervals.set.out)) {
                    .gintervals.big2small(intervals.set.out)
                }
            }
            success <- TRUE
        },
        finally = {
            if (!success && !is.null(intervals.set.out)) {
                unlink(fullpath, recursive = TRUE)
            }
        }
    )

    # refresh the list of GINTERVS, etc.
    if (!is.null(intervals.set.out)) {
        .gdb.add_intervals.set(intervals.set.out)
        retv <- 0 # suppress return value
    } else {
        res
    }
}



#' Converts intervals to canonic form
#'
#' Converts intervals to canonic form.
#'
#' This function converts 'intervals' into a "canonic" form: properly sorted
#' with no overlaps. The result can be used later in the functions that require
#' the intervals to be in canonic form. Use 'unify_touching_intervals' to
#' control whether the intervals that touch each other (i.e. the end coordinate
#' of one equals to the start coordinate of the other) are unified.
#' 'unify_touching_intervals' is ignored if two-dimensional intervals are used.
#'
#' Since 'gintervals.canonic' unifies overlapping or touching intervals, the
#' number of the returned intervals might be less than the number of the
#' original intervals. To allow the user to find the origin of the new interval
#' 'mapping' attribute is attached to the result. It maps between the original
#' intervals and the resulted intervals. Use 'attr(retv_of_gintervals.canonic,
#' "mapping")' to retrieve the map.
#'
#' @param intervals intervals to be converted
#' @param unify_touching_intervals if 'TRUE', touching one-dimensional
#' intervals are unified
#' @return A data frame representing the canonic intervals and an attribute
#' 'mapping' that maps the original intervals to the resulted ones.
#' @seealso \code{\link{gintervals}}, \code{\link{gintervals.2d}}
#' @keywords ~intervals ~canonic
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#'
#' ## Create intervals manually by using 'data.frame'.
#' ## Note that we add an additional column 'data'.
#' ## Return value:
#' ##   chrom start   end data
#' ## 1  chr1 11000 12000   10
#' ## 2  chr1   100   200   20
#' ## 3  chr1 10000 13000   30
#' ## 4  chr1 10500 10600   40
#' intervs <- data.frame(
#'     chrom = "chr1",
#'     start = c(11000, 100, 10000, 10500),
#'     end = c(12000, 200, 13000, 10600),
#'     data = c(10, 20, 30, 40)
#' )
#'
#' ## Convert the intervals into the canonic form.
#' ## The function discards any columns besides chrom, start and end.
#' ## Return value:
#' ##  chrom start   end
#' ## 1  chr1   100   200
#' ## 2  chr1 10000 13000
#' res <- gintervals.canonic(intervs)
#'
#' ## By inspecting mapping attribute we can see how the new
#' ## intervals were created: "2 1 2 2" means that the first
#' ## interval in the result was created from the second interval in
#' ## the original set (we look for the indices in mapping where "1"
#' ## appears). Likewise the second interval in the result was
#' ## created from 3 intervals in the original set. Their indices are
#' ## 1, 3 and 4 (once again we look for the indices in mapping where
#' ## "2" appears).
#' ## Return value:
#' ## 2 1 2 2
#' attr(res, "mapping")
#'
#' ## Finally (and that is the most useful part of 'mapping'
#' ## attribute): we add a new column 'data' to our result which is
#' ## the mean value of the original data column. The trick is done
#' ## using 'tapply' on par with 'mapping' attribute. For example,
#' ## 20.00000 equals is a result of 'mean(intervs[2,]$data' while
#' ## 26.66667 is a result of 'mean(intervs[c(1,3,4),]$data)'.
#' ## 'res' after the following call:
#' ##  chrom start   end     data
#' ## 1  chr1   100   200 20.00000
#' ## 2  chr1 10000 13000 26.66667
#' res$data <- tapply(intervs$data, attr(res, "mapping"), mean)
#'
#' @export gintervals.canonic
gintervals.canonic <- function(intervals = NULL, unify_touching_intervals = TRUE) {
    if (is.null(intervals)) {
        stop("Usage: gintervals.canonic(intervals, unify_touching_intervals = TRUE)", call. = FALSE)
    }

    intervals <- rescue_ALLGENOME(intervals, as.character(substitute(intervals)))

    res <- .gcall("gintervcanonic", intervals, unify_touching_intervals, .misha_env())
    res
}



#' Calculates difference of two intervals sets
#'
#' Returns difference of two sets of intervals.
#'
#' This function returns a genomic space that is covered by 'intervals1' but
#' not covered by 'intervals2'.
#'
#' If 'intervals.set.out' is not 'NULL' the result is saved as an intervals
#' set. Use this parameter if the result size exceeds the limits of the
#' physical memory.
#'
#' @param intervals1,intervals2 set of one-dimensional intervals
#' @param intervals.set.out intervals set name where the function result is
#' optionally outputted
#' @return If 'intervals.set.out' is 'NULL' a data frame representing the
#' intervals.
#' @seealso \code{\link{gintervals}}, \code{\link{gintervals.intersect}},
#' \code{\link{gintervals.union}}
#' @keywords ~diff
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#'
#' intervs1 <- gscreen("dense_track > 0.15")
#' intervs2 <- gscreen("dense_track < 0.2")
#'
#' ## 'res3' equals to 'res4'
#' res3 <- gintervals.diff(intervs1, intervs2)
#' res4 <- gscreen("dense_track >= 0.2")
#'
#' @export gintervals.diff
gintervals.diff <- function(intervals1 = NULL, intervals2 = NULL, intervals.set.out = NULL) {
    if (is.null(intervals1) || is.null(intervals2)) {
        stop("Usage: gintervals.diff(intervals1, intervals2, intervals.set.out = NULL)", call. = FALSE)
    }

    intervals.set.out <- do.call(.gexpr2str, list(substitute(intervals.set.out)), envir = parent.frame())

    if (.gintervals.is_bigset(intervals1) || .gintervals.is_bigset(intervals2) || !is.null(intervals.set.out)) {
        res <- NULL

        FUN <- function(intervals, intervals.set.out, envir) {
            intervals1 <- intervals[[1]]
            intervals2 <- intervals[[2]]
            chrom_res <- .gcall("gintervdiff", intervals1, intervals2, .misha_env())
            if (!is.null(chrom_res) && nrow(chrom_res) > 0) {
                if (is.null(intervals.set.out)) {
                    assign("res", c(get("res", envir = envir), list(chrom_res)), envir = envir)
                    .gverify_max_data_size(sum(unlist(lapply(get("res", envir), nrow))), arguments = "intervals.set.out")
                }
            }
            chrom_res
        }

        .gintervals.apply(gintervals.chrom_sizes(intervals1), list(intervals1, intervals2), intervals.set.out, FUN, intervals.set.out, environment())

        if (!is.null(res)) {
            res <- do.call(.grbind, res)
        } # much faster than calling rbind incrementally in FUN

        if (is.null(intervals.set.out)) {
            if (!is.null(res) && nrow(res)) {
                res
            } else {
                NULL
            }
        } else {
            retv <- 0
        } # suppress return value
    } else {
        res <- .gcall("gintervdiff", intervals1, intervals2, .misha_env())
        res
    }
}



#' Tests for a named intervals set existence
#'
#' Tests for a named intervals set existence.
#'
#' This function returns 'TRUE' if a named intervals set exists in Genomic
#' Database.
#'
#' @param intervals.set name of an intervals set
#' @return 'TRUE' if a named intervals set exists. Otherwise 'FALSE'.
#' @seealso \code{\link{gintervals.ls}}, \code{\link{gintervals.load}},
#' \code{\link{gintervals.rm}}, \code{\link{gintervals.save}},
#' \code{\link{gintervals}}, \code{\link{gintervals.2d}}
#' @keywords ~intervals
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' gintervals.exists("annotations")
#'
#' @export gintervals.exists
gintervals.exists <- function(intervals.set = NULL) {
    if (is.null(substitute(intervals.set))) {
        stop("Usage: gintervals.exists(intervals.set)", call. = FALSE)
    }
    .gcheckroot()

    intervals.set <- do.call(.gexpr2str, list(substitute(intervals.set)), envir = parent.frame())
    !is.na(match(intervals.set, get("GINTERVS", envir = .misha)))
}



#' Limits intervals to chromosomal range
#'
#' Limits intervals to chromosomal range.
#'
#' This function enforces the intervals to be within the chromosomal range [0,
#' chrom length) by altering the intervals' boundaries. Intervals that lay
#' entirely outside of the chromosomal range are eliminated. The new intervals
#' are returned.
#'
#' If 'intervals.set.out' is not 'NULL' the result is saved as an intervals
#' set. Use this parameter if the result size exceeds the limits of the
#' physical memory.
#'
#' @param intervals intervals
#' @param intervals.set.out intervals set name where the function result is
#' optionally outputted
#' @return If 'intervals.set.out' is 'NULL' a data frame representing the
#' intervals.
#' @seealso \code{\link{gintervals}}, \code{\link{gintervals.2d}},
#' \code{\link{gintervals.canonic}}
#' @keywords ~intervals
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' intervs <- data.frame(
#'     chrom = "chr1",
#'     start = c(11000, -100, 10000, 10500),
#'     end = c(12000, 200, 13000000, 10600)
#' )
#' gintervals.force_range(intervs)
#'
#' @export gintervals.force_range
gintervals.force_range <- function(intervals = NULL, intervals.set.out = NULL) {
    if (is.null(substitute(intervals))) {
        stop("Usage: gintervals.force_range(intervals, intervals.set.out = NULL)", call. = FALSE)
    }
    .gcheckroot()

    intervals <- rescue_ALLGENOME(intervals, as.character(substitute(intervals)))

    intervals.set.out <- do.call(.gexpr2str, list(substitute(intervals.set.out)), envir = parent.frame())

    res <- NULL
    FUN <- function(intervals, intervals.set.out, envir) {
        intervals <- intervals[[1]]
        if (.gintervals.is1d(intervals)) {
            intervals$start <- pmax(intervals$start, 0)
            intervals$end <- pmin(intervals$end, gintervals.all()[match(intervals$chrom, gintervals.all()$chrom), ]$end)
            intervals <- intervals[!is.na(intervals$end) & intervals$start < intervals$end, ]
        } else {
            intervals$start1 <- pmax(intervals$start1, 0)
            intervals$end1 <- pmin(intervals$end1, gintervals.all()[match(intervals$chrom1, gintervals.all()$chrom), ]$end)
            intervals$start2 <- pmax(intervals$start2, 0)
            intervals$end2 <- pmin(intervals$end2, gintervals.all()[match(intervals$chrom2, gintervals.all()$chrom), ]$end)
            intervals <- intervals[!is.na(intervals$end1) & !is.na(intervals$end2) & intervals$start1 < intervals$end1 & intervals$start2 < intervals$end2, ]
        }
        if (is.null(intervals.set.out)) {
            assign("res", c(get("res", envir = envir), list(intervals)), envir = envir)
            .gverify_max_data_size(sum(unlist(lapply(get("res", envir), nrow))), arguments = "intervals.set.out")
        }
        intervals
    }

    .gintervals.apply(gintervals.chrom_sizes(intervals), intervals, intervals.set.out, FUN, intervals.set.out, environment())

    if (!is.null(res)) {
        res <- do.call(.grbind, res)
    } # much faster than calling rbind incrementally in FUN

    if (is.null(intervals.set.out)) {
        if (!is.null(res) && nrow(res)) {
            res
        } else {
            NULL
        }
    } else {
        retv <- 0
    } # suppress return value
}



#' Imports genes and annotations from files
#'
#' Imports genes and annotations from files.
#'
#' This function reads a definition of genes from 'genes.file' and returns four
#' sets of intervals: TSS, exons, 3utr and 5utr. In addition to the regular
#' intervals columns 'strand' column is added. It contains '1' values for '+'
#' strands and '-1' values for '-' strands.
#'
#' If annotation file 'annots.file' is given then annotations are attached too
#' to the intervals. The names of the annotations as they would appear in the
#' return value must be specified in 'annots.names' argument.
#'
#' Both 'genes.file' and 'annots.file' can be either a file path or URL in a
#' form of 'ftp://[address]/[file]'. Files that these arguments point to can be
#' zipped or unzipped.
#'
#' Examples of 'genes.file' and 'annots.file' can be found here:
#'
#' \code{ftp://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/knownGene.txt.gz}
#' \code{ftp://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/kgXref.txt.gz}
#'
#' If a few intervals overlap (for example: two TSS regions) they are all
#' unified to an interval that covers the whole overlapping region. 'strand'
#' value is set to '0' if two or more of the overlapping intervals have
#' different strands. The annotations of the overlapping intervals are
#' concatenated to a single character string separated by semicolons. Identical
#' values of overlapping intervals' annotation are eliminated.
#'
#' @param genes.file name or URL of file that contains genes
#' @param annots.file name of URL file that contains annotations. If 'NULL' no
#' annotations are imported
#' @param annots.names annotations names
#' @return A list of four intervals sets named 'tss', 'exons', 'utr3' and
#' 'utr5'. 'strand' column and annotations are attached to the intevals.
#' @seealso \code{\link{gintervals}}, \code{\link{gdb.create}}
#' @keywords ~intervals ~import ~genes
#' @export gintervals.import_genes
gintervals.import_genes <- function(genes.file = NULL, annots.file = NULL, annots.names = NULL) {
    if (is.null(genes.file)) {
        stop("Usage: gintervals.import_genes(genes.file, annots.file = NULL, annots.names = NULL)", call. = FALSE)
    }

    tmp.dirname <- tempfile(pattern = "", tmpdir = paste(get("GROOT", envir = .misha), "/downloads", sep = ""))
    if (!dir.create(tmp.dirname, recursive = TRUE, mode = "0777")) {
        stop(sprintf("Failed to create a directory %s", tmp.dirname), call. = FALSE)
    }

    files <- list(genes.file, annots.file)
    file.types <- c("genes.file", "annots.file")

    tryCatch(
        {
            for (i in 1:length(files)) {
                if (is.null(files[[i]])) {
                    next
                }

                protocol <- "ftp://"
                if (substr(files[[i]], 1, nchar(protocol)) == protocol) {
                    # ftp
                    f <- gwget(files[[i]], tmp.dirname)
                    if (length(f) != 1) {
                        stop(sprintf("More than one file matches %s argument", file.types[i]), call. = FALSE)
                    }
                    files[[i]] <- f
                }

                if (length(grep("^.+\\.gz$", files[[i]], perl = TRUE))) {
                    f.unzipped <- basename(gsub("^(.+)\\.gz$", "\\1", files[[i]], perl = TRUE))
                    f.unzipped <- paste(tmp.dirname, "/", f.unzipped, sep = "")
                    cmd <- paste("/bin/sh -c \"gunzip -q -c", files[[i]], ">", f.unzipped, "\"")
                    if (system(cmd)) {
                        stop(sprintf("Command failed: %s", cmd), call. = FALSE)
                    }
                    files[[i]] <- f.unzipped
                }
            }

            res <- .gcall("gintervals_import_genes", files[[1]], files[[2]], annots.names, .misha_env())
            res
        },
        finally = {
            unlink(tmp.dirname, recursive = TRUE)
        }
    )
}



#' Calculates an intersection of two sets of intervals
#'
#' Calculates an intersection of two sets of intervals.
#'
#' This function returns intervals that represent a genomic space which is
#' achieved by intersection of 'intervals1' and 'intervals2'.
#'
#' If 'intervals.set.out' is not 'NULL' the result is saved as an intervals
#' set. Use this parameter if the result size exceeds the limits of the
#' physical memory.
#'
#' @param intervals1,intervals2 set of intervals
#' @param intervals.set.out intervals set name where the function result is
#' optionally outputted
#' @return If 'intervals.set.out' is 'NULL' a data frame representing the
#' intersection of intervals.
#' @seealso \code{\link{gintervals.2d.band_intersect}},
#' \code{\link{gintervals.diff}}, \code{\link{gintervals.union}},
#' \code{\link{gintervals}}, \code{\link{gintervals.2d}}
#' @keywords ~intersect
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#'
#' intervs1 <- gscreen("dense_track > 0.15")
#' intervs2 <- gscreen("dense_track < 0.2")
#'
#' ## 'intervs3' and 'intervs4' are identical
#' intervs3 <- gintervals.intersect(intervs1, intervs2)
#' intervs4 <- gscreen("dense_track > 0.15 & dense_track < 0.2")
#'
#' @export gintervals.intersect
gintervals.intersect <- function(intervals1 = NULL, intervals2 = NULL, intervals.set.out = NULL) {
    if (is.null(intervals1) || is.null(intervals2)) {
        stop("Usage: gintervals.intersect(intervals1, intervals2, intervals.set.out = NULL)", call. = FALSE)
    }

    intervals.set.out <- do.call(.gexpr2str, list(substitute(intervals.set.out)), envir = parent.frame())

    if (.gintervals.is_bigset(intervals1) || .gintervals.is_bigset(intervals2) || !is.null(intervals.set.out)) {
        res <- NULL

        FUN <- function(intervals, intervals.set.out, envir) {
            intervals1 <- intervals[[1]]
            intervals2 <- intervals[[2]]
            chrom_res <- .gcall("gintervintersect", intervals1, intervals2, .misha_env())
            if (!is.null(chrom_res) && nrow(chrom_res) > 0) {
                if (is.null(intervals.set.out)) {
                    assign("res", c(get("res", envir = envir), list(chrom_res)), envir = envir)
                    .gverify_max_data_size(sum(unlist(lapply(get("res", envir), nrow))), arguments = "intervals.set.out")
                }
            }
            chrom_res
        }

        chroms1 <- gintervals.chrom_sizes(intervals1)
        chroms1$size <- NULL
        chroms2 <- gintervals.chrom_sizes(intervals2)
        chroms2$size <- NULL
        .gintervals.apply(merge(chroms1, chroms2), list(intervals1, intervals2), intervals.set.out, FUN, intervals.set.out, environment())

        if (!is.null(res)) {
            res <- do.call(.grbind, res)
        } # much faster than calling rbind incrementally in FUN

        if (is.null(intervals.set.out)) {
            if (!is.null(res) && nrow(res)) {
                res
            } else {
                NULL
            }
        } else {
            retv <- 0
        } # suppress return value
    } else {
        res <- .gcall("gintervintersect", intervals1, intervals2, .misha_env())
        res
    }
}



#' Returns number of intervals per chromosome
#'
#' Returns number of intervals per chromosome (or chromosome pair).
#'
#' This function returns number of intervals per chromosome (for 1D intervals)
#' or chromosome pair (for 2D intervals).
#'
#' @param intervals intervals set
#' @return Data frame representing number of intervals per chromosome (for 1D
#' intervals) or chromosome pair (for 2D intervals).
#' @seealso \code{\link{gintervals.load}}, \code{\link{gintervals.save}},
#' \code{\link{gintervals.exists}}, \code{\link{gintervals.ls}},
#' \code{\link{gintervals}}, \code{\link{gintervals.2d}}
#' @keywords ~intervals
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' gintervals.chrom_sizes("annotations")
#'
#' @export gintervals.chrom_sizes
gintervals.chrom_sizes <- function(intervals = NULL) {
    if (is.null(intervals)) {
        stop("Usage: gintervals.chrom_sizes(intervals)", call. = FALSE)
    }
    .gcheckroot()

    intervals <- rescue_ALLGENOME(intervals, as.character(substitute(intervals)))

    if (.gintervals.is_bigset(intervals)) {
        if (.gintervals.big.is1d(intervals)) {
            stats <- .gintervals.big.meta(intervals)$stats
            res <- stats[, match(c("chrom", "size"), colnames(stats))]
        } else {
            stats <- .gintervals.big.meta(intervals)$stats
            res <- stats[, match(c("chrom1", "chrom2", "size"), colnames(stats))]
        }
    } else {
        res <- .gcall("gintervals_chrom_sizes", .gintervals.load_ext(intervals), .misha_env())
    }

    if (nrow(res) > 1) {
        rownames(res) <- 1:nrow(res)
    }
    res
}



#' Tests for big intervals set
#'
#' Tests for big intervals set.
#'
#' This function tests whether 'intervals.set' is a big intervals set.
#' Intervals set is big if it is stored in big intervals set format and given
#' the current limits it cannot be fully loaded into memory.
#'
#' Memory limit is controlled by 'gmax.data.size' option (see:
#' 'getOption("gmax.data.size")').
#'
#' @param intervals.set name of an intervals set
#' @return 'TRUE' if intervals set is big, otherwise 'FALSE'.
#' @seealso \code{\link{gintervals.load}}, \code{\link{gintervals.save}},
#' \code{\link{gintervals.exists}}, \code{\link{gintervals.ls}}
#' @keywords ~intervals
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' gintervals.is.bigset("annotations")
#'
#' @export gintervals.is.bigset
gintervals.is.bigset <- function(intervals.set = NULL) {
    if (is.null(intervals.set)) {
        stop("Usage: gintervals.is.bigset(intervals.set)", call. = FALSE)
    }
    .gcheckroot()

    .gintervals.is_bigset(intervals.set) && !.gintervals.loadable(intervals.set)
}



#' Converts intervals from another assembly
#'
#' Converts intervals from another assembly to the current one.
#'
#' This function converts 'intervals' from another assembly to the current one.
#' Chain file instructs how the conversion of coordinates should be done. It
#' can be either a name of a chain file or a data frame in the same format as
#' returned by 'gintervals.load_chain' function.
#'
#' The converted intervals are returned. An additional column named
#' 'intervalID' is added to the resulted data frame. For each interval in the
#' resulted intervals it indicates the index of the original interval.
#'
#' @param intervals intervals from another assembly
#' @param chain name of chain file or data frame as returned by
#' 'gintervals.load_chain'
#' @return A data frame representing the converted intervals.
#' @seealso \code{\link{gintervals.load_chain}}, \code{\link{gtrack.liftover}},
#' \code{\link{gintervals}}
#' @keywords ~intervals ~liftover ~chain
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' chainfile <- paste(.misha$GROOT, "data/test.chain", sep = "/")
#' intervs <- data.frame(
#'     chrom = "chr25", start = c(0, 7000),
#'     end = c(6000, 20000)
#' )
#' gintervals.liftover(intervs, chainfile)
#'
#' @export gintervals.liftover
gintervals.liftover <- function(intervals = NULL, chain = NULL) {
    if (is.null(intervals) || is.null(chain)) {
        stop("Usage: gintervals.liftover(intervals, chain)", call. = FALSE)
    }
    .gcheckroot()

    intervals <- rescue_ALLGENOME(intervals, as.character(substitute(intervals)))

    if (is.character(chain)) {
        chain.intervs <- gintervals.load_chain(chain)
    } else {
        chain.intervs <- chain
    }

    .gcall("gintervs_liftover", intervals, chain.intervs, .misha_env())
}



#' Loads a named intervals set
#'
#' Loads a named intervals set.
#'
#' This function loads and returns intervals stored in a named intervals set.
#'
#' If intervals set contains 1D intervals and 'chrom' is not 'NULL' only the
#' intervals of the given chromosome are returned.
#'
#' Likewise if intervals set contains 2D intervals and 'chrom1', 'chrom2' are
#' not 'NULL' only the intervals of the given pair of chromosomes are returned.
#'
#' For big intervals sets 'chrom' parameter (1D case) / 'chrom1', 'chrom2'
#' parameters (2D case) must be specified. In other words: big intervals sets
#' can be loaded only by chromosome or chromosome pair.
#'
#' @param intervals.set name of an intervals set
#' @param chrom chromosome for 1D intervals set
#' @param chrom1 first chromosome for 2D intervals set
#' @param chrom2 second chromosome for 2D intervals set
#' @return A data frame representing the intervals.
#' @seealso \code{\link{gintervals.save}}, \code{\link{gintervals.is.bigset}},
#' \code{\link{gintervals.exists}}, \code{\link{gintervals.ls}},
#' \code{\link{gintervals}}, \code{\link{gintervals.2d}}
#' @keywords ~intervals
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' gintervals.load("annotations")
#'
#' @export gintervals.load
gintervals.load <- function(intervals.set = NULL, chrom = NULL, chrom1 = NULL, chrom2 = NULL) {
    .gintervals.load_ext(intervals.set, chrom, chrom1, chrom2, TRUE)
}



#' Loads assembly conversion table from a chain file
#'
#' Loads assembly conversion table from a chain file.
#'
#' This function reads a file in 'chain' format and returns assembly conversion
#' table that can be used in 'gtrack.liftover' and 'gintervals.liftover'.
#'
#' Note: chain file might map a few different source intervals into a single
#' target one. These ambiguous mappings are not presented in the data frame
#' returned by 'gintervals.load_chain'.
#'
#' @param file name of chain file
#' @return A data frame representing assembly conversion table.
#' @seealso \code{\link{gintervals.liftover}}, \code{\link{gtrack.liftover}}
#' @keywords ~intervals ~liftover ~chain
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' chainfile <- paste(.misha$GROOT, "data/test.chain", sep = "/")
#' gintervals.load_chain(chainfile)
#'
#' @export gintervals.load_chain
gintervals.load_chain <- function(file = NULL) {
    if (is.null(file)) {
        stop("Usage: gintervals.load_chain(file)", call. = FALSE)
    }
    .gcall("gchain2interv", file, .misha_env())
}



#' Returns a list of named intervals sets
#'
#' Returns a list of named intervals sets in Genomic Database.
#'
#' This function returns a list of named intervals sets that match the pattern
#' (see 'grep'). If called without any arguments all named intervals sets are
#' returned.
#'
#' @param pattern,ignore.case,perl,fixed,useBytes see 'grep'
#' @return An array that contains the names of intervals sets.
#' @seealso \code{\link{grep}}, \code{\link{gintervals.exists}},
#' \code{\link{gintervals.load}}, \code{\link{gintervals.save}},
#' \code{\link{gintervals.rm}}, \code{\link{gintervals}},
#' \code{\link{gintervals.2d}}
#' @keywords ~intervals ~ls
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' gintervals.ls()
#' gintervals.ls(pattern = "annot*")
#'
#' @export gintervals.ls
gintervals.ls <- function(pattern = "", ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) {
    .gcheckroot()
    grep(pattern, get("GINTERVS", envir = .misha), value = TRUE, ignore.case = ignore.case, perl = perl, fixed = fixed, useBytes = useBytes)
}



#' Applies a function to values of track expressions
#'
#' Applies a function to values of track expressions for each interval.
#'
#' This function evaluates track expressions for each interval from
#' 'intervals'. The resulted vectors are passed then as arguments to 'FUN'.
#'
#' If the intervals are one-dimensional and have an additional column named
#' 'strand' whose value is '-1', the values of the track expression are placed
#' to the vector in reverse order.
#'
#' The current interval index (1-based) is stored in 'GAPPLY.INTERVID' variable
#' that is available during the execution of 'gintervals.mapply'. There is no
#' guarantee about the order in which the intervals are processed. Do not rely
#' on any specific order and use 'GITERATOR.INTERVID' variable to detect the
#' current interval id.
#'
#' If 'enable.gapply.intervals' is 'TRUE', an additional variable
#' 'GAPPLY.INTERVALS' is defined during the execution of 'gintervals.mapply'.
#' This variable stores the current iterator intervals prior to track
#' expression evaluation. Please note that setting 'enable.gapply.intervals' to
#' 'TRUE' might severely affect the run-time of the function.
#'
#' Note: all the changes made in R environment by 'FUN' will be void if
#' multitasking mode is switched on. One should also refrain from performing
#' any other operations in 'FUN' that might be not "thread-safe" such as
#' updating files, etc. Please switch off multitasking ('options(gmultitasking
#' = FALSE)') if you wish to perform such operations.
#'
#' If 'intervals.set.out' is not 'NULL' the result is saved as an intervals
#' set. Use this parameter if the result size exceeds the limits of the
#' physical memory.
#'
#' @param FUN function to apply, found via ‘match.fun’
#' @param ... track expressions whose values are used as arguments for 'FUN'
#' @param intervals intervals for which track expressions are calculated
#' @param enable.gapply.intervals if 'TRUE', then a variable 'GAPPLY.INTERVALS'
#' is available
#' @param iterator track expression iterator. If 'NULL' iterator is determined
#' implicitly based on track expressions.
#' @param band track expression band. If 'NULL' no band is used.
#' @param intervals.set.out intervals set name where the function result is
#' optionally outputted
#' @return If 'intervals.set.out' is 'NULL' a data frame representing intervals
#' with an additional column that contains the return values of 'FUN'.
#' @seealso \code{\link{mapply}}
#' @keywords ~apply ~mapply
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' gintervals.mapply(
#'     max, "dense_track",
#'     gintervals(c(1, 2), 0, 10000)
#' )
#' gintervals.mapply(
#'     function(x, y) {
#'         max(x + y)
#'     }, "dense_track",
#'     "sparse_track", gintervals(c(1, 2), 0, 10000),
#'     iterator = "sparse_track"
#' )
#'
#' @export gintervals.mapply
gintervals.mapply <- function(FUN = NULL, ..., intervals = NULL, enable.gapply.intervals = FALSE, iterator = NULL, band = NULL, intervals.set.out = NULL) {
    assign("GINTERVID", -1, envir = .misha)
    args <- as.list(substitute(list(...)))[-1L]
    if (is.null(intervals) && length(args) < 2 || !is.null(intervals) && length(args) < 1) {
        stop("Usage: gintervals.mapply(FUN, [expr]+, intervals, enable.gapply.intervals = FALSE, iterator = NULL, intervals.set.out = NULL)", call. = FALSE)
    }
    .gcheckroot()

    intervals <- rescue_ALLGENOME(intervals, as.character(substitute(intervals)))

    if (is.null(intervals)) {
        intervals <- eval.parent(args[[length(args)]])
        args <- args[1:(length(args) - 1)]
    }

    tracks <- c()
    for (track in args) {
        tracks <- c(tracks, do.call(.gexpr2str, list(track), envir = parent.frame()))
    }

    .iterator <- do.call(.giterator, list(substitute(iterator)), envir = parent.frame())

    if (exists("GAPPLY.INTERVALS", envir = .misha)) {
        remove(list = "GAPPLY.INTERVALS", envir = .misha)
    }

    intervals.set.out <- do.call(.gexpr2str, list(substitute(intervals.set.out)), envir = parent.frame())

    if (.gintervals.is_bigset(intervals) || !is.null(intervals.set.out)) {
        res <- NULL

        INTERVALS_FUN <- function(intervals, intervals.set.out, envir) {
            intervals <- intervals[[1]]
            chrom_res <- .gcall("gmapply", intervals, FUN, tracks, enable.gapply.intervals, .iterator, band, FALSE, .misha_env())
            if (!is.null(chrom_res) && nrow(chrom_res) > 0) {
                if (is.null(intervals.set.out)) {
                    assign("res", c(get("res", envir = envir), list(chrom_res)), envir = envir)
                    .gverify_max_data_size(sum(unlist(lapply(get("res", envir), nrow))), arguments = "intervals.set.out")
                }
            }
            chrom_res
        }

        .gintervals.apply(gintervals.chrom_sizes(intervals), intervals, intervals.set.out, INTERVALS_FUN, intervals.set.out, environment())

        if (!is.null(res)) {
            res <- do.call(.grbind, res)
        } # much faster than calling rbind incrementally in FUN

        if (is.null(intervals.set.out)) {
            if (!is.null(res) && nrow(res)) {
                res
            } else {
                NULL
            }
        } else {
            retv <- 0
        } # suppress return value
    } else {
        if (.ggetOption("gmultitasking")) {
            .gcall("gmapply_multitask", intervals, FUN, tracks, enable.gapply.intervals, .iterator, band, TRUE, .misha_env())
        } else {
            .gcall("gmapply", intervals, FUN, tracks, enable.gapply.intervals, .iterator, band, TRUE, .misha_env())
        }
    }
}



#' Finds neighbors between two sets of intervals
#'
#' Finds neighbors between two sets of intervals.
#'
#' This function finds for each interval in 'intervals1' the closest
#' 'maxneighbors' intervals from 'intervals2'.
#'
#' For 1D intervals the distance must fall in the range of ['mindist',
#' 'maxdist']. If 'intervals2' contains a 'strand' column the distance can be
#' positive or negative depending on the 'strand' value and the position of
#' interval2 relatively to interval1. If 'strand' column is missing the
#' distance is always positive.
#'
#' For 2D intervals two distances are calculated and returned for each axis.
#' The distances must fall in the range of ['mindist1', 'maxdist1'] for axis 1
#' and ['mindist2', 'maxdist2'] for axis 2. For selecting the closest
#' 'maxneighbors' intervals Manhattan distance is used (i.e. dist1+dist2).
#'
#' The names of the returned columns are made unique using
#' \code{make.unique(colnames(df), sep = "")}, assuming 'df' is the result.
#'
#' If 'intervals.set.out' is not 'NULL' the result is saved as an intervals
#' set. Use this parameter if the result size exceeds the limits of the
#' physical memory.
#'
#' @param intervals1,intervals2 intervals
#' @param maxneighbors maximal number of neighbors
#' @param mindist,maxdist distance range for 1D intervals
#' @param mindist1,maxdist1,mindist2,maxdist2 distance range for 2D intervals
#' @param na.if.notfound if 'TRUE' return 'NA' interval if no matching
#' neighbors were found, otherwise omit the interval in the answer
#' @param intervals.set.out intervals set name where the function result is
#' optionally outputted
#' @return If 'intervals.set.out' is 'NULL' a data frame containing the pairs
#' of intervals from 'intervals1', intervals from 'intervals2' and an
#' additional column named 'dist' ('dist1' and 'dist2' for 2D intervals)
#' representing the distance between the corresponding intervals. The intervals
#' from intervals2 would be changed to 'chrom1', 'start1', and 'end1' and for
#' 2D intervals chrom11, start11, end11 and chrom22, start22, end22. If
#' 'na.if.notfound' is 'TRUE', the data frame contains all the intervals from
#' 'intervals1' including those for which no matching neighbor was found. For
#' the latter intervals an 'NA' neighboring interval is stated. If
#' 'na.if.notfound' is 'FALSE', the data frame contains only intervals from
#' 'intervals1' for which matching neighbor(s) was found.
#' @seealso \code{\link{gintervals}},
#' @keywords ~intervals ~annotate ~nearest ~neighbor ~neighbors
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' intervs1 <- giterator.intervals("dense_track",
#'     gintervals(1, 0, 4000),
#'     iterator = 233
#' )
#' intervs2 <- giterator.intervals(
#'     "sparse_track",
#'     gintervals(1, 0, 2000)
#' )
#' gintervals.neighbors(intervs1, intervs2, 10,
#'     mindist = -300,
#'     maxdist = 500
#' )
#' intervs2$strand <- c(1, 1, -1, 1)
#' gintervals.neighbors(intervs1, intervs2, 10,
#'     mindist = -300,
#'     maxdist = 500
#' )
#'
#' @export gintervals.neighbors
gintervals.neighbors <- function(intervals1 = NULL, intervals2 = NULL, maxneighbors = 1, mindist = -1e+09, maxdist = 1e+09,
                                 mindist1 = -1e+09, maxdist1 = 1e+09, mindist2 = -1e+09, maxdist2 = 1e+09,
                                 na.if.notfound = FALSE, intervals.set.out = NULL) {
    if (is.null(intervals1) || is.null(intervals2)) {
        stop(paste("Usage: gintervals.neighbors(intervals1, intervals2, maxneighbors = 1, mindist = -1e+09, maxdist = 1e+09, ",
            "mindist1 = -1e+09, maxdist1 = 1e+09, mindist2 = -1e+09, maxdist2 = 1e+09, na.if.notfound = FALSE, intervals.set.out = NULL)",
            sep = ""
        ), call. = FALSE)
    }

    if (is.null(colnames)) {
        intervals1name <- deparse(substitute(intervals1), width.cutoff = 500)[1]
        intervals2name <- deparse(substitute(intervals2), width.cutoff = 500)[1]
    }

    intervals.set.out <- do.call(.gexpr2str, list(substitute(intervals.set.out)), envir = parent.frame())

    if (.gintervals.is_bigset(intervals1) || .gintervals.is_bigset(intervals2) || !is.null(intervals.set.out)) {
        res <- NULL

        FUN <- function(intervals, intervals.set.out, envir) {
            intervals1 <- intervals[[1]]
            intervals2 <- intervals[[2]]
            chrom_res <- .gcall("gfind_neighbors", intervals1, intervals2, maxneighbors, mindist, maxdist, mindist1, maxdist1, mindist2, maxdist2, na.if.notfound, FALSE, .misha_env())
            if (!is.null(chrom_res) && is.null(intervals.set.out)) {
                assign("res", c(get("res", envir = envir), list(chrom_res)), envir = envir)
                .gverify_max_data_size(sum(unlist(lapply(get("res", envir), nrow))), arguments = "intervals.set.out")
            }
            chrom_res
        }

        if (na.if.notfound) {
            .gintervals.apply(gintervals.chrom_sizes(intervals1), list(intervals1, intervals2), intervals.set.out, FUN, intervals.set.out, environment())
        } else {
            chroms1 <- gintervals.chrom_sizes(intervals1)
            chroms1$size <- NULL
            chroms2 <- gintervals.chrom_sizes(intervals2)
            chroms2$size <- NULL
            .gintervals.apply(merge(chroms1, chroms2), list(intervals1, intervals2), intervals.set.out, FUN, intervals.set.out, environment())
        }

        if (!is.null(res)) {
            res <- do.call(.grbind, res)
        } # much faster than calling rbind incrementally in FUN

        if (is.null(intervals.set.out)) {
            if (!is.null(res) && nrow(res)) {
                repair_names(res)
            } else {
                NULL
            }
        } else {
            retv <- 0
        } # suppress return value
    } else {
        intervals1 <- .gintervals.load_ext(intervals1)
        intervals2 <- .gintervals.load_ext(intervals2)
        res <- .gcall("gfind_neighbors", intervals1, intervals2, maxneighbors, mindist, maxdist, mindist1, maxdist1, mindist2, maxdist2, na.if.notfound, TRUE, .misha_env())
        repair_names(res)
    }
}

repair_names <- function(dataframe) {
    # if there are duplicate names - add a numeric suffix
    if (length(unique(names(dataframe))) < length(names(dataframe))) {
        names(dataframe) <- make.unique(names(dataframe), sep = "")
    }
    return(dataframe)
}



#' Calculates quantiles of a track expression for intervals
#'
#' Calculates quantiles of a track expression for intervals.
#'
#' This function calculates quantiles of 'expr' for each interval in
#' 'intervals'.
#'
#' If 'intervals.set.out' is not 'NULL' the result is saved as an intervals
#' set. Use this parameter if the result size exceeds the limits of the
#' physical memory.
#'
#' @param expr track expression for which quantiles are calculated
#' @param percentiles an array of percentiles of quantiles in [0, 1] range
#' @param intervals set of intervals
#' @param iterator track expression iterator. If 'NULL' iterator is determined
#' implicitly based on track expressions.
#' @param band track expression band. If 'NULL' no band is used.
#' @param intervals.set.out intervals set name where the function result is
#' optionally outputted
#' @return If 'intervals.set.out' is 'NULL' a set of intervals with additional
#' columns representing quantiles for each percentile.
#' @seealso \code{\link{gquantiles}}, \code{\link{gbins.quantiles}}
#' @keywords ~quantiles ~percentiles
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' intervs <- gintervals(c(1, 2), 0, 5000)
#' gintervals.quantiles("dense_track",
#'     percentiles = c(0.5, 0.3, 0.9), intervs
#' )
#'
#' @export gintervals.quantiles
gintervals.quantiles <- function(expr = NULL, percentiles = 0.5, intervals = NULL, iterator = NULL, band = NULL, intervals.set.out = NULL) {
    if (is.null(substitute(expr)) || is.null(intervals)) {
        stop("Usage: gintervals.quantiles(expr, percentiles = 0.5, intervals, iterator = NULL, band = NULL, intervals.set.out = NULL)", call. = FALSE)
    }
    .gcheckroot()

    intervals <- rescue_ALLGENOME(intervals, as.character(substitute(intervals)))

    exprstr <- do.call(.gexpr2str, list(substitute(expr)), envir = parent.frame())
    .iterator <- do.call(.giterator, list(substitute(iterator)), envir = parent.frame())
    intervals.set.out <- do.call(.gexpr2str, list(substitute(intervals.set.out)), envir = parent.frame())

    if (!is.null(intervals.set.out)) {
        fullpath <- .gintervals.check_new_set(intervals.set.out)
    }

    success <- FALSE
    res <- NULL
    tryCatch(
        {
            if (.ggetOption("gmultitasking")) {
                res <- .gcall("gintervals_quantiles_multitask", intervals, exprstr, percentiles, .iterator, band, intervals.set.out, .misha_env())
            } else {
                res <- .gcall("gintervals_quantiles", intervals, exprstr, percentiles, .iterator, band, intervals.set.out, .misha_env())
            }

            if (!is.null(intervals.set.out) && .gintervals.is_bigset(intervals.set.out, FALSE) && !.gintervals.needs_bigset(intervals.set.out)) {
                .gintervals.big2small(intervals.set.out)
            }

            success <- TRUE
        },
        finally = {
            if (!success && !is.null(intervals.set.out)) {
                unlink(fullpath, recursive = TRUE)
            }
        }
    )

    # refresh the list of GINTERVS, etc.
    if (is.null(intervals.set.out)) {
        res
    } else {
        .gdb.add_intervals.set(intervals.set.out)
        retv <- 0 # suppress return value
    }
}



#' Combines several sets of intervals
#'
#' Combines several sets of intervals into one set.
#'
#' This function combines several intervals sets into one set. It works in a
#' similar manner as 'rbind' yet it is faster. Also it supports intervals sets
#' that are stored in files including the big intervals sets.
#'
#' If 'intervals.set.out' is not 'NULL' the result is saved as an intervals
#' set. If the format of the output intervals is set to be "big" (determined
#' implicitly based on the result size and options), the order of the resulted
#' intervals is altered as they are sorted by chromosome (or chromosomes pair -
#' for 2D).
#'
#' @param ... intervals sets to combine
#' @param intervals intervals set
#' @param intervals.set.out intervals set name where the function result is
#' optionally outputted
#' @return If 'intervals.set.out' is 'NULL' a data frame combining intervals
#' sets.
#' @seealso \code{\link{gintervals}}, \code{\link{gintervals.2d}},
#' \code{\link{gintervals.canonic}}
#' @keywords ~rbind
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#'
#' intervs1 <- gextract("sparse_track", gintervals(c(1, 2), 1000, 4000))
#' intervs2 <- gextract("sparse_track", gintervals(c(2, "X"), 2000, 5000))
#' gintervals.save("testintervs", intervs2)
#' gintervals.rbind(intervs1, "testintervs")
#' gintervals.rm("testintervs", force = TRUE)
#'
#' @export gintervals.rbind
gintervals.rbind <- function(..., intervals.set.out = NULL) {
    intervals <- list(...)
    if (!length(intervals)) {
        stop("Usage: gintervals.rbind([intervals]+, intervals.set.out = NULL)", call. = FALSE)
    }
    .gcheckroot()

    intervals.set.out <- do.call(.gexpr2str, list(substitute(intervals.set.out)), envir = parent.frame())

    res <- NULL
    if (any(unlist(lapply(intervals, function(intervals) {
        .gintervals.is_bigset(intervals)
    }))) || !is.null(intervals.set.out)) {
        if (is.null(intervals.set.out)) {
            FUN <- function(intervals, intervals.set.out, envir) {
                assign("res", c(get("res", envir = envir), intervals), envir = envir)
                .gverify_max_data_size(sum(unlist(lapply(get("res", envir), nrow))), arguments = "intervals.set.out")
                intervals[[1]]
            }

            # preserve the order of intervals inside the answer
            lapply(intervals, f <- function(intervals) {
                .gintervals.apply(gintervals.chrom_sizes(intervals), intervals, NULL, FUN, NULL, parent.frame(2))
            })
            if (!is.null(res)) {
                res <- do.call(.grbind, res)
            } # much faster than calling rbind incrementally in FUN
        } else {
            FUN <- function(intervals, intervals.set.out, envir) {
                intervals <- do.call(.grbind, intervals)
                intervals
            }

            # use for .gintervals.apply chromosomes from all intervals
            chroms <- NULL
            chroms <- lapply(intervals, gintervals.chrom_sizes)
            chroms <- do.call(rbind, chroms)
            if (.gintervals.is1d(intervals[[1]])) {
                chroms <- factor(chroms$chrom, levels(chroms$chrom))
                chroms <- unique(chroms)
                chroms <- sort(chroms)
                chroms <- data.frame(chrom = chroms)
            } else {
                chroms <- data.frame(chrom1 = chroms$chrom1, chrom2 = chroms$chrom2)
                chroms <- unique(chroms)
                chroms <- chroms[with(chroms, order(chrom1, chrom2)), ]
            }

            .gintervals.apply(chroms, intervals, intervals.set.out, FUN, intervals.set.out, environment())
        }
    } else {
        intervals <- lapply(intervals, .gintervals.load_ext)
        res <- do.call(.grbind, intervals) # much faster than calling rbind incrementally in FUN
    }

    if (is.null(intervals.set.out)) {
        if (!is.null(res) && nrow(res)) {
            res
        } else {
            NULL
        }
    } else {
        retv <- 0
    } # suppress return value
}



#' Deletes a named intervals set
#'
#' Deletes a named intervals set.
#'
#' This function deletes a named intervals set from the Genomic Database. By
#' default 'gintervals.rm' requires the user to interactively confirm the
#' deletion. Set 'force' to 'TRUE' to suppress the user prompt.
#'
#' @param intervals.set name of an intervals set
#' @param force if 'TRUE', suppresses user confirmation of a named intervals set
#' removal
#' @return None.
#' @seealso \code{\link{gintervals.save}}, \code{\link{gintervals.exists}},
#' \code{\link{gintervals.ls}}, \code{\link{gintervals}},
#' \code{\link{gintervals.2d}}
#' @keywords ~intervals
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' intervs <- gintervals(c(1, 2))
#' gintervals.save("testintervs", intervs)
#' gintervals.ls()
#' gintervals.rm("testintervs", force = TRUE)
#' gintervals.ls()
#'
#' @export gintervals.rm
gintervals.rm <- function(intervals.set = NULL, force = FALSE) {
    if (is.null(substitute(intervals.set))) {
        stop("Usage: gintervals.rm(intervals.set, force = FALSE)", call. = FALSE)
    }
    .gcheckroot()

    intervals.set <- do.call(.gexpr2str, list(substitute(intervals.set)), envir = parent.frame())

    # check whether intervals.set appears among GINTERVS
    if (is.na(match(intervals.set, get("GINTERVS", envir = .misha)))) {
        if (force) {
            return(invisible())
        }
        stop(sprintf("Intervals set %s does not exist", intervals.set), call. = FALSE)
    }

    answer <- "N"
    if (force) {
        answer <- "Y"
    } else {
        str <- sprintf("Are you sure you want to delete intervals set %s (Y/N)? ", intervals.set)
        message(str)
        answer <- toupper(readLines(n = 1))
    }

    if (answer == "Y" || answer == "YES") {
        fname <- sprintf("%s.interv", paste(get("GWD", envir = .misha), gsub("\\.", "/", intervals.set), sep = "/"))

        # remove the intervals set
        unlink(fname, recursive = TRUE)

        if (file.exists(fname)) {
            message(sprintf("Failed to delete intervals set %s", intervals.set))
        } else {
            # refresh the list of GINTERVS, etc.
            .gdb.rm_intervals.set(intervals.set)
        }
    }
}



#' Creates a named intervals set
#'
#' Saves intervals to a named intervals set.
#'
#' This function saves 'intervals' as a named intervals set.
#'
#' @param intervals.set.out name of the new intervals set
#' @param intervals intervals to save
#' @return None.
#' @seealso \code{\link{gintervals.rm}}, \code{\link{gintervals.load}},
#' \code{\link{gintervals.exists}}, \code{\link{gintervals.ls}},
#' \code{\link{gintervals}}, \code{\link{gintervals.2d}}
#' @keywords ~intervals
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' intervs <- gintervals(c(1, 2))
#' gintervals.save("testintervs", intervs)
#' gintervals.ls()
#' gintervals.rm("testintervs", force = TRUE)
#'
#' @export gintervals.save
gintervals.save <- function(intervals.set.out = NULL, intervals = NULL) {
    if (is.null(substitute(intervals.set.out)) || is.null(intervals)) {
        stop("Usage: gintervals.save(intervals.set.out, intervals)", call. = FALSE)
    }
    .gcheckroot()

    intervals <- rescue_ALLGENOME(intervals, as.character(substitute(intervals)))

    intervals.set.out <- do.call(.gexpr2str, list(substitute(intervals.set.out)), envir = parent.frame())
    .gintervals.apply(gintervals.chrom_sizes(intervals), intervals, intervals.set.out, function(intervs, ...) {
        intervs[[1]]
    })
    retv <- NULL
}



#' Updates a named intervals set
#'
#' Updates a named intervals set.
#'
#' This function replaces all intervals of given chromosome (or chromosome
#' pair) within 'intervals.set' with 'intervals'. Chromosome is specified by
#' 'chrom' for 1D intervals set or 'chrom1', 'chrom2' for 2D intervals set.
#'
#' If 'intervals' is 'NULL' all intervals of given chromosome are removed from
#' 'intervals.set'.
#'
#' @param intervals.set name of an intervals set
#' @param intervals intervals or 'NULL'
#' @param chrom chromosome for 1D intervals set
#' @param chrom1 first chromosome for 2D intervals set
#' @param chrom2 second chromosome for 2D intervals set
#' @return None.
#' @seealso \code{\link{gintervals.save}}, \code{\link{gintervals.load}},
#' \code{\link{gintervals.exists}}, \code{\link{gintervals.ls}}
#' @keywords ~intervals
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' intervs <- gscreen(
#'     "sparse_track > 0.2",
#'     gintervals(c(1, 2), 0, 10000)
#' )
#' gintervals.save("testintervs", intervs)
#' gintervals.load("testintervs")
#' gintervals.update("testintervs", intervs[intervs$chrom == "chr2", ][1:5, ], chrom = 2)
#' gintervals.load("testintervs")
#' gintervals.update("testintervs", NULL, chrom = 2)
#' gintervals.load("testintervs")
#' gintervals.rm("testintervs", force = TRUE)
#'
#' @export gintervals.update
gintervals.update <- function(intervals.set = NULL, intervals = "", chrom = NULL, chrom1 = NULL, chrom2 = NULL) {
    if (is.null(substitute(intervals.set)) || identical(intervals, "")) {
        stop("Usage: gintervals.update(intervals.set, intervals, chrom = NULL, chrom1 = NULL, chrom2 = NULL)", call. = FALSE)
    }
    .gcheckroot()

    intervals <- rescue_ALLGENOME(intervals, as.character(substitute(intervals)))

    if (identical(intervals.set, intervals)) {
        return(retv <- NULL)
    }

    if (is.null(chrom) && is.null(chrom1) && is.null(chrom2)) {
        stop("Chromosome must be specified in chrom (for 2D intervals: chrom1, chrom2) parameter", call. = FALSE)
    }

    if (!is.null(chrom)) {
        chrom <- .gchroms(chrom)
        if (length(chrom) > 1) {
            stop("chrom parameter should mark only one chromosome")
        }
    }

    if (!is.null(chrom1)) {
        chrom1 <- .gchroms(chrom1)
        if (length(chrom1) > 1) {
            stop("chrom1 parameter should mark only one chromosome")
        }
    }

    if (!is.null(chrom2)) {
        chrom2 <- .gchroms(chrom2)
        if (length(chrom2) > 1) {
            stop("chrom2 parameter should mark only one chromosome")
        }
    }

    if (!is.null(chrom) && !is.null(chrom1)) {
        stop("Cannot use chrom and chrom1 parameters in the same call", call. = FALSE)
    }

    if (!is.null(chrom) && !is.null(chrom2)) {
        stop("Cannot use chrom and chrom2 parameters in the same call", call. = FALSE)
    }

    if (!is.character(intervals.set) || length(intervals.set) != 1) {
        stop("Invalid format of intervals.set parameter", call. = FALSE)
    }

    if (is.na(match(intervals.set, get("GINTERVS", envir = .misha)))) {
        stop(sprintf("Intervals set %s does not exist", intervals.set), call. = FALSE)
    }

    path <- gsub(".", "/", intervals.set, fixed = TRUE)
    path <- paste(path, ".interv", sep = "")
    fullpath <- paste(get("GWD", envir = .misha), path, sep = "/")

    if (!is.null(intervals)) {
        if (!is.null(chrom)) {
            intervals <- .gintervals.load_ext(intervals, chrom = chrom)
        } else {
            intervals <- .gintervals.load_ext(intervals, chrom1 = chrom1, chrom2 = chrom2)
        }
    }

    # big: update stats (including delete), save chrom (or delete), convert to small if needed
    if (.gintervals.is_bigset(intervals.set)) {
        is1d <- .gintervals.big.is1d(intervals.set)
        meta <- .gintervals.big.meta(intervals.set)
        stats <- meta$stats
        zeroline <- meta$zeroline

        if (!is.null(intervals) && !identical(sapply(intervals, "class"), sapply(zeroline, "class"))) {
            stop(sprintf("Cannot update intervals set %s: columns differ", intervals.set), call. = FALSE)
        }

        if (is1d) {
            if (is.null(chrom)) {
                stop("chrom parameter is not specified", call. = FALSE)
            }
            idx <- which(stats$chrom == chrom)
            if (length(idx) > 0) {
                stats <- stats[-idx, ]
            }
            if (!is.null(intervals) && nrow(intervals)) {
                stat <- .gcall("gintervals_stats", intervals, .misha_env())
                stats <- rbind(stats, data.frame(chrom = chrom, stat))
                stats <- stats[order(stats$chrom), ]
            }
            .gintervals.big.save(fullpath, intervals, chrom = chrom)
        } else {
            if (is.null(chrom1) || is.null(chrom2)) {
                stop("chrom1 and chrom2 parameters must be specified", call. = FALSE)
            }
            idx <- which(stats$chrom1 == chrom1 & stats$chrom2 == chrom2)
            if (length(idx) > 0) {
                stats <- stats[-idx, ]
            }
            if (!is.null(intervals) && nrow(intervals)) {
                stat <- .gcall("gintervals_stats", intervals, .misha_env())
                stats <- rbind(stats, data.frame(chrom1 = chrom1, chrom2 = chrom2, stat))
                stats <- stats[order(stats$chrom1, stats$chrom2), ]
            }
            .gintervals.big.save(fullpath, intervals, chrom1 = chrom1, chrom2 = chrom2)
        }

        if (nrow(stats) > 1) {
            rownames(stats) <- 1:nrow(stats)
        }
        .gintervals.big.save_meta(fullpath, stats, zeroline)

        if (!.gintervals.needs_bigset(intervals.set)) {
            .gintervals.big2small(intervals.set)
        }
    }

    # small: load all, update in place (including delete), save back, convert to big if needed
    else {
        tgt.intervals <- .gintervals.load_ext(intervals.set)
        is1d <- .gintervals.is1d(intervals.set)

        if (!is.null(intervals) && !identical(sapply(intervals, "class"), sapply(tgt.intervals, "class"))) {
            stop(sprintf("Cannot update intervals set %s: columns differ", intervals.set), call. = FALSE)
        }

        if (is1d) {
            if (is.null(chrom)) {
                stop("chrom parameter is not specified", call. = FALSE)
            }
            idx <- which(tgt.intervals$chrom == chrom)
            if (length(idx) > 0) {
                tgt.intervals <- tgt.intervals[-idx, ]
            }
            if (!is.null(intervals) && nrow(intervals)) {
                tgt.intervals <- .grbind(tgt.intervals, intervals)
                tgt.intervals <- tgt.intervals[order(tgt.intervals$chrom), ]
            }
        } else {
            if (is.null(chrom1) || is.null(chrom2)) {
                stop("chrom1 and chrom2 parameters must be specified", call. = FALSE)
            }
            idx <- which(tgt.intervals$chrom1 == chrom1 & tgt.intervals$chrom2 == chrom2)
            if (length(idx) > 0) {
                tgt.intervals <- tgt.intervals[-idx, ]
            }
            if (!is.null(intervals) && nrow(intervals)) {
                tgt.intervals <- .grbind(tgt.intervals, intervals)
                tgt.intervals <- tgt.intervals[order(tgt.intervals$chrom1, tgt.intervals$chrom2), ]
            }
        }
        if (.gintervals.needs_bigset(tgt.intervals)) {
            .gintervals.small2big(intervals.set, tgt.intervals)
        } else {
            .gintervals.save_file(fullpath, tgt.intervals)
        }
    }

    retv <- 0 # suppress return value
}



#' Calculates summary statistics of track expression for intervals
#'
#' Calculates summary statistics of track expression for intervals.
#'
#' This function returns summary statistics of a track expression for each
#' interval 'intervals': total number of bins, total number of bins whose value
#' is NaN, min, max, sum, mean and standard deviation of the values.
#'
#' If 'intervals.set.out' is not 'NULL' the result is saved as an intervals
#' set. Use this parameter if the result size exceeds the limits of the
#' physical memory.
#'
#' @param expr track expression
#' @param intervals set of intervals
#' @param iterator track expression iterator. If 'NULL' iterator is determined
#' implicitly based on track expression.
#' @param band track expression band. If 'NULL' no band is used.
#' @param intervals.set.out intervals set name where the function result is
#' optionally outputted
#' @return If 'intervals.set.out' is 'NULL' a set of intervals with additional
#' columns representing summary statistics for each percentile and interval.
#' @seealso \code{\link{gsummary}}, \code{\link{gbins.summary}}
#' @keywords ~summary ~statistics
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#' intervs <- gintervals(c(1, 2), 0, 5000)
#' gintervals.summary("dense_track", intervs)
#'
#' @export gintervals.summary
gintervals.summary <- function(expr = NULL, intervals = NULL, iterator = NULL, band = NULL, intervals.set.out = NULL) {
    if (is.null(substitute(expr)) || is.null(intervals)) {
        stop("Usage: gintervals.summary(expr, intervals, iterator = NULL, band = NULL, intervals.set.out = NULL)", call. = FALSE)
    }
    .gcheckroot()

    intervals <- rescue_ALLGENOME(intervals, as.character(substitute(intervals)))

    exprstr <- do.call(.gexpr2str, list(substitute(expr)), envir = parent.frame())
    .iterator <- do.call(.giterator, list(substitute(iterator)), envir = parent.frame())
    intervals.set.out <- do.call(.gexpr2str, list(substitute(intervals.set.out)), envir = parent.frame())

    if (!is.null(intervals.set.out)) {
        fullpath <- .gintervals.check_new_set(intervals.set.out)
    }

    # intervals can be NULL if gextract is piped with gscreen and the latter returns NULL
    success <- FALSE
    res <- NULL
    tryCatch(
        {
            if (!is.null(intervals)) {
                res <- .gcall("gintervals_summary", exprstr, intervals, .iterator, band, intervals.set.out, .misha_env())
                if (!is.null(intervals.set.out) && .gintervals.is_bigset(intervals.set.out, FALSE) && !.gintervals.needs_bigset(intervals.set.out)) {
                    .gintervals.big2small(intervals.set.out)
                }
            }
            success <- TRUE
        },
        finally = {
            if (!success && !is.null(intervals.set.out)) {
                unlink(fullpath, recursive = TRUE)
            }
        }
    )

    # refresh the list of GINTERVS, etc.
    if (!is.null(intervals.set.out)) {
        .gdb.add_intervals.set(intervals.set.out)
        retv <- 0 # suppress return value
    } else {
        res
    }
}



#' Calculates a union of two sets of intervals
#'
#' Calculates a union of two sets of intervals.
#'
#' This function returns intervals that represent a genomic space covered by
#' either 'intervals1' or 'intervals2'.
#'
#' If 'intervals.set.out' is not 'NULL' the result is saved as an intervals
#' set. Use this parameter if the result size exceeds the limits of the
#' physical memory.
#'
#' @param intervals1,intervals2 set of one-dimensional intervals
#' @param intervals.set.out intervals set name where the function result is
#' optionally outputted
#' @return If 'intervals.set.out' is 'NULL' a data frame representing the union
#' of intervals.
#' @seealso \code{\link{gintervals.intersect}}, \code{\link{gintervals.diff}},
#' \code{\link{gintervals}}, \code{\link{gintervals.2d}}
#' @keywords ~union
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#'
#' intervs1 <- gscreen("dense_track > 0.15 & dense_track < 0.18")
#' intervs2 <- gscreen("dense_track >= 0.18 & dense_track < 0.2")
#'
#' ## 'intervs3' and 'intervs4' are identical
#' intervs3 <- gintervals.union(intervs1, intervs2)
#' intervs4 <- gscreen("dense_track > 0.15 & dense_track < 0.2")
#'
#' @export gintervals.union
gintervals.union <- function(intervals1 = NULL, intervals2 = NULL, intervals.set.out = NULL) {
    if (is.null(intervals1) || is.null(intervals2)) {
        stop("Usage: gintervals.union(intervals1, intervals2, intervals.set.out = NULL)", call. = FALSE)
    }

    if (.gintervals.is_bigset(intervals1) || .gintervals.is_bigset(intervals2) || !is.null(intervals.set.out)) {
        res <- NULL

        FUN <- function(intervals, intervals.set.out, envir) {
            intervals1 <- intervals[[1]]
            intervals2 <- intervals[[2]]
            chrom_res <- .gcall("gintervunion", intervals1, intervals2, .misha_env())
            if (!is.null(chrom_res) && nrow(chrom_res) > 0) {
                if (is.null(intervals.set.out)) {
                    assign("res", c(get("res", envir = envir), list(chrom_res)), envir = envir)
                    .gverify_max_data_size(sum(unlist(lapply(get("res", envir), nrow))), arguments = "intervals.set.out")
                }
            }
            chrom_res
        }

        # use for .gintervals.apply chromosomes from both intervals1 and intervals2
        chroms <- NULL
        if (.gintervals.is1d(intervals1)) {
            chroms <- rbind(gintervals.chrom_sizes(intervals1), gintervals.chrom_sizes(intervals2))
            chroms <- factor(chroms$chrom, levels(chroms$chrom))
            chroms <- unique(chroms)
            chroms <- sort(chroms)
            chroms <- data.frame(chrom = chroms)
        } else {
            chroms <- rbind(gintervals.chrom_sizes(intervals1), gintervals.chrom_sizes(intervals2))
            chroms <- data.frame(chrom1 = chroms$chrom1, chrom2 = chroms$chrom2)
            chroms <- unique(chroms)
            chroms <- chroms[with(chroms, order(chrom1, chrom2)), ]
        }
        .gintervals.apply(chroms, list(intervals1, intervals2), intervals.set.out, FUN, intervals.set.out, environment())

        if (!is.null(res)) {
            res <- do.call(.grbind, res)
        } # much faster than calling rbind incrementally in FUN

        if (is.null(intervals.set.out)) {
            if (!is.null(res) && nrow(res)) {
                res
            } else {
                NULL
            }
        } else {
            retv <- 0
        } # suppress return value
    } else {
        res <- .gcall("gintervunion", intervals1, intervals2, .misha_env())
        res
    }
}




#' Creates a cartesian-grid iterator
#'
#' Creates a cartesian grid two-dimensional iterator that can be used by any
#' function that accepts an iterator argument.
#'
#' This function creates and returns a cartesian grid two-dimensional iterator
#' that can be used by any function that accepts an iterator argument.
#'
#' Assume 'centers1' and 'centers2' to be the central points of each interval
#' from 'intervals1' and 'intervals2', and 'C1', 'C2' to be two points from
#' 'centers1', 'centers2' accordingly. Assume also that the values in
#' 'expansion1' and 'expansion2' are unique and sorted.
#'
#' 'giterator.cartesian_grid' creates a set of all possible unique and
#' non-overlapping two-dimensional intervals of form: '(chrom1, start1, end1,
#' chrom2, start2, end2)'. Each '(chrom1, start1, end1)' is created by taking a
#' point 'C1' - '(chrom1, coord1)' and converting it to 'start1' and 'end1'
#' such that 'start1 == coord1+E1[i]', 'end1 == coord1+E1[i+1]', where 'E1[i]'
#' is one of the sorted 'expansion1' values. Overlaps between rectangles or
#' expansion beyond the limits of chromosome are avoided.
#'
#' 'min.band.idx' and 'max.band.idx' parameters control whether a pair of 'C1'
#' and 'C2' is skipped or not. If both of these parameters are not 'NULL' AND
#' if both 'C1' and 'C2' share the same chromosome AND the delta of indices of
#' 'C1' and 'C2' ('C1 index - C2 index') lays within '[min.band.idx,
#' max.band.idx]' range - only then the pair will be used to create the
#' intervals. Otherwise 'C1-C2' pair is filtered out. Note: if 'min.band.idx'
#' and 'max.band.idx' are not 'NULL', i.e. band indices filtering is applied,
#' then 'intervals2' parameter must be set to 'NULL'.
#'
#' @param intervals1 one-dimensional intervals
#' @param expansion1 an array of integers that define expansion around
#' intervals1 centers
#' @param intervals2 one-dimensional intervals. If 'NULL' then 'intervals2' is
#' considered to be equal to 'intervals1'
#' @param expansion2 an array of integers that define expansion around
#' intervals2 centers. If 'NULL' then 'expansion2' is considered to be equal to
#' 'expansion1'
#' @param min.band.idx,max.band.idx integers that limit iterator intervals to
#' band
#' @return A list containing the definition of cartesian iterator.
#' @seealso \code{\link{giterator.intervals}}
#' @keywords ~iterator ~cartesian
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#'
#' intervs1 <- gintervals(
#'     c(1, 1, 2), c(100, 300, 200),
#'     c(300, 500, 300)
#' )
#' intervs2 <- gintervals(
#'     c(1, 2, 2), c(400, 1000, 3000),
#'     c(800, 2000, 4000)
#' )
#' itr <- giterator.cartesian_grid(
#'     intervs1, c(-20, 100), intervs2,
#'     c(-40, -10, 50)
#' )
#' giterator.intervals(iterator = itr)
#'
#' itr <- giterator.cartesian_grid(intervs1, c(-20, 50, 100))
#' giterator.intervals(iterator = itr)
#'
#' itr <- giterator.cartesian_grid(intervs1, c(-20, 50, 100),
#'     min.band.idx = -1,
#'     max.band.idx = 0
#' )
#' giterator.intervals(iterator = itr)
#'
#' @export giterator.cartesian_grid
giterator.cartesian_grid <- function(intervals1 = NULL, expansion1 = NULL, intervals2 = NULL, expansion2 = NULL, min.band.idx = NULL, max.band.idx = NULL) {
    if (is.null(intervals1) || is.null(expansion1)) {
        stop("Usage: giterator.cartesian_grid(intervals1, expansion1, intervals2 = NULL, expansion2 = NULL, min.band.idx = NULL, max.band.idx = NULL)", call. = FALSE)
    }

    use.band.idx.limit <- !is.null(min.band.idx) && !is.null(max.band.idx)
    if (use.band.idx.limit) {
        if (min.band.idx > max.band.idx) {
            stop("min.band.idx exceeds max.band.idx", call. = FALSE)
        }

        if (!is.null(intervals2)) {
            stop("band.idx limit can only be used when intervals2 is set to NULL", call. = FALSE)
        }
    } else {
        min.band.idx <- 0
        max.band.idx <- 0
    }

    r <- list(
        intervals1 = intervals1, intervals2 = intervals2, expansion1 = expansion1, expansion2 = expansion2,
        band.idx = c(min.band.idx, max.band.idx, use.band.idx.limit)
    )
    class(r) <- "cartesian.grid"
    .gcall("gcheck_iterator", r, .misha_env())
    r
}



#' Returns iterator intervals
#'
#' Returns iterator intervals given track expression, scope, iterator and band.
#'
#' This function returns a set of intervals used by the iterator intervals for
#' the given track expression, genomic scope, iterator and band. Some functions
#' accept an iterator without accepting a track expression (like
#' 'gtrack.create_pwm_energy'). These functions generate the values for each
#' iterator interval by themselves. Use set 'expr' to 'NULL' to simulate the
#' work of these functions.
#'
#' If 'intervals.set.out' is not 'NULL' the result is saved as an intervals
#' set. Use this parameter if the result size exceeds the limits of the
#' physical memory.
#'
#' @param expr track expression
#' @param intervals genomic scope
#' @param iterator track expression iterator. If 'NULL' iterator is determined
#' implicitly based on track expression.
#' @param band track expression band. If 'NULL' no band is used.
#' @param intervals.set.out intervals set name where the function result is
#' optionally outputted
#' @return If 'intervals.set.out' is 'NULL' a data frame representing iterator
#' intervals.
#' @seealso \code{\link{giterator.cartesian_grid}}
#' @keywords ~iterator ~intervals
#' @examples
#' \dontshow{
#' options(gmax.processes = 2)
#' }
#'
#' gdb.init_examples()
#'
#' ## iterator is set implicitly to bin size of 'dense' track
#' giterator.intervals("dense_track", gintervals(1, 0, 200))
#'
#' ## iterator = 30
#' giterator.intervals("dense_track", gintervals(1, 0, 200), 30)
#'
#' ## iterator is an intervals set named 'annotations'
#' giterator.intervals("dense_track", .misha$ALLGENOME, "annotations")
#'
#' ## iterator is set implicitly to intervals of 'array_track' track
#' giterator.intervals("array_track", gintervals(1, 0, 200))
#'
#' ## iterator is a rectangle 100000 by 50000
#' giterator.intervals(
#'     "rects_track",
#'     gintervals.2d(chroms1 = 1, chroms2 = "chrX"),
#'     c(100000, 50000)
#' )
#'
#' @export giterator.intervals
giterator.intervals <- function(expr = NULL, intervals = .misha$ALLGENOME, iterator = NULL, band = NULL, intervals.set.out = NULL) {
    if (is.null(substitute(expr)) && is.null(substitute(iterator))) {
        stop("Usage: giterator.intervals(expr = NULL, intervals = .misha$ALLGENOME, iterator = NULL, band = NULL, intervals.set.out = NULL)", call. = FALSE)
    }

    intervals <- rescue_ALLGENOME(intervals, as.character(substitute(intervals)))

    if (is.null(substitute(expr))) {
        exprstr <- "0"
    } else {
        exprstr <- do.call(.gexpr2str, list(substitute(expr)), envir = parent.frame())
    }

    .iterator <- do.call(.giterator, list(substitute(iterator)), envir = parent.frame())
    intervals.set.out <- do.call(.gexpr2str, list(substitute(intervals.set.out)), envir = parent.frame())

    if (!is.null(intervals.set.out)) {
        fullpath <- .gintervals.check_new_set(intervals.set.out)
    }

    # intervals can be NULL if gextract is piped with gscreen and the latter returns NULL
    success <- FALSE
    res <- NULL
    tryCatch(
        {
            if (!is.null(intervals)) {
                res <- .gcall("giterator_intervals", exprstr, intervals, .iterator, band, intervals.set.out, .misha_env())

                if (!is.null(intervals.set.out) && .gintervals.is_bigset(intervals.set.out, FALSE) && !.gintervals.needs_bigset(intervals.set.out)) {
                    .gintervals.big2small(intervals.set.out)
                }
            }

            success <- TRUE
        },
        finally = {
            if (!success && !is.null(intervals.set.out)) {
                unlink(fullpath, recursive = TRUE)
            }
        }
    )

    # refresh the list of GINTERVS, etc.
    if (!is.null(intervals.set.out)) {
        .gdb.add_intervals.set(intervals.set.out)
        retv <- 0 # suppress return value
    } else {
        res
    }
}

Try the misha package in your browser

Any scripts or data that you put into this service are public.

misha documentation built on Sept. 14, 2023, 5:08 p.m.