R/read.Mask.R
In IRanges: Foundation of integer range manipulation in Bioconductor

Documented in read.agpMask read.gapMask read.liftMask read.rmMask read.trfMask

### =========================================================================
### Read a mask from a file
### -----------------------
###
### From an NCBI "agp" file (for chrY in hs b36v3):
###   library(BSgenome.Hsapiens.NCBI.b36v3)
###   file1 <- system.file("extdata", "hs_b36v3_chrY.agp", package="IRanges")
###   mask1 <- read.agpMask(file1, seqname="chrY", mask.width=length(Hsapiens$chrY))
###
### From an UCSC "gap" file (for chrY in hg18):
###   library(BSgenome.Hsapiens.UCSC.hg18)
###   file2 <- system.file("extdata", "chrY_gap.txt", package="IRanges")
###   mask2 <- read.gapMask(file2, seqname="chrY", mask.width=length(Hsapiens$chrY))
###
### From an UCSC "lift" file (for hg18):
###   file3 <- system.file("extdata", "hg18liftAll.lft", package="IRanges")
###   mask3 <- read.liftMask(file3, seqname="chr1")
###
### From a RepeatMasker .out file (for chrM in ce2):
###   library(BSgenome.Celegans.UCSC.ce2)
###   file4 <- system.file("extdata", "ce2chrM.fa.out", package="IRanges")
###   mask4 <- read.rmMask(file4, seqname="chrM", mask.width=length(Celegans$chrM)) 
###
### From a Tandem Repeats Finder .bed file (for chrM in ce2):
###   file5 <- system.file("extdata", "ce2chrM.bed", package="IRanges")
###   mask5 <- read.trfMask(file5, seqname="chrM", mask.width=length(Celegans$chrM)) 
###
### -------------------------------------------------------------------------


.showDistinctSeqnamesAndStop <- function(seqnames)
{
    distinct_seqnames <- paste("\"", unique(seqnames), "\"", sep="")
    distinct_seqnames <- paste(distinct_seqnames, collapse=", ")
    stop(length(distinct_seqnames), " distinct seqnames found in this file: ", distinct_seqnames)
}

.newEmptyMask <- function(seqname, mask.width, mask.name, mask.desc, nofound_what="information")
{
    msg <- paste("No ", nofound_what, " found for sequence \"",
                 seqname, "\" in this file. ", sep="")
    if (is.na(mask.width))
        stop(msg, "Please use the\n",
             "  'mask.width' argument to specify the width of the empty mask to\n",
             "  return (i.e. the length of the sequence this mask will be put on).")
    warning(msg, "returning empty mask")
    ans <- Mask(mask.width)  # empty mask
    names(ans) <- mask.name
    desc(ans) <- paste(mask.desc, "(empty)")
    ans
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### read.agpMask() and read.gapMask() extract the AGAPS mask from an NCBI
### "agp" file or a UCSC "gap" file, respectively.
###

.guessGapFileCOL2CLASS <- function(file)
{
    ## UCSC "gap" files generally have the 9 columns below except for some
    ## organisms like Rhesus that have only 8 columns (no 'bin' column).
    COL2CLASS <- c(
        `bin`="integer",
        `chrom`="character",
        `chr_start`="integer",
        `chr_stop`="integer",
        `part_no`="integer",
        `part_type`="character",
        `gap_len`="integer",
        `gap_type`="character",
        `bridge`="character"
    )
    line1 <- try(read.table(file, sep="\t",
                            col.names=names(COL2CLASS),
                            colClasses=COL2CLASS,
                            nrows=1L,
                            check.names=FALSE),
                 silent=TRUE)
    if (!inherits(line1, "try-error"))
        return(COL2CLASS)
    COL2CLASS <- COL2CLASS[-1L]
    line1 <- try(read.table(file, sep="\t",
                            col.names=names(COL2CLASS),
                            colClasses=COL2CLASS,
                            nrows=1L,
                            check.names=FALSE),
                 silent=TRUE)
    if (!inherits(line1, "try-error"))
        return(COL2CLASS)
    stop("unable to guess the column names in \"gap\" file '", file, "', sorry")
}

.read.agpORgapFile <- function(agp_or_gap, file)
{
    if (agp_or_gap == "agp") {
        COL2CLASS <- c(
            `chrom`="character",
            `chr_start`="integer",
            `chr_stop`="integer",
            `part_no`="integer",
            `part_type`="character",
            `gap_len`="character",
            `gap_type`="character",
            `linkage`="character",
            `empty`="character"
        )
    } else if (agp_or_gap == "gap") {
        COL2CLASS <- .guessGapFileCOL2CLASS(file)
    } else {
        stop("read.Mask internal error: please report")
    }
    COLS <- c(
        "chrom",
        "chr_start",
        "chr_stop",
        "part_type",
        "gap_len",
        "gap_type"
    )
    COL2CLASS[!(names(COL2CLASS) %in% COLS)] <- "NULL"
    data <- read.table(file,
                       sep="\t",
                       col.names=names(COL2CLASS),
                       colClasses=COL2CLASS,
                       check.names=FALSE,
                       fill=TRUE)
}

.read.agpORgapMask <- function(agp_or_gap, file, seqname, mask.width,
                               gap.types, use.gap.types)
{
    if (!isSingleString(seqname))
        stop("'seqname' must be a single string")
    if (!isSingleNumberOrNA(mask.width))
        stop("'mask.width' must be a single integer or 'NA'")
    if (!is.integer(mask.width))
        mask.width <- as.integer(mask.width)
    if (!is.null(gap.types) && (!is.character(gap.types)
                                || S4Vectors:::anyMissing(gap.types)
                                || anyDuplicated(gap.types)))
        stop("'gap.types' must be 'NULL' or a character vector ",
             "with no NAs and no duplicated")
    if (!isTRUEorFALSE(use.gap.types))
        stop("'use.gap.types' must be TRUE or FALSE")
    data <- .read.agpORgapFile(agp_or_gap, file)
    if (seqname == "?")
        .showDistinctSeqnamesAndStop(data$chrom)
    data <- data[data$chrom == seqname, ]
    ii <- data$part_type == "N"
    if (agp_or_gap == "agp") {
        data <- data[ii, ]
    } else if (!all(ii)) {
        warning("gap file contains gaps with a part_type that is not N")
    }
    if (length(gap.types) == 1 && gap.types == "?") {
        found_types <- paste("\"", unique(data$gap_type), "\"", sep="")
        found_types <- paste(found_types, collapse=", ")
        stop("gap types found in this file for sequence \"", seqname, "\": ", found_types)
    }
    mask.name <- "AGAPS"
    mask.desc <- "assembly gaps"
    if (!is.null(gap.types)) {
        data <- data[data$gap_type %in% gap.types, ]
        mask.desc <- paste(mask.desc, " [type=", paste(gap.types, collapse="|"), "]", sep="")
    }
    if (nrow(data) == 0)
        return(.newEmptyMask(seqname, mask.width, mask.name, mask.desc, mask.desc))
    if (agp_or_gap == "agp")
        ranges_start <- data$chr_start
    else
        ranges_start <- data$chr_start + 1L
    ranges <- IRanges(start=ranges_start, width=as.integer(data$gap_len))
    ## Sanity check
    if (!identical(end(ranges), data$chr_stop))
        stop("broken \"", agp_or_gap, "\" file: contains inconsistent ",
             "chr_start/chr_stop/gap_len values ",
             "for assembly gaps in sequence \"", seqname, "\"")
    if (use.gap.types) {
        names(ranges) <- data$gap_type
        if (S4Vectors:::isNotStrictlySorted(start(ranges)))
            ranges <- ranges[base::order(start(ranges))]
        if (!isNormal(ranges))
            stop("cannot use the gap types when some gaps are adjacent or overlap")
        nir1 <- asNormalIRanges(ranges, force=FALSE)
    } else {
        nir1 <- asNormalIRanges(ranges, force=TRUE)
    }
    ## Don't use new2(): the validity of the new mask needs to be checked!
    new2("MaskCollection", nir_list=list(nir1), width=mask.width, active=TRUE,
                           NAMES=mask.name, desc=mask.desc, check=FALSE)
}

read.agpMask <- function(file, seqname="?", mask.width=NA, gap.types=NULL, use.gap.types=FALSE)
    .read.agpORgapMask("agp", file, seqname, mask.width, gap.types, use.gap.types)

read.gapMask <- function(file, seqname="?", mask.width=NA, gap.types=NULL, use.gap.types=FALSE)
    .read.agpORgapMask("gap", file, seqname, mask.width, gap.types, use.gap.types)


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### read.liftMask() extracts the AGAPS mask from a UCSC "lift" file.
###

.read.liftFile <- function(file)
{
    COL2CLASS <- c(
        `offset`="integer",
        `xxxx`="NULL",  # not sure how to call this
        `width`="integer",
        `seqname`="character",
        `seqlen`="integer"
    )
    read.table(file,
               col.names=names(COL2CLASS),
               colClasses=COL2CLASS,
               check.names=FALSE)
}

read.liftMask <- function(file, seqname="?", mask.width=NA)
{
    if (!isSingleString(seqname))
        stop("'seqname' must be a single string")
    if (!isSingleNumberOrNA(mask.width))
        stop("'mask.width' must be a single integer or 'NA'")
    if (!is.integer(mask.width))
        mask.width <- as.integer(mask.width)
    data <- .read.liftFile(file)
    if (seqname == "?")
        .showDistinctSeqnamesAndStop(data$seqname)
    data <- data[data$seqname == seqname, ]
    if (nrow(data) == 0)
        return(.newEmptyMask(seqname, mask.width, "AGAPS", "assembly gaps"))
    ## Sanity checks
    seqlen0 <- unique(data$seqlen)
    if (length(seqlen0) != 1)
        stop("broken \"lift\" file: contains different lengths ",
             "for sequence \"", seqname, "\"")
    if (!is.na(mask.width) && mask.width != seqlen0)
        stop("when supplied, 'mask.width' must match the length found ",
             "in the file for sequence \"", seqname, "\"")
    contigs0 <- IRanges(start=data$offset+1, width=data$width)
    contigs1 <- asNormalIRanges(contigs0, force=TRUE)
    if (length(contigs1) != length(contigs0))
        warning("some contigs are adjacent or overlapping")
    contigs <- Mask(seqlen0, start=start(contigs1), width=width(contigs1))
    ans <- gaps(contigs)
    names(ans) <- "AGAPS"
    desc(ans) <- "assembly gaps"
    ans
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### read.rmMask() extracts the RM mask from a RepeatMasker .out file.
###
### See http://www.repeatmasker.org/webrepeatmaskerhelp.html for a
### description of the RepeatMasker output format.
###

.read.rmFile <- function(file)
{
    COL2CLASS <- c(
        `SW_score`="integer",
        `perc_div`="numeric",
        `perc_del`="numeric",
        `perc_ins`="numeric",
        `query_sequence`="character",
        `begin_in_query`="integer",
        `end_in_query`="integer",
        `left_in_query`="character",
        `C`="character", 
        `matching_repeat`="character",
        `repeat_class_or_family`="character",
        `begin_in_repeat`="integer",
        `end_in_repeat`="integer",
        `left_in_repeat`="character",
        `ID`="character"
    )
    COLS <- c("query_sequence", "begin_in_query", "end_in_query", "ID")
    COL2CLASS[!(names(COL2CLASS) %in% COLS)] <- "NULL"
    read.table(file,
               col.names=names(COL2CLASS),
               colClasses=COL2CLASS,
               skip=3,
               check.names=FALSE)
}

read.rmMask <- function(file, seqname="?", mask.width=NA, use.IDs=FALSE)
{
    if (!isSingleString(seqname))
        stop("'seqname' must be a single string")
    if (!isSingleNumberOrNA(mask.width))
        stop("'mask.width' must be a single integer or 'NA'")
    if (!is.integer(mask.width))
        mask.width <- as.integer(mask.width)
    if (!isTRUEorFALSE(use.IDs))
        stop("'use.IDs' must be TRUE or FALSE")
    data <- .read.rmFile(file)
    if (seqname == "?")
        .showDistinctSeqnamesAndStop(data$query_sequence)
    data <- data[data$query_sequence == seqname, ]
    if (nrow(data) == 0)
        return(.newEmptyMask(seqname, mask.width, "RM", "RepeatMasker"))
    ranges <- IRanges(start=data$begin_in_query, end=data$end_in_query)
    if (use.IDs) {
        names(ranges) <- data$ID
        if (S4Vectors:::isNotStrictlySorted(start(ranges)))
            ranges <- ranges[base::order(start(ranges))]
        if (!isNormal(ranges))
            stop("cannot use the repeat IDs when some repeats are adjacent or overlap")
        nir1 <- asNormalIRanges(ranges, force=FALSE)
    } else {
        nir1 <- asNormalIRanges(ranges, force=TRUE)
    }
    ## Don't use new2(): the validity of the new mask needs to be checked!
    new2("MaskCollection", nir_list=list(nir1), width=mask.width, active=TRUE,
                           NAMES="RM", desc="RepeatMasker", check=FALSE)
}


### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### read.trfMask() extracts the TRF mask from a Tandem Repeats Finder .bed
### file.
###

.read.trfFile <- function(file)
{
    COL2CLASS <- c(
        `chrom`="character",
        `chromStart`="integer",
        `chromEnd`="integer",
        `name`="character",
        `period`="integer",
        `copyNum`="numeric",
        `consensusSize`="integer",
        `perMatch`="integer",
        `perIndel`="integer",
        `score`="integer",
        `A`="integer",
        `C`="integer",
        `G`="integer",
        `T`="integer",
        `entropy`="numeric",
        `sequence`="character"
    )
    COLS <- c("chrom", "chromStart", "chromEnd")
    COL2CLASS[!(names(COL2CLASS) %in% COLS)] <- "NULL"
    read.table(file,
               col.names=names(COL2CLASS),
               colClasses=COL2CLASS,
               check.names=FALSE)
}

read.trfMask <- function(file, seqname="?", mask.width=NA)
{
    if (!isSingleString(seqname))
        stop("'seqname' must be a single string")
    if (!isSingleNumberOrNA(mask.width))
        stop("'mask.width' must be a single integer or 'NA'")
    if (!is.integer(mask.width))
        mask.width <- as.integer(mask.width)
    data <- .read.trfFile(file)
    if (seqname == "?")
        .showDistinctSeqnamesAndStop(data$chrom)
    data <- data[data$chrom == seqname, ]
    if (nrow(data) == 0)
        return(.newEmptyMask(seqname, mask.width, "TRF", "Tandem Repeats Finder"))
    ranges <- IRanges(start=data$chromStart+1, end=data$chromEnd)
    nir1 <- asNormalIRanges(ranges, force=TRUE)
    ## Don't use new2(): the validity of the new mask needs to be checked!
    new2("MaskCollection", nir_list=list(nir1), width=mask.width, active=TRUE,
                           NAMES="TRF", desc="Tandem Repeats Finder", check=FALSE)
}

Any scripts or data that you put into this service are public.

IRanges documentation built on Dec. 14, 2020, 2 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

IRanges
Foundation of integer range manipulation in Bioconductor

R/read.Mask.R
In IRanges: Foundation of integer range manipulation in Bioconductor

Defines functions read.trfMask .read.trfFile read.rmMask .read.rmFile read.liftMask .read.liftFile read.gapMask read.agpMask .read.agpORgapMask .read.agpORgapFile .guessGapFileCOL2CLASS .newEmptyMask .showDistinctSeqnamesAndStop

Documented in read.agpMask read.gapMask read.liftMask read.rmMask read.trfMask

Try the IRanges package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

IRanges Foundation of integer range manipulation in Bioconductor

R/read.Mask.R In IRanges: Foundation of integer range manipulation in Bioconductor

Defines functions read.trfMask .read.trfFile read.rmMask .read.rmFile read.liftMask .read.liftFile read.gapMask read.agpMask .read.agpORgapMask .read.agpORgapFile .guessGapFileCOL2CLASS .newEmptyMask .showDistinctSeqnamesAndStop

Documented in read.agpMask read.gapMask read.liftMask read.rmMask read.trfMask

Try the IRanges package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

IRanges
Foundation of integer range manipulation in Bioconductor

R/read.Mask.R
In IRanges: Foundation of integer range manipulation in Bioconductor