##' methods-repeatmasker
##'
##' Description Methods for dealing with RepeatMasker output
##'
##' Details
##'
##' @name RepeatMasker methods
##'
NULL
.header <- c("score", "divergence", "deletions",
"insertions", "query_name", "query_start",
"query_end", "query_bases", "complement",
"matching_repeat", "repeat_class", "subject_start",
"subject_end", "subject_bases", "linkage_id")
.aln_header <- c("score", "divergence", "deletions",
"insertions", "query_name", "query_start",
"query_end", "query_bases", "complement",
"repeat_label", "subject_start",
"subject_end", "subject_bases", "linkage_id")
.numeric_columns <- c("score", "divergence", "deletions", "insertions", "query_start",
"query_end", "subject_start", "subject_end")
## Scan a annotation line or alignment header line. Note that they are
## slightly different in that the summary file contains an extra
## column for repeat class/family, which often is excluded in
## alignments
.scan_line <- function(line, summary = TRUE) {
d <- as.list(unlist(strsplit(line, "\\s+")))
if (summary) {
## Remove trailing asterix if present
if (length(d) == 16)
d[16] <- NULL
## Linkage ID is missing
if (length(d) == 14) d[[15]] <- NA
## Now assume 15 columns
names(d) <- .header
d$repeat_label <- paste(d$matching_repeat, d$repeat_class, sep="#")
} else {
## matching_repeat is always(?) missing; linkage_id is always
## present
if (length(d) == 13) {
## 13: complement missing
names(d) <- .aln_header[-9]
d$complement = "+"
} else if (length(d) == 14) {
## 14: complement included
names(d) <- .aln_header
} else if (length(d) == 15) {
## Assume last column is irrelevant
d <- d[1:14]
names(d) <- .aln_header
}
d$matching_repeat <- unlist(strsplit(d$repeat_label, "#"))[1]
d$repeat_class <- unlist(strsplit(d$repeat_label, "#"))[2]
}
if (d$complement == "C") {
tmp <- d$subject_bases
d$subject_bases <- d$subject_start
d$subject_start <- tmp
}
unlist(d)
}
## Process header
.process_header <- function(data) {
data[.numeric_columns] <- sapply(data[.numeric_columns], function(x) {as.numeric(as.character(x))})
data$query_bases <- as.integer(gsub("(\\(|\\))", "", as.character(data$query_bases)))
data$subject_bases <- as.integer(gsub("(\\(|\\))", "", as.character(data$subject_bases)))
data$query_length <- data$query_end + data$query_bases
data$subject_length <- data$subject_end + data$subject_bases
data
}
.create_query_subject <- function(data, qinfo=NULL) {
if (is.null(qinfo)) {
message("manually inferring seqinfo for query")
qinfo <- unique(data.frame(names = as.character(data$query_name),
lengths = data$query_length))
qinfo$names <- as.character(qinfo$names)
if (any(duplicated(qinfo$names)))
qinfo <- qinfo[!duplicated(qinfo$names),]
qinfo <- GenomeInfoDb::Seqinfo(qinfo$names, qinfo$lengths)
}
stopifnot(inherits(qinfo, "Seqinfo"))
strand <- rep("+", length(data$complement))
strand[data$complement == "C"] <- "-"
strand <- Rle(strand)
query_seq <- NULL
subject_seq <- NULL
if ("query_seq" %in% colnames(data))
query_seq <- BStringSet(data$query_seq)
if ("subject_seq" %in% colnames(data))
subject_seq <- BStringSet(data$subject_seq)
i.na <- is.na(data$query_start) | is.na(data$query_end) | is.na(data$subject_start) | is.na(data$subject_end)
if (any(i.na))
sprintf("%s ", which(i.na))
query <- AlignmentItem(
seqnames = S4Vectors::Rle(data$query_name, rep(1, length(data$query_name))),
ranges = IRanges::IRanges(start = data$query_start,
end = data$query_end),
strand = strand,
bases = data$query_bases,
sequence = query_seq,
seqinfo = qinfo)
subject <- RepeatAlignmentItem(
seqnames = S4Vectors::Rle(data$matching_repeat, rep(1, length(data$matching_repeat))),
ranges = IRanges::IRanges(start = data$subject_start,
end = data$subject_end),
strand = strand,
bases = data$subject_bases,
sequence = subject_seq,
seqinfo = NULL,
repeat_class = data$repeat_class
)
list(query = query, subject = subject)
}
##' Read RepeatMasker annotation output
##'
##' @description Parse RepeatMasker annotation output. The
##' RepeatMasker annotation output is a simple tabular format
##' where each line consists of a query-subject pair and
##' additional statistics such as score and percentage divergence.
##' \code{readRepeatMaskerAnnotation} converts each to an entry in
##' an \code{\link[ripr]{AlignmentPairs}} object.
##' \code{AlignmentPairs} has slot names \code{score},
##' \code{divergence}, \code{deletions} and \code{insertions}
##' corresponding to the first four columns in the tabular output,
##' and a slot \code{linkage_id} that maps to the last column.
##' There are also two slots \code{query} and \code{subject} that
##' correspond to the query and subject columns. These slots are
##' populated by \code{GenomicRanges::GRanges}-derived objects
##' \code{\link[ripr]{AlignmentItem}} and
##' \code{\link[ripr]{RepeatAlignmentItem}}.
##'
##' A \code{GenomeInfoDb::Seqinfo} can be passed as an option to
##' setup sequence information for the query (genome) sequence.
##' Otherwise, sequence information is inferred from the alignment
##' coordinates.
##'
##' No attempt is made to infer the sequence information for the
##' subject (repeat) for two reasons:
##'
##' 1. low-complexity repeats (e.g. polyA-sequences) have no
##' well-defined sequence length
##'
##' 2. RepeatMasker makes adjustments to alignments which are
##' recorded in the annotation output. Therefore the
##' coordinates are sometimes modified which can result in
##' inferred alignment ranges being longer than the repeat
##' sequence itself
##'
##'
##'
##' @param filename File to parse
##' @param seqinfo Seqinfo object for query (genome) sequence
##' @param ... Arguments to pass to data access functions
##'
##' @export
##' @rdname readRepeatMaskerAnnotation
##'
setGeneric("readRepeatMaskerAnnotation", function(filename, seqinfo=NULL, ...)
standardGeneric("readRepeatMaskerAnnotation"))
##'
##' @rdname readRepeatMaskerAnnotation
##' @export
##'
##' @examples
##' fn <- system.file("extdata", "repeatmasker_summary.txt",
##' package="ripr")
##' ap <- readRepeatMaskerAnnotation(fn)
##'
setMethod("readRepeatMaskerAnnotation", signature = c("character"),
definition = function (filename, seqinfo=NULL, ...) {
start_time <- Sys.time()
con <- file(filename, "r")
on.exit(close(con))
message("Reading file ", filename)
## Discard header
readLines(con, n = 3)
## Read rest of file
lines <- lapply(readLines(con), trim)
data <- as.data.frame(do.call(rbind, lapply(lines, .scan_line)))
data <- .process_header(data)
obj <- .create_query_subject(data, qinfo=seqinfo)
message("Processed ", length(lines), " lines in ",
format(Sys.time() - start_time, digits = 2))
AlignmentPairs(obj$query, obj$subject, score = data$score, divergence = data$divergence,
deletions = data$deletions, insertions = data$insertions,
linkage_id = as.character(data$linkage_id))
})
## Get next alignment chunk
.nextAlignmentChunk <- function(con, buf, chunksize = 10, regex = "^\\d+",
comment.char = "^#") {
while (TRUE) {
## Read more lines if there is no alignment in buffer
if (sum(grepl(regex, buf)) <= 1)
lines <- readLines(con, n = chunksize)
buf <- c(buf, lines)
pos <- grep(regex, buf)
eof <- FALSE
start <- pos[1]
## If no more lines were read we are at end
if (length(lines) == 0) {
end <- grep(comment.char, buf)[1] - 1
if (is.na(end))
end <- length(buf)
eof <- TRUE
} else {
end <- pos[2] - 1
}
if (!(is.na(start) | is.na(end))) {
aln <- buf[start:end]
buf[start:end] <- NULL
break
}
}
res <- list(aln, buf, eof)
names(res) <- c("aln", "buf", "eof")
res
}
.repeatMaskerAlignment <- function(aln) {
data <- as.data.frame(t(.scan_line(aln[[1]], summary = FALSE)))
data[.numeric_columns] <- sapply(data[.numeric_columns], function(x) {as.numeric(as.character(x))})
data <- .process_header(data)
## Simpler: just match alignment lines and assume odd are query,
## even are subject
aln_regex <- "^C?\\s+([^ ]+)\\s+(\\d+)\\s+([^ ]+)\\s+(\\d+).*$"
seqs <- gsub(aln_regex, "\\3", aln[grepl(aln_regex, aln)])
if (length(seqs) == 0) {
warning("No sequences in alignment chunk! Skipping!")
message(aln)
return (NULL)
}
data$query_seq <- paste0(seqs[seq(1, length(seqs), 2)], collapse = "")
data$subject_seq <- paste0(seqs[seq(2, length(seqs), 2)], collapse = "")
if (data$complement == "C")
data$subject_seq <- paste0(
rev(unlist(strsplit(data$subject_seq, ""))),
collapse = "")
data
}
##' Read repeatmasker alignment output
##'
##' @param filename File to parse
##' @param ... Arguments to pass to data access functions
##'
##' @export
##' @rdname readRepeatMaskerAlignment
##' @importFrom GenomeInfoDb Seqinfo
##'
setGeneric("readRepeatMaskerAlignment", function(filename, ...)
standardGeneric("readRepeatMaskerAlignment"))
##'
##' @rdname readRepeatMaskerAlignment
##' @export
##'
##' @param num_aln number of alignments to read
##' @param update_freq print progress message every update_freq alignment
##'
setMethod("readRepeatMaskerAlignment", signature = "character",
definition = function (filename, num_aln = NULL,
update_freq = NULL, ...) {
start_time <- Sys.time()
old_time <- start_time
con <- file(filename, "r")
on.exit(close(con))
message("Reading repeatmasker alignment file ", filename)
buf <- list()
nlines <- 0
localenv <- new.env()
localenv$counter <- 0
localenv$size <- 1
data <- data.frame()
localenv$res <- list(NULL)
## https://stackoverflow.com/questions/17046336/here-we-go-again-append-an-element-to-a-list-in-r
AddItemDoubling <- function(item) {
if (localenv$counter == localenv$size )
{
length (localenv$res) <- localenv$size <- localenv$size * 2
}
localenv$counter <- localenv$counter + 1
localenv$res[[localenv$counter]] <- item
}
while (TRUE) {
chunk <- .nextAlignmentChunk(con, buf)
nlines <- nlines + length(chunk$aln)
## Process alignment
if (!is.null(chunk$aln)) {
AddItemDoubling(chunk$aln)
if (!is.null(update_freq)) {
if (localenv$counter %% update_freq == 0) {
now_time <- Sys.time()
message("Read ", localenv$counter, " alignments in ",
format(now_time - old_time, digits = 2))
old_time <- now_time
}
}
if (!is.null(num_aln))
if (localenv$counter %% num_aln == 0)
break
}
if (chunk$eof) break
buf <- chunk$buf
}
localenv$res <- compact(localenv$res)
message("Converting alignments to data frame objects...")
data <- do.call("rbind", compact(lapply(localenv$res, .repeatMaskerAlignment)))
obj <- .create_query_subject(data)
message("Processed ", nlines, " lines in ",
format(Sys.time() - start_time, digits = 2))
AlignmentPairs(
obj$query, obj$subject, score = data$score,
divergence = data$divergence, deletions = data$deletions,
insertions = data$insertions,
linkage_id = as.character(data$linkage_id)
)
})
##' asRepeatMaskerAnnotation
##'
##' Convert AlignmentPairs or AlignmentPairsList into RepeatMasker Annotation format
##'
##'
##' @param obj AlignmentPairs or AlignmentPairsList object
##'
##' @return data frame with columns corresponding to RepeatMasker Annotation format
##'
##' @export
##' @rdname asRepeatMaskerAnnotation
##'
asRepeatMaskerAnnotation <- function(obj) {
cols <- c("score", "divergence", "deletions", "insertions",
"query.seqnames", "query.start", "query.end",
"query.bases", "query.strand", "subject.seqnames",
"subject.repeat_class", "subject.start", "subject.end",
"subject.bases", "linkage_id")
x <- as.data.frame(obj)
y <- x[, cols]
## Add parentheses to query.bases and subject.bases
y$query.bases <- paste("(", y$query.bases, ")", sep = "")
y$subject.bases <- paste("(", y$subject.bases, ")", sep = "")
## shift subject columns for inverse hits
i <- y$query.strand == "-"
tmp <- y[i, ]$subject.start
y[i, ]$subject.start <- y[i, ]$subject.bases
y[i, ]$subject.bases <- tmp
y$query.strand <- as.character(y$query.strand)
y[i, ]$query.strand <- "C"
y
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.