extract_lumpy_read_count <- function(df, index) {
pattern <- "^./.:([[:digit:]]+):([[:digit:]]+):([[:digit:]]+)$"
col <- df[, index]
prefix = colnames(df)[index]
l <- regmatches(col, regexec(pattern, col))
total <- as.integer(unlist(lapply(l, '[[', 2)))
disc <- as.integer(unlist(lapply(l, '[[', 3)))
split <- as.integer(unlist(lapply(l, '[[', 4)))
total_col = paste(prefix, "-Total", sep="")
disc_col = paste(prefix, "-Discordant", sep="")
split_col = paste(prefix, "-Split", sep="")
out <- data.frame(total_col = total, disc_col = disc, split_col = split)
colnames(out) <- c(total_col, disc_col, split_col)
out
}
#' Returns a dataframe from a hydra-multi run
#' @export
load_hydra <- function(filename, header = FALSE) {
#!#!#!#!#!#!# TODO
# Refactor this section - common to load_brass
if (!file.exists(filename)) {
stop("File not found")
}
# 0. Preliminary file exam - see how many samples we've got
GZIPPED <- FALSE
if (stringi::stri_endswith_fixed(filename, ".gz")) {
conn <- gzfile(filename)
GZIPPED <- TRUE
} else {
conn <- file(filename)
}
open(conn)
line <- readLines(conn, 1) # read first line
headskipcount <- 0
while (grepl("^#|^%", line)) {
line <- readLines(conn, 1)
headskipcount <- headskipcount + 1
}
split <- strsplit(line, "\t")[[1]]
nFields <- length(split)
#!#!#!#!#!#!# - Refactor
nsamples <- nFields - 24
coltypes <- readr::cols(readr::col_character(),
readr::col_integer(),
readr::col_integer(),
readr::col_character(),
readr::col_integer(),
readr::col_integer(),
readr::col_skip(),
readr::col_skip(),
readr::col_factor(c("+", "-")),
readr::col_factor(c("+", "-")),
readr::col_skip(),
readr::col_skip(),
readr::col_skip(),
readr::col_skip(),
readr::col_skip(),
readr::col_skip(),
readr::col_skip(),
readr::col_skip(),
readr::col_skip(),
readr::col_skip(),
readr::col_skip(),
readr::col_skip(),
readr::col_skip(),
readr::col_skip()
)
for (i in 1:nsamples) {
coltypes[[1]][[1 + length(coltypes[[1]])]] <- readr::col_integer()
}
colnames <- c("Lower.Location",
"Lower.Start",
"Lower.End",
"Upper.Location",
"Upper.Start",
"Upper.End",
"Lower.Strand",
"Upper.Strand",
paste("Sample", 1:(nsamples), sep = ""))
data <- readr::read_tsv(filename,
col_names = colnames,
col_types = coltypes,
skip = headskipcount,
progress = TRUE)
as.data.frame(data)
}
# Returns a data frame from a LUMPY bedpe output file
#' @export
load_lumpy <- function(filename) {
if (!file.exists(filename)) {
stop("File not found")
}
# 0. Preliminary file exam - see how many samples we've got
GZIPPED <- FALSE
if (stringi::stri_endswith_fixed(filename, ".gz")) {
conn <- gzfile(filename)
GZIPPED <- TRUE
} else {
conn <- file(filename)
}
open(conn)
line <- readLines(conn, 1) # read first line
headskipcount <- 0
while (grepl("^##", line)) {
line <- readLines(conn, 1)
headskipcount <- headskipcount + 1
}
close(conn)
split <- strsplit(line, "\t")[[1]]
nFields <- length(split)
nSamples <- nFields - 21
## Use what we've learnt to prepare the readr load_tsv function
coltypes <- readr::cols(readr::col_character(), # CHROM_A
readr::col_integer(), # START_A
readr::col_integer(), # END_A
readr::col_character(), # CHROM_B
readr::col_integer(), # START_B
readr::col_integer(), # END_B
readr::col_skip(), # ID
readr::col_skip(), # QUAL
readr::col_factor(c("+", "-")), # STRAND_A
readr::col_factor(c("+", "-")), # STRAND_B
readr::col_factor(c("DUP",
"DEL",
"BND",
"INV")), # TYPE
readr::col_skip(), # FILTER
readr::col_skip(), # NAME_A
readr::col_skip(), # REF_A
readr::col_skip(), # ALT_A
readr::col_skip(), # NAME_B
readr::col_skip(), # REF_B
readr::col_skip(), # ALT_B
readr::col_character(), # INFO_A
readr::col_character(), # INFO_B
readr::col_skip()) # FORMAT
for (i in 1:nSamples) {
coltypes[[1]][[1 + length(coltypes[[1]])]] <- readr::col_character()
}
remaining <- nFields - length(coltypes[[1]])
if (remaining > 0) {
for (i in 1:remaining) {
coltypes[[1]][[1 + length(coltypes[[1]])]] <- readr::col_skip()
}
}
data <- readr::read_tsv(filename,
# col_names = colnames,
col_types = coltypes,
skip = headskipcount,
progress = TRUE)
data <- as.data.frame(data)
colnames(data)[1:8] <- c("Lower.Location",
"Lower.Start",
"Lower.End",
"Upper.Location",
"Upper.Start",
"Upper.End",
"Lower.Strand",
"Upper.Strand")
data[, 1:8] <- data[, c(1, 7, 2, 3, 4, 8, 5, 6)]
colnames(data)[1:8] <- colnames(data)[c(1, 7, 2, 3, 4, 8, 5, 6)]
# Extract read counts
for (i in 12:(12+nSamples-1)) {
data <- cbind(data, extract_lumpy_read_count(data, i))
}
data[, -(12:(12+nSamples-1))]
}
# Returns a data frame from a brass bedpe output file. Filters out commented lines and read ID columns
#' @export
load_brass <- function(filename, header = FALSE) {
if (!file.exists(filename)) {
stop("File not found")
}
# 0. Preliminary file exam - see how many samples we've got
GZIPPED <- FALSE
if (stringi::stri_endswith_fixed(filename, ".gz")) {
conn <- gzfile(filename)
GZIPPED <- TRUE
} else {
conn <- file(filename)
}
open(conn)
line <- readLines(conn, 1) # read first line
headskipcount <- 0
while (grepl("^#|^%", line)) {
line <- readLines(conn, 1)
headskipcount <- headskipcount + 1
}
split <- strsplit(line, "\t")[[1]]
nFields <- length(split)
if (suppressWarnings(is.na(as.integer(split[nFields])))) {
nSamples <- (nFields - 9) / 2
} else {
nSamples <- nFields - 8
}
tailskipcount <- 0
# gzfile doesn't allow seeking (gzip is a stream compressor,
# so need to decompress everything prior to seeked-for point)
if (!GZIPPED) {
seek(conn, where = -5000, origin = "end")
line <- readLines(conn, 1)
while (!identical(line, character(0))) {
if (grepl("^#|^%", line)) {
break
}
line <- readLines(conn, 1)
}
while (!identical(line, character(0))) {
tailskipcount <- tailskipcount + 1
line <- readLines(conn, 1)
}
}
close(conn)
## Use what we've learnt to prepare the readr load_tsv function
coltypes <- readr::cols(readr::col_character(),
readr::col_factor(c("+", "-")),
readr::col_integer(),
readr::col_integer(),
readr::col_character(),
readr::col_factor(c("+", "-")),
readr::col_integer(),
readr::col_integer())
for (i in 9:(nSamples + 8)) {
coltypes[[1]][[i]] <- readr::col_integer()
}
remaining <- nFields - length(coltypes[[1]])
if (remaining > 0) {
for (i in 1:remaining) {
coltypes[[1]][[1 + length(coltypes[[1]])]] <- readr::col_skip()
}
}
colnames <- c("Lower.Location",
"Lower.Strand",
"Lower.Start",
"Lower.End",
"Upper.Location",
"Upper.Strand",
"Upper.Start",
"Upper.End",
paste("Sample", 1:(nSamples), sep = ""))
data <- readr::read_tsv(filename,
col_names = colnames,
col_types = coltypes,
skip = headskipcount,
progress = TRUE)
res <- as.data.frame(data[1:(nrow(data) - tailskipcount), ])
res[complete.cases(res),]
}
#' Load a filtered data frame
#' @export
load_filtered <- function(filename) {
if (!file.exists(filename)) {
stop("File not found")
}
conn <- file(filename)
open(conn)
line <- readLines(conn, 1) # read first line
close(conn)
n_cols <- length(strsplit(line, "\t")[[1]])
coltypes <- readr::cols(readr::col_character(),
readr::col_factor(c("+", "-")),
readr::col_integer(),
readr::col_integer(),
readr::col_character(),
readr::col_factor(c("+", "-")),
readr::col_integer(),
readr::col_integer())
for (i in 9:(n_cols-4)) {
coltypes[[1]][[i]] <- readr::col_integer()
}
coltypes[[1]][[n_cols-3]] <- readr::col_number()
coltypes[[1]][[n_cols-2]] <- readr::col_number()
coltypes[[1]][[n_cols-1]] <- readr::col_character()
coltypes[[1]][[n_cols]] <- readr::col_character()
res <- as.data.frame(readr::read_tsv(filename, col_types = coltypes))
res[complete.cases(res),]
}
#' Load a filtered data frame
#' @export
load_filtered <- function(filename) {
if (!file.exists(filename)) {
stop("File not found")
}
conn <- file(filename)
open(conn)
line <- readLines(conn, 1) # read first line
close(conn)
n_cols <- length(strsplit(line, "\t")[[1]])
coltypes <- readr::cols(readr::col_character(),
readr::col_factor(c("+", "-")),
readr::col_integer(),
readr::col_integer(),
readr::col_character(),
readr::col_factor(c("+", "-")),
readr::col_integer(),
readr::col_integer())
for (i in 9:(n_cols-4)) {
coltypes[[1]][[i]] <- readr::col_integer()
}
coltypes[[1]][[n_cols-3]] <- readr::col_number()
coltypes[[1]][[n_cols-2]] <- readr::col_number()
coltypes[[1]][[n_cols-1]] <- readr::col_character()
coltypes[[1]][[n_cols]] <- readr::col_character()
as.data.frame(readr::read_tsv(filename, col_types = coltypes))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.