#' Extract the relevant columns from the files
#'
#' @param files The path to tsv files for methylation status
#' @param chr_idx The index of column for chromosome
#' @param pos_idx The index of column for position
#' @param met_idx The index of column for the number of methylated reads
#' @param id The id to each file
#' @param coverage_idx The index of column containing coverage; optional
#' @param unmet_idx The index of column for the number of unmethylated reads
#' @param strand_idx The index of column containing strand; optional
#' @param deduplicate Whether to sum the reads for rows with the same genomic location
#' @param header Whether to there is a header
#' @param sep delimiter for methylation status file
#' @importFrom S4Vectors Rle DataFrame SimpleList
#' @import dplyr
#' @export
read_meth <- function(files, chr_idx, pos_idx, met_idx, id, coverage_idx = NULL, strand_idx = NULL,
unmet_idx = NULL, deduplicate = T, header = T, sep = '\t') {
meth_list <- SimpleList()
p <- dplyr::progress_estimated(length(files))
for (i in seq_along(files)) {
data <- read.delim(files[i], sep = sep, header = header, stringsAsFactors = F)
if (!is.null(strand_idx)) {
strand <- data[, strand_idx]
} else {
strand <- rep("*", nrow(data))
}
if (!is.null(coverage_idx)) coverage <- data[, coverage_idx]
if (!is.null(unmet_idx)) coverage <- data[, met_idx] + data[, unmet_idx]
data <- data.frame(chr = gsub("chr", "", data[, chr_idx]),
position = data[, pos_idx],
strand = strand,
met_reads = data[, met_idx],
coverage = coverage)
if (deduplicate)
data <- dplyr::distinct(data) %>%
group_by(chr, position, strand) %>%
slice(which.max(coverage))
meth <- new("methCall",
data = with(data,
DataFrame(
chr = Rle(chr),
position = Rle(position),
strand = Rle(strand),
met_reads = Rle(met_reads),
coverage = Rle(coverage))))
meth_list[[id[i]]] <- meth
p$pause(0.1)$tick()$print()
}
return(meth_list)
}
#' Join a list of methCall object by site
#'
#' @param meth_list A named list of methCall object (from read_meth)
#' @param all.x Whether to include all rows in x
#' @param all.y Whether to include all rows in y
#' @return A DataFrame that is a inner join of all the methCall
#' @export
join_meth_list <- function(meth_list, all.x = FALSE, all.y = FALSE) {
joined <- meth_list[[1]]@data
colnames(joined)[4:5] <- paste0(colnames(joined)[4:5], ".", names(meth_list)[1])
for (i in seq_along(meth_list)[-1]) {
df <- meth_list[[i]]@data
colnames(df)[4:5] <- paste0(colnames(df)[4:5], ".", names(meth_list)[i])
joined <- SparkR::merge(joined, df, by = c("chr", "position", "strand"),
all.x = all.x, all.y = all.y)
}
return(joined)
}
truncate_outlier <- function(x, a = 3) {
iqr <- IQR(x)
median <- median(x)
x[x > median + a * iqr] <- median + a * iqr
x[x < median - a * iqr] <- median - a * iqr
x
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.