# Tidy CpGs Functions ------------------------------------------------
#' Get tidy cpgs from a track
#'
#' @param track name of track
#' @param intervals intervals
#' @param only_tcpgs use also tracks that have only tidy_cpgs (without cov,meth,unmeth tracks)
#'
#' @return
#' @export
#'
#' @examples
gpatterns.get_tidy_cpgs <- function(track,
intervals = NULL,
only_tcpgs = TRUE){
if (!gpatterns.track_exists(track) & !only_tcpgs){
stop('track does not exists')
}
files <- .gpatterns.tidy_cpgs_files(track)
if (length(files) == 0){
stop('no tidy cpgs found')
}
.get_tcpgs <- function(files){
res <- files %>%
map_df(function(f) fread(
qq('gzip -d -c @{f}'),
colClasses = c(
read_id = 'character',
chrom = 'character',
start = 'numeric',
end = 'character',
strand = 'character',
umi1 = 'character',
umi2 = 'character',
insert_len = 'numeric',
# num = 'numeric',
cg_pos = 'numeric',
meth = 'character',
qual = 'numeric')) %>%
as_tibble %>%
mutate(cg_pos = cg_pos, meth = ifelse(meth == 'Z', 1, 0)))
if (0 == nrow(res)){
return(NULL)
}
return(res)
}
.intervals2files <- function(intervals, files) {
.gpatterns.intervals2files(intervals=intervals, files=files, dir = paste0(.gpatterns.base_dir(track), '/tidy_cpgs/'))
}
if (!is.null(intervals)){
tidy_intervals <- .gpatterns.get_tidy_cpgs_intervals(track)
if (!is.character(intervals)){
if (all(
paste(intervals$chrom, intervals$start, intervals$end) %in%
paste(tidy_intervals$chrom, tidy_intervals$start, tidy_intervals$end))
){
return(.intervals2files(intervals, files) %>% .get_tcpgs)
}
}
tcpgs <- tidy_intervals %>%
gintervals.filter(intervals) %>%
.intervals2files(files) %>%
.get_tcpgs()
f <- tcpgs %>%
select(chrom, start=cg_pos) %>%
mutate(end = start+1) %>%
gintervals.neighbors1(intervals) %>%
mutate(f = dist == 0) %>% .$f
return(tcpgs %>% filter(f))
}
return(.get_tcpgs(files))
}
#' Apply a function on tidy cpgs (internally separated by coordinates)
#'
#' @param track name of track
#' @param f function to apply (needs to return a data frame)
#' @param split_by_bin apply on each genomic bin separately
#' @param parallel execute parallely
#' @param use_sge use sge cluster for parallelization
#' @param max_jobs maximal number of jobs for sge
#' @param verbose print jobs status
#' @param ... additonal parameters for gcluster.run2
#'
#' @return a data frame with \code{f} applied to every genomic bin of tidy_cpgs
#' of \code{track}
#'
#' @export
#'
#' @examples
gpatterns.apply_tidy_cpgs <- function(track,
f,
split_by_bin = TRUE,
parallel = getOption("gpatterns.parallel"),
use_sge = FALSE,
max_jobs = 300,
verbose = FALSE,
...){
# make sure that empty intervals would return NULL
f_null <- function(track, intervals=NULL){
tcpgs <- gpatterns.get_tidy_cpgs(track, intervals=intervals)
if (is.null(tcpgs) || nrow(tcpgs) == 0){
return(NULL)
}
return(f(tcpgs))
}
if (!split_by_bin){
return(f_null(track))
}
coords <- .gpatterns.get_tidy_cpgs_intervals(track)
if (use_sge){
commands <- plyr::alply(coords, 1, function(cr)
qq('f_null(track,
intervals = gintervals("@{cr$chrom}", @{cr$start}, @{cr$end}))') %>%
gsub('\n', ' ', .))
res <- gcluster.run2(command_list = commands,
max.jobs = max_jobs,
debug = verbose,
packages = c('stringr', 'gpatterns'),
jobs_title = 'gpatterns.apply_tidy_cpgs',
collapse_results = FALSE,
...)
res <- map(res, function(x) x$retv)
} else {
res <- plyr::alply(coords, 1, function(cr) {
f_null(track, intervals=cr)
}, .parallel=parallel,
.progress='text')
}
tryCatch(res <- res %>% compact %>% map_df(~ .),
error=function(e) {
message('wasn\'t able to collate_results, returning raw output')
})
return(res)
}
#' @export
.gpatterns.get_tidy_cpgs_intervals <- function(track=NULL, path=NULL){
stopifnot(!is.null(track) || !is.null(path))
if (!is.null(track)){
files <- .gpatterns.tidy_cpgs_files(track)
}
if (!is.null(path)){
files <- list.files(path, full.names=TRUE, pattern='tcpgs.gz')
}
files %>%
basename %>%
gsub('\\.tcpgs\\.gz$', '', .) %>%
strsplit('_') %>%
map_df(function(x) tibble(chrom=x[1], start=as.numeric(x[2]), end=as.numeric(x[3])))
}
# Pattern generation Functions ------------------------------------------------
#' Calculate pattern coverage for each CpG
#'
#' @param track name of track
#' @param pat_len pattern length
#' @param max_span maximum span to look for CpGs
#' @param by_coord_bin execute separatly for each coordinates bin
#' (for tracks that do not fit into memory)
#' @param parallel execute parallely
#'
#' @return
#' @export
#'
#' @examples
gpatterns.get_pat_cov <- function(track,
pat_len,
max_span=500,
by_coord_bin=TRUE,
parallel = getOption("gpatterns.parallel")){
if (by_coord_bin){
return(track %>%
gpatterns.apply_tidy_cpgs(
function(x) .gpatterns.tidy_cpgs_2_pat_cov(x,
pat_len=pat_len,
max_span=max_span),
parallel=FALSE))
} else {
return(gpatterns.get_tidy_cpgs(track) %>%
.gpatterns.tidy_cpgs_2_pat_cov(pat_len=pat_len,
max_span=max_span))
}
}
#' Extracts patterns given pattern space
#'
#' @param track name of track
#' @param pattern_space pattern space data frame
#' (output of gpatterns.tracks_to_pat_space or gpatterns.intervs_to_pat_space)
#' needs to have the following fields: chrom,start,end,fid
#' @param pattern_space_file file that holds the pattern space
#' @param max_missing maximal number of missing data ('*') in pattern
#' @param min_cov minimal pattern coverage
#' @param by_coord_bin execute separatly for each coordinates bin
#' (for tracks that do not fit into memory)
#' @param ... additional parameters to gpatterns.apply_tidy_cpgs
#' @return
#' @export
#'
#' @examples
gpatterns.tidy_cpgs_to_pats <- function(track,
pattern_space = NULL,
pattern_space_file = NULL,
max_missing = Inf,
min_cov = 1,
by_coord_bin=TRUE,
parallel = getOption("gpatterns.parallel"),
...){
if (!is.null(pattern_space_file)){
pattern_space <- fread(pattern_space_file)
} else if (is.null(pattern_space)){
stop('Please provide either pattern_space or pattern_space_file')
}
pat_space <- pattern_space %>%
group_by(fid) %>%
mutate(pat_pos=1:n()) %>%
select(fid, chrom, cg_pos=start, pat_pos) %>%
ungroup
max_pat_len <- max(pat_space$pat_pos)
.cpgs_to_pats <- function(cpgs){
cpgs <- cpgs %>% select(read_id, chrom, meth, cg_pos)
pats <- cpgs %>%
inner_join(pat_space, by=c('chrom', 'cg_pos'))
if (nrow(pats) == 0){
return(tibble(read_id=character(), fid=character(), pat=character()))
}
pats <- pats %>%
reshape2::dcast(read_id + fid ~ pat_pos, value.var='meth', fill='*') %>%
as_tibble
for (i in paste(1:max_pat_len)){
if(!(i %in% colnames(pats))){
pats[[i]] <- '*'
}
}
pats <- pats %>%
unite_('pat', paste(1:max_pat_len), sep='') %>%
mutate(n_na = stringr::str_count(pat, '\\*')) %>%
filter(n_na <= max_missing) %>%
select(-n_na)
return(pats)
}
if (by_coord_bin){
pats <- track %>% gpatterns.apply_tidy_cpgs(function(x) .cpgs_to_pats(x),
parallel=parallel, ...)
} else {
pats <- track %>% gpatterns.get_tidy_cpgs() %>% .cpgs_to_pats()
}
pats <- pats %>%
group_by(fid) %>%
filter(n() >= min_cov) %>%
select(fid, pattern=pat, read_id) %>%
ungroup %>%
arrange(fid, pattern, read_id)
return(pats)
}
#' Get pattern space of a track
#'
#' @param track track name
#'
#' @return pattern space of a track
#'
#' @export
gpatterns.track_pat_space <- function(track) {
intervs <- .gpatterns.pat_space_intervs_name(track)
if (gintervals.exists(intervs)){
return(gintervals.load(intervs))
} else {
stop("pattern space doesn't exist. Please use gpatterns.tracks_to_pat_space or gpatterns.intervs_to_pat_space to create it")
}
}
#' Generate pattern space
#'
#' @param tracks tracks
#' @param pat_len pattern length
#' @param max_span maximum span to look for CpGs
#' @param exclude_intervals intervals to exclude
#' @param exclude_cpgs CpGs to exclude
#' @param min_cov minimal coverage per CpG
#'
#' @return
#' @export
#'
#' @examples
gpatterns.tracks_to_pat_space <- function(tracks,
pat_len, max_span=NULL,
exclude_intervals=NULL,
exclude_cpgs=NULL,
min_cov=1){
if (all(gtrack.exists(.gpatterns.pat_cov_track_name(tracks, pat_len)))){
cov_tracks <- .gpatterns.pat_cov_track_name(tracks, pat_len)
} else {
cov_tracks <- .gpatterns.cov_track_name(tracks)
}
covered_cpgs <- gscreen(
paste(sprintf("!is.na(%s)", cov_tracks), collapse=' | '),
intervals = .gpatterns.genome_cpgs_intervals,
iterator = .gpatterns.genome_cpgs_intervals)
expr <- sprintf("sum_ignore_na_all(%s)", paste(cov_tracks, collapse=','))
covs <- gextract(expr,
intervals = covered_cpgs,
iterator = covered_cpgs,
colnames = 'cov') %>%
select(-intervalID) %>%
filter(cov >= min_cov)
if (!is.null(exclude_cpgs)){
covs <- covs %>% anti_join(exclude_cpgs, by=c('chrom', 'start', 'end'))
}
if (!is.null(exclude_intervals)){
covs <- covs %>%
ungroup %>%
gintervals.neighbors1(exclude_intervals) %>%
filter(dist != 0) %>%
select(-(chrom1:dist))
}
if (!is.null(max_span)){
covs <- covs %>% filter(abs(end - lead(start, 1)) <= max_span)
}
space <- covs %>%
arrange(chrom, start, end) %>%
group_by(chrom) %>%
filter(n() >= pat_len) %>%
slice(1:(n() - n() %% pat_len))
space <- space %>% ungroup %>% arrange(chrom, start, end) %>%
mutate(fid = map(1:(nrow(space)/pat_len), ~ rep(., pat_len)) %>% combine)
return(space %>% ungroup %>% select(chrom, start, end, fid))
}
.gpatterns.max_pat_cov_intervs <- function(expr, intervals, iterator=.gpatterns.genome_cpgs_intervals, pat_len=5, min_cov = 1, mode='pat_cov', adjacent = TRUE, nchunks= getOption('gpatterns.parallel.thread_num'), parallel=getOption('gpatterns.parallel'), verbose=FALSE){
old_opt <- options()
options(gmultitasking=FALSE)
get_max_cov <- function(covs){
covs <- covs %>%
group_by(chrom1, start1, end1) %>%
filter(n() >= pat_len)
if (adjacent){
if ('start.orig' %in% colnames(covs)){
# make sure that we are covering the original interval
covs <- covs %>%
arrange(chrom1, start1, end1, chrom, start, end) %>%
mutate(cg_num=1:n()) %>%
filter(sum(chrom == chrom1 & start == start.orig & end == end.orig) > 0) %>%
mutate(pos_cg_num=which(chrom == chrom1 & start == start.orig & end == end.orig)) %>%
filter(abs(cg_num - pos_cg_num) < pat_len) %>%
select(-cg_num, -pos_cg_num)
}
space <- covs %>%
group_by(chrom1, start1, end1) %>%
arrange(chrom1, start1, end1, chrom, start, end)
if (mode == 'cov'){
# calculate the total coverage of adjacent CpGs
space <- space %>%
mutate(m = zoo::rollmean(cov, pat_len, align='left', fill=NA))
} else {
# coverage of adjacent CpGs was already calculated in 'pat_cov' tracks
space <- space %>%
mutate(m = cov)
}
space <- space %>%
filter(max(m, na.rm=T) >= min_cov) %>%
mutate(cg_num=1:n(), chosen_start=which.max(m)) %>%
filter(cg_num %in% seq(chosen_start[1], chosen_start[1] + pat_len - 1)) %>%
select(chrom, start, end, chrom1, start1, end1)
} else {
space <- covs %>%
group_by(chrom1, start1, end1) %>%
filter(max(cov) >= min_cov) %>%
top_n(pat_len, cov) %>%
slice(1:pat_len) %>% #remove extra rows if there are ties
ungroup
}
return(space)
}
res <- gdply(get_max_cov, expr, intervals = intervals, iterator = iterator, nchunks=nchunks, parallel=parallel, gfunc = gextract.left_join, colnames='cov', verbose=verbose)
options(old_opt)
return(res)
}
#' Generate pattern space for specific genomic loci
#'
#' @param tracks tracks
#' @param intervals genomic loci to cover (chrom,start,end), with an optional field
#' 'fid'.
#' @param pat_len pattern length
#' @param adjacent generate patterns that do not skip a CpG
#' @param add_rest add rest of the genome (using gpatterns.tracks_to_pat_space)
#' @param min_cov if adjacent: minimal pattern coverage for interval. else - minimal CpG coverage
#' @param max_span maximum span to look for CpGs
#' @param expand_intervals expand intervals by max_span when intervals are single CpG loci
#' @param parallel override global gpatterns.parallel definition
#' @param nchunks override global gpatterns.parallel.thread_num definition
#' @param verbose print additional progress messages
#'
#' @return
#' @export
#'
#' @examples
gpatterns.intervs_to_pat_space <- function(tracks,
intervals,
pat_len,
adjacent=TRUE,
add_rest=FALSE,
min_cov=1,
max_span=500,
expand_intervals = FALSE,
nchunks= getOption('gpatterns.parallel.thread_num'),
parallel=getOption('gpatterns.parallel'),
verbose = FALSE){
if (class(intervals) == 'character'){
intervals <- gintervals.load(intervals) %>% as_tibble
}
if (expand_intervals){
exp_intervs <- intervals %>%
mutate(start.orig=start, end.orig=end)
exp_intervs <- exp_intervs %>% gintervals.expand(max_span)
} else {
exp_intervs <- intervals
}
if (adjacent){
cov_tracks <- .gpatterns.pat_cov_track_name(tracks, pat_len)
cov_tracks_exist <- gtrack.exists(cov_tracks)
# if (any(!cov_tracks_exist)){
# warning(sprintf('pat_cov track do not exist for %s.\n using ', cov_tracks[!cov_tracks_exist]))
# }
if (all(cov_tracks_exist)){
mode <- 'pat_cov'
} else {
mode <- 'cov'
}
} else {
mode <- 'cov'
}
if (mode == 'cov'){
cov_tracks <- .gpatterns.cov_track_name(tracks)
}
expr <- sprintf('sum_ignore_na_all(%s)', paste(cov_tracks, collapse=','))
message('finding most covered adjacent CpGs...')
space <- .gpatterns.max_pat_cov_intervs(expr, exp_intervs, iterator=.gpatterns.genome_cpgs_intervals, pat_len=pat_len, min_cov = min_cov, verbose=verbose, mode = mode, adjacent=adjacent, parallel=parallel, nchunks=nchunks)
if ('fid' %in% colnames(exp_intervs) || 'FID' %in% colnames(exp_intervs)){
fid_col <- which(tolower(colnames(exp_intervs)) == 'fid')
colnames(exp_intervs)[fid_col] <- colnames(exp_intervs)[fid_col] %>% tolower()
space <- space %>%
select(chrom, start, end) %>%
gintervals.neighbors1(exp_intervs %>% select(chrom, start, end, fid)) %>%
filter(dist == 0) %>%
select(chrom, start, end, fid)
} else {
space <- space %>%
ungroup %>%
arrange(chrom, start, end) %>%
mutate(fid = map(1:(nrow(space)/pat_len), ~ rep(., pat_len)) %>% combine) %>%
ungroup %>%
select(chrom, start, end, fid)
}
if (add_rest){
fid_intervs <- space %>%
group_by(fid) %>%
summarise(chrom=first(chrom), start = min(start), end=max(end)) %>%
select(-fid) %>%
ungroup
space_rest <- gpatterns.tracks_to_pat_space(tracks,
pat_len,
max_span,
exclude_intervals=fid_intervs,
exclude_cpgs=space %>%
ungroup %>%
select(chrom, start, end)
)
space_rest <- space_rest %>% mutate(fid = paste0(fid, 'R'))
space <- space %>%
mutate(fid = paste(fid)) %>%
bind_rows(space_rest)
space <- space %>%
arrange(chrom, start, end) %>%
mutate(fid = map(1:(nrow(space)/pat_len), ~ rep(., pat_len)) %>% combine)
}
return(space %>% ungroup)
}
.gpatterns.tidy_cpgs_2_pat_cov <- function(calls, pat_len, max_span){
calls <- calls %>%
mutate(start = cg_pos, end = start+1) %>%
select(read_id, chrom, start, end)
cgs <- gextract.left_join(.gpatterns.genome_cpgs_track,
intervals= calls %>%
distinct(chrom, start, end) %>%
mutate(start.orig=start, end.orig=end, end=end+max_span) %>%
as.data.frame %>%
gintervals.force_range(),
iterator=.gpatterns.genome_cpgs_intervals,
colnames='CG') %>% select(-CG)
cgs <- cgs %>%
group_by(chrom1, start1, end1) %>%
slice(1:pat_len) %>%
mutate(pid=paste0(min(start), '_', max(end)))
cgs <- cgs %>% left_join(calls, by=c('chrom', 'start', 'end'))
valid_cg_ids <- cgs %>% filter(start == start.orig, end == end.orig) %>% ungroup %>% select(pid, read_id)
res <- cgs %>%
inner_join(valid_cg_ids, by=c('pid', 'read_id')) %>%
filter(!(start1 == start.orig & end == end.orig)) %>%
summarise(pat_cov = n()) %>%
rename(chrom=chrom1, start=start1, end=end1) %>% ungroup %>%
mutate(end = end - max_span)
res <- calls %>% distinct(chrom, start, end) %>% left_join(res, by=c('chrom', 'start', 'end')) %>% replace_na(list(pat_cov=0))
return(res)
}
#' Generate CpG pattern frequency from tidy_cpgs
#'
#' @param calls tidy_cpgs
#' @param pat_length length of the pattern, e.g. for pat_lenth == 2 patterns would
#' be 00,01,10,11
#' @param min_cov minimal coverage
#' @param tidy tidy
#'
#' @return
#' @export
#'
#' @examples
gpatterns.tidy_cpgs_2_pat_freq <- function(calls, pat_length = 2, min_cov = 1, tidy=TRUE){
gen_pats <- function(calls, min_cov, pat_length=2){
add_pat <- function(calls, cls, l, min_cov=1){
#for each position get the l'th position after it in the read
cls <- cls %>%
left_join(calls %>%
mutate(next_pos = lead(start, l),
next_meth = lead(meth, l)) %>%
select(chrom, start, end, read_id, next_pos, next_meth),
by=c('chrom', 'start', 'end', 'read_id') )
#get the genomic CG after the last position
cls <- cls %>%
bind_cols(cls %>%
select(chrom, start=lastpos) %>%
mutate(end = start + 1) %>%
gintervals.neighbors1(.gpatterns.genome_next_cpg_intervals) %>%
select(dist, nextcg)) %>%
filter(dist == 0)
#add another CG only if the next cg in the read is the next in the genome
cls <- cls %>%
filter(nextcg == next_pos) %>%
unite('meth', ends_with('meth'), sep='') %>%
mutate(lastpos = nextcg) %>%
select(chrom, start, end, read_id, meth, lastpos)
return(cls)
}
# message(qq('cpg num: 1'))
cls <- calls %>% mutate(lastpos = start)
for (i in 1:(pat_length-1)){
# message(qq('cpg num: @{i+1}'))
if (nrow(calls) > 0 && nrow(cls) > 0){
cls <- add_pat(calls, cls, i, min_cov=min_cov)
}
}
cls <- cls %>% select(chrom, start, end, pat=meth)
pats_dist <- cls %>%
group_by(chrom, start, end, pat) %>%
summarise(n_pat = n()) %>%
group_by(chrom, start, end) %>%
filter(sum(n_pat) >= min_cov) %>%
mutate(p_pat = n_pat / sum(n_pat)) %>%
ungroup %>%
mutate(pat = factor(pat))
return(pats_dist)
}
fill_columns <- function(pats_dist, pat_length){
possible_pats <- apply(expand.grid(rep(list(0:1), pat_length)), 1,
function(x) paste0(x, collapse='')) %>%
paste0('pat_', .)
for (column in possible_pats){
if (!(column %in% colnames(pats_dist))){
if (nrow(pats_dist) == 0){
pats_dist[, column] <- numeric()
} else {
pats_dist[, column] <- 0
}
}
}
return(pats_dist[, c('chrom', 'start', 'end', possible_pats)])
}
calls <- calls %>%
arrange(read_id, cg_pos)
calls <- calls %>%
mutate(start = cg_pos, end = start + 1) %>%
select(chrom, start, end, read_id, meth)
# message('generating pats...')
pats_dist <- gen_pats(calls, min_cov=min_cov, pat_length=pat_length)
if (nrow(pats_dist) == 0){
return(fill_columns(pats_dist[, c('chrom', 'start', 'end')], pat_length))
}
if (tidy){
pats_dist <- pats_dist %>%
tidyr::complete(nesting(chrom, start, end), pat, fill=list(p_pat=0, n_pat=0))
} else {
pats_dist <- pats_dist %>%
select(-p_pat) %>%
mutate(pat = paste0('pat_', pat)) %>% spread('pat', 'n_pat')
pats_dist[is.na(pats_dist)] <- 0
pats_dist <- fill_columns(pats_dist, pat_length)
}
return(pats_dist)
}
#' Generate pileup form tidy_cpgs
#'
#' @param calls tidy_cpgs
#' @param dsn downsampling n
#'
#' @return data frame with chrom,start,end,meth,unmeth,cov,avg
#' @export
#'
#' @examples
gpatterns.tidy_cpgs_2_pileup <- function(calls, dsn = NULL){
pileup <- calls %>%
select(chrom, start=cg_pos, call=meth) %>%
mutate(call = ifelse(call == 1, 'meth', 'unmeth'))
if (!is.null(dsn)) {
pileup <- pileup %>%
group_by(chrom, start) %>%
filter(n() >= dsn) %>%
sample_n(dsn)
}
pileup <- pileup %>%
group_by(chrom, start, call) %>%
summarise(n = n()) %>%
mutate(end = start + 1) %>%
select(chrom, start, end, call, n) %>%
spread('call', 'n', fill=0)
if (!('meth' %in% colnames(pileup))){
pileup$meth <- 0
}
if (!('unmeth' %in% colnames(pileup))){
pileup$unmeth <- 0
}
pileup <- pileup %>%
ungroup() %>%
mutate(cov = meth + unmeth, avg = meth / cov) %>%
select(chrom, start, end, meth, unmeth, cov, avg)
return(pileup)
}
# Plotting Functions ------------------------------------------------
.gpatterns.get_cg_cor_adjacent <- function(track, dist_breaks, intervals=gintervals.all()){
a <- gtrack.array.extract(paste0(track, '.pat2'),
slice = NULL,
intervals = intervals) %>%
select(-intervalID) %>%
as_tibble %>%
gintervals.neighbors1(.gpatterns.genome_next_cpg_intervals) %>%
select(-(chrom1:end1), dist) %>%
mutate(dist = abs(nextcg - start))
a <- a %>%
mutate(dist = cut(dist, breaks=dist_breaks, include.lowest=T)) %>%
group_by(dist) %>%
summarise_at(vars(starts_with('pat_')), sum) %>%
mutate(N = pat_00 + pat_10 + pat_01 + pat_11,
Ex = (pat_10 + pat_11) / N,
Ey = (pat_01 + pat_11) / N,
Exy = pat_11 / N,
sdx = (Ex - Ex^2)^0.5,
sdy = (Ey - Ey^2)^0.5,
cr = (Exy - Ex *Ey) / (sdx * sdy))
return(a %>% mutate(track=track) %>% select(track, dist, cr))
}
.gpatterns.get_cg_cor <- function(track, dist_breaks, intervals, parallel = getOption('gpatterns.parallel'), ...){
get_cor <- function(tcpgs){
tcpgs <- tcpgs %>% group_by(read_id) %>% mutate(n = n())
max_cgs <- max(tcpgs$n)
crs <- 1:(max_cgs-1) %>% map_df(function(i) {
tcpgs %>%
select(read_id, n, cg_pos, meth) %>%
filter(n > i) %>%
mutate(meth2 = lead(meth, i),
pos2 = lead(cg_pos, i),
dist = abs(pos2 - cg_pos)) %>%
filter(!is.na(dist)) %>%
mutate(dist = cut(dist, breaks=dist_breaks, include.lowest=T)) %>%
group_by(dist, meth1=meth, meth2) %>%
summarise(n = n())
})
crs <- crs %>% group_by(dist, meth1, meth2) %>% summarise(n = sum(n)) %>% ungroup
return(crs)
}
cors <- gpatterns.apply_tidy_cpgs(track, function(x) get_cor(x), intervals=intervals, parallel=parallel, ...)
cors <- cors %>%
group_by(dist, meth1, meth2) %>%
summarise(n = sum(n)) %>%
ungroup %>%
mutate(pat = paste0('pat_', meth1, meth2)) %>%
select(dist, pat, n) %>%
group_by(dist) %>%
spread(pat, n) %>%
mutate(N = pat_00 + pat_10 + pat_01 + pat_11,
Ex = (pat_10 + pat_11) / N,
Ey = (pat_01 + pat_11) / N,
Exy = pat_11 / N,
sdx = (Ex - Ex^2)^0.5,
sdy = (Ey - Ey^2)^0.5,
cr = (Exy - Ex *Ey) / (sdx * sdy))
return(cors %>% mutate(track = track) %>% select(track, dist, cr))
}
#' Plot CpG correlation (pearson) as a function of distance between CpGs
#'
#' @param tracks tracks to plot
#' @param dist_breaks distance breaks
#' @param intervals genomic scope for which the function is applied
#' @param adjacent plot correlation only between adjacent CpGs (faster)
#' @param names alternative names for the track
#' @param width plot width (if fig_fn is not NULL)
#' @param height plot height (if fig_fn is not NULL)
#' @param fig_fn output filename for the figure (if NULL, figure would be returned)
#' @param xlab label for the x axis
#' @param ylab label for the y axis
#' @param colors custom colors
#' @param parallel parallel
#' @param ...
#'
#' @return list with trend data frame (under 'trend') and the plot (under 'p')
#' @export
#'
#' @examples
gpatterns.plot_cg_cor <- function(tracks,
dist_breaks,
intervals = gintervals.all(),
adjacent = FALSE,
names = NULL,
width = 500,
height = 260,
fig_fn = NULL,
xlab = 'Distance between CpGs',
ylab = 'CpG correlation',
colors = NULL,
parallel = getOption('gpatterns.parallel'),
...){
names <- names %||% tracks
intervals <- .gpatterns.get_intervals(intervals)
if (adjacent){
cors <- tracks %>% plyr::adply(1, function(track)
.gpatterns.get_cg_cor_adjacent(track, dist_breaks, intervals),
.parallel=parallel)
} else {
cors <- tracks %>% map_df(function(track) {
message(qq('calculating for @{track}'))
.gpatterns.get_cg_cor(track, dist_breaks=dist_breaks, intervals=intervals, parallel=parallel, ...)
})
}
cors <- cors %>% mutate(dist_numeric = dist)
breaks_numeric <- zoo::rollmean(dist_breaks, k=2)
levels(cors$dist_numeric) <- breaks_numeric
cors <- cors %>%
mutate(dist_numeric = as.numeric(as.character(dist_numeric))) %>%
left_join(tibble(track=tracks, samp=names), by='track') %>%
select(samp, dist, dist_numeric, cr) %>%
as_tibble
if (length(tracks) == 1){
p <- cors %>% ggplot(aes(x=dist_numeric, y=cr, group=1)) + geom_line() + xlab(xlab) + ylab(ylab)
} else {
p <- cors %>% ggplot(aes(x=dist_numeric, y=cr, color=samp, group=samp)) + geom_line() + xlab(xlab) + ylab(ylab) + scale_color_discrete(name='')
}
if (!is.null(colors)){
p <- p + scale_color_manual(values=colors)
}
if (!is.null(fig_fn)){
png(fig_fn, width = width, height = height)
print(p)
dev.off()
}
return(list(cors=cors, p=p))
}
# Pattern utility Functions ------------------------------------------------
#' Downsample patterns table
#'
#' @param patterns patterns table
#' @param dsn downsampling n
#'
#' @return
#' @export
#'
#' @examples
gpatterns.downsample_patterns <- function(patterns, dsn){
patterns %>%
group_by(fid) %>%
filter(n() >= dsn) %>%
sample_n(dsn) %>%
ungroup
}
#' transforms patterns to summary statistics: n,n0,n1,nx,nc,meth and epipoly
#'
#' @param patterns_tab patterns table (needs to have 'fid' and 'pattern' fields)
#' @param noise_threshold thershold of a pattern to be considered 'noise'
#'
#' @return
#' @export
#'
#' @examples
gpatterns.frag_stats <- function(patterns_tab, noise_threshold=0.2){
patterns_tab %>%
mutate(cpgs = nchar(pattern), ones = cpgs-nchar(gsub('1', '', pattern))) %>%
group_by(fid) %>%
summarize(ncpg = min(cpgs),
n = n(),
n0 = sum(ones == 0),
n1 = sum(ones == cpgs),
nx = .gpatterns.count_noise(pattern, cpgs, ones, noise_threshold),
nc = n - n0 - n1 - nx,
pat_meth = sum(ones) / sum(cpgs),
epipoly = gpatterns.calc_epipoly(pattern)) %>%
ungroup()
}
#' Extract patterns of a track
#'
#' @param track name of track
#' @param fids extract for specific fragments
#' @param tidy if TRUE: returns tidy format, else - a list
#' @param dsn extract downsampled patterns
#'
#' @return
#' @export
#'
#' @examples
gpatterns.extract_patterns <- function(track, fids = NULL, tidy = TRUE, dsn = NULL)
{
if (!is.null(dsn)){
track <- .gpatterns.downsampled_track_name(track, dsn)
}
if (!.gpatterns.patterns_exist(track)){
stop(sprintf('no patterns available for %s', track))
}
patterns_tab <- .gpatterns.load_patterns_tab(track)
if(!is.null(fids)){
patterns_tab <- patterns_tab %>% filter(fid %in% fids)
}
if (nrow(patterns_tab) == 0){
warning('no patterns in fid')
}
if (tidy){
return(patterns_tab %>% as_tibble)
}
patterns_tab <- patterns_tab %>%
group_by(fid) %>%
do(patterns=.$pattern) %>%
ungroup()
patterns_list <- patterns_tab$patterns
names(patterns_list) <- paste0('fid', patterns_tab$fid)
return(patterns_list)
}
#' Extract pattern data from tracks without specifying coordinates
#'
#' @param ... tracks (comma separated)
#' @param samples name for each track
#' @param elements elements to extract
#' @param add_bipolar_stats add bipolar stats
#' @param extract downsampled data
#' @param na.rm remove rows with missing data
#'
#' @return
#' @export
#'
#' @examples
gpatterns.extract_all <- function(...,
samples=NULL,
elements = c('fid', 'ncpg', 'n', 'n0', 'n1', 'nx', 'pat_meth', 'epipoly'),
add_bipolar_stats = FALSE,
dsn = NULL,
na.rm = FALSE){
if (!('fid' %in% elements)){
elements <- c('fid', elements)
}
if (add_bipolar_stats){
elements = c(elements, .gpatterns.bipolar_model_stats)
}
tracks <- sapply(list(...), as.character)
samples <- samples %||% tracks
if (!is.null(dsn)){
tracks <- .gpatterns.downsampled_track_name(tracks, dsn)
}
tab <- map2_df(
tracks,
samples,
function(track, name)
.gpatterns.load_table(
saved_name = .gpatterns.fids_tab_name(track),
file = .gpatterns.fids_file_name(track))
%>% mutate(track = name))
if (any(elements %in% .gpatterns.bipolar_model_stats)){
mix_tab <- map2_df(
tracks,
samples,
function(track, name)
.gpatterns.load_table(
saved_name = .gpatterns.bipolar_model_tab_name(track),
file = .gpatterns.bipolar_model_file_name(track))
%>% mutate(track=name))
tab <- tab %>%
left_join(mix_tab, by='fid')
}
tab <- tab %>% select_(.dots=c('chrom', 'start', 'end', 'track', elements))
if (na.rm){
tab <- tab %>% na.omit()
}
return(tab)
}
#' Extract pattern data
#'
#' @param ... tracks to extract
#' @param intervals genomic scope to extract from intervals
#' @param colnames names for the tracks
#' @param elements elements to extract. default: fid,ncpg,n,n0,n1,nx,nc,pat_meth
#' @param add_bipolar_stats add stats from bipolar model (if computed)
#' @param add_patterns add patterns
#' @param add_cpg_content add centered cpg content
#' @param cpg_nhood scope for centered cpg content (in bp)
#' @param add_coordinates add chrom,start,end for fragments
#' @param tidy return tidy output
#' @param dsn extract downsampled data
#' @param na.rm remove rows with missing data
#'
#' @return
#' @export
#'
#' @examples
gpatterns.extract <- function(...,
intervals,
colnames=NULL,
elements = c('fid', 'ncpg', 'n', 'n0', 'n1', 'nx', 'nc', 'pat_meth'),
add_bipolar_stats = FALSE,
add_patterns = FALSE,
add_cpg_content = FALSE,
cpg_nhood=500,
add_coordinates = TRUE,
tidy = TRUE,
dsn = NULL,
na.rm = TRUE)
{
if (add_coordinates && any(!(c('chrom', 'start', 'end') %in% elements))){
elements <- c('chrom', 'start', 'end', elements)
}
if (!('fid' %in% elements)){
elements <- c('fid', elements)
}
if (add_bipolar_stats){
elements = c(elements, .gpatterns.bipolar_model_stats)
}
if (add_patterns){
elements = c(elements, 'pattern')
}
if (add_cpg_content){
elements = c(elements, 'cpg_content')
}
tracks <- sapply(list(...), as.character)
colnames <- colnames %||% tracks
if (!is.null(dsn)){
tracks <- .gpatterns.downsampled_track_name(tracks, dsn)
}
add_cpg_content <- function(fids_tab, cpg_nhood){
cg_cont <- fids_tab %>%
distinct(chrom, start, end) %>%
mutate(cpg_content = .gpatterns.centered_cpg_content(., cpg_nhood = cpg_nhood))
return(fids_tab %>% left_join(cg_cont, by=c('chrom', 'start', 'end')))
}
tab <- map2_df(tracks, colnames, function(track, track_name){
fids_scope <- gextract(.gpatterns.fid_track_name(track),
intervals = intervals, colnames = 'fid') %>%
select(fid, intervalID)
fids_tab <- .gpatterns.load_fids_tab(track) %>%
inner_join(fids_scope, by = 'fid')
if ('cpg_content' %in% elements){
fids_tab <- fids_tab %>% add_cpg_content(cpg_nhood)
}
if ('pattern' %in% elements) {
patterns_tab <- gpatterns.extract_patterns(track, fids=fids_tab$fid, tidy = TRUE)
fids_tab <- fids_tab %>% left_join(patterns_tab, by = 'fid')
}
if (any(elements %in% .gpatterns.bipolar_model_stats)){
bipolar_tab <- .gpatterns.load_bipolar_tab(track)
fids_tab <- fids_tab %>% left_join(bipolar_tab, by = 'fid')
}
fids_tab <- fids_tab %>%
mutate(samp = track_name) %>%
select_(.dots=c('samp', elements, 'intervalID'))
})
if (na.rm){
tab <- tab %>% na.omit()
}
if (tidy){
return(tab)
}
spread_elements <- elements[which(!(elements %in% c('chrom', 'start', 'end', 'fid')))]
untidy_tab <- tab %>%
select(one_of(c('chrom', 'start', 'end', 'fid', 'samp', spread_elements[1]))) %>%
spread_('samp', spread_elements[1])
add_element_suffix <- function(tb, element){
column_nums <- which(colnames(tb) %in% colnames)
colnames(tb)[column_nums] <- paste0(colnames(tb)[column_nums], '.', element)
return(tb)
}
untidy_tab <- untidy_tab %>% add_element_suffix(untidy_tab, spread_elements[1])
for (element in spread_elements[-1]){
untidy_tab1 <- tab %>%
select(one_of(c('chrom', 'start', 'end', 'fid', 'samp', element))) %>%
spread_('samp', element)
untidy_tab1 <- untidy_tab %>% add_element_suffix(untidy_tab1, element)
untidy_tab <- suppressMessages(untidy_tab %>% full_join(untidy_tab1))
}
return(untidy_tab)
}
.gpatterns.save_table <- function(x, saved_name, file)
{
assign(saved_name, data.table::as.data.table(x))
save(list=saved_name, file=file)
}
.gpatterns.load_table <- function(saved_name, file)
{
load(file)
return(get(saved_name) %>% as_tibble())
}
.gpatterns.load_fids_tab <- function(track){
.gpatterns.load_table(saved_name=.gpatterns.fids_tab_name(track),
file=.gpatterns.fids_file_name(track))
}
.gpatterns.load_patterns_tab <- function(track){
.gpatterns.load_table(saved_name=.gpatterns.patterns_tab_name(track),
file=.gpatterns.patterns_file_name(track))
}
.gpatterns.load_bipolar_tab <- function(track){
.gpatterns.load_table(saved_name=.gpatterns.bipolar_model_tab_name(track),
file=.gpatterns.bipolar_model_file_name(track))
}
.gpatterns.patterns_exist <- function(tracks){
map_lgl(tracks, function(track) file.exists(.gpatterns.patterns_file_name(track)))
}
.gpatterns.count_noise <- function(patterns, cpgs, ones, noise_threshold)
{
noise_threshold <- noise_threshold * length(patterns)
patterns <- patterns[(ones != 0) & (ones != cpgs)]
tab <- tabulate(as.factor(patterns))
return(sum(tab[tab < noise_threshold]))
}
.gpatterns.remove.tracks <- function(tracks){
for (tr in tracks){
if(gtrack.exists(tr)){
message(sprintf("removing %s", tr))
gtrack.rm(tr, force=TRUE)
}
}
}
.gpatterns.centered_cpg_content <- function(intervals, cpg_nhood)
{
intervals <- intervals %>%
select(chrom, start, end) %>%
mutate(start = floor((start+end-1)/2), end = start+1) %>%
mutate(start= start - cpg_nhood, end = end + cpg_nhood) %>%
as.data.frame %>%
gintervals.force_range()
intervals <- gvextract(.gpatterns.genome_cpgs_track,
intervals = intervals,
iterator = intervals,
colnames='cpg_content',
func = 'sum')
return(intervals$cpg_content / (2*cpg_nhood + 1))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.