#' Curation of gene intervals
#' @export
curateGenes <- function(genes,
extend = 0)
{
message ('Curating gene intervals')
# delete bio from column names for consistency between 10x and gencode gtf files
colnames(mcols(genes)) <- gsub('bio', '', colnames(mcols(genes)))
# modify duplicated gene names
genes_dup <-
genes %>%
as_tibble %>%
select(gene_id, gene_name) %>%
distinct %>%
count(gene_name) %>%
filter(n > 1) %>%
pull(gene_name)
genes <-
genes %>%
mutate(gene_name = ifelse(gene_name %in% genes_dup, paste(gene_name, gene_id, sep = '_'), gene_name))
exons <- genes[genes$type == 'exon']
# extend intervals at 5'
exons <- stretch(anchor_3p(exons), extend)
# reduce and disjoin exon intervals per gene
exons_uniq <-
exons %>%
group_by(gene_name) %>%
reduce_ranges_directed %>%
disjoin_ranges_directed %>%
mutate(id_unique = 1:length(.))
# find annotation for new intervals
anno <-
join_overlap_inner_directed(exons_uniq, exons) %>%
group_by(id_unique) %>%
summarise(name = paste(unique(gene_name), collapse = '|'),
gene_id = paste(unique(gene_id), collapse = '|'),
class = paste(unique(gene_type), collapse = '|'))
# combine intervals and anno
exons_uniq_anno <-
merge(as.data.table(exons_uniq),
as.data.table(anno),
all=FALSE,
by = 'id_unique')[, !'id_unique'
][, id_unique := paste(name, 1:.N, sep = '|'), by = 'name']
# add position start/end column (similar to TEs) to map reads on model (corresponds to distance of first exon pos to most upstream TSS)
exons_uniq_anno <-
exons_uniq_anno[which(strand == '+'), ':=' (position_start = as.integer(cumsum(width) - width + 1), position_end = as.integer(cumsum(width))), by = 'name'
][which(strand == '-'), ':=' (position_start = as.integer(rev(cumsum(rev(width))))), by = 'name'][which(strand == '-'), position_end := as.integer(position_start - width + 1)]
# add external annotation(TSS, UTR, etc.)
exons_uniq_anno[, ':=' (tss = NA, poly = NA, tes = NA)]
# tss_nested <- as.data.table(join_overlap_left_directed(getTSS(genes), as_granges(exons_uniq_anno)) %>% select(id_unique))[, list(tss = list(.SD)), by = 'id_unique']
# exons_uniq_anno <- merge(exons_uniq_anno, tss_nested, all.x = TRUE, by = 'id_unique')
res <- as_granges(exons_uniq_anno)
return(res)
}
#' Curation of TE intervals
#' @export
curateTEs <- function(tes,
genes = NULL, # should be curated
extend_3p = 0,
filt_exonic = FALSE,
max_e_value = 0.05,
min_size = 25)
{
message ('Curating TE intervals')
if (class(tes) != 'GRanges') { tes <- as_granges(tes) }
if (!is.null(tes$e_value))
{
# filter based on e_value threshold
tes <-
tes %>%
filter(e_value < max_e_value) %>%
select(-bits, -e_value, -bias, -sq_len, -kimura_div, -family_acc)
}
message ('Resolving overlaps')
# throw overlapping/nested repeats
tes_unnest <-
tes %>%
disjoin_ranges_directed %>%
mutate(n_overl = countOverlaps(., tes)) %>%
filter(n_overl == 1) %>%
join_overlap_left_directed(., tes) %>%
select(-n_overl)
# filter short intervals
tes_unnest_filt <-
tes_unnest %>%
filter(width >= min_size)
message ('Labeling modified intervals')
# add column to indicate if interval was modified or not
tes_unnest_filt <-
tes_unnest_filt %>%
mutate(orig = 1:length(.) %in% findOverlaps(tes_unnest_filt, tes, type = 'equal')@from,
position_start = ifelse(orig, position_start, NA),
position_end = ifelse(orig, position_end, NA))
# add new unique id (alphabetical prefix for locus chunks)
tes_unnest_filt_df <- as.data.table(tes_unnest_filt)
id_conv_df <-
tes_unnest_filt_df[, fragmented := duplicated(id_unique)
][which(id_unique %in% id_unique[fragmented]),
][, .(id_unique2 = paste0(letters[1:.N], id_unique)), by = 'id_unique']
tes_unnest_filt[tes_unnest_filt$id_unique %in% id_conv_df$id_unique]$id_unique <- id_conv_df$id_unique2
if (length(unique(tes_unnest_filt$id_unique)) != length(tes_unnest_filt)) { stop ('new id_unique is not unique') }
if (!identical(tes_unnest_filt_df$start, start(tes_unnest_filt))) { stop ('data.table conversion changed ordering of rows') }
# mark exonic TEs and add overlapping genes as concatenated string
if (!is.null(genes))
{
message ('Labeling gene overlaps')
hits <- findOverlaps(tes_unnest_filt, genes)
tes_unnest_filt <- tes_unnest_filt %>% mutate(exonic = 1:length(.) %in% from(hits))
gene_hits <-
hits %>%
as_tibble %>%
mutate(subjectHits = genes[subjectHits]$name) %>%
group_by(queryHits) %>%
summarize(name = paste(unique(subjectHits), collapse = '_'))
tes_unnest_filt$gene <- NA
tes_unnest_filt[gene_hits$queryHits]$gene <- gene_hits$name
# update id unique
tes_unnest_filt <- tes_unnest_filt %>% mutate(id_unique = ifelse(exonic, paste(id_unique, gene, sep = '|'), id_unique))
} else {
tes_unnest_filt$genes <- NA
tes_unnest_filt$exonic <- NA
}
if (filt_exonic)
{
tes_unnest_filt <- tes_unnest_filt %>% filter(!exonic)
}
# throw unwanted columns
#res <- tes_unnest_filt %>% select(which(!colnames(mcols(tes_unnest_filt)) %in% c('hmm_st', 'hmm_en', 'orig', 'position_start', 'position_end', 'left_rep')))
res <- tes_unnest_filt
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.