R/HCsite.R

Defines functions HCsite

Documented in HCsite

#' Extract high confident transcript start or end site
#' @param site_stats A DataFrame of TSS or TES statistics generated by \code{SummaryTranSite}.
#' @param cut_off A numeric value of BH adjsuted p-value (fdr), p-value or probablity cut-off.
#' @param cut_at The \code{cut_off} is applied to "fdr" (adjustd p-value), "pval" (p-value) or "prob" (probability).
#' @param site_region An integer of enriched site region. TSSs/TESs locate within enriched
#' site+/-site_region are all treated as high confident sites.
#' @param min_read The minimum reads in a bin to support high confident sites.
#' @param type "TSS" for transcript start site and "TES" for transcript end site.
#' @return A DataFrame of enriched TSS or TES sites.
#'
HCsite <- function(site_stats,
                   cut_off=0.05,
                   cut_at=c('fdr','pval','prob'),
                   site_region=50,
                   min_read=2,
                   type=c('TSS','TES')){
  type <- match.arg(arg = type,choices = c('TSS','TES'))
  cut_at <- match.arg(arg = cut_at,choices = c('fdr','pval','prob'))

  if(!(type %in% colnames(site_stats)))
    stop('Please provide valid type: "TSS" or "TES"')

  #############################################################
  ###---> high abundance
  site_stats_enriched <- site_stats[site_stats[,cut_at] < cut_off &
                                      site_stats$read > site_stats$gene_read_mean,]
  site_stats_enriched$label2 <- paste0(site_stats_enriched$seqnames,';',site_stats_enriched$strand)
  site_stats_enriched_split <- GenomicRanges::split(site_stats_enriched,site_stats_enriched$label2)

  label_enriched <- lapply(names(site_stats_enriched_split), function(i){
    # print(i)
    site <- site_stats_enriched_split[[i]][,type]
    n <- rep((2*site_region+1),length(site))
    insite_region <- sequence(n,from = pmax(rep(0,length(site)),site-site_region))
    # insite_region <- insite_region[insite_region %in% site_stats[,type]]
    label <- paste0(i,';',insite_region)
    label <- label[which(label %in% site_stats$label)]
    label
  })
  label_enriched <- unique(unlist(label_enriched))
  site_high <- site_stats[site_stats$label %in% label_enriched,]

  #############################################################
  ###---> low abundance

  genes <- setdiff(site_stats$gene_id,site_high$gene_id)
  site_low <- site_stats[site_stats$gene_id %in% genes,]
  site_low <- site_low[site_low$read_in_bin >= min_read,]
  
  site_high$enriched <-'Yes'
  site_low$enriched <- 'No'
  
  result <- rbind(site_high,site_low)
  result <- sort(result,by=as.formula(paste0('~seqnames+',type)))
  result
}
wyguo/RTDBox documentation built on Jan. 31, 2023, 1:19 a.m.