inst/scripts/htsmAF.R

message('Loading R packages ... ')
suppressMessages(library("optparse"))
suppressMessages(library("GenomicRanges"))
suppressMessages(library("rtracklayer"))
suppressMessages(library("readr"))
suppressMessages(library("tidyr"))
suppressMessages(library("HTSM"))

option_list <- list(
  make_option(opt_str = c('-i','--input'),default = NULL,
              help = 'Input directory to search for the relevant files.'),
  make_option(opt_str = c('-o','--output'),default = NULL,
              help = 'Output directory to save the results. If not specified, the
              results will be saved to the input direcotry'),
  make_option(opt_str = c('-r','--minread'),default = 2,
              help = 'A transcript is filtered if its read counts < r. Default is 2.'),
  make_option(opt_str = c('-f','--fraction'),default = 0.005,
              help = 'A transcript is filtered if its fraction of read counts < f to the
              total gene level read counts. Default is 0.005.'),
  make_option(opt_str = c('-n','--nread'),default = 2,
              help = 'A transcript is kept if it has >= n reads even if it has 
              low fraction. Default is 2.')
)

opt_parser <- OptionParser(option_list=option_list)
# print_help(opt_parser)
opt <- parse_args(opt_parser)

if(is.null(opt$input)){
  input_dir <- getwd()
} else {
  input_dir <- opt$input
}

if(is.null(opt$output)){
  output_dir <- getwd()
} else {
  output_dir <- opt$output
}

min_read <- opt$minread
min_ps <- opt$fraction
n_read <- opt$nread

#################################################################
## get read counts for the transcripts
if(!file.exists(file.path(output_dir,'read_counts.RData'))){
  read_counts <- countReads(input_dir = input_dir)
  save(read_counts,file=file.path(output_dir,'read_counts.RData'))
} else {
  load(file.path(output_dir,'read_counts.RData'))
}

cat('Abundance filter parameter\n',
    'Input: ',input_dir,'\n',
    'Output: ',output_dir,'\n',
    'Minimum reads: ',min_read,'\n',
    'Minimum fraction: ',min_ps,'\n',
    'Transcripts are all kept if reads >=',n_read,'\n',
    file = file.path(output_dir,'Abundance filter parameter.txt'))

idx <- which((read_counts$counts>=min_read & read_counts$ps >= min_ps) | 
               read_counts$counts >= n_read)
trans <- read_counts$transcript_id[idx]

#################################################################
### Get bed file after tama merge redundant
file2read <- list.files(path = input_dir,
                        pattern = 'htsm_rtd.bed',
                        recursive = T,full.names = T)
if(length(file2read)==0)
  stop('The file "htsm_rtd.bed" does not exist in the data_dir')

rtd_bed <- import(file2read)
rtd_bed$trans <- gsub('.*;','',rtd_bed$name)
rtd_bed_hexp <- rtd_bed[rtd_bed$trans %in% trans]
rtd_bed_hexp$trans <- NULL
rtd_bed_hexp <- sort(rtd_bed_hexp,by=~seqnames + start + end)

rtd_bed_lexp <- rtd_bed[!(rtd_bed$trans %in% trans)]
rtd_bed_lexp$trans <- NULL
rtd_bed_lexp <- sort(rtd_bed_lexp,by=~seqnames + start + end)

######################
rtd_gtf_hexp <- unlist(blocks(rtd_bed_hexp))
rtd_gtf_hexp$gene_id <- gsub(';.*','',names(rtd_gtf_hexp))
rtd_gtf_hexp$transcript_id <- gsub('.*;','',names(rtd_gtf_hexp))
rtd_gtf_hexp$type <- 'exon'
names(rtd_gtf_hexp) <- NULL
rtd_gtf_hexp <- sort(rtd_gtf_hexp,by=~seqnames + start + end)

rtd_gtf_lexp <- unlist(blocks(rtd_bed_lexp))
rtd_gtf_lexp$gene_id <- gsub(';.*','',names(rtd_gtf_lexp))
rtd_gtf_lexp$transcript_id <- gsub('.*;','',names(rtd_gtf_lexp))
rtd_gtf_lexp$type <- 'exon'
names(rtd_gtf_lexp) <- NULL
rtd_gtf_lexp <- sort(rtd_gtf_lexp,by=~seqnames + start + end)

export(rtd_bed_hexp,file.path(output_dir,'htsm_rtd_hexp.bed'))
export(rtd_gtf_hexp,file.path(output_dir,'htsm_rtd_hexp.gtf'))
export(rtd_bed_lexp,file.path(output_dir,'htsm_rtd_lexp.bed'))
export(rtd_gtf_lexp,file.path(output_dir,'htsm_rtd_lexp.gtf'))

stat_lexp_filtered <- data.frame(Category=c('Low expressed'),
                                 Genes=length(unique(rtd_gtf_lexp$gene_id)),
                                 Transcripts=length(unique(rtd_gtf_lexp$transcript_id)))
write.csv(stat_lexp_filtered,
          file=file.path(output_dir,'stat_lexp_filtered.csv'),
          row.names = F)
##################################################
###---> summary RTD
s <- rtdSummary(rtd_gtf_hexp)
write.csv(s,file = file.path(output_dir,'htsm_rtd_hexp_summary.csv'))
wyguo/RTDBox documentation built on Jan. 31, 2023, 1:19 a.m.