R/rtdMerge.R

Defines functions rtdMerge

Documented in rtdMerge

#' Integrate the transcripts in the inference RTD to the reference RTD.
#' @param inf_file The gtf file of a inference RTD,e.g. a RTD assembled from short reads
#' @param ref_file The gtf file of a high confident reference RTD, e.g. a RTD assembled from long reads.
#' The transcripts of the
#' reference RTD will be all kept in the final results.
#' @param prefix_ref A prefix attached to the gene and transcript ids to distinguish their origin of "ref" RTD.
#' Default: "ref"
#' @param prefix_inf A prefix attached to the gene and transcript ids to distinguish their origin of "inf" RTD.
#' Default: "inf"
#' @param chimeric_tolerance Chimeric gene overlap percent, default: 0.05.
#' If the overlap of two gene models < x%, they are treated as separated gene models.
#' @param data_dir The directory to save the results
#' @return A merged RTD saved in the \code{data_di}
#' @example
#' data_dir <- 'data'
#' inf_file <- 'inf_file.gtf'
#' ref_file <- 'ref_file.gtf'
#' prefix_ref <- 'isoseq'
#' prefix_inf <- 'rnaseq'
#' chimeric_tolerance <- 0.05
#' rtdMerge(inf_file = inf_file,ref_file = ref_file,
#'          prefix_ref = 'isoseq',prefix_inf = 'rnaseq',
#'          chimeric_tolerance = 10,data_dir = data_dir)

rtdMerge <- function(inf_file,
                     ref_file,
                     prefix_ref='ref',
                     prefix_inf='inf',
                     chimeric_tolerance = 0.05,
                     data_dir=NULL){
  start.time <- Sys.time()
  if(is.null(data_dir))
    data_dir <- getwd()

  if(!file.exists(data_dir))
    dir.create(data_dir,recursive = T)

  message('Step 1: Load gtf files')
  ## load gtf files
  ref <- import(ref_file)
  inf <- import(inf_file)

  if(grepl(pattern = '[.]bed',inf_file)){
    inf <- unlist(blocks(inf))
    inf$gene_id <- gsub(';.*','',names(inf))
    inf$transcript_id <- gsub('.*;','',names(inf))
    inf$type <- 'exon'
    names(inf) <- NULL
    inf <- sort(inf,by=~seqnames + start + end)
  }

  if(grepl(pattern = '[.]bed',ref_file)){
    ref <- unlist(blocks(ref))
    ref$gene_id <- gsub(';.*','',names(ref))
    ref$transcript_id <- gsub('.*;','',names(ref))
    ref$type <- 'exon'
    names(ref) <- NULL
    ref <- sort(ref,by=~seqnames + start + end)
  }

  ref <- cleanMcols(ref)
  inf <- cleanMcols(inf)

  ref <- ref[ref$type=='exon',]
  inf <- inf[inf$type=='exon',]

  ## filter gene with multiple strands in inf RTD
  strand_idx <- elementNROWS(range(GenomicRanges::split(inf,inf$gene_id)))
  strand_idx <- names(strand_idx)[strand_idx>1]
  inf <- inf[!(inf$gene_id %in% strand_idx),]

  ## add prefix to gene ids
  ref$gene_id <- paste0(prefix_ref,'_',ref$gene_id)
  ref$transcript_id <- paste0(prefix_ref,'_',ref$transcript_id)
  ref$source <- prefix_ref

  inf$gene_id <- paste0(prefix_inf,'_',inf$gene_id)
  inf$transcript_id <- paste0(prefix_inf,'_',inf$transcript_id)
  inf$source <- prefix_inf

  message('Step 2: Identify novel transcript in the inference RTD')
  inf_trans_list <- list()

  message('  -> Process multi-exon transcripts')
  ####################################################
  ###---> Multi-exon transcripts

  ## generate unique labels of SJ chain
  intron_ref <- exon2intron(ref)
  intron_ref <- sort(intron_ref,by = ~ seqnames + start + end)
  intron_ref_sj <- intron_ref$label
  intron_ref_trans <- base::split(intron_ref_sj,intron_ref$transcript_id)
  intron_ref_trans <- sapply(intron_ref_trans, function(x) paste(x,collapse = ';'))

  intron_inf <- exon2intron(inf)
  intron_inf <- sort(intron_inf,by = ~ seqnames + start + end)
  intron_inf_sj <- intron_inf$label
  intron_inf_trans <- base::split(intron_inf_sj,intron_inf$transcript_id)
  intron_inf_trans <- sapply(intron_inf_trans, function(x) paste(x,collapse = ';'))

  ## add source the ref rtd
  trans_add_s <- names(intron_ref_trans)[intron_ref_trans %in% intron_inf_trans]
  ref$source[ref$transcript_id %in% trans_add_s] <- paste0('SJ:',prefix_ref,'&',prefix_inf)

  ## get the transcript names with novel SJ labels
  idx <- which(!(intron_inf_sj %in% intron_ref_sj))
  trans_multiexon_novel <- unique(intron_inf[idx]$transcript_id)

  ## multi-exon transcripts in inf which are not novel
  trans_multiexon_filtered <- setdiff(names(intron_inf_sj),trans_multiexon_novel)
  inf_trans_list <- c(inf_trans_list,trans_multiexon_filtered=list(trans_multiexon_filtered))
  inf_trans_list <- c(inf_trans_list,trans_multiexon_novel=list(trans_multiexon_novel))

  message('  -> Process mono-exon transcripts')
  ## get mono exon transcripts
  ref_mono <- getMonoExonTrans(ref)
  inf_mono <- getMonoExonTrans(inf)

  ## check whether inf mono exon trans has overlap of exons of ref, introic is excluded.
  trans_mono <- unique(inf_mono$transcript_id)
  hits <- suppressWarnings(findOverlaps(query = inf_mono,
                                        subject = ref,ignore.strand=FALSE))

  ## get novel mono exon trans with no overlap
  trans_overlap <- inf_mono[queryHits(hits),]$transcript_id
  trans_monoexon_novel <- setdiff(trans_mono,trans_overlap)
  trans_monoexon_filtered <- setdiff(trans_mono,trans_monoexon_novel)

  inf_trans_list <- c(inf_trans_list,trans_monoexon_novel=list(trans_monoexon_novel))
  inf_trans_list <- c(inf_trans_list,trans_monoexon_filtered=list(trans_monoexon_filtered))

  ## novel transcripts in inf rtd, multi-exon + mono-exon
  trans_novel<- union(trans_multiexon_novel,trans_monoexon_novel)
  inf2keep <- inf[inf$transcript_id %in% trans_novel]

  message('Step 3: Merge inf to ref RTD and rename transcript and gene ids')
  rtd <- suppressWarnings(c(ref,inf2keep))
  rtd <- sort(rtd,by=~seqnames+start+end)
  rtd$gene_id_old <- rtd$gene_id
  rtd$transcript_id_old <- rtd$transcript_id

  ### Gene initial rename, overlap between transcripts and gene collapsed region
  gene_overlap <- getOverlapRange(rtd)
  trans_range <- getTransRange(rtd)
  hits <- findOverlaps(query = trans_range,subject = gene_overlap)

  ### Get the initial gene-transcript mapping table
  mapping <- data.frame(seqnames=seqnames(trans_range[queryHits(hits)]),
                        transcript_id_old=trans_range[queryHits(hits)]$transcript_id,
                        gene_id_group=gene_overlap[subjectHits(hits)]$gene_id)
  mapping <- mapping[!duplicated(mapping),]
  mapping <- mapping[order(mapping$gene_id_group,decreasing = F),]
  mapping$gene_i <- rep(1:length(unique(mapping$gene_id_group)),
                        rle(mapping$gene_id_group)$lengths)
  mapping$gene_id <- paste0(mapping$seqnames,'G',mapping$gene_i)
  mapping$transcript_i <- sequence(rle(mapping$gene_id)$lengths)
  mapping$transcript_id <- paste0(mapping$gene_id,'.',mapping$transcript_i)
  rownames(mapping) <- mapping$transcript_id_old

  ## rename
  rtd$transcript_id <- mapping[rtd$transcript_id_old,'transcript_id']
  rtd$gene_id <- mapping[rtd$transcript_id_old,'gene_id']

  ##### check intronic
  # overlap between trans_range and intron disjoin
  message('  -> Process intronic genes')
  trans_range <- getTransRange(rtd)
  introns <- exon2intron(exons = rtd)
  introns_s <- GenomicRanges::split(introns,introns$gene_id)
  introns_d <- unlist(disjoin(introns_s))
  introns_d$gene_id <- names(introns_d)

  hits <- findOverlaps(trans_range,introns_d,type = 'within')
  if(NROW(hits)>0){
    intronic <- trans_range[queryHits(hits)]
    gene_overlap_intronic <- getOverlapRange(intronic)
    trans_range_intronic <- getTransRange(intronic)
    hits <- findOverlaps(query = trans_range_intronic,subject = gene_overlap_intronic)
    mapping_intronic  <- data.frame(
      seqnames=seqnames(trans_range_intronic[queryHits(hits)]),
      transcript_id_old=trans_range_intronic[queryHits(hits)]$transcript_id,
      gene_id_group=gene_overlap_intronic[subjectHits(hits)]$gene_id)

    mapping_intronic <- mapping_intronic[order(mapping_intronic$gene_id_group,
                                               decreasing = F),]
    mapping_intronic$gene_i <- rep(1:length(unique(mapping_intronic$gene_id_group)),
                                   rle(mapping_intronic$gene_id_group)$lengths)
    n <- max(mapping_intronic$gene_i)
    mapping_intronic$gene_i <- mapping_intronic$gene_i/10^ceiling(log10(n)+1)
    rownames(mapping_intronic) <- mapping_intronic$transcript_id_old

    rownames(mapping) <- mapping$transcript_id
    idx <- mapping_intronic$transcript_id_old
    mapping[idx,'gene_i'] <- mapping[idx,'gene_i']+mapping_intronic[idx,'gene_i']
    mapping$gene_i <- as.integer(factor(mapping$gene_i))
    mapping$gene_id <- paste0(mapping$seqnames,'G',mapping$gene_i)
    mapping <- mapping[order(mapping$gene_id,decreasing = F),]
    mapping$transcript_i <- sequence(rle(mapping$gene_id)$lengths)
    mapping$transcript_id <- paste0(mapping$gene_id,'.',mapping$transcript_i)
    rownames(mapping) <- mapping$transcript_id_old
    rtd$transcript_id <- mapping[rtd$transcript_id_old,'transcript_id']
    rtd$gene_id <- mapping[rtd$transcript_id_old,'gene_id']

    trans_range <- getTransRange(rtd)
  }
  # trans_range$gene_id <- gsub('[.].*','',trans_range$transcript_id)
  trans_range$gene_id <- sub('\\..[^\\.]*$', '', trans_range$transcript_id)

  message('  -> Process chimeric check')
  ###---> find overlap between transcript ranges and transcript ranges
  hits <- findOverlaps(query = trans_range,subject = trans_range)
  idx <- which(queryHits(hits)!=subjectHits(hits))
  hits <- hits[idx,]

  ## get the overlaps < chimeric_tolerance
  gr1 <- trans_range[queryHits(hits)]
  gr2 <- trans_range[subjectHits(hits)]
  overlaps <- pintersect(gr1,gr2)
  p1 <- width(overlaps)/width(gr1)
  p2 <- width(overlaps)/width(gr2)
  idx <- which(p1 < chimeric_tolerance & p2 < chimeric_tolerance)
  overlaps_pass <- overlaps[idx]
  overlaps_pass$from <- gr1$transcript_id[idx]
  overlaps_pass$to <- gr2$transcript_id[idx]
  overlaps_pass$transcript_id <- NULL
  overlaps_pass$hit <- NULL
  overlaps_pass <- unique(overlaps_pass)

  ## do second check, if the overlaps inside in other transcritps
  trans <- union(gr1$transcript_id[idx],gr2$transcript_id[idx])
  genes <- unique(sub('\\..[^\\.]*$', '', trans))
  trans_range_pass <- trans_range[trans_range$gene_id %in% genes]
  trans_range_pass_remain <-
    trans_range_pass[!(trans_range_pass$transcript_id %in% trans)]
  hits_pass <- findOverlaps(query = overlaps_pass,trans_range_pass_remain,
                            type = 'within')

  idx <- unique(queryHits(hits_pass))
  overlaps_pass <- overlaps_pass[-idx,]
  
  overlaps_pass_split <- GenomicRanges::split(overlaps_pass,overlaps_pass$gene_id)
  ## if multiple overlap, get range and check range length agaist transcript length
  if(any(elementNROWS(overlaps_pass_split)>1)){
    overlaps_pass_split <- unlist(range(overlaps_pass_split))
    overlaps_pass_split$gene_id <- names(overlaps_pass_split)
    # names(overlaps_pass_split) <- NULL
    overlaps_pass_split <- overlaps_pass_split[overlaps_pass$gene_id]
    overlaps_pass_split$from <- overlaps_pass$from
    overlaps_pass_split$to <- overlaps_pass$to
    
    #check again if the region is more than > tolorance
    trans_l <- width(trans_range)
    names(trans_l) <- trans_range$transcript_id
    overlaps_pass_split$from_overlap <- width(overlaps_pass_split)/trans_l[overlaps_pass_split$from]
    overlaps_pass_split$to_overlap <- width(overlaps_pass_split)/trans_l[overlaps_pass_split$to]
    
    idx_filter <- which(overlaps_pass_split$to_overlap >= chimeric_tolerance |
                          overlaps_pass_split$from_overlap >= chimeric_tolerance)
    if(length(idx_filter)>0){
      genes2filer <- unique(overlaps_pass_split$gene_id[idx_filter])
      overlaps_pass <- overlaps_pass[!(overlaps_pass$gene_id %in% genes2filer)]
    } else {
      overlaps_pass <- overlaps_pass
    }
  }
  
  trans_range_pass <- trans_range_pass[trans_range_pass$gene_id %in% overlaps_pass$gene_id]
  
  ## repeat the overlap to match the overlap to all the transcripts in the gene
  overlaps_pass <- GenomicRanges::split(overlaps_pass,overlaps_pass$gene_id)
  overlaps_pass <- reduce(overlaps_pass)
  overlaps_pass <- unlist(overlaps_pass)
  overlaps_pass <- overlaps_pass[trans_range_pass$gene_id]
  overlaps_pass$gene_id <- trans_range_pass$gene_id
  overlaps_pass$transcript_id <- trans_range_pass$transcript_id

  ## take away overlap from all the transcripts
  trans_range_pass_cut <- psetdiff(trans_range_pass,overlaps_pass)
  mcols(trans_range_pass_cut) <- mcols(trans_range_pass)

  gene_overlap_chemric <- getOverlapRange(trans_range_pass_cut)
  hits <- findOverlaps(trans_range_pass_cut,gene_overlap_chemric)

  if(NROW(hits)>0){
    mapping_chemric  <- data.frame(
      seqnames=seqnames(trans_range_pass_cut[queryHits(hits)]),
      transcript_id_old=trans_range_pass_cut[queryHits(hits)]$transcript_id,
      gene_id_group=gene_overlap_chemric[subjectHits(hits)]$gene_id)

    mapping_chemric <- mapping_chemric[order(mapping_chemric$gene_id_group,
                                             decreasing = F),]
    mapping_chemric$gene_i <- rep(1:length(unique(mapping_chemric$gene_id_group)),
                                  rle(mapping_chemric$gene_id_group)$lengths)
    n <- max(mapping_chemric$gene_i)
    mapping_chemric$gene_i <- mapping_chemric$gene_i/10^ceiling(log10(n)+1)
    rownames(mapping_chemric) <- mapping_chemric$transcript_id_old

    rownames(mapping) <- mapping$transcript_id
    idx <- mapping_chemric$transcript_id_old
    mapping[idx,'gene_i'] <- mapping[idx,'gene_i']+mapping_chemric[idx,'gene_i']
    mapping$gene_i <- as.integer(factor(mapping$gene_i))
    mapping$gene_id <- paste0(mapping$seqnames,'G',mapping$gene_i)
    mapping$transcript_i <- sequence(rle(mapping$gene_id)$lengths)
    mapping$transcript_id <- paste0(mapping$gene_id,'.',mapping$transcript_i)
    rownames(mapping) <- mapping$transcript_id_old
    rtd$transcript_id <- mapping[rtd$transcript_id_old,'transcript_id']
    rtd$gene_id <- mapping[rtd$transcript_id_old,'gene_id']
  }

  rtd$gene_id_old <- NULL
  rtd$transcript_id_old <- NULL

  message('Step 4: Summary the merged RTD')

  ###---> transcript per gene
  n <- as.character(c(1:20,'>20'))
  diversity <- lapply(list(ref=ref,inf=inf,merged=rtd), function(gr){
    trans2gene <- data.frame(TXNAME=gr$transcript_id,GENEID=gr$gene_id)
    trans2gene <- trans2gene[!duplicated(trans2gene),]
    idx <- table(table(trans2gene$GENEID))
    if(max(as.numeric(names(idx))) > 20){
      idx1 <- idx[as.numeric(names(idx))<=20]
      idx2 <- sum(idx[as.numeric(names(idx))>20])
      names(idx2) <- '>20'
      idx <- c(idx1,idx2)
    }
    num <- rep(0,length(n))
    names(num) <- n
    num[names(idx)] <- idx
    num
  })

  transpergene <- do.call(cbind,diversity)
  transpergene <- data.frame(TransPerGene=rownames(transpergene),transpergene,row.names = NULL)
  colnames(transpergene) <- c('TransPerGene',prefix_ref,prefix_inf,'merged')
  write.csv(transpergene,
            file=file.path(data_dir,paste0('htsm_rtd_',prefix_ref,'_', prefix_inf,
                                           '_transcript_per_gene_number.csv')),
            row.names = F)

  data2plot <- transpergene[,-1]
  rownames(data2plot) <- transpergene[,1]
  data2plot <- t(data2plot)

  png(file.path(data_dir,
                paste0('htsm_rtd_',prefix_ref,'_',prefix_inf,'_transcript_per_gene_number.png')),
      width = 10,height = 5,res = 300,units = 'in')
  xx <- barplot(data2plot,beside = TRUE,
                col =  c("#999999", "#E69F00", "#56B4E9"),
                ylim=c(0,1.2*max(data2plot)),
                xlab = 'Transcripts in a gene',
                ylab = 'Genen number',
                main='Transcript number per gene',
                space=c(0,0.5))
  ## Add text at top of bars
  text(x = xx, y = data2plot, label = data2plot, cex = 0.7, col = "red",
       srt=-90,adj=c(1.1,0.5))
  legend("topright",
         legend = rownames(data2plot),
         fill = c("#999999", "#E69F00", "#56B4E9"))
  dev.off()


  pdf(file.path(data_dir,
                paste0('htsm_rtd_',prefix_ref,'_',prefix_inf,'_transcript_per_gene_number.pdf')),
      width = 10,height = 5)
  xx <- barplot(data2plot,beside = TRUE,
                col =  c("#999999", "#E69F00", "#56B4E9"),
                ylim=c(0,1.2*max(data2plot)),
                xlab = 'Transcripts in a gene',
                ylab = 'Genen number',
                main='Transcript number per gene',
                space=c(0,0.5))

  ## Add text at top of bars
  text(x = xx, y = data2plot, label = data2plot, cex = 0.7, col = "red",
       srt=-90,adj=c(1.1,0.5))
  legend("topright",
         legend = rownames(data2plot),
         fill = c("#999999", "#E69F00", "#56B4E9"))
  dev.off()

  ###---> basic statistics
  stat_merged <- rtdSummary(gr = rtd)
  stat_ref <- rtdSummary(gr = ref)
  stat_inf <- rtdSummary(gr = inf)
  stat <- cbind(stat_ref,stat_inf,stat_merged)
  colnames(stat) <- c(prefix_ref,prefix_inf,'merged')

  write.csv(stat,file=file.path(data_dir,paste0('htsm_rtd_',prefix_ref,'_', prefix_inf,'_merged_summary.csv')))
  message('Step 5: Save the results')

  export_gtf(gr = rtd,
             file2save = file.path(data_dir,paste0('htsm_rtd_',prefix_ref,'_', prefix_inf,'_merged.gtf')))
  save(rtd,file = file.path(data_dir,paste0('htsm_rtd_',prefix_ref,'_', prefix_inf,'_merged.RData')))
  save(inf_trans_list,file=file.path(data_dir,paste0(prefix_inf,'_trans_summary.RData')))

  #######################
  ###---> save to bed file
  exon <- rtd
  mapping <- data.frame(gene_id = exon$gene_id,transcript_id = exon$transcript_id)
  mapping <- mapping[!duplicated(mapping),]
  rownames(mapping) <- mapping$transcript_id

  exon <- GenomicRanges::split(exon,exon$transcript_id)
  bed <- rtracklayer::asBED(exon)

  blocks <- bed$blocks
  bed$blocks <- NULL
  strand_info <- as.character(strand(bed))
  strand_info[strand_info == "*"] <- NA

  df <- data.frame(seqnames = seqnames(bed),
                   start = start(bed) - 1,
                   end = end(bed),
                   names = paste0(mapping[bed$name,'gene_id'],';',bed$name),
                   score = 0,
                   strand = strand_info,
                   thickStart = start(bed) - 1,
                   thickEnd = end(bed),
                   itemRgb = 1,
                   blockCount = elementNROWS(blocks),
                   blockSizes = unlist(lapply(width(blocks), paste, collapse = ","), use.names = FALSE),
                   blockStarts = unlist(lapply(start(blocks) - 1, paste, collapse = ","), use.names = FALSE))
  rownames(df) <- NULL

  message('Export bed file ...')
  write.table(df, file = file.path(data_dir,paste0('htsm_rtd_',prefix_ref,'_', prefix_inf,'_merged.bed')),
              sep = "\t", col.names = FALSE,
              row.names = FALSE, quote = FALSE, na = ".")

  message('Done: ',Sys.time())
  end.time <- Sys.time()
  time.taken <- end.time - start.time
  message(paste('Time for analysis:',round(time.taken,3),attributes(time.taken)$units),'\n')
}
wyguo/RTDBox documentation built on Jan. 31, 2023, 1:19 a.m.