#' Integrate the transcripts in the inference RTD to the reference RTD.
#' @param inf_file The gtf file of a inference RTD,e.g. a RTD assembled from short reads
#' @param ref_file The gtf file of a high confident reference RTD, e.g. a RTD assembled from long reads.
#' The transcripts of the
#' reference RTD will be all kept in the final results.
#' @param prefix_ref A prefix attached to the gene and transcript ids to distinguish their origin of "ref" RTD.
#' Default: "ref"
#' @param prefix_inf A prefix attached to the gene and transcript ids to distinguish their origin of "inf" RTD.
#' Default: "inf"
#' @param chimeric_tolerance Chimeric gene overlap percent, default: 0.05.
#' If the overlap of two gene models < x%, they are treated as separated gene models.
#' @param data_dir The directory to save the results
#' @return A merged RTD saved in the \code{data_di}
#' @example
#' data_dir <- 'data'
#' inf_file <- 'inf_file.gtf'
#' ref_file <- 'ref_file.gtf'
#' prefix_ref <- 'isoseq'
#' prefix_inf <- 'rnaseq'
#' chimeric_tolerance <- 0.05
#' rtdMerge(inf_file = inf_file,ref_file = ref_file,
#' prefix_ref = 'isoseq',prefix_inf = 'rnaseq',
#' chimeric_tolerance = 10,data_dir = data_dir)
rtdMerge <- function(inf_file,
ref_file,
prefix_ref='ref',
prefix_inf='inf',
chimeric_tolerance = 0.05,
data_dir=NULL){
start.time <- Sys.time()
if(is.null(data_dir))
data_dir <- getwd()
if(!file.exists(data_dir))
dir.create(data_dir,recursive = T)
message('Step 1: Load gtf files')
## load gtf files
ref <- import(ref_file)
inf <- import(inf_file)
if(grepl(pattern = '[.]bed',inf_file)){
inf <- unlist(blocks(inf))
inf$gene_id <- gsub(';.*','',names(inf))
inf$transcript_id <- gsub('.*;','',names(inf))
inf$type <- 'exon'
names(inf) <- NULL
inf <- sort(inf,by=~seqnames + start + end)
}
if(grepl(pattern = '[.]bed',ref_file)){
ref <- unlist(blocks(ref))
ref$gene_id <- gsub(';.*','',names(ref))
ref$transcript_id <- gsub('.*;','',names(ref))
ref$type <- 'exon'
names(ref) <- NULL
ref <- sort(ref,by=~seqnames + start + end)
}
ref <- cleanMcols(ref)
inf <- cleanMcols(inf)
ref <- ref[ref$type=='exon',]
inf <- inf[inf$type=='exon',]
## filter gene with multiple strands in inf RTD
strand_idx <- elementNROWS(range(GenomicRanges::split(inf,inf$gene_id)))
strand_idx <- names(strand_idx)[strand_idx>1]
inf <- inf[!(inf$gene_id %in% strand_idx),]
## add prefix to gene ids
ref$gene_id <- paste0(prefix_ref,'_',ref$gene_id)
ref$transcript_id <- paste0(prefix_ref,'_',ref$transcript_id)
ref$source <- prefix_ref
inf$gene_id <- paste0(prefix_inf,'_',inf$gene_id)
inf$transcript_id <- paste0(prefix_inf,'_',inf$transcript_id)
inf$source <- prefix_inf
message('Step 2: Identify novel transcript in the inference RTD')
inf_trans_list <- list()
message(' -> Process multi-exon transcripts')
####################################################
###---> Multi-exon transcripts
## generate unique labels of SJ chain
intron_ref <- exon2intron(ref)
intron_ref <- sort(intron_ref,by = ~ seqnames + start + end)
intron_ref_sj <- intron_ref$label
intron_ref_trans <- base::split(intron_ref_sj,intron_ref$transcript_id)
intron_ref_trans <- sapply(intron_ref_trans, function(x) paste(x,collapse = ';'))
intron_inf <- exon2intron(inf)
intron_inf <- sort(intron_inf,by = ~ seqnames + start + end)
intron_inf_sj <- intron_inf$label
intron_inf_trans <- base::split(intron_inf_sj,intron_inf$transcript_id)
intron_inf_trans <- sapply(intron_inf_trans, function(x) paste(x,collapse = ';'))
## add source the ref rtd
trans_add_s <- names(intron_ref_trans)[intron_ref_trans %in% intron_inf_trans]
ref$source[ref$transcript_id %in% trans_add_s] <- paste0('SJ:',prefix_ref,'&',prefix_inf)
## get the transcript names with novel SJ labels
idx <- which(!(intron_inf_sj %in% intron_ref_sj))
trans_multiexon_novel <- unique(intron_inf[idx]$transcript_id)
## multi-exon transcripts in inf which are not novel
trans_multiexon_filtered <- setdiff(names(intron_inf_sj),trans_multiexon_novel)
inf_trans_list <- c(inf_trans_list,trans_multiexon_filtered=list(trans_multiexon_filtered))
inf_trans_list <- c(inf_trans_list,trans_multiexon_novel=list(trans_multiexon_novel))
message(' -> Process mono-exon transcripts')
## get mono exon transcripts
ref_mono <- getMonoExonTrans(ref)
inf_mono <- getMonoExonTrans(inf)
## check whether inf mono exon trans has overlap of exons of ref, introic is excluded.
trans_mono <- unique(inf_mono$transcript_id)
hits <- suppressWarnings(findOverlaps(query = inf_mono,
subject = ref,ignore.strand=FALSE))
## get novel mono exon trans with no overlap
trans_overlap <- inf_mono[queryHits(hits),]$transcript_id
trans_monoexon_novel <- setdiff(trans_mono,trans_overlap)
trans_monoexon_filtered <- setdiff(trans_mono,trans_monoexon_novel)
inf_trans_list <- c(inf_trans_list,trans_monoexon_novel=list(trans_monoexon_novel))
inf_trans_list <- c(inf_trans_list,trans_monoexon_filtered=list(trans_monoexon_filtered))
## novel transcripts in inf rtd, multi-exon + mono-exon
trans_novel<- union(trans_multiexon_novel,trans_monoexon_novel)
inf2keep <- inf[inf$transcript_id %in% trans_novel]
message('Step 3: Merge inf to ref RTD and rename transcript and gene ids')
rtd <- suppressWarnings(c(ref,inf2keep))
rtd <- sort(rtd,by=~seqnames+start+end)
rtd$gene_id_old <- rtd$gene_id
rtd$transcript_id_old <- rtd$transcript_id
### Gene initial rename, overlap between transcripts and gene collapsed region
gene_overlap <- getOverlapRange(rtd)
trans_range <- getTransRange(rtd)
hits <- findOverlaps(query = trans_range,subject = gene_overlap)
### Get the initial gene-transcript mapping table
mapping <- data.frame(seqnames=seqnames(trans_range[queryHits(hits)]),
transcript_id_old=trans_range[queryHits(hits)]$transcript_id,
gene_id_group=gene_overlap[subjectHits(hits)]$gene_id)
mapping <- mapping[!duplicated(mapping),]
mapping <- mapping[order(mapping$gene_id_group,decreasing = F),]
mapping$gene_i <- rep(1:length(unique(mapping$gene_id_group)),
rle(mapping$gene_id_group)$lengths)
mapping$gene_id <- paste0(mapping$seqnames,'G',mapping$gene_i)
mapping$transcript_i <- sequence(rle(mapping$gene_id)$lengths)
mapping$transcript_id <- paste0(mapping$gene_id,'.',mapping$transcript_i)
rownames(mapping) <- mapping$transcript_id_old
## rename
rtd$transcript_id <- mapping[rtd$transcript_id_old,'transcript_id']
rtd$gene_id <- mapping[rtd$transcript_id_old,'gene_id']
##### check intronic
# overlap between trans_range and intron disjoin
message(' -> Process intronic genes')
trans_range <- getTransRange(rtd)
introns <- exon2intron(exons = rtd)
introns_s <- GenomicRanges::split(introns,introns$gene_id)
introns_d <- unlist(disjoin(introns_s))
introns_d$gene_id <- names(introns_d)
hits <- findOverlaps(trans_range,introns_d,type = 'within')
if(NROW(hits)>0){
intronic <- trans_range[queryHits(hits)]
gene_overlap_intronic <- getOverlapRange(intronic)
trans_range_intronic <- getTransRange(intronic)
hits <- findOverlaps(query = trans_range_intronic,subject = gene_overlap_intronic)
mapping_intronic <- data.frame(
seqnames=seqnames(trans_range_intronic[queryHits(hits)]),
transcript_id_old=trans_range_intronic[queryHits(hits)]$transcript_id,
gene_id_group=gene_overlap_intronic[subjectHits(hits)]$gene_id)
mapping_intronic <- mapping_intronic[order(mapping_intronic$gene_id_group,
decreasing = F),]
mapping_intronic$gene_i <- rep(1:length(unique(mapping_intronic$gene_id_group)),
rle(mapping_intronic$gene_id_group)$lengths)
n <- max(mapping_intronic$gene_i)
mapping_intronic$gene_i <- mapping_intronic$gene_i/10^ceiling(log10(n)+1)
rownames(mapping_intronic) <- mapping_intronic$transcript_id_old
rownames(mapping) <- mapping$transcript_id
idx <- mapping_intronic$transcript_id_old
mapping[idx,'gene_i'] <- mapping[idx,'gene_i']+mapping_intronic[idx,'gene_i']
mapping$gene_i <- as.integer(factor(mapping$gene_i))
mapping$gene_id <- paste0(mapping$seqnames,'G',mapping$gene_i)
mapping <- mapping[order(mapping$gene_id,decreasing = F),]
mapping$transcript_i <- sequence(rle(mapping$gene_id)$lengths)
mapping$transcript_id <- paste0(mapping$gene_id,'.',mapping$transcript_i)
rownames(mapping) <- mapping$transcript_id_old
rtd$transcript_id <- mapping[rtd$transcript_id_old,'transcript_id']
rtd$gene_id <- mapping[rtd$transcript_id_old,'gene_id']
trans_range <- getTransRange(rtd)
}
# trans_range$gene_id <- gsub('[.].*','',trans_range$transcript_id)
trans_range$gene_id <- sub('\\..[^\\.]*$', '', trans_range$transcript_id)
message(' -> Process chimeric check')
###---> find overlap between transcript ranges and transcript ranges
hits <- findOverlaps(query = trans_range,subject = trans_range)
idx <- which(queryHits(hits)!=subjectHits(hits))
hits <- hits[idx,]
## get the overlaps < chimeric_tolerance
gr1 <- trans_range[queryHits(hits)]
gr2 <- trans_range[subjectHits(hits)]
overlaps <- pintersect(gr1,gr2)
p1 <- width(overlaps)/width(gr1)
p2 <- width(overlaps)/width(gr2)
idx <- which(p1 < chimeric_tolerance & p2 < chimeric_tolerance)
overlaps_pass <- overlaps[idx]
overlaps_pass$from <- gr1$transcript_id[idx]
overlaps_pass$to <- gr2$transcript_id[idx]
overlaps_pass$transcript_id <- NULL
overlaps_pass$hit <- NULL
overlaps_pass <- unique(overlaps_pass)
## do second check, if the overlaps inside in other transcritps
trans <- union(gr1$transcript_id[idx],gr2$transcript_id[idx])
genes <- unique(sub('\\..[^\\.]*$', '', trans))
trans_range_pass <- trans_range[trans_range$gene_id %in% genes]
trans_range_pass_remain <-
trans_range_pass[!(trans_range_pass$transcript_id %in% trans)]
hits_pass <- findOverlaps(query = overlaps_pass,trans_range_pass_remain,
type = 'within')
idx <- unique(queryHits(hits_pass))
overlaps_pass <- overlaps_pass[-idx,]
overlaps_pass_split <- GenomicRanges::split(overlaps_pass,overlaps_pass$gene_id)
## if multiple overlap, get range and check range length agaist transcript length
if(any(elementNROWS(overlaps_pass_split)>1)){
overlaps_pass_split <- unlist(range(overlaps_pass_split))
overlaps_pass_split$gene_id <- names(overlaps_pass_split)
# names(overlaps_pass_split) <- NULL
overlaps_pass_split <- overlaps_pass_split[overlaps_pass$gene_id]
overlaps_pass_split$from <- overlaps_pass$from
overlaps_pass_split$to <- overlaps_pass$to
#check again if the region is more than > tolorance
trans_l <- width(trans_range)
names(trans_l) <- trans_range$transcript_id
overlaps_pass_split$from_overlap <- width(overlaps_pass_split)/trans_l[overlaps_pass_split$from]
overlaps_pass_split$to_overlap <- width(overlaps_pass_split)/trans_l[overlaps_pass_split$to]
idx_filter <- which(overlaps_pass_split$to_overlap >= chimeric_tolerance |
overlaps_pass_split$from_overlap >= chimeric_tolerance)
if(length(idx_filter)>0){
genes2filer <- unique(overlaps_pass_split$gene_id[idx_filter])
overlaps_pass <- overlaps_pass[!(overlaps_pass$gene_id %in% genes2filer)]
} else {
overlaps_pass <- overlaps_pass
}
}
trans_range_pass <- trans_range_pass[trans_range_pass$gene_id %in% overlaps_pass$gene_id]
## repeat the overlap to match the overlap to all the transcripts in the gene
overlaps_pass <- GenomicRanges::split(overlaps_pass,overlaps_pass$gene_id)
overlaps_pass <- reduce(overlaps_pass)
overlaps_pass <- unlist(overlaps_pass)
overlaps_pass <- overlaps_pass[trans_range_pass$gene_id]
overlaps_pass$gene_id <- trans_range_pass$gene_id
overlaps_pass$transcript_id <- trans_range_pass$transcript_id
## take away overlap from all the transcripts
trans_range_pass_cut <- psetdiff(trans_range_pass,overlaps_pass)
mcols(trans_range_pass_cut) <- mcols(trans_range_pass)
gene_overlap_chemric <- getOverlapRange(trans_range_pass_cut)
hits <- findOverlaps(trans_range_pass_cut,gene_overlap_chemric)
if(NROW(hits)>0){
mapping_chemric <- data.frame(
seqnames=seqnames(trans_range_pass_cut[queryHits(hits)]),
transcript_id_old=trans_range_pass_cut[queryHits(hits)]$transcript_id,
gene_id_group=gene_overlap_chemric[subjectHits(hits)]$gene_id)
mapping_chemric <- mapping_chemric[order(mapping_chemric$gene_id_group,
decreasing = F),]
mapping_chemric$gene_i <- rep(1:length(unique(mapping_chemric$gene_id_group)),
rle(mapping_chemric$gene_id_group)$lengths)
n <- max(mapping_chemric$gene_i)
mapping_chemric$gene_i <- mapping_chemric$gene_i/10^ceiling(log10(n)+1)
rownames(mapping_chemric) <- mapping_chemric$transcript_id_old
rownames(mapping) <- mapping$transcript_id
idx <- mapping_chemric$transcript_id_old
mapping[idx,'gene_i'] <- mapping[idx,'gene_i']+mapping_chemric[idx,'gene_i']
mapping$gene_i <- as.integer(factor(mapping$gene_i))
mapping$gene_id <- paste0(mapping$seqnames,'G',mapping$gene_i)
mapping$transcript_i <- sequence(rle(mapping$gene_id)$lengths)
mapping$transcript_id <- paste0(mapping$gene_id,'.',mapping$transcript_i)
rownames(mapping) <- mapping$transcript_id_old
rtd$transcript_id <- mapping[rtd$transcript_id_old,'transcript_id']
rtd$gene_id <- mapping[rtd$transcript_id_old,'gene_id']
}
rtd$gene_id_old <- NULL
rtd$transcript_id_old <- NULL
message('Step 4: Summary the merged RTD')
###---> transcript per gene
n <- as.character(c(1:20,'>20'))
diversity <- lapply(list(ref=ref,inf=inf,merged=rtd), function(gr){
trans2gene <- data.frame(TXNAME=gr$transcript_id,GENEID=gr$gene_id)
trans2gene <- trans2gene[!duplicated(trans2gene),]
idx <- table(table(trans2gene$GENEID))
if(max(as.numeric(names(idx))) > 20){
idx1 <- idx[as.numeric(names(idx))<=20]
idx2 <- sum(idx[as.numeric(names(idx))>20])
names(idx2) <- '>20'
idx <- c(idx1,idx2)
}
num <- rep(0,length(n))
names(num) <- n
num[names(idx)] <- idx
num
})
transpergene <- do.call(cbind,diversity)
transpergene <- data.frame(TransPerGene=rownames(transpergene),transpergene,row.names = NULL)
colnames(transpergene) <- c('TransPerGene',prefix_ref,prefix_inf,'merged')
write.csv(transpergene,
file=file.path(data_dir,paste0('htsm_rtd_',prefix_ref,'_', prefix_inf,
'_transcript_per_gene_number.csv')),
row.names = F)
data2plot <- transpergene[,-1]
rownames(data2plot) <- transpergene[,1]
data2plot <- t(data2plot)
png(file.path(data_dir,
paste0('htsm_rtd_',prefix_ref,'_',prefix_inf,'_transcript_per_gene_number.png')),
width = 10,height = 5,res = 300,units = 'in')
xx <- barplot(data2plot,beside = TRUE,
col = c("#999999", "#E69F00", "#56B4E9"),
ylim=c(0,1.2*max(data2plot)),
xlab = 'Transcripts in a gene',
ylab = 'Genen number',
main='Transcript number per gene',
space=c(0,0.5))
## Add text at top of bars
text(x = xx, y = data2plot, label = data2plot, cex = 0.7, col = "red",
srt=-90,adj=c(1.1,0.5))
legend("topright",
legend = rownames(data2plot),
fill = c("#999999", "#E69F00", "#56B4E9"))
dev.off()
pdf(file.path(data_dir,
paste0('htsm_rtd_',prefix_ref,'_',prefix_inf,'_transcript_per_gene_number.pdf')),
width = 10,height = 5)
xx <- barplot(data2plot,beside = TRUE,
col = c("#999999", "#E69F00", "#56B4E9"),
ylim=c(0,1.2*max(data2plot)),
xlab = 'Transcripts in a gene',
ylab = 'Genen number',
main='Transcript number per gene',
space=c(0,0.5))
## Add text at top of bars
text(x = xx, y = data2plot, label = data2plot, cex = 0.7, col = "red",
srt=-90,adj=c(1.1,0.5))
legend("topright",
legend = rownames(data2plot),
fill = c("#999999", "#E69F00", "#56B4E9"))
dev.off()
###---> basic statistics
stat_merged <- rtdSummary(gr = rtd)
stat_ref <- rtdSummary(gr = ref)
stat_inf <- rtdSummary(gr = inf)
stat <- cbind(stat_ref,stat_inf,stat_merged)
colnames(stat) <- c(prefix_ref,prefix_inf,'merged')
write.csv(stat,file=file.path(data_dir,paste0('htsm_rtd_',prefix_ref,'_', prefix_inf,'_merged_summary.csv')))
message('Step 5: Save the results')
export_gtf(gr = rtd,
file2save = file.path(data_dir,paste0('htsm_rtd_',prefix_ref,'_', prefix_inf,'_merged.gtf')))
save(rtd,file = file.path(data_dir,paste0('htsm_rtd_',prefix_ref,'_', prefix_inf,'_merged.RData')))
save(inf_trans_list,file=file.path(data_dir,paste0(prefix_inf,'_trans_summary.RData')))
#######################
###---> save to bed file
exon <- rtd
mapping <- data.frame(gene_id = exon$gene_id,transcript_id = exon$transcript_id)
mapping <- mapping[!duplicated(mapping),]
rownames(mapping) <- mapping$transcript_id
exon <- GenomicRanges::split(exon,exon$transcript_id)
bed <- rtracklayer::asBED(exon)
blocks <- bed$blocks
bed$blocks <- NULL
strand_info <- as.character(strand(bed))
strand_info[strand_info == "*"] <- NA
df <- data.frame(seqnames = seqnames(bed),
start = start(bed) - 1,
end = end(bed),
names = paste0(mapping[bed$name,'gene_id'],';',bed$name),
score = 0,
strand = strand_info,
thickStart = start(bed) - 1,
thickEnd = end(bed),
itemRgb = 1,
blockCount = elementNROWS(blocks),
blockSizes = unlist(lapply(width(blocks), paste, collapse = ","), use.names = FALSE),
blockStarts = unlist(lapply(start(blocks) - 1, paste, collapse = ","), use.names = FALSE))
rownames(df) <- NULL
message('Export bed file ...')
write.table(df, file = file.path(data_dir,paste0('htsm_rtd_',prefix_ref,'_', prefix_inf,'_merged.bed')),
sep = "\t", col.names = FALSE,
row.names = FALSE, quote = FALSE, na = ".")
message('Done: ',Sys.time())
end.time <- Sys.time()
time.taken <- end.time - start.time
message(paste('Time for analysis:',round(time.taken,3),attributes(time.taken)$units),'\n')
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.