R/inference_V2_new.R

Defines functions mcalrate calrate inferfunction plotpairs calsig transpoint expandratio binratio smooth_fun mixscan dirscan getreadslist parsereadsfile makeutrlist getlastexon makegenelist getfpkm geneexonfreq calexonfreq genegccont geneintervalscreen removeoverlap unregister_dopar

Documented in calrate makegenelist mcalrate

#rm(list = ls())

#gc()

#wkdir <- 'C:/Users/yuabr/Desktop/Transfer/codetransfer/proRate/proRate_V2/'

#setwd(wkdir)

#Import data####

#library(proRate)

#wt0file <- system.file('extdata', 'wt0.bam', package = 'proRate')
#wt15file <- system.file('extdata', 'wt15.bam', package = 'proRate')
#ko0file <- system.file('extdata', 'ko0.bam', package = 'proRate')
#ko15file <- system.file('extdata', 'ko15.bam', package = 'proRate')

#targetfile <- system.file('extdata', 'targetgenes.txt', package = 'proRate')

#fp0file <- './testdatasets/fp_00_bamAligned.sortedByCoord.out.qc.chr19.bam'
#fp12file <- './testdatasets/fp_12_bamAligned.sortedByCoord.out.qc.chr19.bam'


#Functions####

#Some parallel computing going on in the background that is not getting cleaned up
#fully between runs can cause Error in summary.connection(connection) : invalid
#connection. The following function is needed to be called to fix this error.
unregister_dopar <- function(){
  
  env <- foreach:::.foreachGlobals
  rm(list=ls(name=env), pos=env)
  
}

removeoverlap <- function(coords = genecoords, ignorestrand = TRUE){
  
  dis <- GenomicRanges::distanceToNearest(coords, ignore.strand = ignorestrand)
  disvec <- dis@elementMetadata$distance
  
  if(sum(disvec == 0)  == 0){
    return(coords)
  }else{
    newgenecoords <- coords[disvec != 0]
    removeoverlap(coords = newgenecoords, ignorestrand = ignorestrand)
  }
  
}

geneintervalscreen <- function(origenes = genes, ignorestrand = TRUE){
  
  #library(GenomicRanges)
  
  if(ignorestrand == TRUE){
    ups <- GenomicRanges::follow(origenes, BiocGenerics::unstrand(origenes))
    dns <- GenomicRanges::precede(origenes, BiocGenerics::unstrand(origenes))
  }else{
    ups <- GenomicRanges::follow(origenes, origenes)
    dns <- GenomicRanges::precede(origenes, origenes)
  }
  #follow returns the nearest upstream region of the query and considers the strandness of the query.
  #If the query is '+' stranded, its upstream region returned will before its start (also on '+' strand,
  #or on either '+' or '-' strand, if the strand information of the subject is removed with unstrand())
  #If the query is '-' stranded, its upstream region returned will after its end (also on '-' strand,
  #or on either '-' or '+' strand, if the strand information of the subject is removed with unstrand())
  #precede returns the nearest downstream region of the query and considers the strandness of the query.
  #If the query is '+' stranded, its downstream region returned will after its end
  #If the query is '-' stranded, its downstream region returned will before its start
  
  strandups <- ups
  stranddns <- dns
  
  strandupdis <- GenomicFeatures::distance(x = origenes[!is.na(strandups)], y = origenes[strandups[!is.na(strandups)]],
                                           ignore.strand = ignorestrand)
  stranddndis <- GenomicFeatures::distance(x = origenes[!is.na(stranddns)], y = origenes[stranddns[!is.na(stranddns)]],
                                           ignore.strand = ignorestrand)
  #distance returns the distance between the closest 2 points of the upstream and the downstream regions
  #(the 2 points are not included in the distance), no matter both of them are on '+' strand or '-' strand.
  #However, if x and y are on different strands, it will return NA, unless to set ignore.strand = TRUE,
  #and it will treat them as on the same strand
  
  
  
  strandupdises <- rep(NA, length(origenes))
  stranddndises <- rep(NA, length(origenes))
  strandupdises[!is.na(strandups)] <- strandupdis
  stranddndises[!is.na(stranddns)] <- stranddndis
  origenes$updis <- strandupdises
  origenes$dndis <- stranddndises
  
  
  
  otherstrandupdis <- origenes[is.na(strandups)]
  otherstranddndis <- origenes[is.na(stranddns)]
  
  outlierdis <- function(targets = otherstrandupdis, dir = 'up'){
    
    chrlens <- targets@seqinfo@seqlengths
    names(chrlens) <- targets@seqinfo@seqnames
    
    targetchrlens <- chrlens[as.character(targets@seqnames)]
    disstart <- targets@ranges@start
    disend <- targetchrlens - (targets@ranges@start + targets@ranges@width - 1)
    disend <- as.vector(disend)
    
    dises <- matrix(c(disstart, disend), nrow = 2, byrow = TRUE)
    dises <- as.vector(dises)
    
    if(dir == 'up'){
      dises <- dises[rep(c('+', '-'), length(targets)) == rep(as.character(targets@strand), each = 2)]
      targets$updis <- dises
    }else{
      dises <- dises[rep(c('-', '+'), length(targets)) == rep(as.character(targets@strand), each = 2)]
      targets$dndis <- dises
    }
    
    return(targets)
    
  }
  
  otherstrandupdis <- outlierdis(targets = otherstrandupdis, dir = 'up')
  otherstranddndis <- outlierdis(targets = otherstranddndis, dir = 'dn')
  
  
  origenes$updis[is.na(origenes$updis)] <- otherstrandupdis$updis
  origenes$dndis[is.na(origenes$dndis)] <- otherstranddndis$dndis
  
  return(origenes)
  
}

genegccont <- function(origenes = genes, genomename = 'mm10'){
  
  #library(GenomicRanges)
  #library(Biostrings)
  
  if(genomename == 'mm10'){
    #library("BSgenome.Mmusculus.UCSC.mm10")
    seqs <- Biostrings::getSeq(BSgenome.Mmusculus.UCSC.mm10::Mmusculus, origenes)
  }else if(genomename == 'hg38'){
    #library("BSgenome.Hsapiens.UCSC.hg38")
    seqs <- Biostrings::getSeq(BSgenome.Hsapiens.UCSC.hg38::Hsapiens, origenes)
  }
  
  #counts_genes <- oligonucleotideFrequency(seqs, width = 6, step = 1)
  #counts_genes <- as.data.frame(counts_genes)
  #freq_genes <- colSums(counts_genes)
  #freq_genes <- freq_genes[order(-freq_genes)]
  
  seqlist <- unlist(strsplit(toString(seqs), split = ', '))
  seqframe <- data.frame(genesym = origenes$gene_id,
                         seq = seqlist, stringsAsFactors = FALSE)
  
  #library(seqinr)
  calGC <- function(line){
    seqstring <- as.vector(unlist(line[2]))
    seqvector <- strsplit(seqstring, split = '')[[1]]
    gccontent <- seqinr::GC(seqvector)
    return(gccontent)
  }
  
  genegc <- apply(seqframe, 1, calGC)
  genegc <- data.frame(genesym = seqframe$genesym, seqgc = genegc, stringsAsFactors = FALSE)
  
  origenes$GC <- genegc$seqgc
  
  return(origenes)
  
}

calexonfreq <- function(genelineidx, origenes, exoncoords){
  
  geneline <- origenes[genelineidx]
  geneexons <- GenomicRanges::intersect(exoncoords, geneline)
  
  if(length(geneexons) == 0){
    return(0)
  }else{
    
    geneexons <- GenomicRanges::reduce(geneexons)
    geneexons$gene_id <- geneline$gene_id
    exonlens <- sum(geneexons@ranges@width)
    genelen <- geneline@ranges@width
    exonfreq <- exonlens/genelen
    
    return(exonfreq)
    
  }
  
}

geneexonfreq <- function(origenes = genes, genomename = 'mm10'){
  
  #origenes <- get(paste0(genomename, '.packagegenes'))[1:100]
  
  
  #library(AnnotationDbi)
  
  if(genomename == 'hg38'){
    
    #library(TxDb.Hsapiens.UCSC.hg38.knownGene)
    #library(org.Hs.eg.db)
    
    exoncoords <- GenomicFeatures::exons(TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene)
    
  }else if(genomename == 'mm10'){
    
    #library(TxDb.Mmusculus.UCSC.mm10.knownGene)
    #library(org.Mm.eg.db)
    
    exoncoords <- GenomicFeatures::exons(TxDb.Mmusculus.UCSC.mm10.knownGene::TxDb.Mmusculus.UCSC.mm10.knownGene)
    
  }
  
  geneexonfreqs <- sapply(X = 1:length(origenes), FUN = calexonfreq, 
                          origenes = origenes, exoncoords = exoncoords)
  
  origenes$exon <- geneexonfreqs
  
  return(origenes)
  
}

getfpkm <- function(features = genes, 
                    reads = reads1,
                    readsdepth = reads1depth, 
                    singleend = FALSE){
  
  
  fcounts <- GenomicAlignments::summarizeOverlaps(features = features, reads = reads,
                                                  mode = 'IntersectionNotEmpty',
                                                  singleEnd = singleend, 
                                                  ignore.strand = FALSE)
  #Add singleEnd = FALSE for pair-end
  #Add ignore.strand = FALSE for strand specific
  
  fcounts <- as.data.frame(SummarizedExperiment::assays(fcounts)$counts)
  #assays(x, ...)
  #The SummarizedExperiment class is a matrix-like container where rows represent 
  #features of interest (e.g. genes, transcripts, exons, etc...) and columns represent 
  #samples (with sample data summarized as a DataFrame). A SummarizedExperiment object 
  #contains one or more assays, each represented by a matrix-like object of numeric
  #or other mode.
  #x A SummarizedExperiment object
  
  featurewidth <- features@ranges@width
  
  factorM <- readsdepth/10^6
  factorK <- featurewidth/10^3
  
  fpkms <- fcounts/factorK/factorM
  features$fpkm <- fpkms$reads
  
  return(features)
  
}

#'Generate a GRanges object of UCSC known genes in specific genome
#'
#'Generate a GRanges object of UCSC known genes in specific genome.
#'
#'@param genomename The name of the genome, can be "mm10" for mouse or "hg38" 
#'  for human.
#'@param save Whether save the result GRanges as a file in work directory. 
#'  Default is TRUE.
#'@param calgenegccont Whether need to calculate the GC content for each gene 
#'  and include it into the result GRanges object. Default is TRUE.
#'@param calgeneexonfreq Whether calculate the exon frequency for each gene 
#'  and include it into the result GRanges object. Default is TRUE.
#'@return A GRanges object of UCSC known genes, with several gene information 
#'  provided, including seqnames, ranges, strands, gene symbols, distance to 
#'  the nearest upstream gene, distance to the nearest downstream gene, etc. 
#'  Genes overlapping with each other will not be included.
#'@export
makegenelist <- function(genomename = 'mm10', 
                         save = TRUE, 
                         
                         calgenegccont = TRUE, 
                         calgeneexonfreq = TRUE){
  
  #library(AnnotationDbi)
  
  if(genomename == 'hg38'){
    
    #library(TxDb.Hsapiens.UCSC.hg38.knownGene)
    #library(org.Hs.eg.db)
    
    genecoords <- GenomicFeatures::genes(TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene)
    #Select genes with unique gene symbols
    gene_ids <- genecoords$gene_id
    genesyms <- AnnotationDbi::select(x = org.Hs.eg.db::org.Hs.eg.db, keys = gene_ids,
                                      columns = 'SYMBOL', keytype = 'ENTREZID')
    
    targetchrs <- paste0('chr', c(seq(1, 22, 1), 'X', 'Y', 'M'))
    
  }else if(genomename == 'mm10'){
    
    #library(TxDb.Mmusculus.UCSC.mm10.knownGene)
    #library(org.Mm.eg.db)
    
    genecoords <- GenomicFeatures::genes(TxDb.Mmusculus.UCSC.mm10.knownGene::TxDb.Mmusculus.UCSC.mm10.knownGene)
    #Select genes with unique gene symbols
    gene_ids <- genecoords$gene_id
    genesyms <- AnnotationDbi::select(x = org.Mm.eg.db::org.Mm.eg.db, keys = gene_ids,
                                      columns = 'SYMBOL', keytype = 'ENTREZID')
    
    targetchrs <- paste0('chr', c(seq(1, 19, 1), 'X', 'Y', 'M'))
    
  }
  
  
  genesyms <- genesyms[complete.cases(genesyms), , drop = FALSE]
  names(genesyms) <- c('entrez', 'gene')
  dupgenes <- unique(genesyms$gene[duplicated(genesyms$gene)])
  genesyms <- subset(genesyms, !(gene %in% dupgenes))
  
  genecoords <- genecoords[gene_ids %in% genesyms$entrez]
  genecoords$gene_id <- genesyms$gene
  
  
  #Select genes only on chr1 to chrM
  genecoords <- GenomeInfoDb::keepSeqlevels(x = genecoords, value = targetchrs, pruning.mode = 'coarse')
  
  #Remove overlapped genes, no matter the strands are different or not###
  genecoords <- removeoverlap(coords = genecoords, ignorestrand = TRUE)
  
  #Add the information of the upstream distance and downstream distance to the nearest genes,
  #no matter the strands are different or not###
  genecoords <- geneintervalscreen(origenes = genecoords, ignorestrand = TRUE)
  
  
  genecoords <- GenomeInfoDb::keepSeqlevels(x = genecoords, value = unique(as.character(genecoords@seqnames)),
                                            pruning.mode = 'coarse')
  
  if(calgenegccont == TRUE){
    genecoords <- genegccont(origenes = genecoords, genomename = genomename)
  }
  
  if(calgeneexonfreq == TRUE){
    genecoords <- geneexonfreq(origenes = genecoords, genomename = genomename)
  }
  
  
  genecoords <- GenomicAlignments::sort(genecoords)
  genes <- genecoords
  names(genes) <- 1:length(genes)
  
  if(save == TRUE){
    saveRDS(genes, paste0(genomename, '.packagegenes.rds'))
  }
  
  return(genes)
  
}

#mm10genes <- makegenelist(genomename = 'mm10')

#hg38genes <- makegenelist(genomename = 'hg38')



getlastexon <- function(genelineidx = 1, origenes, exoncoords){
  
  geneline <- origenes[genelineidx]
  geneexons <- GenomicRanges::intersect(exoncoords, geneline)
  
  if(length(geneexons) == 0){
    
    lastexon <- NULL
    
  }else{
    
    if(as.vector(geneline@strand) == '+'){
      
      lastexon <- geneexons[length(geneexons)]
      lastexon$gene_id <- geneline$gene_id
      
    }else if(as.vector(geneline@strand) == '-'){
      
      lastexon <- geneexons[1]
      lastexon$gene_id <- geneline$gene_id
      
    }else{
      
      lastexon <- NULL
      
    }
    
  }
  
  return(lastexon)
  
}

makeutrlist <- function(genes, 
                        genomename = 'mm10'){
  
  #library(AnnotationDbi)
  
  if(genomename == 'hg38'){
    
    #library(TxDb.Hsapiens.UCSC.hg38.knownGene)
    #library(org.Hs.eg.db)
    
    exoncoords <- GenomicFeatures::exons(TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene)
    
  }else if(genomename == 'mm10'){
    
    #library(TxDb.Mmusculus.UCSC.mm10.knownGene)
    #library(org.Mm.eg.db)
    
    exoncoords <- GenomicFeatures::exons(TxDb.Mmusculus.UCSC.mm10.knownGene::TxDb.Mmusculus.UCSC.mm10.knownGene)
    
  }
  
  lastexons <- sapply(X = 1:length(genes), FUN = getlastexon, origenes = genes, exoncoords = exoncoords)
  
  lastexons <- do.call(c, lastexons)
  
  return(lastexons)
  
}



parsereadsfile <- function(readsfile = time1file, strandmethod = 0){
  
  #library(GenomicAlignments)
  
  if(is.character(readsfile)){
    if(file.exists(readsfile)){
      
      if(strandmethod %in% c(0, 1, 2)){
        
        flags <- Rsamtools::scanBamFlag(isUnmappedQuery = FALSE, 
                                        isDuplicate = FALSE, 
                                        isSecondaryAlignment = FALSE, 
                                        
                                        isProperPair = TRUE)
        
        params <- Rsamtools::ScanBamParam(flag = flags)
        
        reads_ori <- GenomicAlignments::readGAlignmentPairs(readsfile, 
                                                            strandMode = strandmethod, 
                                                            
                                                            param = params)
        
      }else{
        
        flags <- Rsamtools::scanBamFlag(isUnmappedQuery = FALSE, 
                                        isDuplicate = FALSE, 
                                        isSecondaryAlignment = FALSE)
        
        params <- Rsamtools::ScanBamParam(flag = flags)
        
        reads_ori <- GenomicAlignments::readGAlignments(readsfile, 
                                                        
                                                        param = params)
      }
      reads <- as(reads_ori, 'GRanges')
      
    }else{
      
      reads <- GenomicRanges::GRanges()
      
      return(reads)
      
    }
    
  }else if(class(readsfile)[1] == 'GAlignmentPairs'){
    
    reads <- as(readsfile, 'GRanges')
    
  }else if(class(readsfile)[1] == 'GAlignments'){
    
    reads <- as(readsfile, 'GRanges')
    
  }else if(class(readsfile)[1] == 'GRanges'){
    
    reads <- readsfile
    
  }
  
  if(!('UCSC' %in% GenomeInfoDb::seqlevelsStyle(reads))){
    
    GenomeInfoDb::seqlevelsStyle(reads) <- 'UCSC'
    
  }
  
  if(length(reads) == 0){
    
    return(reads)
    
  }
  
  reads <- reads[grepl(pattern = 'chr[0-9].*|chrX|chrY', 
                       x = GenomeInfoDb::seqnames(reads))]
  
  #readscutoff <- max(301, min(quantile(reads@ranges@width, 0.95), 1000))
  
  #reads <- reads[reads@ranges@width < readscutoff]
  
  return(reads)
  
}



getreadslist <- function(timefiles = time1files, 
                         mergefiles = mergerefs, 
                         strandmode = strandmethod){
  
  readses <- list()
  
  i <- 1
  for(i in 1:length(timefiles)){
    timefile <- timefiles[i]
    
    if(mergefiles == TRUE){
      
      reads <- parsereadsfile(timefile, strandmethod = strandmode)
      
    }else{
      
      reads <- timefile
      
    }
    
    readses[[i]] <- reads
    
  }
  
  if(mergefiles == TRUE){
    readses <- do.call(c, readses)
    readses <- list(readses)
    
  }
  
  
  return(readses)
  
  
  
}



dirscan <- function(i = 8699, 
                    ext, 
                    
                    reads1, 
                    reads2, 
                    reads1depth, 
                    reads2depth){
  
  ext <- ext[i]
  
  chr <- GenomicAlignments::seqlevelsInUse(ext)
  GenomeInfoDb::seqlevels(ext) <- chr
  
  strandsign <- as.character(ext@strand@values)
  
  if(strandsign == '+'){
    
    start <- ext@ranges@start
    end <- start + ext@ranges@width - 1
    
    extreads1 <- IRanges::subsetByOverlaps(x = reads1, 
                                           ranges = ext)
    
    extreads2 <- IRanges::subsetByOverlaps(x = reads2, 
                                           ranges = ext)
    
    GenomeInfoDb::seqlevels(extreads1) <- chr
    GenomeInfoDb::seqlevels(extreads2) <- chr
    
    extreadscounts1 <- GenomicAlignments::coverage(extreads1, 
                                                   shift = -(start - 1), 
                                                   width = (end - start + 1))
    
    extreadscounts2 <- GenomicAlignments::coverage(extreads2, 
                                                   shift = -(start - 1), 
                                                   width = (end - start + 1))
    
    extreadscounts1 <- extreadscounts1[[1]]
    extreadscounts2 <- extreadscounts2[[1]]
    
  }else{
    
    start <- ext@ranges@start
    end <- start + ext@ranges@width - 1
    
    extreads1 <- IRanges::subsetByOverlaps(x = reads1, 
                                           ranges = ext)
    
    extreads2 <- IRanges::subsetByOverlaps(x = reads2, 
                                           ranges = ext)
    
    GenomeInfoDb::seqlevels(extreads1) <- chr
    GenomeInfoDb::seqlevels(extreads2) <- chr
    
    extreadscounts1 <- GenomicAlignments::coverage(extreads1, 
                                                   shift = -(start - 1), 
                                                   width = (end - start + 1))
    #Calculate the coverage per base, like wig file, per chrom
    
    extreadscounts2 <- GenomicAlignments::coverage(extreads2, 
                                                   shift = -(start - 1), 
                                                   width = (end - start + 1))
    
    extreadscounts1[[1]] <- rev(extreadscounts1[[1]])
    extreadscounts2[[1]] <- rev(extreadscounts2[[1]])
    
    extreadscounts1 <- extreadscounts1[[1]]
    extreadscounts2 <- extreadscounts2[[1]]
    
  }
  
  extdepth1 <- sum(extreadscounts1)
  extdepth2 <- sum(extreadscounts2)
  
  if(extdepth1 == 0 | extdepth2 == 0){
    return(NULL)
  }
  
  extreadscounts1_adj <- extreadscounts1/(reads1depth/10^6)
  extreadscounts2_adj <- extreadscounts2/(reads2depth/10^6)
  
  extdepth1_adj <- sum(extreadscounts1_adj)
  extdepth2_adj <- sum(extreadscounts2_adj)
  
  weight1 <- extdepth1_adj/mean(c(extdepth1_adj, extdepth2_adj))
  weight2 <- extdepth2_adj/mean(c(extdepth1_adj, extdepth2_adj))
  
  extreadscounts1_adj <- extreadscounts1_adj/weight1
  extreadscounts2_adj <- extreadscounts2_adj/weight2
  
  #Add pseudo count!!!
  #extreadscounts1_adj <- extreadscounts1_adj + 10^-5/weight1
  #extreadscounts2_adj <- extreadscounts2_adj + 10^-5/weight2
  
  #Add pseudo count 1!!!
  extreadscounts1_adj <- extreadscounts1_adj + 1/weight1
  extreadscounts2_adj <- extreadscounts2_adj + 1/weight2
  
  extreadscounts_adj <- extreadscounts1_adj + extreadscounts2_adj
  
  scanextreads <- as.vector(extreadscounts_adj)
  
  windowsizes <- seq(from = 50, 
                     to = ceiling(length(scanextreads)/2/50)*50, 
                     by = 50)
  
  windowdiffs <- c()
  
  windowmeanses <- list()
  
  for(m in 1:length(windowsizes)){
    
    windowsize <- windowsizes[m]
    
    windowmeans <- c()
    
    for(n in 1:length(scanextreads)){
      
      windowmean <- mean(scanextreads[n:min((windowsize + n - 1), 
                                            length(scanextreads))])
      windowmeans <- c(windowmeans, windowmean)
      
      cat(paste0('m = ', m, '; n = ', n, '\n'))
      
    }
    
    windowmeanses[[m]]  <- windowmeans
    
    windowdiff <- windowmeans[1] - min(windowmeans)
    
    windowdiffs <- c(windowdiffs, windowdiff)
    
  }
  
  windowidx <- which.max(windowdiffs)
  
  endpoint <- which.min(windowmeanses[[windowidx]]) - 1
  
  if(endpoint == 0){
    
    return(NULL)
    
  }
  
  if(strandsign == '+'){
    
    endpoint <- start + endpoint - 1
    
    endext <- GenomicRanges::GRanges(seqnames = chr, 
                                     ranges = IRanges::IRanges(start = start, 
                                                               end = endpoint), 
                                     strand = strandsign)
    
    endext@elementMetadata <- ext@elementMetadata
    
  }else{
    
    endpoint <- end - endpoint + 1
    
    endext <- GenomicRanges::GRanges(seqnames = chr, 
                                     ranges = IRanges::IRanges(start = endpoint, 
                                                               end = end), 
                                     strand = strandsign)
    
    endext@elementMetadata <- ext@elementMetadata
    
  }
  
  
  windowsize <- windowsizes[windowidx]
  
  res <- endext
  
  return(res)
  
}

mixscan <- function(i = 7302, 
                    mixext, 
                    extpair, 
                    
                    reads1, 
                    reads2, 
                    reads1depth, 
                    reads2depth){
  
  ext1 <- mixext[i]
  extpair <- subset(extpair, mixexts1 == ext1$gene_id)
  ext2 <- mixext[mixext$gene_id == extpair$mixexts2]
  
  chr <- GenomicAlignments::seqlevelsInUse(ext1)
  GenomeInfoDb::seqlevels(ext1) <- chr
  GenomeInfoDb::seqlevels(ext2) <- chr
  
  strandsign1 <- as.character(ext1@strand@values)
  strandsign2 <- as.character(ext2@strand@values)
  
  strandsigns <- c(strandsign1, strandsign2)
  exts <- c(ext1, ext2)
  names(exts) <- c(1, 2)
  
  mergedext <- GenomicRanges::reduce(exts, ignore.strand = TRUE)
  
  if(strandsign1 == '+'){
    
    start <- mergedext@ranges@start
    end <- start + mergedext@ranges@width - 1
    
    extreads1 <- IRanges::subsetByOverlaps(x = reads1, 
                                           ranges = mergedext)
    
    extreads2 <- IRanges::subsetByOverlaps(x = reads2, 
                                           ranges = mergedext)
    
    GenomeInfoDb::seqlevels(extreads1) <- chr
    GenomeInfoDb::seqlevels(extreads2) <- chr
    
    extreadscounts1 <- GenomicAlignments::coverage(extreads1, 
                                                   shift = -(start - 1), 
                                                   width = (end - start + 1))
    
    extreadscounts2 <- GenomicAlignments::coverage(extreads2, 
                                                   shift = -(start - 1), 
                                                   width = (end - start + 1))
    
    extreadscounts1 <- extreadscounts1[[1]]
    extreadscounts2 <- extreadscounts2[[1]]
    
  }else{
    
    start <- mergedext@ranges@start
    end <- start + mergedext@ranges@width - 1
    
    extreads1 <- IRanges::subsetByOverlaps(x = reads1, 
                                           ranges = mergedext)
    
    extreads2 <- IRanges::subsetByOverlaps(x = reads2, 
                                           ranges = mergedext)
    
    GenomeInfoDb::seqlevels(extreads1) <- chr
    GenomeInfoDb::seqlevels(extreads2) <- chr
    
    extreadscounts1 <- GenomicAlignments::coverage(extreads1, 
                                                   shift = -(start - 1), 
                                                   width = (end - start + 1))
    #Calculate the coverage per base, like wig file, per chrom
    
    extreadscounts2 <- GenomicAlignments::coverage(extreads2, 
                                                   shift = -(start - 1), 
                                                   width = (end - start + 1))
    
    extreadscounts1[[1]] <- rev(extreadscounts1[[1]])
    extreadscounts2[[1]] <- rev(extreadscounts2[[1]])
    
    extreadscounts1 <- extreadscounts1[[1]]
    extreadscounts2 <- extreadscounts2[[1]]
    
  }
  
  extdepth1 <- sum(extreadscounts1)
  extdepth2 <- sum(extreadscounts2)
  
  if(extdepth1 == 0 | extdepth2 == 0){
    
    newextgenes <- exts
    
    scanextreadses <- list()
    
    res <- list()
    res$extgenes <- newextgenes
    
    return(res)
    
  }
  
  extreadscounts1_adj <- extreadscounts1/(reads1depth/10^6)
  extreadscounts2_adj <- extreadscounts2/(reads2depth/10^6)
  
  extdepth1_adj <- sum(extreadscounts1_adj)
  extdepth2_adj <- sum(extreadscounts2_adj)
  
  weight1 <- extdepth1_adj/mean(c(extdepth1_adj, extdepth2_adj))
  weight2 <- extdepth2_adj/mean(c(extdepth1_adj, extdepth2_adj))
  
  extreadscounts1_adj <- extreadscounts1_adj/weight1
  extreadscounts2_adj <- extreadscounts2_adj/weight2
  
  #Add pseudo count!!!
  #extreadscounts1_adj <- extreadscounts1_adj + 10^-5/weight1
  #extreadscounts2_adj <- extreadscounts2_adj + 10^-5/weight2
  
  #Add pseudo count 1!!!
  extreadscounts1_adj <- extreadscounts1_adj + 1/weight1
  extreadscounts2_adj <- extreadscounts2_adj + 1/weight2
  
  extreadscounts_adj <- extreadscounts1_adj + extreadscounts2_adj
  
  
  
  extreadscounts_adj <- extreadscounts_adj/2
  
  scanextreads <- as.vector(extreadscounts_adj)
  
  windowsizes <- seq(from = 50, 
                     to = ceiling(length(scanextreads)/2/50)*50, 
                     by = 50)
  
  windowdiffs <- c()
  
  windowmeanses <- list()
  
  for(m in 1:length(windowsizes)){
    
    windowsize <- windowsizes[m]
    
    windowmeans1 <- c()
    windowmeans2 <- c()
    
    for(n in 1:length(scanextreads)){
      
      windowmean1 <- mean(scanextreads[n:min((windowsize + n - 1), 
                                             length(scanextreads))])
      windowmean2 <- mean(rev(scanextreads)[n:min((windowsize + n - 1), 
                                                  length(scanextreads))])
      
      windowmeans1 <- c(windowmeans1, windowmean1)
      windowmeans2 <- c(windowmeans2, windowmean2)
      
      
    }
    
    windowmeans <- (windowmeans1 + rev(windowmeans2))/2
    
    windowmeanses[[m]]  <- windowmeans
    
    windowdiff <- windowmeans[1] + windowmeans[length(windowmeans)] - 
      min(windowmeans)
    
    windowdiffs <- c(windowdiffs, windowdiff)
    
  }
  
  windowidxes <- order(-windowdiffs)
  
  defined <- FALSE
  
  for(windowidx in windowidxes){
    
    endpoint1 <- which.min(windowmeanses[[windowidx]]) - 1
    
    endpoint2 <- which.min(rev(windowmeanses[[windowidx]])) - 1
    
    if(endpoint1 <= ext1@ranges@width & endpoint2 <= ext2@ranges@width & 
       endpoint1 > 0 & endpoint2 > 0){
      
      defined <- TRUE
      
      break()
      
    }
    
  }
  
  if(defined == FALSE){
    
    windowidx <- windowidxes[1]
    
    endpoint1 <- which.min(windowmeanses[[windowidx]]) - 1
    
    endpoint2 <- which.min(rev(windowmeanses[[windowidx]])) - 1
    
    endpoint1 <- min(endpoint1, ext1@ranges@width)
    
    endpoint2 <- min(endpoint2, ext2@ranges@width)
    
  }
  
  endpoints <- c(endpoint1, endpoint2)
  
  res <- list()
  
  res$mixgenes <- c()
  
  if(sum(endpoints == 0) == 1){
    
    if(endpoints[1] == 0){
      
      return(NULL)
      
    }
    
    endpoint <- endpoints[endpoints != 0]
    strandsign <- strandsigns[endpoints != 0]
    ext <- exts[endpoints != 0]
    
    start <- ext@ranges@start
    end <- start + ext@ranges@width - 1
    
    if(strandsign == '+'){
      
      endpoint <- start + endpoint - 1
      
      endext <- GenomicRanges::GRanges(seqnames = chr, 
                                       ranges = IRanges::IRanges(start = start, 
                                                                 end = endpoint), 
                                       strand = strandsign)
      
      endext@elementMetadata <- ext@elementMetadata
      
    }else{
      
      endpoint <- end - endpoint + 1
      
      endext <- GenomicRanges::GRanges(seqnames = chr, 
                                       ranges = IRanges::IRanges(start = endpoint, 
                                                                 end = end), 
                                       strand = strandsign)
      
      endext@elementMetadata <- ext@elementMetadata
      
    }
    
    res$mixgenes <- endext
    
  }else{
    
    strandsign <- strandsigns[1]
    endpoint <- endpoints[1]
    ext <- exts[1]
    
    start <- ext@ranges@start
    end <- start + ext@ranges@width - 1
    
    if(strandsign == '+'){
      
      endpoint <- start + endpoint - 1
      
      endext <- GenomicRanges::GRanges(seqnames = chr, 
                                       ranges = IRanges::IRanges(start = start, 
                                                                 end = endpoint), 
                                       strand = strandsign)
      
      endext@elementMetadata <- ext@elementMetadata
      
    }else{
      
      endpoint <- end - endpoint + 1
      
      endext <- GenomicRanges::GRanges(seqnames = chr, 
                                       ranges = IRanges::IRanges(start = endpoint, 
                                                                 end = end), 
                                       strand = strandsign)
      
      endext@elementMetadata <- ext@elementMetadata
      
    }
    
    res$mixgenes <- c(res$mixgenes, endext)
    
    
  }
  
  if(length(res) == 0){
    
    res <- NULL
    
  }
  
  return(res)
  
}

smooth_fun <- function(x){
  
  datatype <- class(x)[1]
  
  if(datatype == 'Rle'){
    
    x <- as.vector(x)
    
  }
  
  res <- predict(
    
    locfit::locfit(x ~ locfit::lp(seq_along(x), nn = 0.1, h = 0.8)), 
    seq_along(x)
    
  )
  
  res[res < 0] <- 0
  
  if(datatype == 'Rle'){
    
    res <- S4Vectors::Rle(res)
    
  }
  
  return(res)
  
}

binratio <- function(i = 1, 
                     datalist,
                     window_num,
                     startshorten, 
                     endshorten,
                     reads1depth, 
                     reads2depth){
  
  gene <- datalist$genes[i]
  
  #return(gene)
  
  chr <- GenomicAlignments::seqlevelsInUse(gene)
  GenomeInfoDb::seqlevels(gene) <- chr
  #seqlevels contains all the chrs in a factor
  #seqlevelsInUse only contains the chrs present in x
  
  strandsign <- as.character(gene@strand@values)
  
  if(strandsign == '+'){
    
    tss <- gene@ranges@start
    tts <- tss + gene@ranges@width - 1
    
    inferranges <- GenomicRanges::GRanges(seqnames = chr, 
                                          ranges = IRanges::IRanges(start = tss, 
                                                                    end = tts), 
                                          strand = strandsign)
    
    genereads1 <- IRanges::subsetByOverlaps(x = datalist$subreads1, 
                                            ranges = inferranges)
    
    genereads2 <- IRanges::subsetByOverlaps(x = datalist$subreads2, 
                                            ranges = inferranges)
    
    GenomeInfoDb::seqlevels(genereads1) <- chr
    GenomeInfoDb::seqlevels(genereads2) <- chr
    
    fullreadscounts1 <- GenomicAlignments::coverage(genereads1, 
                                                    shift = -(tss - 1), 
                                                    width = (tts - tss + 1))
    #Calculate the coverage per base, like wig file, per chrom
    
    fullreadscounts2 <- GenomicAlignments::coverage(genereads2, 
                                                    shift = -(tss - 1), 
                                                    width = (tts - tss + 1))
    
    fullreadscounts1 <- fullreadscounts1[[1]]
    fullreadscounts2 <- fullreadscounts2[[1]]
    
    
    
    start <- gene@ranges@start + startshorten
    size <- floor((gene@ranges@width - startshorten - endshorten)/window_num)
    to <- start + size*window_num - 1
    #Important!
    #In R, the rule on range ends is different from HTSeq and most bed, gtf 
    #files. In these files, the start of a sequence is 0 and the end value on 
    #their record is actually not included in the range. 
    #However, in IRange and GRange, the sequence starts from 1 and the end value 
    #is included in the real sequence! So when import bed file into R 
    #session, it is necessary to adjust the coordinates to follow this R rule 
    #manually. R itself will not adjust it automatically.
    
    tss <- gene@ranges@start
    tts <- tss + gene@ranges@width - 1
    endshorten_adj <- tts - to
    
    genereadscounts1 <- fullreadscounts1[startshorten:(to - tss)]
    genereadscounts2 <- fullreadscounts2[startshorten:(to - tss)]
    
  }else{
    
    tss <- gene@ranges@start
    tts <- tss + gene@ranges@width - 1
    
    inferranges <- GenomicRanges::GRanges(seqnames = chr, 
                                          ranges = IRanges::IRanges(start = tss, 
                                                                    end = tts), 
                                          strand = strandsign)
    
    genereads1 <- IRanges::subsetByOverlaps(x = datalist$subreads1, 
                                            ranges = inferranges)
    
    genereads2 <- IRanges::subsetByOverlaps(x = datalist$subreads2, 
                                            ranges = inferranges)
    
    GenomeInfoDb::seqlevels(genereads1) <- chr
    GenomeInfoDb::seqlevels(genereads2) <- chr
    
    fullreadscounts1 <- GenomicAlignments::coverage(genereads1, 
                                                    shift = -(tss - 1), 
                                                    width = (tts - tss + 1))
    #Calculate the coverage per base, like wig file, per chrom
    
    fullreadscounts2 <- GenomicAlignments::coverage(genereads2, 
                                                    shift = -(tss - 1), 
                                                    width = (tts - tss + 1))
    
    fullreadscounts1[[1]] <- rev(fullreadscounts1[[1]])
    fullreadscounts2[[1]] <- rev(fullreadscounts2[[1]])
    
    fullreadscounts1 <- fullreadscounts1[[1]]
    fullreadscounts2 <- fullreadscounts2[[1]]
    
    
    
    to <- gene@ranges@start + gene@ranges@width - 1 - startshorten
    size <- floor((gene@ranges@width - startshorten - endshorten)/window_num)
    start <- to - size*window_num + 1
    #Important!
    #In R, the rule on range ends is different from HTSeq and most bed, gtf 
    #files. In these files, the start of a sequence is 0 and the end value on 
    #their record is actually not included in the range. 
    #However, in IRange and GRange, the sequence starts from 1 and the end value 
    #is included in the real sequence! So when import bed file into R 
    #session, it is necessary to adjust the coordinates to follow this R rule 
    #manually. R itself will not adjust it automatically.
    
    tss <- gene@ranges@start
    tts <- tss + gene@ranges@width - 1
    endshorten_adj <- start - tss
    
    genereadscounts1 <- fullreadscounts1[startshorten:(tts - start)]
    genereadscounts2 <- fullreadscounts2[startshorten:(tts - start)]
    
    
  }
  
  genedepth1 <- sum(genereadscounts1)
  genedepth2 <- sum(genereadscounts2)
  
  if(genedepth1 == 0 | genedepth2 == 0){
    return(NULL)
  }
  
  genereadscounts1_adj <- genereadscounts1/(reads1depth/10^6)
  genereadscounts2_adj <- genereadscounts2/(reads2depth/10^6)
  
  fullreadscounts1 <- fullreadscounts1/(reads1depth/10^6)
  fullreadscounts2 <- fullreadscounts2/(reads2depth/10^6)
  
  genedepth1_adj <- sum(genereadscounts1_adj)
  genedepth2_adj <- sum(genereadscounts2_adj)
  
  weight1 <- genedepth1_adj/mean(c(genedepth1_adj, genedepth2_adj))
  weight2 <- genedepth2_adj/mean(c(genedepth1_adj, genedepth2_adj))
  
  genereadscounts1_adj <- genereadscounts1_adj/weight1
  genereadscounts2_adj <- genereadscounts2_adj/weight2
  
  
  
  #Add pseudo count 1!!!
  #genereadscounts1_adj <- genereadscounts1_adj + 10^-5/weight1
  #genereadscounts2_adj <- genereadscounts2_adj + 10^-5/weight2
  
  #Add pseudo count!!!
  genereadscounts1_adj <- genereadscounts1_adj + 1/weight1
  genereadscounts2_adj <- genereadscounts2_adj + 1/weight2
  
  #Divide the genebody into 40 bins and calculate the read count of each bin
  
  starts <- seq(1, size*window_num, size)
  
  vi1 <- IRanges::Views(genereadscounts1_adj, start = starts, width = size)
  vi2 <- IRanges::Views(genereadscounts2_adj, start = starts, width = size)
  
  windows1 <- IRanges::viewSums(vi1)
  #viewSums calculates sums of the views in a Views or ViewsList object.
  #Here the total read num of each window can be calculated
  
  windows2 <- IRanges::viewSums(vi2)
  #viewSums calculates sums of the views in a Views or ViewsList object. 
  #Here the total read num of each window can be calculated
  
  #Generate the bin ratios between time points
  gene_ratio_adj <- windows2/windows1
  
  
  
  #gene_ratio_adj <- smooth_fun(x = gene_ratio_adj)
  
  
  
  if(sum(is.finite(gene_ratio_adj)) <= length(gene_ratio_adj)/2){
    
    return(NULL)
    
  }
  
  
  
  gene_ratio_adj <- data.frame(ratio_adj = gene_ratio_adj, 
                               stringsAsFactors = FALSE)
  
  res <- list(ratio_adj = gene_ratio_adj, 
              windowsize = size, 
              emis1_adj = genereadscounts1_adj, 
              emis2_adj = genereadscounts2_adj,
              emis1_complete = fullreadscounts1, 
              emis2_complete = fullreadscounts2, 
              endshorten_adj = endshorten_adj, 
              weight1 = weight1, 
              weight2 = weight2)
  
  return(res)
  
}



expandratio <- function(expand1 = binres$emis1_adj[startcoord:endcoord],
                        expand2 = binres$emis2_adj[startcoord:endcoord], 
                        
                        
                        
                        weight1 = weight1, 
                        weight2 = weight2){
  
  expand_ratio <- expand2/expand1
  
  #expand_ratio <- smooth_fun(x = expand_ratio)
  
  if(sum(is.finite(expand_ratio)) < length(expand_ratio)){
    
    #Add pseudo count!!!
    #expand_ratio <- (expand2 + 10^-5/weight2)/(expand1 + 10^-5/weight1)
    
    #Add pseudo count!!!
    expand_ratio <- (expand2 + 1/weight2)/(expand1 + 1/weight1)
    
  }
  
  expand_ratio <- data.frame(ratio_adj = expand_ratio, stringsAsFactors = FALSE)
  
  return(expand_ratio)
  
}

transpoint <- function(ratios, 
                       method = 'LSS', 
                       pythonpath = NULL, 
                       hmmseed = 2023){
  
  if(method == 'LSS'){
    
    expand_ratio <- ratios$ratio_adj
    
    startcoord <- 1
    endcoord <- length(expand_ratio)
    square_list <- c()
    
    for(k in startcoord:(endcoord - 1)){
      leftidx <- startcoord:k
      rightidx <- (k+1):endcoord
      #rightidx start from k+1, so the final transition point obtained from this function belong to the 1st state
      leftratio <- expand_ratio[leftidx - startcoord + 1]
      rightratio <- expand_ratio[rightidx - startcoord + 1]
      
      leftratio_clean <- leftratio#[(!is.na(leftratio)) & is.finite(leftratio)]
      rightratio_clean <- rightratio#[(!is.na(rightratio)) & is.finite(rightratio)]
      
      leftratio_var <- sum((leftratio_clean - mean(leftratio_clean))^2)
      rightratio_var <- sum((rightratio_clean - mean(rightratio_clean))^2)
      square <- leftratio_var + rightratio_var
      
      square_list <- c(square_list, square)
      
      
    }
    
    names(square_list) <- 1:length(square_list)
    minsquare <- min(square_list[(!is.na(square_list)) & is.finite(square_list)])
    
    if(is.infinite(minsquare)){
      
      return(NULL)
      
    }else{
      
      final_point <- as.numeric(names(square_list)[match(minsquare, as.vector(square_list))])
      
      fronts <- expand_ratio[1:final_point]
      latters <- expand_ratio[(final_point + 1):length(expand_ratio)]
      
      wilp <- wilcox.test(fronts, latters)$p.value
      
      frontmean <- mean(fronts)
      lattermean <- mean(latters)
      
      res <- list(point = final_point, front = frontmean, latter = lattermean, wilp = wilp)
      
      return(res)
      
    }
    
  }else{
      
    #hmmpyfile <- system.file("python", "hmm_r.py", package = "proRate")
    #hmmpyfile <- 
    #  'C:/Users/yuabr/Desktop/Transfer/codetransfer/proRate/proRate_V2/R/proRate_V2/hmm_r.py'
    
    hmmpyfile <- '/rsrch5/scratch/lym_myl/yliu46/tmp/rate/codes/proRate_V2/hmm_r.py'
    
    if(!is.null(pythonpath)){
      
      Sys.setenv(RETICULATE_PYTHON = pythonpath)
      
    }
    
    #reticulate::use_python(pydir)
    
    reticulate::py_config()
    
    reticulate::source_python(hmmpyfile)
    
    hmmres <- hmm_r(ratios = ratios, 
                    hmmseed = hmmseed)
    
    if(is.null(hmmres)){
      
      return(NULL)
      
    }
    
    final_point <- hmmres$point
    
    fronts <- hmmres$fronts
    
    latters <- hmmres$latters
    
    wilp <- wilcox.test(fronts, latters)$p.value
    
    frontmean <- mean(fronts)
    lattermean <- mean(latters)
    
    res <- list(point = final_point, front = frontmean, latter = lattermean, wilp = wilp)
    
    return(res)
    
    
  }
  
}

calsig <- function(emis1_adj = binres$emis1_adj, emis2_adj = binres$emis2_adj, point = distance){
  
  ratio_adj <- emis2_adj/emis1_adj
  
  activepart <- ratio_adj[1:point]
  inhibitpart <- ratio_adj[(point + 1):length(ratio_adj)]
  
  activepart <- activepart[is.finite(activepart)]
  inhibitpart <- inhibitpart[is.finite(inhibitpart)]
  
  wilp <- tryCatch({
    wilcox.test(activepart, inhibitpart)$p.value
  }, error = function(err){
    NA
  })
  
  activemean <- mean(activepart)
  inhibitmean <- mean(inhibitpart)
  
  res <- list(activeratiomean = activemean, inhibitratiomean = inhibitmean, wilp = wilp)
  return(res)
}



#reads1 = binres$ratio_adj
#reads2 = NULL
#endtype = endtypestr
#distance = point_idx$point
#genesym = gene$gene_id
#genelen = nrow(binres$ratio_adj)
#timediff = time
#saveplot = TRUE

#textsize = textsize
#titlesize = titlesize
#face = face

plotpairs <- function(reads1 = as.numeric(genefpm1), 
                      reads2 = as.numeric(genefpm2),
                      endtype = c('TSS', 'TTS'), 
                      distance, 
                      genesym = geneinfo$gene_id, 
                      genelen = geneinfo$width, 
                      timediff = time,
                      saveplot = FALSE, 
                      
                      textsize = 13,
                      titlesize = 15,
                      face = 'bold'){
  
  if(is.vector(reads1)){
    reads1 <- data.frame(reads = reads1, stringsAsFactors = FALSE)
  }
  
  row.names(reads1) <- 1:nrow(reads1)
  
  if(!is.null(reads2)){
    
    if(is.vector(reads2)){
      reads2 <- data.frame(reads = reads2, stringsAsFactors = FALSE)
    }
    row.names(reads2) <- 1:nrow(reads2)
  }
  
  start <- 1
  end <- nrow(reads1)
  distance <- distance
  
  points <- c(start, distance, end)
  pointlabels <- c(endtype[1], 'Transition', endtype[2])
  
  if(!is.null(time)){
    
    reads1$condition <- 'time2/time1'
    
  }else{
    
    reads1$condition <- 'condition/control'
    
  }
  
  reads1$xcoord <- as.numeric(row.names(reads1))
  reads_totals <- reads1
  ylabel <- 'Ratio'
  
  if(!is.null(reads2)){
    
    if(!is.null(time)){
      
      reads1$condition <- 'time1'
      reads2$condition <- 'time2'
      reads2$xcoord <- as.numeric(row.names(reads2))
      reads_totals <- rbind(reads1, reads2)
      ylabel <- 'FPM'
      reads_totals$condition <- factor(reads_totals$condition, levels = c('time2', 'time1'),
                                       ordered = TRUE)
      
    }else{
      
      reads1$condition <- 'control'
      reads2$condition <- 'condition'
      reads2$xcoord <- as.numeric(row.names(reads2))
      reads_totals <- rbind(reads1, reads2)
      ylabel <- 'FPM'
      reads_totals$condition <- factor(reads_totals$condition, levels = c('condition', 'control'),
                                       ordered = TRUE)
      
    }
    
    
  }
  
  names(reads_totals)[1] <- 'reads'
    
  if(!is.null(timediff)){
    
    plottitle <- paste0('(Distance = ', distance, ', Length = ', genelen, 
                        ', Time difference = ', timediff, 'min)')
    
  }else{
    
    plottitle <- paste0('(Distance = ', distance, ', Length = ', genelen, ')')
    
  }
  
  plottitle <- paste0(genesym, ' ', plottitle)
  
  #library(ggplot2)
  #library(scales)
  #library(grid)
  
  p <- ggplot2::ggplot(reads_totals, mapping = ggplot2::aes(x = xcoord, y = reads))
  
  if(!is.null(reads2)){
    p <- p + ggplot2::geom_line(size = 1, color = scales::hue_pal()(1))
  }else{
    p <- p + ggplot2::geom_point(size = 2, color = scales::hue_pal()(1))
  }
  
  p <- p + ggplot2::scale_x_continuous(breaks = points,
                                       labels = pointlabels) +
    ggplot2::xlab('') + ggplot2::ylab(ylabel) +
    ggplot2::geom_vline(xintercept = points[2], linetype = 2, color = 'red', size = 1) +
    ggplot2::facet_grid(condition~., scales = 'free_y') +
    ggplot2::ggtitle(plottitle) +
    ggplot2::theme_bw() +
    ggplot2::theme(panel.grid = ggplot2::element_blank()) +
    ggplot2::theme(axis.text.x = ggplot2::element_text(size = 10)) +
    ggplot2::theme(axis.text.x = ggplot2::element_text(hjust=1, angle = 90)) +
    ggplot2::theme(panel.spacing = grid::unit(0, 'lines')) + 
    
    
    
    ggplot2::theme(plot.title = ggplot2::element_text(size = titlesize, face = face),
                   #plot.subtitle = ggplot2::element_text(size = textsize, face = face),
                   axis.title = ggplot2::element_text(size = titlesize, face = face),
                   axis.text = ggplot2::element_text(size = textsize, face = face),
                   strip.text = ggplot2::element_text(size = textsize, face = face))
    
  
  if(saveplot == FALSE){
    print(p)
  }
  
  if(saveplot == TRUE){
    return(p)
  }
  
  
  
}



inferfunction <- function(i = 1, 
                          datalist, 
                          window_num, 
                          startshorten, 
                          endshorten, 
                          reads1depth, 
                          reads2depth, 
                          time, 
                          
                          method = 'LSS', 
                          pythonpath = NULL, 
                          hmmseed = 2023, 
                          
                          textsize = 13,
                          titlesize = 15,
                          face = 'bold'){
  
  binres <- binratio(i = i, 
                     datalist = datalist, 
                     window_num = window_num, 
                     startshorten = startshorten, 
                     endshorten = endshorten, 
                     reads1depth = reads1depth, 
                     reads2depth = reads2depth)
  
  weight1 <- binres$weight1
  weight2 <- binres$weight2
  
  if(is.null(binres)){
    #next()
    return(NULL)
  }
  
  endshorten <- binres$endshorten_adj
  
  
  
  ###################
  #Start 1st infer
  
  #LSS method
  point_idx <- transpoint(ratios = binres$ratio_adj, 
                          method = method, 
                          pythonpath = pythonpath, 
                          hmmseed = hmmseed)
  
  
  
  if(is.null(point_idx)){
    #next()
    return(NULL)
  }
  
  gene <- datalist$genes[i]
  
  if(!is.null(time)){
    
    endtypestr <- c(paste0('TSS+', startshorten, 'bp'), 
                    paste0('TTS-', endshorten, 'bp'))
    
  }else{
    
    endtypestr <- c(paste0('ExonStart+', startshorten, 'bp'), 
                    paste0('DistPolyA-', endshorten, 'bp'))
    
  }
  
  binresplot <- plotpairs(reads1 = binres$ratio_adj, 
                          reads2 = NULL,
                          endtype = endtypestr,
                          distance = point_idx$point, 
                          genesym = gene$gene_id,
                          genelen = nrow(binres$ratio_adj), 
                          timediff = time, 
                          saveplot = TRUE, 
                          
                          textsize = textsize,
                          titlesize = titlesize,
                          face = face)
  
  ###################
  forward <- round(binres$windowsize * (point_idx$point - 1))
  backward <- round(binres$windowsize * (point_idx$point + 1))
  
  #focus on the single window with transition point
  
  startcoord <- 1 + forward
  endcoord <- backward
  
  expand_ratio <- expandratio(expand1 = binres$emis1_adj[startcoord:endcoord],
                              expand2 = binres$emis2_adj[startcoord:endcoord], 
                              weight1 = weight1, 
                              weight2 = weight2)
  
  
  
  if(is.null(expand_ratio)){
    
    final_point <- list(point = binres$windowsize, front = NA, latter = NA, wilp = NA)
    
  }else{
    
    
    
    #Start 2nd infer
    
    #LSS method
    final_point <- transpoint(ratios = expand_ratio, 
                              method = method, 
                              pythonpath = pythonpath, 
                              hmmseed = hmmseed)
    
    if(is.null(final_point)){
      
      final_point <- list(point = binres$windowsize, front = NA, latter = NA, wilp = NA)
      
    }
    
    
    
  }
  
  if(!is.null(time)){
    
    endtypestr <- c(paste0('TSS+', forward + startshorten, 'bp'), 
                    paste0('TSS+', (backward - 1) + startshorten, 'bp'))
    
  }else{
    
    endtypestr <- c(paste0('ExonStart+', forward + startshorten, 'bp'), 
                    paste0('ExonStart+', (backward - 1) + startshorten, 'bp'))
    
  }
  
  expandplot <- plotpairs(reads1 = expand_ratio$ratio_adj, 
                          reads2 = NULL,
                          endtype = endtypestr,
                          distance = final_point$point, 
                          genesym = gene$gene_id,
                          genelen = length(expand_ratio$ratio_adj), 
                          timediff = time,
                          saveplot = TRUE, 
                          
                          textsize = textsize,
                          titlesize = titlesize,
                          face = face)
  
  res <- list()
  
  res$gene_id <- gene$gene_id
  
  res$distance <- startshorten + forward + final_point$point
  
  res$wilp1 <- point_idx$wilp
  res$wilp2 <- final_point$wilp
  
  res$activemean1 <- point_idx$front
  res$inhibitmean1 <- point_idx$latter
  
  res$activemean2 <- final_point$front
  res$inhibitmean2 <- final_point$latter
  
  res$singlegeneread1 <- binres$emis1_complete
  res$singlegeneread2 <- binres$emis2_complete
  res$binresplot <- binresplot
  res$expandplot <- expandplot
  
  
  subreport <- data.frame(gene_id = res$gene_id,
                          distance = res$distance,
                          frontbinratio = res$activemean1,
                          latterbinratio = res$inhibitmean1,
                          diffbinratio = res$inhibitmean1 - res$activemean1,
                          binpval = res$wilp1,
                          frontextendratio = res$activemean2,
                          latterextendratio = res$inhibitmean2,
                          diffextendratio = res$inhibitmean2 - res$activemean2,
                          extendpval = res$wilp2,
                          stringsAsFactors = FALSE)
  
  reslist <- list(subreport = subreport,
                  singlegenereads1 = res$singlegeneread1,
                  singlegenereads2 = res$singlegeneread2,
                  binresplotlist = res$binresplot,
                  expandplotlist = res$expandplot)
  
  
  return(reslist)
  
  
}

#time1file = wt0file
#time2file = wt15file
#targetfile = NULL
#genomename = 'mm10'
#time = 15
#strandmethod = 1
#threads = 1

#lencutoff = 40000
#fpkmcutoff = 1

#startshorten = 1000
#endshorten = 1000
#window_num = 40
#method = 'LSS'
#pythonpath = 'C:/Users/yuabr/anaconda3/python.exe'
#difftype = 1
#utr = FALSE
#utrexts = NULL

#textsize = 13
#titlesize = 15
#face = 'bold'

#'Calculate gene elongation rate from a pair of Pro-seq or Gro-seq data
#'
#'Calculate gene elongation rate from a pair of Pro-seq or Gro-seq data, using 
#'the LSS (least sum of squares) or HMM (hidden Markov model) method. 
#'
#'@param time1file The reference Pro-seq/Gro-seq bam file, corresponding to 
#'  the experimental condition of no transcriptional inhibitor treatment. Can 
#'  be a string indicating the directory of the file, or a GAlignmentPairs 
#'  object, a GAlignments object, or a GRanges object from the original bam 
#'  file.
#'@param time2file The treatment Pro-seq/Gro-seq bam file, corresponding to 
#'  the experimental condition of transcriptional inhibitor treatment for a 
#'  specific time (e.g. DRB treatment for 15 min). Can be a string indicating 
#'  the directory of the bam file, or a GAlignmentPairs object, a GAlignments 
#'  object, or a GRanges object from the original file.
#'@param targetfile A txt file with the genes whose transcriptional rates need 
#'  to be calculated. Should contain columns named as chr, start, end, strand, 
#'  and gene_id. It can also be NULL, so that the genes in the genome set by 
#'  the parameter \code{genomename} will be analyzed. However, in any case, 
#'  the genes should have a length longer than the one set by the parameter 
#'  \code{lencutoff}, and also longer than the one of 2*(\code{startshorten} + 
#'  \code{endshorten}), which is set by the parameters \code{startshorten} and 
#'  \code{endshorten}.
#'@param gene_ids A vector with gene symbols indicating the ones need to be 
#'  analyzed. In addition to \code{targetefile} and \code{genomename}, this 
#'  parameter also indicates the genes to be analyzed. The final ones should 
#'  belong to the intersection of these parameters, and they also need to have 
#'  a length longer than the one set by the parameter \code{lencutoff}, and 
#'  also longer than the one of 2*(\code{startshorten} + \code{endshorten}), 
#'  which is set by the parameters \code{startshorten} and \code{endshorten}. 
#'  Can also be NULL, so that no restriction will be added from it. 
#'@param genomename Specify the genome of the genes to be analyzed, when the 
#'  parameter \code{targetfile} is NULL. Can be "mm10" for mouse or "hg38" for 
#'  human.
#'@param time An integer indicating the inhibitor treatment time difference 
#'  between the \code{time1file} and the \code{time2file}, using min as its 
#'  unit.
#'@param strandmethod Indicate the strand specific method used when preparing 
#'  the sequencing library, can be 1 for the directional ligation method, 2 
#'  for the dUTP method, and 0 for a non-strand specific library. In addition, 
#'  if the sample is sequenced using a single strand method, set it as 3.
#'@param threads Number of threads to perform parallelization. Default is 1.
#'@param lencutoff The cutoff on gene length (bp). Only genes longer than this 
#'  cutoff can be considered for analysis. Default is 70000.
#'@param fpkmcutoff The cutoff value on gene FPKM. Only genes with an FPKM 
#'  value greater than the cutoff in \code{time1file} can be considered for 
#'  analysis. Default is 1.
#'@param startshorten Before inferring a gene's transcription rate, its first 
#'  1000 bp (or other length) and last 1000 bp (or other length) regions will 
#'  be discarded to avoid the unstable reads at the transcription starting and 
#'  ending stages. However, these regions' lengths can be changed by setting 
#'  this parameter \code{startshorten} and the other \code{endshorten}. This 
#'  one is used to set the length of the transcription starting region. Its 
#'  default value is 1000, so that the first 1000 bp region will be discarded.
#'@param endshorten Before inferring a gene's transcription rate, its first 
#'  1000 bp (or other length) and last 1000 bp (or other length) regions will 
#'  be discarded to avoid the unstable reads at the transcription starting and 
#'  ending stages. However, these regions' lengths can be changed by setting 
#'  this parameter \code{endshorten} and the other \code{startshorten}. This 
#'  one is used to set the length of the transcription ending region. Default 
#'  is 1000, so that the last 1000 bp region will be discarded.
#'@param window_num Before inferring a gene's transcription rate, the function 
#'  will divide this gene into 40 bins (or other bin number). For each bin, 
#'  the normalized read count ratio between the treatment and the reference 
#'  files will be calculated, so a vector with 40 ratios (or other bin number) 
#'  will be generated. Then, the LSS or HMM method will be used to find the 
#'  transition bin between the gene's transcription inhibited region and the 
#'  normal reads region. After that, this identified transition bin and its 
#'  downstream neighbor will be merged and expanded to the single-base-pair 
#'  level, and the LSS or HMM method will be further used on them to find the 
#'  transition base pair in this region. The parameter \code{window_num} here 
#'  is used to set the bin number to be divided for each gene. Default value 
#'  is 40.
#'@param method The method to be used for transcription rate inference. The 
#'  default value is "LSS", so that the least sum of squares method will be 
#'  used. Can also be "HMM", so that the hidden Markov model will be used. 
#'@param pythonpath The HMM method is base on \code{Python}, so the directory 
#'  of the \code{Python} interpreter you want to use should be transferred to 
#'  the function via this parameter, and two \code{Python} modules should be 
#'  installed in your \code{Python} environment, including \code{numpy} and 
#'  \code{hmmlearn}.
#'@param hmmseed The HMM method involves random processes, so a random seed 
#'  should be set via this parameter to repeat the results. Default value is 
#'  1234, can also be other integers, such as 2023.
#'@param difftype In most cases, the treatment and reference Pro-seq/Gro-seq 
#'  files are from experiments treating cells with transcription inhibitors, 
#'  such as DRB (5,6-dichloro-1-beta-d-ribofuranosylbenzimidazole), so that 
#'  the normal transcription will be repressed for a specific time, generating 
#'  a reads-depleted region upstream of the normal transcription region. For 
#'  such inhibitor-based experiments, this parameter \code{difftype} should be 
#'  set as 1. However, in some cases, the treatment and reference Pro-seq/Gro- 
#'  seq files can also come from experiments treating cells with transcription 
#'  activators, e.g., treating MCF-7 human breast cancer cells with E2 (17- 
#'  beta-estradiol), making the reads-depleted region downstream, rather than 
#'  upstream, of the normal transcription region, which is in contrast to the 
#'  DRB (inhibitor) experiments. For such activator-based experiments, this 
#'  parameter should be set as 2. In addition, this function \code{calrate} 
#'  can also infer proximal polyA alternative sites for genes, and in this 
#'  case, the parameter \code{time1file} should be an RNA-seq file with genes 
#'  using distal polyA sites; the parameter \code{time2file} needs an RNA-seq 
#'  file with genes using proximal polyA sites; another parameter \code{utr} 
#'  should be set as TRUE; and the parameter \code{difftype} here should be 
#'  set as 2. The default value of \code{difftype} is 1.
#'@param utr In addition to inferring transcription rates from Pro-seq/Gro-seq 
#'  data, \code{calrate} can also infer proximal polyA alternative sites for 
#'  genes. In this case, the parameter \code{time1file} should be an RNA-seq 
#'  file with genes using distal polyA sites; the parameter \code{time2file} 
#'  needs an RNA-seq file with genes using proximal polyA sites; the parameter 
#'  \code{difftype} should be set as 2; and the current parameter \code{utr} 
#'  should be set as TRUE. The default value of \code{utr} is FALSE, so the 
#'  function will perform inference on transcription rates, not on proximal 
#'  polyA sites.
#'@param utrexts When the former parameter \code{utr} is set as TRUE to infer 
#'  proximal polyA sites for genes, this parameter can be used to provide a 
#'  txt file with the genes' last exons whose proximal polyA sites need to be 
#'  identified. Should contain columns named as chr, start, end, strand, and 
#'  gene_id. It can also be set as NULL, so that the genes' last exons in the 
#'  genome set by the parameter \code{genomename} will be analyzed. However, 
#'  in the latter case, the original last exons from the genome will be first 
#'  adjusted so that for the ones with a length > 10000 bp, the proximal polyA 
#'  sites will be inferred directly in them, but for the ones with a length <= 
#'  10000 bp, their lengths will be extended to 10000 bp first, and then the 
#'  proximal sites will be identified within the extended exons. On the other 
#'  hand, if the last exons are provided with this parameter \code{utrexts}, 
#'  they will never be extended to 10000 bp, and the proximal polyA inference 
#'  will be performed directly on them. In addition to the proximal sites, the 
#'  distal ones will also be defined by the function from the \code{time1file} 
#'  and \code{time2file}'s reads. It is performed with a sliding window method 
#'  on the last exon regions, and this step is before the proximal polyA sites 
#'  inference but after the exon extension step. It should also be noted that 
#'  the last exons should have an FPKM value in the \code{time1file} greater 
#'  than the cutoff set by \code{fpkmcutoff}.
#'@param textsize In addition to returning a data frame to show the inference 
#'  results, this function will also generate several plots to show them, and 
#'  the font size for the plot texts is set by this parameter. Default is 13.
#'@param titlesize The font size for the plot titles. Default is 15.
#'@param face The font face for the plot texts. Default is "bold".
#'@return A list including a slot named "report", which is a data frame with 
#'  the inferred transcription elongation rates, or genes' proximal and distal 
#'  polyA sites, as well as other information, such as the genes' coordinates, 
#'  the results' significance, etc. In addition, the result list also contains 
#'  other slots, such as "binplots" and "expandplots", which contains the data 
#'  that can be used to plot the inference results.
#'
#'
#'
#'@examples
#'library(proRate)
#'
#'wt0file <- system.file("extdata", "wt0.bam", package = "proRate")
#'wt15file <- system.file("extdata", "wt15.bam", package = "proRate")
#'
#'wtrates <- calrate(time1file = wt0file, 
#'                   time2file = wt15file, 
#'                   time = 15, 
#'                   strandmethod = 1, 
#'                   
#'                   genomename = "mm10", 
#'                   lencutoff = 40000, 
#'                   fpkmcutoff = 1, 
#'                   
#'                   threads = 4, 
#'                   
#'                   startshorten = 1000, 
#'                   endshorten = 1000, 
#'                   window_num = 40, 
#'                   
#'                   method = "LSS", 
#'                   pythonpath = NULL, 
#'                   
#'                   difftype = 1)
#'
#'
#'
#'@export
calrate <- function(time1file, 
                    time2file,
                    targetfile = NULL, 
                    gene_ids = NULL, 
                    genomename = 'mm10',
                    time,
                    strandmethod = 1, 
                    threads = 1,
                    
                    lencutoff = 70000,
                    fpkmcutoff = 1, 
                    
                    startshorten = 1000, 
                    endshorten = 1000, 
                    window_num = 40, 
                    method = 'LSS', 
                    pythonpath = NULL, 
                    hmmseed = 1234,  
                    #hmmseed = 2023,  
                    difftype = 1, 
                    utr = FALSE, 
                    utrexts = NULL, 
                    
                    textsize = 13,
                    titlesize = 15,
                    face = 'bold'){
  
  if(strandmethod %in% c(0, 1, 2)){
    singleend <- FALSE
  }else{
    singleend <- TRUE
    strandmethod <- 4
  }
  
  if(utr == TRUE){
    
    startshorten <- 0
    endshorten <- 0
    difftype <- 2
    time <- NULL
    
  }
  
  ##########
  #Libraries required
  #library(GenomicAlignments)
  
  if(!is.null(targetfile) & utr == FALSE){
      
    genes <- read.table(targetfile, sep = '\t', header = TRUE,
                        stringsAsFactors = FALSE, quote = '',
                        check.names = FALSE)
    
    genes <- genes[(abs(genes$end - genes$start) + 1) >= 
                     max(2*(startshorten + endshorten), lencutoff), , 
                   drop = FALSE]
    
    if(nrow(genes) == 0){
      
      return(NULL)
      
    }
    
    genes <- genes[c('chr', 'start', 'end', 'strand', 'gene_id')]
    names(genes) <- c('seqnames', 'start', 'end', 'strand', 'gene_id')
    
    genes <- GenomicRanges::GRanges(seqnames = genes$seqnames,
                                    ranges = IRanges::IRanges(start = genes$start, end = genes$end),
                                    strand = genes$strand,
                                    gene_id = genes$gene_id)
    genes <- GenomicAlignments::sort(genes)
    
  }else{
    
    genes <- get(paste0(genomename, '.packagegenes'))
    #genes <- readRDS(paste0(genomename, '.packagegenes.rds'))
    
    genes <- GenomicRanges::GRanges(genes)
    
    if(utr == FALSE){
      
      genes <- genes[genes@ranges@width >= max(2*(startshorten + endshorten), lencutoff)]
      
    }
    
    if(length(genes) == 0){
      
      return(NULL)
      
    }
    
    genes <- GenomicAlignments::sort(genes)
    names(genes) <- 1:length(genes)
    
    genes <- GenomeInfoDb::keepSeqlevels(x = genes, 
                                         value = unique(as.character(genes@seqnames)),
                                         pruning.mode = 'coarse')
    #Keeps only the seqlevels in `value` and removes all others.
    #x Any object having a Seqinfo class in which the seqlevels will be kept.
    #value A named or unnamed character vector.
    
    if(utr == FALSE){
      
      genes$updis <- NULL
      genes$dndis <- NULL
      
    }
    
  }
  
  if(!is.null(targetfile) & utr == FALSE){
    
    genesdis <- GenomicRanges::distanceToNearest(genes, ignore.strand = TRUE)
    
    mixgenes <- genesdis[genesdis@elementMetadata$distance == 0]
    
    mixgenecoords <- NULL
    
    if(length(mixgenes) > 0){
      
      mixgenes1 <- genes$gene_id[mixgenes@from]
      mixgenes2 <- genes$gene_id[mixgenes@to]
      
      mixgenes1strands <- as.vector(genes@strand)[mixgenes@from]
      mixgenes2strands <- as.vector(genes@strand)[mixgenes@to]
      
      mixgenes <- data.frame(mixgenes1 = mixgenes1, 
                             mixgenes2 = mixgenes2, 
                             stringsAsFactors = FALSE)
      
      mixgenes <- mixgenes[mixgenes1strands != mixgenes2strands, , drop = FALSE]
      
      if(nrow(mixgenes) > 0){
        
        row.names(mixgenes) <- 1:nrow(mixgenes)
        
      }
      
    }
    
    genecoords <- genes$gene_id
    
    if(sum(nrow(mixgenes) > 0)){
      
      mixgenecoords <- mixgenes
      mixgenes <- mixgenecoords
      
      mixgenecoords <- unique(c(mixgenecoords$mixgenes1, mixgenecoords$mixgenes2))
      
      genecoords <- setdiff(genecoords, mixgenecoords)
      
    }
    
    genecoords <- genes[genes$gene_id %in% genecoords]
    
    if(length(genecoords) == 0){
      
      return(NULL)
      
    }else{
      
      genecoords <- GenomicAlignments::sort(genecoords)
      
      names(genecoords) <- 1:length(genecoords)
      
      genes <- genecoords
      
    }
    
    
  }
  
  if(!is.null(gene_ids)){
    
    genes <- genes[genes$gene_id %in% gene_ids]
    
    names(genes) <- 1:length(genes)
    
  }
  
  ###################
  
  #Read in the strand-specific paired-end data
  
  reads1 <- parsereadsfile(readsfile = time1file, strandmethod = strandmethod)
  reads2 <- parsereadsfile(readsfile = time2file, strandmethod = strandmethod)
  
  reads1depth <- length(reads1)
  reads2depth <- length(reads2)
  
  
  #genes <- genes[genes$gene_id %in% 
  #                 c('Cpt1a', 'Kmt5b', 'Fxn', 'Csf2ra', 'Tesmin', 'Gal')]
  #names(genes) <- 1:length(genes)
  
  
  if(utr == TRUE){
    
    threads <- max(1, round(threads))
    
    #Infer distal polyA site
    if(is.null(utrexts)){
      
      #lastexons <- makeutrlist(genes = genes, genomename = genomename)
      
      lastexons <- get(paste0('lastexons.', genomename))
      
      #lastexons <- readRDS(paste0('lastexons.', genomename, '.rds'))
      
      nullgenes <- setdiff(genes$gene_id, lastexons$gene_id)
      
      nullgenes <- genes[genes$gene_id %in% nullgenes]
      
      genes <- genes[!(genes$gene_id %in% nullgenes$gene_id)]
      
      lastexons <- lastexons[match(genes$gene_id, lastexons$gene_id)]
      
      trimeddndis <- 10000 - lastexons@ranges@width
      trimeddndis[trimeddndis < 0] <- 0
      trimeddndis[trimeddndis > genes$dndis] <- genes$dndis[trimeddndis > genes$dndis]
      
      
      pluslimits <- genes@ranges@start + genes@ranges@width - 1 + trimeddndis
      minuslimits <- genes@ranges@start - trimeddndis
      
      plusends <- pluslimits
      minusends <- minuslimits
      
      extends <- plusends
      extends[as.vector(lastexons@strand) == '-'] <- 
        minusends[as.vector(lastexons@strand) == '-']
      
      plusstarts <- lastexons@ranges@start
      minusstarts <- lastexons@ranges@start + lastexons@ranges@width - 1
      
      extstarts <- plusstarts
      extstarts[as.vector(lastexons@strand) == '-'] <- 
        minusstarts[as.vector(lastexons@strand) == '-']
      
      genes$extstarts <- extstarts
      genes$extends <- extends
      
      genes$extstarts[as.vector(genes@strand) == '-'] <- 
        extends[as.vector(genes@strand) == '-']
      genes$extends[as.vector(genes@strand == '-')] <- 
        extstarts[as.vector(genes@strand) == '-']
      
      exts <- GenomicRanges::GRanges(seqnames = genes@seqnames, 
                                     ranges = IRanges::IRanges(start = genes$extstarts, 
                                                               end = genes$extends), 
                                     strand = genes@strand, 
                                     gene_id = genes$gene_id)
      
      names(exts) <- 1:length(exts)
      
      extsdis <- GenomicRanges::distanceToNearest(exts, ignore.strand = TRUE)
      
      mixexts <- extsdis[extsdis@elementMetadata$distance == 0]
      
      mixgenes <- NULL
      
      if(length(mixexts) > 0){
        
        mixexts1 <- exts$gene_id[mixexts@from]
        mixexts2 <- exts$gene_id[mixexts@to]
        
        mixexts1strands <- as.vector(exts@strand)[mixexts@from]
        mixexts2strands <- as.vector(exts@strand)[mixexts@to]
        
        mixexts <- data.frame(mixexts1 = mixexts1, 
                              mixexts2 = mixexts2, 
                              stringsAsFactors = FALSE)
        
        mixexts <- mixexts[mixexts1strands != mixexts2strands, , drop = FALSE]
        
        if(nrow(mixexts) > 0){
          
          row.names(mixexts) <- 1:nrow(mixexts)
          
        }
        
      }
      
      if(!is.null(targetfile)){
          
        genes <- read.table(targetfile, sep = '\t', header = TRUE,
                            stringsAsFactors = FALSE, quote = '',
                            check.names = FALSE)
        
        genes <- genes[c('chr', 'start', 'end', 'strand', 'gene_id')]
        names(genes) <- c('seqnames', 'start', 'end', 'strand', 'gene_id')
        
        extgenes <- exts$gene_id[exts$gene_id %in% genes$gene_id]
        
        if(sum(nrow(mixexts) > 0)){
          
          mixgenes <- mixexts[mixexts$mixexts1 %in% genes$gene_id, , drop = FALSE]
          mixexts <- mixgenes
          
          mixgenes <- unique(c(mixgenes$mixexts1, mixgenes$mixexts2))
          
          extgenes <- setdiff(extgenes, mixgenes)
          
        }
        
        
        
      }else{
        
        extgenes <- exts$gene_id
        
        if(sum(nrow(mixexts) > 0)){
          
          mixgenes <- mixexts
          mixexts <- mixgenes
          
          mixgenes <- unique(c(mixgenes$mixexts1, mixgenes$mixexts2))
          
          extgenes <- setdiff(extgenes, mixgenes)
          
        }
        
      }
      
      extgenes <- exts[exts$gene_id %in% extgenes]
      
      mixgenes <- exts[exts$gene_id %in% mixgenes]
      
      endexts <- GenomicRanges::GRanges(seqnames = character(),
                                        ranges = IRanges::IRanges(start = numeric(), 
                                                                  end = numeric()),
                                        strand = character(),
                                        gene_id = character())
      
      if(length(mixgenes) > 0){
        
        mixgenes <- IRanges::subsetByOverlaps(x = mixgenes, 
                                              ranges = range(c(reads1, reads2)))
        
      }
      
      if(length(mixgenes) > 0){
        
        mixgenes <- GenomicAlignments::sort(mixgenes)
        
        names(mixgenes) <- 1:length(mixgenes)
        
        nseq <- seq(1, length(mixgenes), 1)
        
        if(threads == 1){
          
          mixreslist <- list()
          n <- 1
          for(n in nseq){
            
            mixres <- mixscan(i = n, 
                              mixext = mixgenes, 
                              extpair = mixexts, 
                              
                              reads1 = reads1, 
                              reads2 = reads2, 
                              reads1depth = reads1depth, 
                              reads2depth = reads2depth)
            
            mixreslist[[n]] <- mixres
            
            cat(paste0('n = ', n, '\n'))
            
          }
          
          
        }else{
          
          #library(doParallel)
          #threads <- 8
          
          cores <- parallel::detectCores()
          cl <- parallel::makeCluster(min(min(threads, length(mixgenes)), cores))
          
          doParallel::registerDoParallel(cl)
          
          date()
          `%dopar%` <- foreach::`%dopar%`
          
          mixreslist <- foreach::foreach(i = nseq, 
                                         .export = ls(name = globalenv()), .packages = c('GenomicRanges')) %dopar% {
                                           #.export = NULL) %dopar% {
                                           mixscan(i = i, 
                                                   mixext = mixgenes, 
                                                   extpair = mixexts, 
                                                   
                                                   reads1 = reads1, 
                                                   reads2 = reads2, 
                                                   reads1depth = reads1depth, 
                                                   reads2depth = reads2depth)
                                         }
          date()
          
          parallel::stopCluster(cl)
          
          unregister_dopar()
          
          
        }
        
        for(n in 1:length(mixreslist)){
          
          if(!is.null(names(mixreslist[[n]]))){
            
            if(names(mixreslist[[n]]) == 'extgenes'){
              
              extgenes <- c(extgenes, mixreslist[[n]]$extgenes)
              
            }else{
              
              endexts <- c(endexts, mixreslist[[n]]$mixgenes)
              
            }
            
          }
          
        }
        
        if(length(extgenes) > 0){
          extgenes <- unique(extgenes)
          names(extgenes) <- 1:length(extgenes)
        }
        
        if(length(endexts) > 0){
          endexts <- unique(endexts)
          names(endexts) <- 1:length(endexts)
        }
        
      }
      
      if(length(extgenes) > 0){
        
        extgenes <- IRanges::subsetByOverlaps(x = extgenes, 
                                              ranges = range(c(reads1, reads2)))
        
      }
      
      if(length(extgenes) > 0){
        
        extgenes <- GenomicAlignments::sort(extgenes)
        
        names(extgenes) <- 1:length(extgenes)
        
        nseq <- seq(1, length(extgenes), 1)
        
        if(threads == 1){
          
          endextlist <- list()
          n <- 1
          for(n in nseq){
            
            endext <- dirscan(i = n, 
                              ext = extgenes, 
                              reads1 = reads1, 
                              reads2 = reads2, 
                              reads1depth = reads1depth, 
                              reads2depth = reads2depth)
            
            endextlist[[n]] <- endext
            
            cat(paste0('n = ', n, '\n'))
            
          }
          
          
        }else{
          
          #library(doParallel)
          #threads <- 8
          
          cores <- parallel::detectCores()
          cl <- parallel::makeCluster(min(min(threads, length(extgenes)), cores))
          
          doParallel::registerDoParallel(cl)
          
          date()
          `%dopar%` <- foreach::`%dopar%`
          
          endextlist <- foreach::foreach(i = nseq, 
                                         .export = ls(name = globalenv()), .packages = c('GenomicRanges')) %dopar% {
                                           #.export = NULL) %dopar% {
                                           dirscan(i = i, 
                                                   ext = extgenes, 
                                                   reads1 = reads1, 
                                                   reads2 = reads2, 
                                                   reads1depth = reads1depth, 
                                                   reads2depth = reads2depth)
                                           }
          date()
          
          parallel::stopCluster(cl)
          
          unregister_dopar()
          
          
        }
        
        endextlist <- Filter(Negate(is.null), endextlist)
        
        endextlist <- unlist(endextlist)
        
        if(is.list(endextlist)){
          endextlist <- do.call(c, endextlist)
          
          if(length(endextlist) > 0){
            names(endextlist) <- 1:length(endextlist)
          }
          
        }else{
          endextlist <- NULL
        }
        
        
        endexts <- c(endexts, endextlist)
        
      }
      
      if(length(endexts) == 0){
        
        return(NULL)
        
      }else{
        
        endexts <- endexts[endexts$gene_id %in% exts$gene_id]
        
        if(length(endexts) == 0){
          
          return(NULL)
          
        }
        
      }
      
      exts <- endexts
      
      exts <- GenomicAlignments::sort(exts)
      
      names(exts) <- 1:length(exts)
      
    }else{
        
      exts <- read.table(utrexts, sep = '\t', header = TRUE,
                         stringsAsFactors = FALSE, quote = '',
                         check.names = FALSE)
      
      exts <- exts[c('chr', 'start', 'end', 'strand', 'gene_id')]
      names(exts) <- c('seqnames', 'start', 'end', 'strand', 'gene_id')
      
      exts <- exts[exts$gene_id %in% genes$gene_id]
      
      exts <- GenomicRanges::GRanges(seqnames = exts$seqnames,
                                     ranges = IRanges::IRanges(start = exts$start, 
                                                               end = exts$end),
                                     strand = exts$strand,
                                     gene_id = exts$gene_id)
      exts <- GenomicAlignments::sort(exts)
      
      names(exts) <- 1:length(exts)
      
      extsdis <- GenomicRanges::distanceToNearest(exts, ignore.strand = TRUE)
      
      mixexts <- extsdis[extsdis@elementMetadata$distance == 0]
      
      mixgenes <- NULL
      
      if(length(mixexts) > 0){
        
        mixexts1 <- exts$gene_id[mixexts@from]
        mixexts2 <- exts$gene_id[mixexts@to]
        
        mixexts1strands <- as.vector(exts@strand)[mixexts@from]
        mixexts2strands <- as.vector(exts@strand)[mixexts@to]
        
        mixexts <- data.frame(mixexts1 = mixexts1, 
                              mixexts2 = mixexts2, 
                              stringsAsFactors = FALSE)
        
        mixexts <- mixexts[mixexts1strands != mixexts2strands, , drop = FALSE]
        
        if(nrow(mixexts) > 0){
          
          row.names(mixexts) <- 1:nrow(mixexts)
          
        }
        
      }
      
      extgenes <- exts$gene_id
      
      if(sum(nrow(mixexts) > 0)){
        
        mixgenes <- mixexts
        mixexts <- mixgenes
        
        mixgenes <- unique(c(mixgenes$mixexts1, mixgenes$mixexts2))
        
        extgenes <- setdiff(extgenes, mixgenes)
        
      }
      
      extgenes <- exts[exts$gene_id %in% extgenes]
      
      mixgenes <- exts[exts$gene_id %in% mixgenes]
      
      endexts <- GenomicRanges::GRanges(seqnames = character(),
                                        ranges = IRanges::IRanges(start = numeric(), 
                                                                  end = numeric()),
                                        strand = character(),
                                        gene_id = character())
      
      
      if(length(mixgenes) > 0){
        
        mixgenes <- IRanges::subsetByOverlaps(x = mixgenes, 
                                              ranges = range(c(reads1, reads2)))
        
      }
      
      if(length(mixgenes) > 0){
        
        mixgenes <- GenomicAlignments::sort(mixgenes)
        
        names(mixgenes) <- 1:length(mixgenes)
        
        nseq <- seq(1, length(mixgenes), 1)
        
        if(threads == 1){
          
          mixreslist <- list()
          n <- 1
          for(n in nseq){
            
            mixres <- mixscan(i = n, 
                              mixext = mixgenes, 
                              extpair = mixexts, 
                              
                              reads1 = reads1, 
                              reads2 = reads2, 
                              reads1depth = reads1depth, 
                              reads2depth = reads2depth)
            
            mixreslist[[n]] <- mixres
            
            cat(paste0('n = ', n, '\n'))
            
          }
          
          
        }else{
          
          #library(doParallel)
          #threads <- 8
          
          cores <- parallel::detectCores()
          cl <- parallel::makeCluster(min(min(threads, length(mixgenes)), cores))
          
          doParallel::registerDoParallel(cl)
          
          date()
          `%dopar%` <- foreach::`%dopar%`
          
          mixreslist <- foreach::foreach(i = nseq, 
                                         .export = ls(name = globalenv()), .packages = c('GenomicRanges')) %dopar% {
                                           #.export = NULL) %dopar% {
                                           mixscan(i = i, 
                                                   mixext = mixgenes, 
                                                   extpair = mixexts, 
                                                   
                                                   reads1 = reads1, 
                                                   reads2 = reads2, 
                                                   reads1depth = reads1depth, 
                                                   reads2depth = reads2depth)
                                         }
          
          date()
          
          parallel::stopCluster(cl)
          
          unregister_dopar()
          
          
        }
        
        for(n in 1:length(mixreslist)){
          
          if(!is.null(names(mixreslist[[n]]))){
            
            if(names(mixreslist[[n]]) == 'extgenes'){
              
              extgenes <- c(extgenes, mixreslist[[n]]$extgenes)
              
            }else{
              
              endexts <- c(endexts, mixreslist[[n]]$mixgenes)
              
            }
            
          }
          
        }
        
        
        if(length(extgenes) > 0){
          extgenes <- unique(extgenes)
          names(extgenes) <- 1:length(extgenes)
        }
        
        if(length(endexts) > 0){
          endexts <- unique(endexts)
          names(endexts) <- 1:length(endexts)
        }
        
      }
      
      if(length(extgenes) > 0){
        
        extgenes <- IRanges::subsetByOverlaps(x = extgenes, 
                                              ranges = range(c(reads1, reads2)))
        
      }
      
      if(length(extgenes) > 0){
        
        extgenes <- GenomicAlignments::sort(extgenes)
        
        names(extgenes) <- 1:length(extgenes)
        
        endexts <- c(endexts, extgenes)
        
      }
      
      if(length(endexts) == 0){
        
        return(NULL)
        
      }else{
        
        endexts <- endexts[endexts$gene_id %in% exts$gene_id]
        
        if(length(endexts) == 0){
          
          return(NULL)
          
        }
        
      }
      
      exts <- endexts
      
      exts <- GenomicAlignments::sort(exts)
      
      names(exts) <- 1:length(exts)
      
    }
    
    #FPKM screen###
    exts <- getfpkm(features = exts, reads = reads1, readsdepth = reads1depth, 
                    singleend = singleend)
    exts <- exts[!is.na(exts$fpkm)]
    exts <- exts[exts$fpkm != 0]
    if(length(exts) == 0){
      return(NULL)
    }
    exts <- GenomicAlignments::sort(exts)
    names(exts) <- 1:length(exts)
    
    if(!is.null(fpkmcutoff)){
      
      exts <- exts[exts$fpkm > fpkmcutoff]
      if(length(exts) == 0){
        return(NULL)
      }
      exts <- GenomicAlignments::sort(exts)
      names(exts) <- 1:length(exts)
      
    }
    
    #Prepare for parallel######
    
    if(threads > length(exts)){
      threads <- length(exts)
    }
    
    
    extranges <- range(exts)
    
    extreads1 <- IRanges::subsetByOverlaps(x = reads1, ranges = extranges)
    
    extreads2 <- IRanges::subsetByOverlaps(x = reads2, ranges = extranges)
    
    extreads <- c(extreads1, extreads2)
    extreadsranges <- range(extreads)
    exts <- IRanges::subsetByOverlaps(x = exts, ranges = extreadsranges)
    
    exts <- GenomicAlignments::sort(exts)
    extreads1 <- GenomicAlignments::sort(extreads1)
    extreads2 <- GenomicAlignments::sort(extreads2)
    
    GenomeInfoDb::seqlevels(extreads1) <- 
      GenomicAlignments::seqlevelsInUse(extreads1)
    
    GenomeInfoDb::seqlevels(extreads2) <- 
      GenomicAlignments::seqlevelsInUse(extreads2)
    
    if(length(exts) == 0 | length(extreads1) == 0 | length(extreads2) == 0){
      return(NULL)
    }
    
    names(exts) <- 1:length(exts)
    
    datalist <- list(genes = exts, subreads1 = extreads1, subreads2 = extreads2)
    
    
    
  }else{
    
    #FPKM screen###
    genes <- getfpkm(features = genes, reads = reads1, readsdepth = reads1depth, 
                     singleend = singleend)
    genes <- genes[!is.na(genes$fpkm)]
    genes <- genes[genes$fpkm != 0]
    if(length(genes) == 0){
      return(NULL)
    }
    genes <- GenomicAlignments::sort(genes)
    names(genes) <- 1:length(genes)
    
    if(!is.null(fpkmcutoff)){
      
      genes <- genes[genes$fpkm > fpkmcutoff]
      if(length(genes) == 0){
        return(NULL)
      }
      genes <- GenomicAlignments::sort(genes)
      names(genes) <- 1:length(genes)
      
    }
    
    #Prepare for parallel######
    
    threads <- max(1, round(threads))
    
    if(threads > length(genes)){
      threads <- length(genes)
    }
    
    generanges <- range(genes)
    
    subreads1 <- IRanges::subsetByOverlaps(x = reads1, ranges = generanges)
    #subsetByOverlaps returns the subset of `x` that has an overlap hit with a range 
    #in `ranges` using the specified `findOverlaps` parameters.
    
    subreads2 <- IRanges::subsetByOverlaps(x = reads2, ranges = generanges)
    #subsetByOverlaps returns the subset of `x` that has an overlap hit with a range 
    #in `ranges` using the specified `findOverlaps` parameters.
    
    subreads <- c(subreads1, subreads2)
    subreadsranges <- range(subreads)
    genes <- IRanges::subsetByOverlaps(x = genes, ranges = subreadsranges)
    #subsetByOverlaps returns the subset of `x` that has an overlap hit with a range 
    #in `ranges` using the specified `findOverlaps` parameters.
    
    if(length(genes) == 0){
      
      return(NULL)
      
    }
    
    rm(subreads)
    
    genes <- GenomicAlignments::sort(genes)
    subreads1 <- GenomicAlignments::sort(subreads1)
    subreads2 <- GenomicAlignments::sort(subreads2)
    
    GenomeInfoDb::seqlevels(subreads1) <- 
      GenomicAlignments::seqlevelsInUse(subreads1)
    #seqlevels contains all the chrs in a factor
    #seqlevelsInUse only contains the chrs present in x
    
    GenomeInfoDb::seqlevels(subreads2) <- 
      GenomicAlignments::seqlevelsInUse(subreads2)
    #seqlevels contains all the chrs in a factor
    #seqlevelsInUse only contains the chrs present in x
    
    if(length(genes) == 0 | length(subreads1) == 0 | length(subreads2) == 0){
      return(NULL)
    }
    
    names(genes) <- 1:length(genes)
    
    datalist <- list(genes = genes, subreads1 = subreads1, subreads2 = subreads2)
    
    
  }
  
  
  geneframe <- as.data.frame(datalist$genes)
  
  for(i in 1:ncol(geneframe)){
    
    if(is.factor(geneframe[,i])){
      geneframe[,i] <- as.character(geneframe[,i])
    }
    
  }
  
  row.names(geneframe) <- 1:nrow(geneframe)
  
  
  iseq <- seq(1, length(datalist$genes), 1)
  
  gc()
  
  if(threads == 1){
    
    reslists <- list()
    i <- 22
    for(i in iseq){
      
      reslist <- inferfunction(i = i, 
                               datalist = datalist, 
                               window_num = window_num, 
                               startshorten = startshorten, 
                               endshorten = endshorten, 
                               reads1depth = reads1depth, 
                               reads2depth = reads2depth, 
                               time = time, 
                               
                               method = method, 
                               pythonpath = pythonpath, 
                               hmmseed = hmmseed, 
                               
                               textsize = textsize,
                               titlesize = titlesize,
                               face = face)
      
      reslists[[i]] <- reslist
      
    }
    
    
  }else{
    
    #library(doParallel)
    #threads <- 8
    
    cores <- parallel::detectCores()
    cl <- parallel::makeCluster(min(threads, cores))
    
    doParallel::registerDoParallel(cl)
    
    date()
    `%dopar%` <- foreach::`%dopar%`
    reslists <- foreach::foreach(i = iseq, 
                                 .export = ls(name = globalenv()), .packages = c('GenomicRanges')) %dopar% {
                                 #.export = NULL, .packages = c('GenomicRanges')) %dopar% {
                                 inferfunction(i = i, 
                                               datalist = datalist, 
                                               window_num = window_num, 
                                               startshorten = startshorten, 
                                               endshorten = endshorten, 
                                               reads1depth = reads1depth, 
                                               reads2depth = reads2depth, 
                                               time = time, 
                                               
                                               method = method, 
                                               pythonpath = pythonpath, 
                                               hmmseed = hmmseed, 
                                               
                                               textsize = textsize,
                                               titlesize = titlesize,
                                               face = face)
                                   }
    #.export character vector of variables to export. This can be useful when 
    #accessing a variable that isn't defined in the current environment. The 
    #default valule is NULL
    
    #.packages charactor vector of packages that the tasks depend on. If the 
    #expression, ex, requires an R package to be loaded, this option can be used 
    #to load that package on each of the works.
    
    date()
    
    parallel::stopCluster(cl)
    
    unregister_dopar()
    
    
  }
  
  report <- NULL
  genereads1 <- list()
  genereads2 <- list()
  binplots <- list()
  expandplots <- list()
  
  for(i in 1:length(reslists)){
    
      report <- rbind(report, reslists[[i]]$subreport)
      genereads1[[i]] <- reslists[[i]]$singlegenereads1
      genereads2[[i]] <- reslists[[i]]$singlegenereads2
      binplots[[i]] <- reslists[[i]]$binresplotlist
      expandplots[[i]] <- reslists[[i]]$expandplotlist
      
  }
  
  
  
  if(is.null(report) & length(genereads1) == 0 & length(genereads2) == 0 & 
     length(binplots) == 0 & length(expandplots) == 0){
    
    res <- list()
    res[['report']] <- report
    res[['genereads1']] <- genereads1
    res[['genereads2']] <- genereads2
    
    res[['binplots']] <- binplots
    res[['expandplots']] <- expandplots
    
    return(res)
  }
  
  
  
  genereads1 <- Filter(Negate(is.null), genereads1)
  genereads2 <- Filter(Negate(is.null), genereads2)
  binplots <- Filter(Negate(is.null), binplots)
  expandplots <- Filter(Negate(is.null), expandplots)
  
  names(genereads1) <- names(genereads2) <- 
    names(binplots) <- names(expandplots) <- report$gene_id
  
  
  if(utr == FALSE){
    
    report$time <- time
    report$rate <- report$distance/report$time
    
  }
  
  report$binpadj <- p.adjust(p = report$binpval, method = 'BH')
  report$extendpadj <- p.adjust(p = report$extendpval, method = 'BH')
  
  
  report$significance <- 'nonsignificant'
  
  if(difftype == 1){
    
    report$significance[report$diffbinratio > 0 & report$binpadj < 0.05] <- 'significant'
    
  }else if(difftype == 2){
    
    report$significance[report$diffbinratio < 0 & report$binpadj < 0.05] <- 'significant'
    
  }else{
    
    report$significance[report$binpadj < 0.05] <- 'significant'
    
  }
  
  
  geneframe <- geneframe[c('gene_id', 'seqnames', 'start', 'end', 'strand',
                           setdiff(names(geneframe), c('gene_id', 'seqnames', 'start', 'end', 'strand')))]
  
  names(report)[1] <- 'gene_id'
  
  
  sigreport <- subset(report, significance == 'significant')
  nonsigreport <- subset(report, significance == 'nonsignificant')
  
  sigreport <- sigreport[order(sigreport$binpadj, sigreport$binpval,
                               sigreport$extendpadj, sigreport$extendpval,
                               -abs(sigreport$diffbinratio), -abs(sigreport$diffextendratio),
                               -sigreport$distance),]
  nonsigreport <- nonsigreport[order(nonsigreport$binpadj, nonsigreport$binpval,
                                     nonsigreport$extendpadj, nonsigreport$extendpval,
                                     -abs(nonsigreport$diffextendratio), -abs(nonsigreport$diffextendratio),
                                     -nonsigreport$distance),]
  report <- rbind(sigreport, nonsigreport)
  row.names(report) <- 1:nrow(report)
  
  geneframe <- subset(geneframe, gene_id %in% report$gene_id)
  
  geneframe <- geneframe[match(report$gene_id, geneframe$gene_id),]
  row.names(geneframe) <- 1:nrow(geneframe)
  report <- cbind(geneframe, report[-1])
  
  names(report)[2] <- 'chr'
  
  if(utr == FALSE){
    
    maincols <- c('gene_id',
                  'distance', 'time', 'rate',
                  'significance', 'binpadj', 'binpval',
                  'frontbinratio', 'latterbinratio', 'diffbinratio',
                  'chr', 'start', 'end', 'strand',
                  'extendpadj', 'extendpval',
                  'frontextendratio', 'latterextendratio', 'diffextendratio')
    
  }else{
    
    report$ProxPolyA <- report$start + report$distance - 1
    report$ProxPolyA[report$strand == '-'] <- 
      (report$end - report$distance + 1)[report$strand == '-']
    report$ProxPolyA <- paste0(report$chr, ':', report$ProxPolyA)
    
    report$DistPolyA <- report$end
    report$DistPolyA[report$strand == '-'] <- report$start[report$strand == '-']
    report$DistPolyA <- paste0(report$chr, ':', report$DistPolyA)
    
    maincols <- c('gene_id',
                  'distance', 'ProxPolyA', 'DistPolyA',
                  'significance', 'binpadj', 'binpval',
                  'frontbinratio', 'latterbinratio', 'diffbinratio',
                  'chr', 'start', 'end', 'strand',
                  'extendpadj', 'extendpval',
                  'frontextendratio', 'latterextendratio', 'diffextendratio')
    
  }
  
  othercols <- setdiff(names(report), maincols)
  allcols <- c(maincols, othercols)
  report <- report[allcols]
  row.names(report) <- 1:nrow(report)
  
  names(report)[names(report) == 'width'] <- 'genewidth'
  
  if(utr == TRUE){
    
    names(report)[names(report) == 'start'] <- 'ExtStart'
    names(report)[names(report) == 'end'] <- 'ExtEnd'
    names(report)[names(report) == 'genewidth'] <- 'ExtWidth'
    
  }
  
  res <- list()
  res[['report']] <- report
  res[['genereads1']] <- genereads1
  res[['genereads2']] <- genereads2
  
  res[['binplots']] <- binplots
  res[['expandplots']] <- expandplots
  
  return(res)
  
}



#lssres <- calrate(time1file = wt0file, 
#                  time2file = wt15file, 
#                  time = 15, 
#                  strandmethod = 1, 
#                  targetfile = NULL, 
#                  genomename = 'mm10', 
#                  lencutoff = 40000, 
#                  fpkmcutoff = 1, 
#                
#                  threads = 6, 
#                  
#                  startshorten = 1000, 
#                  endshorten = 1000, 
#                  window_num = 40, 
#                  
#                  method = 'LSS', 
#                  pythonpath = 'C:/Users/yuabr/anaconda3/python.exe', 
#                  difftype = 1)



#lssres <- calrate(time1file = fp0file, 
#                  time2file = fp12file, 
#                  time = 12, 
#                  strandmethod = 4, 
#                  targetfile = NULL, 
#                  genomename = 'mm10', 
#                  lencutoff = 40000, 
#                  fpkmcutoff = 1, 
#                  
#                  threads = 6, 
#                  
#                  startshorten = 1000, 
#                  endshorten = 1000, 
#                  window_num = 40, 
#                  
#                  method = 'LSS', 
#                  pythonpath = 'C:/Users/yuabr/anaconda3/python.exe', 
#                  difftype = 1, 
#                  utrexts = NULL, 
#                  
#                  textsize = 13, 
#                  titlesize = 15, 
#                  face = 'bold')



#hmmres <- calrate(time1file = fp0file, 
#                  time2file = fp12file, 
#                  time = 12, 
#                  strandmethod = 4, 
#                  targetfile = NULL, 
#                  genomename = 'mm10', 
#                  lencutoff = 40000, 
#                  fpkmcutoff = 1, 
#                  
#                  threads = 1, 
#                  
#                  startshorten = 1000, 
#                  endshorten = 1000, 
#                  window_num = 40, 
#                  
#                  method = 'HMM', 
#                  pythonpath = 'C:/Users/yuabr/anaconda3/python.exe', 
#                  difftype = 1, 
#                  utrexts = NULL, 
#                  
#                  textsize = 13, 
#                  titlesize = 15, 
#                  face = 'bold')





#'Calculate gene elongation rate for multiple pairs of Pro-seq or Gro-seq data 
#'
#'Calculate gene elongation rate for multiple pairs of Pro-seq or Gro-seq data 
#'with the LSS (least sum of squares) or HMM (hidden Markov model) method. 
#'
#'@param time1files The reference Pro-seq/Gro-seq bam files, corresponding to 
#'  the experimental condition of no transcriptional inhibitor treatment. Can 
#'  be a vector with elements as strings indicating the directories of the bam 
#'  files.
#'@param time2files The treatment Pro-seq/Gro-seq bam files, corresponding to 
#'  the treatment of transcriptional inhibitor for specific times (e.g. DRB 
#'  treatment for 15 min, 30 min, etc). Should be a vector with elements as 
#'  strings indicating the directories of the bam files.
#'@param targetfile A txt file with the genes whose transcriptional rates need 
#'  to be calculated. Should contain columns named as chr, start, end, strand, 
#'  and gene_id. It can also be NULL, so that the genes in the genome set by 
#'  the parameter \code{genomename} will be analyzed. However, in any case, 
#'  the genes should have a length longer than the one set by the parameter 
#'  \code{lencutoff}, and also longer than the one of 2*(\code{startshorten} + 
#'  \code{endshorten}), which is set by the parameters \code{startshorten} and 
#'  \code{endshorten}.
#'@param gene_ids A vector with gene symbols indicating the ones need to be 
#'  analyzed. In addition to \code{targetefile} and \code{genomename}, this 
#'  parameter also indicates the genes to be analyzed. The final ones should 
#'  belong to the intersection of these parameters, and they also need to have 
#'  a length longer than the one set by the parameter \code{lencutoff}, and 
#'  also longer than the one of 2*(\code{startshorten} + \code{endshorten}), 
#'  which is set by the parameters \code{startshorten} and \code{endshorten}. 
#'  Can also be NULL, so that no restriction will be added from it. 
#'@param genomename Specify the genome of the genes to be analyzed, when the 
#'  parameter \code{targetfile} is NULL. Can be "mm10" for mouse or "hg38" for 
#'  human.
#'@param times The treatment time differences between the \code{time1files} 
#'  and the \code{time2files}, using min as their units. Should be a vector 
#'  with each element as the time difference between the matched elements in 
#'  the \code{time1files} vector and the \code{time2files} vector.
#'@param strandmethod Indicate the strand specific method used when preparing 
#'  the sequencing libraries, can be 1 for the directional ligation method, 2 
#'  for the dUTP method, and 0 for non-strand specific librares. In addition, 
#'  if the samples are sequenced using a single strand method, set it as 3.
#'@param threads Number of threads to do the parallelization. Default is 1.
#'@param mergerefs Whether to merge all the reference data contained in the 
#'  \code{time1files} vector to one, and then use it as a unified reference 
#'  for all the \code{time2files}. Default is TRUE.
#'@param mergecases Whether to merge all the treatment data contained in the 
#'  \code{time2files} vector to one. Default is FALSE.
#'@param lencutoff The cutoff on gene length (bp). Only genes longer than this 
#'  cutoff can be considered for analysis. Default is 70000.
#'@param fpkmcutoff The cutoff value on gene FPKM. Only genes with an FPKM 
#'  value greater than the cutoff in the reference data can be considered for 
#'  analysis. Default is 1.
#'@param startshorten Before inferring a gene's transcription rate, its first 
#'  1000 bp (or other length) and last 1000 bp (or other length) regions will 
#'  be discarded to avoid the unstable reads at the transcription starting and 
#'  ending stages. However, these regions' lengths can be changed by setting 
#'  this parameter \code{startshorten} and the other \code{endshorten}. This 
#'  one is used to set the length of the transcription starting region. Its 
#'  default value is 1000, so that the first 1000 bp region will be discarded.
#'@param endshorten Before inferring a gene's transcription rate, its first 
#'  1000 bp (or other length) and last 1000 bp (or other length) regions will 
#'  be discarded to avoid the unstable reads at the transcription starting and 
#'  ending stages. However, these regions' lengths can be changed by setting 
#'  this parameter \code{endshorten} and the other \code{startshorten}. This 
#'  one is used to set the length of the transcription ending region. Default 
#'  is 1000, so that the last 1000 bp region will be discarded.
#'@param window_num Before inferring a gene's transcription rate, the function 
#'  will divide this gene into 40 bins (or other bin number). For each bin, 
#'  the normalized read count ratio between the treatment and the reference 
#'  files will be calculated, so a vector with 40 ratios (or other bin number) 
#'  will be generated. Then, the LSS or HMM method will be used to find the 
#'  transition bin between the gene's transcription inhibited region and the 
#'  normal reads region. After that, this identified transition bin and its 
#'  downstream neighbor will be merged and expanded to the single-base-pair 
#'  level, and the LSS or HMM method will be further used on them to find the 
#'  transition base pair in this region. The parameter \code{window_num} here 
#'  is used to set the bin number to be divided for each gene. Default value 
#'  is 40.
#'@param method The method to be used for transcription rate inference. The 
#'  default value is "LSS", so that the least sum of squares method will be 
#'  used. Can also be "HMM", so that the hidden Markov model will be used. 
#'@param pythonpath The HMM method is base on \code{Python}, so the directory 
#'  of the \code{Python} interpreter you want to use should be transferred to 
#'  the function via this parameter, and two \code{Python} modules should be 
#'  installed to your \code{Python} environment, including \code{numpy} and 
#'  \code{hmmlearn}.
#'@param hmmseed The HMM method involves random processes, so a random seed 
#'  should be set via this parameter to repeat the results. Default value is 
#'  1234, can also be other integers, such as 2023.
#'@param difftype In most cases, the treatment and reference Pro-seq/Gro-seq 
#'  files are from experiments treating cells with transcription inhibitors, 
#'  such as DRB (5,6-dichloro-1-beta-d-ribofuranosylbenzimidazole), so that 
#'  the normal transcription will be repressed for a specific time, generating 
#'  a reads-depleted region upstream of the normal transcription region. For 
#'  such inhibitor-based experiments, this parameter \code{difftype} should be 
#'  set as 1. However, in some cases, the treatment and reference Pro-seq/Gro- 
#'  seq files can also come from experiments treating cells with transcription 
#'  activators, e.g., treating MCF-7 human breast cancer cells with E2 (17- 
#'  beta-estradiol), making the reads-depleted region downstream, rather than 
#'  upstream, of the normal transcription region, which is in contrast to the 
#'  DRB (inhibitor) experiments. For such activator-based experiments, this 
#'  parameter should be set as 2. In addition, this function \code{mcalrate} 
#'  can also infer proximal polyA alternative sites for genes, and to perform 
#'  this analysis, the parameter \code{time1files} needs to contain RNA-seq 
#'  files with genes using distal polyA sites; the parameter \code{time2files} 
#'  should have RNA-seq files with genes using proximal polyA sites; another 
#'  parameter \code{utr} needs to be TRUE; and the parameter \code{difftype} 
#'  here should be set as 2. The default value of \code{difftype} is 1.
#'@param utr In addition to inferring transcription rates from Pro-seq/Gro-seq 
#'  data, \code{mcalrate} can also infer proximal polyA alternative sites for 
#'  genes. In this case, the parameter \code{time1files} should be an RNA-seq 
#'  files with genes using distal polyA sites; the parameter \code{time2files} 
#'  needs RNA-seq files with genes using proximal polyA sites; the parameter 
#'  \code{difftype} should be set as 2; and the current parameter \code{utr} 
#'  should be set as TRUE. The default value of \code{utr} is FALSE, so the 
#'  function will perform inference on transcription rates, not on proximal 
#'  polyA sites.
#'@param utrexts When the former parameter \code{utr} is set as TRUE to infer 
#'  proximal polyA sites for genes, this parameter can be used to provide a 
#'  txt file with the genes' last exons whose proximal polyA sites need to be 
#'  identified. Should contain columns named as chr, start, end, strand, and 
#'  gene_id. It can also be set as NULL, so that the genes' last exons in the 
#'  genome set by the parameter \code{genomename} will be analyzed. However, 
#'  in the latter case, the original last exons from the genome will be first 
#'  adjusted so that for the ones with a length > 10000 bp, the proximal polyA 
#'  sites will be inferred directly in them, but for the ones with a length <= 
#'  10000 bp, their lengths will be extended to 10000 bp first, and then the 
#'  proximal sites will be identified within the extended exons. On the other 
#'  hand, if the last exons are provided with this parameter \code{utrexts}, 
#'  they will never be extended to 10000 bp, and the proximal polyA inference 
#'  will be performed directly on them. In addition to the proximal sites, the 
#'  distal polyA sites will also be defined by the function from the Pro-seq/
#'  Gro-seq pairs defined by \code{time1files} and \code{time2files}. It is 
#'  performed with a sliding window method on the last exons, and it is before 
#'  the proximal polyA sites inference but after the exon extension step. It 
#'  should be noted that for a specific Pro-seq/Gro-seq file included in the 
#'  parameter \code{time1files}, it also set a cutoff for the last exons to be 
#'  analyzed with the polyA sites inference, i.e., their FPKM values in this 
#'  file should be greater than \code{fpkmcutoff}. 
#'@param textsize In addition to returning a data frame to show the inference 
#'  results, this function will also generate several plots to show them, and 
#'  the font size for the plot texts is set by this parameter. Default is 13.
#'@param titlesize The font size for the plot titles. Default is 15.
#'@param face The font face for the plot texts. Default is "bold".
#'@return A list with several sub-lists and each of them includes a slot named 
#'  "report", which is a data frame with the inferred transcription rates, or 
#'  genes' proximal and distal polyA sites, as well as other information, such 
#'  as the genes' coordinates, the results' significance, etc. A sub-list also 
#'  contains other slots, such as "binplots" and "expandplots", which contains 
#'  the data that can be used to plot the inference results.
#'@export
mcalrate <- function(time1files,
                     time2files,
                     targetfile = NULL, 
                     gene_ids = NULL, 
                     genomename = 'mm10',
                     times,
                     strandmethod = 0, 
                     threads = 1,
                     mergerefs = TRUE, 
                     mergecases = FALSE,
                     
                     lencutoff = 70000,
                     fpkmcutoff = 1, 
                     
                     startshorten = 1000, 
                     endshorten = 1000, 
                     window_num = 40, 
                     method = 'LSS', 
                     pythonpath = NULL, 
                     hmmseed = 1234,  
                     #hmmseed = 2023,  
                     difftype = 1, 
                     
                     utr = FALSE, 
                     utrexts = NULL, 
                     
                     textsize = 13,
                     titlesize = 15,
                     face = 'bold'){
  
  time1files <- getreadslist(timefiles = time1files, mergefiles = mergerefs, strandmode = strandmethod)
  time2files <- getreadslist(timefiles = time2files, mergefiles = mergecases, strandmode = strandmethod)
  
  timefilelength <- length(time2files)
  
  reslist <- list()
  i <- 1
  for(i in 1:timefilelength){
    
    time2file <- time2files[[i]]
    time1file <- time1files[[min(length(time1files), i)]]
    
    if(utr == TRUE){
      
      time <- NULL
      
    }else{
      
      time <- times[i]
      
    }
    
    res <- calrate(time1file = time1file, 
                   time2file = time2file,
                   targetfile = targetfile, 
                   gene_ids = gene_ids, 
                   genomename = genomename,
                   time = time,
                   strandmethod = strandmethod, 
                   threads = threads,
                   
                   lencutoff = lencutoff,
                   fpkmcutoff = fpkmcutoff, 
                   
                   startshorten = startshorten, 
                   endshorten = endshorten, 
                   window_num = window_num, 
                   method = method, 
                   pythonpath = pythonpath, 
                   hmmseed = hmmseed, 
                   difftype = difftype, 
                   
                   utr = utr, 
                   utrexts = utrexts, 
                   
                   textsize = textsize,
                   titlesize = titlesize,
                   face = face)
    
    reslist[[i]] <- res
    
  }
  
  return(reslist)
  
}
yuabrahamliu/proRate documentation built on Nov. 3, 2024, 10:14 a.m.