R/cycler_fuctions.R

Defines functions prep_circular_sg merge_fasta merge_qics transcripts_per_sample assemble_transcripts_per_sample prep_output_gtf prep_output find_depleted_features RPKM_calc get_seqs_positive get_seqs_corrected recount_features filter_bam make_BSJ_sg overlap_SG_BSJ plotRanges2 make_BSJ_gr combine_two_BSJ_tables process_BSJs read_tsv_file read_CE read_CIRI parse_files

Documented in combine_two_BSJ_tables filter_bam find_depleted_features make_BSJ_gr make_BSJ_sg merge_fasta merge_qics overlap_SG_BSJ parse_files plotRanges2 prep_circular_sg prep_output_gtf process_BSJs recount_features RPKM_calc transcripts_per_sample

## Copyright (C) 2020-2022 Stefan Stefanov and Irmtraud M. Meyer, Max Delbrueck Centre and Freie Universitaet, Berlin, Germany
## Contact information: irmtraud.meyer@cantab.net
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License,or
## (at your option) any later version.

## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <http://www.gnu.org/licenses/>.




##' Parse BSJ files from CIRI, CIRCexplorer2 or a TSV file
##'
##' This processes BSJ prediction files and prepares them for the next step of the pipeline.
##' \code{input_type} is essential for the correct parsing of the files. 
##' 
##' @title Parse BSJ input
##' @param file_list list with file names
##' @param file_path string object with file path, clould be an empty string
##' @param input_type \code{CIRI} for CIRI2 input, \code{CE} for CIRCexplorer2 input and \code{tsv} 
##'  for TSV formatted input
##' @return \code{Tibble} object with combined BSJ coordinate and number of junction spanning 
##'  reads across sample
##' @keywords parse
##' @author Stefan Stefanov
##' @export

parse_files<-function(file_list, file_path, input_type){
  if(input_type=="CIRI"){
    func=read_CIRI
  }else if(input_type=="CE"){
    func=read_CE
  }else if(input_type=="tsv"){
    func=read_tsv_file
  }else stop("input type is exclusively: CIRI, CE or tsv")
  Reduce(function(x,y) {full_join(x,y, by="circ_id")},map(paste0(file_path,file_list),func))
  
}

read_CIRI<-function(file_name){
  sample_name<-tail(str_split(file_name,"_", simplify = T)[1,], n=1)
  ciri<-readr::read_tsv(file_name, comment = "", col_names = T, col_types = cols(.default = "c" ))
  dplyr::select(ciri,(chr:`#junction_reads`),strand)%>%unite(circ_id,chr,circRNA_start,circRNA_end,strand)%>%
    dplyr::rename(!!sample_name:=`#junction_reads`)%>%mutate_at(2,as.double)
}
read_CE<-function(file_name){
  sample_name<-tail(str_split(file_name,"_", simplify = T)[1,], n=1)
  ce<-read_tsv(file_name, comment = "", col_names = F,col_types = cols(.default = "c" ))
  dplyr::select(ce,(X1:X3),X6,X13)%>%mutate(X2=as.numeric(X2)+1)%>%unite(circ_id,X1,X2,X3,X6)%>%
    dplyr::rename(!!sample_name:=X13)%>%mutate_at(2,as.double)
}
read_tsv_file<-function(file_name){
  sample_name<-tail(str_split(file_name,"_", simplify = T)[1,], n=1)
  df<-read_tsv(file_name, comment = "", col_names = F,col_types = cols(.default = "c" ))
  dplyr::unite(df,circ_id,X1,X2,X3,X4)%>%dplyr::rename(!!sample_name:=X5)%>%mutate_at(2,as.double)
}

##' process the BSJ table and select high confidence BSJs
##'
##' Filters BSJ based on comparison of the average CPM values of BSJs 
##' 
##' @title Process BSJs
##' @param cdf tibble produced by \code{parse_files}
##' @param file_path string object with file path, clould be an empty string
##' @param sample_table sample table formatted according to the manual,
##'   Must contain \dQuote{sample_name} \dQuote{treatment} \dQuote{file_bam} \dQuote{lib_size} 
##'   \dQuote{read_len}; NB the values in column \dQuote{treatment} can only be \dQuote{control} and 
##'   \dQuote{enriched}
##' @param bsj_th hard threshold on number of BSJ spanning reads in the enriched samples
##' @return \code{Tibble} object with combined filtered BSJ coordinate and number of junction spanning 
##'  reads across sample. 
##' @keywords filter BSJ
##' @author Stefan Stefanov
##' @export
process_BSJs<-function(cdf,sample_table,bsj_th=1){
  #stands for circular data frame
  cdf[is.na(cdf)] <- 0
  #cdf<-column_to_rownames(cdf,var = "circ_id")
  sample_index<-c(sample_table$sample_name[sample_table$treatment=="enriched"])
  cdf2<-cdf[,sample_index]
  cdf<-cdf[rowSums(cdf2>bsj_th)>0,]
  cdf[,sample_table$sample_name]<-(cdf[,sample_table$sample_name]/sample_table$lib_size)*10e6
  #cdf<-rownames_to_column(cdf,var = "circ_id")
  cdf$meanc<-rowMeans(cdf[,c(sample_table$sample_name[sample_table$treatment=="control"])])
  cdf$meanRR<-rowMeans(cdf[,c(sample_table$sample_name[sample_table$treatment=="enriched"])])
  # rownames(cdf[cdf$meanc<cdf$meanr,])
  cdf[cdf$meanc<cdf$meanRR,]
}


##' Combine 2 BSJ tables
##'
##' Just a combination of BSJ tables to make sure we have a complete set of BSJs. 
##'  The variable names do not actually matter since the all tables have the same formatting.
##' 
##' @title combine BSJs
##' @param ce_bsjs BSJ table 1
##' @param ciri_bsjs BSJ table 2
##' @return \code{Tibble} object with combined filtered BSJ coordinate and number of junction spanning 
##'  reads across sample. 
##' @keywords combine BSJ
##' @author Stefan Stefanov
##' @export
combine_two_BSJ_tables<-function(ce_bsjs,ciri_bsjs){
  #colnames(ciri_bsjs)<-colnames(ce_bsjs)
  #ciri_bsjs$circ_id<-rownames(ciri_bsjs)
  #ce_bsjs$circ_id<-rownames(ce_bsjs)
  full_table<-bind_rows(ciri_bsjs,ce_bsjs)
  full_table[is.na(full_table)]<-0
  table_circ<-full_table%>%group_by(circ_id)%>%summarize_all(max)
  #table_circ<-table_circ%>%separate(circ_id, into = paste("V", 1:4, sep = ""),sep = "_")
  table_circ<-table_circ%>%separate(circ_id, into = c("chr","start","end","strand"),sep = "_")
  table_circ
}

##'Convert BSJ string to GRanges obejct
##'
##' Convert BSJ string to \code{GRanges} obejct
##' 
##' @title Convert BSJ string to GRanges obejct
##' @param BSJ_set a list of BSJ ID records procudes by \code{process_BSJs} or \code{combine_two_BSJ_tables}
##' @return \code{GRanges} object indicating BSJ loci 
##' @keywords GRanges BSJ
##' @author Stefan Stefanov
##' @export
make_BSJ_gr<-function(BSJ_set){
  BSJ_BED<-as.data.frame(str_split(BSJ_set,"_",simplify = T),stringsAsFactors=F)
  BSJ_BED$V2<-as.numeric(BSJ_BED$V2)
  BSJ_BED$V3<-as.numeric(BSJ_BED$V3)
  BSJ_gr <- GRanges(BSJ_BED$V1, IRanges(start=BSJ_BED$V2,end=BSJ_BED$V3),
                    strand=BSJ_BED$V4, seqlengths=NULL)
  BSJ_gr@elementMetadata@listData[["gene_id"]]<-paste0("circ",seq_along(1:length(BSJ_gr)))
  BSJ_gr
}

##'Plots GRanges objects 
##'
##' ggplot of multiple GRanges object. Every object is auto assigned a colour from colorblind friendly scheme
##' 
##' @title Plot ranges
##' @return ggplot of multiple GRanges objects 
##' @keywords GRanges plot
##' @author Stefan Stefanov
##' @export
plotRanges2 <- function(...) {
  arg_v <- c(as.list(environment()), list(...)) #argument values 
  arg_n <- as.list(match.call()) #argument names 
  t<-paste(arg_n)
  t[1]="c"
  x<-eval(as.call( parse(text=t))) # it needs to be so complicated to save the order of the ranges 
  x_l<-lengths(arg_v)
  col=rep(t[-1],x_l)
  bins <- disjointBins(IRanges(start(x), end(x) + 1))
  dat <- cbind(as.data.frame(x), bin = bins, col=col)
  cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
  ggplot(dat) + 
    geom_rect(aes(xmin = start, xmax = end,
                  ymin = bin, ymax = bin + 0.9, fill=col)) +
    theme_bw()+scale_fill_manual(values=cbPalette)+theme(axis.text.y=element_blank(),
                                                         axis.ticks.y=element_blank())
}

##'Creates a disjointed set of exons based on a \code{SGSeq} obejct and a BSJ \code{GRanges} object
##'
##' Creates a disjointed set of exons based on a \code{SGSeq} obejct and a BSJ \code{GRanges} object. 
##'  The function keeps the \code{SGSeq} metadata
##' 
##' @title Overlap of BSJ and a splice graph 
##' @param BSJ_gr a GRange of BSJ cooredinates
##' @param sgfc_pred \code{SGSeq} prediction object
##' @param sg_annot \code{SGSeq} prediction object with annotation info
##' @return \code{SGSeq} with disjoint exon bins
##' @keywords GRanges overlap
##' @author Stefan Stefanov
##' 
##' @export
overlap_SG_BSJ<-function(sgfc_pred,BSJ_gr,sg_annot){
  sg_gr<-rowRanges(sgfc_pred) #first we process the predicted features
  sg_gr_e<-sg_gr[sg_gr@type=="E"]
  
  BSJ_sg<-SGFeatures(BSJ_gr,type=rep("E",length(BSJ_gr)),splice5p =rep(F,length(BSJ_gr)),
                     splice3p = rep(F,length(BSJ_gr)), featureID =rep(1L,length(BSJ_gr)),
                     geneID = rep(1L,length(BSJ_gr)), txName = mcols(BSJ_gr)$geneID,
                     geneName = mcols(BSJ_gr)$geneID)
  
  sg_gr_e2<-sg_gr_e[sg_gr_e%over%BSJ_sg]  #this is to remove some mapping artifacts
  sg_gr_e<-sg_gr_e[sg_gr_e@geneID%in%sg_gr_e2@geneID] #and keep only the genes that overlap with BSJ regions
  #BSJ_sg2<-GenomicRanges::reduce(BSJ_sg)
  sg_an<-sg_annot #the same for annotated features 
  sg_an_e<-sg_an[sg_an@type=="E"]
  sg_an_e2<-sg_an_e[sg_an_e%over%BSJ_sg]  #this is to remove some mapping artifacts
  sg_an_e<-sg_an_e[sg_an_e@geneID%in%sg_an_e2@geneID] #and keep only the genes that overlap with BSJ region
  combined<-c(BSJ_sg,sg_gr_e,sg_an_e)
  #combined2<-c(BSJ_sg2,sg_gr_e)
  disjoint_ranges<-GenomicRanges::disjoin(combined, with.revmap=T)
  #disjoint_ranges2<-GenomicRanges::disjoin(combined2, with.revmap=T)
  disjoint_ranges<-disjoint_ranges[disjoint_ranges%over%sg_gr_e] #this is needed to remove the intronic artefacts from the split
  disjoined_sg<-SGFeatures(disjoint_ranges,type=rep("E",length(disjoint_ranges)),splice5p =rep(F,length(disjoint_ranges)),
                           splice3p = rep(F,length(disjoint_ranges)), featureID = rep(1L,length(disjoint_ranges)),
                           geneID = rep(1L,length(disjoint_ranges)), txName = mcols(disjoint_ranges)$geneID,
                           geneName = mcols(disjoint_ranges)$geneID)
  disjoined_sg@geneID<-sg_gr_e@geneID[queryHits(findOverlaps(sg_gr_e,disjoined_sg))] #this makes sure we keep geneID info in the new bins
  disjoined_sg@geneName<-sg_gr_e@geneName[queryHits(findOverlaps(sg_gr_e,disjoined_sg))] 
  disjoined_sg@txName<-sg_gr_e@txName[queryHits(findOverlaps(sg_gr_e,disjoined_sg))] 
  #disjoined_sg@featureID<-sg_gr_e@featureID[queryHits(findOverlaps(sg_gr_e,disjoined_sg))] 
  #disjoined_sg@featureID<-seq(1:length(disjoined_sg))
  disjoined_sg@featureID<-seq(max(sg_gr_e@featureID),(max(sg_gr_e@featureID)+length(disjoined_sg)-1))
  disjoined_sg
}
##'Selection of the exons based on BSJ set 
##'
##' Selection of the exons based on BSJ set
##' 
##' @title Preparation of the BSJ-specific splice graphs  
##' @param BSJ_gr a GRange of BSJ coordinates
##' @param circ_sg \code{SGSeq} prediction object
##' @return \code{SGSeq} containing exons belonging to BSJ loci
##' @keywords GRanges overlap
##' @author Stefan Stefanov
##' 
##' @export
make_BSJ_sg<-function(circ_sg,BSJ_gr){
  BSJ_sg<-SGFeatures(BSJ_gr,type=rep("E",length(BSJ_gr)),splice5p =rep(F,length(BSJ_gr)),
                     splice3p = rep(F,length(BSJ_gr)), featureID =rep(1L,length(BSJ_gr)),
                     geneID = rep(1L,length(BSJ_gr)), txName = mcols(BSJ_gr)$geneID,
                     geneName = mcols(BSJ_gr)$geneID)
  BSJ_sg@geneID<-circ_sg@geneID[queryHits(findOverlaps(circ_sg,BSJ_sg,type = c("start")))] 
  BSJ_sg
}
##' A wrapper function for samtools use to trim the files
##'
##' This function removes the BAM file reads that do not overlap with the BSJ loci.
##'  This significantly speeds us the feature detection and lowers the virtual memory requirements
##' 
##' @title BAM file filter  
##' @param BSJ_gr a GRange of BSJ cooredinates
##' @param sample_table sample table formatted according to the manual,
##'   Must contain \dQuote{sample_name} \dQuote{treatment} \dQuote{file_bam} \dQuote{lib_size} 
##'   \dQuote{read_len}; NB the values in column \dQuote{treatment} can only be \dQuote{control} and 
##'   \dQuote{enriched}
##' @param samtools_prefix a string that corresponds to user's samtools run prefix 
##' @return \code{BAMFileList} object with info on the trimmed files 
##' @keywords Bam Filter
##' @author Stefan Stefanov
##' 
##' @export
filter_bam<-function(BSJ_gr,sample_table,samtools_prefix){
  #gn_circ<-gene_ranges[gene_ranges%over%BSJ_gr]
  #unannot<-BSJ_gr[BSJ_gr%outside%gene_ranges]
  #filter_ranges<-c(gn_circ,unannot)
  bed_file<-paste0(bam_file_prefix,"/BSJ.bed")
  #export.bed(con = bed_file,object =  filter_ranges)
  rtracklayer::export.bed(con = bed_file,object =  BSJ_gr)
  #now we need to use the bed file to filter the bam files; i suggest doing it directly in the command line 
  list_file<-paste0(bam_file_prefix,"list")
  write(sample_table$sample_name,list_file)
  print("Files are trimming")
  trim_command<-paste(paste0(samtools_prefix,"samtools view -b -L ",bed_file," ", sample_table$file_bam," > ",bam_file_prefix,"/",sample_table$sample_name,"_sorted_trimmed.bam"), collapse = "; ")
  system(trim_command)
  print("Files are indexing")
  index_command<-paste(paste0(samtools_prefix,"samtools index ",bam_file_prefix,"/",sample_table$sample_name,"_sorted_trimmed.bam"), collapse = "; ")
  system(index_command)
  trimmed_bams<-BamFileList(paste0(bam_file_prefix,"/",sample_table$sample_name,"_sorted_trimmed.bam"), asMates = F)
  trimmed_bams
}
##' A wrapper function for Rsubread
##'
##' This function performs requantification of the exon bins with specifically selected parameters
##' 
##' @title Re-count of the reads per exon bin
##' @param full_sg a \code{SGSeq} object of exon bins
##' @param sample_table sample table formatted according to the manual,
##'   Must contain \dQuote{sample_name} \dQuote{treatment} \dQuote{file_bam} \dQuote{lib_size} 
##'   \dQuote{read_len}; NB the values in column \dQuote{treatment} can only be \dQuote{control} and 
##'   \dQuote{enriched}
##' @param paired_end a binary for pair-end info
##' @return \code{BAMFileList} object with info on the trimmed files 
##' @keywords Bam Filter
##' @author Stefan Stefanov
##' 
##' @export
recount_features<-function(full_sg,sample_table,paired_end=T){
  feature_info<-as.data.frame(full_sg,stringsAsFactors = F)
  # we make a feature reference for counting
  saf<-feature_info[feature_info$type=="E",c("featureID","seqnames","start","end","strand")]
  colnames(saf)<- c("GeneID",		"Chr",	"Start",	"End",	"Strand")
  #saf$GeneID<-c(1,2,3,4)
  saf.counts<-Rsubread::featureCounts(sample_table$file_bam, annot.ext=saf, isPairedEnd=paired_end,  countChimericFragments= T, 
                                      countMultiMappingReads =T ,juncCounts=T,countReadPairs=F,requireBothEndsMapped=F,allowMultiOverlap=T)
  saf.fc<-saf.counts$counts
  saf.fc
}
##' A wrapper function for BSgenome subsequencing
##'
##' Extracts sequence based on \code{SGSeq} object and \dQuote{BSgenome} name
##' 
##' @title Extract sequence per exon bin
##' @param full_sg a \code{SGSeq} object of exon bins
##' @param bs_genome \dQuote{BSgenome} name
##' @return sequence list 
##' @keywords seqs
##' @author Stefan Stefanov
##' 
##' @export
get_seqs<-function (full_sg, bs_genome) {   
  temp_seqs<-gsub("chr","",full_sg@seqnames)
  if(grepl("chr",bs_genome@seqinfo@seqnames[1])) 
  {seqs <- unname(BSgenome::getSeq(bs_genome, paste0("chr",temp_seqs),
                                 start = start(full_sg), end = end(full_sg), 
                                 strand = strand(full_sg), as.character = T))

}
  if(!grepl("chr",bs_genome@seqinfo@seqnames[1])){
    seqs <- unname(BSgenome::getSeq(bs_genome, paste0(temp_seqs), 
                                 start = start(full_sg), end = end(full_sg), 
                                 strand = strand(full_sg), as.character = T))  
}

seqs
}
get_seqs_corrected<-function(full_sg, bs_genome){
  seqs<-get_seqs(full_sg,bs_genome)
  short_ranges<-full_sg[width(full_sg)<35]
  long_ranges<-full_sg[!width(full_sg)<35]
  pair_ranges<-long_ranges[nearest(short_ranges,long_ranges)]
  corrected.seqs<-paste0(get_seqs(short_ranges),get_seqs(pair_ranges))
  seqs[width(full_sg)<35]<-corrected.seqs
  seqs
}
get_seqs_positive<-function(full_sg, bs_genome){
  strand(full_sg)<-"+"
  get_seqs(full_sg,bs_genome)
}
#this one needs work to add the true/false if-s and test it; also figure out how to save the model 
##' RPKM calculation for the genomic features
##'
##' This function performs RPKM calculations for the exonic features. The RPKM calculation is
##'  performed based on the exact sequences for the exons. For junctions, the sequences are selected based 
##'  on the exons, flanking the junction. The function takes into account the needed effective 
##'  length corrections.
##' 
##' @title RPKM calculation for the genomic features 
##' @param count_matrix count matrix corresponding to the features 
##' @param sg \code{SGSeq} object supplying feature info
##' @param bsj_granges  GRange of BSJ cooredinates
##' @param bs_genome a \code{BSGenome} object used for extracting the sequences 
##' @param sample_table sample table formatted according to the manual,
##'   Must contain \dQuote{sample_name} \dQuote{treatment} \dQuote{file_bam} \dQuote{lib_size} 
##'   \dQuote{read_len}; NB the values in column \dQuote{treatment} can only be \dQuote{control} and 
##'   \dQuote{enriched}
##' @param feature_type either \dQuote{e} for exons ot \dQuote{j} for junctions
##' @param fsj_overhang the FJS overhand used in the mapping a.k.a. anchor
##' @param bsj_overhang the BSJ overhand used in the chimeric detection
##' @param eff_length_correction whether or not to apply effective length correction
##' @param gc_correction whether or not to apply GC-content correction; requires further testing
##' @return \code{BAMFileList} object with info on the trimmed files
##' @keywords RPKM
##' @author Stefan Stefanov
##' 
##' @export
RPKM_calc<-function(count_matrix, sg ,bsj_granges, bs_genome, sample_table,feature_type, fsj_overhang=3, 
                    bsj_overhang=15, eff_length_correction=T, gc_correction=F){
  bsj_overhang_adj=bsj_overhang-fsj_overhang
  eff_lengths<- width(sg)
  #the scaling is capped based on the  read length, because in case if something maps it is at least a 
  #read size 
  eff_length_limit<-max(sample_table$read_len[sample_table$treatment=="enriched"])
  eff_lengths[eff_lengths<eff_length_limit]<-eff_length_limit
  if(feature_type=="j") {eff_lengths<-rep(1,length(sg))}
  if(feature_type=="e") {
    if(eff_length_correction){
      #adjustment
      #min required eff. length adjustment is based on the overhang and then we build up upon that 
      #FSJ fix 
      eff_lengths<-eff_lengths-(2*fsj_overhang)
      #BSJ fix
      eff_lengths[start(sg)%in%start(bsj_granges)]<-eff_lengths[start(sg)%in%start(bsj_granges)]-bsj_overhang_adj
      eff_lengths[end(sg)%in%end(bsj_granges)]<-eff_lengths[end(sg)%in%end(bsj_granges)]-bsj_overhang_adj
      
    }
  }  
  ############################# GC-content adhustment 
  #calculating CG content
  #seqs<-unname(BSgenome::getSeq(bs_genome, paste0("chr",full_sg@seqnames), start = start(full_sg), end = end(full_sg), strand=strand(full_sg),as.character=T))
  #fix that later
  #full_gr<-GRanges(full_sg)
  #seqlevelsStyle(full_gr) <- "UCSC"
  #seqs<-BSgenome::getSeq(bs_genome, full_gr)
  if(gc_correction){
    seqs<-get_seqs_corrected(sg,bs_genome)
    GC_content <- (str_count(seqs, "G") + str_count(seqs, "C")) / str_length(seqs) * 100 
    #again a rough fix; i am capping it within those ranges they need to be tested 
    GC_content[GC_content<35]<-35
    GC_content[GC_content>75]<-75
    # adjusting scaled counts for CG content
    #this section needs to be reworked; may be save the model as rda; in the packege folder 
    data("loessfit7")
    fit.df<-as.data.frame(loessfit7)
    loess_mod <- loess(y~x, data = fit.df, control=loess.control(surface="direct"), span=0.3)
    shifts<-predict(loess_mod, GC_content/100)
    shifts<-2^shifts
    eff_lengths_gc<-eff_lengths*shifts
  }
  else{eff_lengths_gc<-eff_lengths}
  #here is with shifts based on the log2
  #shifts <- predict(loess_mod,count_per_feature_exons_circ$GC_content/100)
  colnames(count_matrix)<-sample_table$sample_name
  full_fc_adj<-sweep(count_matrix,2,sample_table$lib_size/10^9, FUN="/")
  full_fc_adj<-sweep(full_fc_adj,1,eff_lengths_gc, FUN="/")
  #full_fc_adj<-sweep(full_fc_adj,1,shifts, FUN="/")
  full_fc_adj<-apply (full_fc_adj, c (1, 2), function (x) {
    (as.integer(x))
  })
  full_fc_adj
}
##' CircRNA feature selection 
##'
##' This function works in 2 ways: direct comparison of average quantities or as a wrapper of DEXSeq.
##' In case of dataset with replicates, the suggested approach is the use of DEXSeq statistical test.
##' 
##' @title CircRNA feature selection 
##' @param circ_fc_adj count matrix corresponding to the circRNA features 
##' @param circ_sg \code{SGSeq} object supplying feature info
##' @param sample_table sample table formatted according to the manual,
##'   Must contain \dQuote{sample_name} \dQuote{treatment} \dQuote{file_bam} \dQuote{lib_size} 
##'   \dQuote{read_len}; NB the values in column \dQuote{treatment} can only be \dQuote{control} and 
##'   \dQuote{enriched}
##' @param test either \dQuote{DEX} for DEXSeq based feature selection or \dQuote{comparison} simple average
##'   comaparison
##' @return vector of featureID
##' @keywords depleted
##' @author Stefan Stefanov
##' 
##' @export
find_depleted_features<-function(circ_fc_adj,sample_table,circ_sg, test="DEX"){#may need to add feature type
  if(length(sample_table$sample_name)<4) {test="comparison" 
  print("test strategy has been switched to comparison")}
  if(test=="comparison"){
    cdf<-as.data.frame(circ_fc_adj)
    cdf$meanc<-rowMeans(as.data.frame(cdf[,c(sample_table$sample_name[sample_table$treatment=="control"])]))
    cdf$meanRR<-rowMeans(as.data.frame(cdf[,c(sample_table$sample_name[sample_table$treatment=="enriched"])]))
    cdf[cdf$meanc>cdf$meanRR,]
    rownames(cdf)
  }
  else{
    feature_dex<- DEXSeqDataSet(circ_fc_adj, sample_table, 
                                design= ~ sample + exon + treatment:exon, 
                                as.character(circ_sg@featureID),  as.character(circ_sg@geneID), 
                                featureRanges=GRanges(circ_sg), 
                                transcripts=circ_sg@txName, 
                                alternativeCountData=NULL)
    sizeFactors(feature_dex)<-1
    #if(feature_type=="e") {sizeFactors(feature_dex)<-1}
    #if(feature_type=="j") {feature_dex = estimateSizeFactors(feature_dex)}
    feature_dex = estimateDispersions( feature_dex, fitType="local", quiet=T) # due to the small number of features; we use the local fit type 
    feature_dex = testForDEU( feature_dex)
    feature_dex = estimateExonFoldChanges( feature_dex,fitExpToVar="treatment")
    feature_dex_res = as.data.frame(DEXSeqResults( feature_dex ))
    #filtering of the enriched features
    feature_dex_res_depleted<-feature_dex_res$featureID[feature_dex_res$log2fold_enriched_control<(-1)&feature_dex_res$padj<=0.005]
    #feature_dex_res_depleted<-feature_dex_res$featureID[feature_dex_res$log2fold_enriched_control<(-0.5)&feature_dex_res$padj<=0.05]
    #feature_dex_res_depleted_extra<-feature_dex_res$featureID[feature_dex_res$log2fold_enriched_control<(-1)]
    #feature_dex<-union(feature_dex_res_depleted,feature_dex_res_depleted_extra)
    feature_dex_res_depleted<-feature_dex_res_depleted[!is.na(feature_dex_res_depleted)]
    feature_dex_res_depleted #exports the feature id; not to be confused with the index 
  }
}
prep_output<-function(transcript_features,circ_exons){
  transcript_features<-rev(transcript_features)
  transcript_features<-transcript_features[!transcript_features==""]
  csp.results.col<-c("circID","chr","start","end","strand","gene","exon_starts","exon_ends","seq")
  csp.results<-data.frame(matrix(0,nrow=length(transcript_features),ncol=length(csp.results.col)))
  colnames(csp.results)<-csp.results.col
  seqs<-get_seqs_positive(circ_exons,bs_genome = bs_genome)
  for(i in 1:length(transcript_features)){
    #str_split(transcript_features[i],"_",simplify = T)
    temp.features<-unlist(strsplit(transcript_features[i],"_"))
    temp.feature.table<-as.data.frame(circ_exons[featureID(circ_exons)%in%temp.features])
    #the reasonable output part
    csp.results[i,2]<-temp.feature.table$seqnames[1]
    csp.results[i,3]<-temp.feature.table$start[1]
    csp.results[i,4]<-tail(temp.feature.table$end, n=1)
    csp.results[i,5]<-as.character(temp.feature.table$strand[1])
    csp.results[i,6]<-ifelse(length(unlist(temp.feature.table$geneName[1]))==0,temp.feature.table$geneID[1],unlist(temp.feature.table$geneName[1]))
    csp.results[i,7]<-paste(temp.feature.table$start,collapse = ",")
    csp.results[i,8]<-paste(temp.feature.table$end,collapse = ",")
    csp.results[i,1]<-paste(c(i,csp.results[i,2],csp.results[i,3],csp.results[i,4]),collapse ="_")
    csp.results[i,9]<-paste(seqs[featureID(circ_exons)%in%temp.features],collapse = "")
    if(temp.feature.table$strand[1]=="-"){ csp.results[i,9]<-as.character(reverseComplement(DNAString(csp.results[i,9])))}
  }
  csp.results$chr<-circ_exons@seqnames@values[csp.results$chr]
  csp.results
}
##' Creation of GTF based on CYCLeR results 
##'
##' This function takes the 
##' 
##' @title Creation of GTF based on CYCLeR results 
##' @param qics CYCLeR \code{data.frame}  of intermediate results
##' @param sgfc \code{SummarizedExperiment} object with info on the circular splice graph features
##' @param annot_list ORG package; soon to be expanded 
##' @return GTF-like table
##' @keywords GTF
##' @author Stefan Stefanov
##' 
##' @export
prep_output_gtf<-function(qics_out,sgfc,annot_list=NULL){
  #check if annotation is given 
  if(is.null(annot_list)){
    mapped_ids<-unique(qics_out$gene)
    names(mapped_ids)<-mapped_ids
  }else{mapped_ids<-annot_list}
  csp.results<-qics_out
  circ_exons<-rowRanges(sgfc)
  circ_exons<-circ_exons[circ_exons@type=="E"]
  
  #the unreasonable, weird gtf part
  temp.gene.id<-""
  gtf.results.col<-c("seqname","source","feature","start","end","score","strand","frame","attribute")
  gtf.results<-data.frame(matrix(ncol=length(gtf.results.col)))
  colnames(gtf.results)<-gtf.results.col
  for(i in 1:length(csp.results$circID)){
    #a<-lapply(1:length(csp.results$circID),function(i){
    #gene
    geneid<-geneID(circ_exons)[start(circ_exons)==csp.results[i,3]]
    temp.gene.name<-ifelse(is.na(mapped_ids[csp.results[i,6]]),paste0("NA_",geneid),mapped_ids[csp.results[i,6]])
    if(temp.gene.id!=geneid){
      temp.gene.id<-csp.results[i,1]
      temp.gtf.gene<-data.frame(matrix(0,nrow=1,ncol=length(gtf.results.col)))
      colnames(temp.gtf.gene)<-gtf.results.col
      temp.gtf.gene$seqname<-csp.results[i,2]
      temp.gtf.gene$feature<-"gene"
      temp.gtf.gene$start<-min(start(circ_exons[geneID(circ_exons)==geneid]))
      temp.gtf.gene$end<-max(end(circ_exons[geneID(circ_exons)==geneid]))
      temp.gtf.gene$strand<-csp.results[i,5]
      temp.gtf.gene$attribute<-paste0('gene_id "circ',csp.results[i,6],'"; gene_name "',temp.gene.name,'"; gene_source "CSP"; gene_biotype "circRNA";')
      gtf.results<-rbind(gtf.results,temp.gtf.gene)   
    } 
    #transcript
    temp.gtf.trascript<-data.frame(matrix(0,nrow=1,ncol=length(gtf.results.col)))
    colnames(temp.gtf.trascript)<-gtf.results.col
    temp.gtf.trascript$feature<-"transcript"
    temp.gtf.trascript$seqname<-csp.results[i,2]
    temp.gtf.trascript$start<-min(str_split(csp.results[i,7],",",simplify =T))
    temp.gtf.trascript$end<-max(str_split(csp.results[i,8],",",simplify =T))
    temp.gtf.trascript$strand<-csp.results[i,5]
    temp.gtf.trascript$attribute<-paste0('gene_id "circ',csp.results[i,6],'"; transcript_id "',csp.results[i,1],'"; gene_name "',temp.gene.name,'"; gene_source "CSP"; gene_biotype "circRNA"; transcript_name "',csp.results[i,1],'"; transcript_source "CSP"; transcript_biotype "circRNA";')
    gtf.results<-rbind(gtf.results,temp.gtf.trascript)
    #exons
    temp.gtf.exons<-data.frame(matrix(0,nrow=length(str_split(csp.results[i,7],",",simplify =F)[[1]]),ncol=length(gtf.results.col)))
    colnames(temp.gtf.exons)<-gtf.results.col
    temp.gtf.exons$feature<-"exon"
    temp.gtf.exons$seqname<-csp.results[i,2]
    temp.gtf.exons$start<-str_split(csp.results[i,7],",",simplify =F)[[1]]
    temp.gtf.exons$end<-str_split(csp.results[i,8],",",simplify =F)[[1]]
    temp.gtf.exons$strand<-csp.results[i,5]
    if(temp.gtf.exons$strand[1]=="-") {temp.gtf.exons<-arrange(temp.gtf.exons, rev(rownames(temp.gtf.exons)))}
    temp.gtf.exons$attribute<-paste0('gene_id "circ',csp.results[i,6],'"; transcript_id "',csp.results[i,1],'"; exon_number "',seq(1:length(str_split(csp.results[i,7],",",simplify =F)[[1]])),'"; gene_name "',temp.gene.name,'"; gene_source "CSP"; gene_biotype "circRNA"; transcript_name "',csp.results[i,1],'"; transcript_source "CSP"; transcript_biotype "circRNA"; exon_id "',csp.results[i,1],'-E',seq(1:length(str_split(csp.results[i,7],",",simplify =F)[[1]])),'"')
    gtf.results<-rbind(gtf.results,temp.gtf.exons)
  }
  gtf.results<-gtf.results[-1,]
  gtf.results$source<-"QICS"
  gtf.results$score<-"."
  gtf.results$frame<-"."  
  gtf.results
}

assemble_transcripts_per_sample<-function(circ_exons,circ_exons_counts,circ_junc,circ_junc_counts,BSJ_gr,i){
  
  #the junction counts do not scale well to the exon counts; so the rpkm of the junctions is scaled to the rpkm of the exons; we use median ratios scaling 
  #first i am calulating a pseudo count for a junction out of the 2 surrounding exons
  pseudo_ref<-(circ_exons_counts[match(start(circ_junc),end(circ_exons)),i]+circ_exons_counts[match(end(circ_junc),start(circ_exons)),i])/2
  #there are NAs introduced from junctions connecting deplted exons; we can remove them or set them to 0; settign them to 0 would shift the median 
  #pseudo_ref<-pseudo_ref[complete.cases(pseudo_ref)]
  ratios<-circ_junc_counts[,i]/pseudo_ref
  ratios<-ratios[complete.cases(ratios)]
  ratios<-ratios[ratios>1]
  #ratios[is.infinite(ratios)]<-0
  #now we need to calculate the size factors 
  size_factor<-median(ratios)
  #we apply the size factor 
  circ_junc_counts[,i]<-round(circ_junc_counts[,i]/size_factor)
  #print(circ_junc_counts[1,i])
  #########################################
  #start recontruction here
  #we need to create the splice graph 
  transcript_features<-c()
  #it is OK to use for loops here since there should not be more than 10000 iterations 
  for(j in unique(circ_exons@geneID)){
    #j=11 #for now
    #print(j)
    sg_exons_df<-cbind(as.data.frame(circ_exons[circ_exons@geneID==j]), circ_exons_counts[circ_exons@geneID==j,i])
    colnames(sg_exons_df)[13]<-"counts"
    sg_junc_df<-cbind(as.data.frame(circ_junc[circ_junc@geneID==j],row.names = NULL), circ_junc_counts[circ_junc@geneID==j,i])
    colnames(sg_junc_df)[13]<-"counts"
    #this is needed for the removal of the linear residual counts
    ###avg_weight_exons_lin<-round(mean(lin_rpkm[lin_sg@geneID==j,i]))
    #for now avg_weight_exons_lin=0 #for now 
    #ths is needed for the calcualtion of the threshold to cut an exon toghether with another 
    avg_weight_exons_circ<-mean(sg_exons_df$counts)
    ###min_weight_exon_circ<-min(sg_exons_df$counts)
    #gettign the retained introns 
    #sg_exons_ri<-sg_exons_df[sg_exons_df$start%in%(sg_exons_df$end+1),]
    #sg_exons_ri<-sg_exons_ri[sg_exons_ri$end%in%(sg_exons_ri$start-1),]
    #feature_id_ri<-sg_exons_ri$featureID
    #feature_id_ri<-""
    #sg_exons_df$counts[!sg_exons_df$featureID%in%feature_id_ri]<-(sg_exons_df$counts-avg_weight_exons_lin)[!sg_exons_df$featureID%in%feature_id_ri]
    #creation of the graph
    sg_exons_df<-sg_exons_df[,c("featureID","start","end","counts")]
    sg_junc_df<-sg_junc_df[,c("featureID","start","end","counts")]
    #sg_junc_df$counts[sg_junc_df$counts<min(sg_exons_df$counts)]<-round(min(sg_exons_df$counts)+0.1*avg_weight_exons_circ)
    #sg_exons_df$counts[sg_exons_df$featureID%in%edge_features]<-round(min(sg_exons_df$counts)+0.1*avg_weight_exons_circ)
    
    #sg_junc_df$counts<-rep(23031,length(sg_junc_df$counts))
    sg_df<-rbind(sg_exons_df,sg_junc_df)
    #adjusting the counts; avoiding retained introns 
    ###sg_df$counts[!sg_df$featureID%in%feature_id_ri]<-(sg_df$counts-avg_weight_exons_lin)[!sg_df$featureID%in%feature_id_ri]
    ###sg_df$counts[sg_df$counts<min_weight_exon_circ]<-min_weight_exon_circ
    sg_df2 <- sg_df[order(sg_df$start),]
    
    #----------------------------------
    # fix of Retain Introns and 5/3 prime splicing
    # we make a data frame of junctions to connect the hanging pieces in the graph 
    fix_df<-NULL
    fix_ends<-sg_exons_df[sg_exons_df$end%in%sg_exons_df$start-1,"start"]
    if(length(fix_ends)>0){
      fix_df<-data.frame(fix_ends-1,fix_ends,rep(max(sg_df2$counts),length(fix_ends)))
      colnames(fix_df)<-c("start","end","weight")
      rownames(fix_df)<-paste0(seq(max(sg_df2$featureID),max(sg_df2$featureID)+length(fix_ends)-1),".1")
    }
    #-----------------------------
    
    rownames(sg_df2)<-sg_df2$featureID
    
    sg_df2<-sg_df2[,c("start","end","counts")]
    colnames(sg_df2)<-c("start","end","weight")
    sg_df2<-rbind(sg_df2,fix_df)
    sg_df2 <- sg_df2[order(sg_df2$start),] 
    #sg_df2<-sg_df2[sg_df2$weight>0,]
    sg<-graph.data.frame(sg_df2)
    #initiation of the graph reconstruction suporting parameters
    edge_features<-union(circ_exons@featureID[start(circ_exons)%in%start(BSJ_gr)],circ_exons@featureID[end(circ_exons)%in%end(BSJ_gr)])
    curr_edge_exons<-intersect(edge_features,rownames(sg_df2))
    #curr_circ<-as.data.frame(BSJ_sg[BSJ_sg@geneID==j])
    #curr_circ<-unname(curr_circ)
    curr_circ<-as.data.frame(table_circ[table_circ$start%in%sg_exons_df$start&table_circ$end%in%sg_exons_df$end,])
    if(length(curr_circ$start)==0){next}
    curr_circ$used<-0
    tracking_circ<-curr_circ
    #transcript_features<-c()
    min.feature<-min(sg_df2$weight)
    #starts here 
    repeat{
      sg<-graph.data.frame(sg_df2)
      #print(sg_df2)
      #plot(sg, layout=layout.circle, edge.label=paste0(rownames(sg_df2),": ",sg_df2$weight ),edge.label.cex=0.8)
      #check is the circles can be reconstructed, based on avalability of edge exons; save the circles that can use the exons as current 
      curr_circ<-curr_circ[curr_circ$end%in%sg_df2$end,]
      curr_circ<-curr_circ[curr_circ$start%in%sg_df2$start,]
      #the lowest coverage exon
      curr_sg_exons<-sg_df2[as.character(intersect(rownames(sg_df2),sg_exons_df$featureID)),]
      #sg_df2_plot<-sg_df2[rownames(curr_sg_exons),]
      #sg_df2_plot[rownames(sg_df2_plot)%in%rownames(sg_df2),3]<-sg_df2[rownames(sg_df2)%in%rownames(sg_df2_plot),3]
      #sg_df2_plot[!rownames(sg_df2_plot)%in%rownames(sg_df2),3]<-0
      #barplot(sg_df2_plot$weight, ylim=c(0,300000),col="darkblue", names.arg = rownames(sg_df2_plot))
      min_exon<-curr_sg_exons[curr_sg_exons$weight==min(curr_sg_exons$weight),][1,]
      min_exon_weight<-min_exon$weight
      #the circles containing that exon 
      work_circ<-curr_circ[curr_circ$start<=min_exon$start&curr_circ$end>=min_exon$end,]
      #quick check if the circle hasnt been depleted 
      if (length(work_circ[,1])==0) {
        sg_df2<-sg_df2[rownames(sg_df2)!=rownames(min_exon),]
      }else{
        #the minimum quantitiy circle containing that exon 
        work_circ<-work_circ[work_circ$count==min(work_circ$count),]
        work_circ<-work_circ[1,] #just on case there are the same number of BSJ
        #in case of very low levels of circle, it can get depleted before the actual use of the BSJ; this is a rough fix for that case; continues at the end of the loop 
        #increasing a tracker counter when a circle is used 
        tracking_circ[tracking_circ$chr==work_circ$chr&tracking_circ$start==work_circ$start&tracking_circ$end==work_circ$end,"used"]<-tracking_circ[tracking_circ$chr==work_circ$chr&tracking_circ$start==work_circ$start&tracking_circ$end==work_circ$end,"used"]+1
        curr_exons<-sg_exons_df[sg_exons_df$start>=work_circ$start&sg_exons_df$end<=work_circ$end,"featureID"]
        edge_exons<-c(head(curr_exons,n=1),tail(curr_exons,n=1))
        if (length(curr_exons)<3) {
          transcript_features<-c(paste(curr_exons,collapse = "_"),transcript_features)
          sg_df2$weight[rownames(sg_df2)%in%curr_exons]<-(sg_df2$weight-as.numeric(min_exon_weight))[rownames(sg_df2)%in%curr_exons]
        }else{  
          #max flow trough the min exon ; seperated into 2 max flows 
          a<-max_flow(sg,as.character(work_circ[1,2]),as.character(min_exon[1,1]),capacity = sg_df2$weight)
          b<-max_flow(sg,as.character(min_exon[1,2]),as.character(work_circ[1,3]),capacity = sg_df2$weight)
          #print(rownames(sg_df2)[as.logical(a$flow)])
          #print(rownames(min_exon))
          #print(rownames(sg_df2)[as.logical(b$flow)])
          #if the path is imposible reverse the increase in the tracker 
          if((all(b$flow==0)|all(a$flow==0))&!is.element(rownames(min_exon),edge_exons)) {
            tracking_circ[tracking_circ$chr==work_circ$chr&tracking_circ$start==work_circ$start&tracking_circ$end==work_circ$end,"used"]<-tracking_circ[tracking_circ$chr==work_circ$chr&tracking_circ$start==work_circ$start&tracking_circ$end==work_circ$end,"used"]-1
            sg_df2<-sg_df2[rownames(sg_df2)!=rownames(min_exon),]
            next
          }
          curr_flow<-c(rownames(sg_df2)[as.logical(a$flow)],rownames(min_exon),rownames(sg_df2)[as.logical(b$flow)])
          
          # the list of reaconstructed transcripts as feature ids
          #the if is sanity check for lack of edge exons; if the path is imposible the length will be less then 3 
          if(length(setdiff(curr_flow,rownames(fix_df)))>=3) {transcript_features<-c(paste(intersect(curr_flow,sg_exons_df$featureID),collapse = "_"),transcript_features)}
          
          #depletion of the coverage 
          sg_df2<-sg_df2[rownames(sg_df2)!=rownames(min_exon),]
          #sg_df2$weight[rownames(sg_df2)%in%curr_exons]<-(sg_df2$weight-as.numeric(min_exon_weight))[rownames(sg_df2)%in%curr_exons]
          #this switches on and off the depletion of the junctions
          #curr_flow<-intersect(curr_flow,curr_exons)
          sg_df2$weight[rownames(sg_df2)%in%curr_flow]<-(sg_df2$weight-as.numeric(min_exon_weight))[rownames(sg_df2)%in%curr_flow]
          #sg_exons_df<-sg_exons_df[sg_exons_df$weight>0.2*avg_weight_exons_circ]
          #sg_df2<-sg_df2[sg_df2$weight>0.05*avg_weight_exons_circ,]
        }
      }
      #print(transcript_features)
      sg_df2<-sg_df2[sg_df2$weight>0.001*avg_weight_exons_circ,]
      #sg_df2<-sg_df2[sg_df2$weight>0.05*min.feature,]
      #sg_df2$weight[sg_df2$weight>0.1*avg_weight_exons_circ&rownames(sg_df2)%in%curr_flow]<-0
      #sg_df2<-sg_df2[sg_df2$weight>0,]
      #print(sg_df2)
      if (length(intersect(rownames(sg_df2),curr_edge_exons))==0) {break}
    }
    # here is the continuation of the fix from above  for the prematurely depleted circles 
    tracking_circ<-tracking_circ[tracking_circ$used==0,]
    if (length(tracking_circ$used)>0) {  
      #print("enter tracker")
      for(k in 1:length(tracking_circ$used)){
        work_circ<-tracking_circ[k,]
        curr_exons<-sg_exons_df[sg_exons_df$start>=work_circ$start&sg_exons_df$end<=work_circ$end,]
        #curr_exons<-setdiff(curr_exons,feature_id_ri)
        #curr_exons<-curr_exons[curr_exons$counts>0.002*avg_weight_exons_circ,"featureID"]
        curr_exons<-curr_exons[curr_exons$counts>0.05*avg_weight_exons_circ,"featureID"]
        transcript_features<-c(paste(curr_exons,collapse = "_"),transcript_features)
      }
    }
  }
  # just in case 
  transcript_features<-unique(transcript_features)
  transcript_features<-transcript_features[transcript_features!=""]
  transcript_features
}
##' Transcript assembly per sample 
##' Transcript assembly per sample based on sample name in the \dQuote{sample_table}
##' @title Transcript assembly
##' @param sgfc \code{SummarizedExperiment} object with info on the circular splice graph features
##' @param BSJ_gr  \code{GRange} object of BSJ coordinates
##' @param sample_name name of the sample as indicated in the sample table
##' @return \code{data.frame} of transcript information in flat format 
##' @keywords assembly
##' @author Stefan Stefanov
##' 
##' @export
transcripts_per_sample<-function(sgfc,BSJ_gr,sample_name){
  temp_count_matrix<-assays(sgfc)$counts
  circ_junc<-rowRanges(sgfc)
  circ_junc<-circ_junc[circ_junc@type=="J"]
  circ_exons<-rowRanges(sgfc)
  circ_exons<-circ_exons[circ_exons@type=="E"]
  circ_exons_counts<-temp_count_matrix[1:length(circ_exons),]
  circ_junc_counts<-temp_count_matrix[length(circ_exons)+1:length(circ_junc),]
  transcript_features<-assemble_transcripts_per_sample(circ_exons=circ_exons,
                                                        circ_exons_counts=circ_exons_counts,
                                                        circ_junc=circ_junc,
                                                        circ_junc_counts=circ_junc_counts,
                                                        BSJ_gr=BSJ_gr,
                                                        i=sample_name)
  qics_out<-prep_output(transcript_features,circ_exons)
  qics_out
}
##' Pair-wise merging 2 assemblies
##' Pair-wise merging 2 assemblies
##' @title Merging 2 assemblies 
##' @param qics1 assembly 1
##' @param qics2 assembly 2
##' @param sgfc_pred \code{SGFC} from the SGSeq feature detection 
##' @return \code{data.frame} of transcript information in flat format 
##' @keywords assembly
##' @author Stefan Stefanov
##' 
##' @export
merge_qics<-function(qics1,qics2,sgfc_pred){
  qics_out_merged<-rbind(qics1,qics2)
  qics_out_merged$circID<-1
  qics_out_merged<-unique(qics_out_merged)
  b<-qics_out_merged[qics_out_merged$seq%in%qics_out_merged$seq[duplicated(qics_out_merged$seq)],]
  a<-qics_out_merged[!qics_out_merged$seq%in%qics_out_merged$seq[duplicated(qics_out_merged$seq)],]
  b<-b[order(b$exon_starts),]
  c<-b[!duplicated(b$seq),]
  qics_out_merged<-rbind(a,c)
  qics_out_merged<-qics_out_merged[order(qics_out_merged$gene),]
  #qics_out_merged$chr<-sgfc_pred@rowRanges@seqnames@values[qics_out_merged$chr]
  qics_out_merged$circID<-paste(1:length(qics_out_merged$circID),qics_out_merged$chr,qics_out_merged$start,qics_out_merged$end, sep = "_")
  qics_out_merged
}
##' Merging 2 DNAStringSets
##' Merging 2 DNAStringSets
##' @title Merging 2 \code{DNAStringSets} 
##' @param qics_fa \code{DNAStringSet} 1
##' @param known_fa \code{DNAStringSet} 2
##' @return merged \code{DNAStringSet} file to be used for quantification 
##' @keywords FASTA merge
##' @author Stefan Stefanov
##' 
##' @export
merge_fasta<-function(qics_fa,known_fa){
  work_fa<-setdiff(qics_fa,known_fa)
  work_fa<-union(known_fa,qics_fa)
}
##' Preparing circular splice graph features
##' Preparing circular splice graph features
##' @title Preparing circular splice graph features
##' @param full_sg \code{SGranges} object supplying feature info
##' @param full_fc count matrix corresponding to the features
##' @param sgfc_pred \code{SGFC} from the SGSeq feature detection
##' @param bs_genome \code{BSGenome} object used for extracting the sequences
##' @param BSJ_gr  \code{GRange} object of BSJ coordinates
##' @param th minimum number of reads limiting exon selection
##' @return \code{SummarizedExperiment} object with info on the circular splice graph features
##' @keywords preparation splice graph
##' @author Stefan Stefanov
##' 
##' @export
prep_circular_sg <- function(full_sg, full_fc,sgfc_pred, bs_genome, BSJ_gr, th=15)
{
  
  #removing super low coverage features
  full_sg<-full_sg[rowSums(as.data.frame(full_fc[,sample_table$treatment=="enriched"]))>th]
  full_fc<-full_fc[rowSums(as.data.frame(full_fc[,sample_table$treatment=="enriched"]))>th,]
  circ_sg<-full_sg[full_sg%over%BSJ_gr] #includes features within BSJ enclosed region
  lin_sg<-full_sg[full_sg%outside%BSJ_gr] #includes features outside of BSJ enclosed region
  #annotate the BSJ with the corresponding geneIDs
  BSJ_sg<-make_BSJ_sg(circ_sg,BSJ_gr)
  #full_fc<-count_matrix[full_sg@featureID,]
  
  #RPKM calculation for exons
  #seqs<-get_seqs(full_sg,bs_genome)
  #bs_genome=Hsapiens
  #full_sg@seqnames<-Rle(paste0("chr",full_sg@seqnames))
  seqs<-get_seqs(full_sg,bs_genome)
  message("preparing exons")
  full_rpkm<-RPKM_calc(full_fc, full_sg, BSJ_gr, bs_genome=bs_genome , sample_table=sample_table, feature_type ="e", gc_correction = F)
  lin_rpkm<-full_rpkm[full_sg%outside%BSJ_gr,]
  #extracting circ specific counts
  circ_fc_adj<-full_rpkm[full_sg%over%BSJ_gr,]
  depleted_exons<-find_depleted_features(circ_fc_adj,sample_table,circ_sg)
  #making sure that the circ edge exons remian in the mix; they coudl be depleted in case of very low levels of the circle
  edge_features<-union(full_sg@featureID[start(full_sg)%in%start(BSJ_gr)],full_sg@featureID[end(full_sg)%in%end(BSJ_gr)])
  depleted_exons<-setdiff(depleted_exons,edge_features)
  circ_exons<-circ_sg[!circ_sg@featureID%in%depleted_exons]# the final set of circRNA exons
  circ_exons_counts<-circ_fc_adj[!circ_sg@featureID%in%depleted_exons,]
  #########################################################################
  #now for junctions
  #we need to normalize the junction read counts to the exon counts
  count_matrix<-as.data.frame(counts(sgfc_pred))
  count_matrix <- apply (count_matrix, c (1, 2), function (x) {(as.integer(x))})
  #row.names(count_matrix) <- 1:length(sgfc_pred)
  ############################################
  message("preparing junctions")
  sg_gr<-rowRanges(sgfc_pred)
  sg_gr_j<-sg_gr[sg_gr@type=="J"]
  #circ_sg_j<-sg_gr_j[sg_gr_j%over%BSJ_gr]
  circ_sg_j<-sg_gr_j[unique(queryHits(findOverlaps(sg_gr_j,BSJ_gr,type = "within")))]
  count_matrix_j<-count_matrix[circ_sg_j@featureID,]
  #get the relative sequences of around a junction
  seqs_j<-paste0(seqs[match(start(circ_sg_j),end(full_sg))],seqs[match(end(circ_sg_j),start(full_sg))])
  #calculate the scaled read count for junction
  junc_rpkm<-RPKM_calc(count_matrix=count_matrix_j, sg=circ_sg_j, bsj_granges = BSJ_gr, sample_table = sample_table, feature_type = "j")
  deplted_j<-find_depleted_features(junc_rpkm,sample_table,circ_sg_j)
  circ_junc<-circ_sg_j[!circ_sg_j@featureID%in%deplted_j]
  circ_junc_counts<-junc_rpkm[!circ_sg_j@featureID%in%deplted_j,]
  circ_junc_counts[circ_junc_counts==0]<-1
  colnames(circ_junc_counts)<-sample_table$sample_name
  
  
  test_sg<-c(circ_exons,circ_junc)
  test_counts<-rbind(circ_exons_counts,circ_junc_counts)
  row.names(test_counts)<-1:length(test_sg)
  
  colData <- DataFrame(sample_table$treatment, row.names = sample_table$sample_name)
  colnames(test_counts) <- sample_table$sample_name
  x <- SummarizedExperiment(
    assays = list(counts = test_counts),
    rowRanges = test_sg,
    colData = colData)
  return(x)
  
}
stiv1n/CYCLeR documentation built on Sept. 15, 2022, 4:39 p.m.