R/ClusteringFunctions.R
In CAGEr: Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining

Documented in TCdataframe2granges TCgranges2dataframe

#' @include CTSS.R

####################################################################################
# Implementation of simple distance-based clustering of data attached to sequences
#

#' @name distclu-functions
#' 
#' @title Private functions for distance clustering.
#' 
#' @param max.dist See [clusterCTSS()].
#' @param useMulticore,nrCores See clusterCTSS.
#' 
#' @description The flow of data is that a [CTSS] object of CTSSes is
#' progressively deconstructed, and data to form the clusters is progressively
#' integrated in a [data.table] object, which is finally converted to [GRanges]
#' at the end.  Doing the whole clustering with `GRanges` is more elegant, but
#' looping on a `GRangesList` was just too slow.  Maybe the operation on the
#' `data.table` is more efficient because it is vectorised.
#' 
#' @examples
#' # Get example data
#' library(IRanges)
#' library(GenomicRanges)
NULL

#' @name .cluster.ctss.strand
#' @rdname distclu-functions
#' 
#' @description `.cluster.ctss.strand` does the strandless distance clustering
#' of strandless CTSS positions from a single chromosome.  Input does not need
#' to be sorted, but _pay attention that the output is sorted_.
#' 
#' @param ctss.ipos.chr A IPos object.
#' 
#' @return `.cluster.ctss.strand` returns an [data.table] object containing
#' arbitrary cluster IDs (as integers) for each CTSS.
#'  
#' @importFrom data.table data.table
#' @importFrom IRanges IPos
#' @importFrom S4Vectors queryHits
#' @importFrom S4Vectors runLength
#' @importFrom S4Vectors runValue
#' @importFrom S4Vectors subjectHits
#' 
#' @examples
#' 
#' #.cluster.ctss.strand
#' ctss.ipos.chr <- IPos(c(1,3,4,12,14,25,28))
#' CAGEr:::.cluster.ctss.strand(ctss.ipos.chr, 5)
#' 
#' ctss.chr <- CTSScoordinatesGR(exampleCAGEexp)
#' ctss.chr <- ctss.chr[strand(ctss.chr) == "+"]
#' ctss.ipos.chr <- ranges(ctss.chr)
#' # Same result if not sorted
#' identical(
#'   CAGEr:::.cluster.ctss.strand(ctss.ipos.chr, 20),
#'   CAGEr:::.cluster.ctss.strand(ctss.ipos.chr[sample(seq_along(ctss.ipos.chr))], 20)
#' )
#' # Returns an emtpy data.table object if given an empty IRanges object.
#' identical(CAGEr:::.cluster.ctss.strand(IPos(), 20), data.table::data.table())

setGeneric(".cluster.ctss.strand", function(ctss.ipos.chr, max.dist) standardGeneric(".cluster.ctss.strand"))

setMethod(".cluster.ctss.strand", "IPos", function(ctss.ipos.chr, max.dist) {
  if (identical(ctss.ipos.chr, IPos())) return(data.table())
  v <- Rle(rep(TRUE, max(start(ctss.ipos.chr)) + max.dist))
  v[start(ctss.ipos.chr) + max.dist] <- FALSE
  longRuns <- which(runLength(v) >= max.dist & runValue(v))
  c.starts <- cumsum(runLength(v))[longRuns] - max.dist
  c.ends <- c(cumsum(runLength(v))[(longRuns - 1)[-1]], length(v)) - max.dist
  clusters <- IRanges(start = c.starts, end = c.ends)
  o <- findOverlaps(clusters, ctss.ipos.chr)
  data.table(id = queryHits(o))
})


#' @name .cluster.ctss.chr
#' @rdname distclu-functions
#' 
#' @description \code{.cluster.ctss.chr} does the stranded distance clustering of CTSS on a
#' single chromosome, by dispatching both strands to \code{.cluster.ctss.strand} and merging
#' the results, taking care keep the cluster IDs unique.  Be careful that this function does
#' not look at the score.
#' 
#' @param ctss.chr A CTSS.chr object.
#' 
#' @return \code{.cluster.ctss.chr} returns a \code{\link{data.table}} object representing the
#' chromosome coordinates (\code{chr}, \code{pos}, \code{strand}) of each CTSS, with their
#' cluster ID (\code{id}).
#' 
#' @importFrom data.table data.table
#' @importFrom S4Vectors decode
#' 
#' @examples 
#' 
#' #.cluster.ctss.chr
#' ctss.chr <- as(CTSScoordinatesGR(exampleCAGEexp), "CTSS.chr")
#' CAGEr:::.cluster.ctss.chr(ctss.chr, 20)

setGeneric(".cluster.ctss.chr", function(ctss.chr, max.dist) standardGeneric(".cluster.ctss.chr"))

setMethod(".cluster.ctss.chr", "CTSS.chr", function(ctss.chr, max.dist) {
  clusters.p <- .cluster.ctss.strand(ranges(ctss.chr[strand(ctss.chr) == "+"]), max.dist)
  clusters.m <- .cluster.ctss.strand(ranges(ctss.chr[strand(ctss.chr) == "-"]), max.dist)
  maxIdP <- if (identical(clusters.p, data.table())) 0 else max(clusters.p)
  clusters.m <- clusters.m + maxIdP
  data.table( rbind(clusters.p, clusters.m)
            , chr    = as.character(seqnames(ctss.chr))
            , pos    = start(ctss.chr)
            , strand = decode(strand(ctss.chr)))
})


#' @name .ctss2clusters
#' @rdname distclu-functions
#' 
#' @description \code{.ctss2clusters} does the stranded distance clustering of CTSS.
#' 
#' @param ctss A CTSS object with a score column.
#' 
#' @return \code{.ctss2clusters} returns a \code{\link{data.table}} object representing the
#' cluster ID (\code{id}), chromosome coordinates (\code{chr}, \code{pos}, \code{strand}) and
#' the \code{score} of each CTSS.
#' 
#' @importFrom data.table data.table
#' @importFrom S4Vectors decode
#' 
#' @examples 
#' 
#' # .ctss2clusters
#' ctss <- CTSScoordinatesGR(exampleCAGEexp)
#' score(ctss) <- CTSSnormalizedTpmDF(exampleCAGEexp)[[1]]
#' seqnames(ctss)[rep(c(TRUE,FALSE), length(ctss) / 2)] <- "chr16"
#' ctss
#' clusters <- CAGEr:::.ctss2clusters(ctss, 20)
#' clusters

setGeneric(".ctss2clusters", function(ctss, max.dist = 20, useMulticore = FALSE, nrCores = NULL) standardGeneric(".ctss2clusters"))

setMethod(".ctss2clusters", "CTSS", function(ctss, max.dist, useMulticore, nrCores) {
  ctss <- sort(ctss)
  ctss <- ctss[score(ctss) != 0]
  ctss.list <- split(ctss, droplevels(seqnames(ctss)))
  ctss.list <- lapply(ctss.list, as, "CTSS.chr")
  ctss.list <- bplapply( ctss.list, .cluster.ctss.chr, max.dist = max.dist
                       , BPPARAM = CAGEr_Multicore(useMulticore, nrCores))
	max.clust <- sapply(ctss.list, function(x) {max(x$id)})
	max.clust <- cumsum(c(0, max.clust[-length(max.clust)]))
	
  for (x in seq_along(max.clust))
    ctss.list[[x]]$id <- ctss.list[[x]]$id + max.clust[x]
	
	dt <- do.call(rbind, ctss.list)
	dt$tpm <- decode(score(ctss))
	dt
})


#' @name .summarize.clusters
#' @rdname distclu-functions
#' @description \code{.summarize.clusters} calculates the number of CTSS, and
#' the position and score of a main peak, for each cluster.
#' 
#' @param ctss.clustered A \code{\link{data.table}} object representing the cluster ID
#'        (\code{id}), chromosome coordinates (\code{chr}, \code{pos}, \code{strand}) and
#'        the \code{score} of each CTSS.
#' @param removeSingletons Remove \dQuote{singleton} clusters that span only a single
#'        nucleotide ? (default = FALSE).
#' @param keepSingletonsAbove Even if \code{removeSingletons = TRUE}, keep singletons when
#'        their score is aboove threshold (default = \code{Inf}).
#' 
#' @return \code{.summarize.clusters} returns GRanges describing the clusters.
#' 
#' @importFrom data.table setnames
#' 
#' @examples 
#' 
#' # .summarize.clusters
#' CAGEr:::.summarize.clusters(clusters)
#' CAGEr:::.summarize.clusters(clusters, removeSingletons = TRUE)
#' CAGEr:::.summarize.clusters(clusters, removeSingletons = TRUE, keepSingletonsAbove = 5)

setGeneric(".summarize.clusters", function(ctss.clustered, removeSingletons = FALSE, keepSingletonsAbove = Inf) standardGeneric(".summarize.clusters"))

setMethod(".summarize.clusters", "data.table", function(ctss.clustered, removeSingletons, keepSingletonsAbove) {

  find.dominant.idx <- function (x) {
    w <- which(x == max(x))
    w[ceiling(length(w)/2)]
  }
	
  chr <- pos <- tpm <- cluster <- id <- NULL  # To keep R CMD check happy.
  clusters <- ctss.clustered[ , list( chr[1]
                                    , min(pos)
                                    , max(pos)
                                    , strand[1]
                                    , length(pos)
                                    , pos[find.dominant.idx(tpm)]
                                    , sum(tpm)
                                    , max(tpm))
                              , by = id]
  setnames(clusters, c( "cluster"
                      , "chr", "start", "end", "strand"
                      , "nr_ctss", "dominant_ctss", "tpm", "tpm.dominant_ctss"))
  
	if(removeSingletons)
		clusters <- subset(clusters, clusters$nr_ctss > 1 | clusters$tpm >= keepSingletonsAbove)
  
  gr <- GRanges( seqnames = Rle(factor(clusters$chr))
               , ranges   = IRanges(clusters$start, clusters$end)
               , strand   = clusters$strand
               , score    = Rle(clusters$tpm)
               , nr_ctss  = clusters$nr_ctss
               , dominant_ctss = clusters$dominant_ctss
               , tpm.dominant_ctss = Rle(clusters$tpm.dominant_ctss)
  )
  names(gr) <- seq_along(gr)
  sort(gr)
})


#' @name .distclu
#' @rdname distclu-functions
#' @description  \code{.distclu} receives the data from the main \code{clusterCTSS} and
#' dispatches each for (possibly parallel) processing.
#' 
#' @param se A \code{\link{SummarizedExperiment}} object representing the CTSSes and
#'        their expression in each sample.
#'         
#' @return \code{.distclu} returns GRanges describing the clusters.
#' 
#' @importFrom GenomicRanges GRangesList
#' @importFrom SummarizedExperiment rowRanges
#' 
#' @examples 
#' 
#' # .distclu
#' CAGEr:::.distclu(CTSStagCountSE(exampleCAGEexp))
#' \dontrun{
#' CAGEr:::.distclu(CTSStagCountSE(exampleCAGEexp), useMulticore = TRUE)
#' }
#' 
#' CAGEr:::.distclu(CTSStagCountSE(exampleCAGEset))

setGeneric(".distclu", function(se, max.dist = 20, removeSingletons = FALSE, keepSingletonsAbove = Inf, useMulticore = FALSE, nrCores = NULL) standardGeneric(".distclu"))

setMethod(".distclu", "SummarizedExperiment", function(se, max.dist, removeSingletons, keepSingletonsAbove, useMulticore, nrCores) {
	
  ctss.cluster.list <- list()
  for(s in colnames(se)) {
    message("\t-> ", s)
    d <- as(rowRanges(se), "CTSS")
    score(d) <- assays(se)[["normalizedTpmMatrix"]][[s]]
    d <- subset(d, score(d) > 0)
    clusters <- .ctss2clusters(ctss = d, max.dist = max.dist, useMulticore = useMulticore, nrCores = nrCores)
    ctss.cluster.list[[s]] <- clusters
  }
  ctss.cluster.list <- bplapply( ctss.cluster.list, .summarize.clusters
                               , removeSingletons = removeSingletons
                               , keepSingletonsAbove = keepSingletonsAbove
                               , BPPARAM = CAGEr_Multicore(useMulticore, nrCores))
  GRangesList(ctss.cluster.list)
})


#################################################################################################################
# Implementation of Paraclu - parametric clustering of data attached to sequences (http://www.cbrc.jp/paraclu/) 
# Reference: A code for transcription initiation in mammalian genomes, 
# MC Frith, E Valen, A Krogh, Y Hayashizaki, P Carninci, A Sandelin, Genome Research 2008 18(1):1-12)
#

#' @importFrom utils tail

.paraclu1 <- function(ctss) {
	
	sit <- nrow(ctss)
	tot <- sum(ctss$tpm)
	if(sit == 1) {
		min_density <- Inf
		br <- NA
	}else{
		densities_forward <- cumsum(ctss$tpm)[-sit]/(ctss$pos[2:sit] - ctss$pos[1])
		densities_reverse <- cumsum(rev(ctss$tpm))[-sit]/(ctss$pos[sit] - ctss$pos[(sit-1):1])
		min_densities = c(min(densities_forward), min(densities_reverse))
		breaks <- c(which(densities_forward == min_densities[1])[1] + 1, sit + 1 - which(densities_reverse == min_densities[2])[1])
		min_density <- min(min_densities)
		br <- breaks[tail(which(min_densities == min_density),1)]
	}
	
	return(as.list(c(br, min_density, tot, sit)))
}


.paraclu2 <- function(ctss, min_density = -Inf, clusters.df = data.frame()) {
	
	if(nrow(ctss)>0){
		
		ctss <- ctss[order(ctss$pos),]
		params <- .paraclu1(ctss)
		br <- params[[1]]
		max_density <- params[[2]]
		tot <- params[[3]]
		sit <- params[[4]]
		
		if(!(max_density == Inf)){
			new_min <- max(min_density, max_density)
			clusters.df <- rbind(.paraclu2(ctss = ctss[1:(br-1),], min_density = new_min, clusters.df = clusters.df), .paraclu2(ctss = ctss[br:nrow(ctss),], min_density = new_min, clusters.df = clusters.df))
		}
		
		return(rbind(clusters.df, data.frame(chr = ctss$chr[1], start = min(ctss$pos), end = max(ctss$pos), strand = ctss$strand[1], nr_ctss = sit, dominant_ctss = ctss$pos[which(ctss$tpm == max(ctss$tpm))[ceiling(length(which(ctss$tpm == max(ctss$tpm)))/2)]], tpm = tot, tpm.dominant_ctss = ctss$tpm[which(ctss$tpm == max(ctss$tpm))[ceiling(length(which(ctss$tpm == max(ctss$tpm)))/2)]], min_d = min_density, max_d= max_density)))
		
	}else{
		return(clusters.df)
	}
	
}


.paraclu3 <- function(ctss.df, minStability = 1, maxLength = 500, removeSingletons = FALSE, keepSingletonsAbove = Inf, reduceToNonoverlapping = TRUE, useMulticore = FALSE, nrCores = NULL){
	
	ctss.df.plus.list <-lapply(as.list(unique(ctss.df$chr)), function(x) {subset(ctss.df, ctss.df$chr == x & strand == "+")})
	ctss.df.minus.list <-lapply(as.list(unique(ctss.df$chr)), function(x) {subset(ctss.df, ctss.df$chr == x & strand == "-")})
	ctss.df.list <- append(ctss.df.plus.list, ctss.df.minus.list)
	clusters.list <- bplapply( ctss.df.list, .paraclu2
	                         , BPPARAM = CAGEr_Multicore(useMulticore, nrCores))
	n <- length(clusters.list)/2
	clusters.list <- lapply(as.list(c(1:n)), function(x) {rbind(clusters.list[[x]], clusters.list[[x+n]])})
	
	clusters <- do.call(rbind, clusters.list)
	
	clusters <- subset(clusters, (clusters$max_d >= (minStability * clusters$min_d)) & ((end - start + 1) <= maxLength))
	
	if(removeSingletons == TRUE){
		clusters <- subset(clusters, !(clusters$start == clusters$end & clusters$tpm < keepSingletonsAbove))
	}
	
	if(reduceToNonoverlapping == TRUE){
		clusters.gr <- GRanges(seqnames = clusters$chr, ranges = IRanges(start = clusters$start, end = clusters$end), strand = clusters$strand, elementMetadata = clusters)
		o <- findOverlaps(clusters.gr, drop.self = TRUE, type = "within")
		clusters.gr <- clusters.gr[-queryHits(o)]
		clusters <- subset(clusters, paste(clusters$chr, clusters$strand, clusters$start, clusters$end, sep = ".") %in% paste(seqnames(clusters.gr), strand(clusters.gr), start(clusters.gr), end(clusters.gr), sep = "."))
	}
	clusters <- cbind(cluster = c(1:nrow(clusters)), clusters)
	clusters$start <- clusters$start - 1
	rownames(clusters) <- c(1:nrow(clusters))
	colnames(clusters)[which(colnames(clusters) == "min_d")] <- "min_density"
	colnames(clusters)[which(colnames(clusters) == "max_d")] <- "max_density"
	
	return(clusters)
	
}


.paraclu <- function(data, sample.labels, minStability = 1, maxLength = 500, removeSingletons = FALSE, keepSingletonsAbove = Inf, reduceToNonoverlapping = TRUE, useMulticore = FALSE, nrCores = NULL) {
	
	ctss.cluster.list <- list()
	for(s in sample.labels) {
		
		message("\t-> ", s)
		d <- data[,c("chr", "pos", "strand", s)]
		colnames(d) <- c("chr", "pos", "strand", "tpm")
		d <- d[d$tpm>0,]
		ctss.cluster.df <- .paraclu3(ctss.df = d, minStability = minStability, maxLength = maxLength, removeSingletons = removeSingletons, keepSingletonsAbove = keepSingletonsAbove, reduceToNonoverlapping = reduceToNonoverlapping, useMulticore = useMulticore, nrCores = nrCores)
		ctss.cluster.list[[s]] <- ctss.cluster.df
		
	}
	
	return(ctss.cluster.list)
	
}


#########################################################################
# Imposing predefined clusters to segment the data attached to sequences


.cluster.ctss.strand.predef <- function(ctss.df, custom.clusters) {
	
	clusters <- data.frame(cluster = c(1:length(custom.clusters$start)), chr = custom.clusters$chr, start = custom.clusters$start, end = custom.clusters$end, strand = custom.clusters$strand)
	clusters.ir <- IRanges(start = clusters$start+1, end = clusters$end)
	ctss.df <- ctss.df[order(ctss.df$pos),]
	ctss.df.ir <- IRanges(start = ctss.df$pos, end = ctss.df$pos)
	o <- findOverlaps(clusters.ir, ctss.df.ir)
	ctss.df.1 <- cbind(ctss.df[subjectHits(o),], cluster = clusters$cluster[queryHits(o)], start = clusters$start[queryHits(o)], end = clusters$end[queryHits(o)])
	ctss.df.2 <- data.frame(chr = clusters$chr[!(clusters$cluster %in% clusters$cluster[queryHits(o)])], pos = rep(NA, length(clusters$cluster[!(clusters$cluster %in% clusters$cluster[queryHits(o)])])), strand = clusters$strand[!(clusters$cluster %in% clusters$cluster[queryHits(o)])], tpm = rep(0, length(clusters$cluster[!(clusters$cluster %in% clusters$cluster[queryHits(o)])])), cluster = clusters$cluster[!(clusters$cluster %in% clusters$cluster[queryHits(o)])], start = clusters$start[!(clusters$cluster %in% clusters$cluster[queryHits(o)])], end = clusters$end[!(clusters$cluster %in% clusters$cluster[queryHits(o)])])
	ctss.df <- rbind(ctss.df.1, ctss.df.2)
	ctss.df <- ctss.df[order(ctss.df$cluster),]
	invisible(gc())
	return(ctss.df)
	
} 

.cluster.ctss.chr.predef <- function(ctss.df, custom.clusters) {
	
	ctss.df.p <- subset(ctss.df, strand == "+")
	custom.clusters.p <- subset(custom.clusters, strand == "+")
	if(nrow(custom.clusters.p) > 0) {
		ctss.df.plus <- .cluster.ctss.strand.predef(ctss.df = ctss.df.p, custom.clusters = custom.clusters.p)
		last.plus.cluster <- max(ctss.df.plus$cluster)
	}else{
		ctss.df.plus <- data.frame()
		last.plus.cluster <- 0
	}
	invisible(gc())
	ctss.df.m <- subset(ctss.df, strand == "-")
	custom.clusters.m <- subset(custom.clusters, strand == "-")
	if(nrow(custom.clusters.m) > 0) {			
		ctss.df.minus <- .cluster.ctss.strand.predef(ctss.df = ctss.df.m, custom.clusters = custom.clusters.m)
		ctss.df.minus$cluster <- last.plus.cluster + ctss.df.minus$cluster
	}else{
		ctss.df.minus <- data.frame()
	}
	invisible(gc())
	return(rbind(ctss.df.plus, ctss.df.minus))
	
}

.ctss2clusters.predef <- function(ctss.df, custom.clusters, useMulticore = FALSE, nrCores = NULL) {
	ctss.cluster.list <- bplapply(as.list(unique(custom.clusters$chr)), function(x) {
		ctss.df.chr <- subset(ctss.df, ctss.df$chr == x)
		custom.clusters <- subset(custom.clusters, custom.clusters$chr == x)
		if(nrow(custom.clusters)>0){
			ctss.cluster.chr.df <- .cluster.ctss.chr.predef(ctss.df = ctss.df.chr, custom.clusters = custom.clusters)
		}
		}, BPPARAM = CAGEr_Multicore(useMulticore, nrCores))
	max.clust <- unlist(lapply(ctss.cluster.list, function(x) {max(x$cluster)}))
	max.clust <- cumsum(c(0, max.clust[-length(max.clust)]))
	invisible(gc())
	
	ctss.cluster.list <- lapply(as.list(1:length(max.clust)), function(x) {
								
								a = ctss.cluster.list[[x]]
								a$cluster = a$cluster + max.clust[x]
								return(a)
								
								}
								)
	
	ctss.cluster.df <- do.call(rbind, ctss.cluster.list)
	invisible(gc())
	return(ctss.cluster.df)
	
}	

.summarize.clusters.predef <- function(ctss.cluster.df) {
	
	ctss.cluster <- data.table(ctss.cluster.df)
	chr <- pos <- tpm <- cluster <- NULL  # To keep R CMD check happy.
	ctss.cluster <- ctss.cluster[, list(chr[1], start[1], end[1], strand[1], length(pos), pos[which(tpm == max(tpm))[ceiling(length(which(tpm == max(tpm)))/2)]], sum(tpm), max(tpm)), by = cluster]
	setnames(ctss.cluster, c("cluster", "chr", "start", "end", "strand", "nr_ctss", "dominant_ctss", "tpm", "tpm.dominant_ctss")) 
	ctss.cluster <- data.frame(ctss.cluster)
	ctss.cluster$nr_ctss[ctss.cluster$tpm == 0] <- 0
	
	invisible(gc())
	return(ctss.cluster)
	
}

.predefined.clusters <- function(data, sample.labels, custom.clusters, useMulticore = FALSE, nrCores = NULL){

	ctss.cluster.list <- list()
	for(s in sample.labels) {
		
		message("\t->  ", s)
		d <- data[,c("chr", "pos", "strand", s)]
		colnames(d) <- c("chr", "pos", "strand", "tpm")
		d <- d[d$tpm > 0,]
		ctss.cluster.df <- .ctss2clusters.predef(ctss.df = d, custom.clusters = custom.clusters, useMulticore = useMulticore, nrCores = nrCores)
		ctss.cluster.list[[s]] <- ctss.cluster.df
		invisible(gc())
		
	}
	
	ctss.cluster.list <- bplapply( ctss.cluster.list, .summarize.clusters.predef
	                             , BPPARAM = CAGEr_Multicore(useMulticore, nrCores))
	invisible(gc())
	return(ctss.cluster.list)
	
}

#' @name tagClusterConvertors
#' 
#' @title Private functions to convert TC formats
#' 
#' @description Interconvert tag clusters (TC) formats used in classes CAGEset
#' (\code{data.frame}) and CAGEexp (\code{GRanges}).
#' 
#' @details 
#' The original format used in \code{\link{CAGEset}} objects follows BED
#' ("0-based") conventsion for the start and end coordinates.  On the other
#' hand, the GRanges objects used in \code{\link{CAGEexp}} objects follow
#' the "1-based" convention.  Therefore a value of 1 has to be added or
#' subtracted to the start positions when converting between both formats.
#' 
#' @family df2granges converters
#' 
#' @examples 
#' df <- tagClusters(exampleCAGEset, 1)
#' head(df)
#' gr <- CAGEr:::TCdataframe2granges(df)
#' gr
#' head(CAGEr:::TCgranges2dataframe(gr))
#' # No exact round-trip because start and end were not integer in df.
#' identical(df, CAGEr:::TCgranges2dataframe(gr))
#' if (! all(df == CAGEr:::TCgranges2dataframe(gr)))
#'   stop("No round-trip between TCdataframe2granges and TCgranges2dataframe")
#' 
#' tagClustersGR(exampleCAGEexp)
#' head(CAGEr:::TCgranges2dataframe(CAGEr:::tagClustersGR(exampleCAGEexp, 1)))
NULL

#' @name TCgranges2dataframe
#' 
#' @rdname tagClusterConvertors
#' 
#' @param gr Consensus clusters in \code{GRanges} format.

TCgranges2dataframe <- function(gr) {
  if (!is.null(gr$cluster)) {
    df <- data.frame(cluster = gr$cluster)
    gr$cluster <- NULL
  } else {
    df <- data.frame(cluster = as.integer(names(gr)))  # Make sure it does not sort lexically!
  }
  df <- cbind(df , data.frame( chr     = decode(seqnames(gr))
                             , start   = start(gr) -1
                             , end     = end(gr)
                             , strand  = decode(droplevels(strand(gr)))))
  if(is.null(gr$tpm))
    df$tpm <- score(gr)
  score(gr) <- NULL
  df <- cbind(df, mcols(gr))
  as.data.frame(df)
}

#' @name TCdataframe2granges
#' @rdname tagClusterConvertors
#' 
#' @param df Consensus clusters in \code{data.frame} format.

TCdataframe2granges <- function(df) {
	gr <- GRanges( seqnames           = df$chr
	             , ranges             = IRanges(df$start + 1, df$end)
 	             , score              = df$tpm
               , strand             = df$strand)
	if(is.null(df[["cluster"]]))
	  gr$cluster <- seq_along(gr)
	mcols(gr) <- cbind( mcols(gr)
	                  , df[,setdiff(colnames(df), c("chr", "start", "end", "strand")), drop = FALSE])
	names(gr) <- rownames(df)
	gr
}
Any scripts or data that you put into this service are public.
CAGEr documentation built on Jan. 17, 2021, 2 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
CAGEr
Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining

R/ClusteringFunctions.R
In CAGEr: Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining

Defines functions TCdataframe2granges TCgranges2dataframe .predefined.clusters .summarize.clusters.predef .ctss2clusters.predef .cluster.ctss.chr.predef .cluster.ctss.strand.predef .paraclu .paraclu3 .paraclu2 .paraclu1

Documented in TCdataframe2granges TCgranges2dataframe

Try the CAGEr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CAGEr Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining

R/ClusteringFunctions.R In CAGEr: Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining

Defines functions TCdataframe2granges TCgranges2dataframe .predefined.clusters .summarize.clusters.predef .ctss2clusters.predef .cluster.ctss.chr.predef .cluster.ctss.strand.predef .paraclu .paraclu3 .paraclu2 .paraclu1

Documented in TCdataframe2granges TCgranges2dataframe

Try the CAGEr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CAGEr
Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining

R/ClusteringFunctions.R
In CAGEr: Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining