R/AllClasses.R
In CAGEr: Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining

#' @name CAGEset-class
#' @docType class
#' @noRd
#' @export
#'
# @import data.table
# @import Rsamtools
# @import S4Vectors
# @import GenomicRanges
# @import IRanges
# Import whole packages above, following earlier versions of CAGEr.  It may be worth
# trying to replace them by ad-hoc importFrom directives.  This would also remove the
# character(0) warning messages when running Roxygen2.
# https://stackoverflow.com/questions/26104401/character0-warnings-when-running-devtoolsload-all-in-rstudio

setClass(Class = "CAGEset",

	representation(
		genomeName = "character",
		inputFiles = "character",
		inputFilesType = "character",
		sampleLabels = "character",
		librarySizes = "integer",
		CTSScoordinates = "data.frame",
		tagCountMatrix = "data.frame",
		normalizedTpmMatrix = "data.frame",
		CTSSexpressionClusteringMethod = "character",
		CTSSexpressionClasses = "character",
		clusteringMethod = "character",
		filteredCTSSidx = "logical",
		tagClusters = "list",
		CTSScumulativesTagClusters = "list",
		tagClustersQuantileLow = "list",
		tagClustersQuantileUp = "list",
		consensusClusters = "data.frame",
		consensusClustersTpmMatrix = "matrix",
		consensusClustersExpressionClusteringMethod = "character",
		consensusClustersExpressionClasses = "character",
		CTSScumulativesConsensusClusters = "list",
		consensusClustersQuantileLow = "list",
		consensusClustersQuantileUp = "list",
		shiftingGroupX = "character",
		shiftingGroupY = "character",
		consensusClustersShiftingScores = "data.frame"		
	),

	prototype(
		genomeName = character(),
		inputFiles = character(),
		inputFilesType = character(),
		sampleLabels = character(),
		librarySizes = integer(),
		CTSScoordinates = data.frame(),
		tagCountMatrix = data.frame(),
		normalizedTpmMatrix = data.frame(),
		CTSSexpressionClusteringMethod = character(),
		CTSSexpressionClasses = character(),
		clusteringMethod = character(),
		filteredCTSSidx = logical(),
		tagClusters = list(),
		CTSScumulativesTagClusters = list(),
		tagClustersQuantileLow = list(),
		tagClustersQuantileUp = list(),
		consensusClusters = data.frame(),
		consensusClustersTpmMatrix = matrix(NA, nrow = 0, ncol = 0),
		consensusClustersExpressionClusteringMethod = character(),
		consensusClustersExpressionClasses = character(),
		CTSScumulativesConsensusClusters = list(),
		consensusClustersQuantileLow = list(),
		consensusClustersQuantileUp = list(),
		shiftingGroupX = character(),
		shiftingGroupY = character(),
		consensusClustersShiftingScores = data.frame()
	),

	validity = function(object) {
#		if(!(object@genomeName %in% suppressWarnings(suppressMessages(BSgenome::installed.genomes()))))
#		return("'genomeName' must be a name of one of the genome packages available in BSgenome! See 'BSgenome::installed.genomes()'")
#		if(object@genomeName %in% rownames(installed.packages()) == FALSE)
#		return("Requested genome is not installed! Please install required BSgenome package before running CAGEr.")
	  supportedTypes <- c("bam", "bamPairedEnd", "bed", "bedmolecule", "ctss", "CTSStable", "FANTOM5", "ENCODE", "FANTOM3and4", "ZebrafishDevelopment")
		if(!(object@inputFilesType %in% supportedTypes))
		return(paste(sQuote("inputFilesType"), "must be one of supported input file types:",
		             paste(sQuote(supportedTypes), collapse = ", "), "."))
		if(!(object@inputFilesType == "CTSStable") & (length(object@sampleLabels) != length(object@inputFiles))) {
			return("Number of provided sample labels must match the number of input files unless inputFilesType = \"CTSStable\"!")
		}
		if(!(all(nzchar(object@sampleLabels))) | !(all(substr(object@sampleLabels, start = 1, stop = 1) %in% c(letters, LETTERS)))){
			stop("All sample labels must be a non-empty strings beginning with a letter!")
		}
		if(length(unique(object@sampleLabels)) != length(object@sampleLabels)) {
			stop("Duplicated sample labels are not allowed!")
		}
		
	}

)

###############################################################
# Function for displaying CAGEset object in user friendly way

#' @name show
#' @noRd
#' @exportMethod show

setMethod("show", 

	signature(object = "CAGEset"), 
	function(object) {
	cat("\nS4 Object of class CAGEset\n\n")
	cat("=======================================\n")
	cat("Input data information\n")
	cat("=======================================\n")
	cat("Reference genome (organism): ", genomeName(object), "\n", sep = "")
	cat("Input file type: ", inputFilesType(object), "\n", sep = "")
	cat("Input file names: ", paste(inputFiles(object), collapse = ", "), "\n", sep = "")
	cat("Sample labels: ", paste(sampleLabels(object), collapse = ", "), "\n", sep = "")
	cat("=======================================\n")
	cat("CTSS information\n")
	cat("=======================================\n")
	if(nrow(CTSScoordinates(object))>0){
	max_out = min(3, nrow(CTSScoordinates(object)))
		cat("CTSS chromosome: ", paste(paste(CTSScoordinates(object)$chr[1:max_out], collapse = ", "), ", ...", sep = ""), "\n", sep = "")
		cat("CTSS position: ", paste(paste(CTSScoordinates(object)$pos[1:max_out], collapse = ", "), ", ...", sep = ""), "\n", sep = "")
		cat("CTSS strand: ", paste(paste(CTSScoordinates(object)$strand[1:max_out], collapse = ", "), ", ...", sep = ""), "\n", sep = "")
	}else{
		cat("CTSS chromosome:\n")	
		cat("CTSS position:\n")	
		cat("CTSS strand:\n")	
	}
	cat("Tag count:\n")
	if(nrow(object@tagCountMatrix) > 0){
		cat(sapply(c(1:length(sampleLabels(object))), function(x) {paste("\t-> ", sampleLabels(object)[x], ": ", paste(paste(object@tagCountMatrix[1:max_out,x], collapse = ", "), ", ...\n", sep = ""), sep = "")}))
	}
	cat("Normalized tpm:\n")
	if(nrow(object@normalizedTpmMatrix)>0){	
		cat(sapply(c(1:length(sampleLabels(object))), function(x) {paste("\t-> ", sampleLabels(object)[x], ": ", paste(paste(formatC(object@normalizedTpmMatrix[1:max_out,x], format = "f", digits = 3), collapse = ", "), ", ...\n", sep = ""), sep = "")}))
	}
	cat("=======================================\n")
	cat("Tag cluster (TC) information\n")
	cat("=======================================\n")
	cat("CTSS clustering method: ", object@clusteringMethod, "\n", sep = "")
	cat("Number of TCs per sample:\n")
	if(length(object@tagClusters) > 0){
		cat(sapply(sampleLabels(object), function(x) {paste("\t-> ", x, ": ", nrow(tagClusters(object, samples = x)), "\n", sep = "")}))
	}
	cat("=======================================\n")
	cat("Consensus cluster information\n")
	cat("=======================================\n")
	if(nrow(consensusClusters(object))>0){
		max_out = min(3, nrow(consensusClusters(object)))
		cat("Number of consensus clusters: ", nrow(consensusClusters(object)), "\n", sep = "")
		cat("Consensus cluster chromosome: ", paste(paste(consensusClusters(object)$chr[1:max_out], collapse = ", "), ", ...", sep = ""), "\n", sep = "")
		cat("Consensus cluster start: ", paste(paste(consensusClusters(object)$start[1:max_out], collapse = ", "), ", ...", sep = ""), "\n", sep = "")
		cat("Consensus cluster end: ", paste(paste(consensusClusters(object)$end[1:max_out], collapse = ", "), ", ...", sep = ""), "\n", sep = "")
		cat("Consensus cluster strand: ", paste(paste(consensusClusters(object)$strand[1:max_out], collapse = ", "), ", ...", sep = ""), "\n", sep = "")
		cat("Normalized tpm:\n")
		cat(sapply(c(1:length(sampleLabels(object))), function(x) {paste("\t-> ", sampleLabels(object)[x], ": ", paste(paste(formatC(object@consensusClustersTpmMatrix[1:max_out,x], format = "f", digits = 3), collapse = ", "), ", ...\n", sep = ""), sep = "")}))
	}else{
		cat("Number of consensus clusters:\n")
		cat("Consensus cluster chromosome:\n")
		cat("Consensus cluster start:\n")
		cat("Consensus cluster end:\n")
		cat("Consensus cluster strand:\n")
		cat("Normalized tpm:\n")		
	}
	cat("=======================================\n")
	cat("Expression profiling\n")
	cat("=======================================\n")
	cat("Expression clustering method: ", object@consensusClustersExpressionClusteringMethod, "\n", sep = "")
	if(length(object@consensusClustersExpressionClasses) > 0){
		cat("Expression clusters for consensus clusters: ", paste(paste(object@consensusClustersExpressionClasses[1:max_out], collapse = ", "), ", ...", sep = ""), "\n", sep = "")
	}else{
		cat("Expression clusters for consensus clusters:\n")
	}
	cat("=======================================\n")
	cat("Promoter shifting\n")
	cat("=======================================\n")
	if(length(object@shiftingGroupX)>0){
		max_out = min(3, nrow(object@consensusClustersShiftingScores))
		cat("GroupX: ", paste(object@shiftingGroupX, collapse = ", "), "\n", sep = "")
		cat("GroupY: ", paste(object@shiftingGroupY, collapse = ", "), "\n", sep = "")
		cat("Shifting scores: ", paste(formatC(object@consensusClustersShiftingScores$shifting.score[1:max_out], format = "f", digits = 3), collapse = ", "), "\n", sep = "")
		cat("KS p-values (FDR adjusted): ", paste(formatC(object@consensusClustersShiftingScores$fdr.KS[1:max_out], format = "e", digits = 2), collapse = ", "), "\n", sep = "")
	}else{
		cat("GroupX:\n")
		cat("GroupY:\n")
		cat("Shifting scores:\n")
		cat("KS p-values (FDR adjusted):\n")
	}	
	cat("\n")
		
})

############
# Coercion

#' @name coerce
#' @noRd
#' @exportMethod coerce

setAs("data.frame", "CAGEset",
    function(from){
        
        if(!(ncol(from) >= 3) | !(all(colnames(from)[1:3] == c("chr", "pos", "strand")))){
            stop("First three columns of the input data.frame must contain chromosome name, genomic position and strand of individual TSSs, and must be named 'chr', 'pos' and 'strand', respectively!")
        }
        if(!(ncol(from))>=4){
            stop("Input data.frame needs to contain at least one column with CAGE tag counts, in addition to first three columns specifying chromosome name, genomic position and strand of individual TSSs!")
        }
        if(!(all(nzchar(colnames(from)))) | !(all(substr(colnames(from),
        start = 1, stop = 1) %in% c(letters, LETTERS)))){
            stop("Names of the columns specifying CAGE tag counts in the input data.frame must be non-empty strings beginning with a letter, as they will be used as sample labels in the resulting CAGEset!")
        }
        if(!(is.integer(from[,"pos"]))){
            stop("The 'pos' column in the input data.frame can contain only non-zero integers as these are interpreted as 1-based genomic coordinates of TSSs! Make sure the 'pos' column is of class 'integer'!")
        }else if(any(from[,"pos"] <= 0)){
            stop("The 'pos' column in the input data.frame can contain only non-zero integers as these are interpreted as 1-based genomic coordinates of TSSs!")
        }
        if(!(all(from[,"strand"] %in% c("+", "-")))){
            stop("The 'strand' column in the input data.frame can contain only '+' or '-'!")
        }
        if(!(all(apply(from[,4:ncol(from),drop=FALSE], 2, is.integer)))){
            stop("The columns specifying CAGE tag counts must be non-negative integers! Make sure these columns are of class 'integer'!")
        }else if(any(apply(from[,4:ncol(from),drop=FALSE], 2, function(x) {any(x < 0)}))){
            stop("The columns specifying CAGE tag counts must be non-negative integers!")
        }
        
        ctss.coordinates <- from[, c("chr", "pos", "strand")]
        ctss.coordinates$chr <- as.character(ctss.coordinates$chr)
        ctss.coordinates$pos <- as.integer(ctss.coordinates$pos)
        ctss.coordinates$strand <- as.character(ctss.coordinates$strand)
        sample.labels <- colnames(from)[4:ncol(from)]
        
        myCAGEset <- new("CAGEset", genomeName = "", inputFiles = "data.frame", inputFilesType = "CTSStable", sampleLabels = sample.labels)
        myCAGEset@librarySizes <- as.integer(colSums(from[,4:ncol(from),drop=FALSE]))
        myCAGEset@CTSScoordinates <- ctss.coordinates
        myCAGEset@tagCountMatrix <- from[,4:ncol(from),drop=FALSE]

        return(myCAGEset)

    }
)


######################
# Merge two CAGEsets

#' Merge two CAGEr objects into one
#' 
#' Merges two [`CAGEr`] objects into one by combining the CTSS genomic
#' coordinates and raw tag counts.  The resulting object will contain a union
#' of TSS positions present in the two input objects and raw tag counts for
#' those TSSs in all samples from both input objects.
#' 
#' @param cs1 A `CAGEr` object
#' @param cs2 A `CAGEr` object
#' 
#' @return Note that merging discards all other information present in the
#' two `CAGEr` objects, that is, the merged object will not contain any
#' normalised tag counts, CTSS clusters, quantile positions, etc., so these
#' have to be calculated again by calling the appropriate functions on the
#' merged object.  Also, it is only possible to merge two objects that contain
#' TSS information for the same reference genome and do not share any sample
#' names.
#' 
#' @return Returns a `CAGEset` or `CAGEexp` object, which contains a union of
#' TSS positions present in the two input objects and raw tag counts for those
#' TSSs in all samples from both input objects.
#' 
#' @author Vanja Haberle
#' 
#' @seealso [`CAGEset`], [`CAGEexp`]
#' 
#' @examples 
#' library(BSgenome.Drerio.UCSC.danRer7)
#' 
#' pathsToInputFiles <- system.file("extdata", c("Zf.unfertilized.egg.chr17.ctss",
#'   "Zf.30p.dome.chr17.ctss", "Zf.prim6.rep1.chr17.ctss"), package="CAGEr")
#'   
#' myCAGEset1 <- new("CAGEset", genomeName = "BSgenome.Drerio.UCSC.danRer7",
#' inputFiles = pathsToInputFiles[1:2], inputFilesType = "ctss", sampleLabels =
#' c("sample1", "sample2"))
#' getCTSS(myCAGEset1)
#' 
#' myCAGEset2 <- new("CAGEset", genomeName = "BSgenome.Drerio.UCSC.danRer7",
#' inputFiles = pathsToInputFiles[3], inputFilesType = "ctss", sampleLabels =
#' "sample3")
#' 
#' getCTSS(myCAGEset2)
#' 
#' myCAGEset <- mergeCAGEsets(myCAGEset1, myCAGEset2)
#' 
#' 
#' ce1 <- CAGEexp(genomeName = "BSgenome.Drerio.UCSC.danRer7",
#' inputFiles = pathsToInputFiles[1:2], inputFilesType = "ctss", sampleLabels =
#' c("sample1", "sample2"))
#' getCTSS(ce1)
#' 
#' ce2 <- CAGEexp(genomeName = "BSgenome.Drerio.UCSC.danRer7",
#' inputFiles = pathsToInputFiles[3], inputFilesType = "ctss", sampleLabels =
#' "sample3")
#' 
#' getCTSS(ce2)
#' 
#' ce <- mergeCAGEsets(ce1, ce2)
#' 
#' @export

setGeneric( "mergeCAGEsets"
          , function(cs1, cs2) {
            
  if (genomeName(cs1) != genomeName(cs2))
    stop("Cannot merge two CAGEsets with data from different genomes!")
  
  if(any(sampleLabels(cs1) %in% sampleLabels(cs2)))
    stop("Cannot merge two CAGEsets that share same sample labels!")
  
  standardGeneric("mergeCAGEsets")
})

#' @rdname mergeCAGEsets

setMethod("mergeCAGEsets", 
signature(cs1 = "CAGEset", cs2 = "CAGEset"),
function (cs1, cs2){
    
    ctss1 <- CTSStagCount(cs1)
    ctss2 <- CTSStagCount(cs2)
    
    ctss <- merge(ctss1, ctss2, by.x = c("chr", "pos", "strand"), by.y = c("chr", "pos", "strand"), all.x = T, all.y = T)
    ctss[is.na(ctss)] <- 0
    ctss <- ctss[order(ctss$chr, ctss$pos),]
    for(i in 4:ncol(ctss)){
        ctss[,i] <- as.integer(ctss[,i])
    }
    
    sample.labels <- c(cs1@sampleLabels, cs2@sampleLabels)
    names(sample.labels) <- rainbow(n = length(sample.labels))
    library.sizes <- as.integer(colSums(ctss[,4:ncol(ctss),drop=FALSE]))
    names(library.sizes) <- sample.labels
    myCAGEset <- new("CAGEset", genomeName = cs1@genomeName, inputFiles = "merged CAGEset", inputFilesType = "CTSStable", sampleLabels = sample.labels)
    myCAGEset@librarySizes <- library.sizes
    myCAGEset@CTSScoordinates <- ctss[,c("chr", "pos", "strand")]
    myCAGEset@tagCountMatrix <- ctss[,4:ncol(ctss),drop=FALSE]

    myCAGEset
})

#' @rdname mergeCAGEsets

setMethod( "mergeCAGEsets"
         , signature(cs1 = "CAGEexp", cs2 = "CAGEexp")
         , function (cs1, cs2) {
           
  # First, make a new CTSS SummarizedExperiment.
    
  getListsOfCTSS <- function(object) {
    lapply(sampleList(object), function(name) {
      ctss <- CTSScoordinatesGR(object)
      mcols(ctss) <- NULL
      score(ctss) <- CTSStagCountDF(object)[[name]]
      ctss[score(ctss) != 0]
    })}
  
  l <- GRangesList(c(getListsOfCTSS(cs1), getListsOfCTSS(cs2)))
  
  # Code duplicated from getCTSS
    
  rowRanges <- sort(unique(unlist(l)))
  mcols(rowRanges) <- NULL

  assay <- DataFrame(V1 = Rle(rep(0L, length(rowRanges))))
  
  expandRange <- function(global, local) {
    x <- Rle(rep(0L, length(global)))
    x[global %in% local] <- score(local)
    x
  }
  
  for (i in seq_along(l))
    assay[,i] <- expandRange(rowRanges, l[[i]])
  
  colnames(assay) <- names(l)
  
  se <- SummarizedExperiment( rowRanges = rowRanges
                            , assays    = SimpleList(counts = assay))

  # Second, merge column metadata
  
  df1 <- colData(cs1)
  df2 <- colData(cs2)
  
  commonCols <- intersect(colnames(df1), colnames(df2))
  df <- rbind(df1[,commonCols], df2[,commonCols])
  
  # Then construct a new CAGEexp object
  
  ce <- CAGEexp(colData = df)
  CTSStagCountSE(ce) <- se
  ce
})
Any scripts or data that you put into this service are public.
CAGEr documentation built on Jan. 17, 2021, 2 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
CAGEr
Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining

R/AllClasses.R
In CAGEr: Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining

Try the CAGEr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CAGEr Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining

R/AllClasses.R In CAGEr: Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining

Try the CAGEr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

CAGEr
Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining

R/AllClasses.R
In CAGEr: Analysis of CAGE (Cap Analysis of Gene Expression) sequencing data for precise mapping of transcription start sites and promoterome mining