R/class_dmSQTLdata.R
In DRIMSeq: Differential transcript usage and tuQTL analyses with Dirichlet-multinomial model in RNA-seq

Documented in dmSQTLdata

#' @include class_MatrixList.R
NULL

###############################################################################
### dmSQTLdata class
###############################################################################

#' dmSQTLdata object
#' 
#' dmSQTLdata contains genomic feature expression (counts), genotypes and sample
#' information needed for the transcript/exon usage QTL analysis. It can be 
#' created with function \code{\link{dmSQTLdata}}.
#' 
#' @return
#' 
#' \itemize{ \item \code{names(x)}: Get the gene names. \item \code{length(x)}: 
#' Get the number of genes. \item \code{x[i, j]}: Get a subset of dmDSdata 
#' object that consists of counts, genotypes and blocks corresponding to genes i
#' and samples j. }
#' 
#' @param x,object dmSQTLdata object.
#' @param i,j Parameters used for subsetting.
#'   
#' @slot counts \code{\linkS4class{MatrixList}} of expression, in counts, of 
#'   genomic features. Rows correspond to genomic features, such as exons or 
#'   transcripts. Columns correspond to samples. MatrixList is partitioned in a 
#'   way that each of the matrices in a list contains counts for a single gene.
#' @slot genotypes MatrixList of unique genotypes. Rows correspond to blocks, 
#'   columns to samples. Each matrix in this list is a collection of unique 
#'   genotypes that are matched with a given gene.
#' @slot blocks MatrixList with two columns \code{block_id} and \code{snp_id}. 
#'   For each gene, it identifies SNPs with identical genotypes across the 
#'   samples and assigns them to blocks.
#' @slot samples Data frame with information about samples. It must contain 
#'   variable \code{sample_id} with unique sample names.
#'   
#' @examples 
#' # --------------------------------------------------------------------------
#' # Create dmSQTLdata object
#' # --------------------------------------------------------------------------
#' # Use subsets of data defined in the GeuvadisTranscriptExpr package
#' 
#' library(GeuvadisTranscriptExpr)
#' \donttest{
#' geuv_counts <- GeuvadisTranscriptExpr::counts
#' geuv_genotypes <- GeuvadisTranscriptExpr::genotypes
#' geuv_gene_ranges <- GeuvadisTranscriptExpr::gene_ranges
#' geuv_snp_ranges <- GeuvadisTranscriptExpr::snp_ranges
#' 
#' colnames(geuv_counts)[c(1,2)] <- c("feature_id", "gene_id")
#' colnames(geuv_genotypes)[4] <- "snp_id"
#' geuv_samples <- data.frame(sample_id = colnames(geuv_counts)[-c(1,2)])
#' 
#' d <- dmSQTLdata(counts = geuv_counts, gene_ranges = geuv_gene_ranges,  
#'   genotypes = geuv_genotypes, snp_ranges = geuv_snp_ranges, 
#'   samples = geuv_samples, window = 5e3)
#' }
#' @author Malgorzata Nowicka
#' @seealso \code{\linkS4class{dmSQTLprecision}}, 
#'   \code{\linkS4class{dmSQTLfit}}, \code{\linkS4class{dmSQTLtest}}
setClass("dmSQTLdata", 
  representation(counts = "MatrixList", 
    genotypes = "MatrixList", 
    blocks = "MatrixList",
    samples = "data.frame"))


###################################


setValidity("dmSQTLdata", function(object){
  ### Has to return TRUE when valid object!
  
  if(!ncol(object@counts) == ncol(object@genotypes))
    return(paste0("Unequal number of samples in 'counts' and 'genotypes' ", 
      ncol(object@counts), " and ", ncol(object@genotypes)))
  
  ### Mystery: This does not pass
  # if(!all(colnames(object@blocks) %in% c("block_id", "snp_id")))
  #   return(paste0("'blocks' must contain 'block_id' and 'snp_id' variables"))
  
  if(!all(names(object@counts) == names(object@genotypes)))
    return("'genotypes' and 'counts' do not contain the same genes")
  
  if(!all(names(object@blocks) == names(object@genotypes)))
    return("'genotypes' and 'blocks' do not contain the same genes or SNPs")
  
  return(TRUE)
  
})


###############################################################################
### show, accessing and subsetting methods
###############################################################################

#' @rdname dmSQTLdata-class
#' @export
setMethod("counts", "dmSQTLdata", function(object){
  
  data.frame(gene_id = rep(names(object@counts), elementNROWS(object@counts)), 
    feature_id = rownames(object@counts), 
    object@counts@unlistData, stringsAsFactors = FALSE, 
    row.names = NULL)
  
})


#' @rdname dmSQTLdata-class
#' @export
setMethod("samples", "dmSQTLdata", function(x) x@samples )


################################


setMethod("show", "dmSQTLdata", function(object){
  
  cat("An object of class", class(object), "\n")
  
  cat("with", length(object), "genes and", ncol(object@counts), "samples\n")
  
  cat("* data accessors: counts(), samples()\n")
  
})


################################


#' @rdname dmSQTLdata-class
#' @export
setMethod("names", "dmSQTLdata", function(x) names(x@counts) )

#' @rdname dmSQTLdata-class
#' @export
setMethod("length", "dmSQTLdata", function(x) length(x@counts) )


#' @aliases [,dmSQTLdata-method [,dmSQTLdata,ANY-method
#' @rdname dmSQTLdata-class
#' @export
setMethod("[", "dmSQTLdata", function(x, i, j){
  
  if(missing(j)){
    
    counts <- x@counts[i, , drop = FALSE]
    genotypes <- x@genotypes[i, , drop = FALSE]
    blocks <- x@blocks[i, , drop = FALSE]
    samples <- x@samples
    
  }else{
    
    if(missing(i)){
      counts <- x@counts[, j, drop = FALSE]
      genotypes <- x@genotypes[, j, drop = FALSE]
    }else{
      counts <- x@counts[i, j, drop = FALSE]
      genotypes <- x@genotypes[i, j, drop = FALSE]
      blocks <- x@blocks[i, , drop = FALSE]
    }
    
    samples <- x@samples
    rownames(samples) <- samples$sample_id
    samples <- samples[j, , drop = FALSE]
    samples$sample_id <- factor(samples$sample_id)
    rownames(samples) <- NULL
    
  }
  
  return(new("dmSQTLdata", counts = counts, genotypes = genotypes, 
    blocks = blocks, samples = samples))
  
})

###############################################################################
### dmSQTLdata
###############################################################################

blocks_per_gene <- function(g, genotypes){
  # g = 1
  
  genotypes_df <- data.frame(t(genotypes[[g]]))
  matching_snps <- match(genotypes_df, genotypes_df)
  oo <- order(matching_snps, decreasing = FALSE)
  block_id <- paste0("block_", as.numeric(factor(matching_snps)))
  snp_id <- colnames(genotypes_df)
  blocks_tmp <- cbind(block_id, snp_id)
  
  return(blocks_tmp[oo, , drop = FALSE])
  
}

#' Create dmSQTLdata object
#' 
#' Constructor functions for a \code{\linkS4class{dmSQTLdata}} object. 
#' dmSQTLdata assignes to a gene all the SNPs that are located in a given
#' surrounding (\code{window}) of this gene.
#' 
#' It is quite common that sample grouping defined by some of the SNPs is 
#' identical. Compare \code{dim(genotypes)} and \code{dim(unique(genotypes))}. 
#' In our QTL analysis, we do not repeat tests for the SNPs that define the 
#' same grouping of samples. Each grouping is tested only once. SNPs that define
#' such unique groupings are aggregated into blocks. P-values and adjusted 
#' p-values are estimated at the block level, but the returned results are 
#' extended to a SNP level by repeating the block statistics for each SNP that 
#' belongs to a given block.
#' 
#' @inheritParams dmDSdata
#' @param genotypes Data frame with genotypes. Rows correspond to SNPs. This 
#'   data frame has to contain a \code{snp_id} column with SNP IDs and columns 
#'   with genotypes for each sample. Column names corresponding to sample IDs 
#'   must be the same as in the \code{sample} data frame. The genotype of each 
#'   sample is coded in the following way: 0 for ref/ref, 1 for ref/not ref, 2 
#'   for not ref/not ref, -1 or \code{NA} for missing value.
#' @param gene_ranges \code{\linkS4class{GRanges}} object with gene location. It
#'   must contain gene names when calling names().
#' @param snp_ranges \code{\linkS4class{GRanges}} object with SNP location. It 
#'   must contain SNP names when calling names().
#' @param window Size of a down and up stream window, which is defining the 
#'   surrounding for a gene. Only SNPs that are located within a gene or its 
#'   surrounding are considered in the sQTL analysis.
#' @param samples Data frame with column \code{sample_id} corresponding to 
#'   unique sample IDs
#' @param BPPARAM Parallelization method used by 
#'   \code{\link[BiocParallel]{bplapply}}.
#'   
#' @return Returns a \code{\linkS4class{dmSQTLdata}} object.
#'   
#' @examples 
#' # --------------------------------------------------------------------------
#' # Create dmSQTLdata object
#' # --------------------------------------------------------------------------
#' # Use subsets of data defined in the GeuvadisTranscriptExpr package
#' 
#' library(GeuvadisTranscriptExpr)
#' \donttest{
#' geuv_counts <- GeuvadisTranscriptExpr::counts
#' geuv_genotypes <- GeuvadisTranscriptExpr::genotypes
#' geuv_gene_ranges <- GeuvadisTranscriptExpr::gene_ranges
#' geuv_snp_ranges <- GeuvadisTranscriptExpr::snp_ranges
#' 
#' colnames(geuv_counts)[c(1,2)] <- c("feature_id", "gene_id")
#' colnames(geuv_genotypes)[4] <- "snp_id"
#' geuv_samples <- data.frame(sample_id = colnames(geuv_counts)[-c(1,2)])
#' 
#' d <- dmSQTLdata(counts = geuv_counts, gene_ranges = geuv_gene_ranges,  
#'   genotypes = geuv_genotypes, snp_ranges = geuv_snp_ranges, 
#'   samples = geuv_samples, window = 5e3)
#' }
#' @seealso \code{\link{plotData}}
#' @author Malgorzata Nowicka
#' @export
#' @importFrom IRanges width
#' @importFrom S4Vectors queryHits subjectHits
dmSQTLdata <- function(counts, gene_ranges, genotypes, snp_ranges, samples, 
  window = 5e3, BPPARAM = BiocParallel::SerialParam()){
  
  stopifnot(is.numeric(window))
  stopifnot(window >= 0)
  
  ### Check on samples
  stopifnot(class(samples) == "data.frame")
  stopifnot("sample_id" %in% colnames(samples))
  stopifnot(sum(duplicated(samples$sample_id)) == 0)
  
  ### Check on counts
  stopifnot(class(counts) == "data.frame")
  stopifnot(all(c("gene_id", "feature_id") %in% colnames(counts)))
  stopifnot(all(samples$sample_id %in% colnames(counts)))
  
  ### Check on genotypes
  stopifnot(class(genotypes) == "data.frame")
  stopifnot("snp_id" %in% colnames(genotypes))
  stopifnot(all(samples$sample_id %in% colnames(counts)))
  
  sample_id <- samples$sample_id
  gene_id <- counts$gene_id
  feature_id <- counts$feature_id
  snp_id <- genotypes$snp_id
  
  stopifnot( class( gene_id ) %in% c("character", "factor"))
  stopifnot( class( feature_id ) %in% c("character", "factor"))
  stopifnot( class( sample_id ) %in% c("character", "factor"))
  stopifnot( class( snp_id ) %in% c("character", "factor"))
  
  stopifnot(all(!is.na(gene_id)))
  stopifnot(all(!is.na(feature_id)))
  stopifnot(all(!is.na(sample_id)))
  stopifnot(all(!is.na(snp_id)))
  
  counts <- counts[, as.character(sample_id), drop = FALSE]
  counts <- as.matrix(counts)
  stopifnot(mode(counts) %in% "numeric")
  
  genotypes <- genotypes[, as.character(sample_id), drop = FALSE]
  genotypes <- as.matrix(genotypes)
  stopifnot(mode(genotypes) %in% "numeric")
  stopifnot(all(genotypes %in% c(-1, 0, 1, 2, NA)))
  rownames(genotypes) <- snp_id
  
  stopifnot(class(gene_ranges) == "GRanges")
  stopifnot(class(snp_ranges) == "GRanges")
  stopifnot(!is.null(names(snp_ranges)))
  stopifnot(!is.null(names(gene_ranges)))
  
  ### Keep genes that are in counts and in gene_ranges
  genes_overlap <- intersect(names(gene_ranges), gene_id)
  
  genes2keep <- gene_id %in% genes_overlap
  counts <- counts[genes2keep, , drop = FALSE]
  gene_id <- gene_id[genes2keep]
  feature_id <- feature_id[genes2keep]
  
  gene_ranges <- gene_ranges[genes_overlap, ]
  
  ### Keep SNPs that are in genotypes and in snp_ranges 
  ### and make them in the same order
  snps_overlap <- intersect(names(snp_ranges), snp_id)
  
  genotypes <- genotypes[snps_overlap, , drop = FALSE]
  snp_ranges <- snp_ranges[snps_overlap, ]
  
  gene_ranges <- GenomicRanges::resize(gene_ranges, 
    width(gene_ranges) + 2 * window, fix = "center")
  
  ## Match genes and SNPs
  variantMatch <- GenomicRanges::findOverlaps(gene_ranges, snp_ranges, 
    select = "all")
  
  q <- queryHits(variantMatch)
  s <- subjectHits(variantMatch)
  
  genotypes <- genotypes[s, ]
  snp_id <- snp_id[s]
  gene_id_genotypes <- names(gene_ranges)[q]
  
  
  ### keep genes that are in counts and in genotypes
  genes2keep <- gene_id %in% gene_id_genotypes
  counts <- counts[genes2keep, , drop = FALSE]
  gene_id <- gene_id[genes2keep]
  feature_id <- feature_id[genes2keep]
  
  genes2keep <- gene_id_genotypes %in% gene_id
  genotypes <- genotypes[genes2keep, , drop = FALSE]
  gene_id_genotypes <- gene_id_genotypes[genes2keep]
  snp_id <- snp_id[genes2keep]
  
  ### order genes in counts and in genotypes
  if(class(gene_id) == "character")
    gene_id <- factor(gene_id, levels = unique(gene_id))
  
  order_counts <- order(gene_id)
  counts <- counts[order_counts, , drop = FALSE]
  gene_id <- gene_id[order_counts]
  feature_id <- feature_id[order_counts]
  
  gene_id_genotypes <- factor(gene_id_genotypes, levels = levels(gene_id))
  order_genotypes <- order(gene_id_genotypes)
  genotypes <- genotypes[order_genotypes, , drop = FALSE]
  gene_id_genotypes <- gene_id_genotypes[order_genotypes]
  snp_id <- snp_id[order_genotypes]
  
  colnames(counts) <- sample_id
  rownames(counts) <- feature_id
  
  colnames(genotypes) <- sample_id
  rownames(genotypes) <- snp_id
  
  inds_counts <- 1:length(gene_id)
  names(inds_counts) <- feature_id
  partitioning_counts <- split(inds_counts, gene_id)
  
  inds_genotypes <- 1:length(gene_id_genotypes)
  names(inds_genotypes) <- snp_id
  partitioning_genotypes <- split(inds_genotypes, gene_id_genotypes)
  
  counts <- new( "MatrixList", unlistData = counts, 
    partitioning = partitioning_counts)
  genotypes <- new( "MatrixList", unlistData = genotypes, 
    partitioning = partitioning_genotypes)
  
  ### Keep unique genotypes and create info about blocs
  inds <- 1:length(genotypes)
  
  blocks <- MatrixList(BiocParallel::bplapply(inds, blocks_per_gene, 
    genotypes = genotypes, BPPARAM = BPPARAM))
  
  names(blocks) <- names(genotypes)
  
  genotypes_u <- MatrixList(lapply(inds, function(g){
    # g = 1
    
    genotypes_tmp <- unique(genotypes[[g]])
    rownames(genotypes_tmp) <- paste0("block_", 1:nrow(genotypes_tmp))
    return(genotypes_tmp)
    
  }))
  
  names(genotypes_u) <- names(genotypes)
  samples <- data.frame(sample_id = sample_id)
  data <- new("dmSQTLdata", counts = counts, genotypes = genotypes_u, 
    blocks = blocks, samples = samples)
  
  return(data)
  
}


################################################################################
### dmFilter
################################################################################


#' @param minor_allele_freq Minimal number of samples where each of the 
#'   genotypes has to be present.
#' @param BPPARAM Parallelization method used by 
#'   \code{\link[BiocParallel]{bplapply}}.
#' @details
#' 
#' In QTL analysis, usually, we deal with data that has many more replicates 
#' than data from a standard differential usage assay. Our example data set 
#' consists of 91 samples. Requiring that genes are expressed in all samples may
#' be too stringent, especially since there may be missing values in the data 
#' and for some genes you may not observe counts in all 91 samples. Slightly 
#' lower threshold ensures that we do not eliminate such genes. For example, if 
#' \code{min_samps_gene_expr = 70} and \code{min_gene_expr = 10}, only genes 
#' with expression of at least 10 in at least 70 samples are kept. Samples with 
#' expression lower than 10 have \code{NA}s assigned and are skipped in the 
#' analysis of this gene. \code{minor_allele_freq} indicates the minimal number 
#' of samples for the minor allele presence. Usually, it is equal to roughly 5\%
#' of total samples.
#' 
#' @examples 
#' # --------------------------------------------------------------------------
#' # Create dmSQTLdata object
#' # --------------------------------------------------------------------------
#' # Use subsets of data defined in the GeuvadisTranscriptExpr package
#' 
#' library(GeuvadisTranscriptExpr)
#' \donttest{
#' geuv_counts <- GeuvadisTranscriptExpr::counts
#' geuv_genotypes <- GeuvadisTranscriptExpr::genotypes
#' geuv_gene_ranges <- GeuvadisTranscriptExpr::gene_ranges
#' geuv_snp_ranges <- GeuvadisTranscriptExpr::snp_ranges
#' 
#' colnames(geuv_counts)[c(1,2)] <- c("feature_id", "gene_id")
#' colnames(geuv_genotypes)[4] <- "snp_id"
#' geuv_samples <- data.frame(sample_id = colnames(geuv_counts)[-c(1,2)])
#' 
#' d <- dmSQTLdata(counts = geuv_counts, gene_ranges = geuv_gene_ranges,  
#'   genotypes = geuv_genotypes, snp_ranges = geuv_snp_ranges, 
#'   samples = geuv_samples, window = 5e3)
#' 
#' # --------------------------------------------------------------------------
#' # sQTL analysis - simple group comparison
#' # --------------------------------------------------------------------------
#' 
#' ## Filtering
#' d <- dmFilter(d, min_samps_gene_expr = 70, min_samps_feature_expr = 5,
#'   minor_allele_freq = 5, min_gene_expr = 10, min_feature_expr = 10)
#'   
#' plotData(d)
#' }
#' @rdname dmFilter
#' @export
setMethod("dmFilter", "dmSQTLdata", function(x, min_samps_gene_expr = 0, 
  min_samps_feature_expr = 0, min_samps_feature_prop = 0, 
  minor_allele_freq = 0.05 * nrow(samples(x)), 
  min_gene_expr = 0, min_feature_expr = 0, min_feature_prop = 0, 
  BPPARAM = BiocParallel::SerialParam()){
  
  stopifnot(min_samps_gene_expr >= 0 && 
      min_samps_gene_expr <= ncol(x@counts))
  stopifnot(min_gene_expr >= 0)
  stopifnot(min_samps_feature_expr >= 0 && 
      min_samps_feature_expr <= ncol(x@counts))
  stopifnot(min_feature_expr >= 0)
  stopifnot(min_samps_feature_prop >= 0 && 
      min_samps_feature_prop <= ncol(x@counts))
  stopifnot(min_feature_prop >= 0 && min_feature_prop <= 1)
  stopifnot(minor_allele_freq >= 1 && 
      minor_allele_freq <= floor(nrow(samples(x))/2))
  
  data_filtered <- dmSQTL_filter(counts = x@counts, genotypes = x@genotypes, 
    blocks = x@blocks, samples = x@samples, 
    min_samps_gene_expr = min_samps_gene_expr, 
    min_gene_expr = min_gene_expr, 
    min_samps_feature_expr = min_samps_feature_expr, 
    min_feature_expr = min_feature_expr, 
    min_samps_feature_prop = min_samps_feature_prop, 
    min_feature_prop = min_feature_prop,
    minor_allele_freq = minor_allele_freq, BPPARAM = BPPARAM)
  
  return(data_filtered)
  
})


###############################################################################
### plotData
###############################################################################


#' @param plot_type Character specifying which type of histogram to plot. Possible
#'   values \code{"features"}, \code{"snps"} or \code{"blocks"}.
#' @examples 
#' # --------------------------------------------------------------------------
#' # Create dmSQTLdata object
#' # --------------------------------------------------------------------------
#' # Use subsets of data defined in the GeuvadisTranscriptExpr package
#' 
#' library(GeuvadisTranscriptExpr)
#' \donttest{
#' geuv_counts <- GeuvadisTranscriptExpr::counts
#' geuv_genotypes <- GeuvadisTranscriptExpr::genotypes
#' geuv_gene_ranges <- GeuvadisTranscriptExpr::gene_ranges
#' geuv_snp_ranges <- GeuvadisTranscriptExpr::snp_ranges
#' 
#' colnames(geuv_counts)[c(1,2)] <- c("feature_id", "gene_id")
#' colnames(geuv_genotypes)[4] <- "snp_id"
#' geuv_samples <- data.frame(sample_id = colnames(geuv_counts)[-c(1,2)])
#' 
#' d <- dmSQTLdata(counts = geuv_counts, gene_ranges = geuv_gene_ranges,  
#'   genotypes = geuv_genotypes, snp_ranges = geuv_snp_ranges, 
#'   samples = geuv_samples, window = 5e3)
#' 
#' # --------------------------------------------------------------------------
#' # sQTL analysis - simple group comparison
#' # --------------------------------------------------------------------------
#' 
#' ## Filtering
#' d <- dmFilter(d, min_samps_gene_expr = 70, min_samps_feature_expr = 5,
#'   minor_allele_freq = 5, min_gene_expr = 10, min_feature_expr = 10)
#'   
#' plotData(d)
#' }
#' @rdname plotData
#' @export
setMethod("plotData", "dmSQTLdata", function(x, plot_type = "features"){
  
  stopifnot(length(plot_type) == 1)
  stopifnot(plot_type %in% c("features", "snps", "blocks"))
  
  switch(plot_type, 
    
    features = {
      
      tt <- elementNROWS(x@counts)
      ggp <- dm_plotDataFeatures(tt)
      
    },
    
    snps = {
      
      tt <- elementNROWS(x@blocks)
      ggp <- dm_plotDataSnps(tt)
      
    },
    
    blocks = {
      
      tt <- elementNROWS(x@genotypes)
      ggp <- dm_plotDataBlocks(tt)
      
    }
  )
  
  return(ggp)
  
})
Any scripts or data that you put into this service are public.
DRIMSeq documentation built on Nov. 8, 2020, 8:25 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
DRIMSeq
Differential transcript usage and tuQTL analyses with Dirichlet-multinomial model in RNA-seq

R/class_dmSQTLdata.R
In DRIMSeq: Differential transcript usage and tuQTL analyses with Dirichlet-multinomial model in RNA-seq

Defines functions dmSQTLdata blocks_per_gene

Documented in dmSQTLdata

Try the DRIMSeq package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DRIMSeq Differential transcript usage and tuQTL analyses with Dirichlet-multinomial model in RNA-seq

R/class_dmSQTLdata.R In DRIMSeq: Differential transcript usage and tuQTL analyses with Dirichlet-multinomial model in RNA-seq

Defines functions dmSQTLdata blocks_per_gene

Documented in dmSQTLdata

Try the DRIMSeq package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

DRIMSeq
Differential transcript usage and tuQTL analyses with Dirichlet-multinomial model in RNA-seq

R/class_dmSQTLdata.R
In DRIMSeq: Differential transcript usage and tuQTL analyses with Dirichlet-multinomial model in RNA-seq