In hbc/hbcABC: Rmarkdown templates for bioinformatic analysis

check_install = function(packages) {
   not_installed = setdiff(packages, rownames(installed.packages()))
   if (length(not_installed) > 0) {
      write(paste("The libraries", not_installed, "are not installed, aborting.",
                   sep = " "), stdout())
      stop()
   }
}

packages = c("NanoStringQCPro","ggplot2","pheatmap","dplyr","cowplot","scales",
             "DESeq2","DEGreport","tidyverse", "knitr")
check_install(packages)
installed = lapply(packages, library, character.only = TRUE)

pca_loadings = function(object, ntop=700) {
  rv <- matrixStats::rowVars(as.matrix(object))
  select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
      length(rv)))]
  pca <- prcomp(t(as.matrix(object)[select, ]))
  percentVar <- pca$sdev^2/sum(pca$sdev^2)
  names(percentVar) = colnames(pca$x)
  pca$percentVar = percentVar
  return(pca)}

# Colors for heatmaps: Generate same colors as ggplot
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

# Fields of View plot

plotFOV = function(eset) {
  pdat = pData(eset) %>%
    tibble::rownames_to_column() 
  pdat$pcounted = pdat$FovCounted/pdat$FovCount * 100
  ggplot(pdat, aes(rowname, pcounted)) + geom_point() +
      theme(axis.text.x = element_blank(),
            axis.ticks.x = element_blank(),
            strip.text.x = element_text(size = 8)) +
    scale_y_continuous(expand = c(0,0)) +
    expand_limits(y = c(0,1.05 * max(pdat$pcounted))) +
    ylab("percentage of FOV counted") + xlab("sample") +
    geom_hline(yintercept = 75, color = "red")
}

# Binding density plot
plotBD = function(eset) {
  pdat = pData(eset) %>%
    tibble::rownames_to_column() 
  ggplot(pdat, aes(rowname, BindingDensity)) + geom_point() +
      theme(axis.text.x = element_blank(),
            axis.ticks.x = element_blank(),
            strip.text.x = element_text(size = 8)) +
    scale_y_continuous(expand = c(0,0)) +
    expand_limits(y = c(0,1.05 * max(pdat$BindingDensity))) +
    ylab("Binding density") + xlab("sample") +
    geom_hline(yintercept = 0.05, color = "red") +
    geom_hline(yintercept = 2.25, color = "red")
}


# Predicates. Mainly usde by extract_pred

is_positive = function(column) {
  return(grepl("Pos", column))
}
is_negative = function(column) {
  return(grepl("Neg", column))
}
is_spikein = function(column) {
  return(grepl("Spike", column))
}
is_ligation = function(column) {
  return(grepl("Ligati", column))
}
is_housekeeping = function(column) {
  return(grepl("Housekee", column))
}

extract_pred = function(eset, predicate, counts=FALSE) {
  if (!counts) {
    counts = data.frame(Biobase::exprs(eset))
  } else {
    counts = eset
    }
  toplot = counts[predicate(rownames(counts)),] %>%
    tibble::rownames_to_column() %>%
    tidyr::gather("sample", "count", -rowname)
  colnames(toplot) = c("spot", "sample", "count")
  toplot = toplot 
  return(toplot)
}

boxplot_expr <- function(eset,predicate) {
    DF <- extract_pred(eset,predicate) %>%
        tidyr::separate(spot,c("a","b","c","d"),sep = "_",remove = F) %>%
        tidyr::unite(gene,b,c,sep = "_") %>%
        dplyr::select(gene,count)
    ggplot(DF,
        aes(x = gene,y = count)) + geom_boxplot(colour = "black", fill = "#56B4E9") +
        scale_y_continuous(trans = log2_trans()) +
        theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust = 0.5)) +
        ylab("counts")
}

'%!in%' <- function(x,y)!('%in%'(x,y))


# Customized PCA plot

pca_plot_custom = function(pc,comps, nc1, nc2,colorby,shapeby=NULL,
                              highlight=NULL,textby=NULL,axisLimits=NULL, size = 3) {
  require(ggplot2)
  require(ggrepel)
  c1str = paste0("PC", nc1)
  c2str = paste0("PC", nc2)

  if (!is.null(shapeby)){
    scatter<-ggplot(comps, aes_string(c1str, c2str)) +
    geom_point(aes_string(c1str, c2str,color=colorby,shape=shapeby),size= size) + theme_bw() +
    xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100), "% variance")) +
    ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] * 100), "% variance"))+
    theme(legend.text=element_text(size=12))+theme(legend.title=element_blank())
  }else{
    scatter<-ggplot(comps, aes_string(c1str, c2str)) +
    geom_point(aes_string(c1str, c2str,color=colorby),size= size) + theme_bw() +
    xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100), "% variance")) +
    ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] * 100), "% variance")) +
    theme(legend.text=element_text(size=12))+theme(legend.title=element_blank())
  }
  if (!is.null(highlight)){
    scatter<-scatter+geom_label_repel(data=highlight, aes_string(c1str, c2str,label=textby), size=size, show.legend = FALSE,
                                      fontface = 'bold',
                                      box.padding = unit(0.25,"lines"),
                                      point.padding = unit(0.5,"lines"))
  }

  if(!is.null(axisLimits)){
    scatter<-scatter+xlim(axisLimits[[1]])+ylim(axisLimits[[2]])
  }
  scatter
}

# Load files and create RccSet object

rccFiles = list.files(params$rcc_folder,pattern = "*.RCC",full.names = TRUE)
eset = newRccSet(rccFiles = rccFiles, extraPdata = params$metadata_file)

data_dir <- file.path(params$output_dir,"data",Sys.Date(),"ncounts")
dir.create(data_dir,recursive = T)

Quality Control {.tabset}

FOV

The nCounter Digital Analyzer images each lane in discrete units, called fields of view (FOV). Optical issues, such as an inability to focus due to bubbles or insufficient oiling of the cartridge, can prevent successful imaging of a FOV. The Digital Analyzer reports the number of FOVs successfully imaged as FOV Counted. Significant discrepancy between the number of FOV for which imaging was attempted (FOV Count) and for which imaging was successful (FOV Counted) may be indicative of an issue with imaging performance. extract from

plotFOV(eset)

BD

The Binding Density is a measure of the number of optical features per square micron. It is useful for determining whether or not data collection has been compromised due to image saturation. Typically, the range for binding density will be between 0.05 and 2.25. extract from here

plotBD(eset)

Positive controls

Expression of the positive control genes.

boxplot_expr(eset,is_positive)

Negative controls

Expression of the negative control genes.

boxplot_expr(eset,is_negative)

Noise Threshold- Limit Of Detection (LOD).

We establish a noise threshold. This threshold is based on the mean and standard deviation of counts of the negative control genes and represents the background noise. We define it as the mean expression of the negative genes counts + 2 times the standard deviation.

lodcounts = extract_pred(eset, is_negative)
lod = mean(lodcounts$count) + 2 * sd(lodcounts$count)

Housekeeping genes

Expression of each housekeeping genes in all samples. The red line represents the noise threshold.

Expression

X <- boxplot_expr(eset,is_housekeeping)
X + geom_hline(yintercept = (lod),colour = "red")

Expression of all the housekeeping genes in each sample.

We plot the mean expression of all the housekeeping genes in each sample.

counts = Biobase::exprs(eset)
hk = counts[grepl("Housekeeping", rownames(counts)),]
hkDF <- as.data.frame(hk)
tidyHK <- tidyr::gather(hkDF)
colnames(tidyHK) <- c("sample","count")
ggplot(tidyHK, aes(sample, count)) + geom_boxplot(colour = "black", fill = "#56B4E9",outlier.size = 0.5) +
     scale_y_continuous(trans = log2_trans()) + 
     xlab("") + ggtitle("") + theme(axis.text.x = element_text(angle = 90, 
     hjust = 1,vjust = 0.5,size = 5)) +
    geom_smooth(se = T, aes(group = 1)) +
    geom_hline(yintercept = (lod),colour = "red") +
    ylab("counts")

General expression

Expression of all genes (Endogenous + Housekeeping).

counts = Biobase::exprs(eset)
endG = counts[grepl("Endogenous", rownames(counts)),]
hk = counts[grepl("Housekeeping", rownames(counts)),]
all <- rbind(endG,hk)
allDF <- as.data.frame(all)
tidyallDF <- tidyr::gather(allDF)
colnames(tidyallDF) <- c("sample","count")
ggplot(tidyallDF, aes(sample, count)) + geom_boxplot(colour = "black", fill = "#56B4E9",outlier.size = 0.5) +
     scale_y_continuous(trans = log2_trans()) + 
     xlab("") + ggtitle("") + theme(axis.text.x = element_text(angle = 90, 
     hjust = 1,vjust = 0.5,size = 5)) +
    geom_smooth(se = T, aes(group = 1)) +
    geom_hline(yintercept = lod,colour = "red")

Normalization

Positive Normalization {.tabset}

We perform a normalization using the expression of the positive genes. This attempts to normalize for technical noise across the samples.

Pre-norm

geo_pos = function(eset) {
     counts = Biobase::exprs(eset)
     hk = counts[grepl("Positive", rownames(counts)), ]
     geoMeans = apply(hk, 2, function(col) exp(mean(log(col[col != 0]))))
     return(geoMeans)
 }
posFactor = function(eset) {
     geoMeans = geo_pos(eset)
     nf = mean(geoMeans)/geoMeans
     return(nf)
}
pData(eset)$pos_nf = posFactor(eset)
counts = Biobase::exprs(eset)

counts = counts[!grepl("Positive", rownames(counts)),]
counts = counts[!grepl("Negative", rownames(counts)),]

prenorm = counts %>% data.frame() %>% tidyr::gather("sample", "count")
prenorm$sample <- gsub("X","",prenorm$sample)
ggplot(prenorm, aes(sample, count)) + geom_boxplot(colour = "black", fill = "#56B4E9",outlier.size = 0.5) +
     scale_y_continuous(trans = log2_trans()) + 
     xlab("") + ggtitle("pre-normalization") + theme(axis.text.x = element_text(angle = 90, 
     hjust = 1,vjust = 0.5,size = 5)) +    
    geom_smooth(se = T, aes(group = 1)) +
    geom_hline(yintercept = lod,colour = "red")

Post-norm

ncounts = counts %*% diag(pData(eset)$pos_nf)
colnames(ncounts) = colnames(counts)
postnorm = ncounts %>%
  data.frame() %>%
  tidyr::gather("sample", "count")
postnorm$sample <- gsub("X","",postnorm$sample)
ggplot(postnorm, aes(sample, count)) +
  geom_boxplot(colour = "black", fill = "#56B4E9",outlier.size = 0.5) +
  scale_y_continuous(trans = log2_trans()) +
  xlab("") + ggtitle("post-normalization") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust = 0.5,size = 5)) +
    geom_smooth(se = T, aes(group = 1)) +
    geom_hline(yintercept = lod,colour = "red")

Housekeeping normalization {.tabset}

hk = ncounts[grepl("Housekeeping", rownames(ncounts)),]
abovenoise = rowSums(hk > (lod)) >= (ncol(hk))
hk_abovenoise = hk[abovenoise,]
aboveMean = (apply(hk_abovenoise,1,mean)) >= 200
hk_sel = hk_abovenoise[aboveMean,]
hk_norm = rownames(hk_sel)

We select thouse housekeeping genes that have expression values greater than the noise threshold and a mean value of expression of at least r params$hk_min_counts counts. From the r dim(hk)[1] original genes, r dim(hk_sel)[1] pass these criteria.

Pre-norm

ggplot(postnorm, aes(sample, count)) +
  geom_boxplot(colour = "black", fill = "#56B4E9",outlier.size = 0.5) +
  scale_y_continuous(trans = log2_trans()) +
  xlab("") + ggtitle("pre-normalization") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1,vjust = 0.5,size = 5)) +
    geom_smooth(se = T, aes(group = 1)) +
    geom_hline(yintercept = lod,colour = "red")

Post-norm

hk_pos = function(counts) {
  hk = counts[hk_norm,]
  geoMeans = apply(hk, 2, function(col) exp(mean(log(col[col != 0]))))
  return(geoMeans)}
hkFactor = function(counts) {
  geoMeans = hk_pos(counts)
  nf = mean(geoMeans) / geoMeans
  return(nf)}
pData(eset)$hk_nf = hkFactor(ncounts)
ncounts = ncounts %*% diag(pData(eset)$hk_nf)
colnames(ncounts) = colnames(counts)
postnorm = ncounts %>%
  data.frame() %>%
  tidyr::gather("sample", "count")
postnorm$sample <- gsub("X","",postnorm$sample)

ggplot(postnorm, aes(sample, count)) + geom_boxplot(colour = "black", fill = "#56B4E9",outlier.size = 0.5) +
     scale_y_continuous(trans = log2_trans()) + 
     xlab("") + ggtitle("") + theme(axis.text.x = element_text(angle = 90, 
     hjust = 1,vjust = 0.5,size = 5)) +
    geom_smooth(se = T, aes(group = 1)) +
    geom_hline(yintercept = lod,colour = "red")

Drop genes

We’ll drop the genes that are below the LOD in all the samples:

allNames <- rownames(ncounts)

ncounts = ncounts[((rowSums(ncounts <= lod)) < (ncol(ncounts))),]

filteredNames <- rownames(ncounts)
filterOutGenes <- allNames[allNames %!in% filteredNames]

From the original r length(allNames) genes, r length(filteredNames) passed this filter.

ncounts_df <- ncounts %>% as.data.frame() %>% tibble::rownames_to_column(var = "geneID")
rio::export(ncounts_df,file = file.path(data_dir,"normalized_counts.csv"))

Data structure

Once the data is normalized and filtered, we proceed to analyze how the expression distributes the samples.

PCA Plot{.tabset}

A PCA (Principal Component Analysis) performs a transformation over the data in order to obtain orthogonal vectors in such a way that the first principal component has the largest possible variance (that is, accounts for as much of the variability in the data as possible), and each succeeding component in turn has the highest variance possible under the constraint that it is orthogonal to the preceding components.

Plotting the samples in this transformed system show which samples are more similar.

ncounts <- ncounts %>% round()

dds = DESeqDataSetFromMatrix(countData = ncounts, colData = pData(eset), design = ~1)
sizeFactors(dds) = rep(1, ncol(ncounts))
dds = DESeq(dds)
rld <- rlog(dds)
rlogMat <- assay(rld)

pc = pca_loadings(rlogMat)
comps = data.frame(pc$x)
comps$sample = rownames(comps)
comps <- left_join(comps,pData(eset),by = c("sample" = "FileName"))

colorby = "sample"
shapeby = NULL

PC1 vs PC2

pca_plot_custom(pc,comps,1,2,colorby,shapeby)

PC3 vs PC4

pca_plot_custom(pc,comps,3,4,colorby,shapeby)

PC5 vs PC6

pca_plot_custom(pc,comps,5,6,colorby,shapeby)

Variance explained by component

ggplot(data.frame(component = reorder(names(pc$percentVar), -pc$percentVar),
                  percent_var = pc$percentVar),
       aes(component, percent_var)) + geom_bar(stat = 'identity') +
  ylab("percent of total variation") + xlab("") + theme_bw()

Covariates correlation with PCs

When multiple factors may influence the results of a given experiment, it is useful to assess which of them is responsible for the most variance as determined by PCA. We adapted the method described by Daily et al. where they integrated a method to correlate covariates with principal components values to determine the importance of each factor.

DEGreport::degCovariates(rlogMat,pData(eset))

Correlation analysis between metadata elements.

We analyze if we observe any correlation between the metadata information.

DEGreport::degCorCov(pData(eset), show_heatmap_legend = FALSE)

PCA metadata {.tabset}

shapeby = "sample"
metadata_values <- names(comps)[!grepl("PC", names(comps))]

for (metadata_value in metadata_values) {
    colorby = metadata_value
    cat("\n###",metadata_value ,"\n")
    p <- customPlots::pca_plot_custom(pc,comps,1,2,colorby,shapeby)
    print(p)
    cat("\n")
}

Conclusions

R session info

sessionInfo()

hbc/hbcABC documentation built on Sept. 5, 2019, 9:51 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

hbc/hbcABC
Rmarkdown templates for bioinformatic analysis

In hbc/hbcABC: Rmarkdown templates for bioinformatic analysis

Quality Control {.tabset}

FOV

BD

Positive controls

Negative controls

Noise Threshold- Limit Of Detection (LOD).

Housekeeping genes

Expression

Expression of all the housekeeping genes in each sample.

General expression

Normalization

Positive Normalization {.tabset}

Pre-norm

Post-norm

Housekeeping normalization {.tabset}

Pre-norm

Post-norm

Drop genes

Data structure

PCA Plot{.tabset}

PC1 vs PC2

PC3 vs PC4

PC5 vs PC6

Variance explained by component

Covariates correlation with PCs

Correlation analysis between metadata elements.

PCA metadata {.tabset}

Conclusions

R session info

R Package Documentation

Browse R Packages

We want your feedback!

hbc/hbcABC Rmarkdown templates for bioinformatic analysis

In hbc/hbcABC: Rmarkdown templates for bioinformatic analysis

Quality Control {.tabset}

FOV

BD

Positive controls

Negative controls

Noise Threshold- Limit Of Detection (LOD).

Housekeeping genes

Expression

Expression of all the housekeeping genes in each sample.

General expression

Normalization

Positive Normalization {.tabset}

Pre-norm

Post-norm

Housekeeping normalization {.tabset}

Pre-norm

Post-norm

Drop genes

Data structure

PCA Plot{.tabset}

PC1 vs PC2

PC3 vs PC4

PC5 vs PC6

Variance explained by component

Covariates correlation with PCs

Correlation analysis between metadata elements.

PCA metadata {.tabset}

Conclusions

R session info

R Package Documentation

Browse R Packages

We want your feedback!

hbc/hbcABC
Rmarkdown templates for bioinformatic analysis