Nothing
#' @title Noise Filtering
#' @description Given a matrix or data frame of count data, this function
#' estimates the size factors as follows: Each column is divided by the
#' geometric means of the rows. The median (or, if requested, another location
#' estimator) of these ratios (skipping the genes with a # geometric mean of
#' zero) is used as the size factor for this column. Source: DESeq package.
#' @param object \code{DISCBIO} class object.
#' @param percentile A numeric value of the percentile. It is used to validate
#' the ERCC spik-ins. Default is 0.8.
#' @param CV A numeric value of the coefficient of variation. It is used to
#' validate the ERCC spik-ins. Default is 0.5.
#' @param geneCol Color of the genes that did not pass the filtration.
#' @param FgeneCol Color of the genes that passt the filtration.
#' @param erccCol Color of the ERCC spik-ins.
#' @param Val A logical vector that allows plotting only the validated ERCC
#' spike-ins. Default is TRUE. If Val=FALSE will plot all the ERCC spike-ins.
#' @param plot A logical vector that allows plotting the technical noise.
#' Default is TRUE.
#' @param export A logical vector that allows writing the final gene list in
#' excel file. Default is TRUE.
#' @param quiet if `TRUE`, suppresses printed output
#' @param filename Name of the exported file (if `export=TRUE`)
#' @importFrom stats quantile var fitted.values pchisq p.adjust median aggregate
#' @importFrom graphics plot axis abline points lines
#' @importFrom statmod glmgam.fit
#' @note This function should be used only if the dataset has ERCC.
#' @return The DISCBIO-class object input with the noiseF slot filled.
#' @examples
#' sc <- DISCBIO(valuesG1msTest) # changes signature of data
#' sd_filtered <- NoiseFiltering(sc, export = FALSE)
#' str(sd_filtered)
#'
setGeneric(
name = "NoiseFiltering",
def = function(object, percentile = 0.8, CV = 0.3, geneCol = "yellow",
FgeneCol = "black", erccCol = "blue", Val = TRUE, plot = TRUE,
export = FALSE, quiet = FALSE,
filename = "Noise_filtering_genes_test") {
standardGeneric("NoiseFiltering")
}
)
#' @export
#' @rdname NoiseFiltering
setMethod(
f = "NoiseFiltering",
signature = "DISCBIO",
definition = function(
object, percentile, CV, geneCol, FgeneCol, erccCol, Val, plot, export,
quiet, filename
) {
if (!is.numeric(percentile)) {
stop("percentile has to be a positive number")
} else if (percentile <= 0) {
stop("percentile has to be a positive number")
}
if (!is.numeric(CV)) {
stop("CV has to be a positive number")
} else if (CV <= 0) {
stop("CV has to be a positive number")
}
# Split data into sub tables based on the factor data geneTypes
GeneList <- rownames(object@expdata)
data <- object@expdataAll
shortNames <- substr(rownames(data), 1, 4)
geneTypes <-
factor(c(ENSG = "ENSG", ERCC = "ERCC")[shortNames])
# calculate normalisation for counts\n",
countsG1ms <- data[which(geneTypes == "ENSG"), ]
countsERCC <- data[which(geneTypes == "ERCC"), ]
estimateSizeFactorsForMatrix <- function(counts, locfunc = median) {
loggeomeans <- rowMeans(log(counts))
apply(counts, 2, function(cnts) {
exp(locfunc((
log(cnts) - loggeomeans
)[is.finite(loggeomeans)]))
})
}
sfERCC <- estimateSizeFactorsForMatrix(countsERCC)
sfG1ms <- estimateSizeFactorsForMatrix(countsG1ms)
# Divide columns by size factors to normalize counts
nCountsERCC <- t(t(countsERCC) / sfERCC)
nCountsG1ms <- t(t(countsG1ms) / sfG1ms)
# perform fit, define sample moments per gene
meansG1ms <- rowMeans(nCountsG1ms)
varsG1ms <- apply(nCountsG1ms, 1, var)
cv2G1ms <- varsG1ms / meansG1ms^2
meansERCC <- rowMeans(nCountsERCC)
varsERCC <- apply(nCountsERCC, 1, var)
cv2ERCC <- varsERCC / meansERCC^2
minMeanForFit <- unname(
quantile(meansERCC[which(cv2ERCC > CV)], percentile)
)
if (!quiet) {
message(
"Cut-off value for the ERCCs= ",
round(minMeanForFit, digits = 2),
"\n"
)
}
# Perform the fit of technical noise strength on average count.
# We regress cv2HeLa on 1/meansForHeLa. We use the glmgam.fit function
# from the statmod package to perform the regression as a GLM fit of
# the gamma family with log link. The 'cbind' construct serves to
# produce a model matrix with an intercept.
useForFit <- meansERCC >= minMeanForFit
fit <- glmgam.fit(
cbind(a0 = 1, a1tilde = 1 / meansERCC[useForFit]),
cv2ERCC[useForFit]
)
if (!quiet) {
message("Coefficients of the fit:")
print(fit$coefficients)
}
# To get the actual noise coefficients, we need to subtract Xi
xi <- mean(1 / sfERCC)
a0 <- unname(fit$coefficients["a0"])
a1 <- unname(fit$coefficients["a1tilde"] - xi)
# how much variance does the fit explain?
residual <-
var(log(fitted.values(fit)) - log(cv2ERCC[useForFit]))
total <- var(log(cv2ERCC[useForFit]))
if (!quiet) {
message(
"Explained variances of log CV^2 values= ",
c(round(1 - residual / total, digits = 2)),
"\n"
)
}
## Pick out genes above noise line
# test which entries are above the line
idx_test <- cv2G1ms > (xi + a1) / meansG1ms + a0
# pick out genes that fulfill statement
genes_test <- GeneList[idx_test] # pick out genes
genes_test <- genes_test[!is.na(genes_test)] # remove na entries
meansG1ms_test <-
meansG1ms[idx_test] # take out mean values for fulfilled genes
meansG1ms_test <-
meansG1ms_test[!is.na(meansG1ms_test)] # remove na entries
cv2G1ms_test <-
cv2G1ms[idx_test] # take out cv2 values for fulfilled genes
cv2G1ms_test <-
cv2G1ms_test[!is.na(cv2G1ms_test)] # remove na entries
genes_test <- genes_test[which(!vapply(genes_test, is.null, FALSE))]
genes_test <- vapply(genes_test, paste0, collapse = "", "")
if (!quiet) {
message(
"Number of genes that passed the filtering = ",
length(genes_test),
"\n"
)
}
if (export) {
write.csv(genes_test, file = paste0(filename, ".csv"))
if (!quiet) {
message(
"The filtered gene list was saved as ",
paste0(filename, ".csv")
)
}
}
if (plot) {
plot(
NULL,
xaxt = "n",
yaxt = "n",
log = "xy",
xlim = c(1e-1, 3e5),
ylim = c(.005, 100),
main = "Gene filtration by accounting for technical noise",
xlab = "Average normalized read count",
ylab = "Squared coefficient of variation (CV^2)"
)
axis(
1,
10^(-1:5),
c(
"0.1",
"1",
"10",
"100",
"1000",
expression(10^4),
expression(10^5)
)
)
axis(2, 10^(-2:2), c("0.01", "0.1", "1", "10", "100"), las = 2)
abline(
h = 10^(-2:1),
v = 10^(-1:5),
col = "#D0D0D0",
lwd = 2
)
# Plot the genes, use a different color if they are highly variable
points(
meansG1ms,
cv2G1ms,
pch = 20,
cex = .2,
col = geneCol
)
# highlight gene list from test
points(
meansG1ms_test,
cv2G1ms_test,
pch = 20,
cex = .22,
col = FgeneCol
)
# Add the technical noise fit, as before
xg <- 10^seq(-2, 6, length.out = 1000)
lines(xg, (xi + a1) / xg + a0, col = "red", lwd = 5)
# Add the normalised ERCC points
if (Val) {
points(
meansERCC[useForFit],
cv2ERCC[useForFit],
pch = 20,
cex = 1.5,
col = erccCol
) # Showing only the valied ERCCs
} else {
points(
meansERCC,
cv2ERCC,
pch = 20,
cex = 2,
col = erccCol
) # Showing all the valied ERCCs
}
add_legend(
"topleft",
legend = c(
"Noise curve",
"ERCC spike-ins",
"Genes above the noise line"
),
pch = c(15, 20, 20),
col = c("red", erccCol, FgeneCol),
horiz = TRUE,
bty = "n",
cex = 0.85
)
}
object@noiseF <- genes_test
return(object)
}
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.