#' Test Illumina CpG Probes for potential problems
#'
#' Test Illumina CpG probes for potential problems, useful for removing hits that may be unlikely to be reproduced
#'
#' @aliases testProbes methylumi, testProbes methylumiSet, testProbes minfi, testProbes RGChannelSet, testProbes RGChannelSetExtended
#'
#' @param betas A matrix contain beta values, or a RGChannelSet(Extended), or a methylumiSet
#' @param manifest Specify which manifest to use
#' @param beadcounts Default: NULL, provide with a matrix containing beadcounts. Bead counts values will be used in calculation of nb for each probe.
#' @param detection Default: NULL, provide with a matrix containing detection P values.Detection p-values will be used in calculation of np for each probe.
#' @param nb A value between 0 and 1, the percentage of dataset a probe has failed in bead count. Probes failed in more than 20\% of datasets in terms of bead count will be filtered out. However, all probes can be tested by nb=0.
#' @param np A value between 0 and 1, the percentage of dataset a probe has failed in detection p-value. Probes failed in more than 20\% of datasets in terms of detection p-values will be filtered out. However, all probes can be tested by np=0.
#' @param nvar A value between 0 and 1, sample variance is used to filter out low variance having probes. Nvar parameter is adjusted to 0.5 which means the probe showing a low variation for more than 50\% of the time will be failed in test. The last two messages clarify how many type I and type II probes were determined within the 5\% in terms of variation.
#' @param ot a character vector representing the types of probes as I or II.
#' @param nbCount refers to physical bead counts to test the probes determined as lower than 3.
#' @param nbTresh a proportion of the sample. The TestProbes function filters probes when the nbCount is less than 3 (<3) and nbThresh > 5\% of the samples.
#' @param pvCount the detection p-value to test probes. Default is 0.05 (5\%).
#' @param pvThresh the detection p-value threshold, corresponding to > 0.01 value. Probes are failed when they show > 0.05 detection p-value in > 1\% of samples.
#' @param nvarThresh the density of variation a probe can be in.
#'
#' @return
#' For minfi package & beta-values: A data.frame of nrow(betas) x 3 columns specifying which probes have failed which tests.
#'
#' For methylumi package: MethylumiSet object. Analysis is stored under betas@featureData@data$Variation.
#' @examples
#' data("melon") #Sample dataset from wateRmelon package
#' betas <- betas(melon) #extracting beta values from melon object
#' test_betas <- testProbes(betas, ot = ot)
#'
#' library(minfiData)
#' data("RGsetEx")
#' test_RG <- testProbes(RGsetEx)
#'
#' data("melon")
#' test_melon <- testProbes(melon)
#' @author Tyler Gorrie-Stone \email{tgorri@essex.ac.uk}
#' @export
testProbes <- function(betas, manifest = c('450k', 'EPIC'), beadcounts = NULL, detection = NULL,
nb = .2, np = .2, nvar =.5, ot,
nbCount = 3, nbThresh = 0.05, pvCount = 0.05, pvThresh = 0.01, nvarThresh = 0.05){
manifest <- match.arg(manifest)
switch(manifest,
'450k' = {
data(melonProbes450k)
manifest <- melonProbes450k
},
'EPIC' = {
message('Potentially bad probes only identified for 450k data, only these will be tested')
data(melonProbesEpic)
manifest <- melonProbesEpic
}
)
manifest <- manifest[intersect(rownames(manifest), rownames(betas)),]
out <- list()
if(!is.null(beadcounts)){
# Test beadcounts
badbc <- manifest$PercBadBc >= nb
badbcRelative <- (rowSums(beadcounts[badbc,] <= nbCount, na.rm = TRUE) / ncol(beadcounts)) > 0.05
message(paste0(sum(badbcRelative), ' of ', sum(badbc), ' have a beadcount <= ', nbCount, ' in more than ', nbThresh*100 , '% of samples.'))
out[['BeadCounts']] = rownames(betas) %in% names(which(badbcRelative))
}
if(!is.null(detection)){
# Test p-values
badpv <- manifest$PercBadPv >= np
badpvRelative <- (rowSums(detection[badpv,] > pvCount, na.rm = TRUE) / ncol(detection)) > pvThresh
message(paste0(sum(badpvRelative), ' of ', sum(badpv), ' have a detection p > ', pvCount, ' in more than ', pvThresh*100 , '% of samples.'))
out[['DetectionP']] = rownames(betas) %in% names(which(badpvRelative))
}
# Test variance
# betas to test...
badvar <- manifest$PercLowVariation >= nvar
sds <- rowSds(betas, na.rm=T)
I <- ot == 'I'
II <- ot == 'II'
tofill <- vector('numeric', length(sds))
tofill[I] <- trunc(rank(sds[I]))/length(sds[I])
tofill[II] <- trunc(rank(sds[II]))/length(sds[II])
low <- tofill < nvarThresh
bad_lowi <- badvar & I & low
bad_lowii <- badvar & II & low
message(paste0(sum(bad_lowi), ' of ', sum(badvar & I), ' are ranked in the bottom ', nvarThresh*100, 'th percentile of Type I probes.'))
message(paste0(sum(bad_lowi), ' of ', sum(badvar & II), ' are ranked in the bottom ', nvarThresh*100, 'th percentile of Type II probes.'))
out[['Variation']] = low
# Test HW Snps
# Currently missing D:
df <- data.frame(out)
rownames(df) <- rownames(betas)
return(df)
}
setMethod(f= "testProbes",
signature(betas="RGChannelSetExtended"),
definition=function(betas,manifest = c('450k', 'EPIC'), beadcounts = NULL,
detection = NULL, nb = .2, np = .2, nvar = .5, ot,
nbCount = 3, nbThresh = 0.05, pvCount = 0.05,
pvThresh = 0.01, nvarThresh = 0.05)
{ betasP <- preprocessRaw(betas)
#betas of object
betRG <- getBeta(betasP)
#ot
ot <- got(betasP)
#beadcount
beadcount <- beadcount(betas)
#detectionP
dp <- detectionP(betas)
out <- testProbes(betas=betRG,manifest = c('450k', 'EPIC'),
beadcounts = beadcount, detection = dp, nb = nb, np = np,
nvar = nvar, ot = ot,nbCount = nbCount, nbThresh = nbThresh,
pvCount = pvCount, pvThresh = pvThresh,
nvarThresh = nvarThresh)
return(out)
}
)
setMethod( f="testProbes",
signature(betas="MethyLumiSet"),
definition = function(betas,manifest = c('450k', 'EPIC'), beadcounts = NULL,
detection = NULL, nb = .2, np = .2, nvar = .5, ot,
nbCount = 3, nbThresh = 0.05, pvCount = 0.05,
pvThresh = 0.01, nvarThresh = 0.05)
{
#betas of MethylumiSet object
bet <- betas(betas)
#probe types column name
ds <- fot(betas)
ot <- betas@featureData@data[,ds]
#beadcounts
if(exists("NBeads", assayData(betas))){
bc <- assayData(betas)$NBeads
bc[bc<3] <- NA
} else {
bc <- NULL
message("No beadCounts available")}
#detection
pv <- betas@assayData$pval
out <- testProbes(betas=bet,manifest = c('450k', 'EPIC'), beadcounts = bc,
detection = pv, nb = nb, np = np, nvar = nvar, ot=ot,
nbCount = nbCount, nbThresh = nbThresh, pvCount = pvCount,
pvThresh = pvThresh, nvarThresh = nvarThresh)
history.submitted <- as.character(Sys.time())
history.finished <-as.character(Sys.time())
history.command <- "Filtered by testProbes method (melonProbes)"
betas@history <- rbind(betas@history, data.frame(
submitted = history.submitted,
finished = history.finished,
command = history.command))
fData(betas) <- cbind(fData(betas),out)
return(betas)
})
#' Dataframe containing potentially bad probes for 450K microarray data
#'
#' Dataframe containing potentially bad probes for 450K microarray data
#'
#' @format A data frame with 800,000 rows and 9 variables:
#' \describe{
#' \item{Zhou_HG19}{Logical Vector of CpGs that are marked TRUE in Zhou's General Masking column for HG19}
#' \item{Zhou_HG38}{Logical Vector of CpGs that are marked TRUE in Zhou's General Masking column for HG38}
#' \item{PercLowVariation}{Numeric Vector describing the number of times a CpG is ranked in bottom 5th percentile of variation}
#' \item{PercBadBc}{Numeric Vector describing the number of times a CpG has a beadcount 4 in 5 of samples.}
#' \item{PercBadPv}{Numeric Vector describing the number of times a CpG has a detection p value 0.05 in 1 of samples.}
#' }
#'
#' @usage data(melonProbes450k)
#'
#' @examples
#' data(melonProbes450k)
#'
#' @source
#' \url{http://zwdzwd.github.io/InfiniumAnnotation}
#'
#' Pidsley, R., Zotenko, E., Peters, T.J. et al. Critical evaluation of the Illumina MethylationEPIC BeadChip microarray for whole-genome DNA methylation profiling. Genome Biol 17, 208 (2016) doi:10.1186\/s13059-016-1066-1
"melonProbes450k"
#' Dataframe containing potentially bad probes for 450K microarray data
#'
#' Dataframe containing potentially bad probes for EPIC microarray data
#'
#' @format A data frame with 800,000 rows and 9 variables:
#' \describe{
#' \item{Zhou_HG19}{Logical Vector of CpGs that are marked TRUE in Zhou's General Masking column for HG19}
#' \item{Zhou_HG38}{Logical Vector of CpGs that are marked TRUE in Zhou's General Masking column for HG38}
#' \item{Pidsley_CrossReactive}{Logical Vector of CpGs that are marked TRUE if identified to be cross-reactive by Pidsley et al.}
#' \item{Pidsley_GeneVariant_Body}{Logical Vector of CpGs that are marked TRUE if identified to have a SNP variant within the body of the probe by Pidsley et al.}
#' \item{Pidsley_Overlap_Gene}{Logical Vector of CpGs that are marked TRUE if identified to have multiple genes mapping to the same CpG by Pidsley et al.}
#' \item{Pidsley_GeneVariant_Extension}{Logical Vector of CpGs that are marked TRUE if identified to have a SNP in the probe single base extension identified by Pidsley et al.}
#' \item{PercLowVariation}{Numeric Vector describing the number of times a CpG is ranked in bottom 5th percentile of variation}
#' \item{PercBadBc}{Numeric Vector describing the number of times a CpG has a beadcount 4 in 5 of samples.}
#' \item{PercBadPv}{Numeric Vector describing the number of times a CpG has a detection p value 0.05 in 1 of samples.}
#' }
#'
#' @usage data(melonProbesEpic)
#'
#' @examples
#' data(melonProbesEpic)
#'
#' @source
#' \url{http://zwdzwd.github.io/InfiniumAnnotation}
#'
#' Pidsley, R., Zotenko, E., Peters, T.J. et al. Critical evaluation of the Illumina MethylationEPIC BeadChip microarray for whole-genome DNA methylation profiling. Genome Biol 17, 208 (2016) doi:10.1186/s13059-016-1066-1
"melonProbesEpic"
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.