###### -- comprehensive and extensible rejection / retention function ---------
RejectionBy <- function(input,
criteria = list("fdr" = 1e-5,
"centroidthreshold" = list("globalpid" = 0.3),
"glmforbiddencolumns" = c("alitype"),
"lmforbiddencolumns" = c("response",
"alitype"),
"kargs" = list("max" = 15,
"scalar" = 4,
"unitnorm" = TRUE)),
rankby = "rawscore",
method = "direct",
supportedcolumns = c("consensus",
"kmerdist",
"featurediff",
"localpid",
"globalpid",
"matchcoverage",
"localscore",
"deltabackground",
"rawscore",
"response"),
dropinappropriate = FALSE) {
# print(criteria$fdr)
# current overhead checking
# function built for internal use, so no printing or progress bars
# return an integer vector identifying rows in the data to retain
if (!is(object = input,
class2 = "data.frame")) {
stop ("only data.frame inputs are supported")
}
# if (!all(colnames(input) %in% supportedcolumns)) {
# stop ("a column that is present is not supported")
# }
if (!(method %in% c("glm",
"kmeans",
"lm",
"direct",
"all"))) {
stop ("method is not supported")
}
# print(criteria$fdr)
check_this <- names(criteria)[!(names(criteria) == "fdr")]
# criteria has changed a bit and this isn't the right check to use anymore
# print(check_this)
# if (!all(check_this %in% colnames(input))) {
# stop ("all supplied criteria must be present in the input data")
# }
aliindex <- ifelse(test = input$alitype == "AA",
yes = "AA",
no = "other")
input <- input[, supportedcolumns]
if (method == "glm") {
# perform a knockout based rejection scheme with the spiked in negatives
# response column must exist, and must contain true/false values
# run a glm, rank predictions on the fitted value (probability of being true)
# and return all values above the fdr indicated in the arguments while dropping
# any false positives that are ranked above the fdr
w1 <- which(!input$response)
if (length(w1) < 1) {
stop ("spiked in negatives must be present for ranked rejection")
}
if (!is.null(criteria$glmforbiddencolumns)) {
# do nothing
} else {
# check whether the indicated columns are present
check_this <- which(colnames(input) %in% criteria$forbiddenglmcolumns)
# if *any* are, remove them
if (length(check_this) > 0) {
input <- input[, -check_this]
}
}
g <- glm(formula = response ~ .,
family = "quasibinomial",
data = input)
FDR <- criteria$fdr
# total number of positives
N <- sum(input$response)
# ordered values
ranking <- order(g$fitted.values,
decreasing = TRUE)
# return(ranking)
# take the ordered values up to the point where the FDR climbs past the tolerance
# then drop known falses from the set
rate <- cumsum(!input$response[ranking]) / seq(nrow(input))
# return(rate)
w2 <- min(which(rate >= FDR))
retained <- ranking[seq(nrow(input)) < w2]
retained <- retained[!(retained %in% w1)]
result <- sort(retained,
decreasing = FALSE)
} else if (method == "kmeans") {
# for now we're just going to do aa vs nt alignments
input$rawscore <- NormVec(input$rawscore)
# return(list("res" = input,
# "index" = aliindex))
input <- tapply(X = input,
INDEX = aliindex,
FUN = function(x) {
x
},
simplify = FALSE)
keylist <- vector(mode = "list",
length = length(input))
attrlist <- vector(mode = "list",
length = length(input))
# at this point we have a LIST
# of data.frames that whose rownames reference the original row order
# we have some overhead things to do and then we can just loop through these
# and return hits that pass
# response column must be removed if it exists,
# spiked in negatives are not necessary here and this must be tested for performance
# with both their inlcusion and exclusion
# a named criteria value must exist and be present
# run a kmeans routine analogous to the previous setups, with the addition of splitting
# nt and aa alignments apart,
# retain clusters with centroids above non-fdr criterias included
thresholdcolumn <- names(criteria$centroidthreshold)[1L]
thresholdval <- criteria$centroidthreshold[[1]]
# inside the running loop, we check whether the current input data.frame fits
# check the size of the objects and use the drop logical to choose whether to
# retain the entire set, or reject the entire set in cases where there are too few candidates
# for the unlabeled clustering
# return(input)
# this loops because i was trying to be forward compatible with giving alignment
# classes that describe the concordance or discordance between the assignments of
# features as coding or not, and their alignment in coding space or not...
# unsupervised clustering in this manner needs to run checks within those class groups
# as opposed to combining them ... that is the purpose
for (m1 in seq_along(input)) {
w1_drop <- which(colnames(input[[m1]]) %in% c("response"))
if (length(w1_drop) > 0) {
w1_logical <- !input[[m1]]$response
w1_integer <- which(w1_logical)
input[[m1]] <- input[[m1]][, -w1_drop]
} else {
# assume there are no negative spiked observations
# create empty values for later comparisons
w1_logical <- rep(FALSE, nrow(input[[m1]]))
w1_integer <- vector(mode = "integer",
length = 0L)
}
# kmeans needs to check for *unique* data points
rowcheck <- nrow(unique(input[[m1]])) <= criteria$kargs$max
if (dropinappropriate & rowcheck) {
# just drop everything
keylist[[m1]] <- vector(mode = "integer",
length = 0L)
} else if (!dropinappropriate & rowcheck) {
# just retain everything
keylist[[m1]] <- seq(nrow(input[[m1]]))
} else {
nclust <- seq(from = 2,
by = 1,
to = criteria$kargs$max)
kmc <- vector(mode = "list",
length = length(nclust))
for (m2 in seq_along(kmc)) {
kmc[[m2]] <- suppressWarnings(kmeans(x = input[[m1]],
centers = nclust[m2],
iter.max = 25L,
nstart = 25L))
# print(m2)
# print(nclust[m2])
}
wss <- vapply(X = kmc,
FUN = function(x) {
x$tot.withinss
},
FUN.VALUE = vector(mode = "numeric",
length = 1L))
# dat2 doesn't seem to behave correctly as a data.frame, so ensure that
# it's a matrix
dat1 <- cbind("n" = nclust,
"wss" = wss)
dat2 <- cbind("n" = nclust - 2,
"wss" = abs(wss - wss[1]))
fita <- nls(dat2[, 2L]~OneSite(X = dat2[, 1L],
Bmax,
Kd),
start = list(Bmax = max(dat2[, 2L]),
Kd = unname(quantile(dat2[, 1L], .25))))
fitasum <- summary(fita)
# fit is offset by -2L to plot and fit correctly, re-offset by +1 to select the correct
# list position
# because we're taking the ceiling here, we technically don't need to re-offset
evalclust <- ceiling((fitasum$coefficients["Kd", "Estimate"] + 1L) * criteria$kargs$scalar)
# print(evalclust)
# print(length(kmc))
# print(criteria$kargs$max)
if (evalclust >= criteria$kargs$max) {
warning("Evaluated clusters may be insufficient for this task.")
evalclust <- criteria$kargs$max - 1L
}
if (evalclust < 1L) {
warning("scalar selection requested a number of clusters less than 2, defaulting to 2 clusterings.")
evalclust <- 1L
}
# return(list("a" = fitasum,
# "b" = evalclust,
# "c" = kmc,
# "d" = thresholdcolumn,
# "e" = thresholdval))
kmcselect <- kmc[[evalclust]]
kmcidentifiers <- kmcselect$cluster
kmcretainclust <- which(kmcselect$centers[, thresholdcolumn] > thresholdval)
kmcretainobs <- which((kmcselect$cluster %in% kmcretainclust) & !w1_logical)
keylist[[m1]] <- rownames(input[[m1]])[kmcretainobs]
}
}
result <- as.integer(unlist(keylist))
} else if (method == "lm") {
# perform a knockout based rejection scheme with the spiked in negatives
# response column must be removed it if exists
# will calculate the fdr as above and nuke everything below the cutoff
# flip the logical for illogical reasons
FDR <- criteria$fdr
w1_logical <- !input$response
w1_integer <- which(w1_logical)
if (length(w1_integer) < 1) {
stop ("spiked in negatives must be present for ranked rejection")
}
if (is.null(criteria$lmforbiddencolumns)) {
# do nothing
} else {
# check whether the indicated columns are present
# we're automatically removing response here from the input, but it's been assigned
# as w1 above, so that's ok for now
check_this <- which(colnames(input) %in% criteria$lmforbiddencolumns)
# if *any* are, remove them
if (length(check_this) > 0) {
input <- input[, -check_this]
}
}
g <- lm(formula = rawscore ~ .,
data = input)
ranking <- order(g$fitted.values,
decreasing = TRUE)
rate <- cumsum(w1_logical[ranking]) / seq(nrow(input))
w2 <- min(which(rate >= FDR))
retained <- ranking[seq(nrow(input)) < w2]
retained <- retained[!(retained %in% w1_integer)]
result <- sort(retained)
} else if (method == "all") {
# run all methods and combine them
stop("not supported yet")
} else if (method == "direct") {
# simplest method, just rank the column the user asks for and drop based on the FDR
FDR <- criteria$fdr
ranking <- order(input[, rankby],
decreasing = TRUE)
rate <- cumsum(!input$response[ranking]) / seq(nrow(input))
w2 <- min(which(rate >= FDR))
retained <- ranking[seq(nrow(input)) < w2]
retained <- retained[!(retained %in% which(!input$response))]
result <- sort(retained,
decreasing = FALSE)
}
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.