This is not necessarily great.
library(opticut) library(dclone) library(rjags) source("~/repos/opticut/extras/calibrate.R") set.seed(234) n <- 50*4 K <- 4 g <- sample(1:K, n, replace=TRUE) x <- rnorm(n, 0, 1) z <- ifelse(g == 1, 1, 0) b0 <- 1 b1 <- 1.5 a <- 0.2 mu <- b0 + z*b1 + x*a #p <- plogis(mu) #y <- rbinom(n, 1, p) lam <- exp(mu) y <- rpois(n, lam) table(y, g) boxplot(lam ~ g) df <- data.frame(y=y, x=x, g=g) df <- df[-(1:5),] #o <- opticut(y ~ x, data=df, strata=g, dist="binomial") o <- opticut(y ~ x, data=df, strata=g, dist="poisson") summary(o) co <- calibrate(o, cbind("Sp 1"=(y[1:5])), model.matrix(~x, data.frame(x=x[1:5]))) y[1:5] g[1:5] co$gnew round(co$pi, 2)
We get g=1 cases correctly classified (this is where Sp1 is to be found), But the rest is just all equally probable. All what we see is that it is not g=1. All makes sense, but not very useful. Luckily for us, we usually have >1 species.
This should work much better in principle.
set.seed(234) n <- 50*4 K <- 4 g <- sample(1:K, n, replace=TRUE) g[1:K] <- 1:K x <- rnorm(n, 0, 1) z <- ifelse(g %in% 1:2, 1, 0) b0 <- 1 b1 <- 1.5 a <- 0.2 mu <- b0 + z*b1 + x*a lam <- exp(mu) y1 <- rpois(n, lam) z <- ifelse(g %in% 2:3, 1, 0) b0 <- 1 b1 <- 1.5 a <- 0.2 mu <- b0 + z*b1 + x*a lam <- exp(mu) y2 <- rpois(n, lam) y <- cbind(y1=y1, y2=y2) df <- data.frame(x=x, g=g) df <- df[-(1:5),] yy <- y[-(1:5),] o <- opticut(yy ~ x, data=df, strata=g, dist="poisson") summary(o) co <- calibrate(o, y[1:5,], df[1:5,]) y[1:5,] g[1:5] co$gnew round(co$pi, 2) v <- rnorm(n) xx <- lapply(1:ncol(yy), function(i) glm(y[,i] ~ as.factor(g) + v, family=poisson)) xnew <- model.matrix( ~v, data.frame(v=rnorm(5))) cxx <- ipredict.default(xx, y[1:5,], xnew, K=4)
data(dolina) dolina$samp$stratum <- as.integer(dolina$samp$stratum) Y <- dolina$xtab#[dolina$samp$method=="Q",] X <- dolina$samp#[dolina$samp$method=="Q",] inew <- 1:nrow(Y) %in% sample(nrow(Y), 10) Y <- Y[,colSums(Y[!inew,] > 0) >= 20] #Y <- ifelse(Y > 0, 1, 0) x <- X[!inew,] xnew <- dolina$samp[inew,] y <- Y[!inew,] ynew <- Y[inew,] o <- opticut(y ~ stratum + lmoist + method, data=x, strata=mhab, dist="poisson") #o <- opticut(y ~ 1, data=x, strata=mhab, dist="poisson") ## binary calibration cal <- calibrate(o, ynew, xnew) #cal <- calibrate(o, ynew) round(cal$pi,2) dolina$samp[inew,"mhab"] round(cor(cal$pi),2) data.frame(predicted=cal$gnew, truth=dolina$samp[inew,"mhab"], OK=cal$gnew == dolina$samp[inew,"mhab"]) cm <- table(pred=cal$gnew, true=X[inew,"mhab"]) cm <- cm[rownames(cm),rownames(cm)] cm sum(diag(cm)) / sum(cm) ## multinomial calibration using calibrate.default oo <- lapply(1:ncol(y), function(i) { glm(y[,i] ~ mhab + stratum + lmoist + method, x, family=poisson) }) xnew <- model.matrix(~ stratum + lmoist + method, xnew) cal <- calibrate(oo, ynew, xnew, K=4) round(cal$pi,2) round(cor(cal$pi),2) data.frame(predicted=levels(x$mhab)[cal$gnew], truth=dolina$samp[inew,"mhab"], OK=levels(x$mhab)[cal$gnew] == dolina$samp[inew,"mhab"]) cm <- table(pred=levels(x$mhab)[cal$gnew], true=X[inew,"mhab"]) cm <- cm[rownames(cm),rownames(cm)] cm sum(diag(cm)) / sum(cm)
library(optpart) data(shoshsite) data(shoshveg) #elev <- cut(shoshsite$elevation, breaks=c(0, 7200, 8000, 9000, 20000)) #levels(elev) <- c("low", "mid1", "mid2", "high") elev <- cut(shoshsite$elevation, breaks=c(0, 7800, 8900, 20000)) levels(elev) <- c("low", "mid", "high") sveg <- as.matrix(shoshveg) sveg[sveg > 0] <- 1 set.seed(1) ss <- seq_len(nrow(sveg)) %in% sample.int(nrow(sveg), floor(nrow(sveg) * 0.9)) sveg1 <- sveg[ss,] sveg1 <- sveg1[,colSums(sveg1) > 5] elev1 <- elev[ss] dim(sveg) dim(sveg1) sveg2 <- sveg[!ss, colnames(sveg1)] elev2 <- elev[!ss] o <- opticut(sveg1 ~ 1, strata=elev1, dist="binomial") plot(o, sort=1) co <- calibrate(o, sveg2) data.frame(predicted=co$gnew, truth=elev2, OK=co$gnew == elev2, round(co$pi,2)) sum(co$gnew == elev2) / nrow(sveg2)
library(rioja) data(SWAP) data(RLGH) spec <- SWAP$spec pH <- SWAP$pH core <- RLGH$spec age <- RLGH$depths$Age ph <- cut(pH, c(0, 5, 6, 14)) levels(ph) <- c("low", "mid", "high") sp <- as.matrix(ifelse(spec > 0, 1, 0)) set.seed(1) ss <- seq_len(nrow(sp)) %in% sample.int(nrow(sp), floor(nrow(sp) * 0.9)) sp1 <- sp[ss,] sp1 <- sp1[,colSums(sp1) > 0] ph1 <- ph[ss] dim(sp) dim(sp1) sp2 <- sp[!ss, colnames(sp1)] ph2 <- ph[!ss] o <- opticut(sp1 ~ 1, strata=ph1, dist="binomial") plot(o, sort=1) co <- calibrate(o, sp2) data.frame(predicted=co$gnew, truth=ph2, OK=co$gnew == ph2, round(co$pi,2)) sum(co$gnew == ph2) / nrow(sp2)
Inverse prediction might be a good metric of overall accuracy and scaling species contribution (i.e. the real indicator power).
Implement binary-multinomial-continuous comparison.
See how non-linear (unimodal, multimodal) responses affect inverse prediction
Somehow show how # species affects accuracy.
x is data class, y is classification
## simple binary confusion matrix cmat <- function(x, y, drop=TRUE) { out <- c( tp=sum(x * y), fp=sum((1-x) * y), fn=sum(x * (1-y)), tn=sum((1-x) * (1-y))) if (drop) out else array(out, c(2L, 2L), list(data=c("t", "f"), class=c("p", "n"))) } cmat(c(1,0,0,1,1,1,0,0,0,0), c(1,1,1,0,0,0,0,0,0,0)) cmat(c(1,0,0,1,1,1,0,0,0,0), c(1,1,1,0,0,0,0,0,0,0), drop=FALSE) x <- sample(LETTERS[1:4], 20, replace=TRUE) y <- x for (i in 1:length(x)) if (runif(1) > 0.1) y[i] <- sample(LETTERS[1:4], 1) mcm <- function(x, y, beta=1) { levs <- sort(unique(x)) N <- length(x) if (length(y) != N) stop("x and y lengths must match") if (length(setdiff(y, x)) > 0) warning("y has more levels than x") cm <- sapply(levs, function(z) { xx <- ifelse(x == z, 1, 0) yy <- ifelse(y == z, 1, 0) cmat(xx, yy) }) Stat <- c( Acc = mean((cm["tp",] + cm["fn",]) / N), Err = mean((cm["fp",] + cm["fn",]) / N), Prc_m = sum(cm["tp",]) / sum(cm["tp",] + cm["fp",]), Prc_M = mean(cm["tp",] / (cm["tp",] + cm["fp",])), Rec_m = sum(cm["tp",]) / sum(cm["tp",] + cm["fn",]), Rec_M = mean(cm["tp",] / (cm["tp",] + cm["fn",]))) Stat <- c(Stat, F_m = (beta^2 + 1) * Stat["Prc_m"] * Stat["Rec_m"] / (beta^2 * Stat["Prc_m"] + Stat["Rec_m"]), F_M = (beta^2 + 1) * Stat["Prc_M"] * Stat["Rec_M"] / (beta^2 * Stat["Prc_M"] + Stat["Rec_M"])) list(cmat=cm, stats=Stat, p=cm["tp",] / (cm["tp",] + cm["fp",]), r=cm["tp",] / (cm["tp",] + cm["fn",]), beta=beta) } mcm(x, y)
Might have to check valid levels (x, y, union)
Dolina LOO
data(dolina) dolina$samp$stratum <- as.integer(dolina$samp$stratum) Y <- dolina$xtab[dolina$samp$method=="Q",] X <- dolina$samp[dolina$samp$method=="Q",] Y <- Y[,colSums(Y > 0) >= 20] gnew <- character(0) pm <- matrix(NA, 0, 4) for (i in 1:nrow(Y)) { if (interactive()) { cat("\nRun", i, "of", nrow(Y), "\n") } ii <- seq_len(nrow(Y)) != i y_trn <- Y[ii,,drop=FALSE] y_new <- Y[!ii,,drop=FALSE] x_trn <- X[ii,,drop=FALSE] #x_new <- X[!ii,,drop=FALSE] o <- opticut(y_trn ~ 1, strata=x_trn$mhab, dist="poisson") cal <- calibrate(o, y_new) gnew <- c(gnew, cal$gnew) pm <- rbind(pm, cal$pi) } mcm(X$mhab, factor(gnew, levels(X$mhab))) tt <- table(X$mhab, factor(gnew, levels(X$mhab))) sum(diag(tt))/sum(tt)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.