Calibration with single species

This is not necessarily great.

library(opticut)
library(dclone)
library(rjags)
source("~/repos/opticut/extras/calibrate.R")

set.seed(234)
n <- 50*4
K <- 4
g <- sample(1:K, n, replace=TRUE)
x <- rnorm(n, 0, 1)
z <- ifelse(g == 1, 1, 0)
b0 <- 1
b1 <- 1.5
a <- 0.2
mu <- b0 + z*b1 + x*a
#p <- plogis(mu)
#y <- rbinom(n, 1, p)
lam <- exp(mu)
y <- rpois(n, lam)
table(y, g)
boxplot(lam ~ g)

df <- data.frame(y=y, x=x, g=g)
df <- df[-(1:5),]
#o <- opticut(y ~ x, data=df, strata=g, dist="binomial")
o <- opticut(y ~ x, data=df, strata=g, dist="poisson")
summary(o)

co <- calibrate(o, cbind("Sp 1"=(y[1:5])), model.matrix(~x, data.frame(x=x[1:5])))

y[1:5]
g[1:5]
co$gnew
round(co$pi, 2)

We get g=1 cases correctly classified (this is where Sp1 is to be found), But the rest is just all equally probable. All what we see is that it is not g=1. All makes sense, but not very useful. Luckily for us, we usually have >1 species.

Calibration with multiple species

This should work much better in principle.

set.seed(234)
n <- 50*4
K <- 4
g <- sample(1:K, n, replace=TRUE)
g[1:K] <- 1:K
x <- rnorm(n, 0, 1)
z <- ifelse(g %in% 1:2, 1, 0)
b0 <- 1
b1 <- 1.5
a <- 0.2
mu <- b0 + z*b1 + x*a
lam <- exp(mu)
y1 <- rpois(n, lam)

z <- ifelse(g %in% 2:3, 1, 0)
b0 <- 1
b1 <- 1.5
a <- 0.2
mu <- b0 + z*b1 + x*a
lam <- exp(mu)
y2 <- rpois(n, lam)
y <- cbind(y1=y1, y2=y2)

df <- data.frame(x=x, g=g)
df <- df[-(1:5),]
yy <- y[-(1:5),]
o <- opticut(yy ~ x, data=df, strata=g, dist="poisson")
summary(o)

co <- calibrate(o, y[1:5,], df[1:5,])

y[1:5,]
g[1:5]
co$gnew
round(co$pi, 2)

v <- rnorm(n)
xx <- lapply(1:ncol(yy), function(i) glm(y[,i] ~ as.factor(g) + v, family=poisson))
xnew <- model.matrix( ~v, data.frame(v=rnorm(5)))
cxx <- ipredict.default(xx, y[1:5,], xnew, K=4)

Dolina and calibration

data(dolina)
dolina$samp$stratum <- as.integer(dolina$samp$stratum)
Y <- dolina$xtab#[dolina$samp$method=="Q",]
X <- dolina$samp#[dolina$samp$method=="Q",]
inew <- 1:nrow(Y) %in% sample(nrow(Y), 10)
Y <- Y[,colSums(Y[!inew,] > 0) >= 20]
#Y <- ifelse(Y > 0, 1, 0)
x <- X[!inew,]
xnew <- dolina$samp[inew,]
y <- Y[!inew,]
ynew <- Y[inew,]
o <- opticut(y ~ stratum + lmoist + method, data=x, strata=mhab, dist="poisson")
#o <- opticut(y ~ 1, data=x, strata=mhab, dist="poisson")

## binary calibration
cal <- calibrate(o, ynew, xnew)
#cal <- calibrate(o, ynew)
round(cal$pi,2)
dolina$samp[inew,"mhab"]
round(cor(cal$pi),2)
data.frame(predicted=cal$gnew,
    truth=dolina$samp[inew,"mhab"],
    OK=cal$gnew == dolina$samp[inew,"mhab"])
cm <- table(pred=cal$gnew, true=X[inew,"mhab"])
cm <- cm[rownames(cm),rownames(cm)]
cm
sum(diag(cm)) / sum(cm)

## multinomial calibration using calibrate.default
oo <- lapply(1:ncol(y), function(i) {
    glm(y[,i] ~ mhab + stratum + lmoist + method, x, family=poisson)
})
xnew <- model.matrix(~ stratum + lmoist + method, xnew)
cal <- calibrate(oo, ynew, xnew, K=4)
round(cal$pi,2)
round(cor(cal$pi),2)
data.frame(predicted=levels(x$mhab)[cal$gnew],
    truth=dolina$samp[inew,"mhab"],
    OK=levels(x$mhab)[cal$gnew] == dolina$samp[inew,"mhab"])
cm <- table(pred=levels(x$mhab)[cal$gnew], true=X[inew,"mhab"])
cm <- cm[rownames(cm),rownames(cm)]
cm
sum(diag(cm)) / sum(cm)
library(optpart)

data(shoshsite)
data(shoshveg)

#elev <- cut(shoshsite$elevation, breaks=c(0, 7200, 8000, 9000, 20000))
#levels(elev) <- c("low", "mid1", "mid2", "high")
elev <- cut(shoshsite$elevation, breaks=c(0, 7800, 8900, 20000))
levels(elev) <- c("low", "mid", "high")

sveg <- as.matrix(shoshveg)
sveg[sveg > 0] <- 1

set.seed(1)
ss <- seq_len(nrow(sveg)) %in% sample.int(nrow(sveg), floor(nrow(sveg) * 0.9))
sveg1 <- sveg[ss,]
sveg1 <- sveg1[,colSums(sveg1) > 5]
elev1 <- elev[ss]
dim(sveg)
dim(sveg1)
sveg2 <- sveg[!ss, colnames(sveg1)]
elev2 <- elev[!ss]


o <- opticut(sveg1 ~ 1, strata=elev1, dist="binomial")
plot(o, sort=1)
co <- calibrate(o, sveg2)


data.frame(predicted=co$gnew,
    truth=elev2,
    OK=co$gnew == elev2,
    round(co$pi,2))
sum(co$gnew == elev2) / nrow(sveg2)

Diatom and pH calibration

library(rioja)
data(SWAP)
data(RLGH)
spec <- SWAP$spec
pH <- SWAP$pH
core <- RLGH$spec
age <- RLGH$depths$Age

ph <- cut(pH, c(0, 5, 6, 14))
levels(ph) <- c("low", "mid", "high")
sp <- as.matrix(ifelse(spec > 0, 1, 0))

set.seed(1)
ss <- seq_len(nrow(sp)) %in% sample.int(nrow(sp), floor(nrow(sp) * 0.9))
sp1 <- sp[ss,]
sp1 <- sp1[,colSums(sp1) > 0]
ph1 <- ph[ss]
dim(sp)
dim(sp1)
sp2 <- sp[!ss, colnames(sp1)]
ph2 <- ph[!ss]


o <- opticut(sp1 ~ 1, strata=ph1, dist="binomial")
plot(o, sort=1)
co <- calibrate(o, sp2)

data.frame(predicted=co$gnew,
    truth=ph2,
    OK=co$gnew == ph2,
    round(co$pi,2))
sum(co$gnew == ph2) / nrow(sp2)

Conclusions

Todo

Inverse prediction might be a good metric of overall accuracy and scaling species contribution (i.e. the real indicator power).

Implement binary-multinomial-continuous comparison.

See how non-linear (unimodal, multimodal) responses affect inverse prediction

Somehow show how # species affects accuracy.

Multiclass classification measures

x is data class, y is classification

## simple binary confusion matrix
cmat <- function(x, y, drop=TRUE) {
    out <- c(
        tp=sum(x * y),
        fp=sum((1-x) * y),
        fn=sum(x * (1-y)),
        tn=sum((1-x) * (1-y)))
    if (drop)
        out else array(out, c(2L, 2L), 
            list(data=c("t", "f"), class=c("p", "n")))
}
cmat(c(1,0,0,1,1,1,0,0,0,0), c(1,1,1,0,0,0,0,0,0,0))
cmat(c(1,0,0,1,1,1,0,0,0,0), c(1,1,1,0,0,0,0,0,0,0), drop=FALSE)

x <- sample(LETTERS[1:4], 20, replace=TRUE)
y <- x
for (i in 1:length(x))
    if (runif(1) > 0.1)
        y[i] <- sample(LETTERS[1:4], 1)
mcm <- function(x, y, beta=1) {
    levs <- sort(unique(x))
    N <- length(x)
    if (length(y) != N)
        stop("x and y lengths must match")
    if (length(setdiff(y, x)) > 0)
        warning("y has more levels than x")
    cm <- sapply(levs, function(z) {
        xx <- ifelse(x == z, 1, 0)
        yy <- ifelse(y == z, 1, 0)
        cmat(xx, yy)
    })
    Stat <- c(
        Acc = mean((cm["tp",] + cm["fn",]) / N),
        Err = mean((cm["fp",] + cm["fn",]) / N),
        Prc_m = sum(cm["tp",]) / sum(cm["tp",] + cm["fp",]),
        Prc_M = mean(cm["tp",] / (cm["tp",] + cm["fp",])),
        Rec_m = sum(cm["tp",]) / sum(cm["tp",] + cm["fn",]),
        Rec_M = mean(cm["tp",] / (cm["tp",] + cm["fn",])))
    Stat <- c(Stat, 
        F_m = (beta^2 + 1) * Stat["Prc_m"] * Stat["Rec_m"] / 
            (beta^2 * Stat["Prc_m"] + Stat["Rec_m"]),
        F_M = (beta^2 + 1) * Stat["Prc_M"] * Stat["Rec_M"] / 
            (beta^2 * Stat["Prc_M"] + Stat["Rec_M"]))
    list(cmat=cm, stats=Stat, 
        p=cm["tp",] / (cm["tp",] + cm["fp",]),
        r=cm["tp",] / (cm["tp",] + cm["fn",]),
        beta=beta)
}

mcm(x, y)

Might have to check valid levels (x, y, union)

Dolina LOO

data(dolina)
dolina$samp$stratum <- as.integer(dolina$samp$stratum)
Y <- dolina$xtab[dolina$samp$method=="Q",]
X <- dolina$samp[dolina$samp$method=="Q",]
Y <- Y[,colSums(Y > 0) >= 20]

gnew <- character(0)
pm <- matrix(NA, 0, 4)
for (i in 1:nrow(Y)) {
    if (interactive()) {
        cat("\nRun", i, "of", nrow(Y), "\n")
    }
    ii <- seq_len(nrow(Y)) != i
    y_trn <- Y[ii,,drop=FALSE]
    y_new <- Y[!ii,,drop=FALSE]
    x_trn <- X[ii,,drop=FALSE]
    #x_new <- X[!ii,,drop=FALSE]
    o <- opticut(y_trn ~ 1, strata=x_trn$mhab, dist="poisson")
    cal <- calibrate(o, y_new)
    gnew <- c(gnew, cal$gnew)
    pm <- rbind(pm, cal$pi)
}

mcm(X$mhab, factor(gnew, levels(X$mhab)))
tt <- table(X$mhab, factor(gnew, levels(X$mhab)))
sum(diag(tt))/sum(tt)


psolymos/opticut documentation built on Nov. 27, 2022, 11:29 a.m.