In psolymos/opticut: Likelihood Based Optimal Partitioning and Indicator Species Analysis

Calibration with single species

This is not necessarily great.

library(opticut)
library(dclone)
library(rjags)
source("~/repos/opticut/extras/calibrate.R")

set.seed(234)
n <- 50*4
K <- 4
g <- sample(1:K, n, replace=TRUE)
x <- rnorm(n, 0, 1)
z <- ifelse(g == 1, 1, 0)
b0 <- 1
b1 <- 1.5
a <- 0.2
mu <- b0 + z*b1 + x*a
#p <- plogis(mu)
#y <- rbinom(n, 1, p)
lam <- exp(mu)
y <- rpois(n, lam)
table(y, g)
boxplot(lam ~ g)

df <- data.frame(y=y, x=x, g=g)
df <- df[-(1:5),]
#o <- opticut(y ~ x, data=df, strata=g, dist="binomial")
o <- opticut(y ~ x, data=df, strata=g, dist="poisson")
summary(o)

co <- calibrate(o, cbind("Sp 1"=(y[1:5])), model.matrix(~x, data.frame(x=x[1:5])))

y[1:5]
g[1:5]
co$gnew
round(co$pi, 2)

We get g=1 cases correctly classified (this is where Sp1 is to be found), But the rest is just all equally probable. All what we see is that it is not g=1. All makes sense, but not very useful. Luckily for us, we usually have >1 species.

Calibration with multiple species

This should work much better in principle.

set.seed(234)
n <- 50*4
K <- 4
g <- sample(1:K, n, replace=TRUE)
g[1:K] <- 1:K
x <- rnorm(n, 0, 1)
z <- ifelse(g %in% 1:2, 1, 0)
b0 <- 1
b1 <- 1.5
a <- 0.2
mu <- b0 + z*b1 + x*a
lam <- exp(mu)
y1 <- rpois(n, lam)

z <- ifelse(g %in% 2:3, 1, 0)
b0 <- 1
b1 <- 1.5
a <- 0.2
mu <- b0 + z*b1 + x*a
lam <- exp(mu)
y2 <- rpois(n, lam)
y <- cbind(y1=y1, y2=y2)

df <- data.frame(x=x, g=g)
df <- df[-(1:5),]
yy <- y[-(1:5),]
o <- opticut(yy ~ x, data=df, strata=g, dist="poisson")
summary(o)

co <- calibrate(o, y[1:5,], df[1:5,])

y[1:5,]
g[1:5]
co$gnew
round(co$pi, 2)

v <- rnorm(n)
xx <- lapply(1:ncol(yy), function(i) glm(y[,i] ~ as.factor(g) + v, family=poisson))
xnew <- model.matrix( ~v, data.frame(v=rnorm(5)))
cxx <- ipredict.default(xx, y[1:5,], xnew, K=4)

Dolina and calibration

data(dolina)
dolina$samp$stratum <- as.integer(dolina$samp$stratum)
Y <- dolina$xtab#[dolina$samp$method=="Q",]
X <- dolina$samp#[dolina$samp$method=="Q",]
inew <- 1:nrow(Y) %in% sample(nrow(Y), 10)
Y <- Y[,colSums(Y[!inew,] > 0) >= 20]
#Y <- ifelse(Y > 0, 1, 0)
x <- X[!inew,]
xnew <- dolina$samp[inew,]
y <- Y[!inew,]
ynew <- Y[inew,]
o <- opticut(y ~ stratum + lmoist + method, data=x, strata=mhab, dist="poisson")
#o <- opticut(y ~ 1, data=x, strata=mhab, dist="poisson")

## binary calibration
cal <- calibrate(o, ynew, xnew)
#cal <- calibrate(o, ynew)
round(cal$pi,2)
dolina$samp[inew,"mhab"]
round(cor(cal$pi),2)
data.frame(predicted=cal$gnew,
    truth=dolina$samp[inew,"mhab"],
    OK=cal$gnew == dolina$samp[inew,"mhab"])
cm <- table(pred=cal$gnew, true=X[inew,"mhab"])
cm <- cm[rownames(cm),rownames(cm)]
cm
sum(diag(cm)) / sum(cm)

## multinomial calibration using calibrate.default
oo <- lapply(1:ncol(y), function(i) {
    glm(y[,i] ~ mhab + stratum + lmoist + method, x, family=poisson)
})
xnew <- model.matrix(~ stratum + lmoist + method, xnew)
cal <- calibrate(oo, ynew, xnew, K=4)
round(cal$pi,2)
round(cor(cal$pi),2)
data.frame(predicted=levels(x$mhab)[cal$gnew],
    truth=dolina$samp[inew,"mhab"],
    OK=levels(x$mhab)[cal$gnew] == dolina$samp[inew,"mhab"])
cm <- table(pred=levels(x$mhab)[cal$gnew], true=X[inew,"mhab"])
cm <- cm[rownames(cm),rownames(cm)]
cm
sum(diag(cm)) / sum(cm)

library(optpart)

data(shoshsite)
data(shoshveg)

#elev <- cut(shoshsite$elevation, breaks=c(0, 7200, 8000, 9000, 20000))
#levels(elev) <- c("low", "mid1", "mid2", "high")
elev <- cut(shoshsite$elevation, breaks=c(0, 7800, 8900, 20000))
levels(elev) <- c("low", "mid", "high")

sveg <- as.matrix(shoshveg)
sveg[sveg > 0] <- 1

set.seed(1)
ss <- seq_len(nrow(sveg)) %in% sample.int(nrow(sveg), floor(nrow(sveg) * 0.9))
sveg1 <- sveg[ss,]
sveg1 <- sveg1[,colSums(sveg1) > 5]
elev1 <- elev[ss]
dim(sveg)
dim(sveg1)
sveg2 <- sveg[!ss, colnames(sveg1)]
elev2 <- elev[!ss]


o <- opticut(sveg1 ~ 1, strata=elev1, dist="binomial")
plot(o, sort=1)
co <- calibrate(o, sveg2)


data.frame(predicted=co$gnew,
    truth=elev2,
    OK=co$gnew == elev2,
    round(co$pi,2))
sum(co$gnew == elev2) / nrow(sveg2)

Diatom and pH calibration

library(rioja)
data(SWAP)
data(RLGH)
spec <- SWAP$spec
pH <- SWAP$pH
core <- RLGH$spec
age <- RLGH$depths$Age

ph <- cut(pH, c(0, 5, 6, 14))
levels(ph) <- c("low", "mid", "high")
sp <- as.matrix(ifelse(spec > 0, 1, 0))

set.seed(1)
ss <- seq_len(nrow(sp)) %in% sample.int(nrow(sp), floor(nrow(sp) * 0.9))
sp1 <- sp[ss,]
sp1 <- sp1[,colSums(sp1) > 0]
ph1 <- ph[ss]
dim(sp)
dim(sp1)
sp2 <- sp[!ss, colnames(sp1)]
ph2 <- ph[!ss]


o <- opticut(sp1 ~ 1, strata=ph1, dist="binomial")
plot(o, sort=1)
co <- calibrate(o, sp2)

data.frame(predicted=co$gnew,
    truth=ph2,
    OK=co$gnew == ph2,
    round(co$pi,2))
sum(co$gnew == ph2) / nrow(sp2)

Conclusions

More species, merrier results (see simple simulation)
Binary partitions might not be the best, especially for count models
Multinomial models (K levels in g) are more eficient (see dolina example)
Many species in binomial context work quite well, if K is not unreasonably high.

Todo

OK use notation: j=1...p & r=1...S
OK implement test cases for lm/glm
OK write internal function (.calibrate)
OK implement user interface: calibrate, calibrate.opticut(object, ynew, xnew, ...)
add NB, beta, ZI cases (ZI: inits!) [drop all the 1-species models!]
gaussian needs sig^2

Inverse prediction might be a good metric of overall accuracy and scaling species contribution (i.e. the real indicator power).

Implement binary-multinomial-continuous comparison.

See how non-linear (unimodal, multimodal) responses affect inverse prediction

Somehow show how # species affects accuracy.

Multiclass classification measures

x is data class, y is classification

## simple binary confusion matrix
cmat <- function(x, y, drop=TRUE) {
    out <- c(
        tp=sum(x * y),
        fp=sum((1-x) * y),
        fn=sum(x * (1-y)),
        tn=sum((1-x) * (1-y)))
    if (drop)
        out else array(out, c(2L, 2L), 
            list(data=c("t", "f"), class=c("p", "n")))
}
cmat(c(1,0,0,1,1,1,0,0,0,0), c(1,1,1,0,0,0,0,0,0,0))
cmat(c(1,0,0,1,1,1,0,0,0,0), c(1,1,1,0,0,0,0,0,0,0), drop=FALSE)

x <- sample(LETTERS[1:4], 20, replace=TRUE)
y <- x
for (i in 1:length(x))
    if (runif(1) > 0.1)
        y[i] <- sample(LETTERS[1:4], 1)
mcm <- function(x, y, beta=1) {
    levs <- sort(unique(x))
    N <- length(x)
    if (length(y) != N)
        stop("x and y lengths must match")
    if (length(setdiff(y, x)) > 0)
        warning("y has more levels than x")
    cm <- sapply(levs, function(z) {
        xx <- ifelse(x == z, 1, 0)
        yy <- ifelse(y == z, 1, 0)
        cmat(xx, yy)
    })
    Stat <- c(
        Acc = mean((cm["tp",] + cm["fn",]) / N),
        Err = mean((cm["fp",] + cm["fn",]) / N),
        Prc_m = sum(cm["tp",]) / sum(cm["tp",] + cm["fp",]),
        Prc_M = mean(cm["tp",] / (cm["tp",] + cm["fp",])),
        Rec_m = sum(cm["tp",]) / sum(cm["tp",] + cm["fn",]),
        Rec_M = mean(cm["tp",] / (cm["tp",] + cm["fn",])))
    Stat <- c(Stat, 
        F_m = (beta^2 + 1) * Stat["Prc_m"] * Stat["Rec_m"] / 
            (beta^2 * Stat["Prc_m"] + Stat["Rec_m"]),
        F_M = (beta^2 + 1) * Stat["Prc_M"] * Stat["Rec_M"] / 
            (beta^2 * Stat["Prc_M"] + Stat["Rec_M"]))
    list(cmat=cm, stats=Stat, 
        p=cm["tp",] / (cm["tp",] + cm["fp",]),
        r=cm["tp",] / (cm["tp",] + cm["fn",]),
        beta=beta)
}

mcm(x, y)

Might have to check valid levels (x, y, union)

Dolina LOO

data(dolina)
dolina$samp$stratum <- as.integer(dolina$samp$stratum)
Y <- dolina$xtab[dolina$samp$method=="Q",]
X <- dolina$samp[dolina$samp$method=="Q",]
Y <- Y[,colSums(Y > 0) >= 20]

gnew <- character(0)
pm <- matrix(NA, 0, 4)
for (i in 1:nrow(Y)) {
    if (interactive()) {
        cat("\nRun", i, "of", nrow(Y), "\n")
    }
    ii <- seq_len(nrow(Y)) != i
    y_trn <- Y[ii,,drop=FALSE]
    y_new <- Y[!ii,,drop=FALSE]
    x_trn <- X[ii,,drop=FALSE]
    #x_new <- X[!ii,,drop=FALSE]
    o <- opticut(y_trn ~ 1, strata=x_trn$mhab, dist="poisson")
    cal <- calibrate(o, y_new)
    gnew <- c(gnew, cal$gnew)
    pm <- rbind(pm, cal$pi)
}

mcm(X$mhab, factor(gnew, levels(X$mhab)))
tt <- table(X$mhab, factor(gnew, levels(X$mhab)))
sum(diag(tt))/sum(tt)

psolymos/opticut documentation built on Nov. 27, 2022, 11:29 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

psolymos/opticut
Likelihood Based Optimal Partitioning and Indicator Species Analysis

In psolymos/opticut: Likelihood Based Optimal Partitioning and Indicator Species Analysis

Calibration with single species

Calibration with multiple species

Dolina and calibration

Diatom and pH calibration

Conclusions

Todo

Multiclass classification measures

R Package Documentation

Browse R Packages

We want your feedback!

psolymos/opticut Likelihood Based Optimal Partitioning and Indicator Species Analysis

In psolymos/opticut: Likelihood Based Optimal Partitioning and Indicator Species Analysis

Calibration with single species

Calibration with multiple species

Dolina and calibration

Diatom and pH calibration

Conclusions

Todo

Multiclass classification measures

R Package Documentation

Browse R Packages

We want your feedback!

psolymos/opticut
Likelihood Based Optimal Partitioning and Indicator Species Analysis