load_all("~/Projects/ProgenyArray") library(rmarkdown) library(Rcpp) library(parallel) library(caret) library(reshape2) options(mc.cores=4) set.seed(42)
sf <- sibFamily(10, 1000)
How do these separate in principal component space?
p <- prcomp(t(sf$progeny)) d <- data.frame(x=p$x[,1], y=p$x[,2], mom=as.factor(sf$metadata$mgamete), type=sf$metadata$type) ggplot(d) + geom_point(aes(x=x, y=y, color=mom, shape=type)) + xlab("PC 1") + ylab("PC 2")
Let's use k-means to look at accuracy. K-means is non-deterministic (random initial positions), so we will run 1000 replicates. Cluster labels are arbitrary, so we swap them when the alternate label is closer.
switchLabels <- function(inferred, real) { switch <- which.max(table(inferred, real)[1, ]) != 1 if (switch) return(ifelse(inferred == 1, 2, 1)) inferred } kmeansAccuracy <- function(geno, real) { cm <- confusionMatrix(switchLabels(kmeans(geno, 2)$cluster, real), real) cm$overall['Accuracy'] } famsize <- rep(5:70, each=3) acc <- sapply(famsize, function(n) { sf <- sibFamily(n, 1000) kmeansAccuracy(t(sf$progeny), sf$metadata$mgamete) }) qplot(x=famsize, y=acc) + ylab("accuracy") + xlab("family size") + geom_smooth(se=FALSE)
What is being misclassified? These are all heterozygous selfed progeny, where the true cluster is either.
incor <- sapply(famsize, function(n) { sf <- sibFamily(n, 1000) l <- switchLabels(kmeans(t(sf$progeny), 2)$cluster, sf$metadata$mgamete) nwrong <- sum(l != sf$metadata$mgamete) # how many cases are wrong, but heterzygous selfs x <- with(sf$metadata, l != mgamete & type == "selfed" & mother != father) sum(x)/nwrong })
haploImputeWindow <- function(geno, parent_geno, fathers, freqs, fun=kmeans, ehet=0.8, ehom=0.1) { geno_na <- na.omit(geno) if (nrow(geno_na) < 1) { warning("cannot run k-means on this window: all loci contian NA values") return(NULL) } nremoved <- length(na.action(geno_na)) #if (nremoved > 1) warning(sprintf("%d ind") km <- fun(t(geno_na), 2) hap_geno <- list(geno[, km$cluster == 1], geno[, km$cluster == 2]) out <- lapply(1:2, function(which_hap) { # iterate haplotype imputation for each cluster hap_geno <- geno[, km$cluster == which_hap, drop=FALSE] hap_fathers <- fathers[km$cluster == which_hap] ll <- lapply(1:nrow(hap_geno), function(i) { allele_ll(hap_geno[i, ], ehet, ehom, freqs[i], parent_geno[cbind(i, hap_fathers)]) }) do.call(rbind, ll) }) } # example run: ll <- haploImputeWindow(sf$progeny, sf$geno, sf$metadata$father, sf$freqs, ehet=0.1, ehom=0.1) mom <- sf$haplos[[1]] h1 <- apply(ll[[1]], 1, which.max) - 1L h2 <- apply(ll[[2]], 1, which.max) - 1L
Variable family size.
famsizes <- c(5, seq(10, 80, 10)) hamming <- function(x, y) sum(x != y) hapConfusion <- function(inferred, real) { if (hamming(inferred[, 1], real[, 1]) < hamming(inferred[, 1], real[, 2])) { out <- list(confusionMatrix(inferred[, 1], real[, 1]), confusionMatrix(inferred[, 2], real[, 2])) } else { out <- list(confusionMatrix(inferred[, 1], real[, 2]), confusionMatrix(inferred[, 2], real[, 1])) } return(out) } accs <- lapply(famsizes, function(fs) { replicate(10, { sf <- sibFamily(fs, 1000) ll <- haploImputeWindow(sf$progeny, sf$geno, sf$metadata$father, sf$freqs, ehet=0.1, ehom=0.1) mom <- sf$haplos[[1]] h1 <- apply(ll[[1]], 1, which.max) - 1L h2 <- apply(ll[[2]], 1, which.max) - 1L cm <- hapConfusion(cbind(h1, h2), mom) sapply(cm, function(x) x$overall['Accuracy']) }) }) # each list element is a matrix, with two rows, each row both haplotypes' # accuracies. So we munge: reshapeReplicatesSimResults <- function(results, var, varname) { tmp <- do.call(rbind, results) d <- as.data.frame(tmp, row.names=1:nrow(tmp)) d[, varname] <- rep(var, each=2) melt(d, id=varname)[, c(varname, 'value')] } d <- reshapeReplicatesSimResults(accs, famsizes, "famsize") p <- ggplot(d, aes(x=famsize, y=value)) + geom_point() + geom_smooth(se=FALSE) p <- p + xlab("family size") + ylab("imputation accuracy") print(p)
or using boxplot:
p <- ggplot(d) + geom_boxplot(aes(x=as.factor(famsize), y=value)) p <- p + xlab("family size") + ylab("imputation accuracy") p <- p + scale_y_continuous(limits=c(0, 1)) print(p)
From the cclust
package.
famsizes <- c(5, seq(10, 80, 10)) hapConfusion <- function(inferred, real) { if (hamming(inferred[, 1], real[, 1]) < hamming(inferred[, 1], real[, 2])) { out <- list(confusionMatrix(inferred[, 1], real[, 1]), confusionMatrix(inferred[, 2], real[, 2])) } else { out <- list(confusionMatrix(inferred[, 1], real[, 2]), confusionMatrix(inferred[, 2], real[, 1])) } return(out) } accs <- lapply(famsizes, function(fs) { replicate(10, { sf <- sibFamily(fs, 1000) llkm <- haploImputeWindow(sf$progeny, sf$geno, sf$metadata$father, sf$freqs, ehet=0.1, ehom=0.1) llcc <- haploImputeWindow(sf$progeny, sf$geno, sf$metadata$father, sf$freqs, ehet=0.1, ehom=0.1) mom <- sf$haplos[[1]] h1km <- apply(llkm[[1]], 1, which.max) - 1L h2km <- apply(llkm[[2]], 1, which.max) - 1L h1cc <- apply(llcc[[1]], 1, which.max) - 1L h2cc <- apply(llcc[[2]], 1, which.max) - 1L cmkm <- hapConfusion(cbind(h1km, h2km), mom) cmcc <- hapConfusion(cbind(h1cc, h2cc), mom) setNames(sapply(c(cmkm, cmcc), function(x) x$overall['Accuracy']), c("km", "km", "cc", "cc")) }) }) # munge data together tmp <- do.call(rbind, accs) d <- as.data.frame(tmp, row.names=1:nrow(tmp)) d$method <- rep(c("kmeans", "kmeans", "cclust", "cclust"), rep=nrow(tmp)/4) # FIX d$famsize <- rep(famsizes, each=4) dm <- melt(d, id=c("method", "famsize"))[, c("famsize", "method", "value")] ggplot(dm, aes(x=famsize, y=value, color=method)) + geom_point() + geom_smooth(se=FALSE) }
Therefore, not much of a difference - we use cclust because it's faster and has better prediction abilities:
# on large sib family 100,000 loci system.time(kmeans(t(sf$genos), 2)) user system elapsed 0.189 0.012 0.201 system.time(cclust(t(sf$genos), 2)) user system elapsed 0.041 0.003 0.043
missing <- seq(0.01, 0.15, by=0.01) na_accs <- lapply(missing, function(m) { replicate(10, { sf <- sibFamily(30, 1000, na_rate=m) ll <- haploImputeWindow(sf$progeny, sf$geno, sf$metadata$father, sf$freqs, ehet=0.1, ehom=0.1) mom <- sf$haplos[[1]] h1 <- apply(ll[[1]], 1, which.max) - 1L h2 <- apply(ll[[2]], 1, which.max) - 1L cm <- hapConfusion(cbind(h1, h2), mom) sapply(cm, function(x) x$overall['Accuracy']) }) }) # each list element is a matrix, with two rows, each row both haplotypes' # accuracies. So we munge: d <- reshapeReplicatesSimResults(na_accs, missing, "missing") p <- ggplot(d, aes(x=missing, y=value)) + geom_point() + geom_smooth(se=FALSE, color="red") p <- p + xlab("proportion missing at random") + ylab("imputation accuracy") + ggtitle("haplotype imputation accuracy (famsize=30)") print(p)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.