##############
### snpzip ###
##############
snpzip <- function(snps, y, plot=TRUE, xval.plot=FALSE, loading.plot=FALSE,
method=c("complete","single","average","centroid",
"mcquitty","median","ward"), ...) {
## dapc input prompts only SNP selection function
if(inherits(y, "dapc")){
dapc1 <- y
phen <- 0
if(xval.plot==TRUE){
warning("cross-validation not performed when x is a dapc object; xval.plot will not be shown")
xval.plot=FALSE
}
}
## snps, phen input prompts cross-validation, DAPC, and SNP selection functions
else{
if(missing(y)){
stop("phen argument needed")
}
phen <- y
}
if(!inherits(y, "dapc")){
################################################
######## Stratified Cross-Validation ###########
################################################
xvalDapc <- function(x, grp, n.pca.max = 200, n.da = NULL,
training.set = 0.9, result = "groupMean",
center = TRUE, scale = FALSE, n.pca = NULL, n.rep = 30, ...){
## CHECKS ##
grp <- factor(grp)
n.pca <- n.pca[n.pca>0]
n.da <- length(levels(grp))-1
if(missing(training.set)){
training.set <- 0.9}
else{
training.set <- training.set}
if(missing(n.rep)){
n.rep<-30}
else{
n.rep<-n.rep}
## GET TRAINING SET SIZE ##
N <- nrow(x)
groups <- levels(grp)
if(all(lapply(groups, function(e) sum(as.vector(unclass(grp==e))))>=10)==TRUE){
N.training <- round(N*training.set)}
else{
groups1 <- (levels(grp))[(as.vector(which.min((lapply(groups,
function(e) sum(as.vector(unclass(grp==e))))))))]
popmin <- length(which(grp%in%groups1))
training.set2 <- ((popmin - 1)/popmin)
N.training <- round(N*training.set2)}
## GET FULL PCA ##
if(missing(n.pca.max)) n.pca.max <- min(dim(x))
pcaX <- dudi.pca(x, nf=n.pca.max, scannf=FALSE, center=center, scale=scale)
n.pca.max <- min(n.pca.max, pcaX$rank, N.training-1)
## DETERMINE N.PCA IF NEEDED ##
if(n.pca.max < 10){
runs <- n.pca.max}
else{
runs<- 10}
if(is.null(n.pca)){
n.pca <- round(pretty(1:n.pca.max, runs))
}
n.pca <- n.pca[n.pca>0 & n.pca<(N.training-1)]
## FUNCTION GETTING THE % OF ACCURATE PREDICTION FOR ONE NUMBER OF PCA PCs ##
## n.pca is a number of retained PCA PCs
get.prop.pred <- function(n.pca){
f1 <- function(){
if(all(lapply(groups, function(e) sum(as.vector(unclass(grp==e))))>=10)==TRUE){
toKeep <- unlist(lapply(groups, function(e) sample(which(grp==e),
size=(round(training.set*sum(as.vector(unclass(grp==e))))))))}
else{
toKeep <- unlist(lapply(groups, function(e) sample(which(grp==e),
size=(round(training.set2*sum(as.vector(unclass(grp==e))))))))}
temp.pca <- pcaX
temp.pca$li <- temp.pca$li[toKeep,,drop=FALSE]
temp.dapc <- suppressWarnings(dapc(x[toKeep,,drop=FALSE], grp[toKeep],
n.pca=n.pca, n.da=n.da, dudi=temp.pca))
temp.pred <- predict.dapc(temp.dapc, newdata=x[-toKeep,,drop=FALSE])
if(result=="overall"){
out <- mean(temp.pred$assign==grp[-toKeep])
}
if(result=="groupMean"){
out <- mean(tapply(temp.pred$assign==grp[-toKeep], grp[-toKeep], mean), na.rm=TRUE)
}
return(out)
}
return(replicate(n.rep, f1()))
}
## GET %SUCCESSFUL OF ACCURATE PREDICTION FOR ALL VALUES ##
res.all <- unlist(lapply(n.pca, get.prop.pred))
xval <- data.frame(n.pca=rep(n.pca, each=n.rep), success=res.all)
n.pcaF <- as.factor(xval$n.pca)
successV <- as.vector(xval$success)
pca.success <- tapply(successV, n.pcaF, mean)
n.opt <- which.max(tapply(successV, n.pcaF, mean))
################################################
##### MSE Calculation and n.pca Selection #####
################################################
temp <- seq(from=1, to=length(xval$n.pca), by=n.rep)
orary <-c(temp+(n.rep-1))
index <-c(1:length(temp))
lins <-sapply(index, function(e) seq(from=temp[e], to=orary[e]))
lin <-c(1:ncol(lins))
col <-successV
cait<-sapply(lin, function(e) ((col[lins[,e]])-1)^2)
FTW <-sapply(lin, function(e) sum(cait[,e])/n.rep)
RMSE <- sqrt(FTW)
names(RMSE) <- xval$n.pca[temp]
best.n.pca <- names(which.min(RMSE))
################################################
################# DAPC #######################
################################################
n.pcaF <- as.factor(xval$n.pca)
successV <- as.vector(xval$success)
pca.success <- tapply(successV, n.pcaF, mean)
n.opt <- which.max(tapply(successV, n.pcaF, mean))
n.pca <- as.integer(best.n.pca)
n.da <- nlevels(grp)-1
dapc1 <- dapc(x, grp, n.pca=n.pca, n.da=n.da)
answerme <- list(n.da, n.pca, dapc1, xval, successV, RMSE, best.n.pca, pca.success, n.opt)
return(answerme)
}
x <- snps
grp <- phen
XVAL <- xvalDapc(x, grp, n.da=NULL, training.set=0.9, result="groupMean",
center=TRUE, scale=FALSE, n.pca=NULL, ...)
n.da <- XVAL[[1]]
n.pca <- XVAL[[2]]
dapc1 <- XVAL[[3]]
xval <- XVAL[[4]]
successV <- XVAL[[5]]
RMSE <- XVAL[[6]]
best.n.pca <- XVAL[[7]]
pca.success <- XVAL[[8]]
n.opt <- XVAL[[9]]
################################################
######## Show Cross-Validation Results #########
################################################
snps <- x
phen <- grp
if(xval.plot==TRUE){
par(ask=TRUE)
random <- replicate(300, mean(tapply(sample(phen)==phen, phen, mean)))
q.phen <- quantile(random, c(0.025,0.5,0.975))
smoothScatter(xval$n.pca, successV, nrpoints=Inf, pch=20, col=transp("black"),
ylim=c(0,1), xlab="Number of PCA axes retained",
ylab="Proportion of successful outcome prediction",
main="DAPC Cross-Validation")
abline(h=q.phen,lty=c(2,1,2))
xvalResults <- list(xval, q.phen, pca.success, (names(n.opt)), RMSE, best.n.pca)
names(xvalResults)[[1]] <- "Cross-Validation Results"
names(xvalResults)[[2]] <- "Median and Confidence Interval for Random Chance"
names(xvalResults)[[3]] <- "Mean Successful Assignment by Number of PCs of PCA"
names(xvalResults)[[4]] <- "Number of PCs Achieving Highest Mean Success"
names(xvalResults)[[5]] <- "Root Mean Squared Error by Number of PCs of PCA"
names(xvalResults)[[6]] <- "Number of PCs Achieving Lowest MSE"
print(xvalResults)
}
} # end of snps, phen section
################################################
############# Plot DAPC Results #############
################################################
if(plot==TRUE){
myCol <- colorRampPalette(c("blue", "gold", "red"))
scatter(dapc1, bg="white", scree.da=FALSE, scree.pca=TRUE, posi.pca="topright",
col=myCol((dapc1$n.da)+1), legend=TRUE, posi.leg="topleft")
title("DAPC")}
################################################
###### Select Cluster of Structural SNPS #######
################################################
if(missing(method)){
method <- "ward"
}
else{
method <- method}
selector <- function(dapc1, dimension){
z <- dapc1$var.contr[,dimension]
xTotal <- dapc1$var.contr[,dimension]
toto <- which(xTotal%in%tail(sort(xTotal), 2000))
z <- sapply(toto, function(e) xTotal[e])
D <- dist(z)
clust <- hclust(D,method)
pop <- factor(cutree(clust,k=2,h=NULL))
m <- which.max(tapply(z,pop,mean))
maximus <- which(pop==m)
maximus <- as.vector(unlist(sapply(maximus, function(e) toto[e])))
popvect <- as.vector(unclass(pop))
n.snp.selected <- sum(popvect==m)
sel.snps <- snps[,maximus]
selection <- c((ncol(snps)-ncol(snps[,-maximus])), ncol(snps[,-maximus]))
resultat <- list(selection, maximus, dimnames(sel.snps)[[2]], dapc1$var.contr[maximus, dimension])
names(resultat)[[1]] <- "Number of selected vs. unselected alleles"
names(resultat)[[2]] <- "List of selected alleles"
names(resultat)[[3]] <- "Names of selected alleles"
names(resultat)[[4]] <- "Contributions of selected alleles to discriminant axis"
gc()
return(resultat)
}
if(dapc1$n.da==1){
features <- selector(dapc1, dimension=1)
}
else{
dimensions <- c(1:dapc1$n.da)
features <- lapply(dimensions, function(e)
selector(dapc1, dimension=e))
}
################################################
#### Calculate success in discrimination ####
################################################
# overall
grp <- dapc1$grp
ass <- dapc1$assign
dapc.success.overall <- length(which(ass==grp)) / length(grp)
# by group
# give grp and ass numbered factor levels
GRP <- factor(grp, levels=levels(grp), labels=c(0:(nlevels(grp)-1)))
ASS <- factor(ass, levels=levels(ass), labels=c(0:(nlevels(ass)-1)))
# make each level of those factors into one element of a list
index <- c(0:(nlevels(grp) - 1))
ASSIGN <- sapply(index, function(e) which(ASS==e))
GROUP <- sapply(index, function(e) which(GRP==e))
index2 <- c(1:(nlevels(grp)))
dapc.success.byGroup <- sum(sapply(index2, function(e)
(length(which(ASSIGN[[e]]%in%GROUP[[e]]))) /
length(GROUP[[e]]))) / length(index2)
dapc.success <- c(dapc.success.overall, dapc.success.byGroup)
################################################
#### Loading Plot Delineating SNP Clusters ####
################################################
if(loading.plot==TRUE){
if(dapc1$n.da==1){
par(ask=TRUE)
maximus <- features[[2]]
decimus <- abs(dapc1$var.contr[maximus][(which.min(dapc1$var.contr[maximus]))])-0.000001
meridius <- loadingplot(dapc1$var.contr[,1], threshold=c(decimus))
}
else{
par(ask=TRUE)
# specify that you want to run the following lines for all DA (ie. from DA=1 to DA=(k-1))
DA <- c(1:dapc1$n.da)
# generate separate loading plots for each DA
for(i in DA){
title <- paste("Loading Plot for DA", i, sep=" ")
maximus <- features[[i]][[2]]
decimus <- abs(dapc1$var.contr[maximus,i][(which.min(dapc1$var.contr[maximus,i]))])-0.000001
meridius <- loadingplot(dapc1$var.contr[, i], threshold=decimus, main=title)
}
}
}
################################################
########## Return snpzip Results ##############
################################################
if(inherits(y, "dapc")){
answer <- list(dapc1$n.pca, features)
names(answer)[[1]] <- "Number of PCs of PCA retained"
names(answer)[[2]] <- "FS"
return(answer)
}
else{
answer <- list(best.n.pca, features, dapc.success, dapc1)
names(answer)[[1]] <- "Number of PCs of PCA retained"
names(answer)[[2]] <- "FS"
names(answer)[[3]] <- "Discrimination success overall & by group"
names(answer)[[4]] <- "DAPC"
return(answer)
}
par(ask=FALSE)
} # end snpzip
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.