#' Make a membership probability plot using results from K-fold cross-validation (ggplot2 style)
#'
#' This function allows you to make a membership probability plot (stacked-bar plot) using results estimated from K-fold cross-validation.
#' @param dir A character string to specify the folder that has your K-fold cross-validation assignment results. A slash should be entered at the end.
#' @param style An option for output style. If style=1, it creates the plot which individuals on the x-axis are in random order. If style=2, individuals are sorted by probabilities within each population. If style=3, individuals of different folds are in seperate plots. If style=4, individuals are separated by fold and sorted by probability.
#' @param non.genetic A logical variable to specify if data are non-genetic. Set it TRUE if you're analyzing non-genetic alone.
#' @param plot.k A number to specify which K of the data set should be plotted. If not given, it will prompt the question.
#' @param plot.loci The proportion of training loci used in your K-fold cross-validation analysis. Specify one of the numbers here to skip question prompt.
#' @return This function returns a stacked-bar plot using the ggplot2 library. Users can modified the plot (e.g., change color, text, etc.) using functions provided by ggplot2 library.
#' @import ggplot2
#' @importFrom reshape2 melt
#' @importFrom stats reorder
#' @export
membership.plot <- function(dir=NULL, style=NULL, non.genetic=FALSE, plot.k=NULL, plot.loci=NULL){
Ind.ID <- NULL; value <- NULL; variable <- NULL #some NULL variable to handle R CMD check
#Read all "Out_*" file names in a specified directory
fileName_vec <- list.files(path=dir, pattern="Out_*")
fileName_vec <- sort(fileName_vec)
noFiles <- length(fileName_vec)#count number of files
#Read one of files and get pop names
result01 <- read.table(paste0(dir,fileName_vec[1]), header=T, stringsAsFactors=T)
pops <- names(result01)[4:length(names(result01))] #read column name and get the pop names between 4th to last column
noPops <- length(pops)#Number of pops
#create vectors for saving data
k_fold_vec <- NULL
train_loci_vec <- NULL
#Read through file name and collect k.fold and train.loci
for(i in 1:noFiles){
oneFileName <- unlist(strsplit(fileName_vec[i], split="_")) #split file name to 4 elements (e.g.,"Out" "0.1" "K3" "1.txt");"0.1"is train.loci,"K3"is k fold,"1"is fold
train_loci_vec[i] <- oneFileName[2]
k_fold_vec[i] <- gsub("K","",oneFileName[3])#Remove "k" character in the "K3" string
}
k.fold <- unique(k_fold_vec)#identify unique levels of K
train.loci <- unique(train_loci_vec)#identify unique levels of training loci
#Check if k.fold only has one level or multiple levels
if(length(k.fold) == 1){
ans_k <- k.fold
#when plot.k argument is given
}else if(is.numeric(plot.k) & length(plot.k) == 1){
plot.k <- as.integer(plot.k)
ans_k <- plot.k
if(!ans_k %in% k.fold){
stop("Your plot.k entry does not exist.")
}
}else{
cat("\n K = ");cat(paste0(k.fold," "));cat(" are found.") #print out detected "K" on console
ans_k <- readline(" Please enter one of the K numbers: ") #ask user to enter one of the K numbers
ans_k <- str_trim(ans_k, side="both") #clean any space
if(!ans_k %in% k.fold){
stop("Your entry is not correct.")
}
}
#Check training loci proportions (levels) or if it is non-genetic data only
if(length(train.loci) > 1){
if(is.numeric(plot.loci) & length(plot.loci) == 1){
ans_t <- plot.loci
if(!ans_t %in% train.loci){
stop("Your plot.loci entry is not correct.")
}
}else{
cat(paste0("\n ", length(train.loci)," proportions of training loci are found."))
cat("\n Levels[train.loci]: ");cat(paste0(train.loci," "))
ans_t <- readline(" Please enter one of the proportions: ")
ans_t <- str_trim(ans_t, side="both")
if(!ans_t %in% train.loci){
stop("Your entry is not correct.")
}
}
pltext <- paste0(" , training locus proportion = ",ans_t)
#If there is only one proportion of training loci or it is non-genetic data only
}else if(length(train.loci)==1){
if(non.genetic){
ans_t <- "N"
}else{
cat(paste0("\n Only one proportion of training loci is found."))
ans_t <- readline(" Do data include genetic loci? (enter Y/N): ")
ans_t <- str_trim(ans_t, side="both")
if(!toupper(ans_t) %in% c("N","Y","NO","YES")){
stop("Your entry is not correct.")
}
}
if(grepl(pattern="Y",toupper(ans_t))){
pltext <- paste0(" , training locus proportion = ",train.loci)
}else if(grepl(pattern="N",toupper(ans_t))){
pltext <- " " #If it is non-genetic dataset, assign nothing to plot title
}
}
#Read selected files
df_mas <- data.frame(matrix(ncol=0,nrow=0))
for(i in 1:ans_k){
if(length(train.loci) > 1){
oneFile <- read.table(paste0(dir,"Out_",ans_t,"_K",ans_k,"_",i,".txt"), header=T, stringsAsFactors=T)
}else if(length(train.loci)==1){
oneFile <- read.table(paste0(dir,"Out_",train.loci,"_K",ans_k,"_",i,".txt"), header=T, stringsAsFactors=T)
}
sampleSize <- nrow(oneFile)
fold_n <- rep(paste0("fold_",i),sampleSize)
oneFile <- cbind(oneFile, fold_n)
df_mas <- rbind(df_mas, oneFile)
}
if(is.null(style)){
cat("\n Finally, select one of the output styles.")
cat("\n [1] Random order (Individuals on x-axis are in random order)")
cat("\n [2] Sorted by probability (Individuals are sorted by probabilities within each group)")
cat("\n [3] Separated by fold (Individuals of different folds are in separate plots)")
cat("\n [4] Separated and Sorted (Individuals are separated by fold and sorted by probability)")
style <- readline(" Please enter 1, 2, 3, or 4: ")
style <- str_trim(style, side="both")
if(!style %in% c(1,2,3,4)){
stop("Your entry is not correct.")
}
}
if(style==2){ # Individuals are sorted based on the probability of their own populations
#Separate inds among pops and sort
df_mas_2 <- data.frame(matrix(ncol=0,nrow=0))
for(p in pops){
df_pop <- df_mas[which(df_mas$origin.pop==p),]#Subset samples for each pop based on the origin.pop variable
df_pop$Ind.ID <- as.factor(df_pop$Ind.ID) #convert ind id charater to factor data
df_pop$Ind.ID <- droplevels(df_pop$Ind.ID) #Drop unexisting ind id levels
df_pop$Ind.ID <- reorder(df_pop$Ind.ID, -df_pop[ ,p])#Reorder ind id levels based on the prob. of its pop
df_mas_2 <- rbind(df_mas_2, df_pop)
}
ndf <- melt(df_mas_2, id.vars=c("Ind.ID","origin.pop","pred.pop","fold_n"))#Reshape the data, making probabilities in one single column (var name="value")
stackplot <- ggplot(ndf, aes(x=Ind.ID, y=value, fill=variable))+
geom_bar(stat="identity", width=1)+ # width=1 allows no space between bars
#scale_fill_grey()+ # Make the bar color in grey scale
facet_grid( . ~ origin.pop, scales="free_x", space="free_x")+ #scales="free" allows each facet includes the data that exist; space="free" allows facet size being proportionally adjusted
ylab("Probability")+
labs(title=paste0("K = ",ans_k," ",pltext))+
coord_cartesian(ylim=c(0, 1.005))+ #add 0.005 on y to give tiny space between panel and facet strip
guides(fill=guide_legend(title=NULL))+ #Hiding title of legend
theme_bw()+
theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank(),#hiding grid of the panel
strip.background = element_rect(colour="black", fill="white", linetype="solid"),#change facet title background color
plot.title = element_text(hjust=0.5),
axis.title.x = element_blank(), axis.text.x = element_text(angle = 90))
#plot.title = element_text(size=16, vjust=0.8),
#legend.text = element_text(size=14),
#strip.text.x = element_text(size=16),
#axis.title.y = element_text(size=16), axis.text.y = element_text(size=14, colour="black"),
#axis.title.x = element_blank(), axis.text.x = element_text(angle=90, size=7) )
return(stackplot)
}else if(style==3){ #Individuals are separated by each fold
ndf <- melt(df_mas, id.vars=c("Ind.ID","origin.pop","pred.pop","fold_n"))#Reshape the data, making probabilities in one single column (var name="value")
stackplot <- ggplot(ndf, aes(x=Ind.ID, y=value, fill=variable))+
geom_bar(stat="identity", width=1)+ # width=1 allows no space between bars
#scale_fill_grey()+ # Make the bar color in grey scale
facet_grid( fold_n ~ origin.pop, scales="free_x", space="free_x")+ #scales="free" allows each facet includes the data that exist; space="free" allows facet size being proportionally adjusted
ylab("Probability")+
labs(title=paste0("K = ",ans_k," ",pltext))+
coord_cartesian(ylim=c(0, 1.005))+ #add 0.005 on y to give tiny space between panel and facet strip
guides(fill=guide_legend(title=NULL))+ #Hiding title of legend
theme_bw()+
theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank(),#hiding grid of the panel
strip.background = element_rect(colour="black", fill="white", linetype="solid"),#change facet title background color
plot.title = element_text(hjust=0.5),
axis.title.x = element_blank(), axis.text.x = element_text(angle = 90))
#plot.title = element_text(size=16, vjust=0.8),
#legend.text = element_text(size=14),
#strip.text.x = element_text(size=16),
#axis.title.y = element_text(size=16), axis.text.y = element_text(size=14, colour="black"),
#axis.title.x = element_blank(), axis.text.x = element_text(angle=90, size=7) )
return(stackplot)
}else if(style==4){ #Individuals are separated by fold and sorted by probability
df_mas_4 <- data.frame(matrix(ncol=0,nrow=0))
for(p in pops){
df_pop <- df_mas[which(df_mas$origin.pop==p),]#Subset samples for each pop based on the origin.pop variable
df_pop$Ind.ID <- as.factor(df_pop$Ind.ID) #convert ind id charater to factor data
df_pop$Ind.ID <- droplevels(df_pop$Ind.ID) #Drop unexisting ind id levels
df_pop$Ind.ID <- factor(df_pop$Ind.ID, levels=df_pop$Ind.ID[order(df_pop$fold_n, -df_pop[,p])], ordered=T)##Reorder samples by fold.k and then probability values of that pop
df_mas_4 <- rbind(df_mas_4, df_pop)
}
ndf <- melt(df_mas_4, id.vars=c("Ind.ID","origin.pop","pred.pop","fold_n"))#Reshape the data, making probabilities in one single column (var name="value")
stackplot <- ggplot(ndf, aes(x=Ind.ID, y=value, fill=variable))+
geom_bar(stat="identity", width=1)+ # width=1 allows no space between bars
#scale_fill_grey()+ # Make the bar color in grey scale
facet_grid( fold_n ~ origin.pop, scales="free_x", space="free_x")+ #scales="free" allows each facet includes the data that exist; space="free" allows facet size being proportionally adjusted
ylab("Probability")+
labs(title=paste0("K = ",ans_k," ",pltext))+
coord_cartesian(ylim=c(0, 1.005))+ #add 0.005 on y to give tiny space between panel and facet strip
guides(fill=guide_legend(title=NULL))+ #Hiding title of legend
theme_bw()+
theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank(),#hiding grid of the panel
strip.background = element_rect(colour="black", fill="white", linetype="solid"),#change facet title background color
plot.title = element_text(hjust=0.5),
axis.title.x = element_blank(), axis.text.x = element_text(angle = 90))
#plot.title = element_text(size=16, vjust=0.8),
#legend.text = element_text(size=14),
#strip.text.x = element_text(size=16),
#axis.title.y = element_text(size=16), axis.text.y = element_text(size=14, colour="black"),
#axis.title.x = element_blank(), axis.text.x = element_text(angle=90, size=7) )
return(stackplot)
}else {
ndf <- melt(df_mas, id.vars=c("Ind.ID","origin.pop","pred.pop","fold_n"))#Reshape the data, making probabilities in one single column (var name="value")
stackplot <- ggplot(ndf, aes(x=Ind.ID, y=value, fill=variable))+
geom_bar(stat="identity", width=1)+ # width=1 allows no space between bars
#scale_fill_grey()+ # Make the bar color in grey scale
facet_grid( . ~ origin.pop, scales="free_x", space="free_x")+ #scales="free" allows each facet includes the data that exist; space="free" allows facet size being proportionally adjusted
ylab("Probability")+
labs(title=paste0("K = ",ans_k," ",pltext))+
coord_cartesian(ylim=c(0, 1.005))+ #add 0.005 on y to give tiny space between panel and facet strip
guides(fill=guide_legend(title=NULL))+ #Hiding title of legend
theme_bw()+
theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank(),#hiding grid of the panel
strip.background = element_rect(colour="black", fill="white", linetype="solid"),#change facet title background color
plot.title = element_text(hjust=0.5),
axis.title.x = element_blank(), axis.text.x = element_text(angle = 90))
#legend.text = element_text(size=14),
#strip.text.x = element_text(size=16),
#axis.title.y = element_text(size=16), axis.text.y = element_text(size=14, colour="black"),
#axis.title.x = element_blank(), axis.text.x = element_text(angle=90, size=7) )
return(stackplot)
}
} #End
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.