knitr::opts_chunk$set(echo = TRUE) ## load necessary libraries library(NILC) invisible( library(tidyverse) ) library(readxl) library(corrplot) library(RColorBrewer) library(viridis) library(gridExtra) library(car) #library(cowplot)
First how do we define and/or measure performance ?
What are some of the variable that we think may influence performance?
####################### ## Read in the data ####################### n = excel_sheets("data/SupplementaryMaterial2.xlsx") ## mydata = sapply(1:length(n), function(x){ read_excel("data/SupplementaryMaterial2.xlsx", sheet = x) }) names(mydata) = n ####################### ## What data is available ## in each sheet ? ####################### lapply(mydata, names)
-- Add average fragment length from Table S1 to Table S2 --
m = match( unlist( mydata[[2]][,1] ), unlist( mydata[[1]][,1] ) ) mydata[[2]] = as_tibble( cbind( mydata[[2]] , mydata[[1]][m,8] ) )
-- Add the data from Table S2 to Table S4 to produce the full data set together --
-- Create a Pool variable --
m = match( unlist( mydata[[4]][,1] ), unlist( mydata[[2]][,1] ) ) mydata[[4]] = as_tibble( cbind( mydata[[4]][,1], mydata[[2]][m, -1], mydata[[4]][, -1] ) ) ## convert characters to factors # mydata[[4]] %>% mutate_if(is.character, as.factor) %>% str() mydata[[4]] = mydata[[4]] %>% mutate_if(is.character, as.factor) ## Set the working data frame to wdata wdata = mydata[[4]] Pool = substring(wdata$`Capture Pool`, 1,2) test = cbind(wdata[, 1:9], Pool, wdata[, 10:31]) wdata = as_tibble(test)
-- Add the data from Table S2 to Table Downsampled to produce the Down-Sampled data set together --
-- Create a Pool variable --
m = match( unlist( mydata[[6]][,1] ), unlist( mydata[[2]][,1] ) ) mydata[[6]] = as_tibble( cbind( mydata[[6]][,1], mydata[[2]][m, -1], mydata[[6]][, -1] ) ) ## convert characters to factors # mydata[[4]] %>% mutate_if(is.character, as.factor) %>% str() mydata[[6]] = mydata[[6]] %>% mutate_if(is.character, as.factor) ## Set the working data frame to wdata downdata = mydata[[6]] Pool = substring(downdata$`Capture Pool`, 1,2) test = cbind(downdata[, 1:9], Pool, downdata[, 10:32]) downdata = as_tibble(test)
-- Using the funtions in the NILC R package, estimate a correlation matrix among all variables in the study --
-- The correaltion estiamtes are based on (1) spearman's rho (N-on-N), (2) Carmer's V (F-on-F), and (3) univariate linear model (F-on-N) sqrt(R2) | sqrt(etasq)
-- Correlation matrix for the the complete data set --
CorMat = Test_DF_Correlations( wdata ) rownames(CorMat[[1]]) = colnames(CorMat[[1]]) = names( wdata ) rownames(CorMat[[2]]) = colnames(CorMat[[2]]) = names( wdata ) write.table(CorMat[[1]], file = "summary_tables/FullData_Correlations.txt", row.names = TRUE, col.names = TRUE, sep = "\t", quote = FALSE)
pdf("pdf_figures/CorrelationPlot_FullDataSet.pdf", width = 15, height = 10) corrplot(CorMat[[1]], order = "hclust", addrect = 4, tl.col = "red", tl.cex = 0.95, tl.srt = 45, method = "cir", p.mat = CorMat[[2]], insig = "label_sig", sig.level = c(.001, .01, .05), #sig.level = 0.05, pch.cex = 1.0, pch.col = "purple") dev.off() corrplot(CorMat[[1]], order = "hclust", addrect = 4, tl.col = "red", tl.cex = 0.95, tl.srt = 45, method = "cir", p.mat = CorMat[[2]], insig = "label_sig", sig.level = c(.001, .01, .05), #sig.level = 0.05, pch.cex = 1.0, pch.col = "purple")
-- Correlation matrix for the the downsampled data set --
## exclude those samples that do not have 1.5M reads r = which(downdata$`Production Reads` != 1500000) ## ## remove production reads and production bases from correlation matrix as the data has been downsampled to that 1.5M DownCorMat = Test_DF_Correlations( downdata[-r,-c(1,13:14)] ) ## rownames(DownCorMat[[1]]) = colnames(DownCorMat[[1]]) = names( downdata )[-c(1,13:14)] rownames(DownCorMat[[2]]) = colnames(DownCorMat[[2]]) = names( downdata )[-c(1,13:14)] write.table(DownCorMat[[1]], file = "summary_tables/DownSampleData_1.5M_Correlations.txt", row.names = TRUE, col.names = TRUE, sep = "\t", quote = FALSE)
pdf("pdf_figures/CorrelationPlot_DownSampled_DataSet.pdf", width = 15, height = 10) corrplot(DownCorMat[[1]], order = "hclust", addrect = 6, tl.col = "red", tl.cex = 0.95, tl.srt = 45, method = "cir", p.mat = DownCorMat[[2]], insig = "label_sig", sig.level = c(.001, .01, .05), #sig.level = 0.05, pch.cex = 1.0, pch.col = "purple") dev.off() corrplot(DownCorMat[[1]], order = "hclust", addrect = 6, tl.col = "red", tl.cex = 0.95, tl.srt = 45, method = "cir", p.mat = DownCorMat[[2]], insig = "label_sig", sig.level = c(.001, .01, .05), #sig.level = 0.05, pch.cex = 1.0, pch.col = "purple")
x = DownCorMat[[1]][, "DP1" ] x = sort(x) x = data.frame(CS = x) pdf("pdf_figures/CS_@DP1_CorrelationBarPlot.pdf", width = 15, height = 6) par(mar = c(15, 10,3,3)) moose_barplot(data = x, column_to_plot = "CS", labels_vec = rownames(x) , rot_angle = 45, pcol = "blue", pmain = "Down Sampled Data (1.5M ProductionReads)\nCorrelation Coef for Capture Sensitivity @DP1") dev.off() par(mar = c(15, 10,3,3)) moose_barplot(data = x, column_to_plot = "CS", labels_vec = rownames(x) , rot_angle = 45, pcol = "blue", pmain = "Down Sampled Data (1.5M ProductionReads)\nCorrelation Coef for Capture Sensitivity @DP1")
-- Distribution of summary statistics --
-- Are the distributions normal?? --
par(mfrow = c(2,4)) invisible( sapply( c(17,26:32), function(i){ d = na.omit( unlist( wdata[, i] ) ) Wstat = signif( shapiro.test( d )$statistic , d = 3) plot( density( d) , lwd = 6, col = "limegreen", main = paste0( colnames(wdata)[i], "; shaprio W = ", Wstat) ) }) )
-- Analysis of the Complete data set --
-- as influenced by site, DNA [concentration], %eDNA, fragment size, pool, amount of DNA in hybridaztion, hybridization, Sequencing run, production reads
## using wdata data frame as input dep_cols = 15:32 ind_cols = c(1:3, 5:13) UnivarMat = lapply(dep_cols, function(dep){ out = sapply(ind_cols, function(ind){ df = data.frame( dep = unlist(wdata[, dep]), ind = unlist(wdata[, ind]) ) ## Rank Normal transformation of the data df$dep = rntransform( df$dep ) fit = lm( dep ~ ind, data = df) s = summary(fit) ##### o = c(s$r.squared, s$adj.r.squared, s$fstatistic[1]) names(o) = c("Rsq","Adj_Rsq","Fstat") ##### a = anova(fit) eta = a[1,2]/sum(a[,2]) Fstat = a[1, 4] pval = a[1, 5] o = c(o, eta, Fstat, pval) names(o) = c("Rsq","Adj_Rsq","Fstat", "EtaSq", "Fstat_","pval") ##### return(o) }) out = t(out) rownames(out) = colnames(wdata)[ind_cols] ## return(out[, c(1:3,6)]) }) names(UnivarMat) = colnames(wdata)[dep_cols] for( i in 1:length(UnivarMat) ){ x = UnivarMat[[i]] n = names(UnivarMat)[i] n = gsub(" ","", n) n = paste0("summary_tables/FullData_Univariate_", n, "_VarExp.txt") write.table(x, file = n, row.names = TRUE, col.names = TRUE, sep = "\t", quote = FALSE) } #### Kendall's tau dep_cols = 15:32 ktau = t( sapply(dep_cols, function(x){ o = cor.test( unlist(wdata[, x]) , unlist(wdata[,"Production Reads"]), method = "k") out = c(o$estimate, o$p.value) return(out) }) ) rownames(ktau) = colnames(wdata)[dep_cols]
testmat = sapply(1:length(UnivarMat), function(x){ return( UnivarMat[[x]][,1] ) }) pmat = sapply(1:length(UnivarMat), function(x){ return( UnivarMat[[x]][,4] ) }) #### colnames(testmat) = names(UnivarMat) pdf("pdf_figures/FullData_UnivariateVarExp.pdf", width = 15, height = 10) corrplot(testmat, tl.col = "red", tl.cex = 0.95, tl.srt = 45, method = "cir", p.mat = pmat, insig = "label_sig", sig.level = c(.001, .01, .05), #sig.level = 0.05, pch.cex = 1.0, pch.col = "purple") dev.off() # corrplot(testmat, # tl.col = "red", # tl.cex = 0.95, # tl.srt = 45, method = "cir", # p.mat = pmat, # insig = "label_sig", # sig.level = c(.001, .01, .05), # #sig.level = 0.05, # pch.cex = 1.0, # pch.col = "purple") pdf("pdf_figures/FullData_UnivariateVarExp_v2.pdf", width = 15, height = 10) col <- colorRampPalette(RColorBrewer::brewer.pal(9, "RdBu") ) corrplot(testmat, tl.col = "red", tl.cex = 0.95, tl.srt = 45, method = "color", col = col(200), addCoef.col = "grey25", pch.cex = 1.0, pch.col = "purple") dev.off() ######## col <- colorRampPalette(RColorBrewer::brewer.pal(9, "RdBu") ) corrplot(testmat, tl.col = "red", tl.cex = 0.95, tl.srt = 45, method = "color", col = col(200), addCoef.col = "grey25", pch.cex = 1.0, pch.col = "purple")
-- Analysis of the Down-sampled data set ( 1.5M Production reads samples only )--
## which downsampled data did not have 1.5M reads ?? w = which(downdata$`Production Reads` < 1500000) ## 274 samples left ## using wdata data frame as input dep_cols = 15:33 ind_cols = c(1:3, 5:12) #ind_cols = c(2, 5:23) UnivarMat_DOWNSAM = lapply(dep_cols, function(dep){ out = sapply(ind_cols, function(ind){ df = data.frame( dep = unlist(downdata[-w, dep]), ind = unlist(downdata[-w, ind]) ) df$dep = rntransform( df$dep ) fit = lm( dep ~ ind, data = df) s = summary(fit) ##### o = c(s$r.squared, s$adj.r.squared, s$fstatistic[1]) names(o) = c("Rsq","Adj_Rsq","Fstat") ##### a = anova(fit) eta = a[1,2]/sum(a[,2]) Fstat = a[1, 4] pval = a[1, 5] o = c(o, eta, Fstat, pval) names(o) = c("Rsq","Adj_Rsq","Fstat", "EtaSq", "Fstat_","pval") ##### return(o) }) out = t(out) rownames(out) = colnames(downdata)[ind_cols] ## return(out) }) names(UnivarMat_DOWNSAM) = colnames(downdata)[dep_cols] for( i in 1:length(UnivarMat_DOWNSAM) ){ x = UnivarMat_DOWNSAM[[i]] n = names(UnivarMat_DOWNSAM)[i] n = gsub(" ","", n) n = paste0("summary_tables/DownSampleData_1.5M_Univariate_", n, "_VarExp.txt") write.table(x, file = n, row.names = TRUE, col.names = TRUE, sep = "\t", quote = FALSE) }
testmat = sapply(1:length(UnivarMat_DOWNSAM), function(x){ return( UnivarMat_DOWNSAM[[x]][,1] ) }) pmat = sapply(1:length(UnivarMat_DOWNSAM), function(x){ return( UnivarMat_DOWNSAM[[x]][,6] ) }) #### colnames(testmat) = names(UnivarMat_DOWNSAM) pdf("pdf_figures/DownSampledData_1.5Monly_UnivariateVarExp.pdf", width = 15, height = 10) corrplot(testmat, tl.col = "red", tl.cex = 0.95, tl.srt = 45, method = "cir", p.mat = pmat, insig = "label_sig", sig.level = c(.001, .01, .05), #sig.level = 0.05, pch.cex = 1.0, pch.col = "purple") dev.off() ####### col <- colorRampPalette(RColorBrewer::brewer.pal(9, "RdBu") ) pdf("pdf_figures/DownSampledData_1.5Monly_UnivariateVarExp_v2.pdf", width = 15, height = 10) corrplot(testmat, tl.col = "red", tl.cex = 0.95, tl.srt = 45, method = "color", col = col(200), addCoef.col = "grey25", pch.cex = 1.0, pch.col = "purple") dev.off() corrplot(testmat, tl.col = "red", tl.cex = 0.95, tl.srt = 45, method = "color", col = col(200), addCoef.col = "grey25", pch.cex = 1.0, pch.col = "purple")
-- Down Sampled Data --
-- model : CS@DP1 ~ subspecies + site + [tDNA] + [eDNA] + %eDNA + fragmentsize + pool + amount_DNA-in-hyb + hybridization + seqbatch + error
w = which(downdata$`Production Reads` < 1500000) ## 274 samples left ##### d = rntransform( unlist( downdata[-w, "DP1"] ) ) ## sub setted model # fit0 = lm( d ~ `Subspecies` + `Site` + # `Total DNA Concentration (ng/ul)` + # #`Endogenous DNA (qPCR - pg/ul)` + # `% Endogenous DNA` + # `Average Fragment Size` + # `Pool` + # `Starting DNA (ug)` + # `Capture Pool` + # `Sequencing Batch` , data = downdata[-w,]) # ## full model !!! fit = lm( d ~ `Subspecies` + `Site` + `Total DNA Concentration (ng/ul)` + `Endogenous DNA (qPCR - pg/ul)` + `% Endogenous DNA` + `Average Fragment Size` + `Pool` + `Starting DNA (ug)` + `Capture Pool` + `Sequencing Batch` , data = downdata[-w,]) # anova(fit0, fit) ################### a = anova(fit) eta2 = a[, 2]/sum(a[,2]); names(eta2) = rownames(a) eta2 = data.frame(labels = rownames(a), eta2 = eta2) ### par(mar = c(6.5, 5,3,3)) moose_barplot(eta2, "eta2", eta2$labels, 20, pylim = c(0,0.45))
-- how is subspecies, site, tDNA, eDNA, %eDNA, frag-length partitioned among --> pools, hybridizations, and seq batches?
x = RColorBrewer::brewer.pal(3, "Set1") pcol = c( rep(x[1], 4), rep(x[2], 6), rep(x[3], 8) ) boxplot( unlist(downdata$`% Endogenous DNA`) ~ unlist(downdata$`Capture Pool`), col = pcol, ylab = "% eDNA", xlab = "Hybridization Pools", main = "%eDNA among hybridizations") legend("topright", legend = c("Pool1","Pool2","Pool3"), col = x, pch = 19, pt.cex = 2)
## subspecies among pools DownCorMat[[2]][2, 9] ## it is random ## sites among pools DownCorMat[[2]][1, 9] ## NOT RANDOM !!! ~ 20% of the variance in pools is explained by site (rho = 0.44; r2 = 0.19, p = 7.546206e-12 ) ## tDNA DownCorMat[[1]][7, 9] ## total DNA is random, eDNA is not random (p = 0.007), %eDNA IS NOT RANDOM (rho = 0.88; r2 = 0.78; p = 5.0e-90)
-- Pool (n = 10, 20, 30) on Capture Sensitivity --
## varaince explained w = which(downdata$`Production Reads` < 1500000) ### pcol = RColorBrewer::brewer.pal(3, "Set1") par(mfrow = c(1,3)) ### fit = lm( unlist( wdata$`Capture Sensitivity (CS) DP1`) ~ unlist(wdata$Pool) ) a = anova(fit) etasq = signif( a[1,2]/sum(a[,2]) , d = 4) pval = signif( a[1,5] , d = 4 ) boxplot( unlist( wdata$`Capture Sensitivity (CS) DP1`) ~ unlist(wdata$Pool), col = pcol, ylab = "CS @ DP1", xlab = "Pool", main = paste0( "Complete Data Set; Etasq = ", etasq, "; P = ", pval ) ) ##### fit = lm( unlist( downdata$`DP1`[-w]) ~ unlist(downdata$Pool[-w]) ) a = anova(fit) etasq = signif( a[1,2]/sum(a[,2]) , d = 4) pval = signif( a[1,5] , d = 4 ) boxplot( unlist( downdata$`DP1`[-w]) ~ unlist(downdata$Pool[-w]), col = pcol, ylab = "CS @ DP1", xlab = "Pool", main = paste0( "DownSampled Data Set; Etasq = ", etasq, "; P = ", pval )) ### dp = rntransform( downdata$DP1[-w] ) fit0 = lm( dp ~ Site + `% Endogenous DNA` + `Average Fragment Size` + Pool, data = downdata[-w, ] ) ### res0 = residuals( lm( dp ~ Site + `% Endogenous DNA` + `Average Fragment Size` , data = downdata[-w, ] ) ) #res1 = residuals( lm( dp ~ Site + `% Endogenous DNA` + `Average Fragment Size` + `Starting DNA (ug)`, data = downdata[-w, ] ) ) #fit = lm( res0 ~ Pool, data = downdata[-w, ] ) a = anova(fit0) etasq = signif( a[4,2]/sum(a[,2]) , d = 4) pval = signif( a[4,5] , d = 4 ) boxplot( res0 ~ unlist(downdata$Pool[-w]), col = pcol, ylab = "CS @ DP1", xlab = "Pool", main = paste0( "Res. DownSample Data Set; Etasq = ", etasq, "; P = ", pval ))
-- of the complete data set --
## using wdata data frame as input dep_cols = 15:32 MultivarMat = t( sapply(dep_cols, function(i){ dep = rntransform( unlist( wdata[ ,i] ) ) ########### fit0 = lm( dep ~ `Subspecies` + `Site` + `Total DNA Concentration (ng/ul)` + `Endogenous DNA (qPCR - pg/ul)` + `% Endogenous DNA` + `Average Fragment Size` + `Pool` + `Starting DNA (ug)` + `Capture Pool` + `Sequencing Batch` + `Production Reads` , data = wdata) ############### fit = lm( dep ~ `Production Reads` + `Sequencing Batch` + `Capture Pool` + `Starting DNA (ug)` + `Pool` + `Average Fragment Size` + `% Endogenous DNA` + `Endogenous DNA (qPCR - pg/ul)` + `Total DNA Concentration (ng/ul)` + `Site` + `Subspecies`, data = wdata) s = summary(fit) ##### o = c(s$r.squared, s$adj.r.squared, s$fstatistic[1]) names(o) = c("Rsq","Adj_Rsq","Fstat") ##### a = anova(fit) n = gsub(" ","",rownames(a)); n = gsub("`","", n ) eta = a[,2]/sum(a[,2]); names(eta) = paste0("etasq_fit_", n ) pval = a[, 5]; names(pval) = paste0("pval_fit_", n ) ##### a = anova(fit0) n = gsub(" ","",rownames(a)); n = gsub("`","", n ) eta0 = a[,2]/sum(a[,2]); names(eta0) = paste0("etasq_fit0_", n ) pval0 = a[,5 ]; names(pval0) = paste0("pval_fit0_", n ) #### o = c(o, eta0, pval0, eta, pval) ##################### ## TYPE II ANOVA #################### a = Anova(fit, type = "II") eta_type2 = a[,1]/sum(a[,1], na.rm = TRUE) n = rownames(a); n = gsub(" ","", n); n = gsub("`","", n) names(eta_type2) = paste0("eta_fit_T2_", n) pval_type2 = a[,4]; names(pval_type2) = paste0("pval_fit_T2", n) ##### out = c(o, eta_type2, pval_type2) ##### return(out) }) ) rownames(MultivarMat) = colnames(wdata)[dep_cols] n = "summary_tables/FullData_Multivariate_Models.txt" write.table(MultivarMat, file = n, row.names = TRUE, col.names = TRUE, quote = FALSE, sep = "\t")
-- of the Downsampled data set limited to only those samples with 1.5M production reads --
w = which(downdata$`Production Reads` < 1500000) ## 274 samples left ## using wdata data frame as input dep_cols = 15:33 MultivarMat_DOWNSAM = t( sapply(dep_cols, function(i){ dep = rntransform( unlist( downdata[ -w,i] ) ) ########### fit0 = lm( dep ~ `Subspecies` + `Site` + `Total DNA Concentration (ng/ul)` + `Endogenous DNA (qPCR - pg/ul)` + `% Endogenous DNA` + `Average Fragment Size` + `Pool` + `Starting DNA (ug)` + `Capture Pool` + `Sequencing Batch` , data = downdata[-w, ]) ############### fit = lm( dep ~ `Sequencing Batch` + `Capture Pool` + `Starting DNA (ug)` + `Pool` + `Average Fragment Size` + `% Endogenous DNA` + `Endogenous DNA (qPCR - pg/ul)` + `Total DNA Concentration (ng/ul)` + `Site` + `Subspecies`, data = downdata[-w, ]) s = summary(fit) ##### o = c(s$r.squared, s$adj.r.squared, s$fstatistic[1]) names(o) = c("Rsq","Adj_Rsq","Fstat") ##### a = anova(fit) n = gsub(" ","",rownames(a)); n = gsub("`","", n ) eta = a[,2]/sum(a[,2]); names(eta) = paste0("etasq_fit_", n ) pval = a[, 5]; names(pval) = paste0("pval_fit_", n ) ##### a = anova(fit0) n = gsub(" ","",rownames(a)); n = gsub("`","", n ) eta0 = a[,2]/sum(a[,2]); names(eta0) = paste0("etasq_fit0_", n ) pval0 = a[,5 ]; names(pval0) = paste0("pval_fit0_", n ) #### o = c(o, eta0, pval0, eta, pval) ##################### ## TYPE II ANOVA #################### a = Anova(fit, type = "II") eta_type2 = a[,1]/sum(a[,1], na.rm = TRUE) n = rownames(a); n = gsub(" ","", n); n = gsub("`","", n) names(eta_type2) = paste0("eta_fit_T2_", n) pval_type2 = a[,4]; names(pval_type2) = paste0("pval_fit_T2_", n) ##### out = c(o, eta_type2, pval_type2) ##### return(out) }) ) rownames(MultivarMat_DOWNSAM) = colnames(downdata)[dep_cols] n = "summary_tables/DownSampleData_1.5M_Multivariate_Models.txt" write.table(MultivarMat_DOWNSAM, file = n, row.names = TRUE, col.names = TRUE, quote = FALSE, sep = "\t")
--correlation plot the multivariate fit data --
testmat = MultivarMat_DOWNSAM[, 4:12] pmat = MultivarMat_DOWNSAM[, 14:22] #### colnames(testmat) = colnames(MultivarMat_DOWNSAM)[4:12] testmat = t(testmat) pmat = t(pmat) ####### col <- colorRampPalette(RColorBrewer::brewer.pal(9, "RdBu") ) pdf("pdf_figures/DownSampledData_1.5Monly_MultivariateVarExp_v2.pdf", width = 15, height = 10) corrplot(testmat, tl.col = "red", tl.cex = 0.95, tl.srt = 45, method = "color", col = col(200), addCoef.col = "grey25", pch.cex = 1.0, pch.col = "purple") dev.off() corrplot(testmat, tl.col = "red", tl.cex = 0.95, tl.srt = 45, method = "color", col = col(200), addCoef.col = "grey25", pch.cex = 1.0, pch.col = "purple")
sort( apply(testmat, 1, mean) , decreasing = TRUE)
-- barplots of the DownSampled Variance Explained for CS@DP1 --
pcols = RColorBrewer::brewer.pal(3, "Dark2") ### par(mfrow = c(3, 1), mar = c(8,3,3,1)) #### d = as.data.frame(UnivarMat_DOWNSAM$DP1) rownames(d) = gsub(" ","",rownames(d)) moose_barplot( d, 'Rsq', row.names(d), 25, pcol = pcols[1], pmain = "Capture Sensitivity (DP1) Univariate ANOVAs", pylim = c(0,1)) ##### d = data.frame( d = MultivarMat_DOWNSAM["DP1", c(4:12)] ) rownames(d) = gsub("etasq_fit0_","",rownames(d)) moose_barplot( d , 'd', row.names(d) , 25, pcol = pcols[2], pmain = "Capture Sensitivity (DP1) Type I Multivariate ANOVAs", pylim = c(0,0.35) ) ##### d = data.frame( d = MultivarMat_DOWNSAM["DP1", c(24:31) ] ) rownames(d) = gsub("etasq_fit_","",rownames(d)) moose_barplot( d , 'd', row.names(d) , 25, pcol = pcols[3], pmain = "Capture Sensitivity (DP1) Type I Multivariate ANOVAs", pylim = c(0,0.35) )
## OK THIS IS SOME DAMN UGLY CODE !!! ## univariate etasq u = UnivarMat_DOWNSAM$DP1[, 1] u = u[c(3,2,1,4:7,11, 9:10, 8)] ## multvariate Etasq from fit0 m0 = MultivarMat_DOWNSAM["DP1", c(4:12) ] m0 = c( m0[1:2], NA, m0[3:6], m0[8], m0[7], m0[9], NA) ## multvariate Etasq from fit m = MultivarMat_DOWNSAM["DP1", c(24:31) ] m = c( NA, m[7], NA, m[6:3], NA, NA, m[2:1]) ## reorder u = u[c(1:7,9,8,10:11)] m0 = m0[c(1:7,9,8,10:11)] m = m[c(1:7,9,8,10:11)] ## varexp = c(u, m0, m) n = names(u) l = c(n , n, n) mod = c( rep("univar", length(u)) , rep("fit0", length(u)) , rep("fit", length(u)) ) ### d = data.frame(labels = l, varexp = varexp, model = mod) ### turn data frame into tibble d = as_tibble(d) d$labels <- factor(d$labels, levels = d$labels[11:1]) ## plot p = d %>% ggplot(aes(x = varexp, y = labels)) + geom_point( aes(color = model, shape = model), size = 7, alpha = 0.85) + scale_color_manual(values = RColorBrewer::brewer.pal(3, "Set1") ) + labs(title = paste0("Variance explained in capture sensitivity \n derived from univariate and type I multivariate ANOVAs"), caption = paste0( "fit0 = CS@DP1 ~ `Subspecies` + `Site` + `Total DNA Concentration` + `% Endogenous DNA` + `Average Fragment Size` + `Pool` + `Starting DNA` +`Capture Pool` + `Sequencing Batch`\n\nmodel `fit` has the order of explanatory variables reversed") ) pdf("pdf_figures/DownSampled_CS@DP1_Uni-n-Multi_ANOVA_etasq.pdf", width = 10, height = 8) p dev.off() p
w = which(is.na(wdata$`Production Reads`)) pcol = brewer.pal(9, "Blues")[-1] ## a ramp of colors pcol = colorRampPalette( pcol )(19) p = wdata[-w,] %>% ggplot( aes(x = `Unique Reads`, y = `Capture Sensitivity (CS) DP1`)) + #geom_point(aes(fill = `Capture Pool`, shape = `Sequencing Batch`, size = `% Endogenous DNA`), alpha = 0.8 ) + geom_point(aes(fill = `Sequencing Batch`, shape = `Pool`, size = `Library Complexity (LC)`), alpha = 0.8 ) + #geom_smooth(method = "loess", aes(color = `Sequencing Batch`)) + scale_shape_manual(values=c(21, 22, 23)) + scale_fill_brewer(palette = "Set1") + scale_color_manual(values = RColorBrewer::brewer.pal(3,"Set1") ) + #scale_fill_manual(values = pcol) + guides(fill = guide_legend(override.aes = list(size = 5, color = RColorBrewer::brewer.pal(3,"Set1") ) ) ) + guides(shape = guide_legend(override.aes = list(size = 5))) + labs(title = "Associations between unique reads and capture sensitiviy @DP1", subtitle =" ... as structured by sequencing batch and library complexity") ###################### p2 = wdata[-w,] %>% ggplot( aes(x = `Unique Reads`, y = `Library Complexity (LC)`)) + #geom_point(aes(fill = `Capture Pool`, shape = `Sequencing Batch`, size = `% Endogenous DNA`), alpha = 0.8 ) + geom_point(aes(fill = `Sequencing Batch`, shape = `Pool`, size = `Production Reads`), alpha = 0.8 ) + geom_smooth(method = "loess", aes(color = `Sequencing Batch`)) + scale_shape_manual(values=c(21, 22, 23)) + scale_fill_brewer(palette = "Set1") + scale_color_manual(values = RColorBrewer::brewer.pal(3,"Set1") ) + #scale_fill_manual(values = pcol) + guides(fill = guide_legend(override.aes = list(size = 5) ) ) + guides(shape = guide_legend(override.aes = list(size = 5))) + labs(title = "Associations beteween unique reads and library complexity", subtitle =" ... as structured by sequencing batch and production reads") ######################## pdf("pdf_figures/CS_vs_UniqueReads.pdf", width = 15, height = 5) grid.arrange( p, p2 , nrow = 1) dev.off() grid.arrange( p, p2 , nrow = 1)
w = which(downdata$`Production Reads` < 1500000 ) ###### pcol = c( rep("blue",2), rep("darkblue",2) , rep("green",4) , rep("darkgreen", 2), rep("red",6), rep("darkred",2)) par(mfrow = c(4,1)) boxplot( log10(downdata$`OnTarget Reliable Reads`[-w]) ~ downdata$`Capture Pool`[-w], col = pcol, ylab = "log10(on target RR)", main = "OnTarget Reliable Reads") boxplot(downdata$Enrichment[-w] ~ downdata$`Capture Pool`[-w], col = pcol, main = "Enrichment") boxplot(downdata$LC[-w] ~ downdata$`Capture Pool`[-w], col = pcol, main = "Library Complexity") boxplot(downdata$DP1[-w] ~ downdata$`Capture Pool`[-w], col = pcol, main = "Capture Sensitivity @DP1")
What can we learn from these observations? 1. Increase the total DNA concentration in hybridization reactions (Perry paper did this) 2. Do not sequence to deeply.
cp = sort( na.omit( as.character( unique(wdata$`Capture Pool`) ) ) ) x = c() for(i in 1:length(cp)){ w = which(wdata$`Capture Pool` == cp[i]) a = cor.test(wdata$`% Endogenous DNA`[w], wdata$`Production Reads`[w]) #a = cor.test(wdata$`% Endogenous DNA`[w], wdata$`OnTarget Reliable Reads`[w]) out = c(a$estimate, a$p.value) x = rbind(x, out) } rownames(x) = cp colnames(x) = c("rho","pval") x
-- Down Sampled Data --
-- model : CS@DP1 ~ subspecies + site + [tDNA] + [eDNA] + %eDNA + fragmentsize + pool + amount_DNA-in-hyb + hybridization + seqbatch + error
#w = which(downdata$`Production Reads` < 1500000 ) ## 274 samples left w = which(downdata$`Production Reads` < 1500000 | downdata$Pool == "P2" ) ## 200 samples left ##### d = rntransform( unlist( downdata[-w, "DP1"] ) ) ## sub setted model # fit0 = lm( d ~ `Subspecies` + `Site` + # `Total DNA Concentration (ng/ul)` + # #`Endogenous DNA (qPCR - pg/ul)` + # `% Endogenous DNA` + # `Average Fragment Size` + # `Pool` + # `Starting DNA (ug)` + # `Capture Pool` + # `Sequencing Batch` , data = downdata[-w,]) # ## full model !!! fit = lm( d ~ `Subspecies` + `Site` + `Total DNA Concentration (ng/ul)` + `Endogenous DNA (qPCR - pg/ul)` + `% Endogenous DNA` + `Average Fragment Size` + `Pool` + `Starting DNA (ug)` + `Capture Pool` + `Sequencing Batch` , data = downdata[-w,]) # anova(fit0, fit) ################### a = anova(fit) eta2 = a[, 2]/sum(a[,2]); names(eta2) = rownames(a) eta2 = data.frame(labels = rownames(a), eta2 = eta2) ### par(mar = c(6.5, 5,3,3)) moose_barplot(eta2, "eta2", eta2$labels, 20, pylim = c(0,0.30))
w = which(downdata$`Production Reads` < 1500000) pcol = brewer.pal(9, "Blues")[-1] ## a ramp of colors pcol = colorRampPalette( pcol )(19) ################################ ## DP1 ################################ p = downdata[-w,] %>% ggplot( aes(x = `DP1`, y = `LC`)) + geom_point(aes(fill = `Sequencing Batch`, shape = `Sequencing Batch`, size = `Production Reads`), alpha = 0.8 ) + geom_smooth(method = "loess", color = "black") + scale_shape_manual(values=c(21, 22, 23)) + scale_fill_brewer(palette = "Set1") + #scale_fill_manual(values = pcol) + guides(fill = guide_legend(override.aes = list(size = 5) ) ) + labs(title = "LC and Capture sensitivity at DP1", subtitle =" ... as structured by sequencing batch", x = "CS @ DP1") ################## p1 = p + downdata[w,] %>% geom_point(mapping = aes(x = `DP1`, y = `LC`, fill = `Sequencing Batch`, shape = `Sequencing Batch`, size = `Production Reads`), alpha = 0.8 ) + theme(legend.position = "none") ################################ ## DP4 ################################ p = downdata[-w,] %>% ggplot( aes(x = `DP4`, y = `LC`)) + geom_point(aes(fill = `Sequencing Batch`, shape = `Sequencing Batch`, size = `Production Reads`), alpha = 0.8 ) + geom_smooth(method = "loess", color = "black") + scale_shape_manual(values=c(21, 22, 23)) + scale_fill_brewer(palette = "Set1") + #scale_fill_manual(values = pcol) + guides(fill = guide_legend(override.aes = list(size = 5) ) ) + labs(title = "LC and Capture sensitivity at DP4", subtitle =" ... as structured by sequencing batch", x = "CS @ DP4") ################## p4 = p + downdata[w,] %>% geom_point(mapping = aes(x = `DP4`, y = `LC`, fill = `Sequencing Batch`, shape = `Sequencing Batch`, size = `Production Reads`), alpha = 0.8 ) + theme(legend.position = "none") ################################ ## DP10 ################################ p = downdata[-w,] %>% ggplot( aes(x = `DP10`, y = `LC`)) + geom_point(aes(fill = `Sequencing Batch`, shape = `Sequencing Batch`, size = `Production Reads`), alpha = 0.8 ) + geom_smooth(method = "loess", color = "black") + scale_shape_manual(values=c(21, 22, 23)) + scale_fill_brewer(palette = "Set1") + #scale_fill_manual(values = pcol) + guides(fill = guide_legend(override.aes = list(size = 5) ) ) + labs(title = "LC and Capture sensitivity at DP10", subtitle =" ... as structured by sequencing batch", x = "CS @ DP10") ################## p10 = p + downdata[w,] %>% geom_point(mapping = aes(x = `DP10`, y = `LC`, fill = `Sequencing Batch`, shape = `Sequencing Batch`, size = `Production Reads`), alpha = 0.8 ) l = cowplot::get_legend(p) p10 = p10 + theme(legend.position = "none") grid.arrange(p1, p4, p10, l, nrow = 1, widths = c(4,4,4,1))
It would be useful to identify a single summary statistic that summarizes what a good "performing" target capture seqeuencing experiment is. I think what we be most useful is to count the number of (population-wide) variable position that we were able to genotype (at whatever criteria uniformly executed across all samples). In the absence of this data it would seem that the best summary statistic that would predict this number is Capture Sensitivity (# of target regions covered by 1 read / total number of target regions), as having a base covered by a read provides a chance for genotyping.
Now depth in coverage gives us accuracy in genotyping heterozygosity and there is most certainly going to be some bias in capturing specific alleles, but we have some good evidence that just making hemizygous calls is informative for the gross|macro level population genetics we would like to do. However, this should eventually be quantified.
#x = sort( CorMat[[1]][-26,26] ); barplot(x) df = tibble( ID = colnames( CorMat[[1]] ) , CS_cor = unlist( CorMat[[1]][,"Capture Sensitivity (CS) DP1"]), CS_pval = unlist( CorMat[[2]][,"Capture Sensitivity (CS) DP1"])) ## change order to increasing values o = order(df$CS_cor) df = df[o,] df = df[1:25, ] ### set ploting order with factor df$ID <- factor(df$ID, levels = df$ID) ### plot ( p <- df %>% ggplot( aes( x = ID, y = CS_cor ) ) + geom_bar(stat="identity", aes(fill = log10(CS_pval) )) + labs(title = "Capture Sensitivity correlation") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) )
What can we learn from this analysis?
#x = sort( CorMat[[1]][-26,26] ); barplot(x) df = tibble( ID = colnames( DownCorMat[[1]] ) , CS_cor = unlist( DownCorMat[[1]][,"DP1"]), CS_pval = unlist( DownCorMat[[2]][,"DP1"])) df = df[-c(1,2,4,27:31), ] ## change order to increasing values o = order(df$CS_cor) df = df[o,] #df = df[1:25, ] ### set ploting order with factor df$ID <- factor(df$ID, levels = df$ID) ### plot ( p <- df %>% ggplot( aes( x = ID, y = CS_cor ) ) + geom_bar(stat="identity", aes(fill = log10(CS_pval) )) + labs(title = "Capture Sensitivity correlation") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) )
#library(network) library(igraph) ############################# ## Make Adjecency Matrix ############################# # x = CorMat[[2]][-c(1:4), -c(1:4)] # adjMat = x # adjMat[ x > 0.00001] = 0 # adjMat[ x <= 0.00001] = 1 # diag(adjMat) = 0 x = CorMat[[1]][-c(1:4), -c(1:4)] adjMat = abs( x ) adjMat[adjMat < 0.5] = 0 ############################# ## Categorize the Nodes ############################# n = colnames(adjMat) nodecats = c( rep("Sample", 3), rep("Methodological", 4), rep("Outcome",11), rep("SumStat", 7 ), "Methodological" ) pcol_o = brewer.pal(nlevels(as.factor(nodecats)), "Set1") pcol <- pcol_o[as.numeric(as.factor(nodecats))] ############################# ## Generate network ############################# network <- graph_from_adjacency_matrix(adjMat, weighted=T, mode="undirected", diag=F) ## estimate degree for each node deg <- degree(network, mode="all") ############################# ## Make Plot ############################# par(bg="grey33", mar=c(0,0,0,0)) plot(network, #layout=layout.sphere, #layout=layout.circle, layout=layout.fruchterman.reingold, vertex.color = pcol, # Node color vertex.label.color="white", vertex.frame.color = "white", # Node border color vertex.shape= "circle", # One of “none”, “circle”, “square”, “csquare”, “rectangle” “crectangle”, “vrectangle”, “pie”, “raster”, or “sphere” vertex.size=deg+4, # Size of the node (default is 15) #vertex.size2=NA, edge.color = "grey50", #edge.arrow.size=0 ) legend("bottomleft", legend=levels(as.factor(nodecats)), col = pcol_o, bty = "n", pch=20 , pt.cex = 3, cex = 1.5, text.col=pcol_o, horiz = FALSE, inset = c(0.1, 0.1))
############################# ## Data Reductions ############################# ### to do computationally -- LATER ############################# ## Make Adjecency Matrix ############################# # x = CorMat[[2]][-c(1:4), -c(1:4)] # adjMat = x # adjMat[ x > 0.00001] = 0 # adjMat[ x <= 0.00001] = 1 # diag(adjMat) = 0 x = CorMat[[1]][-c(1:4,11, 16:19, 21, 27:29), -c(1:4,11, 16:19, 21, 27:29)] adjMat = abs( x ) adjMat[adjMat < 0.5] = 0 ############################# ## Categorize the Nodes ############################# n = colnames(adjMat) nodecats = c( rep("Sample", 3), rep("Methodological", 3), rep("Outcome",6), rep("SumStat", 4 ), "Methodological" ) pcol_o = brewer.pal(nlevels(as.factor(nodecats)), "Set1") pcol <- pcol_o[as.numeric(as.factor(nodecats))] ############################# ## Generate network ############################# network <- graph_from_adjacency_matrix(adjMat, weighted=T, mode="undirected", diag=F) ## estimate degree for each node deg <- degree(network, mode="all") ############################# ## Make Plot ############################# par(bg="grey33", mar=c(0,0,0,0)) plot(network, #layout=layout.sphere, #layout=layout.circle, layout=layout.fruchterman.reingold, vertex.color = pcol, # Node color vertex.label.color="white", vertex.frame.color = "white", # Node border color vertex.shape= "circle", # One of “none”, “circle”, “square”, “csquare”, “rectangle” “crectangle”, “vrectangle”, “pie”, “raster”, or “sphere” vertex.size=deg+4, # Size of the node (default is 15) #vertex.size2=NA, edge.color = "grey50", #edge.arrow.size=0 ) legend("topleft", legend=levels(as.factor(nodecats)), col = pcol_o, bty = "n", pch=20 , pt.cex = 3, cex = 1.5, text.col=pcol_o, horiz = FALSE, inset = c(0.1, 0.1))
############################# ## Data Reductions ############################# Dmat = as.dist( na.omit( 1 - abs(DownCorMat[[1]]) ) ) tree = hclust(Dmat, method = "complete") # plot(tree, hang = -1) ############################# ## Make Adjecency Matrix ############################# # x = CorMat[[2]][-c(1:4), -c(1:4)] # adjMat = x # adjMat[ x > 0.00001] = 0 # adjMat[ x <= 0.00001] = 1 # diag(adjMat) = 0 r = c(1:4, 12, 14, 16, 17,20, 21, 22, 28:31) x = DownCorMat[[1]][-r, -r] adjMat = abs( x ) adjMat[adjMat < 0.5] = 0 ############################# ## Categorize the Nodes ############################# n = colnames(adjMat) nodecats = c( rep("Sample", 3), rep("Methodological", 3), rep("Outcome",6), rep("SumStat", 4 ), "Methodological" ) pcol_o = brewer.pal(nlevels(as.factor(nodecats)), "Set1") pcol <- pcol_o[as.numeric(as.factor(nodecats))] ############################# ## Generate network ############################# network <- graph_from_adjacency_matrix(adjMat, weighted=T, mode="undirected", diag=F) ## estimate degree for each node deg <- degree(network, mode="all") ############################# ## Make Plot ############################# par(bg="grey33", mar=c(0,0,0,0)) plot(network, #layout=layout.sphere, #layout=layout.circle, layout=layout.fruchterman.reingold, vertex.color = pcol, # Node color vertex.label.color="white", vertex.frame.color = "white", # Node border color vertex.shape= "circle", # One of “none”, “circle”, “square”, “csquare”, “rectangle” “crectangle”, “vrectangle”, “pie”, “raster”, or “sphere” vertex.size=deg+4, # Size of the node (default is 15) #vertex.size2=NA, edge.color = "grey50", #edge.arrow.size=0 ) legend("topleft", legend=levels(as.factor(nodecats)), col = pcol_o, bty = "n", pch=20 , pt.cex = 3, cex = 1.5, text.col=pcol_o, horiz = FALSE, inset = c(0.1, 0.1))
w = which(is.na(wdata$`Production Reads`)) pcol = brewer.pal(9, "Blues")[-1] ## a ramp of colors pcol = colorRampPalette( pcol )(19) wdata[-w,] %>% mutate(NCS = `Capture Sensitivity (CS) DP1` / `Production Reads`) %>% ggplot( aes(x = `Endogenous DNA (qPCR - pg/ul)`, y = `Capture Sensitivity (CS) DP1`)) + #geom_point(aes(fill = `Sequencing Batch`, shape = `Sequencing Batch`, size = `% Endogenous DNA`), alpha = 0.5 ) + #geom_point(aes(fill = `Sequencing Batch`, shape = `Sequencing Batch`, size = log10(`Production Reads`)), alpha = 0.5 ) + geom_point(aes(fill = `Capture Pool`, shape = `Sequencing Batch`, size = log10(`Production Reads`)), alpha = 0.5 ) + geom_smooth( method = "loess") + scale_shape_manual(values=c(21, 22, 23)) + #scale_fill_brewer(palette = "Set1") + scale_fill_manual(values = rainbow(nlevels(wdata$`Capture Pool`)) ) + guides(fill = guide_legend(override.aes = list(size = 5, color = rainbow(nlevels(wdata$`Capture Pool`))) ) ) + labs(title = "Associations beteween Endogenous DNA concentration and capture Sensitivity", subtitle =" as structured by sequencing batch")
wdata[-w,] %>% ggplot( aes(y = `Endogenous DNA (qPCR - pg/ul)`, x = `Capture Pool`)) + geom_boxplot(fill = gray.colors(nlevels(wdata$`Capture Pool`) ), outlier.colour="red", outlier.shape=16, outlier.size=2, notch=FALSE) + geom_jitter(shape=16, position=position_jitter(0.2), color = "blue") + labs(title = "eDNA concentration by capture pool")
cols2test = c(2:3, 4:23, 25, 30) UnivarateANOVA = matrix(NA, length(cols2test), 2) for(i in 1:length(cols2test) ){ x = unlist(wdata[,cols2test[i] ]) test = class( x ) ### fit = lm(wdata$`Capture Sensitivity (CS) DP1` ~ x) ### a = anova(fit) eta = a[1,2]/sum(a[,2]) pval = a[1, 5] out = c(eta, pval) ## UnivarateANOVA[i, ] = out } rownames(UnivarateANOVA) = colnames(wdata)[cols2test] colnames(UnivarateANOVA) = c("eta","pval") ## order o = order(UnivarateANOVA[,1], decreasing = TRUE) UnivarateANOVA = UnivarateANOVA[o,]
df = tibble(ID = rownames(UnivarateANOVA), eta = UnivarateANOVA[,1], pvals = UnivarateANOVA[,2]) ### maintain model order for the plot df$ID <- factor(df$ID, levels = df$ID) ### plot ( p <- df %>% ggplot( aes( x = ID, y = eta ) ) + geom_bar(stat="identity", aes(fill = log10(pvals) )) + labs(title = "Univariate ANOVA for capture sensitivity")+ theme(axis.text.x = element_text(angle = 45, hjust = 1)) )
library(car) ############################## ## fit a simple linear model ############################## fit = lm( `Capture Sensitivity (CS) DP1` ~ `Total DNA Concentration (ng/ul)` + `Endogenous DNA (qPCR - pg/ul)` + `% Endogenous DNA` + `Capture Pool` + `Starting DNA (ug)` + `Sequencing Batch` + `Production Reads` + `Unique Reads` , data = wdata ) fit = lm( `Capture Sensitivity (CS) DP1` ~ `Total DNA Concentration (ng/ul)` + `Endogenous DNA (qPCR - pg/ul)` + `% Endogenous DNA` + `Starting DNA (ug)` + `Production Reads` + `Unique Reads` , data = wdata ) ############################## ## are model residuals normal ? ############################## W = shapiro.test(residuals(fit)) ############################# ## estiamte SS and VarExp ## assuming an TypeI hierarchical ## ANOVA ############################# (a = anova(fit) ) eta = a[, 2]/ sum(a[,2]) names(eta) = rownames(a) summary(fit)
df = tibble(ID = names(eta) , eta = eta, pvals = a[, 5]) ### maintain model order for the plot df$ID <- factor(df$ID, levels = df$ID) ### plot ( p <- df %>% ggplot( aes( x = ID, y = eta ) ) + geom_bar(stat="identity", aes(fill = log10(pvals) )) + labs(title = "Type I ANOVA for capture sensitivity", subtitle = paste0( " Shapiro's W-stat for residuals of fitted model = ", signif(W$statistic, d = 4) )) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) )
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.