knitr::opts_chunk$set(echo = TRUE)

## load necessary libraries
library(NILC)
invisible( library(tidyverse) )
library(readxl)
library(corrplot)
library(RColorBrewer)
library(viridis)
library(gridExtra)
library(car)
#library(cowplot)

What are the variables that are influencing the performance of our experiments??

  1. First how do we define and/or measure performance ?

    • the number of reliable reads: reads retained after QC and mapping filtering (secondary alignments and mapping quality score < 30)
    • enrichment factor (EF): (reliable-on-target reads / production reads ) / (target space/genomic space)
    • library complexity (LC): # reliable reads / total number of mapped reads, including duplicates
    • capture sensitivity (CS): # of target regions covered by 1 read / total number of target regions
    • capture specificity (CSp): reliable-on-target / reliable reads
  2. What are some of the variable that we think may influence performance?

    • subspecies -- a proxy for environmental condition and sample quality
    • geogrpahic sampling site
    • sample
    • total DNA concentration
    • endogenous DNA concentration
    • % endogenous DNA
    • DNA fragment size
    • the sample pool it belongs to
    • the hybridization
    • production or sequencing reads acquired for a sample/hybridization
    • pipeting volume used to make a library (????) -- this could be something to evaluate, but perhaps that is explained by [tDNA]
    • pipeting volume used to make the pool (????)

Step1) Read in the data and report the variables available in each table

#######################
## Read in the data
#######################
n = excel_sheets("data/SupplementaryMaterial2.xlsx")
##
mydata = sapply(1:length(n), function(x){
  read_excel("data/SupplementaryMaterial2.xlsx", sheet = x)
  })
names(mydata) = n

#######################
## What data is available 
## in each sheet ?
#######################
lapply(mydata, names)

Step 2) Bring the necessary data together

-- Add average fragment length from Table S1 to Table S2 --

m = match( unlist( mydata[[2]][,1] ), unlist( mydata[[1]][,1]  ) )

mydata[[2]] =  as_tibble( cbind( mydata[[2]] , mydata[[1]][m,8] ) )

-- Add the data from Table S2 to Table S4 to produce the full data set together --

-- Create a Pool variable --

m = match( unlist( mydata[[4]][,1] ), unlist( mydata[[2]][,1]  ) )

mydata[[4]] =  as_tibble( cbind( mydata[[4]][,1], mydata[[2]][m, -1], mydata[[4]][, -1] ) )

## convert characters to factors
# mydata[[4]] %>% mutate_if(is.character, as.factor) %>% str()

mydata[[4]] = mydata[[4]] %>% mutate_if(is.character, as.factor)

## Set the working data frame to wdata
wdata = mydata[[4]]
Pool = substring(wdata$`Capture Pool`, 1,2)
test = cbind(wdata[, 1:9], Pool, wdata[, 10:31])
wdata = as_tibble(test)

-- Add the data from Table S2 to Table Downsampled to produce the Down-Sampled data set together --

-- Create a Pool variable --

m = match( unlist( mydata[[6]][,1] ), unlist( mydata[[2]][,1]  ) )

mydata[[6]] =  as_tibble( cbind( mydata[[6]][,1], mydata[[2]][m, -1], mydata[[6]][, -1] ) )

## convert characters to factors
# mydata[[4]] %>% mutate_if(is.character, as.factor) %>% str()

mydata[[6]] = mydata[[6]] %>% mutate_if(is.character, as.factor)

## Set the working data frame to wdata
downdata = mydata[[6]]

Pool = substring(downdata$`Capture Pool`, 1,2)
test = cbind(downdata[, 1:9], Pool, downdata[, 10:32])
downdata = as_tibble(test)

Step 3) Correlation Analysis : everything on everything

-- Using the funtions in the NILC R package, estimate a correlation matrix among all variables in the study --

-- The correaltion estiamtes are based on (1) spearman's rho (N-on-N), (2) Carmer's V (F-on-F), and (3) univariate linear model (F-on-N) sqrt(R2) | sqrt(etasq)

-- Correlation matrix for the the complete data set --

CorMat = Test_DF_Correlations( wdata ) 
rownames(CorMat[[1]]) = colnames(CorMat[[1]]) = names( wdata )
rownames(CorMat[[2]]) = colnames(CorMat[[2]]) = names( wdata )

write.table(CorMat[[1]], file = "summary_tables/FullData_Correlations.txt",
            row.names = TRUE, col.names = TRUE, sep = "\t", quote = FALSE)
pdf("pdf_figures/CorrelationPlot_FullDataSet.pdf", width = 15, height = 10)

corrplot(CorMat[[1]], 
         order = "hclust",
         addrect = 4,
         tl.col = "red", 
         tl.cex = 0.95,  
         tl.srt = 45, method = "cir",
         p.mat = CorMat[[2]],
         insig = "label_sig",
         sig.level = c(.001, .01, .05), 
         #sig.level = 0.05,
         pch.cex = 1.0, 
         pch.col = "purple")

dev.off()


corrplot(CorMat[[1]], 
         order = "hclust",
         addrect = 4,
         tl.col = "red", 
         tl.cex = 0.95,  
         tl.srt = 45, method = "cir",
         p.mat = CorMat[[2]],
         insig = "label_sig",
         sig.level = c(.001, .01, .05), 
         #sig.level = 0.05,
         pch.cex = 1.0, 
         pch.col = "purple")

-- Correlation matrix for the the downsampled data set --

## exclude those samples that do not have 1.5M reads
r = which(downdata$`Production Reads` != 1500000)
##

## remove production reads and production bases from correlation matrix as the data has been downsampled to that 1.5M
DownCorMat = Test_DF_Correlations( downdata[-r,-c(1,13:14)] ) 

##
rownames(DownCorMat[[1]]) = colnames(DownCorMat[[1]]) = names( downdata )[-c(1,13:14)]
rownames(DownCorMat[[2]]) = colnames(DownCorMat[[2]]) = names( downdata )[-c(1,13:14)]

write.table(DownCorMat[[1]], file = "summary_tables/DownSampleData_1.5M_Correlations.txt",
            row.names = TRUE, col.names = TRUE, sep = "\t", quote = FALSE)
pdf("pdf_figures/CorrelationPlot_DownSampled_DataSet.pdf", width = 15, height = 10)
corrplot(DownCorMat[[1]], 
         order = "hclust",
         addrect = 6,
         tl.col = "red", 
         tl.cex = 0.95,  
         tl.srt = 45, method = "cir",
         p.mat = DownCorMat[[2]],
         insig = "label_sig",
         sig.level = c(.001, .01, .05), 
         #sig.level = 0.05,
         pch.cex = 1.0, 
         pch.col = "purple")
dev.off()

corrplot(DownCorMat[[1]], 
         order = "hclust",
         addrect = 6,
         tl.col = "red", 
         tl.cex = 0.95,  
         tl.srt = 45, method = "cir",
         p.mat = DownCorMat[[2]],
         insig = "label_sig",
         sig.level = c(.001, .01, .05), 
         #sig.level = 0.05,
         pch.cex = 1.0, 
         pch.col = "purple")

-- A closer look at univaraite capture sensitivity @DP1 rho (correlation coefficents) --

x = DownCorMat[[1]][, "DP1" ]
x = sort(x)
x = data.frame(CS = x)

pdf("pdf_figures/CS_@DP1_CorrelationBarPlot.pdf", width = 15, height = 6)
par(mar = c(15, 10,3,3))
moose_barplot(data = x, column_to_plot = "CS", labels_vec = rownames(x) , rot_angle = 45, pcol = "blue", pmain = "Down Sampled Data (1.5M ProductionReads)\nCorrelation Coef for Capture Sensitivity @DP1")
dev.off()

par(mar = c(15, 10,3,3))
moose_barplot(data = x, column_to_plot = "CS", labels_vec = rownames(x) , rot_angle = 45, pcol = "blue",  pmain = "Down Sampled Data (1.5M ProductionReads)\nCorrelation Coef for Capture Sensitivity @DP1")

Step 4) Explicity Univariate linear modeling

-- Distribution of summary statistics --

-- Are the distributions normal?? --

par(mfrow = c(2,4))
invisible( 
  sapply( c(17,26:32), function(i){
  d = na.omit( unlist( wdata[, i] ) )
  Wstat = signif( shapiro.test( d )$statistic , d = 3)
  plot( density( d) , lwd = 6, col = "limegreen", 
       main = paste0( colnames(wdata)[i], ";  shaprio W = ", Wstat)  )
  })
)

-- Analysis of the Complete data set --

-- as influenced by site, DNA [concentration], %eDNA, fragment size, pool, amount of DNA in hybridaztion, hybridization, Sequencing run, production reads

## using wdata data frame as input
dep_cols = 15:32
ind_cols = c(1:3, 5:13)

UnivarMat = lapply(dep_cols, function(dep){
  out = sapply(ind_cols, function(ind){
    df = data.frame( dep = unlist(wdata[, dep]), ind = unlist(wdata[, ind]) )
    ## Rank Normal transformation of the data
    df$dep = rntransform( df$dep )
    fit = lm( dep ~ ind, data = df)
    s = summary(fit)
    #####
    o = c(s$r.squared, s$adj.r.squared, s$fstatistic[1])
    names(o) = c("Rsq","Adj_Rsq","Fstat")
    #####
    a = anova(fit)
    eta = a[1,2]/sum(a[,2])
    Fstat = a[1, 4]
    pval = a[1, 5]
    o = c(o, eta, Fstat, pval)
    names(o) = c("Rsq","Adj_Rsq","Fstat", "EtaSq", "Fstat_","pval")
    #####
    return(o)
    })
  out = t(out)
  rownames(out) = colnames(wdata)[ind_cols]
  ##
  return(out[, c(1:3,6)])
  })

names(UnivarMat) = colnames(wdata)[dep_cols]

for( i in 1:length(UnivarMat) ){
  x = UnivarMat[[i]]
  n = names(UnivarMat)[i]
  n = gsub(" ","", n)
  n = paste0("summary_tables/FullData_Univariate_", n, "_VarExp.txt")
  write.table(x, file = n, row.names = TRUE, col.names = TRUE, sep = "\t", quote = FALSE)
}

####  Kendall's tau
dep_cols = 15:32
ktau = t( sapply(dep_cols, function(x){
  o = cor.test( unlist(wdata[, x]) , unlist(wdata[,"Production Reads"]), method = "k")
  out = c(o$estimate, o$p.value)
  return(out)
  }) )
rownames(ktau) = colnames(wdata)[dep_cols]
testmat = sapply(1:length(UnivarMat), function(x){ return( UnivarMat[[x]][,1] ) })
pmat = sapply(1:length(UnivarMat), function(x){ return( UnivarMat[[x]][,4] ) })
####
colnames(testmat) = names(UnivarMat)

pdf("pdf_figures/FullData_UnivariateVarExp.pdf", width = 15, height = 10)
corrplot(testmat,
         tl.col = "red", 
         tl.cex = 0.95,  
         tl.srt = 45, method = "cir",
         p.mat = pmat,
         insig = "label_sig",
         sig.level = c(.001, .01, .05), 
         #sig.level = 0.05,
         pch.cex = 1.0, 
         pch.col = "purple")
dev.off()

# corrplot(testmat,
#          tl.col = "red", 
#          tl.cex = 0.95,  
#          tl.srt = 45, method = "cir",
#          p.mat = pmat,
#          insig = "label_sig",
#          sig.level = c(.001, .01, .05), 
#          #sig.level = 0.05,
#          pch.cex = 1.0, 
#          pch.col = "purple")

pdf("pdf_figures/FullData_UnivariateVarExp_v2.pdf", width = 15, height = 10)
col <- colorRampPalette(RColorBrewer::brewer.pal(9, "RdBu") )
corrplot(testmat,
         tl.col = "red", 
         tl.cex = 0.95,  
         tl.srt = 45, method = "color",
         col = col(200),
         addCoef.col = "grey25",
         pch.cex = 1.0, 
         pch.col = "purple")
dev.off()

########
col <- colorRampPalette(RColorBrewer::brewer.pal(9, "RdBu") )
corrplot(testmat,
         tl.col = "red", 
         tl.cex = 0.95,  
         tl.srt = 45, method = "color",
         col = col(200),
         addCoef.col = "grey25",
         pch.cex = 1.0, 
         pch.col = "purple")

-- Analysis of the Down-sampled data set ( 1.5M Production reads samples only )--

## which downsampled data did not have 1.5M reads ??
w = which(downdata$`Production Reads` < 1500000) ## 274 samples left
## using wdata data frame as input
dep_cols = 15:33
ind_cols = c(1:3, 5:12)
#ind_cols = c(2, 5:23)

UnivarMat_DOWNSAM = lapply(dep_cols, function(dep){
  out = sapply(ind_cols, function(ind){
    df = data.frame( dep = unlist(downdata[-w, dep]), ind = unlist(downdata[-w, ind]) )
    df$dep = rntransform( df$dep )
    fit = lm( dep ~ ind, data = df)
    s = summary(fit)
    #####
    o = c(s$r.squared, s$adj.r.squared, s$fstatistic[1])
    names(o) = c("Rsq","Adj_Rsq","Fstat")
    #####
    a = anova(fit)
    eta = a[1,2]/sum(a[,2])
    Fstat = a[1, 4]
    pval = a[1, 5]
    o = c(o, eta, Fstat, pval)
    names(o) = c("Rsq","Adj_Rsq","Fstat", "EtaSq", "Fstat_","pval")
    #####
    return(o)
    })
  out = t(out)
  rownames(out) = colnames(downdata)[ind_cols]
  ##
  return(out)
  })

names(UnivarMat_DOWNSAM) = colnames(downdata)[dep_cols]


for( i in 1:length(UnivarMat_DOWNSAM) ){
  x = UnivarMat_DOWNSAM[[i]]
  n = names(UnivarMat_DOWNSAM)[i]
  n = gsub(" ","", n)
  n = paste0("summary_tables/DownSampleData_1.5M_Univariate_", n, "_VarExp.txt")
  write.table(x, file = n, row.names = TRUE, col.names = TRUE, sep = "\t", quote = FALSE)
  }
testmat = sapply(1:length(UnivarMat_DOWNSAM), function(x){ return( UnivarMat_DOWNSAM[[x]][,1] ) })
pmat = sapply(1:length(UnivarMat_DOWNSAM), function(x){ return( UnivarMat_DOWNSAM[[x]][,6] ) })
####
colnames(testmat) = names(UnivarMat_DOWNSAM)

pdf("pdf_figures/DownSampledData_1.5Monly_UnivariateVarExp.pdf", width = 15, height = 10)
corrplot(testmat,
         tl.col = "red", 
         tl.cex = 0.95,  
         tl.srt = 45, method = "cir",
         p.mat = pmat,
         insig = "label_sig",
         sig.level = c(.001, .01, .05), 
         #sig.level = 0.05,
         pch.cex = 1.0, 
         pch.col = "purple")
dev.off()
#######
col <- colorRampPalette(RColorBrewer::brewer.pal(9, "RdBu") )
pdf("pdf_figures/DownSampledData_1.5Monly_UnivariateVarExp_v2.pdf", width = 15, height = 10)
corrplot(testmat,
         tl.col = "red", 
         tl.cex = 0.95,  
         tl.srt = 45, method = "color",
         col = col(200),
         addCoef.col = "grey25",
         pch.cex = 1.0, 
         pch.col = "purple")
dev.off()
corrplot(testmat,
         tl.col = "red", 
         tl.cex = 0.95,  
         tl.srt = 45, method = "color",
         col = col(200),
         addCoef.col = "grey25",
         pch.cex = 1.0, 
         pch.col = "purple")

Step 5) Multivariate Modeling of CS @ DP1

-- Down Sampled Data --

-- model : CS@DP1 ~ subspecies + site + [tDNA] + [eDNA] + %eDNA + fragmentsize + pool + amount_DNA-in-hyb + hybridization + seqbatch + error

w = which(downdata$`Production Reads` < 1500000) ## 274 samples left
#####
d = rntransform( unlist( downdata[-w, "DP1"] ) )
## sub setted model
# fit0 = lm( d ~ `Subspecies` + `Site` +
#              `Total DNA Concentration (ng/ul)` + 
#              #`Endogenous DNA (qPCR - pg/ul)` +
#              `% Endogenous DNA` +
#              `Average Fragment Size` +
#              `Pool` + 
#              `Starting DNA (ug)` +
#              `Capture Pool` +
#              `Sequencing Batch` , data = downdata[-w,])
# 

## full model !!!
fit = lm( d ~ `Subspecies` + `Site` +
             `Total DNA Concentration (ng/ul)` + 
             `Endogenous DNA (qPCR - pg/ul)` +
             `% Endogenous DNA` +
             `Average Fragment Size` +
             `Pool` + 
            `Starting DNA (ug)` +
             `Capture Pool` +
             `Sequencing Batch` , data = downdata[-w,])
# anova(fit0, fit)
###################

a = anova(fit) 
eta2 = a[, 2]/sum(a[,2]); names(eta2) = rownames(a)
eta2 = data.frame(labels = rownames(a), eta2 = eta2)
###
par(mar = c(6.5, 5,3,3))
moose_barplot(eta2, "eta2", eta2$labels, 20, pylim = c(0,0.45))

-- how is subspecies, site, tDNA, eDNA, %eDNA, frag-length partitioned among --> pools, hybridizations, and seq batches?

x = RColorBrewer::brewer.pal(3, "Set1")
pcol = c( rep(x[1], 4), rep(x[2], 6), rep(x[3], 8) )
boxplot(  unlist(downdata$`% Endogenous DNA`) ~ unlist(downdata$`Capture Pool`), col = pcol,
          ylab = "% eDNA", xlab = "Hybridization Pools", main = "%eDNA among hybridizations")
legend("topright", legend = c("Pool1","Pool2","Pool3"), col = x, pch = 19, pt.cex = 2)
## subspecies among pools
DownCorMat[[2]][2, 9] ## it is random

## sites among pools
DownCorMat[[2]][1, 9]  ## NOT RANDOM !!! ~ 20% of the variance in pools is explained by site (rho = 0.44; r2 = 0.19, p = 7.546206e-12 )

## tDNA
DownCorMat[[1]][7, 9] ## total DNA is random, eDNA is not random (p = 0.007), %eDNA IS NOT RANDOM (rho = 0.88; r2 = 0.78; p = 5.0e-90)

-- Pool (n = 10, 20, 30) on Capture Sensitivity --

## varaince explained
w = which(downdata$`Production Reads` < 1500000)


###
pcol = RColorBrewer::brewer.pal(3, "Set1")
par(mfrow = c(1,3))
###
fit = lm( unlist( wdata$`Capture Sensitivity (CS) DP1`) ~ unlist(wdata$Pool) )
a = anova(fit)
etasq = signif( a[1,2]/sum(a[,2]) , d = 4)
pval = signif( a[1,5] , d = 4 )
boxplot( unlist( wdata$`Capture Sensitivity (CS) DP1`) ~ unlist(wdata$Pool), col = pcol,
          ylab = "CS @ DP1", xlab = "Pool",
          main = paste0( "Complete Data Set;  Etasq = ", etasq, ";  P = ", pval  ) )
#####
fit =  lm( unlist( downdata$`DP1`[-w]) ~ unlist(downdata$Pool[-w]) )
a = anova(fit)
etasq = signif( a[1,2]/sum(a[,2]) , d = 4)
pval = signif( a[1,5] , d = 4 )
boxplot( unlist( downdata$`DP1`[-w]) ~ unlist(downdata$Pool[-w]), col = pcol,
         ylab = "CS @ DP1", xlab = "Pool",
          main = paste0( "DownSampled Data Set;  Etasq = ", etasq, ";  P = ", pval  ))

###
dp = rntransform( downdata$DP1[-w] )
fit0 =  lm( dp ~  Site + `% Endogenous DNA` + `Average Fragment Size` + Pool, data = downdata[-w, ] )
###
res0 =  residuals( lm( dp ~  Site + `% Endogenous DNA` + `Average Fragment Size` , data = downdata[-w, ] ) )
#res1 =  residuals( lm( dp ~ Site + `% Endogenous DNA` + `Average Fragment Size` + `Starting DNA (ug)`, data = downdata[-w, ] ) )
#fit =  lm( res0 ~ Pool, data = downdata[-w, ] )
a = anova(fit0)
etasq = signif( a[4,2]/sum(a[,2]) , d = 4)
pval = signif( a[4,5] , d = 4 )
boxplot( res0 ~ unlist(downdata$Pool[-w]), col = pcol,
         ylab = "CS @ DP1", xlab = "Pool",
          main = paste0( "Res. DownSample Data Set;  Etasq = ", etasq, ";  P = ", pval  ))

Step 5) Multivariate Modeling

-- of the complete data set --

## using wdata data frame as input
dep_cols = 15:32

MultivarMat = t( sapply(dep_cols, function(i){
    dep = rntransform( unlist( wdata[ ,i] ) )
    ###########
    fit0 = lm( dep ~ `Subspecies` + `Site` +
                `Total DNA Concentration (ng/ul)` +
                 `Endogenous DNA (qPCR - pg/ul)` +
                `% Endogenous DNA` +
                 `Average Fragment Size` +
                 `Pool` +
                 `Starting DNA  (ug)` +
                `Capture Pool` +
                `Sequencing Batch` +
                `Production Reads` , data = wdata)

    ###############

    fit = lm( dep ~ `Production Reads` + 
                `Sequencing Batch` +
                `Capture Pool` +
                `Starting DNA  (ug)` +
                `Pool` +
                `Average Fragment Size` +
                `% Endogenous DNA` + 
                `Endogenous DNA (qPCR - pg/ul)` +
                `Total DNA Concentration (ng/ul)` + 
                `Site` + `Subspecies`, data = wdata)

    s = summary(fit)
    #####
    o = c(s$r.squared, s$adj.r.squared, s$fstatistic[1])
    names(o) = c("Rsq","Adj_Rsq","Fstat")
    #####
    a = anova(fit)
    n = gsub(" ","",rownames(a)); n = gsub("`","", n )
    eta = a[,2]/sum(a[,2]); names(eta) = paste0("etasq_fit_", n )
    pval = a[, 5]; names(pval) = paste0("pval_fit_", n )
    #####
    a = anova(fit0)
    n = gsub(" ","",rownames(a)); n = gsub("`","", n )
    eta0 = a[,2]/sum(a[,2]); names(eta0) = paste0("etasq_fit0_", n )
    pval0 = a[,5 ]; names(pval0) = paste0("pval_fit0_", n )
    ####
    o = c(o, eta0, pval0, eta, pval)
    #####################
    ## TYPE II ANOVA
    ####################
    a = Anova(fit, type = "II")
    eta_type2 = a[,1]/sum(a[,1], na.rm = TRUE) 
    n = rownames(a); n = gsub(" ","", n); n = gsub("`","", n)
    names(eta_type2) = paste0("eta_fit_T2_", n)

    pval_type2 = a[,4]; 
    names(pval_type2) = paste0("pval_fit_T2", n)

    #####
    out = c(o, eta_type2, pval_type2)
    #####
    return(out)
    })
)
rownames(MultivarMat) = colnames(wdata)[dep_cols]

n = "summary_tables/FullData_Multivariate_Models.txt"
write.table(MultivarMat, file = n, row.names = TRUE, col.names = TRUE, quote = FALSE, sep = "\t")

-- of the Downsampled data set limited to only those samples with 1.5M production reads --

w = which(downdata$`Production Reads` < 1500000) ## 274 samples left


## using wdata data frame as input
dep_cols = 15:33

MultivarMat_DOWNSAM = t( sapply(dep_cols, function(i){
    dep = rntransform( unlist( downdata[ -w,i] ) )
    ###########
    fit0 = lm( dep ~ `Subspecies` + `Site` +
                `Total DNA Concentration (ng/ul)` +
                 `Endogenous DNA (qPCR - pg/ul)` +
                `% Endogenous DNA` +
                 `Average Fragment Size` +
                 `Pool` +
                 `Starting DNA (ug)` +
                `Capture Pool` +
                `Sequencing Batch`  , data = downdata[-w, ])

    ###############

    fit = lm( dep ~ `Sequencing Batch` +
                `Capture Pool` +
                `Starting DNA (ug)` +
                `Pool` +
                `Average Fragment Size` +
                `% Endogenous DNA` + 
                `Endogenous DNA (qPCR - pg/ul)` +
                `Total DNA Concentration (ng/ul)` + 
                `Site` + `Subspecies`, data = downdata[-w, ])

    s = summary(fit)
    #####
    o = c(s$r.squared, s$adj.r.squared, s$fstatistic[1])
    names(o) = c("Rsq","Adj_Rsq","Fstat")
    #####
    a = anova(fit)
    n = gsub(" ","",rownames(a)); n = gsub("`","", n )
    eta = a[,2]/sum(a[,2]); names(eta) = paste0("etasq_fit_", n )
    pval = a[, 5]; names(pval) = paste0("pval_fit_", n )
    #####
    a = anova(fit0)
    n = gsub(" ","",rownames(a)); n = gsub("`","", n )
    eta0 = a[,2]/sum(a[,2]); names(eta0) = paste0("etasq_fit0_", n )
    pval0 = a[,5 ]; names(pval0) = paste0("pval_fit0_", n )
    ####
    o = c(o, eta0, pval0, eta, pval)
    #####################
    ## TYPE II ANOVA
    ####################
    a = Anova(fit, type = "II")
    eta_type2 = a[,1]/sum(a[,1], na.rm = TRUE) 
    n = rownames(a); n = gsub(" ","", n); n = gsub("`","", n)
    names(eta_type2) = paste0("eta_fit_T2_", n)

    pval_type2 = a[,4]; 
    names(pval_type2) = paste0("pval_fit_T2_", n)

    #####
    out = c(o, eta_type2, pval_type2)
    #####
    return(out)
    })
)
rownames(MultivarMat_DOWNSAM) = colnames(downdata)[dep_cols]


n = "summary_tables/DownSampleData_1.5M_Multivariate_Models.txt"
write.table(MultivarMat_DOWNSAM, file = n, row.names = TRUE, col.names = TRUE, quote = FALSE, sep = "\t")

--correlation plot the multivariate fit data --

testmat = MultivarMat_DOWNSAM[, 4:12]
pmat = MultivarMat_DOWNSAM[, 14:22]
####
colnames(testmat) = colnames(MultivarMat_DOWNSAM)[4:12]

testmat = t(testmat)
pmat = t(pmat)

#######
col <- colorRampPalette(RColorBrewer::brewer.pal(9, "RdBu") )
pdf("pdf_figures/DownSampledData_1.5Monly_MultivariateVarExp_v2.pdf", width = 15, height = 10)
corrplot(testmat,
         tl.col = "red", 
         tl.cex = 0.95,  
         tl.srt = 45, method = "color",
         col = col(200),
         addCoef.col = "grey25",
         pch.cex = 1.0, 
         pch.col = "purple")
dev.off()


corrplot(testmat,
         tl.col = "red", 
         tl.cex = 0.95,  
         tl.srt = 45, method = "color",
         col = col(200),
         addCoef.col = "grey25",
         pch.cex = 1.0, 
         pch.col = "purple")
sort( apply(testmat, 1, mean) , decreasing = TRUE)

-- barplots of the DownSampled Variance Explained for CS@DP1 --

pcols = RColorBrewer::brewer.pal(3, "Dark2")

###
par(mfrow = c(3, 1), mar = c(8,3,3,1))
####
d = as.data.frame(UnivarMat_DOWNSAM$DP1)
rownames(d) = gsub(" ","",rownames(d))
moose_barplot( d, 'Rsq', row.names(d), 25, pcol = pcols[1], 
          pmain = "Capture Sensitivity (DP1) Univariate ANOVAs", pylim = c(0,1))

#####
d = data.frame( d = MultivarMat_DOWNSAM["DP1",  c(4:12)] )
rownames(d) = gsub("etasq_fit0_","",rownames(d))
moose_barplot( d , 'd', row.names(d) , 25, pcol =  pcols[2], 
          pmain = "Capture Sensitivity (DP1) Type I Multivariate ANOVAs", pylim = c(0,0.35) )

#####
d = data.frame( d = MultivarMat_DOWNSAM["DP1", c(24:31)  ] )
rownames(d) = gsub("etasq_fit_","",rownames(d))
moose_barplot( d , 'd', row.names(d) , 25, pcol =  pcols[3], 
          pmain = "Capture Sensitivity (DP1) Type I Multivariate ANOVAs", pylim = c(0,0.35) )
## OK THIS IS SOME DAMN UGLY CODE !!!

## univariate etasq
u = UnivarMat_DOWNSAM$DP1[, 1]
u = u[c(3,2,1,4:7,11, 9:10, 8)]
## multvariate Etasq from fit0
m0 = MultivarMat_DOWNSAM["DP1", c(4:12) ]
m0 = c( m0[1:2], NA, m0[3:6], m0[8], m0[7], m0[9], NA)
## multvariate Etasq from fit
m =  MultivarMat_DOWNSAM["DP1", c(24:31) ] 
m = c( NA, m[7], NA, m[6:3], NA, NA, m[2:1])

## reorder
u = u[c(1:7,9,8,10:11)]
m0 = m0[c(1:7,9,8,10:11)]
m = m[c(1:7,9,8,10:11)]
##
varexp = c(u, m0, m)
n = names(u)
l = c(n , n, n)
mod = c( rep("univar", length(u))  , rep("fit0", length(u)) , rep("fit", length(u)) )
###

d = data.frame(labels = l, varexp = varexp, model = mod)

### turn data frame into tibble
d = as_tibble(d)
d$labels <- factor(d$labels, levels = d$labels[11:1])

## plot
p = d %>% ggplot(aes(x = varexp, y = labels)) + 
  geom_point( aes(color = model, shape = model), size = 7, alpha = 0.85) +
  scale_color_manual(values = RColorBrewer::brewer.pal(3, "Set1") ) +
  labs(title = paste0("Variance explained in capture sensitivity \n derived from univariate and type I multivariate ANOVAs"),
       caption = paste0( "fit0 = CS@DP1 ~ `Subspecies` + `Site` + `Total DNA Concentration` + `% Endogenous DNA` + 
       `Average Fragment Size` + `Pool` + `Starting DNA` +`Capture Pool` + `Sequencing Batch`\n\nmodel `fit` has the order of explanatory variables reversed") )

pdf("pdf_figures/DownSampled_CS@DP1_Uni-n-Multi_ANOVA_etasq.pdf", width = 10, height = 8)
p
dev.off()

p

Library Complexity and CS @DP1

w = which(is.na(wdata$`Production Reads`))

pcol = brewer.pal(9, "Blues")[-1] 
## a ramp of colors 
pcol = colorRampPalette( pcol )(19)

p = wdata[-w,] %>% ggplot( aes(x = `Unique Reads`, y = `Capture Sensitivity (CS) DP1`)) +
  #geom_point(aes(fill = `Capture Pool`, shape = `Sequencing Batch`,  size = `% Endogenous DNA`), alpha = 0.8 ) +
  geom_point(aes(fill = `Sequencing Batch`, shape = `Pool`,  size = `Library Complexity (LC)`), alpha = 0.8 ) +
  #geom_smooth(method = "loess", aes(color = `Sequencing Batch`)) +
  scale_shape_manual(values=c(21, 22, 23)) +
  scale_fill_brewer(palette = "Set1") +
  scale_color_manual(values = RColorBrewer::brewer.pal(3,"Set1") ) +
  #scale_fill_manual(values = pcol) +
  guides(fill = guide_legend(override.aes = list(size = 5, color = RColorBrewer::brewer.pal(3,"Set1") ) ) ) +
  guides(shape = guide_legend(override.aes = list(size = 5))) +
  labs(title = "Associations between unique reads and capture sensitiviy @DP1", 
       subtitle ="   ...  as structured by sequencing batch and library complexity") 

######################
p2 = wdata[-w,] %>% ggplot( aes(x = `Unique Reads`, y = `Library Complexity (LC)`)) +
  #geom_point(aes(fill = `Capture Pool`, shape = `Sequencing Batch`,  size = `% Endogenous DNA`), alpha = 0.8 ) +
  geom_point(aes(fill = `Sequencing Batch`, shape = `Pool`,  size = `Production Reads`), alpha = 0.8 ) +
  geom_smooth(method = "loess", aes(color = `Sequencing Batch`)) +
  scale_shape_manual(values=c(21, 22, 23)) +
  scale_fill_brewer(palette = "Set1") +
  scale_color_manual(values = RColorBrewer::brewer.pal(3,"Set1") ) +
  #scale_fill_manual(values = pcol) +
  guides(fill = guide_legend(override.aes = list(size = 5) ) ) +
  guides(shape = guide_legend(override.aes = list(size = 5))) +
  labs(title = "Associations beteween unique reads and library complexity", 
       subtitle ="   ...  as structured by sequencing batch and production reads") 

########################
pdf("pdf_figures/CS_vs_UniqueReads.pdf", width = 15, height = 5)
grid.arrange( p, p2 , nrow = 1)
dev.off()

grid.arrange( p, p2 , nrow = 1)

Unique Reads by hybridization

w = which(downdata$`Production Reads` < 1500000 )
######
pcol = c( rep("blue",2), rep("darkblue",2) , rep("green",4) , rep("darkgreen", 2), rep("red",6), rep("darkred",2))
par(mfrow = c(4,1))
boxplot( log10(downdata$`OnTarget Reliable Reads`[-w]) ~ downdata$`Capture Pool`[-w], col = pcol, ylab = "log10(on target RR)", main = "OnTarget Reliable Reads")
boxplot(downdata$Enrichment[-w] ~ downdata$`Capture Pool`[-w], col = pcol, main = "Enrichment")
boxplot(downdata$LC[-w] ~ downdata$`Capture Pool`[-w], col = pcol, main = "Library Complexity")
boxplot(downdata$DP1[-w] ~ downdata$`Capture Pool`[-w], col = pcol, main = "Capture Sensitivity @DP1")
  1. Samples in SeqBatch 3 all had 2ug of in the hybridization, hence the large bump in LC.
  2. Drowning the data in useless sequencing also appears to have had a negative effect on LC.

What can we learn from these observations? 1. Increase the total DNA concentration in hybridization reactions (Perry paper did this) 2. Do not sequence to deeply.

does %eDNA correlate with production reads ?

cp = sort( na.omit( as.character( unique(wdata$`Capture Pool`) ) ) )
x = c()
for(i in 1:length(cp)){
  w = which(wdata$`Capture Pool` == cp[i])
  a = cor.test(wdata$`% Endogenous DNA`[w], wdata$`Production Reads`[w])
  #a = cor.test(wdata$`% Endogenous DNA`[w], wdata$`OnTarget Reliable Reads`[w])
  out = c(a$estimate, a$p.value)
  x = rbind(x, out)
}
rownames(x) = cp
colnames(x) = c("rho","pval")
x

Exclude pool 2 -- Multivariate Modeling of CS @ DP1

-- Down Sampled Data --

-- model : CS@DP1 ~ subspecies + site + [tDNA] + [eDNA] + %eDNA + fragmentsize + pool + amount_DNA-in-hyb + hybridization + seqbatch + error

#w = which(downdata$`Production Reads` < 1500000  ) ## 274 samples left
w = which(downdata$`Production Reads` < 1500000 | downdata$Pool == "P2" ) ## 200 samples left
#####
d = rntransform( unlist( downdata[-w, "DP1"] ) )
## sub setted model
# fit0 = lm( d ~ `Subspecies` + `Site` +
#              `Total DNA Concentration (ng/ul)` + 
#              #`Endogenous DNA (qPCR - pg/ul)` +
#              `% Endogenous DNA` +
#              `Average Fragment Size` +
#              `Pool` + 
#              `Starting DNA (ug)` +
#              `Capture Pool` +
#              `Sequencing Batch` , data = downdata[-w,])
# 

## full model !!!
fit = lm( d ~ `Subspecies` + `Site` +
             `Total DNA Concentration (ng/ul)` + 
             `Endogenous DNA (qPCR - pg/ul)` +
             `% Endogenous DNA` +
             `Average Fragment Size` +
             `Pool` + 
            `Starting DNA (ug)` +
             `Capture Pool` +
             `Sequencing Batch` , data = downdata[-w,])
# anova(fit0, fit)
###################

a = anova(fit) 
eta2 = a[, 2]/sum(a[,2]); names(eta2) = rownames(a)
eta2 = data.frame(labels = rownames(a), eta2 = eta2)
###
par(mar = c(6.5, 5,3,3))
moose_barplot(eta2, "eta2", eta2$labels, 20, pylim = c(0,0.30))

How are production reads influencing library complexity in the down sampled data ?

w = which(downdata$`Production Reads` < 1500000)

pcol = brewer.pal(9, "Blues")[-1] 
## a ramp of colors 
pcol = colorRampPalette( pcol )(19)

################################
##  DP1
################################

p = downdata[-w,] %>% ggplot( aes(x = `DP1`, y = `LC`)) +
  geom_point(aes(fill = `Sequencing Batch`, shape = `Sequencing Batch`,  size = `Production Reads`), alpha = 0.8 ) +
  geom_smooth(method = "loess", color = "black") +
  scale_shape_manual(values=c(21, 22, 23)) +
  scale_fill_brewer(palette = "Set1") +
  #scale_fill_manual(values = pcol) +
  guides(fill = guide_legend(override.aes = list(size = 5) ) ) +
  labs(title = "LC and Capture sensitivity at DP1", 
       subtitle ="   ...  as structured by sequencing batch",
       x = "CS @ DP1") 

##################



p1 = p + downdata[w,] %>%
  geom_point(mapping = aes(x = `DP1`, y = `LC`, 
                           fill = `Sequencing Batch`, 
                           shape = `Sequencing Batch`,  
                           size = `Production Reads`), alpha = 0.8 ) + 
  theme(legend.position = "none")

################################
##  DP4
################################
p = downdata[-w,] %>% ggplot( aes(x = `DP4`, y = `LC`)) +
  geom_point(aes(fill = `Sequencing Batch`, shape = `Sequencing Batch`,  size = `Production Reads`), alpha = 0.8 ) +
  geom_smooth(method = "loess", color = "black") +
  scale_shape_manual(values=c(21, 22, 23)) +
  scale_fill_brewer(palette = "Set1") +
  #scale_fill_manual(values = pcol) +
  guides(fill = guide_legend(override.aes = list(size = 5) ) ) +
  labs(title = "LC and Capture sensitivity at DP4", 
       subtitle ="   ...  as structured by sequencing batch",
       x = "CS @ DP4") 

##################

p4 = p + downdata[w,] %>%
  geom_point(mapping = aes(x = `DP4`, y = `LC`, 
                           fill = `Sequencing Batch`, 
                           shape = `Sequencing Batch`,  
                           size = `Production Reads`), alpha = 0.8 ) + 
  theme(legend.position = "none")

################################
##  DP10
################################
p = downdata[-w,] %>% ggplot( aes(x = `DP10`, y = `LC`)) +
  geom_point(aes(fill = `Sequencing Batch`, shape = `Sequencing Batch`,  size = `Production Reads`), alpha = 0.8 ) +
  geom_smooth(method = "loess", color = "black") +
  scale_shape_manual(values=c(21, 22, 23)) +
  scale_fill_brewer(palette = "Set1") +
  #scale_fill_manual(values = pcol) +
  guides(fill = guide_legend(override.aes = list(size = 5) ) ) +
  labs(title = "LC and Capture sensitivity at DP10", 
       subtitle ="   ...  as structured by sequencing batch",
       x = "CS @ DP10") 

##################

p10 = p + downdata[w,] %>%
  geom_point(mapping = aes(x = `DP10`, y = `LC`, 
                           fill = `Sequencing Batch`, 
                           shape = `Sequencing Batch`,  
                           size = `Production Reads`), alpha = 0.8 ) 

l = cowplot::get_legend(p)

p10 = p10 + theme(legend.position = "none")


grid.arrange(p1, p4, p10, l, nrow = 1, widths = c(4,4,4,1))

Capture Sensitivity

It would be useful to identify a single summary statistic that summarizes what a good "performing" target capture seqeuencing experiment is. I think what we be most useful is to count the number of (population-wide) variable position that we were able to genotype (at whatever criteria uniformly executed across all samples). In the absence of this data it would seem that the best summary statistic that would predict this number is Capture Sensitivity (# of target regions covered by 1 read / total number of target regions), as having a base covered by a read provides a chance for genotyping.

Now depth in coverage gives us accuracy in genotyping heterozygosity and there is most certainly going to be some bias in capturing specific alleles, but we have some good evidence that just making hemizygous calls is informative for the gross|macro level population genetics we would like to do. However, this should eventually be quantified.

Below we are asking in a univariate fashion how each of our variables, and sumstat, correlate with CAPTURE SENSITIVITY.

#x = sort( CorMat[[1]][-26,26] ); barplot(x)

df = tibble( ID = colnames( CorMat[[1]] ) , 
             CS_cor = unlist( CorMat[[1]][,"Capture Sensitivity (CS) DP1"]),
             CS_pval = unlist( CorMat[[2]][,"Capture Sensitivity (CS) DP1"]))

## change order to increasing values
o = order(df$CS_cor)
df = df[o,]
df = df[1:25, ]
### set ploting order with factor 
df$ID <- factor(df$ID, levels = df$ID)

### plot
(
  p <- df %>% ggplot( aes( x = ID, y = CS_cor ) ) +
    geom_bar(stat="identity", aes(fill = log10(CS_pval) )) +
    labs(title = "Capture Sensitivity correlation") +
    theme(axis.text.x = element_text(angle = 45,  hjust = 1))
)

What can we learn from this analysis?

  1. Want to increase capture sensitivity aquire more unique reads !
    • kind of obvious but great to observe and demonstrate.
  2. For technical or methodological choices it would appear that
    • samples with more Endogenous DNA, (note that this is NOT %DNA), it is higher DNA [concentrations] perform better
    • captures with more DNA in the hybridization perform better

Below we are asking in a univariate fashion how each of our variables, and sumstat, correlate with CAPTURE SENSITIVITY at a uniform production

#x = sort( CorMat[[1]][-26,26] ); barplot(x)

df = tibble( ID = colnames( DownCorMat[[1]] ) , 
             CS_cor = unlist( DownCorMat[[1]][,"DP1"]),
             CS_pval = unlist( DownCorMat[[2]][,"DP1"]))
df = df[-c(1,2,4,27:31), ]

## change order to increasing values
o = order(df$CS_cor)
df = df[o,]
#df = df[1:25, ]
### set ploting order with factor 
df$ID <- factor(df$ID, levels = df$ID)

### plot
(
  p <- df %>% ggplot( aes( x = ID, y = CS_cor ) ) +
    geom_bar(stat="identity", aes(fill = log10(CS_pval) )) +
    labs(title = "Capture Sensitivity correlation") +
    theme(axis.text.x = element_text(angle = 45,  hjust = 1))
)

Build a network of relationships based on correaltion estimates

#library(network)
library(igraph)

#############################
## Make Adjecency Matrix
#############################

# x = CorMat[[2]][-c(1:4), -c(1:4)]
# adjMat = x
# adjMat[ x > 0.00001] = 0
# adjMat[ x <= 0.00001] = 1
# diag(adjMat) = 0

x = CorMat[[1]][-c(1:4), -c(1:4)]
adjMat = abs( x )
adjMat[adjMat < 0.5] = 0

#############################
## Categorize the Nodes
#############################
n = colnames(adjMat)
nodecats = c( rep("Sample", 3), rep("Methodological", 4), rep("Outcome",11), rep("SumStat", 7 ), "Methodological" )
pcol_o = brewer.pal(nlevels(as.factor(nodecats)), "Set1")
pcol <- pcol_o[as.numeric(as.factor(nodecats))]

#############################
## Generate network
#############################
network <- graph_from_adjacency_matrix(adjMat, weighted=T, mode="undirected", diag=F)

## estimate degree for each node
deg <- degree(network, mode="all")

#############################
## Make Plot
#############################
par(bg="grey33", mar=c(0,0,0,0))
plot(network, 
    #layout=layout.sphere,
    #layout=layout.circle,
    layout=layout.fruchterman.reingold,
    vertex.color = pcol,          # Node color
    vertex.label.color="white",
    vertex.frame.color = "white",   # Node border color
    vertex.shape= "circle",         # One of “none”, “circle”, “square”, “csquare”, “rectangle” “crectangle”, “vrectangle”, “pie”, “raster”, or “sphere”
    vertex.size=deg+4,                  # Size of the node (default is 15)
    #vertex.size2=NA,
    edge.color = "grey50",
    #edge.arrow.size=0
    )
legend("bottomleft", 
       legend=levels(as.factor(nodecats)), 
       col = pcol_o, 
       bty = "n", 
       pch=20 , 
       pt.cex = 3, 
       cex = 1.5, 
       text.col=pcol_o, 
       horiz = FALSE, 
       inset = c(0.1, 0.1))

Remove redundancies in Network

#############################
## Data Reductions
#############################

### to do computationally -- LATER

#############################
## Make Adjecency Matrix
#############################

# x = CorMat[[2]][-c(1:4), -c(1:4)]
# adjMat = x
# adjMat[ x > 0.00001] = 0
# adjMat[ x <= 0.00001] = 1
# diag(adjMat) = 0

x = CorMat[[1]][-c(1:4,11, 16:19, 21, 27:29), -c(1:4,11, 16:19, 21, 27:29)]
adjMat = abs( x )
adjMat[adjMat < 0.5] = 0

#############################
## Categorize the Nodes
#############################
n = colnames(adjMat)
nodecats = c( rep("Sample", 3), rep("Methodological", 3), rep("Outcome",6), rep("SumStat", 4 ), "Methodological" )
pcol_o = brewer.pal(nlevels(as.factor(nodecats)), "Set1")
pcol <- pcol_o[as.numeric(as.factor(nodecats))]

#############################
## Generate network
#############################
network <- graph_from_adjacency_matrix(adjMat, weighted=T, mode="undirected", diag=F)

## estimate degree for each node
deg <- degree(network, mode="all")

#############################
## Make Plot
#############################
par(bg="grey33", mar=c(0,0,0,0))
plot(network, 
    #layout=layout.sphere,
    #layout=layout.circle,
    layout=layout.fruchterman.reingold,
    vertex.color = pcol,          # Node color
    vertex.label.color="white",
    vertex.frame.color = "white",   # Node border color
    vertex.shape= "circle",         # One of “none”, “circle”, “square”, “csquare”, “rectangle” “crectangle”, “vrectangle”, “pie”, “raster”, or “sphere”
    vertex.size=deg+4,                  # Size of the node (default is 15)
    #vertex.size2=NA,
    edge.color = "grey50",
    #edge.arrow.size=0
    )
legend("topleft", 
       legend=levels(as.factor(nodecats)), 
       col = pcol_o, 
       bty = "n", 
       pch=20 , 
       pt.cex = 3, 
       cex = 1.5, 
       text.col=pcol_o, 
       horiz = FALSE, 
       inset = c(0.1, 0.1))

Remove redundancies in Network and construct with down sampled data

#############################
## Data Reductions
#############################

Dmat = as.dist( na.omit( 1 - abs(DownCorMat[[1]]) ) )
tree = hclust(Dmat, method = "complete")
# plot(tree, hang = -1)

#############################
## Make Adjecency Matrix
#############################

# x = CorMat[[2]][-c(1:4), -c(1:4)]
# adjMat = x
# adjMat[ x > 0.00001] = 0
# adjMat[ x <= 0.00001] = 1
# diag(adjMat) = 0
r = c(1:4, 12, 14, 16, 17,20, 21, 22, 28:31)
x = DownCorMat[[1]][-r, -r]
adjMat = abs( x )
adjMat[adjMat < 0.5] = 0

#############################
## Categorize the Nodes
#############################
n = colnames(adjMat)
nodecats = c( rep("Sample", 3), rep("Methodological", 3), rep("Outcome",6), rep("SumStat", 4 ), "Methodological" )
pcol_o = brewer.pal(nlevels(as.factor(nodecats)), "Set1")
pcol <- pcol_o[as.numeric(as.factor(nodecats))]

#############################
## Generate network
#############################
network <- graph_from_adjacency_matrix(adjMat, weighted=T, mode="undirected", diag=F)

## estimate degree for each node
deg <- degree(network, mode="all")

#############################
## Make Plot
#############################
par(bg="grey33", mar=c(0,0,0,0))
plot(network, 
    #layout=layout.sphere,
    #layout=layout.circle,
    layout=layout.fruchterman.reingold,
    vertex.color = pcol,          # Node color
    vertex.label.color="white",
    vertex.frame.color = "white",   # Node border color
    vertex.shape= "circle",         # One of “none”, “circle”, “square”, “csquare”, “rectangle” “crectangle”, “vrectangle”, “pie”, “raster”, or “sphere”
    vertex.size=deg+4,                  # Size of the node (default is 15)
    #vertex.size2=NA,
    edge.color = "grey50",
    #edge.arrow.size=0
    )
legend("topleft", 
       legend=levels(as.factor(nodecats)), 
       col = pcol_o, 
       bty = "n", 
       pch=20 , 
       pt.cex = 3, 
       cex = 1.5, 
       text.col=pcol_o, 
       horiz = FALSE, 
       inset = c(0.1, 0.1))

How is the concentration of a sample influencing capture sensitivity ?

w = which(is.na(wdata$`Production Reads`))
pcol = brewer.pal(9, "Blues")[-1] 
## a ramp of colors 
pcol = colorRampPalette( pcol )(19)


wdata[-w,] %>% mutate(NCS = `Capture Sensitivity (CS) DP1` / `Production Reads`) %>%  ggplot( aes(x = `Endogenous DNA (qPCR - pg/ul)`, y = `Capture Sensitivity (CS) DP1`)) +
  #geom_point(aes(fill = `Sequencing Batch`, shape = `Sequencing Batch`,  size = `% Endogenous DNA`), alpha = 0.5 ) +
  #geom_point(aes(fill = `Sequencing Batch`, shape = `Sequencing Batch`,  size = log10(`Production Reads`)), alpha = 0.5 ) +
  geom_point(aes(fill = `Capture Pool`, shape = `Sequencing Batch`,  size = log10(`Production Reads`)), alpha = 0.5 ) +
  geom_smooth(  method = "loess") +
  scale_shape_manual(values=c(21, 22, 23)) +
  #scale_fill_brewer(palette = "Set1") +
  scale_fill_manual(values = rainbow(nlevels(wdata$`Capture Pool`)) ) +
  guides(fill = guide_legend(override.aes = list(size = 5, color = rainbow(nlevels(wdata$`Capture Pool`))) ) ) +
  labs(title = "Associations beteween Endogenous DNA concentration and capture Sensitivity", 
       subtitle ="     as structured by sequencing batch") 
wdata[-w,] %>% ggplot( aes(y = `Endogenous DNA (qPCR - pg/ul)`, x = `Capture Pool`)) +
  geom_boxplot(fill = gray.colors(nlevels(wdata$`Capture Pool`) ), outlier.colour="red", outlier.shape=16, outlier.size=2, notch=FALSE) +
  geom_jitter(shape=16, position=position_jitter(0.2), color = "blue") +
  labs(title = "eDNA concentration by capture pool") 

Univariate ANOVA on Summary Statistics

cols2test = c(2:3, 4:23, 25, 30)
UnivarateANOVA = matrix(NA, length(cols2test), 2)
for(i in 1:length(cols2test) ){
  x = unlist(wdata[,cols2test[i] ])
  test = class( x )
  ###

    fit = lm(wdata$`Capture Sensitivity (CS) DP1` ~  x)

  ###
  a = anova(fit)
  eta = a[1,2]/sum(a[,2])
  pval = a[1, 5]
  out = c(eta, pval)
  ##
  UnivarateANOVA[i, ] = out
  }

rownames(UnivarateANOVA) = colnames(wdata)[cols2test]
colnames(UnivarateANOVA) = c("eta","pval")

## order
o = order(UnivarateANOVA[,1], decreasing = TRUE)
UnivarateANOVA = UnivarateANOVA[o,]
df = tibble(ID = rownames(UnivarateANOVA), eta = UnivarateANOVA[,1], pvals = UnivarateANOVA[,2])
### maintain model order for the plot 
df$ID <- factor(df$ID, levels = df$ID)

### plot
(
  p <- df %>% ggplot( aes( x = ID, y = eta ) ) +
    geom_bar(stat="identity", aes(fill = log10(pvals) )) +
    labs(title = "Univariate ANOVA for capture sensitivity")+
    theme(axis.text.x = element_text(angle = 45,  hjust = 1))
)

A multivariate model to explain how sample quality, and methodological choice influences Capture Sensitivity

library(car)
##############################
## fit a simple linear model
##############################
fit = lm( `Capture Sensitivity (CS) DP1` ~ `Total DNA Concentration (ng/ul)` + 
            `Endogenous DNA (qPCR - pg/ul)` +
            `% Endogenous DNA` +
            `Capture Pool` + 
            `Starting DNA  (ug)` + 
            `Sequencing Batch` + 
            `Production Reads` +
            `Unique Reads`
            , data = wdata )

fit = lm( `Capture Sensitivity (CS) DP1` ~ `Total DNA Concentration (ng/ul)` + 
            `Endogenous DNA (qPCR - pg/ul)` +
            `% Endogenous DNA` +
            `Starting DNA  (ug)` + 
            `Production Reads` +
            `Unique Reads`
            , data = wdata )


##############################
## are model residuals normal ?
##############################
W = shapiro.test(residuals(fit))

#############################
##  estiamte SS and VarExp
##  assuming an TypeI hierarchical
##  ANOVA
#############################

(a = anova(fit) )
eta = a[, 2]/ sum(a[,2])
names(eta) = rownames(a)


summary(fit)
df = tibble(ID = names(eta) , eta = eta, pvals = a[, 5])
### maintain model order for the plot 
df$ID <- factor(df$ID, levels = df$ID)

### plot
(
  p <- df %>% ggplot( aes( x = ID, y = eta ) ) +
    geom_bar(stat="identity", aes(fill = log10(pvals) )) +
    labs(title = "Type I ANOVA for capture sensitivity", 
         subtitle = paste0( "       Shapiro's W-stat for residuals of fitted model = ", signif(W$statistic, d = 4) )) +
    theme(axis.text.x = element_text(angle = 45,  hjust = 1))
)


hughesevoanth/NILC documentation built on Dec. 7, 2020, 4:19 p.m.