knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(beemixtox1) library(tidyverse) options(knitr.kable.NA = '')
The following filtering criteria was used to extract the relevant data form the EPA-Ecotox database. In this extraction, the dataset contains 576 CAS numbers.
#Define and set paths Dir_Data = "~/Projects/beemixtox_public/data-raw/" Data.SSD = read.csv(paste0(Dir_Data,"data/Copy of etc4373-sup-0002-supmat.csv")) ## usethis::use_data(Data.SSD) ## The common-name of the EPA Ecotox DB. #Load data (USEPA internal not for publication) #Load data Data = read.csv(paste0(Dir_Data,"data/TerrestrialReport.csv")) ## usethis::use_data("Data") length(unique(Data$CAS.Number)) #578 CAS
#Subset dataset for relevant parameters Data.1 = Data Data.1 = Data.1[Data.1$Organism.Lifestage%in%c("Adult"),] #Subset for relevant exposure times Data.1$Observed.Duration..Days.[Data.1$Observed.Duration..Days.=="NR"] = NA Data.1$Observed.Duration..Days. = as.numeric(as.character(Data.1$Observed.Duration..Days.)) # Restrict to 24 h & 48 h end-points Data.1 = Data.1[Data.1$Observed.Duration..Days.>=1 & Data.1$Observed.Duration..Days.<=2,] #Restrict to definitive LDx Data.1 = Data.1[Data.1$Observed.Response.Mean.Op%in%c(""),] Data.1 = droplevels(Data.1) length(unique(Data.1$CAS.Number)) # How many end-points per CAS number? NPerCAS = aggregate(Observed.Response.Mean ~ CAS.Number, data =Data.1, length) NPerCAS = NPerCAS[order(NPerCAS$Observed.Response.Mean, decreasing = T),] # Restrict to more than 5 observations NPerCAS = NPerCAS[NPerCAS$Observed.Response.Mean>=3,] #29 compunts # Restrict Data.1 for these 29 compounds Data.1 = Data.1[Data.1$CAS.Number%in%c(NPerCAS$CAS.Number),]; Data.1 = droplevels(Data.1) # Homogenize units table(Data.1$Observed.Response.Units) ## UNITS Data.1$Observed.Response.Mean = as.numeric(as.character(Data.1$Observed.Response.Mean)) # Retain only useful units Data.1 = Data.1[Data.1$Observed.Response.Units%in%c("AI mg/org","AI ng/org","AI ug/org","mg/bee","mg/org","ng/org","ug/bee","ug/org"),]; Data.1 = droplevels(Data.1) #Transform mg to microgram Data.1$Observed.Response.Mean[Data.1$Observed.Response.Units%in%c("AI mg/org","mg/bee","mg/org")] = Data.1$Observed.Response.Mean[Data.1$Observed.Response.Units%in%c("AI mg/org","mg/bee","mg/org")]*1000 #to ug/bee Data.1$Observed.Response.Units[Data.1$Observed.Response.Units%in%c("AI mg/org","mg/bee","mg/org")] = "ug/bee" #ng to microgram Data.1$Observed.Response.Mean[Data.1$Observed.Response.Units%in%c("AI ng/org","ng/org")] = Data.1$Observed.Response.Mean[Data.1$Observed.Response.Units%in%c("AI ng/org","ng/org")]/1000 #to ug/bee Data.1$Observed.Response.Units[Data.1$Observed.Response.Units%in%c("AI ng/org","ng/org")] = "ug/bee" # Homogenize ng/bee Data.1$Observed.Response.Units[Data.1$Observed.Response.Units%in%c("AI ug/org","ug/bee","ug/org")] = "ug/bee" table(Data.1$Observed.Response.Units) Data.1 = droplevels(Data.1) # Exposure media table(Data.1$Exposure.Type) Data.1 = Data.1[!Data.1$Exposure.Type%in%c("Oral via capsule","Spray"),]; Data.1 = droplevels(Data.1)
# Approximate variability Bee.var.N = aggregate(Observed.Response.Mean ~ CAS.Number, data =Data.1, length) Bee.var.Ave = aggregate(Observed.Response.Mean ~ CAS.Number, data =Data.1, mean) Bee.var.sd = aggregate(Observed.Response.Mean ~ CAS.Number, data =Data.1, sd) # Merge Bee.var = merge(Bee.var.N,Bee.var.Ave, by="CAS.Number") Bee.var = merge(Bee.var,Bee.var.sd, by="CAS.Number") names(Bee.var) [c(2:4)] = c("N", "Mean", "sd") # Restrict to at least 5 cases per CAS Bee.var = Bee.var[Bee.var$N>3,] # 16 chemicals Bee.var$CV.Perc = 100*(Bee.var$Mean/Bee.var$sd) Bee.var = merge(Bee.var, Data.SSD[,c(1,3)], by.x = "CAS.Number", by.y = "CAS.", all.x = T) Bee.var = Bee.var[order(Bee.var$N, decreasing = T),] # Summary of CV% summary(Bee.var) # Check for independency of N and Mean plot(Bee.var$N,Bee.var$CV.Perc) plot(log10(Bee.var$N),Bee.var$CV.Perc) plot(log10(Bee.var$Mean), Bee.var$CV.Perc) # Which chemicals do we have represented?
#restrict Data.1 to the selected CAS.N Data.2 = Data.1 Data.2 = Data.2[Data.2$CAS.Number%in%Bee.var$CAS.Number,]; Data.2 = droplevels(Data.2) Data.2$CAS.Number = as.character(Data.2$CAS.Number)
# Check consistency again table(Data.2$Chemical.Grade) %>% knitr::kable(.,caption ="Chemical Grade")%>% kableExtra::kable_classic_2() # Chemical grade is not reported in most cases table(Data.2$Organism.Lifestage)%>% knitr::kable(.,caption ="Organism Life Stage")%>% kableExtra::kable_classic_2() table(Data.2$Exposure.Type)%>% knitr::kable(.,caption ="Exposure Type")%>% kableExtra::kable_classic_2() table(Data.2$Number.of.Doses)%>% knitr::kable(.,caption ="Number of Doses")%>% kableExtra::kable_classic_2() table(Data.2$Observed.Duration..Days.)%>% knitr::kable(.,caption ="Duration in Days")%>% kableExtra::kable_classic_2()
aov.1 = aov(Observed.Response.Mean ~ CAS.Number + Conc.1.Type..Author. + Exposure.Type + Observed.Duration..Days., data = Data.2) summary(aov.1) # Save reference list only Data.2.Ref = unique(Data.2[,c(70,71,72,73,74)]) ## usethis::use_data(Data.2) ## write.csv(Data.2.Ref, file = "Data.2.Ref_ECOTOX_BeeLD50_Curated.csv", row.names = F)
In this section we we use EDA and ANOVA to check possible correlations among the variables.
Data.2 <- Data.2 %>% mutate(logTox = log(Observed.Response.Mean)) mod = lm(log(Observed.Response.Mean) ~ CAS.Number + Conc.1.Type..Author. + Exposure.Type + Observed.Duration..Days., data = Data.2) summary(mod) ## car::Anova(mod) %>% pander::pander(.)
car::Anova(mod) %>% knitr::kable(.,digits = 3) %>% kableExtra::kable_classic()
As shown in the ANOVA Table, none of the covariate variables, except for CAS, has a significant effect on the observed response mean.
From the EDA, we see the formulation has in general has higher observed response mean values as hown in Figure 1 and Figure 3. However, if we break it down to each CAS, the relationship does not hold for most of the CAS. Figure 4 confirmed our finding in the simple ANOVA analysis, the variance is mainly explained by the CAS difference, not by Conc types.
library(GGally) ggpairs(Data.2[,c("logTox","Conc.1.Type..Author.","Exposure.Type","Observed.Duration..Days.")]) ggplot(data=Data.2,aes(x=Observed.Duration..Days.,y=log(Observed.Response.Mean)))+geom_point()+geom_boxplot(aes(group=Observed.Duration..Days.)) ldat <- Data.2 %>% dplyr::select(c(CAS.Number , Conc.1.Type..Author., Exposure.Type,logTox)) %>% pivot_longer(!logTox,names_to="variables",values_to="value") ggplot(ldat,aes(x=value,y=logTox))+geom_boxplot()+geom_point()+facet_wrap(~variables,scales = "free")+ theme(axis.text.x = element_text(angle = 90))
ggplot(Data.2,aes(x=Conc.1.Type..Author.,y=logTox))+geom_boxplot()+geom_point()+facet_wrap(~CAS.Number,scales = "free_y")+ theme(axis.text.x = element_text(angle = 90))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.