#' Harmonize the compound names on sim_data_xl to made finding the correct rows
#' easier later.
#'
#' INTERNAL USE ONLY.
#'
#' @param sim_data_xl sim_data_xl
#' @param compoundToExtract compoundToExtract
#' @param PerpPresent T or F
#' @param ADAM T or F
#' @param AdvBrainModel T or F
#'
#' @return a data.frame of sim_data_xl with compound names and compound codes
#' switched to, e.g., SUBSTRATE
#'
#' @examples
#' # none
eCT_harmonize <- function(sim_data_xl,
compoundToExtract,
AllCompoundsPresent,
tissue,
PerpPresent,
Deets,
ADAM,
AdvBrainModel){
# Renaming compounds -------------------------------------------------------
if(any(ADAM, AdvBrainModel) == FALSE){
# If "interaction" or "Csys" or other similar strings are part of the name
# of any of the compounds, that messes up the regex. Plus, the Simulator
# has a variety of inconsistent ways that it names compounds that are hard
# to figure out sometimes. Substituting to standardize the compound names.
# Also need to consider the possibility that user may have had to hack
# things and may have the same compound in multiple positions.
# Scenario 1: Systemic tissue, no perpetrator
# CSys will be listed for all concs. Each compound ID will be on its own tab.
# Scenario 2: Systemic tissue, + perpetrator
# Substrate: CSys for substrate alone, CSys + interaction for substrate + perp,
# Each metabolite will be on its own tab. All concs labeled as CSys.
# Perpetrators: ISys for inhibitor 1 concs, ISys 2 for inhibitor 2 concs, ISys and some other number for inhibitor metabolite concs. Not clear how they pick the number for the metabolite concs.
# REMINDER TO SELF: As far as I can tell, only substrate and inhibitor concentrations are available for solid tissues.
# Scenario 3: Solid tissue except sometimes liver, no perpetrator
# Many concs possible.
# Adipose, Bone, Brain (except when AdvBrainModel), Gut, Heart, Kidney, Lung, Muscle, Skin, Spleen, Pancreas: CTissue
# Scenario 4: Solid tissue except sometimes liver, + perpetrator
# Adipose, Bone, Brain (except when AdvBrainModel), Gut, Heart, Kidney, Lung, Muscle, Skin, Spleen, Pancreas: CTissue or CTissue + Interaction for substrate, ITissue(Inh X) for inhibitor. This will be 1 for inhibitor 1 but need to check on other perp compound IDs.
# Scenario 5: Liver sometimes
# Can have sub, PM1, PM2, maybe sec met?, inhib1, inhib2, maybe inhib1 met?. Example: "mdz-met1-met2-inhib1-inhib2-md-alltissues-v22.xlsx"
NApos <- which(is.na(sim_data_xl$...1))
# Looking for all possible compounds. If there is an inhibitor, this will
# include substrate alone as well as substrate + interaction.
CmpdMatches1 <- sim_data_xl$...1[(NApos[1] + 1):(NApos[2]-1)]
CmpdMatches1 <- CmpdMatches1[!str_detect(CmpdMatches1, "Trial")]
# For some pediatric simulations, subjects are binned by age, and there
# will be at least 1 extra row for the age bin. This should not be
# included for CmpdMatches1, I'm pretty sure. I have only encountered one
# example of this so far. If there is a "bin", then any row that has
# "mean" and also "bin" should have the "mean" deleted or we end up with
# two "mean" rows and that messes everything up.
CmpdMatches1 <- CmpdMatches1[!str_detect(CmpdMatches1, " \\(Bin [0-9]")]
BinRows <- which(str_detect(sim_data_xl$...1, " Bin [0-9]| \\(Bin [0-9]"))
sim_data_xl$...1[BinRows] <- sub("[Mm]ean", "", sim_data_xl$...1[BinRows])
# # If the compound is not on the same tab as the substrate, then removing
# # all the "Trial" rows removes all the rows with the compound name.
# # Adjusting for that. Also adjusting for the weird situation with some
# # liver concentrations where the metabolite concentrations are on the same
# # tab as the substrate and inhibitors.
# if(any(compoundToExtract %in% c("primary metabolite 1", "primary metabolite 2",
# "secondary metabolite")) &
# !(tissue == "liver" &
# as.numeric(str_extract(Deets$SimulatorVersion, "[0-9]{2}")) >= 22)){
# CmpdMatches1 <- rep(AllCompoundsPresent[[compoundToExtract]], length(CmpdMatches1))
# }
# Next, need to figure out which combination of CSys and ISys 1 or ISys 3
# or whatever number belongs to which actual compound. Looking for what
# compounds were listed under "Population Statistics" b/c that's where
# they use that kind of coding.
StartRow <- which(str_detect(sim_data_xl$...1, "Population Statistics"))[1]
StartRow <- ifelse(is.na(StartRow),
# Need to deal w/animal sim data, which will not have
# any aggregate data. Instead, there will be a row
# titled "Statistics" for plasma data. It's different
# for brain tissue, fyi.
which(str_detect(sim_data_xl$...1, "^(Individual )?Statistics"))[1],
StartRow)
EndRow <- which(str_detect(sim_data_xl$...1, "Individual Statistics"))[1]-1
EndRow <- ifelse(is.na(EndRow),
nrow(sim_data_xl),
max(which(complete.cases(sim_data_xl$...1[1:EndRow]))))
CmpdMatches2 <- sim_data_xl$...1[StartRow:EndRow]
CmpdMatches2 <- CmpdMatches2[which(str_detect(CmpdMatches2, "^(P)?[CM](2)?(II)?(Sys|liver|pv|Tissue| lumen free|Peripheral)(.*[iI]nteraction)?|I(Sys|liver|pv| lumen free|Peripheral) [1-9]?|ITissue.Inh|InhM"))]
CmpdMatches2 <- str_trim(str_extract(CmpdMatches2, "^(P)?[CM](2)?(II)?(Sys|liver|pv|Tissue| lumen free|Peripheral)(.*[iI]nteraction)?|I(Sys|liver|pv| lumen free|Peripheral) [1-9]?|ITissue.Inh|InhM"))
CmpdMatches2 <- CmpdMatches2[complete.cases(CmpdMatches2)]
CmpdMatches2[str_detect(CmpdMatches2, "\\+( )?[iI]nteraction")] <-
paste(str_extract(CmpdMatches2[str_detect(CmpdMatches2, "\\+( )?[iI]nteraction")], "(P)?[CIM](2)?(II)?(Sys|liver|pv|Tissue| lumen free|Peripheral)"),
"interaction")
# For some tissues, regex above will result in inhibitor 1 being labeled
# as "ITissue(Inh", and that's not optimal for subsequent steps. Remove
# the "(Inh" bit.
CmpdMatches2 <- sub("\\(Inh", "", CmpdMatches2)
# Last step: Find the unique versions of the coding.
CmpdMatches2 <- unique(CmpdMatches2)
if(PerpPresent == FALSE & str_detect(tissue, "plasma|blood") == FALSE){
CmpdMatches2 <- CmpdMatches1
}
if(length(CmpdMatches1) != length(CmpdMatches2)){
warning("PLEASE TELL LAURA SHIREMAN YOU SAW AN ERROR CALLED `COMPOUNDCODE` WHEN TRYING TO EXTRACT CONCENTRATION TIME DATA")
}
AllCompoundsInv <- names(AllCompoundsPresent)
names(AllCompoundsInv) <- AllCompoundsPresent
# This works fine as long as there are no duplicate compounds, e.g., Drug
# X is both the primary metabolite 1 AND inhibitor 1, which CAN happen
# when we need to hack things in the Simulator. Need to filter to retain
# ONLY compounds in compoundsToExtract or the function glitches farther
# down.
AllCompoundsInv <- AllCompoundsInv[AllCompoundsInv %in% compoundToExtract]
# Admittedly, this step here where we say that CmpdMatches1, which is the
# actual compound names, is going to be in the same order as CmpdMatches2,
# which lists the coded versions of the compounds, makes me nervous just
# b/c it's coding by index and the two items aren't perfectly matched --
# we had to remove a bunch of excess junk in between them. So far, though,
# I haven't found an example of this failing. The order that compounds are
# listed -- whether by their actual names or by their coded names -- seems
# to be the same always.
CmpdMatches <- data.frame(NamesInExcel = CmpdMatches1,
CompoundCode = CmpdMatches2) %>%
mutate(CompoundName = sub("( )?\\+( )?[Ii]nteraction", "", NamesInExcel),
CompoundID = AllCompoundsInv[CompoundName],
CompoundID = ifelse(str_detect(CompoundCode, "I(Sys|liver|pv|Tissue)") &
CompoundID %in% c("substrate",
"primary metabolite 1",
"primary metabolite 2",
"secondary metabolite"),
AllCompoundsInv[str_detect(AllCompoundsInv, "inhibitor")][CompoundName],
CompoundID),
CompoundNameForRegex = sub("\\+|\\(|\\)|\\%|\\%", ".", CompoundName),
Interaction = str_detect(CompoundCode, "interaction"))
# Output sometimes dosen't list the compound name. Not sure what the
# pattern is for when this happens. They only list, e.g., "CPlasma" or
# "ITissue".
CompoundThatShouldBePresent <- compoundToExtract
ShouldBeButNotInhib <- CompoundThatShouldBePresent[!str_detect(CompoundThatShouldBePresent, "inhibitor")]
if(length(ShouldBeButNotInhib) == 0){
ShouldBeButNotInhib <- NA
}
if(any(is.na(CmpdMatches$CompoundID))){
CmpdMatches <- CmpdMatches %>%
mutate(CompoundID = case_when(
is.na(CompoundID) & CompoundCode %in% c("CPlasma",
"CPlasma interaction",
"CSys",
"CSys interaction",
"CTissue",
"CTissue Sub",
"CTissue interaction") ~ ShouldBeButNotInhib,
is.na(CompoundID) & CompoundCode == "ITissue" ~ "inhibitor 1",
is.na(CompoundID) & CompoundCode == "IPlasma" ~ "inhibitor 1",
TRUE ~ CompoundID))
}
rm(CmpdMatches1, CmpdMatches2, NApos, StartRow, EndRow, AllCompoundsInv)
for(cmpd in compoundToExtract){
if(complete.cases(Deets[AllCompounds$DetailNames[AllCompounds$CompoundID == cmpd]]) &
cmpd %in% CmpdMatches$CompoundID){
# NB: I made inhibitor 1 be "PERPETRATOR1INHIB" rather than just
# "PERPETRATOR1" for ease of regex. If it's just "PERPETRATOR1"
# then, later, the regex will match both "PERPETRATOR1" and
# "PERPETRATOR1METABOLITE".
# Challenging scenario: Sometimes people have to hack the simulator
# and have the same compound in multiple positions in the Simulator,
# making it extremely challenging to make sure to match the right
# compound IDs with the right compounds b/c one compound will have
# multiple matches. For that reason, only doing regex ONE COMPOUNDID
# AT A TIME and only for the subset of rows that apply to that
# compound ID.
CmpdRows <- which(str_detect(sim_data_xl$...1,
CmpdMatches %>%
filter(CompoundID == cmpd &
Interaction == FALSE) %>%
pull(CompoundCode)))
sim_data_xl$...1[CmpdRows] <-
sub(pattern =
str_c(c(
CmpdMatches$CompoundCode[
which(CmpdMatches$CompoundID == cmpd)],
CmpdMatches$CompoundNameForRegex[
which(CmpdMatches$CompoundID == cmpd)]),
collapse = "|"),
replacement =
switch(cmpd,
"substrate" = "SUBSTRATE",
"primary metabolite 1" = "PRIMET1",
"primary metabolite 2" = "PRIMET2",
"secondary metabolite" = "SECMET",
"inhibitor 1" = "PERPETRATOR1INHIB",
"inhibitor 2" = "PERPETRATOR2",
"inhibitor 1 metabolite" = "PERPETRATOR1MET"),
x =
sim_data_xl$...1[CmpdRows])
} else if(cmpd %in% compoundToExtract){
compoundToExtract <- setdiff(compoundToExtract, cmpd)
}
}
} else if(ADAM){
# ADAM and AdvBrainModel data have different requirements and generally
# only have a more limited set of compounds available.
# Release fraction doesn't include that verbiage in the aggregated data
# section. Fixing that. Cumulative release has its own tab, but I'm
# checking that so that we don't inadvertently reassign some aggregated
# data to release faction incorrectly.
if(any(str_detect(sim_data_xl$...1, "Release Fraction"), na.rm = T) &
any(str_detect(sim_data_xl$...1, "^Ms|^Dissolution Rate Solid State|^C Lumen Free|^C Lumen Total|^Heff|^Absorption Rate|^Mur|^Md|^Inh Md|^Luminal CLint|CTissue|dissolved|absorbed|^C Enterocyte|CIntracranial|CBrainI[CS]F|CCSF(Spinal|Cranial)|Kpuu_I[CS]F|Kpuu_BrainMass|CTotalBrain"),
na.rm = T) == FALSE){
sim_data_xl$...1[tolower(sim_data_xl$...1) %in% c("mean",
"95th percentile",
"5th percentile",
"95 percentile",
"95th ptile")] <-
paste("Release Fraction",
sim_data_xl$...1[tolower(sim_data_xl$...1) %in% c("mean",
"95th percentile",
"5th percentile",
"95 percentile",
"95th ptile")])
}
}
# Renaming interaction text -------------------------------------------------
# Multiple ways to specify that the data are the substrate or metabolite data
# in the present of the perpetrator. Harmonizing those, too.
sim_data_xl$...1 <- sub("After Inh|[iI]nteraction", "WITHINTERACTION",
sim_data_xl$...1)
# When it's lumen free, for some reason, it's labeled differently. Dealing
# with that.
sim_data_xl$...1 <- sub("Inh C Lumen Free", "C Lumen Free WITHINTERACTION",
sim_data_xl$...1)
return(sim_data_xl)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.