knitr::opts_chunk$set( echo = TRUE, fig.width = 6, fig.asp = 0.618, out.width = "70%", fig.align = "center", root.dir = '../' ) library(tidyverse) library(forcats) library(painBiomarkR)
We have two spreadsheets - one for the papers, and one for the biomarkers.
papers <- readxl::read_excel("2018_12_10-papers.xlsx") biomarkers <- readxl::read_excel("2018_12_10-biomarkers.xlsx")
map_biomarker <- readxl::read_excel("2018_12_12-mappers.xlsx", sheet = 1) map_strain <- readxl::read_excel("2018_12_12-mappers.xlsx", sheet = 2) map_tissue <- readxl::read_excel("2018_12_12-mappers.xlsx", sheet = 3)
Check that no missing
biomarkers %>% filter(is.na(`STUDY (PMID)`))
Export the unique biomarkers so that can set up as google sheet. The rest can then check and map for cleaning.
# biomarker_mapper <- data_frame(from = unique(biomarkers$BIOMARKER)) # biomarker_mapper %>% write_excel_csv("mapper_biomarker.csv")
We can now use the mapper to convert the biomarkers.
biomarkers <- biomarkers %>% mutate( SYSTEM = map_var(BIOMARKER, from = map_biomarker$from, to = map_biomarker$System), CLASS = map_var(BIOMARKER, from = map_biomarker$from, to = map_biomarker$Class))
Check
biomarkers %>% count(BIOMARKER, SYSTEM) %>% ggplot(aes(BIOMARKER, SYSTEM)) + geom_point() biomarkers %>% count(BIOMARKER, CLASS) %>% ggplot(aes(BIOMARKER, CLASS)) + geom_point()
Same as biomarker, set up google sheet for mapping.
# tissue_mapper <- data_frame(from = unique(biomarkers$TISSUE)) # tissue_mapper %>% write_excel_csv("mapper_tissue.csv")
Use mapper to add new columns
biomarkers <- biomarkers %>% mutate(TISSUE2 = map_var(TISSUE, from = map_tissue$from, to = map_tissue$to), ASSAYABLE = map_var(TISSUE, from = map_tissue$from, to = map_tissue$Assayable))
Check mapping
biomarkers %>% count(TISSUE, TISSUE2) %>% ggplot(aes(TISSUE, TISSUE2)) + geom_point() biomarkers %>% count(TISSUE, ASSAYABLE) %>% ggplot(aes(TISSUE, ASSAYABLE)) + geom_point()
Check that only yes and no.
biomarkers %>% count(mRNA)
Check only yes and no.
biomarkers %>% count(PROTEIN)
For response, the only levels should be decrease, increase, and no change.
biomarkers %>% count(RESPONSE)
Looks fine.
papers
Not needed in final dataset so remove.
papers <- papers %>% select(-Identifier)
Should be a single row for each PMID, so check.
papers %>% count(`Study PMID`) %>% filter(nn > 1)
All good.
As PMID is ID move to first place
papers <- papers %>% select(`Study PMID`, everything())
We use the RISmed
package to download dates from Pubmed, these are then added to the papers data frame.
papers # dates <- get_PM_dates(papers$`Study PMID`) # write_csv(dates, "dates.csv") dates <- read_csv("dates.csv") dates
Add the dates to the papers data frame
papers <- papers %>% left_join(dates, by = c("Study PMID" = "PMID")) papers
Check missing
papers %>% filter(is.na(Year))
These are all text fields, so not much general checking that can be done.
papers$Title[1] papers$Description[1] papers$Details[1]
The allowable pain measures are (See 2018_12_10_pain_measures.jpg).
1.8, 1.9, 2.7, 6, 7, 9, 13, 14, 18, 19, 20, 21, 27, 30, 300
papers$`Pain Measures` %>% str_split(., ",") %>% unlist() %>% unique()
Not sure if these are right, will need to check, in the meantime, we can remove spaces to ensure consistency.
papers <- papers %>% mutate(`Pain Measures` = str_remove_all(`Pain Measures`, " ")) papers$`Pain Measures` %>% str_split(., ",") %>% unlist() %>% unique()
Also this entry has no pain measure:
papers %>% filter(str_detect(`Pain Measures`, "NA"))
So set to NA
papers$`Pain Measures`[which(papers$`Pain Measures` == "NA")] <- NA papers$`Pain Measures` %>% str_split(., ",") %>% unlist() %>% unique()
Get list of papers with bad pain measures.
get_bad_pain_measures <- function(x){ allowed_pain_measures <- c( 1.8, 1.9, 2.7, 6, 7, 9, 13, 14, 18, 19, 20, 21, 27, 30, 300 ) if(any(!x %in% allowed_pain_measures)){ return(FALSE) } else { return(TRUE) } } bad_papers <- which( papers$`Pain Measures` %>% str_split(., ",") %>% map_lgl(get_bad_pain_measures) == FALSE) papers$`Study PMID`[bad_papers]
papers$`Measured pain type` %>% str_split(., ",") %>% unlist() %>% unique()
Will get rid of spaces, and convert ?? to ?.
papers <- papers %>% mutate( `Measured pain type` = str_remove_all(`Measured pain type`, " "), `Measured pain type` = str_replace_all(`Measured pain type`,"\\?\\?", "?") ) papers$`Measured pain type` %>% str_split(., ",") %>% unlist() %>% unique()
Set "NA" to NA
.
papers$`Measured pain type`[which(papers$`Measured pain type` == "NA")] <- NA papers$`Measured pain type` %>% str_split(., ",") %>% unlist() %>% unique()
Get data frame of model initiators. Note that diabetes did not have category code so I set to "L".
model_initiators <- readxl::read_excel("2018_12_10-initiators.xlsx") model_initiators
Some rows are just headers, the rest of the codes, so split
headers <- which(!is.na(model_initiators$Category)) codes <- which(is.na(model_initiators$Category))
Set Categories for the codes
model_initiators <- model_initiators %>% fill(Category) model_initiators
Strip out headers
headers <- model_initiators[headers,] headers model_initiators <- model_initiators[codes,] model_initiators
Add back category names
model_initiators <- model_initiators %>% left_join(headers, by = "Category") %>% rename(Subcategory = `Sub Category.x`, Code = Code.x, Category_name = `Intervention type.y`, Intervention_type = `Intervention type.x`) %>% select(-`Sub Category.y`, -Code.y) %>% mutate(Category_name = str_remove_all(Category_name, "\\d+")) model_initiators
Note that some of the codes have lots of decimal places. I think this is excel storing a larger number. Looking at the pattern all the rest are to two decimal places so we will round these.
Also 14.34 had a backtick so removed in the excel.
model_initiators$Code model_initiators <- model_initiators %>% mutate(Code = round(Code, 2))
Now we can check the papers dataset for correct model initiators.
papers$`Model Initiator` %>% str_split(., ",") %>% unlist() %>% unique()
Lot of stuff, first get rid of spaces
papers <- papers %>% mutate(`Model Initiator` = str_replace_all(`Model Initiator`, ", ", ",")) papers$`Model Initiator` %>% str_split(., ",") %>% unlist() %>% unique()
Next we have the rounding problem, so we will round
clean_model_initiator <- function(x){ y <- round(as.numeric(x), 2) is_na <- which(is.na(y)) y[is_na] <- x[is_na] y <- str_c(y, collapse = ",") return(y) } papers$model_initiator <- papers$`Model Initiator` %>% str_split(., ",") %>% map_chr(clean_model_initiator) papers %>% select(`Model Initiator`, model_initiator)
Now find levels that are not in model initiator dataframe.
get_bad_model_initiators <- function(x, allowed_MI){ if(any(!x %in% allowed_MI)){ return(TRUE) } else { return(FALSE) } } bad_papers <- papers$model_initiator %>% str_split(., ",") %>% map_lgl(get_bad_model_initiators, allowed_MI = model_initiators$Code) papers %>% filter(bad_papers) %>% select(`Study PMID`, model_initiator)
obs_MI <- papers$model_initiator %>% str_split(., ",") %>% unlist() %>% unique() obs_MI[!obs_MI %in% model_initiators$Code]
19937403 has space in front of 18.04, and 24398148 has period rather than dot - these are fixed by me in the excel.
bad_papers <- papers$model_initiator %>% str_split(., ",") %>% map_lgl(get_bad_model_initiators, allowed_MI = model_initiators$Code) papers %>% filter(bad_papers) %>% select(`Study PMID`, model_initiator)
papers$`Nerve/immune model` %>% str_split(., ",") %>% unlist() %>% unique()
Remove spaces, set to one stress, set NAs, and set N2I/I2N to both
papers <- papers %>% mutate( `Nerve/immune model` = str_remove_all(`Nerve/immune model`, " "), `Nerve/immune model` = str_replace(`Nerve/immune model`, "stress", "Stress"), `Nerve/immune model` = str_replace(`Nerve/immune model`, "N2I/I2N", "both") ) papers$`Nerve/immune model`[which(papers$`Nerve/immune model` == "NA")] <- NA papers$`Nerve/immune model` %>% str_split(., ",") %>% unlist() %>% unique()
Need to check if "both" and "N2I/I2N" are the same.
unique(papers$blinded)
Need to check if "notstateddesigned" can be converted to "not stated".
Get "notstateddesigned" for SH
papers %>% filter(blinded == "not stateddesigned")
papers$Species %>% str_split(., ",") %>% unlist() %>% unique()
Get rid of spaces, and convert "mouse and rat" to "mouse,rat"
papers <- papers %>% mutate( Species = str_replace(Species, "mouse and rat", "mouse,rat"), Species = str_remove(Species, " "), Species = str_to_title(Species) ) papers$Species %>% str_split(., ",") %>% unlist() %>% unique()
papers$Strain %>% str_split(., ",") %>% unlist() %>% unique()
Some spaces to remove
papers <- papers %>% mutate(Strain = str_replace_all(Strain, ", ", ",")) papers$Strain %>% str_split(., ",") %>% unlist() %>% unique() %>% sort()
May need a mapper.
strain_mapper <- data_frame( from = papers$Strain %>% str_split(., ",") %>% unlist() %>% unique() %>% sort() ) strain_mapper %>% write_excel_csv("mapper_strain.csv")
Using the mapper
papers <- papers %>% mutate(strain_species = map_var( Strain, from = map_strain$from, to = map_strain$to))
papers %>% filter(Species != strain_species) %>% select(`Study PMID`, Strain, Species, strain_species) %>% write_excel_csv("../data/species_mismatch.csv")
unique(papers$Sex)
So convert "Female" to "female"; and "no stated" to "not stated". Finally "NA" to NA.
papers <- papers %>% mutate( Sex = str_replace(Sex, "no stated", "not stated"), Sex = str_replace(Sex, "Female", "female") ) papers$Sex[which(papers$Sex == "NA")] <- NA unique(papers$Sex)
unique(papers$Neuter)
All fine
This is a text file so leave as is.
papers$Comment[1]
Last column is nothing of note so remove
table(papers$X__1, useNA = "always") papers <- papers %>% select(-X__1)
paper_biomarker <- left_join(papers, biomarkers, by = c("Study PMID" = "STUDY (PMID)"))
table(papers$`Nerve/immune model`)
devtools::use_data(biomarkers, overwrite = TRUE) devtools::use_data(papers, overwrite = TRUE) devtools::use_data(paper_biomarker, overwrite = TRUE) devtools::use_data(model_initiators, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.