knitr::opts_chunk$set(
  echo = TRUE, 
  fig.width = 6, 
  fig.asp = 0.618, 
  out.width = "70%",
  fig.align = "center", 
  root.dir = '../'
)
library(tidyverse)
library(forcats)
library(painBiomarkR)

Read in the excel spreadsheets.

We have two spreadsheets - one for the papers, and one for the biomarkers.

papers  <- readxl::read_excel("2018_12_10-papers.xlsx")
biomarkers  <- readxl::read_excel("2018_12_10-biomarkers.xlsx")

Clean biomarkers

Read in mappers

map_biomarker  <- readxl::read_excel("2018_12_12-mappers.xlsx", sheet = 1)
map_strain  <- readxl::read_excel("2018_12_12-mappers.xlsx", sheet = 2)
map_tissue  <- readxl::read_excel("2018_12_12-mappers.xlsx", sheet = 3)

PMID

Check that no missing

biomarkers %>% filter(is.na(`STUDY (PMID)`))

BIOMARKER

Export the unique biomarkers so that can set up as google sheet. The rest can then check and map for cleaning.

# biomarker_mapper  <- data_frame(from = unique(biomarkers$BIOMARKER))
# biomarker_mapper %>% write_excel_csv("mapper_biomarker.csv")

We can now use the mapper to convert the biomarkers.

biomarkers  <- 
  biomarkers %>% 
  mutate(
    SYSTEM = map_var(BIOMARKER, 
                     from = map_biomarker$from, 
                     to = map_biomarker$System), 
    CLASS = map_var(BIOMARKER, 
                    from = map_biomarker$from,
                    to = map_biomarker$Class))

Check

biomarkers %>% 
  count(BIOMARKER, SYSTEM) %>% 
  ggplot(aes(BIOMARKER, SYSTEM)) + geom_point()
biomarkers %>% 
  count(BIOMARKER, CLASS) %>% 
  ggplot(aes(BIOMARKER, CLASS)) + geom_point()

TISSUE

Same as biomarker, set up google sheet for mapping.

# tissue_mapper  <- data_frame(from = unique(biomarkers$TISSUE))
# tissue_mapper %>% write_excel_csv("mapper_tissue.csv")

Use mapper to add new columns

biomarkers <-   
biomarkers %>% 
  mutate(TISSUE2 = map_var(TISSUE, 
                           from = map_tissue$from, 
                           to = map_tissue$to), 
         ASSAYABLE = map_var(TISSUE, 
                             from = map_tissue$from, 
                             to = map_tissue$Assayable))

Check mapping

biomarkers %>% 
  count(TISSUE, TISSUE2) %>% 
  ggplot(aes(TISSUE, TISSUE2)) + geom_point()
biomarkers %>% 
  count(TISSUE, ASSAYABLE) %>% 
  ggplot(aes(TISSUE, ASSAYABLE)) + geom_point()

mRNA

Check that only yes and no.

biomarkers %>% count(mRNA)

PROTEIN

Check only yes and no.

biomarkers %>% count(PROTEIN)

RESPONSE

For response, the only levels should be decrease, increase, and no change.

biomarkers %>% count(RESPONSE)

Looks fine.

Clean papers file

papers

Identifier

Not needed in final dataset so remove.

papers  <- papers %>% select(-Identifier)

PMID

Should be a single row for each PMID, so check.

papers %>% count(`Study PMID`) %>% 
  filter(nn > 1)

All good.

As PMID is ID move to first place

papers  <- 
  papers %>% 
  select(`Study PMID`, everything())

Publication date

We use the RISmed package to download dates from Pubmed, these are then added to the papers data frame.

papers
# dates  <- get_PM_dates(papers$`Study PMID`)
# write_csv(dates, "dates.csv")
dates  <- read_csv("dates.csv")
dates

Add the dates to the papers data frame

papers  <- 
  papers %>% left_join(dates, by = c("Study PMID" = "PMID"))
papers

Check missing

papers %>% 
  filter(is.na(Year))

Title, Description, and Details

These are all text fields, so not much general checking that can be done.

papers$Title[1]
papers$Description[1]
papers$Details[1]

Pain measures

The allowable pain measures are (See 2018_12_10_pain_measures.jpg).

1.8, 1.9, 2.7, 6, 7, 9, 13, 14, 18, 19, 20, 21, 27, 30, 300

papers$`Pain Measures` %>% str_split(., ",") %>% unlist() %>% unique()

Not sure if these are right, will need to check, in the meantime, we can remove spaces to ensure consistency.

papers  <- 
  papers %>% 
  mutate(`Pain Measures` = str_remove_all(`Pain Measures`, " "))
papers$`Pain Measures` %>% str_split(., ",") %>% unlist() %>% unique()

Also this entry has no pain measure:

papers %>% 
  filter(str_detect(`Pain Measures`, "NA"))

So set to NA

papers$`Pain Measures`[which(papers$`Pain Measures` == "NA")]  <- NA
papers$`Pain Measures` %>% str_split(., ",") %>% unlist() %>% unique()

Get list of papers with bad pain measures.

get_bad_pain_measures  <- function(x){
  allowed_pain_measures  <- c(
    1.8, 1.9, 2.7, 6, 7, 9, 13, 14, 18, 19, 20, 21, 27, 30, 300
  )
  if(any(!x %in% allowed_pain_measures)){
    return(FALSE)
  } else {
    return(TRUE)
  }
}


bad_papers  <- which(
papers$`Pain Measures` %>% 
str_split(., ",") %>% 
map_lgl(get_bad_pain_measures) == FALSE)
papers$`Study PMID`[bad_papers]

Measured pain

papers$`Measured pain type` %>% str_split(., ",") %>% unlist() %>% unique()

Will get rid of spaces, and convert ?? to ?.

papers  <- 
  papers %>% 
  mutate(
    `Measured pain type` = str_remove_all(`Measured pain type`, " "), 
    `Measured pain type` = str_replace_all(`Measured pain type`,"\\?\\?", "?")
    )
papers$`Measured pain type` %>% str_split(., ",") %>% unlist() %>% unique()

Set "NA" to NA.

papers$`Measured pain type`[which(papers$`Measured pain type` == "NA")]  <- NA
papers$`Measured pain type` %>% str_split(., ",") %>% unlist() %>% unique()

Model initiator

Get data frame of model initiators. Note that diabetes did not have category code so I set to "L".

model_initiators  <- readxl::read_excel("2018_12_10-initiators.xlsx")
model_initiators

Some rows are just headers, the rest of the codes, so split

headers  <- which(!is.na(model_initiators$Category))
codes  <- which(is.na(model_initiators$Category))

Set Categories for the codes

model_initiators  <- 
  model_initiators %>% 
  fill(Category)
model_initiators

Strip out headers

headers  <- model_initiators[headers,]
headers

model_initiators  <- model_initiators[codes,]
model_initiators

Add back category names

model_initiators  <- 
  model_initiators %>% 
  left_join(headers, by = "Category") %>% 
  rename(Subcategory = `Sub Category.x`, 
         Code = Code.x, 
         Category_name = `Intervention type.y`, 
         Intervention_type = `Intervention type.x`) %>% 
  select(-`Sub Category.y`, -Code.y) %>% 
  mutate(Category_name = str_remove_all(Category_name, "\\d+"))
model_initiators

Note that some of the codes have lots of decimal places. I think this is excel storing a larger number. Looking at the pattern all the rest are to two decimal places so we will round these.

Also 14.34 had a backtick so removed in the excel.

model_initiators$Code
model_initiators <- 
  model_initiators %>% 
  mutate(Code = round(Code, 2))

Now we can check the papers dataset for correct model initiators.

papers$`Model Initiator` %>% str_split(., ",") %>% unlist() %>% unique()

Lot of stuff, first get rid of spaces

papers  <- 
  papers %>% 
  mutate(`Model Initiator` = str_replace_all(`Model Initiator`, ", ", ","))
papers$`Model Initiator` %>% str_split(., ",") %>% unlist() %>% unique()

Next we have the rounding problem, so we will round

clean_model_initiator  <- function(x){
  y  <- round(as.numeric(x), 2)
  is_na  <- which(is.na(y))
  y[is_na]  <- x[is_na]
  y  <- str_c(y, collapse = ",")
  return(y)
}
papers$model_initiator  <- 
  papers$`Model Initiator` %>% 
  str_split(., ",") %>% 
  map_chr(clean_model_initiator)
papers %>% 
  select(`Model Initiator`, model_initiator)

Now find levels that are not in model initiator dataframe.

get_bad_model_initiators  <- function(x, allowed_MI){
  if(any(!x %in% allowed_MI)){
    return(TRUE)
  } else {
    return(FALSE)
  }
}
bad_papers  <- 
  papers$model_initiator %>% 
  str_split(., ",") %>% 
  map_lgl(get_bad_model_initiators, allowed_MI = model_initiators$Code)
papers %>% filter(bad_papers) %>% select(`Study PMID`, model_initiator)
obs_MI  <- papers$model_initiator %>% str_split(., ",") %>% 
  unlist() %>% 
  unique()
obs_MI[!obs_MI %in% model_initiators$Code]

19937403 has space in front of 18.04, and 24398148 has period rather than dot - these are fixed by me in the excel.

bad_papers  <- 
  papers$model_initiator %>% 
  str_split(., ",") %>% 
  map_lgl(get_bad_model_initiators, allowed_MI = model_initiators$Code)
papers %>% filter(bad_papers) %>% select(`Study PMID`, model_initiator) 

Nerve / Immune

papers$`Nerve/immune model` %>% str_split(., ",") %>% unlist() %>% unique()

Remove spaces, set to one stress, set NAs, and set N2I/I2N to both

papers  <- 
  papers %>% 
  mutate(
    `Nerve/immune model` = str_remove_all(`Nerve/immune model`, " "), 
    `Nerve/immune model` = str_replace(`Nerve/immune model`, "stress", "Stress"),
    `Nerve/immune model` = str_replace(`Nerve/immune model`, "N2I/I2N", "both")
    )
papers$`Nerve/immune model`[which(papers$`Nerve/immune model` == "NA")]  <- NA
papers$`Nerve/immune model` %>% str_split(., ",") %>% unlist() %>% unique()

Need to check if "both" and "N2I/I2N" are the same.

Blinded

unique(papers$blinded)

Need to check if "notstateddesigned" can be converted to "not stated".

Get "notstateddesigned" for SH

papers %>% 
  filter(blinded == "not stateddesigned")

Species

papers$Species %>% str_split(., ",") %>% unlist() %>% unique()

Get rid of spaces, and convert "mouse and rat" to "mouse,rat"

papers  <- 
  papers %>% 
  mutate(
    Species = str_replace(Species, "mouse and rat", "mouse,rat"),
    Species = str_remove(Species, " "), 
    Species = str_to_title(Species)
  )
papers$Species %>% str_split(., ",") %>% unlist() %>% unique()

Strain

papers$Strain %>% str_split(., ",") %>% unlist() %>% unique()

Some spaces to remove

papers  <- 
  papers %>% 
  mutate(Strain = str_replace_all(Strain, ", ", ","))
papers$Strain %>% str_split(., ",") %>% unlist() %>% unique() %>% sort()

May need a mapper.

strain_mapper  <- data_frame(
  from = papers$Strain %>% str_split(., ",") %>% unlist() %>% unique() %>% sort()
)
strain_mapper %>% write_excel_csv("mapper_strain.csv")

Using the mapper

papers  <- 
  papers %>% 
  mutate(strain_species = map_var(
    Strain, 
    from = map_strain$from, 
    to = map_strain$to))
papers %>% 
  filter(Species != strain_species) %>% 
  select(`Study PMID`, Strain, Species, strain_species) %>% 
  write_excel_csv("../data/species_mismatch.csv")

Check sex

unique(papers$Sex)

So convert "Female" to "female"; and "no stated" to "not stated". Finally "NA" to NA.

papers  <- 
  papers %>% 
  mutate(
    Sex = str_replace(Sex, "no stated", "not stated"),
    Sex = str_replace(Sex, "Female", "female")
  )
papers$Sex[which(papers$Sex == "NA")]  <- NA
unique(papers$Sex)

Neuter

unique(papers$Neuter)

All fine

Comment

This is a text file so leave as is.

papers$Comment[1]

Remove last column

Last column is nothing of note so remove

table(papers$X__1, useNA = "always")
papers  <- 
  papers %>%
  select(-X__1)

merge datasets

paper_biomarker  <- left_join(papers, biomarkers, by = c("Study PMID" = "STUDY (PMID)"))
table(papers$`Nerve/immune model`)

Save data to package

devtools::use_data(biomarkers, overwrite = TRUE)
devtools::use_data(papers, overwrite = TRUE)
devtools::use_data(paper_biomarker, overwrite = TRUE)
devtools::use_data(model_initiators, overwrite = TRUE)


jonotuke/painBiomarkR documentation built on May 13, 2019, 3:01 a.m.