knitr::opts_chunk$set(echo = TRUE, fig.width = 5, fig.height = 3)
## defines the root of the project for later use
require("rprojroot") || utils::install.packages("rprojroot")
library(rprojroot)
root <- find_root_file(criterion = is_rstudio_project)

Lesson Aim

path_to_readxl_sticker <- file.path(root, "images", "readxl_sticker.png")
 knitr::include_graphics(path = path_to_readxl_sticker, dpi = 250)

The Excel Enigma

Reading .xls and .xlsx with {readxl}

Real-life example

path_to_messy_excel <- file.path(root, "images", "messy_excel.png")
knitr::include_graphics(path_to_messy_excel, dpi = 110)

Let's read this messy Excel file into R

path_to_messy_excel_data <- file.path(
  root, 
  "data-raw",
  "D020",
  "messy_excel.xlsx")
messy <- readxl::read_excel(path = path_to_messy_excel_data, sheet = 1 )
head(messy, 8)

So that didn't work, now what

Variables in the dataset:

How do we extract these and make the data tidy?

Splitting the set into two:

EC50 and IL-1alpha_2x are side-by-side

#?read_excel

library(readxl)
EC50 <- readxl::read_excel(path = path_to_messy_excel_data, 
                           sheet = 1,
                           range = cell_cols("C:F"),
                           na = c("not tested"),
                           col_names = c("hu", "disfeb",                                                "vumc", "basf")
)

head(EC50, 8)

IL-1 alpha

IL1alpha <- readxl::read_excel(path = path_to_messy_excel_data, 
                           sheet = 1,
                           range = cell_cols("I:L"),
                           na = c("not tested", "not reached"),
                           col_names = c("hu", "disfeb",                                                "vumc", "basf")
)
head(IL1alpha)

Cleaning up the data further

You will notice that there are a number of strange records in the data. Probably due to merging of cells?

Extract the actual data with indexing

EC50 <- EC50[c(3:15), ]
IL1alpha <- IL1alpha[c(3:15), ]

Adding an extra variable with compound names

chemical_names <- readxl::read_excel(path_to_messy_excel_data,
                                     range = cell_cols("A"))

chemical_names <- chemical_names[c(4:16), ]
chemical_names <- chemical_names$`Correlation LLNA-EC3 data` 


EC50$chemical_names <- chemical_names
IL1alpha$chemical_names <- chemical_names

Adding a new variable on the read_out variable

EC50$read_out <- c("ec50")
IL1alpha$read_out <- c("il_1_alpha")

Combining the two data

We can combine the data now.

library(dplyr)
tidy <- bind_rows(EC50, IL1alpha)

Complete Cases

tidy <- tidy[complete.cases(tidy), ]

Gather all the nummeric variables into one

The variables called hu to basf are actually no variables We can gather() these variables into onw called laboratory and put the numeric value in result

library(tidyr)
tidy <- tidy %>%
  gather(hu:basf, key = laboratory, value = result)

Checking variable class

str(tidy)
tidy$chemical_names <- as.factor(tidy$chemical_names)
tidy$read_out <- as.factor(tidy$read_out)
tidy$laboratory <- as.factor(tidy$laboratory)
tidy$result <- as.numeric(tidy$result)

Rounding digits

tidy$result <- round(tidy$result, 2)

Final result

tidy

Plotting the result

We will make two plot to start the Exploratory Data Analysis. Which chemical do you think is the most potent?

What do you think of the correlation between the EC3 and the IL1-alpha values?

EC3 values

library(ggplot2)
## read EC3
EC3 <- readxl::read_excel(path_to_messy_excel_data,
                       range = cell_cols("A:B"))

EC3 <- EC3[c(4:16), ]
names(EC3) <- c("chemical_names", "result")
EC3$result <- as.numeric(EC3$result)
EC3$chemical_names <- as.factor(EC3$chemical_names)

Potency

## plot
potency <- tidy %>% 
ggplot(aes(x = reorder(chemical_names, 
                       result, FUN= mean), 
           y = result)) +
  geom_point() + 
  geom_smooth() +
  facet_grid(read_out ~ laboratory) + 
  coord_flip() +
  ylab("Result (mg/ml") +
  xlab("Chemical names") +
  ggtitle("Per lab and per chemical") 

Plot potency

potency

Correlation

names(EC3)[2] <- "ec3"
combined <- left_join(tidy, EC3, by = "chemical_names")

only_hu <- combined %>% 
  filter(read_out == "il_1_alpha", laboratory == "hu")

cor <- cor.test(only_hu$result, only_hu$ec3)

correlation <- only_hu %>% 
ggplot(aes(x = result, 
           y = ec3)) +
  geom_point() +
  geom_smooth(method = "lm") +
  xlab("IL-1a") +
  ylab("EC3") +
  ggtitle("Relation between EC50 and EC3 for HU") +
  geom_text(aes(label = chemical_names, hjust = 0.2, vjust = -.4)) +
  geom_jitter(position = "jitter")

Plots

correlation

Write to disk

readr::write_csv(
  combined,
  here::here(
  "data",
  "D020",
  "tidy_excel.csv"
  )
)

## md5 sums
tools::md5sum(files = here::here(
  "data",
  "D020",
  "tidy_excel.csv"
  )) %>% readr::write_lines(
  path = 
    here::here(
      "data",
      "D020",
      paste("tidy_excel.csv.md5", sep = "_")
      ))


DataScienceILC/tlsc-dsfb26v-20_workflows documentation built on July 4, 2025, 5:49 a.m.