In AshirBorah/cp_bootcamp_r_tutorials:

library(learnr)
library(data.table)
library(here)
library(useful)
library(tidyverse)
library(gapminder)
library(broom)
library(ggrepel)
library(knitr)
library(kableExtra)

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

#beautify table printout
pretty_table <- function(table, max_rows = 6, max_cols = 6) {
  if (nrow(table) > max_rows) {
    table <- table %>% head(max_rows)
  } 
  if (ncol(table) > max_cols) {
    table <- table[, 1:max_cols]
  }
  kable(table) %>%
  kable_styling(c('striped', 'hover'), full_width = F, position = 'left')
}

sample_info <- read_csv(here::here('data', 'sample_info.csv')) %>% 
  select(DepMap_ID, lineage)
RNAseq_mat <- fread(here('data', 'CCLE_expression_subset.csv')) %>% 
  as_tibble() %>% 
  column_to_rownames(var = 'V1')

These exercises are intended to help synthesize and reinforce the pieces we've covered previously, and get practice with a few new concepts, most notably:

for loops
conditional statements (if/else)
string manipulation
basic statistical modeling

Programming practice

For loops

Recall the general pattern of for loops

output <- **PREPARE OUTPUT**
for (element in vector) {
  ** DO STUFF WITH element**
}

Write a for loop that computes the sum of squared elements in a vector, and apply it to the test_vector

test_vector <- 1:10

result <- 0 #initialize
for (num in test_vector) {

}

Looping over columns

We can also loop over the columns of a dataframe. Technically, dataframes are lists of columns so you can use seq_along(df) to create a vector from 1 to the number of columns to loop over, and the syntax df[[i]] will pull out the ith column as a vector.

Write a loop that computes the average value for each column in the dataframe iris_num, and stores the result in col_means.

iris[, 1:4] %>% head

iris_num <- iris[, 1:4]

col_means <- vector('double', ncol(iris_num))

col_means <- vector('double', ncol(iris_num))
for (i in seq_along(iris_num)) {
  col_means[i] <- mean(iris_num[[i]])
}

If / else statements

Recall the general pattern of for if / else statements

if (condition_1) {
    *STUFF*
  else if (condition_2) {
    *OTHER_STUFF*
  } else {
    *OR THIS*
  }
}

Use a loop to create a new vector result whose values are equal to those in test_vec if they are odd, and are squared for even values.

NOTE: the %% operator 'modulus' is useful here, as x %% 2 is 1 for odd numbers and 0 for even numbers

test_vec <- 1:10

result <- vector('double', length(test_vec))
for (i in seq_along(test_vec)) {

}

result <- vector('double', length(test_vec))
for (i in seq_along(test_vec)) {
  if (test_vec[i] %% 2 == 0) {

    } else {

    }
}

If/else in a function

Let's write a function that will adjust input strings to a target length (call it K).

If the string is the target length leave it as is.
If the string is too long, keep only the first K characters
If the string is too short, add enough periods at the end to make it length K

Complete the template and verify with the test cases. NOTE: str_length(string) will return the length of the string

norm_string <- function(input_string, target_length) {
  if ( ) {
    out_string <- str_sub()
  } else if ( ) {
    out_string <- str_pad()
  } else {
    out_string <- 
  }
  return(out_string)
}
norm_string('test', 4)
norm_string('test', 3)
norm_string('test', 5)

#truncate a string to a target length
out_string <- str_sub(input_string, start = 1, end = target_length)

#expand a string to a target length by padding it on the end with periods
out_string <- str_pad(input_string, target_length, side = 'right', pad = '.')

norm_string <- function(input_string, target_length) {
  if (str_length(input_string) > target_length) {
    out_string <- str_sub(input_string, start = 1, end = target_length)
  } else if (str_length(input_string) < target_length) {
    out_string <- str_pad(input_string, target_length, side = 'right', pad = '.')
  } else {
    out_string <- input_string
  }
  return(out_string)
}

String manipulation practice

str_c

Use the str_c function to do the following with the test_vec of strings:

add an 's' to each string
collapse the vector into a single string, where elements are separated by a comma and space (i.e. "dog, horse, pig")

test_vec <- c('dog', 'horse', 'pig')

str_c(test_vec, 's')
str_c(test_vec, collapse = ', ')

Let's practice using str_c with a table. Use string functions to add a new column to the table called animal_string so that it looks like this:

plural <- c('many', 'some')
animal_df <- tibble(animal = c('dog', 'horse', 'pig', 'cow', 'chicken'), 
       amount = c('a', 'many', 'some', 'one', 'one'))
animal_df %>% 
  mutate(animal_string = str_c(amount, animal, sep = ' '),
         animal_string = ifelse(amount %in% plural, str_c(animal_string, 's'), animal_string),
         animal_string = ifelse(str_length(animal) > 3, str_to_title(animal_string), animal_string))

You'll want to use the following string functions: str_c, str_length and str_to_title

plural <- c('many', 'some') #set of plural amounts
animal_df <- tibble(animal = c('dog', 'horse', 'pig', 'cow', 'chicken'), 
       amount = c('a', 'many', 'some', 'one', 'one'))

mutate(animal_string = str_c(amount, animal, sep = ' '))

mutate(animal_string = ifelse(amount %in% plural, str_c(animal_string, 's'), animal_string))

plural <- c('many', 'some')
animal_df <- tibble(animal = c('dog', 'horse', 'pig', 'cow', 'chicken'), 
       amount = c('a', 'many', 'some', 'one', 'one'))
animal_df <- animal_df %>% 
  mutate(animal_string = str_c(amount, animal, sep = ' '),
         animal_string = ifelse(amount %in% plural, str_c(animal_string, 's'), animal_string),
         animal_string = ifelse(str_length(animal) > 3, str_to_title(animal_string), animal_string))

Pattern matching

We won't have time to get into pattern matching, but it's very helpful. See the strings section of R4ds, or other regular expression tutorials.

Practice around a bit with pattern matching. You can use the str_view(string_vec, pattern) function to visualize where there are matches in the string_vec to a target `pattern'

A few tips upfront: you can create sets with '[]', so '[Df3]' will match one of the characters D, f, or 3. '[abc][def]' would match one of 'a', 'b', or 'c' followed by one of 'd', 'e', or 'f', etc. The special symbols ^ and $ represent the start and end of the string respectively.

There are many more tricks like this, but see if you can match strings that end with a vowel. Then try using str_subset to extract those matching strings

test_vec <- c('cow', 'horse', 'pig', 'dog', 'goat', 'sheep', 'jellyfish', 'squid', 'giraffe', 'elephant')

str_subset(test_vec, '[aeiou]$')

Basic linear modeling

Recall the general linear model fitting pattern: mod <- lm(dep_var ~ indep_var, data = df), and that summary(mod) will print a summary of the linear model results

Using the gapminder table again, test if life expectancy significantly varies with population for Asian countries

summary(mod) #prints summary stats from model

Now test if life expectancy varies significantly with population when accounting for gdpPercap, again for Asian countries

mod <- lm(lifeExp ~ pop + gdpPercap, data = gapminder %>% filter(continent == 'Asia'))

Write a loop that computes the linear regression slope of average lifeExp as a function of year, separately for each continent.

unique_continents <- unique(gapminder$continent)

cont_slopes <- vector('double', length(unique_continents))
for (i in seq_along(unique_continents)) {

}

cont_slopes <- vector('double', length(unique_continents))
for (i in seq_along(unique_continents)) {
      cont_by_year <- gapminder %>% 
      filter(continent == unique_continents[i]) %>% 
      group_by(year) %>% 
      summarise(mean_lifeExp = mean(lifeExp))
  mod <- lm()
}

unique_continents <- unique(gapminder$continent)
cont_slopes <- vector('double', length(unique_continents))
for (i in seq_along(unique_continents)) {
    cont_by_year <- gapminder %>% 
      filter(continent == unique_continents[i]) %>% 
      group_by(year) %>% 
      summarise(mean_lifeExp = mean(lifeExp))
    mod <- lm(mean_lifeExp ~ year, data = cont_by_year)
    cont_slopes[i] <- mod$coefficients[2]
}

ANOVA

Fit a linear model with lm with lifeExpectancy as the dependent variable and continent as the independent variable. Let's verify that the model coefficients represent group means computed relative to the first group in the list (which gets assigned to the 'Intercept' term).

Compute a table of per-continent average lifeExp. Then compare these to the model coefficients with plot(x,y). You can access the model coefficients as a vector using mod$coefficients.

mod <- lm(lifeExp ~ continent, data = gapminder)

cont_avgs <- gapminder %>% 
  group_by(continent) %>% 
  summarise(mean_lifeExp = mean(lifeExp))

#this will add the first model coefficient to all the remaining values to convert back to 'absolute' group averages (rather than relative to the first)
adj_coefs <- mod$coefficients
adj_coefs[-1] <-  mod$coefficients[-1] + mod$coefficients[1] #[-1] subsets to all except the first element
plot(adj_coefs, cont_avgs$mean_lifeExp)

Challenge

Now let's make a plot showing the significance of across-countinent differences in lifeExp as a function of year.

Use a for loop to iterate over the unique years in gapminder
for each year, fit an ANOVA model with aov of lifeExp as a function of continent
use the tidy function (already loaded from the broom package) to convert the model object into a tidy dataframe of stats (take a look at what it looks like for an example model fit)
pull out the p.value
use ggplot to plot the -log10(p-value) of the ANOVA vs years.

p_vals[i] <- tidy(cur_aov) %>% 
  filter(term == 'continent') %>% 
  pull(p.value)

unique_years <- unique(gapminder$year)
p_vals <- vector('double', length(unique_years))
for (i in seq_along(unique_years)) {
  cur_dat <- gapminder %>% 
    filter(year == unique_years[i])
  cur_aov <- aov(lifeExp ~ continent, data = cur_dat)
  p_vals[i] <- tidy(cur_aov) %>% 
    filter(term == 'continent') %>% 
    pull(p.value)
}
tibble(year = unique_years, p_val = p_vals) %>% 
  ggplot(aes(year, -log10(p_val))) + 
  geom_line() + 
  geom_point()

Your plot should look like this:

unique_years <- unique(gapminder$year)
p_vals <- vector('double', length(unique_years))
for (i in seq_along(unique_years)) {
  cur_dat <- gapminder %>% 
    filter(year == unique_years[i])
  cur_aov <- aov(lifeExp ~ continent, data = cur_dat)
  p_vals[i] <- broom::tidy(cur_aov) %>% 
    filter(term == 'continent') %>% 
    pull(p.value)
}
tibble(year = unique_years, p_val = p_vals) %>% 
  ggplot(aes(year, -log10(p_val))) + 
  geom_line() + 
  geom_point()

More practice!

Let's first load an example matrix of gene expression values from the DepMap. Row names are cell line IDs, column names are gene IDs. Note this is just a small subset of the data with a few hundred genes

corner(RNAseq_mat)

Finding gene identifiers

The gene names in our expression matrix are formatted with symbols and numeric IDs, according to the pattern "HGNC_SYMBOL (ENTREZ_ID)". Let's make a function that takes as input an HGNC symbol (like "CREBBP") and returns the full identifier used in RNAseq_mat (e.g. "CREBBP (1387)").

Fill in the blank below to make this function and test you get the right result with CREBBP. Use the str_subset(string, pattern) function which will look through a vector of strings and return any values that match the target pattern we're searching for.

Careful, in our case the pattern you want to use isn't quite just the gene name (e.g. if you searched for 'TP53' you might get 'TP53' and 'TP53RK' as hits). As a hint, the function str_c can add additional characters to a string.

Note also that you can use RNAseq_mat inside your function even though we're not providing it as one of the arguments. Here we're treating it as 'globally fixed' for the purposes of these analysis.

get_gene_id <- function(gene_symbol) {

}
get_gene_id('CREBBP')
get_gene_id('AR')

str_c(gene_symbol, ' ') #add a space after the target gene symbol

str_subset(colnames(RNAseq_mat), str_c(gene_symbol, ' '))

Get expression range

Write a function which, given a gene ID (as it appears in the RNAseq_mat column names), returns the difference between the largest and smallest gene expression values in RNAseq_mat

Use 'if / else' syntax to test whether the smallest expression value for the gene is equal to 0. If it is 0, return 'NA' instead of the expression range.

get_gene_id <- function(gene_symbol) {
  return(str_subset(colnames(RNAseq_mat), paste0(gene_symbol, ' ')))
}

get_gene_range <- function(gene_id) {

}

## tests (remove comments when ready to test)
#get_gene_range('CREBBP (1387)') #
#get_gene_range('CD79B (974)')

min_val <- min(RNAseq_mat[gene_id,])

if (minval == 0) {
  exp_diff <- NA
} else {
  exp_diff <- max_val - min_val
}

get_gene_range <- function(gene_id) {
    max_val <- max(RNAseq_mat[, gene_id])
    min_val <- min(RNAseq_mat[, gene_id])
    if (min_val == 0) {
      exp_diff <- NA
    } else {
      exp_diff <- max_val - min_val
    }
    return(exp_diff)
}

Your function should work like this:

get_gene_range('CREBBP (1387)')
get_gene_range('CD79B (974)')

Now use your get_gene_range function to finish writing this 'for loop' to iterate over genes and compute the range of expression levels for each gene, storing the results in gene_ranges_loop

gene_id_list <- colnames(RNAseq_mat)
gene_ranges_loop <- vector('numeric', length(gene_id_list)) #let's initialize an empty numeric vector to hold the medians for the genes
names(gene_ranges_loop) <- gene_id_list #lets make it a named vector

#fill in the rest
for (cur_gene in gene_id_list) {

}
head(gene_ranges_loop)

gene_ranges_loop[cur_gene] <- get_gene_range(cur_gene)

You should get something that looks like this

gene_id_list <- colnames(RNAseq_mat)
gene_ranges_loop <- vector('numeric', length(gene_id_list)) #let's initialize an empty numeric vector to hold the medians for the genes
names(gene_ranges_loop) <- gene_id_list #lets make it a named vector

#fill in the rest
for (cur_gene in gene_id_list) {
  gene_ranges_loop[cur_gene] <- get_gene_range(cur_gene)
}
head(gene_ranges_loop) #look at the beginning of the resulting vector

Now use map_dbl() to do the same thing (more efficiently).

This function takes as input a list and a function, and applies the function to each element in the list. the "_dbl" part in the function name means that it will return a 'double' or 'numerical' vector as the result. (Note for functions that take multiple inputs, the values being looped over will be provided as the first input to the function by default)

gene_ranges_map <- map_dbl()

map_dbl(colnames(RNAseq_mat), get_gene_range)

get_gene_range <- function(gene_id) {
    max_val <- max(RNAseq_mat[, gene_id])
    min_val <- min(RNAseq_mat[, gene_id])
    if (min_val == 0) {
      exp_diff <- NA
    } else {
      exp_diff <- max_val - min_val
    }
    return(exp_diff)
}
gene_ranges_map <- map_dbl(colnames(RNAseq_mat), get_gene_range)