library(learnr)
library(tidyverse)
library(gapminder)
library(useful)
library(data.table)
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
tutorial_options(exercise.timelimit = 60, exercise.blanks = "___+")

gapminder <- gapminder %>% 
  mutate(country = as.character(country),
         continent = as.character(continent))

Overview

Basic programming in R

For loop {.smaller}

When you want to do some operation repeatedly

for (element in my_vector) {
  ** DO STUFF WITH element**
}
my_vector <- c(2,5,3)
for (element in my_vector) {
    print(element)
}

For loop

Using a loop to sum the values in a vector

my_vector <- c(2,5,3,9,8,11,6)
count <- 0 #initialize a counter
for (val in my_vector) {
  count <- count + val
}
print(count)

We've been using them (implicitly)

my_vector <- c(2,5,3,9,8,11,6)
sum(my_vector)

Looping over indices of a vector

my_vector <- c(2,5,3,9,8,11,6)

mod_vector <- vector('double', length = length(my_vector))
for (i in seq_along(my_vector)) {
  mod_vector[i] <- my_vector[i] + 1
}
mod_vector
my_vector <- c(2,5,3,9,8,11,6)
mod_vector <- my_vector + 1

Loops to repeat something N times

The function seq is helpful for generating a vector to loop over

my_vector <- c()
for (i in seq(from = 1, to = 5)) {
  my_vector <- c(my_vector, 'ha')
}
print(my_vector)

Components of a for loop

While loop

Rarely used in analysis, but good to be aware of

while (condition) {
  # body
}

If (conditional) statement

Do some stuff only if a given condition holds

if (logical_statement) {
  **DO SOMETHING**
}

Often useful inside of loops

If / else

if (logical_statement) {
  **DO SOMETHING**
} else {
  **OTHER STUFF**
}
test_num <- function(x) {
  if (x < 0) {
    print("Negative number")
  } else if (x > 0) {
    print("Positive number")
  } else {
    print("Zero")
  }
}
test_num(-1)

If example

Count the number of even numbers in a vector

x <- c(2,5,3,9,8,11,6)
count <- 0
for (val in x) {
  if (val %% 2 == 0) {
    count = count+1
  }
}
print(count)

Example loop {.smaller}

Let's compute the median life expectancy for each country in gapminder

unique_countries <- unique(gapminder$country) #list to iterate over
med_lifeExp <- vector('double', length(unique_countries)) #initialize a list to store results
for (i in seq_along(unique_countries)) {
   gap_cur_country <- gapminder %>% 
      filter(country == unique_countries[i]) 
   med_lifeExp[i] <- median(gap_cur_country$lifeExp)
}
head(med_lifeExp)

Often a better way {.smaller}

There are typically multiple ways to accomplish the same goal, and when it comes to loops there are often more efficient ways

gapminder %>% 
  group_by(country) %>% 
  summarise(median = median(lifeExp))

Loops the tidyverse way {.smaller}

purrr package has a family of map functions that can make 'for loops' simpler

result <- map(my_vector, my_function)

Simpler replacement for this (also less prone to errors)

result <- vector('double', length(my_vector))
for (ii in seq_along(my_vector)) {
  result[ii] <- my_function(my_vector[ii])
}

Loops the tidyverse way {.smaller}

Tools for working with text

String manipulation

Imagine your genomic dataset has genes given like this:

gene_name_IDs <- c('EML1 (2009)', 'ATG2B (55102)', 'STAG3 (10734)')

String manipulation

GO <- read_tsv("data/gprofiler_results_Mov10oe.csv") %>% select(term.id, term.name, p.value)

Or what if we have a list of pathway enrichment results

GO %>% select(term.name, p.value) %>% head(5)

How could we find results for pathways involving certain key terms, like 'interferon'?

Working with strings {.smaller}

Read more here

str_length

str_vec <- c("why", "video", "cross", "extra", "deal", "authority")

str_length(str_vec)
str_sub(str_vec, 2)

str_c

collapse vector of strings into a single string

str_c('a', 'dog', 'walked', 'into', 'a', 'bar')
address <- c('Main St.', 'Ames St.', 'Brookline Ave.')
nums <- c(415, 75, 450)
str_c(nums, address, sep = ' ')

str_replace

str_vec <- c("why", "video", "cross", "extra", "deal", "authority")
str_replace(str_vec, pattern = "y", replacement = "?")

str_extract

Pull out the matching sequences from each string

str_vec <- c("why", "floss", "cross", "extra", "boss")
str_extract(str_vec, 'oss')

String manipulation with dplyr

db <- c('GO', 'KEGG', 'GO', 'GO', 'KEGG')
gset <- c('interferon signaling', 'interferon signaling', 'NFKB signaling','ER-nucleus signaling', 'TP53 pathway')
pathways <- tibble(db, gset)
pathways

String manipulation with dplyr

Make a new column that combines multiple string columns (opposite of separate)

pathways %>%
  mutate(db_gset = str_c(db, gset, sep = ' '))

String manipulation with dplyr

Modify text in a column

pathways %>%
  mutate(db_gset = str_replace(gset, 'signaling', 'sig.'))

String manipulation with dplyr

pathways <- pathways %>% 
  mutate(db_gset = str_replace(gset, 'signaling', 'sig.'))

Filter based on properties of string column (e.g. its length)

pathways %>%
  filter(str_length(db) > 2)

String manipulation with dplyr

Filter to rows matching a certain text pattern

pathways %>%
  filter(str_detect(gset, 'interferon'))

regular expressions

An example putting things together

head(GO, 5)

Let's find just the GO terms with 'bolic' in their name

An example putting things together

GO %>%
  mutate(term.name = tolower(term.name)) %>%
  filter(str_detect(term.name, 'bolic')) %>%
  mutate(type_of_bolic = str_extract(term.name, '[:alnum:]+bolic')) %>%
  head(5)

Additional string refs

Basic stats in R

Usesful statistical tests in R

R has a TON of useful stats functions, e.g.:


Many of these (e.g. t.test and cor.test are pretty easy to use.

Fitting linear models to data is extremely useful, and a key building block to more complex analyses

Linear models in R

gap_77 <- gapminder %>% dplyr::filter(year == 1977)
mod <- lm(lifeExp ~ gdpPercap, data = gap_77) 
print(mod)

Linear models in R {.smaller}

Use the summary function to print more details

summary(mod) 

Linear models in R {.smaller}

mod <- lm(lifeExp ~ gdpPercap + pop, data = gap_77) 
summary(mod) 

Linear models in R {.smaller}

mod <- lm(lifeExp ~ gdpPercap + continent, data = gap_77) 
summary(mod)

Linear models in R {.smaller}

mod <- lm(lifeExp ~ gdpPercap * continent, data = gap_77) 
summary(mod)

Always good to inspect relationships visually! {.smaller}

ggplot(gap_77, aes(x = gdpPercap, y = lifeExp)) + 
  geom_point() + 
  geom_smooth(method = 'lm')

Relationship is not linear!

Always good to inspect relationships visually! {.smaller}

ggplot(gap_77, aes(x = gdpPercap, y = lifeExp)) + 
  geom_point() + 
  geom_smooth(method = 'lm') +
  scale_x_log10()

Always good to inspect relationships visually! {.smaller}

Life expectancy is linearly related to log gdpPercap

mod <- lm(lifeExp ~ log10(gdpPercap), data = gap_77) 
print(mod)

ANOVA {.smaller}

Testing for different life expectancies across continents

unique(gap_77$continent)

Recall what we get with lm

mod <- lm(lifeExp ~ continent, data = gap_77)
summary(mod)$coefficients
gap_77 %>% 
  group_by(continent) %>% 
  summarise(mean_lifeExp = mean(lifeExp))

ANOVA {.smaller}

Compare to the group means?

gap_77 %>% 
  group_by(continent) %>% 
  summarise(mean_lifeExp = mean(lifeExp))
summary(mod)$coefficients

ANOVA {.smaller}

mod <- aov(lifeExp ~ continent, data = gap_77)
summary(mod)

ANOVA (post-hoc comparing group means) {.smaller}

Often interested in pairwise comparisons between group-means.

Several ways of doing this that account for 'multiple comparisons', e.g.:

TukeyHSD(mod)

Additional material

apply

result <- apply(matrix, MARGIN, function, extra_arguments) 

apply example {.smaller}

RNAseq_mat <- fread('data/CCLE_expression_subset.csv') %>% 
  as_tibble() %>% 
  column_to_rownames(var = 'V1') %>% 
  as.matrix()
corner(RNAseq_mat)
gene_medians <- apply(RNAseq_mat, MARGIN = 2, FUN = median, na.rm=TRUE)
head(gene_medians, 5)

apply with custom functions

Functions must take a vector and return a number

get_squared_median <- function(vec) {
  return(median(vec)^2)
}
apply(RNAseq_mat, 2, get_squared_median) %>% head(5)

scale (column-wise normalization) {.smaller}

The scale function is very useful for normalizing matrices

new_mat <- scale(mat, center, scale)

scale example

norm_RNAseq <- scale(RNAseq_mat, center = TRUE, scale = FALSE)
gene_medians <- apply(RNAseq_mat, 2, median, na.rm=TRUE)
norm_RNAseq <- scale(RNAseq_mat, center = gene_medians, scale = FALSE)

str_to_upper str_to_lower

str_vec <- c("Why", "Video", "Cross", "Extra", "Deal", "Authority")
str_to_upper(str_vec)
str_to_lower(str_vec)

str_sub

str_vec <- c("why", "video", "cross", "extra", "deal", "authority")
str_sub(str_vec, start = 2, end = 3)

str_subset {.smaller}

Restrict list to strings that contain a pattern match

str_vec <- c("why", "video", "cross", "extra", "deal", "adapt")
str_subset(str_vec, pattern = 'd')

str_detect returns a TRUE/FALSE vector (useful with filter())

regexp (position) {.smaller}

strings <- c("abcd", "cdab", "cabd", "c abd")
str_subset(strings, 'ab')
str_subset(strings, '^ab')

regexp (quantifiers) {.smaller}

strings <- c("a", "ab", "acb", "accb", "acccb", "accccb")
str_subset(strings, "ac*b")
str_subset(strings, "ac+b")

regexp (operators) {.smaller}

strings <- c("ab", "abc", "adc", "abd", "abe", "ab 12")
str_subset(strings, 'a.c')
str_subset(strings, 'ab[cde]')
# str_subset(strings, 'ab[c-e]') #same using a range

regexp (character classes) {.smaller}

strings <- c("ab", "abc", "adc", "abd", "abe", "ab 12")
str_subset(strings, 'ab[:alnum:]')

regexp (escaping) {.smaller}

strings <- c('dog', 'cat', 'th*s')

This gives an error

str_subset(strings, "*")

Need to 'escape' the interpretation of '*' by preceeding it with \\

str_subset(strings, "\\*")
gapminder %>%
  filter(str_detect(country, '^New '))

Thanks!

More cheat-sheet refs:

Rstudio2019

BaseR



AshirBorah/cp_bootcamp_r_tutorials documentation built on May 16, 2024, 3:24 p.m.