library(learnr) library(tidyverse) library(gapminder) library(useful) library(data.table) knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) tutorial_options(exercise.timelimit = 60, exercise.blanks = "___+") gapminder <- gapminder %>% mutate(country = as.character(country), continent = as.character(continent))
Key programming concepts
Loops
Conditionals
Working with Strings
Basic stats in R
When you want to do some operation repeatedly
for (element in my_vector) { ** DO STUFF WITH element** }
my_vector <- c(2,5,3) for (element in my_vector) { print(element) }
Using a loop to sum the values in a vector
my_vector <- c(2,5,3,9,8,11,6) count <- 0 #initialize a counter for (val in my_vector) { count <- count + val } print(count)
We've been using them (implicitly)
my_vector <- c(2,5,3,9,8,11,6) sum(my_vector)
my_vector <- c(2,5,3,9,8,11,6) mod_vector <- vector('double', length = length(my_vector)) for (i in seq_along(my_vector)) { mod_vector[i] <- my_vector[i] + 1 } mod_vector
my_vector <- c(2,5,3,9,8,11,6) mod_vector <- my_vector + 1
The function seq
is helpful for generating a vector to loop over
my_vector <- c() for (i in seq(from = 1, to = 5)) { my_vector <- c(my_vector, 'ha') } print(my_vector)
Output: how are you storing the output? output <- vector("double", length(x))
Sequence: What is being looped over. e.g. i in seq_along(df)
body: What is being done in each iteration: e.g. mod_vector[i] = my_vector[i]+1
Rarely used in analysis, but good to be aware of
while (condition) { # body }
Do some stuff only if a given condition holds
if (logical_statement) { **DO SOMETHING** }
Often useful inside of loops
if (logical_statement) { **DO SOMETHING** } else { **OTHER STUFF** }
test_num <- function(x) { if (x < 0) { print("Negative number") } else if (x > 0) { print("Positive number") } else { print("Zero") } } test_num(-1)
Count the number of even numbers in a vector
x <- c(2,5,3,9,8,11,6) count <- 0 for (val in x) { if (val %% 2 == 0) { count = count+1 } } print(count)
Let's compute the median life expectancy for each country in gapminder
unique_countries <- unique(gapminder$country) #list to iterate over med_lifeExp <- vector('double', length(unique_countries)) #initialize a list to store results for (i in seq_along(unique_countries)) { gap_cur_country <- gapminder %>% filter(country == unique_countries[i]) med_lifeExp[i] <- median(gap_cur_country$lifeExp) } head(med_lifeExp)
There are typically multiple ways to accomplish the same goal, and when it comes to loops there are often more efficient ways
gapminder %>% group_by(country) %>% summarise(median = median(lifeExp))
purrr
package has a family of map
functions that can make 'for loops' simpler
result <- map(my_vector, my_function)
Simpler replacement for this (also less prone to errors)
result <- vector('double', length(my_vector)) for (ii in seq_along(my_vector)) { result[ii] <- my_function(my_vector[ii]) }
map()
: returns a list
map_chr()
, map_int()
, map_dbl()
and map_lgl()
: returns a vector of this type (or die trying)
map_dfr()
and map_dfc()
combine results into a tibble (by row or column)
Lots more on purrr here
Imagine your genomic dataset has genes given like this:
gene_name_IDs <- c('EML1 (2009)', 'ATG2B (55102)', 'STAG3 (10734)')
How would you extract just the HUGO symbol?
What about the Entrez ID?
GO <- read_tsv("data/gprofiler_results_Mov10oe.csv") %>% select(term.id, term.name, p.value)
Or what if we have a list of pathway enrichment results
GO %>% select(term.name, p.value) %>% head(5)
How could we find results for pathways involving certain key terms, like 'interferon'?
Manipulating strings
Get length of strings: str_length
Combine strings, collapse: str_c
Chop/subset: str_sub
, str_trunc
Mutate/sort: str_to_upper
, str_sort
Pattern matching
Replace one pattern of text with another: str_replace
Extract/detect strings that match a pattern: str_extract
, str_detect
Read more here
str_vec <- c("why", "video", "cross", "extra", "deal", "authority") str_length(str_vec) str_sub(str_vec, 2)
collapse vector of strings into a single string
str_c('a', 'dog', 'walked', 'into', 'a', 'bar')
address <- c('Main St.', 'Ames St.', 'Brookline Ave.') nums <- c(415, 75, 450) str_c(nums, address, sep = ' ')
str_vec <- c("why", "video", "cross", "extra", "deal", "authority") str_replace(str_vec, pattern = "y", replacement = "?")
Pull out the matching sequences from each string
str_vec <- c("why", "floss", "cross", "extra", "boss") str_extract(str_vec, 'oss')
db <- c('GO', 'KEGG', 'GO', 'GO', 'KEGG') gset <- c('interferon signaling', 'interferon signaling', 'NFKB signaling','ER-nucleus signaling', 'TP53 pathway') pathways <- tibble(db, gset)
pathways
Make a new column that combines multiple string columns (opposite of separate
)
pathways %>% mutate(db_gset = str_c(db, gset, sep = ' '))
Modify text in a column
pathways %>% mutate(db_gset = str_replace(gset, 'signaling', 'sig.'))
pathways <- pathways %>% mutate(db_gset = str_replace(gset, 'signaling', 'sig.'))
Filter based on properties of string column (e.g. its length)
pathways %>% filter(str_length(db) > 2)
Filter to rows matching a certain text pattern
pathways %>% filter(str_detect(gset, 'interferon'))
Rules for extracting patterns in text using special syntax
Specify set of characters to seek out, possibly with info on repeats and location within the string, character sets, 'wildcards' etc.
head(GO, 5)
Let's find just the GO terms with 'bolic' in their name
GO %>% mutate(term.name = tolower(term.name)) %>% filter(str_detect(term.name, 'bolic')) %>% mutate(type_of_bolic = str_extract(term.name, '[:alnum:]+bolic')) %>% head(5)
https://stringr.tidyverse.org/
R has a TON of useful stats functions, e.g.:
wilcox.test()
Wilcoxon rank sum test
t.test()
t-test (one or two-sample)
cor.test()
test whether correlation is significant (Pearson or Spearman)
lm()
linear model
aov()
, anova()
ANOVA (analysis of variance)
Many of these (e.g. t.test
and cor.test
are pretty easy to use.
Fitting linear models to data is extremely useful, and a key building block to more complex analyses
Recall the basic linear model: dep_var = slope * ind_var + intercept
Use funky 'formula' notation: dep_var ~ ind_var
Read as "model dep_var
as a function of ind_var
"
gap_77 <- gapminder %>% dplyr::filter(year == 1977)
mod <- lm(lifeExp ~ gdpPercap, data = gap_77) print(mod)
Use the summary
function to print more details
summary(mod)
+
mod <- lm(lifeExp ~ gdpPercap + pop, data = gap_77) summary(mod)
mod <- lm(lifeExp ~ gdpPercap + continent, data = gap_77) summary(mod)
mod <- lm(lifeExp ~ gdpPercap * continent, data = gap_77) summary(mod)
ggplot(gap_77, aes(x = gdpPercap, y = lifeExp)) + geom_point() + geom_smooth(method = 'lm')
Relationship is not linear!
ggplot(gap_77, aes(x = gdpPercap, y = lifeExp)) + geom_point() + geom_smooth(method = 'lm') + scale_x_log10()
Life expectancy is linearly related to log gdpPercap
mod <- lm(lifeExp ~ log10(gdpPercap), data = gap_77) print(mod)
Testing for different life expectancies across continents
unique(gap_77$continent)
Recall what we get with lm
mod <- lm(lifeExp ~ continent, data = gap_77) summary(mod)$coefficients
gap_77 %>% group_by(continent) %>% summarise(mean_lifeExp = mean(lifeExp))
Compare to the group means?
gap_77 %>% group_by(continent) %>% summarise(mean_lifeExp = mean(lifeExp))
summary(mod)$coefficients
The aov
function is helpful for traditional analysis of variance (ANOVA)
Testing for significant differences across group means of categorical variable
mod <- aov(lifeExp ~ continent, data = gap_77) summary(mod)
Often interested in pairwise comparisons between group-means.
Several ways of doing this that account for 'multiple comparisons', e.g.:
TukeyHSD(mod)
What 'loops' for matrices?
How to apply a function to each row or column?
result <- apply(matrix, MARGIN, function, extra_arguments)
RNAseq_mat <- fread('data/CCLE_expression_subset.csv') %>% as_tibble() %>% column_to_rownames(var = 'V1') %>% as.matrix()
corner(RNAseq_mat)
gene_medians <- apply(RNAseq_mat, MARGIN = 2, FUN = median, na.rm=TRUE) head(gene_medians, 5)
na.rm=TRUE
is an additional argument to median()
Functions must take a vector and return a number
get_squared_median <- function(vec) { return(median(vec)^2) } apply(RNAseq_mat, 2, get_squared_median) %>% head(5)
The scale
function is very useful for normalizing matrices
Subtract off center
of each column (default = mean)
Divide each column by scale
(default is std. dev.)
new_mat <- scale(mat, center, scale)
center
and scale
can be either TRUE/FALSE, or a vector to be 'subtracted' or 'scaled' bynorm_RNAseq <- scale(RNAseq_mat, center = TRUE, scale = FALSE)
gene_medians <- apply(RNAseq_mat, 2, median, na.rm=TRUE) norm_RNAseq <- scale(RNAseq_mat, center = gene_medians, scale = FALSE)
str_vec <- c("Why", "Video", "Cross", "Extra", "Deal", "Authority") str_to_upper(str_vec) str_to_lower(str_vec)
str_vec <- c("why", "video", "cross", "extra", "deal", "authority") str_sub(str_vec, start = 2, end = 3)
Restrict list to strings that contain a pattern match
str_vec <- c("why", "video", "cross", "extra", "deal", "adapt") str_subset(str_vec, pattern = 'd')
str_detect
returns a TRUE/FALSE vector (useful with filter()
)
^
: matches the start of the string.
$
: matches the end of the string.
strings <- c("abcd", "cdab", "cabd", "c abd")
str_subset(strings, 'ab')
str_subset(strings, '^ab')
*
: matches at least 0 times.
+
: matches at least 1 times.
strings <- c("a", "ab", "acb", "accb", "acccb", "accccb")
str_subset(strings, "ac*b")
str_subset(strings, "ac+b")
.
: (wildcard) matches any single character
[...]
: matches any character in a set
|
: matches either pattern on either side
strings <- c("ab", "abc", "adc", "abd", "abe", "ab 12")
str_subset(strings, 'a.c')
str_subset(strings, 'ab[cde]') # str_subset(strings, 'ab[c-e]') #same using a range
[:digit:]
same as [0-9]
[:alpha:]
all alphabetic characters (either case)[:alnum:]
all alphanumeric characters[:space:]
any whitespace characterstrings <- c("ab", "abc", "adc", "abd", "abe", "ab 12")
str_subset(strings, 'ab[:alnum:]')
strings <- c('dog', 'cat', 'th*s')
This gives an error
str_subset(strings, "*")
Need to 'escape' the interpretation of '*' by preceeding it with \\
str_subset(strings, "\\*")
gapminder %>% filter(str_detect(country, '^New '))
More cheat-sheet refs:
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.