knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(learnr)
library(tidyverse)
library(useful)
tutorial_options(exercise.timelimit = 60, exercise.blanks = "___+")

Outline

R and Rstudio

Different ways to work with R

Rstudio: Live demo

-->

Make your code readable!

Comments

Also good to include comments in your code for specifics/lightweight explanation

#add a fixed offset to avoid negative values
new_data <- old_data + offset 

#now normalize so max value is 1
new_data <- new_data/max(new_data)
new_data <- my_pipeline_function(new_data, outliers = 'ignore') #drop 6-sigma outliers

Create project directory

Loading data

Basics of data formats

Comma-separated values (.csv)

name,age,member_since
bob,20,2015
frank,40,2015
tammy,15,2016

Tab-separated values (.tsv or .txt)

name  age  member_since
bob  20  2015
frank  40  2015
tammy  15  2016

A zoo of data loading functions

| Data type | Extension | Function | Package | |:------------------------|:----------|:------------------|:-------------------| | Comma separated values | csv | read.csv() | utils (default) | | | | read_csv() | readr (tidyverse) | | Tab separated values | tsv | read_tsv() | readr | | Other delimited formats | txt | read.table() | utils | | | | read_table() | readr | | | | read_delim() | readr | | Excel | xlsx, xls | read_excel() | readxl (tidyverse) |

Practical advice on data loading

my_table <- read_delim('my_data_file.csv')

Specifying where a file 'is'

File locations ('paths') can be specified in absolute or relative terms

imported_data <- read_csv("/Users/jmmcfarl/BootCamp/cp_r_bootcamp/
                          data/data_file.csv")

Relative paths

This will look for the file inside your 'working directory'

imported_data <- read_csv('data_file.csv')

What's the working directory and how does it get set?

To avoid having to worry about this

library(here)
here('data', 'my_data_file.csv')

You can use it like this:

imported_csv <- read_delim(here('data', 'data_file.csv'))
imported_tsv <- read_delim(here('data', 'tab_separated_table.tsv'))

Loading from Excel sheets

Load the second sheet from an xlsx file

my_table <- read_excel(here('data', 'my_metadata_file.xlsx'), sheet = 2)

Uses the readxl package part of tidyverse

Loading matrices

For matrices:

Loading matrices

library(data.table)
counts_matrix <-  fread(here('data', 'counts_rpkm.csv'))
counts_matrix <- as_tibble(counts_matrix) #convert to a 'tibble'
corner(counts_matrix)

Then we want to make one of the column rownames and mint our new data matrix

counts_matrix <- column_to_rownames(counts_matrix, var = 'Gene') #set Gene column to rownames
counts_matrix <- as.matrix(counts_matrix) #formally make it a matrix, not really necessary
corner(counts_matrix)

Inspecting data {.smaller}

Backup plan

Working with data in R

3 ways to access data

Selecting by position

age <- c(15, 22, 45, 52, 73, 81)

age[5]
idx <- c(3,5,6) # create vector of the elements of interest
age[idx]

Selecting elements with logical statements {.smaller}

| Operator | Description | | :-----------:|:-------------| | > | greater than | | >= | greater than or equal to| | < | less than | | <= | less than or equal to | | == | equal to | | != | not equal to | | & | and | | \| |or |


age > 50
log_idx <- age > 50
age[log_idx]
#same as age[age > 50]

age == 52
age[age == 52]
age[age != 52]

Subsetting using names

Can assign names to each element in a vector

age <- c(Allice = 15, Bob = 22, Charlie = 45, Dan = 52)
age
names(age)

Can also set names on a given vector

age <- c(15, 22, 45, 52)
names(age) <- c('Allice', 'Bob', 'Charlie', 'Dan')

Subsetting using names

age

Selecting elements by name is generally unambiguous

age[c('Bob', 'Charlie')]

Brief asside on NULL

R has another special value NULL which represents 'the absence of a value'.

For example

my_vec <- c(1,2,3)
names(my_vec)

Note: Subtly different from NA, but don't worry about that now.

Reordering data

x <- c(4, 2, 3, 5, 1)
sort(x)
sort(x, decreasing = TRUE)

Reordering using indexing

Indexing can also be used to reorder data

teaching_team <- c("Mary", "Meeta", "Radhika")
reorder_teach <- teaching_team[c(3, 1, 2)] # Saving the results to a variable
reorder_teach

Subsetting matrices

matrix[row_set, column_set]
matrix[row_set, ] #keeps all columns

matrix[, column_set] #keeps all rows

Matrix subsetting examples

counts_mat <- fread(here('data', 'counts_rpkm.csv'))
counts_mat <- as_tibble(counts_mat)
counts_mat <- column_to_rownames(counts_mat, var = 'Gene')
counts_mat <- as.matrix(counts_mat)
useful::corner(counts_mat)

counts_mat[3, ]

counts_mat[, 3]

counts_mat[1:4, 2:3]

Using rownames and column names

counts_mat[c('ENSMUSG00000000028', 'ENSMUSG00000000037'), ]

counts_mat['ENSMUSG00000000001', c('sample7', 'sample5')]

The %in% operator

A <- c(1, 2, 3, 4)
B <- c(3, 4, 5, 6)

A %in% B

Applications of %in%

Useful for restricting to the intersection of elements in two lists

A <- c(1, 2, 3, 4)
B <- c(3, 4, 5, 6)

A[A %in% B]

Or checking whether any/all elements of A are contained in B

any(A %in% B)
all(A %in% B)

Other 'set' functions

Inspecting data visually

x <- counts_mat['ENSMUSG00000081010',]
hist(x)

x <- counts_mat['ENSMUSG00000081010',]
y <- counts_mat['ENSMUSG00000000037',]
plot(x,y)

Subsetting lists {.smaller}

A list of lists

people <- list(
  Allice = list(age = 20, height = 50, school = 'MIT'),
  Bob = list(age = 10, height = 30, school = 'Harvard'),
  Charlie = list(age = 40, height = 60, school = 'BU'),
  Frank = c(age = 10, height = 2)
  )
people[[2]]

Subsetting lists {.smaller}

With lists it's especially useful to access elements by name

people[['Bob']]

Another (equivalent) way is to use the $ symbol. This is nice because it works with 'tab-complete'

people$Bob

Nested indexing of lists

people[['Allice']][['school']]
people$Allice$height

Creating dataframes

species <- c("ecoli", "human", "corn")
glengths <- c(4.6, 3000, 50000)
df <- tibble(species, glengths)
df

You can name the columns

df <- tibble(animal_species = species, genome_lengths = glengths)
df

Quick note on new/old ways

df <- data.frame(species = species, glengths = glengths)
df

Extracting a single column

df <- tibble(species, glengths)

You can access a column from a tibble as if it were a list of vectors (it is):

df$species
df[['species']]

Subsetting dataframes

You can access rows and columns of dataframes like matrix indexing

df[2,]
df[2:3, 'glengths']

Saving data to a file

write_csv(df, here('results', 'my_dataframe.csv'))

Key concepts recap



AshirBorah/cp_bootcamp_r_tutorials documentation built on May 16, 2024, 3:24 p.m.