RStudio

knitr::opts_chunk$set(
  collapse = TRUE,
  message = FALSE,
  warning = FALSE,
  comment = "#>",
  fig.align = "center"
)

Before we get started: update pbda

#devtools::install_github("rnabioco/practical-data-analysis")
#pbda::update_pbda() # probably say no to updating other packages
library(pbda)
library(dplyr)

Classes to come:

basic programming concepts (functions, conditional statements, for loops, etc)
RNAseq
gene lists, GO term enrichment
other tips and tricks

atom (single value) data types

character
integer
numeric
logical

look at classes

class(0L)
class(0.0)
class("0")

is.numeric(0L) # test for certain type/class
is.integer(0L)

coerce to other classes

class(as.character(0))
class(as.numeric("0"))

integer vs numeric

1:5
class(1:5)
object.size(1:1000)
object.size(as.numeric(1:1000)) # integer saves space

be careful with integer coercion

as.integer(1.8) # note that as.integer isn't rounding
round(1.8)

as.integer(-1.8) # be very careful
round(-1.8)

numbers to logical

as.logical(0) # 0 == FALSE
as.logical(0.1) # anything not 0 is TRUE
as.logical("0.1") # can't coerce

data structures

1. atomic vector, a combination of values

indexing

v1 <- c("geneA", "geneB", "geneC")
length(v1)

v1[2] # access element by index
v1[c(3, 1)] # use index to change order
v1[c(TRUE, TRUE, FALSE)] # use logical vector as index

v2 <- c(5, 10, 0) # pretend that v2 contains expression values for v1, they can be filtered like this:
v2 >= 2 # result is logical vector
v1[v2 >= 2]

combining and coercion

c(v2, 4) # combine values
c(4, v2) # note order
c(v1, v2) # vectors only contain same type of data

class(v2)
as.character(v2) # coercion over entire vector
as.character(v2) %>% class()

useful functions

unique(c(1, 2, 3, 2))

sort(c(2, 4, 3))
sort(c("geneB", "geneA", "geneC"))
sort(c("geneB", "geneA", "geneC"), decreasing = TRUE)

intersect(c(1, 2, 3), c(2, 3, 4))

setdiff(c(1, 2, 3), c(2, 3, 4))
setdiff(c(2, 3, 4), c(1, 2, 3)) # note difference order makes

2. data.frame, combination of multiple vectors

df vs tbl

mtcars
class(mtcars)

mtcars_tbl <- as_tibble(mtcars, rownames = "name") # tibble usually drops rownames
mtcars_tbl # only prints first 10 by default
class(mtcars_tbl) # still a data.frame, but more
class(mtcars_tbl) == "data.frame" # programming without considering potentially different result structure is dangerous
is.data.frame(mtcars_tbl)

exploring data.frame

mtcars$mpg # a vector
mtcars[["mpg"]] # also a vector
dim(mtcars)
ncol(mtcars)
nrow(mtcars)
colnames(mtcars)
rownames(mtcars)

mtcars_tbl_hp <- mtcars_tbl %>% dplyr::select(name, hp)
colnames(mtcars_tbl_hp)
colnames(mtcars_tbl_hp) <- c("car", "horsepower") # assign new column names
mtcars_tbl_hp
colnames(mtcars_tbl_hp)[1] <- "carname"  # assign new column name by index
mtcars_tbl_hp

data.frame indexing

mtcars[1, 1] # value of one cell, order is row then column
mtcars[1, ] # row to new data.frame
mtcars[, 1] # column to vector
mtcars[, -c(1:5)] # negative selection
mtcars[c(1:2), "hp"] # combination of number index and names

3. matrix, combination of multiple vectors with the same types

matrix vs df

mtcars_mat <- as.matrix(mtcars)
mtcars_mat 
mtcars_tbl_mat <- as.matrix(mtcars_tbl)
mtcars_tbl_mat # all coerced to character

object.size(mtcars)
object.size(mtcars_mat) # smaller and faster with certain calculations

very similar function calls to df

# mtcars_mat$mpg # can't do this for matrix
# mtcars_mat[["mpg"]] # can't do this for matrix

dim(mtcars_mat)
ncol(mtcars_mat)
nrow(mtcars_mat)
colnames(mtcars_mat)
rownames(mtcars_mat)

length(mtcars) # number of cols
length(mtcars_mat) # number of cells, probably want to avoid using them

colnames(mtcars_mat)[1] <- "milespergallon" # assign new column names
mtcars_mat

matrix indexing

mtcars_mat[1, ] # row to vector, named vector
mtcars_mat[, 1] # column to vector
mtcars_mat[1, 1] # value of one cell
mtcars_mat[, -c(1:5)] # negative selection
mtcars_mat[c(1:2), "hp"] # combination of number index and names

t(mtcars_mat)

mathematical operations

log2(mtcars_mat)
rowSums(mtcars_mat)
rowMeans(mtcars_mat)
colSums(mtcars_mat)
colMeans(mtcars_mat)

mtcars_mat - 5 # -5 on every numeric value
mtcars_mat * c(1, 0, -1) # vector recycling, note sequence
mtcars_mat - mtcars_mat[, 1] # each element in the same row is subtracted by the corresponding vector element, ie normalize by the 1st column

matrix practice question -

How would you center the data (subtract the mean for each variable/column)?

#hint: start with 
mtcars_mat_t <- t(mtcars_mat)

4. list, collection of objects(vectors, matrices, data.frames, etc)

lists store all types of data

l1 <- list(1, 
           c("what", "ever"), 
           mtcars)
l1

l2 <- list(n = 1, 
           c = c("what", "ever"), 
           df = mtcars, 
           l = l1) # with names, also can even include lists
l2

pbda::cc.genes # example of list

exploring list

cc.genes[[1]]
cc.genes$s.genes

length(cc.genes)
length(cc.genes[[1]])
names(cc.genes)

unlist(cc.genes) # named vector
unlist(cc.genes, use.names = FALSE)

c(cc.genes, "geneA") # combine into list
cc.genes[[1]] <- c(cc.genes[[1]], "geneA") # combine into first list element

list practice question -

From cc.genes, how many markers are shared for both S phase and G2/M?

list practice question 2 -

Are ggplot objects list?

library(ggplot2)
g <- ggplot(mtcars, aes(x = hp, y = mpg, color = factor(cyl))) + geom_point()

5. S3/S4, more complex objects, similar to lists but accessed with $ and @. See RNAseq class.

6. factors, grouping variables the data

plotting use case

library(ggplot2)
months_tbl <- data.frame(month = c("Jan", "Feb", "Mar"),
                         labmeetings = c(0, 3, 9))
ggplot(months_tbl, aes(x = month, y = labmeetings)) +
  geom_col() +
  cowplot::theme_cowplot() # <- ordered alphabetical, not ideal

months_tbl_factor <- months_tbl %>% mutate(month = factor(month, levels = c("Jan", "Feb", "Mar")))
months_tbl_factor$month
ggplot(months_tbl_factor, aes(x = month, y = labmeetings)) +
  geom_col() +
  cowplot::theme_cowplot() # <- ordered as described by levels

for other uses of factors, please visit https://forcats.tidyverse.org/

I/O, reading and writing files

readr package

faster more flexible than base R functions for large files

library(readr) 
path <- system.file("extdata", "gene_tibble.csv", package = 'pbda') # data included in package
# call `less` from the terminal, paste in path as well
gene_tbl <- read_csv(path)
write_csv(gene_tbl, "gene_tbl.csv")
write_csv(gene_tbl, "gene_tbl.csv.gz") # will auto zip if indicated
getwd() # saved here if full path is not given

path2 <- system.file("extdata", "hg19_genes.bed.gz", package = 'pbda') # will auto unzip
bed_tbl <- read_tsv(path2) # use col_names = FALSE or give vector of names
# use the terminal and zless to look at it briefly

david_tbl <- read_tsv("https://raw.githubusercontent.com/IDPT7810/practical-data-analysis/master/inst/extdata/david.txt") # link directly

write_lines(cc.genes$s.genes, "Sgenes.txt") # write vector into file, each element on a line

?read_delim # worth looking through the options

we recommend using readr functions, with "_", but beware of the rowname exclusion

write_csv(mtcars, "mtcars.txt")
read_csv("mtcars.txt") # row names are gone!

mtcars %>% as_tibble(rownames = "rowname") # before saving, use one of these two options
mtcars %>% tibble::rownames_to_column("rowname")

write.csv(mtcars, "mtcars2.txt")
r1 <- read_csv("mtcars2.txt") 
r2 <- read.csv("mtcars2.txt") # note the different default column name assignment
r3 <- read.csv("mtcars2.txt", row.names = 1) # gets back row names

readr practice question -

look at this variant call format, read with readr. rename column names to c("chromosome", "position", "variantID", "ref_allele", "alt_allele", "quality", "filter", "info") count number of variants at each reported position, sort by descending order

# download.file("https://raw.githubusercontent.com/IDPT7810/practical-data-analysis/master/inst/extdata/clinvar_2000.vcf",
#              "clinvar_2000.vcf")

readxl for xlsx files

library(readxl)
read_excel(readxl_example("datasets.xlsx"))
?read_excel

saving objects as .rdata (.rda is the same)

rstudio asks or by default saves entire environment into .rdata file

this causes slow start up and other conflicts, we recommend turning the option off

but do use rdata when needed

save(mtcars_mat, mtcars_mat_t, file = "mat.RData")
rm(mtcars_mat, mtcars_mat_t) # a way to remove from environment
load("mat.RData") # load them back

.rds, saving one object, but allows for name reassignment

saveRDS(mtcars_mat, "mtcars_mat.rds") # default compress = TRUE
mat2 <- readRDS("mtcars_mat.rds") 
identical(mat2, mtcars_mat)

IDPT7810/practical-data-analysis documentation built on July 8, 2022, 9:32 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

IDPT7810/practical-data-analysis Practical Biological Data Analysis in R/RStudio