knitr::opts_chunk$set( collapse = TRUE, message = FALSE, warning = FALSE, comment = "#>", fig.align = "center" )
#devtools::install_github("rnabioco/practical-data-analysis") #pbda::update_pbda() # probably say no to updating other packages library(pbda) library(dplyr)
basic programming concepts (functions, conditional statements, for loops, etc)
RNAseq
gene lists, GO term enrichment
other tips and tricks
character
integer
numeric
logical
class(0L) class(0.0) class("0") is.numeric(0L) # test for certain type/class is.integer(0L)
class(as.character(0)) class(as.numeric("0"))
1:5 class(1:5) object.size(1:1000) object.size(as.numeric(1:1000)) # integer saves space
as.integer(1.8) # note that as.integer isn't rounding round(1.8) as.integer(-1.8) # be very careful round(-1.8)
as.logical(0) # 0 == FALSE as.logical(0.1) # anything not 0 is TRUE as.logical("0.1") # can't coerce
v1 <- c("geneA", "geneB", "geneC") length(v1) v1[2] # access element by index v1[c(3, 1)] # use index to change order v1[c(TRUE, TRUE, FALSE)] # use logical vector as index v2 <- c(5, 10, 0) # pretend that v2 contains expression values for v1, they can be filtered like this: v2 >= 2 # result is logical vector v1[v2 >= 2]
c(v2, 4) # combine values c(4, v2) # note order c(v1, v2) # vectors only contain same type of data class(v2) as.character(v2) # coercion over entire vector as.character(v2) %>% class()
unique(c(1, 2, 3, 2)) sort(c(2, 4, 3)) sort(c("geneB", "geneA", "geneC")) sort(c("geneB", "geneA", "geneC"), decreasing = TRUE) intersect(c(1, 2, 3), c(2, 3, 4)) setdiff(c(1, 2, 3), c(2, 3, 4)) setdiff(c(2, 3, 4), c(1, 2, 3)) # note difference order makes
mtcars class(mtcars) mtcars_tbl <- as_tibble(mtcars, rownames = "name") # tibble usually drops rownames mtcars_tbl # only prints first 10 by default class(mtcars_tbl) # still a data.frame, but more class(mtcars_tbl) == "data.frame" # programming without considering potentially different result structure is dangerous is.data.frame(mtcars_tbl)
mtcars$mpg # a vector mtcars[["mpg"]] # also a vector dim(mtcars) ncol(mtcars) nrow(mtcars) colnames(mtcars) rownames(mtcars) mtcars_tbl_hp <- mtcars_tbl %>% dplyr::select(name, hp) colnames(mtcars_tbl_hp) colnames(mtcars_tbl_hp) <- c("car", "horsepower") # assign new column names mtcars_tbl_hp colnames(mtcars_tbl_hp)[1] <- "carname" # assign new column name by index mtcars_tbl_hp
mtcars[1, 1] # value of one cell, order is row then column mtcars[1, ] # row to new data.frame mtcars[, 1] # column to vector mtcars[, -c(1:5)] # negative selection mtcars[c(1:2), "hp"] # combination of number index and names
mtcars_mat <- as.matrix(mtcars) mtcars_mat mtcars_tbl_mat <- as.matrix(mtcars_tbl) mtcars_tbl_mat # all coerced to character object.size(mtcars) object.size(mtcars_mat) # smaller and faster with certain calculations
# mtcars_mat$mpg # can't do this for matrix # mtcars_mat[["mpg"]] # can't do this for matrix dim(mtcars_mat) ncol(mtcars_mat) nrow(mtcars_mat) colnames(mtcars_mat) rownames(mtcars_mat) length(mtcars) # number of cols length(mtcars_mat) # number of cells, probably want to avoid using them colnames(mtcars_mat)[1] <- "milespergallon" # assign new column names mtcars_mat
mtcars_mat[1, ] # row to vector, named vector mtcars_mat[, 1] # column to vector mtcars_mat[1, 1] # value of one cell mtcars_mat[, -c(1:5)] # negative selection mtcars_mat[c(1:2), "hp"] # combination of number index and names t(mtcars_mat)
log2(mtcars_mat) rowSums(mtcars_mat) rowMeans(mtcars_mat) colSums(mtcars_mat) colMeans(mtcars_mat) mtcars_mat - 5 # -5 on every numeric value mtcars_mat * c(1, 0, -1) # vector recycling, note sequence mtcars_mat - mtcars_mat[, 1] # each element in the same row is subtracted by the corresponding vector element, ie normalize by the 1st column
How would you center the data (subtract the mean for each variable/column)?
#hint: start with mtcars_mat_t <- t(mtcars_mat)
l1 <- list(1, c("what", "ever"), mtcars) l1 l2 <- list(n = 1, c = c("what", "ever"), df = mtcars, l = l1) # with names, also can even include lists l2 pbda::cc.genes # example of list
cc.genes[[1]] cc.genes$s.genes length(cc.genes) length(cc.genes[[1]]) names(cc.genes) unlist(cc.genes) # named vector unlist(cc.genes, use.names = FALSE) c(cc.genes, "geneA") # combine into list cc.genes[[1]] <- c(cc.genes[[1]], "geneA") # combine into first list element
From cc.genes
, how many markers are shared for both S phase and G2/M?
Are ggplot objects list?
library(ggplot2) g <- ggplot(mtcars, aes(x = hp, y = mpg, color = factor(cyl))) + geom_point()
library(ggplot2) months_tbl <- data.frame(month = c("Jan", "Feb", "Mar"), labmeetings = c(0, 3, 9)) ggplot(months_tbl, aes(x = month, y = labmeetings)) + geom_col() + cowplot::theme_cowplot() # <- ordered alphabetical, not ideal months_tbl_factor <- months_tbl %>% mutate(month = factor(month, levels = c("Jan", "Feb", "Mar"))) months_tbl_factor$month ggplot(months_tbl_factor, aes(x = month, y = labmeetings)) + geom_col() + cowplot::theme_cowplot() # <- ordered as described by levels
library(readr) path <- system.file("extdata", "gene_tibble.csv", package = 'pbda') # data included in package # call `less` from the terminal, paste in path as well gene_tbl <- read_csv(path) write_csv(gene_tbl, "gene_tbl.csv") write_csv(gene_tbl, "gene_tbl.csv.gz") # will auto zip if indicated getwd() # saved here if full path is not given path2 <- system.file("extdata", "hg19_genes.bed.gz", package = 'pbda') # will auto unzip bed_tbl <- read_tsv(path2) # use col_names = FALSE or give vector of names # use the terminal and zless to look at it briefly david_tbl <- read_tsv("https://raw.githubusercontent.com/IDPT7810/practical-data-analysis/master/inst/extdata/david.txt") # link directly write_lines(cc.genes$s.genes, "Sgenes.txt") # write vector into file, each element on a line ?read_delim # worth looking through the options
write_csv(mtcars, "mtcars.txt") read_csv("mtcars.txt") # row names are gone! mtcars %>% as_tibble(rownames = "rowname") # before saving, use one of these two options mtcars %>% tibble::rownames_to_column("rowname") write.csv(mtcars, "mtcars2.txt") r1 <- read_csv("mtcars2.txt") r2 <- read.csv("mtcars2.txt") # note the different default column name assignment r3 <- read.csv("mtcars2.txt", row.names = 1) # gets back row names
look at this variant call format, read with readr. rename column names to c("chromosome", "position", "variantID", "ref_allele", "alt_allele", "quality", "filter", "info") count number of variants at each reported position, sort by descending order
# download.file("https://raw.githubusercontent.com/IDPT7810/practical-data-analysis/master/inst/extdata/clinvar_2000.vcf", # "clinvar_2000.vcf")
library(readxl) read_excel(readxl_example("datasets.xlsx")) ?read_excel
rstudio asks or by default saves entire environment into .rdata file
this causes slow start up and other conflicts, we recommend turning the option off
but do use rdata when needed
save(mtcars_mat, mtcars_mat_t, file = "mat.RData") rm(mtcars_mat, mtcars_mat_t) # a way to remove from environment load("mat.RData") # load them back
saveRDS(mtcars_mat, "mtcars_mat.rds") # default compress = TRUE mat2 <- readRDS("mtcars_mat.rds") identical(mat2, mtcars_mat)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.