In cancerit/RCRISPR: R Functions for CRISPR-Related Analyses

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Libary setup and working directory

Load libraries for data import and processing.

library(rcrispr)
library(tidyverse)
library(readxl)

Set location where data will be stored NOTE: this will need to be updated depending on where you want to prepare and store the data.

dirpath <- '~/cancer/Rpackages/rcrispr/inst/extdata'

Colic et al 2019 drug-gene interaction screens

The following library and datasets are detailed in:

Colic, M., Wang, G., Zimmermann, M. et al.
Identifying chemogenetic interactions from CRISPR screens with drugZ.
Genome Med 11, 52 (2019).
https://doi.org/10.1186/s13073-019-0665-3

Zimmermann, M., Murina, O., Reijns, M.A.M. et al.
CRISPR screens identify genomic ribonucleotides as a source of PARP-trapping lesions.
Nature 559, 285–289 (2018).
https://doi.org/10.1038/s41586-018-0291-z

Download and parse Toronto KnockOut (TKO) CRISPR Library v1

This will download a gzipped TSV of the base Toronto KnockOut (TKO) CRISPR Library v1 (TKOv1) from http://tko.ccbr.utoronto.ca.

# Download TKOv1 
TKOv1_gz <- file.path(dirpath, 'TKOv1-base-90k-library-91320_sequences.gz')
download.file(url = "http://tko.ccbr.utoronto.ca/Data/TKOv1-base-90k-library-91320_sequences.gz", destfile = TKOv1_gz)
TKOv1 <- read.delim(gzfile(TKOv1_gz,'rt'), sep = "\t", header = F, col.names = c('SEQ', 'LOCUS', 'TARGET'))

# Split guide LOCUS to get chromosome, coordinates gene and strand
# Remove chr, start, end and strand for CTRL guides
# For chr10Promiscuous and chr10Rand set GENE as chr10
# Set sgRNA (ID) as <GENE>_<SEQ>
CTRL_guides <- c('chr10Promiscuous', 'chr10Rand', 'LacZ', 'EGFP', 'luciferase')
TKOv1_ann <- TKOv1 %>% 
  separate(col = LOCUS, 
           into = c('COORDS', 'GENE', 'STRAND'),
           sep = "_", 
           remove = FALSE, 
           extra = 'drop') %>%
  separate(col = COORDS, 
           into = c('CHR', 'START', 'END'),
           sep = "[\\:\\-]", 
           remove = TRUE, 
           extra = 'drop') %>%
  mutate(CHR = ifelse(TARGET %in% CTRL_guides, NA, CHR),
         START = ifelse(TARGET %in% CTRL_guides, NA, START),
         END = ifelse(TARGET %in% CTRL_guides, NA, END),
         STRAND = ifelse(TARGET %in% CTRL_guides, NA, STRAND)) %>%
  mutate(GENE = ifelse(TARGET == 'chr10Promiscuous' | TARGET == 'chr10Rand', 'chr10', GENE)) %>%
 unite(sgRNA, GENE, SEQ, sep = "_", remove = FALSE)

# Write annotated TKO library to file
write.table(TKOv1_ann, file = file.path(dirpath, 'TKOv1.tsv'), row.names = F, sep = "\t", quote = F)

# Remove TKOv1 zipped TSV
file.remove(TKOv1_gz)

# Show first 10 sgRNAs
head(TKOv1_ann)

Download Colic et al raw counts from FigShare

# Download Colic et al raw counts
colic_raw_counts_zip <- file.path(dirpath, 'readcounts-drugZ-updated_May2019.zip')
download.file(url = "https://ndownloader.figshare.com/files/16170896", destfile = colic_raw_counts_zip)

# Extract and read in HeLa raw read count matrix
HeLa_read_counts = read.delim(unz(colic_raw_counts_zip, 'readcounts-drugZ-updated_May2019/readcounts-HeLa_ola.txt'), stringsAsFactors = F)

# Loop over sample columns and write one count file per sample
# If sample is not T0 (*_T0), compress the count file
dir.create(file.path(dirpath, 'HeLa_raw_sample_counts'))
for (i in 3:9) {
  sample_count_filename <- file.path(dirpath, 'HeLa_raw_sample_counts', paste0(colnames(HeLa_read_counts)[i], '.tsv'))
  sample_counts <- HeLa_read_counts %>% select(sgRNA, GENE, !!colnames(HeLa_read_counts)[i])
  write.table(sample_counts, sample_count_filename, sep = "\t", row.names = F, quote = F)
  if (!grepl('_T0', colnames(HeLa_read_counts)[i])) {
    R.utils::gzip(sample_count_filename)
  }
}

# Remove raw counts zip file
file.remove(colic_raw_counts_zip)

# Show first 10 sgRNAs
head(HeLa_read_counts)

Prepare sample metadata tables from count matrix

The c-sar sample metadata consists of a table with one row per sample and requires the following columns:

filename - name of file containing counts (one per sample)
sample_name/label - sample label for plots (does not need to match the sample name in the count file)
plasmid - whether the sample is a plasmid sample (0 = no, 1 = yes)
control - whether the sample is a control sample (0 = no, 1 = yes)
treatment - whether the sample is a treatment sample (0 = no, 1 = yes)

In addition, there are several optional columns used by c-sar for plotting:

group - group label
reads - number of equencing reads (used to calculate mapping statistics/plots)

We don't need to worry about the specific names given to the columns as c-sar uses the column index instead of column names.

# Prepare sample mapping from read count column names (samples)
HeLa_sample_mapping <- data.frame('sample_name' = colnames(HeLa_read_counts)[3:9],
                                  'plasmid' = c(1, rep(0, 6)),
                                  'control' = c(0, rep(1, 3), rep(0, 3)),
                                  'treatment' = c(rep(0, 4), rep(1, 3)))

# Add filename (<sample_name>.tsv or <sample_name>.tsv.gz)
# Add group ('T0', 'untreated-control', 'olaparib-treated')
HeLa_sample_mapping <- HeLa_sample_mapping %>%
  mutate(filename = paste0(sample_name, '.tsv'), 
         filename = ifelse(control, paste0(filename, '.gz'), filename)) %>%
  mutate(treatment_group = case_when(plasmid == 1 ~ 'T0',
                                     control == 1 ~ 'untreated-control',
                                     treatment == 1 ~ 'olaparib-treated',
                                     TRUE ~ 'unknown'))
# Write HeLa sample metadata to file
write.table(HeLa_sample_mapping, file = file.path(dirpath, 'HeLa_olaparib_sample_metadata.tsv'), row.names = F, sep = "\t", quote = F)