inst/doc/website.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----getPackage, eval=FALSE---------------------------------------------------
#  if (!requireNamespace("BiocManager", quietly = TRUE))
#      install.packages("BiocManager")
#   BiocManager::install("cicero")

## ---- eval = FALSE------------------------------------------------------------
#  BiocManager::install(cole-trapnell-lab/cicero)

## ----Load, message=FALSE------------------------------------------------------
 library(cicero)

## -----------------------------------------------------------------------------
data(cicero_data)

## ---- eval=TRUE---------------------------------------------------------------
input_cds <- make_atac_cds(cicero_data, binarize = TRUE)

## ---- eval=TRUE---------------------------------------------------------------
set.seed(2017)
input_cds <- detectGenes(input_cds)
input_cds <- estimateSizeFactors(input_cds)
input_cds <- reduceDimension(input_cds, max_components = 2, num_dim=6,
                        reduction_method = 'tSNE', norm_method = "none")

## ---- eval=TRUE---------------------------------------------------------------
tsne_coords <- t(reducedDimA(input_cds))
row.names(tsne_coords) <- row.names(pData(input_cds))
cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = tsne_coords)

## ---- eval=TRUE---------------------------------------------------------------
data("human.hg19.genome")
sample_genome <- subset(human.hg19.genome, V1 == "chr18")
sample_genome$V2[1] <- 10000000
conns <- run_cicero(cicero_cds, sample_genome, sample_num = 2) # Takes a few minutes to run
head(conns)

## ---- fig.width = 7, fig.height = 4, fig.align='center', eval=TRUE------------
data(gene_annotation_sample)
plot_connections(conns, "chr18", 8575097, 8839855, 
                 gene_model = gene_annotation_sample, 
                 coaccess_cutoff = .25, 
                 connection_width = .5, 
                 collapseTranscripts = "longest" )

## ---- eval=TRUE---------------------------------------------------------------
chia_conns <-  data.frame(Peak1 = c("chr18_10000_10200", "chr18_10000_10200", 
                                    "chr18_49500_49600"), 
                          Peak2 = c("chr18_10600_10700", "chr18_111700_111800", 
                                    "chr18_10600_10700"))

conns$in_chia <- compare_connections(conns, chia_conns)

## ---- eval=TRUE---------------------------------------------------------------
conns$in_chia_100 <- compare_connections(conns, chia_conns, maxgap=100)

head(conns)

## ---- eval=TRUE---------------------------------------------------------------
# Add a column of 1s called "coaccess"
chia_conns <-  data.frame(Peak1 = c("chr18_10000_10200", "chr18_10000_10200", 
                                    "chr18_49500_49600"), 
                          Peak2 = c("chr18_10600_10700", "chr18_111700_111800", 
                                    "chr18_10600_10700"),
                          coaccess = c(1, 1, 1))

plot_connections(conns, "chr18", 10000, 112367, 
                 gene_model = gene_annotation_sample, 
                 coaccess_cutoff = 0,
                 connection_width = .5,
                 comparison_track = chia_conns,
                 include_axis_track = FALSE,
                 collapseTranscripts = "longest") 

## ---- eval=TRUE---------------------------------------------------------------
CCAN_assigns <- generate_ccans(conns)

head(CCAN_assigns)

## ---- eval=TRUE---------------------------------------------------------------

#### Add a column for the pData table indicating the gene if a peak is a promoter ####
# Create a gene annotation set that only marks the transcription start sites of 
# the genes. We use this as a proxy for promoters.
# To do this we need the first exon of each transcript
pos <- subset(gene_annotation_sample, strand == "+")
pos <- pos[order(pos$start),] 
pos <- pos[!duplicated(pos$transcript),] # remove all but the first exons per transcript
pos$end <- pos$start + 1 # make a 1 base pair marker of the TSS

neg <- subset(gene_annotation_sample, strand == "-")
neg <- neg[order(neg$start, decreasing = TRUE),] 
neg <- neg[!duplicated(neg$transcript),] # remove all but the first exons per transcript
neg$start <- neg$end - 1

gene_annotation_sub <- rbind(pos, neg)

# Make a subset of the TSS annotation columns containing just the coordinates 
# and the gene name
gene_annotation_sub <- gene_annotation_sub[,c(1:3, 8)]

# Rename the gene symbol column to "gene"
names(gene_annotation_sub)[4] <- "gene"

input_cds <- annotate_cds_by_site(input_cds, gene_annotation_sub)

head(fData(input_cds))


#### Generate gene activity scores ####
# generate unnormalized gene activity matrix
unnorm_ga <- build_gene_activity_matrix(input_cds, conns)

# remove any rows/columns with all zeroes
unnorm_ga <- unnorm_ga[!Matrix::rowSums(unnorm_ga) == 0, !Matrix::colSums(unnorm_ga) == 0]

# make a list of num_genes_expressed
num_genes <- pData(input_cds)$num_genes_expressed
names(num_genes) <- row.names(pData(input_cds))

# normalize
cicero_gene_activities <- normalize_gene_activities(unnorm_ga, num_genes)

# if you had two datasets to normalize, you would pass both:
# num_genes should then include all cells from both sets
unnorm_ga2 <- unnorm_ga
cicero_gene_activities <- normalize_gene_activities(list(unnorm_ga, unnorm_ga2), num_genes)


## ---- eval=TRUE---------------------------------------------------------------
data("cicero_data")
input_cds <- make_atac_cds(cicero_data)

# Add some cell meta-data
data("cell_data")
pData(input_cds) <- cbind(pData(input_cds), cell_data[row.names(pData(input_cds)),])
pData(input_cds)$cell <- NULL

agg_cds <- aggregate_nearby_peaks(input_cds, distance = 10000)
agg_cds <- detectGenes(agg_cds)
agg_cds <- estimateSizeFactors(agg_cds)
agg_cds <- estimateDispersions(agg_cds)

## ---- eval=TRUE---------------------------------------------------------------
# This takes a few minutes to run
diff_timepoint <- differentialGeneTest(agg_cds,
                      fullModelFormulaStr="~timepoint + num_genes_expressed")

# We chose a very high q-value cutoff because there are
# so few sites in the sample dataset, in general a q-value
# cutoff in the range of 0.01 to 0.1 would be appropriate
ordering_sites <- row.names(subset(diff_timepoint, qval < .5))
length(ordering_sites)

## ---- fig.show='hold', eval=FALSE---------------------------------------------
#  plot_pc_variance_explained(agg_cds, return_all = FALSE) #Choose 2 PCs
#  agg_cds <- reduceDimension(agg_cds,
#                                max_components = 2,
#                                norm_method = 'log',
#                                num_dim = 2,
#                                reduction_method = 'tSNE',
#                                verbose = TRUE)
#  
#  agg_cds <- clusterCells(agg_cds, verbose = FALSE)
#  
#  plot_cell_clusters(agg_cds, color_by = 'as.factor(Cluster)') + theme(text = element_text(size=8))
#  clustering_DA_sites <- differentialGeneTest(agg_cds[1:10,], #Subset for example only
#                                               fullModelFormulaStr = '~Cluster')
#  
#  # Not run because using Option 1 to continue
#  # ordering_sites <-
#  #  row.names(clustering_DA_sites)[order(clustering_DA_sites$qval)][1:1000]
#  

## ---- eval=TRUE---------------------------------------------------------------
agg_cds <- setOrderingFilter(agg_cds, ordering_sites)

## ---- fig.align='center', fig.height=4, fig.width=4, eval=TRUE----------------
agg_cds <- reduceDimension(agg_cds, max_components = 2,
          residualModelFormulaStr="~num_genes_expressed",
          reduction_method = 'DDRTree')
agg_cds <- orderCells(agg_cds)

plot_cell_trajectory(agg_cds, color_by = "timepoint")

## ---- fig.align='center', fig.height=4, fig.width=4, eval=TRUE----------------
plot_cell_trajectory(agg_cds, color_by = "State")

## ---- fig.align='center', fig.height=4, fig.width=4, eval=TRUE----------------
agg_cds <- orderCells(agg_cds, root_state = 4)
plot_cell_trajectory(agg_cds, color_by = "Pseudotime")

## ---- fig.align='center', fig.height=4, fig.width=4, eval=TRUE----------------
pData(input_cds)$Pseudotime <- pData(agg_cds)[colnames(input_cds),]$Pseudotime
pData(input_cds)$State <- pData(agg_cds)[colnames(input_cds),]$State

## ---- fig.width = 3, fig.height = 4, fig.align='center', eval=TRUE------------
input_cds_lin <- input_cds[,row.names(subset(pData(input_cds), State  != 5))]

plot_accessibility_in_pseudotime(input_cds_lin[c("chr18_38156577_38158261", 
                                                 "chr18_48373358_48374180", 
                                                 "chr18_60457956_60459080")])

## ---- eval=TRUE---------------------------------------------------------------
pData(input_cds_lin)$cell_subtype <- cut(pData(input_cds_lin)$Pseudotime, 10)
binned_input_lin <- aggregate_by_cell_bin(input_cds_lin, "cell_subtype")

## ---- eval=TRUE---------------------------------------------------------------
diff_test_res <- differentialGeneTest(binned_input_lin[1:10,], #Subset for example only
    fullModelFormulaStr="~sm.ns(Pseudotime, df=3) + sm.ns(num_genes_expressed, df=3)",
    reducedModelFormulaStr="~sm.ns(num_genes_expressed, df=3)", cores=1)

head(diff_test_res)

## -----------------------------------------------------------------------------
citation("cicero")

## -----------------------------------------------------------------------------
sessionInfo()

Try the cicero package in your browser

Any scripts or data that you put into this service are public.

cicero documentation built on Dec. 10, 2020, 2 a.m.