Baseline-Vignette.R
In shazam: Immunoglobulin Somatic Hypermutation Analysis

## ----eval=TRUE, warning=FALSE, message=FALSE----------------------------------
# Import required packages
library(alakazam)
library(shazam)

# Load and subset example data (for faster demonstration)
data(ExampleDb, package="alakazam")
ExampleDb <- subset(ExampleDb, c_call %in% c("IGHA", "IGHG"))

## ----eval=TRUE, warning=FALSE, results="hide"---------------------------------
# Collapse clonal groups into single sequences
clones <- collapseClones(ExampleDb, cloneColumn="clone_id", 
                         sequenceColumn="sequence_alignment", 
                         germlineColumn="germline_alignment_d_mask", 
                         regionDefinition=IMGT_V, 
                         method="thresholdedFreq", minimumFrequency=0.6,
                         includeAmbiguous=FALSE, breakTiesStochastic=FALSE, 
                         nproc=1)

## ----eval=F, warning=F, results="hide"----------------------------------------
# # Subset to sequences with clone_id=3170
# db_3170 <- subset(ExampleDb, clone_id == 3170)
# dim(db_3170)
# colnames(db_3170)
# 
# # Generate a ChangeoClone object for lineage construction
# clone_3170 <- makeChangeoClone(db_3170, seq="sequence_alignment", germ="germline_alignment")
# 
# # Run the lineage reconstruction
# dnapars_exec <- "/usr/local/bin/dnapars"
# graph_3170 <- buildPhylipLineage(clone_3170, dnapars_exec, rm_temp=TRUE)
# 
# # Generating a data.frame from the lineage tree graph object,
# # and merge it with clone data.frame
# graph_3170_df <- makeGraphDf(graph_3170, clone_3170)
# dim(graph_3170_df)
# colnames(graph_3170_df)

## ----eval=TRUE, warning=FALSE, results="hide"---------------------------------
# Count observed mutations and append mu_count columns to the output
observed <- observedMutations(clones, 
                              sequenceColumn="clonal_sequence",
                              germlineColumn="clonal_germline",
                              regionDefinition=IMGT_V, nproc=1)
# Count expected mutations and append mu_expected columns to the output
expected <- expectedMutations(observed, 
                              sequenceColumn="clonal_sequence",
                              germlineColumn="clonal_germline",
                              targetingModel=HH_S5F,
                              regionDefinition=IMGT_V, nproc=1)

## ----eval=TRUE, warning=FALSE, results="hide"---------------------------------
# Calculate selection scores using the output from expectedMutations
baseline <- calcBaseline(expected, testStatistic="focused", 
                         regionDefinition=IMGT_V, nproc=1)

## ----eval=FALSE, warning=FALSE, results="hide"--------------------------------
# # Calculate selection scores from scratch
# baseline <- calcBaseline(clones, testStatistic="focused",
#                          regionDefinition=IMGT_V, nproc=1)

## ----eval=FALSE, warning=FALSE, results="hide"--------------------------------
# # Calculate selection on charge class with the mouse 5-mer model
# baseline_mk_rs5nf <- calcBaseline(clones, testStatistic="focused",
#                          regionDefinition=IMGT_V,
#                          targetingModel=MK_RS5NF,
#                          mutationDefinition=CHARGE_MUTATIONS,
#                          nproc=1)

## ----eval=TRUE, warning=FALSE, results="hide"---------------------------------
# Combine selection scores by time-point
grouped_1 <- groupBaseline(baseline, groupBy="sample_id")

## ----eval=TRUE, warning=FALSE, results="hide"---------------------------------
# Subset the original data to switched isotypes
db_sub <- subset(ExampleDb, c_call %in% c("IGHM", "IGHG"))

# Collapse clonal groups into single sequence
clones_sub <- collapseClones(db_sub, cloneColumn="clone_id",
                             sequenceColumn="sequence_alignment",
                             germlineColumn="germline_alignment_d_mask",
                             regionDefinition=IMGT_V, 
                             method="thresholdedFreq", minimumFrequency=0.6,
                             includeAmbiguous=FALSE, breakTiesStochastic=FALSE, 
                             nproc=1)

# Calculate selection scores from scratch
baseline_sub <- calcBaseline(clones_sub, testStatistic="focused", 
                             regionDefinition=IMGT_V, nproc=1)

# Combine selection scores by time-point and isotype
grouped_2 <- groupBaseline(baseline_sub, groupBy=c("sample_id", "c_call"))

## ----eval=FALSE, warning=FALSE, results="hide"--------------------------------
# # First group by subject and status
# subject_grouped <- groupBaseline(baseline, groupBy=c("status", "subject"))
# 
# # Then group the output by status
# status_grouped <- groupBaseline(subject_grouped, groupBy="status")

## ----eval=TRUE----------------------------------------------------------------
testBaseline(grouped_1, groupBy="sample_id")

## ----eval=TRUE, warning=FALSE-------------------------------------------------
# Set sample and isotype colors
sample_colors <- c("-1h"="seagreen", "+7d"="steelblue")
isotype_colors <- c("IGHM"="darkorchid", "IGHD"="firebrick", 
                    "IGHG"="seagreen", "IGHA"="steelblue")

# Plot mean and confidence interval by time-point
plotBaselineSummary(grouped_1, "sample_id")

# Plot selection scores by time-point and isotype for only CDR
plotBaselineSummary(grouped_2, "sample_id", "c_call", groupColors=isotype_colors,
                    subsetRegions="cdr")

# Group by CDR/FWR and facet by isotype
plotBaselineSummary(grouped_2, "sample_id", "c_call", facetBy="group")

## ----eval=TRUE, warning=FALSE-------------------------------------------------
# Plot selection PDFs for a subset of the data
plotBaselineDensity(grouped_2, "c_call", groupColumn="sample_id", colorElement="group", 
                    colorValues=sample_colors, sigmaLimits=c(-1, 1))

## ----eval=FALSE, warning=FALSE, results="hide"--------------------------------
# # Get indices of rows corresponding to IGHA in the field "db"
# # These are the same indices also in the matrices in the fields "numbOfSeqs",
# # "binomK", "binomN", "binomP", and "pdfs"
# # In this example, there is one row of IGHA for each sample
# dbIgMIndex <- which(grouped_2@db[["c_call"]] == "IGHG")
# 
# grouped_2 <- editBaseline(grouped_2, "db", grouped_2@db[-dbIgMIndex, ])
# grouped_2 <- editBaseline(grouped_2, "numbOfSeqs", grouped_2@numbOfSeqs[-dbIgMIndex, ])
# grouped_2 <- editBaseline(grouped_2, "binomK", grouped_2@binomK[-dbIgMIndex, ])
# grouped_2 <- editBaseline(grouped_2, "binomN", grouped_2@binomN[-dbIgMIndex, ])
# grouped_2 <- editBaseline(grouped_2, "binomP", grouped_2@binomP[-dbIgMIndex, ])
# grouped_2 <- editBaseline(grouped_2, "pdfs",
#                           lapply(grouped_2@pdfs, function(pdfs) {pdfs[-dbIgMIndex, ]} ))
# 
# # The indices corresponding to IGHA are slightly different in the field "stats"
# # In this example, there is one row of IGHA for each sample and for each region
# grouped_2 <- editBaseline(grouped_2, "stats",
#                           grouped_2@stats[grouped_2@stats[["c_call"]] != "IGHA", ])