# <style>
# body {
# text-align: justify}
# </style>
BiocStyle::markdown()
knitr::opts_chunk$set(
    collapse = TRUE #,
    # comment = "#>"
)
options(width=120)

This document gives an overview of the DNABarcodeCompatibility R package with a brief description of the set of tools that it contains. The package includes six main functions that are briefly described below with examples. These functions allow one to load a list of DNA barcodes (such as the Illumina TruSeq small RNA kits), to filter these barcodes according to distance and nucleotide content criteria, to generate sets of compatible barcode combinations out of the filtered barcode list, and finally to generate an optimized selection of barcode combinations for multiplex sequencing experiments. In particular, the package provides an optimizer function to favour the selection of compatible barcode combinations with least heterogeneity in the frequencies of DNA barcodes, and allows one to keep barcodes that are robust against substitution and insertion/deletion errors, thereby facilitating the demultiplexing step.

The DNABarcodeCompatibility package also contains:

The package deals with the three existing sequencing-by-synthesis chemistries from Illumina:

Load the package

library("DNABarcodeCompatibility")

Define a helper function to save the raw dataset as a temporary text file

# This function is created for the purpose of the documentation 
export_dataset_to_file = 
    function(dataset = DNABarcodeCompatibility::IlluminaIndexesRaw) {
        if ("data.frame" %in% is(dataset)) {
            write.table(dataset,
                        textfile <- tempfile(),
                        row.names = FALSE, col.names = FALSE, quote=FALSE)
            return(textfile)
        } else print(paste("The input dataset isn't a data.frame:",
                            "NOT exported into file"))
    }

Design an experiment

The function experiment_design() uses a Shannon-entropy maximization approach to identify a set of compatible barcode combinations in which the frequencies of occurrences of the various DNA barcodes are as uniform as possible. The optimization can be performed in the contexts of single and dual barcoding. It performs either an exhaustive or a random search of compatible DNA-barcode combinations, depending on the size of the DNA-barcode set used, and on the number of samples to be multiplexed.

Examples for single indexing

txtfile <- export_dataset_to_file (
    dataset = DNABarcodeCompatibility::IlluminaIndexesRaw
)
experiment_design(file1=txtfile,
                    sample_number=12,
                    mplex_level=3,
                    platform=4)
txtfile <- export_dataset_to_file (
    dataset = DNABarcodeCompatibility::IlluminaIndexesRaw
)
experiment_design(file1=txtfile,
                    sample_number=12,
                    mplex_level=3,
                    platform=2)
txtfile <- export_dataset_to_file (
    dataset = DNABarcodeCompatibility::IlluminaIndexesRaw
)
experiment_design(file1=txtfile,
                    sample_number=12,
                    mplex_level=3,
                    platform=1)
txtfile <- export_dataset_to_file (
    dataset = DNABarcodeCompatibility::IlluminaIndexesRaw
)
experiment_design(file1=txtfile,
                sample_number=12,
                mplex_level=3,
                platform=4,
                metric = "hamming",
                d = 3)

Examples for dual indexing

# Select the first half of barcodes from the dataset
txtfile1 <- export_dataset_to_file (
    DNABarcodeCompatibility::IlluminaIndexesRaw[1:24,]
)

# Select the second half of barcodes from the dataset
txtfile2 <- export_dataset_to_file (
    DNABarcodeCompatibility::IlluminaIndexesRaw[25:48,]
)

# Get compatibles combinations of least redundant barcodes
experiment_design(file1=txtfile1,
                sample_number=12,
                mplex_level=3,
                platform=4,
                file2=txtfile2)
# Select the first half of barcodes from the dataset
txtfile1 <- export_dataset_to_file (
    DNABarcodeCompatibility::IlluminaIndexesRaw[1:24,]
)

# Select the second half of barcodes from the dataset
txtfile2 <- export_dataset_to_file (
    DNABarcodeCompatibility::IlluminaIndexesRaw[25:48,]
)

# Get compatibles combinations of least redundant barcodes
experiment_design(file1=txtfile1, sample_number=12, mplex_level=3, platform=4,
                    file2=txtfile2, metric="hamming", d=3)

Build your own workflow

This section guides you through the detailed API of the package with the aim to help you build your own workflow. The package is designed to be flexible and should be easily adaptable to most experimental contexts, using the experiment_design() function as a template, or building your own workflow from scratch.

Load and check a dataset of barcodes

The file_loading_and_checking() function loads the file containing the DNA barcodes set and analyzes its content. In particular, it checks that each barcode in the set is unique and uniquely identified (removing any repetition that occurs). It also checks the homogeneity of size of the barcodes, calculates their GC content and detects the presence of homopolymers of length >= 3.

file_loading_and_checking(
    file = export_dataset_to_file(
        dataset = DNABarcodeCompatibility::IlluminaIndexesRaw
    )
)

Examples of an exhaustive search of compatible barcode combinations

The total number of combinations depends on the number of available barcodes and of the multiplex level. For 48 barcodes and a multiplex level of 3, the total number of combinations (compatible or not) can be calculated using choose(48,3), which gives r format(choose(48,3)) combinations. In many cases the total number of combinations can become much larger (even gigantic), and one cannot perform an exhaustive search (see get_random_combinations() below).

# Total number of combinations
choose(48,2)

# Load barcodes
barcodes <- DNABarcodeCompatibility::IlluminaIndexes

# Time for an exhaustive search
system.time(m <- get_all_combinations(index_df = barcodes,
                                    mplex_level = 2,
                                    platform = 4))

# Each line represents a compatible combination of barcodes
head(m)
# Total number of combinations
choose(48,3)

# Load barcodes
barcodes <- DNABarcodeCompatibility::IlluminaIndexes

# Time for an exhaustive search
system.time(m <- get_all_combinations(index_df = barcodes,
                                    mplex_level = 3,
                                    platform = 4))

# Each line represents a compatible combination of barcodes
head(m)

Examples of a random search of compatible barcode combinations

When the total number of combinations is too high, it is recommended to pick combinations at random and then select those that are compatible.

# Total number of combinations
choose(48,3)

# Load barcodes
barcodes <- DNABarcodeCompatibility::IlluminaIndexes

# Time for a random search
system.time(m <- get_random_combinations(index_df = barcodes,
                                        mplex_level = 2,
                                        platform = 4))

# Each line represents a compatible combination of barcodes
head(m)
# Total number of combinations
choose(48,4)

# Load barcodes
barcodes <- DNABarcodeCompatibility::IlluminaIndexes

# Time for a random search
system.time(m <- get_random_combinations(index_df = barcodes,
                                        mplex_level = 4,
                                        platform = 4))

# Each line represents a compatible combination of barcodes
head(m)
# Total number of combinations
choose(48,6)

# Load barcodes
barcodes <- DNABarcodeCompatibility::IlluminaIndexes

# Time for a random search
system.time(m <- get_random_combinations(index_df = barcodes,
                                        mplex_level = 6,
                                        platform = 4))

# Each line represents a compatible combination of barcodes
head(m)

Constrain barcodes to be robust against one substitution error

# Load barcodes
barcodes <- DNABarcodeCompatibility::IlluminaIndexes

# Perform a random search of compatible combinations
m <- get_random_combinations(index_df = barcodes,
                            mplex_level = 3,
                            platform = 4)

# Keep barcodes that are robust against one substitution error
filtered_m <- distance_filter(index_df = barcodes,
                            combinations_m = m,
                            metric = "hamming",
                            d = 3)

# Each line represents a compatible combination of barcodes
head(filtered_m)

Optimize the set of compatible combinations to reduce barcode redundancy

# Keep set of compatible barcodes that are robust against one substitution
# error
filtered_m <- distance_filter(
    index_df = DNABarcodeCompatibility::IlluminaIndexes,
    combinations_m = get_random_combinations(index_df = barcodes,
                                            mplex_level = 3,
                                            platform = 4),
    metric = "hamming", d = 3)

# Use a Shannon-entropy maximization approach to reduce barcode redundancy
df <- optimize_combinations(combination_m = filtered_m,
                            nb_lane = 12,
                            index_number = 48)

# Each line represents a compatible combination of barcodes and each row a lane
# of the flow cell
df

The optimized result isn't an optimum when filtering out too many barcodes

# Keep set of compatible barcodes that are robust against multiple substitution
# and insertion/deletion errors
filtered_m <- distance_filter(
    index_df = DNABarcodeCompatibility::IlluminaIndexes,
    combinations_m = get_random_combinations(index_df = barcodes,
                                            mplex_level = 3,
                                            platform = 4),
    metric = "seqlev", d = 4)

# Use a Shannon-entropy maximization approach to reduce barcode redundancy
df <- optimize_combinations(combination_m = filtered_m,
                            nb_lane = 12,
                            index_number = 48)

# Each line represents a compatible combination of barcodes and each row a
# lane of the flow cell
df


comoto-pasteur-fr/DNABarcodeCompatibility documentation built on June 21, 2019, 9:21 a.m.