ProteoDisco: Generation of customized protein variant databases from genomic variants, splice-junctions and manual sequences

## ----knitr_init, echo = FALSE, cache = FALSE, results = 'hide', warning=FALSE, message=FALSE----
suppressPackageStartupMessages(library(knitr))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(AnnotationDbi))
suppressPackageStartupMessages(library(ProteoDisco))

## ---- label = 'Minimal Workflow', eval = TRUE, results = 'hide', message = FALSE----

# Generate the ProteoDiscography (containing the genome-sequences and annotations used throughout the incorporation of genomic variants)
# In this case, we supply an existing reference genome with known annotations (GRCh37 with GENCODE annotations).
ProteoDiscography.hg19 <- ProteoDisco::generateProteoDiscography(
TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene, 
genomeSeqs = BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19
)

# Supply the ProteoDiscography with genomic variants to incorporate in downstream analysis. This can be one or multiple VCF / MAF files.
# Additional manual sequences and exon-exon mapping (i.e., splice junctions) can also be given as shown in the sections below.
ProteoDiscography.hg19 <- ProteoDisco::importGenomicVariants(
ProteoDiscography = ProteoDiscography.hg19,
# Provide the VCF / MAF files.
files = system.file('extdata', 'validationSet_hg19.vcf', package = 'ProteoDisco'), 
# We can replace the original samples within the VCF with nicer names.
samplenames = 'Validation Set (GRCh37)',
# Number of threads used for parallelization.
# We run samples sequentially and parallelize within (variant-wise multi-threading).
threads = 1, 
# To increase import-speed for this example, do not check for validity of the reference anchor with the given reference sequences.
performAnchorCheck = FALSE
)

# Incorporate the given genomic variants into their respective overlapping coding-sequences (i.e. transcripts).
# This can be done in a sample-specific or aggregated cohort-wide manner and can be performed per exon or transcript separately.
ProteoDiscography.hg19 <- ProteoDisco::incorporateGenomicVariants(
ProteoDiscography = ProteoDiscography.hg19,
# Do not aggregate samples and generate mutant transcripts from the mutations per sample.
aggregateSamples = FALSE, 
# If there are multiple mutations within the same exon (CDS), place them on the same mutant CDS sequence.
aggregateWithinExon = TRUE, 
# Aggregate multiple mutant exons (CDS) within the same transcripts instead of incorporating one at a time.
aggregateWithinTranscript = TRUE, 
# If there are overlapping mutations on the same coding position, retain only the first of the overlapping mutations.
# If set to FALSE, throw an error and specify which CDS had overlapping mutations.
ignoreOverlappingMutations = TRUE, 
# Number of threads.
threads = 1
)

## ---- label = 'Export FASTA', eval = FALSE, results = 'hide'------------------
#  # Output custom protein database (FASTA) containing the generated protein variant sequences.
#  ProteoDisco::exportProteoDiscography(
#    ProteoDiscography = ProteoDiscography.hg19,
#    outFile = 'example.fasta',
#    aggregateSamples = TRUE
#  )

## ---- label = 'Logging options', eval = TRUE, results = 'hide'----------------
# Display tracing messages:
ParallelLogger::clearLoggers()
ParallelLogger::registerLogger(ParallelLogger::createLogger(threshold = 'TRACE', appenders =list(ParallelLogger::createConsoleAppender(layout = ParallelLogger::layoutTimestamp))))

# Or only display general information messages:
ParallelLogger::clearLoggers()
ParallelLogger::registerLogger(ParallelLogger::createLogger(threshold = 'INFO', appenders =list(ParallelLogger::createConsoleAppender(layout = ParallelLogger::layoutTimestamp))))

## ---- label = 'TxDb from FASTA and GTF', eval = FALSE-------------------------
#  
#  # Download the latest annotation.
#  URL.annotations <- 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/gencode.v37.annotation.gff3.gz'
#  utils::download.file(URL.annotations, basename(URL.annotations))
#  
#  # Download the respective reference genome (GRCh38.p12)
#  URL.genome <- 'ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_37/GRCh38.p12.genome.fa.gz'
#  utils::download.file(URL.genome, basename(URL.genome))
#  
#  # Use the latest annotations to generate a TxDb.
#  TxDb <- GenomicFeatures::makeTxDbFromGFF(
#    file = basename(URL.annotations),
#    dataSource = 'GENCODE (v37)',
#    organism = 'Homo sapiens', taxonomyId = 9606
#  )
#  
#  # Import the genome-sequences.
#  genomeSeqs <- Biostrings::readDNAStringSet(filepath = basename(URL.genome), format = 'fasta')
#  
#  # Clean the chromosomal names and use the chr-prefix.
#  names(genomeSeqs) <- gsub(' .*', '', names(genomeSeqs))
#  
#  # Generate the ProteoDiscography.
#  ProteoDiscography <- ProteoDisco::generateProteoDiscography(
#    TxDb = TxDb,
#    genomeSeqs = genomeSeqs,
#    geneticCode = 'Standard'
#  )

## ---- label = 'Generate ProteoDiscrography from GRCh37 resources', eval = TRUE, results = 'hide'----
# Attach the required packages.
require(BSgenome.Hsapiens.UCSC.hg19)
require(TxDb.Hsapiens.UCSC.hg19.knownGene)

# Generate the ProteoDiscography.
ProteoDiscography.hg19 <- ProteoDisco::generateProteoDiscography(
TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene, 
genomeSeqs = BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19
)

## ---- label = 'Inspection', eval = TRUE, results = 'hide'---------------------
print(ProteoDiscography.hg19)
summary(ProteoDiscography.hg19)

## ---- label = 'Generate ProteoDiscrography (GRCh37) for subsequent testing', eval = TRUE, results = 'hide'----
# Lets produce a ProteoDiscography using pre-generated (BioConductor) resources for hg19 with UCSC annotations.
require(BSgenome.Hsapiens.UCSC.hg19)
require(TxDb.Hsapiens.UCSC.hg19.knownGene)

# Initialize the ProteoDiscography.
ProteoDiscography.hg19 <- ProteoDisco::generateProteoDiscography(
TxDb = TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene, 
genomeSeqs = BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19
)

# Select one or more VCF or MAF files.
# In this case, we make use of the provided GRCh37 test file.
files.muts <- c(
system.file('extdata', 'validationSet_hg19.vcf', package = 'ProteoDisco')
)

# Import the genomic (somatic) variants.
ProteoDiscography.hg19 <- ProteoDisco::importGenomicVariants(
ProteoDiscography = ProteoDiscography.hg19,
# Provide the VCF / MAF files.
files = files.muts, 
# We can replace the original samples within the VCF with nicer names.
samplenames = 'Validation Set (GRCh37)',
# For illustration purposes, do not check the validity of the reference anchor.
performAnchorCheck = FALSE
)


## ---- label = 'Inspect example ProteoDiscrography (GRCh37)', eval = TRUE, results = 'hide'----
library(ggplot2)

# Print an overview of supplied variants, their respective mutational types and an overview per patients.
summary(ProteoDiscography.hg19, verbose = FALSE)

# We can also capture the summary output and display / handle only certain parts. 
# The 'verbose' parameters toggles whether additional information is printed to the console.
z <- summary(ProteoDiscography.hg19, verbose = FALSE)

# Generate a quick overview of the mutational categories (SNV, MNV and InDels) per imported sample.
reshape2::melt(z$overviewMutations, id.vars = 'sample') %>% 
dplyr::filter(!is.na(value)) %>% 
ggplot2::ggplot(ggplot2::aes(x = sample, y = value, fill = variable)) + 
ggplot2::geom_bar(stat = 'identity', width = .8, color = 'black', position = ggplot2::position_dodge(preserve = 'single')) + 
ggplot2::scale_y_continuous(trans = scales::pseudo_log_trans(), breaks = c(0, 10, 100, 1000, 10000)) +
ggplot2::labs(x = 'Provided samples', y = 'Nr. of variants (log10)') +
ggplot2::scale_fill_brewer(name = 'Category', palette = 'Greens') +
ggplot2::theme_minimal()

## ---- label = 'Add manual transcript sequences', eval = TRUE, results = 'hide', cache = TRUE----

# TMPRSS2-ERG sequence from ENA.
testSeq1 <- Biostrings::DNAString('ATGACCGCGTCCTCCTCCAGCGACTATGGACAGACTTCCAAGATGAGCCCACGCGTCCCTCAGCAGGATTGGCTGTCTCAACCCCCAGCCAGGGTCACCATCAAAATGGAATGTAACCCTAGCCAGGTGAATGGCTCAAGGAACTCTCCTGATGAATGCAGTGTGGCCAAAGGCGGGAAGATGGTGGGCAGCCCAGACACCGTTGGGATGAACTACGGCAGCTACATGGAGGAGAAGCACATGCCACCCCCAAACATGACCACGAACGAGCGCAGAGTTATCGTGCCAGCAGATCCTACGCTATGGAGTACAGACCATGTGCGGCAGTGGCTGGAGTGGGCGGTGAAAGAATATGGCCTTCCAGACGTCAACATCTTGTTATTCCAGAACATCGATGGGAAGGAACTGTGCAAGATGACCAAGGACGACTTCCAGAGGCTCACCCCCAGCTACAACGCCGACATCCTTCTCTCACATCTCCACTACCTCAGAGAGACTCCTCTTCCACATTTGACTTCAGATGATGTTGATAAAGCCTTACAAAACTCTCCACGGTTAATGCATGCTAGAAACACAGGGGGTGCAGCTTTTATTTTCCCAAATACTTCAGTATATCCTGAAGCTACGCAAAGAATTACAACTAGGCCAGTCTCTTACAGATAA')

# Partial CDS of BCR-ABL from ENA.
testSeq2 <- Biostrings::DNAString('ATGATGAGTCTCCGGGGCTCTATGGGTTTCTGAATGTCATCGTCCACTCAGCCACTGGATTTAAGCAGAGTTCAAAAGCCCTTCAGCGGCCAGTAGCATCTGACTTTGAGCCTCAGGGTCTGAGTGAAGCCGCTCGTTGGAACTCCAAGGAAAACCTTCTCGCTGGACCCAGTGAAAATGACCCCAACCTTTTCGTTGCACTGTATGATTTTGTGGCCAGTGGAGATAACACTCTAAGCATAACTAAAGGTGAAAAGCTCCGGGTCTTAGGCTATAATCACAATGGGGAATGGTTTGAAGCCCAAACCAAAAATGGCCAAGGCTGGGTCCCAAGCAACTACATCACGCCAGTCAACAGTCTGGAGAAACACTCCTGGTACCATGGGCCTGTGTCCCGCAATGCCGCTGAGTATCTGCTGAGCAGCGGGATCAAT')

# Generate DataFrame containing the mutant sequences and metadata and add these to our test sample.
manualSeq <- S4Vectors::DataFrame(
  sample = rep('Validation Set (GRCh37)', 2),
  identifier = c('TMPRSS2-ERG prostate cancer specific isoform 1', 'Homo sapiens (human) partial bcr/c-abl oncogene protein'),
  gene = c('TMPRSS2-ERG', 'BCR-ABL1'),
  Tx.SequenceMut = Biostrings::DNAStringSet(base::list(testSeq1, testSeq2))
)

# Add to ProteoDiscography.
ProteoDiscography.hg19 <- ProteoDisco::importTranscriptSequences(
  ProteoDiscography.hg19, 
  transcripts = manualSeq,
  removeExisting = TRUE
)

## ---- label = 'View imported ProteoDiscography records', eval = TRUE, results = 'hide'----

# Retrieve / view the records imported into the ProteoDiscography.
getDiscography(ProteoDiscography.hg19)

## ---- label = 'Incorporate genomic variants into ProteoDiscrography (GRCh37)', eval = TRUE, results = 'hide', message = FALSE----

ProteoDiscography.hg19 <- ProteoDisco::incorporateGenomicVariants(
  ProteoDiscography = ProteoDiscography.hg19,
  # Do not aggregate samples and generate mutant transcripts from the mutations per sample.
  aggregateSamples = FALSE, 
  # If there are multiple mutations within the same exon (CDS), place them on the same mutant CDS sequence.
  aggregateWithinExon = TRUE, 
  # Aggregate multiple mutant exons (CDS) within the same transcripts instead of incorporating one at a time.
  aggregateWithinTranscript = TRUE, 
  # If there are overlapping mutations on the same coding position, retain only the first of the overlapping mutations.
  # If set to FALSE, throw an error and specify which CDS had overlapping mutations.
  ignoreOverlappingMutations = TRUE, 
  # Number of threads.
  threads = 4
)


## ---- label = 'Inspection post-incorporation', eval = TRUE, results = 'hide'----

# Simply print the ProteoDiscography and it will state the number of mutant transcripts currently generated.
summary(ProteoDiscography.hg19)

# Retrieve the mutant transcript sequences.
# This will retrieve a list of all the generated (and imported) transcript sequences per input type and the respective amino-acid sequence.
x <- ProteoDisco::mutantTranscripts(ProteoDiscography.hg19)

# Visualization of the number of incorporated genomic mutations per transcript.
barplot(table(x$genomicVariants$numberOfMutationsInTx))


## ---- label = 'Subsetting', eval = TRUE, results = 'hide'---------------------
x <- ProteoDisco::mutantTranscripts(ProteoDiscography.hg19)
x <- x$genomicVariants
x

# Only keep the first 10 mutant transcripts of the incorporated genomic variants.
ProteoDiscography.hg19.subset <- ProteoDisco::setMutantTranscripts(ProteoDiscography.hg19, x[1:10,], slotType = 'genomicVariants')

y <- ProteoDisco::mutantTranscripts(ProteoDiscography.hg19.subset)
y <- y$genomicVariants
y


## ---- label = 'Import splice-junctions', eval = TRUE, results = 'hide', message = FALSE----
# Import splice-junctions from BED files.
files.Splicing <- c(
  system.file('extdata', 'spliceJunctions_pyQUILTS_chr22.bed', package = 'ProteoDisco')
)

# Import splice-junctions from BED or SJ,out.tab files into our ProteoDiscography.
ProteoDiscography.hg19 <- ProteoDisco::importSpliceJunctions(
  ProteoDiscography = ProteoDiscography.hg19, 
  inputSpliceJunctions = system.file('extdata', 'spliceJunctions_pyQUILTS_chr22.bed', package = 'ProteoDisco'), 
  # (Optional) Rename samples.
  samples = c('pyQUILTS'), 
  # Specify that the given BED files are obtained from TopHat.
  # Chromosomal coordinates from TopHat require additional formatting.
  isTopHat = TRUE,
  aggregateSamples = FALSE,
  removeExisting = TRUE
)

# Or, import splice-junctions (even spanning different chromosomes) based on our format.
testSJ <- readr::read_tsv(system.file('extdata', 'validationSetSJ_hg19.txt', package = 'ProteoDisco')) %>% 
  dplyr::select(sample, identifier, junctionA, junctionB)

# Add custom SJ to ProteoDiscography.
ProteoDiscography.hg19 <- ProteoDisco::importSpliceJunctions(
  ProteoDiscography = ProteoDiscography.hg19, 
  inputSpliceJunctions = testSJ, 
  # Remove existing SJ-input.
  removeExisting = TRUE
)

# Generate junction-models from non-canonical splice-junctions.
ProteoDiscography.hg19 <- ProteoDisco::generateJunctionModels(
  ProteoDiscography = ProteoDiscography.hg19, 
  # Max. distance from a known exon-boundary before introducing a novel exon.
  # If an adjacent exon is found within this distance, it will shorten or elongate that exon towards the SJ.
  maxDistance = 150, 
  # Should we skip known exon-exon junctions (in which both the acceptor and donor are located on known adjacent exons within the same transcript)
  skipCanonical = TRUE,
  # Perform on multiple threads (optional)
  threads = 4
)

## ---- label = 'Check proteotypic fragments.', eval = FALSE, results = 'hide'----
#  # Load additional peptide sequences against which proteotypic fragments are checked.
#  # In this case, load the first 100 human Swiss-Prot/Uniprot database entries.
#  peptides.SwissProt <- base::textConnection(RCurl::getURL('https://www.uniprot.org/uniprot/?query=*&format=fasta&limit=100&fil=organism:%22Homo%20sapiens%20(Human)%20[9606]%22%20AND%20reviewed:yes')) %>%
#    seqinr::read.fasta(., seqtype = 'AA', seqonly = FALSE, as.string = TRUE) %>%
#    Biostrings::AAStringSetList() %>%
#    unlist()
#  
#  # Check proteotypic fragments.
#  ProteoDiscography.hg19 <- ProteoDisco::checkProteotypicFragments(
#    ProteoDiscography.hg19,
#    additionalPeptides = peptides.SwissProt,
#    # Protease used in the experiment (see cleaver package)
#    enzymUsed = 'trypsin',
#    # Number of potentially-missed cleavages (see cleaver package)
#    missedCleavages = 1,
#    # Should proteotypic fragments also be unique among the generated variant-sequences.
#    checkWithinMutantSeqs = FALSE
#  )
#  
#  # We can now appreciate that additional information is added to the mutant transcript denoting how many, and which fragments are unique to the mutant transcripts.
#  ProteoDisco::mutantTranscripts(ProteoDiscography.hg19)
#  

## ---- label = 'Converting gene symbols', eval = TRUE, results = 'hide'--------
require(org.Hs.eg.db)

# Retrieve the all imported mutant transcripts.
x <- ProteoDisco::mutantTranscripts(ProteoDiscography.hg19)

# Convert ENTREZ identifiers to gene symbols.
geneSymbols <- unique(AnnotationDbi::select(
  org.Hs.eg.db, keys = x$genomicVariants$geneID, 
  keytype = 'ENTREZID', columns = 'SYMBOL')
)

# Add the gene symbols back to the mutant transcript.
x$genomicVariants <- merge(
  x$genomicVariants, geneSymbols, 
  by.x = 'geneID', by.y = 'ENTREZID', all.x = TRUE
)

# Add the mutant transcripts (with symbols) back into the ProteoDiscography.
ProteoDiscography.hg19 <- ProteoDisco::setMutantTranscripts(
  ProteoDiscography.hg19, 
  transcripts = x$genomicVariants, 
  slotType = 'genomicVariants'
)

# We now have the SYMBOL column in our mutant transcripts DataFrame.
head(ProteoDisco::mutantTranscripts(ProteoDiscography.hg19)$genomicVariants)

## ---- label = 'Export mutant proteins to FASTA', eval = TRUE, results = 'hide'----

ProteoDisco::exportProteoDiscography(
  ProteoDiscography = ProteoDiscography.hg19, 
  outFile = 'example.fasta', 
  aggregateSamples = TRUE
)


## ---- label = 'Session Information', eval = TRUE------------------------------
utils::sessionInfo()
ErasmusMC-CCBC/ProteoDisco documentation built on Dec. 9, 2022, 8:41 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ErasmusMC-CCBC/ProteoDisco
Generation of customized protein variant databases from genomic variants, splice-junctions and manual sequences

inst/doc/Overview_ProteoDisco.R
In ErasmusMC-CCBC/ProteoDisco: Generation of customized protein variant databases from genomic variants, splice-junctions and manual sequences

R Package Documentation

Browse R Packages

We want your feedback!

ErasmusMC-CCBC/ProteoDisco Generation of customized protein variant databases from genomic variants, splice-junctions and manual sequences

inst/doc/Overview_ProteoDisco.R In ErasmusMC-CCBC/ProteoDisco: Generation of customized protein variant databases from genomic variants, splice-junctions and manual sequences

R Package Documentation

Browse R Packages

We want your feedback!

ErasmusMC-CCBC/ProteoDisco
Generation of customized protein variant databases from genomic variants, splice-junctions and manual sequences

inst/doc/Overview_ProteoDisco.R
In ErasmusMC-CCBC/ProteoDisco: Generation of customized protein variant databases from genomic variants, splice-junctions and manual sequences