Nothing
#' Read tumor genomes from a VCF file (Variant Call Format).
#'
#' `readGenomesFromVCF()` reads somatic mutations of a single tumor genome
#' (sample) or a set of genomes from a VCF file (Variant Call Format) and
#' determines the mutation frequencies according to a specific model of
#' mutational signatures (Alexandrov or Shiraishi).
#'
#' @usage readGenomesFromVCF(file, numBases=5, type="Shiraishi", trDir=TRUE,
#' enforceUniqueTrDir=TRUE,
#' refGenome=BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,
#' transcriptAnno=
#' TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene,
#' verbose=TRUE)
#' @param file (Mandatory) The name of the VCF file (can be compressed with
#' \code{gzip}).
#' @param numBases (Mandatory) Total number of bases (mutated base and
#' flanking bases) to be used for sequence patterns. Must be odd. Default: 5
#' @param type (Mandatory) Signature model or type (\code{"Alexandrov"} or
#' \code{"Shiraishi"}). Default: \code{"Shiraishi"}
#' @param trDir (Mandatory) Specifies whether the transcription direction is
#' taken into account in the signature model. If so, only mutations within
#' genomic regions with a defined transcription direction can be considered.
#' Default: \code{TRUE}
#' @param enforceUniqueTrDir (Optional) Used only if \code{trDir} is
#' \code{TRUE}. If \code{enforceUniqueTrDir} is TRUE (default), then mutations
#' which map to a region with multiple overlapping genes with opposing
#' transcription directions will be excluded from the analysis. If \code{FALSE},
#' the transcript direction encountered first in the transcript database (see
#' \code{transcriptAnno}) is assigned to the mutation. The latter was the
#' behavior until version 1.3.5 of \code{decompTumor2Sig} and is also the
#' behavior of \code{pmsignature}. However, it is preferable to exclude
#' these mutations from the count (default) because from mutation data alone
#' it cannot be inferred which of the two genes has the higher transcriptional
#' activity which might potentially be linked to the occurrence of the mutation.
#' (If you are unsure, use the default setting; this option exists mostly for
#' backward compatibility with older versions.)
#' @param refGenome (Mandatory) The reference genome (\code{BSgenome}) needed
#' to extract sequence patterns. Default: \code{BSgenome} object for hg19.
#' @param transcriptAnno (Optional) Transcript annotation (\code{TxDb} object)
#' used to determine the transcription direction. This is required only if
#' \code{trDir} is \code{TRUE}. Default: \code{TxDb} object for hg19.
#' @param verbose (Optional) Print information about reading and processing the
#' mutation data. Default: \code{TRUE}
#' @return A list containing the genomes in terms of frequencies of the mutated
#' sequence patterns. This list of genomes can be used for
#' \code{decomposeTumorGenomes}.
#' @author Rosario M. Piro\cr Politecnico di Milano\cr Maintainer: Rosario
#' M. Piro\cr E-Mail: <rmpiro@@gmail.com> or <rosariomichael.piro@@polimi.it>
#' @references \url{http://rmpiro.net/decompTumor2Sig/}\cr
#' Krueger, Piro (2019) decompTumor2Sig: Identification of mutational
#' signatures active in individual tumors. BMC Bioinformatics
#' 20(Suppl 4):152.\cr
#' @seealso \code{\link{decompTumor2Sig}}\cr
#' \code{\link{decomposeTumorGenomes}}\cr
#' \code{\link{readGenomesFromMPF}}\cr
#' \code{\link{getGenomesFromMutFeatData}}
#' @examples
#'
#' ### load reference genome and transcript annotation (if direction is needed)
#' refGenome <- BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19
#' transcriptAnno <-
#' TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene
#'
#' ### read breast cancer genomes from Nik-Zainal et al (PMID: 22608084)
#' gfile <- system.file("extdata",
#' "Nik-Zainal_PMID_22608084-VCF-convertedfromMPF.vcf.gz",
#' package="decompTumor2Sig")
#' genomes <- readGenomesFromVCF(gfile, numBases=5, type="Shiraishi",
#' trDir=TRUE, enforceUniqueTrDir=TRUE, refGenome=refGenome,
#' transcriptAnno=transcriptAnno, verbose=FALSE)
#'
#' @importFrom VariantAnnotation readVcf expand alt ref geno
#' @importFrom GenomicRanges seqnames start
#' @importFrom BSgenome.Hsapiens.UCSC.hg19 BSgenome.Hsapiens.UCSC.hg19
#' @importFrom TxDb.Hsapiens.UCSC.hg19.knownGene
#' TxDb.Hsapiens.UCSC.hg19.knownGene
#' @export readGenomesFromVCF
readGenomesFromVCF <- function(file, numBases=5, type="Shiraishi", trDir=TRUE,
enforceUniqueTrDir=TRUE,
refGenome=BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19,
transcriptAnno=
TxDb.Hsapiens.UCSC.hg19.knownGene::TxDb.Hsapiens.UCSC.hg19.knownGene,
verbose=TRUE) {
# read mutation data
if(verbose) {
cat("Reading mutations and genotype information from VCF file:\n")
}
# read VCF file (using VariantAnnotation)
vcf <- tryCatch(readVcf(file),
error=function(e) {
stop(paste0(e, "Cannot read VCF file; is this a VCF ",
"file containing one or more individual ",
"samples?"))
})
if (is(vcf, "CollapsedVCF")) {
vcf <- expand(vcf) # split multiallelic entries!
}
# get only SNVs (one REF base and one ALT base)
# (indels can be specified as, e.g. deletion, "AG > A" or as "G > -")
snvRows <-
(nchar(as.character(ref(vcf))) == 1) &
(nchar(as.character(alt(vcf))) == 1) &
(as.character(ref(vcf)) != "-") & (as.character(alt(vcf)) != "-") &
(as.character(ref(vcf)) != "N") & (as.character(alt(vcf)) != "N")
# basic variant information (chr, pos, ref, alt)
snvs <- cbind(as.character(seqnames(vcf)),
as.character(start(vcf)),
as.character(ref(vcf)),
as.character(alt(vcf)))
snvs <- snvs[snvRows,]
colnames(snvs) <- c("CHROM","POS","REF","ALT")
# get genotypes (if FORMAT and genotypes are present in the VCF)
gtypes <- tryCatch(geno(vcf)$GT[snvRows,],
error=function(e) { return(NULL) })
gtypes[gtypes == "."] <- NA
# if this was only one sample: have a vector, need a matrix
if (!is.matrix(gtypes)) {
gtypes <- as.matrix(gtypes)
}
# make sure we keep the correct sample names
colnames(gtypes) <- colnames(geno(vcf)$GT)
# genotype information
if (!is.null(gtypes)) {
# if we do have genotype information, add it!
snvs <- cbind(snvs, # CHR, POS, REF, ALT
"FORMAT"=rep("GT", sum(snvRows)), # FORMAT
gtypes) # genotype info for samples
} else {
# we don't have genotype information from the VCF file
# take all variants as originating from the same sample/genome,
# create dummy genotype
snvs <- cbind(snvs, matrix(c("GT", "1/1"),
nrow=length(which(snvRows)),
ncol=2, byrow=TRUE)
)
colnames(snvs)[seq((ncol(snvs)-1),ncol(snvs))] =
c("FORMAT", "variants_without_genotype_info")
}
## we have now (example):
#> snvs[1:10,]
# CHROM POS REF ALT FORMAT sample1 sample2
#[1,] "2" "947" "C" "T" "GT:PL:GQ:AD:DP" "1/1:84,6,0:6:0,2:2" NA
#[2,] "2" "992" "G" "A" "GT:PL:GQ:AD:DP" "0/1:123,0,33:33:1,3:4" "0/0:..."
genomes <- buildGenomesFromMutationData(snvs=snvs, numBases=numBases,
type=type, trDir=trDir,
uniqueTrDir=enforceUniqueTrDir,
refGenome=refGenome,
transcriptAnno=transcriptAnno,
verbose=verbose)
if(verbose && !is.null(genomes)) {
cat("Done reading genomes.\n")
}
return(genomes)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.