#' Read a VCF file into R session
#'
#' @param vcf_file character(1), path to VCF file to read into R session as a
#' \code{\link[VariantAnnotation]{CollapsedVCF}} object
#' @param genome character(1), string indicating the genome build used in the
#' VCF file(s) (default: "GRCh37")
#' @param seq_levels_style character(1), string passed to
#' \code{\link[GenomeInfoDb]{seqlevelsStyle}} the style to use for
#' chromosome/contig names (default: "Ensembl")
#' @param verbose logical(1), should messages be printed as function runs?
#'
#' @return a vcf object
#' @examples
#' vcf <- read_vcf(system.file("extdata", "cells.donorid.vcf.gz",
#' package = "cardelino"))
#'
#' @importFrom VariantAnnotation readVcf isSNV geno ref alt
#' genotypeToSnpMatrix
#' @importFrom GenomeInfoDb seqlevelsStyle seqlengths
#' @importFrom methods as is
#'
#' @export
#'
read_vcf <- function(vcf_file, genome = "GRCh37",
seq_levels_style = "Ensembl", verbose = TRUE) {
## Read in VCF from this sample
if (verbose) {
message("Reading VCF.")
}
vcf_sample <- VariantAnnotation::readVcf(vcf_file, genome)
vcf_sample <- vcf_sample[VariantAnnotation::isSNV(vcf_sample)]
if (length(vcf_sample) > 0) {
GenomeInfoDb::seqlevelsStyle(vcf_sample) <- seq_levels_style
new_snp_names <- gsub(
"chr", "",
gsub(
":", "_",
gsub(
"_[ATCG]/[ATCG]", "",
names(vcf_sample)
)
)
)
names(vcf_sample) <- new_snp_names
} else {
stop("No SNVs present in VCF file.")
}
if (verbose) {
message(
"Read ", dim(vcf_sample)[1], " variants for ",
dim(vcf_sample)[2], " samples."
)
}
vcf_sample
}
#' Get SNP data matrices from VCF object(s)
#'
#' @param vcf_cell a \code{\link[VariantAnnotation]{CollapsedVCF}} object
#' containing variant data for cells
#' @param vcf_donor an optional \code{\link[VariantAnnotation]{CollapsedVCF}}
#' object containing genotype data for donors
#' @param verbose logical(1), should the function output verbose information as
#' it runs?
#' @param donors optional character vector providing a set of donors to use, by
#' subsetting the donors present in the \code{donor_vcf_file}; if \code{NULL}
#' (default) then all donors present in VCF will be used.
#'
#' @return a list containing
#' \code{A}, a matrix of integers. Number of alteration reads in SNP i cell j.
#' \code{D}, a matrix of integers. Number of reads depth in SNP i cell j.
#' \code{R}, a matrix of integers. Number of reference reads in SNP i cell j.
#' \code{GT_cells}, a matrix of integers for genotypes. The cell-SNP
#' configuration.
#' \code{GT_donors}, a matrix of integers for genotypes. The donor-SNP
#' configuration.
#'
#' @examples
#' vcf_cell <- read_vcf(system.file("extdata", "cells.donorid.vcf.gz",
#' package = "cardelino"))
#' vcf_donor <- read_vcf(system.file("extdata", "donors.donorid.vcf.gz",
#' package = "cardelino"))
#' snp_data <- get_snp_matrices(vcf_cell, vcf_donor)
#'
#' @import snpStats
#' @importFrom GenomicRanges findOverlaps
#' @importFrom S4Vectors queryHits subjectHits
#'
#' @export
#'
get_snp_matrices <- function(vcf_cell, vcf_donor = NULL, verbose = TRUE,
donors = NULL) {
if (!methods::is(vcf_cell, "CollapsedVCF")) {
stop(
"vcf_cell must be a CollapsedVCF object from the ",
"VariantAnnotation package."
)
}
vcf_cell <- GenomeInfoDb::sortSeqlevels(vcf_cell, X.is.sexchrom = TRUE)
slengths_sample <- GenomeInfoDb::seqlengths(vcf_cell)
if (!is.null(vcf_donor)) {
if (!methods::is(vcf_donor, "CollapsedVCF")) {
stop(
"vcf_donor must be a CollapsedVCF object from the ",
"VariantAnnotation package."
)
}
## filter sample VCF to those variants found in donor VCF
if (!is.null(donors)) {
if (sum(colnames(vcf_donor) %in% donors) < 1) {
stop(
"No donors in vcf_donor are found in the donors",
"argument supplied."
)
}
vcf_donor <- vcf_donor[, colnames(vcf_donor) %in% donors]
}
vcf_donor <- GenomeInfoDb::sortSeqlevels(vcf_donor,
X.is.sexchrom = TRUE
)
GenomeInfoDb::seqlengths(vcf_donor) <-
slengths_sample[GenomeInfoDb::seqlevels(vcf_donor)]
ovlap <- GenomicRanges::findOverlaps(vcf_cell, vcf_donor)
if (length(ovlap) < 1L) {
stop("No variants overlapping in cell VCF and Donor VCF.")
} else {
if (verbose) {
message("Filtering cell VCF.")
}
vcf_cell <- vcf_cell[S4Vectors::queryHits(ovlap)]
if (verbose) {
message("Filtering donor VCF.")
}
vcf_donor <- vcf_donor[S4Vectors::subjectHits(ovlap)]
match_alleles <- unlist(
VariantAnnotation::ref(vcf_cell) ==
VariantAnnotation::ref(vcf_donor) &
VariantAnnotation::alt(vcf_cell) ==
VariantAnnotation::alt(vcf_donor)
)
if (sum(match_alleles) < 1L) {
stop(
"No variants with matching alleles in sample and ",
"donor VCFs"
)
}
vcf_cell <- vcf_cell[match_alleles]
vcf_donor <- vcf_donor[match_alleles]
if (verbose) {
message("Extracting donor SNP genotype matrix.")
}
sm_donor <- VariantAnnotation::genotypeToSnpMatrix(
VariantAnnotation::geno(vcf_donor, "GT"),
ref = VariantAnnotation::ref(vcf_donor),
alt = VariantAnnotation::alt(vcf_donor)
)
}
} else {
sm_donor <- NULL
}
## get snp matrices
if (verbose) {
message("Extracting cell SNP matrices.")
}
sm_sample <- VariantAnnotation::genotypeToSnpMatrix(
VariantAnnotation::geno(vcf_cell, "GT"),
ref = VariantAnnotation::ref(vcf_cell),
alt = VariantAnnotation::alt(vcf_cell)
)
sm_sample_REF <- matrix(vapply(
VariantAnnotation::geno(vcf_cell, "AD"),
function(x) x[[1]],
numeric(1)
), ncol = ncol(vcf_cell))
sm_sample_ALT <- matrix(vapply(
VariantAnnotation::geno(vcf_cell, "AD"),
function(x) x[[2]],
numeric(1)
), ncol = ncol(vcf_cell))
sm_sample_ALT[is.na(sm_sample_ALT) & !is.na(sm_sample_REF)] <- 0
sm_sample_DEP <- sm_sample_REF + sm_sample_ALT
sm_sample_DEP[sm_sample_DEP == 0] <- NA
sm_sample_REF[is.na(sm_sample_DEP)] <- NA
sm_sample_ALT[is.na(sm_sample_DEP)] <- NA
rownames(sm_sample_REF) <- rownames(vcf_cell)
rownames(sm_sample_ALT) <- rownames(vcf_cell)
rownames(sm_sample_DEP) <- rownames(vcf_cell)
colnames(sm_sample_REF) <- colnames(vcf_cell)
colnames(sm_sample_ALT) <- colnames(vcf_cell)
colnames(sm_sample_DEP) <- colnames(vcf_cell)
na_sample <- is.na(sm_sample_DEP)
if (sum(!na_sample) < 1L) {
stop("No variants with non-missing genotypes cells VCF and Donor VCF\n")
}
out <- list(
A = sm_sample_ALT, D = sm_sample_DEP, R = sm_sample_REF,
GT_cells = t(methods::as(sm_sample$genotypes, "numeric"))
)
if (!is.null(vcf_donor)) {
out[["GT_donors"]] <- t(methods::as(sm_donor$genotypes, "numeric"))
} else {
out[["GT_donors"]] <- NULL
}
out
}
#' Load sparse matrices A and D from cellSNP VCF file with filtering SNPs
#'
#' @param vcf_file character(1), path to VCF file generated from cellSNP
#' @param max_other_allele maximum ratio of other alleles comparing to REF and
#' ALT alleles; for cellSNP vcf, we recommend 0.05
#' @param min_count minimum count across all cells, e.g., 20
#' @param min_MAF minimum minor allele fraction, e.g., 0.1
#' @param rowname_format the format of rowname: NULL is the default from vcfR,
#' short is CHROM_POS, and full is CHROM_POS_REF_ALT
#' @param keep_GL logical(1), if TRUE, check if GL (genotype probability) exists
#' it will be returned
#'
#' @return A list with elements the matrices A and D and GL, the genotype
#' probability. If keep_GL is false the GL element will be an empty list.
#'
#' @export
#'
#' @examples
#' vcf_file <- system.file("extdata", "cellSNP.cells.vcf.gz",
#' package = "cardelino"
#' )
#' input_data <- load_cellSNP_vcf(vcf_file)
load_cellSNP_vcf <- function(vcf_file, min_count = 0, min_MAF = 0,
max_other_allele = NULL, rowname_format = "full",
keep_GL = FALSE) {
vcf_temp <- vcfR::read.vcfR(vcf_file)
dp_full <- vcfR::extract.gt(vcf_temp, element = "DP", as.numeric = TRUE)
ad_full <- vcfR::extract.gt(vcf_temp, element = "AD", as.numeric = TRUE)
idx <- (rowSums(dp_full, na.rm = TRUE) >= min_count &
((rowSums(ad_full, na.rm = TRUE) /
rowSums(dp_full, na.rm = TRUE)) >= min_MAF))
idx <- idx & (!is.na(idx))
if (!is.null(max_other_allele)) {
dp_sum <- vcfR::extract.info(vcf_temp,
element = "DP",
as.numeric = TRUE
)
oth_sum <- vcfR::extract.info(vcf_temp,
element = "OTH",
as.numeric = TRUE
)
idx <- idx & (oth_sum / dp_sum < max_other_allele)
}
print(paste(sum(idx), "out of", length(idx), "SNPs passed."))
A <- ad_full[idx, ]
D <- dp_full[idx, ]
A[is.na(A)] <- 0
D[is.na(D)] <- 0
if (!is.null(rowname_format)) {
fix_val <- vcfR::getFIX(vcf_temp)
if (rowname_format == "short") {
var_ids <- paste0(fix_val[, 1], "_", fix_val[, 2])
} else {
var_ids <- paste0(
fix_val[, 1], "_", fix_val[, 2],
"_", fix_val[, 4], "_", fix_val[, 5]
)
}
}
row.names(A) <- row.names(D) <- var_ids[idx]
GL_list <- list()
if (keep_GL && sum(strsplit(vcf_temp@gt[1, 1], ":")$FORMAT == "GL") > 0) {
GL_full <- vcfR::extract.gt(vcf_temp,
element = "GL",
as.numeric = FALSE
)
for (ii in seq_len(length(strsplit(GL_full[1, 1], ",")[[1]]))) {
GL_tmp <- vcfR::masplit(GL_full, delim = ",", record = ii, sort = 0)
GL_tmp[is.na(GL_tmp)] <- 0
row.names(GL_tmp) <- var_ids
GL_list[[ii]] <- Matrix::Matrix(GL_tmp[idx, ], sparse = TRUE)
}
}
list(
"A" = Matrix::Matrix(A, sparse = TRUE),
"D" = Matrix::Matrix(D, sparse = TRUE),
"GL" = GL_list
)
}
#' Load genotype VCF into numeric values: 0, 1, or 2
#'
#' Note, the genotype VCF can be very big for whole genome. It would be more
#' efficient to only keep the wanted variants and samples. bcftools does such
#' jobs nicely.
#'
#' @param vcf_file character(1), path to VCF file for donor genotypes
#' @param rowname_format the format of rowname: NULL is the default from vcfR,
#' short is CHROM_POS, and full is CHROM_POS_REF_ALT
#' @param na.rm logical(1), if TRUE, remove the variants with NA values
#' @param keep_GP logical(1), if TRUE, check if GP (genotype probability) exists
#' it will be returned
#'
#' @return A list representing the loaded genotype information with two
#' components: GT, the usual numeric representation of genotype and GP the
#' genotype probabilities. Note that if \code{keep_GP} is false the GP
#' component will be NULL.
#'
#' @export
#'
#' @examples
#' vcf_file <- system.file("extdata", "cellSNP.cells.vcf.gz",
#' package = "cardelino"
#' )
#' GT_dat <- load_GT_vcf(vcf_file, na.rm = FALSE)
load_GT_vcf <- function(vcf_file, rowname_format = "full", na.rm = TRUE,
keep_GP = TRUE) {
GT_vcf <- vcfR::read.vcfR(vcf_file)
## variant ids
if (!is.null(rowname_format)) {
fix_val <- vcfR::getFIX(GT_vcf)
if (rowname_format == "short") {
var_ids <- paste0(fix_val[, 1], "_", fix_val[, 2])
} else {
var_ids <- paste0(
fix_val[, 1], "_", fix_val[, 2],
"_", fix_val[, 4], "_", fix_val[, 5]
)
}
}
## GT values
GT <- vcfR::extract.gt(GT_vcf, element = "GT")
GT_num <- matrix(NA, nrow = nrow(GT), ncol = ncol(GT))
colnames(GT_num) <- colnames(GT)
rownames(GT_num) <- var_ids
idx0 <- which(GT == "0/0" | GT == "0|0")
idx2 <- which(GT == "1/1" | GT == "1|1")
idx1 <- which(GT == "0/1" | GT == "1/0" |
GT == "0|1" | GT == "1|0")
GT_num[idx0] <- 0
GT_num[idx1] <- 1
GT_num[idx2] <- 2
if (na.rm) {
idx <- rowSums(is.na(GT_num)) == 0
} else {
idx <- seq_len(nrow(GT_num))
}
GT_num <- GT_num[idx, ]
## Genotype probability
if (keep_GP && sum(strsplit(GT_vcf@gt[1, 1], ":")$FORMAT == "GP") > 0) {
GP_val <- matrix(NA, nrow = length(GT_num), ncol = 3)
rownames(GP_val) <- paste0(
rep(colnames(GT_num), each = ncol(GT_num)), ":",
rep(rownames(GT_num), times = ncol(GT_num))
)
colnames(GP_val) <- c("GT=0", "GT=1", "GT=2")
GP <- vcfR::extract.gt(GT_vcf, element = "GP", as.numeric = FALSE)
for (ii in seq_len(3)) {
GP_val[, ii] <- vcfR::masplit(GP,
delim = ",", record = ii,
sort = 0
)[idx]
}
} else {
GP_val <- NULL
}
list("GT" = GT_num, "GP" = GP_val)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.