# Start of internal.R
# get_col_indices
#' Get column indices of object.
#'
#' @param x An object with columns.
#' @param requested A character vector of column names, a logical vector of
#' length equal to the number of object columns, or a numeric vector of column
#' indices of the input object. If this is specified, only the requested column
#' indices are returned; otherwise, this function returns all column indices.
#' @param strict Option indicating that \code{requested}, if specified, must
#' be in the same order as the corresponding columns of the input object.
#'
#' @return Integer vector of column indices.
#'
#' @keywords internal
#' @rdname get_col_indices
get_col_indices <- function(x, requested=NULL, strict=FALSE) {
num.cols <- ncol(x)
if ( ! is_single_nonnegative_whole_number(num.cols) ) {
stop("cannot get column indices - object does not have columns")
}
available <- if ( num.cols > 0 ) { 1:num.cols } else { integer() }
names(available) <- colnames(x)
resolved <- get_indices(available, requested=requested, strict=strict)
indices <- unname(available[resolved])
return(indices)
}
# get_indices
#' Get indices of object elements.
#'
#' Get the indices of \code{x}, as constrained by the \code{requested} parameter. If using this
#' function without \code{requested} constraints, consider using the faster primitive R function
#' \code{seq_along} instead.
#'
#' @param x An object with elements that are accessible by an index.
#' @param requested A character vector of names, a logical vector of the same length as \code{x}, or
#' a numeric vector containing indices of \code{x}. If this parameter is not specified, all indices
#' are returned.
#' @param strict Option indicating that \code{requested}, if specified, must request indices that
#' are unique and in the same order as the corresponding elements of \code{x}.
#'
#' @return Integer vector containing indices of \code{x}.
#'
#' @keywords internal
#' @rdname get_indices
get_indices <- function(x, requested=NULL, strict=FALSE) {
stopifnot( isTRUE(strict) || identical(FALSE, strict) )
object.length <- length(x)
if ( ! is_single_nonnegative_whole_number(object.length) ) {
stop("cannot get object indices - object does not have length")
}
indices <- seq_along(x)
if ( ! is.null(requested) ) {
if ( is.numeric(requested) ) {
nonintegers <- requested[ ! is_whole_number(requested) ]
if ( length(nonintegers) > 0L ) {
stop("requested indices are not integers - '", toString(nonintegers), "'")
}
exrange <- requested[ ! requested %in% indices ]
if ( length(exrange) > 0L ) {
stop("requested indices out of range - '", toString(exrange), "'")
}
indices <- indices[requested]
} else if ( is.logical(requested) ) {
if ( length(requested) != object.length ) {
stop("cannot resolve indices by logical vector - length mismatch")
}
indices <- unname(which(requested))
} else if ( is.character(requested) ) {
object.names <- names(x)
if ( anyNA(requested) ) {
stop("cannot resolve indices by name - requested names are incomplete")
}
if ( is.null(object.names) ) {
stop("cannot resolve indices by name - no object names found")
}
if ( anyDuplicated(object.names) ) {
stop("cannot resolve indices by name - duplicate object names found")
}
unfound <- requested[ ! requested %in% object.names ]
if ( length(unfound) > 0L ) {
stop("requested names not found - '", toString(unfound), "'")
}
indices <- match(requested, object.names)
} else {
stop("requested indices must be specified by index, logical mask, or name")
}
if (strict) { # NB: also ensures no duplicates
if ( is.unsorted(indices, strictly=TRUE) ) {
stop("requested indices not specified in strictly increasing order")
}
}
}
return(indices)
}
# get_lod_col_index
#' Get LOD column index.
#'
#' @param x An \pkg{R/qtl} \code{scanone}, \code{scanoneperm}, or \code{summary.scanoneperm} object.
#' @param lodcolumn This parameter indicates for which LOD column an index should be returned. This
#' must be either a LOD column name or an index \emph{with respect to the set of LOD columns}. If no
#' LOD column is specified and one such column is found, the index of that column is returned by
#' default; otherwise a LOD column must be specified.
#'
#' @return LOD column index.
#'
#' @keywords internal
#' @rdname get_lod_col_index
get_lod_col_index <- function(x, lodcolumn=NULL) {
lodcol.indices <- get_lod_col_indices(x, lodcolumns=lodcolumn)
if ( length(lodcol.indices) > 1L ) {
stop("object has multiple LOD columns - please choose one")
} else if ( length(lodcol.indices) == 0L ) {
stop("no LOD column found")
}
return(lodcol.indices[1L])
}
# get_lod_col_indices
#' Get LOD column indices.
#'
#' @param x An \pkg{R/qtl} \code{scanone}, \code{scanoneperm}, or \code{summary.scanoneperm} object.
#' @param lodcolumns This parameter indicates for which LOD columns indices should be returned. The
#' specified LOD columns must be a character vector of LOD column names, a logical vector with
#' length equal to the number of LOD columns, or a numeric vector of indices \emph{with respect
#' to the set of LOD columns}. If no LOD columns are specified, all LOD column indices are returned.
#' @param strict Option indicating that \code{lodcolumns}, if specified, must request LOD column
#' indices that are unique and in the same order as the LOD columns of \code{x}.
#'
#' @return Vector of LOD column indices.
#'
#' @keywords internal
#' @rdname get_lod_col_indices
get_lod_col_indices <- function(x, lodcolumns=NULL, strict=FALSE) {
UseMethod('get_lod_col_indices', x)
}
# get_lod_col_indices.scanone
#' @method get_lod_col_indices scanone
#' @rdname get_lod_col_indices
get_lod_col_indices.scanone <- function(x, lodcolumns=NULL, strict=FALSE) {
stopifnot( identical(colnames(x)[1:2], c('chr', 'pos')) )
available <- seq_len(ncol(x))[-(1:2)]
names(available) <- colnames(x)[available]
resolved <- get_indices(available, requested=lodcolumns, strict=strict)
indices <- unname(available[resolved])
return(indices)
}
# get_lod_col_indices.scanoneperm
#' @method get_lod_col_indices scanoneperm
#' @rdname get_lod_col_indices
get_lod_col_indices.scanoneperm <- function(x, lodcolumns=NULL, strict=FALSE) {
stopifnot( ncol(x) >= 1L )
available <- seq_len(ncol(x))
names(available) <- colnames(x)
resolved <- get_indices(available, requested=lodcolumns, strict=strict)
indices <- unname(available[resolved])
return(indices)
}
# get_lod_col_indices.summary.scanoneperm
#' @export
#' @method get_lod_col_indices summary.scanoneperm
#' @rdname get_lod_col_indices
get_lod_col_indices.summary.scanoneperm <- function(x, lodcolumns=NULL, strict=FALSE) {
stopifnot( ncol(x) >= 1L )
available <- seq_len(ncol(x))
names(available) <- colnames(x)
resolved <- get_indices(available, requested=lodcolumns, strict=strict)
indices <- unname(available[resolved])
return(indices)
}
# get_run_index_list
#' Get index list of successive runs in a vector.
#'
#' @param x A vector.
#'
#' @return List of integer vectors, each containing indices for a run of repeated values in
#' \code{x}. Each list element takes its name from the corresponding repeated value. Returns
#' an empty list if \code{x} is of length zero.
#'
#' @keywords internal
#' @rdname get_run_index_list
get_run_index_list <- function(x, na.rm=FALSE) {
stopifnot( is.vector(x) )
stopifnot( isTRUE(na.rm) || identical(FALSE, na.rm) )
if ( length(x) > 0L ) {
# Get run-length encoding of vector.
runs <- rle(x)
# Set run names from RLE values.
run.names <- runs$values
# Get number of runs in RLE.
num.runs <- unique(lengths(runs))
# Get last index of each run.
J <- cumsum(runs$lengths)
# Get first index of each run.
if ( num.runs > 1L ) {
I <- c(1L, sapply(J[1:(length(J)-1)], function(j) j + 1))
} else {
I <- 1L
}
# Remove NA values, if specified.
if (na.rm) {
mask <- ! is.na(runs$values)
run.names <- run.names[mask]
I <- I[mask]
J <- J[mask]
}
# Set index list from run index ranges.
index.list <- mapply(function(i, j) i:j, I, J, SIMPLIFY=FALSE)
# Set names of index list from run values.
names(index.list) <- run.names
} else {
index.list <- list()
}
return(index.list)
}
# infer_map_precision
#' Infer precision of map positions.
#'
#' @param x An \pkg{R/qtl} \code{scanone} or \code{map} object.
#' @param tol Tolerance for map position equality.
#'
#' @return Inferred precision of map in \code{x}. Map precision is taken from the map step size, if
#' the majority of the map steps in \code{x} are the same. Otherwise, map precision is set from the
#' smallest distance between any two adjacent loci.
#'
#' @keywords internal
#' @rdname infer_map_precision
infer_map_precision <- function(x, tol=.Machine$double.eps^0.5) {
stopifnot( any( c('map', 'scanone') %in% class(x) ) )
stopifnot( is_single_nonnegative_number(tol) )
if ( 'scanone' %in% class(x) ) {
map.steps <- unlist(lapply(unique(x$chr), function(map.seq)
diff(x[x$chr == map.seq, 'pos'])))
} else { # 'map' %in% class(x)
map.steps <- unlist(unname(lapply(x, function(map.pos) diff(as.numeric(map.pos)))))
}
# Get frequency table of map steps.
step.freqs <- table(map.steps)
# Get numeric step sizes.
step.sizes <- as.numeric(names(step.freqs))
# Get differences between step sizes.
step.diffs <- diff(step.sizes)
# Group step-size values that are very similar.
size.groups <- vector('list')
i <- 1L
while ( i <= length(step.freqs) ) {
j <- i
while ( j < length(step.freqs) && step.diffs[j] < tol ) {
j <- j + 1L
}
size.groups <- append(size.groups, list(i:j))
i <- j + 1L
}
# Merge similar step values.
merged.freqs <- integer(length=length(size.groups))
for ( i in seq_along(size.groups) ) {
g <- unlist(size.groups[i])
merged.freqs[i] <- sum(step.freqs[g])
names(merged.freqs)[i] <- names(sort(step.freqs[g], decreasing=TRUE))[1L]
}
# Sort steps by decreasing frequency.
sorted.freqs <- sort(merged.freqs, decreasing=TRUE)
# If the most frequent step is more frequent than all others combined, set as map precision..
if ( length(sorted.freqs) == 1L || (length(sorted.freqs) > 1L &&
sorted.freqs[1L] >= sum(sorted.freqs[2:length(sorted.freqs)])) ) {
map.precision <- as.numeric(names(sorted.freqs)[1L])
# ..otherwise set smallest step as map precision.
} else {
map.precision <- min(map.steps, na.rm=TRUE)
}
return(map.precision)
}
# is_single_char
#' Test for a single character.
#'
#' @param x Test object.
#'
#' @return \code{TRUE} if \code{x} is a single character; \code{FALSE} otherwise.
#'
#' @keywords internal
#' @rdname is_single_char
is_single_char <- function(x) {
return( is.character(x) && length(x) == 1L && ! is.na(x) && nchar(x) == 1L )
}
# is_single_nonnegative_number
#' Test for a single non-negative number.
#'
#' @param n Test object.
#'
#' @return \code{TRUE} if \code{x} is a single non-negative number; \code{FALSE} otherwise.
#'
#' @keywords internal
#' @rdname is_single_nonnegative_number
is_single_nonnegative_number <- function(n) {
return( is.numeric(n) && length(n) == 1L && is.finite(n) && n >= 0.0 )
}
# is_single_nonnegative_whole_number
#' Test for a single non-negative whole number.
#'
#' @param n Test object.
#' @param tol Numeric tolerance.
#'
#' @return \code{TRUE} if \code{x} is a single non-negative whole number; \code{FALSE} otherwise.
#'
#' @keywords internal
#' @rdname is_single_nonnegative_whole_number
is_single_nonnegative_whole_number <- function(n, tol=.Machine$double.eps^0.5) {
return( is.numeric(n) && length(n) == 1L && is.finite(n) && n >= 0.0 &&
abs(n - round(n)) < abs(tol) )
}
# is_single_positive_whole_number
#' Test for a single positive whole number.
#'
#' @param n Test object.
#' @param tol Numeric tolerance.
#'
#' @return \code{TRUE} if the object is a single positive whole number;
#' \code{FALSE} otherwise.
#'
#' @keywords internal
#' @rdname is_single_positive_whole_number
is_single_positive_whole_number <- function(n, tol=.Machine$double.eps^0.5) {
return( length(n) == 1 && is.numeric(n) && is.finite(n) &&
n > tol && abs(n - round(n)) < abs(tol) )
}
# is_single_probability
#' Test for a single valid probability.
#'
#' @param n Test object.
#'
#' @return \code{TRUE} if \code{x} is a single valid probability; \code{FALSE} otherwise.
#'
#' @keywords internal
#' @rdname is_single_probability
is_single_probability <- function(n) {
return( is.numeric(n) && length(n) == 1L && is.finite(n) && n >= 0.0 && n <= 1.0 )
}
# is_single_string
#' Test for a single character string.
#'
#' @param x Test object.
#'
#' @return \code{TRUE} if \code{x} is a single string; \code{FALSE} otherwise.
#'
#' @keywords internal
#' @rdname is_single_string
is_single_string <- function(x) {
return( is.character(x) && length(x) == 1L && ! is.na(x) )
}
# is_valid_id
#' Test identifier validity.
#'
#' An identifier is considered valid if it is a string
#' containing at least one non-whitespace character.
#'
#' @param x Test vector.
#'
#' @return Logical vector indicating which elements are valid identifiers.
#'
#' @keywords internal
#' @rdname is_valid_id
is_valid_id <- function(x) {
return( is.character(x) & nzchar(gsub('[[:space:]]', '', x)) )
}
# is_whole_number
#' Test for whole numbers.
#'
#' @param n Test vector.
#' @param tol Numeric tolerance.
#'
#' @return Logical vector indicating which elements of \code{n} are whole numbers.
#'
#' @keywords internal
#' @rdname is_whole_number
is_whole_number <- function(n, tol=.Machine$double.eps^0.5) {
return( is.numeric(n) & is.finite(n) & abs(n - round(n)) < abs(tol) )
}
# isBOOL
#' Test for a single logical value.
#'
#' @param x Test object.
#'
#' @return \code{TRUE} if object is a single logical value;
#' \code{FALSE} otherwise.
#'
#' @keywords internal
#' @rdname isBOOL
isBOOL <- function(x) {
return( isTRUE(x) | isFALSE(x) )
}
# isFALSE
#' Test for a single \code{FALSE} value.
#'
#' @param x Test object.
#'
#' @return \code{TRUE} if object is a single \code{FALSE} value;
#' \code{FALSE} otherwise.
#'
#' @keywords internal
#' @rdname isFALSE
isFALSE <- function(x) {
return( identical(FALSE, x) )
}
# make_geno_matrix
#' Make genotype matrix from sample and founder genotype data.
#'
#' Given input genotype data for cross samples and their founder strains, this
#' function assigns a genotype symbol to each locus according to the inferred
#' combination of founder alleles.
#'
#' @param x Raw sample genotype \code{matrix}.
#' @param y Raw founder genotype \code{matrix}.
#' @param alleles Mapping of founder IDs to founder allele symbols.
#'
#' @return A founder genotype matrix, with genotypes encoded as integers and
#' their corresponding genotype symbols in the attribute \code{'genotypes'}.
#'
#' @keywords internal
#' @rdname make_geno_matrix
make_geno_matrix <- function(x, y, alleles=NULL) {
founder.allele.charset <- c(LETTERS, letters)
x.samples <- rownames(x)
x.snps <- colnames(x)
y.samples <- rownames(y)
y.snps <- colnames(y)
# Check upper limit of founder sample count.
# NB: absolute upper limit is length of founder.allele.charset
if ( length(y.samples) != 2 ) {
stop("unsupported number of founders - '", length(y.samples), "'")
}
if ( ! is.null(alleles) ) {
stopifnot( is_mapping(alleles) )
founder.samples <- mapping_keys(alleles) # NB: ensures unique sample IDs
founder.symbols <- as.character( mapping_values(alleles) )
unknown <- founder.samples[ ! founder.samples %in% y.samples ]
if ( length(unknown) > 0 ) {
stop("allele symbols specified for unknown founder samples - '",
toString(unknown), "'")
}
unfound <- y.samples[ ! y.samples %in% founder.samples ]
if ( length(unfound) > 0 ) {
stop("allele symbols not specified for founder samples - '",
toString(unfound), "'")
}
dup.alleles <- founder.symbols[ duplicated(founder.symbols) ]
if ( length(dup.alleles) > 0 ) {
stop("duplicate founder alleles - '", toString(dup.alleles), "'")
}
err.alleles <- founder.symbols[ ! founder.symbols %in% founder.allele.charset ]
if ( length(err.alleles) > 0 ) {
stop("invalid founder alleles - '", toString(err.alleles), "'")
}
names(founder.symbols) <- founder.samples
allele.symbols <- sort( founder.symbols )
} else {
allele.symbols <- founder.allele.charset[ seq_along(y.samples) ]
names(allele.symbols) <- y.samples
}
# Keep only loci common to samples and founders.
common.snps <- intersect(x.snps, y.snps)
x <- x[, common.snps, drop=FALSE]
y <- y[, common.snps, drop=FALSE]
# Init genotype matrix.
geno.mat <- matrix(NA_character_, nrow=nrow(x), ncol=ncol(x),
dimnames=dimnames(x))
for ( j in get_col_indices(geno.mat) ) {
# Get sample symbols and genotypes for this
# locus, skip if effectively monomorphic.
x.symbols <- x[, j]
uniq.x.symbols <- unique(x.symbols)
uniq.x.geno <- uniq.x.symbols[ ! is.na(uniq.x.symbols) ]
if ( length(uniq.x.geno) < 2 ) {
next
}
# Get founder genotypes for this locus,
# skip if any are missing.
y.genotypes <- y[, j]
if ( anyNA(y.genotypes) ) {
next
}
# Get founder alleles, skip if monomorphic or heterozygous.
y.alleles <- lapply(y.genotypes, function(gt)
unique(strsplit(gt, '')[[1]]))
if ( anyDuplicated(y.alleles) || any( lengths(y.alleles) > 1 ) ) {
next
}
y.alleles <- unlist(y.alleles)
# Assign locus genotypes from matching founder.
x.calls <- lapply(x.symbols, function(gt) {
x.alleles <- strsplit(gt, '')[[1]]
x.indices <- match(x.alleles, y.alleles)
if ( anyNA(x.indices) ) {
x.genotype <- NA_character_
} else {
x.fdr.names <- names(y.alleles)[x.indices]
# Sort genotype alleles, since 'BA'
# is considered equivalent to 'AB'.
x.fdr.geno <- sort(allele.symbols[x.fdr.names])
x.genotype <- paste0(x.fdr.geno, collapse='')
}
})
# Skip if monomorphic after matching to founder alleles.
uniq.x.calls <- unique(x.calls[ ! is.na(x.calls) ])
if ( length(uniq.x.calls) < 2 ) {
next
}
geno.mat[, j] <- unlist(x.calls)
}
# Remove null loci.
geno.mat <- remove_na_cols(geno.mat)
if ( ncol(geno.mat) == 0 ) {
stop("cannot make genotype matrix - no suitable loci found")
}
return(geno.mat)
}
# make_snp_marker_ids
#' Make SNP marker IDs for loci.
#'
#' @description This function creates SNP marker
#' IDs from locus \code{data.frame} \code{'loc'}.
#'
#' @param loc Locus \code{data.frame}, with columns \code{'chr'} and
#' \code{'pos'}, specifying physical map positions in base-pair units.
#' @param max.seqlength Maximum expected sequence length. This determines
#' the width to which SNP positions are zero-padded. By default, this is
#' the maximum position in the input \code{data.frame}.
#'
#' @return Character vector of SNP marker IDs.
#'
#' @author Thomas A. Walsh
#' @author Yue Hu
#'
#' @keywords internal
#' @rdname make_snp_marker_ids
make_snp_marker_ids <- function(loc, max.seqlength=NULL) {
stopifnot( is.data.frame(loc) )
stopifnot( nrow(loc) > 0 )
loc.pos <- loc$pos
max.pos <- max(loc.pos, na.rm=TRUE)
stopifnot( is_single_positive_whole_number(max.pos) )
max.pos.width <- floor(log10(max.pos)) + 1
pad.width <- NULL
if ( ! is.null(max.seqlength) ) {
stopifnot( is_single_positive_whole_number(max.seqlength) )
if ( max.pos <= max.seqlength ) {
pad.width <- floor(log10(max.seqlength)) + 1
} else {
warning("input position (", format(max.pos, scientific=FALSE), "",
") exceeds specified maximum sequence length (",
format(max.seqlength, scientific=FALSE), "), padding to ",
max.pos.width, " digits")
}
}
if ( is.null(pad.width) ) {
pad.width <- max.pos.width
}
pad.fmt.str <- paste0('%0', pad.width, 'd')
loc.pos <- sprintf(pad.fmt.str, loc.pos)
return( paste0(loc$chr, ':', loc.pos) )
}
# parse_snp_marker_ids
#' Parse SNP marker IDs.
#'
#' This function parses an input vector of SNP marker IDs, and returns a \code{data.frame} with
#' the locus in each row derived from the corresponding marker ID in the input vector. An error
#' is raised if any of the input values cannot be parsed as a SNP marker ID.
#'
#' @param ids Vector of SNP marker IDs.
#'
#' @return A \code{data.frame} with loci
#' corresponding to the SNP marker IDs.
#'
#' @keywords internal
#' @rdname parse_snp_marker_ids
parse_snp_marker_ids <- function(ids) {
snp_marker_id_patt = '^([^:]+):([[:digit:]]+)$'
m <- regexec(snp_marker_id_patt, ids)
regmatch.list <- regmatches(ids, m)
valid.snp.marker.ids <- lengths(regmatch.list) > 0
stopifnot( all(valid.snp.marker.ids) )
snp.seqs <- sapply(regmatch.list, getElement, 2)
snp.pos <- sapply(regmatch.list, function(x) as.numeric(x[3]))
return( data.frame(chr=snp.seqs, pos=snp.pos, row.names=ids, stringsAsFactors=FALSE) )
}
# read_snps_from_vcf
#' Read SNP genotypes from VCF files.
#'
#' This function reads SNP genotype data from one or more VCF files, and returns
#' these as a \code{matrix} of SNP genotypes - effectively variant base calls.
#'
#' @param infiles Input VCF file paths.
#' @param samples Optional vector of samples for which SNP genotypes should be obtained.
#' If not specified, genotypes are returned for all available samples.
#' @param max.seqlength Optional parameter to indicate the maximum reference sequence length, which
#' is used to determine the zero-padded width of genomic positions in SNP marker IDs. Without this
#' information, SNP marker IDs may be formatted inconsistently in different datasets.
#' @param require.all Remove variants that are not completely genotyped
#' with respect to the given samples.
#' @param require.any Remove variants that do not have at least one genotype
#' call among the given samples.
#' @param require.polymorphic Remove variants that do not have at least two
#' different genotype calls among the given samples.
#'
#' @return A \code{matrix} object containing SNP genotype data.
#'
#' @keywords internal
#' @rdname read_snps_from_vcf
read_snps_from_vcf <- function(infiles, samples=NULL, max.seqlength=NULL,
require.all=FALSE, require.any=FALSE,
require.polymorphic=FALSE) {
stopifnot( length(infiles) > 0 )
stopifnot( isBOOL(require.all) )
stopifnot( isBOOL(require.any) )
stopifnot( isBOOL(require.polymorphic) )
# Get samples in each input VCF file.
sample.list <- lapply(infiles, read_samples_from_vcf)
# If samples specified, filter sample list, keep only those specified.
if ( ! is.null(samples) ) {
stopifnot( is.character(samples) )
stopifnot( length(samples) > 0 )
for ( i in seq_along(infiles) ) {
sample.list[[i]] <- sample.list[[i]][ sample.list[[i]] %in% samples ]
}
unfound <- samples[ ! samples %in% unlist(sample.list) ]
if ( length(unfound) > 0 ) {
stop("samples not found - '", toString(unfound), "'")
}
}
samples <- unlist(sample.list)
if ( length(samples) == 0 ) {
stop("no samples found")
}
dup.samples <- samples[ duplicated(samples) ]
if ( length(dup.samples) > 0 ) {
stop("duplicate samples - '", toString(dup.samples), "'")
}
invalid.ids <- samples[ ! is_valid_id(samples) ]
if ( length(invalid.ids) > 0 ) {
stop("invalid sample IDs - '", toString(invalid.ids), "'")
}
# Keep only files relevant for the given samples.
relevant <- lengths(sample.list) > 0
sample.list <- sample.list[relevant]
infiles <- infiles[relevant]
# Init list of SNP data.
snps <- vector('list', length(infiles))
# Init list for mapping genotype strings to allele indices.
geno.memo <- list()
# Read SNP genotypes from each input VCF file.
for ( i in seq_along(infiles) ) {
file.samples <- sample.list[[i]]
infile <- infiles[[i]]
# Set VCF parameters.
param <- VariantAnnotation::ScanVcfParam(fixed=c('ALT', 'QUAL'),
geno='GT', samples=file.samples)
# Read VCF.
vcf <- VariantAnnotation::readVcf(infile, param=param)
stopifnot( length(vcf) > 0 )
# Get variant data from VCF.
var.seqs <- as.character( GenomeInfoDb::seqnames(vcf) ) # reference sequence
var.pos <- BiocGenerics::start( IRanges::ranges(vcf) ) # position
var.ref <- as.character( VariantAnnotation::ref(vcf) ) # reference allele
var.alt <- lapply( IRanges::CharacterList( # alternative alleles
VariantAnnotation::alt(vcf) ), as.character)
var.geno <- VariantAnnotation::geno(vcf) # genotype info
geno.mat <- t( var.geno$GT ) # genotype calls
# Set indices of SNP variants.
snp.indices <- which( var.ref %in% Biostrings::DNA_BASES &
sapply(var.alt, function(x) all(x %in% Biostrings::DNA_BASES)) )
num.snps <- length(snp.indices)
stopifnot( num.snps > 0 )
# Filter genotype data to retain only SNP variants.
geno.mat <- geno.mat[, snp.indices, drop=FALSE]
# Combine REF and ALT alleles for each SNP.
var.alleles <- lapply(snp.indices, function(j)
c(var.ref[j], var.alt[[j]]))
# Create dataframe with variant locus info.
snp.loc <- data.frame(chr=var.seqs[snp.indices],
pos=var.pos[snp.indices])
# If maximum reference sequence length not
# specified, try to get that info from VCF.
if ( is.null(max.seqlength) ) {
seqinfo <- VariantAnnotation::seqinfo(vcf)
obs.seqnames <- unique(var.seqs)
known.seqnames <- GenomeInfoDb::seqnames(seqinfo)
unknown.seqnames <- setdiff(obs.seqnames, known.seqnames)
if ( ! anyNA(obs.seqnames) && length(unknown.seqnames) == 0 ) {
known.seqlengths <- stats::setNames(GenomeInfoDb::seqlengths(seqinfo),
known.seqnames)
obs.seqlengths <- known.seqlengths[obs.seqnames]
max.obs.seqlength <- max(obs.seqlengths, na.rm=TRUE)
if ( is_single_positive_whole_number(max.obs.seqlength) ) {
max.seqlength <- max.obs.seqlength
}
}
}
if ( is.null(max.seqlength) ) {
warning("maximum reference sequence length is unknown - ",
"marker IDs may be inconsistent between datasets")
}
# Get marker IDs for SNPs in this file.
file.snps <- make_snp_marker_ids(snp.loc, max.seqlength=max.seqlength)
# Check for multiple variants coinciding at same locus.
# TODO: handle coinciding variants
if ( anyDuplicated(file.snps) ) {
stop("coinciding variants in file - '", infile, "'")
}
colnames(geno.mat) <- file.snps
# Resolve raw genotypes for each variant as concatenated base calls.
for ( j in 1:num.snps ) {
# Get vector of alleles for this variant record.
rec.alleles <- var.alleles[[j]]
# Get genotype strings for each sample in this variant record.
geno.data <- unname( geno.mat[, j] )
# Resolve previously unseen genotype strings
# to their corresponding allele indices.
for ( unique.call in unique(geno.data) ) {
if ( ! unique.call %in% names(geno.memo) ) {
# Convert allele indices from 0-offset strings to
# 1-offset integers, VCF missing '.' to NA value.
indicesC <- unlist( strsplit(unique.call, '[/|]') )
indices0 <- suppressWarnings( as.integer(indicesC) )
indices1 <- indices0 + 1
geno.memo[[unique.call]] <- indices1
}
}
# Resolve genotype calls for this variant record.
geno.calls <- lapply(geno.data, function(geno.str) {
alel.idxs <- geno.memo[[geno.str]]
alel.calls <- rec.alleles[alel.idxs]
if ( anyNA(alel.calls) ) {
geno.call <- NA_character_
} else {
geno.call <- paste0(alel.calls, collapse='')
}
})
geno.mat[, j] <- unlist(geno.calls)
}
# Set file variant data, sorting columns
# to facilitate subsequent merging.
snps[[i]] <- geno.mat[, order(colnames(geno.mat)), drop=FALSE]
}
# If multiple relevant files, combine SNP genotypes,
# taking the union of all variants in input files..
if ( length(infiles) > 1 ) {
# Prep combined variant data.
combined.samples <- sort( unique( unlist( lapply(snps, rownames) ) ) )
combined.snps <- sort( unique( unlist( lapply(snps, colnames) ) ) )
combined.names <- list(combined.samples, combined.snps)
# Set combined variant data from each input file.
# NB: we previously checked for duplicate samples and coinciding
# variants, so we can assume that there will be no conflicts.
result <- matrix(NA_character_, nrow=length(combined.samples),
ncol=length(combined.snps), dimnames=combined.names)
for ( i in seq_along(snps) ) {
dest.idxs <- match(colnames(snps[[i]]), colnames(result))
for ( sample.id in rownames(snps[[i]]) ) {
result[sample.id, dest.idxs] <- snps[[i]][sample.id, ]
}
}
} else { # ..otherwise take single set of SNP genotypes.
result <- snps[[1]]
}
# Get combined number of SNP variants.
num.snps <- ncol(result)
stopifnot( num.snps > 0 )
# If filters specified, filter SNP variants.
if ( require.all || require.any || require.polymorphic ) {
mask <- rep(TRUE, num.snps)
if (require.all) { # Remove incomplete variants.
mask <- mask & sapply(1:num.snps, function(i)
! anyNA(result[, i])
)
}
if (require.any) { # Remove variants without genotypes.
mask <- mask & sapply(1:num.snps, function(i)
! all( is.na(result[, i]) )
)
}
if (require.polymorphic) { # Remove monomorphic variants.
mask <- mask & sapply(1:num.snps, function(i) {
geno.data <- result[, i]
g.symbols <- unique(geno.data)
genotypes <- g.symbols[ ! is.na(g.symbols) ]
return( length(genotypes) > 1 )
})
}
# Apply filter mask.
result <- result[, mask, drop=FALSE]
}
return(result)
}
# remove_na_cols
#' Remove columns containing only \code{NA} values.
#'
#' @param x A \code{data.frame} or \code{matrix}.
#'
#' @return Input object without those columns that contain only \code{NA} values.
#'
#' @keywords internal
#' @rdname remove_na_cols
remove_na_cols <- function(x) {
stopifnot( is.data.frame(x) || is.matrix(x) )
stopifnot( nrow(x) > 0 )
mask <- ! apply( x, 2, function(column) all( is.na(as.vector(column)) ) )
x <- x[, mask, drop=FALSE]
return(x)
}
# End of internal.R
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.