#' @title Import genotypes of core-genome SNPs
#'
#' @description Read the SNP table usually produced by the parseSNPtable.py of RedDog (https://github.com/katholt/RedDog).
#' The function also compresses the SNP matrix into a pattern matrix. Notice it is the user's responsibility to ensure the
#' input SNP table only contains 100% conserved SNPs. This function does not check for the conservation status.
#'
#' @param snps either a path to the SNP table or a list generated by this function previously
#' @param snps.delim a single character for the delimiter in the SNP table to be imported
#' @param pos.col either a name or an index of the column for SNP positions
#' @param ref.col (optional) A string specifying the column for SNPs of the reference genome.
#' @param replace.ref Replace the column name specified by ref.col with this argument when ref.col is found amonst column names.
#' Such a column is seen in the SNP table created using RedDog. A user may want to substitute
#' it with a genuine strain name to match the reference isolate in the SNP matrix with that in the genetic and allelic matrices.
#' @param min.mac the minimal number of times that each minor allele occurs in the population (with outlier isolates excluded)
#' min.mac = 1: no filter of SNPs by the minor allele count (MAC) is applied.
#' min.mac = 2: the filter is applied to remove any isolate-specific SNPs.
#' min.mac > 2: the filter is applied for other purposes.
#' @param ingroup a character vector of isolate names to be included in the SNP matrix
#' @param outliers a character vector of names for outlier isolates (to be excluded from the SNP matrix)
#' @param G.file file path of a bimbam-formatted genotype matrix (G). Keep it an empty character ("" by default) to prevent the function from writing this genotype matrix to the hard disk.
#' @param annots.file file path of bimbam-formatted SNP annotations. Keep it "" to prevent the function from writing SNP annotations to the hard disk.
#' @param skip whether to skip overwriting existing output files.
#'
#' @author Yu Wan (\email{wanyuac@@126.com})
#' @export
#
# Copyright 2017 Yu Wan <wanyuac@126.com>
# Licensed under the Apache License, Version 2.0
# Last edition: 21 May 2017
importCoreGenomeSNPs <- function(snps, snps.delim = ",", pos.col = "Pos", ref.col = "Ref",
replace.ref = NULL, min.mac = 1, ingroup = NULL,
outliers = NULL, G.file = "", annots.file = "",
skip = TRUE) {
snpData.class <- class(snps)
if (snpData.class == "character") { # when a file name is provided
require(data.table)
print(paste0("Reading the SNP table from the file ", snps))
snps.core <- fread(input = snps, sep = snps.delim, header = TRUE,
data.table = FALSE, stringsAsFactors = FALSE) # returns a large data frame
print("Processing core-genome SNPs.")
# get the column name of SNP positions when pos.col is an integer
if (class(pos.col) == "integer") {
pos.col <- names(snps.core)[pos.col] # get the column name using an index
}
# substitute the column name specified by ref.column with its genuine strain name when the reference column is present and the substitution is specified
if (is.character(replace.ref)) {
if (ref.col != "") {
i <- which(names(snps.core) == ref.col) # length(i) = 0 when replace.ref = "" or ref.col is not present
if (replace.ref != "" && length(i) == 1) { # length(i) actually equals one when the ref.col column is found because R does not allow duplicated column names.
print(paste0("Replacing the column name ", ref.col, " with ", replace.ref, "."))
names(snps.core)[i] <- replace.ref
} else {
print("Warning: no column name is replaced as the value of replace.ref equals '' or the column name for the reference genotypes is absent in the SNP matrix.")
}
} else {
print("Warning: no column name is replaced for the reference genotypes because the corresponding column is absent in the SNP matrix.")
}
} else { # default
print("Warning: no column name is replaced for the reference strain because the argument for replace.ref is not set or invalid.")
}
# construct a data frame for SNP annotations
# Here, the maximum of the SNP id reveals the number of SNPs in the original SNP table.
# Later, rows of this data frame will be selected for allele counts, etc.
n.snps <- nrow(snps.core)
# Arbitarily assign a chromosome ID, which is essentially trivial.
# However, as the same as the BUGWAS package does, I hacked the index of the human Y chromosome here,
# which is 24 by convention, to emphasise the haploidy of bacterial genomes.
snp.annots <- data.frame(id = paste0("rs", 1 : n.snps),
pos = snps.core[, pos.col],
chr = rep(24, times = n.snps),
stringsAsFactors = FALSE) # rs1, rs2, ..., rsN
# filter isolates as specified, but keep the row (SNP) number intact
# Since we define core-genome SNPs as those preserved in all strains,
# the remaining SNPs after strain removal are still core-genome SNPs.
if (!is.null(outliers)) {
snps.core <- snps.core[, !(names(snps.core) %in% outliers)] # exclude columns corresponding to outlier isolates (including the outgroup isolate)
}
if (!is.null(ingroup)) {
snps.core <- snps.core[, c(pos.col, ingroup)] # only keep isolates on the "ingroup" list
}
print(paste("There are", ncol(snps.core) - 1, "isolates in the SNP matrix.", sep = " "))
# drop the position column from the data frame as we no longer need it
# The "which" statement always holds even if SNP positions are not stored in the first column (Its index may change when some columns are excluded in this case).
snps.core <- snps.core[, -which(names(snps.core) == pos.col)] # Now all column names are isolate names.
# prepare the genotype matrix of biallelic SNPs
snps.core <- t(as.matrix(snps.core)) # convert the data frame into a matrix and transpose it to make columns SNP names
colnames(snps.core) <- snp.annots$id # Now columns (SNPs) are treated as variables. Row names are isolate names.
snps.var <- apply(snps.core, 2, function(x) length(unique(x))) # returns a named integer vector about the number of alleles that every SNP has
snps.bi <- snps.core[, as.integer(snps.var) == 2] # extract biallelic SNPs
snp.alleles <- .getAllelePairs(snps.bi) # get major and minor alleles of biallelic SNPs
G <- .encodeAlleles(snps.bi, snp.alleles) # encode alleles of core-genome biallelic SNPs and return the un-centred genotype matrix G
# remove any SNP whose minor allele does not occur at a minimal number of times. Default: off
# This step does not affect the sample size. Hence nrow(snps.core) = nrow(G) = sample size at this stage.
mac <- apply(G, 2, sum) # get minor-allele counts (Major alleles are coded as 0)
print(paste("There are", length(mac), "SNPs in the SNP matrix G.", sep = " "))
if (min.mac > 1) {
print(paste("Filtering out SNPs whose minor alleles occur less than", min.mac, "times.", sep = " "))
G <- G[, as.integer(mac) >= min.mac] # drop SNPs that do not occur at a minimum of min.mac times. It excludes isolate-specific SNPs when min.mac = 2.
print(paste0(ncol(G), " SNPs passed this filter."))
}
# match SNP annotations with G because the latter only covers biallelic SNPs
snp.annots <- subset(snp.annots, id %in% colnames(G))
snp.alleles <- snp.alleles[, snp.annots$id] # Only biallelic SNPs in G will be retained in this matrix as well.
# zero-centring of the SNP matrix
S <- scale(G, center = TRUE, scale = FALSE) # the centred genotype matrix of biallelic SNPs
# generate a BIMBAM-compatible SNP-genotype file for GEMMA ===============
G.bimbam <- as.data.frame(t(G), stringsAsFactors = FALSE) # no centring of SNP genotypes here as GEMMA does this
G.bimbam <- cbind.data.frame(id = rownames(G.bimbam), minor = snp.alleles["minor", ],
major = snp.alleles["major", ], G.bimbam, stringsAsFactors = FALSE)
rownames(G.bimbam) <- NULL # already copied to G.bimbam$id
if (G.file != "") {
.writeData(x = G.bimbam, output = G.file, message = "[SNP genotype matrix G]", skip = skip) # the BIMBAM-compatible SNP genotype file (tab-delimited)
}
# save a BIMBAM-formatted text file of SNP annotations for GEMMA ===============
if (annots.file != "") {
.writeData(x = snp.annots, output = annots.file, message = "[SNP annotations]",
skip = skip) # the BIMBAM-compatible SNP annotation file
}
# finally, wrap results into a single list
snps <- list(G = G, S = S, G.bimbam = G.bimbam,
annots = snp.annots, snp.alleles = snp.alleles, mac = mac,
core = snps.core, bi = snps.bi, var = snps.var)
} else if (snpData.class == "list") { # Alternatively, the complete results must be provided.
if (sum(names(snps) %in% c("G", "S", "G.bimbam", "annots", "snp.alleles", "mac",
"core", "bi", "var")) == 9) {
print("Skip importing core-genome SNPs because the curated SNP data are already provided.")
if (G.file != "") {
.writeData(x = snps[["G.bimbam"]], output = G.file, message = "[SNP genotype matrix G]",
skip = skip) # the BIMBAM-compatible SNP genotype file (tab-delimited)
}
if (annots.file != "") {
.writeData(x = snps[["annots"]], output = annots.file, message = "[SNP annotations]",
skip = skip)
}
} else {
stop("Input error of snps: it is a list but does not have all elements.")
}
} else {
stop("Input error: requiring either a list or a file path to your SNP table.")
}
return(snps)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.