R/func__importCoreGenomeSNPs.R

Defines functions importCoreGenomeSNPs

Documented in importCoreGenomeSNPs

#' @title Import genotypes of core-genome SNPs
#'
#' @description Read the SNP table usually produced by the parseSNPtable.py of RedDog (https://github.com/katholt/RedDog).
#' The function also compresses the SNP matrix into a pattern matrix. Notice it is the user's responsibility to ensure the
#' input SNP table only contains 100% conserved SNPs. This function does not check for the conservation status.
#'
#' @param snps either a path to the SNP table or a list generated by this function previously
#' @param snps.delim a single character for the delimiter in the SNP table to be imported
#' @param pos.col either a name or an index of the column for SNP positions
#' @param ref.col (optional) A string specifying the column for SNPs of the reference genome.
#' @param replace.ref Replace the column name specified by ref.col with this argument when ref.col is found amonst column names.
#'     Such a column is seen in the SNP table created using RedDog. A user may want to substitute
#'     it with a genuine strain name to match the reference isolate in the SNP matrix with that in the genetic and allelic matrices.
#' @param min.mac the minimal number of times that each minor allele occurs in the population (with outlier isolates excluded)
#'     min.mac = 1: no filter of SNPs by the minor allele count (MAC) is applied.
#'     min.mac = 2: the filter is applied to remove any isolate-specific SNPs.
#'     min.mac > 2: the filter is applied for other purposes.
#' @param ingroup a character vector of isolate names to be included in the SNP matrix
#' @param outliers a character vector of names for outlier isolates (to be excluded from the SNP matrix)
#' @param G.file file path of a bimbam-formatted genotype matrix (G). Keep it an empty character ("" by default) to prevent the function from writing this genotype matrix to the hard disk.
#' @param annots.file file path of bimbam-formatted SNP annotations. Keep it "" to prevent the function from writing SNP annotations to the hard disk.
#' @param skip whether to skip overwriting existing output files.
#'
#' @author Yu Wan (\email{wanyuac@@126.com})
#' @export
#
# Copyright 2017 Yu Wan <wanyuac@126.com>
# Licensed under the Apache License, Version 2.0
# Last edition: 21 May 2017

importCoreGenomeSNPs <- function(snps, snps.delim = ",", pos.col = "Pos", ref.col = "Ref",
                                 replace.ref = NULL, min.mac = 1, ingroup = NULL,
                                 outliers = NULL, G.file = "", annots.file = "",
                                 skip = TRUE) {
    snpData.class <- class(snps)
    if (snpData.class == "character") {  # when a file name is provided
        require(data.table)

        print(paste0("Reading the SNP table from the file ", snps))
        snps.core <- fread(input = snps, sep = snps.delim, header = TRUE,
                           data.table = FALSE, stringsAsFactors = FALSE)  # returns a large data frame

        print("Processing core-genome SNPs.")

        # get the column name of SNP positions when pos.col is an integer
        if (class(pos.col) == "integer") {
            pos.col <- names(snps.core)[pos.col]  # get the column name using an index
        }

        # substitute the column name specified by ref.column with its genuine strain name when the reference column is present and the substitution is specified
        if (is.character(replace.ref)) {
            if (ref.col != "") {
                i <- which(names(snps.core) == ref.col)  # length(i) = 0 when replace.ref = "" or ref.col is not present
                if (replace.ref != "" && length(i) == 1) {  # length(i) actually equals one when the ref.col column is found because R does not allow duplicated column names.
                    print(paste0("Replacing the column name ", ref.col, " with ", replace.ref, "."))
                    names(snps.core)[i] <- replace.ref
                } else {
                    print("Warning: no column name is replaced as the value of replace.ref equals '' or the column name for the reference genotypes is absent in the SNP matrix.")
                }
            } else {
                print("Warning: no column name is replaced for the reference genotypes because the corresponding column is absent in the SNP matrix.")
            }
        } else {  # default
            print("Warning: no column name is replaced for the reference strain because the argument for replace.ref is not set or invalid.")
        }

        # construct a data frame for SNP annotations
        # Here, the maximum of the SNP id reveals the number of SNPs in the original SNP table.
        # Later, rows of this data frame will be selected for allele counts, etc.
        n.snps <- nrow(snps.core)
        # Arbitarily assign a chromosome ID, which is essentially trivial.
        # However, as the same as the BUGWAS package does, I hacked the index of the human Y chromosome here,
        # which is 24 by convention, to emphasise the haploidy of bacterial genomes.
        snp.annots <- data.frame(id = paste0("rs", 1 : n.snps),
                                 pos = snps.core[, pos.col],
                                 chr = rep(24, times = n.snps),
                                 stringsAsFactors = FALSE)  # rs1, rs2, ..., rsN

        # filter isolates as specified, but keep the row (SNP) number intact
        # Since we define core-genome SNPs as those preserved in all strains,
        # the remaining SNPs after strain removal are still core-genome SNPs.
        if (!is.null(outliers)) {
            snps.core <- snps.core[, !(names(snps.core) %in% outliers)]  # exclude columns corresponding to outlier isolates (including the outgroup isolate)
        }
        if (!is.null(ingroup)) {
            snps.core <- snps.core[, c(pos.col, ingroup)]  # only keep isolates on the "ingroup" list
        }
        print(paste("There are", ncol(snps.core) - 1, "isolates in the SNP matrix.", sep = " "))

        # drop the position column from the data frame as we no longer need it
        # The "which" statement always holds even if SNP positions are not stored in the first column (Its index may change when some columns are excluded in this case).
        snps.core <- snps.core[, -which(names(snps.core) == pos.col)]  # Now all column names are isolate names.

        # prepare the genotype matrix of biallelic SNPs
        snps.core <- t(as.matrix(snps.core))  # convert the data frame into a matrix and transpose it to make columns SNP names
        colnames(snps.core) <- snp.annots$id  # Now columns (SNPs) are treated as variables. Row names are isolate names.
        snps.var <- apply(snps.core, 2, function(x) length(unique(x)))  # returns a named integer vector about the number of alleles that every SNP has
        snps.bi <- snps.core[, as.integer(snps.var) == 2]  # extract biallelic SNPs
        snp.alleles <- .getAllelePairs(snps.bi)  # get major and minor alleles of biallelic SNPs
        G <- .encodeAlleles(snps.bi, snp.alleles)  # encode alleles of core-genome biallelic SNPs and return the un-centred genotype matrix G

        # remove any SNP whose minor allele does not occur at a minimal number of times. Default: off
        # This step does not affect the sample size. Hence nrow(snps.core) = nrow(G) = sample size at this stage.
        mac <- apply(G, 2, sum)  # get minor-allele counts (Major alleles are coded as 0)
		print(paste("There are", length(mac), "SNPs in the SNP matrix G.", sep = " "))
        if (min.mac > 1) {
			print(paste("Filtering out SNPs whose minor alleles occur less than", min.mac, "times.", sep = " "))
            G <- G[, as.integer(mac) >= min.mac]  # drop SNPs that do not occur at a minimum of min.mac times. It excludes isolate-specific SNPs when min.mac = 2.
			print(paste0(ncol(G), " SNPs passed this filter."))
        }

        # match SNP annotations with G because the latter only covers biallelic SNPs
        snp.annots <- subset(snp.annots, id %in% colnames(G))
        snp.alleles <- snp.alleles[, snp.annots$id]  # Only biallelic SNPs in G will be retained in this matrix as well.

        # zero-centring of the SNP matrix
        S <- scale(G, center = TRUE, scale = FALSE)  # the centred genotype matrix of biallelic SNPs

        # generate a BIMBAM-compatible SNP-genotype file for GEMMA ===============
        G.bimbam <- as.data.frame(t(G), stringsAsFactors = FALSE)  # no centring of SNP genotypes here as GEMMA does this
        G.bimbam <- cbind.data.frame(id = rownames(G.bimbam), minor = snp.alleles["minor", ],
                                     major = snp.alleles["major", ], G.bimbam, stringsAsFactors = FALSE)
        rownames(G.bimbam) <- NULL  # already copied to G.bimbam$id
        if (G.file != "") {
            .writeData(x = G.bimbam, output = G.file, message = "[SNP genotype matrix G]", skip = skip)  # the BIMBAM-compatible SNP genotype file (tab-delimited)
        }

        # save a BIMBAM-formatted text file of SNP annotations for GEMMA ===============
        if (annots.file != "") {
            .writeData(x = snp.annots, output = annots.file, message = "[SNP annotations]",
                       skip = skip)  # the BIMBAM-compatible SNP annotation file
        }

        # finally, wrap results into a single list
        snps <- list(G = G, S = S, G.bimbam = G.bimbam,
                     annots = snp.annots, snp.alleles = snp.alleles, mac = mac,
                     core = snps.core, bi = snps.bi, var = snps.var)
    } else if (snpData.class == "list") {  # Alternatively, the complete results must be provided.
        if (sum(names(snps) %in% c("G", "S", "G.bimbam", "annots", "snp.alleles", "mac",
                                   "core", "bi", "var")) == 9) {
            print("Skip importing core-genome SNPs because the curated SNP data are already provided.")
            if (G.file != "") {
                .writeData(x = snps[["G.bimbam"]], output = G.file, message = "[SNP genotype matrix G]",
                           skip = skip)  # the BIMBAM-compatible SNP genotype file (tab-delimited)
            }
            if (annots.file != "") {
                .writeData(x = snps[["annots"]], output = annots.file, message = "[SNP annotations]",
                           skip = skip)
            }
        } else {
            stop("Input error of snps: it is a list but does not have all elements.")
        }
    } else {
        stop("Input error: requiring either a list or a file path to your SNP table.")
    }

    return(snps)
}
wanyuac/GeneMates documentation built on Aug. 12, 2022, 7:37 a.m.