#' Check genotype object
#'
#' This function checks the genotype object before passing the SNPs to `scoreInvHap`. The
#' function removes SNPs with different alleles or different allele frequencies. Nonetheless,
#' it is possible that these SNPs could be recovered after an examination of the results. Be
#' aware that testing of allele frequencies might fail for small datasets.
#'
#' @export
#' @param SNPobj List with SNPs data from plink or \code{VCF-class}.
#' @param checkAlleleFreqs Should allele frequencies be check (Default: TRUE)
#' @return List containing the SNPs prepared for \code{scoreInvHap}
#' \itemize{
#' \item{genos: Object with genotype data ready for scoreInvHap}
#' \item{wrongAlleles: Character vector with the SNPs discarded due to having alleles different to reference}
#' \item{wrongFreqs: Character vector with the SNPs discarded due to having allele frequencies different to reference}
#' }
#' @examples
#'
#' ## Run method
#' if(require(VariantAnnotation)){
#' vcf <- readVcf(system.file("extdata", "example.vcf", package = "scoreInvHap"), "hg19")
#' resList <- checkSNPs(vcf)
#' resList
#' }
checkSNPs <- function(SNPobj, checkAlleleFreqs = TRUE){
info <- scoreInvHap::info
## Check VCF
if (is(SNPobj, "VCF")){
dupSNPs <- rownames(SNPobj)[duplicated(rownames(SNPobj))]
SNPobj <- SNPobj[!rownames(SNPobj) %in% dupSNPs, ]
## Remove variants with length != 1
SNPobj <- SNPobj[width(SNPobj) == 1, ]
## Remove variants with more than one alternative allele
SNPobj <- SNPobj[lengths(rowRanges(SNPobj)$ALT) == 1, ]
# Filter SNPs with bad imputation quality (R2 < 0.3)
if ("R2" %in% colnames(VariantAnnotation::info(SNPobj))){
SNPobj <- SNPobj[VariantAnnotation::info(SNPobj)$R2 > 0.3, ]
}
ranges <- SummarizedExperiment::rowRanges(SNPobj)
rownames(SNPobj) <- paste(seqnames(ranges), start(ranges), sep = ":")
## Select SNPs in scoreInvHap
SNPobj <- SNPobj[rownames(SNPobj) %in% rownames(info), ]
## Check alleles
ranges <- SummarizedExperiment::rowRanges(SNPobj)
ref <- as.character(ranges$REF)
alt <- as.character(unlist(ranges$ALT))
info <- info[rownames(SNPobj),]
badMask <- (ref != info$Ref & ref != info$Alt) | (alt != info$Ref & alt != info$Alt)
badSNPs <- NULL
## Remove SNPs with wrong alleles
if (sum(badMask) > 0){
badSNPs <- rownames(SNPobj)[badMask]
SNPobj <- SNPobj[!badMask, ]
info <- info[!badMask, , drop = FALSE]
}
badFreq <- NULL
## Check allele Freqs
if(checkAlleleFreqs & nrow(SNPobj) > 1){
ranges <- SummarizedExperiment::rowRanges(SNPobj)
ref <- as.character(ranges$REF)
stats <- VariantAnnotation::snpSummary(SNPobj)
freq <- ifelse(ref == info$Ref, stats$a1Freq, stats$a2Freq)
freqMask <- abs(freq - info$Freq) > 0.2
## Remove variants without allele frequency
freqMask[is.na(freqMask)] <- TRUE
if (sum(freqMask) > 0){
badFreq <- rownames(SNPobj)[freqMask]
SNPobj <- SNPobj[!freqMask, ]
info <- info[!freqMask, , drop = FALSE]
}
}
## Change ids
rownames(SNPobj) <- info[rownames(SNPobj), "id"]
return(list(genos = SNPobj, wrongAlleles = badSNPs, wrongFreqs = badFreq))
}
## Check plink
if (!all(c("map", "genotypes") %in% names(SNPobj))){
stop("SNPobj must contain map and genotypes elements")
}
map <- SNPobj$map
geno <- SNPobj$geno
if (!all(c("allele.1", "allele.2") %in% colnames(map))){
stop("map of SNPobj must contain columns allele.1 and allele.2")
}
geno <- geno[, !is.na(map$allele.1)]
map <- map[!is.na(map$allele.1), ]
## Change name to position
if (sum(duplicated(paste(map$chromosome, map$position, sep = ":")))){
stop("There are different SNPs located in the same position. Please, include only one variant per site before proceeding.")
}
colnames(geno) <- rownames(map) <- paste(map$chromosome, map$position, sep = ":")
## Select SNPs in scoreInvHap
selSNPs <- rownames(map)[rownames(map) %in% rownames(info)]
map <- map[selSNPs, , drop = FALSE]
geno <- geno[, selSNPs, drop = FALSE]
## Check alleles
info <- info[rownames(map),]
badMask <- (map$allele.1 != info$Ref & map$allele.1 != info$Alt) | (map$allele.2 != info$Ref & map$allele.2 != info$Alt)
badSNPs <- NULL
## Remove SNPs with wrong alleles
if (sum(badMask) > 0){
badSNPs <- rownames(map)[badMask]
map <- map[!badMask, , drop = FALSE]
geno <- geno[, !badMask, drop = FALSE]
info <- info[!badMask, , drop = FALSE]
}
badFreq <- NULL
## Check allele Freqs
if(checkAlleleFreqs & nrow(map) > 0){
stats <- snpStats::col.summary(geno)
freq <- ifelse(map$allele.1 == info$Ref, stats$RAF, 1 - stats$RAF)
freqMask <- abs(freq - info$Freq) > 0.2
## Remove variants without allele frequency
freqMask[is.na(freqMask)] <- TRUE
if (sum(freqMask) > 0){
badFreq <- rownames(map)[freqMask]
map <- map[!freqMask, , drop = FALSE]
geno <- geno[, !freqMask, drop = FALSE]
info <- info[!freqMask, , drop = FALSE]
}
}
## Change ids
colnames(geno) <- rownames(map) <- info[rownames(map), "id"]
SNPobj$map <- map
SNPobj$genotypes <- geno
return(list(genos = SNPobj, wrongAlleles = badSNPs, wrongFreqs = badFreq))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.