R/SNPstats.R

Defines functions PlotSnpStats OHperSNP SnpStats

Documented in OHperSNP PlotSnpStats SnpStats

#' @title SNP Summary Statistics
#'
#' @description Estimate allele frequency (AF), missingness and Mendelian
#' errors per SNP.
#'
#' @details Calculation of these summary statistics can be done in PLINK, and
#'   SNPs with low minor allele frequency or high missingness should be filtered
#'   out prior to pedigree reconstruction. This function is provided as an aid
#'   to inspect the relationship between AF, missingness and genotyping error to
#'   find a suitable combination of SNP filtering thresholds to use.
#'
#'   For pedigree reconstruction, SNPs with zero or one copies of the alternate
#'   allele in the dataset (MAF \eqn{\le 1/2N}) are considered fixed, and
#'   excluded.
#'
#' @param GenoM  genotype matrix, in sequoia's format: 1 column per SNP, 1 row
#'   per individual, genotypes coded as 0/1/2/-9, and row names giving individual
#'   IDs.
#' @param Pedigree  dataframe with 3 columns: ID - parent1 - parent2.
#'   Additional columns and non-genotyped individuals are ignored. Used to count
#'   Mendelian errors per SNP and (poorly) estimate the error rate.
#' @param Duplicates  dataframe with pairs of duplicated samples
#' @param Plot  show histograms of the results?
#' @param ErrFlavour  DEPRECATED AND IGNORED. Was used to estimate
#'   \code{Err.hat}; see \code{\link{EstEr}}.
#'
#' @return A matrix with a number of rows equal to the number of SNPs
#'  (=number of columns of GenoM), and when no Pedigree is provided 2 columns:
#' \item{AF}{Allele frequency of the 'second allele' (the one for which the
#'   homozygote is coded 2)}
#' \item{Mis}{Proportion of missing calls}
#' \item{HWE.p}{p-value from chi-square test for Hardy-Weinberg equilibrium}
#'
#' When a Pedigree is provided, there are 8 additional columns:
#' \item{n.dam, n.sire, n.pair}{Number of dams, sires, parent-pairs successfully
#'   genotyped for the SNP}
#' \item{OHdam, OHsire}{Count of number of opposing homozygous cases}
#' \item{MEpair}{Count of Mendelian errors, includes opposing homozygous cases
#'   when only one parent is genotyped}
#' \item{n.dups, n.diff}{Number of duplicate pairs successfully genotyped for
#'   the SNP; number of differences. The latter does not count cases where one
#'   duplicate is not successfully genotyped at the SNP}
#'
#' @seealso  \code{\link{GenoConvert}} to convert from various data formats;
#'   \code{\link{CheckGeno}} to check the data is in valid format for sequoia
#'   and exclude monomorphic SNPs etc., \code{\link{CalcOHLLR}} to calculate OH
#'   & ME per individual; \code{\link{EstEr}} to estimate genotyping error rate.
#'
#' @examples
#' Genotypes <- SimGeno(Ped_HSg5, nSnp=100, CallRate = runif(100, 0.5, 0.8),
#'                      SnpError = 0.05)
#' SnpStats(Genotypes)   # only plots; data is returned invisibly
#' SNPstats <- SnpStats(Genotypes, Pedigree=Ped_HSg5)
#'
#' @importFrom graphics plot points legend
#'
#' @export

SnpStats <- function(GenoM,
                     Pedigree = NULL,
                     Duplicates = NULL,
                     Plot = TRUE,
                     ErrFlavour)
{
  # missingness
  Mis <- apply(GenoM, 2, function(x) sum(x < 0))/nrow(GenoM)
  # allele frequency
  AF <- apply(GenoM, 2, function(x) sum(x[x>=0])/(2*sum(x>=0)))
  # Hardy-weinberg equilibrium
  counts.o <- apply(GenoM, 2, function(x) table(factor(x, levels=c(0,1,2))))  # observed genotype counts
  freq.e <- rbind('0' = (1-AF)^2,
                  '1' = 2*AF*(1-AF),
                  '2' = AF^2)
  freq.e[, Mis==1] <- 0.0
  calc.HWE.p <- function(i) {
    if (all(counts.o[,i]==0)) {
      NA  # all missing
    } else {
      suppressWarnings(stats::chisq.test(x = counts.o[,i],
                                  p = freq.e[,i])$p.value)
    }
  }
  HWE.p <- sapply(1:ncol(GenoM), calc.HWE.p)

  OUT <- cbind(AF, Mis, HWE.p)

  if (is.logical(Pedigree)) {
    Plot <- Pedigree
    Pedigree <- NULL
  }

  if (!is.null(Pedigree)) {
    Par <- PedPolish(Pedigree, gID = rownames(GenoM), DropNonSNPd=TRUE,
                     KeepAllColumns=FALSE)
    Par <- Par[!(is.na(Par[,'dam']) & is.na(Par[,'sire'])), ]
    if (nrow(Par)==0)  warning('Cannot count OH, because pedigree ',
                            'does not have any genotyped parent-offspring pairs')
  }

  if (!is.null(Pedigree) | !is.null(Duplicates)) {
    OHcounts <- OHperSNP(GenoM, Par, Duplicates)
    OUT <- cbind(OUT, OHcounts)
  }

  if (Plot) {
    img <- tryCatch(
      {
        suppressWarnings( PlotSnpStats( OUT ) )
      },
      error = function(e) {
        message("Plotting area too small for SnpStats() plot (or other plotting problem)")
      })
  }

  rownames(OUT) <- paste0("SNP", formatC(1:nrow(OUT),
                                         width=ifelse(nrow(OUT)<1000, 3, 4),
                                         flag="0"))
  invisible( OUT )
}



#==============================================================================
#' @title Estimate Genotyping Error Rate
#'
#' @description Estimate genotyping error rate from Mendelian errors per SNP.
#'
#' @param GenoM  genotype matrix, in sequoia's format: 1 column per SNP, 1 row
#'   per individual, genotypes coded as 0/1/2/-9, and rownames giving individual
#'   IDs.
#' @param Par  pedigree dataframe, only genotyped parents are used.
#' @param Dups  pairs of duplicates
#'
#' @return A dataframe with columns:
#' \item{n.dam, n.sire, n.pair}{Number of dams, sires, parent-pairs successfully
#'   genotyped for the SNP}
#' \item{OHdam, OHsire}{Count of number of opposing homozygous cases}
#' \item{MEpair}{Count of Mendelian errors, includes opposing homozygous cases}
#' \item{n.dups, n.diff}{Number of duplicate pairs successfully genotyped for
#'   the SNP; number of differences}
#'
#' @seealso \code{\link{SnpStats}}.
#'
#' @keywords internal

OHperSNP <- function(GenoM, Par, Dups=NULL)
{
  GenoMx <- GenoM
  GenoMx[GenoMx<0] <- 3
  GenoMx <- rbind(GenoMx, "NA" = 3)  # add fake row for parent='NA'
  GenoMx <- GenoMx +1
  gID <- rownames(GenoM)

  # names --> rownumbers in GenoM
  PedN <- PedToNum(Par, gID, DoDummies = "no")$PedPar
  PedN[PedN==0] <- nrow(GenoMx)

  # offspring-dam-sire trio genotype combo's  ====
  Obs.OO.all <- array(0, dim=c(nrow(PedN), 3, ncol(GenoMx)))
  for (i in 1:nrow(PedN)) {
    if (PedN[i,'dam']==0 & PedN[i,'sire']==0)  next
    Obs.OO.all[i,,] <- rbind(GenoMx[i, ],
                             GenoMx[PedN[i,1], ],
                             GenoMx[PedN[i,2], ])
  }
  tbl.trio <- plyr::aaply(Obs.OO.all, 3, function(M) table(factor(M[,1], levels=1:4),
                                                          factor(M[,2], levels=1:4),
                                                          factor(M[,3], levels=1:4)))

  # count mendelian errors ===
  MER <- array(0, dim=c(4,4,4))  # offspr - mother - father
  MER[1:3,,1] <- matrix(c(0,1,2, 0,0,1, 1,0,1, 0,0,1), 3,4)  # 0/1/2/NA
  MER[1:3,,2] <- matrix(c(0,0,1, 0,0,0, 1,0,0, 0,0,0), 3,4)
  MER[1:3,,3] <- matrix(c(1,0,1, 1,0,0, 2,1,0, 1,0,0), 3,4)
  MER[1:3,,4] <- matrix(c(0,0,1, 0,0,0, 1,0,0, 0,0,0), 3,4)

  tmpdam <- apply(tbl.trio, c(1,2,3), sum)  # sum over sire genotypes (incl. missing)
  tmpsire <- apply(tbl.trio, c(1,2,4), sum)
  Counts <- data.frame(n.dam = apply(tmpdam, 1, function(M) sum(M[1:3, 1:3])),
                       n.sire = apply(tmpsire, 1, function(M) sum(M[1:3, 1:3])),
                       n.pair = apply(tbl.trio, 1, function(A) sum(A[1:3, 1:3, 1:3])),
                       OHdam = apply(tmpdam, 1, function(M) M[1,3] + M[3,1]),
                       OHsire = apply(tmpsire, 1, function(M) M[1,3] + M[3,1]),
                       MEpair = sapply(1:ncol(GenoM), function(l) sum(MER * tbl.trio[l,,,])))

  # duplicates ===
  if (!is.null(Dups)) {
    GenoNums <- setNames(seq_along(gID), gID)
    NumDups <- cbind(id1 = GenoNums[Dups[,1]],
                     id2 = GenoNums[Dups[,2]])

    OO.dup <- array(dim=c(nrow(Dups), 2, ncol(GenoMx)))
    for (i in 1:nrow(Dups)) {
      OO.dup[i,,] <- rbind(GenoMx[NumDups[i,1], ],
                           GenoMx[NumDups[i,2], ])
    }
    tbl.dup <- plyr::aaply(OO.dup, 3, function(M) table(factor(M[,1], levels=1:3),
                                                        factor(M[,2], levels=1:3)))
    Counts$n.dups <- apply(tbl.dup, 1, sum)
    Counts$n.diff <- apply(tbl.dup, 1, function(M) sum(M) - sum(diag(M)))
  }

  # out ===
  return( Counts )
}



#==============================================================================
#' @title plot SnpStats results
#'
#' @description scatter plots and histograms of allele frequency, missingness,
#'   and estimated genotyping error, across SNPs
#'
#' @param OUT  output from \code{SnpStats}
#'
#' @return plots
#'
#' @keywords internal


PlotSnpStats <- function(OUT)
{

  oldpar <- par(no.readonly = TRUE)
  oldpar <- oldpar[!names(oldpar) %in% c("pin", "fig")]
  par(mfrow=c(3,3), mai=c(.9,.8,.2,.1), xpd=NA)
  graphics::plot.new()

  OUT <- cbind(OUT,
               'MAF' = ifelse(OUT[,"AF"] <= 0.5, OUT[,"AF"], 1-OUT[,"AF"]),
               'HWE' = -log10(OUT[,"HWE.p"]))

  # top 5% worst SNPs by each metric (coloured red in scatter plots)
  q95 <- list('MAF' = OUT[,'MAF'] < stats::quantile(OUT[,'MAF'], prob=0.05, na.rm=TRUE),
              'Mis' = OUT[,"Mis"] > stats::quantile(OUT[,"Mis"], prob=0.95, na.rm=TRUE),
              'HWE' = OUT[,"HWE"] > stats::quantile(OUT[,"HWE"], prob=0.95, na.rm=TRUE))
  vars <- c('Minor allele frequency' = 'MAF',
            'Missingness' = 'Mis',
            'HWE test: -log10(p)' = 'HWE')

  for (i in seq_along(vars)) {
    for (j in seq_along(vars)) {

      par(mfg = c(i,j))  # where to put next plot

      if (i == j) {  # diagonal: histogram
        tryPlot(hist,
                OUT[,vars[i]], breaks=nrow(OUT)/5, col="grey", main="",
                xlab=names(vars[i]), cex.lab=1.3, ylab="",
                # ErrMsg = "SnpStats: Plotting area too small",
                oldpar = oldpar)

      } else if (i > j) {  # below diagonal: scatter plot
        plot(OUT[,vars[j]], OUT[,vars[i]], pch=16, cex=1.2,
             xlab=names(vars[j]), ylab=names(vars[i]), cex.lab=1.3)
        h <- max(setdiff(seq_along(vars), c(i,j)))   # 'the other one'
        points(OUT[q95[[h]], vars[j]], OUT[q95[[h]], vars[i]], pch=16, col="red")
        legend(par('usr')[1], par('usr')[4], yjust=0, # above
               legend = paste("5% worst", vars[h]), pch=16, col="red", text.col='red', box.col='red')

      } else {
        # above diagonal: empty, do nothing
      }
    }
  }

  par(oldpar)  # restore old par settings
}

Try the sequoia package in your browser

Any scripts or data that you put into this service are public.

sequoia documentation built on Sept. 8, 2023, 5:29 p.m.