or relatedness

# GenotypeData methods for SeqVarTools generics
setMethod("chromWithPAR",
          "GenotypeData",
          function(gdsobj, ...) {
              getChromosome(gdsobj, char=TRUE)
          })

setMethod("validateSex",
          "GenotypeData",
          function(x) {
              sex <- suppressWarnings(getSex(x))
              if (!is.null(sex)) {
                  if (all(sex %in% c(1,2,NA))) {
                      sex <- c("M", "F")[sex]
                  }
                  if (!all(sex %in% c("M", "F", NA))) {
                      sex <- NULL
                  }
              }
              sex
          })

setMethod("variantInfo",
          "GenotypeData",
          function(gdsobj, alleles=FALSE) {
              data.frame(variant.id=getSnpID(gdsobj),
                         chr=getChromosome(gdsobj, char=TRUE),
                         pos=getPosition(gdsobj),
                         stringsAsFactors=FALSE)
          })

setMethod("variantFilter",
          "GenotypeIterator",
          function(x) {
              snpFilter(x)
          })


# check for all genotypes identical (including all hets)
# allow a tolerance in case we have imputed dosage values rather than integer
# return an index for subsetting rather than modifying geno
# (so we can subset variant info also)
# we have to compute count and freq anyway, so pass as arguments
.filterMonomorphic <- function(geno, count, freq, imputed=FALSE) {
    #count <- colSums(!is.na(geno))
    if (!imputed) {
        #freq <- 0.5*colMeans(geno, na.rm=TRUE)
        isref <- freq == 0
        isalt <- freq == 1
        ishet <- colSums(geno == 1, na.rm=TRUE) == count
    } else {
        tol <- .Machine$double.eps ^ 0.5
        isref <- colSums(abs(geno) < tol, na.rm=TRUE) == count
        isalt <- colSums(abs(geno - 1) < tol, na.rm=TRUE) == count
        ishet <- colSums(abs(geno - 2) < tol, na.rm=TRUE) == count
    }
    !(isref | isalt | ishet)
}


# index is in case we had to subset geno so it no longer matches the variant filter
# (in the case of allele matching)
.alleleFreq <- function(gdsobj, geno, variant.index=NULL, sample.index=NULL, male.diploid=TRUE, genome.build=c("hg19", "hg38")) {

    # check sex
    sex <- validateSex(gdsobj)
    if (!is.null(sample.index)) sex <- sex[sample.index]
    if (is.null(sex)) {
        #freq <- 0.5*colMeans(geno, na.rm=TRUE)
        count <- colSums(geno, na.rm=TRUE)
        # nsamp <- colSums(!is.na(geno))
        nsamp <- .countNonMissing(geno, MARGIN = 2)
        freq <- count/(2*nsamp)
        mac <- round(pmin(count, 2*nsamp - count))
        return(data.frame(freq=freq, MAC=mac))
    }

    # check chromosome
    chr <- chromWithPAR(gdsobj, genome.build=genome.build)
    if (!is.null(variant.index)) chr <- chr[variant.index]
    X <- chr %in% "X"
    Y <- chr %in% "Y"
    auto <- !X & !Y

    # allele count vectors
    #freq <- rep(NA, ncol(geno))
    count <- rep(NA, ncol(geno))
    possible <- rep(NA, ncol(geno))

    # autosomes
    if (any(auto)) {
        #freq[auto] <- 0.5*colMeans(geno[, auto, drop=FALSE], na.rm=TRUE)
        count[auto] <- colSums(geno[, auto, drop=FALSE], na.rm=TRUE)
        # possible[auto] <- 2 * colSums(!is.na(geno[, auto, drop=FALSE]))
        possible[auto] <- 2 * .countNonMissing(geno[, auto, drop=FALSE], MARGIN = 2)
    }

    # X chrom
    if (any(X)) {
        female <- sex %in% "F"
        male <- sex %in% "M"
        F.count <- colSums(geno[female, X, drop=FALSE], na.rm=TRUE)
        # F.nsamp <- colSums(!is.na(geno[female, X, drop=FALSE]))
        F.nsamp <- .countNonMissing(geno[female, X, drop=FALSE], MARGIN = 2)
        M.count <- colSums(geno[male, X, drop=FALSE], na.rm=TRUE)
        if (male.diploid) {
            M.count <- 0.5*M.count
        }
        # M.nsamp <- colSums(!is.na(geno[male, X, drop=FALSE]))
        M.nsamp <- .countNonMissing(geno[male, X, drop=FALSE], MARGIN = 2)
        #freq[X] <- (F.count + M.count)/(2*F.nsamp + M.nsamp)
        count[X] <- (F.count + M.count)
        possible[X] <- (2*F.nsamp + M.nsamp)
    }

    # Y chrom
    if (any(Y)) {
        male <- sex %in% "M"
        #freq[Y] <- colMeans(geno[male, Y, drop=FALSE], na.rm=TRUE)
        count[Y] <- colSums(geno[male, Y, drop=FALSE], na.rm=TRUE)
        if (male.diploid) {
            #freq[Y] <- 0.5*freq[Y]
            count[Y] <- 0.5*count[Y]
        }
        # possible[Y] <- colSums(!is.na(geno[male, Y, drop=FALSE]))
        possible[Y] <- .countNonMissing(geno[male, Y, drop=FALSE], MARGIN = 2)
    }

    freq <- count / possible
    mac <- round(pmin(count, possible - count))
    data.frame(freq=freq, MAC=mac)
}


.meanImpute <- function(geno, freq) {
    miss.idx <- which(is.na(geno))
    miss.var.idx <- ceiling(miss.idx/nrow(geno))
    geno[miss.idx] <- 2*freq[miss.var.idx]
    geno
}

.weightFromFreq <- function(freq, weight.beta) {
    freq <- pmin(freq, 1-freq)
    dbeta(freq, weight.beta[1], weight.beta[2])
}

# set a sample filter, and return the index to put filtered samples
# in the same order as the null model
.setFilterNullModel <- function(gdsobj, null.model, verbose=TRUE) {
    if (!is.null(null.model$sample.id)) {
        seqSetFilter(gdsobj, sample.id=null.model$sample.id, verbose=verbose)
        sample.index <- match(null.model$sample.id, seqGetData(gdsobj, "sample.id"))
    } else {
        sample.index <- seq_along(seqGetData(gdsobj, "sample.id"))
    }
    sample.index
}

# return sample.index to match GenotypeData object to a null model
.sampleIndexNullModel <- function(gdsobj, null.model) {
    if (!is.null(null.model$sample.id)) {
        sample.index <- match(null.model$sample.id, getScanID(gdsobj))
    } else {
        sample.index <- match(rownames(null.model$model.matrix),
                              sampleNames(getScanAnnotation(gdsobj)))
    }
    sample.index
}

.modelMatrixColumns <- function(null.model, col.name) {
    cols <- unlist(lapply(col.name, function(x) grep(paste0("^", x), colnames(null.model$model.matrix))))
    null.model$model.matrix[,cols,drop=FALSE]
}

.matchAlleles <- function(gdsobj, var.info) {
    seqnames <- NULL
    if (nrow(var.info) == 0) return(integer(0))
    var.info$n <- 1:nrow(var.info)
    var.sel <- as.data.table(currentRanges(gdsobj))
    var.sel <- var.sel[,`:=`(seqnames=as.character(seqnames))]
    setnames(var.sel, c("seqnames", "start"), c("chr", "pos"))
    match.cols <- c("chr", "pos")
    if ("ref" %in% names(var.sel)) match.cols <- c(match.cols, "ref")
    if ("alt" %in% names(var.sel)) match.cols <- c(match.cols, "alt")
    var.match <- merge(as.data.table(var.info), var.sel, by=match.cols)
    unique(var.match$n)
}


.addNames <- function(iterator, x) {
    gr <- variantRanges(iterator)
    rownames(x$results) <- names(gr)
    names(x$variantInfo) <- names(gr)
    x
}

.addWindows <- function(iterator, x) {
    windows <- variantRanges(iterator)
    win.df <- data.frame(chr=as.character(GenomicRanges::seqnames(windows)),
                         start=BiocGenerics::start(windows),
                         end=BiocGenerics::end(windows),
                         stringsAsFactors=FALSE)
    x$results <- cbind(win.df, x$results)
    x
}

setGeneric(".annotateAssoc", function(gdsobj, x) standardGeneric(".annotateAssoc"))

setMethod(".annotateAssoc",
          "SeqVarWindowIterator",
          function(gdsobj, x) {
              .addWindows(gdsobj, x)
          })

setMethod(".annotateAssoc",
          "SeqVarRangeIterator",
          function(gdsobj, x) {
              .addNames(gdsobj, x)
          })

setMethod(".annotateAssoc",
          "SeqVarListIterator",
          function(gdsobj, x) {
              .addNames(gdsobj, x)
          })

setMethod(".annotateAssoc",
          "SeqVarBlockIterator",
          function(gdsobj, x) {
              x
          })

setMethod(".annotateAssoc",
          "GenotypeIterator",
          function(gdsobj, x) {
              filt.names <- names(snpFilter(gdsobj))
              if (!is.null(filt.names)) {
                  rownames(x$results) <- filt.names
                  names(x$variantInfo) <- filt.names
              }
              x
          })

.listIdentical <- function(x) {
    all(sapply(x[-1], FUN = identical, x[[1]]))
}

Any scripts or data that you put into this service are public.

GENESIS documentation built on Jan. 30, 2021, 2:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

GENESIS
GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

R/utils.R
In GENESIS: GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

Defines functions .listIdentical .addWindows .addNames .matchAlleles .modelMatrixColumns .sampleIndexNullModel .setFilterNullModel .weightFromFreq .meanImpute .alleleFreq .filterMonomorphic

Try the GENESIS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GENESIS GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

R/utils.R In GENESIS: GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

Defines functions .listIdentical .addWindows .addNames .matchAlleles .modelMatrixColumns .sampleIndexNullModel .setFilterNullModel .weightFromFreq .meanImpute .alleleFreq .filterMonomorphic

Try the GENESIS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GENESIS
GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

R/utils.R
In GENESIS: GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness