R/subset.haplohh.R
In rehh: Searching for Footprints of Selection using 'Extended Haplotype Homozygosity' Based Tests

Documented in subset.haplohh

#'Subsets object of \code{\link{haplohh-class}}
#'@description Subsets the data of an object of class \code{\link{haplohh-class}},
#'meeting certain conditions.
#'@param x object of class \code{\link{haplohh-class}} to be subset.
#'@param select.hap expression, indicating haplotypes to select.
#'@param select.mrk expression, indicating markers to select.
#'@param min_perc_geno.hap threshold on percentage of missing data for haplotypes
#'(haplotypes with less than \code{min_perc_geno.hap} percent of markers genotyped are discarded). Default is \code{NA},
#'hence no constraint.
#'@param min_perc_geno.mrk threshold on percentage of missing data for markers (markers genotyped on less than
#'\code{min_perc_geno.mrk} percent of haplotypes are discarded). By default, \code{min_perc_geno.mrk=100},
#'hence only fully genotyped markers are retained.
#'This value cannot be set to \code{NA} or zero.
#'@param min_maf threshold on the Minor Allele Frequency. Markers having a MAF lower than or equal to minmaf are discarded.
#'In case of multi-allelic markers the second-most frequent allele is referred to as minor allele.
#'Setting this value to zero eliminates monomorphic sites. Default is \code{NA},
#'hence no constraint.
#'@param max_alleles threshold for the maximum number of different alleles at a site. Default is \code{NA},
#'hence no restriction. In order to retain only bi-allelic markers, set this parameter to 2.
#'@param verbose logical. If \code{TRUE} (default), report verbose progress.
#'@param ... further arguments are ignored.
#'@seealso \code{\link{haplohh-class}}, \code{\link{data2haplohh}}
#'@examples #example haplohh object (280 haplotypes, 1424 SNPs)
#'#see ?haplohh_cgu_bta12 for details
#'data(haplohh_cgu_bta12)
#'#select subset of first 10 hyplotypes and first 5 markers
#'subset(haplohh_cgu_bta12, select.hap = 1:10, select.mrk = 1:5)
#'@export
subset.haplohh <-
  function(x,
           select.hap = NULL,
           select.mrk = NULL,
           min_perc_geno.hap = NA,
           min_perc_geno.mrk = 100,
           min_maf = NA,
           max_alleles = NA,
           verbose = TRUE,
           ...) {
    # check parameters
    ### empty haplotype makes no sense, but is of no harm
    if (!is.na(min_perc_geno.hap) & (min_perc_geno.hap < 0 |
                                     min_perc_geno.hap > 100)) {
      stop("min_perc_geno.hap should lie in the interval [0,100].",
           call. = FALSE)
    }
    ### empty marker will cause trouble -> forbid NA or zero !!
    if (is.na(min_perc_geno.mrk) | min_perc_geno.mrk <= 0 |
        min_perc_geno.mrk > 100) {
      stop("min_perc_geno.mrk should lie in the interval (0,100].",
           call. = FALSE)
    }
    ### minor frequency can be maximal 0.5
    if (!is.na(min_maf)) {
      if (!is.numeric(min_maf) | min_maf < 0 |
          min_maf > 0.5) {
        stop("min_maf should lie in the interval [0,0.5].", call. = FALSE)
      }
    }
    ### max_alleles must be at least 2
    if (!is.na(max_alleles)) {
      if (!is.numeric(max_alleles) | max_alleles < 2) {
        stop("max_alleles should be at least 2.", call. = FALSE)
      }
    }
    ### check if object is valid haplohh
    if (!is.haplohh(x)) {
      stop("Data is not a valid object of class haplohh.", call. = FALSE)
    }
    ### check if object is empty
    if (min(dim(haplo(x))) == 0) {
      stop("Empty data object.", call. = FALSE)
    }
    
    # filtering
    
    if (!is.null(select.hap)) {
      if (verbose)
        cat("* Subset haplotypes *\n")
      # drop = FALSE ensures that haplo is a matrix
      haplo <- x@haplo[select.hap, , drop = FALSE]
      
      if (nrow(haplo) == nhap(x)) {
        if (verbose)
          cat("No haplotype discarded.\n")
      } else{
        if (verbose)
          cat(nhap(x) - nrow(haplo), "haplotypes discarded.\n")
        x@haplo <- haplo
        if (verbose)
          cat(nhap(x), "haplotypes remaining.\n")
      }
      if (nhap(x) == 0) {
        warning(
          "No haplotype left after filtering on 'select.hap'.\n",
          call. = FALSE,
          immediate. = TRUE
        )
        return(x)
      }
    }
    
    if (!is.null(select.mrk)) {
      if (verbose)
        cat("* Subset markers *\n")
      # drop = FALSE ensures that haplo is a matrix
      haplo <- x@haplo[, select.mrk, drop = FALSE]
      
      if (ncol(haplo) == nmrk(x)) {
        if (verbose)
          cat("No marker discarded.\n")
      } else{
        if (verbose)
          cat(nmrk(x) - ncol(haplo), "markers discarded.\n")
        # positions are unnamed in haplohh
        positions <- positions(x)
        # name them to subset by marker name
        names(positions) <- mrk.names(x)
        # update slots in object
        x@haplo <- haplo
        x@positions <- unname(positions[select.mrk])
        if (verbose)
          cat(nmrk(x), "markers remaining.\n")
        if (nmrk(x) == 0) {
          warning(
            "No marker left after filtering on 'select.mrk'.\n",
            call. = FALSE,
            immediate. = TRUE
          )
          return(x)
        }
      }
    }
    
    if (verbose)
      cat("* Filtering data *\n")
    
    if (!is.na(min_perc_geno.hap)) {
      if (verbose)
        cat("Discard haplotypes with less than",
            min_perc_geno.hap,
            "% of genotyped markers.\n")
      
      hap_sel <-
        (100 * rowMeans(!is.na(x@haplo))) >= min_perc_geno.hap
      
      if (sum(hap_sel) == nhap(x)) {
        if (verbose)
          cat("No haplotype discarded.\n")
      } else{
        if (verbose)
          cat(nhap(x) - sum(hap_sel), "haplotypes discarded.\n")
        x@haplo <- x@haplo[hap_sel, , drop = FALSE]
        if (verbose)
          cat(nhap(x), "haplotypes remaining.\n")
        
        if (nhap(x) == 0) {
          warning(
            "No haplotype left after filtering of missing data.\n",
            "If applicable, reduce min_perc_geno.hap to allow for more missing data.\n",
            call. = FALSE,
            immediate. = TRUE
          )
          return(x)
        }
      }
    }
    #selection des markers d'apres les donnees manquantes
    if (verbose)
      cat("Discard markers genotyped on less than",
          min_perc_geno.mrk,
          "% of haplotypes.\n")
    mrk_sel <-
      (100 * colMeans(!is.na(x@haplo))) >= min_perc_geno.mrk
    if (sum(mrk_sel) == nmrk(x)) {
      if (verbose)
        cat("No marker discarded.\n")
    } else{
      if (verbose)
        cat(nmrk(x) - sum(mrk_sel), "markers discarded.\n")
      x@haplo <- x@haplo[, mrk_sel, drop = FALSE]
      x@positions <- x@positions[mrk_sel]
      if (verbose)
        cat(nmrk(x), "markers remaining.\n")
      
      if (nmrk(x) == 0) {
        warning(
          "No marker left after filtering of missing data.\n",
          "If applicable, reduce min_perc_geno.mrk to allow for more missing data.\n",
          call. = FALSE,
          immediate. = TRUE
        )
        return(x)
      }
    }
    #selection des mrks sur MAF
    if (!is.na(min_maf)) {
      if (verbose)
        cat("Discard markers with Minor Allele Frequency equal to or below",
            min_maf,
            ".\n")
      mrk_sel <- apply(x@haplo, 2, function(x) {
        t <- tabulate(x + 1L)
        if (length(t) == 1) {
          ## only one (ancestral) allele
          return(FALSE)
        } else{
          ## at least one (derived) allele
          alleles <- order(t, decreasing = TRUE)[1:2]
          #check frequency of most and second-most common allele
          return(min(t[alleles]) / sum(t) > min_maf)
        }
      })
      if (sum(mrk_sel) == nmrk(x)) {
        if (verbose)
          cat("No marker discarded.\n")
      } else{
        if (verbose)
          cat(nmrk(x) - sum(mrk_sel), "markers discarded.\n")
        x@haplo <- x@haplo[, mrk_sel, drop = FALSE]
        x@positions <- x@positions[mrk_sel]
        if (verbose)
          cat(nmrk(x), "markers remaining.\n")
        
        if (nmrk(x) == 0) {
          warning(
            "No marker left after filtering on Minor Allele Frequency.\n",
            "If applicable, reduce min_maf to allow for less frequent minor alleles.",
            call. = FALSE,
            immediate. = TRUE
          )
          return(x)
        }
      }
    }
    
    if (!is.na(max_alleles)) {
      if (verbose)
        cat("Discard markers with more than",
            max_alleles,
            "different alleles.\n")
      mrk_sel <-
        apply(x@haplo, 2, function(x) {
          length(unique(na.omit(x))) <= max_alleles
        })
      if (sum(mrk_sel) == nmrk(x)) {
        if (verbose)
          cat("No marker discarded.\n")
      } else{
        if (verbose)
          cat(nmrk(x) - sum(mrk_sel), "markers discarded.\n")
        x@haplo <- x@haplo[, mrk_sel, drop = FALSE]
        x@positions <- x@positions[mrk_sel]
        if (verbose)
          cat(nmrk(x), "markers remaining.\n")
        
        if (nmrk(x) == 0) {
          warning(
            "No marker left after filtering on maximal number of different alleles.\n",
            call. = FALSE,
            immediate. = TRUE
          )
          return(x)
        }
      }
    }
    
    if (verbose)
      cat("Data consists of",
          nhap(x),
          "haplotypes and",
          nmrk(x),
          "markers.\n")
    
    multicity <- tabulate(apply(x@haplo, 2, function(x) {
      sum(tabulate(x + 1) != 0)
    }))
    names(multicity) <- seq_along(multicity)
    
    if (verbose) {
      cat("Number of mono-, bi-, multi-allelic markers:\n")
      cat(names(multicity), "\n")
      cat(multicity, "\n")
    }
    
    return(x)
  }

Any scripts or data that you put into this service are public.

rehh documentation built on Sept. 15, 2021, 5:06 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

rehh
Searching for Footprints of Selection using 'Extended Haplotype Homozygosity' Based Tests

R/subset.haplohh.R
In rehh: Searching for Footprints of Selection using 'Extended Haplotype Homozygosity' Based Tests

Defines functions subset.haplohh

Documented in subset.haplohh

Try the rehh package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rehh Searching for Footprints of Selection using 'Extended Haplotype Homozygosity' Based Tests

R/subset.haplohh.R In rehh: Searching for Footprints of Selection using 'Extended Haplotype Homozygosity' Based Tests

Defines functions subset.haplohh

Documented in subset.haplohh

Try the rehh package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

rehh
Searching for Footprints of Selection using 'Extended Haplotype Homozygosity' Based Tests

R/subset.haplohh.R
In rehh: Searching for Footprints of Selection using 'Extended Haplotype Homozygosity' Based Tests