R/compute.ALD.R
In asymLD: Asymmetric Linkage Disequilibrium (ALD) for Polymorphic Genetic Data

#' Compute ALD.
#'
#' A function to compute asymmetric Linkage Disequilibrium measures (ALD) for polymorphic genetic 
#' data. These measures are identical to the correlation measure (r) for bi-allelic data.
#' 
#' @param dat	A data.frame with 5 required variables (having the names listed below):
#'      \tabular{ll}{
#'           \code{haplo.freq} \tab A numeric vector of haplotype frequencies.\cr
#'            \code{locus1} \tab A character vector indentifying the first locus.\cr
#'            \code{locus2} \tab A character vector indentifying the second locus.\cr
#'            \code{allele1} \tab A character vector indentifying the allele at locus 1.\cr
#'            \code{allele2} \tab A character vector indentifying the allele at locus 2.\cr
#'            }
#' @param tolerance A threshold for the sum of the haplotype frequencies.
#'    If the sum of the haplotype frequencies is greater than 1+tolerance or less
#'    than 1-tolerance an error is returned. The default is 0.01.
#'
#' @return The return value is a dataframe with the following components:
#'  \tabular{ll}{
#'  \code{locus1}	\tab The name of the first locus.\cr
#'  \code{locus2}	\tab The name of the second locus.\cr
#'  \code{F.1}	\tab Homozygosity (expected under HWP) for locus 1.\cr
#'  \code{F.1.2}	\tab Conditional homozygosity* for locus1 given locus2.\cr
#'  \code{F.2}	\tab Homozygosity (expected under HWP) for locus 2.\cr
#'  \code{F.2.1}	\tab Conditional homozygosity* for locus2 given locus1.\cr
#'  \code{ALD.1.2} \tab Asymmetric LD for locus1 given locus2.\cr
#'  \code{ALD.2.1} \tab  Asymmetric LD for locus2 given locus1.\cr
#'  }
#'  *Overall weighted haplotype-specific homozygosity for the first locus given the second locus.
#'
#' @section Details:
#' A warning message is given if the sum of the haplotype frequencies is greater than 1.01 or less
#' than 0.99 (regardless of the \code{tolerance} setting). The haplotype frequencies that are passed
#' to the function are normalized within the function to sum to 1.0 by dividing each frequency by 
#' the sum of the passed frequencies.
#'
#' @examples
#' library(asymLD)
#' 
#' # An example using haplotype frequencies from Wilson(2010)
#' data(hla.freqs)
#' hla.a_b <- hla.freqs[hla.freqs$locus1=="A" & hla.freqs$locus2=="B",]
#' compute.ALD(hla.a_b)
#' hla.freqs$locus <- paste(hla.freqs$locus1, hla.freqs$locus2, sep="-")
#' compute.ALD(hla.freqs[hla.freqs$locus=="C-B",])
#' # Note: additonal columns on the input dataframe (e.g., "locus" above) are allowed, but 
#' # ignored by the function.
#' 
#' # An example using genotype data from the haplo.stats package
#' require(haplo.stats)
#' data(hla.demo)
#' geno <- hla.demo[,5:8]  #DPB-DPA 
#' label <- unique(gsub(".a(1|2)", "", colnames(geno)))
#' label <- paste("HLA*",label,sep="")
#' keep <- !apply(is.na(geno) | geno==0, 1, any)
#' em.keep  <- haplo.em(geno=geno[keep,], locus.label=label)
#' hapfreqs.df <- cbind(em.keep$haplotype, em.keep$hap.prob) 
#' #format dataframe for ALD function
#' names(hapfreqs.df)[dim(hapfreqs.df)[2]] <- "haplo.freq"
#' names(hapfreqs.df)[1] <- "allele1"
#' names(hapfreqs.df)[2] <- "allele2"
#' hapfreqs.df$locus1 <- label[1]
#' hapfreqs.df$locus2 <- label[2]
#' head(hapfreqs.df)
#' compute.ALD(hapfreqs.df)
#' # Note that there is substantially less variablity (higher ALD) for HLA*DPA1 
#' # conditional on HLA*DPB1 than for HLA*DPB1 conditional on HLA*DPA1, indicating 
#' # that the overall variation for DPA1 is relatively low given specific DPB1 alleles
#' 
#' # An example using SNP data where results are symmetric and equal to the ordinary 
#' # correlation measure (r)
#' data(snp.freqs)
#' snps <- c("rs1548306", "rs6923504", "rs4434496", "rs7766854")
#' compute.ALD(snp.freqs[snp.freqs$locus1==snps[2] & snp.freqs$locus2==snps[3],])
#' 
#' snp.freqs$locus <- paste(snp.freqs$locus1, snp.freqs$locus2, sep="-")
#' by(snp.freqs,list(snp.freqs$locus),compute.ALD)
#' 
#' # SNP1 & SNP2 : the r correlation & ALD measures are equivalent due to symmetry for 
#' # bi-allelic SNPs
#' p.AB <- snp.freqs$haplo.freq[1]
#' p.Ab <- snp.freqs$haplo.freq[2]
#' p.aB <- snp.freqs$haplo.freq[3]
#' p.ab <- snp.freqs$haplo.freq[4]
#' p.A <- p.AB + p.Ab
#' p.B <- p.AB + p.aB 
#' r.squared <- (p.AB - p.A*p.B)^2 / (p.A*(1-p.A)*p.B*(1-p.B))
#' sqrt(r.squared) #the r correlation measure
#' compute.ALD(snp.freqs[snp.freqs$locus1==snps[1] & snp.freqs$locus2==snps[2],])
#' 
#' @export
#' @importFrom stats aggregate

compute.ALD <- function(dat, tolerance = 0.01) {
  names.req <- c("locus1", "locus2", "allele1", "allele2", "haplo.freq")
  check.names <- names.req %in% names(dat)
  if (sum(!check.names) > 0)
    stop("The following required variables are missing from the dataset: ",
      paste(names.req[!check.names], collapse = " "))
  if (sum(dat$haplo.freq) > 1 + tolerance)
    stop("Sum of haplo freqs > 1; sum=", sum(dat$haplo.freq))
  if (sum(dat$haplo.freq) < 1 - tolerance)
    stop("Sum of haplo freqs < 1; sum=", sum(dat$haplo.freq))
  if (abs(1 - sum(dat$haplo.freq)) > 0.01)
    warning("Sum of haplo freqs is not 1; sum=", sum(dat$haplo.freq))
  locus1 <- unique(dat$locus1)
  locus2 <- unique(dat$locus2)
  
  # normalize haplotype freqs to sum to 1.0
  dat$haplo.freq <- dat$haplo.freq/sum(dat$haplo.freq)
  
  # generate allele freqs based on haplo freqs
  by.vars1 <- list(dat$allele1)
  by.vars2 <- list(dat$allele2)
  names(by.vars1) <- c("allele1")
  names(by.vars2) <- c("allele2")
  af1 <- aggregate(dat$haplo.freq, by = by.vars1, FUN = sum)
  af2 <- aggregate(dat$haplo.freq, by = by.vars2, FUN = sum)
  names(af1)[length(names(af1))] <- "allele.freq1"
  names(af2)[length(names(af2))] <- "allele.freq2"
  mrg1 <- merge(dat, af1, by.x = c("allele1"), by.y = c("allele1"), all.x = TRUE, all.y = FALSE)
  mrg2 <- merge(mrg1, af2, by.x = c("allele2"), by.y = c("allele2"), all.x = TRUE, all.y = FALSE)
  dat <- mrg2
  
  F.1 <- 0
  F.2.1 <- 0
  F.2 <- 0
  F.1.2 <- 0
  for (a1 in unique(dat$allele1)) {
    af.1 <- unique(dat$allele.freq1[dat$allele1 == a1])
    F.2.1 <- sum(F.2.1, sum(dat$haplo.freq[dat$allele1 == a1]^2)/af.1)
    F.1 <- F.1 + af.1^2
  }
  for (a2 in unique(dat$allele2)) {
    af.2 <- unique(dat$allele.freq2[dat$allele2 == a2])
    F.1.2 <- sum(F.1.2, sum(dat$haplo.freq[dat$allele2 == a2]^2)/af.2)
    F.2 <- F.2 + af.2^2
  }
  if (F.2 == 1) {
    F.2.1.prime <- NA
    ALD.2.1 <- NA
  } else {
    F.2.1.prime <- (F.2.1 - F.2)/(1 - F.2)
    ALD.2.1 <- sqrt(F.2.1.prime)
  }
  if (F.1 == 1) {
    F.1.2.prime <- NA
    ALD.1.2 <- NA
  } else {
    F.1.2.prime <- (F.1.2 - F.1)/(1 - F.1)
    ALD.1.2 <- sqrt(F.1.2.prime)
  }
  ALD <- data.frame(locus1, locus2, F.1, F.1.2, F.2, F.2.1, ALD.1.2, ALD.2.1)
  cat("   ALD.x.y is asymmetric LD for locus x conditioned on locus y\n\n")
  return(ALD)
}

Any scripts or data that you put into this service are public.

asymLD documentation built on May 2, 2019, 6:14 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

asymLD
Asymmetric Linkage Disequilibrium (ALD) for Polymorphic Genetic Data

R/compute.ALD.R
In asymLD: Asymmetric Linkage Disequilibrium (ALD) for Polymorphic Genetic Data

Try the asymLD package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

asymLD Asymmetric Linkage Disequilibrium (ALD) for Polymorphic Genetic Data

R/compute.ALD.R In asymLD: Asymmetric Linkage Disequilibrium (ALD) for Polymorphic Genetic Data

Try the asymLD package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

asymLD
Asymmetric Linkage Disequilibrium (ALD) for Polymorphic Genetic Data

R/compute.ALD.R
In asymLD: Asymmetric Linkage Disequilibrium (ALD) for Polymorphic Genetic Data