R/compute.ALD.R

#' Compute ALD.
#'
#' A function to compute asymmetric Linkage Disequilibrium measures (ALD) for polymorphic genetic 
#' data. These measures are identical to the correlation measure (r) for bi-allelic data.
#' 
#' @param dat	A data.frame with 5 required variables (having the names listed below):
#'      \tabular{ll}{
#'           \code{haplo.freq} \tab A numeric vector of haplotype frequencies.\cr
#'            \code{locus1} \tab A character vector indentifying the first locus.\cr
#'            \code{locus2} \tab A character vector indentifying the second locus.\cr
#'            \code{allele1} \tab A character vector indentifying the allele at locus 1.\cr
#'            \code{allele2} \tab A character vector indentifying the allele at locus 2.\cr
#'            }
#' @param tolerance A threshold for the sum of the haplotype frequencies.
#'    If the sum of the haplotype frequencies is greater than 1+tolerance or less
#'    than 1-tolerance an error is returned. The default is 0.01.
#'
#' @param symm An indicator for whether to compute symmetric measures Dprime & Wn (default=FALSE)
#'
#' @return The return value is a dataframe with the following components:
#'  \tabular{ll}{
#'  \code{locus1}	\tab The name of the first locus.\cr
#'  \code{locus2}	\tab The name of the second locus.\cr
#'  \code{F.1}	\tab Homozygosity (expected under HWP) for locus 1.\cr
#'  \code{F.1.2}	\tab Conditional homozygosity* for locus1 given locus2.\cr
#'  \code{F.2}	\tab Homozygosity (expected under HWP) for locus 2.\cr
#'  \code{F.2.1}	\tab Conditional homozygosity* for locus2 given locus1.\cr
#'  \code{ALD.1.2} \tab Asymmetric LD for locus1 given locus2.\cr
#'  \code{ALD.2.1} \tab  Asymmetric LD for locus2 given locus1.\cr
#'  }
#'  *Overall weighted haplotype-specific homozygosity for the first locus given the second locus.
#'
#' @section Details:
#' A warning message is given if the sum of the haplotype frequencies is greater than 1.01 or less
#' than 0.99 (regardless of the \code{tolerance} setting). The haplotype frequencies that are passed
#' to the function are normalized within the function to sum to 1.0 by dividing each frequency by 
#' the sum of the passed frequencies.
#'
#' @examples
#' library(asymLD)
#' 
#' # An example using haplotype frequencies from Wilson(2010)
#' data(hla.freqs)
#' hla.a_b <- hla.freqs[hla.freqs$locus1=="A" & hla.freqs$locus2=="B",]
#' compute.ALD(hla.a_b)
#' hla.freqs$locus <- paste(hla.freqs$locus1, hla.freqs$locus2, sep="-")
#' compute.ALD(hla.freqs[hla.freqs$locus=="C-B",])
#' # Note: additonal columns on the input dataframe (e.g., "locus" above) are allowed, but 
#' # ignored by the function.
#' 
#' # An example using genotype data from the haplo.stats package
#' require(haplo.stats)
#' data(hla.demo)
#' geno <- hla.demo[,5:8]  #DPB-DPA 
#' label <- unique(gsub(".a(1|2)", "", colnames(geno)))
#' label <- paste("HLA*",label,sep="")
#' keep <- !apply(is.na(geno) | geno==0, 1, any)
#' em.keep  <- haplo.em(geno=geno[keep,], locus.label=label)
#' hapfreqs.df <- cbind(em.keep$haplotype, em.keep$hap.prob) 
#' #format dataframe for ALD function
#' names(hapfreqs.df)[dim(hapfreqs.df)[2]] <- "haplo.freq"
#' names(hapfreqs.df)[1] <- "allele1"
#' names(hapfreqs.df)[2] <- "allele2"
#' hapfreqs.df$locus1 <- label[1]
#' hapfreqs.df$locus2 <- label[2]
#' head(hapfreqs.df)
#' compute.ALD(hapfreqs.df)
#' # Note that there is substantially less variablity (higher ALD) for HLA*DPA1 
#' # conditional on HLA*DPB1 than for HLA*DPB1 conditional on HLA*DPA1, indicating 
#' # that the overall variation for DPA1 is relatively low given specific DPB1 alleles
#' 
#' # An example using SNP data where results are symmetric and equal to the ordinary 
#' # correlation measure (r)
#' data(snp.freqs)
#' snps <- c("rs1548306", "rs6923504", "rs4434496", "rs7766854")
#' compute.ALD(snp.freqs[snp.freqs$locus1==snps[2] & snp.freqs$locus2==snps[3],])
#' 
#' snp.freqs$locus <- paste(snp.freqs$locus1, snp.freqs$locus2, sep="-")
#' by(snp.freqs,list(snp.freqs$locus),compute.ALD)
#' 
#' # SNP1 & SNP2 : the r correlation & ALD measures are equivalent due to symmetry for 
#' # bi-allelic SNPs
#' p.AB <- snp.freqs$haplo.freq[1]
#' p.Ab <- snp.freqs$haplo.freq[2]
#' p.aB <- snp.freqs$haplo.freq[3]
#' p.ab <- snp.freqs$haplo.freq[4]
#' p.A <- p.AB + p.Ab
#' p.B <- p.AB + p.aB 
#' r.squared <- (p.AB - p.A*p.B)^2 / (p.A*(1-p.A)*p.B*(1-p.B))
#' sqrt(r.squared) #the r correlation measure
#' compute.ALD(snp.freqs[snp.freqs$locus1==snps[1] & snp.freqs$locus2==snps[2],])
#' 
#' @export
#' @importFrom stats aggregate

compute.ALD <- function(dat, tolerance = 0.01, symm=FALSE) {
  names.req <- c("locus1", "locus2", "allele1", "allele2", "haplo.freq")
  check.names <- names.req %in% names(dat)
  if (sum(!check.names) > 0)
    stop("The following required variables are missing from the dataset: ",
      paste(names.req[!check.names], collapse = " "))
  if (sum(dat$haplo.freq) > 1 + tolerance)
    stop("Sum of haplo freqs > 1; sum=", sum(dat$haplo.freq))
  if (sum(dat$haplo.freq) < 1 - tolerance)
    stop("Sum of haplo freqs < 1; sum=", sum(dat$haplo.freq))
  if (abs(1 - sum(dat$haplo.freq)) > 0.01)
    warning("Sum of haplo freqs is not 1; sum=", sum(dat$haplo.freq))
  if (sum( (names(dat) %in% c("allele.freq1", "allele.freq2")) )>0) {
    dat$allele.freq1 <- NULL
    dat$allele.freq2 <- NULL
  }  
  locus1 <- unique(dat$locus1)
  locus2 <- unique(dat$locus2)
  
  # normalize haplotype freqs to sum to 1.0
  dat$haplo.freq <- dat$haplo.freq/sum(dat$haplo.freq)
  
  # generate allele freqs based on haplo freqs
  by.vars1 <- list(dat$allele1)
  by.vars2 <- list(dat$allele2)
  names(by.vars1) <- c("allele1")
  names(by.vars2) <- c("allele2")
  af1 <- aggregate(dat$haplo.freq, by = by.vars1, FUN = sum)
  af2 <- aggregate(dat$haplo.freq, by = by.vars2, FUN = sum)
  names(af1)[length(names(af1))] <- "allele.freq1"
  names(af2)[length(names(af2))] <- "allele.freq2"
  mrg1 <- merge(dat, af1, by.x = c("allele1"), by.y = c("allele1"), all.x = TRUE, all.y = FALSE)
  mrg2 <- merge(mrg1, af2, by.x = c("allele2"), by.y = c("allele2"), all.x = TRUE, all.y = FALSE)
  dat <- mrg2
  
  F.1 <- 0
  F.2.1 <- 0
  F.2 <- 0
  F.1.2 <- 0
  for (a1 in unique(dat$allele1)) {
    af.1 <- unique(dat$allele.freq1[dat$allele1 == a1])
    F.2.1 <- sum(F.2.1, sum(dat$haplo.freq[dat$allele1 == a1]^2)/af.1)
    F.1 <- F.1 + af.1^2
  }
  for (a2 in unique(dat$allele2)) {
    af.2 <- unique(dat$allele.freq2[dat$allele2 == a2])
    F.1.2 <- sum(F.1.2, sum(dat$haplo.freq[dat$allele2 == a2]^2)/af.2)
    F.2 <- F.2 + af.2^2
  }
  if (F.2 == 1) {
    F.2.1.prime <- NA
    ALD.2.1 <- NA
  } else {
    F.2.1.prime <- (F.2.1 - F.2)/(1 - F.2)
    ALD.2.1 <- sqrt(F.2.1.prime)
  }
  if (F.1 == 1) {
    F.1.2.prime <- NA
    ALD.1.2 <- NA
  } else {
    F.1.2.prime <- (F.1.2 - F.1)/(1 - F.1)
    ALD.1.2 <- sqrt(F.1.2.prime)
  }

  if (symm==TRUE) {
    # add zero frequency haplotypes
    a1.list <- sort(unique(as.character(dat$allele1)))
    a2.list <- sort(unique(as.character(dat$allele2)))
    tmp <- expand.grid(a1.list,a2.list) #create all possible combos of allele1 & allele2
    names(tmp)[1] <- "allele1"
    names(tmp)[2] <- "allele2"
    dat1 <- unique( dat[dat$allele1 %in% unique(dat$allele1),c("allele1","allele.freq1")] )
    dat2 <- unique( dat[dat$allele2 %in% unique(dat$allele2),c("allele2","allele.freq2")] )
    tmp1 <- merge(tmp, dat1, by="allele1", all.x=T, all.y=T)
    tmp  <- merge(tmp1, dat2, by="allele2", all.x=T, all.y=T)
    dat$allele.freq1 <- NULL
    dat$allele.freq2 <- NULL
    tmp1 <- merge(tmp, dat, by=c("allele1","allele2"), all.x=T, all.y=T)
    tmp1$haplo.freq[is.na(tmp1$haplo.freq)] <- 0
    # compute Dprime & Wn
    tmp1$d <- tmp1$haplo.freq - tmp1$allele.freq1*tmp1$allele.freq2
    tmp1$dprime.den <- rep(0,dim(tmp1)[1])
    tmp1$den.lt0 <- pmin( tmp1$allele.freq1*tmp1$allele.freq2, (1-tmp1$allele.freq1)*(1-tmp1$allele.freq2) )
    tmp1$den.ge0 <- pmin( (1-tmp1$allele.freq1)*tmp1$allele.freq2, tmp1$allele.freq1*(1-tmp1$allele.freq2) )
    tmp1$dprime.den[tmp1$d < 0] <- tmp1$den.lt0[tmp1$d < 0]
    tmp1$dprime.den[tmp1$d >=0] <- tmp1$den.ge0[tmp1$d >=0]
    tmp1$dprime <- tmp1$d/tmp1$dprime.den
    Dprime <- sum(abs(tmp1$dprime)*tmp1$allele.freq1*tmp1$allele.freq2)
    w  <- sum(tmp1$d^2/(tmp1$allele.freq1*tmp1$allele.freq2))
    Wn <- sqrt( w / (min( length(unique(tmp1$allele1)), length(unique(tmp1$allele2)) ) - 1) )  
    
    ALD <- data.frame(locus1, locus2, F.1, F.1.2, F.2, F.2.1, ALD.1.2, ALD.2.1, Dprime, Wn)
    cat("   ALD.x.y is asymmetric LD for locus x conditioned on locus y\n\n")
    return(ALD)
  } else {
    ALD <- data.frame(locus1, locus2, F.1, F.1.2, F.2, F.2.1, ALD.1.2, ALD.2.1)
    cat("   ALD.x.y is asymmetric LD for locus x conditioned on locus y\n\n")
    return(ALD)
  }
}
rsingle/asymLD_CRAN documentation built on May 28, 2019, 3:32 a.m.