zoom.plot: Modular zoom plots for genomic data

require(parallel)


# Reverse the coding of the major and minor alleles
complement <- function( marker ) {
  return ( 2 - marker )
}

calc.ld.matrix <- function(X, mc.cores, ld.fun=linkage.disequilibrium.D.prime) {
  
  if ( is.null(ld.fun) ) ld.fun <- linkage.disequilibrium.D.prime
  
  if ( min(X, na.rm=TRUE) < 0 && max(X, na.rm=TRUE) < 2 ) {
    warn('Converting -1/0/1 coded genotypes to 0/1/2')
    X <- X+1
  }
  
  # X need to be coded as 0, 1, 2
  stopifnot(min(X, na.rm=TRUE) >= 0, max(X, na.rm=TRUE) <= 2)
  
  # Check for columns with NA values
  na.indices <- lapply(1:ncol(X), function (j) { which(is.na(X[, j])) })
  
  # Complement all neccessary columns 
  for (j in 1:ncol(X)) {
    if ( length(na.indices[[j]]) == 0) {
      p.A <- sum(X[, j])/ (2*nrow(X))
    } else {  
      p.A <- sum(X[-na.indices[[j]], j])/ (2*nrow(X))
    }			
    
    if ( p.A < 0.5 ) {
      X[, j] <- complement( X[, j] )
    }
  }
  
  n.geno <- ncol(X)
  # Make a list of all of all possible pairs of markers
  idx.pairs <- data.frame('a'=rep(1:n.geno, n.geno), 'b'=rep(1:n.geno, each=n.geno))
  # Keep only the pairs where a <= b
  idx.pairs <- idx.pairs[idx.pairs$a <= idx.pairs$b, ]
  
  # Remove NA values and calls linkage.disequilibrium.D.prime() for pair i
  do.pair <- function(i) {
    a <- idx.pairs$a[i]
    b <- idx.pairs$b[i]
    # exclude indices that are NA in either marker a or b
    exclude <- union(na.indices[[a]], na.indices[[b]])
    
    if ( length(exclude) == nrow(X))
      return ( 0 )
    
    if ( length(exclude) == 0) {			
      marker1 <- X[, a]
      marker2 <- X[, b]
    } else {
      marker1 <- X[-exclude, a]
      marker2 <- X[-exclude, b]
    }
    return ( ld.fun(marker1, marker2) )
  }
  
  # use MC.CORES to calculate D' for every pair
  D.prime.values <- mclapply(1:nrow(idx.pairs), do.pair, mc.cores=mc.cores)
  
  # Convert list into a matrix
  D.prime <- matrix(-1, n.geno, n.geno)
  for ( i in 1:nrow(idx.pairs) ) {
    D.prime[idx.pairs$a[i], idx.pairs$b[i]] <- D.prime.values[[i]]
    D.prime[idx.pairs$b[i], idx.pairs$a[i]] <- D.prime.values[[i]]
  }
  
  return ( D.prime ) 
}

# The log likelihood function for pAB for calculating D' 
# adaptived from LD function in library(genetics)
log.L.linkage.disequilibrium <- function(pAB, p.A, p.B, counts) {
  pAb <- p.A - pAB
  paB <- p.B - pAB
  pab <- 1 - p.A - p.B + pAB
  
  return (counts[1] * log(pab) +
            counts[2] * log(paB) +
            counts[3] * log(pAb) +
            counts[4] * log(pAB) +
            counts[5] * log(pAB*pab + pAb*paB))
}

# calculates D' much faster than the LD function in library(genetics)
# code is adapted from library(genetics)
#' @export
linkage.disequilibrium.D.prime <- function(X1, X2) {
  
  if ( all(X1==X2) ) {
    return ( 1 )
  }
  
  p.A <- sum(X1) / (2*length(X1))
  p.B <- sum(X2) / (2*length(X2))
  
  if ( p.A < 0.5 ) {
    p.A <- 1 - p.A
    X1 <- complement(X1)
  }
  
  if ( p.B < 0.5 ) {
    p.B <- 1 - p.B
    X2 <- complement(X2)
  }
  
  p.a <- 1 - p.A
  p.b <- 1 - p.B
  
  # Make a contingency table for makers X1 and X2
  N <- table(factor(X1, levels=0:2), factor(X2, levels=0:2))
  
  counts <- c()
  counts[1] <- 2*N[1, 1] + N[1, 2] + N[2, 1] # p(ab)
  counts[2] <- 2*N[1, 3] + N[1, 2] + N[2, 3] # p(aB)
  counts[3] <- 2*N[3, 1] + N[2, 1] + N[3, 2] # p(Ab)
  counts[4] <- 2*N[3, 3] + N[3, 2] + N[2, 3] # p(AB)
  counts[5] <- N[2, 2] # p(AB)p(ab) + p(Ab)*p(aB)
  
  Dmin <- max(-p.A*p.B, -p.a*p.b)
  Dmax <- min(p.A*p.b, p.B*p.a);
  
  pmin <- p.A*p.B + Dmin;
  pmax <- p.A*p.B + Dmax;
  
  # Find p(AB) by maximizing log-likelihood function
  solution <- tryCatch(optimize(log.L.linkage.disequilibrium,
                                lower=pmin + .Machine$double.eps,
                                upper=pmax - .Machine$double.eps,
                                maximum=TRUE,
                                p.A=p.A, p.B=p.B, counts=counts),
                       error=function(e) { return ( NA ) })
  
  if ( any(is.na(solution)) ) {
    return ( NA )
  }
  
  pAB <- solution$maximum
  
  D <- pAB - p.A*p.B
  
  if ( D > 0 ) {
    D.prime <- D / Dmax
  } else {
    D.prime <- D / Dmin
  }
  
  return ( D.prime )
}

#' @export
linkage.disequilibrium.r.squared <- function(X1, X2) {
  if ( all(X1==X2) ) {
    return ( 1 )
  }
  
  p.A <- sum(X1) / (2*length(X1))
  p.B <- sum(X2) / (2*length(X2))
  
  if ( p.A < 0.5 ) {
    p.A <- 1 - p.A
    X1 <- complement(X1)
  }
  
  if ( p.B < 0.5 ) {
    p.B <- 1 - p.B
    X2 <- complement(X2)
  }
  
  p.a <- 1 - p.A
  p.b <- 1 - p.B
  
  # Make a contingency table for makers X1 and X2
  N <- table(factor(X1, levels=0:2), factor(X2, levels=0:2))
  
  counts <- c()
  counts[1] <- 2*N[1, 1] + N[1, 2] + N[2, 1] # p(ab)
  counts[2] <- 2*N[1, 3] + N[1, 2] + N[2, 3] # p(aB)
  counts[3] <- 2*N[3, 1] + N[2, 1] + N[3, 2] # p(Ab)
  counts[4] <- 2*N[3, 3] + N[3, 2] + N[2, 3] # p(AB)
  counts[5] <- N[2, 2] # p(AB)p(ab) + p(Ab)*p(aB)
  
  Dmin <- max(-p.A*p.B, -p.a*p.b)
  Dmax <- min(p.A*p.b, p.B*p.a);
  
  pmin <- p.A*p.B + Dmin;
  pmax <- p.A*p.B + Dmax;
  
  # Find p(AB) by maximizing log-likelihood function
  solution <- tryCatch(optimize(log.L.linkage.disequilibrium,
                                lower=pmin + .Machine$double.eps,
                                upper=pmax - .Machine$double.eps,
                                maximum=TRUE,
                                p.A=p.A, p.B=p.B, counts=counts),
                       error=function(e) { return ( NA ) })
  
  if ( any(is.na(solution)) ) {
    return ( NA )
  }
  
  pAB <- solution$maximum
  
  D <- pAB - p.A*p.B
  
  r.squared <- D^2 / (p.A * p.a * p.B * p.b)
  
  return ( r.squared )
}

#' @export
ld.matrix <- function(genotypes, mc.cores, ld.fun=NULL, prune=FALSE, prune.window=50, prune.threshold=0.5, prune.ld.fun=NULL) {
  require('parallel')
  stopifnot(max(genotypes, na.rm=TRUE) <= 2,
            min(genotypes, na.rm=TRUE) >= 0)
  
  if ( missing(mc.cores) ) {
    mc.cores <- detectCores()
  }
  
  if ( prune ) {
    
    cat('Pruning SNPs for LD matrix...\n')
    if ( is.null(prune.ld.fun) )
      prune.ld.fun <- linkage.disequilibrium.r.squared
    window <- prune.window
    threshold <- prune.threshold
    
    n.snps <- ncol(genotypes)
    
    get.ld.sibs <- function(i) {
      min.idx <- max(1, i-window)
      max.idx <- min(n.snps, i+window)
      
      up.pairs <- down.pairs <- NULL
      if ( min.idx < i )
        down.pairs <- data.frame(a=min.idx:(i-1), b=i)
      if ( max.idx > i )
        up.pairs <- data.frame(a=(i+1):max.idx, b=i)
      
      pairs <- rbind(down.pairs, up.pairs)
      ld.vals <- sapply(1:nrow(pairs), function (x) with(pairs[x, , drop=FALSE],
                                                         prune.ld.fun(genotypes[, a], genotypes[, b])))
      pairs$a[ld.vals>threshold]
    }
    
    all.ld.sibs <- mclapply(1:n.snps, get.ld.sibs, mc.cores=detectCores())
    n.sibs <- sapply(all.ld.sibs, length)
    
    check.snps <- order(n.sibs, decreasing=TRUE)[1:sum(n.sibs>0)]
    sub.snp <- 1:n.snps
    sel.snp <- rep(0, n.snps)
    
    for ( i in check.snps ) {
      if ( sel.snp[i] != 0 ) next
      sel.snp[i] <- 1
      sibs <- all.ld.sibs[[i]]
      sib.sel <- sel.snp[sibs]
      sibs <- sibs[sib.sel < 1]
      sub.snp[sibs] <- i
      sel.snp[sibs] <- -1
    }
    
    selected <- which(sel.snp>=0)
    sel.sub.snp <- match(sub.snp, selected)
    
    cat('Calculating LD for', length(selected), 'of', ncol(genotypes), 'genotypes using', mc.cores, 'cores...\n')
    LDM.pruned <- calc.ld.matrix(genotypes[, selected], mc.cores, ld.fun=ld.fun)
    LDM <- (LDM.pruned[, sel.sub.snp])[sel.sub.snp, ]
    
  } else {
    cat('Calculating LD for', ncol(genotypes), 'genotypes using', mc.cores, 'cores...\n')
    LDM <- calc.ld.matrix(genotypes, mc.cores, ld.fun=ld.fun)
  }
  
  return ( LDM )
}

#' @export
ld.plot <- function(ld.matrix, snp.positions, start.pos, end.pos, ld.colors, drawscale=TRUE, scale.cex=0.6) {
  
  if ( missing(ld.colors) ) {
    ld.colors <- rgb(1, 99:0/99, 99:0/99)
  }
  stopifnot(ncol(ld.matrix)==length(snp.positions),
            nrow(ld.matrix)==length(snp.positions))
  
  if ( is.unsorted(snp.positions) ) warning('SNPs positions are not in order!')
  
  plot(0, type='n', ylim=c(min(snp.positions)-max(snp.positions), 0), 
       xlim=range(snp.positions),
       axes=FALSE, bty='n', xlab='', ylab='', yaxs='i')
  
  # Calculate positions for SNPs
  num.snps <- length(snp.positions)
  x.pos <- snp.positions
  x.pos.mids <- (x.pos[-1] + x.pos[-num.snps])/2
  x.pos.left <- c(x.pos[1], x.pos.mids)
  x.pos.right <- c(x.pos.mids, x.pos[num.snps])
  
  # Loop through each position in the LDM array and draw polygon of appropriate shade
  for ( i in 1:num.snps ) {
    for ( j in 1:i ) {
      fill.color <- ld.colors[round(abs(ld.matrix[i, j]) * (length(ld.colors) - 1)) + 1]
      # Draw a diamond shape 
      polygon(c(x.pos.left[i]+x.pos.left[j],
                x.pos.right[i]+x.pos.left[j],
                x.pos.right[i]+x.pos.right[j],
                x.pos.left[i]+x.pos.right[j])/2,
              -c(x.pos.left[i]-x.pos.left[j],
                 x.pos.right[i]-x.pos.left[j],
                 x.pos.right[i]-x.pos.right[j],
                 x.pos.left[i]-x.pos.right[j]),
              col=fill.color, border=fill.color)
      # NOTE: not drawing polygon borders (i.e. border=NA) causes jagged,
      # pixelated edges. So we draw a border in the same color as the fill
    }
  }
  
  if ( drawscale ) {
    draw.scale(ld.colors, c(0, 1), pos='bottomleft')
  }
}