binner: Read fsa fragment files from an ABI Genetic Analzyer

Documented in aflpCombine binSet

##' @rdname binSet
##' @title Apply bins to peak tables, creating arrays of AFLP data
##'
##' @description Functions for converting peak tables to AFLP data arrays,
##' and combining multiple arrays for analysis.
##' 
##' @details \code{binSet} takes a peak table, applies the
##' given bins, and returns a 3D aflp data array.
##'
##' \code{aflpCombine} takes two aflp data arrays, as produced from
##' \code{binSet}, and combines them into a single array
##'
##' @param pt A peak table, as generated by \code{fsa2PeakTab}
##' 
##' @param bins A bin table, as produced by \code{fsaRGbin}
##' 
##' @param pref A prefix to append to bin names, to allow for later
##' combining data from different primer pairs
##' 
##' @param hthresh The height threshold to score peaks as 'present'. You
##' can revisit this decision later in the process. 
##'
##' @param aflp1 aflp data arrays, as produced by \code{binSet}
##' @param aflp2 aflp data arrays, as produced by \code{binSet}
##' 
##' @return \code{binSet} returns a 3D array. The rows are
##' samples, the columns are fragments, and the third dimension
##' splits the array into allele calls ("alleles"), peak heights
##' ("heights") and fragment size in basepairs ("sizes"). Thus,
##' to retrieve the presence/absence matrix of allele calls, use:
##' \code{aflp[ , , "alleles"]}, where \code{aflp} is the object
##' returned by \code{binSet}.
##'
##' Other data summaries are readily available by creative slicing and
##' dicing of the 3D array.
##'
##' @author Tyler Smith
##'
##' @export
##' 
binSet <- function(pt, bins, pref, hthresh = 100){
  pt <- pt[order(pt$bp), ]
  bins <- bins[order(bins[,1]), ]
  sizes <- pt$bp
  samples <- levels(pt$sample.name)
  size.mat <- matrix(NA, ncol = nrow(bins), nrow = length(samples))
  height.mat <- size.mat
  row.names(size.mat) <- samples
  row.names(height.mat) <- samples
  colnames(size.mat) <- bins[,1]
  colnames(height.mat) <- bins[,1]
  
  bin <- 1
  peak <- 1
  while(bin <= nrow(bins) & peak <= length(sizes)) {
    if(sizes[peak] < bins[bin, 1]){     # peak is too small, move to next peak
      peak <- peak + 1
    }
    else if (sizes[peak] >= bins[bin, 2]){ # peak is bigger than bin, move to next bin
      bin <- bin + 1
    }
    else if (sizes[peak] >= bins[bin, 1] & sizes[peak] < bins[bin, 2]) { # count it! 
      name <- pt$sample.name[peak]
      height <- pt$height[peak]
      size <- pt$bp[peak]
      size.mat[name, bin] <- size
      height.mat[name, bin] <- height
      peak <- peak + 1
    }
  }

  res <- array(dim = c(dim(height.mat), 3),
               dimnames = list(sample = rownames(height.mat),
                 fragment = colnames(height.mat),
                 values = c("alleles", "heights", "sizes"))) 
  res[,,2] <- height.mat
  res[,,3] <- size.mat

  alleles <- res[, , "heights"]
  alleles[is.na(alleles)] <- 0
  alleles[alleles >= hthresh] <- 1
  alleles[alleles > 1] <- 0
  res[, , 1] <- alleles
  ##print(paste(pref, round(colMeans(res[,,"sizes"], na.rm = TRUE), 1), sep = "."))
  colnames(res) <- paste(pref, round(colMeans(res[,,"sizes"], na.rm = TRUE), 1), sep = ".")
  return(res)
}

#' @rdname binSet
#' @export
#' 
aflpCombine <- function(aflp1, aflp2) {
  common <- intersect(rownames(a), rownames(b))
  return(abind::abind(a[common,,], b[common,,], along = 2))
}


## unbin <- function(gm){
##   levs <- levels(as.factor(row.names(gm)))

##   res <- data.frame(sample.name = factor(levels = levs),
##                     height = numeric(), Size = numeric(), bin = numeric())
  
##   for (i in 1:ncol(gm)) {
##     targets <- complete.cases(gm[ , i, "sizes"])
##     samples <- row.names(gm)[targets]
##     heights <- gm[targets, i, "heights"]
##     sizes <- gm[targets, i, "sizes"]
##     bin <- rep(i, sum(targets))
##     tmpdf <- data.frame(sample.name = factor(samples, levels = levs), height = heights,
##                         Size = sizes, bin = bin) 
##     tmpdf <- tmpdf[order(tmpdf$bp), ]
##     res <- rbind(res, tmpdf)
##   }

##   rownames(res) <- NULL
  
##   return(res)
## }