msProcess: Protein Mass Spectra Processing

################################################
## S+Proteome Peak alignment functions
##
##  msAlign
##    msAlignCluster
##    msAlignGap
##    msAlignMRD
##    msAlignVote
##  
################################################

###
# msAlign
###

"msAlign" <- function(x, FUN="cluster",	mz.precision=0.003, 
  snr.thresh=10, ...)
{
  # extract peak.list
  peak.list <- x$peak.list
  if (is.null(peak.list)) {
  	if (is.null(x$peak.class)) {
      stop("The input ", deparse(substitute(x)), " has no peak.list attribute,",
        "which can be generated by running msPeak() with use.mean=FALSE.")
  	}
    else { # if peak.class has been obtained from the mean spectrum
      warning("The input ", deparse(substitute(x)), " has peak.class attribute,",
        "no peak alignment is needed any more. Returning the original object.")
      return(x)    	
    }
  }

	# check the number of spectra with peak detected
  n.peak.list <- length(peak.list)
  nopeak <- (sapply(peak.list, NROW) == 0)
  n.nopeak <- sum(nopeak)
  
  if (n.nopeak==n.peak.list) {
  	stop("no peak to align")
  } else if (n.nopeak != 0) {
  	peak.list <- peak.list[!nopeak]
  }

  # extract locations of peaks with large snr from the peak.list
  peak.all <- unlist(lapply(peak.list,
    function(x, snr.thresh) x[x$snr>=snr.thresh, "tick.loc"],
      snr.thresh=snr.thresh))
  if (length(peak.all)==0) stop("no strong peak to align, reduce snr.thresh")
  peak.count  <- table(peak.all)
  peak.unique <- as.integer(names(peak.count))
	
  # compute mz precision threshold in log scale
  # mass(1+mz.precision) -> log(mass) + log(1+mz.precision)
  mz    <- x$mz
  logmz <- log(mz)
  logmz.precision <- log(1 + mz.precision)
    	
  # group peaks based on their locations
  peak.group <- switch(match.arg(FUN, choices=c("cluster", "gap", "vote", "mrd")),
    cluster = msAlignCluster(logmz, logmz.precision, peak.unique),
    gap 	= msAlignGap(logmz, logmz.precision, peak.unique),
    vote 	= msAlignVote(logmz, logmz.precision, peak.unique, peak.count),
    mrd   = msAlignMRD(peak.unique, peak.count, 
      level=ifelse1(is.null(x$mrd), NULL, max(x$mrd$levels)), ...))

  # compute some summary statistics for each peak class
  tick.loc = tapply(peak.unique, peak.group, median)
if (is.R()) {
  tick.range = cbind(tapply(peak.unique, peak.group, min), tapply(peak.unique, peak.group, max))	
} else {
  tick.range = t(data.frame(tapply(peak.unique, peak.group, range)))
}

  peak.class <- cbind(
    tick.loc   = tick.loc,
    tick.left  = tick.range[,1],
    tick.right = tick.range[,2],
    tick.span  = tick.range[,2] - tick.range[,1] + 1,
    mass.loc   = mz[tick.loc],
    mass.left  = mz[tick.range[,1]],
    mass.right = mz[tick.range[,2]],
    mass.span  = mz[tick.range[,2]] - mz[tick.range[,1]])

  ord        <- order(peak.class[, "tick.loc"])
  peak.class <- peak.class[ord, ]
  dimnames(peak.class)[[1]] <- seq(nrow(peak.class))

  # add peak.class to the msSet object
  x <- msSet(x, peak.class=peak.class)

  # append history event
  eventHistory(x, "Peak Alignment"=
    list(process=FUN, mz.precision=mz.precision, snr.thresh=snr.thresh))
}

###
# msAlignCluster
###
	
# align peaks using 1-D clustering
"msAlignCluster" <- function(logmz, logmz.precision, peak.unique)
{
if (is.R()) {
  clust.tree <- hclust(dist(logmz[peak.unique], method="euclidean"),
    method="complete")
} else {
  clust.tree <- hclust(dist(logmz[peak.unique], method="euclidean"),
    method="compact")
}
  # peak.group
  cutree(clust.tree, h=2*logmz.precision) # +/- logmz.precision
}

###
# msAlignGap
###

# align peaks using between-peak distance
"msAlignGap" <- function(logmz, logmz.precision, peak.unique)
{ 
  diff.logmz <- diff(logmz[peak.unique])
  1 + c(0, cumsum(diff.logmz > logmz.precision))
}

###
# msAlignMRD
###

"msAlignMRD" <- function(peak.unique, peak.count, level=NULL, 
  wavelet="s8", reflect=TRUE, ...)
{
  # check input arguments
  max.level <- floor(logb(diff(range(peak.unique)),2))
  if (is.null(level))
    level <- 3
  checkRange(level, c(3,max.level))
  checkScalarType(wavelet,"character")
  checkVectorType(peak.unique,"integer")
  checkVectorType(peak.count,"integer")

	# form histogram of peak locations
	tick.max <- max(peak.unique)
	peak.hist <- rep(0, tick.max)
	peak.hist[peak.unique] <- peak.count
	
	# smooth the histogram ala an MRD
	peak.hist.smoothed <- wavMRDSum(peak.hist, wavelet=wavelet, levels=level-1,
	  keep.smooth=TRUE, keep.details=FALSE, xform="modwt", reflect=reflect)

  # calculate first derivative approximation of smoothed histogram
  crystal <- paste("d", level-2, sep="")
  dpeak.hist.smoothed <- wavShift(wavMODWT(peak.hist.smoothed,
    wavelet="haar", n.levels=level-2))[[crystal]]

  # find local minima
  inadirs <- zeroCross(dpeak.hist.smoothed, "positive")
  
  # form binary bin boundary vector: 1 indicates the
  # index location of the peak histogram bin boundary
  bin.marks <- rep(0, tick.max)
  bin.marks[inadirs] <- 1

  # return peak grouping
  cumsum(bin.marks)[peak.unique]
}

###
# msAlignVote
###
	
# align peaks using voting
"msAlignVote" <- function(logmz, logmz.precision, peak.unique, peak.count)
{
  n.peak <- length(peak.unique)
  logmzp = logmz[peak.unique]

  # find the left and right bounds of each peak in peak.unique
  left <- right <- rep(NA, n.peak)
		
  for (i in 1:n.peak) {
    logmz.current <- logmzp[i]
			
    index    <- 1:i
    left[i]  <- index[logmz.current-logmzp[index]<=2*logmz.precision][1]
	
    index    <- i:n.peak
    right[i] <- rev(index[logmzp[index]-logmz.current<=2*logmz.precision])[1]
  }

#  # NOTE: could get rid of the above for loop using outer() but
#  # 1. it could get too big if there are more than five thousand unique peaks
#  # 2. the computation is not necessarily faster	
#  diff.matrix <- outer(logmzp, logmzp, "-")
#  left <- apply(diff.matrix<=2*logmz.precision, 1, 
#    function(x) which(x)[1])
#  right <- apply(diff.matrix>=-2*logmz.precision, 1, 
#    function(x) rev(which(x))[1])  
	
  # identify peak groups iteratively by voting
  peak.group <- rep(NA, n.peak)
	
  peak.remain  <- 1:n.peak
  count.remain <- sum(peak.count)
  while (count.remain>0){
    my.count <- apply(matrix(peak.remain), 1,
      function(i, peak.count, left, right) sum(peak.count[left[i]:right[i]]),
        peak.count=peak.count, left=left, right=right)
          
    my.count.max <- max(my.count)
    index <- peak.remain[my.count==my.count.max][1] # one at a time
    peak.group[left[index]:right[index]] <- index
    peak.count[left[index]:right[index]] <- 0
    count.remain <- count.remain - my.count.max
    peak.remain  <- setdiff(peak.remain, left[index]:right[index])
  }
	
  peak.group
}