flowClean: flowClean

Documented in clean

require(flowCore)
require(sfsmisc)
require(changepoint)
require(bit)
require(grid)
Sys.setlocale('LC_ALL','C') ## Occasional multibyte string issue

geo.mean <- function(vec){
  return(exp(mean(log(vec))))
}

make_pops <- function(dF, cutoff, params, markers){
  dF <- dF[,params]
  cnames <- colnames(dF) 
  if (cutoff == "median"){ cutoff <- apply(dF, 2, function(x){ quantile(x, 0.5) })}
  else if (cutoff < 1){
    cutoff <- apply(dF, 2, function(x, v) { quantile(x, v) }, v=cutoff)
  }
  else { cutoff <- rep(cutoff, length(params)) }
  d2 <- do.call(cbind, lapply(1:ncol(dF), function(i, a, b){
    x <- a[,i]
    cutoff <- b[i]
    if ( (length(x[which(x > cutoff)])/length(x)) > 0.9){
      cutoff <- median(x)
    }
    x[which(x <= cutoff)] <- 0
    x[which(x != 0)] <- 1
    return(x)
  }, a=dF, b=cutoff))

  dfbit <- apply(d2, 1, as.bit)
  aphbit <- unique(dfbit)
  idx <- lapply(c(1:length(aphbit)), function(x, y, i){
    return(which(x == y[[i]]))
  }, x=dfbit, y=aphbit)

  lengths <- lapply(idx, length)
  return(list("aphbit"=aphbit, "lengths"=lengths, "idx"=idx))
}

name_maker <- function(aphbit, markers, full=TRUE){
  v <- lapply(aphbit, function(x, d){
    if (x == 0) { return(rep.int(0, times=d)) }
    else {
      dig <- t(digitsBase(x, base=2))[1:length(markers)]
      if (length(dig) == length(markers)){ return(dig) }
      else {
        dig <- c(rep.int(0, times=(length(markers) - length(dig))), dig)
        return(dig)
      }
     } },
    d=length(aphbit))

  newv <- lapply(v, function(w, x, full){
    ids <- !is.na(x)
    x[which(x == 1)] <- "+"
    x[which(x == 0)] <- "-"
    x <- lapply(seq_along(x), function(i, y, z){
      return(paste(z[i], y[i], sep=""))
    }, y=x, z=w)
    if (full == FALSE) { x <- x[ids] }
    x <- paste(x, collapse="")
    return(x)
  }, w=markers, full=full)
  return(newv)
}

get_pops <- function(dF, cutoff, params, bins, nCellCutoff, markers, nstable){
  poplist <- make_pops(dF, cutoff, params, markers)
  curr <- length(which(poplist$lengths >= nCellCutoff))
  ## try to find as many pops as possible
  if (length(which(poplist$lengths >= nCellCutoff)) < nstable){
    cut_grid <- rev(seq(0.05, 0.5, by=0.05))
    while (curr < 5 & length(cut_grid) >= 1){
      poplist <- make_pops(dF, cut_grid[1], params, markers)
      curr <- length(which(poplist$lengths >= nCellCutoff))
      cut_grid <- cut_grid[-1]
    }
    ## logic for seemingly redundant code:
    ## if the conditional evaluates and is true there are few pops
    ## if the conditional doesn't evaluate there are definitely few pops
    tryCatch({
        if(cut_grid == 0.05 & curr < nstable){
            warning(paste0("Less than ", nstable," populations identified by flowClean"))
        }
    }, error = function(e){
        warning(paste0("Less than ", nstable," populations identified by flowClean"))
    })
  }
  aphbit <- poplist$aphbit
  lengths <- poplist$lengths
  idx <- poplist$idx
  pops <- name_maker(aphbit, markers)

  counts <- lapply(1:length(idx), function(i, x, y){
    cc <- lapply(1:length(bins), function(j, w, v) {
      return(length(which(w %in% v[[j]])))
    }, w=x[[i]], v=bins)
    return(cc)
  }, x=idx, y=bins)

  popdf <- do.call(rbind, counts)
  perdf <- apply(popdf, 2, function(x){ N <- unlist(x); return(N/sum(N)) })
  perdf <- cbind(perdf, as.double(lengths))
  rownames(perdf) <- pops
  good.idx <- which(lengths >= nCellCutoff)
  perdf.trim <- perdf[good.idx,1:length(bins)]
  return(list("full"=perdf, "trim"=perdf.trim, "pops"=popdf))
}

clean <- function(fF, vectMarkers, filePrefixWithDir, ext, binSize=0.01, nCellCutoff=500,
                  announce=TRUE, cutoff="median", diagnostic=FALSE, fcMax=1.3,
                  returnVector=FALSE, nstable=5){

  if (nrow(fF) < 30000){
      if (announce){
          print(paste("flowClean detected too few cells in ",
                      keyword(fF, "FILENAME")$FILENAME, ".", sep=""))
      }
      warning("Too few cells in FCS for flowClean.")
      GoodVsBad <- rep.int(0, times=nrow(fF))
      if (returnVector == TRUE){ return(GoodVsBad) }
      outFCS <- makeFCS(fF, GoodVsBad, filePrefixWithDir, 0, nCellCutoff, ext,
                        stablePops=NULL)
      return(outFCS)
  }

  markers <- parameters(fF)$name
  markers <- as.vector(markers)

  numbins <- ceiling(1/binSize)
  time.id <- grep("time", colnames(fF), ignore.case = TRUE)
  if (length(time.id) > 0) {
      time <- exprs(fF)[, time.id]
      ## Have to deal with this here
      ## if (mean(time) == time[1]) {
      ##     time <- 1:nrow(fF)
      ## }
  }
  else {
      if (announce){
          print(paste("flowClean detected no Time parameter in ",
                      keyword(fF, "FILENAME")$FILENAME, ".", sep=""))
      }
      warning("No Time Parameter Detected")
      GoodVsBad <- rep.int(0, times=nrow(fF))
      if (returnVector == TRUE){ return(GoodVsBad) }
      outFCS <- makeFCS(fF, GoodVsBad, filePrefixWithDir, 0, nCellCutoff, ext,
                        stablePops=NULL)
      return(outFCS)
  }
  # make sure time starts at 0
  if (min(time) > 0){ time <- time - min(time) }
  numOfEvents <- length(time)
  lTime <- time[numOfEvents]
  stepB <- lTime * binSize
  bins <- lapply(c(1:numbins), function(i, x, y, z){
    vec <- which(x >= ((i - 1) * y) & (x < i * y))
    if (i == z){ vec <- c(vec, which(x >= (i * y))) }
    return(vec)
  }, x=time, y=stepB, z=numbins )
  binVector <- unlist(lapply(c(1:numbins), function(i, x){
    rep(i, length(unlist(x[[i]]))) }, x=bins))
  ### NOTE: implicitly assuming vectMarkers is numeric
  out <- get_pops(exprs(fF), cutoff, params=vectMarkers, bins=bins,
                  nCellCutoff, markers[vectMarkers], nstable)
  full <- out$full
  pops <- out$pops
  out <- out$trim
  dxVector <- binVector
  ## missing cells really screw everything up
  pops <- apply(pops, c(1, 2), as.numeric)
  sums <- apply(pops, 2, sum)
  weirds <- lapply(c(37, 42, 51), function(xx){
      weird <- getUnlikely(sums, xx)
  })
  weird <- sort(unique(unlist(weirds)))
  if (length(weird) > 0){
      out <- out[,-weird]
  }
  out <- cen.log.ratio(out)
  norms <- lp(out)
  ## was previously penalty=AIC, but with recent updates this works
  ## better/does what AIC used to
  ## what is the indexing here?  
  pts <- cpt.mean(norms, method="PELT", penalty="Manual", pen.value=1)
  bad <- getBad(pts, fcMax)

  ## what kind of bad do we report?
  if (!is.null(bad) | length(weird) > 0){
    if (length(weird) > 0){
        bad <- unique(c(unlist(fix.weird(bad, weird, numbins)), weird))
    }
    else if (bad[length(bad)] != numbins){
        ## changepoint library frequently off by 1
        bad <- c(min(bad), unique(bad+1))
    }
    bad <- sort(bad)
    dxVector[which(dxVector %in% bad)] <- runif(length(which(dxVector %in% bad)),
                                                min=10000, max=20000)
    
    if (announce){
      print(paste("flowClean has identified problems in ",
                  keyword(fF, "FILENAME")$FILENAME, " with ", toString(bad),  ".", sep=""))
    }
    if (diagnostic){
      png(paste(filePrefixWithDir,sep=".", numbins, nCellCutoff,
                "clr_percent_plot", "png"), type="cairo",
          height=1000, width=1000)
      diagnosticPlot(out,"CLR", bad)
      dev.off()
    }
    GoodVsBad <- as.numeric(dxVector)
    if (returnVector){ return(GoodVsBad) }
    

    outFCS <- makeFCS(fF, GoodVsBad, filePrefixWithDir, numbins, nCellCutoff,
                      ext, stablePops=out)

    return(outFCS)
  }
  else{
    if (announce){
        print(paste("flowClean detected no problems in ",
                    keyword(fF, "FILENAME")$FILENAME, ".", sep=""))
    }
    if (diagnostic){
      png(paste(filePrefixWithDir,sep=".", numbins, nCellCutoff,
                "clr_percent_plot", "png"), type="cairo",
          height=1000, width=1000)
      diagnosticPlot(out,"CLR", bad)
      dev.off()
   }

    GoodVsBad <- as.numeric(dxVector)
    if (returnVector){ return(GoodVsBad) }
    outFCS <- makeFCS(fF, GoodVsBad, filePrefixWithDir, numbins, nCellCutoff,
                      ext, stablePops=out)
    return(outFCS)
  } 
}

makeSeries <- function(vec){
    out <- list()
    out[[1]] <-  holder <- vec[1]
    counter <- 1
    devnull <- lapply(diff(vec), function(xx){
        holder <<- holder + xx
        if (xx != 1){
            counter <<- counter + 1
            out[[counter]] <<- holder
        }
        else {
            out[[counter]] <<- c(out[[counter]], holder)
        }
    })
    out
}

fix.weird <- function(bad, weird, maxBin){
    
    if(is.null(bad)){ return(weird) }
    bad <- c(min(bad)-1, bad)
    ser.b <- makeSeries(bad)
    ser.w <- makeSeries(weird)
    ## fix persistent off-by-one error
    ## need to include minimum bad
    ser.b <- lapply(ser.b, function(xx){
        ## with gaps in the data, we also miss the start of problems
        if (min(xx) > 1){ xx <- c(min(xx) - 1, xx) }
        if (max(xx) < maxBin){ return(xx + 1) }
        else{ xx }
    })
    ## let's find the max per weird series and place it before
    ## a bad series if possible
    ## creates a vector whose indices are those of
    ## 'weird series' and whose entries index 'bad series'
    shifts <- sapply(sapply(ser.w, max), function(xx){
        which(sapply(ser.b, min) > xx)[1]
    })
    ## occasionally the weird bin is the last bin
    if (any(!is.na(shifts))) {
      devnull <- sapply(1:length(shifts[!is.na(shifts)]), function(ii){
        bad.id <- shifts[ii]
        bad.offset <- length(ser.w[[ii]])
        ser.b[[bad.id]] <<- ser.b[[bad.id]] + bad.offset
      })
    }
    ## problem is not here!
    ser.b
}

loglik <- function(data, l){
    n <- length(data)
    sum(data, na.rm=TRUE) * log(l) - n*l
}

getUnlikely <- function(sums, seed){
   set.seed(seed)
   idx <- sample(1:length(sums))
   names(sums) <- paste0("V", 1:length(sums))
   sums <- sums[idx]
   cll <- sapply(1:length(sums), function(ii){
       if (ii == 1){return(loglik(sums[ii], sums[ii]))}
       l <- loglik(sums[1:(ii-1)], mean(sums[1:(ii-1)]))
       lnew <- loglik(sums[1:ii], mean(sums[1:ii]))
       return(lnew - l)
   })
   weird <- which(cll <= 0)
   weird <- as.numeric(gsub("V", "", names(sums)[weird]))
   weird
}


makeFCS <- function(fF, GoodVsBad, filePrefixWithDir, numbins, nCellCutoff, ext, stablePops){
  ex <- exprs(fF)
  rs <- attr(exprs(fF), "ranges")
  rs <- c(rs, rs[1])
  ex <- cbind(ex, GoodVsBad)
  attr(ex, "ranges") <- rs
  NN <- as.numeric(description(fF)["$PAR"]) + 1
  names(dimnames(ex)[[2]]) <- sprintf("$P%sN", 1:NN)
  pnr <- paste0("$P", NN, "R")
  pnb <- paste0("$P", NN, "B")
  pne <- paste0("$P", NN, "E")
  pnn <- paste0("$P", NN, "N")
  pns <- paste0("$P", NN, "S")
  flowCorePnRmax <- paste0("flowCore_$P", NN, "Rmax")
  flowCorePnRmin <- paste0("flowCore_$P", NN, "Rmin")

  o <- parameters(fF)@data
  o[length(o[,1]) + 1,] <- c("GoodVsBad", "GoodVsBad", as.numeric(description(fF)$`$P1R`), 0, as.numeric(description(fF)$`$P1R`) - 1)
  outFCS <- new("flowFrame", exprs=ex, parameters=new("AnnotatedDataFrame",o), description=description(fF))
  description(outFCS)$FILENAME <- paste(filePrefixWithDir,sep=".", numbins, nCellCutoff, "revised", ext)
  description(outFCS)[pnr] <- max(20000, description(outFCS)$`$P1R`)
  description(outFCS)[pnb] <- description(outFCS)$`$P1B`
  description(outFCS)[pne] <- "0,0"
  description(outFCS)[pnn] <- "GoodVsBad"
  description(outFCS)[pns] <- "GoodVsBad"
  description(outFCS)$`$PAR` <- NN
  description(outFCS)$`StablePops` <- nrow(stablePops)
  description(outFCS)$`nBins` <- numbins
  description(outFCS)$`nCellCutoff` <- nCellCutoff
  description(outFCS)[flowCorePnRmax] <- max(20000, description(outFCS)$`flowCore_$P1Rmax`)
  description(outFCS)[flowCorePnRmin] <- 0
  parameters(outFCS)@data$range <- as.numeric(parameters(outFCS)@data$range)
  parameters(outFCS)@data$minRange <- as.numeric(parameters(outFCS)@data$minRange)
  parameters(outFCS)@data$maxRange <- as.numeric(parameters(outFCS)@data$maxRange)
  outFCS
}  

diagnosticPlot <- function(dF, ylab, bad){
  bins <- ncol(dF)
  pops <- nrow(dF)
  xx <- 1:bins
  plot(xx, dF[1,],ylim=c(min(dF), max(dF)),xlab="Time Bins", ylab=ylab, col=ifelse(xx %in% bad, 2, 1), pch=20)
  for (i in 2:pops){
    points(1:bins,dF[i,], col=ifelse(xx %in% bad, 2, 1), pch=20)
  }
  abline(h=0, col=4)
}

cen.log.ratio <- function(dF, minim=1e-7){
  #element-wise statistic
  sums <- unlist(apply(dF, 1, sum))
  ta <- minim/max(sums)
  n <- nrow(dF)

  #element-wise statistic
  m <- max(unlist(apply(dF, 1, function(x){return(length(x[which(x == 0)]))})))
  if (n == m) { n <- n+1 }   
  d <- (n^2*ta)/((m+1)*(n-m))
  Ts <- (d*m*(m+1))/(n^2)

  #modified Aitchison of Fry et al
  rev_df <- apply(dF, c(1,2), function(x){
    if (x == 0){ x <- ta }
    else { x <- x - (x*Ts) }; return(x)
  })
  rev_df <- apply(rev_df, 2, function(x) {return(log(x/geo.mean(x)))})

  return(rev_df)
}

getCPTs <- function(cpt){
    methodFound <- slotFound <- FALSE
    out <- NULL
    if (length(cpts(cpt)) > 0) {
        methodFound <- TRUE
        out <- cpts(cpt)
    }
    else if (length(cpt@cpts)){
        slotFound <- TRUE
        out <- cpt@cpts
    }
    if (methodFound | slotFound){ return(out) }
    else { return(NULL) }
}

getBad <- function(cpt, k=1.3){
    pts <- getCPTs(cpt)
    if (length(pts) == 0) {
        bad <- NULL
        return(bad)
    }
    ps <- ps2 <- pts
    ms <- param.est(cpt)$mean
    len <- length(data.set(cpt))

    binList <- cl.iter(pts, len)

    if (length(binList) != length(ms)){
        ms <- lapply(binList, function(xx){ return( mean(data.set(cpt)[unlist(xx)]) ) })
    }

    lens <- sapply(binList, length)
    big <- ms[which(lens == max(lens))]
    FCs <- unlist(ms)/as.numeric(big)
    bad <- binList[which(FCs > k)]
    unlist(bad)
}

bin.unlist <- function(bins){
   cls <- lapply(bins, is.list)
   bins2 <- list()
   for (ii in 1:length(cls)){
       if (cls[[ii]]){
           vv <- lapply(bins[[ii]], unlist)
           bins2 <- c(bins2, vv)
       }
       else { bins2 <- c(bins2, bins[ii]) }
   }
   bins2
}

posList <- function(bins){
    ids <- sapply(bins, function(xx){
        if (length(xx) == 1 && xx == -1){ return(FALSE) }
        else { return(TRUE) }
    })
    return(bins[ids])
}

cl.iter <- function(vectr, max.x){
   bins <- lapply(vectr, cl, vectr=vectr, max.x=max.x)
#   browser()
   bins <- posList(bin.unlist(bins))
   bins
}
 
cl <- function(x, vectr, max.x){
  id <- which(x == vectr)
  if (x == 1){
    return(x)
  }
  ## if the first CPT is not 1, and the 2nd CPT is CPT.1 + 1, return 2 entries.
  if (x == vectr[1] & vectr[1] != 1 & length(vectr) != 1){
    if ((vectr[2] - x) == 1){ return(c(1:(x - 1),x)) }
    else { return(1:x) }
  }
  if (x == vectr[length(vectr)]){
    if (x != max.x){ return((x+1):max.x) }
    else if (x == max.x & length(vectr) > 1){ return((vectr[id-1] + 1):x) }
    else if (x == max.x & length(vectr) == 1){ return(c(unlist(1:(x-1)), unlist(x))) }
    else { return(x) }
  }
  else{
    if (((vectr[id+1] - x) == 1) & ((vectr[id+1] - vectr[id-1]) ==  2)){ return(x) }
    else if (((vectr[id+1] - x) == 1) & ((vectr[id+1] - vectr[id-1]) >  2)){
      return(-1) }
    else if (((vectr[id+1] - x) > 1) && ((vectr[id+1] - vectr[id-1]) >  2) &&
             ((id - 2) > 0) && ((x - vectr[id-1]) > 1)  &&
             ((x - vectr[id-2]) >  2)){
        return((x+1):(vectr[id+1])) }
    else { return(x:(vectr[id+1])) }
  }
}


lp <- function(dF, p=NULL){
  xnorms <- apply(dF, 2, function(x){
    pos <- x[which(x > 0)]
    if (is.null(p)){
      p = length(pos)
    }
    lnorm <- sum(abs(pos)^p)^(1/p)
  })
  return(xnorms)
}