R/cytosee_preprocess.R

#' Initialize and setup the iflow obj
#'
#' Initializes the cytosee object and some optional filtering
#' @param fcs.data Raw input data
#' @param project Project name (string)
#' @param transform_method Method for cell normalization. Default is no normalization.
#' @return Returns a cytosee object with the raw data stored in object@@fcs.data.
#' @importFrom utils packageVersion
#' @import plyr
#' @import reshape2
#' @import scales
#' @export
#'
#' @examples
#' iflowobj <- initiflow(fcs.data = fcsname)
#' iflowobj
#'
#'
initiflow <- function(
  fcs.data,
  projectname = "cytoseeProject"
) {
  cytosee.version <- packageVersion("cytosee")
  object <- new(
    Class = "cytosee",
    fcs.data = fcs.data,
    projectname = projectname,
    version = cytosee.version
  )
  object.fcs.data <- object@fcs.data
  return(object)
}


#' @importFrom flowCore exprs
#' @export
para_setting <- function(object,cores=1, channel.use = 0, transform_method ="log10"){
    object@cores <- cores
    object@transform_method<-transform_method
    if(channel.use == 0){
      channel.use <- 1:length(object@fcs.data@exprs[1,])
    }
    object@channel.use <- channel.use
    return(object)

}


#' @importFrom flowCore exprs





############################################### flowAI function integration ##########################################################


##### flowAI function #####

# Guess which channel captures time in a exprs, flowFrame or flowset
findTimeChannel <- function(xx) {
  time <- grep("^Time$", colnames(xx), value = TRUE, ignore.case = TRUE)[1]
  if (is.na(time)) {
    if (is(xx, "flowSet") || is(xx, "ncdfFlowList"))
      xx <- exprs(xx[[1]]) else if (is(xx, "flowFrame"))
        xx <- exprs(xx)
      cont <- apply(xx, 2, function(y) all(sign(diff(y)) >= 0))
      time <- names(which(cont))
  }
  if (!length(time) || length(time) > 1)
    time <- NULL
  return(time)
}

# Check if the Fcs file is ordered according to time otherwise it order it.
ord_fcs_time <- function(x, timeCh= "Time"){
  xord <- order(exprs(x)[, timeCh])

  if( !identical(xord, 1:nrow(x)) ){
    warning(paste0("Expression data in the file ", basename(keyword(x)$FILENAME),
                   " were not originally ordered by time."))
    params <- parameters(x)
    keyval <- keyword(x)
    sub_exprs <- exprs(x)[xord, ]
    newx <- flowFrame(exprs = sub_exprs, parameters = params,
                      description = keyval)
    return(newx)
  }else{
    return(x)
  }
}


flow_rate_bin <- function(x, second_fraction = 0.1, timeCh = "Time", timestep = 0.1){

  xx <- exprs(x)[, timeCh]
  idx <- c(1:nrow(x))
  lenx <- length(xx)                                                  # num of time ticks

  endsec <- ceiling(timestep * max(xx))                 # total seconds of the experiment
  tbins <- seq(0, endsec/timestep, by = second_fraction/timestep)             # time bins
  secbin <- seq(0, endsec, by = second_fraction)               # bin expressed in seconds
  minbin <- round(secbin/60, 3)                                # bin expressed in minutes
  tbCounts <- c(0, hist(xx, tbins, plot = FALSE)$counts)  # number of events per time bin

  nrBins <- length(tbins) - 1
  expEv <- lenx/(nrBins)           # median(tbCounts) # expected number of events per bin
  binID <- do.call(c, mapply(rep, x = 1:length(tbCounts), times = tbCounts,
                             SIMPLIFY = FALSE))

  if (length(idx) != length(binID))
    stop("length of cell ID not equal length of bin ID")

  flowRateData <- list(frequencies = cbind(tbins, minbin, secbin, tbCounts),
                       cellBinID = data.frame(cellID = idx, binID = binID),
                       info = data.frame(second_fraction = second_fraction,
                                         expFrequency = expEv, bins = nrBins))
  return(flowRateData)
}


flow_rate_plot <- function(flowRateData, lowerRateThres, upperRateThres,
                           lowerTimeCut, UpperTimeCut) {
  if(is.null(lowerRateThres)){
    return()
  }

  frequencies <- as.data.frame(flowRateData$frequencies)
  second_fraction <- flowRateData$info$second_fraction
  short_period <- quantile(frequencies$secbin, seq(0,1, length.out = 4))
  long_period <- quantile(frequencies$secbin, seq(0,1, length.out = 3*10 + 1))

  ## flow rate graph(frg)
  frg <- ggplot(frequencies, aes_string(x="secbin", y="tbCounts")) + geom_line(colour="red") +
    theme_bw() + theme(panel.grid.major = element_blank(),
                       panel.grid.minor = element_blank(),
                       text=element_text(size = 14))
  frg <- frg + geom_vline(xintercept=short_period, color="gray60") +
    geom_vline(xintercept=long_period, linetype = 2, color="gray60")

  frg <- frg + geom_hline(yintercept=c(lowerRateThres, upperRateThres), color="blue",
                          linetype = "longdash", size = 1.2, show.legend = TRUE)
  frg <- frg + geom_vline(xintercept=c(lowerTimeCut, UpperTimeCut), color="blue",
                          linetype = "longdash", size = 1.2, show.legend = TRUE)

  frg <- frg + labs(x= "Seconds", y= paste0("Number of cells per 1/", 1/second_fraction, " second"),
                    title= "Flow Rate Plot")

  return(frg)
}

flow_rate_check <- function(flowRateData, lowerRateThres, upperRateThres,
                            lowerTimeCut, UpperTimeCut) {

  frequencies <- as.data.frame(flowRateData$frequencies)
  cellBinID <- flowRateData$cellBinID

  goodcell_x <- which(frequencies$secbin < UpperTimeCut & frequencies$secbin > lowerTimeCut)
  goodcell_y <- which(frequencies$tbCounts < upperRateThres & frequencies$tbCounts > lowerRateThres)

  flowRateQC <- cellBinID$cellID[ cellBinID$binID %in% intersect(goodcell_x, goodcell_y) ]
  cat("flow rate high Q events: ", length(flowRateQC), "\n")
  return(flowRateQC)
}



flow_signal_bin <- function(x, channels = NULL, binSize=500, timeCh="Time",
                            timestep=0.1, TimeChCheck = NULL) {

  if (is.null(channels) || missing(channels) || is.na(channels)) {
    parms <- setdiff(colnames(x), timeCh)
  } else {
    if (!all(channels %in% colnames(x)))
      stop("Invalid channel(s)")
    parms <- channels
  }

  if (missing(binSize) || is.null(binSize) || is.na(binSize))
    binSize <- 500

  ### Retriving time and expression info
  exp <- exprs(x)
  if (!is.null(TimeChCheck)) {
    timex <- seq(from = 0, length.out = nrow(x), by = 0.1)
  }else{
    timex <- exp[, timeCh]
  }
  yy <- exp[, parms]  # channels data
  idx <- c(1:nrow(x))
  seconds <- timex * timestep
  lenSec <- length(seconds)  # num of time ticks
  uniSeconds <- unique(seconds)  # num of unique time tick
  nrBins <- floor(lenSec/binSize)  # num of bins

  if (length(uniSeconds) < nrBins || lenSec < binSize)
    stop("Improper bin size")

  cf <- c(rep(1:nrBins, each = binSize), rep(nrBins + 1, lenSec - nrBins * binSize))  # id bins
  stopifnot(length(cf) == lenSec)
  tmpx <- split(seconds, cf)
  xx2 <- sapply(tmpx, mean)       # mean of each time bin  (x axis)
  yy2 <- as.matrix(ddply(as.data.frame(yy), .(cf), colwise(median)))[, -1]

  return(list(exprsBin = cbind(timeSec = xx2, yy2), cellBinID = data.frame(cellID = idx, binID = cf),
              bins = length(unique(cf)), binSize = binSize))
}


flow_signal_plot <- function(flowSignalData, lowerBinThres, upperBinThres) {
  if(is.null(flowSignalData)){
    return()
  }
  exprsBin <- flowSignalData$exprsBin
  binID <- 1:nrow(exprsBin)
  teCh <- grep("Time|time|Event|event", colnames(exprsBin), value = T)
  parms <- setdiff(colnames(exprsBin), teCh)
  dataORIG <- exprsBin[, parms]     # first channel is time
  data <- as.data.frame(dataORIG)
  data$binID <- binID

  longdata <- melt(data, id.vars = "binID", variable.name = "marker", value.name = "value")
  FS_graph <- ggplot(longdata, aes(x = binID, y = value, col = marker), environment = environment()) +
    geom_line() + facet_grid(marker ~ ., scales = "free") +
    labs(x = "Bin ID", y = "Median Intensity value") + theme_bw() +
    theme(strip.text.y = element_text(angle = 0, hjust = 1), axis.text = element_text(size = 10),
          axis.title = element_text(size = 15), legend.position = "none") +
    scale_x_continuous(breaks= pretty_breaks(n = 10)) +
    scale_y_continuous(breaks= pretty_breaks(n = 3)) +
    geom_rect(aes(xmin = lowerBinThres, xmax = upperBinThres, ymin = -Inf, ymax = Inf), fill = "orange", linetype = 0, alpha = 0.005)

  return(FS_graph)
}


flow_signal_check <- function(flowSignalData, lowerBinThres, upperBinThres) {

  exprsBin <- flowSignalData$exprsBin
  cellBinID <- flowSignalData$cellBinID

  goodBins <- cellBinID$binID < upperBinThres & cellBinID$binID > lowerBinThres
  FlowSignalQC <- cellBinID$cellID[goodBins]

  cat("flow signal check: ", length(FlowSignalQC), "\n")
  return(FlowSignalQC)
}


flow_margin_check <- function(x,  margin_channels = NULL, side = "both") {

  if (is.null(margin_channels)) {
    teCh <- grep("Time|time|Event|event", colnames(x), value = T)
    parms <- setdiff(colnames(x), teCh)
  } else {
    if (!all(margin_channels %in% colnames(x)))
      stop("Invalid channel(s)")
    parms <- margin_channels
  }
  scatter_parms <- grep("FSC|SSC", parms, value = T)

  xx <- c(1:nrow(x))
  yy <- x@exprs[, parms]
  range <- range(x)
  lenx <- length(xx)

  ## lower check
  if (side == "lower" || side == "both") {

    out_neg_range <- apply(yy, 2, function(x) {
      neg <- which(x < 0)
      # Zscores <- (0.6745*(x[neg] + median(x[neg])))/mad(x[neg]) ## it
      # calculates the Zscore outneg <- neg[which(Zscores < -3.5)]
      min_value <- (-3.5 * mad(x[neg]) + (0.6745 * median(x[neg])))/0.6745  # -3.5 is the default threshold
      if (is.na(min_value)) {
        min(x) - 1
      } else {
        min_value
      }
    })
  }

  # n. bad cells for each channel
  if (side == "lower" || side == "both") {
    neg_bad_len <- sapply(parms, function(x) length(xx[yy[, x] <= out_neg_range[x]]))
  }
  if (side == "upper" || side == "both") {
    pos_bad_len <- sapply(parms, function(x) length(xx[yy[, x] >= range[2,
                                                                        x]]))
  }

  # badcellIDs
  if (side == "lower" || side == "both") {
    lowID <- do.call(c, lapply(parms, function(ch) {
      xx[yy[, ch] <= out_neg_range[ch]]
    }))
    if(length(scatter_parms) != 0){   ### check for values less than 0 in scatter parameters
      minSc <- apply(yy[,scatter_parms], 1, function(x){
        min(x)
      })
      low_scatter_ID <- which(minSc < 0)
      lowID <- unique(c(lowID, low_scatter_ID))
    }
  }
  if (side == "upper" || side == "both") {
    upID <- do.call(c, lapply(parms, function(ch) {
      xx[yy[, ch] >= range[2, ch]]
    }))
  }

  if (side == "lower") {
    summary_bad_cells <- data.frame(lower_range = c(neg_bad_len,
                                                    total_SUM = length(lowID), total_UNIQUE = length(unique(lowID))))
    bad_lowerIDs <- unique(lowID)
    bad_upperIDs <- NULL
    badCellIDs <- unique(lowID)
  } else if (side == "upper") {
    summary_bad_cells <- data.frame(upper_range = c(pos_bad_len,
                                                    total_SUM = length(upID), total_UNIQUE = length(unique(upID))))
    bad_lowerIDs <- NULL
    bad_upperIDs <- unique(upID)
    badCellIDs <- unique(upID)
  } else {
    summary_bad_cells <- data.frame(lower_range = c(neg_bad_len,
                                                    total_SUM = length(lowID), total_UNIQUE = length(unique(lowID))),
                                    upper_range = c(pos_bad_len,
                                                    total_SUM = length(upID), total_UNIQUE = length(unique(upID))))
    bad_lowerIDs <- unique(lowID)
    bad_upperIDs <- unique(upID)
    badCellIDs <- unique(c(lowID,upID))
  }

  goodCellIDs <- setdiff(xx, badCellIDs)

  cat("margin check:", length(goodCellIDs), "\n")

  return(list(goodCellIDs = goodCellIDs, bad_lowerIDs = bad_lowerIDs,
              bad_upperIDs = bad_upperIDs, events = lenx))
}







###  graph showing where the anomalies mostly happened
flow_margin_plot <- function(FlowMarginData, binSize = 500) {

  tot_events <- FlowMarginData$events
  bad_lowerIDs <- FlowMarginData$bad_lowerIDs
  bad_upperIDs <- FlowMarginData$bad_upperIDs

  if (missing(binSize) || is.null(binSize) || is.na(binSize))
    binSize <- 500
  nrBins <- floor(tot_events/binSize)

  cf <- c(rep(1:nrBins, each = binSize), rep(nrBins + 1, tot_events - nrBins * binSize))
  tmpx <- split(1:tot_events, cf)

  if(length(bad_lowerIDs) != 0 && length(bad_upperIDs) != 0){
    lowline <- sapply(tmpx, function(x){
      length(which(bad_lowerIDs %in% x))
    })
    upline <- sapply(tmpx, function(x){
      length(which(bad_upperIDs %in% x))
    })
    ymax <- max(lowline, upline)
    plot(lowline, type ="l", col = "blue", bty ="n",
         ylim = c(0, ymax), xlab = "segment ID",
         ylab = "Number of cells removed" )
    lines(upline, col = "red")
    legend("top", c("Negative Outliers", "Upper Margine Events"), lty = 1,bty = "n", cex = 0.7,
           col = c("blue", "red"))
  }else if( length(bad_lowerIDs) != 0 && length(bad_upperIDs) == 0){
    lowline <- sapply(tmpx, function(x){
      length(which(bad_lowerIDs %in% x))
    })
    plot(lowline, type ="l", col = "blue", bty ="n", xlab = "segment ID",
         ylab = "Number of cells removed" )
    legend("top", c("Negative Outliers"), lty = 1,bty = "n", cex = 0.7,
           col = "blue")
  }else if( length(bad_lowerIDs) == 0 && length(bad_upperIDs) != 0){
    upline <- sapply(tmpx, function(x){
      length(which(bad_upperIDs %in% x))
    })
    plot(upline, type ="l", col = "red", bty ="n", xlab = "segment ID",
         ylab = "Number of cells removed" )
    legend("top", c("Upper Margine Events"), lty = 1,bty = "n", cex = 0.7,
           col = "red")
  }
}


## create new flowFrame with the parameter indicating good and bad cells
addQC <- function(QCvector, sub_exprs, params, keyval){

  rs <- attr(sub_exprs, "ranges")
  rs <- c(rs, rs[1])
  sub_exprs <- cbind(sub_exprs, QCvector)
  attr(sub_exprs, "ranges") <- rs
  NN <- as.numeric(keyval["$PAR"]) + 1
  names(dimnames(sub_exprs)[[2]]) <- sprintf("$P%sN", 1:NN)
  pnr <- paste0("$P", NN, "R")
  pnb <- paste0("$P", NN, "B")
  pne <- paste0("$P", NN, "E")
  pnn <- paste0("$P", NN, "N")
  pns <- paste0("$P", NN, "S")
  flowCorePnRmax <- paste0("flowCore_$P", NN, "Rmax")
  flowCorePnRmin <- paste0("flowCore_$P", NN, "Rmin")
  o <- params@data
  o[length(o[,1]) + 1,] <- c("QC", "bad > 10,000", as.numeric(keyval$`$P1R`), 0, 20000)
  rownames(o)[length(o[,1])] <- paste("$P", NN, sep = "")

  outFCS <- new("flowFrame", exprs=sub_exprs, parameters=new("AnnotatedDataFrame",o), description=keyval)
  description(outFCS)[pnr] <- max(20000, description(outFCS)$`$P1R`)
  description(outFCS)[pnb] <- description(outFCS)$`$P1B`
  description(outFCS)[pne] <- "0,0"
  description(outFCS)[pnn] <- "QC"
  description(outFCS)[pns] <- "bad > 10,000"
  description(outFCS)$`$PAR` <- NN
  description(outFCS)[flowCorePnRmax] <- 20000
  description(outFCS)[flowCorePnRmin] <- 0
  outFCS
}


#############################################################  end of flowAI ###########################################################################
poipou/CytoSEE documentation built on May 13, 2019, 6:15 p.m.