R/peakSeparator.R

Defines functions pruneRegions combinePols hdMat area.in.polygon computeOverlap plotting normApprox getProportions getRegions getSubsets peakSeparator curvRegs backGating

## This is the main function to perform backgating. 'data' is supposed to be a flowSet,
## 'xy' is a character vector of length 2 giving the dimensions of interest and 'channels'
## are the additional channels used for backgating. 'fres' is an optional list of filterResults
## of the initial curv1Filter foer each parameter in channels and can be supplied in order to
## speed things up. the output of the function depends on the value of 'incidence'. If this
## is TRUE, it will be a list of incidence matrices for each sample in the set, where each matrix
## indicates whether a particular event was assign to a given peak. If 'incidence' is FALSE, a
## hook function can be supplied which directly uses the subset of events under a peak
## (as a flowFrame in the first argument). The value of xy is passed in as a second argument.
## The output is supposed to be a data frame, potentially of landmark coordinates and further
## annotation. The hook function is called for each frame in the set, for each peak in each
## channel, and the respective results are concatenated using rbind. The peak, channel and
## sample information is added automatically. The final output of the backGating function is
## the concatenated data frame in this case.
backGating <- function(data, xy, channels=setdiff(colnames(data), c(xy, "time", "Time")),
                                                  fres, hookFunction="curvRegs", incidence=FALSE)
{
    if(missing(fres))
    {
        fres <- lapply(channels, function(x) filter(data, curv1Filter(x)))
        names(fres) <- channels
    }
    allres <- NULL
    subsets <- vector(mode="list", length=length(data))
    names(subsets) <- sampleNames(data)
    dataColl <- as(data, "flowFrame")
    for(i in seq_along(channels))
    {
        fresColl <- filter(dataColl, curv1Filter(channels[i]) %subset% sampleFilter(50000))
        np <- length(fresColl)-1
        lm <- landmarkMatrix(data, fres, channels[i], indices=TRUE,
                             peakNr=np)
        if(!all(lm==0))
        {
	    for(j in seq_len(ncol(lm)))
            {
                for(k in sampleNames(data))
                {
                    cat("\r                                                ")
                    cat("\rProcessing channel", channels[i], "peak", j, "sample", k)
                    subset <- split(data[[k]], fres[[channels[i]]][[k]])[[lm[k,j]+1]]
                    if(!is.null(subset))
                    {
                        tfr <-  fres[[channels[i]]][[k]][[lm[k,j]+1]]
                        subsets[[k]] <- cbind(subsets[[k]], tfr@subSet)
                        colnames(subsets[[k]])[ncol(subsets[[k]])] <-
                        paste(channels[[i]], j, sep="_")
                        if(!incidence)
                        {
                            res <- do.call(hookFunction, args=list(subset, xy))
                            if(!is.null(res))
                                allres <- rbind(allres, cbind(res, sample=k, peak=j,
                                                              channel=channels[i]))
                        }
                    }
                }
            }
        }
    }
    cat("\n")
    return(if(incidence) subsets else allres)  
}







## Compute high-density regions using a curv2Filter, try to combine
## neighboring areas if they are within a certain threshold and return
## the polygon vertices as well as theirt centroids.
curvRegs <- function(dat, p)
{
    r <- range(dat)[, p]
    c2 <- curv2Filter(p, bwFac=1.8)
    c2Res <- filter(dat, c2)
    pols <- attr(c2Res@subSet, "polygons")
    polM <- lapply(pols, function(x) sapply(x[2:3], c))
    if(length(polM)>1)
        polM <- pruneRegions(polM, max(apply(r, 2, diff))/5,
                             prod(apply(r, 2, diff))/250)
    res <- NULL
    for(i in seq_along(polM))
    {
        tmp <- polM[[i]]
        colnames(tmp) <- p
        cent <- apply(tmp, 2, mean)
        ntmp <-   data.frame(x=cent[1], y=cent[2], population=i,
                                      type="centroid")
        colnames(ntmp)[seq_along(p)] <- p
        res <- rbind(res,
                     rbind(data.frame(tmp, population=i,
                                      type="polygon"),
                           ntmp))
    }
    return(res)
}






## Some helpers below here...


## Find and score separators between peaks of a flowSet. The scoring
## is based on the overlap of normal approximations for the underlying
## data of neighbouring peaks.

peakSeparator <- function(data, parm, fres)
{
    if(missing(fres))
    {
        fres <- lapply(parm, function(x) filter(data, curv1Filter(x)))
        names(fres) <- parm
    }
    lm <- landmarkMatrix(data, fres, parm, indices=TRUE)
    ## Initially we only take peaks that are present in all samples
    ## lm <- lm[,!apply(lm, 2, function(x) any(is.na(x))), drop=FALSE]
    ##regions <- getRegions(data, parm, fres, lm)
    subs <- getSubsets(data, parm, fres, lm)
    props <- getProportions(data, parm, fres, lm)
    norm <- lapply(subs, normApprox)
    ol <- computeOverlap(norm)
    plotting(data, norm, parm, props, ol)
    
}


getSubsets <- function(data, parm, fres, mat)
{
    res <- list()
    for(i in sampleNames(data))
        res[[i]] <- split(data[[i]][,parm], fres[[parm]][[i]])[mat[i,]+1]
    return(res)
}



getRegions <- function(data, parm, fres, mat)
{
    res <- list()
    for(i in sampleNames(data))
        res[[i]] <- curvPeaks(fres[[parm]][[i]], exprs(data[[i]])[,parm])$regions[mat[i,],]
    return(res)
}


getProportions <- function(data, parm, fres, mat)
{
    res <- list()
    for(i in sampleNames(data))
        res[[i]] <- summary(fres[[parm]][[i]], toTable=TRUE)$p[mat[i,]+1]
    return(res)
}

                      
normApprox <- function(x)
{
    res <- matrix(ncol=2, nrow=length(x))
    colnames(res) <- c("mu", "sd")
    for(i in seq_along(x))
    {
        if(!is.null(x[[i]]))
           res[i,] <- c(median(exprs(x[[i]]), na.rm=TRUE), mad(exprs(x[[i]]), na.rm=TRUE))
        
    }
    return(res)
}



plotting <- function(data, norm, parm, props, ol)
{
    opar <- par(ask=TRUE)
    on.exit(par(opar))
    for(i in sampleNames(data))
    {
        plot(density(exprs(data[[i]][,parm])), main=parm)
        r <- range(exprs(data[[i]]))[,parm]
        for(j in 1:nrow(norm[[i]]))
        {
            sq <- seq(r[1], r[2], len=500)
            tt <- dnorm(sq, mean=norm[[i]][j,1], sd=norm[[i]][j,2])*props[[i]][j]
            lines(sq, tt, col=j+1)
            if(!is.na(ol[[i]][j]))
            {
                xl <- if(nrow(norm[[i]]) >= j+1) mean(c(norm[[i]][j,1], norm[[i]][j+1,1]))
                else norm[[i]][j,1]
                text(xl, par("usr")[4]-(par("usr")[4]/5), signif(ol[[i]][j],2))
            }
        }
    }
    
}



computeOverlap <- function(norm)
{
    res <- list()
    for(n in names(norm))
    {
        nn <- norm[[n]]
        lr <- min(nn[,1] - 10*nn[,2], na.rm=TRUE)
        rr <- max(nn[,1] + 10*nn[,2], na.rm=TRUE)
        tmp <- NULL
        if(nrow(nn)>1)
        {
            for(j in 2:nrow(nn))
            {
                sq <- seq(lr, rr, len=500)
                d1 <-  dnorm(sq, mean=nn[j-1,1], sd=nn[j-1,2])
                d2 <-  dnorm(sq, mean=nn[j,1], sd=nn[j,2])
                tmp <- c(tmp) 
            }
        }
        res[[n]] <- tmp
    }
    return(res)
}





## Compute the area within a polygon by triangular approximation
area.in.polygon <- function(x)
{
    n<-nrow(x)
    D <- diff(x[c(1:n, 1),  ])
    A <- x[, 1] * D[, 2] - x[, 2] * D[, 1]
    abs(sum(A)/2)
}


## A simple Hausdorff distance algorithm for two sets of polygon
## vertices.  The proper solution is to compute distances between
## edges, however this is only defined for convex polygons (which
## could be forced by first calling chull...)
## Note that this makes sense only if the two dimensions are on a
## similar scale, e.g., FSC vs SSC
simpleHd <- function (A, B) 
{
    hd <- 0
    n1 <- nrow(A)
    n2 <- nrow(B)
    d1 <- c()
    d2 <- c()
    if (n1 * n2 == 0) 
        hd <- -1
    else {
        stopifnot(is.numeric(A))
        stopifnot(is.numeric(B))
        for (i in 1:n1) d1[i] <- min(sqrt((B[,1]-A[i,1])^2 + (B[,2]-A[i,2])^2))
        for (i in 1:n2) d2[i] <- min(sqrt((A[,1]-B[i,1])^2 + (A[,2]-B[i,2])^2))
        hd <- max(max(d1), max(d2))
    }
    hd
}


## The pariwise distance matrix for a list of polygons, based on the
## Hausdorff distance metrix.
hdMat <- function(pols)
{
    mat <- matrix(ncol=length(pols), nrow=length(pols))
    for(i in seq_along(pols))
        for(j in seq_along(pols))
            mat[i,j] <- simpleHd(pols[[i]], pols[[j]])
    return(mat)
}


## Collapse two polygons to their common convex hull.
combinePols <- function(p1, p2)
{
    pc <- rbind(p1, p2)
    pc[chull(pc),]
}


## Take a list of polygons and collapse all areas that are closer than
## 'dt' in the Hausdorff distance metric. Also drop all polygons with
## areas smaller than 'st'.
pruneRegions <- function(pols, dt, st)
{
    ssel <- sapply(pols, area.in.polygon) > st
    if(any(ssel))
       pols <- pols[ssel]
    else
       return(pols)
    d <- hdMat(pols)
    sel <- which(hdMat(pols) < dt, arr.ind=TRUE)
    sel <- sel[apply(sel, 1, diff)!=0, ]
    while(length(sel))
    {
        pols[[sel[1,1]]] <- combinePols(pols[[sel[1,1]]],
                                        pols[[sel[1,2]]])
             
        pols <- pols[-sel[1,2]]
        d <- hdMat(pols)
        sel <- which(hdMat(pols) < dt, arr.ind=TRUE)
        sel <- sel[apply(sel, 1, diff)!=0, ]
    }
    return(pols)
}
RGLab/flowStats documentation built on July 20, 2023, 1:33 a.m.