R/ecc.gdsn.R

Defines functions estimateCellCounts.gds impose getquantiles normalizeQuantiles2

Documented in estimateCellCounts.gds getquantiles impose normalizeQuantiles2

normalizeQuantiles2 <- function(A, ties=TRUE) {
    # Abridged function of limma::normalizeQuantiles, cuts function
    # before sort - will be removed when EPIC reference set is introduced.
	n <- dim(A)
	if(is.null(n)) return(A)
	if(n[2]==1) return(A)
	O <- S <- array(,n)
	nobs <- rep(n[1],n[2])
	i <- (0:(n[1]-1))/(n[1]-1)
	for (j in 1:n[2]) {
		Si <- sort(A[,j], method="quick", index.return=TRUE)
		nobsj <- length(Si$x)
		if(nobsj < n[1]) {
			nobs[j] <- nobsj
			isna <- is.na(A[,j])
			S[,j] <- approx((0:(nobsj-1))/(nobsj-1), Si$x, i, ties="ordered")$y
			O[!isna,j] <- ((1:n[1])[!isna])[Si$ix]
		} else {
			S[,j] <- Si$x
			O[,j] <- Si$ix
		}
	}
m <- rowMeans(S)
output <- list(m, i)
return(output)
}

# Get quantiles of (pre/un)normalized node.
getquantiles <- function(gds, node, perc, onetwo, rank = FALSE, new.node = NULL){
    x <- index.gdsn(gds, node)
    ranked <- get.attr.gdsn(x)[['ranked']]
    if(!is.null(ranked)){
        out <- list(quantiles = get.attr.gdsn(x)[['quantiles']],
                    inter = get.attr.gdsn(x)[['inter']],
                    onetwo = get.attr.gdsn(x)[['onetwo']])
        return(out)
    } else {
       out <- getquantilesandranks(gds = gds, node = node, onetwo = onetwo, perc = perc, rank.node = NULL)
       return(out)
    }
}

impose <- function(matrix, quan){
    ot <- quan[['onetwo']]
    quantiles <- quan[['quantiles']]
    names(ot) <- quan[['rn']]
    inter <- quan[['inter']]
    blank <- matrix(NA, length(ot), ncol(matrix))
    rownames(blank) <- names(ot)
    blank[rownames(matrix), ] <- matrix
    for(z in 1:ncol(matrix)){
        isna <- is.na(blank[,z])
        r <- rep(0, length(ot))
        r[ot=='I'] <- rank(blank[ot=='I', z])
        r[ot=='II'] <- rank(blank[ot=='II', z])
        blank[ot == 'I' & (!isna), z] <-  approx(inter[ot == 'I'],
            quantiles[ot == 'I'],
            (r[ot=='I'&(!isna)] - 1) /(sum(ot == 'I')-1), ties = "ordered")$y

        blank[ot == 'II' & (!isna),z] <- approx(inter[ot == 'II'],
            quantiles[ot == 'II'],
            (r[ot=='II'&(!isna)] - 1)/(sum(ot == 'II')-1), ties = "ordered")$y
    }
    b2 <- na.omit(blank)
    b2<- b2[rownames(matrix),]
    return(b2)
}

# EstimateCellCounts.gds
estimateCellCounts.gds <- function(
    gds,
    gdPlatform = c("450k", "EPIC", "27k"),
    mn = NULL,
    un = NULL,
    bn = NULL,
    perc = 0.25,
    compositeCellType = "Blood",
#    processMethod = "auto",
    probeSelect = "auto",
    cellTypes = c("CD8T","CD4T","NK","Bcell","Mono","Gran"),
    referencePlatform = c("IlluminaHumanMethylation450k",
        "IlluminaHumanMethylationEPIC",
        "IlluminaHumanMethylation27k"),
    returnAll = FALSE,
    meanPlot = FALSE,
    verbose = TRUE,
    ...) {
    # My shameless /scopy/s implmentation of minfi::estimateCellCounts
    # For those who do not want use minfi::read.metharray
    referencePlatform <- match.arg(referencePlatform)
    rgPlatform <- match.arg(gdPlatform)
    if(rgPlatform == 'EPIC') rgPlatform <- '450k'
    # No method in bigmelon to derive annotation from object, therefore specify.
    # Sanity Checking from minfi...
    if((compositeCellType == "CordBlood") && (!"nRBC" %in% cellTypes)){
        message("[estimateCellCounts] Consider including 'nRBC' in argument 'cellTypes' for cord blood estimation.\n")
    }
    referencePkg <- sprintf("FlowSorted.%s.%s", compositeCellType, rgPlatform)
    if(!require(referencePkg, character.only = TRUE)){
        stop(sprintf("Could not find reference data package for compositeCellType '%s' and referencePlatform '%s' (inferred package name is '%s')",
                     compositeCellType, rgPlatform, referencePkg))
    }
    data(list = referencePkg)
    referenceRGset <- get(referencePkg)
    if(! "CellType" %in% names(colData(referenceRGset)))
        stop(sprintf("the reference sorted dataset (in this case '%s') needs to have a phenoData column called 'CellType'"),
             names(referencePkg))
    if(sum(colnames(gds) %in% colnames(referenceRGset)) > 0)
        stop("the sample/column names in the user set must not be in the reference data ")
    if(!all(cellTypes %in% referenceRGset$CellType))
        stop(sprintf("all elements of argument 'cellTypes' needs to be part of the reference phenoData columns 'CellType' (containg the following elements: '%s')",
                     paste(unique(referenceRGset$cellType), collapse = "', '")))
    if(length(unique(cellTypes)) < 2)
        stop("At least 2 cell types must be provided.")
    # Here is where things get different.
    # First assumption: Data is prenormalized - we will assume dasen is used.
    # Bigmelon has a wrapper that enables the quantification of quantiles for both M and U without having to rerank and re-sort data.
    # Therefore, bigmelon will check if quantiles have been computed already.
    if(is.null(mn) & !'mnsrank'%in% ls.gdsn(gds)) mn <- 'methylated'
    if(is.null(mn) & 'mnsrank' %in% ls.gdsn(gds)) mn <- 'mnsrank'
    if(is.null(un) & !'unsrank'%in% ls.gdsn(gds)) un <- 'unmethylated'
    if(is.null(un) & 'unsrank' %in% ls.gdsn(gds)) un <- 'unsrank'
    if(is.null(bn)) bn <- 'betas'
    referencePd <- colData(referenceRGset)
    referenceMset <- preprocessRaw(referenceRGset)
    if(gdPlatform == 'EPIC'){
    message('No Reference Set of EPIC, converting array to 450k...')
    message('This method is NOT memory efficient!')
    M <- gds[rownames(referenceMset), , node = mn]
    U <- gds[rownames(referenceMset), , node = un]
    rownames(M) <- rownames(U) <- rownames(referenceMset)
    ot <- getProbeType(referenceMset)
    sMI <- normalizeQuantiles2(M[ot=='I',])
    sMII <- normalizeQuantiles2(M[ot=='II',])
    sUI <- normalizeQuantiles2(U[ot=='I',])
    sUII <- normalizeQuantiles2(U[ot=='II',])
    mquan <- list(quantiles = rep(0, nrow(M)),
                    inter = rep(0, nrow(M)),
                    onetwo = ot
                )
    mquan[['quantiles']][ot == 'I'] <- sMI[[1]]
    mquan[['inter']][ot == 'I'] <- sMI[[2]]
    mquan[['quantiles']][ot == 'II'] <- sMII[[1]]
    mquan[['inter']][ot == 'II'] <- sMII[[2]]
    uquan <- list(quantiles = rep(0, nrow(M)),
                    inter = rep(0, nrow(M)),
                    onetwo = ot)
    uquan[['quantiles']][ot == 'I'] <- sUI[[1]]
    uquan[['inter']][ot == 'I'] <- sUI[[2]]
    uquan[['quantiles']][ot == 'II'] <- sUII[[1]]
    uquan[['inter']][ot == 'II'] <- sUII[[2]]
    mquan[['rn']] <- uquan[['rn']] <- rownames(M)

    } else {
    ot <- fot(gds)
    mquan <- getquantiles(gds = gds, node = mn, onetwo = ot, perc = perc)
    # rownames not working(?)
    mquan[['rn']] <- read.gdsn(index.gdsn(gds,read.gdsn(index.gdsn(gds, "paths"))[1]))
    uquan <- getquantiles(gds = gds, node = un, onetwo = ot, perc = perc)
    # rownames not working(?)
    uquan[['rn']] <- read.gdsn(index.gdsn(gds,read.gdsn(index.gdsn(gds, "paths"))[1]))
    # We can skip combining data-sets.
    # Instead preprocessRaw reference to replace data with quantiles.
    }
    nmet <- impose(getMeth(referenceMset), mquan)
    nume <- impose(getUnmeth(referenceMset), uquan)
    rm(referenceRGset)

    # Everything else continues as normal.
    referenceMset <- minfi::MethylSet(Meth=na.omit(nmet), Unmeth=na.omit(nume), colData=referencePd, annotation(referenceMset))

    if(verbose) message("[estimateCellCounts] Picking probes for composition estimation.\n")
    compData <- minfi:::pickCompProbes(referenceMset, cellTypes = cellTypes, compositeCellType = compositeCellType, probeSelect = probeSelect)
    coefs <- compData$coefEsts
    rm(referenceMset)

    if(verbose) message("[estimateCellCounts] Estimating composition.\n")
    coefdat <- gds[rownames(coefs),,node=bn]
    rownames(coefdat) <- rownames(coefs)
    counts <- minfi:::projectCellType(coefdat, coefs)
    # counts <- minfi:::projectCellType(gds[rownames(coefs), , node = bn ], coefs)
    if (meanPlot) {
        smeans <- compData$sampleMeans
        smeans <- smeans[order(names(smeans))]
        sampleMeans <- c(colMeans(gds[rownames(coefs),, node = bn]), smeans)
        sampleColors <- c(rep(1, ncol(coefdat)), 1 + as.numeric(factor(names(smeans))))
        plot(sampleMeans, pch = 21, bg = sampleColors)
        legend("bottomleft", c("blood", levels(factor(names(smeans)))),
               col = 1:7, pch = 15)
    }
    if(returnAll) {
        list(counts = counts, compTable = compData$compTable)
    } else {
        counts
    }
}
Bioconductor-mirror/bigmelon documentation built on June 1, 2017, 4:26 a.m.