R/snprma-functions.R

Defines functions processCEL snprma2 fitAffySnpMixture56 snprma

Documented in snprma snprma2

snprma <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
                   eps=0.1, verbose=TRUE, seed=1, cdfName, sns){
  if (missing(sns)) sns <- basename(filenames)
  if (missing(cdfName))
    cdfName <- read.celfile.header(filenames[1])$cdfName
  pkgname <- getCrlmmAnnotationName(cdfName)

  if(!require(pkgname, character.only=TRUE, quietly=!verbose)){
    suggCall <- paste("library(", pkgname,
                      ", lib.loc='/Altern/Lib/Loc')",
                      sep="")
    msg <- paste("If", pkgname,
                 "is installed on an alternative location,",
                 "please load it manually by using", suggCall)
    message(strwrap(msg))
    stop("Package ", pkgname, " could not be found.")
  }

  if(verbose) message("Loading annotations and mixture model parameters.")
  loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
  loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
  loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
  autosomeIndex <- getVarInEnv("autosomeIndex")
  pnsa <- getVarInEnv("pnsa")
  pnsb <- getVarInEnv("pnsb")
  fid <- getVarInEnv("fid")
  reference <- getVarInEnv("reference")
  aIndex <- getVarInEnv("aIndex")
  bIndex <- getVarInEnv("bIndex")
  SMEDIAN <- getVarInEnv("SMEDIAN")
  theKnots <- getVarInEnv("theKnots")
  gns <- getVarInEnv("gns")

  ##We will read each cel file, summarize, and run EM one by one
  ##We will save parameters of EM to use later
  mixtureParams <- matrix(0, 4, length(filenames))
  SNR <- vector("numeric", length(filenames))
  SKW <- vector("numeric", length(filenames))
##   mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames))
##   SNR <- initializeBigVector("crlmmSNR-", length(filenames), "numeric")
##   SKW <- initializeBigVector("crlmmSKW-", length(filenames), "numeric")

  ## This is the sample for the fitting of splines
  ## BC: I like better the idea of the user passing the seed,
  ##     because this might intefere with other analyses
  ##     (like what happened to GCRMA)
  set.seed(seed)
  idx <- sort(sample(autosomeIndex, mixtureSampleSize))
  ##S will hold (A+B)/2 and M will hold A-B
  ##NOTE: We actually dont need to save S. Only for pics etc...
  ##f is the correction. we save to avoid recomputing
  A <- matrix(as.integer(0), length(pnsa), length(filenames))
  B <- matrix(as.integer(0), length(pnsb), length(filenames))

  if(verbose){
    message("Processing ", length(filenames), " files.")
    pb <- txtProgressBar(min=0, max=length(filenames), style=3)
  }

  ##for skewness. no need to do everything
  idx2 <- sample(length(fid), 10^5)

  ##We start looping throug cel files
  for(i in seq(along=filenames)){
    y <- as.matrix(read.celfile(filenames[i], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
    x <- log2(y[idx2])
    SKW[i] <- mean((x-mean(x))^3)/(sd(x)^3)
    rm(x)
    y <- normalize.quantiles.use.target(y, target=reference)
    A[, i] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
    B[, i] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
    ##Now to fit the EM
    if(fitMixture){
      S <- (log2(A[idx, i])+log2(B[idx, i]))/2 - SMEDIAN
      M <- log2(A[idx, i])-log2(B[idx, i])

      ##we need to test the choice of eps.. it is not the max diff between funcs
      tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)

      mixtureParams[, i] <- tmp[["coef"]]
      SNR[i] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
    }
    if (verbose) setTxtProgressBar(pb, i)
  }
  if (verbose) close(pb)
  if (!fitMixture) SNR <- mixtureParams <- NA
  ## gns comes from preprocStuff.rda
  list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
}

fitAffySnpMixture56 <- function(S, M, knots, probs=rep(1/3, 3), eps=.01, maxit=10, verbose=FALSE){
  ##56 stands for 5 and 6 arrays but will also work for Illumina
  ##Note the unfortunate choice of numbering:
  ##1 is BB, 2 AB, and 3 AA. Opposite to everything else!
  ##this is legacy code I decided not to change.
  ## this why at the end we report -coefs: F1 is the negative f
  mus <- append(quantile(M, c(1, 5)/6, names=FALSE), 0, 1)
  sigmas <- rep(mad(c(M[M<mus[1]]-mus[1], M[M>mus[3]]-mus[3])), 3)
  sigmas[2] <- sigmas[2]/2

  weights <- apply(cbind(mus, sigmas), 1, function(p) dnorm(M, p[1], p[2]))
  previousF1 <- -Inf
  change <- eps+1
  it <- 0

  if(verbose) message("Max change must be under ", eps, ".")
  matS <- stupidSplineBasis(S, knots)
  while (change > eps & it < maxit){
    it <- it+1
    ## E
    z <- sweep(weights, 2, probs, "*")
    LogLik <- rowSums(z)
    z <- sweep(z, 1, LogLik, "/")
    probs <- colMeans(z)

    ## M
    fit1 <- crossprod(chol2inv(chol(crossprod(sweep(matS, 1, z[, 1], FUN="*"), matS))), crossprod(matS, z[, 1]*M))

    fit2 <- sum(z[, 2]*M)/sum(z[, 2])
    F1 <- matS%*%fit1
    sigmas[c(1, 3)] <- sqrt(sum(z[, 1]*(M-F1)^2)/sum(z[, 1]))
    sigmas[2] <- sqrt(sum(z[, 2]*(M-fit2)^2)/sum(z[, 2]))

    weights[, 1] <- dnorm(M, F1, sigmas[1])
    weights[, 2] <- dnorm(M, fit2, sigmas[2])
    weights[, 3] <- dnorm(M, -F1, sigmas[3])

    change <- max(abs(F1-previousF1))
    previousF1 <- F1
    if(verbose) message("Iter ", it, ": ", change, ".")
  }
  medF1 <- median(-F1)
 return(list(coef= -fit1, medF1=medF1, sigma1=sigmas[1], sigma2=sigmas[2]))
}

snprma2 <- function(filenames, mixtureSampleSize=10^5, fitMixture=FALSE,
                    eps=0.1, verbose=TRUE, seed=1, cdfName, sns){
  if (missing(sns)) sns <- basename(filenames)
  if (missing(cdfName))
    cdfName <- read.celfile.header(filenames[1])[["cdfName"]]
  pkgname <- getCrlmmAnnotationName(cdfName)
  stopifnot(require(pkgname, character.only=TRUE, quietly=!verbose))

  if(verbose) message("Loading annotations and mixture model parameters.")
  obj <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
  pnsa <- getVarInEnv("pnsa")
  pnsb <- getVarInEnv("pnsb")
  gns <- getVarInEnv("gns")
  rm(list=obj, envir=.crlmmPkgEnv)
  rm(obj)

  ##We will read each cel file, summarize, and run EM one by one
  ##We will save parameters of EM to use later
  if(verbose) message("Initializing objects.")
  mixtureParams <- initializeBigMatrix("crlmmMixt-", 4, length(filenames), "double")
  SNR <- initializeBigVector("crlmmSNR-", length(filenames), "double")
  SKW <- initializeBigVector("crlmmSKW-", length(filenames), "double")

  ## This is the sample for the fitting of splines
  ## BC: I like better the idea of the user passing the seed,
  ##     because this might intefere with other analyses
  ##     (like what happened to GCRMA)
  ##S will hold (A+B)/2 and M will hold A-B
  ##NOTE: We actually dont need to save S. Only for pics etc...
  ##f is the correction. we save to avoid recomputing
  A <- initializeBigMatrix("crlmmA-", length(pnsa), length(filenames), "integer")
  B <- initializeBigMatrix("crlmmB-", length(pnsb), length(filenames), "integer")

  sampleBatches <- splitIndicesByNode(seq(along=filenames))

  if(verbose) message("Processing ", length(filenames), " files.")

  ocLapply(sampleBatches, processCEL, filenames=filenames,
           fitMixture=fitMixture, A=A, B=B, SKW=SKW, SNR=SNR,
           mixtureParams=mixtureParams, eps=eps, seed=seed,
           mixtureSampleSize=mixtureSampleSize, pkgname=pkgname,
           neededPkgs=c("crlmm", pkgname))
  close(mixtureParams)
  close(SNR)
  close(SKW)
  close(A)
  close(B)

  list(A=A, B=B, sns=sns, gns=gns, SNR=SNR, SKW=SKW, mixtureParams=mixtureParams, cdfName=cdfName)
}


processCEL <- function(i, filenames, fitMixture, A, B, SKW, SNR,
                       mixtureParams, eps, seed, mixtureSampleSize,
                       pkgname){
  obj1 <- loader("preprocStuff.rda", .crlmmPkgEnv, pkgname)
  obj2 <- loader("genotypeStuff.rda", .crlmmPkgEnv, pkgname)
  obj3 <- loader("mixtureStuff.rda", .crlmmPkgEnv, pkgname)
  autosomeIndex <- getVarInEnv("autosomeIndex")
  pnsa <- getVarInEnv("pnsa")
  pnsb <- getVarInEnv("pnsb")
  fid <- getVarInEnv("fid")
  reference <- getVarInEnv("reference")
  aIndex <- getVarInEnv("aIndex")
  bIndex <- getVarInEnv("bIndex")
  SMEDIAN <- getVarInEnv("SMEDIAN")
  theKnots <- getVarInEnv("theKnots")
  gns <- getVarInEnv("gns")
  rm(list=c(obj1, obj2, obj3), envir=.crlmmPkgEnv)
  rm(obj1, obj2, obj3)

  ## for mixture
  set.seed(seed)
  idx <- sort(sample(autosomeIndex, mixtureSampleSize))
  ##for skewness. no need to do everything
  idx2 <- sample(length(fid), 10^5)

  open(A)
  open(B)
  open(SKW)
  open(mixtureParams)
  open(SNR)

  for (k in i){
    y <- as.matrix(read.celfile(filenames[k], intensity.means.only=TRUE)[["INTENSITY"]][["MEAN"]][fid])
    x <- log2(y[idx2])
    SKW[k] <- mean((x-mean(x))^3)/(sd(x)^3)
    rm(x)
    y <- normalize.quantiles.use.target(y, target=reference)
    A[, k] <- intMedianSummaries(y[aIndex, 1, drop=FALSE], pnsa)
    B[, k] <- intMedianSummaries(y[bIndex, 1, drop=FALSE], pnsb)
    rm(y)

    if(fitMixture){
      S <- (log2(A[idx,k])+log2(B[idx, k]))/2 - SMEDIAN
      M <- log2(A[idx, k])-log2(B[idx, k])
      tmp <- fitAffySnpMixture56(S, M, theKnots, eps=eps)
      rm(S, M)
      mixtureParams[, k] <- tmp[["coef"]]
      SNR[k] <- tmp[["medF1"]]^2/(tmp[["sigma1"]]^2+tmp[["sigma2"]]^2)
      rm(tmp)
    } else {
      mixtureParams[, k] <- NA
      SNR[k] <- NA
    }
  }
  close(A)
  close(B)
  close(SKW)
  close(mixtureParams)
  close(SNR)
  rm(list=ls())
  gc(verbose=FALSE)
  TRUE
}

Try the crlmm package in your browser

Any scripts or data that you put into this service are public.

crlmm documentation built on Nov. 8, 2020, 4:55 p.m.