tests/normalizeFragmentLength-ex2.R

library("aroma.light")

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Example 2: Two-enzyme fragment-length normalization of 6 arrays
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
set.seed(0xbeef)

# Number samples
I <- 5

# Number of loci
J <- 3000

# Fragment lengths (two enzymes)
fl <- matrix(0, nrow=J, ncol=2)
fl[,1] <- seq(from=100, to=1000, length.out=J)
fl[,2] <- seq(from=1000, to=100, length.out=J)

# Let 1/2 of the units be on both enzymes
fl[seq(from=1, to=J, by=4),1] <- NA_real_
fl[seq(from=2, to=J, by=4),2] <- NA_real_

# Let some have unknown fragment lengths
hasUnknownFL <- seq(from=1, to=J, by=15)
fl[hasUnknownFL,] <- NA_real_

# Sty/Nsp mixing proportions:
rho <- rep(1, I)
rho[1] <- 1/3;  # Less Sty in 1st sample
rho[3] <- 3/2;  # More Sty in 3rd sample


# Simulate data
z <- array(0, dim=c(J,2,I))
maxLog2Theta <- 12
for (ii in 1:I) {
  # Common effect for both enzymes
  mu <- function(fl) {
    k <- runif(n=1, min=3, max=5)
    mu <- rep(maxLog2Theta, length(fl))
    ok <- is.finite(fl)
    mu[ok] <- mu[ok] - fl[ok]^{1/k}
    mu
  }

  # Calculate the effect for each data point
  for (ee in 1:2) {
    z[,ee,ii] <- mu(fl[,ee])
  }

  # Update the Sty/Nsp mixing proportions
  ee <- 2
  z[,ee,ii] <- rho[ii]*z[,ee,ii]

  # Add random errors
  for (ee in 1:2) {
    eps <- rnorm(J, mean=0, sd=1/sqrt(2))
    z[,ee,ii] <- z[,ee,ii] + eps
  }
}


hasFl <- is.finite(fl)

unitSets <- list(
  nsp  = which( hasFl[,1] & !hasFl[,2]),
  sty  = which(!hasFl[,1] &  hasFl[,2]),
  both = which( hasFl[,1] &  hasFl[,2]),
  none = which(!hasFl[,1] & !hasFl[,2])
)

# The observed data is a mix of two enzymes
theta <- matrix(NA_real_, nrow=J, ncol=I)

# Single-enzyme units
for (ee in 1:2) {
  uu <- unitSets[[ee]]
  theta[uu,] <- 2^z[uu,ee,]
}

# Both-enzyme units (sum on intensity scale)
uu <- unitSets$both
theta[uu,] <- (2^z[uu,1,]+2^z[uu,2,])/2

# Missing units (sample from the others)
uu <- unitSets$none
theta[uu,] <- apply(theta, MARGIN=2, sample, size=length(uu))

# Calculate target array
thetaT <- rowMeans(theta, na.rm=TRUE)
targetFcns <- list()
for (ee in 1:2) {
  uu <- unitSets[[ee]]
  fit <- lowess(fl[uu,ee], log2(thetaT[uu]))
  class(fit) <- "lowess"
  targetFcns[[ee]] <- function(fl, ...) {
    predict(fit, newdata=fl)
  }
}


# Fit model only to a subset of the data
subsetToFit <- setdiff(1:J, seq(from=1, to=J, by=10))

# Normalize data (to a target baseline)
thetaN <- matrix(NA_real_, nrow=J, ncol=I)
fits <- vector("list", I)
for (ii in 1:I) {
  lthetaNi <- normalizeFragmentLength(log2(theta[,ii]), targetFcns=targetFcns,
                     fragmentLengths=fl, onMissing="median",
                     subsetToFit=subsetToFit, .returnFit=TRUE)
  fits[[ii]] <- attr(lthetaNi, "modelFit")
  thetaN[,ii] <- 2^lthetaNi
}


# Plot raw data
xlim <- c(0, max(fl, na.rm=TRUE))
ylim <- c(0, max(log2(theta), na.rm=TRUE))
Mlim <- c(-1,1)*4
xlab <- "Fragment length"
ylab <- expression(log2(theta))
Mlab <- expression(M==log[2](theta/theta[R]))

layout(matrix(1:(3*I), ncol=I, byrow=TRUE))
for (ii in 1:I) {
  plot(NA, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, main="raw")

  # Single-enzyme units
  for (ee in 1:2) {
    # The raw data
    uu <- unitSets[[ee]]
    points(fl[uu,ee], log2(theta[uu,ii]), col=ee+1)
  }

  # Both-enzyme units (use fragment-length for enzyme #1)
  uu <- unitSets$both
  points(fl[uu,1], log2(theta[uu,ii]), col=3+1)

  for (ee in 1:2) {
    # The true effects
    uu <- unitSets[[ee]]
    lines(lowess(fl[uu,ee], log2(theta[uu,ii])), col="black", lwd=4, lty=3)

    # The estimated effects
    fit <- fits[[ii]][[ee]]$fit
    lines(fit, col="orange", lwd=3)

    muT <- targetFcns[[ee]](fl[uu,ee])
    lines(fl[uu,ee], muT, col="cyan", lwd=1)
  }
}

# Calculate log-ratios
thetaR <- rowMeans(thetaN, na.rm=TRUE)
M <- log2(thetaN/thetaR)

# Plot normalized data
for (ii in 1:I) {
  plot(NA, xlim=xlim, ylim=Mlim, xlab=xlab, ylab=Mlab, main="normalized")
  # Single-enzyme units
  for (ee in 1:2) {
    # The normalized data
    uu <- unitSets[[ee]]
    points(fl[uu,ee], M[uu,ii], col=ee+1)
  }
  # Both-enzyme units (use fragment-length for enzyme #1)
  uu <- unitSets$both
  points(fl[uu,1], M[uu,ii], col=3+1)
}

ylim <- c(0,1.5)
for (ii in 1:I) {
  data <- list()
  for (ee in 1:2) {
    # The normalized data
    uu <- unitSets[[ee]]
    data[[ee]] <- M[uu,ii]
  }
  uu <- unitSets$both
  if (length(uu) > 0)
    data[[3]] <- M[uu,ii]

  uu <- unitSets$none
  if (length(uu) > 0)
    data[[4]] <- M[uu,ii]

  cols <- seq_along(data)+1
  plotDensity(data, col=cols, xlim=Mlim, xlab=Mlab, main="normalized")

  abline(v=0, lty=2)
}
HenrikBengtsson/aroma.light documentation built on July 3, 2023, 1:57 a.m.