R/pcfPlain.r

Defines functions pcfPlain

Documented in pcfPlain

####################################################################
## Author: Gro Nilsen, Knut Liestøl and Ole Christian Lingjærde.
## Maintainer: Gro Nilsen <gronilse@ifi.uio.no>
## License: Artistic 2.0
## Part of the copynumber package
## Reference: Nilsen and Liestøl et al. (2012), BMC Genomics
####################################################################


## Required by:


## Requires:
### findNN
### getMad
### pcf(not the main function, but the rest of the help functions found in the same document)
### handleMissing
### pullOutContent


## Main function for pcf-analysis to be called by the user

pcfPlain <- function(pos.data, kmin = 5, gamma = 40, normalize = TRUE, fast = TRUE, digits = 4, return.est = FALSE, verbose = TRUE) {
  # Input could come from winsorize and thus be a list; check and possibly retrieve data frame wins.data
  pos.data <- pullOutContent(pos.data, what = "wins.data")

  # Make sure all data columns are numeric:
  if (any(!sapply(pos.data, is.numeric))) {
    stop("All input in pos.data must be numeric", call. = FALSE)
  }

  # Check data input:
  stopifnot(ncol(pos.data) >= 2) # something is missing in input data

  # Extract information from pos.data:
  position <- pos.data[, 1]
  cn.data <- as.matrix(pos.data[, -1])
  nSample <- ncol(pos.data) - 1
  sampleid <- colnames(pos.data)[-1]
  nProbe <- length(position)


  # save user's gamma
  gamma0 <- gamma

  sd <- rep(1, nSample) # sd is only used if normalize=TRUE, and then these values are replaced by MAD-sd
  # If number of probes in entire data set is less than 100K, the MAD sd-estimate is calculated using all obs for every sample
  # Only required if normalize=T
  if (nProbe < 100000 && normalize) {
    # calculate MAD-sd for each sample:
    for (j in 1:nSample) {
      sample.data <- pos.data[, j + 1]
      sd[j] <- getMad(sample.data[!is.na(sample.data)], k = 25) # Take out missing values before calculating mad
    }
  } # endif


  # Initialize
  pcf.names <- c("pos", sampleid)
  seg.names <- c("sampleID", "start.pos", "end.pos", "n.probes", "mean")

  segments <- data.frame(matrix(nrow = 0, ncol = 5))
  colnames(segments) <- seg.names
  if (return.est) {
    pcf.est <- matrix(nrow = 0, ncol = nSample)
  }


  # Run PCF separately for each sample:
  for (i in 1:nSample) {
    if (return.est) {
      # Initialize:
      yhat <- rep(NA, length(nProbe))
    }

    sample.data <- cn.data[, i]

    # Remove probes with missing obs; Only run pcf on non-missing values
    obs <- !is.na(sample.data)
    obs.data <- sample.data[obs]


    if (length(obs.data) > 0) { ## Make sure there are observations for this sample! If not, estimates are left NA as well

      # If number of probes in entire data set is >= 100K, the MAD sd-estimate is calculated using obs in this arm for this sample.
      # Only required if normalize=T
      if (nProbe >= 100000 && normalize) {
        sd[i] <- getMad(obs.data, k = 25)
      }

      # Scale gamma by variance if normalize is TRUE
      use.gamma <- gamma0
      if (normalize) {
        use.gamma <- gamma0 * (sd[i])^2
      }

      # Must check that sd!=0 and sd!!=NA -> use.gamma=0/NA. If not, simply calculate mean of observations
      if (use.gamma == 0 || is.na(use.gamma)) {
        if (return.est) {
          res <- list(Lengde = length(obs.data), sta = 1, mean = mean(obs.data), nIntervals = 1, yhat = rep(mean(obs.data)))
        } else {
          res <- list(Lengde = length(obs.data), sta = 1, mean = mean(obs.data), nIntervals = 1)
        }
      } else {
        # Compute piecewise constant fit
        # run fast approximate PCF if fast=TRUE and number of probes>400, or exact PCF otherwise
        if (!fast || length(obs.data) < 400) {
          # Exact PCF:
          res <- exactPcf(y = obs.data, kmin = kmin, gamma = use.gamma, yest = return.est)
        } else {
          # Run fast PCF:
          res <- selectFastPcf(x = obs.data, kmin = kmin, gamma = use.gamma, yest = return.est)
        } # endif
      } # endif

      # Retrieve segment info from results:
      seg.start <- res$sta
      seg.stop <- c(seg.start[-1] - 1, length(obs.data))
      seg.npos <- res$Lengde
      seg.mean <- res$mean
      nSeg <- res$nIntervals
      if (return.est) {
        yhat[obs] <- res$yhat
      }
      # Find genomic positions for start and stop of each segment:
      pos.start <- position[seg.start]
      pos.stop <- position[seg.stop]

      # Handle missing values:
      if (any(!obs)) {
        # first find nearest non-missing neighbour for missing probes:
        nn <- findNN(pos = position, obs = obs)

        # Include probes with missing values in segments where their nearest neighbour probes are located
        new.res <- handleMissing(nn = nn, pos = position, obs = obs, pos.start = pos.start, pos.stop = pos.stop, seg.npos = seg.npos)
        pos.start <- new.res$pos.start
        pos.stop <- new.res$pos.stop
        seg.npos <- new.res$seg.npos

        if (return.est) {
          yhat[!obs] <- yhat[nn]
        }
      }
    } else {
      warning(paste("pcf is not run for sample ", i, " because all observations are missing. NA is returned.", sep = ""), immediate. = TRUE, call. = FALSE)
      seg.start <- 1
      seg.stop <- nProbe
      pos.start <- position[seg.start]
      pos.stop <- position[seg.stop]
      nSeg <- 1
      seg.mean <- NA
      seg.npos <- nProbe
    }


    # Round:
    seg.mean <- round(seg.mean, digits = digits)

    # Add results for this sample to results for other samples in data frame:
    seg <- data.frame(rep(sampleid[i], nSeg), pos.start, pos.stop, seg.npos, seg.mean, stringsAsFactors = FALSE)
    colnames(seg) <- seg.names
    segments <- rbind(segments, seg)

    if (return.est) {
      # Rounding:
      yhat <- round(yhat, digits = digits)
      pcf.est <- cbind(pcf.est, yhat)
    }
  } # endfor



  if (verbose) {
    cat(paste("pcf finished for sample ", i, sep = ""), "\n")
  }


  if (return.est) {
    pcf.est <- data.frame(position, pcf.est, stringsAsFactors = FALSE)
    colnames(pcf.est) <- pcf.names
    return(list(estimates = pcf.est, segments = segments))
  } else {
    return(segments)
  }
} # endfunction
igordot/copynumber documentation built on Feb. 23, 2025, 1:47 a.m.