R/multipcf.r

Defines functions sawMarkM expandMulti findMarksMulti markMultiPotts multiPCFcompact compactMulti runMultiPcfSubset runFastMultiPCF selectFastMultiPcf doMultiPCF multipcf

Documented in multipcf

####################################################################
## Author: Gro Nilsen, Knut Liest?l and Ole Christian Lingj?rde.
## Maintainer: Gro Nilsen <gronilse@ifi.uio.no>
## License: Artistic 2.0
## Part of the copynumber package
## Reference: Nilsen and Liest?l et al. (2012), BMC Genomics
####################################################################

## Required by:
### none


## Requires:
### getMad
### getArms
### numericArms
### numericChrom
### pullOutContent

## Main function for multipcf-analysis to be called by the user

multipcf <- function(data, pos.unit = "bp", arms = NULL, Y = NULL, gamma = 40, normalize = TRUE, w = 1, fast = TRUE, assembly = "hg19", digits = 4, return.est = FALSE, save.res = FALSE, file.names = NULL, verbose = TRUE) {
  # Check pos.unit input:
  if (!pos.unit %in% c("bp", "kbp", "mbp")) {
    stop("'pos.unit' must be one of bp, kbp and mbp", call. = FALSE)
  }

  # Check assembly input:
  if (!assembly %in% c("hg19", "hg18", "hg17", "hg16", "mm7", "mm8", "mm9", "hg38", "mm10")) {
    stop("assembly must be one of hg19, hg18, hg17 or hg16", call. = FALSE)
  }

  # Is data a file:
  isfile.data <- class(data) == "character"

  # Check data input:
  if (!isfile.data) {
    # Input could come from winsorize and thus be a list; check and possibly retrieve data frame wins.data
    data <- pullOutContent(data, what = "wins.data")
    stopifnot(ncol(data) >= 4) # something is missing in input data
    # Extract information from data:
    chrom <- data[, 1]
    position <- data[, 2]
    nSample <- ncol(data) - 2
    sampleid <- colnames(data)[-c(1:2)]
  } else {
    # data is a datafile which contains data
    f <- file(data, "r") # open file connection
    head <- scan(f, nlines = 1, what = "character", quiet = TRUE, sep = "\t") # Read header
    if (length(head) < 3) {
      stop("Data in file must have at least 3 columns", call. = FALSE)
    }
    sampleid <- head[-c(1:2)]
    nSample <- length(sampleid)

    # Read just the two first columns to get chrom and pos
    chrom.pos <- read.table(file = data, sep = "\t", header = TRUE, colClasses = c(rep(NA, 2), rep("NULL", nSample)), as.is = TRUE)
    chrom <- chrom.pos[, 1]
    position <- chrom.pos[, 2]
  }

  # Make sure chrom is not factor:
  if (is.factor(chrom)) {
    # If chrom is factor; convert to character
    chrom <- as.character(chrom)
  }
  # Make sure chromosomes are numeric (replace X and Y by 23 and 24)
  num.chrom <- numericChrom(chrom)
  nProbe <- length(num.chrom)

  # Make sure position is numeric:
  if (!is.numeric(position)) {
    stop("input in data column 2 (posistions) must be numeric", call. = FALSE)
  }

  # Get character arms:
  if (is.null(arms)) {
    arms <- getArms(num.chrom, position, pos.unit, get(assembly))
  } else {
    stopifnot(length(arms) == nProbe)
  }
  # Translate to numeric arms:
  num.arms <- numericArms(num.chrom, arms)

  # Unique arms:
  arm.list <- unique(num.arms)
  nArm <- length(arm.list)

  # Check that w is same length as number of samples:
  if (length(w) == 1) {
    w <- rep(w, nSample)
  } else if (length(w) != nSample) {
    stop("'w' must be a single number or a vector of same length as the number of samples in 'data'", call. = FALSE)
  }

  # Check Y input:
  if (!is.null(Y)) {
    stopifnot(class(Y) %in% c("matrix", "data.frame", "character"))
    isfile.Y <- class(Y) == "character"

    if (!isfile.Y) {
      ncol.Y <- ncol(Y)
      nrow.Y <- nrow(Y)
    } else {
      f.y <- file(Y, "r")
      ncol.Y <- length(scan(f.y, nlines = 1, what = "character", quiet = TRUE, sep = "\t"))
      nrow.Y <- nrow(read.table(file = Y, sep = "\t", header = TRUE, colClasses = c(NA, rep("NULL", ncol.Y - 1)), as.is = TRUE))
    }
    if (nrow.Y != nProbe || ncol.Y != nSample + 2) {
      stop("Input Y does not represent the same number of probes and samples as found in input data", call. = FALSE)
    }
  }

  # Create folders in working directory where results are saved:
  # Initialize
  seg.names <- c("chrom", "arm", "start.pos", "end.pos", "n.probes", sampleid)
  mpcf.names <- c("chrom", "pos", sampleid)
  segments <- data.frame(matrix(nrow = 0, ncol = nSample + 5))
  colnames(segments) <- seg.names
  if (return.est) {
    mpcf.est <- matrix(nrow = 0, ncol = nSample)
  }
  if (save.res) {
    if (is.null(file.names)) {
      # Create directory where results are to be saved
      dir.res <- "multipcf_results"
      if (!dir.res %in% dir()) {
        dir.create(dir.res)
      }
      file.names <- c(paste(dir.res, "/", "estimates.txt", sep = ""), paste(dir.res, "/", "segments.txt", sep = ""))
    } else {
      # Check that file.names is the correct length
      if (length(file.names) < 2) {
        stop("'file.names' must be of length 2", call. = FALSE)
      }
    }
  }

  # estimates must be returned from routines if return.est or save.res
  yest <- any(return.est, save.res)

  # If normalize=T, we will scale by the sample residual standard error. If the number of probes in the data set < 100K, the MAD sd-estimate is calculated using all obs for the sample:
  sd <- rep(1, nSample) # to have a value to check in if-test later; not used otherwise. sd is only used if normalize=TRUE and then these values are replaced by MAD-sd
  if (normalize && nProbe < 100000) {
    for (j in 1:nSample) {
      if (!isfile.data) {
        sample.data <- data[, j + 2]
      } else {
        cc <- rep("NULL", nSample + 2)
        cc[j + 2] <- "numeric"
        # only read data for the j'th sample
        sample.data <- read.table(file = data, sep = "\t", header = TRUE, colClasses = cc)[, 1]
      }
      sd[j] <- getMad(sample.data[!is.na(sample.data)], k = 25) # Take out missing values before calculating mad
    }
  }

  # Scale gamma according to the number of samples:
  gamma <- gamma * nSample

  # run multiPCF separately on each chromosomearm:
  for (c in 1:nArm) {
    probe.c <- which(num.arms == arm.list[c])
    pos.c <- position[probe.c]
    nProbe.c <- length(probe.c)

    # get data for this arm
    if (!isfile.data) {
      arm.data <- data[probe.c, -c(1:2), drop = FALSE]
    } else {
      # Read data for this arm from file; since f is a opened connection, the reading will start on the next line which has not already been read
      # two first columns skipped
      arm.data <- read.table(f, nrows = nProbe.c, sep = "\t", colClasses = c(rep("NULL", 2), rep("numeric", nSample)))
    }

    # Check that there are no missing values:
    if (any(is.na(arm.data))) {
      stop("multiPCF cannot be run because there are missing data values, see 'imputeMissing' for imputation of missing values")
    }

    # Make sure data is numeric:
    if (any(!sapply(arm.data, is.numeric))) {
      stop("input in data columns 3 and onwards (copy numbers) must be numeric", call. = FALSE)
    }

    # If normalize=T and nProbe>=100K, we calculate the MAD sd-estimate for each sample using only obs in this arm
    if (normalize && nProbe >= 100000) {
      sd <- apply(arm.data, 2, getMad)
    }

    # Check sd; cannot normalize if sd=0 or if sd=NA:
    if (any(sd == 0) || any(is.na(sd))) {
      # not run multipcf, return mean for each sample:
      m <- apply(arm.data, 2, mean)
      dim(m) <- c(length(m), 1)
      if (yest) {
        yhat <- sapply(m, rep, nrow(arm.data))
        mpcf <- list(pcf = t(yhat), nIntervals = 1, start0 = 1, length = nrow(arm.data), mean = m)
      } else {
        mpcf <- list(nIntervals = 1, start0 = 1, length = nrow(arm.data), mean = m)
      }
    } else {
      # normalize data data (sd=1 if normalize=FALSE)
      arm.data <- sweep(arm.data, 2, sd, "/")

      # weight data (default weights is 1)
      arm.data <- sweep(arm.data, 2, w, "*")

      # Run multipcf:
      if (!fast || nrow(arm.data) < 400) {
        mpcf <- doMultiPCF(as.matrix(t(arm.data)), gamma = gamma, yest = yest) # requires samples in rows, probes in columns
        # note: returns samples in rows, estimates in columns.
      } else {
        mpcf <- selectFastMultiPcf(as.matrix(arm.data), gamma = gamma, L = 15, yest = yest) # requires samples in columns, probes in rows
      }

      # "Unweight" estimates:
      mpcf$mean <- sweep(mpcf$mean, 1, w, "/")
      if (yest) {
        mpcf$pcf <- sweep(mpcf$pcf, 1, w, "/")
      }
      # "Un-normalize" estimates:
      mpcf$mean <- sweep(mpcf$mean, 1, sd, "*")
      if (yest) {
        mpcf$pcf <- sweep(mpcf$pcf, 1, sd, "*")
      }
    }

    # Information about segments:
    nSeg <- mpcf$nIntervals
    start0 <- mpcf$start0
    n.pos <- mpcf$length
    seg.mean <- t(mpcf$mean) # get samples in columns
    posStart <- pos.c[start0]
    posEnd <- c(pos.c[start0 - 1], pos.c[nProbe.c])

    # Chromosome number and character arm id:
    chr <- unique(chrom[probe.c])
    a <- unique(arms[probe.c])
    chrid <- rep(chr, times = nSeg)
    armid <- rep(a, times = nSeg)

    # May use mean of input data or the observed data specified in Y:
    if (!is.null(Y)) {
      # get Y for this arm
      if (!isfile.Y) {
        arm.Y <- Y[probe.c, -c(1:2), drop = FALSE]
      } else {
        arm.Y <- read.table(f.y, nrows = length(probe.c), sep = "\t", colClasses = c(rep("NULL", 2), rep("numeric", nSample)))
      }
      # Make sure Y is numeric:
      if (any(!sapply(arm.Y, is.numeric))) {
        stop("input in Y columns 3 and onwards (copy numbers) must be numeric", call. = FALSE)
      }
      # Use observed data to calculate segment mean (recommended)
      seg.mean <- matrix(NA, nrow = nSeg, ncol = nSample)
      for (s in 1:nSeg) {
        seg.Y <- as.matrix(arm.Y[start0[s]:(start0[s] + n.pos[s] - 1), ])
        seg.mean[s, ] <- apply(seg.Y, 2, mean, na.rm = TRUE)
      }
    }

    # Round
    if (yest) {
      yhat <- round(mpcf$pcf, digits = digits)
    }
    seg.mean <- round(seg.mean, digits = digits)

    # Data frame:
    segments.c <- data.frame(chrid, armid, posStart, posEnd, n.pos, seg.mean, stringsAsFactors = FALSE)
    colnames(segments.c) <- seg.names

    # Should results be written to files or returned to user:
    if (save.res) {
      if (c == 1) {
        # open connection for writing to file
        w1 <- file(file.names[1], "w")
        w2 <- file(file.names[2], "w")
      }

      # Write segments to file for this arm
      write.table(segments.c, file = w2, col.names = if (c == 1) seg.names else FALSE, row.names = FALSE, quote = FALSE, sep = "\t")
      # Write estimated multiPCF-values file for this arm:
      write.table(data.frame(chrom[probe.c], pos.c, t(yhat), stringsAsFactors = FALSE), file = w1, col.names = if (c == 1) mpcf.names else FALSE, row.names = FALSE, quote = FALSE, sep = "\t")
    }

    # Append results for this arm:
    segments <- rbind(segments, segments.c)
    if (return.est) {
      mpcf.est <- rbind(mpcf.est, t(yhat))
    }

    if (verbose) {
      cat(paste("multipcf finished for chromosome arm ", chr, a, sep = ""), "\n")
    }
  } # endfor

  # Close connections
  if (isfile.data) {
    close(f)
  }
  if (!is.null(Y)) {
    if (isfile.Y) {
      close(f.y)
    }
  }

  if (save.res) {
    cat(paste("multipcf-estimates were saved in file", file.names[1]), sep = "\n")
    close(w1)
    cat(paste("segments were saved in file", file.names[2]), sep = "\n")
    close(w2)
  }

  # return results:
  if (return.est) {
    mpcf.est <- data.frame(chrom, position, mpcf.est, stringsAsFactors = FALSE)
    colnames(mpcf.est) <- mpcf.names
    return(list(estimates = mpcf.est, segments = segments))
  } else {
    return(segments)
  }
} # endfunction




# Run exact multipcf algorithm, to be called by multipcf (main function)
doMultiPCF <- function(y, gamma, yest) {
  ## y: input matrix of copy number estimates, samples in rows
  ## gamma: penalty for discontinuities
  ## yest: logical, should estimates be returned
  N <- length(y)
  nSamples <- nrow(y)
  nProbes <- ncol(y)
  ## initialisations
  if (yest) {
    yhat <- rep(0, N)
    dim(yhat) <- c(nSamples, nProbes)
  }
  bestCost <- rep(0, nProbes)
  bestSplit <- rep(0, nProbes + 1)
  bestAver <- rep(0, N)
  dim(bestAver) <- c(nSamples, nProbes)
  Sum <- rep(0, N)
  dim(Sum) <- c(nSamples, nProbes)
  Aver <- rep(0, N)
  dim(Aver) <- c(nSamples, nProbes)
  Nevner <- rep(0, N)
  dim(Nevner) <- c(nProbes, nSamples)
  Nevner <- t(Nevner + (nProbes:1))
  eachCost <- rep(0, N)
  dim(eachCost) <- c(nSamples, nProbes)
  Cost <- rep(0, nProbes)
  ## Filling of first elements
  y1 <- y[, 1]
  Sum[, 1] <- y1
  Aver[, 1] <- y1
  bestCost[1] <- 0
  bestSplit[1] <- 0
  bestAver[, 1] <- y1
  helper <- rep(1, nSamples)
  ## Solving for gradually longer arrays. Sum accumulates
  ## values for errors for righthand plateau downward from n;
  ## this error added to gamma and the stored cost in bestCost
  ## give the total cost. Cost stores the total cost for breaks
  ## at any position below n, and which.min finds the position
  ## with lowest cost (best split). Aver is the average of the
  ## righthand plateau.
  for (n in 2:nProbes) {
    Sum[, 1:n] <- Sum[, 1:n] + y[, n]
    Aver[, 1:n] <- Sum[, 1:n] / Nevner[, (nProbes - n + 1):nProbes]
    eachCost[, 1:n] <- -(Sum[, 1:n] * Aver[, 1:n])
    Cost[1:n] <- helper %*% eachCost[, 1:n]
    Cost[2:n] <- Cost[2:n] + bestCost[1:(n - 1)] + gamma
    Pos <- which.min(Cost[1:n])
    bestCost[n] <- Cost[Pos]
    bestAver[, n] <- Aver[, Pos]
    bestSplit[n] <- Pos - 1
  }
  ## The final solution is found iteratively from the sequence
  ## of split positions stored in bestSplit and the averages
  ## for each plateau stored in bestAver
  n <- nProbes
  antInt <- 0
  while (n > 0) {
    if (yest) {
      yhat[, (bestSplit[n] + 1):n] <- bestAver[, n]
    }
    n <- bestSplit[n]
    antInt <- antInt + 1
  }
  n <- nProbes
  lengde <- rep(0, antInt)
  start0 <- rep(0, antInt)
  verdi <- rep(0, antInt * nSamples)
  dim(verdi) <- c(nSamples, antInt)
  oldSplit <- n
  antall <- antInt
  while (n > 0) {
    start0[antall] <- bestSplit[n] + 1
    lengde[antall] <- oldSplit - bestSplit[n]
    verdi[, antall] <- bestAver[, n]
    n <- bestSplit[n]
    oldSplit <- n
    antall <- antall - 1
  }
  if (yest) {
    return(list(pcf = yhat, length = lengde, start0 = start0, mean = verdi, nIntervals = antInt))
  } else {
    return(list(length = lengde, start0 = start0, mean = verdi, nIntervals = antInt))
  }
}


## Choose fast multipcf version, called by multipcf (main function)
selectFastMultiPcf <- function(x, gamma, L, yest) {
  xLength <- nrow(x)
  if (xLength < 1000) {
    result <- runFastMultiPCF(x, gamma, L, 0.15, 0.15, yest)
  } else {
    if (xLength < 3000) {
      result <- runFastMultiPCF(x, gamma, L, 0.12, 0.10, yest)
    } else {
      if (xLength < 15000) {
        result <- runFastMultiPCF(x, gamma, L, 0.12, 0.05, yest)
      } else {
        result <- runMultiPcfSubset(x, gamma, L, 0.12, 0.05, yest)
      }
    }
  }
  return(result)
}


# Fast version 1, for moderately long sequences, called by selectFastMultiPcf
runFastMultiPCF <- function(x, gamma, L, frac1, frac2, yest) {
  mark <- rep(0, nrow(x))
  mark <- sawMarkM(x, L, frac1, frac2)
  dense <- compactMulti(t(x), mark)
  compPotts <- multiPCFcompact(dense$Nr, dense$Sum, gamma)
  if (yest) {
    potts <- expandMulti(nrow(x), ncol(x), compPotts$Lengde, compPotts$mean)
    return(list(
      pcf = potts, length = compPotts$Lengde, start0 = compPotts$sta,
      mean = compPotts$mean, nIntervals = compPotts$nIntervals
    ))
  } else {
    return(list(
      length = compPotts$Lengde, start0 = compPotts$sta,
      mean = compPotts$mean, nIntervals = compPotts$nIntervals
    ))
  }
}

# Fast version 2, for very long sequences, called by selectFastMultiPcf
runMultiPcfSubset <- function(x, gamma, L, frac1, frac2, yest) {
  SUBSIZE <- 5000 # length of subsets
  antGen <- nrow(x)
  mark <- sawMarkM(x, L, frac1, frac2)
  markInit <- c(mark[1:(SUBSIZE - 1)], TRUE)
  compX <- compactMulti(t(x[1:SUBSIZE, ]), markInit)
  mark2 <- rep(FALSE, antGen)
  mark2[1:SUBSIZE] <- markMultiPotts(compX$Nr, compX$Sum, gamma, SUBSIZE)
  mark2[4 * SUBSIZE / 5] <- TRUE
  start0 <- 4 * SUBSIZE / 5 + 1
  while (start0 + SUBSIZE < antGen) {
    slutt <- start0 + SUBSIZE - 1
    markSub <- c(mark2[1:(start0 - 1)], mark[start0:slutt])
    markSub[slutt] <- TRUE
    compX <- compactMulti(t(x[1:slutt, ]), markSub)
    mark2[1:slutt] <- markMultiPotts(compX$Nr, compX$Sum, gamma, slutt)
    start0 <- start0 + 4 * SUBSIZE / 5
    mark2[start0 - 1] <- TRUE
  }
  markSub <- c(mark2[1:(start0 - 1)], mark[start0:antGen])
  compX <- compactMulti(t(x), markSub)
  compPotts <- multiPCFcompact(compX$Nr, compX$Sum, gamma)
  if (yest) {
    potts <- expandMulti(nrow(x), ncol(x), compPotts$Lengde, compPotts$mean)
    return(list(
      pcf = potts, length = compPotts$Lengde, start0 = compPotts$sta,
      mean = compPotts$mean, nIntervals = compPotts$nIntervals
    ))
  } else {
    return(list(
      length = compPotts$Lengde, start0 = compPotts$sta,
      mean = compPotts$mean, nIntervals = compPotts$nIntervals
    ))
  }
}

# function that accumulates numbers of observations and sums between potential breakpoints
compactMulti <- function(y, mark) {
  antGen <- ncol(y)
  antSample <- nrow(y)
  antMark <- sum(mark)
  ant <- rep(0, antMark)
  sum <- rep(0, antMark * antSample)
  dim(sum) <- c(antSample, antMark)
  pos <- 1
  oldPos <- 0
  count <- 1
  delSum <- rep(0, antSample)
  while (pos <= antGen) {
    delSum <- 0
    while (mark[pos] < 1) {
      delSum <- delSum + y[, pos]
      pos <- pos + 1
    }
    ant[count] <- pos - oldPos
    sum[, count] <- delSum + y[, pos]
    oldPos <- pos
    pos <- pos + 1
    count <- count + 1
  }
  list(Nr = ant, Sum = sum)
}


# main calculations for fast multipcf-versions
multiPCFcompact <- function(nr, sum, gamma) {
  ## nr,sum : numbers and sums for one analysis unit,
  ## typically one chromosomal arm. Samples assumed to be in rows.
  ## gamma: penalty for discontinuities
  N <- length(nr)
  nSamples <- nrow(sum)
  ## initialisations
  yhat <- rep(0, N * nSamples)
  dim(yhat) <- c(nSamples, N)
  bestCost <- rep(0, N)
  bestSplit <- rep(0, N + 1)
  bestAver <- rep(0, N * nSamples)
  dim(bestAver) <- c(nSamples, N)
  Sum <- rep(0, N * nSamples)
  dim(Sum) <- c(nSamples, N)
  Nevner <- rep(0, N * nSamples)
  dim(Nevner) <- c(nSamples, N)
  eachCost <- rep(0, N * nSamples)
  dim(eachCost) <- c(nSamples, N)
  Cost <- rep(0, N)
  ## Filling of first elements
  Sum[, 1] <- sum[, 1]
  Nevner[, 1] <- nr[1]
  bestSplit[1] <- 0
  bestAver[, 1] <- sum[, 1] / nr[1]
  helper <- rep(1, nSamples)
  bestCost[1] <- helper %*% (-Sum[, 1] * bestAver[, 1])
  lengde <- rep(0, N)

  ## Solving for gradually longer arrays. Sum accumulates
  ## error values for righthand plateau downward from n;
  ## this error added to gamma and the stored cost in bestCost
  ## give the total cost. Cost stores the total cost for breaks
  ## at any position below n, and which.min finds the position
  ## with lowest cost (best split). Aver is the average of the
  ## righthand plateau.
  for (n in 2:N) {
    Sum[, 1:n] <- Sum[, 1:n] + sum[, n]
    Nevner[, 1:n] <- Nevner[, 1:n] + nr[n]
    eachCost[, 1:n] <- -(Sum[, 1:n]^2) / Nevner[, 1:n]
    Cost[1:n] <- helper %*% eachCost[, 1:n]
    Cost[2:n] <- Cost[2:n] + bestCost[1:(n - 1)] + gamma
    Pos <- which.min(Cost[1:n])
    cost <- Cost[Pos]
    aver <- Sum[, Pos] / Nevner[, Pos]
    bestCost[n] <- cost
    bestAver[, n] <- aver
    bestSplit[n] <- Pos - 1
  }

  ## The final solution is found iteratively from the sequence
  ## of split positions stored in bestSplit and the averages
  ## for each plateau stored in bestAver

  n <- N
  antInt <- 0
  while (n > 0) {
    yhat[, (bestSplit[n] + 1):n] <- bestAver[, n]
    antInt <- antInt + 1
    lengde[antInt] <- sum(nr[(bestSplit[n] + 1):n])
    n <- bestSplit[n]
  }
  lengdeRev <- lengde[antInt:1]
  init <- rep(0, antInt)
  init[1] <- 1
  if (antInt >= 2) {
    for (k in 2:antInt) {
      init[k] <- init[k - 1] + lengdeRev[k - 1]
    }
  }

  n <- N
  verdi <- rep(0, antInt * nSamples)
  dim(verdi) <- c(nSamples, antInt)
  bestSplit[n + 1] <- n
  antall <- antInt
  while (n > 0) {
    verdi[, antall] <- bestAver[, n]
    n <- bestSplit[n]
    antall <- antall - 1
  }

  list(Lengde = lengdeRev, sta = init, mean = verdi, nIntervals = antInt)
}

# helper function for fast version 2
markMultiPotts <- function(nr, sum, gamma, subsize) {
  ## nr,sum: numbers and sums for one analysis unit,
  ##  typically one chromosomal arm. Samples assumed to be in rows.
  ## gamma: penalty for discontinuities
  N <- length(nr)
  nSamples <- nrow(sum)
  markSub <- rep(FALSE, N)
  bestCost <- rep(0, N)
  bestSplit <- rep(0, N + 1)
  bestAver <- rep(0, N * nSamples)
  dim(bestAver) <- c(nSamples, N)
  Sum <- rep(0, N * nSamples)
  dim(Sum) <- c(nSamples, N)
  Nevner <- rep(0, N * nSamples)
  dim(Nevner) <- c(nSamples, N)
  eachCost <- rep(0, N * nSamples)
  dim(eachCost) <- c(nSamples, N)
  Cost <- rep(0, N)
  ## Filling of first elements
  Sum[, 1] <- sum[, 1]
  Nevner[, 1] <- nr[1]
  bestSplit[1] <- 0
  bestAver[, 1] <- sum[, 1] / nr[1]
  helper <- rep(1, nSamples)
  bestCost[1] <- helper %*% (-Sum[, 1] * bestAver[, 1])
  lengde <- rep(0, N)
  for (n in 2:N) {
    Sum[, 1:n] <- Sum[, 1:n] + sum[, n]
    Nevner[, 1:n] <- Nevner[, 1:n] + nr[n]
    eachCost[, 1:n] <- -(Sum[, 1:n]^2) / Nevner[, 1:n]
    Cost[1:n] <- helper %*% eachCost[, 1:n]
    Cost[2:n] <- Cost[2:n] + bestCost[1:(n - 1)] + gamma
    Pos <- which.min(Cost[1:n])
    cost <- Cost[Pos]
    aver <- Sum[, Pos] / Nevner[, Pos]
    bestCost[n] <- cost
    bestAver[, n] <- aver
    bestSplit[n] <- Pos - 1
    markSub[Pos - 1] <- TRUE
  }
  help <- findMarksMulti(markSub, nr, subsize)
  return(help)
}


# Function which finds potential breakpoints
findMarksMulti <- function(markSub, Nr, subsize) {
  ## markSub: marks in compressed scale
  ## NR: number of observations between potenstial breakpoints
  mark <- rep(FALSE, subsize) ## marks in original scale
  if (sum(markSub) < 1) {
    return(mark)
  } else {
    N <- length(markSub)
    ant <- seq(1:N)
    help <- ant[markSub]
    lengdeHelp <- length(help)
    help0 <- c(0, help[1:(lengdeHelp - 1)])
    lengde <- help - help0
    start0 <- 1
    oldStart <- 1
    startOrig <- 1
    for (i in 1:lengdeHelp) {
      start0 <- start0 + lengde[i]
      lengdeOrig <- sum(Nr[oldStart:(start0 - 1)])
      startOrig <- startOrig + lengdeOrig
      mark[startOrig - 1] <- TRUE
      oldStart <- start0
    }
    return(mark)
  }
}


## expand compact solution
expandMulti <- function(nProbes, nSamples, lengthInt, mean) {
  ## input: nr of probes, length of intervals,
  ## value in intervals; returns the expansion

  Potts <- rep(0, nProbes * nSamples)
  dim(Potts) <- c(nSamples, nProbes)
  lengthCompArr <- length(lengthInt)
  k <- 1
  for (i in 1:lengthCompArr) {
    for (j in 1:lengthInt[i]) {
      Potts[, k] <- mean[, i]
      k <- k + 1
    }
  }
  return(Potts)
}

## sawtooth-filter for multiPCF - marks potential breakpoints. Uses two
## sawtoothfilters, one lang (length L) and one short (fixed length 6)
sawMarkM <- function(x, L, frac1, frac2) {
  nrProbes <- nrow(x)
  nrSample <- ncol(x)
  mark <- rep(0, nrProbes)
  sawValue <- rep(0, nrProbes)
  filter <- rep(0, 2 * L)
  sawValue2 <- rep(0, nrProbes)
  filter2 <- rep(0, 6)
  for (k in 1:L) {
    filter[k] <- k / L
    filter[2 * L + 1 - k] <- -k / L
  }

  for (k in 1:3) {
    filter2[k] <- k / 3
    filter2[7 - k] <- -k / 3
  }

  for (l in 1:(nrProbes - 2 * L + 1)) {
    for (m in 1:nrSample) {
      diff <- crossprod(filter, x[l:(l + 2 * L - 1), m])
      sawValue[l + L - 1] <- sawValue[l + L - 1] + abs(diff)
    }
  }
  limit <- quantile(sawValue, (1 - frac1))
  for (l in 1:(nrProbes - 2 * L)) {
    if (sawValue[l + L - 1] > limit) {
      mark[l + L - 1] <- 1
    }
  }
  for (l in (L - 1):(nrProbes - L - 2)) {
    for (m in 1:nrSample) {
      diff2 <- crossprod(filter2, x[l:(l + 5), m])
      sawValue2[l + 2] <- sawValue2[l + 2] + abs(diff2)
    }
  }
  limit2 <- quantile(sawValue2, (1 - frac2))
  for (l in (L - 1):(nrProbes - L - 2)) {
    if (sawValue2[l + 2] > limit2) {
      mark[l + 2] <- 1
    }
  }
  for (l in 1:L) {
    mark[l] <- 1
    mark[nrProbes + 1 - l] <- 1
  }

  return(mark)
}
igordot/copynumber documentation built on Feb. 23, 2025, 1:47 a.m.