synapter: Label-free data analysis pipeline for optimal identification and quantitation

filterFunction <- function(x) {
  ## keep only function 1 in Pep3DAMRT
  ## was Function column before (see issue #67)
  x[x$matchedEMRTs == 1,]
}

filterPeptideMatchType <- function(x) {
  ## keep peptide.matchType PepFrag1 or pepFrag2
  x[x$peptide.matchType %in% c("PepFrag1", "PepFrag2"),]
}


## pepScore filtering
filterPepScore <- function(dfr,
                           fdr = 0.01,
                           method = c("BH", "qval", "Bonferroni")) {
  method <- match.arg(method)
  ## Assumes that peptide data has been
  ##
  sel <- vector("logical", length=nrow(dfr))
  sel1 <- dfr$peptide.matchType == "PepFrag1"
  sel2 <- dfr$peptide.matchType == "PepFrag2"
  ## signif1 <- (dfr[sel1, "qval"] <= fdr)
  ## signif2 <- (dfr[sel2, "qval"] <= fdr)
  ## v 0.7.7 - switching
  signif1 <- (dfr[sel1, method] <= fdr)
  signif2 <- (dfr[sel2, method] <= fdr)
  if (anyNA(signif1) | anyNA(signif2)) {
    stop("Filtering NA qvalues out.")
    ## signif1[is.na(signif1)] <- FALSE
    ## signif2[is.na(signif2)] <- FALSE
  }
  sel[sel1] <- signif1
  sel[sel2] <- signif2
  dfr[sel,]
}


filterProtFpr <- function(pepdata, fpr) {
  pepdata[pepdata$protein.falsePositiveRate < fpr, ]
}

error.ppm <- function(obs, theo) {
  ## Estimating MS mass accuracy
  ## Error(ppm) = 1e6 * (observed m/z - exact m/z) / (exact m/z)
  ## with observed m/z = xx$precursor.mhp.[hd]mse
  ##      thoeretical m/z = xx$peptide.mhp.[hd]mse -- calculated from sequence
  ((obs - theo)/theo) * 1e6
}

filter.error.ppm <- function(x, colname, ppm = 2) {
  ## t is the +/- ppm error threshold
  ## this will also be a percentage of points
  ## we want to keep and will be estimated from
  ## calculated mean +/- sd
  ## using ... ?
  if (length(colname) != 1)
    stop("Require exactly 1 column name.")
  if (!colname %in% names(x))
    stop(paste(colname, "not found!"))
  sel <- abs(x[,colname]) < ppm
  x[sel,]
}

loessModel <- function(x, y, span) {
  loess(y ~ x, span = span, degree = 1, family = "symmetric", iterations = 4, surface = "direct")
}

loessModelPredSd <- function(x, y, span) {
  lo <- loessModel(x, y, span)
  o <- order(x)
  pp <- predict(lo, x, se=TRUE)
  sd <- pp$se.fit * sqrt(lo$n) ## get sd from se
  stopifnot(all.equal(pp$fit, fitted(lo), check.attributes = FALSE))
  list(lo = lo,
       o = o,
       preds = pp,
       sd = sd)
}

modelRetTime <- function(retT, deltaRt, span) {
  ## delta = hdmse.rt - mse.rt
  ## hdmse' = mse.rt + delta
  ##        = mse.rt + hdmse.rt - mse.rt
  ##        = hdmse.rt
  ## mse' = hdmse - delta
  ##      = hdmse - (hdmse.rt - mse.rt)
  ##      = hdmse - hdmse.rt + mse.rt
  ##      = mse
  loessModelPredSd(retT, deltaRt, span)
}

.modelIntensity <- function(retT, intenRatio, span) {
  loessModelPredSd(retT, intenRatio, span)
}

predictIntensities <- function(emrts, model) {
  2L^(predict(model$lo, emrts$precursor.retT))
}

doHDMSePredictions <- function(identpep, model, nsd) {
  ## changes in v 0.4.6 - if available, ans is retrieved
  ## from the input data.frame, else it is computed using
  ## the model.
  if (all(c("predictedRt", "sdRt") %in% names(identpep))) {
    ## get from identpep dataframe
    .fitted <- NA
    .predicted <- identpep$predictedRt
    .sd <- identpep$sdRt
  } else {
    ## compute from model
    .o <- order(identpep$precursor.retT)
    .allpreds <- predict(model$lo, identpep$precursor.retT, se=TRUE)
    .sd <- .allpreds$se.fit[.o] * sqrt(model$lo$n) ## get sd from se
    .predicted <- identpep$precursor.retT - .allpreds$fit[.o]
    .fitted <- .allpreds$fit[.o]
  }
  if (!missing(nsd)) {
    .lower <- .predicted - (nsd * .sd)
    .upper <- .predicted + (nsd * .sd)
    stopifnot(all(.lower <= .upper))
  } else {
    .lower <- .upper <- NA
  }
  stopifnot(length(.predicted) == length(.sd))
  list(fitted = .fitted,
       predicted = .predicted,
       lower = .lower,
       upper = .upper,
       mass = identpep$peptide.mhp,
       sd = .sd)
}

findRtIndices <- function(sortedPep3d, lowerHDMSeRt, upperHDMSeRt) {
  stopifnot(all(lowerHDMSeRt <= upperHDMSeRt))
  stopifnot(length(lowerHDMSeRt) == length(upperHDMSeRt))

  lowerIdx <- findInterval(lowerHDMSeRt, sortedPep3d$rt_min)+1
  upperIdx <- findInterval(upperHDMSeRt, sortedPep3d$rt_min)

  ## just to be sure
  lowerIdx <- pmin(lowerIdx, upperIdx)

  ## create matrix with boundaries (col 1 and 2)
  cbind(lower=lowerIdx, upper=upperIdx)
}

findMzIndices <- function(pep3dMz, hdmseMz, rtIndices, ppmthreshold) {
  stopifnot(nrow(rtIndices) == length(hdmseMz))
  l <- vector(mode="list", length=nrow(rtIndices))
  for (i in seq(along=l)) {
    j <- rtIndices[i, 1L]:rtIndices[i, 2L]
    l[[i]] <- j[abs(error.ppm(obs=pep3dMz[j], theo=hdmseMz[i])) < ppmthreshold]
  }
  l
}

findImIndices <- function(pep3dIm, hdmseIm, mzIndices, imthreshold) {
  if (imthreshold == Inf) {
    return(mzIndices)
  }
  stopifnot(length(mzIndices) == length(hdmseIm))
  l <- vector(mode="list", length=length(mzIndices))
  for (i in seq(along=l)) {
    l[[i]] <-
      mzIndices[[i]][abs(pep3dIm[mzIndices[[i]]] - hdmseIm[i]) < imthreshold]
  }
  l
}

calculateGridPerformance <- function(identpep, sortedPep3d, mergedpep, matches) {
  ## Those that match *1* spectrumIDs will be transferred
  ## BUT there is no guarantee that with *1* unique match,
  ##     we get the correct one, even for those that were
  ##     part of the matched high confidence ident+quant
  ##     identified subset!

  ## #############################################################
  n <- length(matches)
  k <- lengths(matches)
  k1 <- which(k == 1L)
  k2 <- which(k > 1L)

  idx <- match(mergedpep$precursor.leID.ident, identpep$precursor.leID)
  precursor.leID.quant <- rep(NA_real_, n)
  precursor.leID.quant[idx] <- mergedpep$precursor.leID.quant

  spectrumID <- rep(NA_real_, n)
  spectrumID[k1] <- sortedPep3d$spectrumID[unlist(matches[k1])]

  ## grd1: number of unique matches divided by total number of matches
  ## => sum(k==1)/length(k) == mean(k==1)
  grd1 <- mean(k == 1L)

  notNaIdx <- which(!is.na(precursor.leID.quant))

  ## grd2: number of correct matched
  ## (MSe peptide's leID == MSe Pep3D's spectrumID)
  ## divided by number of features used in model
  ## non-NAs are those used for modelling
  grd2 <- sum(precursor.leID.quant == spectrumID, na.rm=TRUE)/length(notNaIdx)

  details <- integer(n)
  details[k1] <- -1L
  details[k1][spectrumID[k1] == precursor.leID.quant[k1]] <- 1L
  details[k2] <- -2L
  details[k2][unlist(lapply(k2, function(i) {
    precursor.leID.quant[i] %in% matches[[i]]}))] <- 2L

  ## exclude all values where precursor.leID.quant == NA
  details <- details[notNaIdx]
  details <- .table(details)

  ## make sure that we have alway 5 categories
  grddetails <- c("-2"=0, "-1"=0, "0"=0, "1"=0, "2"=0)
  grddetails[names(details)] <- details

  list(grd1=grd1, grd2=grd2, details=grddetails)
}

findMSeEMRTs <- function(identpep,
                         pep3d,
                         mergedpep,
                         nsd,
                         ppmthreshold,
                         imdiff,
                         model) {
  sortedPep3d <- pep3d
  sortedPep3d <- sortedPep3d[order(pep3d$rt_min),]

  hdmseData <- doHDMSePredictions(identpep, model, nsd)
  rtIdx <- findRtIndices(sortedPep3d, hdmseData$lower, hdmseData$upper)
  mzIdx <- findMzIndices(sortedPep3d$mwHPlus, hdmseData$mass, rtIdx,
                         ppmthreshold)
  res <- findImIndices(sortedPep3d$clust_drift, identpep$precursor.Mobility,
                       mzIdx, imdiff)

  k <- lengths(res)

  ## Those that match *1* spectrumIDs will be transferred
  ## BUT there is no guarantee that with *1* unique match,
  ##     we get the correct one, even for those that were
  ##     part of the matched high confidence ident+quant
  ##     identified subset!

  ## #############################################################

  n <- length(k)
  m <- ncol(pep3d)
  ## to initialise the new pep3d2 with with n rows
  ## and same nb of columns than pep3d
  pep3d2 <- matrix(NA_real_, nrow=n, ncol=m, dimnames=list(c(), colnames(pep3d)))
  pep3d2[, 1] <- k
  k1 <- which(k == 1)
  ## convert to data.frame first to avoid conversion of matrix to character
  ## matrix (because of the new isotopicDistr column) that results in a
  ## data.frame full of factors
  pep3d2 <- as.data.frame(pep3d2, stringsAsFactors = FALSE)
  pep3d2[k1, ] <- sortedPep3d[unlist(res[k1]), ]

  ## #############################################################

  ans <- cbind(identpep, pep3d2)

  ans$matched.quant.spectrumIDs <- MSnbase:::utils.list2ssv(res, sep=";")

  ans$precursor.leID.quant <- NA_real_
  idx <- match(mergedpep$precursor.leID.ident, ans$precursor.leID)

  ans$precursor.leID.quant[idx] <- mergedpep$precursor.leID.quant
  i <- grep("precursor.leID$", names(ans))
  names(ans)[i] <- "precursor.leID.ident" ## to avoid any confusion

  ans$idSource <- "transfer"

  ## since v 0.5.0 - removing multiply matched EMRTs
  ## dupIDs <- ans$spectrumID[ans$Function == 1 & duplicated(ans$spectrumID)]
  ## dupRows <- ans$spectrumID %in% dupIDs
  ## ans[dupRows, (ncol(identpep)+1) : ncol(ans)] <- -1

  ans
}


estimate.mass.range <- function(Mhdmse, ppmt) {
  mass.ranges <- sapply(Mhdmse, function(m)
                        c(( (ppmt * m) / 1e6) + m,
                          ((-ppmt * m) / 1e6) + m))
  t(mass.ranges)
}


gridSearch3 <- function(model,
                        identpep,
                        pep3d,
                        mergedPeptides,
                        ppms,
                        nsds,
                        imdiffs,
                        verbose = interactive()) {
  ## As initial gridSearch, but now returns a list
  ## with two grids; first one as before, percent of
  ## uniquely matched features; second is percent of
  ## correctly assigned features (based on merged features
  ## used to model rt).
  ##

  ## set imdiffs=Inf to disable 3D grid search

  n <- length(nsds)
  m <- length(ppms)
  o <- length(imdiffs)
  grd1 <- grd2 <- array(NA_integer_, dim=c(n, m, o),
                        dimnames=list(nsds, ppms, imdiffs))
  N <- n*m*o
  details <- vector("list", length=N)

  ._k <- 0
  if (verbose) {
    pb <- txtProgressBar(min=0, max=N, style=3)
  }

  sortedPep3d <- pep3d[order(pep3d$rt_min),]

  for (i in 1:n) {
    hdmseData <- doHDMSePredictions(identpep, model, nsds[i])
    rtIdx <- findRtIndices(sortedPep3d, hdmseData$lower, hdmseData$upper)

    for (j in 1:m) {
      mzIdx <- findMzIndices(sortedPep3d$mwHPlus, hdmseData$mass, rtIdx,
                             ppms[j])
      for (k in 1:o) {
        ._k <- ._k + 1L

        matches <- findImIndices(sortedPep3d$clust_drift,
                                 identpep$precursor.Mobility, mzIdx,
                                 imdiffs[k])
        gridDetails <- calculateGridPerformance(identpep, sortedPep3d,
                                                mergedPeptides, matches)
        grd1[i, j, k] <- gridDetails$grd1
        grd2[i, j, k] <- gridDetails$grd2
        details[[._k]] <- gridDetails$details
      }

      if (verbose) {
        setTxtProgressBar(pb, ._k)
      }
    }
  }
  if (verbose) {
    close(pb)
  }
  nms <- paste(rep(nsds, each = m*o) ,
               replicate(n, rep(ppms, each = o)),
               rep(imdiffs, m*n),
               sep=":")
  names(details) <- nms
  list(prcntTotal = grd1,
       prcntModel = grd2,
       details = details)
}

makeFigurePath <- function(dirname, filename) {
  full <- paste(dirname, "/", filename, sep="")
  rel <- filename
  list(full=full, relative=rel)
}

getQs <- function(x, qtls) {
    xi <- length(x) * qtls
    yi <- quantile(sort(abs(x)), qtls)
    list(x=xi, y=yi)
}

keepUniqueSpectrumIds <- function(pep3d) {
  ## The EMRTs spectra are duplicated
  ## for different charge states, isotopes, ...
  ## This functions removes the duplicated
  ## spectrum ids and keeps the first instances
  ## only
  pep3d[!duplicated(pep3d$spectrumID),]
}

score2pval <- function(xx) {
  srnd <- xx[xx$protein.dataBaseType == "Random", "peptide.score"]
  sreg <- xx[xx$protein.dataBaseType == "Regular", "peptide.score"]
  nrnd <- length(srnd)
  sapply(sreg, function(x) sum(srnd >= x)/nrnd)
}

getIdStats <- function(pepdata) {
  dbtypes <- sort(unique(pepdata$protein.dataBaseType))
  if (any(dbtypes != c("Random", "Regular")))
    stop("[Regular and Random] not found.")
  ## new in v 0.8.1 -- keeping only unique (as in unique())
  ## peptide entries to calculate statistics, to avoid to biasing
  ## towards repeated identical peptides (from different proteins
  ## for instance).
  ## fixed in v 0.8.3
  pepseqs <- as.character(pepdata$peptide.seq)
  res <- pepdata[, c("peptide.seq",
                     "peptide.score",
                     "peptide.matchType",
                     "protein.dataBaseType")]
  rownames(res) <- make.unique(pepseqs)
  res$pval <- res$qval <- res$Bonferroni <- res$BH <- NA
  ## name subsets
  pf1 <- rownames(res)[res$peptide.matchType == "PepFrag1"]
  pf2 <- rownames(res)[res$peptide.matchType == "PepFrag2"]
  pf1reg <- rownames(res)[res$peptide.matchType == "PepFrag1" &
                          res$protein.dataBaseType == "Regular"]
  pf2reg <- rownames(res)[res$peptide.matchType == "PepFrag2" &
                          res$protein.dataBaseType == "Regular"]
  ## PepFrag1
  .pv1 <- score2pval(res[pf1, ])
  res[pf1reg, "pval"] <- .pv1
  res[pf1reg, "qval"] <- qvalue(.pv1)$qvalues
  .adj1 <- mt.rawp2adjp(.pv1)
  res[pf1reg, "BH"] <- .adj1$adjp[order(.adj1$index), "BH"]
  res[pf1reg, "Bonferroni"] <- .adj1$adjp[order(.adj1$index), "Bonferroni"]
  ## PepFrag2
  .pv2 <- score2pval(res[pf2, ])
  res[pf2reg, "pval"] <- .pv2
  res[pf2reg, "qval"] <- qvalue(.pv2)$qvalues
  .adj2 <- mt.rawp2adjp(.pv2)
  res[pf2reg, "BH"] <- .adj2$adjp[order(.adj2$index), "BH"]
  res[pf2reg, "Bonferroni"] <- .adj2$adjp[order(.adj2$index), "Bonferroni"]
  ## checking
  testsel <- res$peptide.matchType %in% c("PepFrag1", "PepFrag2") & res$protein.dataBaseType == "Regular"
  if (anyNA(res[testsel, "pval"]))
    stop("NA p-values generated")
  ##
  ans <- res[, c("pval", "qval", "BH", "Bonferroni")]
  rownames(ans) <- rownames(pepdata)
  ans
}

## closes #42
isCorrespondingPep3DataFile <- function(quant, pep3d) {
  idx <- match(quant$precursor.leID, pep3d$spectrumID)

  ## We disable the comparison of the intensity values for the file check
  ## temporarly because some files exist that have entries with different
  ## intensity values.
  ## See https://github.com/lgatto/synapter/issues/42 for details
  !anyNA(idx)
#  return(!anyNA(idx) && all(quant$precursor.inten == pep3d$Counts[idx]))
}

## duplicate rows if the have multiple matched.quant.spectrumIDs
## adds a columns "gridSearchResult" that could be "unique-true",
## "unique-false", "non-unique-true", "non-unique-false"
## please note that it would change the order of the data.frame
flatMatchedEMRTs <- function(emrts, pep3d, na.rm=TRUE, verbose=interactive()) {
  if (verbose) {
    message("create flat EMRT data.frame (one matched EMRT per row)")
  }

  if (na.rm) {
    emrts <- emrts[!is.na(emrts$precursor.leID.quant), ]
  }

  ## unique matches
  k1 <- which(emrts$matchedEMRTs == 1)
  emrts$matched.quant.spectrumIDs[k1] <- emrts$spectrumID[k1]

  ## non-unique matches
  k2 <- which(emrts$matchedEMRTs > 1)
  mIds <- matched.quant.spectrumIDs2numeric(emrts$matched.quant.spectrumIDs)

  flatEmrts <- lapply(k2, function(j) {
    ## duplicated current row
    curRow <- emrts[rep(j, length(mIds[[j]])), ]
    ## update matched.quant.spectrumIDs (contain only a single entry now)
    curRow$matched.quant.spectrumIDs <- mIds[[j]]
    ## update spectrumID
    curRow$spectrumID <- curRow$matched.quant.spectrumIDs

    curRow
  })

  ## build final df
  flatEmrts <- do.call(rbind, flatEmrts)
  flatEmrts <- rbind(emrts[k1, ], flatEmrts)
  flatEmrts$matched.quant.spectrumIDs <-
    as.numeric(flatEmrts$matched.quant.spectrumIDs)
  rownames(flatEmrts) <- NULL

  ## refill pep3d information
  rows <- flatEmrts$matched.quant.spectrumIDs

  ## find corresponding columns (but exclude spectrumID and Function
  cols <- .commonColnames(flatEmrts, pep3d,
                          exclude = c("matchedEMRTs", "spectrumID"))

  flatEmrts[, cols] <- pep3d[rows, cols]

  flatEmrts$gridSearchResult <- "no-quant-id"

  isCorrectMatch <- as.numeric(flatEmrts$precursor.leID.quant) == flatEmrts$matched.quant.spectrumIDs

  flatEmrts$gridSearchResult[flatEmrts$matchedEMRTs == 1 & isCorrectMatch] <- "unique-true"
  flatEmrts$gridSearchResult[flatEmrts$matchedEMRTs == 1 & !isCorrectMatch] <- "unique-false"
  flatEmrts$gridSearchResult[flatEmrts$matchedEMRTs > 1 & isCorrectMatch] <- "non-unique-true"
  flatEmrts$gridSearchResult[flatEmrts$matchedEMRTs > 1 & !isCorrectMatch] <- "non-unique-false"

  flatEmrts
}

## see issue 73 for details
## https://github.com/lgatto/synapter/issues/73
.findSynapterPlgsAgreement <- function(emrts) {
  ## single match
  k1 <- emrts$matchedEMRTs == 1

  k1Idx <- which(k1)

  agreement <- rep.int("no_synapter_transfer", nrow(emrts))

  ## agree (same EMRT by database search and synapter)
  ## disagree (different EMRT by database search and synapter)
  mqsid <- lapply(matched.quant.spectrumIDs2numeric(emrts$matched.quant.spectrumIDs[k1Idx]), "[", 1L)
  agreement[k1Idx] <- ifelse(mqsid == as.numeric(emrts$precursor.leID.quant[k1Idx]), "agree", "disagree")

  ## no_plgs_id (transferred by synapter not ided by PLGS)
  agreement[is.na(emrts$precursor.leID.quant)] <- "no_plgs_id"
  ## no_id_or_transfer (not transferred by synapter not ided by PLGS)
  agreement[(is.na(emrts$precursor.leID.quant) & is.na(emrts$matched.quant.spectrumIDs)) |
            (!k1 & is.na(emrts$precursor.leID.quant))] <- "no_id_or_transfer"

  ## multiple ident matches
  if (!is.null(emrts$matchedMultipleIdentEMRTs)) {
    agreement[emrts$matchedMultipleIdentEMRTs] <- "multiple_ident_matches"
  }

  agreement
}

.appendFragmentMatchingColumn <- function(emrts, fm) {
  idx <- match(sort(unique(fm$precursor.leID.ident)),
               emrts$precursor.leID.ident)

  emrts[idx, "FragmentMatching"] <- MSnbase:::utils.list2ssv(
    split(fm[, "FragmentMatching"], fm$precursor.leID.ident))

  emrts
}

matched.quant.spectrumIDs2numeric <- function(x) {
  lapply(MSnbase:::utils.ssv2list(x), as.numeric)
}

diagnosticErrors <- function(x) {
  tp <- x[, "tp"]
  fp <- x[, "fp"]
  tn <- x[, "tn"]
  fn <- x[, "fn"]

  acc <- (tp+tn)/(tp+fp+tn+fn)
  pre <- tp/(tp+fp)
  rec <- tp/(tp+fn)
  f1 <- 2*pre*rec/(pre+rec)
  cbind(accuracy=acc, precision=pre, recall=rec, fdr=1-pre, f1=f1)
}

# concatenate isotope distribution information
# @param df data.frame (e.g. pep3d data)
# @param idcol id column (e.g. "spectrumID")
# @param zcol charge column name (e.g. "ion_z")
# @param isocol isotope column name (e.g. "ion_iso")
# @param intcol intensity column name (e.g. "ion_counts")
# @return a character vector of length == nrow(df)
# @noRd
.catIsotopeDistr <- function(df,
                             idcol = "spectrumID",
                             zcol = "ion_z",
                             isocol = "ion_iso",
                             intcol = "ion_counts") {
  ## cat each row
  r <- paste0(df[, zcol], "_", df[, isocol], ":",  df[, intcol])
  ## combine rows with equal ids
  ave(r, df[, idcol], FUN=function(x)paste0(x, collapse=";"))
}

# convert isotope distribution information to matrix
# @param x vector of isotopicDistr strings, generated by .catIsotopeDistr
# @return a matrix of isotopic counts; rownames: runs;
#   colnames: isotopic names
# @noRd
.isotopicDistr2matrix <- function(x) {
  if (is.data.frame(x)) {
    x <- unlist(x)
  }
  nm <- names(x)
  n <- length(x)

  x <- strsplit(x, ":|;")
  ne <- lengths(x)

  ## if input contains NA we remove it manually
  ne1 <- which(ne == 1L)
  x[ne1] <- NULL
  ne[ne1] <- 0L
  nall <- sum(ne)
  x <- unlist(x, use.names=FALSE)

  if (nall) {
    cn <- x[seq(from=1L, to=nall, by=2L)]
  } else {
    cn <- "1_0"
  }
  ucn <- sort.int(unique(cn))

  m <- matrix(NA_real_, nrow=n, ncol=length(ucn),
              dimnames=list(nm, ucn))

  rows <- rep(1:n, ne/2L)
  cols <- match(cn, ucn)
  i <- rows + (cols - 1L) * n

  if (nall) {
    m[i] <- as.double(x[seq(from=2L, to=nall, by=2L)])
  }
  m
}

.rescueEMRTs <- function(matchedEMRTs, mergedFeatures, quantPep3d,
                         method=c("rescue", "copy")) {
  method <- match.arg(method)

  if (method == "rescue") {
    ## these are those that were in the merged data set but that
    ## did NOT get transferred because they did NOT uniquely matched
    ## a pep3D EMRT
    i <- matchedEMRTs$matchedEMRTs != 1 &
         matchedEMRTs$precursor.leID.ident %in%
           mergedFeatures$precursor.leID.ident
  } else {
    i <- matchedEMRTs$precursor.leID.ident %in%
          mergedFeatures$precursor.leID.ident
  }
  quantIds <- mergedFeatures$precursor.leID.quant[
                              match(matchedEMRTs$precursor.leID.ident[i],
                                    mergedFeatures$precursor.leID.ident)]
  ## find corresponding columns (but exclude spectrumID and matchedEMRTs
  cols <- .commonColnames(matchedEMRTs, quantPep3d,
                          exclude=c("matchedEMRTs", "spectrumID"))
  matchedEMRTs[i, cols] <- quantPep3d[quantIds, cols]
  matchedEMRTs$idSource[i] <- method
  matchedEMRTs
}

#' find common columns
#' @param x data.frame
#' @param y data.frame
#' @param exclude character, exclude some of them
#' @return character, common column names
#' @noRd
.commonColnames <- function(x, y, exclude=character()) {
  setdiff(intersect(colnames(x), colnames(y)), exclude)
}

#' similar to duplicated but marks also the first occurence as duplicated
#' @param x vector
#' @return logical
#' @noRd
.duplicated2 <- function(x) {
  duplicated(x) | duplicated(x, fromLast=TRUE)
}

#' filter nonunique ident matches (e.g. two idents vs one quant; fragment
#' matching handles the other case: one ident vs multiple quants)
#' see https://github.com/lgatto/synapter/issues/111 for details
#' @param emrts MatchedEMRTs data.frame
#' @return filtered matched emrts data.frame with an additional column
#' `matchedMultipleIdentEMRTs`
#' @noRd
.filterNonUniqueIdentMatches <- function(emrts) {
  k1 <- which(emrts$matchedEMRTs == 1)
  dup <- which(.duplicated2(emrts$spectrumID[k1]) & !is.na(emrts$spectrumID[k1]))
  emrts$matchedEMRTs[k1[dup]] <- -2
  emrts$Counts[k1[dup]] <- NA
  emrts$matchedMultipleIdentEMRTs <- FALSE
  emrts$matchedMultipleIdentEMRTs[k1[dup]] <- TRUE
  emrts
}

#' .table is similar to table but because it is limited to numeric it is faster
#' (but less general)
#' @param x numeric vector
#' @return named vector with numbers of occurence
#' @noRd
.table <- function(x) {
  stopifnot(is.numeric(x))
  ux <- sort(unique(x))
  i <- match(x, ux)
  setNames(tabulate(i, nbins=length(ux)), ux)
}
lgatto/synapter documentation built on Sept. 28, 2022, 6:53 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
lgatto/synapter
Label-free data analysis pipeline for optimal identification and quantitation

R/utils.R
In lgatto/synapter: Label-free data analysis pipeline for optimal identification and quantitation

R Package Documentation

Browse R Packages

We want your feedback!

lgatto/synapter Label-free data analysis pipeline for optimal identification and quantitation

R/utils.R In lgatto/synapter: Label-free data analysis pipeline for optimal identification and quantitation

R Package Documentation

Browse R Packages

We want your feedback!

lgatto/synapter
Label-free data analysis pipeline for optimal identification and quantitation

R/utils.R
In lgatto/synapter: Label-free data analysis pipeline for optimal identification and quantitation