R/merge-proteins.R

Defines functions merge_proteins sum_slides extract_ints

Documented in extract_ints merge_proteins sum_slides

# merge data from several proteins together into one combined protein

#' extract_ints
#'
#' Cut every slice into tiny subslices.
#'
#' @param ints Vector of intensities for every slice.
#' @param mass_fit_params Parameter estimates from the polynomial fit.
#' @param cut_size Number of subslices per slice (default is 100)
#' @examples
#' mass_fit_params <- c(2.90230588635708,-0.0990513683718742,0.00309367801794186,-4.05829976900451e-05)
#' ints <- runif(42,0,1)
#' extr_ints <- extract_ints(ints, mass_fit_params)
#' @export
extract_ints <- function(ints, mass_fit_params, cut_size = 100){

  fit_func <- function(x) { mass_fit_params[1] + mass_fit_params[2] * x + mass_fit_params[3] * x^2 + mass_fit_params[4] * x^3 }

  my_limits <- head(seq(from=0.5, to=length(ints)+0.5, by=1), -1)

  d <- as.numeric(ints)

  cutted_ints <- lapply(1:length(my_limits), function(i){
    l <- my_limits[i]
    my_cuts <- head(seq(from=l, to=l+1, by=1/(cut_size)), -1)
    #my.ints <- rep(d[i]/cut.size, cut.size)
    my_ints <- rep(d[i], cut_size)
    my_weights <- unlist(lapply(my_cuts, fit_func))
    data.frame(cuts=my_cuts, ints=my_ints, mol_mass=my_weights)
  })

  do.call("rbind", cutted_ints)
}


#' sum_slides
#'
#' Sum up slides which were generated by extract_ints
#'
#' @param extracted_data A list of data extracted by extract_ints
#' @export
sum_slides <- function(extracted_data){
  nr_samples <- length(extracted_data)

  # we take the first as reference
  ref <- extracted_data[[1]]

  # loop over the mol.mass from the reference and add the other ones to it
  ref_length <- nrow(ref)
  summed_ints <- lapply(1:ref_length, function(i){
    ref_mass <- ref$mol_mass[i]

    # define the borders of the current slice
    dist_left <- if(i > 1) ref$mol_mass[i-1] - ref_mass else ref_mass - ref$mol_mass[i+1]
    dist_right <- if(i < ref_length) ref_mass - ref$mol_mass[i+1] else dist_left #ref$mol_mass[i+1] - ref_mass
    limit_left <- ref_mass - dist_left/2
    limit_right <- ref_mass + dist_right/2

    res <- ref$ints[i]

    if(nr_samples > 1){
      # look for entries within this border in the other samples
      sample_ints <- lapply(2:nr_samples, function(idx){
        sample_data <- extracted_data[[idx]]
        sample_slices_ok <- which(sample_data$mol_mass >= limit_left & sample_data$mol_mass < limit_right)
        if(length(sample_slices_ok)) {
          sample_data$ints[sample_slices_ok]
        }else{
          0
        }
      })

      # give back the summed ints
      res <- sum(unlist(sample_ints)) + res
    }

    res
  })

  # construct the data.frame with the results
  data.frame(mol_mass=ref$mol_mass, ints=unlist(summed_ints)/nr_samples)
}


#' merge_proteins
#'
#' Cut every slice into tiny subslices.
#'
#' @param data Named list containing mass_fit_params and ints for every
#'   replicate.
#' @param cut_size Number of subslices per slice (default is 100)
#' @param loess_span The span parameter used by the loess curve fitting
#'   function.
#' @examples
#' data <- list()
#' data[[1]] <- list()
#' data[[1]][['mass_fit_params']] <- c(2.92541101792906,-0.0856835305957989,
#' 0.00229499350478187,-2.85888515674011e-05)
#' data[[1]][['ints']] <- c(1930800,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#' 0,0,0,0,0,0,0,0,0,0,0,3145700,4036800,6036000,6696500000,8679700000,
#' 63454000,0,0,0,0,0,0,0,0,0)
#' data[[2]] <- list()
#' data[[2]][['mass_fit_params']] <- c(2.93157752998003,-0.0911120501100777,
#' 0.00261614574546816,-3.38080473782906e-05)
#' data[[2]][['ints']] <- c(0,0,0,424060,0,0,26561000,0,0,0,8257400,0,0,0,0,
#' 0,0,0,0,4896700,3668500,0,0,0,0,52009000,0,0,0,0,0,0,5325700,2393800,2973300,
#' 4061200000,3175600000,35474000,25662000,18546000,0,0,7599400,0,0,5936300,0)
#' data[[3]] <- list()
#' data[[3]][['mass_fit_params']] <- c(2.86255232915833,-0.0860407095427795,
#' 0.00249869975354482,-3.31808222647113e-05)
#' data[[3]][['ints']] <- c(1053800,0,0,0,0,0,0,0,3159900,1061300,0,4672100,
#' 0,0,2268100,0,0,0,0,0,0,0,0,0,3289300,0,0,0,5302800,4653300,0,7113600,
#' 24130000,6285400,460980000,1.3405e+10,3747900000,17037000,15226000,0,0,
#' 17088000,3872700,0,14897000,8817200)
#' merged_protein <- merge_proteins(data, cut_size=100, loess_span=0.05)
#' @export
#12345678901234567890123456789012345678901234567890123456789012345678901234567890
merge_proteins <- function(data, cut_size = 100, loess_span=0.05){

  extracted_data <- lapply(data, function(d){
    extract_ints(d[['ints']], d[['mass_fit_params']], cut_size)
  })

  summed_ints <- sum_slides(extracted_data)

  # make a loess fit
  loess_fit <- stats::loess(ints ~ mol_mass, summed_ints, span=loess_span)
  j <- order(summed_ints$mol_mass)
  loess_fit_y <- loess_fit$fitted[j]
  loess_fit_y[loess_fit_y < 0] <- 0
  data.frame(x=summed_ints$mol_mass[j], y=loess_fit_y)
}
UNIL-PAF/pumbaR documentation built on June 9, 2022, 6:31 p.m.