R/simulate_LFQ_data.R

Defines functions sim_lfq_data_2Factor_config sim_lfq_data_protein_config sim_lfq_data_peptide_config which_missing sim_lfq_data .generate_random_string

Documented in sim_lfq_data sim_lfq_data_2Factor_config sim_lfq_data_peptide_config sim_lfq_data_protein_config which_missing

.generate_random_string <- function(N, str_length) {
  random_string <- vector(mode = "character", length = N)
  digits <- 0:9
  for (i in seq_len(N)) {
    random_string[i] <- paste0(sample(digits, str_length, replace = TRUE), collapse = "")
  }
  return(random_string)
}


#' simulate protein level data with two groups
#' @export
#' @param Nprot number of porteins
#' @param N group size
#' @param fc D - down regulation N - matrix,  U -  regulation
#' @param prop proportion of down (D), up (U) and not regulated (N)
#' @examples
#'
#' res <- sim_lfq_data(Nprot = 10)
#' res <- sim_lfq_data(Nprot = 10, PEPTIDE = TRUE)
#'

sim_lfq_data <- function(
    Nprot = 20,
    N = 4,
    fc = list(A = c(D = -2,  U = 2, N = 0), B = c(D = 1, U = -4)),
    prop = list(A = c(D = 10, U = 10), B = c(D = 5, U = 20)),
    mean_prot = 20,
    sdlog = log(1.2),
    probability_of_success = 0.3,
    PEPTIDE = FALSE
) {


  proteins <- stringi::stri_rand_strings(Nprot, 6)
  idtype2 <- .generate_random_string(Nprot, 4)
  # simulate number of peptides per protein
  nrpeptides <- rgeom(n = Nprot, probability_of_success) + 1


  prot <- data.frame(
    proteinID = proteins,
    idtype2 = idtype2,
    nr_peptides = nrpeptides,
    average_prot_abundance = rlnorm(Nprot,log(20),sdlog = sdlog),
    mean_Ctrl = 0,
    N_Ctrl = N,
    sd = 1
  )

  for (i in seq_along(fc)) {
    name <- names(fc)[i]
    fcx <- fc[[i]]
    propx <- prop[[i]]
    if (!"N" %in% names(fcx)) {
      fcx["N"] <- 0
    }
    if (!"N" %in% names(propx)) {
      propx["N"] <- 100 - sum(propx)
    }

    FC = rep(fcx, ceiling(propx / 100 * Nprot)) |> head(n = Nprot)

    # add regulation to group A.
    groupMean <- paste0("mean_", name)
    groupSize <- paste0("N_", name)
    prot <- prot |> dplyr::mutate(!!groupMean := FC, !!groupSize := N)
  }

  if (PEPTIDE) {

    # add row for each protein
    peptide_df <- prot |> tidyr::uncount( nr_peptides )
    # create peptide ids
    peptide_df$peptideID <- stringi::stri_rand_strings(sum(prot$nr_peptides), 8)
  } else {
    peptide_df <- prot
  }


  # transform into long format
  peptide_df2 <- peptide_df |> tidyr::pivot_longer(cols = tidyselect::starts_with(c("mean", "N_")),
                                                   names_to = "group" , values_to = "mean")
  peptide_df2 <-  peptide_df2 |> tidyr::separate(group, c("what", "group"))
  peptide_df2 <- peptide_df2 |> tidyr::pivot_wider(names_from = "what", values_from = mean)

  sample_from_normal <- function(mean, sd, n) {
    rnorm(n, mean, sd)
  }
  nrpep <- nrow(peptide_df2)
  sampled_data <- matrix(nrow = nrpep, ncol = N)
  colnames(sampled_data) <- paste0("V", 1:ncol(sampled_data))

  peptide_df2$average_prot_abundance <- peptide_df2$average_prot_abundance - peptide_df2$mean

  if (PEPTIDE) {
    peptide_df2$avg_peptide_abd <-
      with(peptide_df2,
           rlnorm(nrow(peptide_df2),
                  meanlog = log(average_prot_abundance),
                  sdlog = sdlog))
    for (i in seq_len(nrpep)) {
      sampled_data[i,] <- sample_from_normal(peptide_df2$avg_peptide_abd[i], peptide_df2$sd[1], peptide_df2$N[i])
    }

  } else {
    for (i in seq_len(nrpep)) {
      sampled_data[i,] <- sample_from_normal(peptide_df2$average_prot_abundance[i], peptide_df2$sd[1], peptide_df2$N[i])
    }
  }

  x <- dplyr::bind_cols(peptide_df2,sampled_data)

  peptideAbudances <- x |>
    tidyr::pivot_longer(
      tidyselect::starts_with("V"),
      names_to = "Replicate",
      values_to = "abundance")
  peptideAbundances <- peptideAbudances |>
    tidyr::unite("sample", group, Replicate, remove =  FALSE)
  return(peptideAbundances)
}


#' add missing values to x vector based on the values of x
#' @export
#' @param x vector of intensities
#' @param weight_missing greater weight more missing
#'
which_missing <- function(x, weight_missing = 0.2){
  missing_prop <- pnorm(x, mean = mean(x), sd = sd(x))
  # sample TRUE or FALSE with propability in missing_prop
  samplemiss <- function(missing_prop) {
    mp <- c((1 - missing_prop)*weight_missing, missing_prop*3)
    mp <- mp / sum(mp)
    sample(c(TRUE, FALSE), size = 1, replace = TRUE, prob = mp)
  }

  missing_values <- sapply(missing_prop, samplemiss)
  # Introduce missing values into the vector x
  #x[missing_values] <- NA
  return(missing_values)
}


#' Simulate data, protein and peptide, with config
#' @param description Nprot number of proteins
#' @param with_missing add missing values, default TRUE
#' @param seed seed for reproducibility, if NULL no seed is set.
#' @export
#' @examples
#' undebug(sim_lfq_data_peptide_config)
#' x <- sim_lfq_data_peptide_config()
#' stopifnot("data.frame" %in% class(x$data))
#' stopifnot("AnalysisConfiguration" %in% class(x$config))
sim_lfq_data_peptide_config <- function(Nprot = 10, with_missing = TRUE, weight_missing = 0.2, seed = 1234){
  if (!is.null(seed)) {
    set.seed(seed)
  }
  data <- sim_lfq_data(Nprot = Nprot, PEPTIDE = TRUE)
  if (with_missing) {
    not_missing <- !which_missing(data$abundance, weight_missing = weight_missing)
    data <- data[not_missing,]
  }
  data$isotopeLabel <- "light"
  data$qValue <- 0

  atable <- AnalysisTableAnnotation$new()
  atable$fileName = "sample"

  atable$factors["group_"] = "group"
  atable$hierarchy[["protein_Id"]] = c("proteinID", "idtype2")
  atable$hierarchy[["peptide_Id"]] = "peptideID"
  atable$set_response("abundance")

  config <- AnalysisConfiguration$new(atable)
  adata <- setup_analysis(data, config)
  return(list(data = adata, config = config))
}
#' Simulate data, protein, with config
#' @param description Nprot number of proteins
#' @param with_missing add missing values, default TRUE
#' @param seed seed for reproducibility, if NULL no seed is set.
#' @export
#' @examples
#' x <- sim_lfq_data_protein_config()
#' stopifnot("data.frame" %in% class(x$data))
#' stopifnot("AnalysisConfiguration" %in% class(x$config))
#'
sim_lfq_data_protein_config <- function(Nprot = 10, with_missing = TRUE, weight_missing = 0.2, seed = 1234){
  if (!is.null(seed)) {
    set.seed(seed)
  }
  data <- sim_lfq_data(Nprot = Nprot, PEPTIDE = FALSE)
  if (with_missing) {
    data <- data[!which_missing(data$abundance,weight_missing = weight_missing),]
  }
  data$isotopeLabel <- "light"
  data$qValue <- 0

  atable <- AnalysisTableAnnotation$new()
  atable$fileName = "sample"
  atable$nr_children = "nr_peptides"
  atable$factors["group_"] = "group"
  atable$hierarchy[["protein_Id"]] = c("proteinID", "idtype2")
  atable$set_response("abundance")

  config <- AnalysisConfiguration$new(atable)
  adata <- setup_analysis(data, config)
  return(list(data = adata, config = config))
}


#' Simulate data, protein, with config with 2 factros Treatment and Background
#' @param description Nprot number of proteins
#' @param with_missing add missing values, default TRUE
#' @param seed seed for reproducibility, if NULL no seed is set.
#' @export
#' @examples
#' x <- sim_lfq_data_2Factor_config(PEPTIDE= FALSE)
#' dim(x$data)
#' stopifnot("data.frame" %in% class(x$data))
#' stopifnot("AnalysisConfiguration" %in% class(x$config))
#' x <- sim_lfq_data_2Factor_config(PEPTIDE = TRUE)
#' dim(x$data)
sim_lfq_data_2Factor_config <- function(Nprot = 10,
                                                with_missing = TRUE,
                                                weight_missing = 0.2,
                                                PEPTIDE = FALSE,
                                                seed = 1234
                                                ){
  if (!is.null(seed)) {
    set.seed(seed)
  }
  res <- sim_lfq_data(Nprot = Nprot, PEPTIDE = PEPTIDE,
                      fc = list(A = c(D = -2,  U = 2, N = 0), B = c(D = 1, U = -4), C = c(D = -1, U = -4)),
                      prop = list(A = c(D = 10, U = 10), B = c(D = 5, U = 20), C = c(D = 15, U = 25)))
  res <- res |> mutate(Treatment = case_when(group %in% c("Ctrl", "A") ~ "A", TRUE ~ "B"))
  data <- res |> mutate(Background = case_when(group %in% c("Ctrl", "C") ~ "Z", TRUE ~ "X"))
  if (with_missing) {
    data <- data[!which_missing(data$abundance,weight_missing = weight_missing),]
  }
  data$isotopeLabel <- "light"
  data$qValue <- 0

  atable <- AnalysisTableAnnotation$new()
  atable$fileName = "sample"
  atable$nr_children = "nr_peptides"
  atable$factors["Treatment"] = "Treatment"
  atable$factors["Background"] = "Background"
  atable$factorDepth <- 2

  atable$hierarchy[["protein_Id"]] = c("proteinID", "idtype2")
  if (PEPTIDE) {
    atable$hierarchy[["peptpide_Id"]] = c("peptideID")
  }
  atable$set_response("abundance")

  config <- AnalysisConfiguration$new(atable)
  adata <- setup_analysis(data, config)
  return(list(data = adata, config = config))
}
wolski/prolfqua documentation built on May 12, 2024, 10:16 p.m.