R/assign_phenotypes.R

Defines functions assign_GWAX_phenotype assign_ltfh_phenotype

Documented in assign_GWAX_phenotype assign_ltfh_phenotype

#' @title Calculate posterior mean genetic liabilities
#'
#' @description Determines the configuration for each individuals based on the
#' family history and assigns a posterior mean genetic liability to each
#' individual.
#' 
#' @details
#' The posterior mean genetic liabilities (PMGL) are estimated using Gibbs
#' sampling. For further explanation on the process, as well as parameters used
#' in the algorithm, see \code{vignette("gibbs")}.\cr
#' Calculation of PMGL is based on which configuration an individual belongs
#' to. Configurations are determined by phenotype status in the family. Two
#' different configurations can be equivalent from a theoretical standpoint,
#' and will belong to the same configuration class, denoted by the `conf_class`
#' column. Equivalent configurations are assigned the same PMGL.\cr
#' \emph{Note:} The `sibs` argument can be used for limiting how many siblings
#' are taken into consideration when creating configurations. This allows for
#' different analyses with the same simulation.
#'
#' @param pheno_file path to file with information on liabilities, i.e.
#' the file "phenotypes.txt" generated by simulation functions.
#' @param output_file path of output file, including file extension ".txt".
#' @param alpha significance level used for deciding phenotype status based on
#' liabilities.
#' @param sibs \emph{optional:} number of siblings in the families to be used
#' for calculating posterior mean genetic liabilities. Defaults to using all
#' of an individual's siblings.
#'
#' @return Does not return anything, but writes an \code{output_file} to disk,
#' where configuration, configuration class and posterior mean genetic
#' liability is appended to the end of \code{pheno_file}.\cr
#' For a more detailed guide of how to read the configuration and configuration
#' class strings, see \code{vignette("liability-distribution")}.
#'
#' @import utils
#'
#' @export
assign_ltfh_phenotype <- function(pheno_file,
                                  output_file = pheno_file,
                                  alpha,
                                  sibs) {

  stopifnot("pheno_file needs to be a valid file" = file.exists(pheno_file),
            "output_file needs to be a valid file path ending with '.txt'" =
              (tools::file_ext(output_file) == "txt"),
            "alpha needs to be a number between 0 and 1" =
              (is.numeric(alpha) && length(alpha) == 1 
               && 0 < alpha && alpha < 1),
            "sibs needs to be a non-negative integer" =
              (missing(sibs) || (is.numeric(sibs) && length(sibs) == 1 
                                 && sibs == round(sibs) && 0 <= sibs)),
            "sibs needs to be at most the number of siblings in pheno_file" =
              (missing(sibs) || sibs <= n_sibs(load_phenotypes(pheno_file))))

  # import phenotypes.
  pheno <- load_phenotypes(pheno_file)
  
  # if sibs isn't specified we just assign the max number of siblings in data
  if (missing(sibs)) {
    sibs <- n_sibs(pheno)
  }

  crit <- qnorm(1 - alpha)

  # helper function for determining phenotype status based on liabilities
  onetwo <- function(x) {
    dplyr::if_else(x != -9, dplyr::if_else(x >= crit, "2", "1"), "")
  }

  # helper function for finding equivalent configurations
  create_conf_class <- function(LTFH_key) {
    key_split <- strsplit(LTFH_key, "")
    child <- key_split[[1]][1]
    parent_sum <- sum(as.numeric(key_split[[1]][2:3]))
    n_sibs <- nchar(LTFH_key) - 3
    sibs_sum <- sum(as.numeric(tail(key_split[[1]], -3)))
    conf_class <- paste(child, parent_sum, n_sibs, sibs_sum, sep = "x")
    return(conf_class)
  }

  # Liability for child, par1 and par2
  liability_columns <- c(5, 8, 11)

  # Liability for siblings
  if (sibs > 0) {
    liability_columns <- c(liability_columns, 11 + (1:sibs * 3))
  }


  # Create configurations (LTFH_key)
  pheno <- dplyr::bind_cols(pheno,
                            tidyr::unite(data = tibble::as_tibble(
                              apply(pheno[, liability_columns], 2, onetwo)),
                              col = LTFH_key, sep = "", remove = F)[, 1])


  # Table of configurations and which configuration class they belong to
  table_1 <- dplyr::distinct(pheno, LTFH_key) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(conf_class = create_conf_class(LTFH_key))


  # Table of configuration classes and their posterior mean genetic liability
  table_2 <- dplyr::distinct(table_1, conf_class, .keep_all = TRUE) %>%
    dplyr::mutate(LTFH_pheno = generate_pmgl(conf = LTFH_key,
                                     burn_in = 200,
                                     start_value = 3,
                                     alpha = alpha)) %>%
    dplyr::select(conf_class, LTFH_pheno)

  # Joining table_1 and table_2 by conf_class
  table_3 <- dplyr::left_join(table_1, table_2, by = "conf_class")

  # Joining such that new columns are appended to the tibble imported from
  # phenofile
  pheno <- dplyr::left_join(pheno, table_3, by = "LTFH_key")

  #write to file
  data.table::fwrite(x = pheno, file = output_file,
                     quote = F,
                     sep = " ",
                     col.names = T,
                     append = F)
}

#' @title Create phenotype status by proxy
#'
#' @description Assigns a new phenotype status to individuals by looking at
#' family members, for use in Genome Wide Association Studies by proxy (GWAX).
#'
#' @details
#' An individual, who is registered as a control, will be assigned case, if any
#' of its family members are cases. Otherwise the phenotype status is the
#' same.\cr
#' \emph{Note:} The `sibs` argument can be used for limiting how many siblings
#' are taken into consideration when creating the new phenotype This allows for
#' different analyses with the same simulation, see for instance
#' \code{vignette("geneference")}.
#'
#' @param pheno_file path to file with information on phenotype status, i.e.
#' the file "phenotypes.txt" generated by simulation functions.
#' @param output_file path of output file, including file extension ".txt".
#' @param sibs \emph{optional:} specify number of siblings to use for proxy.
#' Defaults to using all of an individual's siblings.
#'
#' @return Does not return anything, but writes \code{output_file} to disk,
#' where the GWAX phenotype column is appended to the end of \code{pheno_file}. 
#'
#' @export
assign_GWAX_phenotype <- function(pheno_file,
                                  output_file = pheno_file,
                                  sibs) {

  stopifnot("pheno_file needs to be a valid file" = file.exists(pheno_file),
            "output_file needs to be a valid file path ending with '.txt'" =
              (tools::file_ext(output_file) == "txt"),
            "sibs needs to be a non-negative integer" =
              (missing(sibs) || (is.numeric(sibs) && length(sibs) == 1 
                                 && sibs == round(sibs) && 0 <= sibs)),
            "sibs needs to be at most the number of siblings in pheno_file" =
              (missing(sibs) || sibs <= n_sibs(load_phenotypes(pheno_file))))

  # Read the file
  pheno <- load_phenotypes(pheno_file)

  # if sibs isn't specified we just assign the max number of siblings in data
  if (missing(sibs)) {
    sibs <- n_sibs(pheno)
  }
  
  # Columns with phenotypes for child, parent 1 and parent 2
  pheno_cols <- seq(3, 10, 3)

  # Columns with phenotypes for siblings
  if (sibs > 0) {
    pheno_cols <- c(pheno_cols, 9 + (1:sibs * 3))
  }

  # Set GWAX phenotype to maximum of the phenotypes in the family
  pheno[, "GWAX_pheno"] <- apply(pheno[, pheno_cols], 1, max, na.rm = TRUE)

  # Write to file
  data.table::fwrite(x = pheno, file = output_file,
                     quote = F,
                     sep = " ",
                     col.names = T,
                     append = F)
}
FireGutter/geneference documentation built on Dec. 17, 2021, 8:27 p.m.