#' @title Import phenotype data
#'
#' @description
#' Import all phenotype-related data generated in simulations.
#'
#' @details
#' Note that this function only loads data about the liabilities and phenotype
#' status of individuals and their family.
#'
#' @param pheno_file path of file with phenotypes, including file extension.
#' Simulation functions save this file as "phenotypes.txt".
#'
#' @return Returns a tibble with the data from \code{pheno_file}.
#'
#' @export
load_phenotypes <- function(pheno_file) {
stopifnot("pheno_file needs to a valid file with extension '.txt'" =
(file.exists(pheno_file) && file_ext(pheno_file) == "txt"))
pheno_ds <- tibble::tibble(data.table::fread(file = pheno_file,
header = TRUE))
return(pheno_ds)
}
#' @title Import and merge results
#'
#' @description
#' Merge results from analysis performed in PLINK with the true values used by
#' the simulation.
#'
#' @details
#' The function only takes a directory as input and then prompts the user to
#' decide which files in the directory should be loaded. Therefore, simulation
#' data and results from \code{analysis_association()} should reside in the
#' same directory in order to be merged.\cr
#' The names of the columns are created by adding the name of the file as
#' suffix. E.g., the P-values from an analysis named \code{GWAS.assoc} will be
#' in the column named "P_GWAS".\cr
#' True effect sizes are stored in the column "beta". True MAFs of SNPs are
#' stored in the column "MAFs".\cr
#' Note that this function is not supposed to load phenotype information, but
#' rather loads data relating directly to the SNPs. See
#' \code{load_phenotypes()} for reading phenotype related data.\cr
#'
#' @param path path to the folder containing the data, including "/" or "\\\\"
#' at the end. The function can not take relative paths, i.e. the user always
#' needs to specify the full path to the directory.
#'
#' @return A data.table containing the different results from the files.
#'
#' @export
load_results <- function(path) {
stopifnot("path needs to be a valid path ending with '/' or '\\\\'"
= (dir.exists(path)
&& (substr(path, nchar(path), nchar(path)) == "/" ||
substr(path, nchar(path), nchar(path)) == "\\")))
vars <- c("P", "BETA", "CHISQ", "SE")
re <- list.files(path)
len <- length(re)
pick <- numeric(len)
i <- 1
j <- 1
for(val in re){
if (any(c(grepl("\\.assoc", val, ignore.case = T),
grepl("\\.qassoc", val, ignore.case = T),
grepl("\\.txt", val, ignore.case = T))) &
!grepl("pheno", val, ignore.case = T)) {
pick[i] <- re[j]
i <- i + 1
}
j <- j + 1
}
if(i == 1){
stop("No analysis files from PLINK found at path.")
}
pick <- pick[1:(i - 1)]
while(TRUE) {
cat("Which files would you like to load?: \n")
nr <- 1
for(val in pick){
cat(nr, val)
cat("\n")
nr <- nr + 1
}
cat(nr, "all \n")
cat(nr + 1, "abort \n")
ind = readline('Enter a number or numbers seperated by " ": ')
vals <- c(strsplit(ind, " ")[[1]])
if(length(vals) == 1){
if(vals %in% 1:(nr + 1)){
break
}
else{
cat("Invalid input. \n\n")
}
}
else {
for(k in vals){
if(!(k %in% 1:(nr - 1))){
flag <- FALSE
break
} else {
flag <- TRUE
}
}
if(!flag) {
cat("Invalid input. \n\n")
}
else if(any(duplicated(vals) |
duplicated(vals, fromLast = TRUE))){
cat("The same number can not be repeated. \n")
}
else {
break
}
}
}
if(length(vals) == 1 & vals[1] != nr){
if(vals == nr + 1){
stop("This process has been aborted.")
}
if (grepl("\\.txt", pick[strtoi(vals)], ignore.case = T)) {
name <- sub("\\.txt.*", "", pick[strtoi(vals)])
data <- data.table::fread(paste0(path, pick[strtoi(vals)]),
col.names = c(name))
data <- data %>% tibble::rowid_to_column("SNP")
}
else {
data <- data.table::fread(paste0(path, pick[strtoi(vals)]))
}
}
else{
if(vals[1] == nr){
vals <- 1:(strtoi(vals) - 1)
}
else{
vals <- strtoi(vals)
}
choices <- numeric(length(vals))
for(i in seq_len(length(vals))) {
choices[i] <- pick[vals[i]]
}
assoc <- grepl("\\.assoc", choices, ignore.case = T)
qassoc <- grepl("\\.qassoc", choices, ignore.case = T)
txt <- grepl("\\.txt", choices, ignore.case = T)
assoc_val <- choices[assoc]
qassoc_val <- choices[qassoc]
txt_val <- choices[txt]
if(any(assoc)) {
for(i in seq_len(sum(assoc))) {
if(i == 1) {
tmp <- data.table::fread(paste0(path, assoc_val[i]))
name <- sub("\\.assoc.*", "", assoc_val[i])
data <- data.table::data.table("SNP" = tmp$SNP, "A1" = tmp$A1)
for(v in vars){
if(v %in% names(tmp)){
strin <- paste0(v, "_", name)
stup <- tmp %>% dplyr::select(dplyr::all_of(c("SNP", v)))
data.table::setnames(stup, v, strin)
data <- dplyr::inner_join(data, stup, by = "SNP")
}
}
}
else {
tmp <- data.table::fread(paste0(path, assoc_val[i]))
name <- sub("\\.assoc.*", "", assoc_val[i])
for(v in vars){
if(v %in% names(tmp)){
strin <- paste0(v, "_", name)
stup <- tmp %>% dplyr::select(dplyr::all_of(c("SNP", v)))
data.table::setnames(stup, v, strin)
data <- dplyr::inner_join(data, stup, by = "SNP")
}
}
}
}
}
else if(any(qassoc)) {
for(i in seq_len(sum(qassoc))) {
if(i == 1) {
tmp <- data.table::fread(paste0(path, qassoc_val[i]))
name <- sub("\\.qassoc.*", "", qassoc_val[i])
data <- data.table::data.table("SNP" = tmp$SNP)
for(v in vars){
if(v %in% names(tmp)){
strin <- paste0(v, "_", name)
stup <- tmp %>% dplyr::select(dplyr::all_of(c("SNP", v)))
data.table::setnames(stup, v, strin)
data <- dplyr::inner_join(data, stup, by = "SNP")
}
}
}
else {
tmp <- data.table::fread(paste0(path, qassoc_val[i]))
name <- sub("\\.qassoc.*", "", qassoc_val[i])
for(v in vars){
if(v %in% names(tmp)){
strin <- paste0(v, "_", name)
stup <- tmp %>% dplyr::select(dplyr::all_of(c("SNP", v)))
data.table::setnames(stup, v, strin)
data <- dplyr::inner_join(data, stup, by = "SNP")
}
}
}
}
}
else {
for(i in seq_len(sum(txt))) {
if(i == 1) {
name <- sub("\\.txt.*", "", txt_val[i])
tmp <- data.table::fread(paste0(path, txt_val[i]),
col.names = c(name))
data <- tmp %>% tibble::rowid_to_column("SNP")
}
else {
name <- sub("\\.txt.*", "", txt_val[i])
tmp <- data.table::fread(paste0(path, txt_val[i]),
col.names = c(name))
tmp <- tmp %>% tibble::rowid_to_column("SNP")
data <- dplyr::inner_join(data, tmp, by = "SNP")
}
}
}
if(all(c(any(assoc), any(qassoc), any(txt)))) {
for(i in seq_len(sum(qassoc))) {
tmp <- data.table::fread(paste0(path, qassoc_val[i]))
name <- sub("\\.qassoc.*", "", qassoc_val[i])
for(v in vars){
if(v %in% names(tmp)){
strin <- paste0(v, "_", name)
stup <- tmp %>% dplyr::select(dplyr::all_of(c("SNP", v)))
data.table::setnames(stup, v, strin)
data <- dplyr::inner_join(data, stup, by = "SNP")
}
}
}
for(i in seq_len(sum(txt))) {
name <- sub("\\.txt.*", "", txt_val[i])
tmp <- data.table::fread(paste0(path, txt_val[i]),
col.names = c(name))
tmp <- tmp %>% tibble::rowid_to_column("SNP")
data <- dplyr::inner_join(data, tmp, by = "SNP")
}
}
else if (all(c(any(assoc), any(txt)))) {
for(i in seq_len(sum(txt))) {
name <- sub("\\.txt.*", "", txt_val[i])
tmp <- data.table::fread(paste0(path, txt_val[i]),
col.names = c(name))
tmp <- tmp %>% tibble::rowid_to_column("SNP")
data <- dplyr::inner_join(data, tmp, by = "SNP")
}
}
else if (all(c(any(assoc), any(qassoc)))) {
for(i in seq_len(sum(qassoc))) {
tmp <- data.table::fread(paste0(path, qassoc_val[i]))
name <- sub("\\.qassoc.*", "", qassoc_val[i])
for(v in vars){
if(v %in% names(tmp)){
strin <- paste0(v, "_", name)
stup <- tmp %>% dplyr::select(dplyr::all_of(c("SNP", v)))
data.table::setnames(stup, v, strin)
data <- dplyr::inner_join(data, stup, by = "SNP")
}
}
}
}
else if(all(c(any(qassoc), any(txt)))) {
for(i in seq_len(sum(txt))) {
name <- sub("\\.txt.*", "", txt_val[i])
tmp <- data.table::fread(paste0(path, txt_val[i]),
col.names = c(name))
tmp <- tmp %>% tibble::rowid_to_column("SNP")
data <- dplyr::inner_join(data, tmp, by = "SNP")
}
}
}
return(data)
}
#' @title Augment results for plotting
#'
#' @description
#' Takes data from \code{load_results()} and appends
#' columns used for plotting results.
#'
#' @details
#' This function requires that this data has been loaded with
#' \code{load_results()}. \cr
#' It generates new columns for all columns that represent p-values generated by
#' \code{analysis_association()}. One of the columns, having "_significant" as
#' its suffix, denotes whether or not the p-value is significant at the specified
#' alpha level. The other column, having "_bonferroni" as suffix, denotes whether the p-value is significant
#' with Bonferroni-corrected alpha level.\cr
#' Furthermore, the user is prompted to decide if they want to create a column
#' named "causal", which denotes whether or not a SNP is truly causal for the
#' phenotype status of an individual. The user needs to have loaded "beta.txt"
#' with \code{load_results()} in order to create this column.\cr
#' Lastly, the user is prompted to decide if they want to create a column
#' named "LTFH_transformed", which is the estimated effect sizes of a
#' regression using the LT-FH phenotype transformed to the liability scale.\cr
#' Transforming the effect sizes requires that the user has loaded results from
#' \code{analysis_association()} with both \code{pheno_name = "line_pheno"} and
#' \code{pheno_name = "LTFH_pheno"}. These need to be laoded with \code{load_results()}
#' along with \code{MAFs.txt}.
#'
#' @param data data.table from \code{load_results()}.
#' @param alpha significance level used for hypothesis tests.
#'
#' @return A data.table containing the new appended results.
#'
#' @export
augment_results <- function(data, alpha) {
plotter <- function(){
j <- 1
for(name in names(data)){
cat(j, name)
cat("\n")
j <- j + 1
}
}
nr_of_names <- length(names(data))
while(TRUE){
cat("Do you want to create a causal column from the true effect sizes (by default named beta)")
ind = readline('Enter 0 for "NO" or 1 for "YES": ')
if(ind %in% c(0, 1)){
if(ind == 1){
while(TRUE) {
cat("Which column is the beta column?\n")
plotter()
cat(nr_of_names + 1, "That column does not exist\n")
val <- readline('Enter a number: ')
if(val %in% 1:nr_of_names){
val <- strtoi(val)
beta_vals <- dplyr::pull(data, names(data)[val])
data <- data %>% dplyr::mutate("causal" = beta_vals != 0)
break
}
else if(val == (nr_of_names + 1)){
cat("Since the beta column does not exist, the causal column can't be created\n")
ind <- 0
break
}
else {
cat("Invalid input.\n")
}
}
break
}
break
}
cat("Invalid input.\n")
}
m <- nrow(data)
for(val in names(data)) {
if (grepl("^P_", val, ignore.case = T)) {
strin1 <- paste0(val, "_significant")
strin2 <- paste0(val, "_bonferroni")
numbers <- data %>% dplyr::select(dplyr::all_of(c(val)))
data <- data %>% dplyr::mutate("strin1v1" = numbers < alpha,
"strin2v2" = numbers < alpha/m)
data.table::setnames(data, "strin1v1", strin1)
data.table::setnames(data, "strin2v2", strin2)
}
}
while (TRUE){
cat("Do you also want to make a transformed LTFH_beta column?")
ind <- readline('Enter 0 for "NO" or 1 for "YES": ')
if(ind %in% c(0, 1)){
if(ind == 1){
cat("Do you have standard names from the load_results() function?\n")
sec <- readline('Enter 0 for "NO" or 1 for "YES": ')
if(sec %in% c(0, 1)){
break
}
}
else {
break
}
}
else {
cat("Invalid input.\n\n")
}
}
nr_of_names <- length(names(data))
if(ind == 0) {
return(data)
}
else if (ind == 1 & sec == 0) {
i <- 1
choices <- numeric(6)
while(TRUE){
if(i == 1){
cat("Which column contain P-values form LTFH?\n")
plotter()
cat(nr_of_names + 1, "None of the columns contain P-values from LTFH.\n")
ind <- readline('Enter a number: ')
if(ind %in% 1:nr_of_names){
ind <- strtoi(ind)
choices[i] <- names(data)[ind]
i <- i + 1
LTFH_P <- dplyr::pull(data, names(data)[ind])
}
else if(ind == nr_of_names + 1){
ind <- 0
break
}
else {
cat("Invalid input.\n")
}
}
else if(i == 2){
cat("Which column contain P-values from file 'linear'?\n")
plotter()
cat(nr_of_names + 1, "None of the columns contain P-values from file 'linear'.\n")
ind <- readline('Enter a number: ')
if(ind %in% 1:nr_of_names){
ind <- strtoi(ind)
choices[i] <- names(data)[ind]
i <- i + 1
LINE_P <- dplyr::pull(data, names(data)[ind])
}
else if(ind == nr_of_names + 1){
ind <- 0
break
}
else {
cat("Invalid input.\n")
}
}
else if(i == 3){
cat("Which column contain beta-values form LTFH?\n")
plotter()
cat(nr_of_names + 1, "None of the columns contain beta-values from LTFH.\n")
ind <- readline('Enter a number: ')
if(ind %in% 1:nr_of_names){
ind <- strtoi(ind)
choices[i] <- names(data)[ind]
i <- i + 1
LTFH_BETA <- dplyr::pull(data, names(data)[ind])
}
else if(ind == nr_of_names + 1){
ind <- 0
break
}
else {
cat("Invalid input.\n")
}
}
else if(i == 4){
cat("Which column contain standard-errors (SE) values from LTFH?\n")
plotter()
cat(nr_of_names + 1, "None of the columns contain standard-errors from LTFH.\n")
ind <- readline('Enter a number": ')
if(ind %in% 1:nr_of_names){
ind <- strtoi(ind)
choices[i] <- names(data)[ind]
i <- i + 1
LTFH_SE <- dplyr::pull(data, names(data)[ind])
}
else if(ind == nr_of_names + 1){
ind <- 0
break
}
else {
cat("Invalid input.\n")
}
}
else if(i == 5){
cat("Which column contain the MAF values\n")
plotter()
cat(nr_of_names + 1, "None of the columns contain the MAF values.\n")
ind <- readline('Enter a number: ')
if(ind %in% 1:nr_of_names){
ind <- strtoi(ind)
choices[i] <- names(data)[ind]
i <- i + 1
MAFS <- dplyr::pull(data, names(data)[ind])
}
else if(ind == nr_of_names + 1){
ind <- 0
break
}
else {
cat("Invalid input.\n")
}
}
else if(i == 6) {
ind <- readline('Choose the prevalence parameter "k": ')
options(digits = 5)
k <- as.double(ind)
choices[i] <- ind
i <- i + 1
}
else if(i == 7) {
while(TRUE) {
cat("The following data has been entered:\n")
cat("1: P values from LTFH:", choices[1])
cat("\n")
cat("2: P values from file 'linear':", choices[2])
cat("\n")
cat("3: BETA values from LTFH:", choices[3])
cat("\n")
cat("4: SE from LTFH:", choices[4])
cat("\n")
cat("5: MAFs:", choices[5])
cat("\n")
cat("6: k (prevalence):", choices[6])
cat("\n")
ind = readline('Is this correct? Enter 7 if "YES". Enter any of the above numbers if no: ')
if(ind %in% 1:7) {
if(ind == "7"){
i <- 8
break
}
else{
i <- strtoi(ind)
break
}
}
else {
cat("Invalid input! \n")
}
}
}
else {
break
}
}
}
else {
stopifnot("data must have columns 'P_LTFH', 'P_linear', 'BETA_LTFH', 'SE_LTFH' and 'MAFs'" =
all(c('P_LTFH', 'P_linear', 'BETA_LTFH', 'SE_LTFH', 'MAFs') %in% colnames(data)))
ind <- readline('Choose the prevalence parameter "k": ')
options(digits = 5)
k <- as.double(ind)
sq <- sqrt(m * median(qchisq(data$P_LTFH, 1, lower.tail = T))/median(qchisq(data$P_linear, 1, lower.tail = T)))
obs <- (data$BETA_LTFH / (data$SE_LTFH * sq)) * sqrt(k * (1 - k) / (2 * data$MAFs * (1 - data$MAFs)))
data <- data %>% dplyr::mutate("LTFH_transformed" = obs)
return(data)
}
sq <- sqrt(m * median(qchisq(LTFH_P, 1, lower.tail = T))/median(qchisq(LINE_P, 1, lower.tail = T)))
obs <- (LTFH_BETA / (LTFH_SE * sq)) * sqrt(k * (1 - k) / (2 * MAFS * (1 - MAFS)))
data <- data %>% dplyr::mutate("LTFH_transformed" = obs)
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.