# {VALIDATED}
#' Load beta data
#'
#' Using sesame and sesame, this function loads beta data using IDAT files
#' stored in a given directory
#'
#' @param IDAT_dir directory location of where given IDAT file is located
#'
#' @return beta matrix
#' @examples
#' betas <- read_IDAT('~/20191212_GEO_datasets/GSE110554')
#' ... some visualization
#' @export
read_IDAT <- function(IDAT_dir) {
library(sesame)
library(tidyverse)
pfx = searchIDATprefixes(IDAT_dir)
betas = openSesame(pfx)
return(betas)
}
#' Remove probes on sex chromosomes {VALIDATED}
#'
#' This function removes probes located on sex chromosomes because these
#' probes have a disproportionate global methylation status
#'
#' @param reference reference file
#' @param betas matrix (rows are probe names, columns are cell names)
#'
#' @return beta matrix without probes located on sex chromosomes.
#' @examples
#' betas <- remove_sex_chr(reference, betas)
#' ... some visualization
#' @export
remove_sex_chr <- function(reference, betas) {
sex_probes_remove = rownames(reference)[reference[,"seqnames"] == "chrX" | reference[,"seqnames"] == "chrY" | reference[,"seqnames"] == "*"]
betas = betas[,!(colnames(betas) %in% sex_probes_remove)]
return(betas)
}
#' Returns reference file {VALIDATED}
#'
#' This function returns a reference file given location. Any preprocessing
#' steps for the reference will also be added in this function.
#'
#' @param reference_file directory location of where given IDAT file is located
#'
#' @return beta matrix without probes located on sex chromosomes.
#' @examples
#' reference <- get_reference('~/references/hg38/annotation/R/
#' EPIC.hg38.manifest.rds')
#' ... some visualization
#' @export
get_reference <- function(reference_file) {
reference <- readRDS(reference_file)
probe_names <- names(reference)
reference <- data.frame(reference)
rownames(reference) <- probe_names
return(reference)
}
#' Remove specific cell types from betas matrix {VALIDATED}
#'
#' This function removes specified group names. Common names include MIX or
#' are typically heterogeneous mixtures of cells.
#'
#' @param betas matrix (rows are cell names, columns are probe names)
#' @param remove_names list of cell names in betas
#'
#' @return beta matrix without specified cell names
#' @examples
#' betas <- remove_cell_types(betas, c('A.mix', 'B.mix', ...))
#' ... some visualization
#' @export
remove_cell_types <- function(betas, remove_names) {
betas <- betas[!(rownames(betas) %in% remove_names),]
return(betas)
}
#' Generate bimodal distribution {VALIDATED}
#'
#' This function generates a bimodal distribution on the betas matrix data
#' within cells
#'
#' @param betas matrix (rows are cell names, columns are probe names)
#'
#' @return matrix specifying mu0, mu1, sigma0, sigma1 across all cell types
#' @examples
#' bi <- generate_bimodal_distribution(betas)
#' ... some visualization
#' @export
generate_bimodal_distribution <- function(betas) {
library(BimodalIndex)
b_data <- betas[, apply(betas, 2, function(x) !any(is.na(x)))]
bi <- bimodalIndex(b_data, verbose=FALSE)
return(bi)
}
#' Categorize betas based on bimodal distribution {VALIDATED}
#'
#' This function uses a given bimodal distribution generated within cells
#' in order to categorize betas as either methylated (1), unmethylated (0), or
#' ambiguous (-1).
#'
#' @param betas matrix (rows are cell names, columns are probe names)
#' @param bi matrix specifying mu0, mu1, sigma0, sigma1 across all cell types
#'
#' @return betas categorized based on bimodal distribution
#' @examples
#' betas <- fit_betas_to_distribution(betas, bi)
#' ... some visualization
#' @export
fit_betas_to_distribution <- function(betas, bi) {
for(i in 1:nrow(betas)) {
for(k in 1:ncol(betas)) {
if (is.na(betas[i,k])) {
} else if (betas[i,k] <= (bi[i,1] + 2*bi[i,3])) {
betas[i,k] = 0
} else if (betas[i,k] >= (bi[i,2] - 2*bi[i,3])) {
betas[i,k] = 1
} else {
betas[i,k] = -1
}
}
}
return(betas)
}
#' Create consensus vector of probes for each group {VALIDATED}
#'
#' This function creates consensus vector for each group using majority ruling:
#' across all probes, whichever category (methylated/unmethylated) is
#' represented in more than 2/3's of the sample will saved. Otherwise, -1 for
#' ambiguous is saved. A probabilistic model will supplement this in the future.
#'
#' @param betas matrix (rows are cell names, columns are probe names)
#' @param group_names list of cell names in betas
#'
#' @return consensus vector for each cell in group names
#' @examples
#' consensus_vector <- create_consensus_vector(betas, c('NK', "Monocytes",
#' ...))
#' ... some visualization
#' @export
create_consensus_vector <- function(betas, group_names) {
consensus_vector <- c()
for (i in 1:length(group_names)) {
name = group_names[i]
select_group = betas[grepl(name, rownames(betas)), ]
n = nrow(select_group)
S = select_group[1,]
S[colSums(select_group == 1) > (n - n / 3)] = 1
S[colSums(select_group == 0) > (n - n / 3)] = 0
S[colSums(select_group == 0) <= (n - n / 3) & colSums(select_group == 1) <= (n - n / 3)] = -1
consensus_vector[[name]] = S
}
return(data.frame(consensus_vector))
}
#' Converts betas matrix to consensus vectors of probes for each group
#' {VALIDATED}
#'
#' This function creates a consensus vector for each group using the betas
#' matrix
#'
#' @param reference_file directory location of where given IDAT file is located
#' @param series_matrix_file directory location of where given series matrix
#' file is located
#' @param betas matrix (rows are cell names, columns are probe names)
#' @param group_names list of cell names in betas
#' @param remove_names list of cell names in betas
#'
#' @return consensus vector data frame for each cell in group names
#' @examples
#' consensus_vector <- betas_to_consensus_vector('~/references/
#' EPIC.hg38.manifest.rds', 'GSE110554/samples_GEOLoadSeriesMatrix.rds',
#' betas, c('NK', "Monocytes", ...), c('A.mix', 'B.mix', ...))
#' ... some visualization
#' @export
betas_to_consensus_vector <- function(reference_file, series_matrix_file, betas, group_names, remove_names) {
betas <- remove_sex_chr(reference_file, betas)
save_data(betas, 'betas_raw')
series_matrix <- readRDS(series_matrix_file)
betas = t(betas)
rownames(betas) <- series_matrix$title
betas <- remove_cell_types(betas, remove_names)
save_data(betas, 'betas_selected_cells')
bi <- generate_bimodal_distribution(betas)
save_data(bi, 'bi')
betas <- fit_betas_to_distribution(betas, bi)
save_data(betas, 'betas_fit_to_distribution')
consensus_vector <- create_consensus_vector(betas, group_names)
names(consensus_vector) <- c('NK.cells', 'Monocytes', 'Tc.cells', 'Neutrophils','Th.cells', 'B.cells')
save_data(consensus_vector, 'consensus_vector')
consensus_vector <- remove_unapplicable_probes(consensus_vector)
return(consensus_vector)
}
#' Removes NA probes {VALIDATED}
#'
#' Selects and removes NA probes from consensus vectors
#'
#' @param consensus_vector consensus vector data frame for each cell in group
#' names
#'
#' @return consensus vector without NA probes
#' @examples
#' consensus_vector <- remove_NA_probes(consensus_vector)
#' ... some visualization
#' @export
remove_NA_probes <- function(consensus_vector) {
NA_remove <- rownames(consensus_vector)[is.na(rowSums(consensus_vector))]
print(sprintf("Number of NAs: %d", length(NA_remove)))
consensus_vector = consensus_vector[!(rownames(consensus_vector) %in% NA_remove), ]
return(consensus_vector)
}
#' Removes intermediate probes {VALIDATED}
#'
#' Selects and removes intermediate probes from consensus vectors
#'
#' @param consensus_vector consensus vector data frame for each cell in group
#' names
#'
#' @return consensus vector without intermediate probes
#' @examples
#' consensus_vector <- remove_intermediate_probes(consensus_vector)
#' ... some visualization
#' @export
remove_intermediate_probes <- function(consensus_vector) {
nprobes = nrow(consensus_vector)
intermediate_remove = unique(which(consensus_vector == -1) %% nprobes)
intermediate_remove[intermediate_remove == 0] = nprobes
intermediate_remove <- rownames(consensus_vector)[intermediate_remove]
print(sprintf("Number of intermediates: %d", length(intermediate_remove)))
consensus_vector <- consensus_vector[!(rownames(consensus_vector) %in% intermediate_remove), ]
return(consensus_vector)
}
#' Removes zero parsimony probes {VALIDATED}
#'
#' Selects and removes zero parsimony probes from consensus vectors
#'
#' @param consensus_vector consensus vector data frame for each cell in group
#' names
#'
#' @return consensus vector without zero parsimony probes
#' @examples
#' consensus_vector <- remove_zero_parsimony_probes(consensus_vector)
#' ... some visualization
#' @export
remove_zero_parsimony_probes <- function(consensus_vector) {
ngroups = ncol(consensus_vector)
zero_parsimony_remove = rownames(consensus_vector)[rowSums(consensus_vector) == ngroups | rowSums(consensus_vector) == 0]
print(sprintf("Number of zero parsimony: %d", length(zero_parsimony_remove)))
consensus_vector <- consensus_vector[!(rownames(consensus_vector) %in% zero_parsimony_remove), ]
return(consensus_vector)
}
#' Removes all inapplicable probes {VALIDATED}
#'
#' Selects and removes NA, intermediate, and zero parsimony probes from
#' consensus vectors
#'
#' @param consensus_vector consensus vector data frame for each cell in group
#' names
#'
#' @return consensus vector without inapplicable probes
#' @examples
#' consensus_vector <- remove_unapplicable_probes(consensus_vector)
#' ... some visualization
#' @export
remove_unapplicable_probes <- function(consensus_vector) {
consensus_vector <- remove_NA_probes(consensus_vector)
consensus_vector <- remove_intermediate_probes(consensus_vector)
consensus_vector <- remove_zero_parsimony_probes(consensus_vector)
return(consensus_vector)
}
#' Saves specified data with variable name to working directory {VALIDATED}
#'
#' Given data, a variable name, and a location for that data to be stored,
#' this function simply saves data.
#'
#' @param data data structure
#' @param variable_name name of variable/file
#' @param dir directory to which the data should be saved
#'
#' @return None
#' @examples
#' save_data(betas, 'betas', getwd())
#' ... some visualization
#' @export
save_data <- function(data, variable_name, dir) {
if(missing(dir)) {
dir = getwd()
}
data_file = paste(dir, "/", variable_name, '.rds', sep = "")
saveRDS(data, file = data_file)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.