R/preprocessData.R

Defines functions save_data remove_unapplicable_probes remove_zero_parsimony_probes remove_intermediate_probes remove_NA_probes betas_to_consensus_vector create_consensus_vector fit_betas_to_distribution generate_bimodal_distribution remove_cell_types get_reference remove_sex_chr read_IDAT

Documented in betas_to_consensus_vector create_consensus_vector fit_betas_to_distribution generate_bimodal_distribution get_reference read_IDAT remove_cell_types remove_intermediate_probes remove_NA_probes remove_sex_chr remove_unapplicable_probes remove_zero_parsimony_probes save_data

# {VALIDATED}

#' Load beta data
#'
#' Using sesame and sesame, this function loads beta data using IDAT files 
#' stored in a given directory
#' 
#' @param IDAT_dir directory location of where given IDAT file is located
#'
#' @return beta matrix
#' @examples
#' betas <- read_IDAT('~/20191212_GEO_datasets/GSE110554')
#' ... some visualization 
#' @export
read_IDAT <- function(IDAT_dir) {
	library(sesame)
	library(tidyverse)
	
	pfx = searchIDATprefixes(IDAT_dir)
	betas = openSesame(pfx)

	return(betas)
}

#' Remove probes on sex chromosomes {VALIDATED}
#'
#' This function removes probes located on sex chromosomes because these 
#' probes have a disproportionate global methylation status
#' 
#' @param reference reference file
#' @param betas matrix (rows are probe names, columns are cell names)
#'
#' @return beta matrix without probes located on sex chromosomes.
#' @examples
#' betas <- remove_sex_chr(reference, betas)
#' ... some visualization 
#' @export
remove_sex_chr <- function(reference, betas) {
	sex_probes_remove = rownames(reference)[reference[,"seqnames"] == "chrX" | reference[,"seqnames"] == "chrY" | reference[,"seqnames"] == "*"]

	betas = betas[,!(colnames(betas) %in% sex_probes_remove)]
	return(betas)
}

#' Returns reference file {VALIDATED}
#'
#' This function returns a reference file given location. Any preprocessing 
#' steps for the reference will also be added in this function.
#' 
#' @param reference_file directory location of where given IDAT file is located
#'
#' @return beta matrix without probes located on sex chromosomes.
#' @examples
#' reference <- get_reference('~/references/hg38/annotation/R/
#' EPIC.hg38.manifest.rds')
#' ... some visualization 
#' @export
get_reference <- function(reference_file) {
	reference <- readRDS(reference_file)
	probe_names <- names(reference)
	reference <- data.frame(reference)
	rownames(reference) <- probe_names

	return(reference)
}

#' Remove specific cell types from betas matrix {VALIDATED}
#'
#' This function removes specified group names. Common names include MIX or 
#' are typically heterogeneous mixtures of cells.
#' 
#' @param betas matrix (rows are cell names, columns are probe names)
#' @param remove_names list of cell names in betas
#'
#' @return beta matrix without specified cell names
#' @examples
#' betas <- remove_cell_types(betas, c('A.mix', 'B.mix', ...))
#' ... some visualization 
#' @export
remove_cell_types <- function(betas, remove_names) {
	betas <- betas[!(rownames(betas) %in% remove_names),]
	return(betas)
}

#' Generate bimodal distribution {VALIDATED}
#'
#' This function generates a bimodal distribution on the betas matrix data 
#' within cells
#' 
#' @param betas matrix (rows are cell names, columns are probe names)
#'
#' @return matrix specifying mu0, mu1, sigma0, sigma1 across all cell types
#' @examples
#' bi <- generate_bimodal_distribution(betas)
#' ... some visualization 
#' @export
generate_bimodal_distribution <- function(betas) {
	library(BimodalIndex)

	b_data <- betas[, apply(betas, 2, function(x) !any(is.na(x)))]
	bi <- bimodalIndex(b_data, verbose=FALSE)
	return(bi)
}

#' Categorize betas based on bimodal distribution {VALIDATED}
#'
#' This function uses a given bimodal distribution generated within cells
#' in order to categorize betas as either methylated (1), unmethylated (0), or 
#' ambiguous (-1).
#' 
#' @param betas matrix (rows are cell names, columns are probe names)
#' @param bi matrix specifying mu0, mu1, sigma0, sigma1 across all cell types
#'
#' @return betas categorized based on bimodal distribution
#' @examples
#' betas <- fit_betas_to_distribution(betas, bi)
#' ... some visualization 
#' @export
fit_betas_to_distribution <- function(betas, bi) {
	for(i in 1:nrow(betas)) {
		for(k in 1:ncol(betas)) {
	    	if (is.na(betas[i,k])) {              
	   		} else if (betas[i,k] <= (bi[i,1] + 2*bi[i,3])) { 
	    		betas[i,k] = 0
	   		} else if (betas[i,k] >= (bi[i,2] - 2*bi[i,3])) { 
	   			betas[i,k] = 1
	   		} else {
	   			betas[i,k] = -1                   
	  		}
	  	}
	}
	return(betas)
}

#' Create consensus vector of probes for each group {VALIDATED}
#'
#' This function creates consensus vector for each group using majority ruling:
#' across all probes, whichever category (methylated/unmethylated) is 
#' represented in more than 2/3's of the sample will saved. Otherwise, -1 for 
#' ambiguous is saved. A probabilistic model will supplement this in the future.
#' 
#' @param betas matrix (rows are cell names, columns are probe names)
#' @param group_names list of cell names in betas
#'
#' @return consensus vector for each cell in group names
#' @examples
#' consensus_vector <- create_consensus_vector(betas, c('NK', "Monocytes", 
#' ...))
#' ... some visualization 
#' @export
create_consensus_vector <- function(betas, group_names) {

	consensus_vector <- c()

	for (i in 1:length(group_names)) {
		name = group_names[i]

		select_group = betas[grepl(name, rownames(betas)), ]

		n = nrow(select_group)

		S = select_group[1,]
		S[colSums(select_group == 1) > (n - n / 3)] = 1
		S[colSums(select_group == 0) > (n - n / 3)] = 0
		S[colSums(select_group == 0) <= (n - n / 3) & colSums(select_group == 1) <= (n - n / 3)] = -1

		consensus_vector[[name]] = S
	} 

	return(data.frame(consensus_vector))
}

#' Converts betas matrix to consensus vectors of probes for each group 
#' {VALIDATED}
#'
#' This function creates a consensus vector for each group using the betas 
#' matrix
#' 
#' @param reference_file directory location of where given IDAT file is located
#' @param series_matrix_file directory location of where given series matrix 
#' file is located
#' @param betas matrix (rows are cell names, columns are probe names)
#' @param group_names list of cell names in betas
#' @param remove_names list of cell names in betas
#'
#' @return consensus vector data frame for each cell in group names
#' @examples
#' consensus_vector <- betas_to_consensus_vector('~/references/
#' EPIC.hg38.manifest.rds', 'GSE110554/samples_GEOLoadSeriesMatrix.rds', 
#' betas, c('NK', "Monocytes", ...), c('A.mix', 'B.mix', ...))
#' ... some visualization 
#' @export
betas_to_consensus_vector <- function(reference_file, series_matrix_file, betas, group_names, remove_names) {

	betas <- remove_sex_chr(reference_file, betas)
	save_data(betas, 'betas_raw')

	series_matrix <- readRDS(series_matrix_file)

	betas = t(betas)
	rownames(betas) <- series_matrix$title

	betas <- remove_cell_types(betas, remove_names)
	save_data(betas, 'betas_selected_cells')

	bi <- generate_bimodal_distribution(betas)
	save_data(bi, 'bi')

	betas <- fit_betas_to_distribution(betas, bi)
	save_data(betas, 'betas_fit_to_distribution')

	consensus_vector <- create_consensus_vector(betas, group_names)
	names(consensus_vector) <- c('NK.cells', 'Monocytes', 'Tc.cells', 'Neutrophils','Th.cells', 'B.cells')
	save_data(consensus_vector, 'consensus_vector')

	consensus_vector <- remove_unapplicable_probes(consensus_vector)

	return(consensus_vector)
}

#' Removes NA probes {VALIDATED}
#'
#' Selects and removes NA probes from consensus vectors
#' 
#' @param consensus_vector consensus vector data frame for each cell in group 
#' names
#'
#' @return consensus vector without NA probes
#' @examples
#' consensus_vector <- remove_NA_probes(consensus_vector)
#' ... some visualization 
#' @export
remove_NA_probes <- function(consensus_vector) {
	NA_remove <- rownames(consensus_vector)[is.na(rowSums(consensus_vector))]
	print(sprintf("Number of NAs: %d", length(NA_remove)))
	
	consensus_vector = consensus_vector[!(rownames(consensus_vector) %in% NA_remove), ]
	return(consensus_vector)
}

#' Removes intermediate probes {VALIDATED}
#'
#' Selects and removes intermediate probes from consensus vectors
#' 
#' @param consensus_vector consensus vector data frame for each cell in group 
#' names
#'
#' @return consensus vector without intermediate probes
#' @examples
#' consensus_vector <- remove_intermediate_probes(consensus_vector)
#' ... some visualization 
#' @export
remove_intermediate_probes <- function(consensus_vector) {
	nprobes = nrow(consensus_vector)

	intermediate_remove = unique(which(consensus_vector == -1) %% nprobes)
	intermediate_remove[intermediate_remove == 0] = nprobes	
	intermediate_remove <- rownames(consensus_vector)[intermediate_remove]
	print(sprintf("Number of intermediates: %d", length(intermediate_remove)))

	consensus_vector <- consensus_vector[!(rownames(consensus_vector) %in% intermediate_remove), ]

	return(consensus_vector)
}

#' Removes zero parsimony probes {VALIDATED}
#'
#' Selects and removes zero parsimony probes from consensus vectors
#' 
#' @param consensus_vector consensus vector data frame for each cell in group 
#' names
#'
#' @return consensus vector without zero parsimony probes
#' @examples
#' consensus_vector <- remove_zero_parsimony_probes(consensus_vector)
#' ... some visualization 
#' @export
remove_zero_parsimony_probes <- function(consensus_vector) {
	ngroups = ncol(consensus_vector)

	zero_parsimony_remove = rownames(consensus_vector)[rowSums(consensus_vector) == ngroups | rowSums(consensus_vector) == 0]
	print(sprintf("Number of zero parsimony: %d", length(zero_parsimony_remove)))

	consensus_vector <- consensus_vector[!(rownames(consensus_vector) %in% zero_parsimony_remove), ]
	return(consensus_vector)
}

#' Removes all inapplicable probes {VALIDATED}
#'
#' Selects and removes NA, intermediate, and zero parsimony probes from 
#' consensus vectors
#' 
#' @param consensus_vector consensus vector data frame for each cell in group 
#' names
#'
#' @return consensus vector without inapplicable probes
#' @examples
#' consensus_vector <- remove_unapplicable_probes(consensus_vector)
#' ... some visualization 
#' @export
remove_unapplicable_probes <- function(consensus_vector) {
	consensus_vector <- remove_NA_probes(consensus_vector)
	consensus_vector <- remove_intermediate_probes(consensus_vector)
	consensus_vector <- remove_zero_parsimony_probes(consensus_vector)

	return(consensus_vector)
}

#' Saves specified data with variable name to working directory {VALIDATED}
#'
#' Given data, a variable name, and a location for that data to be stored, 
#' this function simply saves data.
#' 
#' @param data data structure
#' @param variable_name name of variable/file
#' @param dir directory to which the data should be saved
#'
#' @return None
#' @examples
#' save_data(betas, 'betas', getwd())
#' ... some visualization 
#' @export
save_data <- function(data, variable_name, dir) {
	if(missing(dir)) {
		dir = getwd()
	}

	data_file = paste(dir, "/", variable_name, '.rds', sep = "")
	saveRDS(data, file = data_file)
}
ethanmoyer/MethylConstruct documentation built on July 10, 2020, 12:28 a.m.