#' Creating eQTL input hdf5 file templates.
#'
#' Generates an hdf5 file with the basic groups for eQTL analysis.
#'
#' @param file_name The name of the file to be created
#'
#' @return HDF5 file with groups phenotypes, genotypes, and covars with subgroups col_info and row_info will be created in the current directory.
#' @keywords eQTL, HDF5
#'
#' @import rhdf5
#' @export
create_h5_file <- function(file_name){
level1.groups <- c("phenotypes", "genotypes", "covars")
h5createFile(file_name)
for(l1 in 1:length(level1.groups)){
h5createGroup(file_name, level1.groups[l1])
h5createGroup(file_name, paste(level1.groups[l1], "col_info", sep = "/"))
h5createGroup(file_name, paste(level1.groups[l1], "row_info", sep = "/"))
}
h5createGroup(file_name, "K_mx")
H5close()
}
#' Adding data to a given hdf5 file
#'
#' Use in-memory objects to create phenotypes, genotypes and covariate datasets in a given hdf5 file
#'
#' @param file_name Name of the HDF5 file to add the information to
#' @param phenotypes Phenotype matrix to save
#' @param feature_ids Unique phenotype ids
#' @param sample_ids Unique sample ids
#'
#' @return Saves the given objects into the hdf5 generated by the create_eqtl_input_h5() function.
#' @keywords eQTL, HDF5
#'
#' @import rhdf5
#' @export
h5_add_data <- function(file_name = NULL,
input_matrix = NULL,
feature_ids = NULL,
sample_ids = NULL,
data_type = c("phenotypes","genotypes","covars")){
if (!is.null(file_name)) {
if (!is.null(input_matrix)) {
message(paste("Saving", data_type, "data to", file_name,"\n"))
n_samples <- nrow(input_matrix)
n_feature <- ncol(input_matrix)
if (is.null(feature_ids) & !is.null(colnames(input_matrix))){
message("Using colnames of input_matrix as pheno ids")
feature_ids <- colnames(input_matrix)
} else if (is.null(feature_ids) & is.null(colnames(input_matrix))) {
message("No feature ids detected, using generic feature ids")
feature_ids <- paste0("pheno_",1:ncol(input_matrix))
}
if (is.null(sample_ids) & !is.null(rownames(input_matrix))) {
message("Using rownames of input_matrix as sample ids")
sample_ids <- rownames(input_matrix)
} else if (is.null(sample_ids) &
is.null(colnames(input_matrix))) {
message("No sample ids not detected, using generic sample ids")
sample_ids <- paste0("sample_",1:nrow(input_matrix))
}
h5createDataset(
file_name,
paste0(data_type,"/matrix"),
c(n_samples, n_feature),
chunk = NULL, level = 0
)
h5write(input_matrix, file_name, paste0(data_type,"/matrix"))
message("Saving feature ids to col_info/id")
h5write(feature_ids, file_name, paste0(data_type,"/col_info/id"))
message("Saving sample ids to row_info/id")
h5write(sample_ids, file_name, paste0(data_type,"/row_info/id"))
} else {
message("No input_matrix given, please specify input data")
}
} else {
message("HDF5 file not specified, please specify a file to add information.")
}
}
#' Adding SNP information to an HDF5 file
#'
#' Use in-memory objects to add SNP information to a given HDF5 file
#'
#' @param file_name Name of the HDF5 file to add the information to
#' @param snp_info Dataframe with SNP information for all SNPs present in genotypes/matrix
#' @param snp_id Vector of genotype ids that match the feature ids used in saving the genotype matrix.
#' @param snp_chr Vector of genotype chromosomes in the same order as snp_id entries.
#' @param snp_pos Vector of genotype positions in the same order as snp_id entries.
#'
#' @return Saves the given objects into the hdf5 generated by the create_eqtl_input_h5() function.
#' @keywords eQTL, HDF5
#'
#' @import rhdf5
#' @export
h5_add_snp_info <- function(file_name = NULL,
snp_id = NULL,
snp_chr = NULL,
snp_pos = NULL){
if(is.null(snp_id)){
stop("SNP id not specified, please specify SNP ids that match genotype feature ids")
}
if(is.null(snp_chr)){
stop("SNP chromosomes not specified.")
}
if(is.null(snp_pos)){
stop("SNP position not specified.")
}
if( length(snp_id) == length(snp_chr) & length(snp_id) == length(snp_pos) ) {
saved_ids <- h5read(file_name, "genotypes/col_info/id")
matched_idx <- match(snp_id, saved_ids)
sorted_chr <- gsub("chr", "", as.character(snp_chr[matched_idx]))
sorted_pos <- snp_pos[matched_idx]
h5write(sorted_chr, file_name, "genotypes/col_info/geno_chr")
h5write(as.numeric(sorted_pos), file_name, "genotypes/col_info/geno_pos")
H5close()
} else {
stop("Input data size mismatch. Aborting function.")
}
}
#' Adding gene information to an HDF5 file
#'
#' Use in-memory objects to add gene information to a given HDF5 file
#'
#' @param file_name Name of the HDF5 file to add the information to
#' @param gene_info Dataframe with gene information for all genes present in genotypes/matrix
#' @param id_col Column name of gene_info that contains id information
#' @param chr_col Column name of gene_info that contains chromosome information
#' @param start_col Column name of gene_info that contains start positions
#' @param end_col Column name of gene_info that contains end positions
#' @param entrez_col Column name of gene_info that contains entrez ids
#' @param symbol_col Column name of gene_info that contains gene symbols
#' @return Saves the given objects into the hdf5 generated by the create_eqtl_input_h5() function.
#' @keywords eQTL, HDF5
#'
#' @import rhdf5
#' @export
h5_add_pheno_info <- function(file_name = NULL,
pheno_id = NULL,
pheno_chr = NULL,
pheno_start = NULL,
pheno_end = NULL,
pheno_entrez = NULL,
pheno_symbol = NULL){
if(is.null(pheno_id)){
stop("pheno_id not specified, please specify pheno ids that match phenotype feature ids")
}
if(is.null(pheno_chr)){
stop("Phenotype chromosomes not specified.")
}
if(is.null(pheno_start)){
stop("Phenotype start position not specified.")
}
if(is.null(pheno_end)){
stop("Phenotype end position not specified.")
}
if( length(pheno_id) == length(pheno_chr) &
length(pheno_id) == length(pheno_start) &
length(pheno_id) == length(pheno_end) ) {
saved_ids <- h5read(file_name, "phenotypes/col_info/id")
matched_idx <- match(pheno_id, saved_ids)
sorted_chr <- gsub("chr", "", as.character(pheno_chr[matched_idx]))
sorted_start <- pheno_start[matched_idx]
sorted_end <- pheno_end[matched_idx]
h5write(sorted_chr, file_name, "phenotypes/col_info/pheno_chr")
h5write(as.numeric(sorted_start), file_name, "phenotypes/col_info/pheno_start")
h5write(as.numeric(sorted_end), file_name, "phenotypes/col_info/pheno_end")
if(!is.null(pheno_symbol)){
sorted_symbol <- pheno_symbol[matched_idx]
h5write(as.character(sorted_symbol), file_name, "phenotypes/col_info/pheno_symbol")
}
if(!is.null(pheno_entrez)){
sorted_entrez <- pheno_entrez[matched_idx]
h5write(as.character(sorted_entrez), file_name, "phenotypes/col_info/entrez")
}
H5close()
} else {
stop("Input data size mismatch. Aborting function.")
}
}
#' Loading data from a given hdf5 file
#'
#' Load data from a given hdf5 file to an in-memory object
#'
#' @param file_name Name of the HDF5 file to read from
#' @param data_type Data type to create (phenotypes, genotypes, covars)
#'
#' @return Data matrix with row and columns labels.
#' @keywords eQTL, HDF5
#'
#' @import rhdf5
#' @export
load_h5_data <- function(input_file = NULL, data_type = c("phenotypes","genotypes", "covars")){
data_matrix <- h5read(input_file, paste0(data_type,"/matrix"))
rownames(data_matrix) <- h5read(input_file, paste0(data_type,"/row_info/id"))
colnames(data_matrix) <- h5read(input_file, paste0(data_type,"/col_info/id"))
return(data_matrix)
}
#' Custom ICA function for analyzing gene expression data.
#'
#' Performing ICA on a dataset and create a list object with results.
#' @param h5_file Name of HDF5 file that has the phenotype values saved.
#' @param input_pheno Phenotype matrix with diemnsions N x g
#' @param k.est Number of components to be estimated or method to estimate it.
#' @param scale.pheno Logical value specifying the scaling of row of the phenotype.mx.
#' @return List with the following entries.
#' @keywords keywords
#'
#' @import mclust
#' @export
ica_covar_check <- function(ica_list = NULL,
h5_file = NULL,
covars = NULL,
cor.threshold = 0.05){
if(is.null(covars) & !is.null(h5_file)){
message("Loading covariates from HDF5 file \n")
covars <- load_h5_data(h5_file, "covars")
}
A_mx <- ica_list$A
message("Checking dimensions and column/row names \n")
if(nrow(covars) == ncol(A_mx)){
message("[Good] Number of samples in <covars> and <ica_list> match \n")
} else {
stop("Error: Sample numbers in <covars> and <ica_list> don't match. Stopping script")
}
matching.names <- sum(rownames(covars) %in% colnames(A_mx))
if(matching.names == nrow(covars)){
message("[Good] All samples in <ica_list> are accounted for in <covars> \n")
} else {
stop("Missing sample Information: Check sample names in <covars> and <ica_list>")
}
message("- Checking associations between ICs and covariates \n")
# Anova analysis for covariates vs ICA weights (A matrix)
cov.pval.mx <- ic_covariate_association_test(A_mx,covars)
ica_list$covar_pvals <- cov.pval.mx
attr(ica_list, 'covar_cor') <- "yes"
return(ica_list)
}
#' @import lrgpr
#' @import formula.tools
#' @export
ica_genotype_association <- function(ica_list = NULL,
h5_file = NULL,
genotype.mx = NULL,
sig_threshold = 0.05,
n.cores = 1){
if(is.null(ica_list)){
stop("ICA input missing")
}
if(is.null(genotype.mx) & !is.null(h5_file)){
message("Loading genotypes from HDF5 file \n")
genotype.mx <- load_h5_data(h5_file, "genotypes")
} else if (is.null(genotype.mx) & is.null(h5_file)){
stop("Missing input for genotypes, please specify genotype object or h5 file")
}
ica.loadings <- t(ica_list$A)
ic.vs.geno <- lrgpr::glmApply(ica.loadings ~ SNP,
features = genotype.mx,
nthreads = n.cores)$pValues
colnames(ic.vs.geno) <- rownames(ica_list$A)
#sig <- which(ic.vs.geno < (0.05/length(ic.vs.geno) ), arr.ind = TRUE)
#genetic.factors <- colnames(ic.vs.geno)[unique(sig[,"col"])]
#non.genetic <- colnames(ic.vs.geno)[which(!(colnames(ic.vs.geno) %in% genetic.factors))]
ica_list$geno_pvals <- ic.vs.geno
ica_list$geno_cor_count <- apply(ic.vs.geno, 2, function(x) sum(na.omit(x < (sig_threshold / length(ic.vs.geno)))))
ica_list$non_genetic <- names(ica_list$geno_cor_count[which(ica_list$geno_cor_count < 1)])
attr(ica_list, 'geno_cor') <- "yes"
return(ica_list)
}
#' @export
get_gene_info <- function(ica_list, h5_file){
# gene_id <- h5read(h5_file, "phenotypes/col_info/id")
# gene_chr <- h5read(h5_file, "phenotypes/col_info/pheno_chr")
# gene_start <- h5read(h5_file, "phenotypes/col_info/pheno_start")
gene_info_df <- as.data.frame(h5read(h5_file, "phenotypes/col_info"), stringsAsFactors = FALSE)
gene_info_df$idx <- 1:nrow(gene_info_df)
ica_list$gene_info <- gene_info_df
return(ica_list)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.