R/h5_functions.R

#' Creating eQTL input hdf5 file templates.
#'
#' Generates an hdf5 file with the basic groups for eQTL analysis.
#'
#' @param file_name The name of the file to be created
#'
#' @return HDF5 file with groups phenotypes, genotypes, and covars with subgroups col_info and row_info will be created in the current directory.
#' @keywords eQTL, HDF5
#'
#' @import rhdf5
#' @export
create_h5_file <- function(file_name){
  level1.groups <- c("phenotypes", "genotypes", "covars")

  h5createFile(file_name)

  for(l1 in 1:length(level1.groups)){
    h5createGroup(file_name, level1.groups[l1])
    h5createGroup(file_name, paste(level1.groups[l1], "col_info", sep = "/"))
    h5createGroup(file_name, paste(level1.groups[l1], "row_info", sep = "/"))
  }

  h5createGroup(file_name, "K_mx")
  H5close()
}


#' Adding data to a given hdf5 file
#'
#' Use in-memory objects to create phenotypes, genotypes and covariate datasets in a given hdf5 file
#'
#' @param file_name Name of the HDF5 file to add the information to
#' @param phenotypes Phenotype matrix to save
#' @param feature_ids Unique phenotype ids
#' @param sample_ids Unique sample ids
#'
#' @return Saves the given objects into the hdf5 generated by the create_eqtl_input_h5() function.
#' @keywords eQTL, HDF5
#'
#' @import rhdf5
#' @export

h5_add_data <- function(file_name = NULL,
                         input_matrix = NULL,
                         feature_ids = NULL,
                         sample_ids = NULL,
                         data_type = c("phenotypes","genotypes","covars")){
  if (!is.null(file_name)) {
    if (!is.null(input_matrix)) {
      message(paste("Saving", data_type, "data to", file_name,"\n"))

      n_samples <- nrow(input_matrix)
      n_feature <- ncol(input_matrix)


      if (is.null(feature_ids) & !is.null(colnames(input_matrix))){
        message("Using colnames of input_matrix as pheno ids")
        feature_ids <- colnames(input_matrix)
      } else if (is.null(feature_ids) & is.null(colnames(input_matrix))) {
        message("No feature ids detected, using generic feature ids")
        feature_ids <- paste0("pheno_",1:ncol(input_matrix))
      }


      if (is.null(sample_ids) & !is.null(rownames(input_matrix))) {
        message("Using rownames of input_matrix as sample ids")
        sample_ids <- rownames(input_matrix)
      } else if (is.null(sample_ids) &
                 is.null(colnames(input_matrix))) {
                   message("No sample ids not detected, using generic sample ids")
                   sample_ids <- paste0("sample_",1:nrow(input_matrix))
                 }

                 h5createDataset(
                   file_name,
                   paste0(data_type,"/matrix"),
                   c(n_samples, n_feature),
                   chunk = NULL, level = 0
                 )
                 h5write(input_matrix, file_name, paste0(data_type,"/matrix"))

                 message("Saving feature ids to col_info/id")
                 h5write(feature_ids, file_name, paste0(data_type,"/col_info/id"))

                 message("Saving sample ids to row_info/id")
                 h5write(sample_ids, file_name, paste0(data_type,"/row_info/id"))


    } else {
      message("No input_matrix given, please specify input data")
    }

  } else {
    message("HDF5 file not specified, please specify a file to add information.")
  }
}


#' Adding SNP information to an HDF5 file
#'
#' Use in-memory objects to add SNP information to a given HDF5 file
#'
#' @param file_name Name of the HDF5 file to add the information to
#' @param snp_info Dataframe with SNP information for all SNPs present in genotypes/matrix
#' @param snp_id Vector of genotype ids that match the feature ids used in saving the genotype matrix.
#' @param snp_chr Vector of genotype chromosomes in the same order as snp_id entries.
#' @param snp_pos Vector of genotype positions in the same order as snp_id entries.
#'
#' @return Saves the given objects into the hdf5 generated by the create_eqtl_input_h5() function.
#' @keywords eQTL, HDF5
#'
#' @import rhdf5
#' @export

h5_add_snp_info <- function(file_name = NULL,
                            snp_id = NULL,
                            snp_chr = NULL,
                            snp_pos = NULL){

  if(is.null(snp_id)){
      stop("SNP id not specified, please specify SNP ids that match genotype feature ids")
  }
  if(is.null(snp_chr)){
      stop("SNP chromosomes not specified.")
  }
  if(is.null(snp_pos)){
      stop("SNP position not specified.")
  }

  if( length(snp_id) == length(snp_chr) & length(snp_id) == length(snp_pos) ) {
      saved_ids <- h5read(file_name, "genotypes/col_info/id")
      matched_idx <- match(snp_id, saved_ids)
      sorted_chr <- gsub("chr", "", as.character(snp_chr[matched_idx]))
      sorted_pos <- snp_pos[matched_idx]

      h5write(sorted_chr, file_name, "genotypes/col_info/geno_chr")
      h5write(as.numeric(sorted_pos), file_name, "genotypes/col_info/geno_pos")

      H5close()

  } else {
      stop("Input data size mismatch. Aborting function.")
  }

}

#' Adding gene information to an HDF5 file
#'
#' Use in-memory objects to add gene information to a given HDF5 file
#'
#' @param file_name Name of the HDF5 file to add the information to
#' @param gene_info Dataframe with gene information for all genes present in genotypes/matrix
#' @param id_col Column name of gene_info that contains id information
#' @param chr_col Column name of gene_info that contains chromosome information
#' @param start_col Column name of gene_info that contains start positions
#' @param end_col Column name of gene_info that contains end positions
#' @param entrez_col Column name of gene_info that contains entrez ids
#' @param symbol_col Column name of gene_info that contains gene symbols
#' @return Saves the given objects into the hdf5 generated by the create_eqtl_input_h5() function.
#' @keywords eQTL, HDF5
#'
#' @import rhdf5
#' @export
h5_add_pheno_info <- function(file_name = NULL,
                             pheno_id = NULL,
                             pheno_chr = NULL,
                             pheno_start = NULL,
                             pheno_end = NULL,
                             pheno_entrez = NULL,
                             pheno_symbol = NULL){

  if(is.null(pheno_id)){
    stop("pheno_id not specified, please specify pheno ids that match phenotype feature ids")
  }
  if(is.null(pheno_chr)){
    stop("Phenotype chromosomes not specified.")
  }
  if(is.null(pheno_start)){
    stop("Phenotype start position not specified.")
  }
  if(is.null(pheno_end)){
    stop("Phenotype end position not specified.")
  }

  if( length(pheno_id) == length(pheno_chr) &
      length(pheno_id) == length(pheno_start) &
      length(pheno_id) == length(pheno_end) ) {
    saved_ids <- h5read(file_name, "phenotypes/col_info/id")
    matched_idx <- match(pheno_id, saved_ids)
    sorted_chr <- gsub("chr", "", as.character(pheno_chr[matched_idx]))
    sorted_start <- pheno_start[matched_idx]
    sorted_end <- pheno_end[matched_idx]

    h5write(sorted_chr, file_name, "phenotypes/col_info/pheno_chr")
    h5write(as.numeric(sorted_start), file_name, "phenotypes/col_info/pheno_start")
    h5write(as.numeric(sorted_end), file_name, "phenotypes/col_info/pheno_end")

    if(!is.null(pheno_symbol)){
      sorted_symbol <- pheno_symbol[matched_idx]
      h5write(as.character(sorted_symbol), file_name, "phenotypes/col_info/pheno_symbol")
    }
    if(!is.null(pheno_entrez)){
      sorted_entrez <- pheno_entrez[matched_idx]
      h5write(as.character(sorted_entrez), file_name, "phenotypes/col_info/entrez")
    }
    H5close()

  } else {
    stop("Input data size mismatch. Aborting function.")
  }

}

#' Loading data from a given hdf5 file
#'
#' Load data from a given hdf5 file to an in-memory object
#'
#' @param file_name Name of the HDF5 file to read from
#' @param data_type Data type to create (phenotypes, genotypes, covars)
#'
#' @return Data matrix with row and columns labels.
#' @keywords eQTL, HDF5
#'
#' @import rhdf5
#' @export

load_h5_data <- function(input_file = NULL, data_type = c("phenotypes","genotypes", "covars")){
    data_matrix <- h5read(input_file, paste0(data_type,"/matrix"))
    rownames(data_matrix) <- h5read(input_file, paste0(data_type,"/row_info/id"))
    colnames(data_matrix) <- h5read(input_file, paste0(data_type,"/col_info/id"))
    return(data_matrix)


}


#' Custom ICA function for analyzing gene expression data.
#'
#' Performing ICA on a dataset and create a list object with results.
#' @param h5_file Name of HDF5 file that has the phenotype values saved.
#' @param input_pheno Phenotype matrix with diemnsions N x g
#' @param k.est Number of components to be estimated or method to estimate it.
#' @param scale.pheno Logical value specifying the scaling of row of the phenotype.mx.
#' @return List with the following entries.
#' @keywords keywords
#'
#' @import mclust
#' @export
ica_covar_check <- function(ica_list = NULL,
                            h5_file = NULL,
                            covars = NULL, 
                            cor.threshold = 0.05){

    if(is.null(covars) & !is.null(h5_file)){
      message("Loading covariates from HDF5 file \n")
      covars <- load_h5_data(h5_file, "covars")
    }

    A_mx <- ica_list$A

    message("Checking dimensions and column/row names \n")

    if(nrow(covars) == ncol(A_mx)){
      message("[Good] Number of samples in <covars> and <ica_list> match \n")
    } else {
      stop("Error: Sample numbers in <covars> and <ica_list> don't match. Stopping script")
    }

    matching.names <- sum(rownames(covars) %in% colnames(A_mx))

    if(matching.names == nrow(covars)){
      message("[Good] All samples in <ica_list> are accounted for in <covars> \n")
    } else {
      stop("Missing sample Information: Check sample names in <covars> and <ica_list>")
    }

    message("- Checking associations between ICs and covariates \n")
    # Anova analysis for covariates vs ICA weights (A matrix)
    cov.pval.mx <- ic_covariate_association_test(A_mx,covars)
    ica_list$covar_pvals <- cov.pval.mx
    attr(ica_list, 'covar_cor') <- "yes"

    return(ica_list)
}

#' @import lrgpr
#' @import formula.tools
#' @export
ica_genotype_association <- function(ica_list = NULL,
                                     h5_file = NULL,
                                     genotype.mx = NULL,
                                     sig_threshold = 0.05,
                                     n.cores = 1){
  if(is.null(ica_list)){
      stop("ICA input missing")
  }

  if(is.null(genotype.mx) & !is.null(h5_file)){
    message("Loading genotypes from HDF5 file \n")
    genotype.mx <- load_h5_data(h5_file, "genotypes")
  } else if (is.null(genotype.mx) & is.null(h5_file)){
    stop("Missing input for genotypes, please specify genotype object or h5 file")
  }

  ica.loadings <- t(ica_list$A)

  ic.vs.geno <- lrgpr::glmApply(ica.loadings ~ SNP,
                         features = genotype.mx,
                         nthreads = n.cores)$pValues

  colnames(ic.vs.geno) <- rownames(ica_list$A)
  #sig <- which(ic.vs.geno < (0.05/length(ic.vs.geno) ), arr.ind = TRUE)

  #genetic.factors <- colnames(ic.vs.geno)[unique(sig[,"col"])]
  #non.genetic <- colnames(ic.vs.geno)[which(!(colnames(ic.vs.geno) %in% genetic.factors))]
  ica_list$geno_pvals <- ic.vs.geno
  ica_list$geno_cor_count <- apply(ic.vs.geno, 2, function(x) sum(na.omit(x < (sig_threshold / length(ic.vs.geno)))))
  ica_list$non_genetic <- names(ica_list$geno_cor_count[which(ica_list$geno_cor_count < 1)])
  attr(ica_list, 'geno_cor') <- "yes"
  return(ica_list)
}

#' @export
get_gene_info <- function(ica_list, h5_file){
#  gene_id <- h5read(h5_file, "phenotypes/col_info/id")
#  gene_chr <- h5read(h5_file, "phenotypes/col_info/pheno_chr")
#  gene_start <- h5read(h5_file, "phenotypes/col_info/pheno_start")
  gene_info_df <- as.data.frame(h5read(h5_file, "phenotypes/col_info"), stringsAsFactors = FALSE)
  gene_info_df$idx <- 1:nrow(gene_info_df)
  ica_list$gene_info <- gene_info_df
  return(ica_list)
}
jinhyunju/icreport documentation built on May 19, 2019, 10:35 a.m.