HOmics: Hierarchical omics

Documented in HOmics.meth

#' Integrates methylation and gene data
#'
#' @param meth.data GenomicRatioSet or  ExpressionSet with methylation data
#' @param pheno.cond.col response variable, column of annotation that contains condition (one of the variable names contained in pheno data obtained with function pData())
#' @param annot.gene.col column of annotation that contains gene names (same annotations as gene.list). Default = "UCSC_RefGene_Name"
#' @param annot.z.col prior information. Column of annotation that contains prior variable (for GenomicRatioSet functions getAnnotation() or fData() for ExpressionSet). Default = "UCSC_RefGene_Group"
#' @param annot.mult.sep separator for annot.gene.col and maybe annot.z.col multiple values. Default = ";"
#' @param z.matrix prior information. Alternative parameter to annot.z.col as matrix with rows the featureNames of meth.data and columns the Zvars can be given. Default = NULL
#' @param pheno.covar.col covariates, column of annotation that contains names of covariates to include in the model (one or more of the variable names contained in pheno data). Default = NULL
#' @param gene.list genes to analyze as a vector with symbols (HUGO)
#' @param seed numerical seed for the use of function set.seed in the generation of the model, for reproducibility
#' @param cores cores in case of parallelization. Default = 1 (no parallelization)
#' @param n.adapt number of iterations for the adaptative phase of the hierarchical model. Default = 1000
#' @param n.chain number of chains of the hierarchical model. Default = 3
#' @param n.iter number of iteractions for the burn in phase or sampling of the hierarchical model. Default = 2000


#' @import dplyr
#' @import Biobase
#' @import minfi
#' @import parallel
#' @import doParallel
#' @import foreach
#' 
#' @return an object of class HOmics
#'  
#' @examples to be built

#' @export HOmics.meth


HOmics.meth <- function(meth.data = GRSet, pheno.cond.col="tissue.ch1", annot.gene.col = "UCSC_RefGene_Name",
                      annot.z.col = c("UCSC_RefGene_Group"), annot.mult.sep = ";", z.matrix = NULL, pheno.covar.col = NULL, 
                      gene.list = genes.u, seed=NULL, cores = 1,
                      n.adapt = 1000, n.chains = 3, n.iter = 2000, 
                      ...)
{
  
  cont = FALSE
  
  #########################
  ##Argument checking #####
  #########################
  call<-match.call() #to be returned at the end
  
  if (is(meth.data, "GenomicRatioSet")){
    pheno.data <- pData(meth.data)
    pheno.data$id <- rownames(pheno.data)
    pheno.data <- as_tibble(pheno.data)

    annot.data <- getAnnotation(meth.data)
    annot.data$cpg <- rownames(annot.data)
    annot.data <- as_tibble(annot.data)
    
    cpg.data <- getBeta(meth.data)
    
  } else if (is(meth.data, "ExpressionSet")) {
    
    pheno.data <- pData(meth.data)
    pheno.data$id <- rownames(pheno.data)
    pheno.data <- as_tibble(pheno.data)
    
    annot.data <- fData(meth.data)
    annot.data$cpg <- rownames(annot.data) 
    annot.data <- as_tibble(annot.data)
    
    cpg.data <- exprs(meth.data)
    
  } else stop("meth.data should be a GenomicRatioSet or an ExpressionSet or a SummarizedExperiment") 
  
  if (is.null(pheno.cond.col) | !(pheno.cond.col %in% colnames(pheno.data))) stop("cond variable should be a variable in phenoData slot")
 
  if (is.null(annot.gene.col) | !(annot.gene.col %in% colnames(annot.data))) stop("annot.gene.col should be a column in the annotations slot")
  
  #########################
  ## Completeness cpgs ####
  #########################
  if (anyNA(cpg.data)) {
    cpg.data <-  cpg.data[complete.cases(cpg.data),]
    annot.data <- annot.data %>% filter(cpg %in% rownames(cpg.data))
    cat("some missing values were detected, only complete features will be selected\n" )
  } 
  
  if (is.null(z.matrix)) {
    if (is.null(annot.z.col) | !(annot.z.col %in% colnames(annot.data))) stop("annot.z.col should be a column in the annotations slot of meth.data")
    
  } else {
    
    #check common cpgs
    cpgs.c<-intersect(rownames(cpg.data),rownames(z.matrix))
    
    if (length(cpgs.c)==0) stop ("z.matrix must contain rownames matching featureNames in meth.data")
    
    else
      
      cat("z.matrix will be used to construct the hierarchical model\n")
      z.matrix <- z.matrix[cpgs.c,]
      cpg.data <- cpg.data[cpgs.c,]
      annot.data <- annot.data %>% filter(cpg %in% cpgs.c)
      
  }
    
  cond.v <- pheno.data %>% dplyr::select (!!as.name(pheno.cond.col)) %>% pull()
  
  if (is.factor(cond.v)) {
    
    if (nlevels(cond.v) != 2 ) {
      
      stop(paste0("vector ", pheno.cond.col, " is a factor but must have 2 levels" )) 
      
    } else {
      cond.min <- levels(cond.v)[1]
      cat(paste0(pheno.cond.col," is a factor, it has been converted to a numerical vector of 0s and 1s with ",cond.min," as the reference level. A hierarchical logistic model will be fitted\n" ))
      cond <- as.numeric(cond.v)-1
      
    }
  } else if (is.character(cond.v)) {
    cond.min <- min(cond.v)
    cat(paste0(pheno.cond.col," is a factor, it has been converted to a numerical vector of 0s and 1s with ",cond.min," as the reference level. A hierarchical logistic model will be fitted\n" ))
    cond <- as.numeric(as.factor(cond.v)) -1
   
  } else if (is.numeric(cond.v)) {
    
    cat(paste0(pheno.cond.col," is a numerical vector, a hierarchical linear model will be constructed\n" ))
    cond <- cond.v
    cont <- TRUE
    
  } else stop ("Unrecognized type of condition")  
 

  rm(cond.v)
  #########################
  ## Covariates  ####
  #########################
  #flag covar
  if (!is.null(pheno.covar.col)){
    
    if (!(pheno.covar.col %in% colnames(pheno.data))) stop("covariate ", pheno.covar.col, " should be a variable in phenoData slot") 
    else   covar.v <- pheno.data %>% dplyr::select (!!as.name(pheno.covar.col)) %>% pull()
    
    if(is.factor(covar.v) | is.character(covar.v)) {
      
      covar.v <- as.numeric(as.factor(covar.v)) -1
      cat(paste0("covariate ", pheno.covar.col, " has been converted to a numerical vector\n" ))
      
    } else cat(paste0("covariate ", pheno.covar.col, " will be included in the hierarchical model, as a continuous covariate\n")) 
    
  } else covar.v=NULL
    
  
  #########################
  #######  Cores  #########
  #########################
  
  if (!is.numeric(cores)) stop("cores should be numeric")

  #########################
  #####    LOOP     #######
  #########################
  
N = length(gene.list)
cl <- makeCluster(cores,type="PSOCK",outfile="output.txt")
registerDoParallel(cl)


cpg.models<- foreach(i=1:N,.packages=c("rjags","tidyverse","MCMCvis"),.export="hmodel") %dopar% {
    gen <- gene.list[i]
    #select annotations for that gene in specified col
    if (is.null(z.matrix)){
      annot.gen <- annot.data %>% 
                  filter(grepl(gen,!!as.name(annot.gene.col))) %>% 
                  dplyr::select(cpg,!!as.name(annot.gene.col),!!as.name(annot.z.col))
      
      if(nrow(annot.gen)>0){ 
        #number of genes max in a field
        nmax <- max(sapply(pull(annot.gen,!!as.name(annot.gene.col)),
                           FUN = function(x) length(unlist(strsplit(x,split=annot.mult.sep)))))
        
       #create 2 tibbles, one for genes and one for z vars
       annot.gen.g <- annot.gen %>% 
                      separate(!!as.name(annot.gene.col), paste("Gene",1:nmax,sep="_"),sep=annot.mult.sep) %>% 
                      dplyr::select(cpg,Gene_1:paste("Gene",nmax,sep="_")) %>%
                      gather("GeneVar","Gene",Gene_1:paste("Gene",nmax,sep="_"))
        
        annot.gen.z <- annot.gen %>% 
                      separate(!!as.name(annot.z.col), paste("GeneGroup",1:nmax,sep="_"),sep=annot.mult.sep) %>%
                      dplyr::select(cpg,GeneGroup_1:paste("GeneGroup",nmax,sep="_")) %>%
                      gather("GeneGroupVar","GeneGroup",GeneGroup_1:paste("GeneGroup",nmax,sep="_"))
        
        annot.gen <- inner_join(annot.gen.g,annot.gen.z,by=c("cpg"="cpg")) %>%
                      filter(Gene==gen, !is.na(GeneGroup)) %>%
                      dplyr::select(cpg,Gene,GeneGroup) %>%
                      distinct()
        
        #genegroup to dummies
        if(nrow(annot.gen)>0){
          z.n <- annot.gen   %>% 
                dplyr::select(-Gene) %>% 
                mutate(var = 1) %>% 
                spread(GeneGroup, var, fill = 0, sep = "_") 
          
          cpgs <- z.n %>% pull(cpg)
          
          z.mat<-as.data.frame(dplyr::select(z.n,-cpg)) 
          rownames(z.mat) <- cpgs
          g.matrix <- cpg.data[cpgs,] 
          
          #matrix restrictions and transformations
          if (is.matrix(g.matrix)) {
            g.matrix<- t(g.matrix) 
          } else if (is.vector(g.matrix)) {
            g.matrix <-as.matrix(g.matrix)
            colnames(g.matrix) <- rownames(data.matrix)[i]
          }
          
          ### Z matrix
          if (is.vector(z.mat)) {
            z.mat <-t(as.matrix(z.mat))
            rownames(z.mat) <- rownames(data.matrix)[i]
          }
          
          hmodel(G = g.matrix, 
                 Z = z.mat,
                 cond = cond, 
                 cont = cont, 
                 covar.matrix = covar.v, 
                 seed = seed,
                 n.adapt = n.adapt,
                 n.chains = n.chains,
                 n.iter = n.iter)  
        }
      }  
    } else {
      annot.gen <- annot.data %>% 
                    filter(grepl(gen,!!as.name(annot.gene.col))) %>% 
                    dplyr::select(cpg,!!as.name(annot.gene.col))
      if(nrow(annot.gen)>0){ 
        #number of genes max in a field
        nmax <- max(sapply(pull(annot.gen,!!as.name(annot.gene.col)),
                           FUN = function(x) length(unlist(strsplit(x,split=annot.mult.sep)))))
        annot.gen.g <- annot.gen %>% 
                      separate(!!as.name(annot.gene.col), paste("Gene",1:nmax,sep="_"),sep=annot.mult.sep) %>% 
                      dplyr::select(cpg,Gene_1:paste("Gene",nmax,sep="_")) %>%
                      gather("GeneVar","Gene",Gene_1:paste("Gene",nmax,sep="_")) %>%
                      filter(Gene==gen) %>% 
                      dplyr::select(cpg,Gene) %>%
                      distinct()
        if(nrow(annot.gen.g)>0){
          cpgs <-  annot.gen.g %>% pull(cpg)
          z.mat <- z.matrix[cpgs,]
          if (is.vector(z.mat)) z.mat <-as.matrix(t(z.mat)) 
          rownames(z.mat) <- cpgs
          g.matrix <- cpg.data[cpgs,] 
          
          #matrix restrictions and transformations
          if (is.matrix(g.matrix)) {
            g.matrix<- t(g.matrix) 
          } else if (is.vector(g.matrix)) {
            g.matrix <-as.matrix(g.matrix)
            colnames(g.matrix) <- rownames(data.matrix)[i]
          }
          
          ### Z matrix
          if (is.vector(z.mat)) {
            z.mat <-t(as.matrix(z.mat))
            rownames(z.mat) <- rownames(data.matrix)[i]
          }
          
          hmodel(G = g.matrix, 
                 Z = z.mat,
                 cond = cond, 
                 cont = cont, 
                 covar.matrix = covar.v, 
                 seed = seed,
                 n.adapt = n.adapt,
                 n.chains = n.chains,
                 n.iter = n.iter)  
 
        }  
      }  
    }
 }
stopCluster(cl)

names(cpg.models) <- gene.list[1:N]

results<-cpg.models
#results$call <- call
class(results) <- "HOmics"
return(results)  
}
# ===============================================================================