R/user_functions.R

Defines functions mspc.consensus.peaks prepare.mspc.table multiple.bed.enrichment prepare.feature.table separate.multiple.features

Documented in mspc.consensus.peaks multiple.bed.enrichment prepare.feature.table prepare.mspc.table separate.multiple.features

#' Separate Multiple Features
#'
#' Divides a bed file with multiple features in multiple files with single features.
#'
#' It is useful, for example, if you use ChromHMM states bed files, in order to be able to use mubeen.
#'
#' @param features_files A character vector with the file path of the multiple-feature bed, or a data.frame/data.table with the data in a bed-like format.
#' @param save_dir The directory where files should be wrote. By default, the folder "new_feature_files" inside of the current working directory
#' @param colf The column of the feature in the bed. By default, the first three columns are chr, start and end, and the 4th column is the feature.
#'
#' @return None
#' @export
#'
separate.multiple.features = function(features_file, save_dir = paste0(getwd(), "/new_feature_files"), colf = 4){

  if (is.character(features_file)){
    features_file = data.table::fread(features_file)
  }

  features_factor = as.factor(features_file[,colf])
  dir.create(save_dir)

  for (feature in split(features_file, features_factor)){
    data.table::fwrite (feature, file = paste0(save_dir, "/", gsub("/","_",as.character(feature[1,colf])),".bed"), sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE)
  }
}


#' Prepare Feature Table
#'
#' Creates a data.frame with the annotation of different feature beds, including their complete path, the cell type and the feature name.
#'
#' Since the main function of mubeen, \code{\link[mubeen]{multiple.bed.enrichment}}, needs different attributes of the feature bed files, this function provides a convenient way to generate the information. You can provide the vectors with the feature names and files, or, if not, they are automatically generated from the file name (by default, you have to put the feature name, a whitespace and the cell name). If you have a bed file with multiple features, you should previously separate them using the function \code{\link[mubeen]{separate.multiple.features}}
#'
#' @param feature_directory A character vector with the path of the folder with the feature bed files.
#' @param feature_names A character vector of the feature names. This parameter is optional.
#' @param cell_types A character vector of the cell types. This parameter is optional.
#'
#' @return A data frame with 3 columns: 1st feature file paths, 2on cell types, 3rd feature names.
#' @export
#'
prepare.feature.table = function(feature_directory, feature_names=NULL, cell_types=NULL){

  feature_files = setdiff(list.files(feature_directory,full.names=T), list.dirs(feature_directory,recursive = FALSE, full.names = TRUE))

  if (is.null(feature_names)|is.null(cell_types)){
    cellfeature = sapply(strsplit(feature_files, "/"), function(x) trimws(x[length(x)]))
    feature_table = data.frame(feature_file = feature_files, x = cellfeature, stringsAsFactors = FALSE)
    feature_table = tidyr::separate(feature_table, x, into=c("feature_name","cell_type"), sep=" ")
  }
  else {
    feature_table = data.frame(feature_file=feature_files, cell_type=cell_types, feature_name = feature_names, stringsAsFactors=F)
  }

  return(feature_table)
}

#' MUltiple BEd ENrichment
#'
#' From a target bed file, and a background bed file, this function calculates the statistical enrichment in several features (e.g. histone marks), in a given range.
#'
#' The function is able to sequentially repeat that enrichment calculation several times in an arbitrary range and window of the target bed file, shifting all the genomic positions each time.
#'
#' @param targets A character vector with the file path of the bed with the regions of interest, or a data.frame/data.table with the data in a bed-like format.  It can be a vector of one or more files.
#' @param target_labels A character vector indicating the names of the target files, in the same order than the targets parameter.
#' @param background A character vector with the file path of the bed with the background regions, or a data.frame/data.table with the data in a bed-like format
#' @param feature_table A data.frame with the paths of the feature files, and their annotation. You can use \code{\link{prepare.feature.table}}, or make a custom table with the same format.
#' @param bin The size of each overlap checking. If it is 0, the regions are checked as given.
#' @param window The region that should be analyzed on each side (upstream and downstream) of the bed regions. If it is 0, the regions are checked as given.
#' @param thread_number The number of threads that should be use in parallel.
#' @param DMR A boolean indicating if the regions are not of size 1bp, such as, for example, DMRs (differentially methylated regions).
#'
#' @return A list with several data.tables, each one the result of one feature, and the last with the merge of all. Each row contains the information of one bin, with the fisher.test result.
#' @export
#'
multiple.bed.enrichment = function(targets, target_labels = names(targets), background, feature_table, bin=0, window=0, thread_number = 1, DMR=F){

  #Si se ha dado un link se intenta leer los archivos, si no lo convertimos a data.table
  targets = as.list(targets)

  for (i in 1:length(targets)){
    if (is.character(targets[[i]])){targets[[i]] = data.table::fread(targets[[i]])}
    else {data.table::setDT(targets[[i]])}
  }

  if (is.character(background)){background = data.table::fread(background)}
  else {data.table::setDT(background)}

  #Registramos el número de hilos
  `%dopar%` = foreach::`%dopar%`
  `%:%` = foreach::`%:%`

  doParallel::registerDoParallel(thread_number)

  #prueba nested foreach
  x =
    foreach::foreach(target = targets) %:%
      foreach::foreach(feature=feature_table$feature_file) %dopar% {
        message(paste("Calculating intersects of", feature, "with", target ))
        add.multiple.fisher(check.intersect(target, background, feature, bin, window, DMR=DMR))
  }

  #Añadimos los cell_names y feature_names correspondientes a cada lista

  for (i in 1:length(x)){
    for (j in 1:length(x[[i]])){
      x[[i]][[j]]$cell_type = feature_table$cell_type[j]
      x[[i]][[j]]$feature_name = feature_table$feature_name[j]
      x[[i]][[j]]$target = target_labels[i]
    }
  }

  #añadimos a la lista una tabla completa preparada para ggplot2

  x = unlist(x, recursive = FALSE)
  join = data.table::rbindlist(x)
  x[[length(x)+1]] = join
  names(x)[length(x)] = "join_list"

  #Cerramos DoParallel
  doParallel::stopImplicitCluster()

  return(x)
}



#' Prepare MSPC Table
#'
#' Creates a data.frame with the annotation of different batches of bed files to be processed by the MSPC algorithm (Consensus Peaks).
#'
#' Since the MSPC function \code{\link[mubeen]{mspc.consensus.peaks}} requires a data frame with the lists of bed files and their associated feature name and cell type, this function creates automatically the table from a proper folder structure. In order to use this function, you have to organize your bed files in folder, one for each cell type, and include different folders inside of them, one for each feature.
#'
#' @param input_dir The root directory of your bed files collection.

#' @return A data frame with 3 columns: 1st list of input files, separate with a pipe symbol ('|'), 2on cell types, 3rd feature names.
#' @export
#'
prepare.mspc.table = function(input_dir){
  #Cell name, inputs_vector, feature name
  mspc.table = data.frame(input_names = character(),cell_type = character(), feature_name = character(), stringsAsFactors = F)
  cell_directories = list.dirs(input_dir, full.names=TRUE, recursive=FALSE)

  for (cell_dir in cell_directories){
    for(feature_dir in list.dirs(cell_dir, full.names=TRUE, recursive=FALSE)){
      input_names = paste0(setdiff(list.files(feature_dir,full.names=TRUE), list.dirs(feature_dir, full.names = TRUE, recursive = FALSE)), collapse="|")
      mspc.table[nrow(mspc.table)+1,] = c(input_names,basename(cell_dir), basename(feature_dir) )
    }
  }
  return(mspc.table)
}



#' MSPC Consensus Peaks
#'
#' This function is able to collapse several bed files of one or more given features (e.g. histone marks), creating a Consensus Peaks file, a bed file with strong and reproducible peaks between the files. This algorithm (\href{https://genometric.github.io/MSPC/docs/quick_start}{project link}) has been created mainly for data from MACS.
#'
#' MSPC is an independent project, and this function is a wrapper that makes able to call the software from R without preprocessing of bed files. For the order of the columns, the program assumes the structure of the output of ChIP-seq beds from the Blueprint project (web), but you can define the columns to use the function with different input. This function checks if MSPC is installed and, if not, try to download and install the latest version in the folder ~/.mspc . This function only works nowadays in GNU/Linux.
#'
#' @param feature_table A data.frame with the paths of input bed files, and their annotation. You can use \code{\link{prepare.mspc.table}}, or make a custom table with the same format.
#' @param output_dir The directory where the new files will be created
#' @param fold_threshold Minimal enrichment over background to be considered in the files. Every peak with less fold enrichment will be erased and not shown to the MSPC algorithm.
#' @param min_overlap Minimal number of overlaping replicates. This is use as '-c' parameter in the MSPC algorithm. If the number of files in a feature is equal or less than the min_overlap, the min_overlap will be calculated as the number of files minus one.
#' @param qvalue_col The index of the column containing the q_values in bed files.
#' @param pvalue_col  The index of the column containing the p_values in bed files.
#' @param fold_col  The index of the column containing the fold_enrichment in bed files.
#'
#' @return None. The function generates a side effect (calling the MSPC algorithm and creating the final consensus peaks files).
#' @export
#'
mspc.consensus.peaks = function(mspc_table, output_dir, fold_threshold=2, min_overlap=3, qvalue_col = 7, pvalue_col = 8, fold_col = 9){

  `%do%` = foreach::`%do%`

  #Comprobamos si MSPC está instalado en la carpeta correcta
  
  if (!file.exists("~/.mspc/mspc")){
      
      message("MSPC is not installed in the proper folder (~/.mspc). Would you like the program to be downloaded? (Answer yes if you want):")
      
      if (readline()=="yes"){
          dir.create("~/.mspc")
          download.file("https://github.com/Genometric/MSPC/releases/latest/download/linux-x64.zip", "~/.mspc/mspc.zip")
          unzip("~/.mspc/mspc.zip", junkpaths=TRUE, exdir = "~/.mspc")
          Sys.chmod("~/.mspc/mspc", '777')
          message("MSPC was downloaded and installed successfully.")
      }
      else {
          message("Without MSPC installed, the function cannot continue")
          return()
          }
  }
  

  # loop for
  for(i in 1:nrow(mspc_table)) {
    individual.consensus.peaks(cell_type=mspc_table$cell_type[i], feature_name=mspc_table$feature_name[i], inputs_vector=unlist(strsplit(mspc_table$input_names[i],split="\\|")), output_dir=output_dir, fold_threshold=fold_threshold, min_overlap=min_overlap, qvalue_col = qvalue_col, pvalue_col = pvalue_col, fold_col = fold_col)
  }
  
  invisible() #return nothing

}
omorante/mubeen documentation built on June 4, 2020, 9:19 p.m.