R/scMethrix_export.R

Defines functions export_seurat export_bigwigs export_bsseq export_methrix export_beds

Documented in export_beds export_bigwigs export_bsseq export_methrix

#------------------------------------------------------------------------------------------------------------
#' Exports all samples in an \code{\link{scMethrix}} objects into individual bedgraph files
#' @details The structure of the bedgraph files will be a tab-deliminated structure of:
#' Chromosome | CpG start site | CpG end site | methylation score | coverage | Additional assays (if include = TRUE)
#' 
#' If additional assays are used, and headers enabled, it is up to the user to ensure that assay names are not protected in any downstream analysis of the bedgraph files
#' @inheritParams generic_scMethrix_function
#' @param path string; the \code{\link{file.path}} of the directory to save the files
#' @param suffix string; optional suffix to add to the exported bed files 
#' @param include boolean; flag to include the values of non-standard assays in the bedgraph file
#' @param header boolean; flag to add the header onto each column
#' @param na.rm boolean; flag to remove the NA values from the output data
#' @return nothing
#' @examples
#' data('scMethrix_data')
#' export_beds(scMethrix_data,path=paste0(tempdir(),"/export"))
#' @export
export_beds <- function(scm = NULL, path = NULL, suffix = NULL,  include = FALSE, na.rm = TRUE, header = FALSE, verbose = TRUE) {
  
  #- Input Validation --------------------------------------------------------------------------
  meth <- cov <- NULL
  
  .validateExp(scm)
  .validateType(path,c("directory","string"))
  .validateType(suffix,c("string","null"))
  .validateType(include,"boolean")
  .validateType(na.rm,"boolean")
  .validateType(header,"boolean")
  .validateType(verbose,"boolean")
  
  #- Function code -----------------------------------------------------------------------------
  if (verbose) message("Exporting beds to ",path,start_time())
  
  dir.create(path, showWarnings = FALSE)
  
  files <- row.names(scm@colData)
  rrng <- data.table::as.data.table(rowRanges(scm))
  rrng[,c("width","strand"):=NULL]
  
  if (is.null(suffix)) suffix <- "" #TODO: Should switch to some kind of regex input
  
  for (i in 1:length(files)) {
    
    file = files[i]
    
    val <- score(scm)[, file] 
    rrng[,meth := val]
    
    if (has_cov(scm)) {
      val <- counts(scm)[, file] 
      rrng[,cov := val]
    }
    
    if (include) {
      assays <- assays(scm)
    }
    
    if (na.rm) {  out <- stats::na.omit(rrng, cols="meth", invert=FALSE)
    } else {      out <- rrng}
    
    data.table::fwrite(out, paste0(path,"/",file,suffix,".bedgraph"), append = FALSE, sep = "\t", row.names = FALSE, 
           col.names = FALSE, quote = FALSE)
    
    if (verbose) message("Exported ",i," of ",length(files)," (",split_time(), ")")
  }
  
  if (verbose) message("BEDs exported in in ",stop_time())
  
  invisible()
}

#------------------------------------------------------------------------------------------------------------
#' Converts an \code{\link{scMethrix}} object to methrix object
#' @details Removes extra slot data from an \code{\link{scMethrix}} object and changes structure to match
#' \code{\link[methrix]{methrix}} format. A 'counts' assay for coverage values must be present. 
#' Functionality not supported by methrix (e.g. reduced dimensionality)
#' will be discarded.
#' @inheritParams generic_scMethrix_function
#' @param h5_dir Location to save the methrix H5 file
#' @return a \code{\link[methrix]{methrix}} object
#' @examples
#' data('scMethrix_data')
#' # convert_to_methrix(scMethrix_data)
#' @export
export_methrix <- function(scm = NULL, h5_dir = NULL) {
  
  #- Input Validation --------------------------------------------------------------------------
  chr <- m_obj <- NULL
  
  .validateExp(scm)
  .validateType(h5_dir,"string")
  
  #- Function code -----------------------------------------------------------------------------
  rrng <- as.data.table(rowRanges(scm))
  rrng[,c("width","end") := NULL]
  names(rrng) <- c("chr","start","strand")
  
  chrom_size <- data.frame(contig=GenomeInfoDb::seqlevels(rowRanges(scm)),length=width(range(rowRanges(scm))))
  ref_cpgs_chr <- data.frame(chr=GenomeInfoDb::seqlevels(rowRanges(scm)),N=summary(rrng$`chr`))
  
  if (!has_cov(scm)) {
    stop("scMethrix does not contain coverage data. Cannot convert to methrix object")
  }
  
  #TODO: Need to export create_methrix function in the methrix package to use this
  if (is_h5(scm)) {
    # m_obj <- methrix::create_methrix(beta_mat = get_matrix(scm,type="score"), cov_mat = get_matrix(scm,type="counts"),
    #                                  cpg_loci = rrng[, .(chr, start, strand)], is_hdf5 = TRUE, genome_name = scm@metadata$genome,
    #                                  col_data = scm@colData, h5_dir = h5_dir, ref_cpg_dt = ref_cpgs_chr,
    #                                  chrom_sizes = chrom_sizes)#, desc = descriptive_stats)
  } else {
    # m_obj <- methrix::create_methrix(beta_mat = get_matrix(scm,type="score"), cov_mat = get_matrix(scm,type="counts"),
    #                                  cpg_loci = rrng[, .(chr, start, strand)], is_hdf5 = FALSE, 
    #                                  genome_name = scm@metadata$genome, col_data = scm@colData, 
    #                                  ref_cpg_dt = ref_cpgs_chr, chrom_sizes = chrom_sizes)#, desc = descriptive_stats)
  }
  
  return(m_obj) 
}

#' Convert \code{\link{scMethrix}} to \code{bsseq} object
#' @details Takes \code{\link{scMethrix}} object and returns a \code{bsseq} object. 
#' @param scm \code{\link{methrix}} object
#' @param m_assay matrix; the assay containing methylation scores
#' @param c_assay matrix; the assay containing count scores
#' @param path string; the path of the export directory
#' @return An object of class \code{bsseq}
#' @examples
#' \dontrun{
#' data('scMethrix_data')
#' export_bsseq(scMethrix_data)
#' }
#' @export
export_bsseq <- function(scm, m_assay = "score", c_assay="counts", path = NULL) {

  #- Input Validation --------------------------------------------------------------------------
  .validateExp(scm)
  if (!has_cov(scm)) stop("BSSeq requires a coverage matrix.", call. = FALSE)
  .validateAssay(scm,m_assay)
  .validateAssay(scm,c_assay)
  .validateType(path,"string")
  
  # if (anyNA(get_matrix(scm,m_assay)) || anyNA(get_matrix(scm,c_assay)))
  #   warning("NAs present in assay. These will be filled with zero values.")
  
  #- Function code -----------------------------------------------------------------------------
  M_clean <- get_matrix(scm,m_assay) * get_matrix(scm,c_assay)
  M_clean[is.na(M_clean)] <- 0
  C_clean <- get_matrix(scm,c_assay)
  C_clean[is.na(counts(scm))] <- 0
  
  b <- bsseq::BSseq(M = M_clean, Cov = C_clean, pData = colData(scm),
                    gr = rowRanges(scm), sampleNames = rownames(colData(scm)))
  return(b)
}

#' Exports scMethrix object as bigWigs
#' @param scm \code{\link{scMethrix}} object
#' @param assay string; the assay to export. Default is "score"
#' @param path string; Output directory name where the files should be saved. Default tempdir()
#' @param samp_names string; List of sample names to export
#' @examples
#' \dontrun{
#' data('scMethrix_data')
#' export_bigwigs(scm = scMethrix_data, assay = "score", output_dir = tempdir())
#' }
#' @return NULL
#' @importFrom rtracklayer export
#' @importFrom GenomeInfoDb seqlengths
#' @export
export_bigwigs = function(scm, assay = "score", path = tempdir(), samp_names = NULL){

  #- Input Validation --------------------------------------------------------------------------
  .validateExp(scm)
  .validateAssay(scm,assay)
  .validateType(path,"string")
  .validateType(samp_names,"string")
  
  if (is.null(path)){
    stop("A valid path needs to be supplied.", call. = FALSE)
  }

  #- Function code -----------------------------------------------------------------------------
  if (!dir.exists(path)) {
    dir.create(path = path, showWarnings = FALSE, recursive = TRUE)
  }
  
  message("Generating bigWig files...",start_time())
  
  mat_gr <- get_matrix(scm = scm, assay = assay, add_loci = TRUE, in_granges = TRUE)
  
  seql = GenomicRanges::width(range(SummarizedExperiment::rowRanges(scm)))
  names(seql) = levels(GenomeInfoDb::seqnames(scm))
  
  if(is.null(samp_names)){
    samp_names = names(mcols(mat_gr))
  }else{
    samp_names = intersect(samp_names, names(mcols(mat_gr)))
    if(length(samp_names) == 0) stop("Incorrect sample names! No matching samples in the experiment")
  }
  
  for(samp in samp_names){
    op_bw = paste0(path, "/", samp, ".bw")
    message("   Writing ", op_bw)
    samp_gr = mat_gr[,samp]
    names(mcols(samp_gr)) = "score"
    samp_gr = samp_gr[!is.na(samp_gr$score)]
    GenomeInfoDb::seqlengths(samp_gr) = seql[names(GenomeInfoDb::seqlengths(samp_gr))]
    rtracklayer::export(samp_gr, con = paste0(path, "/", samp, ".bw"), format="bigWig")
  }
  
  message("Files generated in ",stop_time())
}


export_seurat <- function(scm,assay="score", path = NULL) {
  
  #- Input Validation --------------------------------------------------------------------------
  .validateExp(scm)
  if (!has_cov(scm)) stop("Seurat requires a coverage matrix.", call. = FALSE)
  .validateAssay(scm,assay)
  .validateType(path,"string")
  
  #- Function code -----------------------------------------------------------------------------
  cnt <- counts(scm)
  rownames(cnt) <- paste0("CpG",1:nrow(cnt))
  cnt[is.na(cnt)] <- 0
  
  scr <- get_matrix(scm = scm, assay = assay)
  rownames(scr) <- paste0("CpG",1:nrow(scr))
  scr[is.na(scr)] <- 0
  
  seur <- Seurat::CreateSeuratObject(counts = cnt, meta.data = as.data.frame(colData(scm)))
  seur <- Seurat::SetAssayData(object = seur, slot = "data", new.data = scr)
  
  # seur <- NormalizeData(object = seur)
  # seur <- FindVariableFeatures(object = seur)
  # seur <- ScaleData(object = seur)
  # seur <- RunPCA(object = seur)
  # seur <- FindNeighbors(object = seur)
  # seur <- FindClusters(object = seur)
  # seur <- RunTSNE(object = seur)
  # DimPlot(object = seur, reduction = "tsne")
  # Set feature metadata, AKA rowData. Super intuitive, right?
  # sce.to.seurat[["RNA"]][[]] <- as.data.frame(rowData(pbmc.sce))
  
  # rownames(rowData(scm)) <- paste0("CpG",1:nrow(scm))
  
  return(seur)
}
CompEpigen/scMethrix documentation built on Nov. 6, 2021, 3:09 p.m.