R/import.R

Defines functions .validateInput .readTwoBams .readBam .process.chr .importHicLib .importHomer .processChiapetName makeGenomicInteractionsFromFile

Documented in .importHicLib .importHomer makeGenomicInteractionsFromFile .processChiapetName .readBam .readTwoBams .validateInput

#' Function to create GenomicInteraction objects from a file
#'
#' Function to create GenomicInteraction objects from a variety of files. The resulting objects contain information
#' on which genomic regions are interacting with each other, and the number of counts supporting each interaction.
#' It is also possible to store information on associated p-values and false-discovery rates (FDR).
#' It is possible to create GenomicInteractions objects for various datasets including Hi-C and ChIA-PET. It is possible
#' to read interactions from a variety of files including BAM files, bed files (BED12 and BEDPE) and from the output
#' from standard processing pipelines, such as HOMER and ChIA-PET tool. GenomicInteractions objects can also be created
#' using calls of the form \code{new('GenomicInteractions', ...)}. For hiclib, it expects the directory in which the files
#' extracted using h5dictToTxt.py from the hdf5 file are located, where as for all of the other file types it expects the full
#' filename. Note that recent versions of hiclib (2015-) cannot export the required data and so this function will only work with
#' older files. Hiclib support will be removed in the next version of GenomicInteractions.
#'
#' @param fn Filename or, if type='hiclib', folder
#' @param type One of 'chiapet.tool', 'bed12', 'bedpe', 'hiclib', 'homer', 'bam', 'two.bams'.
#' @param experiment_name Experiment name.
#' @param description Description of experiment.
#' @param chr_names a vector of chromosome names in order, required for re-naming chromosomes for hiclib import
#' @return a GenomicInteractions object
#'
#' @importFrom Rsamtools scanBamFlag ScanBamParam scanBam bamFlagAsBitMatrix
#' @importFrom IRanges IRanges
#' @importFrom data.table data.table fread .N
#' @importFrom S4Vectors first second
#'
#' @examples
#'
#' k562.rep1 <- makeGenomicInteractionsFromFile(
#'        system.file(package='GenomicInteractions', 'extdata', 'k562.rep1.cluster.pet3+.txt'),
#'        type='chiapet.tool', experiment_name='k562', description='k562 pol2 8wg16')
#'
#' k562.rep1
#'
#' @export

makeGenomicInteractionsFromFile <- function(fn, type, experiment_name = "", description = "", chr_names = NULL) {
    em <- NULL
    if (type == "chiapet.tool") {
        dat <- read.table(fn, header = TRUE, stringsAsFactors = FALSE, sep = "\t")
        if (!all(vapply(dat[, c(2, 3, 5, 6, 7, 8, 9)], is.numeric, logical(1))) || !all(vapply(dat[, c(1, 4)], is.character, logical(1)))) {
            stop(paste("ChIA-PET file does not appear to be in the correct format. Columns should be", "chr1, start1, end1, chr2, start2, end2, pet_count, p-value, q-value"))
        }
        
        anchor_one <- GRanges(dat[, 1], IRanges(dat[, 2], dat[, 3]))
        anchor_two <- GRanges(dat[, 4], IRanges(dat[, 5], dat[, 6]))
        counts <- as.integer(dat[, 7])
        em <- DataFrame(p.value = dat[, 8], fdr = dat[, 9])
        
    } else if (type == "bed12") {
        bedfile <- read.table(fn)
        dat <- .processChiapetName(unique(bedfile[, 4]))
        anchor_one <- GRanges(dat[, "chrom.left."], IRanges(as.integer(dat[, "start.left."]), as.integer(dat[, "end.left."])))
        anchor_two <- GRanges(dat[, "chrom.right."], IRanges(as.integer(dat[, "start.right."]), as.integer(dat[, "end.right."])))
        counts <- as.integer(dat[, "counts"])
        
    } else if (type == "bedpe") {
        dat <- rtracklayer::import(fn, format = "bedpe")
        anchor_one <- first(dat)
        anchor_two <- second(dat)
        if ("score" %in% names(mcols(dat))) {
            counts <- mcols(dat)$score
        } else {
            message("No 'score' column found; setting counts to 1.")
            counts <- rep(1, length(anchor_one))
        }
        em <- mcols(dat)[, names(mcols(dat)) != "score", drop = FALSE]
        
        if (any(counts == 0)) 
            warning("Some counts are set to zero, bedpe score field may represent other data")
        
    } else if (type == "hiclib") {
        warning("hiclib support is deprecated and will be removed in the next version of GenomicInteractions.")
        dat <- .importHicLib(fn, chr_names)
        .validateInput(dat, c("chrm1", "fraglength1", "mid1", "fragid1", "chrm2", "fraglength2", "mid2", "fragid2", "N"))
        anchor_one <- GRanges(dat$chrm1, IRanges(dat$mid1 - round(dat$fraglength1/2), dat$mid1 + round(dat$fraglength1/2)), fragid = dat$fragid1)
        anchor_two <- GRanges(dat$chrm2, IRanges(dat$mid2 - round(dat$fraglength2/2), dat$mid2 + round(dat$fraglength2/2)), fragid = dat$fragid2)
        counts <- dat$N
        
    } else if (type == "homer") {
        dat <- .importHomer(fn)
        required_cols <- c("chr.1.", "start.1.", "end.1.", "chr.2.", "start.2.", "end.2.", "Interaction.Reads")
        .validateInput(dat, required_cols)
        anchor_one <- GRanges(dat$chr.1., IRanges(dat$start.1., dat$end.1.))
        anchor_two <- GRanges(dat$chr.2., IRanges(dat$start.2., dat$end.2.))
        counts <- as.integer(dat$Interaction.Reads)
        
        extra_cols <- names(dat)[!names(dat) %in% required_cols]
        
        em <- DataFrame(dat[, extra_cols])
    } else if (type == "bam") {
        dat <- .readBam(fn)
        anchor_one <- dat[[1]]
        anchor_two <- dat[[2]]
        counts <- as.integer(rep(1, length(anchor_one)))
        
    } else if (type == "two.bams") {
        dat <- .readTwoBams(fn)
        anchor_one <- dat[[1]]
        anchor_two <- dat[[2]]
        counts <- as.integer(rep(1, length(anchor_one)))
        
    } else {
        stop("type is not one of \"chiapet.tool\", \"chiapet.encode\", \"bed12\", \"bedpe\", \"hiclib\", \"homer\", \"bam\", \"two.bams\"")
    }
    
    if (!.isEqualSeqInfo(anchor_one, anchor_two)) {
        seqinfo_both <- merge(seqinfo(anchor_one), seqinfo(anchor_two))
        seqlevels(anchor_one) <- seqlevels(seqinfo_both)
        seqinfo(anchor_one) <- seqinfo_both
        seqlevels(anchor_two) <- seqlevels(seqinfo_both)
        seqinfo(anchor_two) <- seqinfo_both
    }
    
    if (is.null(em)) {
        em <- new("DataFrame", nrows = length(anchor_one))
    }
    
    giobject <- GenomicInteractions(anchor_one, anchor_two, counts = counts, metadata = list(experiment_name = experiment_name, description = description), 
        elementMetadata = em)
    names(mcols(giobject)) <- gsub("elementMetadata.", "", names(mcols(giobject)))
    
    return(giobject)
    
}


# import methods

#' Function to process names relating to interactions stored in bed12 formats.
#'
#' In bed(n) formats interaction data cannot be stored directly due to the inability of the format to handle
#' trans-interactions. It is only capable of storing cis-interactions through the use of blockStarts and blockEnds.
#' As such the information is typically stored as a string of the format chr1:start1..end1-chr2:start2..end2,N
#' (e.g. \'chr1:3268539..3269061-chr10:103124111..103124631,4\'.). This function takes a vector of these strings
#' and converts them to a data.frame.
#'
#' @param x a vector of names stored in the bed file
#'
#' @return a data.frame containing all of the processed information
#'
#' @importFrom stringr str_match
.processChiapetName <- function(x) {
    dat <- str_match(x, "(\\w+):(-?\\d+)\\.\\.(\\d+)-(\\w+):(-?\\d+)\\.\\.(\\d+),(\\d+)")[, 2:8]
    na_by_line <- apply(dat, 1, function(x) any(is.na(x)))
    if (any(na_by_line)) {
        stop(sprintf(paste("Not all names in bed12 file cannot be read in. Interactions should be stored in", "the \"names\" field in bed12 as \"chr:start..end-chr2:start2..end2\".", 
            "First Error: Line %i"), which(na_by_line)[1]))
    }
    dat <- as.data.frame(dat)
    colnames(dat) <- c("chrom.left.", "start.left.", "end.left.", "chrom.right.", "start.right.", "end.right.", "counts")
    dat$start.right. <- as.integer(as.character(dat$start.right.)) + 1L
    dat$start.left. <- as.integer(as.character(dat$start.left.)) + 1L
    dat$end.right. <- as.integer(as.character(dat$end.right.))
    dat$end.left. <- as.integer(as.character(dat$end.left.))
    dat$counts <- as.integer(as.character(dat$counts))
    return(dat)
}

#' Function to read in processed Hi-C interaction data generated by HOMER
#'
#' This function reads in the interaction data outputted by HOMER (http://homer.salk.edu/homer/interactions/).
#'
#' @param fn location of data exported by HOMER
#' @return a data frame containing the relevant information
#' @importFrom utils read.table
.importHomer <- function(fn) {
    HOMER_int.df <- read.table(fn, header = TRUE, stringsAsFactors = FALSE, sep = "\t")
    # convert to closed format, already 1-based
    HOMER_int.df$end.1. <- HOMER_int.df$end.1. - 1
    HOMER_int.df$end.2. <- HOMER_int.df$end.2. - 1
    return(HOMER_int.df)
}


#' Function to read in processed Hi-C interaction data generated by hiclib
#'
#' This function reads in the interaction data processed by [hiclib](https://bitbucket.org/mirnylab/hiclib). The data is extracted from hiclib and
#' exported from its internal HDF5 format to text using h5dictToTxt.py. This function reads in the relevant files present in the
#' directory and generates a data.table containing all of the information. It assumes that there are files with the following
#' names in the directory: fragids1, chrms1, mids1, fraglens1, fragids2, chrms2, mids2, fraglens2, distances.
#'
#' @param dir directory containing the output files generated by hiclib, extracted using h5dictToTxt.py
#' @param chr_names ordered chromosome names (for renaming)
#' @return data.table containing information on the individual interactions
#'
#'
.importHicLib <- function(dir, chr_names) {
    frags <- data.table(fragid1 = fread(paste0(dir, "fragids1"))$V1, chrm1 = .process.chr(fread(paste0(dir, "chrms1"))$V1, chr_names), 
        mid1 = fread(paste0(dir, "mids1"))$V1, fraglength1 = fread(paste0(dir, "fraglens1"))$V1, fragid2 = fread(paste0(dir, "fragids2"))$V1, 
        chrm2 = .process.chr(fread(paste0(dir, "chrms2"))$V1, chr_names), mid2 = fread(paste0(dir, "mids2"))$V1, fraglength2 = fread(paste0(dir, 
            "fraglens2"))$V1, distances = fread(paste0(dir, "distances"))$V1, stringsAsFactors = FALSE)
    
    frags_agg <- frags[, .N, by = names(frags)]
    
    return(frags_agg)
}

.process.chr <- function(chrs, chr_names) {
    if (is.null(chr_names)) {
        warning("No chromosome names supplied for hiclib import: non-numeric chromosomes will not be renamed")
        chrs <- as.numeric(chrs) + 1
        return(paste0("chr", chrs))
    } else {
        chrs <- as.numeric(chrs) + 1
        chrs <- ifelse(chr_names[chrs] %in% c("chrY"), "Y", chrs)
        chrs <- ifelse(chr_names[chrs] %in% c("chrX"), "X", chrs)
        return(paste0("chr", chrs))
    }
}

#' Function to read in interaction-data stored in a BAM file
#'
#' Reads in interactions stored in a BAM file. Assumes that each interaction is represented by pair of PETs
#' with the same qname. The function reads in and determines which anchor is which by examining the
#' isFirstMateRead and isSecondMateRead fields in the BAM file.
#'
#' @param fn name of BAM file containing interaction information
#' @return list of GRanges - storing the anchor information for each interaction
#'
#' @importFrom Rsamtools scanBamFlag ScanBamParam scanBam
#' @import GenomicRanges
#'
.readBam <- function(fn) {
    bf <- scanBamFlag(isPaired = TRUE, isDuplicate = FALSE)
    param <- ScanBamParam(flag = bf, what = c("rname", "qname", "strand", "pos", "seq", "cigar", "flag"))
    b <- scanBam(fn, param = param)
    
    y <- GRanges(b[[1]]$rname, IRanges(b[[1]]$pos, width = width(b[[1]]$seq)), strand = b[[1]]$strand, qname = b[[1]]$qname, bamFlagAsBitMatrix(b[[1]][["flag"]], 
        bitnames = "isFirstMateRead"), bamFlagAsBitMatrix(b[[1]][["flag"]], bitnames = "isSecondMateRead"))
    
    y1 <- y[y$isFirstMateRead == 1]
    y2 <- y[y$isSecondMateRead == 1]
    
    y1 <- y1[y1$qname %in% unique(y1$qname)]
    y2 <- y2[y2$qname %in% unique(y2$qname)]
    
    y1 <- y1[order(y1$qname)]
    y2 <- y2[order(y2$qname)]
    y1$isFirstMateRead <- NULL
    y1$isSecondMateRead <- NULL
    y2$isSecondMateRead <- NULL
    y2$isFirstMateRead <- NULL
    return(list(y1, y2))
    
}

#' Function to read in interaction-data stored in a pair of BAM files
#'
#' Reads in interactions stored in a a pair of BAM files, e.g. from independent
#' alignment of paired-end reads. Assumes that each interaction is represented
#' by pair of PETs with the same qname (this may not always be true. Depending
#' on data origin read qnames may end in '/1' or '/2' to denote first or second
#' read in the pair). The function reads in files, removes unpaired reads, and
#' pairs reads based on macthing qnames.
#'
#' @param fn Character vector of two BAM files with aligned reads.
#' @return list of two GRanges, storing the anchor information for each interaction
#'
#' @importFrom Rsamtools scanBamFlag ScanBamParam scanBam
#' @import GenomicRanges
#'
.readTwoBams <- function(fn) {
    
    if (length(fn) != 2) {
        stop("Must supply two bam files for this import method")
    }
    
    bf <- scanBamFlag(isUnmappedQuery = FALSE)
    param <- ScanBamParam(flag = bf, what = c("rname", "qname", "strand", "pos", "seq"))
    
    message("Reading first bam file...")
    b1 <- scanBam(fn[1], param = param)
    g1 <- GRanges(as.character(b1[[1]]$rname), IRanges(b1[[1]]$pos, width = width(b1[[1]]$seq)), strand = as.character(b1[[1]]$strand), 
        qname = b1[[1]]$qname)
    rm(b1)
    
    message("Reading second bam file...")
    b2 <- scanBam(fn[2], param = param)
    g2 <- GRanges(as.character(b2[[1]]$rname), IRanges(b2[[1]]$pos, width = width(b2[[1]]$seq)), strand = as.character(b2[[1]]$strand), 
        qname = b2[[1]]$qname)
    rm(b2)
    
    message("Removing unpaired reads...")
    g1 <- g1[g1$qname %in% g2$qname]
    g2 <- g2[g2$qname %in% g1$qname]
    
    message("Pairing reads...")
    g1 <- g1[order(g1$qname)]
    g2 <- g2[order(g2$qname)]
    
    return(list(g1, g2))
}

#' Function to validate tabular input
#'
#' Check for columns in a data.frame.
#'
#' @param x A data frame.
#' @param h A character vector containing expected headings
#' @return list of two GRanges, storing the anchor information for each interaction
#'
.validateInput <- function(x, h) {
    h_in_x <- h %in% colnames(x)
    if (!all(h_in_x)) {
        missing <- paste(h[!h_in_x], collapse = ", ")
        stop(paste("Input file does not contain columns", missing))
    }
    invisible(0)
}

Try the GenomicInteractions package in your browser

Any scripts or data that you put into this service are public.

GenomicInteractions documentation built on Nov. 8, 2020, 8:19 p.m.