library(BiocStyle)
library(knitr)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)

Obtaining the read sequences

Libraries are downloaded from the NCBI GEO data series GSE57392, using the SRA accessions listed below. One file is available for each library, i.e., no technical replicates. SRA files are unpacked to yield FASTQ files with the raw read sequences, using the fastq-dump utility from the SRA Toolkit. Note that the SRA files themselves are saved in the local home directory and need to be cleared out.

sra.numbers <- c("SRR1274188", "SRR1274189", "SRR1274190", "SRR1274191")
all.fastq <- paste0(sra.numbers, ".fastq.gz")
for (i in seq_along(sra.numbers)) {
    # Skipping download if the file already exists.
    if (file.exists(all.fastq[i])) { next } 

    # Downloading from NCBI.
    cmd <- "fastq-dump --gzip --skip-technical --dumpbase --clip"
    code <- system(paste(cmd, sra.numbers[i]))
    stopifnot(code==0L)

    # Cleaning out the cache to reduce disk usage.
    extras <- list.files("~/ncbi/public/sra", 
        pattern=sprintf("^%s\\.", sra.numbers[i]), full=TRUE)
    unlink(extras)
}

Reads are aligned to the mm10 genome using r Biocpkg("Rsubread"). This assumes that an index has already been constructed with the prefix index/mm10, see the buildindex() function for details. The type parameter is set to optimize for genomic alignment, rather than alignment to the transcriptome. We only consider uniquely mapped reads to avoid ambiguity due to paralogous regions or repeats.

library(Rsubread)
bam.files <- paste0(sra.numbers, ".bam")
align(index="index/mm10", readfile1=all.fastq, type='dna', unique=TRUE,
    input_format="gzFASTQ", output_file=bam.files)

Post-processing of the BAM files

Alignments in each BAM file are sorted by coordinate.

library(Rsamtools)
for (bam in bam.files) {
    out <- suppressWarnings(sortBam(bam, "h3k27me3_temp"))
    file.rename(out, bam)
}

Potential PCR duplicates are marked using the MarkDuplicates tool from the Picard software suite. MarkDuplicates uses BAM index files if they're available. We don't want it using old indices, so we delete them beforehand if any are present.

indices <- paste0(bam.files, ".bai")
exist.indices <- file.exists(indices)
if (any(exist.indices)) { unlink(indices[exist.indices]) }

# Marking duplicates.
temp.bam <- "h3k27me3_temp.bam"
temp.file <- "h3k27me3_metric.txt"
temp.dir <- "h3k27me3_working"
dir.create(temp.dir)
for (bam in bam.files) {
    out <- suppressWarnings(sortBam(bam, "h3k27me3_temp"))
    file.rename(out, bam)
    code <- system(sprintf("MarkDuplicates I=%s O=%s M=%s \\
        TMP_DIR=%s AS=true REMOVE_DUPLICATES=false \\
        VALIDATION_STRINGENCY=SILENT",
        bam, temp.bam, temp.file, temp.dir))
    stopifnot(code==0L)
    file.rename(temp.bam, bam)
}

Finally, we create indices for all of the BAM files.

indexBam(bam.files)

Wrapping up

We delete all of the unnecessary files that were generated during this procedure.

unlink(all.fastq)
unlink(temp.dir, recursive=TRUE)
unlink(temp.file)    

We also show the session information.

sessionInfo()


LTLA/chipseqDBData documentation built on July 3, 2021, 9:09 a.m.