Using-bedr.R
In bedr: Genomic Region Processing using Tools Such as 'BEDTools', 'BEDOPS' and 'Tabix'

## ----results = "hold", message = TRUE, eval = TRUE, errors = TRUE-------------
# load bedr library
library("bedr");

## ----results = "hold", message = TRUE, eval = TRUE, errors = TRUE-------------

if (check.binary("bedtools")) {

	# get example regions
	index <- get.example.regions();
	a <- index[[1]];
	b <- index[[2]];

	# region validation
	is.a.valid  <- is.valid.region(a);
	is.b.valid  <- is.valid.region(b);
	a <- a[is.a.valid];
	b <- b[is.b.valid];
		
	# print
	cat(" REGION a: ", a, "\n");
	cat(" REGION b: ", b, "\n");
	}

## ----results = "hold", message = TRUE, eval = TRUE, errors = TRUE-------------
if (check.binary("bedtools")) {

	# check if already sorted
	is.sorted <- is.sorted.region(a);

	# sort lexographically
	a.sort <- bedr.sort.region(a);

	# sort naturally
	a.sort.natural <- bedr.sort.region(a, method = "natural");

	# sort - explicit call using primary API function bedr()
	b.sort <- bedr(
		engine = "bedtools", 
		input = list(i = b), 
		method = "sort", 
		params = ""
		);

	# print
	cat(" REGION a: ", a.sort, "\n");
	cat(" REGION b: ", a.sort.natural, "\n");
	cat(" REGION c: ", b.sort, "\n");
	}

## ----results = "hold", message = TRUE, eval = TRUE, errors = TRUE-------------
if (check.binary("bedtools")) {

	# check if already merged (non-overlapping regions)
	is.merged <- is.merged.region(a.sort);
	is.merged <- is.merged.region(b.sort);

	# merge
	a.merge <- bedr.merge.region(a.sort);

	# merge - explicit call using primary API function bedr()
	b.merge <- bedr(
		engine = "bedtools", 
		input = list(i = b.sort), 
		method = "merge", 
		params = ""
		);

	# print
	cat(" REGION a: ", a.merge, "\n");
	cat(" REGION b: ", b.merge, "\n");
	}

## ----results = "hold", message = TRUE, eval = TRUE, errors = TRUE-------------
if (check.binary("bedtools")) {

	# subtract
	a.sub1 <- bedr.subtract.region(a.merge, b.merge);

	# subtract - explicit call using primary API function bedr()
	a.sub2 <- bedr(
		input = list(a = a.merge, b = b.merge), 
		method = "subtract", 
		params = "-A"
		);

	# print
	cat(" REGION a - sub1: ", a.sub1, "\n");
	cat(" REGION a - sub2: ", a.sub2, "\n");
	}

## ----results = "hold", message = TRUE, eval = TRUE, errors = TRUE-------------
if (check.binary("bedtools")) {

	# check if present in a region
	is.region <- in.region(a.merge, b.merge);

	# or alternatively R-like in command
	is.region <- a.merge %in.region% b.merge

	# print
	cat(" is.region: ", is.region, "\n");
	}

## ----results = "hold", message = TRUE, eval = TRUE, errors = TRUE-------------
if (check.binary("bedtools")) {

	# intersect / join
	a.int1 <- bedr.join.region(a.merge, b.merge);
	a.int2 <- bedr(
		input = list(a = a.sort, b = b.sort), 
		method = "intersect", 
		params = "-loj -sorted"
		);

	# multiple join
	d <- get.random.regions(15, chr="chr1", sort = TRUE);
	a.mult <- bedr.join.multiple.region(
		x = list(a.merge, b.merge, bedr.sort.region(d))
		);

	# print
	cat(" REGION a intersect: \n"); print(a.int1); cat("\n");
	cat(" REGION multi (a,b,c) intersect: \n"); print(a.mult); cat("\n");
	}

## ----results = "hold", message = TRUE, eval = TRUE, errors = TRUE-------------
if (check.binary("bedtools")) {

	# compare a and b set of sequences
	jaccard.stats <- jaccard(a.sort, b.sort);
	reldist.stats <- reldist(a.sort, b.sort);

	# print
	cat(" JACCARD a & b: \n"); print(jaccard.stats); cat("\n");
	cat(" RELDIST a & b: \n"); print(reldist.stats); cat("\n");

	# even better way to run both jaccard and reldist, as well as estimate P value through random permutations
	jaccard.reldist.stats <- test.region.similarity(a.sort, b.sort, n = 40);

	cat(" JACCARD/RELDIST a & b: \n"); print(jaccard.reldist.stats); cat("\n");
	}

## ----results = "hold", message = TRUE, eval = TRUE, errors = TRUE-------------
if (check.binary("bedtools")) {

	# read example regions file
	regions.file <- system.file("extdata/example-a-region.bed", package = "bedr");
	a <- read.table(regions.file, header = FALSE, stringsAsFactors = FALSE);
	colnames(a) <- c("a.CHROM", "a.START", "a.END", "Score");

	# sort
	a <- bedr.sort.region(a);

	# group by (on first three columns) the score in column 4. Concatenate scores
	a.collapsed <- bedr(
		input = list(i = a), 
		method = "groupby", 
		params = "-g 1,2,3 -c 4 -o collapse"
		);

	# group by (on first three columns) the score in column 4. Compute mean
	a.mean <- bedr(
		input = list(i = a), 
		method = "groupby", 
		params = "-g 1,2,3 -c 4 -o mean"
		);

	# print
	cat(" REGION a groupby (collapsed): \n"); print(a.collapsed); cat("\n");
	cat(" REGION a groupby (mean): \n"); print(a.mean); cat("\n");
	}

## ----results = "hold", message = TRUE, eval = TRUE, errors = TRUE-------------
if (check.binary("bedtools")) {
	VALID.SV.TYPES <- c('BND', 'CNV', 'DEL', 'DUP', 'INS', 'INV');
	POSITION.COLUMNS <- c('CHROM', 'POS', 'END');

	callerA.filename <- system.file("extdata/callerA.vcf.gz", package = "bedr");
	callerB.filename <- system.file("extdata/callerB.vcf.gz", package = "bedr");

	# read the VCF file
	callerA <- read.vcf(callerA.filename, split.info = TRUE)$vcf;
	callerB <- read.vcf(callerB.filename, split.info = TRUE)$vcf;

	# focus on SVs
	callerA <- callerA[which(callerA$SVTYPE %in% VALID.SV.TYPES), ];
	callerB <- callerB[which(callerB$SVTYPE %in% VALID.SV.TYPES), ];

	# convert to zero-based coordinates
	callerA$POS <- callerA$POS - 1;
	callerB$POS <- callerB$POS - 1;

	# find all overlapping pairs, retrieve size of overlap (bp)
	overlapping.pairs <- bedr.join.region(
		callerA[, POSITION.COLUMNS],
		callerB[, POSITION.COLUMNS],
		report.n.overlap = TRUE,
		check.chr = FALSE
		);
	colnames(overlapping.pairs) <- c(
		'a.CHROM', 'a.POS', 'a.END',
		'b.CHROM', 'b.POS', 'b.END',
		'Overlap'
	    );
	overlapping.pairs$b.POS <- as.numeric(overlapping.pairs$b.POS);
	overlapping.pairs$b.END <- as.numeric(overlapping.pairs$b.END);

	# compute a distance between overlapping pairs
	min.breakpoint.distances <- cbind(
	    overlapping.pairs$a.POS - overlapping.pairs$b.POS,
	    overlapping.pairs$a.END - overlapping.pairs$b.END
	    );

	min.breakpoint.distances <- apply(
	    abs(min.breakpoint.distances),
	    1,
	    min
	    );
	a.length <- overlapping.pairs$a.END - overlapping.pairs$a.POS;
	b.length <- overlapping.pairs$b.END - overlapping.pairs$b.POS;

	overlapping.pairs$distance  <- (min.breakpoint.distances + abs(a.length - b.length)) / 2;

	# print
	cat(" OVERLAPPING PAIRS: \n"); print(head(overlapping.pairs)); cat("\n");
	}

## ----results = "hold", message = TRUE, eval = TRUE, errors = TRUE-------------
if (check.binary("bedtools")) {

	# get Human RefSeq genes (Hg19) in BED format
	refseq.file <- system.file("extdata/ucsc.hg19.RefSeq.chr1-2.txt.gz", package = "bedr");
	refseq <- read.table(refseq.file, header = FALSE, stringsAsFactors = FALSE);

	# sort Refseq and remove chr prefix
	refseq.sorted <- bedr.sort.region(refseq[, 1:4]);
	colnames(refseq.sorted) <- c(POSITION.COLUMNS, "Gene");
	refseq.sorted$CHROM <- gsub("^chr", "", refseq.sorted$CHROM);

	# reuse the SV calls from workflow 1 and add gene identifiers to callerA
	callerA.annotated <- bedr.join.region(
		callerA[, POSITION.COLUMNS],
		refseq.sorted,
		report.n.overlap = TRUE,
		check.chr = FALSE
		);
	colnames(callerA.annotated) <- c(
        'a.CHROM', 'a.POS', 'a.END',
        'Gene.CHROM', 'Gene.POS', 'Gene.END', 'Gene', 'Overlap'
        );

	# reuse the SV calls from workflow 1 and add gene identifiers to callerB
	callerB.annotated <- bedr.join.region(
		callerB[, POSITION.COLUMNS],
		refseq.sorted,
		report.n.overlap = TRUE,
		check.chr = FALSE
		);
	colnames(callerB.annotated) <- c(
        'b.CHROM', 'b.POS', 'b.END',
        'Gene.CHROM', 'Gene.POS', 'Gene.END', 'Gene', 'Overlap'
        );

	# reinstate chr prefix to chromosome names
	callerA.annotated$a.CHROM <- paste('chr', callerA.annotated$a.CHROM, sep = "");
	callerB.annotated$b.CHROM <- paste('chr', callerB.annotated$b.CHROM, sep = "");

	# print
	cat(" CALLER A GENES (chr 1,2): \n"); print(head(callerA.annotated)); cat("\n");
	cat(" CALLER B GENES (chr 1,2): \n"); print(head(callerB.annotated)); cat("\n");
	}

## ----results = "hold", message = FALSE, warnings = FALSE, eval = TRUE, errors = TRUE----
options("warn" = -1);
if (check.binary("bedtools")) {

	# collapse column 7 (Gene) against unique composite key (column 1, 2 and 3)
	callerA.annotated.grouped <- bedr(
		input = list(i = callerA.annotated), 
		method = "groupby", 
		params = "-g 1,2,3 -c 7 -o collapse"
		);

	# collapse column 7 (Gene) against unique composite key (column 1, 2 and 3)
	callerB.annotated.grouped <- bedr(
		input = list(i = callerB.annotated), 
		method = "groupby", 
		params = "-g 1,2,3 -c 7 -o collapse"
		);

	# print
	cat(" CALLER A GENES (chr 1,2) GROUPED: \n"); print(head(callerA.annotated.grouped)); cat("\n");
	cat(" CALLER B GENES (chr 1,2) GROUPED: \n"); print(head(callerB.annotated.grouped)); cat("\n");
	}

## ----results = "hide", message = FALSE, eval = TRUE, errors = TRUE, fig.width = 7, fig.height = 7----
if (check.binary("bedtools")) {

	# merge and plot overlapping regions between the two callers with genes 
	# on chromosome 1 and 2
	callerA.merged <- callerA.annotated[, c('a.CHROM', 'a.POS', 'a.END')];
	callerB.merged <- callerB.annotated[, c('b.CHROM', 'b.POS', 'b.END')];
	callerA.merged <- bedr.merge.region(callerA.merged);
	callerB.merged <- bedr.merge.region(callerB.merged);

	# plot (sub-regions exclusive to a and b, and sub-regions in common)
	bedr.plot.region(
		input = list(
			a = callerA.merged, 
			b = callerB.merged
			),
		params = list(lty = 2, label.col = "black", main = "Genes Overlap"),
		feature = 'interval',
		verbose = FALSE
		);
	}

## ----results = "hide", message = TRUE, eval = TRUE, errors = TRUE-------------
if (check.binary("bedtools")) {

	# read copy number segmented (regions) data
	cna.file <- system.file("extdata/CNA.segmented.txt.gz", package = "bedr");
	cna.data <- read.table(cna.file, header = TRUE, stringsAsFactors = FALSE);
	cna.data$Chromosome <- paste("chr", cna.data$Chromosome, sep = "");

	# check if regions are valid
	valid.segments <- is.valid.region(cna.data[, c("Chromosome", "Start", "End")]);
	cat(" VALID REGIONS: ", length(which(valid.segments) ==  TRUE), "/", length(valid.segments), "\n");

	# restrict to copy number regions with |log-ratio| > 0.10
	cna.data <- cna.data[which(abs(cna.data$Segment_Mean) > 0.10), ];

	# create a list data-structure for every patient's copy number data
	# and sort them
	cna.data.gain <- list();
	cna.data.loss <- list();
	pga.gain <- list();
	pga.loss <- list();
	sample.ids <- unique(cna.data$Sample);
	hg19.size <- 3137161264;
	for (sample.id in sample.ids) {

		# extract sample specific gains
		gain.index <- which(cna.data$Sample == sample.id & cna.data$Segment_Mean > 0);
		if (length(gain.index) > 0) {

			cna.data.gain[[sample.id]] <- cna.data[
				gain.index,
				2:4
				];

			# sort
			cna.data.gain[[sample.id]] <- bedr.sort.region(
				x = cna.data.gain[[sample.id]],
				method = "natural",
				verbose = FALSE
				);

			# estimate percent genome gained
			pga.gain[[sample.id]] <- sum(
				apply(
					cna.data.gain[[sample.id]],
					1,
					FUN = function(x) { return( (as.numeric(x[3]) - as.numeric(x[2])) ); }
					)
				);
			pga.gain[[sample.id]] <- pga.gain[[sample.id]]/hg19.size*100;
			}

		# extract sample specific losses
		loss.index <- which(cna.data$Sample == sample.id & cna.data$Segment_Mean < 0);
		if (length(loss.index) > 0) {

			cna.data.loss[[sample.id]] <- cna.data[
				loss.index,
				2:4
				];

			# sort
			cna.data.loss[[sample.id]] <- bedr.sort.region(
				x = cna.data.loss[[sample.id]],
				method = "natural",
				verbose = FALSE
				);

			# estimate percent genome loss
			pga.loss[[sample.id]] <- sum(
				apply(
					cna.data.loss[[sample.id]],
					1,
					FUN = function(x) { return( (as.numeric(x[3]) - as.numeric(x[2])) ); }
					)
				);
			pga.loss[[sample.id]] <- pga.loss[[sample.id]]/hg19.size*100;
			}
		}
	}

## ----results = "hide", message = FALSE, eval = TRUE, errors = TRUE, fig.width = 7, fig.height = 5----
if (check.binary("bedtools")) {

	# set graphics params
	par(mfrow = c(1, 2), las = 1);

	# plot histograms of Gain and Loss frequencies
	hist(unlist(pga.gain), xlab = "Percent Gain", main = "Percent Genome Altered (Gain)");
	hist(unlist(pga.loss), xlab = "Percent Loss", main = "Percent Genome Altered (Loss)");
	}

## ----results = "hold", message = TRUE, eval = TRUE, errors = TRUE-------------
if (check.binary("bedtools")) {

	# find minimal common regions (gain)
	mcr.gain <- bedr.join.multiple.region(
		x = cna.data.gain,
		species = "human",
		build = "hg19",
		check.valid = FALSE,
		check.sort = FALSE,
		check.merge = FALSE,
		verbose = FALSE
		);

	# find minimal common regions (loss)
	mcr.loss <- bedr.join.multiple.region(
		x = cna.data.loss,
		species = "human",
		build = "hg19",
		check.valid = FALSE,
		check.sort = FALSE,
		check.merge = FALSE,
		verbose = FALSE
		);

	# reorder by frequency of recurrence
	mcr.gain <- mcr.gain[order(as.numeric(mcr.gain$n.overlaps), decreasing = TRUE), ];
	mcr.loss <- mcr.loss[order(as.numeric(mcr.loss$n.overlaps), decreasing = TRUE), ];

	# print
	cat(" RECURRENT CNA GAINS \n"); print(head(mcr.gain[, 1:5])); cat("\n");
	cat(" RECURRENT CNA LOSSES \n"); print(head(mcr.loss[, 1:5])); cat("\n");
	}
Any scripts or data that you put into this service are public.
bedr documentation built on April 11, 2025, 5:52 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
bedr
Genomic Region Processing using Tools Such as 'BEDTools', 'BEDOPS' and 'Tabix'

inst/doc/Using-bedr.R
In bedr: Genomic Region Processing using Tools Such as 'BEDTools', 'BEDOPS' and 'Tabix'

Try the bedr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

bedr Genomic Region Processing using Tools Such as 'BEDTools', 'BEDOPS' and 'Tabix'

inst/doc/Using-bedr.R In bedr: Genomic Region Processing using Tools Such as 'BEDTools', 'BEDOPS' and 'Tabix'

Try the bedr package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

bedr
Genomic Region Processing using Tools Such as 'BEDTools', 'BEDOPS' and 'Tabix'

inst/doc/Using-bedr.R
In bedr: Genomic Region Processing using Tools Such as 'BEDTools', 'BEDOPS' and 'Tabix'