R/read.vcf.R

# The bedr package is copyright (c) 2014 Ontario Institute for Cancer Research (OICR)
# This package and its accompanying libraries is free software; you can redistribute it and/or modify it under the terms of the GPL
# (either version 1, or at your option, any later version) or the Artistic License 2.0.  Refer to LICENSE for the full license text.
# OICR makes no representations whatsoever as to the SOFTWARE contained herein.  It is experimental in nature and is provided WITHOUT
# WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE OR ANY OTHER WARRANTY, EXPRESS OR IMPLIED. OICR MAKES NO REPRESENTATION
# OR WARRANTY THAT THE USE OF THIS SOFTWARE WILL NOT INFRINGE ANY PATENT OR OTHER PROPRIETARY RIGHT.
# By downloading this SOFTWARE, your Institution hereby indemnifies OICR against any loss, claim, damage or liability, of whatsoever kind or
# nature, which may arise from your Institution's respective use, handling or storage of the SOFTWARE.
# If publications result from research using this SOFTWARE, we ask that the Ontario Institute for Cancer Research be acknowledged and/or
# credit be given to OICR scientists, as scientifically appropriate.

read.vcf <- function(x, split.info = FALSE, split.samples = FALSE, nrows = -1, verbose = TRUE) {
	
	catv("READING VCF\n");

	# check file exists
	catv(" * checking if file exists... ");
	if (!file.exists(x)) {
		catv("FAIL\n")
		stop();
		}
	else {
		catv("PASS\n")
		}

	# read header
	catv(" * Reading vcf header...\n")
	con <- file(x);
	open(con);
	no.body <- FALSE;
	x.header <- NULL;
	x.colnames <- NULL;


	# uncompress, fread doesn't natively read compressed, booh.
	if (grepl(".gz$", x)) {
		x.basename <- basename(x);
		x.tmp      <- tempfile(pattern = paste(x.basename, "_", sep = ""));
		# system(paste0("gunzip -c ", x, " > ", x.tmp));
		R.utils::gunzip(filename = x, destname = x.tmp, overwrite = TRUE, remove = FALSE);
		x <- x.tmp;
		}


	# loop over header 
	while (TRUE) {
		x.header.tmp <- readLines(con, n = 1);
		if (0 == length(x.header.tmp)) {
			# no more lines to read in, there is no body content
			no.body <- TRUE;
			break;
			}
		else if (!grepl("^#", x.header.tmp)) {
			# line does not start with the comment character,
			# so assume have read past the header 
			break;
			}
		else if (grepl("^#[Cc]", x.header.tmp)) {
			# parse the column names
			x.colnames    <- unlist(strsplit(x.header.tmp, split = "\t| +"));
			x.colnames[1] <- gsub("^#", "", x.colnames[1])
			}
		else {
			x.header <- c(x.header, x.header.tmp)
			}
		}
	
	close(con);
	catv("   Done\n");

	if (is.null(x.header)) {
		catv(" * No header detected!  Seriously???  Parsing of INFO and FORMAT fields is disabled.  Please fix. \n");
		header.length <- 0;
		}
	else {
		header.length <- length(x.header);
		}

    colnames.in.file <- !is.null(x.colnames);
	if (!colnames.in.file) {
		catv(" * No column names detected!  Now that's just lazy. Please fix for sanity purposes.\n");
			
		# assume the 8 colnames
		x.colnames <- c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO");
		}
		
	if (no.body) {
		# create an empty data frame for the body since 
		# no body content was found 
		x.df <- data.frame(matrix(nrow = 0, ncol = length(x.colnames)));
		colnames(x.df) <- x.colnames;

		# since there's no body, there's no info or sample content to split
		split.info <- FALSE;
		split.samples <- FALSE;
	} else {	
		# read data.  fread is much faster than read.table but imports into a data.table
		catv(" * Reading vcf body...\n")

		# default colClasses
		colClasses <- c("character","integer", "character","character","character","numeric","character", "character");

		# add columns for additional samples
		if (length(x.colnames) > 8) {
			colClasses <- c(colClasses, rep("character", length(x.colnames)-8));
			}

		# check the number of columns match the column names
		# (read past the header and column names, if present)
		x.firstline <- data.table::fread(x, header = FALSE, sep = "\t", skip = header.length + colnames.in.file, nrows = 1);
		if (length(x.firstline) != length(x.colnames)) {
			catv(" * There looks to be more columns than column names.  Please fix!\n");
			if ( length(x.firstline) > length(x.colnames) ) {
				colClasses <- c(colClasses, rep("NULL", length(x.firstline)-length(x.colnames)) );
				}
			}

		# read in content after the header and column names (if present)
		x.df <- data.table::fread(x, header = FALSE, sep = "\t", stringsAsFactors = FALSE, na.strings = ".", colClasses = colClasses, skip = header.length + colnames.in.file, nrows = nrows);

		catv("   Done\n");

		# add colnames
		colnames(x.df)[1:length(x.colnames)] <- x.colnames;

		# get the updated (complete) list of column names
		x.colnames <- colnames(x.df); 
		}

	# add header as attribute
	catv(" * Parse vcf header...\n");
	attr(x.df, "header") <- x.header;

	# add parsed header as list in item
	x.header.parsed <- parse.vcf.header(x.header);
	x.df <- list(header = x.header.parsed, vcf = x.df);
	catv("   Done\n");

	# validate (i.e. tags)

	### SPLIT INFO
	
	if (split.info && !is.null(x.header)) {
		catv(" * Split info...\n");

		# split the info
		x.info.split <- mclapply(x.df$vcf$INFO, hash2vec, split.char = ";", fill.names = x.header.parsed$INFO[,"ID"]); # slow! consider mcapply

		# fill in the missing tabs
		x.info.split <- mclapply(x.info.split, fill.vector, fill.names = x.header.parsed$INFO[,"ID"]); # slow! consider mcapply

		# combine into matrix
		x.info.split <- as.data.frame(do.call(rbind, x.info.split), stringsAsFactors = FALSE);
		colnames(x.info.split) <- x.header.parsed$INFO[,"ID"];

		# apply the data types to the info
		for (i in 1:nrow(x.header.parsed$INFO)) {
	
			if (x.header.parsed$INFO[i,"Type"] %in% c("Integer","Float") & x.header.parsed$INFO[i,"Number"] == 1) {
				x.info.split[,i] <- tryCatch({as.numeric(x.info.split[,i])}, warning = function(w){catv(paste0(" * Problem encountered converting ", x.header.parsed$INFO[i,1], " to numeric, skipping...\n")); x.info.split[,i]});
				}
			else if (x.header.parsed$INFO[i,"Type"] == "Flag") {
				x.info.split[,i] <- !is.na(x.info.split[,i]);
				}
			}

		# merge the split info with the rest of the vcf
		if (length(x.df$vcf) == 8) {
			x.df$vcf <- data.frame(x.df$vcf[,1:7, with = FALSE], x.info.split);
			}
		else {
			x.df$vcf <- data.frame(x.df$vcf[,1:7, with = FALSE], x.info.split, x.df$vcf[,9:length(x.df$vcf), with = FALSE]);
			}
		catv(" * Done\n");
		}

	### SPLIT SAMPLES 
	n.samples     <- length(x.colnames)-9; # original colnames before adding info

	if (split.samples && n.samples > 0 && !is.null(x.header)) {
	
		catv(" * Split samples...\n");
	
		# a helper function to split a single sample
		split.vcf.sample <- function(x, format.string) {
			x[x == "./."] <- paste(rep(".", length(format.string)), collapse = ":"); # yuck
			x.sample.split   <- as.data.frame(matrix(hash2vec(x, split.char = ":"), ncol = length(format.string), byrow = TRUE),  stringsAsFactors = FALSE);
			colnames(x.sample.split) <- format.string;
		
			x.sample.split;
			}

		# gather some information
		format.string <- unlist(strsplit(x.df$vcf$FORMAT[1], split = ":"));

		# if one sample add to main vcf table; if multiple then create a list of GxI matrices i.e. GT, DP, GQ...
		if (n.samples == 1) {
			x.sample.split            <- split.vcf.sample(x.df$vcf[[ncol(x.df$vcf)]], format.string);
			x.df$vcf$FORMAT           <- NULL;
			x.df$vcf[,ncol(x.df$vcf)] <- NULL;
			x.df$vcf <- data.frame(x.df$vcf, x.sample.split);
			}
		else {
			# split each sample
			x.sample.split <- mclapply(x.df$vcf[,(ncol(x.df$vcf)-n.samples+1):ncol(x.df$vcf),with = FALSE], split.vcf.sample,  format.string); # consider mcapply
			x.df$vcf[,(ncol(x.df$vcf)-n.samples+1):ncol(x.df$vcf)] <- NULL;
			
			# loop over each tag and merge
			for (i in 1:length(format.string)) {
				x.df[[format.string[i]]] <- do.call("cbind", mclapply(x.sample.split, "[[", format.string[i])); # consider mcapply
				
				# convert "." to NA
				x.df[[format.string[i]]][x.df[[format.string[i]]] == "."] <- NA
				
				# change data type if possible
				if (x.df$header$FORMAT[x.df$header$FORMAT[,"ID"] == format.string[i],"Type"] %in% c("Integer","Float") & x.df$header$FORMAT[x.df$header$FORMAT[,"ID"] == format.string[i],"Number"] == 1) {
					x.df[[format.string[i]]] <- as.integer(x.df[[format.string[i]]]);
					}
			
				}
				
			}

		catv("   Done\n")
		}

	# stats (snp/indel, pass, ti/tv)

	attr(x.df, "vcf") <- TRUE;

	return(x.df)
	}

###################################################################################################

parse.vcf.header <- function(x) {
### return the header as a list of key values 

	if (is.null(x)) {return(NULL)}

	x.header.parsed <- list();
	x.header.keys   <- list();
	x.header.values <- list();

	# loop over each line
	for (i in 1:length(x)) {
		x.header.key <- x.header.value <- NULL;

		# add quotes for description if missing. yuck.
		if (grepl("Description=",x[i]) && grepl("\">$",(x[i]))) {
			x[i] <- gsub('(Description=)([^\"])', '\\1\"\\2', x[i], perl = TRUE);
			x[i] <- gsub('([^\"])(>$)', '\\1\"\\2', x[i], perl = TRUE);
			}

		# parse primary key values
		x.header.key    <- gsub("^##(.*?)=(<?.*?>?)$", "\\1", x[i], perl = TRUE);
		x.header.value  <- gsub("^##(.*?)=(<?.*?>?)$", "\\2", x[i], perl = TRUE);
		
		# parse secondeary key values only if text in tag <.*>
		if (grepl("^<.*>$", x.header.value)) {
			x.header.value  <- gsub("^<(.*)>$", "\\1", x.header.value, perl = TRUE);
			x.header.value  <- hash2vec(x.header.value);
			}

		# add key values to list
		x.header.keys[[i]]   <- x.header.key;
		x.header.values[[i]] <- x.header.value;

		}

	# is duplicated function
	is.duplicated <- function(x) {duplicated(x) | duplicated(x, fromLast = TRUE)}

	# add unique keys
	unique.keys <- unlist(x.header.keys[!is.duplicated(x.header.keys)]);
	for (i in 1:length(unique.keys)) {
		x.header.values.tmp <- unlist(x.header.values[x.header.keys == unique.keys[i]]);
		x.header.parsed[[unique.keys[i]]] <- x.header.values.tmp;
		}

	# group common keys tables
	repeated.keys <- unique(unlist(x.header.keys[duplicated(x.header.keys)]));

	if (!is.null(repeated.keys)) {

		for (i in 1:length(repeated.keys)) {
			x.header.values.tmp <- x.header.values[x.header.keys == repeated.keys[i]];
			x.header.parsed[[repeated.keys[i]]] <- do.call("rbind", x.header.values.tmp)
			}

		}

	return(x.header.parsed)
	}


###################################################################################################

# hash.split
hash2vec <- function(x, assign.char = "=", split.char = ",", fill.names = NULL) {
### convert key value string to named vector
	
	# split by delimiter
	x <- scan(text = x, what="character", quiet = TRUE, sep = split.char);

	# split key/value
	x.keys <-   sub("^(.*?)=(.*)$", "\\1", x, perl = TRUE) ;
	x.values <- sub("^(.*?)=(.*)$", "\\2", x, perl = TRUE) ;

	# if no hash make key empty
	if (all(x.values == x.keys)) {
		x <- x.values;
		}
	else {
		x <- setNames(x.values, x.keys);
		}

#	# fill missing keys
#	if (!is.null(fill.names)) {
#		x <- fill.vector(x, fill.names);
#		}

	return(x);
	}

fill.vector <- function(x, fill.names) {
### add values to a vector if missing and order.  good for ragged rbinds.
	x.fill <- NULL;

	x <- x[fill.names]

#	for (i in 1:length(fill.names)) {
#		if (is.null(x[fill.names[i]])) {
#			x.fill[fill.names[i]] <- NA;
#			}
#		else {
#			x.fill[fill.names[i]] <- x[fill.names[i]];
#			}
#		}

	return(x);
	}

Try the bedr package in your browser

Any scripts or data that you put into this service are public.

bedr documentation built on May 2, 2019, 11:36 a.m.