popcorn: File-Format Wrangler for Population-Based Genetic Data

Documented in .fix_sex read_beagle_ibd read_bed read_coverage read_fam read_het_plink read_homozyg_plink read_map read_roh

# io.R

#' Encode sex as integer
#' 
#' @param x vector of sex labels, encoded in any sensible way
#' @return integer vector of sex codes: 0=uknown, 1=male, 2=female
#' @details Think of the result as the count of X chromosomes.
#' 
#' @export
.fix_sex <- function(x) {
	
	if (is.character(x) || is.factor(x))
		ifelse(grepl("^[fF]", x), 2L, ifelse(grepl("^[mM]", x), 1L, 0L))
	else
		return(as.integer(x))
	
}

#' Read a PLINK family (*.fam) file
#' 
#' @param ff path to a *.fam file
#' @return dataframe of sample info
#' @details A *.fam file should have columns family ID, individual ID, mom ID, dad ID, sex, phenotype.  PLINK uses
#'   special value -9 to denote missing phenotype, for some reason, while 0=missing for mom and dad ID.
#' 
#' @export
read_fam <- function(ff, ...) {
	
	fam <- tryCatch({
		readr::read_tsv(ff, col_names = FALSE)
	},
	error = function(e) {
		tibble::as_tibble(read.table(ff, header = FALSE, stringsAsFactors = FALSE))
	})
	
	cols <- c("fid","iid","mom","dad","sex","pheno")
	nc <- min(length(cols), ncol(fam))
	colnames(fam)[ 1:nc ] <- cols[1:nc]
	if ("mom" %in% colnames(fam)) {
		fam$mom <- as.character(fam$mom)
	}
	if ("dad" %in% colnames(fam)) {
		fam$dad <- as.character(fam$dad)
	}
	if ("sex" %in% colnames(fam)) {
		fam$sex <- .fix_sex(fam$sex)
	}
	return(fam)
	
}

#' Read a PLINK marker map (*.bim) file
#' 
#' @param ff path to a *.bim file
#' @return dataframe of marker info
#' @details A *.bim file should have columns chromosome, marker ID, genetic position (cM), physical position,
#'   allele 1 (A1=alternate allele), allele 2 (A2=reference allele).
#' 
#' @export
read_map <- function(ff, ...) {
	
	df <- tibble::as_tibble(read.table(ff, header = FALSE, stringsAsFactors = FALSE))
	cols <- c("chr","marker","cM","pos","A1","A2")
	nc <- min(length(cols), ncol(df))
	colnames(df)[ 1:nc ] <- cols[1:nc]
	df$chr <- mouser::factor_chrom(df$chr)
	return(df)
	
}

#' @export
write_fam <- function(ff, fname, ...) {
	
	fam <- tibble::as_tibble(ff)
	cols <- c("fid","iid","mom","dad","sex","pheno")
	nc <- min(length(cols), ncol(fam))
	colnames(fam)[ 1:nc ] <- cols[1:nc]
	if ("mom" %in% colnames(fam)) {
		fam$mom <- as.character(fam$mom)
	}
	if ("dad" %in% colnames(fam)) {
		fam$dad <- as.character(fam$dad)
	}
	if ("sex" %in% colnames(fam)) {
		fam$sex <- .fix_sex(fam$sex)
	}
	readr::write_tsv(ff, fname, col_names = FALSE)
	
}

#' @export
update_fam_file <- function(ff, fam, ...) {
	
	if (!inherits(ff, "data.frame"))
		oldfam <- read_fam(ff)
	else
		oldfam <- ff
	ii <- match(oldfam$iid, fam$iid)
	nona <- !is.na(ii)
	newfam <- oldfam
	newfam$fid[nona] <- as.character(fam$fid[ii[nona]])
	newfam$sex[nona] <- .fix_sex(fam$sex[ii[nona]])
	return(newfam)
	
}

#' @export
read_missingness_plink <- function(ff, ...) {
	
	df <- read.table(ff, header = TRUE, stringsAsFactors = FALSE)
	colnames(df) <- c("chr","marker","nmiss","nind","fmiss")
	df <- tibble::as_tibble(df)
	#print(head(df))
	df$ntyped <- with(df, nind-nmiss)
	df$chr <- factor_chrom(df$chr)
	return(df)
	
}

#' @export
read_freq_plink <- function(ff, ...) {
	
	df <- read.table(ff, header = TRUE, stringsAsFactors = FALSE)
	colnames(df) <- c("chr","marker","A1","A2","freq","nobs")
	df <- tibble::as_tibble(df)
	#print(head(df))
	df$chr <- factor_chrom(df$chr)
	return(df)
	
}

#' @export
read_freq_plink2 <- function(ff, force_chroms = TRUE, ...) {
	
	df <- readr::read_tsv(ff, col_types = "ccccdi")
	colnames(df) <- c("chr","marker","ref","alt","freq","nobs")
	
	if (force_chroms) {
		df$chr <- mouser::factor_chrom(df$chr)
	}
	return(df)
	
}

#' @export
read_het_plink2 <- function(ff, ...) {
	
	#FID    IID     O(HOM)  E(HOM)  OBS_CT  F
	df <- readr::read_tsv(ff, col_types = c("ccidid"))
	colnames(df) <- c("fid","iid","hom_obs","hom_expect","nobs","f_hat")
	return(df)
	
}

#' @export
read_missing_plink2 <- function(ff, what = c("sample","variant"), force_chroms = TRUE, ...) {
	
	what <- match.arg(what)
	
	if (what == "sample") {
		df <- readr::read_tsv(ff, col_types = "cciid")
		colnames(df) <- c("fid","iid","nmiss","nobs","fmiss")
	}
	else if (what == "variant") {
		df <- readr::read_tsv(ff, col_types = "cciid")
		colnames(df) <- c("chr","marker","nmiss","nobs","fmiss")
		if (force_chroms) {
			df$chr <- mouser::factor_chrom(df$chr)
		}
	}
	
	return(df)
	
}

#' @export
read_geno_summary_plink2 <- function(ff, ...) {
	
	df <- readr::read_tsv(ff, col_types = c("cciiiiiiiiii"))
	colnames(df) <- c("fid","iid","n_homref","n_homalt","n_het",
					  "n_diploid_ti","n_diploid_tv","n_diploid_other","n_diploid_singleton",
					  "n_haploid_ref","n_haploid_alt","n_miss")
	return(df)
	
}

#' Read ROH report from PLINK
#' @export
read_homozyg_plink <- function(ff, ...) {
	
	df <- read.table(ff, header = TRUE, stringsAsFactors = FALSE)
	colnames(df) <- c("fid","iid","phe","chr","left","right","start","end","width.kb","nsites","dens","phom","phet")
	df <- tibble::as_tibble(df)
	df <- df[ ,c("iid","chr","left","start","right","end","nsites","dens","phom","phet") ]
	df$width <- with(df, end-start)
	df$chr <- factor_chrom(df$chr)
	return(df)
	
}

#' Read estimated inbreeding report from PLINK
#' @export
read_het_plink <- function(ff, ...) {
	
	df <- read.table(ff, header = TRUE, stringsAsFactors = FALSE)
	colnames(df) <- c("fid","iid","obs","expect","nhet","fhat")
	df <- tibble::as_tibble(df)
	return(df)
	
}

#' Read BEAGLE-format IBD segments
#' 
#' @export
read_beagle_ibd <- function(ff, map = NULL, expand = FALSE, as.ranges = FALSE, ...) {
	
	ibd <- readr::read_tsv(ff, col_names = FALSE)[ ,1:8 ]
	colnames(ibd) <- c("iid1","p1","iid2","p2","chr","start","end","lod")
	ibd$chr <- mouser::factor_chrom(ibd$chr)
	map <- subset(map, !is.na(cM) & cM > 0)
	dups <- duplicated(map[ ,c("chr","pos") ])
	map <- map[ !dups, ]
	
	if (!is.null(map)) {
		start.cM <- dplyr::left_join(ibd, map[ ,c("chr","cM","pos") ], by = c("chr" = "chr", "start" = "pos"))
		end.cM <- dplyr::left_join(ibd, map[ ,c("chr","cM","pos") ], by = c("chr" = "chr", "end" = "pos"))
		ibd$start.cM <- start.cM$cM
		ibd$end.cM <- end.cM$cM
		ibd$width.cM <- with(ibd, end.cM - start.cM)
	}
	
	if (expand) {
		ibd2 <- ibd
		tmp <- ibd2$iid1
		ibd2$iid1 <- ibd2$iid2
		ibd2$iid2 <- tmp
		ibd <- dplyr::bind_rows(ibd, ibd2)
	}
	
	if (as.ranges) {
		ibd.gr <- GenomicRanges::makeGRangesFromDataFrame(ibd, keep.extra.columns = TRUE)
		return(ibd.gr)
	} else {
		return(ibd)
	}
	
}

#' @export
read_refinedIBD <- function(ff, expand = FALSE, row_ids = TRUE, ...) {
	
	ibd <- readr::read_tsv(ff, col_names = c("iid1","p1","iid2","p2","chr","start","end","lod","cM"))
	ibd$.row <- seq_len(nrow(ibd))
	if (expand) {
		ibd2 <- ibd
		tmp <- ibd2$iid1
		ibd2$iid1 <- ibd2$iid2
		ibd2$iid2 <- tmp
		ibd <- dplyr::bind_rows(ibd, ibd2)
	}
	
	return(ibd)
	
}

#' @export
read_ibdne_result <- function(ff, ...) {

	ne <- readr::read_tsv(ff, ...)
	colnames(ne) <- c("gen","Ne","lo","hi")[ seq_len(ncol(ne)) ]
	ne$gen <- as.integer(gsub("^G","", ne$gen))
	return(ne)
	
}

#' Read `bcftools roh` output (mode `-Or``)
#' 
#' @export
read_roh <- function(ff, force.chroms = TRUE, ...) {
	
	roh <- readr::read_tsv(ff, col_names = c("flag","iid","chr","start","end","width","nsites","qual"), comment = "#")
	roh$flag <- NULL
	if (force.chroms)
		roh$chr <- mouser::factor_chrom(roh$chr)
	return(roh)
	
}

# Read BLAT *.psl file
#'
#' @export
read_blat_psl <- function(ff, with.header = FALSE, force.chroms = TRUE, ...) {
	
	nskip <- if (with.header) 3 else 0
	df <- readr::read_tsv(ff, skip = nskip,
						  col_names = c("matches","mismatches","rep.matches","ns","qinserts","qbpins",
						  			  "tinserts","tbpins","strand","qname","qsize","qstart","qend",
						  			  "tname","tsize","tstart","tend","nblocks","blen","qstarts","tstarts"),
						  col_types = "iiiiiiiicciiiciiiiccc")
	
	if (force.chroms) {
		df$chr <- mouser::factor_chrom(df$tname)
		df$start <- df$tstart
		df$end <- df$tend
	}
	
	return(df)
		
}

# Write genomic intervals in BED format
#' @export
write_bed <- function(x, ff = "/dev/stdout", extra = TRUE, is.zero.based = FALSE, ...) {
	
	if (inherits(x, "GRanges")) {
		x <- as.data.frame(x)
		colnames(x)[1] <- "chr"
		x$start <- pmax(x$start - 1L, 1L)
	}
	cn <- colnames(x)
	if ("seqnames" %in% cn) {
		cn[ cn == "seqnames" ] <- "chr"
		colnames(x) <- cn
	}
	
	if (!is.zero.based) {
		x$start <- pmax(x$start - 1L, 1L)
	}
	
	cols <- c("chr","start","end","name","score","strand")
	others <- cn[ !(cn %in% cols) ]
	keep.cols <- cols[ sort(which(cols %in% cn)) ]
	if (extra)
		write.cols <- c(keep.cols, others)
	else
		write.cols <- keep.cols
	write.table(x[ ,write.cols ], ff, 
				col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t")
	
}

#' Read genomic intervals in BED format
#' @export
read_bed <- function(ff, ..., force.chroms = TRUE) {
	
	df <- readr::read_tsv(ff, col_names = FALSE, comment = "#", ...)
	idx <- 1:min(ncol(df), 6)
	colnames(df)[ idx ] <- c("chr","start","end","name","score","strand")[ idx ]
	if (force.chroms)
		df$chr <- mouser::factor_chrom(df$chr)
	return(df)
	
}

#' Read output from my \code{coverage.py} script
#' 
#' @param path name of file to read
#' @param scrub if \code{TRUE}, remove trailing suffixes (\code{*.sorted}, \code{*.merged} etc) from individual IDs
#' @param as.ranges if \code{TRUE}, convert to a \code{GRanges} object
#' 
#' @details Expected input is a BED-like file with columns *chr*, *start*, *end*, *iid*, *raw read count*,
#' 	*normalized read count* and optionally *count of MQ0 reads*, *proportion of total reads which are MQ0*.
#' 
#' @return a dataframe
read_coverage <- function(path, scrub = TRUE, as.ranges = FALSE, has.MQ0 = FALSE, ...) {
	
	counts <- tryCatch({
		readr::read_tsv(path, col_names = FALSE, comment = "#")
	},
	error = function(e) {
		read.table(path, header = FALSE, comment.char = "#")
	})
	colnames(counts)[1:6] <- c("chr","start","end","iid","nreads","depth")
	if (has.MQ0)
		colnames(counts)[7:8] <- c("MQ0","MQ0.prop")
	
	if (scrub)
		counts$iid <- gsub("\\.\\w+$", "", counts$iid)
	
	if (as.ranges)
		counts <- GenomicRanges::makeGRangesFromDataFrame(counts, starts.in.df.are.0based = TRUE,
														  keep.extra.columns = TRUE)
	
	attr(counts, "filename") <- normalizePath(path)
	return(counts)
	
}

#' @export
read_psmc_result <- function(ff, niter = 25, mu = 1e-8, s = 100, ...) {
	
	cmd <- paste0("awk '/RD.", niter, "/{f=1}/\\/\\//{f=0}f&&/RS/{print $0;}' ", ff)
	rez <- readr::read_tsv(pipe(cmd), col_names = c("what","k","t_k","lambda_k","pi_k","sum_A_kl","A_kk"))
	maxk <- max(rez$k)
	nboot <- sum(rez$k == maxk)
	rez$repno <- floor(seq_len(nrow(rez))/maxk)
	
	cmd <- paste0("awk 'BEGIN{OFS=\"\\t\";}/RD.", niter, "/{f=1}/\\/\\//{f=0}f&&/TR/{print $2,$3}' ", ff)
	pis <- readr::read_tsv(pipe(cmd), col_names = c("theta_0","rho_0"))
	pis$repno <- seq_len(nrow(pis))-1

	rez <- dplyr::left_join(rez, pis) %>%
		dplyr::mutate(Nk = (lambda_k*theta_0/s)/(4*mu))
	return(rez)
	
}

andrewparkermorgan/popcorn documentation built on July 8, 2023, 12:42 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

andrewparkermorgan/popcorn
File-Format Wrangler for Population-Based Genetic Data

R/io.R
In andrewparkermorgan/popcorn: File-Format Wrangler for Population-Based Genetic Data

Documented in .fix_sex read_beagle_ibd read_bed read_coverage read_fam read_het_plink read_homozyg_plink read_map read_roh

R Package Documentation

Browse R Packages

We want your feedback!

andrewparkermorgan/popcorn File-Format Wrangler for Population-Based Genetic Data

R/io.R In andrewparkermorgan/popcorn: File-Format Wrangler for Population-Based Genetic Data

Documented in .fix_sex read_beagle_ibd read_bed read_coverage read_fam read_het_plink read_homozyg_plink read_map read_roh

R Package Documentation

Browse R Packages

We want your feedback!

andrewparkermorgan/popcorn
File-Format Wrangler for Population-Based Genetic Data

R/io.R
In andrewparkermorgan/popcorn: File-Format Wrangler for Population-Based Genetic Data