Nothing
#' importPED
#'
#' Import a PED/MAP file pair
#'
#' This function is to a large extend taken from \code{snpStat::read.pedmap}, but here is internally the \code{data.table::fread} function used
#' that resulted in much faster file processing.
#'
#' To import the data, the ped file can be provided to the \code{file} option and the map file to the \code{snps} option. If no option is given to
#' \code{snps} and the \code{file} option is provided without any file extension, then the ped/map extension are automaticall added
#'
#' @param file ped filename
#' @param n Number of samples to read
#' @param snps map filename
#' @param which Names of SNPS to import
#' @param split Columns separator in ped file
#' @param sep Character that separates Alleles
#' @param na.strings Definition for missing values
#' @param lex.order Logical, lexicographical order
#' @param verbose Logical, verbose output
#'
#' @return a pedmap object
#'
#' @author Daniel Fischer
#'
#' @examples
#'
#' # Define here the location on HDD for the example file
#' pedPath <- system.file("extdata","example.ped", package="GenomicTools.fileHandler")
#' mapPath <- system.file("extdata","example.map", package="GenomicTools.fileHandler")
#' # Import the example ped/map files
#' importPED(file=pedPath, snps=mapPath)
#'
#'
#' @export
importPED <- function (file, n, snps=NULL, which, split = "\t| +", sep = ".", na.strings = "0",
lex.order = FALSE, verbose=TRUE) {
# Easy file definition
baseNameOnly <- TRUE
if(substrRight(file,3)=="ped") baseNameOnly <- FALSE
if(is.null(snps)){
if(baseNameOnly){
snps <- paste(file,"map",sep=".")
file <- paste(file,"ped",sep=".")
if(verbose){
cat("Assume the following files:\n")
if(file.exists(file)){
cat("file:",file,"\n")
} else {
stop("Please check the input for 'file'-option, cannot find",file)
}
if(file.exists(snps)){
cat("file:",snps,"\n")
} else {
stop("Please check the input for 'snps'-option, cannot find",snps)
}
}
} else {
stop("No SNP file provided. Please provide either the basename of the ped/map file to file or ped to file and map to snps.")
}
}
# This file is taken to a large extend from the snpStat package
# The difference, however, is the use of fread that speeds up the function substantially.
r0 <- as.raw(0)
r1 <- as.raw(1)
r2 <- as.raw(2)
r3 <- as.raw(3)
con <- gzfile(file)
open(con)
if (missing(n)) {
n <- 0
repeat {
line <- readLines(con, n = 1)
if (length(line) == 0)
break
n <- n + 1
}
if (n == 0)
stop("Nothing read")
seek(con, 0)
}
gen <- missing(snps)
map <- NULL
if (!gen) {
m <- length(snps)
if (m == 1) {
map <- read.table(snps, comment.char = "")
m <- nrow(map)
if (missing(which)) {
which <- 1
repeat {
snps <- map[, which]
if (!any(duplicated(snps)))
break
if (which == ncol(map))
stop("No unambiguous snp names found on file")
which <- which + 1
}
}
else {
snps <- map[, which]
}
}
}
else {
line <- readLines(con, n = 1)
fields <- strsplit(line, split)[[1]]
m <- (length(fields) - 6)/2
if (m%%2 != 0)
stop("Odd number of fields")
seek(con, 0)
}
nf <- 6 + 2 * m
result <- matrix(raw(n * m), nrow = n)
ped <- character(n)
mem <- character(n)
pa <- character(n)
ma <- character(n)
sex <- numeric(n)
aff <- numeric(n)
rownms <- character(n)
a1 <- a2 <- rep(NA, m)
a1m <- a2m <- rep(TRUE, m)
mallelic <- rep(FALSE, m)
lines <- fread(file, sep= "?", header = FALSE)[[1L]]
for (i in 1:n) {
#line <- readLines(con, n = 1)
line <- lines[i]
fields <- strsplit(line, "\t| +")[[1]]
to.na <- fields %in% na.strings
fields[to.na] <- NA
ped[i] <- fields[1]
mem[i] <- fields[2]
pa[i] <- fields[3]
ma[i] <- fields[4]
sex[i] <- as.numeric(fields[5])
aff[i] <- as.numeric(fields[6])
alleles <- matrix(fields[7:nf], byrow = TRUE, ncol = 2)
one <- two <- rep(FALSE, m)
for (k in 1:2) {
ak <- alleles[, k]
akm <- is.na(ak)
br1 <- !akm & a1m
a1[br1] <- ak[br1]
a1m[br1] <- FALSE
br2 <- !akm & (a1 == ak)
one[br2] <- TRUE
br3 <- !akm & !a1m & (a1 != ak)
br4 <- br3 & a2m
a2[br4] <- ak[br4]
a2m[br4] <- FALSE
br5 <- br3 & (a2 == ak)
two[br5] <- TRUE
mallelic <- mallelic | !(akm | one | two)
}
gt <- rep(r0, m)
gt[one & !two] <- r1
gt[one & two] <- r2
gt[two & !one] <- r3
result[i, ] <- gt
}
close(con)
if (any(a1m))
if(verbose) warning("no data for ", sum(a1m), " loci")
mono <- (a2m & !a1m)
if (any(mono))
if(verbose) warning(sum(mono), " loci were monomorphic")
if (any(mallelic)) {
result[, mallelic] <- r0
if(verbose) warning(sum(mallelic), " loci were multi-allelic --- set to NA")
}
if (gen)
snps <- paste("locus", 1:m, sep = sep)
if (any(duplicated(ped))) {
if (any(duplicated(mem))) {
rnames <- paste(ped, mem, sep = sep)
if (any(duplicated(rnames)))
stop("could not create unique subject identifiers")
}
else rnames <- mem
}
else rnames <- ped
dimnames(result) <- list(rnames, snps)
# result <- new("SnpMatrix", result)
result <- data.table(result)
if (lex.order) {
swa <- (!(is.na(a1) | is.na(a2)) & (a1 > a2))
switch.alleles(result, swa)
a1n <- a1
a1n[swa] <- a2[swa]
a2[swa] <- a1[swa]
a1 <- a1n
}
fam <- data.frame(row.names = rnames, pedigree = ped, member = mem,
father = pa, mother = ma, sex = sex, affected = aff)
if (is.null(map))
map <- data.frame(row.names = snps, snp.names = snps,
allele.1 = a1, allele.2 = a2)
else {
map$allele.1 <- a1
map$allele.2 <- a2
names(map)[which] <- "snp.names"
}
rownames(result) <- fam$member
meta <- list(monomorph=sum(mono), multiallelic=sum(mallelic), missing=sum(a1m), pedFile=file, mapFile=snps)
result <- list(genotypes = result, fam = fam, map = map, meta=meta)
class(result) <- "pedMap"
result
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.