Nothing
read.snpdata.mach <- function(fileroot, tol.af = 0.01, phenotypes = NULL, isuffix = ".mlinfo", dsuffix = ".mldose") {
if (substr(isuffix, nchar(isuffix) - 2, nchar(isuffix)) == ".gz") {
mlinfo <- read.table(gzfile(paste(fileroot, isuffix, sep = "")), header = TRUE, colClasses = "character")
} else {
mlinfo <- read.table(paste(fileroot, isuffix, sep = ""), header = TRUE, colClasses = "character")
}
## read info as characters to prevent small .mlinfo files with Al1 or Al2 all "T" alleles being coerced to logical
names(mlinfo) <- sub("^AL", "Al", names(mlinfo)) # some old files have AL1,AL2
stopifnot(all(c("SNP", "Al1", "Al2", "Freq1") %in% names(mlinfo)))
for (colname in intersect(c("Freq1", "MAF", "Quality", "Rsq"), names(mlinfo))) mlinfo[[colname]] <- as.double(mlinfo[[colname]]) # if present, coerce these columns to double
## we could use something like try(as.numeric(... to make this more general
if (substr(dsuffix, nchar(dsuffix) - 2, nchar(dsuffix)) == ".gz") {
mldose <- read.table(gzfile(paste(fileroot, dsuffix, sep = "")), header = FALSE, col.names = c("MACHID", "DATATYPE", paste(mlinfo$SNP, mlinfo$Al1, sep = "_")), as.is = TRUE)
} else {
mldose <- read.table(paste(fileroot, dsuffix, sep = ""), header = FALSE, col.names = c("MACHID", "DATATYPE", paste(mlinfo$SNP, mlinfo$Al1, sep = "_")), as.is = TRUE)
}
if (nrow(mlinfo) > 0 && any(abs(apply(mldose[ , -2:-1, drop = FALSE], 2, mean)/2 - mlinfo$Freq1) > tol.af)) stop("mlinfo/mldose allele frequency mismatch") # check mean dose corresponds to Freq1 from mlinfo file; we allow empty mlinfo/mldose files
## we should check for MACH-style names with "->" separator
mldose$FID <- sapply(mldose$MACHID, function(ii) unlist(strsplit(ii, "->"))[1])
mldose$IID <- sapply(mldose$MACHID, function(ii) unlist(strsplit(ii, "->"))[2]) # assumes two part names
snpinfo <- subset(mlinfo, select = c("SNP", "Al1", "Al2", "Freq1"))
names(snpinfo) <- c("snp", "coded.allele", "noncoded.allele", "coded.freq")
## note Al1 is the coded.allele in the sense that mldose contains does of Al1
## however mach2qtl inverts effect directions
snpinfo <- cbind(snpinfo, subset(mlinfo, select = setdiff(names(mlinfo), c("SNP", "Al1", "Al2", "Freq1")))) # keep any other columns, user may have to coerce additional mini-mac columns back to double
if (!is.null(phenotypes)) {
stopifnot(is.data.frame(phenotypes))
stopifnot("MACHID" %in% names(phenotypes))
mldose <- cbind(mldose, subset(phenotypes, select = setdiff(names(phenotypes), "MACHID"))[match(mldose$MACHID, phenotypes$MACHID), ])
}
snpdata <- list(snpinfo = snpinfo, data = mldose)
class(snpdata) <- "snpdata"
return(snpdata)
}
read.snpdata.minimac <- function(fileroot, tol.af = 0.01, phenotypes = NULL, isuffix = ".info.gz", dsuffix = ".dose.gz") {
return(read.snpdata.mach(fileroot, tol.af, phenotypes, isuffix, dsuffix))
}
read.snpdata.plink <- function(fileroot, tol.af = 0.01, phenotypes = NULL) {
frq <- read.table(paste(fileroot, "frq", sep = "."), header = TRUE, colClasses = "character") # read as characters to prevent small .frq files with A1 or A2 all "T" alleles being coerced to logical
stopifnot(all(c("SNP", "A1", "A2", "MAF") %in% names(frq)))
for (colname in intersect(c("MAF", "NCHROBS"), names(frq))) frq[[colname]] <- as.double(frq[[colname]]) # if present, coerce these columns to double
raw <- read.table(paste(fileroot, "raw", sep = "."), header = TRUE, as.is = TRUE, check.names = FALSE) # do not check.names since metabochip names like "chr1:11717263" will be broken
stopifnot(all(paste(frq$SNP, frq$A1, sep = "_") == names(raw)[-6:-1])) # assume 6 columns before genotypes
if (nrow(frq) > 0 && any(abs(frq$MAF - apply(raw[ , -6:-1], 2, mean, na.rm = TRUE)/2) > tol.af)) stop("frq/raw allele frequency mismatch") # check mean dose corresponds to MAF from frq file
if (ncol(raw) > 6) for (colid in 7:ncol(raw)) raw[is.na(raw[ , colid]), colid] <- mean(raw[ , colid], na.rm = TRUE) # replace NA genotypes with column mean
snpinfo <- subset(frq, select = c("SNP", "A1", "A2", "MAF"))
names(snpinfo) <- c("snp", "coded.allele", "noncoded.allele", "coded.freq")
snpinfo <- cbind(snpinfo, subset(frq, select = setdiff(names(frq), c("SNP", "A1", "A2", "MAF")))) # keep any other columns
if (!is.null(phenotypes)) {
stopifnot(is.data.frame(phenotypes))
stopifnot(all(c("FID", "IID") %in% names(phenotypes)))
raw <- cbind(raw, subset(phenotypes, select = setdiff(names(phenotypes), c("FID", "IID")))[match(paste(raw$FID, raw$IID, sep = "->"), paste(phenotypes$FID, phenotypes$IID, sep = "->")), ])
}
snpdata <- list(snpinfo = snpinfo, data = raw)
class(snpdata) <- "snpdata"
return(snpdata)
}
read.snpdata.impute <- function(samplefile, genofile, phenotypes = NULL) {
## hack to work around 0 0 0 P ... line
samples <- read.table(samplefile, skip = 2, header = FALSE, as.is = TRUE,
col.names = names(read.table(samplefile, header = TRUE, nrows = 0)))
nsamples <- nrow(samples)
## hope we have right number of colClasses
gts <- read.table(genofile, header = FALSE,
colClasses = c(rep("character", 5), rep("numeric", nrow(samples)*3)))
stopifnot(ncol(gts) == 5 + nsamples*3)
nsnps <- nrow(gts)
snpinfo <- gts[ , 1:5]
names(snpinfo) <- c("chr", "snp", "pos", "noncoded.allele", "coded.allele")
i0 <- seq(from = 1, by = 3, length.out = nsamples)
i1 <- seq(from = 2, by = 3, length.out = nsamples)
i2 <- seq(from = 3, by = 3, length.out = nsamples)
for (snpidx in 1:nsnps) {
pvec <- t(gts[snpidx, 6:ncol(gts)]) # this traverses memory with large stride so only do once
p0 <- pvec[i0, ]
p1 <- pvec[i1, ]
p2 <- pvec[i2, ]
psum <- p0+p1+p2
dose <- ifelse(psum > 0, (p1+2*p2)/psum, NA)
mdose <- mean(dose, na.rm = TRUE)
snpinfo$coded.freq[snpidx] <- mdose/2
dose[is.na(dose)] <- mdose
samples[[paste(snpinfo$snp[snpidx], snpinfo$coded.allele[snpidx], sep = "_")]] <- dose
}
# check on dosages on [0,2]?
if (!is.null(phenotypes)) {
stopifnot(is.data.frame(phenotypes))
stopifnot(all(c("FID", "IID") %in% names(phenotypes)))
samples <- cbind(samples, subset(phenotypes, select = setdiff(names(phenotypes), c("FID", "IID")))[match(paste(samples[ , 1], samples[ , 2], sep = "->"), paste(phenotypes$FID, phenotypes$IID, sep = "->")), ])
}
snpdata <- list(snpinfo = snpinfo, data = samples)
class(snpdata) <- "snpdata"
return(snpdata)
}
allelesAB <- function(A1, A2, sep = "/") return(ifelse(A1 < A2, paste(A1, A2, sep = sep), paste(A2, A1, sep = sep)))
align.snpdata.coding <- function(params, snpdata, ploidy = 2, missing.snp = "fail") {
stopifnot(is.data.frame(params))
stopifnot(all(c("snp", "coded.allele", "noncoded.allele") %in% names(params)))
## snpdata is list with snpdata$snpinfo and snpdata$data returned by read.snpdata.plink or read.snpdata.mldose
stopifnot(all(c("snp", "coded.allele", "noncoded.allele", "coded.freq") %in% names(snpdata$snpinfo)))
if (missing.snp == "okay") {
snpinfo <- data.frame(snp = setdiff(params$snp, snpdata$snpinfo$snp))
if (nrow(snpinfo) > 0) {
snpinfo$coded.allele <- params$coded.allele[match(snpinfo$snp, params$snp)]
snpinfo$noncoded.allele <- params$noncoded.allele[match(snpinfo$snp, params$snp)]
snpinfo$coded.freq <- 0
for (extracol in setdiff(names(snpdata$snpinfo), names(snpinfo))) snpinfo[[extracol]] <- NA
snpdata$snpinfo <- rbind(snpdata$snpinfo, snpinfo)
for (codeB in paste(snpinfo$snp, snpinfo$coded.allele, sep = "_")) snpdata$data[[codeB]] <- 0
}
}
stopifnot(all(params$snp %in% snpdata$snpinfo$snp))
### non-robust workaround to deal with PLINK calling coded allele "0"
for (idx in which(snpdata$snpinfo$snp %in% params$snp & snpdata$snpinfo$coded.allele == "0")) {
jdx <- match(snpdata$snpinfo$snp[idx], params$snp)
if (snpdata$snpinfo$noncoded.allele[idx] == params$coded.allele[jdx]) {
names(snpdata$data) <- sub(paste("^", snpdata$snpinfo$snp[idx], "_", snpdata$snpinfo$coded.allele[idx], "$", sep = ""), paste(params$snp[jdx], params$noncoded.allele[jdx], sep = "_"), names(snpdata$data))
snpdata$snpinfo$coded.allele[idx] <- params$noncoded.allele[jdx]
} else if (snpdata$snpinfo$noncoded.allele[idx] == params$noncoded.allele[jdx]) {
names(snpdata$data) <- sub(paste("^", snpdata$snpinfo$snp[idx], "_", snpdata$snpinfo$coded.allele[idx], "$", sep = ""), paste(params$snp[jdx], params$coded.allele[jdx], sep = "_"), names(snpdata$data))
snpdata$snpinfo$coded.allele[idx] <- params$coded.allele[jdx]
} else {
stop(paste("cannot match alleles for", snpdata$snpinfo$snp[idx]))
}
}
stopifnot(all(allelesAB(params$coded.allele, params$noncoded.allele) == allelesAB(snpdata$snpinfo$coded.allele, snpdata$snpinfo$noncoded.allele)[match(params$snp, snpdata$snpinfo$snp)])) # check alleles match even if inverted
params$codeB <- paste(params$snp, params$coded.allele, sep = "_")
params$codeA <- paste(params$snp, params$noncoded.allele, sep = "_")
## the next section recodes columsn of snpdata$data to coded allele of params if necessary
uset <- unique(subset(params, select = c("codeB", "codeA")))
for (idx in which(!uset$codeB %in% names(snpdata$data))) {
if (uset$codeA[idx] %in% names(snpdata$data)) {
snpdata$data[[uset$codeB[idx]]] <- ploidy - snpdata$data[[uset$codeA[idx]]]
} else {
stop("cannot find dose for ", uset$codeB[idx], "/", uset$codeA[idx])
}
}
stopifnot(all(params$codeB %in% names(snpdata$data)))
rm(uset)
for (colid in which(names(snpdata$data) %in% params$codeB)) {
if (any(is.na(snpdata$data[ , colid]))) stop("missing dosages for", names(snpdata$data)[colid])
if (any(snpdata$data[ , colid] < 0 | snpdata$data[ , colid] > ploidy)) stop("dosages outside [0,ploidy] for", names(snpdata$data)[colid])
}
params$data.coded.freq <- apply(subset(snpdata$data, select = params$codeB), 2, mean)/ploidy
## do some kind of check...
return(list(params = params, snpdata = snpdata))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.