######################################################################
#
# read.cross.mq.R
#
# copyright (c) 2014, INRA (author: Timothee Flutre)
# (Some revisions by Karl Broman)
# last modified Dec, 2018
# first written May, 2014
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License,
# version 3, as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but without any warranty; without even the implied warranty of
# merchantability or fitness for a particular purpose. See the GNU
# General Public License, version 3, for more details.
#
# A copy of the GNU General Public License, version 3, is available
# at http://www.r-project.org/Licenses/GPL-3
#
# Part of the R/qtl package
# Contains: read.cross.mq, read.cross.mq.loc, read.cross.mq.map,
# read.cross.mq.qua, mq.rmv.comment
# [See read.cross.R for the main read.cross function.]
#
######################################################################
######################################################################
#
# read.cross.mq: read data from an experimental cross in MapQTL (and
# JoinMap) format.
#
# We need three files: a "loc" file containing the genotype data, a
# "map" file containing the linkage group assignments and map
# positions, and a "qua" file containing the phenotypes.
#
# File formats are described in the MapQTL manual available online at
# http://www.kyazma.nl/docs/MQ7Manual.pdf
# For the loc-file, each marker should be on a single line.
# Only 4-way crosses are supported ("CP" type in MapQTL/JoinMap).
#
######################################################################
read.cross.mq <-
function(dir, locfile, mapfile, quafile)
{
if(! missing(dir) && dir != "") {
if(length(grep("/$", dir)) > 0) # strip off ending /
dir <- substr(dir, 1, nchar(dir)-1)
locfile <- file.path(dir, locfile)
mapfile <- file.path(dir, mapfile)
quafile <- file.path(dir, quafile)
}
loc <- read.cross.mq.loc(locfile)
map <- read.cross.mq.map(mapfile)
pheno <- read.cross.mq.qua(quafile)
type <- loc$pop.type # only "4way" for the moment
n.ind <- nrow(loc$genotypes)
n.mar <- ncol(loc$genotypes)
n.phe <- ncol(pheno)
if(nrow(pheno) < n.ind){
msg <- paste("qua-file should have at least the same number of",
" individuals than loc-file")
stop(msg, call.=FALSE)
}
if(nrow(pheno) > n.ind){
pheno <- pheno[-((n.ind+1):nrow(pheno)),]
}
cat(" --Read the following data:\n")
cat("\tNumber of individuals: ", n.ind, "\n")
cat("\tNumber of markers: ", n.mar, "\n")
cat("\tNumber of phenotypes: ", n.phe, "\n")
geno <- list()
for(chr in levels(map$chr)){
geno[[chr]] <- list(data=loc$genotypes[,map$marker[map$chr == chr]],
map=map$pos[map$chr == chr])
names(geno[[chr]]$map) <- map$marker[map$chr == chr]
if(type == "4way")
geno[[chr]]$map <- rbind(geno[[chr]]$map,
map$pos[map$chr == chr])
if(chr %in% c("x", "X")){
class(geno[[chr]]) <- "X"
} else
class(geno[[chr]]) <- "A"
}
cross <- list(geno=geno, pheno=pheno)
class(cross) <- c(type, "cross")
list(cross, FALSE) # FALSE for "don't estimate map"
}
mq_grab_param <-
function(lines, param, longname, filetype)
{
# stuff for error message
if(missing(filetype)) filetype <- ""
else filetype <- paste(" in", filetype, "file")
if(missing(longname)) longname <- param
g <- grep(param, lines)
if(length(g) == 0)
stop("Cannot find ", longname, " in ", filetype)
# remove white space
line <- gsub("\\s+", "", lines[g])
result <- strsplit(line, "=")[[1]][2]
return(list(result, g)) # g is the line number
}
## each marker should be on a single line
## only 4-way crosses (CP type) are handled
read.cross.mq.loc <-
function(locfile){
pop.name <- NULL
pop.type <- NULL
nb.loci <- NULL
nb.inds <- NULL
seg <- NULL
phase <- NULL
classif <- NULL
genotypes <- NULL
lines <- readLines(locfile, warn=FALSE)
# drop comments
lines <- vapply(strsplit(lines, ";"), "[", "", 1)
# drop empty lines
lines <- lines[!is.na(lines)]
blank <- grep("^\\s*$", lines)
if(length(blank) > 0)
lines <- lines[-blank]
## extract the population name
res <- mq_grab_param(lines, "name", "population name", "loc")
pop.name <- res[[1]]
todrop <- res[[2]]
## extract the population type
res <- mq_grab_param(lines, "popt", "population type", "loc")
pop.type <- res[[1]]
todrop <- c(todrop, res[[2]])
pop.types <- c("BC1","F2","RIx","DH","DH1","DH2","HAP","HAP1","CP","BCpxFy",
"IMxFy")
if(! pop.type %in% pop.types){
msg <- paste("unknown population type", pop.type)
stop(msg, call.=FALSE)
}
if(pop.type == "CP"){
pop.type <- "4way"
} else{
msg <- paste("population type", pop.type,
"is not supported (yet)")
stop(msg, call.=FALSE)
}
## extract the number of loci
res <- mq_grab_param(lines, "nloc", "Number of loci", "loc")
nb.loci <- as.numeric(res[[1]])
todrop <- c(todrop, res[[2]])
seg <- rep(NA, nb.loci)
phase <- rep(NA, nb.loci)
classif <- rep(NA, nb.loci)
res <- mq_grab_param(lines, "nind", "Number of individuals", "loc")
nb.inds <- as.numeric(res[[1]])
todrop <- c(todrop, res[[2]])
genotypes <- matrix(NA, nrow=nb.loci, ncol=nb.inds)
rownames(genotypes) <- paste("loc", 1:nb.loci, sep=".")
## colnames(genotypes) <- paste("ind", 1:nb.inds, sep=".")
lines <- lines[-todrop]
spl <- strsplit(lines, "\\s+")
if(length(spl) != nb.loci)
stop("nloc=", nb.loci, " but genotypes are found at ", length(spl), " markers")
spl.lengths <- vapply(spl, length, 1)
if(any(spl.lengths > nb.inds+4))
stop("lines should have no more than ", nb.inds+4, " columns\n",
"Problems in lines", seq(along=spl.lengths)[spl.lengths > nb.inds+4])
rn <- vapply(spl, "[", "", 1)
if(nrow(genotypes) != length(rn))
stop(paste0("nloc = ", nrow(genotypes), ", but .loc file contains ", length(rn), " genotype rows"))
rownames(genotypes) <- rn
if(length(spl) > nb.loci + 1){
msg <- paste("there seems to be more loci (", locus.id-1,
") than indicated in the header (", nb.loci, ")")
stop(msg, call.=FALSE)
}
nb.fields <- rep(NA, length(lines))
for(line.id in 1:length(lines)){
tokens <- spl[[line.id]]
nb.fields[line.id] <- length(tokens)
if(length(tokens) > nb.inds + 1){
for(i in 2:(length(tokens)-nb.inds)){
if(length(grep(pattern="\\{", x=tokens[i])) > 0){
phase[line.id] <- tokens[i]
} else if(length(grep(pattern="\\(", x=tokens[i])) > 0){
classif[line.id] <- tokens[i]
} else
seg[line.id] <- tokens[i] # only for "4way" type
}
}
genotypes[line.id,] <- tokens[(length(tokens)-nb.inds+1):length(tokens)]
}
if(length(unique(nb.fields)) > 1)
stop("some markers have more fields than others")
genotypes <- t(genotypes) # individuals in rows, markers in columns
## convert all missing data to NA
for(i in 1:length(genotypes)){
if(grepl(pattern="\\.", x=genotypes[i]))
genotypes[i] <- gsub(pattern="\\.", replacement="-",
x=genotypes[i])
if(grepl(pattern="u", x=genotypes[i]))
genotypes[i] <- gsub(pattern="u", replacement="-",
x=genotypes[i])
}
genotypes[genotypes == "--"] <- NA
## convert segregation types and genotypes to new MapQtl format
convert.seg <- FALSE
new.seg.types <- c("<abxcd>", "<efxeg>", "<hkxhk>", "<lmxll>", "<nnxnp>")
for(seg.type in unique(seg))
if(! seg.type %in% new.seg.types){ # e.g. <abxac>
convert.seg <- TRUE
break
}
if(convert.seg){
if(pop.type != "4way"){
msg <- paste("can't convert genotypes from old to new format",
"for 4way cross (yet)")
stop(msg, call.=FALSE)
}
for(locus.id in 1:nb.loci){
if(seg[locus.id] %in% new.seg.types){
next
} else if(seg[locus.id] == "<abxac>"){
seg[locus.id] <- "<efxeg>"
tmp <- which(genotypes[,locus.id] == "aa")
genotypes[tmp, locus.id] <- "ee"
tmp <- which(genotypes[,locus.id] == "ab")
genotypes[tmp, locus.id] <- "ef"
tmp <- which(genotypes[,locus.id] == "ac")
genotypes[tmp, locus.id] <- "eg"
tmp <- which(genotypes[,locus.id] == "bc")
genotypes[tmp, locus.id] <- "fg"
} else if(seg[locus.id] == "<abxab>"){
seg[locus.id] <- "<hkxhk>"
tmp <- which(genotypes[,locus.id] == "aa")
genotypes[tmp, locus.id] <- "hh"
tmp <- which(genotypes[,locus.id] == "ab")
genotypes[tmp, locus.id] <- "hk"
tmp <- which(genotypes[,locus.id] == "bb")
genotypes[tmp, locus.id] <- "kk"
tmp <- which(genotypes[,locus.id] == "a-")
genotypes[tmp, locus.id] <- "h-"
tmp <- which(genotypes[,locus.id] == "b-")
genotypes[tmp, locus.id] <- "k-"
} else if(seg[locus.id] == "<abxaa>"){
seg[locus.id] <- "<lmxll>"
tmp <- which(genotypes[,locus.id] == "aa")
genotypes[tmp, locus.id] <- "ll"
tmp <- which(genotypes[,locus.id] == "ab")
genotypes[tmp, locus.id] <- "lm"
} else if(seg[locus.id] == "<aaxab>"){
seg[locus.id] <- "<nnxnp>"
tmp <- which(genotypes[,locus.id] == "aa")
genotypes[tmp, locus.id] <- "nn"
tmp <- which(genotypes[,locus.id] == "ab")
genotypes[tmp, locus.id] <- "np"
} else{
msg <- paste("unrecognized segregation type", seg[locus.id],
"at locus", locus.id)
stop(msg, call.=FALSE)
}
}
}
## check genotypes
proper.genotypes <- c(NA, "ac", "ca", "ad", "da", "bc", "cb", "bd", "db",
"ee", "ef", "fe", "eg", "ge", "fg", "gf",
"hh", "hk", "kh", "kk", "h-", "k-",
"ll", "lm", "ml",
"nn", "np", "pn")
for(genotype in unique(genotypes))
if(! genotype %in% proper.genotypes){
msg <- paste("unrecognized genotype", genotype)
stop(msg, call.=FALSE)
}
## replace all "ca" by "ac", etc -> speed-up next step?
## TODO
## convert genotypes to R/qtl code (mother=AB x father=CD)
for(locus.id in 1:nb.loci){
if(phase[locus.id] == "{0-}"){
if(seg[locus.id] == "<lmxll>"){ # AB=lm ; CD=ll
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] == "ll"){ # AC or AD
genotypes[ind.id,locus.id] <- 5
} else if(genotypes[ind.id,locus.id] %in% c("lm","ml")){ # BC or BD
genotypes[ind.id,locus.id] <- 6
}
}
} else{
msg <- paste("unrecognized segregation ", seg[locus.id],
"at locus", locus.id, "with phase",
phase[locus.id], "(should be <lmxll>)")
stop(msg, call.=FALSE)
}
} else if(phase[locus.id] == "{1-}"){
if(seg[locus.id] == "<lmxll>"){ # AB=ml ; CD=ll
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] == "ll"){ # BC or BD
genotypes[ind.id,locus.id] <- 6
} else if(genotypes[ind.id,locus.id] %in% c("lm","ml")){ # AC or AD
genotypes[ind.id,locus.id] <- 5
}
}
} else{
msg <- paste("unrecognized segregation ", seg[locus.id],
"at locus", locus.id, "with phase",
phase[locus.id], "(should be <lmxll>)")
stop(msg, call.=FALSE)
}
} else if(phase[locus.id] == "{-0}"){
if(seg[locus.id] == "<nnxnp>"){ # AB=nn ; CD=np
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] == "nn"){ # AC or BC
genotypes[ind.id,locus.id] <- 7
} else if(genotypes[ind.id,locus.id] %in% c("np","pn")){ # AD or BD
genotypes[ind.id,locus.id] <- 8
}
}
} else{
msg <- paste("unrecognized segregation ", seg[locus.id],
"at locus", locus.id, "with phase",
phase[locus.id], "(should be <nnxnp>)")
stop(msg, call.=FALSE)
}
} else if(phase[locus.id] == "{-1}"){
if(seg[locus.id] == "<nnxnp>"){ # AB=nn ; CD=pn
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] == "nn"){ # AD or BD
genotypes[ind.id,locus.id] <- 8
} else if(genotypes[ind.id,locus.id] %in% c("np","pn")){ # AC or BC
genotypes[ind.id,locus.id] <- 7
}
}
} else{
msg <- paste("unrecognized segregation ", seg[locus.id],
"at locus", locus.id, "with phase",
phase[locus.id], "(should be <nnxnp>)")
stop(msg, call.=FALSE)
}
} else if(phase[locus.id] == "{00}"){
if(seg[locus.id] == "<abxcd>"){
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] %in% c("ac","ca")){ # AC
genotypes[ind.id,locus.id] <- 1
} else if(genotypes[ind.id,locus.id] %in% c("ad","da")){ # AD
genotypes[ind.id,locus.id] <- 3
} else if(genotypes[ind.id,locus.id] %in% c("bc","cb")){ # BC
genotypes[ind.id,locus.id] <- 2
} else if(genotypes[ind.id,locus.id] %in% c("bd","db")){ # BD
genotypes[ind.id,locus.id] <- 4
}
}
} else if(seg[locus.id] == "<efxeg>"){
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] %in% c("ee")){ # AC
genotypes[ind.id,locus.id] <- 1
} else if(genotypes[ind.id,locus.id] %in% c("eg","ge")){ # AD
genotypes[ind.id,locus.id] <- 3
} else if(genotypes[ind.id,locus.id] %in% c("fe","ef")){ # BC
genotypes[ind.id,locus.id] <- 2
} else if(genotypes[ind.id,locus.id] %in% c("fg","gf")){ # BD
genotypes[ind.id,locus.id] <- 4
}
}
} else if(seg[locus.id] == "<hkxhk>"){
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] %in% c("hh")){ # AC
genotypes[ind.id,locus.id] <- 1
} else if(genotypes[ind.id,locus.id] %in% c("hk","kh")){ # AD or BC
genotypes[ind.id,locus.id] <- 10
} else if(genotypes[ind.id,locus.id] %in% c("kk")){ # BD
genotypes[ind.id,locus.id] <- 4
} else if(genotypes[ind.id,locus.id] %in% c("h-","-h")){ # not BD
genotypes[ind.id,locus.id] <- 14
} else if(genotypes[ind.id,locus.id] %in% c("k-","-k")){ # not AC
genotypes[ind.id,locus.id] <- 11
}
}
} else{
msg <- paste("unrecognized segregation ", seg[locus.id],
"at locus", locus.id, "with phase",
phase[locus.id])
stop(msg, call.=FALSE)
}
} else if(phase[locus.id] == "{01}"){
if(seg[locus.id] == "<abxcd>"){
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] %in% c("ad","da")){ # AC
genotypes[ind.id,locus.id] <- 1
} else if(genotypes[ind.id,locus.id] %in% c("ac","ca")){ # AD
genotypes[ind.id,locus.id] <- 3
} else if(genotypes[ind.id,locus.id] %in% c("bd","db")){ # BC
genotypes[ind.id,locus.id] <- 2
} else if(genotypes[ind.id,locus.id] %in% c("bc","cb")){ # BD
genotypes[ind.id,locus.id] <- 4
}
}
} else if(seg[locus.id] == "<efxeg>"){
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] %in% c("eg","ge")){ # AC
genotypes[ind.id,locus.id] <- 1
} else if(genotypes[ind.id,locus.id] %in% c("ee")){ # AD
genotypes[ind.id,locus.id] <- 3
} else if(genotypes[ind.id,locus.id] %in% c("fg","gf")){ # BC
genotypes[ind.id,locus.id] <- 2
} else if(genotypes[ind.id,locus.id] %in% c("fe","ef")){ # BD
genotypes[ind.id,locus.id] <- 4
}
}
} else if(seg[locus.id] == "<hkxhk>"){
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] %in% c("hk","kh")){ # AC or BD
genotypes[ind.id,locus.id] <- 9
} else if(genotypes[ind.id,locus.id] %in% c("hh")){ # AD
genotypes[ind.id,locus.id] <- 3
} else if(genotypes[ind.id,locus.id] %in% c("kk")){ # BC
genotypes[ind.id,locus.id] <- 2
} else if(genotypes[ind.id,locus.id] %in% c("h-","-h")){ # not BC
genotypes[ind.id,locus.id] <- 12
} else if(genotypes[ind.id,locus.id] %in% c("k-","-k")){ # not AD
genotypes[ind.id,locus.id] <- 13
}
}
} else{
msg <- paste("unrecognized segregation ", seg[locus.id],
"at locus", locus.id, "with phase",
phase[locus.id])
stop(msg, call.=FALSE)
}
} else if(phase[locus.id] == "{10}"){
if(seg[locus.id] == "<abxcd>"){
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] %in% c("bc","cb")){ # AC
genotypes[ind.id,locus.id] <- 1
} else if(genotypes[ind.id,locus.id] %in% c("bd","db")){ # AD
genotypes[ind.id,locus.id] <- 3
} else if(genotypes[ind.id,locus.id] %in% c("ac","ca")){ # BC
genotypes[ind.id,locus.id] <- 2
} else if(genotypes[ind.id,locus.id] %in% c("ad","da")){ # BD
genotypes[ind.id,locus.id] <- 4
}
}
} else if(seg[locus.id] == "<efxeg>"){
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] %in% c("fe","ef")){ # AC
genotypes[ind.id,locus.id] <- 1
} else if(genotypes[ind.id,locus.id] %in% c("fg","gf")){ # AD
genotypes[ind.id,locus.id] <- 3
} else if(genotypes[ind.id,locus.id] %in% c("ee")){ # BC
genotypes[ind.id,locus.id] <- 2
} else if(genotypes[ind.id,locus.id] %in% c("eg","ge")){ # BD
genotypes[ind.id,locus.id] <- 4
}
}
} else if(seg[locus.id] == "<hkxhk>"){
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] %in% c("kh","hk")){ # AC or BD
genotypes[ind.id,locus.id] <- 9
} else if(genotypes[ind.id,locus.id] %in% c("kk")){ # AD
genotypes[ind.id,locus.id] <- 3
} else if(genotypes[ind.id,locus.id] %in% c("hh")){ # BC
genotypes[ind.id,locus.id] <- 2
} else if(genotypes[ind.id,locus.id] %in% c("h-","-h")){ # not AD
genotypes[ind.id,locus.id] <- 13
} else if(genotypes[ind.id,locus.id] %in% c("k-","-k")){ # not BC
genotypes[ind.id,locus.id] <- 12
}
}
} else{
msg <- paste("unrecognized segregation ", seg[locus.id],
"at locus", locus.id, "with phase",
phase[locus.id])
stop(msg, call.=FALSE)
}
} else if(phase[locus.id] == "{11}"){
if(seg[locus.id] == "<abxcd>"){
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] %in% c("bd","db")){ # AC
genotypes[ind.id,locus.id] <- 1
} else if(genotypes[ind.id,locus.id] %in% c("bc","cb")){ # AD
genotypes[ind.id,locus.id] <- 3
} else if(genotypes[ind.id,locus.id] %in% c("ad","da")){ # BC
genotypes[ind.id,locus.id] <- 2
} else if(genotypes[ind.id,locus.id] %in% c("ac","ca")){ # BD
genotypes[ind.id,locus.id] <- 4
}
}
} else if(seg[locus.id] == "<efxeg>"){
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] %in% c("fg","gf")){ # AC
genotypes[ind.id,locus.id] <- 1
} else if(genotypes[ind.id,locus.id] %in% c("fe","ef")){ # AD
genotypes[ind.id,locus.id] <- 3
} else if(genotypes[ind.id,locus.id] %in% c("eg","ge")){ # BC
genotypes[ind.id,locus.id] <- 2
} else if(genotypes[ind.id,locus.id] %in% c("ee")){ # BD
genotypes[ind.id,locus.id] <- 4
}
}
} else if(seg[locus.id] == "<hkxhk>"){
for(ind.id in 1:nb.inds){
if(is.na(genotypes[ind.id,locus.id])){
next
} else if(genotypes[ind.id,locus.id] %in% c("kk")){ # AC
genotypes[ind.id,locus.id] <- 1
} else if(genotypes[ind.id,locus.id] %in% c("kh","hk")){ # AD or BC
genotypes[ind.id,locus.id] <- 10
} else if(genotypes[ind.id,locus.id] %in% c("hh")){ # BD
genotypes[ind.id,locus.id] <- 4
} else if(genotypes[ind.id,locus.id] %in% c("h-","-h")){ # not AC
genotypes[ind.id,locus.id] <- 11
} else if(genotypes[ind.id,locus.id] %in% c("k-","-k")){ # not BD
genotypes[ind.id,locus.id] <- 14
}
}
} else{
msg <- paste("unrecognized segregation ", seg[locus.id],
"at locus", locus.id, "with phase",
phase[locus.id])
stop(msg, call.=FALSE)
}
} else{
msg <- paste("unrecognized phase", phase[locus.id],
"at locus", locus.id)
stop(msg, call.=FALSE)
}
}
storage.mode(genotypes) <- "numeric"
list(pop.name=pop.name, pop.type=pop.type, genotypes=genotypes,
seg=seg, phase=phase, classif=classif)
}
## returns a data.frame with 3 columns: chr (factor), marker (char), pos (num)
read.cross.mq.map <-
function(mapfile){
# read all lines
lines <- readLines(mapfile, warn=FALSE)
# remove comments
lines <- vapply(strsplit(lines, ";"), "[", "", 1)
# drop empty lines
lines <- lines[!is.na(lines)]
blank <- grep("^\\s*$", lines)
if(length(blank) > 0)
lines <- lines[-blank]
# find groups
grouplines <- grep("group", lines)
# add lg name to end of each line
for(i in seq(along=grouplines)) {
# linkage group name
groupname <- strsplit(lines[grouplines[i]], "\\s+")[[1]]
groupname <- groupname[length(groupname)]
first <- grouplines[i]+1
if(i==length(grouplines))
last <- length(lines)
else last <- grouplines[i+1]-1
lines[first:last] <- paste(lines[first:last], groupname)
}
# drop initial lines
if(grouplines[1] > 1)
todrop <- 1:(grouplines[1]-1)
else todrop <- NULL
# also drop the group lines
todrop <- c(todrop, grouplines)
# now the actual dropping
lines <- lines[-todrop]
# split at white space
spl <- strsplit(lines, "\\s+")
# combine into a data frame
genmap <- data.frame(chr=vapply(spl, "[", "", 3),
marker=vapply(spl, "[", "", 1),
pos=as.numeric(vapply(spl, "[", "", 2)),
stringsAsFactors=FALSE)
# make chr as factor
genmap[,1] <- factor(genmap[,1], levels=unique(genmap[,1]))
genmap
}
read.cross.mq.qua <-
function(quafile){
nb.traits <- NULL
nb.inds <- NULL
miss <- NULL
phenotypes <- NULL
lines <- readLines(quafile, warn=FALSE)
# remove comments
lines <- vapply(strsplit(lines, ";"), "[", "", 1)
# drop empty lines
lines <- lines[!is.na(lines)]
blank <- grep("^\\s*$", lines)
if(length(blank) > 0)
lines <- lines[-blank]
## extract the number of traits
res <- mq_grab_param(lines, "ntrt", "Number of traits", "qua")
nb.traits <- as.numeric(res[[1]])
todrop <- res[[2]]
## extract the number of individuals
res <- mq_grab_param(lines, "nind", "Number of individuals", "qua")
nb.inds <- as.numeric(res[[1]])
todrop <- c(todrop, res[[2]])
## extract the symbol for missing values
res <- mq_grab_param(lines, "miss", "Missing value code", "qua")
miss <- res[[1]]
todrop <- c(todrop, res[[2]])
lines <- lines[-todrop]
spl <- strsplit(lines, "\\s+")
ind.id <- 1
trait.id <- 1
trait.names <- c()
for(line.id in 1:length(lines)){
tokens <- spl[[line.id]]
if(trait.id <= nb.traits){
if(length(tokens) == 1){ # one trait name per line
trait.names <- c(trait.names, tokens[1])
trait.id <- trait.id + 1
} else{ # all trait names on the same line
trait.names <- tokens
trait.id <- nb.traits + 1
}
next
} else if(trait.id > nb.traits && is.null(phenotypes)){
if(length(trait.names) != nb.traits){
msg <- paste("there seems to be fewer trait names (",
length(trait.names),
") than indicated in the header (",
nb.traits, ")", sep="")
stop(msg, call.=FALSE)
}
phenotypes <- matrix(NA, nrow=nb.inds, ncol=nb.traits)
}
if(length(tokens) != nb.traits){
msg <- paste0("line ", line.id, " should have ", nb.traits,
" column", ifelse(nb.traits > 1, "s", ""),
" separated by spaces or tabs")
stop(msg, call.=FALSE)
}
phenotypes[ind.id,] <- tokens
ind.id <- ind.id + 1
}
phenotypes[which(phenotypes == miss)] <- NA
phenotypes <- as.data.frame(phenotypes)
for(j in 1:ncol(phenotypes))
phenotypes[,j] <- tryCatch(expr=as.numeric(as.character(phenotypes[,j])),
warning=function(w)
as.factor(phenotypes[,j]))
colnames(phenotypes) <- trait.names
phenotypes
}
# end of read.cross.mq.R
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.