R.v.maj <- as.numeric(R.version$major)
R.v.min.1 <- as.numeric(strsplit(R.version$minor, "\\.")[[1]][1])
if(R.v.maj < 2 || (R.v.maj == 2 && R.v.min.1 < 15))
  stop("requires R >= 2.15", call.=FALSE)

suppressPackageStartupMessages(library(knitr))
opts_chunk$set(echo=TRUE, warning=TRUE, message=TRUE, cache=FALSE, fig.align="center")

Overview

This document aims at presenting how to build genetic maps for any cross between two outbred parents using a pseudo-tescross strategy. The seminal article is from Grattapaglia and Sederoff (1994). First, a map is built for each parent from genotypes coded as in a backcross. Second, both maps are used to deduce parental phases. Third, a consensus map is built for both parents from genotypes coded as in a F2 intercross.

This document requires external packages to be available:

suppressPackageStartupMessages(library(scrm)) # on the CRAN
suppressPackageStartupMessages(library(seriation)) # on the CRAN
suppressPackageStartupMessages(library(qtl)) # on the CRAN
stopifnot(file.exists(Sys.which("carthagene"))) # http://www7.inra.fr/mia/T/CarthaGene/
suppressPackageStartupMessages(library(ASMap)) # on the CRAN
suppressPackageStartupMessages(library(rutilstimflutre)) # on GitHub
stopifnot(compareVersion("0.144.0",
                         as.character(packageVersion("rutilstimflutre")))
          != 1)

This R chunk is used to assess how much time it takes to execute the R code in this document until the end:

t0 <- proc.time()

Simulate genetic data

Set the seed:

set.seed(1859)

Simulate haplotypes and genotypes

In two populations:

nb.genos <- 5 * 10^2
nb.chrs <- 2
chr.len.phy <- 10^5     # chromosome physical length in base pairs
mu <- 10^(-8)           # neutral mutation rate in events / base / generation
u <- mu * chr.len.phy   #  in events / chrom / gen
chr.len.gen <- 10^(-1)  # chromosome genetic length in Morgans
c.r <- chr.len.gen / chr.len.phy # recomb rate in events / base / gen
r <- c.r * chr.len.phy  #  in events / chrom / gen
m <- 10^(-4)            # fraction of a pop replaced from other pops each gen
Ne <- 10^4              # effective population size
theta <- 4 * Ne * u     # scaled neutral mutation rate in events / chrom
rho <- 4 * Ne * r       # scaled recomb rate in events / chrom
M <- 4 * Ne * m         # scaled migration rate in events
genomes <- simulCoalescent(nb.inds=nb.genos,
                           nb.reps=nb.chrs,
                           pop.mut.rate=theta,
                           pop.recomb.rate=rho,
                           chrom.len=chr.len.phy,
                           nb.pops=2,
                           mig.rate=M,
                           get.alleles=TRUE)
table(inds.per.pop <- kmeans(genomes$genos, 2)$cluster)
afs.pop <- estimSnpAf(X=genomes$genos)
mafs.pop <- estimSnpMaf(afs=afs.pop)
A.vr.pop <- estimGenRel(X=genomes$genos, afs=afs.pop, method="vanraden1")
mrk2chr <- setNames(genomes$snp.coords$chr, rownames(genomes$snp.coords))

Check allele frequencies

plotHistAllelFreq(afs=afs.pop, main="Allele frequencies")
plotHistMinAllelFreq(mafs=mafs.pop, main="Minor allele frequencies")

Check additive relationships

imageWithScale(A.vr.pop, main="Additive genetic relationships")
summary(diag(A.vr.pop))
hist(diag(A.vr.pop), col="grey", border="white")
summary(A.vr.pop[upper.tri(A.vr.pop)])
hist(A.vr.pop[upper.tri(A.vr.pop)], col="grey", border="white")

Check LD

if(all(c("GenomicRanges", "S4Vectors", "IRanges", "LDcorSV") %in%
       installed.packages()[,"Package"])){
  chr <- "chr1"
  min.maf <- 0.15
  snps.tokeep <- rownames(genomes$snp.coords[genomes$snp.coords$chr == chr &
                                             mafs.pop >= min.maf,])
  print(length(snps.tokeep))
  ld <- estimLd(X=genomes$genos[,snps.tokeep],
                snp.coords=genomes$snp.coords[snps.tokeep,],
                use.ldcorsv=FALSE)
  ## st <- system.time(
  ##     ld <- estimLd(X=genomes$genos[,snps.tokeep],
  ##                   snp.coords=genomes$snp.coords[snps.tokeep,],
  ##                   K=A.vr.pop,
  ##                   use.ldcorsv=TRUE))
  ## print(st)
  print(nrow(ld))
  print(summary(ld$cor2))
  snp.dist <- distSnpPairs(snp.pairs=ld[, c("loc1","loc2")],
                           snp.coords=genomes$snp.coords[snps.tokeep,])
  plotLd(snp.dist,
         sqrt(ld$cor2), estim="r",
         ## xlim=c(0, 4*10^4),
         main=paste0(length(snps.tokeep), " SNPs with MAF >= ", min.maf,
                     " on ", chr),
         use.density=TRUE,
         span=1/20,
         sample.size=2*nb.genos,
         Ne=Ne, c=c.r,
         add.ohta.kimura=TRUE)
}

Perform controlled crosses

Choose parents

Choose two individuals as parents (one per population):

(idx.parents <- c(sample(x=names(inds.per.pop)[inds.per.pop == 1], size=1),
                  sample(x=names(inds.per.pop)[inds.per.pop == 2], size=1)))
A.vr.pop[idx.parents, idx.parents]
genos.parents <- genomes$genos[idx.parents,]
names.parents <- rownames(genos.parents)
haplos.parents <- getHaplosInds(haplos=genomes$haplos,
                                ind.names=names.parents)

Make offsprings

Cross them several times to make offsprings:

nb.offs <- 200
names.offs <- paste0("off-",
                     sprintf(fmt=paste0("%0", floor(log10(nb.offs))+1, "i"),
                             1:nb.offs))
head(crosses.off <- data.frame(parent1=rep(names.parents[1], nb.offs),
                               parent2=rep(names.parents[2], nb.offs),
                               child=names.offs,
                               stringsAsFactors=FALSE))
loc.crovers.off <- drawLocCrossovers(crosses=crosses.off,
                                     nb.snps=sapply(haplos.parents, ncol),
                                     simplistic=FALSE,
                                     verbose=1)
haplos.offs <- makeCrosses(haplos=haplos.parents, crosses=crosses.off,
                           loc.crossovers=loc.crovers.off,
                           howto.start.haplo=0)
genos.offs <- segSites2allDoses(seg.sites=haplos.offs,
                                ind.ids=getIndNamesFromHaplos(haplos.offs),
                                snp.ids=rownames(genomes$snp.coords))
afs.offs <- estimSnpAf(X=genos.offs)
mafs.offs <- estimSnpMaf(afs=afs.offs)
dim(genos.doses <- rbind(genos.parents, genos.offs))
genos.classes <- genoDoses2genoClasses(X=genos.doses,
                                       alleles=genomes$alleles)
genos.jm <- genoClasses2JoinMap(x=genos.classes)
genos.jm[1:3, 1:14]
tests.seg <- filterSegreg(genos.jm[,-c(1:8)], return.counts=TRUE)

The JoinMap format will be used to encode genotypes from both parents and offsprings as it allows to specify segregation types and phases, and can be natively read by the R qtl package.

Everything is explained in detail in the JoinMap manual, available online. Below is a brief summary.

At each locus, parental alleles are labelled as A and B for the first parent, and C and D for the second parent. Interesting locus must segregate in the progeny, meaning that A, B, C and D shouldn't all correspond to the same allele. Several segregation types exist, depending on the parental genotypes and their alleles:

Moreover, when considering several locus belonging to the same linkage group, their alleles can belong to the same physical chromosome, hence allowing to define linkage phases (haplotypes). In a given parent, two alleles at different locus are said to be "in coupling" when they came from the same grand-parent. Otherwise, they are said to be "in repulsion".

For instance, if a locus $l_1$ is of segregation type <hkxhk> and phase type {00} and another locus $l_2$ is of segregation type <abxcd> and phase type {01}, this means that, in the first parent, the h-allele of $l_1$ and the a-allele of $l_2$ are in coupling (and thus also their k- and b-alleles), and that, in the second parent, the h-allele of $l_1$ is in repulsion with the c-allele of $l_2$ (and thus in coupling with the d-allele of $l_2$).

Plot pedigree

ped <- data.frame(ind=c(names.parents,
                        crosses.off$child),
                  mother=c(rep(NA, length(names.parents)),
                           crosses.off$parent1),
                  father=c(rep(NA, length(names.parents)),
                           crosses.off$parent2),
                  gen=c(rep(0, length(names.parents)),
                        rep(1, nrow(crosses.off))),
                  stringsAsFactors=FALSE)
ped.tmp <- rbind(ped[1:5,],
                 c(ind="off-...", ped[5, -1]),
                 c(ind="off-....", ped[5, -1]),
                 ped[nrow(ped),])
plotPedigree(inds=ped.tmp$ind, mothers=ped.tmp$mother, fathers=ped.tmp$father,
             generations=ped.tmp$gen, main="Pedigree of the controlled cross")

Plot allele frequencies

plotHistAllelFreq(afs=afs.offs, main="Allele frequencies among offsprings")
plotHistMinAllelFreq(mafs=mafs.offs, main="Minor allele frequencies among offsprings")

Check additive genetic relationships

A.vr.cross <- estimGenRel(X=genos.doses, afs=afs.pop, method="vanraden1")
A.t.cross <- estimGenRel(X=genos.doses, afs=afs.pop, method="toro2011_eq10")
cor(c(A.vr.cross), c(A.t.cross))
imageWithScale(A.vr.cross, main="Additive genetic relationships of crosses")
imageWithScale(A.vr.cross[1:10, 1:10],
               main="Additive genetic relationships of crosses (subset)",
               idx.rownames=1:10, idx.colnames=1:10)
summary(diag(A.vr.cross))
summary(A.vr.cross[upper.tri(A.vr.cross)])
summary(A.vr.cross[names.parents[1], grep("off", colnames(A.vr.cross))])
summary(A.vr.cross[names.parents[2], grep("off", colnames(A.vr.cross))])
summary(A.t.cross[names.parents[1], grep("off", colnames(A.t.cross))])
summary(A.t.cross[names.parents[2], grep("off", colnames(A.t.cross))])

Under HWE in a single population, the additive genetic relationships between all parent-child pairs should be centered around 0.5, corresponding to a coancestry coefficient of 1/4.

Check LD

if(all(c("GenomicRanges", "S4Vectors", "IRanges", "LDcorSV") %in%
       installed.packages()[,"Package"])){
  chr <- "chr1"
  min.maf <- 0.1
  max.maf <- 0.49
  snps.tokeep <- rownames(genomes$snp.coords[genomes$snp.coords$chr == chr &
                                             mafs.offs >= min.maf &
                                            mafs.offs <= max.maf,])
  print(length(snps.tokeep))
  ld <- estimLd(X=genos.offs[,snps.tokeep],
                snp.coords=genomes$snp.coords[snps.tokeep,],
                use.ldcorsv=FALSE)
  print(nrow(ld))
  print(summary(ld$cor2))
  snp.dist <- distSnpPairs(snp.pairs=ld[, c("loc1","loc2")],
                           snp.coords=genomes$snp.coords[snps.tokeep,])
  plotLd(snp.dist,
         sqrt(ld$cor2), estim="r",
         main=paste0(length(snps.tokeep), " SNPs with MAF >= ", min.maf,
                     " on ", chr),
         use.density=TRUE)
}

Build genetic maps with R/qtl

This section is partly inspired by the document "Genetic map construction with R/qtl" by Karl Broman (2012).

Prepare the input data

genos.qtl <- joinMap2backcross(x=genos.jm[, -c(1:8)], alias.hom=1,
                               alias.het=2, alias.dup=0, alias.miss=NA,
                               parent.names=colnames(genos.jm)[1:2])
sapply(genos.qtl, dim)
table(genomes$snp.coords[rownames(genos.qtl[[1]]), "chr"])
table(genomes$snp.coords[rownames(genos.qtl[[2]]), "chr"])

Build a map for parent 1 (detailed)

Set up the input object:

cross.par1 <- setupQtlCrossObject(gendat=t(genos.qtl[[1]]), cross.type="bc")
summary(cross.par1)

Look at missing data:

plotMissing(cross.par1)
par(mfrow=c(1,2), las=1)
plot(ntyped(cross.par1), ylab="nb of typed markers",
     xlab="genotypes")
plot(ntyped(cross.par1, "mar"), ylab="nb of typed genotypes",
     xlab="markers")

Identify duplicate individuals:

cg <- comparegeno(cross.par1)
hist(cg[lower.tri(cg)], breaks=seq(0, 1, len=101),
     xlab="nb of matching marker between genotypes", main="")
rug(cg[lower.tri(cg)])
wh <- which(cg > 0.9, arr=TRUE)
dim(wh <- wh[wh[,1] < wh[,2],])
head(wh)
g <- pull.geno(cross.par1)
table(g[11,], g[37,])

Identify duplicate markers:

length(dup <- findDupMarkers(cross.par1, exact.only=FALSE))
head(dup)

Look for markers with distorted segregation patterns:

gt <- geno.table(cross.par1)
gt[gt$P.value < 0.05/totmar(cross.par1),]

Define the linkage groups (two markers will be placed in the same linkage groups if they have estimated recombination fraction <= max.rf and LOD score >= min.lod):

cross.par1 <- est.rf(cross.par1) # see also markerlrt()
checkAlleles(cross.par1, threshold=5) # to check markers with switched alleles
rf <- pull.rf(cross.par1)
lod <- pull.rf(cross.par1, what="lod")
plot(as.numeric(rf), as.numeric(lod), xlab="recombination fraction",
     ylab="LOD score")
## different curves due to different amount of missing data btw marker pairs
## see the CartaGene section below where this plot is reproduced with colors
lg <- formLinkageGroups(cross.par1, max.rf=0.2, min.lod=1.5)
table(lg[,"LG"])
tapply(rownames(lg), factor(lg$LG),
       function(locus){
         table(genomes$snp.coords[gsub("_m", "", locus), "chr"])
       }, simplify=FALSE)
cross.par1 <- formLinkageGroups(cross.par1, max.rf=0.2, min.lod=1.5,
                                reorgMarkers=TRUE)
summary(cross.par1)
plotRF(cross.par1) # upper-right=RF lower-right=LOD
rf <- pull.rf(cross.par1)
lod <- pull.rf(cross.par1, what="lod")
mrk <- markernames(cross.par1, chr=1)[3] # look at 3rd marker from lg 1
par(mfrow=c(2,1))
plot(rf, mrk, bandcol="gray70", ylim=c(0,1))
abline(h=0.5, lty=2)
plot(lod, mrk, bandcol="gray70")

Determine the marker order and inter-marker distances for the linkage groups corresponding to the two distinct chromosomes:

system.time(
    map.par1 <- est.map(cross.par1, chr=c(1,3), map.function="kosambi",
                        verbose=TRUE))
summaryMap(map.par1)
plotMap(map.par1, show.marker.names=TRUE)
cross.par1 <- replace.map(cross.par1, map.par1)
plotRF(cross.par1, chr=c(1,3))
plot(countXO(cross.par1), ylab="Number of crossovers")
xoloc.par1.lg1 <- locateXO(cross.par1, chr=1, full.info=TRUE)
summary(sapply(xoloc.par1.lg1, function(x){
  if(length(x) == 0){
    0
  } else
    nrow(x)
}))

Save the whole map:

map.par1.qtl <- pull.map(cross.par1, chr=c(1,3), as.table=TRUE)
map.par1.qtl <- data.frame(linkage.group=map.par1.qtl$chr,
                           locus=rownames(map.par1.qtl),
                           genetic.distance=map.par1.qtl$pos)
map.par1.qtl$linkage.group <- as.character(map.par1.qtl$linkage.group)
map.par1.qtl$linkage.group[map.par1.qtl$linkage.group == "1"] <- "chr2"
map.par1.qtl$linkage.group[map.par1.qtl$linkage.group == "3"] <- "chr1"
tapply(map.par1.qtl$genetic.distance, factor(map.par1.qtl$linkage.group), max)
file.map.par1.qtl <- paste0("bestmap_qtl_parent1-",
                            colnames(genos.jm)[1], ".txt")
write.table(x=map.par1.qtl, file=file.map.par1.qtl, quote=FALSE, sep="\t",
            row.names=FALSE)

Clean:

file.remove(file.map.par1.qtl)

Build a map for parent 2 (brief)

cross.par2 <- setupQtlCrossObject(gendat=t(genos.qtl[[2]]), cross.type="bc")
summary(cross.par2)
cross.par2 <- est.rf(cross.par2)
lg <- formLinkageGroups(cross.par2, max.rf=0.15, min.lod=1.5)
table(lg[,"LG"])
tapply(rownames(lg), factor(lg$LG),
       function(locus){
         table(genomes$snp.coords[gsub("_m", "", locus), "chr"])
       }, simplify=FALSE)
cross.par2 <- formLinkageGroups(cross.par2, max.rf=0.15, min.lod=1.5,
                                reorgMarkers=TRUE)
summary(cross.par2)
system.time(
    map.par2 <- est.map(cross.par2, chr=c(1,3), map.function="kosambi",
                        verbose=TRUE))
summaryMap(map.par2)
cross.par2 <- replace.map(cross.par2, map.par2)
xoloc.par2.lg1 <- locateXO(cross.par2, chr=1, full.info=TRUE)
summary(sapply(xoloc.par2.lg1, function(x){
  if(length(x) == 0){
    0
  } else
    nrow(x)
}))
map.par2.qtl <- pull.map(cross.par2, chr=c(1,3), as.table=TRUE)
map.par2.qtl <- data.frame(linkage.group=map.par2.qtl$chr,
                           locus=rownames(map.par2.qtl),
                           genetic.distance=map.par2.qtl$pos)
map.par2.qtl$linkage.group <- as.character(map.par2.qtl$linkage.group)
map.par2.qtl$linkage.group[map.par2.qtl$linkage.group == "1"] <- "chr2"
map.par2.qtl$linkage.group[map.par2.qtl$linkage.group == "3"] <- "chr1"
tapply(map.par2.qtl$genetic.distance, factor(map.par2.qtl$linkage.group), max)
file.map.par2.qtl <- paste0("bestmap_qtl_parent2-",
                            colnames(genos.jm)[2], ".txt")
write.table(x=map.par2.qtl, file=file.map.par2.qtl, quote=FALSE, sep="\t",
            row.names=FALSE)

Clean:

file.remove(file.map.par2.qtl)

Build genetic maps with CarthaGene

See de Givry et al (2005).

Prepare the input data

genos.cg <- joinMap2backcross(x=genos.jm[, -c(1:8)], alias.hom="A",
                              alias.het="H", alias.dup="Z", alias.miss="-",
                              parent.names=colnames(genos.jm)[1:2])
sapply(genos.cg, dim)
table(genomes$snp.coords[rownames(genos.cg[[1]]), "chr"])
table(genomes$snp.coords[rownames(genos.cg[[2]]), "chr"])

Build a map for parent 1 (detailed)

Load the data set:

file.par1 <- paste0("genos_carthagene_parent1-", names(genos.cg)[1], ".txt")
writeCartagene(genos.cg[[1]], file.par1)
cg <- openCarthagene(file.par1, "par1")
(out <- runCarthagene(cg, "cgversion"))
(out <- runCarthagene(cg, "dsinfo"))
out.mrkinfo <- runCarthagene(cg, "mrkinfo")
dim(mrk.info <- parseCgMrkinfo(out.mrkinfo))
head(mrk.info)
(out <- runCarthagene(cg, "heapsizeget"))

Look at the pairwise matrices of recombination fraction, LOD and genetic distances:

out.2rf <- runCarthagene(cg, "mrkfr2p")
out.2lod <- runCarthagene(cg, "mrklod2p")
out.2dist <- runCarthagene(cg, "mrkdist2p k")
pwrf <- parseCgPwMatrix(out.2rf, "mrkfr2p", mrk.info)
pwlod <- parseCgPwMatrix(out.2lod, "mrklod2p", mrk.info)
pwdist <- parseCgPwMatrix(out.2dist, "mrkdist2p", mrk.info)
plot(pwrf[upper.tri(pwrf)], pwlod[upper.tri(pwlod)],
     xlab="recombination fraction", ylab="LOD score")
plot(pwrf[upper.tri(pwrf)], pwdist[upper.tri(pwdist)],
     xlab="recombination fraction", ylab="genetic distance (Kosambi)")
pimage(pwrf, main="pairwise recombination fractions")
ord.rf <- seriate(pwrf)
pimage(pwrf, ord.rf, main="pairwise recombination fractions")

Show the cause between the different LOD curves by coloring the plot of recombination fractions versus LOD scores:

pws <- rbind(rf=pwrf[upper.tri(pwrf)], lod=pwlod[upper.tri(pwlod)])
names.per.col <- matrix(rownames(pwrf), nrow=nrow(pwrf), ncol=ncol(pwrf), byrow=FALSE)
names.per.row <- matrix(rownames(pwrf), nrow=nrow(pwrf), ncol=ncol(pwrf), byrow=TRUE)
npc.up <- names.per.col[upper.tri(names.per.col)]
npr.up <- names.per.row[upper.tri(names.per.row)]
colnames(pws) <- paste0(npc.up, "-", npr.up)

## recompute rec frac and LOD for a subset of SNP pairs
idx <- sample.int(n=ncol(pws), size=1000)
pairs <- as.data.frame(cbind(colnames(pws)[idx],
                             do.call(rbind, strsplit(colnames(pws)[idx], "-"))),
                       stringsAsFactors=FALSE)
colnames(pairs) <- c("pair", "mrk1", "mrk2")
pairs$nb.offs <- NA
pairs$nb.rec <- NA
pairs[,c("nb.offs", "nb.rec")] <- do.call(rbind, lapply(1:nrow(pairs), function(i){
  tmp <- genos.cg[[1]][unlist(pairs[i,c("mrk1","mrk2")]),]
  tmp.noNA <- tmp[, apply(tmp, 2, function(x){! any(x == "-")})]
  if(all(dim(table(tmp.noNA[1,], tmp.noNA[2,])) == c(2,2))){
    c(ncol(tmp.noNA), table(tmp.noNA[1,], tmp.noNA[2,])[1,2] +
                      table(tmp.noNA[1,], tmp.noNA[2,])[2,1])
  } else
    c(ncol(tmp.noNA), NA)
}))
pairs$rec.frac <- pairs[,"nb.rec"] / pairs[,"nb.offs"]
pairs[,"lod"] <- lodLinkage(pairs$rec.frac, pairs$nb.offs - pairs$nb.rec,
                            pairs$nb.rec)
plot(pairs$rec.frac, pairs$lod, col=ifelse(pairs$nb.offs == 200, "black", "red"))
legend("top", legend=c("n = 200", "n < 200"), col=c("black", "red"), pch=1, bty="n")
table(pairs$nb.offs)
## different LOD curves depending on the amount of NA's in marker pairs

## compare these recomputed LODs with CarthaGene's LODs
plot(pairs$lod, pws["lod", pairs$pair], xlab="recomputed LOD scores",
     ylab="CarthaGene's LOD scores", las=1)
pairs.above <- pairs$pair[pairs$rec.frac > 0.5]
summary(pws["lod", pairs.above[! is.na(pairs.above)]])
## differences seem to come from marker pairs with recomb frac above 0.5

## pairs of markers with no recomb btw them but a very small CarthaGene's LOD
idx <- which(apply(pws, 2, function(x){
  all(x[1] < 0.05, x[2] < 5)
}))
pws[, idx]
tmp <- as.data.frame(cbind(colnames(pws)[idx],
                           do.call(rbind, strsplit(colnames(pws)[idx], "-"))),
                     stringsAsFactors=FALSE)
colnames(tmp) <- c("pair", "mrk1", "mrk2")
tmp$nb.offs <- NA
tmp$nb.rec <- NA
tmp[, c("nb.offs", "nb.rec")] <- do.call(rbind, lapply(1:nrow(tmp), function(i){
  tmp.NA <- genos.cg[[1]][unlist(tmp[i,c("mrk1","mrk2")]),]
  tmp.noNA <- tmp.NA[, apply(tmp.NA, 2, function(x){! any(x == "-")}), drop=FALSE]
  if(all(dim(table(tmp.noNA[1,], tmp.noNA[2,])) == c(2,2))){
    c(ncol(tmp.noNA), table(tmp.noNA[1,], tmp.noNA[2,])[1,2] +
                      table(tmp.noNA[1,], tmp.noNA[2,])[2,1])
  } else
    c(ncol(tmp.noNA), NA)
}))
tmp
## seems to be marker pairs with too many NA's

Determine the linkage groups, by specifying a distance threshold and a LOD threshold (two markers are put in the same group if their Haldane distance and two-points LOD are below the thresholds):

cmd <- "group 0.2 1.5" # params chosen by trial and error
out.group <- runCarthagene(cg, cmd)
linkgroups.par1 <- parseCgGroup(out.group, mrk.info)
table(linkgroups.par1$linkage.group)

Check that all markers put in the same linkage group belong to the same chromosome:

lg2chr <- tapply(linkgroups.par1$locus, factor(linkgroups.par1$linkage.group),
                 function(locus){
                   setNames(genomes$snp.coords[gsub("_m", "", locus), "chr"],
                            locus)
                 })
lapply(lg2chr, table)

Note that knowing in which linkage group each marker is, is enough to determine parental phases. However, if the marker orders and inter-marker genetic distances are also required, it is necessary to build a genetic map.

Look at double markers:

out.mrkdbl <- runCarthagene(cg, "mrkdouble")
length(out.mrkdbl)
head(out.mrkdbl, n=10)

Determine the marker order and inter-marker distances for the first linkage group, by building a "framework" map:

runCarthagene(cg, "mrkselset [groupget 1]")
system.time(
    out.buildfw <- runCarthagene(cg, "buildfw 3 3 {} 1"))
length(out.buildfw)

Check its reliability:

out.heaprint <- runCarthagene(cg, "heaprint")
(maps.info <- parseCgHeaprint(out.heaprint))
system.time(
    out.flips <- runCarthagene(cg, "flips 4 3 1"))
length(out.flips)
system.time(
    out.polish <- runCarthagene(cg, "polish"))
length(out.polish)
system.time(
    out.squeeze <- runCarthagene(cg, "squeeze 20"))
out.squeeze
out.heaprint <- runCarthagene(cg, "heaprint")
(maps.info <- parseCgHeaprint(out.heaprint))

Get the maps (best is last):

cmd <- paste0("maprintd ", maps.info$id[which.max(maps.info$log10.lik)])
out.maprintd <- runCarthagene(cg, cmd)
map.group1 <- parseCgMaprintd(out.maprintd)
str(map.group1)
head(map.group1)
out.bestprintd <- runCarthagene(cg, "bestprintd")
bestmap.group1 <- parseCgMaprintd(out.bestprintd)
head(bestmap.group1)

Do the same for a linkage group corresponding to the other chromosome:

runCarthagene(cg, "mrkselset [groupget 3]")
system.time(
    out.buildfw <- runCarthagene(cg, "buildfw 3 3 {} 1"))
out.heaprint <- runCarthagene(cg, "heaprint")
(maps.info <- parseCgHeaprint(out.heaprint))
system.time(
    out.flips <- runCarthagene(cg, "flips 4 3 1"))
system.time(
    out.polish <- runCarthagene(cg, "polish"))
system.time(
    out.squeeze <- runCarthagene(cg, "squeeze 20"))
out.heaprint <- runCarthagene(cg, "heaprint")
(maps.info <- parseCgHeaprint(out.heaprint))
out.bestprintd <- runCarthagene(cg, "bestprintd")
bestmap.group3 <- parseCgMaprintd(out.bestprintd)

Save the whole map:

map.par1.cg <-
  rbind(data.frame(linkage.group=rep(unique(lg2chr[[1]][bestmap.group1$locus]),
                                     nrow(bestmap.group1)),
                   locus=bestmap.group1$locus,
                   genetic.distance=bestmap.group1$cum.dist.kosambi,
                   stringsAsFactors=FALSE),
        data.frame(linkage.group=rep(unique(lg2chr[[3]][bestmap.group3$locus]),
                                     nrow(bestmap.group3)),
                   locus=bestmap.group3$locus,
                   genetic.distance=bestmap.group3$cum.dist.kosambi,
                   stringsAsFactors=FALSE))
tapply(map.par1.cg$genetic.distance, factor(map.par1.cg$linkage.group), max)
file.map.par1.cg <- paste0("bestmap_carthagene_parent1-",
                           colnames(genos.jm)[1], ".txt")
write.table(x=map.par1.cg, file=file.map.par1.cg, quote=FALSE, sep="\t",
            row.names=FALSE)
## map.par1.cg <- read.table(file.map.par1.cg, header=TRUE, sep="\t")

Save CarthaGene's state:

file.cgsave <- "cgsave.tcl"
(out <- runCarthagene(cg, paste("cgsave", file.cgsave)))
file.export <- "cg_map.txt"
(out <- runCarthagene(cg, paste("cgexport", file.export, "tuto-genmap-outbreds")))

Close and clean:

closeCarthagene(cg, file=file.par1)
file.remove(file.par1)
file.remove(file.map.par1.cg)

Build a map for parent 2 (brief)

Load the data set:

file.par2 <- paste0("genos_carthagene_parent2-", names(genos.cg)[2], ".txt")
writeCartagene(genos.cg[[2]], file.par2)
cg <- openCarthagene(file.par2, "par2")

Define linkage groups:

linkgroups.par2 <-
  defLinkgroupsWithCarthagene(cg=cg, dist.thresh=0.3, lod.thresh=5,
                              mrk2chr=mrk2chr)

Build the map for the first group:

mrk2phy <- setNames(genomes$snp.coords$pos, rownames(genomes$snp.coords))
bestmap.lg1 <- estMrkOrderGenDistsWithCarthagene(cg, linkgroups.par2, lg.id=1,
                                                 keep.thresh=3, add.thresh=3,
                                                 mrk2phy=mrk2phy)

Do the same for the other chromosome:

bestmap.lg3 <- estMrkOrderGenDistsWithCarthagene(cg, linkgroups.par2, lg.id=3,
                                                 keep.thresh=3, add.thresh=3,
                                                 mrk2phy=mrk2phy)

For data sets of a bigger size, the batchtools package can be used to launch the function estMrkOrderGenDistsWithCarthagene() on each linkage group in parallel.

Save the whole map:

map.par2.cg <-
  rbind(data.frame(linkage.group=rep(1, nrow(bestmap.lg1)),
                   chr="chr2",
                   locus=bestmap.lg1$locus,
                   genetic.distance=bestmap.lg1$cum.dist.kosambi,
                   physical.distance=bestmap.lg1$physical.distance,
                   stringsAsFactors=FALSE),
        data.frame(linkage.group=rep(3, nrow(bestmap.lg3)),
                   chr="chr1",
                   locus=bestmap.lg3$locus,
                   genetic.distance=bestmap.lg3$cum.dist.kosambi,
                   physical.distance=bestmap.lg3$physical.distance,
                   stringsAsFactors=FALSE))
tapply(map.par2.cg$genetic.distance, factor(map.par2.cg$chr), max)
file.map.par2.cg <- paste0("bestmap_carthagene_parent2-",
                           colnames(genos.jm)[2], ".txt")
write.table(x=map.par2.cg, file=file.map.par2.cg, quote=FALSE, sep="\t",
            row.names=FALSE)
## map.par2.cg <- read.table(file.map.par2.cg, header=TRUE, sep="\t")

Close and clean:

closeCarthagene(cg, file=file.par2)
file.remove(file.par2)
file.remove(file.map.par2.cg)

Use R/qtl to explore parental maps

Make a qtl::cross object for the first parent, and look at it:

cross.cg.par1 <- setupQtlCrossObject(gendat=t(genos.qtl[[1]]), cross.type="bc",
                                     genmap=map.par1.cg)
summary(cross.cg.par1)
sum(duplicated(map.par1.cg$genetic.distance[map.par1.cg$linkage.group == "chr2"]))
summaryMap(cross.cg.par1)
plotMap(cross.cg.par1)

Make a qtl::cross object for the second parent, and look at it:

cross.cg.par2 <- setupQtlCrossObject(gendat=t(genos.qtl[[2]]), cross.type="bc",
                                     genmap=map.par2.cg)
summary(cross.cg.par2)
sum(duplicated(map.par2.cg$genetic.distance[map.par2.cg$linkage.group == 1]))
summaryMap(cross.cg.par2)
plotMap(cross.cg.par2)

Plot physical vs genetic distances:

plotPhyVsGenDistances(map.par2.cg, xlim=c(0, chr.len.phy))

Build a consensus map

If both parental genetic maps are similar in terms of recombination rates, it is possible to make a consensus map.

Once linkage groups are determined for both parents, linkage phases can be deduced:

genos.jm.phased <-
  setJoinMapPhasesFromParentalLinkGroups(
      x=genos.jm[,-c(1:8)],
      lg.par1=linkgroups.par1[linkgroups.par1$linkage.group %in% c(1,3),],
      lg.par2=linkgroups.par2[linkgroups.par2$linkage.group %in% c(1,3),])
table(genos.jm.phased$phase, useNA="always")
pairs.seg.phase <- paste0(genos.jm.phased$seg, "_", genos.jm.phased$phase)
table(pairs.seg.phase, useNA="always")

TODO

Build genetic maps with ASMap

Prepare the input data:

dim(X <- genos.doses[, estimSnpMaf(genos.doses[-(1:2),]) > 0.15])
genos.asmap <- genoDoses2ASMap(X=X)
sapply(genos.asmap, dim)

Build a map for parent 1:

system.time(
    cross.mst.par1 <- mstmap(genos.asmap$parent1,
                             pop.type="BC",
                             dist.fun="kosambi",
                             objective.fun="COUNT",
                             p.value=10^(-6), # proba below which markers are in same group
                             miss.thresh=1,
                             trace=TRUE,
                             as.cross=TRUE))
summary(cross.mst.par1)

Identify which linkage group corresponds to which chromosome(s):

plot.map(cross.mst.par1)
heatMap(cross.mst.par1)
nmar(cross.mst.par1)
(lg2chr <- lapply(cross.mst.par1$geno, function(lg){
  table(genomes$snp.coords[gsub("_m", "", names(lg$map)), "chr"])
}))
head(cross.mst.par1$geno$L2$map)
head(cross.mst.par1$geno$L3$map)
head(cross.mst.par1$geno$L7$map)
head(cross.mst.par1$geno$L8$map)
profileMark(cross.mst.par1, crit.val="bonf")
clones <- genClones(cross.mst.par1)
clones$cgd
(pair1 <- c(as.character(clones$cgd[1,1]),
            as.character(clones$cgd[1,2])))
if("A.vr.cross" %in% ls()){
  print(A.vr.cross[pair1, pair1])
  print(A.t.cross[pair1, pair1])
}
loc.crovers.off[[pair1[1]]]
loc.crovers.off[[pair1[2]]]
idx <- grep(paste(pair1, collapse="|"), rownames(haplos.offs$chr1))
plotHaplosMatrix(haplos.offs$chr1[idx[c(1,3,2,4)],], main="Haplotypes (chr1)")
plotHaplosMatrix(haplos.offs$chr2[idx[c(1,3,2,4)],], main="Haplotypes (chr2)")

Build a map for parent 2:

TODO

map.c <- combineMap(cross.mst.par1, cross.mst.par2)

Clean:

if(file.exists("MSToutput.txt"))
  file.remove("MSToutput.txt")

Appendix

t1 <- proc.time(); t1 - t0
print(sessionInfo(), locale=FALSE)


timflutre/rutilstimflutre documentation built on Aug. 18, 2024, 7:43 p.m.