knitr::opts_chunk$set(echo = TRUE)
This document goes through the process of parsing the genotype data originally
described in Kover et al. (2009)
and converting it to a R/qtl2 cross2
object.
The data are downloaded from Richard Mott's server, currently hosted at: http://mtweb.cs.ucl.ac.uk/mus/www/magic/.
According to the website's description:
Note that the original 2009 paper used only 527 MAGIC lines and was based on the TAIR8 genome. The data and resources for MAGIC now available on this page have been expanded to 703 genotyped lines and the SNPs remapped to TAIR9/TAIR10.
This fact might explain some discrepancies in the SNP coordinates between the data processed here and the data processed by Karl Broman, who used the supplementary file from Gnan et al. (2014).
Despite the fact that these are not the actual original data published by
Kover et al. (2009), the final
cross2
object will be saved as kover2009
.
# Load packages library(tidyverse) # Make directory for files dir.create("downloaded_files") dir.create("tidy_files")
# Genotype files source magic_server <- "http://mtweb.cs.ucl.ac.uk/mus/www/magic/" geno_magic <- "magic.15012010.tar.gz" # Download the files download.file(file.path(magic_server, geno_magic), file.path("downloaded_files", geno_magic), method = "wget") # Untar the MAGIC genotypes untar(file.path("downloaded_files", geno_magic), exdir = "downloaded_files")
The format of these files is quite unconventional.
Here, I try to parse them to obtain tabular formats, which are needed for R/qtl2
package.
For each chromosome there's 3 sets of files with the following extensions:
My understanding is that this file contains the prior probability of each accession/strain state in that marker. So, if 3 accessions/strains share the same nucleotide allele, there is a 1/3 prior chance that an individual with that allele inherited this allele from each of those accessions.
This is the most challenging file format of the bunch. First I break down my understanding of the file format. Then I have a chunk of code that parses it.
Here's the first 10 lines of one of the files:
read_lines("downloaded_files/chr1.MAGIC.alleles", n_max = 10)
Regarding the last point, this is what the four lines for each marker refer to:
Regarding the first of those lines, this is the information it contains, tab-separated:
read_lines("downloaded_files/chr1.MAGIC.alleles", skip = 2, n_max = 1)
So, here's the code chunk that processes these different parts, into four tables (per chromosome), one with marker information, one with probability of missing genotype, one with probability of allele 1 and another with probability of allele 2.
tidy_founders <- list.files("downloaded_files/", pattern = ".alleles", full.names = TRUE) %>% map(function(chr_file) { # The first line of the file has information about the number of markers in the ## file and the number of founders ("strains") n_markers <- read_tsv(chr_file, n_max = 1, col_names = FALSE) %>% pull(X2) n_strains <- read_tsv(chr_file, n_max = 1, col_names = FALSE) %>% pull(X4) # The second line has the names of these "strains" strain_ids <- read_tsv(chr_file, skip = 1, n_max = 1, col_names = FALSE) %>% select(-X1) %>% unlist() %>% unname() # Extract the lines with marker information (every 4th value starting at line 3 of the file) # V1 = word "marker" # V2 = the marker name # V3 = the number 3 (what is this?) # V4 = the chromosome number # V5 = the position of the marker (in centimorgan, I assume) markers <- read_lines(chr_file, skip = 2)[seq(1, n_markers * 4, by = 4)] %>% map_dfr(function(i) { read.table(text = i, sep = "\t", header = FALSE, stringsAsFactors = FALSE) }) %>% select(marker = V2, chrom = V4, map = V5) %>% as_tibble() # Extract the missing genotype information (every 4th value starting at line 4 of the file) # V1 = word "allele" # V2 = the value "NA" # V3-V21 = the probability that this genotype is missing in the respective founder strain na_prob <- read_lines(chr_file, skip = 3)[seq(1, n_markers * 4, by = 4)] %>% map_dfr(function(i) { read.table(text = i, sep = "\t", header = FALSE, stringsAsFactors = FALSE) }) %>% select(V2:V21) names(na_prob) <- c("allele", strain_ids) # Extract genotype information for first allele (every 4th value starting at line 5 of the file) # V1 = word "allele" # V2 = the allele # V3-V21 = the probability that this strain carries the respective allele allele1_prob <- read_lines(chr_file, skip = 4)[seq(1, n_markers * 4, by = 4)] %>% map_dfr(function(i) { read.table( text = i, sep = "\t", header = FALSE, colClasses = c("character", "character", rep("numeric", 19)) ) }) %>% select(V2:V21) names(allele1_prob) <- c("allele", strain_ids) # Extract genotype information for second allele (every 4th value starting at line 6 of the file) # V1 = word "allele" # V2 = the allele # V3-V21 = the probability that this strain carries the respective allele allele2_prob <- read_lines(chr_file, skip = 5)[seq(1, n_markers * 4, by = 4)] %>% map_dfr(function(i) { read.table( text = i, sep = "\t", header = FALSE, colClasses = c("character", "character", rep("numeric", 19)) ) }) %>% select(V2:V21) names(allele2_prob) <- c("allele", strain_ids) # Make some sanity checks if(nrow(markers) == n_markers & nrow(na_prob) == n_markers & nrow(allele1_prob) == n_markers & nrow(allele2_prob) == n_markers){ list(bind_cols(markers, na_prob), bind_cols(markers, allele1_prob), bind_cols(markers, allele2_prob)) } else { stop("the number of rows didn't match") } }) # Bind each chromosome's table together ## pmap is useful here - parallel map across the list of lists tidy_founders <- pmap(tidy_founders, bind_rows) # Now bind these further into a single table tidy_founders <- bind_rows(tidy_founders)
This is the output we get:
tidy_founders %>% arrange(chrom, map, marker, !is.na(allele))
And we have r nrow(tidy_founders)/3
markers.
I don't quite understand the missing allele probability, because it's 1/19 for every strain in every marker:
tidy_founders %>% filter(is.na(allele)) %>% filter_at(vars(Bur:Zu), any_vars(. != round(1/19, 3)))
However, we can see that some markers do have missing allele information, some of them for quite a lot of the parental strains:
tidy_founders %>% gather("strain", "prob", Bur:Zu) %>% group_by(marker, strain) %>% summarise(allele = allele[which.max(prob)]) %>% filter(is.na(allele)) %>% count(marker) %>% arrange(desc(n))
Not sure how that information is used by happy.hbrem::happy()
function, but
perhaps this affects imputation in some of these markers.
This is a more straightforward space-delimited text file:
read_delim("downloaded_files/chr1.MAGIC.data", n_max = 10, col_names = FALSE, col_types = cols(.default = col_character()), delim = " ")
But we need to ensure that the markers are named correctly. So I extract the marker names from the ".alleles" files:
geno <- map(1:5, function(i){ # Get the marker names # Read the lines of the file # subset the lines to contain the word marker # collapse vector using "newline" to recreate a "tsv" file # use read.table with text option to read this as a table # extract the second column that contains the marker name markers <- read_lines(paste0("downloaded_files/chr", i, ".MAGIC.alleles"), skip = 2) %>% str_subset("marker") %>% str_c(collapse = "\n") %>% read.table(text = ., sep = "\t", header = FALSE, stringsAsFactors = FALSE) %>% pull(V2) # Read the genotypes file geno <- read_delim(paste0("downloaded_files/chr", i, ".MAGIC.data"), col_names = FALSE, col_types = cols(.default = col_character()), delim = " ") # The first 6 columns are mostly missing data. Retain only the first column (ID) # and every other column starting at 7. We retain only every other column because # all sites are homozygous geno <- geno %>% select(1, seq(7, ncol(geno), by = 2)) # Add informative column names colnames(geno) <- c("line", markers) return(geno) }) # Join all the genotypes - see http://www.brodrigues.co/blog/2016-07-30-merge-a-list-of-datasets-together/ geno <- Reduce(function(x, y) full_join(x, y, by = "line"), geno)
Note: the published genotypes (https://doi.org/10.1371/journal.pgen.1000551.s003) seem to have the wrong alleles, at least compared to what is available from http://signal.salk.edu/atg1001/3.0/gebrowser.php browser. See block of code below for an example:
kover2009 <- read_tsv("https://journals.plos.org/plosgenetics/article/file?id=10.1371/journal.pgen.1000551.s003&type=supplementary", col_types = cols(.default = col_character())) kover2009 <- kover2009 %>% filter(!str_detect(line, "MAGIC") & str_detect(line, "-")) kover2009 <- kover2009 %>% gather("marker", "genotype", -line) %>% mutate(line = str_remove(line, "-.*")) # Compare these two kover2009 %>% filter(marker == "CRY2_1021") tidy_founders %>% gather("line", "prob", Bur:Zu) %>% group_by(marker, line) %>% filter(prob == max(prob)) %>% ungroup() %>% select(marker, line, allele) %>% filter(marker == "CRY2_1021")
These are tab-delimited and reasonably simple.
markers_map <- list.files("downloaded_files/", pattern = ".map", full.names = TRUE) %>% map_dfr(read_tsv, col_types = "cccii")
However, not all r nrow(markers_map)
markers in this map are in the genotype data.
sum(markers_map$marker %in% colnames(geno))
This means we lack physical location information for 9 markers. We could remove those from the dataset, although one might consider retaining them in the future...
# Retain only markers with genotype data markers_map <- markers_map %>% filter(marker %in% colnames(geno)) # Retain only genotypes for markers with map information geno <- geno[, c("line", markers_map$marker)] # Retain only founder genotypes for those markers tidy_founders <- tidy_founders %>% filter(marker %in% markers_map$marker)
It's worth noting that the genetic map location of markers from the tidy_founders
table is a linear function of the physical location:
full_join(markers_map, select(tidy_founders, marker, map), by = "marker") %>% ggplot(aes(bp, map)) + geom_point()
Finally, I produce a table with the reference (Col-0) allele at each marker:
ref_allele <- tidy_founders %>% group_by(marker) %>% summarise(ref = allele[which(Col == max(Col))]) head(ref_allele)
This parsing resulted in four files:
tidy_founders
which contains genotype information for the foundersgeno
which contains genotype information for the MAGIC linesmarkers_map
which contains physical map information for each markerref_allele
which contains the reference allele for each markerhead(tidy_founders) head(geno) head(markers_map)
Each of these contains all the relevant information from Kover et al. 2009 in a tidy set of tables.
Instead of exporting one file per chromosome, we export a single set of files for all chromosomes (the files aren't that big).
From the tidy_founders
table, we need some gimnastics to write into the happy
format:
# First line of file contains number of markers and number of strains paste("markers", length(unique(tidy_founders$marker)), "strains 19", sep = "\t") %>% write_lines("tidy_files/kover2009_happy_alleles.txt") # Second line contains the strain names "strain_names\tBur\tCan\tCol\tCt\tEdi\tHi\tKn\tLer\tMt\tNo\tOy\tPo\tRsch\tSf\tTsu\tWil\tWs\tWu\tZu" %>% write_lines("tidy_files/kover2009_happy_alleles.txt", append = TRUE) # Further lines contain information about each marker ## Here I nest the table, to write each marker sequentially temp <- tidy_founders %>% mutate(marker_header = paste("marker", marker, "3", chrom, map, sep = "\t")) %>% group_by(marker, chrom, map) %>% nest() walk(temp$data, function(i){ # This appends the marker header information into the file i$marker_header %>% unique() %>% write_lines("tidy_files/kover2009_happy_alleles.txt", append = TRUE) # This appends the rest of the information for each allele i %>% mutate(prefix = "allele") %>% select(prefix, allele, Bur:Zu) %>% write_tsv("tidy_files/kover2009_happy_alleles.txt", append = TRUE) }) rm(temp)
We can also write the ".ped" file, which needs two columns for each marker, as well as 6 columns in the beginning:
cbind(geno[, 1], geno[, 1], 0, 0, NA, NA, geno[, rep(2:ncol(geno), each = 2)]) %>% write_tsv("tidy_files/kover2009_happy_genotypes.ped", col_names = FALSE)
Finally, the map file is just straightforward written out:
markers_map %>% write_tsv("tidy_files/kover2009_happy_markers.map")
R/qtl2 requires the following files (more details here):
json
file with information about the crosscsv
files with MAGIC and founder genotypes with an ID
column + a column for each markerNA
for missing genotypescsv
files for genetic and physical maps with columns marker,chr,pos
'{ "description": "Arabidopsis MAGIC", "crosstype": "magic19", "sep": ",", "comment.char": "#", "geno": "kover2009_qtl2_geno.csv", "founder_geno": "kover2009_qtl2_founders.csv", "genotypes": { "1": "1", "2": "2", "3": "3" }, "gmap": "kover2009_qtl2_gmap.csv", "pmap": "kover2009_qtl2_pmap.csv", "alleles": ["Bur", "Can", "Col", "Ct", "Edi", "Hi", "Kn", "Ler", "Mt", "No", "Oy", "Po", "Rsch", "Sf", "Tsu", "Wil", "Ws", "Wu", "Zu"] }' %>% write_lines("./tidy_files/kover2009_qtl2.json")
To convert the genotypes to R/qtl2 format we convert each genotype to numeric code using "1" for reference and "3" for alternative allele (all sites are homozygous, so there is no "2" category).
geno %>% gather("marker", "allele", -line) %>% left_join(ref_allele, by = "marker") %>% mutate(allele = ifelse(allele == ref, "1", "3")) %>% select(-ref) %>% spread(marker, allele) %>% rename(ID = line) %>% write_csv("./tidy_files/kover2009_qtl2_geno.csv")
A similar thing is done for the founder genotypes:
# Need to founders_geno <- tidy_founders %>% group_by(marker) %>% # extract the allele with highest probability summarise_at(vars(Bur:Zu), funs(allele[which(. == max(.))])) %>% # add a reference allele left_join(ref_allele, by = "marker") %>% # convert genotypes to numeric categories mutate_at(vars(-marker, -ref), funs(ifelse(. == ref, "1", "3"))) %>% select(-ref)
head(founders_geno)
This table needs to be transposed though
# Transpose the table founders_geno <- founders_geno %>% as.data.frame() %>% column_to_rownames("marker") %>% as.matrix() %>% t() %>% as_tibble(rownames = "ID") head(founders_geno)
founders_geno %>% write_csv("./tidy_files/kover2009_qtl2_founders.csv")
markers_map %>% select(marker, chr = chromosome, pos = bp) %>% write_csv("./tidy_files/kover2009_qtl2_pmap.csv") tidy_founders %>% distinct(marker, chr = chrom, pos = map) %>% write_csv("./tidy_files/kover2009_qtl2_gmap.csv")
kover2009 <- qtl2::read_cross2("./tidy_files/kover2009_qtl2.json") usethis::use_data(kover2009, overwrite = TRUE, compress = "xz")
# Remove directories unlink("downloaded_files/", recursive = TRUE) unlink("tidy_files/", recursive = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.