In neyhartj/pbsimData: Observed data to support simulations using the pbsim package.

knitr::opts_chunk$set(echo = TRUE)

Introduction

This markdown file outlines the preparation of BOPA SNP genotype data two barley populations:

Spring two-row from 5 U.S. breeding programs (n = 183)
Spring six-row from 2 U.S. breeding programs (n = 768)

# Load packages
library(tidyverse)
library(stringr)
library(qtl)
library(pbsim)

# Define a function to take a matrix and compare all columns and return an index of the identical columns
identical.columns <- function(input.matrix) {

  # Error handling
  input.matrix <- as.matrix(input.matrix)

  # Save the vector 1:ncol(input.matrix)
  col.ind <- 1:ncol(input.matrix)

  # Create a comparison list
  comparison.list <- list()

  # Loop over the number of columns
  for (i in col.ind) {

    # Extract column i
    col.i <- input.matrix[,i]
    # Compare it to all other columns
    compare.i <- apply(X = as.matrix(input.matrix[,setdiff(col.ind, i)]), MARGIN = 2, FUN = function(column) identical(col.i, column) )
    # Find the columns that match
    col.compare <- setdiff(col.ind, i)[compare.i]

    # Add to the list
    comparison.list[[i]] <- sort(c(i, col.compare))
  }

  # Find the unique groups
  comparison.list <- unique(comparison.list)

  # Return the comparison list
  return(comparison.list)
} # Close the function

Data Acquisition

The data were acquired from T3 and downloaded without any filters.

Two maps were downloaded for all SNPs

2009 consensus map from @Close2009
2011 consensus map from @Munoz-Amatriain2011

Processing

Genetic Maps

First the genetic maps of @Close2009 and @Munoz-Amatriain2011 will be jittered so they are consistent in the two-row and six-row genetic data.

# Read in the s2 data
## Directory of the unzipped download folder for the Spring Two-Row CAP Lines
s2_data_dir <- "C:/Users/Jeff/Google Drive/Barley Lab/Projects/Genomics/BOPA/2R_CAP/2R_CAP_T3_download/"
# Read in the data
s2_genos <- read_tsv(file = file.path(s2_data_dir, "genotype.hmp.txt"))
# Extract the map
s2_snp_info <- s2_genos %>%
  select(rs:pos)

## Directory of the unzipped download folder for the Spring Six-Row CAP Lines
s6_data_dir <- "C:/Users/Jeff/Google Drive/Barley Lab/Projects/Genomics/BOPA/S6TP_BOPA/"
# Read in the data
s6_genos <- read_tsv(file = file.path(s6_data_dir, "genotype.hmp.txt"))
# Extract the map
s6_snp_info <- s6_genos %>%
  select(rs:pos)


# Find the common markers
common_markers <- intersect(s2_snp_info$rs, s6_snp_info$rs)

# Designate the order of these markers as the ma_map
# Also remove those with unknown positions
ma_map <- s2_snp_info %>%
  filter(rs %in% common_markers,
         chrom != "UNK") %>%
  select(-alleles)

# Read in the Close 2009 map
close_map <- read_tsv(file = "C:/Users/Jeff/Google Drive/Barley Lab/Projects/Genomics/BOPA/close_et_al_09_genetic_map.txt", skip = 1, col_names = c("rs", "chrom", "pos"))

# Extract only markers in the ma_map
close_map1 <- close_map %>% 
  filter(rs %in% ma_map$rs,
         chrom != "UNK") %>%
  # Convert hecto-Morgans to cM
  rename(cM_pos = pos)

# And reciprocate for the ma map
ma_map1 <- ma_map %>%
  filter(rs %in% close_map1$rs) %>%
  # Convert hecto-Morgans to cM
  mutate(cM_pos = pos / 1000) %>%
  select(-pos)



# Convert each to a qtl map and jitter, the convert back to a table
ma_map2 <- ma_map1 %>% 
  as.data.frame() %>% 
  column_to_rownames("rs") %>% 
  qtl::table2map() %>%
  qtl::jittermap() %>%
  qtl::map2table() %>%
  rownames_to_column("rs") %>%
  rename(chrom = chr, cM_pos = pos)

close_map2 <- close_map1 %>% 
  as.data.frame() %>% 
  column_to_rownames("rs") %>% 
  qtl::table2map() %>%
  qtl::jittermap() %>%
  qtl::map2table() %>%
  rownames_to_column("rs") %>%
  rename(chrom = chr, cM_pos = pos)

Two Row Barley Data

# Extract the metadata (map, chromosome, etc)
snp_info <- s2_genos %>%
  select(rs:pos)

# Calculate cM position from the hecto-Morgan positions given
snp_info1 <- snp_info %>%
  mutate(cM_pos = pos / 1000) %>%
  select(-pos)


# Trim the genotype matrix
s2_genos_mat <- s2_genos %>%
  select(-rs:-pos) %>% 
  apply(X = ., MARGIN = 2, FUN = as.numeric) %>%
  structure(dimnames = list(s2_genos$rs, colnames(.))) %>%
  t()

## Remove markers with > 10% missing data
marker_missing <- is.na(s2_genos_mat) %>%
  colMeans()

s2_genos_mat1 <- s2_genos_mat[,marker_missing <= 0.10]

# Remove entries with > 10% missing data
entry_missing <- is.na(s2_genos_mat1) %>%
  rowMeans()

s2_genos_mat2 <- s2_genos_mat1[entry_missing <= 0.10,]

# Remove markers that are monomorphic over the whole population
marker_polymorphic <- s2_genos_mat2 %>% 
  apply(MARGIN = 2, FUN = n_distinct, na.rm = T) %>%
  {. > 1}

s2_genos_mat3 <- s2_genos_mat2[,marker_polymorphic]

# Trim the snp_info data.frame and remove unknown
snp_info2 <- snp_info1 %>% 
  filter(rs %in% colnames(s2_genos_mat3),
         chrom != "UNK")

# Remove the unknowns from the genotype matrix
s2_genos_mat4 <- s2_genos_mat3[,snp_info2$rs]


### Processing of the marker matrix for use in simulation
# Remove redundant markers
# These are characterized by having the same genotypes across all samples AND fall on the same cM position

# Split the marker information by unique position
unique_snp_list <- snp_info2 %>% 
  split(list(.$chrom, .$cM_pos))

# Remove NULL
unique_snp_list <- unique_snp_list[sapply(unique_snp_list, nrow) != 0]

# Apply a function to each group of unique SNPs
non_redundant_marker_list <- lapply(unique_snp_list, FUN = function(uniq_info) {

  # If the number of markers is 1, just return the marker
  if (nrow(uniq_info) == 1) {
    return(uniq_info)

  } else { # Otherwise look more closely

    # Extract the marker names
    marker.names <- uniq_info$rs

    M.i <- s2_genos_mat4[,marker.names, drop = FALSE]

    # Determine if the genotype calls are the same between markers
    same.genos <- identical.columns(input.matrix = M.i)

    # Choose the first index from each group
    # Gather the names of the chosen markers
    non.redundant.marker.names <- colnames(M.i)[sapply(X = same.genos, FUN = function(group) group[1])]

    # Extract marker info for those markers and return
    snp_info2 %>%
      filter(rs %in% non.redundant.marker.names)

  } })

# Collapse the list
snp_info3 <- bind_rows(non_redundant_marker_list) %>%
  arrange(chrom, cM_pos)

# Subset the genotype matrix again
s2_genos_mat5 <- s2_genos_mat4[,snp_info3$rs]

## Impute
# Set hets to missing
s2_genos_mat5[s2_genos_mat5 == 0] <- NA

# Impute using mean
s2_genos_imputed <- s2_genos_mat5 %>% 
  apply(MARGIN = 2, FUN = function(snp) {
    # Mean genotype
    mean_geno <- mean(snp, na.rm = T)
    # Round
    mean_geno <- ifelse(mean_geno < 0, -1, 1)
    snp[is.na(snp)] <- mean_geno
    return(snp) })

s2_cap_genos <- s2_genos_imputed + 1

s2_snp_info <- snp_info3

# Convert the genotypes to an array of haploid chromosomes
s2_cap_haploid <- split(s2_cap_genos, 1:nrow(s2_cap_genos), drop = FALSE) %>%
  lapply(FUN = function(ind) rbind(ind / 2, ind / 2) )

# Empty array
s2_cap_haploid_array <- array(data = NA, dim = c(2, ncol(s2_cap_genos), length(s2_cap_haploid)),
                              dimnames = list(NULL, colnames(s2_cap_genos), row.names(s2_cap_genos)))

for (k in seq_along(s2_cap_haploid)) {
  s2_cap_haploid_array[,,k] <- s2_cap_haploid[[k]]
}


# Rename
s2_cap_haploid <- s2_cap_haploid_array

# Read in line details
s2_cap_line_info <- read_csv(file = file.path(s2_data_dir, "../2R_CAP_line_details.csv"))

# Subset the lines that are in the genotype data
s2_cap_line_info <-  s2_cap_line_info %>% 
  filter(Name %in% row.names(s2_cap_genos))

line_replacement <- structure(s2_cap_line_info$CAP_Name, names = s2_cap_line_info$Name)

row.names(s2_cap_genos) <- str_replace_all(row.names(s2_cap_genos), line_replacement)

### REMOVE THIS ###
# devtools::use_data(s2_cap_genos, s2_snp_info, s2_cap_haploid, s2_cap_line_info, overwrite = T)

Six-Row Barley Data

## Directory of the unzipped download folder for the Spring Two-Row CAP Lines
s6_data_dir <- "C:/Users/Jeff/Google Drive/Barley Lab/Projects/Genomic Selection/Genotypic Data/BOPA Markers/S6TP_BOPA/"


# Read in the data
s6_genos <- read_tsv(file = file.path(s6_data_dir, "genotype.hmp.txt"))

# Extract the metadata (map, chromosome, etc)
snp_info <- s6_genos %>%
  select(rs:pos)

# Calculate cM position from the hecto-Morgan positions given
snp_info1 <- snp_info %>%
  mutate(cM_pos = pos / 1000) %>%
  select(-pos)


# Trim the genotype matrix
s6_genos_mat <- s6_genos %>%
  select(-rs:-pos) %>%
  mutate_each(funs = funs(parse_number)) %>%
  as.matrix()

# Add rownames
row.names(s6_genos_mat) <- snp_info1$rs

# Transpose
s6_genos_mat1 <- t(s6_genos_mat)

## Remove markers with > 10% missing data
marker_missing <- is.na(s6_genos_mat1) %>%
  colMeans()

s6_genos_mat2 <- s6_genos_mat1[,marker_missing <= 0.10]

# Remove entries with > 10% missing data
entry_missing <- is.na(s6_genos_mat2) %>%
  rowMeans()

s6_genos_mat3 <- s6_genos_mat2[entry_missing <= 0.10,]

# Keep only markers that are polymorphic in BOTH the MN and ND populations
ND_lines <- str_subset(row.names(s6_genos_mat3), "^ND")
MN_lines <- setdiff(row.names(s6_genos_mat3), ND_lines)

marker_poly_ND <- s6_genos_mat3 %>% 
  subset(row.names(.) %in% ND_lines) %>%
  apply(MARGIN = 2, FUN = n_distinct, na.rm = T) %>%
  {. > 1}

marker_poly_MN <- s6_genos_mat3 %>% 
  subset(row.names(.) %in% MN_lines) %>%
  apply(MARGIN = 2, FUN = n_distinct, na.rm = T) %>%
  {. > 1}


s6_genos_mat4 <- s6_genos_mat3[,marker_poly_MN & marker_poly_ND]

# Trim the snp_info data.frame and remove unknown
snp_info2 <- snp_info1 %>% 
  filter(rs %in% colnames(s6_genos_mat4),
         chrom != "UNK")

# Remove the unknowns from the genotype matrix
s6_genos_mat5 <- s6_genos_mat4[,snp_info2$rs]


### Processing of the marker matrix for use in simulation
# Remove redundant markers
# These are characterized by having the same genotypes across all samples AND fall on the same cM position

# Split the marker information by unique position
unique_snp_list <- snp_info2 %>% 
  split(list(.$chrom, .$cM_pos))

# Remove NULL
unique_snp_list <- unique_snp_list[sapply(unique_snp_list, nrow) != 0]

# Apply a function to each group of unique SNPs
non_redundant_marker_list <- lapply(unique_snp_list, FUN = function(uniq_info) {

  # If the number of markers is 1, just return the marker
  if (nrow(uniq_info) == 1) {
    return(uniq_info)

  } else { # Otherwise look more closely

    # Extract the marker names
    marker.names <- uniq_info$rs

    M.i <- s6_genos_mat5[,marker.names, drop = FALSE]

    # Determine if the genotype calls are the same between markers
    same.genos <- identical.columns(input.matrix = M.i)

    # Choose the first index from each group
    # Gather the names of the chosen markers
    non.redundant.marker.names <- colnames(M.i)[sapply(X = same.genos, FUN = function(group) group[1])]

    # Extract marker info for those markers and return
    snp_info2 %>%
      filter(rs %in% non.redundant.marker.names)

  } })

# Collapse the list
snp_info3 <- bind_rows(non_redundant_marker_list) %>%
  arrange(chrom, cM_pos)

# Subset the genotype matrix again
s6_genos_mat6 <- s6_genos_mat5[,snp_info3$rs]

## Impute
# Set hets to missing
s6_genos_mat6[s6_genos_mat6 == 0] <- NA

# Impute using mean
s6_genos_imputed <- s6_genos_mat6 %>% 
  apply(MARGIN = 2, FUN = function(snp) {
    # Mean genotype
    mean_geno <- mean(snp, na.rm = T)
    # Round
    mean_geno <- ifelse(mean_geno < 0, -1, 1)
    snp[is.na(snp)] <- mean_geno
    return(snp) })

s6_cap_genos <- s6_genos_imputed + 1

s6_snp_info <- snp_info3

# Convert the genotypes to an array of haploid chromosomes
s6_cap_haploid <- split(s6_cap_genos, 1:nrow(s6_cap_genos), drop = FALSE) %>%
  lapply(FUN = function(ind) rbind(ind / 2, ind / 2) )

# Empty array
s6_cap_haploid_array <- array(data = NA, dim = c(2, ncol(s6_cap_genos), length(s6_cap_haploid)),
                              dimnames = list(NULL, colnames(s6_cap_genos), row.names(s6_cap_genos)))

for (k in seq_along(s6_cap_haploid)) {
  s6_cap_haploid_array[,,k] <- s6_cap_haploid[[k]]
}


# Rename
s6_cap_haploid <- s6_cap_haploid_array


### REMOVE THIS ###
# devtools::use_data(s6_cap_genos, s6_snp_info, s6_cap_haploid, overwrite = T)

Genetic Maps

Using the filtered markers as above, extract their positions on the ma_map and the close_map

# Replace the positions in the s2 and s6 snp info data.frames with those from the different genetic maps
s2_snp_info <- s2_snp_info %>% 
  select(-cM_pos) %>% 
  left_join(., ma_map2, by = c("rs", "chrom")) %>%
  filter(!is.na(cM_pos))

s6_snp_info <- s6_snp_info %>% 
  select(-cM_pos) %>% 
  left_join(., ma_map2, by = c("rs", "chrom")) %>%
  filter(!is.na(cM_pos))

# Now assign them positions from the Close map
s2_snp_info_close <- s2_snp_info %>% 
  select(-cM_pos) %>% 
  left_join(., close_map2, by = c("rs", "chrom")) %>%
  filter(!is.na(cM_pos))

s6_snp_info_close <- s6_snp_info %>% 
  select(-cM_pos) %>% 
  left_join(., close_map2, by = c("rs", "chrom")) %>%
  filter(!is.na(cM_pos))


# Refilter the marker data for those markers
s2_cap_genos <- s2_cap_genos[,s2_snp_info1$rs]
s2_cap_haploid <- s2_cap_haploid[,s2_snp_info1$rs,]



s6_cap_genos <- s6_cap_genos[,s6_snp_info1$rs]
s6_cap_haploid <- s6_cap_haploid[,s6_snp_info1$rs,]


# Save all this data

devtools::use_data(s2_snp_info, s2_snp_info_close, s2_cap_genos, s2_cap_haploid,
                   s6_snp_info, s6_snp_info_close, s6_cap_genos, s6_cap_haploid, overwrite = T)