knitr::opts_chunk$set(echo = TRUE)

Introduction

This markdown file outlines the preparation of BOPA SNP genotype data two barley populations:

# Load packages
library(tidyverse)
library(stringr)
library(qtl)
library(pbsim)

# Define a function to take a matrix and compare all columns and return an index of the identical columns
identical.columns <- function(input.matrix) {

  # Error handling
  input.matrix <- as.matrix(input.matrix)

  # Save the vector 1:ncol(input.matrix)
  col.ind <- 1:ncol(input.matrix)

  # Create a comparison list
  comparison.list <- list()

  # Loop over the number of columns
  for (i in col.ind) {

    # Extract column i
    col.i <- input.matrix[,i]
    # Compare it to all other columns
    compare.i <- apply(X = as.matrix(input.matrix[,setdiff(col.ind, i)]), MARGIN = 2, FUN = function(column) identical(col.i, column) )
    # Find the columns that match
    col.compare <- setdiff(col.ind, i)[compare.i]

    # Add to the list
    comparison.list[[i]] <- sort(c(i, col.compare))
  }

  # Find the unique groups
  comparison.list <- unique(comparison.list)

  # Return the comparison list
  return(comparison.list)
} # Close the function

Data Acquisition

The data were acquired from T3 and downloaded without any filters.

Two maps were downloaded for all SNPs

Processing

Genetic Maps

First the genetic maps of @Close2009 and @Munoz-Amatriain2011 will be jittered so they are consistent in the two-row and six-row genetic data.

# Read in the s2 data
## Directory of the unzipped download folder for the Spring Two-Row CAP Lines
s2_data_dir <- "C:/Users/Jeff/Google Drive/Barley Lab/Projects/Genomics/BOPA/2R_CAP/2R_CAP_T3_download/"
# Read in the data
s2_genos <- read_tsv(file = file.path(s2_data_dir, "genotype.hmp.txt"))
# Extract the map
s2_snp_info <- s2_genos %>%
  select(rs:pos)

## Directory of the unzipped download folder for the Spring Six-Row CAP Lines
s6_data_dir <- "C:/Users/Jeff/Google Drive/Barley Lab/Projects/Genomics/BOPA/S6TP_BOPA/"
# Read in the data
s6_genos <- read_tsv(file = file.path(s6_data_dir, "genotype.hmp.txt"))
# Extract the map
s6_snp_info <- s6_genos %>%
  select(rs:pos)


# Find the common markers
common_markers <- intersect(s2_snp_info$rs, s6_snp_info$rs)

# Designate the order of these markers as the ma_map
# Also remove those with unknown positions
ma_map <- s2_snp_info %>%
  filter(rs %in% common_markers,
         chrom != "UNK") %>%
  select(-alleles)

# Read in the Close 2009 map
close_map <- read_tsv(file = "C:/Users/Jeff/Google Drive/Barley Lab/Projects/Genomics/BOPA/close_et_al_09_genetic_map.txt", skip = 1, col_names = c("rs", "chrom", "pos"))

# Extract only markers in the ma_map
close_map1 <- close_map %>% 
  filter(rs %in% ma_map$rs,
         chrom != "UNK") %>%
  # Convert hecto-Morgans to cM
  rename(cM_pos = pos)

# And reciprocate for the ma map
ma_map1 <- ma_map %>%
  filter(rs %in% close_map1$rs) %>%
  # Convert hecto-Morgans to cM
  mutate(cM_pos = pos / 1000) %>%
  select(-pos)



# Convert each to a qtl map and jitter, the convert back to a table
ma_map2 <- ma_map1 %>% 
  as.data.frame() %>% 
  column_to_rownames("rs") %>% 
  qtl::table2map() %>%
  qtl::jittermap() %>%
  qtl::map2table() %>%
  rownames_to_column("rs") %>%
  rename(chrom = chr, cM_pos = pos)

close_map2 <- close_map1 %>% 
  as.data.frame() %>% 
  column_to_rownames("rs") %>% 
  qtl::table2map() %>%
  qtl::jittermap() %>%
  qtl::map2table() %>%
  rownames_to_column("rs") %>%
  rename(chrom = chr, cM_pos = pos)

Two Row Barley Data

# Extract the metadata (map, chromosome, etc)
snp_info <- s2_genos %>%
  select(rs:pos)

# Calculate cM position from the hecto-Morgan positions given
snp_info1 <- snp_info %>%
  mutate(cM_pos = pos / 1000) %>%
  select(-pos)


# Trim the genotype matrix
s2_genos_mat <- s2_genos %>%
  select(-rs:-pos) %>% 
  apply(X = ., MARGIN = 2, FUN = as.numeric) %>%
  structure(dimnames = list(s2_genos$rs, colnames(.))) %>%
  t()

## Remove markers with > 10% missing data
marker_missing <- is.na(s2_genos_mat) %>%
  colMeans()

s2_genos_mat1 <- s2_genos_mat[,marker_missing <= 0.10]

# Remove entries with > 10% missing data
entry_missing <- is.na(s2_genos_mat1) %>%
  rowMeans()

s2_genos_mat2 <- s2_genos_mat1[entry_missing <= 0.10,]

# Remove markers that are monomorphic over the whole population
marker_polymorphic <- s2_genos_mat2 %>% 
  apply(MARGIN = 2, FUN = n_distinct, na.rm = T) %>%
  {. > 1}

s2_genos_mat3 <- s2_genos_mat2[,marker_polymorphic]

# Trim the snp_info data.frame and remove unknown
snp_info2 <- snp_info1 %>% 
  filter(rs %in% colnames(s2_genos_mat3),
         chrom != "UNK")

# Remove the unknowns from the genotype matrix
s2_genos_mat4 <- s2_genos_mat3[,snp_info2$rs]


### Processing of the marker matrix for use in simulation
# Remove redundant markers
# These are characterized by having the same genotypes across all samples AND fall on the same cM position

# Split the marker information by unique position
unique_snp_list <- snp_info2 %>% 
  split(list(.$chrom, .$cM_pos))

# Remove NULL
unique_snp_list <- unique_snp_list[sapply(unique_snp_list, nrow) != 0]

# Apply a function to each group of unique SNPs
non_redundant_marker_list <- lapply(unique_snp_list, FUN = function(uniq_info) {

  # If the number of markers is 1, just return the marker
  if (nrow(uniq_info) == 1) {
    return(uniq_info)

  } else { # Otherwise look more closely

    # Extract the marker names
    marker.names <- uniq_info$rs

    M.i <- s2_genos_mat4[,marker.names, drop = FALSE]

    # Determine if the genotype calls are the same between markers
    same.genos <- identical.columns(input.matrix = M.i)

    # Choose the first index from each group
    # Gather the names of the chosen markers
    non.redundant.marker.names <- colnames(M.i)[sapply(X = same.genos, FUN = function(group) group[1])]

    # Extract marker info for those markers and return
    snp_info2 %>%
      filter(rs %in% non.redundant.marker.names)

  } })

# Collapse the list
snp_info3 <- bind_rows(non_redundant_marker_list) %>%
  arrange(chrom, cM_pos)

# Subset the genotype matrix again
s2_genos_mat5 <- s2_genos_mat4[,snp_info3$rs]

## Impute
# Set hets to missing
s2_genos_mat5[s2_genos_mat5 == 0] <- NA

# Impute using mean
s2_genos_imputed <- s2_genos_mat5 %>% 
  apply(MARGIN = 2, FUN = function(snp) {
    # Mean genotype
    mean_geno <- mean(snp, na.rm = T)
    # Round
    mean_geno <- ifelse(mean_geno < 0, -1, 1)
    snp[is.na(snp)] <- mean_geno
    return(snp) })

s2_cap_genos <- s2_genos_imputed + 1

s2_snp_info <- snp_info3

# Convert the genotypes to an array of haploid chromosomes
s2_cap_haploid <- split(s2_cap_genos, 1:nrow(s2_cap_genos), drop = FALSE) %>%
  lapply(FUN = function(ind) rbind(ind / 2, ind / 2) )

# Empty array
s2_cap_haploid_array <- array(data = NA, dim = c(2, ncol(s2_cap_genos), length(s2_cap_haploid)),
                              dimnames = list(NULL, colnames(s2_cap_genos), row.names(s2_cap_genos)))

for (k in seq_along(s2_cap_haploid)) {
  s2_cap_haploid_array[,,k] <- s2_cap_haploid[[k]]
}


# Rename
s2_cap_haploid <- s2_cap_haploid_array

# Read in line details
s2_cap_line_info <- read_csv(file = file.path(s2_data_dir, "../2R_CAP_line_details.csv"))

# Subset the lines that are in the genotype data
s2_cap_line_info <-  s2_cap_line_info %>% 
  filter(Name %in% row.names(s2_cap_genos))

line_replacement <- structure(s2_cap_line_info$CAP_Name, names = s2_cap_line_info$Name)

row.names(s2_cap_genos) <- str_replace_all(row.names(s2_cap_genos), line_replacement)

### REMOVE THIS ###
# devtools::use_data(s2_cap_genos, s2_snp_info, s2_cap_haploid, s2_cap_line_info, overwrite = T)

Six-Row Barley Data

## Directory of the unzipped download folder for the Spring Two-Row CAP Lines
s6_data_dir <- "C:/Users/Jeff/Google Drive/Barley Lab/Projects/Genomic Selection/Genotypic Data/BOPA Markers/S6TP_BOPA/"


# Read in the data
s6_genos <- read_tsv(file = file.path(s6_data_dir, "genotype.hmp.txt"))

# Extract the metadata (map, chromosome, etc)
snp_info <- s6_genos %>%
  select(rs:pos)

# Calculate cM position from the hecto-Morgan positions given
snp_info1 <- snp_info %>%
  mutate(cM_pos = pos / 1000) %>%
  select(-pos)


# Trim the genotype matrix
s6_genos_mat <- s6_genos %>%
  select(-rs:-pos) %>%
  mutate_each(funs = funs(parse_number)) %>%
  as.matrix()

# Add rownames
row.names(s6_genos_mat) <- snp_info1$rs

# Transpose
s6_genos_mat1 <- t(s6_genos_mat)

## Remove markers with > 10% missing data
marker_missing <- is.na(s6_genos_mat1) %>%
  colMeans()

s6_genos_mat2 <- s6_genos_mat1[,marker_missing <= 0.10]

# Remove entries with > 10% missing data
entry_missing <- is.na(s6_genos_mat2) %>%
  rowMeans()

s6_genos_mat3 <- s6_genos_mat2[entry_missing <= 0.10,]

# Keep only markers that are polymorphic in BOTH the MN and ND populations
ND_lines <- str_subset(row.names(s6_genos_mat3), "^ND")
MN_lines <- setdiff(row.names(s6_genos_mat3), ND_lines)

marker_poly_ND <- s6_genos_mat3 %>% 
  subset(row.names(.) %in% ND_lines) %>%
  apply(MARGIN = 2, FUN = n_distinct, na.rm = T) %>%
  {. > 1}

marker_poly_MN <- s6_genos_mat3 %>% 
  subset(row.names(.) %in% MN_lines) %>%
  apply(MARGIN = 2, FUN = n_distinct, na.rm = T) %>%
  {. > 1}


s6_genos_mat4 <- s6_genos_mat3[,marker_poly_MN & marker_poly_ND]

# Trim the snp_info data.frame and remove unknown
snp_info2 <- snp_info1 %>% 
  filter(rs %in% colnames(s6_genos_mat4),
         chrom != "UNK")

# Remove the unknowns from the genotype matrix
s6_genos_mat5 <- s6_genos_mat4[,snp_info2$rs]


### Processing of the marker matrix for use in simulation
# Remove redundant markers
# These are characterized by having the same genotypes across all samples AND fall on the same cM position

# Split the marker information by unique position
unique_snp_list <- snp_info2 %>% 
  split(list(.$chrom, .$cM_pos))

# Remove NULL
unique_snp_list <- unique_snp_list[sapply(unique_snp_list, nrow) != 0]

# Apply a function to each group of unique SNPs
non_redundant_marker_list <- lapply(unique_snp_list, FUN = function(uniq_info) {

  # If the number of markers is 1, just return the marker
  if (nrow(uniq_info) == 1) {
    return(uniq_info)

  } else { # Otherwise look more closely

    # Extract the marker names
    marker.names <- uniq_info$rs

    M.i <- s6_genos_mat5[,marker.names, drop = FALSE]

    # Determine if the genotype calls are the same between markers
    same.genos <- identical.columns(input.matrix = M.i)

    # Choose the first index from each group
    # Gather the names of the chosen markers
    non.redundant.marker.names <- colnames(M.i)[sapply(X = same.genos, FUN = function(group) group[1])]

    # Extract marker info for those markers and return
    snp_info2 %>%
      filter(rs %in% non.redundant.marker.names)

  } })

# Collapse the list
snp_info3 <- bind_rows(non_redundant_marker_list) %>%
  arrange(chrom, cM_pos)

# Subset the genotype matrix again
s6_genos_mat6 <- s6_genos_mat5[,snp_info3$rs]

## Impute
# Set hets to missing
s6_genos_mat6[s6_genos_mat6 == 0] <- NA

# Impute using mean
s6_genos_imputed <- s6_genos_mat6 %>% 
  apply(MARGIN = 2, FUN = function(snp) {
    # Mean genotype
    mean_geno <- mean(snp, na.rm = T)
    # Round
    mean_geno <- ifelse(mean_geno < 0, -1, 1)
    snp[is.na(snp)] <- mean_geno
    return(snp) })

s6_cap_genos <- s6_genos_imputed + 1

s6_snp_info <- snp_info3

# Convert the genotypes to an array of haploid chromosomes
s6_cap_haploid <- split(s6_cap_genos, 1:nrow(s6_cap_genos), drop = FALSE) %>%
  lapply(FUN = function(ind) rbind(ind / 2, ind / 2) )

# Empty array
s6_cap_haploid_array <- array(data = NA, dim = c(2, ncol(s6_cap_genos), length(s6_cap_haploid)),
                              dimnames = list(NULL, colnames(s6_cap_genos), row.names(s6_cap_genos)))

for (k in seq_along(s6_cap_haploid)) {
  s6_cap_haploid_array[,,k] <- s6_cap_haploid[[k]]
}


# Rename
s6_cap_haploid <- s6_cap_haploid_array


### REMOVE THIS ###
# devtools::use_data(s6_cap_genos, s6_snp_info, s6_cap_haploid, overwrite = T)

Genetic Maps

Using the filtered markers as above, extract their positions on the ma_map and the close_map

# Replace the positions in the s2 and s6 snp info data.frames with those from the different genetic maps
s2_snp_info <- s2_snp_info %>% 
  select(-cM_pos) %>% 
  left_join(., ma_map2, by = c("rs", "chrom")) %>%
  filter(!is.na(cM_pos))

s6_snp_info <- s6_snp_info %>% 
  select(-cM_pos) %>% 
  left_join(., ma_map2, by = c("rs", "chrom")) %>%
  filter(!is.na(cM_pos))

# Now assign them positions from the Close map
s2_snp_info_close <- s2_snp_info %>% 
  select(-cM_pos) %>% 
  left_join(., close_map2, by = c("rs", "chrom")) %>%
  filter(!is.na(cM_pos))

s6_snp_info_close <- s6_snp_info %>% 
  select(-cM_pos) %>% 
  left_join(., close_map2, by = c("rs", "chrom")) %>%
  filter(!is.na(cM_pos))


# Refilter the marker data for those markers
s2_cap_genos <- s2_cap_genos[,s2_snp_info1$rs]
s2_cap_haploid <- s2_cap_haploid[,s2_snp_info1$rs,]



s6_cap_genos <- s6_cap_genos[,s6_snp_info1$rs]
s6_cap_haploid <- s6_cap_haploid[,s6_snp_info1$rs,]


# Save all this data

devtools::use_data(s2_snp_info, s2_snp_info_close, s2_cap_genos, s2_cap_haploid,
                   s6_snp_info, s6_snp_info_close, s6_cap_genos, s6_cap_haploid, overwrite = T)

References



neyhartj/pbsimData documentation built on May 31, 2019, 10:36 a.m.