README.md

RhAMPseq: Analysis of rhAmpSeq data logo

R build
status

Installation

You can install the development version from GitHub with:

# install.packages(c("hexSticker", "remotes")
remotes::install_github("villegar/RhAMPseq")

Data description

Author: J.L.

Files

Example

| Locus | Haplotypes | blank__vDNAfen551A01_A01 | 160_271_303__vDNAfen551A01_A02 | 160_271_311__vDNAfen551A01_A03 | | :----------------- | :-------------------------------------------------------------------------------- | :-------------------------- | :---------------------------------- | :---------------------------------- | | rh_chr1_10143990 | 2(0.37);4(0.16);9(0.10);5(0.09);1(0.07);3(0.07);8(0.05);13(0.04);6(0.03);7(0.03); | ./.:0 | 4/2:34,24 | 2/2:71 |

where:

So, if we wanted to know the DNA sequence of this marker, we would look up haplotypes 4 and 2 from the HaplotypeAllele.fasta file.

One other thing to note that is very important. the marker name, rh_ch1_10143990 stands for rhampseq, chromosome 1, position 10143990. This position information does not indicate the position of the polymorphism detected at this marker, it is the position of the primer sequence. So when we map these markers, they will be very near, but not exactly at, these positions.

Initial analysis

Raw data

Load raw data from the 3 files previously described.

raw <- load_data(fasta = "data/HaplotypeAllele.fasta", 
                 hap_geno = "data/hap_genotype",
                 count_mat = "data/readCountMatrixFile")

Haplotypes

# Load haplotypes for mapping family
mapping_family <- read_excel_col("data/Rhampseq_populations.xlsx", "A")
mapping_family_short <- unlist(lapply(mapping_family, 
                                      function(x) gsub("__.*", "", x)))
mapping_family_short <- unique(mapping_family_short)

## Extract genotypes from the hap_genotype file
hap_geno_names <- names(raw$hap_geno)
hap_geno_names_blank <- hap_geno_names[grepl("*blank*", hap_geno_names)]
## Drop "blank" genotypes
# hap_geno_names <- hap_geno_names[!grepl("*blank*", hap_geno_names)]
## Trim 'X' from genotypes [added when importing the data]
hap_geno_names <- gsub("X*", "", hap_geno_names)
## Further trimming of genotypes: 160_271_001__vDNAlon499A03_A01 -> 160_271_001
hap_geno_names <- unlist(lapply(hap_geno_names, 
                                function(x) gsub("__.*", "", x)))

## Lookup for mapping_family genotypes
hap_geno_names_idx <- hap_geno_names %in% mapping_family_short
## Extract matching genotypes
hap_geno_names_sub <- hap_geno_names[hap_geno_names_idx]
hap_geno_sub <- raw$hap_geno[hap_geno_names_idx]

## Extract locus data
locus <- raw$hap_geno$Locus

# Start parallel backend
tictoc::tic("Parallel") # ~12.719 sec (2 CPUs); ~7.923 sec (4 CPUs)
cores <- parallel::detectCores() - 1
cpus <- ifelse(cores > 2, 2, 1)
cl <- parallel::makeCluster(cpus, setup_strategy = "sequential")
doParallel::registerDoParallel(cl)

# Load binary operator for backend
`%dopar%` <- foreach::`%dopar%`
map <- foreach::foreach(i = 1:length(hap_geno_sub), 
                        .combine = cbind) %dopar% {
                          unlist(lapply(hap_geno_sub[, i], get_geno))
                        }
parallel::stopCluster(cl) # Stop cluster
tictoc::toc()
colnames(map) <- names(hap_geno_sub)
rownames(map) <- locus
# map <- cbind(Locus = locus, map)
map_t <- t(map)
colnames(map_t) <- col(map)
rownames(map_t) <- gsub("X*", "", rownames(map_t))
write.csv(map_t, "data/pseudomap.csv")
# map_t[1:5,1:5]

Pseudo-genetic map sample (map_t)

| X | rhMAS_5GT_cons95 | rhMAS_SDI_p2_AG11_chr18_30Mb | rhMAS_SDI_p3_AGL11 | rhMAS_SDI_p4_AGL11 | rhMAS_5GT_700 | | :---------------------------------- | :----------------- | :-------------------------------- | :-------------------- | :-------------------- | :-------------- | | 160_271_303__vDNAfen551A01_A02 | NP | NP | NP | NP | NN | | 160_271_311__vDNAfen551A01_A03 | NN | NN | NP | NP | NN | | 160_271_319__vDNAfen551A01_A04 | NP | NP | NN | NN | NN | | 160_271_327__vDNAfen551A01_A05 | NP | NP | NP | NP | NN | | 160_271_335__vDNAfen551A01_A06 | NP | NP | NP | NP | NN |

Other stuff

marker_names <- names(raw$fasta)
marker_names_no_repeats <- lapply(marker_names, function(x) gsub("#.*", "", x))

length(marker_names_no_repeats) # 984030
length(unique(marker_names_no_repeats)) # 2055
sum_stats <- list() #data.frame(marker = NA, non_zero = NA)
for (loc in raw$hap_geno$Locus) {
  # repeats_idx <- which(marker_names_no_repeats == loc)
  # repeats <- raw$fasta[repeats_idx]
  # repeats_seq <- lapply(repeats, get_seq)

  count_mat_idx <- which(loc == names(raw$count_mat))
  count_mat_sub <- raw$count_mat[count_mat_idx]
  sum_stats[[loc]] <- list(idx = which(count_mat_sub > 0), count = length(unlist(count_mat_sub)))
  #sum_stats <- rbind(sum_stats, c(loc, length(which(count_mat_sub > 0))))
}
# sum_stats[1, ] <- NULL


villegar/RhAMPseq documentation built on Aug. 12, 2021, 8:47 p.m.