data_raw/Process_GTseq.R

############################################################################
# This script processes data from
# /gsap/garage-protistvector/tstraub/ampseq/gtseq_v1/amp_frqs.rds generated by
# Tim Straub 30th Jan 2020 and accessed by Aimee on Feb 3rd 2020 via scp
# aimeet@login.broadinstitute.org:/gsap/garage-protistvector/tstraub/ampseq/gtseq_v1/amp_frqs.rds
# ~/Documents/
#
# The example raw input data include marker positions and allele frequency
# estimates (see the glossary of vignette('paneljudge_example') to better
# understand terms. In this example the markers are microhaplotypes that are
# typed using amplicons, hence "amp_freq". Note that microhaplotypes are not
# point mutations: they are regions of the genome that contain two or more
# single nucleotide polymorphisms (SNPs), and their alleles are sequences of
# nucleotide base pairs. For example, a micohaplotype spanning two biallelic
# SNPs whose nucleotides are either A or T could have up to four alleles: AA,
# AT, TA and TT, whose frequencies among monoclonal samples could be calculated
# by simply counting observed alleles, e.g. frequency of AA
#
# fAA = n(AA) / [n(AA) + n(AT) + n(TA) + n(TT)],
#
# where n(AA) is the count of AA observations etc. To simulate microhaplotype
# data under the HMM model [1], distances between markers are based on their
# middle positions.
#
# Tim's notes (copied from slack): This file contains Senegal, Columbia, French
# Guiana, and Mali data. For some amplicons, there was no variation in some
# countries, so I put in Genotype.1 as a value of 1. A handful of amplicons had
# no useable SNP data and they were excluded from this file (PF3D7_1008700,
# PF3D7_1408100_1, PfHRP2_1, PfHRP3_1, and PvDHFR [vivax, not falciparum]). To
# make it easier to separate out the CTSA amplicons (CSP, TRAP, SERA2, and
# AMA1) from the regular gt-seq panel, those have a ā€œzā€ pre-pended to their
# name and appear on the bottom of the data set (e.g. zTRAP). The genotypes
# have been ordered based on decreasing allele frequency, just in case that
# makes a difference.
#
# ******************************* IMPORTANT ********************************
# In the following we refer to Tim's Genotype.1 as Allele.1 etc. As mentioned
# above, alleles per marker are ordered by their frequencies. More
# specifically, non-zero frequencies must precede zero frequencies
# (additionally in this example analysis, non-zero frequencies monotonically
# decrease). Non-zero frequencies preceding zero frequencies is a compute
# requirement of the HMM of [1], not a statistical requirement (statistically,
# alleles under the HMM of [1] are modelled as realisations of unordered
# categorgical random variables). As such, the names of the alleles per marker
# (Allele.1, Allele.2, etc.) likely correspond to different microhaplotype
# nucleotide sequences in different countries, e.g. the most common allele in
# Sengal may differ to that in Colombia. This is not a problem since data on
# each country should be simulated and analysed separately using the HMM of
# [1]. It would be a problem if we estimated relatedness between parasite from
# different populations, however, e.g. using hmmIBD.
# **************************************************************************
############################################################################
rm(list = ls())
require(plyr) # for dlply

if(exists("markers")){
  stop("Before running this script, ensure paneljudge isn't attached
       (e.g. attached via devtools::load_all())), otherwise problems
       with duplicate names markers and frequencies")
}

# Set working directory to this file location and load example raw input data
amp_frqs = readRDS('amp_frqs.rds')

# Frequencies do not sum to one exactly
freq_sum_range <- range(rowSums(amp_frqs[,grepl("Allele.", colnames(amp_frqs))]))
max(abs(freq_sum_range-1)) # Maximum deviation

# Convert factor columns. Do not use apply (e.g. apply(amp_frqs, 2, is.factor))
# since apply converts data.frame to matrix and surreptitiously convert factors
# to characters s.t. all(apply(amp_frqs, 2, is.factor)) returns FALSE even when
# some columns are factors
factor_inds <- sapply(amp_frqs, is.factor)
amp_frqs[factor_inds] <- lapply(amp_frqs[factor_inds], as.character)

# Rename "Genotype.1" by "Allele.1" etc.
colnames(amp_frqs) <- gsub("Genotype", "Allele", colnames(amp_frqs))

# Rename countries
amp_frqs$Country <- gsub("columbia", "Colombia", amp_frqs$Country)
amp_frqs$Country <- gsub("mali", "Mali", amp_frqs$Country)
amp_frqs$Country <- gsub("senegal", "Senegal", amp_frqs$Country)
amp_frqs$Country <- gsub("frenchguiana", "French Guiana", amp_frqs$Country)

# Separate amplicon data (marker regions) from country specific data (allele frequencies)
markers = amp_frqs[!duplicated(amp_frqs$Amplicon_name), c('Amplicon_name','Chr','Start','Stop')]
rownames(markers) = markers$Amplicon_name

# Add marker length
markers$length <- markers$Stop - markers$Start

# Add the middle position of each marker region
markers$pos = apply(markers[,c('Start','Stop')], 1, function(x)median(as.numeric(x)))

# Add numeric chromosome
markers$chrom = as.numeric(gsub('_v3', '', gsub('Pf3D7_', '', markers$Chr)))

# Manually check all chromosomes are represented: yes
all(sort(unique(markers$chrom)) == 1:14)

# Sort markers by order of increasing chromosome and position
reordered_data_list = list()
for(chr in sort(unique(markers$chrom))){
  x = markers$pos[chr == markers$chrom] # Extract positions per chromosome
  inds = sort.int(x, index.return = T)$ix # Sort positions
  reordered_data_list[[chr]] = markers[chr == markers$chrom, ][inds, ]
}

# Check re-ordered data: returns TRUE if ordered correctly
markers = do.call(rbind, reordered_data_list)
all(sapply(split(markers$pos, markers$chrom),function(x) all(x == cummax(x))))

# Compute distances between middle positions
markers$distances <- c(diff(markers$pos), Inf)
pos_change_chrom <- 1 + which(diff(markers$chrom) != 0) # chromosome switch-points
markers$distances[pos_change_chrom-1] <- Inf

# Extract allele names
Alleles = names(amp_frqs)[grepl('Allele', names(amp_frqs))]

# Separate frequencies by country and check alleles are ordered by frequency per amplicon
ordered_markers = rownames(markers)
frequencies = plyr::dlply(amp_frqs, 'Country', function(x){
  rownames(x) = x$Amplicon_name
  y <- x[ordered_markers, Alleles] # Order amplicons according to chrom and pos
  # Check order of alleles per amplicon and re-order if necessary
  if(!all(apply(y[, Alleles], 1, function(z){all(z == cummin(z))}))){
    writeLines('Reordering frequencies')
    y <- t(apply(y[, Alleles], 1, function(f) sort(as.numeric(f), decreasing = T))) # Order frequencies as numeric
  }
  return(as.matrix(y))
})

# Save example data
save(markers, file = '../data/markers.RData')
save(frequencies, file = '../data/frequencies.RData')
artaylor85/paneljudge documentation built on March 6, 2023, 1:50 a.m.