Analysis_multipanel_multicountry/Process_Sanger_Barcode.R

############################################################################
# This script processes data from on the Sanger spot malaria panel for Senegal,
# Colombia, and French Guiana. The data are stored in the file
# /gsap/garage-protistvector/tstraub/ampseq/Sanger/for_Aimee which was
# generated by Tim Straub on June 4th 2019 and accessed by Aimee on June 14th
# 2019 via scp
# aimeet@login.broadinstitute.org:/gsap/garage-protistvector/tstraub/ampseq/Sanger/for_Aimee/*
# ~/Documents/malaria_mltplx_ampseq/RData/
# The file containing frequencies for all three countries (Senegal, Colombia,
# and French Guiana) combined, All_Sanger_Amplicon_Haplotype_Frqs.Rdata, was
# duplicated and copied from malaria_mltplx_ampseq/RData/ to
# paneljudge/data_raw for use as example data in the R package paneljudge
#
# ************************ IMPORTANT ***************************
# In the data on the sanger barcode provided by Tim June 2019, alleles per
# marker were not ordered by their frequencies. After this script, if
# frequencies are reordered per marker (this is a compute
# requirement of the HMM of [1], not a statistically requirement), numbered
# alleles per marker may refer to a different alleles across
# countries, e.g. "Allele.1" of the first marker in Senegal and "Allele.1" of
# the first marker in Colombia may correspond to different nucleotides.
# This is not a problem since data on each country should be
# simulated and analysed separately using the HMM of [1]. It would be a problem
# if we estimated relatedness between parasite from different populations,
# however, e.g. using hmmIBD.
# **************************************************************
############################################################################
rm(list = ls())
require(plyr) # for dlply

if(exists("markers")){
  stop("Before running this script, ensure paneljudge isn't attached
        (e.g. via devtools::load_all())), otherwise problems with duplicate
        names markers and frequencies")
}

# Set working directory to this file location and load example raw input data
# do not devtools::load_all
load('All_Sanger_Amplicon_Haplotype_Frqs.RData')

# Convert Start and Stop columns to numeric
# This problem was not encountered in Process_GTseq.R
amp_frqs$Start <- as.numeric(as.character(amp_frqs$Start))
amp_frqs$Stop <- as.numeric(as.character(amp_frqs$Start))

# Convert factor columns. Do not use apply (e.g. apply(amp_frqs, 2, is.factor))
# since apply converts data.frame to matrix and surreptitiously convert factors
# to characters s.t. all(apply(amp_frqs, 2, is.factor)) returns FALSE even when
# some columns are factors
factor_inds <- sapply(amp_frqs, is.factor)
amp_frqs[factor_inds] <- lapply(amp_frqs[factor_inds], as.character)

# Rename "Genotype.1" by "Allele.1" etc.
colnames(amp_frqs) <- gsub("Genotype", "Allele", colnames(amp_frqs))

# Rename countries (no Mali in All_Sanger_Amplicon_Haplotype_Frqs.RData as pred-dates amp_frqs.rds)
amp_frqs$Country <- gsub("Columbia", "Colombia", amp_frqs$Country)
amp_frqs$Country <- gsub("Senegal", "Senegal", amp_frqs$Country)
amp_frqs$Country <- gsub("FrenchGuiana", "French Guiana", amp_frqs$Country)

# Separate amplicon data (marker regions) from country specific data (allele frequencies)
markers = amp_frqs[!duplicated(amp_frqs$Amplicon_name), c('Amplicon_name','Chr','Start','Stop')]
rownames(markers) = markers$Amplicon_name

# Add marker length
markers$length <- markers$Stop - markers$Start

# Add the middle position of each marker region
markers$pos = apply(markers[,c('Start','Stop')], 1, function(x)median(as.numeric(x)))

# Add numeric chromosome
markers$chrom = as.numeric(gsub('_v3', '', gsub('Pf3D7_', '', markers$Chr)))

# Manually check all chromosomes are represented: yes
all(sort(unique(markers$chrom)) == 1:14)

# Sort markers by order of increasing chromosome and position
reordered_data_list = list()
for(chr in sort(unique(markers$chrom))){
  x = markers$pos[chr == markers$chrom] # Extract positions per chromosome
  inds = sort.int(x, index.return = T)$ix # Sort positions
  reordered_data_list[[chr]] = markers[chr == markers$chrom, ][inds, ]
}

# Check re-ordered data: returns TRUE if ordered correctly
markers = do.call(rbind, reordered_data_list)
all(sapply(unique(markers$chrom), function(chr){
  x = markers$pos[chr == markers$chrom]
  all(x == cummax(x))
}))

# Compute distances between middle positions
markers$distances <- c(diff(markers$pos), Inf)
pos_change_chrom <- 1 + which(diff(markers$chrom) != 0) # chromosome switch-points
markers$distances[pos_change_chrom-1] <- Inf

# Extract allele names
Alleles = names(amp_frqs)[grepl('Allele', names(amp_frqs))]

# Separate frequencies by country and check alleles are ordered by frequency per amplicon
ordered_markers = rownames(markers)
frequencies = plyr::dlply(amp_frqs, 'Country', function(x){
  rownames(x) = x$Amplicon_name
  y <- x[ordered_markers, Alleles] # Order amplicons according to chrom and pos
  # Check order of alleles per amplicon and re-order if necessary
  if(!all(apply(y[, Alleles], 1, function(z){all(z == cummin(z))}))){
    writeLines('Reordering frequencies')
    y <- t(apply(y[, Alleles], 1, function(f) sort(as.numeric(f), decreasing = T))) # Order frequencies as numeric
  }
  return(as.matrix(y))
})

# Rename s.t. not confused with example markers and
# frequencies processed by Process_GTseq.R
markers_sanger_barcode <- markers
frequencies_sanger_barcode <- frequencies

# Save example data
save(markers_sanger_barcode, file = 'markers_sanger_barcode.RData')
save(frequencies_sanger_barcode, file = 'frequencies_sanger_barcode.RData')
artaylor85/paneljudge documentation built on March 6, 2023, 1:50 a.m.