working_dir <- '/home/phillipl/projects/HaplotypeTracker/code/HaplotypeTracker'
setwd(working_dir)
library(HaplotypeTracker)
input_aa_file <- '/home/phillipl/projects/toolmania/data/CAP177prot/all_timesv3_prot.fasta'
input_dna_file <- '/home/phillipl/projects/toolmania/data/CAP177/all_timesv3.fasta'
output_dir <- '/home/phillipl/projects/HaplotypeTracker/results'
xth_most_freq_seq <- 1
starting_time <- "4250V3"
dataset_order <- c("2000V3", "2020V3", "2030V3", "2040V3", "2050V3", "2060V3", 
"3080V3", "3090V3", "3100V3", "3110V3", "3120V3", "3130V3", "3140V3", 
"3150V3", "3160V3", "4170V3", "4180V3", "4190V3", "4200V3", "4210V3", 
"4220V3", "4230V3", "4240V3", "4250V3")
opts_chunk$set(echo=FALSE)
library(knitr)
library(devtools)
working_dir <- '/home/phillipl/projects/HaplotypeTracker/code/HaplotypeTracker'
setwd(working_dir)
document()
build()
install()
load_all()
knit2html(file.path(working_dir, 'inst/ideas.Rmd'))
browseURL(paste('file://', file.path(getwd(),'ideas.html'), sep=''))

Input Data

seq_dna_data <- readDNAStringSet(input_dna_file)
seq_aa_data <- readAAStringSet(input_aa_file)
seq_data <- seq_aa_data
print(seq_data)
print(get_unique_points_of("time", seq_data, sep = "_", indx = 2))
dput(get_unique_points_of("time", seq_data, sep = "_", indx = 2))

Find starting sequence

filtered_data <- get_data_of("timepoint", seq_data, starting_time)
tab <- table(filtered_data)
hap <- sort(tab, decreasing = T)[xth_most_freq_seq]
prev_seq <- AAString(names(hap))
best_score_possible <- score(pairwiseAlignment(prev_seq, prev_seq, substitutionMatrix =
                               "BLOSUM50"))
haplotypes <- data.frame(selected_seq = names(hap),
                         timepoint = starting_time,
                         occurances = as.numeric(hap),
                         timepoint_size = length(filtered_data),
                         score = as.numeric(NA),
                         best_score_possible = best_score_possible,
                         ss_with_first_seq = best_score_possible)

print(table(tab))
print(prev_seq)
print(best_score_possible)

Determine direction to navigate through datasets

x <- match(starting_time, dataset_order)
if (x > 1 & x < length(dataset_order)){
  direction <- 'both'
} else if (x == 1) {
  direction <- 'forward'
} else if (x == length(dataset_order)){
  direction <- 'backward'
} else {
  stop('something went wrong with the direction detection')
}
print(direction)

Backwards tracking

timepoints <- rev(dataset_order)
timepoints <- setdiff(timepoints, starting_time)
for (timepoint in timepoints){
  filtered_data <- get_data_of("timepoint", seq_data, timepoint)
  alignments <- pairwiseAlignment(gsub('-|~', '', unique(filtered_data)), 
                                  gsub('-|~', '', prev_seq),
                                  substitutionMatrix = "BLOSUM50")
#  for (i in seq_along(unique(filtered_data))){
#    print(i)
#    print(pairwiseAlignment(gsub('-|~', '', unique(filtered_data)[i]), 
#                                  gsub('-|~', '', prev_seq),
#                                  substitutionMatrix = "BLOSUM50"))
#  }
  scores <- score(alignments)
  indx <- which(scores == max(scores))

  tab <- table(filtered_data)
  hap <- tab[match(as.character(pattern(alignments[indx])), attr(tab, 'dimnames')$filtered_data)]
  hap <- hap[hap == max(hap)]
  hap <- hap[1]

  prev_seq <- AAString(names(hap))
  best_score_possible <- score(pairwiseAlignment(prev_seq, prev_seq, 
                                                 substitutionMatrix = "BLOSUM50"))
  ss_with_first_seq <- score(pairwiseAlignment(prev_seq, AAString(haplotypes[1,1]), 
                                                 substitutionMatrix = "BLOSUM50"))

  haplotypes_row <- data.frame(selected_seq = names(hap),
                               timepoint = timepoint,
                               occurances = as.numeric(hap),
                               timepoint_size = length(filtered_data),
                               score = max(scores[indx]),
                               best_score_possible = best_score_possible,
                               ss_with_first_seq = ss_with_first_seq)
  haplotypes <- rbind(haplotypes, haplotypes_row)
}
kable(haplotypes[,2:ncol(haplotypes)])
print(haplotypes[,1])

Occurances by timepoint

p <- ggplot(haplotypes, aes(x = timepoint, y = occurances)) +
     geom_bar(stat = 'identity') + 
     theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(p)

Normalized Occurenaces by timepoint

p <- ggplot(haplotypes, aes(x = timepoint, y = occurances/timepoint_size)) +
     geom_bar(stat = 'identity') + 
     theme(axis.text.x = element_text(angle = 90, hjust = 1))
print(p)

Phylogenetics of the chosen sequences

seqs <- AAStringSet(haplotypes[,1])
names(seqs) <- haplotypes[,2]
distmat <- stringDist(seqs)
plot(bionj(distmat))


philliplab/HaplotypeTracker documentation built on May 25, 2019, 5:05 a.m.