Nothing
#R Script written by Bettina Budeus, rokkaku-fight@gmx.de
#from 2012-06-19 to 2012--
#analysis of correlation of HLA and ancestral sequence to look for escape mutations or reversions
#!!!! BEWARE!!!! You have to name the fasta sequences so that phylo can read and write them correctly. It will replace any space charakter with _ and ignore colon!!!!!!
#search for "change" for points where changes are possible
#please check before executing
#libs
library(Biostrings)
library(plyr)
library(plotrix)
#library(ape)
library(phangorn)
library(qvalue)
#filenames, change as required...
files.dir <- "./"
#all patients (all clades w/o consensus!)
#files.sequences <- "Anti-D_m_woX_same_length.fasta"
files.sequences <- "Anti-D_test_hlong.fasta"
#change: p-values will only be calculated for allels with a number of patients above threshold
patnum.threshold <- 0
#change: p-values will only be calculated from contingency tables with row sums above threshold or zero
rowsum.threshold <- -1
#change: threshold level for significance in fisher test search
thr.sig.fi <- 0.05
#some functions needed in the following
#function to create the same output as the old readFASTA from new read.AAStringSet
create_correct_FASTA_input <- function(sequences){
liste <- as.character(sequences, use.names=TRUE)
result_list <- c()
for (i in 1:length(liste)){
seq=liste[i][[1]]
name = names(liste[i])
entry <- list (name, seq)
names(entry) <- c("desc", "seq")
result_list[[i]] = entry
}
return (result_list)
}
ht_get_input_file_known_sequences <- structure(function(
###read aa sequences from fasta file - they have to be aligned in order to work!
path_to_file
### a FASTA file with sequence data. For reference please look in example file.
){
### the sequences from the FASTA file.
sequences <- readAAStringSet(path_to_file)
#c_sequences <- create_correct_FASTA_input(sequences)
.GlobalEnv[["sequences"]] <- sequences
},ex=function(){
ht_get_input_file_known_sequences("SeqFeatR/extdata/Example_epitopes.csv")
})
#read tree file
ht_get_input_file_tree <- structure(function(
### set the input file (tree file of the given sequences).
path_to_file
### a nexus file with tree data. For reference please look in example file.
){
### the tree.
.GlobalEnv[["tree"]] <- read.nexus(path_to_file)
},ex=function(){
ht_wo_get_input_file_tree("SeqFeatR/extdata/Example_tree.nh")
})
#removes "0"s from a string
remove.zeroes <- function(s){
if(substr(s, 1, 1) == "0")
a <- remove.zeroes(substr(s, 2, nchar(s)))
else
a <- s
}
#get indices of non-zero-values
find.neqzero <- function(array){
r <- c()
for(i in 1:length(array)){
if(array[i]!=0)
r <- c(r,i)
}
r
}
#calculates from stupid phydat the original sequences without this patterns thingy
#@input a phydat file
#@return a list of sequences, each character one column
get_original_sequence <- function(phydat){
weight <- attributes(phydat)$weight
index <- attributes(phydat)$index
levels <- attributes(phydat)$levels
names <- attributes(phydat)$names
number_of_sequences <- length(names)
number_of_letters <- length(index)
new_seqs <- array(rep(0,number_of_sequences*number_of_letters), c(number_of_sequences,number_of_letters))
rownames(new_seqs) <- names
for (entry in 1:length(index)){
for (sequence in 1:number_of_sequences){
sequence_pattern <- phydat[sequence][[1]]
new_seqs[sequence,entry] <- sequence_pattern[index[entry]]
}
}
for (row in 1:nrow(new_seqs)){
for (col in 1:ncol(new_seqs)){
AAnr <- as.numeric(new_seqs[row,col][[1]])
new_seqs[row,col] <- levels[AAnr]
}
}
return (new_seqs)
}
#calculates from stupid phydat-phangorn the original sequences without this patterns thingy (differnd because of matrix not single array)
#@input a phydat-phangorn file, such as from ancestral sequence reconstruction
#@return a list of sequences, each character one column
get_original_sequence_ph <- function(phydat){
weight <- attributes(phydat)$weight
index <- attributes(phydat)$index
levels <- attributes(phydat)$levels
names <- attributes(phydat)$names
number_of_sequences <- length(names)
number_of_letters <- length(index)
new_seqs <- array(rep(0,number_of_sequences*number_of_letters), c(number_of_sequences,number_of_letters))
rownames(new_seqs) <- names
for (sequence in 1:number_of_sequences){
for (entry in 1:length(index)){
sequence_pattern <- subset(phydat, sequence)[[1]]
sequence_pattern_names <- rownames(sequence_pattern)
if (is.null(sequence_pattern_names)){
subset <- sequence_pattern[index[entry],]
max <- max(subset)
which <- (which(max == subset))
letter <- levels[which]
if (length(letter) > 1){
new_seqs[sequence,entry] <- letter[1]
}else{
new_seqs[sequence,entry] <- letter
}
}
else {
new_seqs[sequence,entry] <- sequence_pattern_names[index[entry]]
}
}
}
return (new_seqs)
}
#function do make char array from the sequences
make_char_array <- function(){
sequences <- .GlobalEnv[["sequences"]]
min_FASTA_length <- .GlobalEnv[["min_FASTA_length"]]
char_array <- array(rep(0,min_FASTA_length*length(sequences)),c(length(sequences),min_FASTA_length))
for (sequence in 1:length(sequences)){
for (i in 1:min_FASTA_length){
#letter <- (substr(sequences[[sequence]]$seq, i, i))
letter <- subseq(sequences[sequence], i, i)
char_array[sequence,i] <- as.character(letter)
}
}
return (char_array)
}
#function to make list of HLA-types for each sequence
make_HLA_types_array <- function(){
A11 <- .GlobalEnv[["A11"]]
A12 <- .GlobalEnv[["A12"]]
A21 <- .GlobalEnv[["A21"]]
A22 <- .GlobalEnv[["A22"]]
B11 <- .GlobalEnv[["B11"]]
B12 <- .GlobalEnv[["B12"]]
B21 <- .GlobalEnv[["B21"]]
B22 <- .GlobalEnv[["B22"]]
sequences <- .GlobalEnv[["sequences"]]
HLA_array <- array(rep(0,length(sequences)*4),c(length(sequences),4))
for (sequence in 1:length(sequences)){
a1 <- paste("A", remove.zeroes(substr(names(sequences)[sequence], A11, A12)), sep="")
a2 <- paste("A", remove.zeroes(substr(names(sequences)[sequence], A21, A22)), sep="")
b1 <- paste("B", remove.zeroes(substr(names(sequences)[sequence], B11, B12)), sep="")
b2 <- paste("B", remove.zeroes(substr(names(sequences)[sequence], B21, B22)), sep="")
HLA_array[sequence,1] <- a1
HLA_array[sequence,2] <- a2
HLA_array[sequence,3] <- b1
HLA_array[sequence,4] <- b2
}
return (HLA_array)
}
#function to return the potential ancestral AA for given position. It searches the tip.labels for the given sequence and calucalts
get_ancestral_for_this_position <- function(sequence, position){
sequences <- .GlobalEnv[["sequences"]]
tree <- .GlobalEnv[["tree"]]
tree.tip.label <- .GlobalEnv[["tree.tip.label"]]
a_sequences <- .GlobalEnv[["a_sequences"]]
tip.N <- which(tree.tip.label==names(sequence))
ancestor <- Ancestors(tree, tip.N, type=c("parent"))
ancestral_letter <- a_sequences[ancestor, position]
return (ancestral_letter)
}
#function to create table with all positions for better lapply use
make_array <- function(){
min_FASTA_length <- .GlobalEnv[["min_FASTA_length"]]
anc_new_array <- c()
for (i in 1:(min_FASTA_length)){
anc_new_array <- c(anc_new_array, i)
}
return (anc_new_array)
}
#function to find the minmal number of AA pairs at a certain allel to garanty significance
get.threshold.f.fisher.pair <- function(number.sequences, number.pairs, allels.count){
thr.sig.fi <- .GlobalEnv[["thr.sig.fi"]]
patnum.threshold <- .GlobalEnv[["patnum.threshold"]]
length_allels <- .GlobalEnv[["length_allels"]]
allels <- .GlobalEnv[["allels"]]
threshold <- c()
x <- 0
for (allel in 1:length(allels)){
if (allels.count[allel] > patnum.threshold){
min_threshold <- 0
for (x in 0:allels.count[allel]){
y <- allels.count[allel]
l <- sum(allels.count)
for (z in seq(x,(l-y))){
#cat (x,y-x,z-x,l-y-z+x,"\n")
small.table = matrix(c(x,y-x,z-x,l-y-z+x),nrow=2)
if(sum(small.table[1,])*sum(small.table[2,])!=0 & (sum(small.table[1,])<=rowsum.threshold | sum(small.table[2,])<=rowsum.threshold))
next
#calculate fisher's exact test for each pair (allel, acid)
test.result <- fisher.test(small.table)
if (test.result$p.value < thr.sig.fi){
threshold <- rbind(threshold, c(x,y-x,z-x,l-y-z+x,test.result$p.value))
}
}
}
}
}
threshold <- subset(threshold, !duplicated(threshold))
#print (threshold)
#miracle <- ocure
return (threshold)
}
#calculates the length of result list
calculate_length <- function(list){
counter <- 0
for (i in 1:length(list)){
counter <- counter + nrow(list[[i]])
}
return (counter)
}
#create mini_table for calculating all relevant numbers for one position pair i,j
#@result: a table with all relevant data aka the sum for all occurings
create_mini_table_tr <- function(i){
sequences <- .GlobalEnv[["sequences"]]
patnum.threshold <- .GlobalEnv[["patnum.threshold"]]
char_array <- .GlobalEnv[["char_array"]]
HLA_array <- .GlobalEnv[["HLA_array"]]
length_allels <- .GlobalEnv[["length_allels"]]
allels <- .GlobalEnv[["allels"]]
last_line <- c("sum")
mini_table <- array(rep(0, (length_allels+2)),c(length_allels+2), list(c("Anc_New_P",allels,"sum")))
mini_table <- rbind(mini_table, c(rep(0, (length_allels+2))))
for (sequence in 1:length(sequences)){
a1 <- HLA_array [sequence,1]
a2 <- HLA_array [sequence,2]
b1 <- HLA_array [sequence,3]
b2 <- HLA_array [sequence,4]
l1 <- char_array[sequence,i]
l2 <- get_ancestral_for_this_position(sequence, i)
line <- paste(l1,l2,sep = "")
pos.a1 = which(allels == a1)+1
pos.a2 = which(allels == a2)+1
pos.b1 = which(allels == b1)+1
pos.b2 = which(allels == b2)+1
pos.aa = which(mini_table[,1] == line)
if (!length(pos.aa)){
mini_table <- rbind(mini_table, c(0))
rows = nrow(mini_table)
mini_table[rows,1] = line
mini_table[rows,pos.a1] = 1
mini_table[rows,pos.a2] = 1
mini_table[rows,pos.b1] = 1
mini_table[rows,pos.b2] = 1
mini_table[rows,ncol(mini_table)] = sum(as.numeric(mini_table[rows,2:(ncol(mini_table)-1)]))
}
else{
mini_table[pos.aa,pos.a1] = as.numeric(mini_table[pos.aa,pos.a1]) + 1
mini_table[pos.aa,pos.a2] = as.numeric(mini_table[pos.aa,pos.a2]) + 1
mini_table[pos.aa,pos.b1] = as.numeric(mini_table[pos.aa,pos.b1]) + 1
mini_table[pos.aa,pos.b2] = as.numeric(mini_table[pos.aa,pos.b2]) + 1
mini_table[pos.aa,ncol(mini_table)] = sum(as.numeric(mini_table[pos.aa,2:(ncol(mini_table)-1)]))
}
}
#print (mini_table)
for (col in 2:ncol(mini_table)){
last_line <- c(last_line, sum(as.numeric(mini_table[,col])))
}
mini_table <- rbind(mini_table, last_line)
mini_table_r <- mini_table[,(which(as.numeric(mini_table[nrow(mini_table),2:ncol(mini_table)])>patnum.threshold)+1)]
mini_table_r <- cbind (mini_table[,1],mini_table_r)
mini_table_r <- mini_table_r[3:nrow(mini_table_r),]
print (mini_table)
return (mini_table_r)
}
#for controll and test purpose only!!!!
#make_fisher_test <- function(a,b,c,d){
# small.table = matrix(c(a,b-a,c-a,d-b-c+a),nrow=2)
# #print (small.table)
# if(sum(small.table[1,])*sum(small.table[2,])!=0 & (sum(small.table[1,])<=rowsum.threshold | sum(small.table[2,])<=rowsum.threshold))
# next
# if(small.table[1,1]*small.table[2,2] < small.table[1,2]*small.table[2,1]){
# small.table[2,1] <- small.table[2,1]+1
# small.table[1,2] <- small.table[1,2]+1
# }
# else if(small.table[1,1]*small.table[2,2] > small.table[1,2]*small.table[2,1]){
# small.table[1,1] <- small.table[1,1]+1
# small.table[2,2] <- small.table[2,2]+1
# }
# #calculate fisher's exact test for each pair (allel, acid)
# test.result <- fisher.test(small.table)
# return (test.result$p.value)
#}
#main function to call:
# for each position pair:
# generate mini_table
# check if entrys in mini_table are inside threshold for significant F Test
mai <- function(){
anc_new_array <- .GlobalEnv[["anc_new_array"]]
length_allels <- .GlobalEnv[["length_allels"]]
result <- lapply(anc_new_array, FUN=inside_main_tr)
result_wo_zero <- result[!sapply(result, is.null)]
length_of_all <- calculate_length(result_wo_zero)
result_matrix <- matrix(rep(0,length_of_all*4),ncol = 4,dimnames = list(NULL, c("allel", "Anc_New", "First Position", "p-value")))
counter <- 1
for (i in 1:length(result_wo_zero)){
for (j in 1:nrow(result_wo_zero[[i]])){
result_matrix[counter,] <- result_wo_zero[[i]][j,]
counter <- counter+1
}
}
return (result_matrix[order(result_matrix[,1]),])
}
#inside main function with use of apply
inside_main_tr <- function(col){
threshold <- .GlobalEnv[["threshold"]]
i = col[1]
result <- c()
#cat ("col", i, "\n")
mini_table <- create_mini_table_tr(i)
#print (mini_table)
for (col in 2:(ncol(mini_table)-1)){
for (row in 1:(nrow(mini_table)-1)){
a <- as.numeric(mini_table[row,col])
b <- as.numeric(mini_table[nrow(mini_table),col])
c <- as.numeric(mini_table[row,ncol(mini_table)])
d <- as.numeric(mini_table[nrow(mini_table),ncol(mini_table)])
#cat (a,b-a,c-a,d-b-c+a,"\n")
#print (threshold)
is_in <- threshold[which(threshold[,1]==a & threshold[,2]==(b-a) & threshold[,3]==(c-a) & threshold[,4]==(d-c-b+a)),]
#print (is_in)
if (length(is_in)>0){
p_value <- is_in[5]
#print (mini_table)
#print (col)
if (a == 0){
result <- rbind(result,c(colnames(mini_table)[col],mini_table[row,1],i,-(p_value)))
}else{
line <- c(colnames(mini_table)[col],mini_table[row,1],i,p_value)
#print (line)
result <- rbind(result,line)
#print (result)
}
}
}
#print (result)
}
return (result)
}
create_ancestral_sequences <- function(path_to_file_s){
sequences_for_tree <- read.phyDat(path_to_file_s, type="AA", format="fasta")
tree <- .GlobalEnv[["tree"]]
print (tree)
print (names(sequences_for_tree))
fit = pml(tree, sequences_for_tree)
#anc.ml = ancestral.pml(fit, type = "ml")
anc.acctran = ancestral.pars(tree, sequences_for_tree, "ACCTRAN")
#anc.mpr = ancestral.pars(tree, sequences_for_tree, "MPR")
return (anc.acctran)
}
#get_length_of_nested_list <- function(nested_list){
# leng <- 0
# for (i in 1: length(nested_list)){
# leng <- (leng + nrow(nested_list[[i]]))
# }
# return (leng)
#}
#___________________________________________________________
comparewithancestral <- structure(function(#Find possible escape mutations
### Takes a tree and a set of sequences and searches for possible escape mutations
##details<<
##seealso<<
##note<<
path_to_file_sequence_alignment = NULL,
### a FASTA file with sequence data. Homzygot patiens have to have a 00 instead of the HLA typeFor reference please look in
### example file.
path_to_file_nexus_tree = NULL,
### a tree file of the sequences. For reference please look in example file.
save_name_csv,
### the file name of the result file in csv format
patnum_threshold = 1,
### the minimum number of patients of one HLA type to consider in the calculation.
significance_level = 0.05,
### p-value threshold below which the results from fishers exact test should be added to output.
A11,
### the position of the start of the first HLA A Allel in the description block of the FASTA file.
A12,
### the position of the end of the first HLA A Allel in the description block of the FASTA file.
A21,
### the position of the start of the second HLA A Allel in the description block of the FASTA file.
A22,
### the position of the end of the second HLA A Allel in the description block of the FASTA file.
B11,
### the position of the start of the first HLA B Allel in the description block of the FASTA file.
B12,
### the position of the end of the first HLA B Allel in the description block of the FASTA file.
B21,
### the position of the start of the second HLA B Allel in the description block of the FASTA file.
B22
### the position of the end of the second HLA B Allel in the description block of the FASTA file.
){
HLA_and_tree_inner(path_to_file_sequence_alignment, path_to_file_nexus_tree, save_name_csv, patnum_threshold, significance_level, A11, A12, A21, A22, B11, B12, B21, B22)
},ex=function(){
ex <- system.file("extdata", "Example.fasta", package="SeqFeatR")
ep <- system.file("extdata", "Example_tree.nh", package="SeqFeatR")
hlaTree(ex,
ep,
"ancestral_analysis_results.csv",
1,
0.05,
22,
23,
25,
26,
29,
30,
32,
33)
})
HLA_and_tree_inner <- function(path_to_file_s = NULL, path_to_file_t = NULL, save_name_csv, patnum.threshol = 1, thr.sig.f = 0.05, A11, A12, A21, A22, B11, B12, B21, B22){
#read aa sequences from fasta file - they have to be aligned in order to work!
ht_get_input_file_known_sequences(path_to_file_s)
ht_get_input_file_tree(path_to_file_t)
sequences <- .GlobalEnv[["sequences"]]
tree <- .GlobalEnv[["tree"]]
.GlobalEnv[["thr.sig.fi"]] <- thr.sig.f
.GlobalEnv[["patnum.threshold"]] <- patnum.threshol
.GlobalEnv[["A11"]] <- A11
.GlobalEnv[["A12"]] <- A12
.GlobalEnv[["A21"]] <- A21
.GlobalEnv[["A22"]] <- A22
.GlobalEnv[["B11"]] <- B11
.GlobalEnv[["B12"]] <- B12
.GlobalEnv[["B21"]] <- B21
.GlobalEnv[["B22"]] <- B22
min_FASTA_length <- min(width(sequences))
ancestral_sequences <- create_ancestral_sequences(path_to_file_s)
#ancestral_sequences <- load(paste(files.dir, "tree_data.RData",sep=""))
a_sequences <- get_original_sequence_ph(ancestral_sequences)
.GlobalEnv[["a_sequences"]] <- a_sequences
#a_sequences <- read.csv2(paste(files.dir, "tree.csv",sep=""))
#list of the tipnames in correct order (hopefully) to get the correct tip number
tree.tip.label <- tree$tip.label
.GlobalEnv[["tree.tip.label"]] <- tree.tip.label
sequences.count <- length(sequences)
.GlobalEnv[["min_FASTA_length"]] <- min_FASTA_length
#find out all allels that occur in given sequences
allels.all <- c()
allels.count <- c()
new_sequence_names <- c()
for(i in 1:length(sequences)){
#the used values depend on fasta file; they possibly have to be changed - the same holds for the block some lines below
#allels.all holds all allels from the FASTA files, even duplicates
#allels holds no duplicates and none which are only a letter, without number (if the type is 00)
#allels.count counts how often the certain alles is there
f <- paste("A", remove.zeroes(substr(names(sequences)[i], A11, A12)), sep="")
s <- paste("A", remove.zeroes(substr(names(sequences)[i], A21, A22)), sep="")
t <- paste("B", remove.zeroes(substr(names(sequences)[i], B11, B12)), sep="")
o <- paste("B", remove.zeroes(substr(names(sequences)[i], B21, B22)), sep="")
allels.all <- c(allels.all, f, s, t, o)
new_sequence_names <- c(new_sequence_names, paste(f, s, t, o))
}
allels <- sort(unique(allels.all[allels.all!="A" & allels.all!="B"]))
.GlobalEnv[["allels"]] <- allels
for(i in 1:length(allels)){
allels.count <- c(allels.count, sum(allels.all==allels[i]))
}
#possible one letter codes for amino acids
aacids <- c("A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y","-", "B", "X")
#possible parings of amino acids
par.aacids <- c()
names.par.aacids <- c()
for (i in 1:length(aacids)){
for (j in 1:length(aacids)){
both <- data.frame(c(aacids[i], aacids[j]))
names.par.aacids <- c(names.par.aacids, paste(aacids[i],aacids[j], sep = ""))
#print (both)
par.aacids <- c(par.aacids, both)
}
}
par.aacids <- data.frame(lapply(par.aacids, as.character), stringsAsFactors=FALSE)
.GlobalEnv[["length_of_pairs"]] <- (min_FASTA_length*(min_FASTA_length-1))/2
.GlobalEnv[["char_array"]] <- make_char_array()
.GlobalEnv[["HLA_array"]] <- make_HLA_types_array()
.GlobalEnv[["anc_new_array"]] <- make_array()
.GlobalEnv[["length_allels"]] <- length(allels.count)
length_allels <- .GlobalEnv[["length_allels"]]
counter_a <- array(rep(1, length(allels)), c(length_allels), list(allels))
cat (sequences.count, length(par.aacids), sum(allels.count), "\n")
threshold <- get.threshold.f.fisher.pair(sequences.count, length(par.aacids), allels.count)
.GlobalEnv[["threshold"]] <- threshold
allels.counter <- 0
allels.list <- c()
counter <- 1
result_matrix <- mai()
p_values <- result_matrix[,4]
p_v <- c()
for (entry in 1:length(p_values)){
value <- abs(as.numeric(p_values[entry]))
p_v <- c(p_v,value)
}
#write.csv2(p_values, paste(files.dir, "p_values.csv", sep=""))
#q_value <- qvalue(p=p_v, lambda=0.04, pi0.method="smoother", fdr.level=NULL, robust=TRUE, gui=FALSE, smooth.df=3, smooth.log.pi0=FALSE)
q_value <- qvalue(p=p_v, lambda=0, fdr.level=0.05)
#print (q_value$qvalues)
result_matrix <- cbind(result_matrix, q_value$qvalues)
colnames(result_matrix)[5] <- "q-value"
#print (result_matrix)
write.csv2(result_matrix, save_name_csv)
}
#comparewithancestral("../inst/extdata/Example_aa_for_comparewithancestral.fasta",
# "../inst/extdata/Example_tree.nh",
# "test.csv",
# 1,
# 0.05,
# 10,
# 11,
# 13,
# 14,
# 17,
# 18,
# 20,
# 21)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.