Nothing
#http://www.r-bloggers.com/developing-a-user-friendly-regular-expression-function-easygregexpr/
easyGregexpr <- function(pattern, charvector, ...) {
#require(plyr)
if (storage.mode(charvector) != "character") stop("easyGregexpr expects charvector to be a character vector.")
#identify all matches
regexpMatches <- gregexpr(pattern, charvector, ...)
convertMatches <- c()
for (i in 1:length(regexpMatches)) {
thisLine <- regexpMatches[[i]]
#only append if there is at least one match on this line
if (thisLine[1] != -1) {
convertMatches <- rbind(convertMatches, data.frame(element=i, start=thisLine, end=thisLine + attr(thisLine, "match.length") - 1))
}
}
#if no matches exist, return null (otherwise, will break adply)
if (is.null(convertMatches)) return(NULL)
#We now have a data frame with the line, starting position, and ending position of every match
#Add the matched string to the data.frame
#Use adply to iterate over rows and apply substr func
convertMatches <- adply(convertMatches, 1, function(row) {
row$match <- substr(charvector[row$element], row$start, row$end)
return(as.data.frame(row))
})
#need to convert from factor to character because adply uses stringsAsFactors=TRUE even when overridden
convertMatches$match <- as.character(convertMatches$match)
return(convertMatches)
}
bayes_fac <- function(y, K, m){
rdirichlet = function(n, alpha) {
l <- length(alpha)
x <- matrix(rgamma(l * n, alpha), ncol = l, byrow = TRUE)
sm <- x %*% rep(1, l)
return(x/as.vector(sm))
}
ldirichlet = function(alpha) {
return(rowSums(lgamma(alpha)) - lgamma(rowSums(alpha)))
}
yc = colSums(y)
yr = rowSums(y)
n = sum(yc)
d = dim(y)
I = d[1]
J = d[2]
etaA = rdirichlet(m, yr + 1)
etaB = rdirichlet(m, yc + 1)
Keta = c()
KetaY = c()
for (i in 1:I) {
for (j in 1:J) {
Keta = cbind(Keta, K * etaA[, i] * etaB[, j])
KetaY = cbind(KetaY, K * etaA[, i] * etaB[, j] +
y[i, j])
}
}
logint = ldirichlet(KetaY) - ldirichlet(Keta)
for (i in 1:I) logint = logint - yr[i] * log(etaA[, i])
for (j in 1:J) logint = logint - yc[j] * log(etaB[, j])
int = exp(logint)
return(list(bf = mean(int), nse = sd(int)/sqrt(m)))
}
#search for "change" for points where changes are possible
#please check before executing
#libs
library(Biostrings)
library(plyr)
rowsum.threshold <- -1
#min_FASTA_length <- 0
#epitopes <- c()
#sequences <- c()
#pos.epitopes <- c()
#pos.epi.matrix <- c()
#function to create the same output as the old readFASTA from new read.AAStringSet
create_correct_FASTA_input <- function(sequences){
liste <- as.character(sequences, use.names=TRUE)
result_list <- c()
for (i in 1:length(liste)){
seq=liste[i][[1]]
name = names(liste[i])
entry <- list (name, seq)
names(entry) <- c("desc", "seq")
result_list[[i]] = entry
}
return (result_list)
}
#read epitopes from csv file
ep_wo_set_input_file_known_epitopes <- structure(function(
### set the input file (known epitopes).
path_to_file
### a csv file with known epitopes. For reference please look in example file.
){
### known epitopes
if (path_to_file != ""){
.GlobalEnv[["epitopes"]] <- read.csv2(path_to_file)
}
else {
.GlobalEnv[["epitopes"]] <- c()
}
},ex=function(){
ep_wo_set_input_file_known_epitopes("SeqFeatR/extdata/Example_epitopes.csv")
})
ep_wo_set_input_file_known_sequences <- structure(function(
###read aa sequences from fasta file - they have to be aligned in order to work!
path_to_file
### a FASTA file with sequence data. For reference please look in example file.
){
### the sequences from the FASTA file.
sequences <- readAAStringSet(path_to_file)
consensus <- consensusString(sequences)
.GlobalEnv[["StringSetsequences"]] <- sequences
.GlobalEnv[["consensus"]] <- consensus
c_sequences <- create_correct_FASTA_input(sequences)
.GlobalEnv[["sequences"]] <- sequences
},ex=function(){
ep_wo_set_input_file_known_sequences("SeqFeatR/extdata/Example.fasta")
})
#read possible epitopes from csv file
ep_wo_set_input_file_known_pos_epi <- structure(function(
### set the input file (HLA binding motifs).
path_to_file
### a csv file with HLA binding motifs if available. For reference please look in example file.
){
### the HLA binding motifs.
if (path_to_file != ""){
.GlobalEnv[["pos.epitopes"]] <- read.csv2(path_to_file,stringsAsFactors = FALSE)
}
else {
.GlobalEnv[["pos.epitopes"]] <- c()
}
},ex=function(){
ep_wo_set_input_file_known_pos_epi("SeqFeatR/extdata/Example_HLA_binding_motifs.csv")
})
#some functions needed in the following
#removes "0"s from a string
remove.zeroes <- function(s){
print (substr(s, 1, 1))
if(substr(s, 1, 1) == "0")
a <- remove.zeroes(substr(s, 2, nchar(s)))
else
a <- s
}
#get indices of non-zero-values
find.neqzero <- function(array){
r <- c()
for(i in 1:length(array)){
if(array[i]!=0)
r <- c(r,i)
}
r
}
KD <- function(mat) {
rs = rowSums(mat)
cs = colSums(mat)
eta = rs %*% t(cs)/sum(mat)
Kd = sum(mat) - sum(abs(eta - mat))
Kd
}
#return a vector from 0.1 till lowest 10 based potence of input
get_10_based <- function(value){
vector <- c()
potence <- 0
while (round(value, digits = potence) == 0){
potence <- potence + 1
}
lowest_value <- round(value, digits = potence)
for (i in 1:(potence+1)){
vector <- c(vector, 1*10^-(i))
}
#print (vector)
#miracle <- occure
return (vector)
}
#check if given possible epitope is anywhere in range of possible epitope position
find_epitope_in_given_sequence <- function(pos.epitope, matrix.only.pvals.above.threshold, sequences, allel){
#print (matrix.only.pvals.above.threshold)
#print (pos.epitope)
min_FASTA_length <- .GlobalEnv[["min_FASTA_length"]]
count.letter.inside <- -2
count.praenthesis <- 0
true.false.letter <- FALSE
for (letter in 1:nchar(pos.epitope)){
if (substr(pos.epitope, letter, letter)=="["){
count.praenthesis <- count.praenthesis + 1
true.false.letter <- TRUE
}
else if (substr(pos.epitope, letter, letter)=="]"){
true.false.letter <- FALSE
}
if (true.false.letter && substr(pos.epitope, letter, letter)!="("){
if (true.false.letter && substr(pos.epitope, letter, letter)!=")"){
count.letter.inside <- count.letter.inside+1
}
}
}
sub <- gsub("[[|]|(|A-Z|)]", "", pos.epitope)
length.of.epitope <- (nchar(gsub("[(|)]", "", sub)) - count.letter.inside + count.praenthesis)-1
is_in_seq <- c()
if(class(matrix.only.pvals.above.threshold)==class(2)){
entry <- matrix.only.pvals.above.threshold[1]
min <- max(1, entry-length.of.epitope)
max <- min(entry+length.of.epitope, min_FASTA_length)
for (single_seq in 1:length(sequences)){
is_in_seq <- rbind(is_in_seq, test_for_occurence(pos.epitope, min, max, sequences[single_seq], allel, entry))
}
}else{
for (entry in matrix.only.pvals.above.threshold[,1]){
min <- max(1, entry-length.of.epitope)
max <- min(entry+length.of.epitope, min_FASTA_length)
for (single_seq in 1:length(sequences)){
is_in_seq <- rbind(is_in_seq, test_for_occurence(pos.epitope, min, max, sequences[single_seq], allel, entry))
}
}
}
return (is_in_seq)
}
#tests if given sequence is in certain sequence
#min = min position, max = max position, original = where p-value is
test_for_occurence <- function(pos.epitope, min, max, seq, allel, original_position){
eppi <- gsub("[(|)]", "", pos.epitope)
result_array <- c()
test.str <- as.character(seq)#as.character(subseq(seq, min, max))
corrected.pos.epitope <- gsub("[x]","[A-Z]",eppi)
if (!is.null(easyGregexpr(corrected.pos.epitope,test.str))){
list.of.matches <- easyGregexpr(corrected.pos.epitope,test.str)
# print (list.of.matches)
for (i in 1:nrow(list.of.matches)){
start <- list.of.matches[i, 2]
end <- list.of.matches[i, 3]
result_array <- rbind(result_array, start, end)
}
#cat(min, max, allel, original_position,"\n")
create_pos_epi_csv(result_array, seq, pos.epitope, allel, original_position, test.str)
}
return (result_array)
}
#creates plot data for possible epitopes from list with min, max values
create_pos_epi_plot <- function(pos.epitope_list, min_FASTA_length){
plot <- c()
starting <- FALSE
for (i in 1:min_FASTA_length){
if (any(pos.epitope_list == i) & !starting){
starting <- TRUE
plot <- c(plot, 1)
}
else if (any(pos.epitope_list == i) & starting){
plot <- c(plot, 1)
starting <- FALSE
}
else if (starting){
plot <- c(plot, 1)
}
else{
plot <- c(plot, 0)
}
}
return (plot)
}
create_pos_epi_csv <- function(pos.epitope_list, seq, epitope, allel, original_position, test.str){
pos.epi.matrix <- .GlobalEnv[["pos.epi.matrix"]]
for (entry in 1:(length(pos.epitope_list)/2)){
x <- 1
if (!is.null(pos.epitope_list[x])){
s <- pos.epitope_list[x]-1
e <- pos.epitope_list[x+1]-1
sequence <- test.str
#cat (s, e, sequence, "\n")
#cat (pos.epitope_list[x]-1, original_position, pos.epitope_list[x+1]-1, epitope, allel, sequence, "\n")
pos.epi.matrix_p <- rbind(pos.epi.matrix, c(pos.epitope_list[x]-1, original_position, pos.epitope_list[x+1]-1, epitope, allel, sequence))
.GlobalEnv[["pos.epi.matrix"]] <- pos.epi.matrix_p
}
x <- x+2
}
}
find_smallest_cluster_number <- function(cluster, seqs, seqs_to_comp){
if (length(seqs_to_comp) > 1){
groups <- cutree(cluster, k=seq(from = (length(seqs)-1), to = 1))
result_vectors <- c()
for (i in 1:ncol(groups)){
seqs_in_same_cluster <- test_if_in_one_cluster(groups[,i], seqs_to_comp)
result_vectors <- rbind(result_vectors, seqs_in_same_cluster)
#if(seqs_in_same_cluster){
# return ((ncol(groups)-i)/length(seqs))
#}
}
rS <- rowSums(result_vectors)
positions <- seq(1:ncol(groups))
return (lm(rS ~ positions)$coefficients[1]*-1)
}
else{
return (NA)
}
}
test_if_in_one_cluster <- function(group, seqs_to_comp){
group_number <- 0
number_of_member <- 0
result_vector <- c(0,0,0,0,0,0)
for(i in 1:max(group)){
group_with_max <- group[group == i]
names_of <- names(group_with_max)
is_inside <- as.numeric(names(seqs_to_comp) %in% names_of)
#print (is_inside)
if(100/length(is_inside)*sum(is_inside) >= 40){
result_vector[1] <- 1
}
if(100/length(is_inside)*sum(is_inside) >= 50){
result_vector[2] <- 1
}
if(100/length(is_inside)*sum(is_inside) >= 60){
result_vector[3] <- 1
}
if(100/length(is_inside)*sum(is_inside) >= 70){
result_vector[4] <- 1
}
if(100/length(is_inside)*sum(is_inside) >= 80){
result_vector[5] <- 1
}
if(100/length(is_inside)*sum(is_inside) >= 90){
result_vector[6] <- 1
}
}
return (result_vector)
}
addtolist <- function(items, counter){
p.values.all.all <- .GlobalEnv[["p.values.all.all"]]
p.values.all.all[[counter]] <- items
.GlobalEnv[["p.values.all.all"]] <- p.values.all.all
}
#---------------------------------------------------------------------------------------------
assocpoint <- structure(function(#Find possible epitopes
### Searches in a given sequence for possible epitopes.
##details<< For every position in the sequences a fisher's exact test of (Certain HLA type, Other HLA types, AA and not this AA) is calculated and the p-value is pasted in a table.
## Every HLA type has a column, every position a row. There is also a graphical output generated in which every HLA type has it's own page. In this output, a line is inserted, which shows the user where the p-value limit is considered above to be significant.
## Integrated in this graphic are the known epitopes if there are any and the calculated possible epitopes, which are calculated with the known HLA binding motifs. As an extras information a second graphical output is added in which the user can see the level of significant muations in a 9er sliding window.
## Each point in this graphic is the number of mutations in the next nine sequence positions.
## If pylogenetic comparison was choosen, the programm compares for every lowest p.value below a certain user given value the sequence similarity of the sequences with this trai with the similarity of all sequences with a Phylogenetic biason test and adds the p.values in another column.
## Uses EasyGregExpr from http://www.r-bloggers.com/developing-a-user-friendly-regular-expression-function-easygregexpr/
## Also uses p.adjust from stats package to calculate some p-value correction additionally for the graphical output and as an extra column in the csv output.
## Please consider that you have to use the dna/aa switch depending on your data!
##seealso<< \code{\link{check_for_pos_epi_mut_core}}
##note<< If a patient has a homozygot HLA Allel, then please change the second one to "00" (without ") instead!
path_to_file_sequence_alignment = NULL,
### a FASTA file with sequence data. Homzygot patiens have to have a 00 instead of the HLA typeFor reference please look in
### example file.
path_to_file_known_epitopes = NULL,
### a csv file with known epitopes. For reference please look in example file.
path_to_file_binding_motifs = NULL,
### a csv file with HLA binding motifs if available. For reference please look in example file.
save_name_pdf,
### the file name of the result file in pdf format
save_name_csv,
### the file name of the result file in csv format
dna = FALSE,
### if the data is in DNA or amino Acid Code
patnum_threshold = 1,
### the minimum number of patients of one HLA type to consider in the calculation.
optical_significance_level = 0.05,
### the height of the optical horizontal line in the graphical output. It should be yout chosen p-value limit, like 0.05.
star_significance_level = 0.001,
### the height of the invisible horizontal line above which alle points are marked as stars. Should be a high p-value limit,
### like 0,001.
### Should be the same as optical.significance_leve.
A11,
### the position of the start of the first HLA A Allel in the description block of the FASTA file.
A12,
### the position of the end of the first HLA A Allel in the description block of the FASTA file.
A21,
### the position of the start of the second HLA A Allel in the description block of the FASTA file.
A22,
### the position of the end of the second HLA A Allel in the description block of the FASTA file.
B11,
### the position of the start of the first HLA B Allel in the description block of the FASTA file.
B12,
### the position of the end of the first HLA B Allel in the description block of the FASTA file.
B21,
### the position of the start of the second HLA B Allel in the description block of the FASTA file.
B22,
### the position of the end of the second HLA B Allel in the description block of the FASTA file.
C11,
### the position of the start of the first HLA C Allel in the description block of the FASTA file.
C12,
### the position of the start of the second HLA C Allel in the description block of the FASTA file.
C21,
### the position of the start of the second HLA C Allel in the description block of the FASTA file.
C22,
### the position of the end of the second HLA C Allel in the description block of the FASTA file.
has_C = FALSE,
### if there is a C Allel information
multiple_testing_correction = "bonferroni",
### the statistical correction applied to the p-values. Input can be: "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none".
bayes_factor=FALSE,
### if bayes factor should be applied instead of Fisher's exact test for association. See details.
constant_dirichlet_precision_parameter=FALSE,
### if K is constant
dirichlet_precision_parameter=20,
### The Dirichlet precision parameter for evaluation with Bayes factor. See details.
phylo_bias_check = FALSE,
### if the sequences with the certain trait and Amino Acid with the lowest p.value should be compared in sequence similarity with all sequences. See details.
path_to_file_reference_sequence = NULL,
### Reference sequence for better comparison of results
one_feature=FALSE,
### if there is only one identifier
window_size=9,
### size for the sliding window
epi_plot=FALSE
){
#print (path_to_file_reference_sequence)
#print (multiple_testing_correction)
#print (path_to_file_m)
result <- find_possible_epitopes_inner(path_to_file_sequence_alignment, path_to_file_known_epitopes, path_to_file_binding_motifs, save_name_pdf, save_name_csv, dna, patnum_threshold, optical_significance_level, star_significance_level, optical_significance_level, optical_significance_level, A11, A12, A21, A22, B11, B12, B21, B22, C11, C12, C21, C22, has_C, multiple_testing_correction, bayes_factor, constant_dirichlet_precision_parameter, dirichlet_precision_parameter, phylo_bias_check, path_to_file_reference_sequence, one_feature, window_size, epi_plot)
return(result)
},ex=function(){
ex <- system.file("extdata", "Example.fasta", package="SeqFeatR")
ep <- system.file("extdata", "Example_epitopes.csv", package="SeqFeatR")
co <- system.file("extdata", "Example_HLA_binding_motifs.csv", package="SeqFeatR")
assocpoint(ex,
ep,
co,
"epitope_results.pdf",
"epitope_results.csv",
FALSE,
1,
0.05,
0.001,
0.05,
0.05,
22,
23,
25,
26,
29,
30,
32,
33,
36,
37,
39,
40,
"bonferroni",
FALSE,
reference_sequence = NULL,
one_feature=FALSE,
window_size=9,
pos_epi_plot=FALSE)
})
find_possible_epitopes_inner <- function(path_to_file_s = NULL, path_to_file_p = NULL, path_to_file_m = NULL, save_name, save_name_csv, dna = FALSE, patnum.threshold = 1, optical.significance_level = 0.05, star.significance_level = 0.001, pot.epitope_level = 0.05, window_threshold = 0.05, A11, A12, A21, A22, B11, B12, B21, B22, C11, C12, C21, C22, has_C, statistical_correction = "bonferroni", bayes_factor, constant_dirichlet_precision_parameter, dirichlet_precision_parameter=20, with_phylogenetic_comparison = FALSE, reference_sequence = NULL, one_ident, window_size, pos_epi_plot=FALSE){
if (!is.null(reference_sequence) && reference_sequence == 0){
reference_sequence <- NULL
}
if (!is.null(reference_sequence) && !(reference_sequence == "") ){
ref_seq <- readAAStringSet(reference_sequence)
}
if (!is.null(path_to_file_s) && path_to_file_s == 0){
path_to_file_s <- NULL
}
if (!is.null(path_to_file_s) && !(path_to_file_s == "") ){
ep_wo_set_input_file_known_sequences(path_to_file_s)
}
if (!is.null(path_to_file_p) && path_to_file_p == 0){
path_to_file_p <- NULL
}
if (!is.null(path_to_file_p) && !(path_to_file_p == "") ){
ep_wo_set_input_file_known_epitopes(path_to_file_p)
}
if (!is.null(path_to_file_m) && path_to_file_m == 0){
path_to_file_m <- NULL
}
if (!is.null(path_to_file_m) && !(path_to_file_m == "") ){
ep_wo_set_input_file_known_pos_epi(path_to_file_m)
}
if(window_size <=2){
print ("Beware! Window size smaller than 2! Will be set to 2")
window_size <- 2
}
sequences <- .GlobalEnv[["StringSetsequences"]]
epitopes <- .GlobalEnv[["epitopes"]]
pos.epitopes <- .GlobalEnv[["pos.epitopes"]]
consensus <- .GlobalEnv[["consensus"]]
number_of_sequences <- length(sequences)
if (with_phylogenetic_comparison){
StringSetsequences_p <- .GlobalEnv[["StringSetsequences"]]
StringSetsequences <- StringSetsequences_p
counter <- 0
dist.vec <- dist.vec <- stringDist(StringSetsequences, method = "levenshtein")
mean_dist_all <- mean(dist.vec)
}
#find the shortest length of FASTA String, if the sequences are of different length
#min_FASTA_length holds this value
.GlobalEnv[["min_FASTA_length"]] <- min(width(sequences))
min_FASTA_length <- .GlobalEnv[["min_FASTA_length"]]
#find out all allels that occur in given sequences
allels.all <- c()
allels.count <- c()
new_sequence_names <- c()
if (one_ident){
for(i in 1:length(sequences)){
#the used values depend on fasta file; they possibly have to be changed - the same holds for the block some lines below
#tropismus.all holds all tropismus from the FASTA files, even duplicates
#tropismus holds no duplicates and none which are only a letter, without number (if the type is 00)
#tropismus.count counts how often the certain alles is there
allels.all <- c(allels.all, strsplit(names(sequences)[i], ";")[[1]][2])
new_sequence_names <- c(new_sequence_names, strsplit(names(sequences)[i], ";")[[1]][2])
}
}else{
if(has_C){
for(i in 1:length(sequences)){
#the used values depend on fasta file; they possibly have to be changed - the same holds for the block some lines below
#allels.all holds all allels from the FASTA files, even duplicates
#allels holds no duplicates and none which are only a letter, without number (if the type is 00)
#allels.count counts how often the certain alles is there
f <- paste("A", remove.zeroes(substr(names(sequences)[i], A11, A12)), sep="")
s <- paste("A", remove.zeroes(substr(names(sequences)[i], A21, A22)), sep="")
t <- paste("B", remove.zeroes(substr(names(sequences)[i], B11, B12)), sep="")
o <- paste("B", remove.zeroes(substr(names(sequences)[i], B21, B22)), sep="")
v <- paste("C", remove.zeroes(substr(names(sequences)[i], C11, C12)), sep="")
x <- paste("C", remove.zeroes(substr(names(sequences)[i], C21, C22)), sep="")
allels.all <- c(allels.all, f, s, t, o, v, x)
new_sequence_names <- c(new_sequence_names, paste(f, s, t, o, v, x))
}
}else{
for(i in 1:length(sequences)){
#the used values depend on fasta file; they possibly have to be changed - the same holds for the block some lines below
#allels.all holds all allels from the FASTA files, even duplicates
#allels holds no duplicates and none which are only a letter, without number (if the type is 00)
#allels.count counts how often the certain alles is there
f <- paste("A", remove.zeroes(substr(names(sequences)[i], A11, A12)), sep="")
s <- paste("A", remove.zeroes(substr(names(sequences)[i], A21, A22)), sep="")
t <- paste("B", remove.zeroes(substr(names(sequences)[i], B11, B12)), sep="")
o <- paste("B", remove.zeroes(substr(names(sequences)[i], B21, B22)), sep="")
allels.all <- c(allels.all, f, s, t, o)
new_sequence_names <- c(new_sequence_names, paste(f, s, t, o))
}
}
}
if(has_C){
allels <- sort(unique(allels.all[allels.all!="A" & allels.all!="B" & allels.all!="C"]))
#print (allels)
for(i in 1:length(allels)){
allels.count <- c(allels.count, sum(allels.all==allels[i]))
}
}else{
allels <- sort(unique(allels.all[allels.all!="A" & allels.all!="B"]))
#print (allels)
for(i in 1:length(allels)){
allels.count <- c(allels.count, sum(allels.all==allels[i]))
}
}
#possible one letter codes for amino acids B is for amBigious
#if (!dna){
# aacids <- c("A","C","D","E","F","G","H","I","K","L","M","N","P","Q","R","S","T","V","W","Y","-", "B", "X")
#}else {
# aacids <- c("A","C","G","T","-","R", "Y", "M", "K", "S", "W", "N")
#}
#save all p-values here
all.pvals <- c()
#save all aas here
all.aas <- c()
#save the number of aas here
aa.all_level <- c()
##save fishertable for best AA here
all.aa_1 <- c()
all.aa_2 <- c()
all.aa_3 <- c()
all.aa_4 <- c()
all.odds <- c()
#all standard errors for bayes factor
all_sses <- c()
#save stattest
all.stattest <- c()
#save all above star level here
above.star.all <- c()
#save all pos.epi here
#pos.epi.matrix <- data.frame(start=numeric(), original_position=numeric(), stop=numeric(), epitope=character(), allel=character(), protein_sequence=character())
pos.epi.matrix <- matrix(nrow = 1, ncol = 6)
colnames(pos.epi.matrix) <- (c("start", "original_position", "stop", "epitope", "allel", "protein_sequence") )
.GlobalEnv[["pos.epi.matrix"]] <- pos.epi.matrix
all.pos.epi <- c()
#do the following for each allel with a number of patients over a certain threshold
pdf(paste(save_name, sep=""), width = 11.69, height = 18.27)
# for each allel (where allel.num is the index of the allel in allels):
# if the count of this allel is above patnum.threshold.
# allels.counter counts the allels which are abouve the patnum.threshold
allels.counter <- 0
sequences_names <- names(sequences)
#all svings
p.values.all.all <- list(NULL)
length(p.values.all.all) <- length(allels)
.GlobalEnv[["p.values.all.all"]] <- p.values.all.all
for(allel.num in 1:length(allels)){
#print (allels[allel.num])
if(allels.count[allel.num]>patnum.threshold){
#if (allels[allel.num] == "A24"){
print (allel.num)
current_allel <- allels[allel.num]
has_allel <- grep(current_allel, new_sequence_names)
has_no_allel <- grep(current_allel, new_sequence_names, invert = T)
#table for illustration
allels.counter <- allels.counter + 1
probs.max <- c()
p.values.all <- c()
p.values.min <- c()
odds.ratio.all <- c()
aa_values_m <- c()
aa_level <- c()
aa_1 <- c()
aa_2 <- c()
aa_3 <- c()
aa_4 <- c()
p.values.norm <- c()
p.values2 <- c()
probs.norm <- c()
probs2 <- c()
sse_all <- c()
stattest <- c()
# for each letter in FASTA:
for (j in 1:min_FASTA_length){
#print (j)
#mir <- occ
#for each point in sequence do the following:
#for certain allel calculate fisher's exact test's minimum p-value among the tests for all amino acids (aa/!aa ; allel/!allel)
p.values <- c()
probs <- c()
sse <- c()
odds.ratio <- c()
aa_values <- c()
aa_values2 <- c()
aa_values3 <- c()
aa_values4 <- c()
# for each aa in length of possible aas:
just_this_position <- subseq(sequences, j, j)
mat <- as.matrix(just_this_position)
#print (mat)
aacids <- unique(mat[,1])
subset_has_allel <- mat[has_allel]
subset_not_has_allel <- mat[has_no_allel]
#print (subset_has_allel)
#print (subset_not_has_allel)
#count from tabel for each FASTA letter and then each aa for the allel, the occurence
# 1,1 is: occurency of aa in sequence in allel
# 1,2 is: occurency of all aa in sequence in allel - 1,1
# 2,1 is: occurency of aa in sequence in all allels - 1,1
# 2,2 is: occurency of all aas in sequence - 1,2 - 2,1 + 1,1
#@result: list of p values, sorted by aa from aacids
for(k in 1:length(aacids)){
aa <- aacids[k]
aa_and_h <- length(which(subset_has_allel == aa))
aa_and_n_h <- length(which(subset_not_has_allel == aa))
n_aa_and_h <- length(which(subset_has_allel != aa))
n_aa_and_n_h <- length(which(subset_not_has_allel != aa))
small.table = matrix(c(aa_and_h, n_aa_and_h, aa_and_n_h, n_aa_and_n_h),nrow=2)
#print (small.table)
####BAYES - FACTOR
if(!constant_dirichlet_precision_parameter){
dirichlet_precision_parameter <- KD(small.table)
if (dirichlet_precision_parameter == 0){
dirichlet_precision_parameter <- 0.01
}
}
if (bayes_factor){
m=1000
res <- bayes_fac(small.table, as.numeric(dirichlet_precision_parameter), m)
probs <- c(probs, res$bf)
odds.ratio <- c(odds.ratio, ((aa_and_h * n_aa_and_n_h)/(n_aa_and_h * aa_and_n_h)))
sse <- c(sse, res$nse)
}else{
#only calculate p-values from contingency tables with row sums above threshold or zero
if(sum(small.table[1,])*sum(small.table[2,])!=0 & (sum(small.table[1,])<=rowsum.threshold | sum(small.table[2,])<=rowsum.threshold))
next
#cat (j, k, "\n")
#print(small.table)
#calculate fisher's exact test for each pair (tropis, acid)
test.result <- fisher.test(small.table)
#print (as.character(subseq(ref_seq, j, j)))
p.values <- c(p.values, test.result$p.value)
#print(test.result$p.value)
#print(test.result$estimate)
if (!is.null(reference_sequence) && !(reference_sequence == "") ){
cat((allel.num), "\t", j, "\t", as.character(subseq(ref_seq, j, j)), "\t", (small.table[1,1]), "\t", (small.table[2,1]), "\t", (small.table[1,2]), "\t", (small.table[2,2]), "\t", aacids[k], "\t", test.result$p.value, "\n", file= "logging.csv", append=TRUE)
}else{
cat((allel.num), "\t", j, "\t", (small.table[1,1]), "\t", (small.table[2,1]), "\t", (small.table[1,2]), "\t", (small.table[2,2]), "\t", aacids[k], "\t", test.result$p.value, "\n", file= "logging.csv", append=TRUE)
}
odds.ratio <- c(odds.ratio, test.result$estimate)
}
#print(test.result)
aa_values <- c(aa_values, small.table[1,1])
aa_values2 <- c(aa_values2, small.table[2,1])
aa_values3 <- c(aa_values3, small.table[1,2])
aa_values4 <- c(aa_values4, small.table[2,2])
}
####BAYES - FACTOR
if (bayes_factor){
probs.max <- c(probs.max, max(probs))
position <- which(probs == max(probs))
probs.norm <- c(probs.norm, max(probs))
sse_all <- c(sse_all, sse[position])
p.values.all.little <- cbind(probs, NA)
}else{
p.values.min <- c(p.values.min, 1-min(p.values))
position <- which(p.values == min(p.values))
p.values.norm <- c(p.values.norm, min(p.values))
p.values.all.little <- cbind(p.values, NA)
}
if (length(position)>1){
random_pos <- sample(position, 1)
aa_1 <- c(aa_1, paste("/", aa_values[random_pos]))
aa_2 <- c(aa_2, paste("/", aa_values2[random_pos]))
aa_3 <- c(aa_3, paste("/", aa_values3[random_pos]))
aa_4 <- c(aa_4, paste("/", aa_values4[random_pos]))
aa_values_m <- c(aa_values_m, aacids[random_pos])
aa_level <- c(aa_level, paste("/", aa_values[random_pos]))
odds.ratio.all <- c(odds.ratio.all, odds.ratio[random_pos])
p.values.all.little[random_pos, 2] <- "min"
}
else{
aa_1 <- c(aa_1, aa_values[position])
aa_2 <- c(aa_2, aa_values2[position])
aa_3 <- c(aa_3, aa_values3[position])
aa_4 <- c(aa_4, aa_values4[position])
aa_values_m <- c(aa_values_m, aacids[position])
aa_level <- c(aa_level, aa_values[position])
odds.ratio.all <- c(odds.ratio.all, odds.ratio[position])
p.values.all.little[position, 2] <- "min"
}
#print (p.values.all)
#print (odds.ratio.all)
p.values.all <- rbind(p.values.all, p.values.all.little)
####BAYES - FACTOR
if (bayes_factor & with_phylogenetic_comparison){
print ("Nothing here yet (phylogenetic bias)")
}else{
if (p.values.norm[j] <= optical.significance_level && with_phylogenetic_comparison){
singleAA <- as.character(subseq(StringSetsequences_p, j, j))
seq_number_with_AA <- which(singleAA==aa_values_m[j])
seqs_to_comp <- StringSetsequences_p[seq_number_with_AA]
labls <- which(labels(dist.vec) %in% names(seqs_to_comp))
dist_mut <- as.matrix(dist.vec)[labls,labls]
number_of_cluster <- 1-(mean(dist_mut)/mean_dist_all)
stattest <- c(stattest, as.character(number_of_cluster))
}
else{
stattest <- c(stattest, "/")
}
#print (position)
}
}
addtolist(p.values.all, allel.num)
###### BAYES FACTOR
if (bayes_factor){
all.pvals<-c(all.pvals,probs.norm)
aa.all_level <- cbind(aa.all_level, aa_level)
all.aas <- cbind(all.aas, aa_values_m)
all.aa_1 <- cbind(all.aa_1, aa_1)
all.aa_2 <- cbind(all.aa_2, aa_2)
all.aa_3 <- cbind(all.aa_3, aa_3)
all.aa_4 <- cbind(all.aa_4, aa_4)
all.odds <- cbind(all.odds, odds.ratio.all)
all_sses <- cbind(all_sses, sse_all)
}else{
all.pvals<-c(all.pvals,p.values.norm)
aa.all_level <- cbind(aa.all_level, aa_level)
all.aas <- cbind(all.aas, aa_values_m)
all.aa_1 <- cbind(all.aa_1, aa_1)
all.aa_2 <- cbind(all.aa_2, aa_2)
all.aa_3 <- cbind(all.aa_3, aa_3)
all.aa_4 <- cbind(all.aa_4, aa_4)
all.odds <- cbind(all.odds, odds.ratio.all)
all.stattest <- cbind(all.stattest, stattest)
}
}
}
p.values.all.all <- .GlobalEnv[["p.values.all.all"]]
####BAYES - FACTOR
if (bayes_factor){
all.pvals.corrected <- c()
}else{
all.pvals.corrected <- c()
all.pvals.corrected.all.all <- lapply(p.values.all.all,function(r){p.adjust(r[,1], method = statistical_correction)})
}
for(allel.num in 1:length(allels)){
if(allels.count[allel.num]>patnum.threshold){
#if (allels[allel.num] == "A24"){
p.values.all <- p.values.all.all[[allel.num]]
if(bayes_factor){
probs.max <- as.numeric(p.values.all[which(p.values.all[,2]=="min")])
probs.norm <- probs.max
all.pvals.corrected <- c(all.pvals.corrected, probs.norm)
}else{
all.pvals.corrected.all <- all.pvals.corrected.all.all[[allel.num]]
p.values.min <- 1-as.numeric(all.pvals.corrected.all[which(p.values.all[,2]=="min")])
p.values.norm <- as.numeric(all.pvals.corrected.all[which(p.values.all[,2]=="min")])
all.pvals.corrected <- c(all.pvals.corrected, p.values.norm)
}
# print (p.values.min)
# print (p.values.norm)
# mir <- occ
#evaluate
#get epitopes of this allel
#1:2 is column, first number in allel.epitopes is row
allel.epitopes <- epitopes[epitopes[,3]==allels[allel.num],1:2]
#only evaluate if any epitopes have already been discovered for this allel
#fill allel.epitopes2 with zeros for FASTA-length, then put max of p.values.min for all values between epitope border
allel.epitopes2 <- rep(0,min_FASTA_length)
####BAYES - FACTOR
if (bayes_factor){
if(length(allel.epitopes[,1])>0)
for (i in 1:length(allel.epitopes[,1])){
allel.epitopes2[allel.epitopes[i,1]:allel.epitopes[i,2]] <- 1
}
all.pvals_one_allel<-c(probs.max)
matrix.all.pvals_pre <- matrix(all.pvals_one_allel, ncol=1, dimnames=list(1:length(probs.max),allels[allel.num]))
}else{
if(length(allel.epitopes[,1])>0)
for (i in 1:length(allel.epitopes[,1])){
allel.epitopes2[allel.epitopes[i,1]:allel.epitopes[i,2]] <- 1
}
all.pvals_one_allel<-c(p.values.min)
matrix.all.pvals_pre <- matrix(all.pvals_one_allel, ncol=1, dimnames=list(1:length(p.values.min),allels[allel.num]))
}
#post_production for every allel above threshold:
#first: create a matrix with a column for position,
#second: remove all rows with values beneath a certain threshold
#third: calculate with the rest possible epitopes
matrix.all.pvals <- cbind(1:min_FASTA_length, matrix.all.pvals_pre)
matrix.one.allel <- c()
#columns <- c(1, i+1)
matrix.one.allel <- matrix.all.pvals
rowsums.all.pvals <- rowSums(matrix.one.allel)
removal_counter <- c() #for pval selection
remove_counter <- c() # for sequence clean up
for (i in 1:min_FASTA_length){
remove_counter <- c(remove_counter,0)
if (rowsums.all.pvals[i]<=(matrix.one.allel[i,1]+(1-pot.epitope_level))){ # this is becaus of sum of number + value
removal_counter <- c(removal_counter, 1)
}
else{
removal_counter <- c(removal_counter, 0)
}
}
matrix.only.pvals.above.threshold <- matrix.one.allel[!removal_counter, ]
if (is.null(names(matrix.only.pvals.above.threshold))){
name <- colnames(matrix.only.pvals.above.threshold)
}else{
name <- names(matrix.only.pvals.above.threshold)
}
#print (name)
#Get column names
allel <- name[2]
if(one_ident || is.null(pos.epitopes)){
pos.epitope.plot <- c(0)
}
if (!is.null(name) && !is.null(pos.epitopes)){
#cat ("allel",allel,"\n")
pos.epitopes.f.allel <- c()
for (i in 1:length(pos.epitopes[,1])){
pos.epi <- gsub("A[*]0*", "A", substr(pos.epitopes[i,1],1, 4)) # replaces the *and leading 0s in HLA type for possible epitope
pos.epi <- gsub("B[*]0*", "B", pos.epi)
if (pos.epi == allel){
pos.epitopes.f.allel <- c(pos.epitopes.f.allel, pos.epitopes[i,2])
}
}
pos.epitope_list <- c()
#print (seq)
if ((!is.null(pos.epitopes.f.allel)) && (length(matrix.only.pvals.above.threshold)>0)){
for (pos.epitope in pos.epitopes.f.allel){
pos.epitope_list <- c(pos.epitope_list, find_epitope_in_given_sequence(pos.epitope, matrix.only.pvals.above.threshold, sequences[has_allel], allel))
}
}
pos.epitope.plot <- create_pos_epi_plot(pos.epitope_list, min_FASTA_length)
#print (pos.epitope.plot)
}
#window-slinding-thingy
if (bayes_factor){
number_of_mutations_in_sequence <- c()
for (i in 1:(length(all.pvals_one_allel)-(window_size-1))){
number_of_mutations_in_window <- 0
for (j in 0:(window_size-1)){
value <- all.pvals_one_allel[i+j]
thresh <- 10^window_threshold
if (value > thresh){
number_of_mutations_in_window <- (number_of_mutations_in_window + 1)
}
}
number_of_mutations_in_sequence <- c(number_of_mutations_in_sequence, ((1.0/window_size)*number_of_mutations_in_window))
}
}else{
number_of_mutations_in_sequence <- c()
for (i in 1:(length(all.pvals_one_allel)-(window_size-1))){
number_of_mutations_in_window <- 0
for (j in 0:(window_size-1)){
value <- all.pvals_one_allel[i+j]
thresh <- 1-window_threshold
if (value > thresh){
number_of_mutations_in_window <- (number_of_mutations_in_window + 1)
}
}
number_of_mutations_in_sequence <- c(number_of_mutations_in_sequence, ((1.0/window_size)*number_of_mutations_in_window))
}
}
####BAYES - FACTOR
if (bayes_factor){
highest_prob_value <- max(probs.norm)
probs.norm <- log10(probs.norm)
}else{
lowest_p_value <- min(p.values.norm)
vector_for_yaxis <- get_10_based(min(p.values.norm))
}
####BAYES - FACTOR
if (bayes_factor){
if (pos_epi_plot){
par(fig=c(0,0.75,0.55,1))#, new=TRUE)
if (is.null(path_to_file_m) & !is.null(path_to_file_p)){
plot (panel.first=
c(lines(allel.epitopes2, type="l", col="red")), #known epitopes in red
number_of_mutations_in_sequence, type="l", col = "black", main=paste(allels[allel.num]," (",allels.count[allel.num]," sequences)",sep=""),ylab="fraction of significant Bayes Factors", xlab="", xlim=c(0, min_FASTA_length), ylim=c(0,1)) # plot for percental number of mutations in 9er
par(fig=c(0,1,0.6,1))
legend("right", c("known epitopes", paste("# mutations in ", window_size,"er \nsliding window", sep="")),lty=c(1,1,1),lwd=c(1.5,1.5,1.5),col=c("red","black"),bty="n", cex=0.8, xjust=0)
}else if (!is.null(path_to_file_m) & is.null(path_to_file_p)){
plot (panel.first=
c(lines(pos.epitope.plot, type="l", col = "darkgoldenrod1")), #known epitopes in red
number_of_mutations_in_sequence, type="l", col = "black", main=paste(allels[allel.num]," (",allels.count[allel.num]," sequences)",sep=""),ylab="fraction of significant Bayes Factors", xlab="", xlim=c(0, min_FASTA_length), ylim=c(0,1)) # plot for percental number of mutations in 9er
par(fig=c(0,1,0.6,1))
legend("right", c("possible new epitopes", paste("# mutations in ", window_size,"er \nsliding window", sep="")),lty=c(1,1,1),lwd=c(1.5,1.5,1.5),col=c("darkgoldenrod1","black"),bty="n", cex=0.8, xjust=0)
}else if (is.null(path_to_file_p) & is.null(path_to_file_m)){
plot (number_of_mutations_in_sequence, type="l", col = "black", main=paste(allels[allel.num]," (",allels.count[allel.num]," sequences)",sep=""),ylab="fraction of significant Bayes Factors", xlab="", xlim=c(0, min_FASTA_length), ylim=c(0,1)) # plot for percental number of mutations in 9er
par(fig=c(0,1,0.6,1))
legend("right", c(paste("# mutations in ", window_size,"er \nsliding window", sep="")),lty=c(1,1,1),lwd=c(1.5,1.5,1.5),col=c("black"),bty="n", cex=0.8, xjust=0)
}else{
plot (panel.first=
c(lines(allel.epitopes2, type="l", col="red"), #known epitopes in red
lines(pos.epitope.plot, type="l", col = "darkgoldenrod1")), # possible epitopes in darkgoldenrod1
number_of_mutations_in_sequence, type="l", col = "black", main=paste(allels[allel.num]," (",allels.count[allel.num]," sequences)",sep=""),ylab="fraction of significant Bayes Factors", xlab="", xlim=c(0, min_FASTA_length), ylim=c(0,1)) # plot for percental number of mutations in 9er
par(fig=c(0,1,0.6,1))
legend("right", c("known epitopes","possible new epitopes", paste("# mutations in ", window_size,"er \nsliding window", sep="")),lty=c(1,1,1),lwd=c(1.5,1.5,1.5),col=c("red","darkgoldenrod1","black"),bty="n", cex=0.8, xjust=0)
}
par(fig=c(0,0.75,0.0,0.6), new=TRUE)
}
plot(probs.norm, type="p", col="black", pch=20, ylab="log10(Bayes Factor)", xlab="Alignment position", xlim=c(0, min_FASTA_length))
if (!pos_epi_plot){
mtext(paste(allels[allel.num]," (",allels.count[allel.num]," sequences)",sep=""))
}
corrected_opti <- optical.significance_level
corrected_star <- star.significance_level
abline(h=optical.significance_level, col = "gray60", lty=2)
probs.above_sig_level_norm <- probs.norm
probs.above_star_level_norm <- probs.norm
for (i in 1:length(probs.norm)){
if (probs.norm[i] >= (corrected_opti)){
if (probs.norm[i] >= (corrected_star)){
probs.above_star_level_norm[i] <- probs.norm[i]
}
else{
probs.above_sig_level_norm[i] <- probs.norm[i]
probs.above_star_level_norm[i] <- NA
}
}
else {
probs.above_sig_level_norm[i] <- NA
probs.above_star_level_norm[i] <- NA
}
}
vert_lines <- which(probs.above_star_level_norm > 0)
for (sp in 1:length(vert_lines)){
abline(v=vert_lines[sp], col = "gray60", lty=2)
}
lines(probs.above_sig_level_norm, type="p", col = "red", pch=20) #points above a certain significance level
lines(probs.above_star_level_norm, type="p", col = "red", pch=8) #stars above a certain significance level
}else{
if (pos_epi_plot){
par(fig=c(0,0.75,0.55,1))#, new=TRUE)
if (is.null(path_to_file_m) & !is.null(path_to_file_p)){
plot (panel.first=
c(lines(allel.epitopes2, type="l", col="red")), #known epitopes in red
number_of_mutations_in_sequence, type="l", col = "black", main=paste(allels[allel.num]," (",allels.count[allel.num]," sequences)",sep=""),ylab="fraction of significant p-values", xlab="", xlim=c(0, min_FASTA_length), ylim=c(0,1)) # plot for percental number of mutations in 9er
par(fig=c(0,1,0.6,1))
legend("right", c("known epitopes", paste("# mutations in ", window_size,"er \nsliding window", sep="")),lty=c(1,1,1),lwd=c(1.5,1.5,1.5),col=c("red","black"),bty="n", cex=0.8, xjust=0)
}else if (!is.null(path_to_file_m) & is.null(path_to_file_p)){
plot (panel.first=
c(lines(pos.epitope.plot, type="l", col = "darkgoldenrod1")), #known epitopes in red
number_of_mutations_in_sequence, type="l", col = "black", main=paste(allels[allel.num]," (",allels.count[allel.num]," sequences)",sep=""),ylab="fraction of significant p-values", xlab="", xlim=c(0, min_FASTA_length), ylim=c(0,1)) # plot for percental number of mutations in 9er
par(fig=c(0,1,0.6,1))
legend("right", c("possible new epitopes", paste("# mutations in ", window_size,"er \nsliding window", sep="")),lty=c(1,1,1),lwd=c(1.5,1.5,1.5),col=c("darkgoldenrod1","black"),bty="n", cex=0.8, xjust=0)
}else if (is.null(path_to_file_p) & is.null(path_to_file_m)){
plot (number_of_mutations_in_sequence, type="l", col = "black", main=paste(allels[allel.num]," (",allels.count[allel.num]," sequences)",sep=""),ylab="fraction of significant p-values", xlab="", xlim=c(0, min_FASTA_length)) # plot for percental number of mutations in 9er
par(fig=c(0,1,0.6,1))
legend("right", c(paste("# mutations in ", window_size,"er \nsliding window", sep="")),lty=c(1,1,1),lwd=c(1.5,1.5,1.5),col=c("black"),bty="n", cex=0.8, xjust=0)
}else{
plot (panel.first=
c(lines(allel.epitopes2, type="l", col="red"), #known epitopes in red
lines(pos.epitope.plot, type="l", col = "darkgoldenrod1")), # possible epitopes in darkgoldenrod1
number_of_mutations_in_sequence, type="l", col = "black", main=paste(allels[allel.num]," (",allels.count[allel.num]," sequences)",sep=""),ylab="fraction of significant p-values", xlab="", xlim=c(0, min_FASTA_length), ylim=c(0,1)) # plot for percental number of mutations in 9er
par(fig=c(0,1,0.6,1))
legend("right", c("known epitopes","possible new epitopes", paste("# mutations in ", window_size,"er \nsliding window", sep="")),lty=c(1,1,1),lwd=c(1.5,1.5,1.5),col=c("red","darkgoldenrod1","black"),bty="n", cex=0.8, xjust=0)
}
par(fig=c(0,0.75,0.0,0.6), new=TRUE)
}
plot(p.values.norm, type="p", ylim=c(1.0,lowest_p_value), log="y", col="black", yaxt="n", pch=20, ylab="p-values", xlab="Alignment position", xlim=c(0, min_FASTA_length))
if (!pos_epi_plot){
mtext(paste(allels[allel.num]," (",allels.count[allel.num]," sequences)",sep=""))
}
axis(side=2, at=vector_for_yaxis, labels=vector_for_yaxis)
corrected_opti <- optical.significance_level
corrected_star <- star.significance_level
abline(h=optical.significance_level, col = "gray60", lty=2)
p.values.above_sig_level_norm <- p.values.norm
p.values.above_star_level_norm <- p.values.norm
for (i in 1:length(p.values.norm)){
if (p.values.norm[i] <= (corrected_opti)){
if (p.values.norm[i] <= (corrected_star)){
p.values.above_star_level_norm[i] <- p.values.norm[i]
}
else{
p.values.above_sig_level_norm[i] <- p.values.norm[i]
p.values.above_star_level_norm[i] <- 0
}
}
else {
p.values.above_sig_level_norm[i] <- 0
p.values.above_star_level_norm[i] <- 0
}
}
vert_lines <- which(p.values.above_star_level_norm > 0)
for (sp in 1:length(vert_lines)){
abline(v=vert_lines[sp], col = "gray60", lty=2)
}
lines(p.values.above_sig_level_norm, type="p", col = "red", pch=20) #points above a certain significance level
lines(p.values.above_star_level_norm, type="p", col = "red", pch=8) #stars above a certain significance level
}
}
}
#forth: write this epitope in a csv
all.pos.epi <- subset(pos.epi.matrix, !duplicated(pos.epi.matrix))
#all.pos.epi <- pos.epi.matrix[!duplicated(pos.epi.matrix)]
#print (all.pos.epi)
if (nrow(all.pos.epi)>1){
all.pos.epi <- all.pos.epi[2:nrow(all.pos.epi),]
#print(all.pos.epi)
}
####BAYES - FACTOR
if (bayes_factor){
result_output <- (matrix(all.pvals, ncol=sum(allels.count>patnum.threshold), dimnames=list(1:length(probs.max),allels[allels.count>patnum.threshold])))
all.pvals.c_output <- (matrix(all.pvals.corrected, ncol=sum(allels.count>patnum.threshold), dimnames=list(1:length(probs.max),allels[allels.count>patnum.threshold])))
}else{
result_output <- (matrix(all.pvals, ncol=sum(allels.count>patnum.threshold), dimnames=list(1:length(p.values.min),allels[allels.count>patnum.threshold])))
all.pvals.c_output <- (matrix(all.pvals.corrected, ncol=sum(allels.count>patnum.threshold), dimnames=list(1:length(p.values.min),allels[allels.count>patnum.threshold])))
}
result <- c()
if (!is.null(reference_sequence) && !(reference_sequence == "") ){
ref_row <- as.character(subseq(ref_seq, 1, 1))
for (j in 2:width(ref_seq)){
ref_row <- rbind(ref_row, as.character(subseq(ref_seq, j, j)))
}
}
####BAYES - FACTOR
if (bayes_factor){
if (!is.null(reference_sequence) && !(reference_sequence == "") ){
if (is.null(dim(all.aas))){
result <- cbind(result_output, all_sses, all.aas, all.odds, as.character(subseq(ref_seq, j, j)))
}
else {
for (i in 1:ncol(result_output)){
result <- cbind(result, result_output[,i], scale(result_output[,i],center=TRUE,scale=TRUE)[,1], all_sses[,i], all.aas[,i], all.odds[,i], ref_row)
}
}
}else{
if (is.null(dim(all.aas))){
result <- cbind(result_output, all_sses, all.aas, all.odds)
}
else {
for (i in 1:ncol(result_output)){
result <- cbind(result, result_output[,i], scale(result_output[,i],center=TRUE,scale=TRUE)[,1], all_sses[,i], all.aas[,i], all.odds[,i])
}
}
}
colname <- c()
if (!is.null(reference_sequence) && !(reference_sequence == "") ){
for (i in 1:(ncol(result)/6)){
colname <- c(colname, allels[allels.count>patnum.threshold][i], "z-scores", "standard error BF", "AA with lowest p value", "Odds ratio", "ref_seq")
}
}else{
for (i in 1:(ncol(result)/5)){
colname <- c(colname, allels[allels.count>patnum.threshold][i], "z-scores", "standard error BF", "AA with lowest p value", "Odds ratio")
}
}
}else{
if (!is.null(reference_sequence) && !(reference_sequence == "") ){
if (is.null(dim(all.aas))){
result <- cbind(result_output, all.aas, all.odds, all.stattest, as.character(subseq(ref_seq, j, j)))
}
else {
for (i in 1:ncol(result_output)){
result <- cbind(result, result_output[,i], all.pvals.c_output[,i], scale(result_output[,i],center=TRUE,scale=TRUE)[,1], all.aas[,i], all.odds[,i], all.stattest[,i],
ref_row)
}
}
}else{
if (is.null(dim(all.aas))){
result <- cbind(result_output, all.aas, all.odds, all.stattest)
}
else {
for (i in 1:ncol(result_output)){
result <- cbind(result, result_output[,i], all.pvals.c_output[,i], scale(result_output[,i],center=TRUE,scale=TRUE)[,1], all.aas[,i], all.odds[,i], all.stattest[,i])
}
}
}
colname <- c()
if (!is.null(reference_sequence) && !(reference_sequence == "") ){
for (i in 1:(ncol(result)/7)){
colname <- c(colname, allels[allels.count>patnum.threshold][i],"Corrected p-value", "z-scores", "AA with lowest p value", "Odds ratio", "Phylogenetic bias", "ref_seq")
}
}else{
for (i in 1:(ncol(result)/6)){
colname <- c(colname, allels[allels.count>patnum.threshold][i],"Corrected p-value", "z-scores", "AA with lowest p value", "Odds ratio", "Phylogenetic bias")
}
}
}
colnames(result) <- colname
write.csv2(result, paste(save_name_csv, sep=""))
#write all pvalues in csv file
#we have finished our pdf writing
dev.off()
#print (result)
### returns the result as a table of HLA type vs sequence position
return (result)
}
#assocpoint(
#"HBV.fasta",
#"",
#"",
#save_name_pdf = "HBVresults.pdf",
#save_name_csv = "HBVresults.csv",
#dna = FALSE,
#patnum_threshold = 1,
#optical_significance_level = 0.05,
#star_significance_level = 0.001,
#A11 = 15,
#A12 = 18,
#A21 = 20,
#A22 = 23,
#B11 = 26,
#B12 = 29,
#B21 = 31,
#B22 = 34,
#C11 = 37,
#C12 = 40,
#C21 = 42,
#C22 = 45,
#has_C = FALSE,
#multiple_testing_correction = "fdr",
#bayes_factor=FALSE,
#constant_dirichlet_precision_parameter=FALSE,
#dirichlet_precision_parameter=200,
#phylo_bias_check = FALSE,
#path_to_file_reference_sequence = "",
#one_feature=FALSE,
#window_size=9,
#epi_plot=TRUE
#)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.