Test/testing.R

partial_correlations_core <- function(genotype_shared,genotype_rev,phenotype,coords,model=1,size){
  n=dim(genotype_shared)[2]
  data_matrix <- matrix(0,nrow = 2*(coords[2]-coords[1]+1),ncol=dim(genotype_shared)[2])
  matrix_row <- 0
  if (model==1){
    for (i in coords[1]:coords[2]){
      matrix_row <- matrix_row+1
      if (i < n){
        for (j in (i+1):n){
          tmp_model = fastLm(phenotype ~ I(genotype_shared[,i])+I(genotype_shared[,j])+I(genotype_shared[,i]*genotype_shared[,j]))
          data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
  }
  if (model==2){
    for (i in coords[1]:coords[2]){
      matrix_row <- matrix_row+1
      if (i < n){
        for (j in (i+1):n){
          tmp_model = fastLm(phenotype ~ I(genotype[,i])+I(genotype[,j])+I(genotype[,i]*genotype_rev[,j]))
          data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
  }
  return(data_matrix)
}

partial_correlations_test <- function(phenotype,coords,model=1,size){
  n=size
  data_matrix <- matrix(0,nrow = 2*(coords[2]-coords[1]+1),ncol=size)
  matrix_row <- 0
  if (model==1){
    for (i in coords[1]:coords[2]){
      matrix_row <- matrix_row+1
      if (i < n){
        for (j in (i+1):n){
          tmp_model = fastLm(phenotype ~ I(genotype1[,i])+I(genotype1[,j])+I(genotype1[,i]*genotype1[,j]))
          data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
  }
  if (model==2){
    for (i in coords[1]:coords[2]){
      matrix_row <- matrix_row+1
      if (i < n){
        for (j in (i+1):n){
          tmp_model = fastLm(phenotype ~ I(genotype[,i])+I(genotype[,j])+I(genotype[,i]*genotype_rev[,j]))
          data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
  }
  return(data_matrix)
}

epistatic.correlation_test <- function(phenotype,genotype,parallel=1,test=T,simple=T){ 
  phenotype <- as.matrix(phenotype)
  n<-ncol(genotype)
  coords<-triangular_split(n,parallel)
  if(is.data.frame(genotype)){
    genotype[] <- lapply(genotype, as.numeric)
  }
  if (simple==F || test==T){
    genotype_rev <- genotype
    decide_1<-(genotype_rev==1)
    decide_2<-(genotype_rev==2)
    genotype_rev[decide_1] <- 2
    genotype_rev[decide_2] <- 1
    rm(decide_1)
    rm(decide_2)
    genotype_rev <- as.data.frame(genotype_rev)
  }
  if (test==T && n > 315) {
    x <- as.big.matrix(x = genotype, type = "double", 
                       separated = FALSE, 
                       backingfile = "genotype.file", 
                       descriptorfile = "genotype.desc")
    # get a description of the matrix
    mdesc <- describe(x)
    cl <- makeCluster(parallel)
    registerDoParallel(cl = cl)
    clusterExport(cl,c("partial_correlations_core"))
    message("Running Test")
    message("Estimating run time based on ~100.000 models")
    start.time <- Sys.time()
    test_coords<-triangular_split(1000,parallel)
    snp_matrix <- foreach(j = 1:parallel, .combine='rbind', .verbose=F) %dopar% {
      require(bigmemory)
      require(RcppEigen)
      genotype_shared <- attach.big.matrix("genotype.desc")
      subset <- partial_correlations_core(genotype_shared,1,phenotype,test_coords[j,],model=1)
      return(subset)
    #   n=1000
    #   data_matrix <- matrix(0,nrow = 2*(coords[j,2]-coords[j,1]+1),ncol=1000)
    #   matrix_row <- 0
    #   for (i in coords[j,1]:coords[j,2]){
    #     matrix_row <- matrix_row+1
    #     if (i < n){
    #       for (j in (i+1):n){
    #         tmp_model = fastLm(phenotype ~ I(genotype_shared[,i])+I(genotype_shared[,j])+I(genotype_shared[,i]*genotype_shared[,j]))
    #         data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
    #       }
    #     }
    #   }
    #   return(data_matrix)
    # }
    # message("Running Test")
    # message("Estimating run time based on ~100.000 models")
    # start.time <- Sys.time()
    # genotype_list <- list() 
    # test_coords<-triangular_split(316,parallel)
    #  for (i in 1:dim(test_coords)[1]){
    #   genotype_list[[i]] <- genotype[,test_coords[i,1]:test_coords[i,2]]
    #   print(length(genotype_list))
    #   subset <- partial_correlations_test(genotype_list[[i]],genotype_list[[i]],phenotype,test_coords[i,],316,model=1)
    #   }
    # #snp_matrix <- foreach(j = 1:parallel, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
    #subset <- partial_correlations_test(genotype_list[[j]],genotype_list[[j]],phenotype,dim(genotype_list[[j]])[2],316,model=1)
    #return(subset)
    }
  }
  end.time <- Sys.time()
  time<-as.numeric(end.time-start.time,units="hours")
  model_time<-(((n^2-n)/2)/((316^2-316)/2))*time
  model_time <- round(model_time,digits = 2)
  model_time<-as.character(model_time)
  estimate<-paste(paste("The estimated run time for the simple model is",model_time),"hours",sep=" ")
  message(estimate)
  #snp_matrix <- foreach(j = 1:parallel, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
  # subset <- partial_correlations_test(genotype[,1:316],genotype_rev[,1:316],phenotype,test_coords[j,],model=2)
  #return(subset)
  #}
  #end.time <- Sys.time()
  #time<-as.numeric(end.time-start.time,units="hours")
  #model_time<-(((n^2-n)/2)/((316^2-316)/2))*time
  #model_time <- round(model_time,digits = 2)
  #model_time<-as.character(model_time)
  #estimate<-paste(paste("The estimated run time for the full model is",model_time),"hours",sep=" ")
  #message(estimate)
  epi_cor <- snp_matrix[seq(1,nrow(snp_matrix)-1,2),]
  epi_pvalue <-   snp_matrix[seq(2,nrow(snp_matrix),2),]
  epi_cor_t <- t(epi_cor)
  epi_pvalue_t <- t(epi_pvalue)
  diag(epi_cor_t) <- 1
  diag(epi_pvalue_t) <- 1
  epi_cor_t[upper.tri(epi_cor_t)]<- epi_cor[upper.tri(epi_cor)]
  epi_pvalue_t[upper.tri(epi_pvalue_t)]<- epi_pvalue[upper.tri(epi_pvalue)]
  colnames(epi_pvalue_t) <- colnames(genotype)
  rownames(epi_pvalue_t) <- colnames(genotype)
  colnames(epi_cor_t) <- colnames(genotype)
  rownames(epi_cor_t) <- colnames(genotype)
  output <-list(epi_pvalue_t,epi_cor_t)
  names(output)<-c("Pvalues","Coefficients")
  return(output)
}
result1<-epistatic.correlation_test(genotype = genotype[1:193,1:1000], phenotype = phenotype$W0_W12 ,parallel = 10, test = T)

result1<-epistatic.correlation(genotype = genotype[1:193,1:1000], phenotype = phenotype$W0_W12 ,threads = 10, test = F, simple = T)
hist(result1$Pvalues)
result1$Coefficients


hist(result$Pvalues)


ids_delete <- read.table("/home/victor/Documents/WISH_files/IDs_delete.txt")
phenotype <- read.table("/home/victor/Documents/ADD_files/pheno_for_victor.txt",header = T)

genotype<-generate.genotype(ped = "/home/victor/Documents/ADD_files/INDICES_1_2.ped",tped = "/home/victor/Documents/ADD_files/INDICES1_1_2.tped",major.freq = 0.90)
genotype<-genotype[!(rownames(genotype) %in% ids_delete[,1]),]
tped <- fread("/home/victor/Documents/ADD_files/INDICES1_1_2.tped", data.table = F)




#### Developing Blocks ####

correlation_blocks <-function(genotype,threshold=0.9,measure="r2"){ 
  if (!(measure %in% c("r2","Dd"))){
    print("unknown LD measure, should be r2 or Dd")
  }
  else{
  snp_block_matrix <- as.matrix(rep(0,dim(genotype)[2]))
  rownames(snp_block_matrix) <- colnames(genotype)
  n_snp <- 1
  start <- 2
  n_block <- 1
  snps <- dim(genotype)[2]
  snp_block_matrix[n_snp,] <- n_block
  block_coords <- list()
  r_vector <- c()
  D_vector <- c()
  Dd_vector <- c()
  while(start <= snps){
      total <- sum(!is.na(genotype[,n_snp]+genotype[,start]))
      usable_values <- !is.na(genotype[,n_snp]+genotype[,start])
      p_AB <- (sum(genotype[usable_values,n_snp]+genotype[usable_values,start] == 0,na.rm = T)+(sum(genotype[usable_values,n_snp]+ genotype[usable_values,start] == 1 ,na.rm = T)/2)+(sum(genotype[usable_values,n_snp]*genotype[usable_values,start] == 1 ,na.rm = T)/2))/total
      p_A <- (sum(genotype[usable_values,n_snp] == 0,na.rm = T)+ sum(genotype[usable_values,n_snp] == 1,na.rm = T)/2)/total
      p_a <- 1-p_A
      p_B <- (sum(genotype[usable_values,start] == 0,na.rm = T)+ sum(genotype[usable_values,start] == 1,na.rm = T)/2)/total
      p_b <- 1-p_B
      D <- p_AB-(p_A*p_B)
      if( D < 0){
        D_min <- max(c(-(p_A*p_B),-(p_a*p_b)))
        Dd <- D/D_min
      }
      else {
        D_min <- min(c(p_A*p_b,p_a*p_B))
        Dd <- D/D_min
      }
      r2 <- D^2/(p_A*p_a*p_B*p_b)
      r_vector <- c(r_vector,r2)
      D_vector <- c(D_vector,D)
      Dd_vector <- c(Dd_vector,Dd)
      if (measure == "r2"){
        similarity <- r2
      }
      if (measure == "Dd"){
        similarity <- Dd
      }
      if (similarity >= threshold & similarity){ 
        snp_block_matrix[start,1] = n_block
        start <- start+1
        if (start > snps){
          block_coords[[n_block]] <- c(n_snp,(start-1))
        }
      } 
      else {
        if (start == snps){
          block_coords[[n_block]] <- c(n_snp,(start-1))
          n_block <- n_block + 1
          snp_block_matrix[snps,1] <- n_block
          block_coords[[n_block]] <- c(start,start)
          start <- start + 1
        }
        else{
        block_coords[[n_block]] <- c(n_snp,(start-1))
        n_snp <- start
        n_block <- n_block+1
        snp_block_matrix[start,1] = n_block
        start <- start +1
        }
      }
  }
  genotype <- as.matrix(genotype)
  new_genotype <- matrix(0L, nrow = dim(genotype)[1], ncol = n_block)
  counter <- 0
  tagging_markers <- c()
  message("Creating Blocks")
  for (coords in block_coords){
    counter <- counter + 1
    if (coords[1] == coords[2]){
      new_genotype[,counter] <- genotype[,coords[1]]
      tagging_markers <- c(tagging_markers,coords[1])
    }
    else {
        selection<- round(median(c(coords[1]:coords[2]))) 
        new_genotype[,counter] <- genotype[,selection] 
        tagging_markers <- c(tagging_markers,selection) 
      }  
    }
  output<-list(new_genotype,block_coords,snp_block_matrix,r_vector,D_vector,Dd_vector,tagging_markers)
  names(output)<-c("genotype","block_coords","snp_id_blocks","r_values","D_values","Dd_values","tagging_markers")
  return(output)
  }
}

genotype_cut<-correlation_blocks(genotype,threshold = 0.8,measure="r2")
geno_block<-(genotype_cut$genotype)
dim(geno_block)

geno_block[,2303]
blocks1<-genotype_cut[[1]]
blocks2<-genotype_cut[[2]]
blocks3<-genotype_cut[[3]]
blocks4<-genotype_cut[[4]]
blocks5<-genotype_cut[[5]]
blocks6<-genotype_cut[[6]]

max(blocks5)
min(blocks5)
hist(blocks4)
hist(blocks5)
hist(blocks6)

dim(genotype)

head(blocks2)

head(blocks3)


model



correlation_blocks2 <-function(genotype,threshold=0.9){ 
  snp_block_matrix <- as.matrix(rep(0,dim(genotype)[2]))
  rownames(snp_block_matrix) <- colnames(genotype)
  n_snp <- 1
  start <- 2
  n_block <- 1
  snps <- dim(genotype)[2]
  snp_block_matrix[n_snp,] <- n_block
  block_coords <- list()
  while(start <= snps){
    if (n_snp == snps){
      snp_block_matrix[snps,1] = n_block
    }
    else{
      #matches<-sum(genotype[,n_snp]/genotype[,start] == 1,na.rm = T)+(sum(c(c(genotype[,n_snp] == 1.5)+ c(genotype[,start] == 1.5)) == 1 ,na.rm = T)/2)
      total <- sum(!is.na(genotype[,n_snp]+genotype[,start]))
      #similarity <- matches/total
      p_MM <- 2*(sum(genotype[,n_snp]*genotype[,start] == 4,na.rm = T)+(sum(genotype[,n_snp]+ genotype[,start] == 3 ,na.rm = T)))
      p_M <- sum(genotype[,n_snp]+genotype[,start] == 3.5 ,na.rm = T)
      similarity <- (p_MM+p_M)/(total*2)
      print(similarity)
      if (similarity >= threshold){
        snp_block_matrix[start,1] = n_block
        start <- start+1
      }
      else {
        block_coords[[n_block]] <- c(n_snp,start)
        n_snp <- start
        n_block <- n_block +1
        snp_block_matrix[start,1] = n_block
        start <- start +1
        if (start%%1000 == 0){
          print(start)
        }
      }
    } 
  }
  genotype <- as.matrix(genotype)
  new_genotype <- matrix(0L, nrow = dim(genotype)[1], ncol = n_block)
  counter <- 0
  message("Creating Blocks")
  for (coords in block_coords){
    counter <- counter + 1
    if (counter%%1000 == 0 ){
      print(counter)
    }
    if (coords[1] == coords[2]){
      new_genotype[,counter] <- genotype[,coords[1]]
    }
    else {
      new_genotype[,counter] <- rowMeans(genotype[,coords[1]:coords[2]],na.rm = T) 
    }
  }
  output<-list(new_genotype,block_coords)
  names(output)<-c("genotype","block_coords","snp_id_blocks")
  return(list(output) 
} 

genotype_cut<-correlation_blocks(genotype)
genotype_cut[[2]]

genotype_cut[,1392]

dim(genotype_cut)


system.time(blocks<-correlation_blocks(genotype,0.9))
dim(blocks)
tail(blocks)
genotype[,blocks[,1] == 1]
sum(blocks[,1] == 1)
woot<-sort((table(blocks)),decreasing = T)
woot[1:100]

correlation_blocks_running <-function(genotype,threshold=0.9){ 
  snp_block_matrix <- as.matrix(rep(0,dim(genotype)[2]))
  rownames(snp_block_matrix) <- colnames(genotype)
  n_snp <- 1
  start <- 2
  n_block <- 1
  snps <- dim(genotype)[2]
  snp_block_matrix[n_snp,] <- n_block
  similarity <- 0
  while(start <= snps){
    if (n_snp == snps){
      snp_block_matrix[snps,1] = n_block
    }
    else{
      matches<-sum(genotype[,n_snp]/genotype[,start] == 1,na.rm = T) + sum((genotype[,n_snp]+genotype[,start]) == 3,na.rm = T)+sum(((genotype[,n_snp] == 1.5 + genotype[,start] == 1.5)==1,na.rm = T)
      total <- sum(!is.na(genotype[,n_snp]+genotype[,start]))
      score<-matches+(total-matches)/2
      similarity <- score/total
      if (similarity/(start-n_snp) >= threshold){
        snp_block_matrix[start,1] = n_block
        start <- start+1
      }
      else {
        n_snp <- start
        n_block <- n_block +1
        snp_block_matrix[start,1] = n_block
        start <- start +1
        similarity <- 0
      }
    }
  }
  
  
} 


blocks<-correlation_blocks_running(genotype)
tail(blocks)


correlation_blocks_network <-function(genotype,threshold=0.9,max_block_size=1000){ 
  snp_block_matrix <- as.matrix(rep(0,dim(genotype)[2]))
  rownames(snp_block_matrix) <- colnames(genotype)
  n_snp <- 1
  start <- 2
  n_block <- 1
  network_size <- 0
  r2_values <- c()
  block_coords <- list()
  snps <- dim(genotype)[2]
  snp_block_matrix[n_snp,] <- n_block
  while(start <= snps){ 
    if (n_snp == snps){
      snp_block_matrix[snps,1] = n_block
    }
    else{ 
      temp_sim <- 0
      network_pairs<-combn(c(n_snp:(start)),2)
      for (i in 1:dim(network_pairs)[2]){
        start_t <- network_pairs[1,i]
        n_snp_t <- network_pairs[2,i]
        # if (n_snp == snps){
        #   snp_block_matrix[snps,1] = n_block
        # }
        #else{
        #matches<-sum(genotype[,n_snp]/genotype[,start] == 1,na.rm = T)+(sum(c(c(genotype[,n_snp] == 1.5)+ c(genotype[,start] == 1.5)) == 1 ,na.rm = T)/2)
        total <- sum(!is.na(genotype[,n_snp_t]+genotype[,start_t]))
        usable_values <- !is.na(genotype[,n_snp_t]+genotype[,start_t])
        #similarity <- matches/total
        p_AB <- (sum(genotype[usable_values,n_snp_t]+genotype[usable_values,start_t] == 0,na.rm = T)+(sum(genotype[usable_values,n_snp_t]+ genotype[usable_values,start_t] == 1 ,na.rm = T)/2)+(sum(genotype[usable_values,n_snp_t]*genotype[usable_values,start_t] == 1 ,na.rm = T)/2))/total
        p_A <- (sum(genotype[usable_values,n_snp_t] == 0,na.rm = T)+ sum(genotype[usable_values,n_snp_t] == 1,na.rm = T)/2)/total
        p_a <- 1-p_A
        p_B <- (sum(genotype[usable_values,start_t] == 0,na.rm = T)+ sum(genotype[usable_values,start_t] == 1,na.rm = T)/2)/total
        p_b <- 1-p_B
        #print(c(p_A,p_B,p_AB,start))
        D <- p_AB-(p_A*p_B)
        r2 <- D^2/(p_A*p_a*p_B*p_b)
        r2_values <- c(r2_values,r2)
        temp_sim <- r2+temp_sim
        network_size <- network_size + 1
      }
      similarity <- temp_sim
      print(c(similarity,similarity/network_size,network_size,start,n_snp))
      if (similarity/network_size >= threshold && network_size < 1001){
        snp_block_matrix[start,1] = n_block
        start <- start+1
        network_size <-0
        if (start > snps){
          block_coords[[n_block]] <- c(n_snp,(start-1))
        }
      }
      else {
        if (start == snps){
          block_coords[[n_block]] <- c(n_snp,(start-1))
          n_block <- n_block + 1
          snp_block_matrix[snps,1] <- n_block
          block_coords[[n_block]] <- c(start,start)
          start <- start + 1
        }
        else{
        block_coords[[n_block]] <- c(n_snp,(start-1))
        n_snp <- start
        n_block <- n_block +1
        snp_block_matrix[start,1] = n_block
        start <- start +1
        network_size <- 0
        }
      }
    }
  }
  genotype <- as.matrix(genotype)
  new_genotype <- matrix(0L, nrow = dim(genotype)[1], ncol = n_block)
  counter <- 0
  tagging_markers <- c()
  message("Creating Blocks")
  for (coords in block_coords){
    counter <- counter + 1
    if (coords[1] == coords[2]){
      new_genotype[,counter] <- genotype[,coords[1]]
      tagging_markers <- c(tagging_markers,coords[1])
    }
    else {
      selection<- round(median(c(coords[1]:coords[2]))) 
      new_genotype[,counter] <- genotype[,selection] 
      tagging_markers <- c(tagging_markers,selection) 
    }  
  }
  output<-list(new_genotype,tagging_markers,snp_block_matrix,r2_values)
  names(output)<-c("genotype","tagging_markers","genotype_block_matrix")
  return(output)
}

blocks<-correlation_blocks_network(genotype,0.8)


max(table(blocks$genotype_block_matrix))

hist(blocks$r2)

dim(blocks[[1]])
dim(blocks[[2]])
tail(blocks[[2]])
tail(blocks[[1]])

length(blocks$tagging_markers)
blocks<-blocks$genotype_block_matrix
max(table(blocks))


max(hist(blocks[[2]]))
dim(blocks[[1]])
tail(blocks[[3]])



matches<-sum(genotype[,1001]/genotype[,1000] == 1,na.rm = T)+sum(c(c(genotype[,1001] == 1.5)+ c(genotype[,1000] == 1.5)) == 1 ,na.rm = T)/2

matches<-sum(tempgen[,1]/tempgen[,2] == 1,na.rm = T)+sum((tempgen[,1]+tempgen[,2]) == 3,na.rm = T)
score<-matches+(sum(!is.na(tempgen[,1]+tempgen[,2]))-matches)/2
sum(c(c(genotype[,1] == 1.5)+ c(genotype[,3] == 1.5)) == 1 ,na.rm = T)

sum((genotype[,1]+genotype[,3]) == 2.5)
sum((genotype[,1]+genotype[,3]) == 3.5)


### New matrix Split ###

#' Calculate the epistatic interaction effect between SNP pairs to construct a 
#' WISH network using a genotype data frame created from genarate.genotype()
#' @import doParallel
#' @import foreach
#' @import RcppEigen
#' @description A WISH network can be built based on epistatic interaction 
#' effects between SNP pairs. Those interaction effects are calculated using
#' linear models. 
#' @usage epistatic.correlation(phenotype, genotype, parallel,test,simple)
#' @param phenotype Dataframe with the rows correspinding to the individuals
#' in the analysis,and columns for the different measured phenotypes and 
#' fixed/random factors. Only give one phenotype column at a time. Phenotypes
#' should be non-categorical continous or discrete/semi-discrete variables. 
#' Make sure that the dataframe contains the same
#' individuals as in the genotype-file, and that those are in the same order.
#' @param genotype Dataframe with the genotype information, resulting from 
#' the function generate.genotype(). Make sure that the dataframe contains the 
#' same individuals as in the phenotype-file, and that those are in the 
#' same order.
#' @param parallel Number of cores to use for parallel execution in the function 
#' registerDoParallel()
#' @param test True or False value indicating if a test run is being perform.
#' If True will calculate the expected time it will take for the full analysis
#' based on calculating 100.000 models with the setting chosen
#' @param simple True or false value indicating if only a major/major and
#' minor/minor directed interaction model are tested (simple=T) or if if 
#' interactions on the major/minor minor axis are tested as well, with the 
#' best one of the two being selected (simple=F).
#' @return A list of two matrices. The first matrix gives the epistatic
#' interaction effects between all the SNP-pairs which were in the input 
#' genotype data) and selected with the pvalue from the GWAS results. 
#' The second matrix are the corresponding pvalues of the parameter 
#' estimates of the epistatic interactions. 
#' @references Lisette J.A. Kogelman and Haja N.Kadarmideen (2014). 
#' Weighted Interaction SNP Hub (WISH) network method for building genetic
#' networks for complex diseases and traits using whole genome genotype data.
#' BMC Systems Biology 8(Suppl 2):S5. 
#' http://www.biomedcentral.com/1752-0509/8/S2/S5.
#' @examples
#' epistatic.correlation(phenotype,genotype,parallel,test,simple)
#' 
#' @export

epistatic.correlation_test <- function(phenotype,genotype,threads=1,test=T,simple=T){ 
  registerDoParallel(threads)
  if ( simple == T){ 
    model <- 1
  }
  else {
    model <- 2
  }
    phenotype < as.matrix(phenotype)
  n<-ncol(genotype)
  if(is.data.frame(genotype)){
    genotype[] <- lapply(genotype, as.numeric)
  }
  if (simple==F || test==T){
    genotype_rev <- genotype
    decide_1<-(genotype_rev==1)
    decide_2<-(genotype_rev==2)
    genotype_rev[decide_1] <- 2
    genotype_rev[decide_2] <- 1
    rm(decide_1)
    rm(decide_2)
    genotype_rev <- as.data.frame(genotype_rev)
  }
  if (test==T && n > 315) {
    message("Running Test")
    message("Estimating run time based on ~100.000 models")
    start.time <- Sys.time()
    coord_splits <-triangular_split(316,threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype[,1:316],genotype_rev[,1:316],phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
    }
    end.time <- Sys.time()
    time<-as.numeric(end.time-start.time,units="hours")
    model_time<-(((n^2-n)/2)/((316^2-316)/2))*time
    model_time <- round(model_time,digits = 2)
    model_time<-as.character(model_time)
    estimate<-paste(paste("The estimated run time for the simple model is",model_time),"hours",sep=" ")
    message(estimate)
    coord_splits <-triangular_split(316,threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype[,1:316],genotype_rev[,1:316],phenotype=phenotype,coord_splits[j,],model=2) 
      return(subset)
    }
    end.time <- Sys.time()
    time<-as.numeric(end.time-start.time,units="hours")
    model_time<-(((n^2-n)/2)/((316^2-316)/2))*time
    model_time <- round(model_time,digits = 2)
    model_time<-as.character(model_time)
    estimate<-paste(paste("The estimated run time for the full model is",model_time),"hours",sep=" ")
    message(estimate)
  }
  else if (test==T && n <= 315){ 
    message("Data size too small for testing, running normal analysis")
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
    }
    # Running opposite minor/major co-linearity model
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix_rev <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
    }
  }
  else if (test==F && simple==F) {
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
    }
    # Running opposite minor/major co-linearity model
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix_rev <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
    }
  }
  if (test == F && simple==F || (test==T && n <= 315)){
    # Transposing and filling out the correlation and pvalue matrix
    epi_cor <- snp_matrix[seq(1,nrow(snp_matrix)-1,2),]
    epi_pvalue <-   snp_matrix[seq(2,nrow(snp_matrix),2),]
    rm(snp_matrix)
    epi_cor_t <- t(epi_cor)
    epi_pvalue_t <- t(epi_pvalue)
    diag(epi_cor_t) <- 1
    diag(epi_pvalue_t) <- 1
    epi_cor_t[upper.tri(epi_cor_t)]<- epi_cor[upper.tri(epi_cor)]
    epi_pvalue_t[upper.tri(epi_pvalue_t)]<- epi_pvalue[upper.tri(epi_pvalue)]
    epi_cor_t_1 <- epi_cor_t
    epi_pvalue_t_1 <- epi_pvalue_t
    epi_cor_t_1[is.na(epi_cor_t_1)] <- 0
    epi_pvalue_t_1[is.na(epi_pvalue_t_1)] <- 1
    # Opposite assumption correlation and pvalue matrix
    epi_cor <- snp_matrix_rev[seq(1,nrow(snp_matrix_rev)-1,2),]
    epi_pvalue <-   snp_matrix_rev[seq(2,nrow(snp_matrix_rev),2),]
    rm(snp_matrix_rev)
    epi_cor_t <- t(epi_cor)
    epi_pvalue_t <- t(epi_pvalue)
    diag(epi_cor_t) <- 1
    diag(epi_pvalue_t) <- 1
    epi_cor_t[upper.tri(epi_cor_t)]<- epi_cor[upper.tri(epi_cor)]
    epi_pvalue_t[upper.tri(epi_pvalue_t)]<- epi_pvalue[upper.tri(epi_pvalue)]
    epi_cor_t_2 <- epi_cor_t
    epi_pvalue_t_2 <- epi_pvalue_t
    epi_cor_t_2[is.na(epi_cor_t_2)] <- 0
    epi_pvalue_t_2[is.na(epi_pvalue_t_2)] <- 1
    #Picking the correct model assumption based on lowest pvalue
    decider_matrix <- epi_pvalue_t_1-epi_pvalue_t_2
    epi_pvalue_t <- epi_pvalue_t_1
    epi_pvalue_t[0 < decider_matrix] <- epi_pvalue_t_2[0 < decider_matrix]
    epi_cor_t <- epi_cor_t_1
    epi_cor_t[0 < decider_matrix] <- epi_cor_t_2[0 < decider_matrix]
    colnames(epi_pvalue_t) <- colnames(genotype)
    rownames(epi_pvalue_t) <- colnames(genotype)
    colnames(epi_cor_t) <- colnames(genotype)
    rownames(epi_cor_t) <- colnames(genotype)
    output <-list(epi_pvalue_t,epi_cor_t)
    names(output)<-c("Pvalues","Coefficients")
    return(output)
  }
  else if (test==F && simple==T){ 
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
    }
    epi_cor <- snp_matrix[seq(1,nrow(snp_matrix)-1,2),]
    epi_pvalue <-   snp_matrix[seq(2,nrow(snp_matrix),2),]
    epi_cor_t <- t(epi_cor)
    epi_pvalue_t <- t(epi_pvalue)
    diag(epi_cor_t) <- 1
    diag(epi_pvalue_t) <- 1
    epi_cor_t[upper.tri(epi_cor_t)]<- epi_cor[upper.tri(epi_cor)]
    epi_pvalue_t[upper.tri(epi_pvalue_t)]<- epi_pvalue[upper.tri(epi_pvalue)]
    colnames(epi_pvalue_t) <- colnames(genotype)
    rownames(epi_pvalue_t) <- colnames(genotype)
    colnames(epi_cor_t) <- colnames(genotype)
    rownames(epi_cor_t) <- colnames(genotype)
    output <-list(epi_pvalue_t,epi_cor_t)
    names(output)<-c("Pvalues","Coefficients")
    return(output)
  }
} 

system2("ulimit","-s 16384")
system("ulimit -s")
ids_delete <- read.table("/data/nbg153/ADHD/WISH_files_newest/IDs_delete.txt")
phenotype <- read.table("//data/nbg153/ADHD/WISH_files_newest/pheno_for_victor.txt",header = T)
genotype<-generate.genotype(ped = "/data/nbg153/ADHD/WISH_files_newest/trimmed_ped",tped = "/data/nbg153/ADHD/WISH_files_newest/trimmed_tped",major.freq = 0.9,coding="linear")
genotype<-genotype[!(rownames(genotype) %in% ids_delete[,1]),]
tped <- fread("/data/nbg153/ADHD/WISH_files_newest/INDICES1_1_2.tped", data.table = F)

dim(genotype)
ped <- fread("/data/nbg153/ADHD/WISH_files_newest/trimmed_ped", data.table = F)
tped <- fread("/data/nbg153/ADHD/WISH_files_newest/trimmed_tped", data.table = F)



genotype<-generate.genotype(ped = ped,tped = "/data/nbg153/ADHD/WISH_files_newest/trimmed_tped",major.freq = 0.9)

new_geno <- cbind(genotype,genotype,genotype)

dim(genotype)

system.time(result<-epistatic.correlation(phenotype[,3],genotype,threads= 40,test=F,simple=F))
result2<-epistatic.correlation(phenotype[,3],geno_block,threads= 40,test=F,simple=F)

hist(result2$Pvalues)
r2_bf<-p.adjust(result2$Pvalues)
r2_fdr<-p.adjust(result2$Pvalues,method="fdr")
min(r2_bf)
min(r2_fdr)

min(result2$Pvalues)
which(result$Pvalues < 5.950226e-08)


3204039%%2695
names<-row.names(result$Pvalues)

tped[tped$V2==names[2379],1:5]
tped[tped$V2==names[1189],1:5]

which.min(result$Pvalues[2379,1189])

result$Pvalues[,2379]

dim(result$Pvalues)
r2_bf<-p.adjust(result$Pvalues)
r2_fdr<-p.adjust(result$Pvalues,method="fdr")
min(r2_bf)
min(r2_fdr)
which(r2_fdr < 0.05)

phenotype
triangular_split_test <- function(n,split) {
  if (split == 1){
    boundaries<-matrix(0,nrow=split,ncol=2)
    boundaries[1,1] <- 1
    boundaries[1,2] <- n
  }
  else {
    total_count<-(n*n-n)/2
    splits <- total_count/split
    total <- splits
    row <- c()
    for (i in 1:(split-1)){
      temp_row<-((2*n-1)-sqrt((2*n-1)^2-8*total))/2
      temp_row <- floor(temp_row)
      row <- c(row,temp_row)
      total <- total+splits
    }
    row<-as.vector(c(0,row,n))
    boundaries<-matrix(0,nrow=split,ncol=2)
    for (i in 1:split){
      boundaries[i,1] <- row[i]+1
      boundaries[i,2] <- row[i+1]
    }
  }
  return(boundaries)
}

# Creates coordinates splits for rectangular matrices
square_split <- function(threads,rows) { 
  split <- floor(rows/threads)
  coords <- matrix(0,nrow=2,ncol=threads)
  coords <- c()
  for (i in 1:threads){ 
    if (i != threads){
      coords <-c(coords,(i)*split)
    } 
  }
  coords <- c(0,coords,rows) 
  boundaries<-matrix(0,nrow=threads,ncol=2)
  for (i in 1:threads){
    boundaries[i,1] <- coords[i]+1
    boundaries[i,2] <- coords[i+1]
  }
  return(boundaries)
}

square_split(10,1000)

memory_subspace <- function(n,threads,genotype,genotype_rev=0,phenotype,model) { 
  # Split correlation into n^2/2 section for memory usage saving
  registerDoParallel(threads)
  if (n == 1){ 
    coord_splits <-triangular_split_test(dim(genotype)[2],threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
    }
  }
  else {
  square_size <- dim(genotype)[2]
  snp_matrix <- matrix(0,nrow=2*square_size,ncol=square_size)
  splits<-floor(square_size/n)
  square_coords <- c()
  for (i in 1:n){
    if (i == n){
      coord <- square_size
    }
    else {
      coord <- i*splits
    }
    square_coords <- c(square_coords,coord)
    }
  pair_coords<-combn(square_coords,2)
  coord_pairs<-cbind(pair_coords,rbind(square_coords,square_coords))
  # Run the new subsets and collect them in the main matrix
  
  for (i in 1:dim(coord_pairs)[2]) { 
    if (coord_pairs[1,i] == coord_pairs[2,i]){
      temp_geno <- genotype[,(coord_pairs[1,i]-splits+1):coord_pairs[1,i]]
      temp_geno_rev <- 0
      if (model == 2){
        temp_geno_rev <- genotype_rev[,(coord_pairs[1,i]-splits+1):coord_pairs[1,i]]
      }
      coord_splits <-triangular_split_test(dim(temp_geno)[2],threads)
      partial_results <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(temp_geno,temp_geno_rev,phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
      }
      snp_matrix[((coord_pairs[1,i]-splits+1)*2-1):(coord_pairs[1,i]*2),(coord_pairs[2,i]-splits+1):coord_pairs[2,i]] <- partial_results
    }
    else {
      if (square_size == coord_pairs[1,i]){
          temp_geno_1 <- genotype[,square_coords[n-1]:square_coords[n]]
          temp_geno_2 <- genotype[,(coord_pairs[2,i]-splits+1):coord_pairs[2,i]]
          temp_geno_rev <- 0
          if (model == 2)
          {
            temp_geno_rev <-  genotype_rev[,(coord_pairs[2,i]-splits+1):coord_pairs[2,i]]
          } 
          coords_splits <-square_split(threads,dim(temp_geno_1)[2])
          partial_results <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
            subset <- partial_correlations(temp_geno_1[,coords_splits[j,1]:coords_splits[j,2]],temp_geno_2,temp_geno_rev,phenotype=phenotype,model=model)
            return(subset)
          }
         snp_matrix[(square_coords[n-1]*2-1):(square_coords[n]*2),(coord_pairs[2,i]-splits+1):coord_pairs[2,i]] <- partial_results
      }
      else if (square_size == coord_pairs[2,i]){ 
          temp_geno_2 <- genotype[,square_coords[n-1]:square_coords[n]]
          temp_geno_1 <- genotype[,(coord_pairs[1,i]-splits+1):coord_pairs[1,i]]
          print(dim(temp_geno_1))
          print(dim(temp_geno_2))
          temp_geno_rev <- 0
          if (model == 2){
            temp_geno_rev <- genotype_rev[,square_coords[n-1]:square_coords[n]]
          }
          coords_splits <-square_split(threads,dim(temp_geno_1)[2])
          partial_results <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
            subset <- partial_correlations(temp_geno_1[,coords_splits[j,1]:coords_splits[j,2]],temp_geno_2,temp_geno_rev,phenotype=phenotype,model=model)
            return(subset)
          }
          #snp_matrix[((coord_pairs[1,i]-splits+1)*2-1):(coord_pairs[1,i]*2),square_coords[n-1]:square_coords[n]] <- partial_results
          }
      else { 
        temp_geno_1 <- genotype[,(coord_pairs[1,i]-splits+1):coord_pairs[1,i]]
        temp_geno_2 <- genotype[,(coord_pairs[2,i]-splits+1):coord_pairs[2,i]]
        print(dim(temp_geno_1))
        print(dim(temp_geno_2))
        temp_geno_rev <- 0
        if (model == 2){
          temp_geno_rev <- genotype_rev[,(coord_pairs[2,i]-splits+1):coord_pairs[2,i]]
        }
        coords_splits <-square_split(threads,dim(temp_geno_1)[2])
        partial_results <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
          subset <- partial_correlations(temp_geno_1[,coords_splits[j,1]:coords_splits[j,2]],temp_geno_2,temp_geno_rev,phenotype=phenotype,model=model)
          return(subset)
        }
        #snp_matrix[((coord_pairs[1,i]-splits+1)*2-1):(coord_pairs[1,i]*2),(coord_pairs[2,i]-splits+1):coord_pairs[2,i]] <- partial_results
      }
    } 
  }
  }
  #return(snp_matrix)
}  

result<-memory_subspace(1,20,genotype = genotype,genotype_rev = genotype,phenotype=phenotype[,3],model = 1)

result<-memory_subspace(1,20,genotype = new_new_geno,genotype_rev = new_geno,phenotype=phenotype[,3],model = 1)

new_new_geno <- cbind(new_geno,new_geno)
dim(new_geno)

result<-epistatic.correlation_test(phenotype[,3],genotype,threads= 40,test=T,simple=T)

  # New partial correlation formula based on the non-traingular subsetting
partial_correlations <- function(genotype1,genotype2,genotype2_rev,phenotype,model=1){ 
    size1<-dim(genotype1)[2]
    size2<-dim(genotype2)[2]
    data_matrix <- matrix(0,nrow = 2*(size1),ncol=size2) 
    if (model==1){
      for (i in 1:size1){
          for (j in 1:size2){
            tmp_model = glm(phenotype ~ I(genotype1[,i])+I(genotype2[,j])+I(genotype1[,i]*genotype2[,j]))
            data_matrix[(i*2-1):(i*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
    if (model==2){
      for (i in 1:size1){
        for (j in 1:size2){
          tmp_model = fastLm(phenotype ~ I(genotype1[,i])+I(genotype2[,j])+I(genotype1[,i]*genotype2_rev[,j]))
          data_matrix[(i*2-1):(i*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
    gc(verbose = F)
    return(data_matrix)
  } 


partial_correlations_triangular <- function(genotype_1,genotype_rev_1,phenotype,coords,model=1){
  n=dim(genotype_1)[2]
  data_matrix <- matrix(0,nrow = 2*(coords[2]-coords[1]+1),ncol=dim(genotype_1)[2])
  matrix_row <- 0
  if (model==1){
    for (i in coords[1]:coords[2]){
      matrix_row <- matrix_row+1
      if (i < n){
        for (j in (i+1):n){
          tmp_model = fastLm(phenotype ~ I(genotype_1[,i])+I(genotype_1[,j])+I(genotype_1[,i]*genotype_1[,j]))
          data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
  }
  if (model==2){
    for (i in coords[1]:coords[2]){
      matrix_row <- matrix_row+1
      if (i < n){
        for (j in (i+1):n){
          tmp_model = fastLm(phenotype ~ I(genotype_1[,i])+I(genotype_1[,j])+I(genotype_1[,i]*genotype_rev_1[,j]))
          data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
  }
  return(data_matrix)
}

partial_correlations <- function(genotype1,genotype2,genotype2_rev,phenotype,model=1){ 
  size1<-dim(genotype1)[2]
  size2<-dim(genotype1)[2]
  genotype2 <- 0
  #print(c(size1,size2)) 
  data_matrix <- matrix(0,nrow = 2*(size1),ncol=size2) 
  if (model==1){
    for (i in 1:size1){
      for (j in 1:size2){
        tmp_model = fastLm(phenotype ~ I(genotype1[,i])+I(genotype1[,j])+I(genotype1[,i]*genotype1[,j]))
        data_matrix[(i*2-1):(i*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
      }
    }
  }
  if (model==2){
    for (i in 1:size1){
      for (j in 1:size2){
        tmp_model = fastLm(phenotype ~ I(genotype1[,i])+I(genotype2[,j])+I(genotype1[,i]*genotype2_rev[,j]))
        data_matrix[(i*2-1):(i*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
      }
    }
  }
  return(data_matrix)
} 



genotype_new<- genotype
genotype_new[genotype_new==2] <- 0
genotype_new[genotype_new==1] <- 4
genotype_new[genotype_new==1.5] <- 1
genotype_new[genotype_new==4] <- 2

values <- c()
for (i in 1:100){
  values <- c(values,max(rnorm(i)))
}
  
plot(1:100,values)
  
  
qnorm(0.1) 
?qnorm




# GLM testing

partial_correlations_triangular_glm <- function(genotype_1,genotype_rev_1,phenotype,coords,model=1){
  n=dim(genotype_1)[2]
  data_matrix <- matrix(0,nrow = 2*(coords[2]-coords[1]+1),ncol=dim(genotype_1)[2])
  matrix_row <- 0
  if (model==1){
    for (i in coords[1]:coords[2]){
      matrix_row <- matrix_row+1
      if (i < n){
        for (j in (i+1):n){
          tmp_model = glm(phenotype ~ I(genotype_1[,i])+I(genotype_1[,j])+I(genotype_1[,i]*genotype_1[,j]),family=binomial())
          data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
  }
  if (model==2){
    for (i in coords[1]:coords[2]){
      matrix_row <- matrix_row+1
      if (i < n){
        for (j in (i+1):n){
          tmp_model = glm(phenotype ~ I(genotype_1[,i])+I(genotype_1[,j])+I(genotype_1[,i]*genotype_rev_1[,j]),family=binomial())
          data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
  }
  return(data_matrix)
} 


partial_correlations_glm <- function(genotype_1,genotype_rev_1,phenotype,coords,model=1){
  n=dim(genotype_1)[2]
  data_matrix <- matrix(0,nrow = 2*(coords[2]-coords[1]+1),ncol=dim(genotype_1)[2])
  matrix_row <- 0
  if (model==1){
    for (i in coords[1]:coords[2]){
      matrix_row <- matrix_row+1
      if (i < n){
        for (j in (i+1):n){
          tmp_model = glm(phenotype ~ I(genotype_1[,i])+I(genotype_1[,j])+I(genotype_1[,i]*genotype_1[,j]),family=binomial())
          data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
  }
  if (model==2){
    for (i in coords[1]:coords[2]){
      matrix_row <- matrix_row+1
      if (i < n){
        for (j in (i+1):n){
          tmp_model = glm(phenotype ~ I(genotype_1[,i])+I(genotype_1[,j])+I(genotype_1[,i]*genotype_rev_1[,j]),family=binomial())
          data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
  }
  return(data_matrix)
} 

epistatic.correlation_glm <- function(phenotype,genotype,threads=1,test=T,simple=T){ 
  registerDoParallel(threads)
  if ( simple == T){ 
    model <- 1
  }
  else {
    model <- 2
  }
  phenotype < as.matrix(phenotype)
  n<-ncol(genotype)
  if(is.data.frame(genotype)){
    genotype[] <- lapply(genotype, as.numeric)
  }
  if (simple==F || test==T){
    genotype_rev <- genotype
    decide_1<-(genotype_rev==0)
    decide_2<-(genotype_rev==2)
    genotype_rev[decide_1] <- 2
    genotype_rev[decide_2] <- 0
    rm(decide_1)
    rm(decide_2)
    genotype_rev <- as.data.frame(genotype_rev)
  }
  if (test==T && n > 315) {
    message("Running Test")
    message("Estimating run time based on ~100.000 models")
    start.time <- Sys.time()
    coord_splits <-triangular_split(316,threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_glm(genotype[,1:316],genotype_rev[,1:316],phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
    }
    end.time <- Sys.time()
    time<-as.numeric(end.time-start.time,units="hours")
    model_time<-(((n^2-n)/2)/((316^2-316)/2))*time
    model_time <- round(model_time,digits = 2)
    model_time<-as.character(model_time)
    estimate<-paste(paste("The estimated run time for the simple model is",model_time),"hours",sep=" ")
    message(estimate)
    coord_splits <-triangular_split(316,threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_glm(genotype[,1:316],genotype_rev[,1:316],phenotype=phenotype,coord_splits[j,],model=2) 
      return(subset)
    }
    end.time <- Sys.time()
    time<-as.numeric(end.time-start.time,units="hours")
    model_time<-(((n^2-n)/2)/((316^2-316)/2))*time
    model_time <- round(model_time,digits = 2)
    model_time<-as.character(model_time)
    estimate<-paste(paste("The estimated run time for the full model is",model_time),"hours",sep=" ")
    message(estimate)
  }
  else if (test==T && n <= 315){ 
    message("Data size too small for testing, running normal analysis")
    coord_splits <-glm_split(dim(genotype)[2],threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_glm(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
    }
    # Running opposite minor/major co-linearity model
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix_rev <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_glm(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
    }
  }
  else if (test==F && simple==F) {
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_glm(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
    }
    # Running opposite minor/major co-linearity model
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix_rev <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_glm(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
    }
  }
  if (test == F && simple==F || (test==T && n <= 315)){
    # Transposing and filling out the correlation and pvalue matrix
    epi_cor <- snp_matrix[seq(1,nrow(snp_matrix)-1,2),]
    epi_pvalue <-   snp_matrix[seq(2,nrow(snp_matrix),2),]
    rm(snp_matrix)
    epi_cor_t <- t(epi_cor)
    epi_pvalue_t <- t(epi_pvalue)
    diag(epi_cor_t) <- 1
    diag(epi_pvalue_t) <- 1
    epi_cor_t[upper.tri(epi_cor_t)]<- epi_cor[upper.tri(epi_cor)]
    epi_pvalue_t[upper.tri(epi_pvalue_t)]<- epi_pvalue[upper.tri(epi_pvalue)]
    epi_cor_t_1 <- epi_cor_t
    epi_pvalue_t_1 <- epi_pvalue_t
    epi_cor_t_1[is.na(epi_cor_t_1)] <- 0
    epi_pvalue_t_1[is.na(epi_pvalue_t_1)] <- 1
    # Opposite assumption correlation and pvalue matrix
    epi_cor <- snp_matrix_rev[seq(1,nrow(snp_matrix_rev)-1,2),]
    epi_pvalue <-   snp_matrix_rev[seq(2,nrow(snp_matrix_rev),2),]
    rm(snp_matrix_rev)
    epi_cor_t <- t(epi_cor)
    epi_pvalue_t <- t(epi_pvalue)
    diag(epi_cor_t) <- 1
    diag(epi_pvalue_t) <- 1
    epi_cor_t[upper.tri(epi_cor_t)]<- epi_cor[upper.tri(epi_cor)]
    epi_pvalue_t[upper.tri(epi_pvalue_t)]<- epi_pvalue[upper.tri(epi_pvalue)]
    epi_cor_t_2 <- epi_cor_t
    epi_pvalue_t_2 <- epi_pvalue_t
    epi_cor_t_2[is.na(epi_cor_t_2)] <- 0
    epi_pvalue_t_2[is.na(epi_pvalue_t_2)] <- 1
    #Picking the correct model assumption based on lowest pvalue
    decider_matrix <- epi_pvalue_t_1-epi_pvalue_t_2
    epi_pvalue_t <- epi_pvalue_t_1
    epi_pvalue_t[0 < decider_matrix] <- epi_pvalue_t_2[0 < decider_matrix]
    epi_cor_t <- epi_cor_t_1
    epi_cor_t[0 < decider_matrix] <- epi_cor_t_2[0 < decider_matrix]
    colnames(epi_pvalue_t) <- colnames(genotype)
    rownames(epi_pvalue_t) <- colnames(genotype)
    colnames(epi_cor_t) <- colnames(genotype)
    rownames(epi_cor_t) <- colnames(genotype)
    output <-list(epi_pvalue_t,epi_cor_t)
    names(output)<-c("Pvalues","Coefficients")
    return(output)
  }
  else if (test==F && simple==T){ 
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_glm(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],model=model) 
      return(subset)
    }
    epi_cor <- snp_matrix[seq(1,nrow(snp_matrix)-1,2),]
    epi_pvalue <-   snp_matrix[seq(2,nrow(snp_matrix),2),]
    epi_cor_t <- t(epi_cor)
    epi_pvalue_t <- t(epi_pvalue)
    diag(epi_cor_t) <- 1
    diag(epi_pvalue_t) <- 1
    epi_cor_t[upper.tri(epi_cor_t)]<- epi_cor[upper.tri(epi_cor)]
    epi_pvalue_t[upper.tri(epi_pvalue_t)]<- epi_pvalue[upper.tri(epi_pvalue)]
    colnames(epi_pvalue_t) <- colnames(genotype)
    rownames(epi_pvalue_t) <- colnames(genotype)
    colnames(epi_cor_t) <- colnames(genotype)
    rownames(epi_cor_t) <- colnames(genotype)
    output <-list(epi_pvalue_t,epi_cor_t)
    names(output)<-c("Pvalues","Coefficients")
    return(output)
  }
}   

rand_pheno <- sample(c(1,0),193,T)
system.time(result2<-epistatic.correlation(rand_pheno,genotype[,1:1000],threads= 10,test=T,glm=T,simple=F)) 

tmp_model<-speedglm(rand_pheno ~ I(genotype[,1])+I(genotype[,2])+I(genotype[,2]*genotype[,1]),family=binomial())


mode1<-formula(phenotype[,3] ~ I(genotype[,1])+I(genotype[,2])+I(genotype[,2]*genotype[,1]))
mode2<-formula(phenotype[,3] ~ I(genotype[,1])+I(genotype[,2])+I(genotype[,2]*genotype[,1]))

#visualization testing



effect_sumarize<-colSums(abs(e_e))
effect_sumarize<-abs(colSums(e_e))

plot(1:7758,effect_sumarize)



mins_p<-apply(p_e,2,min)
plot(1:7758,-log(mins_p))
hist(mins_p)

which.min(mins_p)

effect_sumarize<-colSums(-log(p_e))
hist(effect_sumarize)
which(effect_sumarize > 18000)
hist(effect_sumarize)
effect_sumarize<-abs(colSums(p_e))

plot(1:7758,effect_sumarize)

tped[tped[,2] == "BovineHD1300004594",]

table(tped[,1])
effect_test<-c()
for (i in 1:8000){
  print(i)
  effect_test <- c(effect_test,sum(-log(runif(1000))))
}

hist(effect_test) 






 
#' Function for summarizing individual variant epistasis results 
#' @description Extract summary information from the epistasis analysis 
#' for each variant
#' @usage variant_summary(tped,correlations)
#' @param tped Input tped file as .tped file or data frame. The tped file (.tped)
#' is a transposed ped file, from Plink. 
#' This file contains the SNP and genotype information where one row is a SNP.
#' The first 4 columns of a TPED file are the same as a 4-column MAP file.
#' Then all genotypes are listed for all individuals for each particular SNP on 
#' each line. Again, SNPs are 1,2-coded.#'
#' @param correlations List of epistatic correlations and p-values generated by
#' epistatic.correlation()
#' @return Plots a pseudo manhattan plot
#' @examples
#' pseudo_manahattan(tped,correlations)
#' 
#' @export


summary_matrix <- function(tped,correlations) {
  matrix_base <- tped[tped[,2] %in% rownames(correlations$Pvalues),c(1,2,4)]
  likelyhood_sum <- colSums(-log(correlations$Pvalues))
  effect_sum <- colSums(abs(correlations$Coefficients))
  min_p <- apply(correlations$Pvalues,2,min)
  summary_matrix<-cbind(matrix_base,likelyhood_sum,effect_sum,min_p)
  colnames(summary_matrix) <- c("chr","variant","position","likelihood_sum","Effect_Sum","min_Pvalue")
  return(summary_matrix)
} 



pseudo_manhattan(tped,model_GIFT,values="c") 
test<-variant_matrix(tped,model_GIFT)

tped<-data.table::fread("../caw_project/GIFT_final2.tped",data.table = F)

tped <- tped[!(tped$V1 %in%  c(0,23,24,25,26,27,28,29,30,31,32,33)),]

load("../../nbg153/caw_project/model_GIFT.RData")
genome.interaction(tped,model_GIFT)

correlations <- model_GIFT
genome.interaction <- function(tped,correlations,quantile=0.9) {
  if (is.character(tped)){
    message("loading tped file")
    tped <- fread(tped,data.table=F)
  }
  else if (!is.data.frame(tped)){
    stop("tped file not file or data frame")
  }
  new_P <- (1-correlations$Pvalues)
  map<-tped[tped[,2] %in% rownames(correlations$Pvalues),1:2]
  counter = 0
  ends <- c()
  chr_list <- c()
  for (i in map[,1]){
    if (counter == 0){
      counter <- counter + 1
      starts <- 1
      chr <- i
      chr_list <- c(chr_list,chr)
    }
    else {
      if (chr == i){
        counter <- counter + 1 
        if (counter == dim(map)[1]){
          ends <- c(ends,counter)
        }
      }
      if (chr != i){
        chr <- i
        chr_list <- c(chr_list,chr)
        ends <- c(ends,counter)
        counter <- counter + 1 
        starts <- c(starts,counter)
      }
    }
  }
  coord_splits<-cbind(starts,ends)
  visualization_matrix <- matrix(nrow = length(starts),ncol = length(starts))
  colnames(visualization_matrix) <- chr_list
  rownames(visualization_matrix) <- chr_list
  for (i in 1:length(starts)){
    for (j in 1:length(starts)){
      subset <- c(new_P[coord_splits[i,1]:coord_splits[i,2],coord_splits[j,1]:coord_splits[j,2]])
      subset <- abs(subset)
      visualization_matrix[i,j] <- quantile(subset,quantile,na.rm=T)
    }
  }
  visualization_matrix <- 2*(visualization_matrix-min(visualization_matrix))/(max(visualization_matrix)-min(visualization_matrix))-1
  corrplot(visualization_matrix, type="upper",title= "Pairwise Chromosome Interaction Map",mar=c(0,0,2,0))
}


genome.interaction(tped,model_GIFT)

hist(test$likelihood_sum)
hist(test$Effect_Sum)
hist(test$chr)

dim(correlations$Pvalues)


sim_genotype <- function(samples,snps) {
  sp<-sample(c(0,1,2),samples*snps,T)  
  genotype_matrix <- matrix(sp,nrow=samples,ncol=snps)
  return(genotype_matrix)
}
i=3000
for (i in c(100,500,1000,2000,4000)){
  genotype<-sim_genotype(i,2000)
  phenotypes <- rnorm(i)
  time<-system.time(epistatic.correlation(phenotypes,genotype,40,simple = T,test=F))
  print(time)
}


for (i in c(5,10,15,20,30,40)){
  genotype<-sim_genotype(500,2000)
  phenotypes <- rnorm(500)
  time<-system.time(epistatic.correlation(phenotypes,genotype,simple = T,test=F))
  print(time)
}

for (i in c(5,10,15,20,30,40)){
  genotype<-sim_genotype(500,3000)
  phenotypes <- rnorm(500)
  time<-system.time(epistatic.correlation(phenotypes,threads = 40,genotype,simple = T,test=F))
  print(time)
}



run_1000 <- c(261,140,104,86,58,53)
run_2000 <- c(1038,559,395,327,231,206)
run_3000 <- c(4543,2463,1757,1441,1113,953)/2
cores <- c(5,10,15,20,30,40)




run_data<-as.data.frame(cbind(c(cores,cores,cores),c(run_1000,run_2000,run_3000),c(1000,1000,1000,1000,1000,1000,2000,2000,2000,2000,2000,2000,3000,3000,3000,3000,3000,3000)))

colnames(run_data) <- c("cores","runtime","size")

run_data$size <- as.character(run_data$size)

library(ggplot2)

gg <- ggplot(run_data,aes(x=cores, y=(runtime),col=size )) + 
  geom_point()+
  ggtitle("Multi-thread Scaling")+
  ylab("Seconds")+
  xlab("Threads")
gg <- gg+guides(size=F)
gg <- gg+labs(col="N-variants")
gg

seconds <- c(174,216,277,346,466,551)
samples <- c(100,500,1000,2000,3000,4000)
sample_runtime<-as.data.frame(cbind(seconds,samples))

gg<-ggplot(sample_runtime,aes(x=samples, y=seconds)) + 
  geom_point(colour="blue",size=4)+
  ggtitle("Sample Size Scaling")+
  ylab("seconds")+
  xlab("N-samples")
gg<-gg+guides(size=FALSE)
gg


ygenotype<-sim_genotype(500,1000)
phenotypes <- rnorm(500)
epistatic.correlation(phenotypes,threads = 40,genotype,simple = F,test=F)

class(tped)

genome.interaction(tped,model_GIFT)

pairwise.chr.map <- function(chr1,chr2,tped,correlations,span=10^6) {  
  new_P <- (1-correlations$Pvalues)
  message("loading tped file")
  tped <- fread(tped,data.table=F)
  total_map <- tped[tped[,2] %in% rownames(correlations$Coefficients),c(1,4)]
  total_map[,2] <- as.numeric(total_map[,2])
  snps1<-c(which(total_map[,1] == chr1))
  snps2 <-c(which(total_map[,1] == chr2))
  values <- abs(correlations$Pvalues[snps1,snps2])
  #values <- -log(values)
  #threshold<-quantile(values,0.95)
  #values[values < threshold] <- 0
  
  heatmap3(values,Rowv = NA,Colv = NA)
}
par(oma=c(0,0,2,0))
pairwise.chr.map(1,2,"../caw_project/GIFT_final2.tped",model_GIFT,grid_size = 50)


hist(model_GIFT$Coefficients)

#heatmap3(visualization_matrix,scale="none",main="Pairwise Chromosomal Interaction",Rowv = NA,Colv = NA,xlab=xlabel,ylab=ylabel ,labRow=c("start",rep("",dim(chromosome_choords1)[1]-2),"end"),labCol=c("start",rep("",dim(chromosome_choords2)[1]-2),"end"))







#' Function to plot summary pseudo-manhattan plots of variants.
#' @description Visualize summary statistics for interactions based on total sum 
#' of -loglikelihoods for each variant across all interactions og the sum of
#' effect sizes.
#' @import ggplot2 
#' @usage pseudo_manahattan(tped,correlations)
#' @param tped Input tped file as .tped file or data frame. The tped file (.tped)
#' is a transposed ped file, from Plink. 
#' This file contains the SNP and genotype information where one row is a SNP.
#' The first 4 columns of a TPED file are the same as a 4-column MAP file.
#' Then all genotypes are listed for all individuals for each particular SNP on 
#' each line. Again, SNPs are 1,2-coded.#'
#' @param correlations List of epistatic correlations and p-values generated by
#' epistatic.correlation()
#' @return Plots a pseudo manhattan plot
#' @examples
#' pseudo_manahattan(tped,correlations)
#' 
#' @export


pseudo_manhattan<- function(tped,correlations,values="p"){
  if (is.character(tped)){
    message("loading tped file")
    tped <- fread(tped,data.table=F)
  }
  map<-tped[tped[,2] %in% rownames(correlations$Pvalues),1:2]
  if ((sum(map[,2] == rownames(correlations$Pvalues)))== dim(correlations$Pvalues)[1]){
    if (values=="p"){
      likelyhood_sum <- colSums(-log(correlations$Pvalues))
      #plot(1:length(likelyhood_sum),likelyhood_sum,col=map[,1]%%2+3,xlab="N-variant",ylab="Sum of log-likelihood",main="Pseudo-Manhattan Plot")
      data_p<-as.data.frame(cbind(likelyhood_sum,c(1:length(likelyhood_sum)),map[,1]))
      colnames(data_p)<-c("like_sum","Nvar","chr")
      data_p$chr <- as.factor(data_p$chr)
      gg<-ggplot(data_p,aes(x=Nvar, y=like_sum,col=chr ))+ geom_point()+
        ggtitle("Pseudo-Manhattan Plot")+
        xlab("N-variant")+
        ylab("Sum of -log-likelihood")
      gg <- gg+guides(col=F)
      return(gg)
    }
    if (values=="c"){
      likelyhood_sum <- colSums(abs(correlations$Coefficients))
      data_p<-as.data.frame(cbind(likelyhood_sum,c(1:length(likelyhood_sum)),map[,1]))
      colnames(data_p)<-c("like_sum","Nvar","chr")
      data_p$chr <- as.factor(data_p$chr)
      gg<-ggplot(data_p,aes(x=Nvar, y=like_sum,col=chr ))+ geom_point()+
        ggtitle("Pseudo-Manhattan Plot")+
        xlab("N-variant")+
        ylab("Sum of Effect Sizes")
      gg <- gg+guides(col=F)
      return(gg)
    }
  }
  else{
    message("tped and model output variant names do not match")
  }
}

#' * genotype The tagging genotypes selected from the blocks
#' * tagging_genotype The genotype selected to represent each block. The median genotype, rounded down is selected
#' * genotype_block_matrix A matrix indicating which block each genotype belongs to


pig_fe <-read.table("/data/nbg153/FeedOmics/phenotypes.txt")




ped <- fread("/data/nbg153/ADHD/WISH_files_newest/trimmed_ped", data.table = F)
tped <- fread("/data/nbg153/ADHD/WISH_files_newest/trimmed_tped", data.table = F)

dim(ped)

ped_small <- ped[,1:5006] 
tped_small <- tped[1:2500,]


names <- c()
for (i in 1:dim(ped)[1]){
  names <- c(names, paste("sample",as.character(i),sep=""))
}
names


snp_names <- c()
for (i in 1:dim(tped_small)[1]){
  snp_names <- c(snp_names, paste("snp",as.character(i),sep=""))
}
snp_names



tped_small[,2] <- snp_names
ped_small[,2] <- names


phenotype <-round(runif(203,1,100)) 

data<-as.data.frame(cbind(names,phenotype))

write.table(ped_small,"/data/nbg153/ADHD/WISH_files_newest/test.ped",col.names=F,row.names=F,quote=F)
write.table(tped_small,"/data/nbg153/ADHD/WISH_files_newest/test.tped",col.names=F,row.names=F,quote=F)
write.table(data,"/data/nbg153/ADHD/WISH_files_newest/test_pheno.txt",col.names=F,row.names=F,quote=F)

chr1 <- 1
chr2 <- 2
region1 <- 1


correlations<-model_GIFT  


tped[,1]<- chr

genotype<-generate.genotype(ped,tped)



LD_genotype<-LD_blocks_t(genotype)
LD_genotype$tagging_genotype
genotype1 <- LD_genotype$genotype

results<-epistatic.correlation(phenotype[,2], genotype,threads = 20 ,test=F)

phenotype[,2] <- rnorm(203)+sample(c(1:10),203,replace=T)

write.table(phenotype,"test_pheno.txt",col.names=F,row.names=F,quote=F)

correlations<-epistatic.correlation(phenotype[,2], genotype, threads = 20 ,test=F,simple=F)
hist(correlations$Coefficients)


genome.interaction(tped,correlations) 

write.table(tped,"tped.test",col.names=F,row.names=F,quote=F)
correlations$Coefficients[correlations$Coefficients > 1000 |  correlations$Coefficients < -1000] <-0
hist(correlations$Coefficients)

modules <-generate.modules(correlations,threads=2)


generate.modules <- function(correlations,values="Coefficients",power=c(seq(1,10,0.1),c(12:22)),n.snps=dim(correlations$Coefficients)[1],minClusterSize=50,type="unsigned",threads=2) {
  enableWGCNAThreads(threads)
  if (values=="Pvalue"){
    corr <- (1-correlations$Pvalues)*(correlations$Coefficients/abs(correlations$Coefficients))
  }
  if (values=="Coefficients"){
    temp_corr<-correlations$Coefficients
    temp_corr[temp_corr < 0] <- 0
    temp_corr <- temp_corr/(max(temp_corr))
    corr <- temp_corr
    temp_corr <- correlations$Coefficients
    temp_corr[temp_corr > 0] <- 0
    temp_corr <- temp_corr/(abs(min(temp_corr)))
    corr[temp_corr < 0] <- temp_corr[temp_corr < 0]
  }
  sft = pickSoftThreshold(corr, powerVector = c(seq(1,10,0.1),c(12:30)), verbose = 5)
  connectivity <- adjacency.fromSimilarity(corr, power=sft$powerEstimate,type=type)
  sizeGrWindow(10,5)
  par(mfrow=c(1,2))
  hist(connectivity,xlab="connectivity")
  scaleFreePlot(connectivity)
  par(mfrow=c(1,1))
  #select SNPs for network construction based on connectivity
  select.snps <- corr[rank(-colSums(connectivity),ties.method="first")<=n.snps,rank(-colSums(connectivity),ties.method="first")<=n.snps ]
  select.snps[,c(1:ncol(select.snps))] <- sapply(select.snps[,c(1:ncol(select.snps))], as.numeric)
  #create adjacency matrix (correlation matrix raised to power beta)
  adjMat <- adjacency.fromSimilarity(select.snps,power=sft$powerEstimate,type=type)
  #calculate dissimilarity TOM
  dissTOM <- 1-(TOMsimilarity(adjMat))
  #create gene dendrogram based on diss TOM
  genetree <- flashClust(as.dist(dissTOM), method="average")
  #cut branches of the tree= modules
  dynamicMods = cutreeDynamic(dendro=genetree, distM=dissTOM, 
                              deepSplit=2,pamRespectsDendro=F, minClusterSize=minClusterSize)
  #give modules a color as name
  moduleColors = labels2colors(dynamicMods)
  #sizeGrWindow(8,6)
  #plot dendrogram with the module colors
  plotDendroAndColors(genetree, moduleColors,
                      dendroLabels=F, hang=0.03,
                      addGuide=T, guideHang = 0.05,
                      main="Gene dendrogram and modules")
  output <- list(select.snps,connectivity,adjMat,dissTOM,genetree,dynamicMods,moduleColors,sft$powerEstimate)
  names(output) <- c("SNPs","connectivity","adjMat","dissTom","genetree","modules","modulecolors","power.estimate")
  return(output)
}




#' Visualization of chromosomal pairwise region epistatic interaction strength, based on 
#' statistical significance 
#' @description Visualization of chromosome pairwise region epistatic interaction strength, based on 
#' statistical significance. The value is based of the most signficant epistatic interaction in each
#' region pair, ranging from 1 ( strongest) to 0 (weakest). By defaulty chromosomes are separated into
#' 1 Mb regions, but if SNPs are more spaced out that this it will adjust to the smallest region that fit
#' the data.  
#' @import heatmap3
#' @usage pairwise.chr.map(chr1,chr2,tped,correlations,span=10^6)
#' @param chr1 The name of the first chromosome in the comparison, matching the name
#' from the tped file
#' @param chr2 The name of the second chromosome in the comparison, matching the name
#' from the tped file
#' @param tped The tped file used in generate.genotype(). The SNPs must
#' be sorted by chromosome and position on the chromosome, matching the order of the SNPs in the correlation 
#' matrices. 
#' @param span Region in bp. Default is 1 Mb (10^6)
#' @param correlations List of epistatic correlations and p-values genrated by
#' epistatic.correlation()
#' @return Outputs a plot visualizing the pairwise chromosome region interaction
#' @examples
#'  pairwise.chr.map("1","2",tped,correlations)
#' 
#' @export


test_pairwise.chr.map <- function(chr1,chr2,tped,correlations,span=10^6) {  
  new_P <- (1-correlations$Pvalues)
  #message("loading tped file")
  #tped <- fread(tped,data.table=F)
  total_map <- tped[tped[,2] %in% rownames(correlations$Pvalues),c(1,4)]
  total_map[,2] <- as.numeric(total_map[,2])
  map <-total_map[total_map[,1] == chr1,]
  first_snp <- map[1,2]
  last_snp <- map[dim(map)[1],2]
  #size <- round((last_snp-first_snp)/span,digits=0)
  progress <- 1
  ends <- c()
  for (snp in map[,2]){   
    if ( snp < (first_snp+progress*span)) {
      starts <- 1
      row <- 1
    }
    else {
      while (snp > first_snp+(progress+1)*span) {
        progress <- progress +1
      }
      print("what")
      ends <- c(ends,row)
      row <- row +1
      starts <- c(starts,row)
      progress <- progress + 1
    }
  }
  ends<- c(ends,dim(map)[1])
  chromosome_choords1 <- cbind(starts,ends)
  chromosome_choords1 <- chromosome_choords1 + which(total_map[,1] == chr1)[1]-1
  map <-total_map[total_map[,1] == chr2,]
  first_snp <- map[1,2]
  last_snp <- map[dim(map)[1],2]
  progress <- 1
  ends <- c()
  for (snp in map[,2]){   
    if ( snp < (first_snp+progress*10^6)) {
      #print("!t")
      starts <- 1
      row <- 1
    }
    else {
      while (snp > first_snp+(progress+1)*10^6) {
        progress <- progress +1
      }
      #print("what")
      ends <- c(ends,row)
      row <- row +1
      starts <- c(starts,row)
      progress <- progress + 1
    }
  }
  ends<- c(ends,dim(map)[1])
  chromosome_choords2 <- cbind(starts,ends)
  chromosome_choords2 <- chromosome_choords2 + which(total_map[,1] == chr2)[1]-1
  visualization_matrix <- matrix(nrow = dim(chromosome_choords1)[1],ncol = dim(chromosome_choords2)[1])
  colnames(visualization_matrix) <- 1:(dim(chromosome_choords2)[1])
  rownames(visualization_matrix) <- 1:(dim(chromosome_choords1)[1]) 
  for (i in 1:dim(chromosome_choords1)[1]){
    for (j in 1:dim(chromosome_choords2)[1]){
      subset <- c(new_P[chromosome_choords1[i,1]:chromosome_choords1[i,2],chromosome_choords2[j,1]:chromosome_choords2[j,2]])
      subset <- abs(subset)
      visualization_matrix[i,j] <- max(subset)
      
    }
  }
  xlabel<-paste("Chromosome=",as.character(chr2),", N-regions=",as.character(dim(chromosome_choords2)[1]))
  ylabel<-paste("Chromosome=",as.character(chr1),", N-regions=",as.character(dim(chromosome_choords1)[1]))
  heatmap3(visualization_matrix,scale="none",main="Pairwise Chromosomal Interaction",Rowv = NA,Colv = NA,xlab=xlabel,ylab=ylabel ,labRow=c("start",rep("",dim(chromosome_choords1)[1]-2),"end"),labCol=c("start",rep("",dim(chromosome_choords2)[1]-2),"end"))
}

load("../caw_project/model_GIF05.RData")

tped <- fread("../caw_project/GIFT_final2_filtered05.tped",data.table = F)

p_e <- model_GIFT$Pvalues 
e_e <- model_GIFT$Coefficients 


correlations <- model_GIFT

pvalues <- correlations$Pvalues

chr1 <- 1
chr2 <- 2
grid_size <- 50


test_pairwise.chr.map <- function(chr1,chr2,tped,correlations,grid_size){
  total_map <- tped[tped[,2] %in% rownames(correlations$Pvalues),c(1,4)]
  chr_pair_values <-correlations$Pvalues[which(total_map[,1] == chr1),which(total_map[,1] == chr2)]
  chr_sizes <-dim(chr_pair_values)   
  chr1_coords<-1:chr_sizes[1]
  chr2_coords<-1:chr_sizes[2]
  chr1_chunk<-(length(chr1_coords)/grid_size)
  chr2_chunk<-floor(length(chr2_coords)/grid_size)
  chr1_splits<-split(chr1_coords,ceiling(seq_along(chr1_coords)/chr1_chunk))
  chr2_splits<-split(chr2_coords,ceiling(seq_along(chr2_coords)/chr2_chunk))
  mean_matrix<-matrix(0,nrow=length(chr1_splits),ncol=length(chr2_splits))
  counter <- c(0,0)
  for ( i in chr1_splits){
    counter[1]<- counter[1]+1
    counter[2]<-0
    for (j in chr2_splits){
      counter[2]<- counter[2]+1
           mean_matrix[counter[1],counter[2]] <- mean(chr_pair_values[i,j])
    }
  }
  xlabel<-paste("Chromosome=",as.character(chr2),", N-regions=",as.character(counter[1]))
  ylabel<-paste("Chromosome=",as.character(chr1),", N-regions=",as.character(counter[2]))
  heatmap3(mean_matrix,scale="none",main="Pairwise Chromosomal Interaction",Rowv = NA,Colv = NA,xlab=xlabel,ylab=ylabel,labRow=c("start",rep("",(counter[1]-2)),"end"),labCol=c("start",rep("",(counter[2]-2)),"end"))
}

test_pairwise.chr.map(1,2,tped,correlations,40) 



#### Simulate Genotype ####


uniform_genotype_sampler <- function(n_samples,n_genotypes) {
  genotypes <- matrix(0,nrow = n_samples, ncol= n_genotypes)
  for (i in c(1:n_genotypes)){
    maf<-runif(1,0,0.5)
    maf2 <- maf^2
    genotype <- runif(n_samples)
    mm <-genotype < maf2
    Mm <- genotype >= maf2 & genotype < (maf2+maf*(1-maf))
    MM <- genotype  >=(maf2+maf*(1-maf))
    genotype[mm] <- 2
    genotype[Mm] <- 1.5
    genotype[MM] <- 1
    genotypes[,i] <- genotype
  }
  return(genotypes)
}


blockwise_genotype_sampler <- function(n_samples,n_genotypes) {
  genotypes <- matrix(0,nrow = n_samples, ncol= n_genotypes)
  for (i in c(1:n_genotypes)){
    maf<-runif(1,0,0.5)
    maf2 <- maf^2
    genotype <- runif(n_samples)
    mm <-genotype < maf2
    Mm <- genotype >= maf2 & genotype < (maf2+maf*(1-maf))
    MM <- genotype  >=(maf2+maf*(1-maf))
    genotype[mm] <- 2
    genotype[Mm] <- 1.5
    genotype[MM] <- 1
    genotypes[,i] <- genotype
  }
  return(genotypes)
}





epistasis_layer <- function(genotype,n_pheno){
  n_snps <- dim(genotype)[2]
  epi_pheno <- rep(0,n_pheno)
  effect_sizes<- matrix(0,nrow=n_snps,ncol=n_snps) 
  for ( i in 1:n_snps){
    n_values <- length(i:n_snps)
    x1 <- 1000
    x0 <- 0.001
    y <- runif(n_values)
    n=5
    #power law
    x = ((x1^(n+1) - x0^(n+1))*y + x0^(n+1))^(1/(n+1))
    #rescaling to frequent low values
  
    effect_sizes[i,i:n_snps] <- x
  }
  x_max = max(effect_sizes)
  x <- effect_sizes
  x = (-1*x+max(x))
  x <- x^(log10(dim(genotype)[2]^2))
  x=x/max(x)*10
  for (i in 1:n_snps){
    sign <-sample(c(-1,1),replace=T,n_snps)
    x[,i] <- sign*x[,i]
  }
  x = x*(effect_sizes !=0)
  effect_sizes = x
  for (i in 1:n_snps){
    if (i+1 <= n_snps){
    for (j in (i+1):n_snps){
      epi_geno<-genotype[,i]*genotype[,j]      
      epi_pheno <- epi_pheno+effect_sizes[i,j]*epi_geno
    }  
    }
  }
 return(list(epi_pheno,effect_sizes)) 
}


main_layer <- function(genotype,range){ 
  x1 <- 1000
  x0 <- 0.001
  y <- runif(dim(genotype)[2])
  n=5
  #power law
  x = ((x1^(n+1) - x0^(n+1))*y + x0^(n+1))^(1/(n+1))
  x_max = max(x)
  x = (-1*x+max(x))^log10(length(y))
  sign <-sample(c(-1,1),replace=T,length(x))
  effect_sizes <- x*sign
  pheno <- colSums(effect_sizes*t(genotype))
  range_main <- max(pheno)-min(pheno)
  range_factor <- range_main/range
  effect_sizes <- effect_sizes/range_factor
  filter<- sd(effect_sizes)
  effect_sizes[effect_sizes < sd(effect_sizes)] <- 0
  pheno <- colSums(effect_sizes*t(genotype))
  output<-(list(pheno,effect_sizes,filter))
  names(output)<-c("Phenotype","Effect_sizes","Filter")
  return(output)
}

enviroment_layer <- function(epi_layer,main_layer,n){
  genetic_effects<-epi_layer+main_layer
  baseline <- abs(min(genetic_effects)-max(genetic_effects))
  enviromental_layer <-rnorm(n=n,mean=0.1*baseline,sd=0.05*baseline)
  output <-list(baseline+enviromental_layer+genetic_effects,baseline,enviromental_layer)
  names(output) <- c("Phenotype","baseline","enviromental")
  return(output)
}








cor.prob <- function(X, dfr = nrow(X) - 2) {
  R <- cor(X)
  above <- row(R) < col(R)
  r2 <- R[above]^2
  Fstat <- r2 * dfr / (1 - r2)
  R[above] <- 1 - pf(Fstat, 1, dfr)
    cor.mat <- t(R)
  cor.mat[upper.tri(cor.mat)] <- NA
  cor.mat
}



library(wish)



epistasis_generation <- function(main_pheno,main_effect,genotype,n_snps,hera_exp){ 
  epi_pheno <- rep(0,length(main_pheno))
  total_effects<-sum(main_effect)/hera_exp
  likelyhood<-abs(main_effect %*% t(main_effect))
  likelyhood[likelyhood < min(main_effect)] <- 0
  likelyhood[lower.tri(likelyhood)] <-0 
  effect_vector <- sort(c(likelyhood[likelyhood > 0]),decreasing = T)
  index <- 1
  epi_total <- 0
  while(epi_total < total_effects & index < length(effect_vector)){
    epi_total <- epi_total+effect_vector[index]
    print(c(index,effect_vector[index],total_effects,epi_total))
    index <- index+1
  }
  likelyhood[likelyhood < effect_vector[index]] <- 0
  epi_effects<- likelyhood
  for (i in 1:n_snps){
    if (i+1 <= n_snps){
      for (j in (i+1):n_snps){
        epi_geno<-genotype[,i]*genotype[,j]      
        epi_pheno <- epi_pheno+epi_effects[i,j]*epi_geno
        
      }  
    }
  }
  output<-(list(epi_pheno,epi_effects,filter))
  names(output)<-c("Phenotype","Effect_sizes","Filter")
  return(output)
}


epi_pheno<-epistasis_layer(sampled_data,n_pheno=200)

phenotype<-enviroment_layer(epi_pheno[[1]],main_pheno[[1]],200)
hist(phenotype[[1]])

test_phenotype<-phenotype[[1]]
main_pheno[[1]]

sampled_data<-uniform_genotype_sampler(200,1000)

hist(main_pheno$Effect_sizes[main_pheno$Effect_sizes > 0])
hist(main_pheno$Phenotype)

sum(main_pheno$Effect_sizes > 0)
sum(main_pheno$Effect_sizes > sd(main_pheno$Effect_sizes))


sort(main_pheno$Effect_sizes)
sampled_data<-uniform_genotype_sampler(200,200)
epi_pheno <- epistasis_generation(main_pheno$Phenotype,sampled_data,200,0.1)
main_pheno<-main_layer(sampled_data,range = 20)
hist(main_pheno$Effect_sizes)
final_pheno<-enviroment_layer(epi_pheno$Phenotype,main_pheno$Phenotype,203)

length


library(WISH)

sampled_genotype<-genotype[,sample(651720,1000)]
sampled_genotype[is.na(sampled_genotype)]<-0
dim(sampled_genotype)

main_pheno<-main_layer(as.matrix(sampled_genotype),range = 20)
epi_pheno <- epistasis_generation(main_pheno$Phenotype,main_pheno$Effect_sizes,sampled_genotype,200,0.1)
hist(main_pheno$Effect_sizes)
hist(epi_pheno$Effect_sizes)
final_pheno<-enviroment_layer(epi_pheno$Phenotype,main_pheno$Phenotype,203)
test_corr<-epistatic.correlation(epi_pheno$Phenotype+main_pheno$Phenotype,sampled_genotype,threads = 20,test=F) 



test_corr<-epistatic.correlation(rnorm(203,100,50),sampled_genotype,threads = 20,test=F) 

adjusted<-(p.adjust(test_corr$Pvalue,method="bonferroni"))
sum(adjusted< 0.05, na.rm=T)


test_corr$Pvalues[is.na(test_corr$Pvalues)] <- 1
test_corr$Coefficients[is.na(test_corr$Coefficients)]<-0

test_corr$Coefficients[epi_pheno$Effect_sizes >0]

hist(test_corr$Pvalues[epi_pheno$Effect_sizes >0]) 

hist(epi_pheno$Effect_sizes[test_corr$Pvalues< 0.01]) 

cor(test_corr$Pvalues[epi_pheno$Effect_sizes >0],epi_pheno$Effect_sizes[epi_pheno$Effect_sizes >0])
     
cor(test_corr$Coefficients[epi_pheno$Effect_sizes >0],epi_pheno$Effect_sizes[epi_pheno$Effect_sizes >0])    


plot(test_corr$Coefficients[epi_pheno$Effect_sizes >0],epi_pheno$Effect_sizes[epi_pheno$Effect_sizes >0])    



dim(test_corr$Pvalues)  
dim(epi_pheno$Effect_sizes)  
sum(epi_pheno$Effect_sizes >0)  

cor(test_corr$Coefficients[epi_pheno$Effect_sizes >0],epi_pheno$Effect_sizes[epi_pheno$Effect_sizes >0])   

library(data.table) 
 
tped_variation<-fread("/data/nbg153/ADHD/WISH_files_newest/INDICES_1_2.tped",data.table = F) 
ped1<-fread("/data/nbg153/ADHD/WISH_files_newest/part1.ped",data.table = F) 
ped2<-fread("/data/nbg153/ADHD/WISH_files_newest/part2.ped",data.table = F) 
ped3<-fread("/data/nbg153/ADHD/WISH_files_newest/part3.ped",data.table = F) 
ped_variation<-cbind(ped1,ped2,ped3) 

library(WISH)

genotype<-generate.genotype(ped_variation,tped_variation)
LD_blocks_test <-function(genotype,threshold=0.9,max_block_size=1000){ 
  snp_block_matrix <- as.matrix(rep(0,dim(genotype)[2]))
  rownames(snp_block_matrix) <- colnames(genotype)
  tagging_variants <- c()
  n_snp <- 1
  start <- 2
  n_block <- 1
  network_size <- 0
  block_coords <- list()
  snps <- dim(genotype)[2]
  snp_block_matrix[n_snp,] <- n_block
  while(start <= snps){ 
    print(n_snp)
    if (n_snp == snps){
      snp_block_matrix[snps,1] = n_block
    }
    else{ 
      temp_sim <- 0
      network_pairs<-combn(c(n_snp:(start)),2)
      r2_values <- c()
      for (i in 1:dim(network_pairs)[2]){
        start_t <- network_pairs[1,i]
        n_snp_t <- network_pairs[2,i]
        # if (n_snp == snps){
        #   snp_block_matrix[snps,1] = n_block
        # }
        #else{
        #matches<-sum(genotype[,n_snp]/genotype[,start] == 1,na.rm = T)+(sum(c(c(genotype[,n_snp] == 1.5)+ c(genotype[,start] == 1.5)) == 1 ,na.rm = T)/2)
        total <- sum(!is.na(genotype[,n_snp_t]+genotype[,start_t]))
        usable_values <- !is.na(genotype[,n_snp_t]+genotype[,start_t])
        #similarity <- matches/total
        p_AB <- (sum(genotype[usable_values,n_snp_t]+genotype[usable_values,start_t] == 0,na.rm = T)+(sum(genotype[usable_values,n_snp_t]+ genotype[usable_values,start_t] == 1 ,na.rm = T)/2)+(sum(genotype[usable_values,n_snp_t]*genotype[usable_values,start_t] == 1 ,na.rm = T)/2))/total
        p_A <- (sum(genotype[usable_values,n_snp_t] == 0,na.rm = T)+ sum(genotype[usable_values,n_snp_t] == 1,na.rm = T)/2)/total
        p_a <- 1-p_A
        p_B <- (sum(genotype[usable_values,start_t] == 0,na.rm = T)+ sum(genotype[usable_values,start_t] == 1,na.rm = T)/2)/total
        p_b <- 1-p_B
        #print(c(p_A,p_B,p_AB,start))
        D <- p_AB-(p_A*p_B)
        r2 <- D^2/(p_A*p_a*p_B*p_b)
        r2_values <- c(r2_values,r2)
        temp_sim <- r2+temp_sim
        network_size <- network_size + 1
      }
      similarity <- temp_sim
      #print(c(similarity,similarity/network_size,network_size,start,n_snp))
      if (similarity/network_size >= threshold && network_size < max_block_size){ 
        snp_block_matrix[start,1] = n_block
        start <- start+1
        network_size <-0
        if (start > snps){ 
          block_coords[[n_block]] <- c(n_snp,(start-1))
          for (i in min(network_pairs):max(network_pairs)){
            if (i == min(network_pairs)){
              r2_sum <- sum(r2_values[network_pairs[1,] == i | network_pairs[2,] == i],na.rm=T)
              tagging_variant <- i
            }
            else{
                new_r2<-sum(r2_values[network_pairs[1,] == i | network_pairs[2,] == i],na.rm=T)
                if(new_r2 > r2_sum){
                  r2_sum <- new_r2
                  tagging_variant <- i
                }
              }
          }
          tagging_variants <- c(tagging_variants,tagging_variant)
          }
        }
      else {
        if (start == snps){
          block_coords[[n_block]] <- c(n_snp,(start-1))
          n_block <- n_block + 1
          snp_block_matrix[snps,1] <- n_block
          block_coords[[n_block]] <- c(start,start)
          tagging_variants <- c(tagging_variants,start)
          start <- start + 1
        }
        else{
          r2_values <- r2_values[network_pairs[1,] != start & network_pairs[2,] != start]
          block_coords[[n_block]] <- c(n_snp,(start-1))
          if (n_snp == start-1){
            tagging_variant <- n_snp
          }
          else{
            network_pairs<-combn(c(n_snp:(start-1)),2)
            print((r2_values))
            print((network_pairs))
            for (i in min(network_pairs):max(network_pairs)){
              if (i == min(network_pairs)){
                r2_sum <- sum(r2_values[network_pairs[1,] == i | network_pairs[2,] == i],na.rm=T)
                tagging_variant <- i
              }
              else{
                  new_r2<-sum(r2_values[network_pairs[1,] == i | network_pairs[2,] == i],na.rm=T)
                  if(new_r2 > r2_sum){
                    r2_sum <- new_r2
                    tagging_variant <- i
                  }
                }
            }
            }
          tagging_variants <- c(tagging_variants,tagging_variant)
          n_snp <- start
          n_block <- n_block +1
          snp_block_matrix[start,1] = n_block
          start <- start +1
          network_size <- 0
        }
      }
    }
  }
  genotype <- as.matrix(genotype)
  new_genotype <- genotype[,tagging_variants]
  
  output<-list(new_genotype,tagging_variants,snp_block_matrix)
  names(output)<-c("genotype","tagging_genotype","genotype_block_matrix")
  return(output)
}
LD_blocks(genotype[,1:10000])
dim(genotype) 
  
sampled_genotype<-genotype[,sample(651720,200)] 

total_minors<-sum(tped_variation[,5:410] == 1) 
total_majors<-sum(tped_variation[,5:410] == 2) 
total_minors/(total_majors+total_minors) 

freq_minors<-rowSums(tped_variation[,5:410] == 1) 
freq_majors<-rowSums(tped_variation[,5:410] == 2) 

hist(freq_minors)
hist(freq_majors)
median(freq_minors)


LD_analysis<-LD_blocks_test(genotype[,1:10000])

length(LD_analysis[[2]])
length(LD_analysis[[3]])
AQS-Group/WISH documentation built on July 17, 2020, 12:12 a.m.