R/all_functions.R

Defines functions pseudo_manhattan LD_blocks generate.modules pairwise.chr.map genome.interaction epistatic.correlation partial_correlations_triangular triangular_split generate.genotype

Documented in epistatic.correlation generate.genotype generate.modules genome.interaction LD_blocks pairwise.chr.map partial_correlations_triangular pseudo_manhattan triangular_split

#' Import genotype data in the correct format for network construction
#' @import data.table
#' @description For network construction based on both genomic correlations
#' as well as epistatic interactions a genotype matrix has to be
#' created, consisting of one numeric value per SNP, per individual. This function
#' takes Plink output (1,2-coding) to create the genotype matrix which can be used
#' to calculate genomic correlations or epistatic interaction effects 
#' @usage generate.genotype(ped,tped,snp.id=NULL, pvalue=0.05,id.select=NULL,
#' gwas.p=NULL,major.freq=0.95,fast.read=T)
#' @param ped Input ped file as .ped file or data.frame. The ped file (.ped) is an 
#' input file from Plink: The PED file is a
#' white-space (space or tab) delimited file: the first six columns are mandatory:
#' Family ID, Idividual ID, Paternal ID, Maternal ID, 
#' Sex (1=male; 2=female;other=unknown) and Phenotype. The IDs are alphanumeric: 
#' the combination of family and individual ID should uniquely identify a person.
#' A PED file must have 1 and only 1 phenotype in the sixth column.
#' The phenotype can be either a quantitative trait or an affection status 
#' column: PLINK will automatically detect which type
#' (i.e. based on whether a value other than 0, 1, 2 or the missing genotype 
#' code is observed). SNPs are 1,2-coded (1 for major allele,2 for minor allele) 
#' For more information: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped
#' @param tped Input tped file as .tped file or data frame. The tped file (.tped)
#' is a transposed ped file, from Plink. 
#' This file contains the SNP and genotype information where one row is a SNP.
#' The first 4 columns of a TPED file are the same as a 4-column MAP file.
#' Then all genotypes are listed for all individuals for each particular SNP on 
#' each line. Again, SNPs are 1,2-coded.
#' @param snp.id  Input SNP ids to use in analysis if not all snps are to be used
#' @param pvalue A value for the cutoff of the SNPs which should be remained 
#' in the matrix, based on the pvalue resulting from the GWAS. Default value
#' is 0.05
#' @param id.select If requested, a subset of individuals can be 
#' selected (e.g. extremes). If nothing inserted, all individuals are in the
#' output
#' @param gwas.p  A vector of the p-values corresponding to 
#' the input SNPs in the ped/tped file or gwas.id vector. If assigned, will 
#' select snps based on the pvalue parameter with a default value of 0.05.
#' @param major.freq Maximum major allele frequency allowed in each variant. 
#' Default value is 0.95.
#' @param fast.read If true will use fread from the data.table package to read
#' the files. This is much faster than read.table, but requires consistent delimeters
#' in the ped and tped file, and a maximum of approximately 950.000 colums in the ped
#' file. This can be increased by changing the stack size (do this only if you
#' know what you are doing)
#' @return A genotype dataframe and the corresponding vector of passing snps in a vector.
#' The genotype data frame has a row for each individual and a column
#'  for each SNP. SNPs are 1,1.5,2 coded: 1 for homozygous for the major 
#'  allele, 1.5 for heterozygous, and 2 for homozygous for the minor allele. 
#'  Missing values are NA coded. 
#' @references Lisette J.A. Kogelman and Haja N.Kadarmideen (2014). 
#' Weighted Interaction SNP Hub (WISH) network method for building genetic
#' networks for complex diseases and traits using whole genome genotype data.
#' BMC Systems Biology 8(Suppl 2):S5. 
#' http://www.biomedcentral.com/1752-0509/8/S2/S5.
#' @examples
#' generate.genotype(ped,tped)
#' 
#' @export
#' 
#' 
#' 
generate.genotype <- function(ped,tped,snp.id=NULL, pvalue=0.05,id.select=NULL,gwas.p=NULL,major.freq=0.95,fast.read=T) {
  if (fast.read == T){
    if (is.character(ped)){
      message("loading ped file")
      ped <- fread(ped,data.table=F)
    }
    else if (!is.data.frame(ped)){
      stop("ped file not file or data frame")
    }
    if (is.character(tped)){
      message("loading tped file")
      tped <- fread(tped,data.table=F)
    }
    else if (!is.data.frame(tped)){
      stop("tped file not file or data frame")
    }
  }
  else {
    if (is.character(ped)){
      message("loading ped file")
      ped <- read.table(ped)
    }
    else if (!is.data.frame(ped)){
      stop("ped file not file or data frame")
    }
    if (is.character(tped)){
      message("loading tped file")
      ped <- read.table(tped)
    }
    else if (!is.data.frame(tped)){
      stop("tped file not file or data frame")
    }
  }
    if ((dim(ped)[1] != (dim(tped)[2]-4)/2) && ((dim(ped)[2]-6)/2 != (dim(tped)[1]))){
    stop("Error: ped-file and tped file dimensions do not fit, make sure file delimiters are consistent")
  } 
  if(is.null(snp.id)){
    snp.id <- tped[,2]
  }
  if(is.null(id.select)){
    id.select <- ped[,2]
  }
  if(is.null(gwas.p)){
    genotype <- matrix(nrow=length(c(id.select)),ncol=length(c(snp.id)))
    rownames(genotype) <- id.select
    colnames(genotype) <- snp.id
    if (length(c(snp.id))==length(c(tped[,2]))){
      snps <- c(1:dim(tped)[1])
    }
    else {
      snps<-which(tped[,2]%in%snp.id)  
    }
    if (length(c(id.select))==length(c(ped[,2]))){
      ids <- c(1:dim(ped)[1])
      ped_trim <- as.matrix(ped[ids,c(rep(2*snps,each=2)-(1:(2*length(snps)))%%2+6)])
    }
    else {
      ids<-which(ped[,2]%in%id.select)
      ped_trim <- as.matrix(ped[ids,c(sort(rep(2*snps,each=2)-(1:(2*length(snps)))%%2)+6)])
    }
    ped_trim[ped_trim==0] <- NA
    for (i in 1:(dim(genotype)[2])){
      genotype[,i] <- rowMeans((ped_trim[,c(2*i-1,2*i)]))
    }
  }
  if(!(is.null(gwas.p))){
    # In case of p-value filtering we want the input SNP IDs to match the pvalue vector
    if(length(as.vector(gwas.p))!=length(as.vector(snp.id))){
      stop("Gwas P-values not same length as SNP IDs")
    }  
    snp.id <- as.vector(snp.id)
    original_length<-length(snp.id)
    snp.id <- snp.id[as.vector(gwas.p) <= pvalue & !(is.na(as.vector(gwas.p)))]
    new_length<-length(snp.id) 
    snp_counts<-paste(as.character(new_length),as.character(original_length), sep = "/") 
    genotype <- matrix(nrow=length(c(id.select)),ncol=length(c(snp.id)))
    rownames(genotype) <- id.select
    colnames(genotype) <- snp.id
    if (length(c(snp.id))==length(c(tped[,2]))){
      snps <- c(1:dim(tped)[1])
      if (length(c(id.select))==length(c(ped[,2]))){
        ids <- c(1:dim(ped)[1])
        ped_trim <- ped[ids,c(rep(2*snps,each=2)-(1:(2*length(snps)))%%2+6)]
      }
      else {
        ids<-which(ped[,2]%in%id.select) 
        ped_trim <- ped[ids,c(sort(rep(2*snps,each=2)-(1:(2*length(snps)))%%2)+6)]
      }
    }
    else {
      snps<-which(tped[,2]%in%snp.id)  
      if (length(c(id.select))==length(c(ped[,2]))){
        ids <- c(1:dim(ped)[1])
        ped_trim <- as.matrix(ped[ids,c(rep(2*snps,each=2)-(1:(2*length(snps)))%%2+6)])
      }
      else {
        ids<-which(ped[,2]%in%id.select) 
        ped_trim <- as.matrix(ped[ids,c(sort(rep(2*snps,each=2)-(1:(2*length(snps)))%%2)+6)])
      }
    }  
    ped_trim[ped_trim==0] <- NA
    for (i in 1:(dim(genotype)[2])){
      genotype[,i] <- rowMeans((ped_trim[,c(2*i-1,2*i)]))
    }
  }
  #Ensuring that we only get variants with enough variation. We remove variants with no minor alleles or/and with a majore allele frequency over 0.95(default)
  passing_snps <- which((colSums((genotype == 2),na.rm = T) < (dim(genotype)[1]*major.freq)) & colSums(is.na(genotype)) < (0.05*dim(genotype)[1]))
  genotype <- genotype[,passing_snps]
  genotype[genotype==2] <- 0
  genotype[genotype==1] <- 4
  genotype[genotype==1.5] <- 1
  genotype[genotype==4] <- 2
  message(paste(length(passing_snps), "passed QC"), sep=" ")
  return(genotype)
}





#' ***Internal Use Function*** This function calculates the row coordinates for splitting triangular sub
#' matrices of quadratic matrices into approximately equally sized partitions
#' for use in in dividing correlation calculations into equal size for 
#' parallelization 
#' @description Internal function for splitting triangular matrices into
#' approximately equal parts
#' @usage triangular_split(n,split)
#' @param n Row and Column length of the n by n matrix the triangular matrix
#' originates from
#' @param split Number of partitions to split the triangular matrix in
#' @return A matrix of row coordinates used for splitting
#' @examples
#' triangular_split(1000,5)
#' 
#' @export


triangular_split <- function(n,split) {
  if (split == 1){
    boundaries<-matrix(0,nrow=split,ncol=2)
    boundaries[1,1] <- 1
    boundaries[1,2] <- n
  }
  else {
    total_count<-(n*n-n)/2
    splits <- total_count/split
    total <- splits
    row <- c()
    for (i in 1:(split-1)){
      temp_row<-((2*n-1)-sqrt((2*n-1)^2-8*total))/2
      temp_row <- floor(temp_row)
      row <- c(row,temp_row)
      total <- total+splits
    }
    row<-as.vector(c(0,row,n))
    boundaries<-matrix(0,nrow=split,ncol=2)
    for (i in 1:split){
      boundaries[i,1] <- row[i]+1
      boundaries[i,2] <- row[i+1]
    }
  }
  return(boundaries)
}

#' ***Internal Use Function*** This function calculates the epistatic correlations
#' in a subset of a matrix space based on coordiantes 
#' @description Internal package function for calculating epsitatic correlations
#' in sub-matrices
#' @usage partial_correlations_triangular(genotype_1,genotype_rev_1,phenotype,coords,
#' glm=F,model=1)
#' @param genotype_1 Dataframe with the genotype information, resulting from 
#' the function generate.genotype(). Make sure that the dataframe contains the 
#' same individuals as in the phenotype-file, and that those are in the 
#' same order.
#' @param genotype_rev_1 Same as genotpye but with reversed genotype coding
#' @param phenotype Dataframe with the rows correspinding to the individuals
#' in the analysis,and columns for the different measured phenotypes and 
#' fixed/random factors. Phenotypes should be continous variables. 
#' @param coords Matrix of row split coordinates for subseting input space
#' @param glm setting controlling if is a lm or glm
#' @param model Specification controlling if MM or Mm directed interaction
#' model is used.
#' @return Epsitatic correlations and P-values for the selected set or subset
#' of the data
#' @examples
#' partial_correlations <- partial_correlaiton_triangular(genotype_1,genotype_rev_1,
#' phenotype,coords,model)
#' 
#' @export

partial_correlations_triangular <- function(genotype_1,genotype_rev_1,phenotype,coords,glm=F,model=1){
  n=dim(genotype_1)[2]
  data_matrix <- matrix(0,nrow = 2*(coords[2]-coords[1]+1),ncol=dim(genotype_1)[2])
  matrix_row <- 0
  if ( glm == F){
  if (model==1){
    for (i in coords[1]:coords[2]){
      matrix_row <- matrix_row+1
      if (i < n){
        for (j in (i+1):n){
          tmp_model = fastLm(phenotype ~ I(genotype_1[,i])+I(genotype_1[,j])+I(genotype_1[,i]*genotype_1[,j]))
          data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
  }
  if (model==2){
    for (i in coords[1]:coords[2]){
      matrix_row <- matrix_row+1
      if (i < n){
        for (j in (i+1):n){
          tmp_model = fastLm(phenotype ~ I(genotype_1[,i])+I(genotype_1[,j])+I(genotype_1[,i]*genotype_rev_1[,j]))
          data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
  }
  }
  if ( glm == T){
    if (model==1){
    for (i in coords[1]:coords[2]){
      matrix_row <- matrix_row+1
      if (i < n){
        for (j in (i+1):n){
          tmp_model = glm(phenotype ~ I(genotype_1[,i])+I(genotype_1[,j])+I(genotype_1[,i]*genotype_1[,j]),family=binomial())
          data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
        }
      }
    }
  }
    if (model==2){
      for (i in coords[1]:coords[2]){
        matrix_row <- matrix_row+1
        if (i < n){
          for (j in (i+1):n){
            tmp_model = glm(phenotype ~ I(genotype_1[,i])+I(genotype_1[,j])+I(genotype_1[,i]*genotype_rev_1[,j]),family=binomial())
            data_matrix[(matrix_row*2-1):(matrix_row*2),j]<-c(tmp_model$coefficients[length(tmp_model$coefficients)],summary(tmp_model)$coefficients[dim(summary(tmp_model)$coefficients)[1],4])
          }
        }
      }
    }
    }
  return(data_matrix)
}





#' Calculate the epistatic interaction effect between SNP pairs  using a genotype 
#' data frame created from genarate.genotype()
#' @import doParallel
#' @import foreach
#' @import RcppEigen
#' @import bigmemory
#' @import parallel
#' @description A WISH network can be built based on epistatic interaction 
#' effects between SNP pairs. Those interaction effects are calculated using
#' linear models. 
#' @usage epistatic.correlation(phenotype,genotype,threads=1,test=T,simple=T,glm=F)
#' @param phenotype Dataframe with the rows correspinding to the individuals
#' in the analysis,and columns for the different measured phenotypes and 
#' fixed/random factors. Only give one phenotype column at a time. Phenotypes
#' should be non-categorical continous or discrete/semi-discrete variables. 
#' Make sure that the dataframe contains the same
#' individuals as in the genotype-file, and that those are in the same order.
#' @param genotype Dataframe with the genotype information, resulting from 
#' the function generate.genotype(). Make sure that the dataframe contains the 
#' same individuals as in the phenotype-file, and that those are in the 
#' same order.
#' @param threads Number of threads to use for parallel execution in the function 
#' registerDoParallel()
#' @param test True or False value indicating if a test run is being perform.
#' If True will calculate the expected time it will take for the full analysis
#' based on calculating 100.000 models with the setting chosen
#' @param simple True or false value indicating if only a major/major and
#' minor/minor directed interaction model are tested (simple=T) or if if 
#' interactions on the major/minor minor axis are tested as well, with the 
#' best one of the two being selected (simple=F).
#' @param glm If T will use a generelized linear model with a binomial link
#' function instead of a regular linear model. This should be used if your
#' phenotype is binary. 
#' @return A list of two matrices. The first matrix gives the epistatic
#' interaction effects between all the SNP-pairs which were in the input 
#' genotype data) and selected with the pvalue from the GWAS results. 
#' The second matrix are the corresponding pvalues of the parameter 
#' estimates of the epistatic interactions. 
#' @references Lisette J.A. Kogelman and Haja N.Kadarmideen (2014). 
#' Weighted Interaction SNP Hub (WISH) network method for building genetic
#' networks for complex diseases and traits using whole genome genotype data.
#' BMC Systems Biology 8(Suppl 2):S5. 
#' http://www.biomedcentral.com/1752-0509/8/S2/S5.
#' @examples
#' epistatic.correlation(phenotype,genotype,threads,test,simple)
#' 
#' @export

epistatic.correlation <- function(phenotype,genotype,threads=1,test=T,simple=T,glm=F){ 
  registerDoParallel(threads)
  if ( simple == T){ 
    model <- 1
  }
  else {
    model <- 2
  }
  phenotype <- as.matrix(phenotype)
  n<-ncol(genotype)
  if(is.data.frame(genotype)){
    genotype[] <- lapply(genotype, as.numeric)
  }
  if (simple==F || test==T){
    genotype_rev <- genotype
    decide_1<-(genotype_rev==0)
    decide_2<-(genotype_rev==2)
    genotype_rev[decide_1] <- 2
    genotype_rev[decide_2] <- 0
    rm(decide_1)
    rm(decide_2)
    genotype_rev <- as.data.frame(genotype_rev)
  }
  if (test==T && n > 315) {
    message("Running Test")
    message("Estimating run time based on ~100.000 models")
    start.time <- Sys.time()
    coord_splits <-triangular_split(316,threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype[,1:316],genotype_rev[,1:316],phenotype=phenotype,coord_splits[j,],glm=glm,model=model) 
      return(subset)
    }
    end.time <- Sys.time()
    time<-as.numeric(end.time-start.time,units="hours")
    model_time<-(((n^2-n)/2)/((316^2-316)/2))*time
    model_time <- round(model_time,digits = 2)
    model_time<-as.character(model_time)
    estimate<-paste(paste("The estimated run time for the simple model is",model_time),"hours",sep=" ")
    message(estimate)
    coord_splits <-triangular_split(316,threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype[,1:316],genotype_rev[,1:316],phenotype=phenotype,coord_splits[j,],glm=glm,model=2) 
      return(subset)
    }
    end.time <- Sys.time()
    time<-as.numeric(end.time-start.time,units="hours")
    model_time<-(((n^2-n)/2)/((316^2-316)/2))*time
    model_time <- round(model_time,digits = 2)
    model_time<-as.character(model_time)
    estimate<-paste(paste("The estimated run time for the full model is",model_time),"hours",sep=" ")
    message(estimate)
  }
  else if (test==T && n <= 315){ 
    message("Data size too small for testing, running normal analysis")
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],glm=glm,model=model) 
      return(subset)
    }
    # Running opposite minor/major co-linearity model
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix_rev <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],glm=glm,model=model) 
      return(subset)
    }
  }
  else if (test==F && simple==F) {
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],glm=glm,model=model) 
      return(subset)
    }
    # Running opposite minor/major co-linearity model
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix_rev <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],glm=glm,model=model) 
      return(subset)
    }
  }
  if (test == F && simple==F || (test==T && n <= 315)){
    # Transposing and filling out the correlation and pvalue matrix
    epi_cor <- snp_matrix[seq(1,nrow(snp_matrix)-1,2),]
    epi_pvalue <-   snp_matrix[seq(2,nrow(snp_matrix),2),]
    rm(snp_matrix)
    epi_cor_t <- t(epi_cor)
    epi_pvalue_t <- t(epi_pvalue)
    diag(epi_cor_t) <- 0
    diag(epi_pvalue_t) <- 1
    epi_cor_t[upper.tri(epi_cor_t)]<- epi_cor[upper.tri(epi_cor)]
    epi_pvalue_t[upper.tri(epi_pvalue_t)]<- epi_pvalue[upper.tri(epi_pvalue)]
    epi_cor_t_1 <- epi_cor_t
    epi_pvalue_t_1 <- epi_pvalue_t
    epi_cor_t_1[is.na(epi_cor_t_1)] <- 0
    epi_pvalue_t_1[is.na(epi_pvalue_t_1)] <- 1
    # Opposite assumption correlation and pvalue matrix
    epi_cor <- snp_matrix_rev[seq(1,nrow(snp_matrix_rev)-1,2),]
    epi_pvalue <-   snp_matrix_rev[seq(2,nrow(snp_matrix_rev),2),]
    rm(snp_matrix_rev)
    epi_cor_t <- t(epi_cor)
    epi_pvalue_t <- t(epi_pvalue)
    diag(epi_cor_t) <- 0
    diag(epi_pvalue_t) <- 1
    epi_cor_t[upper.tri(epi_cor_t)]<- epi_cor[upper.tri(epi_cor)]
    epi_pvalue_t[upper.tri(epi_pvalue_t)]<- epi_pvalue[upper.tri(epi_pvalue)]
    epi_cor_t_2 <- epi_cor_t
    epi_pvalue_t_2 <- epi_pvalue_t
    epi_cor_t_2[is.na(epi_cor_t_2)] <- 0
    epi_pvalue_t_2[is.na(epi_pvalue_t_2)] <- 1
    #Picking the correct model assumption based on lowest pvalue
    decider_matrix <- epi_pvalue_t_1-epi_pvalue_t_2
    epi_pvalue_t <- epi_pvalue_t_1
    epi_pvalue_t[0 < decider_matrix] <- epi_pvalue_t_2[0 < decider_matrix]
    epi_cor_t <- epi_cor_t_1
    epi_cor_t[0 < decider_matrix] <- epi_cor_t_2[0 < decider_matrix]
    colnames(epi_pvalue_t) <- colnames(genotype)
    rownames(epi_pvalue_t) <- colnames(genotype)
    colnames(epi_cor_t) <- colnames(genotype)
    rownames(epi_cor_t) <- colnames(genotype)
    output <-list(epi_pvalue_t,epi_cor_t)
    names(output)<-c("Pvalues","Coefficients")
    return(output)
  }
  else if (test==F && simple==T){ 
    coord_splits <-triangular_split(dim(genotype)[2],threads)
    snp_matrix <- foreach(j = 1:threads, .combine='rbind', .inorder=T, .verbose=F) %dopar% {
      subset <- partial_correlations_triangular(genotype,genotype_rev,phenotype=phenotype,coord_splits[j,],glm=glm,model=model) 
      return(subset)
    }
    epi_cor <- snp_matrix[seq(1,nrow(snp_matrix)-1,2),]
    epi_pvalue <-   snp_matrix[seq(2,nrow(snp_matrix),2),]
    epi_cor_t <- t(epi_cor)
    epi_pvalue_t <- t(epi_pvalue)
    #diag(epi_cor_t) <- 0
    #diag(epi_pvalue_t) <- 1
    epi_cor_t[upper.tri(epi_cor_t)]<- epi_cor[upper.tri(epi_cor)]
    epi_pvalue_t[upper.tri(epi_pvalue_t)]<- epi_pvalue[upper.tri(epi_pvalue)]
    diag(epi_cor_t) <- 0
    diag(epi_pvalue_t) <- 1
    colnames(epi_pvalue_t) <- colnames(genotype)
    rownames(epi_pvalue_t) <- colnames(genotype)
    colnames(epi_cor_t) <- colnames(genotype)
    rownames(epi_cor_t) <- colnames(genotype)
    output <-list(epi_pvalue_t,epi_cor_t)
    names(output)<-c("Pvalues","Coefficients")
    return(output)
  }
} 

#' Visualization of pairwise chromosome epistatic interactions on a genome wide level
#' @description Visualization of the genome wide chromosome pairwise relative strength
#' of epistatic interaction, ranging from 1 (strongest) to 0 (weakest).
#' The strength is based on the 90th percentile quantile (default) of stastistical
#' significance of epistatic interaction between all interactions in each
#' chromosome pair, scaled to 1 to 0. 
#' @import corrplot
#' @usage genome.interaction(tped,correlations,quantile=0.9)
#' @param tped The tped file used in generate.genotype(). The SNPs must
#' be sorted by chromosome, matching the order of the SNPs in the correlation 
#' matrices. 
#' @param correlations List of epistatic correlations and p-values genrated by
#' epistatic.correlation()
#' @param quantile Number from 0 to 1 indicating which quantile to base the
#' visualization on. 
#' @return Outputs a plot visualizing the chromosome interaction map
#' @examples
#'  genome.interaction(tped,correlations)
#' 
#' @export


genome.interaction <- function(tped,correlations,quantile=0.9) {
    if (is.character(tped)){
      message("loading tped file")
      tped <- fread(tped,data.table=F)
    }
    else if (!is.data.frame(tped)){
      stop("tped file not file or data frame")
    }
  new_P <- (1-correlations$Pvalues)
  map<-tped[tped[,2] %in% rownames(correlations$Pvalues),1:2]
  counter = 0
  ends <- c()
  chr_list <- c()
  for (i in map[,1]){
    if (counter == 0){
      counter <- counter + 1
      starts <- 1
      chr <- i
      chr_list <- c(chr_list,chr)
    }
    else {
      if (chr == i){
        counter <- counter + 1 
        if (counter == dim(map)[1]){
          ends <- c(ends,counter)
        }
      }
      if (chr != i){
        chr <- i
        chr_list <- c(chr_list,chr)
        ends <- c(ends,counter)
        counter <- counter + 1 
        starts <- c(starts,counter)
      }
    }
  }
  coord_splits<-cbind(starts,ends)
  visualization_matrix <- matrix(nrow = length(starts),ncol = length(starts))
  colnames(visualization_matrix) <- chr_list
  rownames(visualization_matrix) <- chr_list
  for (i in 1:length(starts)){
    for (j in 1:length(starts)){
      subset <- c(new_P[coord_splits[i,1]:coord_splits[i,2],coord_splits[j,1]:coord_splits[j,2]])
      subset <- abs(subset)
      visualization_matrix[i,j] <- quantile(subset,quantile,na.rm=T)
    }
  }
  visualization_matrix <- (visualization_matrix-min(visualization_matrix))/(max(visualization_matrix)-min(visualization_matrix))
  corrplot(visualization_matrix, type="upper",title= "Pairwise Chromosome Interaction Map",mar=c(0,0,2,0),cl.lim = c(0, 1))
}


#' Visualization of chromosomal pairwise region epistatic interaction strength, based on 
#' statistical significance 
#' @description Visualization of chromosome pairwise region epistatic interaction strength, based on 
#' statistical significance. The value is based of the most signficant epistatic interaction in each
#' region pair, ranging from 1 ( strongest) to 0 (weakest). The original input data and tped file needs
#' to be sorted by chromosome and coordinate.
#' @import heatmap3
#' @usage pairwise.chr.map(chr1,chr2,tped,correlations,50)
#' @param chr1 The name of the first chromosome in the comparison, matching the name
#' from the tped file
#' @param chr2 The name of the second chromosome in the comparison, matching the name
#' from the tped file
#' @param tped The tped file used in generate.genotype(), either as a datafrae or filelink. The SNPs must
#' be sorted by chromosome and position on the chromosome, matching the order of the SNPs in the correlation 
#' matrices. 
#' @param grid_size Number of regions to split each chromosome. This value is approximate as
#' the number variants is not always easily divisible by the number of splits. The actual
#' number of regions are displayed in the output plot
#' @param correlations List of epistatic correlations and p-values genrated by
#' epistatic.correlation()
#' @return Outputs a plot visualizing the pairwise chromosome region interaction
#' @examples
#'  pairwise.chr.map("1","2",tped,correlations)
#' 
#' @export


pairwise.chr.map <- function(chr1,chr2,tped,correlations,grid_size){
  if (is.character(tped)){
    message("loading tped file")
    tped <- fread(tped,data.table=F)
  }
  else if (!is.data.frame(tped)){
    stop("tped file not file or data frame")
  }
  total_map <- tped[tped[,2] %in% rownames(correlations$Pvalues),c(1,4)]
  chr_pair_values <-correlations$Pvalues[which(total_map[,1] == chr1),which(total_map[,1] == chr2)]
  chr_sizes <-dim(chr_pair_values)   
  chr1_coords<-1:chr_sizes[1]
  chr2_coords<-1:chr_sizes[2]
  chr1_chunk<-(length(chr1_coords)/grid_size)
  chr2_chunk<-floor(length(chr2_coords)/grid_size)
  chr1_splits<-split(chr1_coords,ceiling(seq_along(chr1_coords)/chr1_chunk))
  chr2_splits<-split(chr2_coords,ceiling(seq_along(chr2_coords)/chr2_chunk))
  mean_matrix<-matrix(0,nrow=length(chr1_splits),ncol=length(chr2_splits))
  counter <- c(0,0)
  for ( i in chr1_splits){
    counter[1]<- counter[1]+1
    counter[2]<-0
    for (j in chr2_splits){
      counter[2]<- counter[2]+1
      mean_matrix[counter[1],counter[2]] <- mean(chr_pair_values[i,j])
    }
  }
  xlabel<-paste("Chromosome=",as.character(chr2),", N-regions=",as.character(counter[1]))
  ylabel<-paste("Chromosome=",as.character(chr1),", N-regions=",as.character(counter[2]))
  heatmap3(mean_matrix,scale="none",main="Pairwise Chromosomal Interaction",Rowv = NA,Colv = NA,xlab=xlabel,ylab=ylabel,labRow=c("start",rep("",(counter[1]-2)),"end"),labCol=c("start",rep("",(counter[2]-2)),"end"),col=heat.colors(100))
}


#' Function for creation of genomic interaction modules based on WGCNA framework. 
#' @description Generate.modules normalizes the epistatic interaction coefficients
#' and performs hierarchical clustering,SNP selection and parameter selection for 
#' module construction, and generates modules.
#' It uses the WGCNA workflow modified for epistatic correlations
#' and outputs the module components and analysis parameters.
#' @import WGCNA
#' @import flashClust
#' @import dynamicTreeCut
#' @usage generate.modules(correlations,values="Coefficients",power=c(seq(1,10,0.1),c(12:22)),
#' n.snps=dim(correlations$Coefficients)[1],minClusterSize=50,type="unsigned",threads=1)
#' @param correlations List of epistatic correlations and p-values generated by
#' epistatic.correlation()
#' @param power Powers to test for creating scale free network. Only change if the default
#' values don't work
#' @param values  Character which can be "Pvalues" or "Coefficients"
#' Indicating if P-values or Coefficients should be used for analysis.
#' The recomended and defualt values is Coefficients
#' @param n.snps Number of SNPs to select. SNPs are selected by connectivity, so 500 will
#' select the top 500 most connected Snps. Default is to use all
#' @param minClusterSize Minimum module (cluster) size. Default, is 50, but changing this may
#' be recommended in case of sparse SNPs  
#' @param type Type of network to generate. Default is "unsigned", can be "signed" or "signed hybrid"
#' @param threads Number of threads to use if parallelization is possible.
#' @return Plots the network connectivity and the scale and SNP tree clustering with modules found. 
#' Returns a named list with all the data generated:
#' \describe{
#'  \item{SNPs}{SNPs used in the analysis and their correlations}
#'  \item{connectivity}{The connectivity matrix of the SNPs}
#'  \item{adjMat}{The adjacency matrix of the SNPs}
#'  \item{dissTom}{The dissimilarity TOM}
#'  \item{genetree}{The clustering object used for the genetree}
#'  \item{modules}{The module numbers for each SNP, in order of the SNP matrix}
#'  \item{modulcolors}{The colors used in the modules for each SNP}
#'  \item{power.estimate}{The power estimate to generate a scale free network}
#' }
#' @examples
#'  generate.modules(correlations)
#' 
#' @export


generate.modules <- function(correlations,values="Coefficients",power=c(seq(1,10,0.1),c(12:22)),n.snps=dim(correlations$Coefficients)[1],minClusterSize=50,type="unsigned",threads=2) {
  enableWGCNAThreads(threads)
  if (values=="Pvalue"){
    corr <- (1-correlations$Pvalues)*(correlations$Coefficients/abs(correlations$Coefficients))
  }
  if (values=="Coefficients"){
    temp_corr<-correlations$Coefficients
    temp_corr[temp_corr < 0] <- 0
    temp_corr <- temp_corr/(max(temp_corr))
    corr <- temp_corr
    temp_corr <- correlations$Coefficients
    temp_corr[temp_corr > 0] <- 0
    temp_corr <- temp_corr/(abs(min(temp_corr)))
    corr[temp_corr < 0] <- temp_corr[temp_corr < 0]
  }
  sft = pickSoftThreshold(corr, powerVector = c(seq(1,10,0.1),c(12:22)), verbose = 5)
  connectivity <- adjacency.fromSimilarity(corr, power=sft$powerEstimate,type=type)
  sizeGrWindow(10,5)
  par(mfrow=c(1,2))
  hist(connectivity,xlab="connectivity")
  scaleFreePlot(connectivity)
  par(mfrow=c(1,1))
  #select SNPs for network construction based on connectivity
  select.snps <- corr[rank(-colSums(connectivity),ties.method="first")<=n.snps,rank(-colSums(connectivity),ties.method="first")<=n.snps ]
  select.snps[,c(1:ncol(select.snps))] <- sapply(select.snps[,c(1:ncol(select.snps))], as.numeric)
  #create adjacency matrix (correlation matrix raised to power beta)
  adjMat <- adjacency.fromSimilarity(select.snps,power=sft$powerEstimate,type=type)
  #calculate dissimilarity TOM
  dissTOM <- 1-(TOMsimilarity(adjMat))
  #create gene dendrogram based on diss TOM
  genetree <- flashClust(as.dist(dissTOM), method="average")
  #cut branches of the tree= modules
  dynamicMods = cutreeDynamic(dendro=genetree, distM=dissTOM, 
                              deepSplit=2,pamRespectsDendro=F, minClusterSize=minClusterSize)
  #give modules a color as name
  moduleColors = labels2colors(dynamicMods)
  #sizeGrWindow(8,6)
  #plot dendrogram with the module colors
  plotDendroAndColors(genetree, moduleColors,
                      dendroLabels=F, hang=0.03,
                      addGuide=T, guideHang = 0.05,
                      main="Gene dendrogram and modules")
  output <- list(select.snps,connectivity,adjMat,dissTOM,genetree,dynamicMods,moduleColors,sft$powerEstimate)
  names(output) <- c("SNPs","connectivity","adjMat","dissTom","genetree","modules","modulecolors","power.estimate")
  return(output)
}


#' Function for creating blocks of input genotypes based on LD and selecting tagging variants in each
#' block. Genotypes should be sorted by genomic coordinates and chromosome. 
#' @description Given an input set of genotypes from the generate.genotype() function, this function
#' will generate LD blocks based on average LD between all members in each block, using r2 metric of LD.
#' Scanning linearly over the genotype file from the first row, genotypes will be contiously added to blocks
#' as long as the average of all pairwise r2 values of the genotypes in the block are above threshold. When
#' the values goes below the threshold, the last genotype tested will be the base of a new block. For each
#' block the variant with the highest average pairwise r2 value in block will be selected as a tagging genotype. 
#' @usage LD_blocks(genotype,threshold=0.9,max_block_size=1000)
#' @param genotype A genotype matrix from generate.genotype()
#' @param threshold The threshold used for generating blocks, indicating the minimum average pairwise r2
#' value allowed. 
#' @param max_block_size The maximum block size allowed.  
#' @return Returns a named list with the following objects:
#' \describe{
#' \item{genotype}{The tagging genotypes selected from the blocks}
#' \item{tagging_genotype}{The genotype selected to represent each block. The median genotype, rounded down is selected}
#' \item{genotype_block_matrix}{A matrix indicating which block each genotype belongs to}
#' } 
#' @return Returns a named list with the following objects:
#' @examples
#'  LD_blocks(genotype,threshold,max_blocksize)
#' @export



LD_blocks <-function(genotype,threshold=0.9,max_block_size=1000){ 
  snp_block_matrix <- as.matrix(rep(0,dim(genotype)[2]))
  rownames(snp_block_matrix) <- colnames(genotype)
  tagging_variants <- c()
  n_snp <- 1
  start <- 2
  n_block <- 1
  network_size <- 0
  block_coords <- list()
  snps <- dim(genotype)[2]
  snp_block_matrix[n_snp,] <- n_block
  while(start <= snps){ 
    print(n_snp)
    if (n_snp == snps){
      snp_block_matrix[snps,1] = n_block
    }
    else{ 
      temp_sim <- 0
      network_pairs<-combn(c(n_snp:(start)),2)
      r2_values <- c()
      for (i in 1:dim(network_pairs)[2]){
        start_t <- network_pairs[1,i]
        n_snp_t <- network_pairs[2,i]
        # if (n_snp == snps){
        #   snp_block_matrix[snps,1] = n_block
        # }
        #else{
        #matches<-sum(genotype[,n_snp]/genotype[,start] == 1,na.rm = T)+(sum(c(c(genotype[,n_snp] == 1.5)+ c(genotype[,start] == 1.5)) == 1 ,na.rm = T)/2)
        total <- sum(!is.na(genotype[,n_snp_t]+genotype[,start_t]))
        usable_values <- !is.na(genotype[,n_snp_t]+genotype[,start_t])
        #similarity <- matches/total
        p_AB <- (sum(genotype[usable_values,n_snp_t]+genotype[usable_values,start_t] == 0,na.rm = T)+(sum(genotype[usable_values,n_snp_t]+ genotype[usable_values,start_t] == 1 ,na.rm = T)/2)+(sum(genotype[usable_values,n_snp_t]*genotype[usable_values,start_t] == 1 ,na.rm = T)/2))/total
        p_A <- (sum(genotype[usable_values,n_snp_t] == 0,na.rm = T)+ sum(genotype[usable_values,n_snp_t] == 1,na.rm = T)/2)/total
        p_a <- 1-p_A
        p_B <- (sum(genotype[usable_values,start_t] == 0,na.rm = T)+ sum(genotype[usable_values,start_t] == 1,na.rm = T)/2)/total
        p_b <- 1-p_B
        #print(c(p_A,p_B,p_AB,start))
        D <- p_AB-(p_A*p_B)
        r2 <- D^2/(p_A*p_a*p_B*p_b)
        r2_values <- c(r2_values,r2)
        temp_sim <- r2+temp_sim
        network_size <- network_size + 1
      }
      similarity <- temp_sim
      #print(c(similarity,similarity/network_size,network_size,start,n_snp))
      if (similarity/network_size >= threshold && network_size < max_block_size){ 
        snp_block_matrix[start,1] = n_block
        start <- start+1
        network_size <-0
        if (start > snps){ 
          block_coords[[n_block]] <- c(n_snp,(start-1))
          for (i in min(network_pairs):max(network_pairs)){
            if (i == min(network_pairs)){
              r2_sum <- sum(r2_values[network_pairs[1,] == i | network_pairs[2,] == i],na.rm=T)
              tagging_variant <- i
            }
            else{
              new_r2<-sum(r2_values[network_pairs[1,] == i | network_pairs[2,] == i],na.rm=T)
              if(new_r2 > r2_sum){
                r2_sum <- new_r2
                tagging_variant <- i
              }
            }
          }
          tagging_variants <- c(tagging_variants,tagging_variant)
        }
      }
      else {
        if (start == snps){
          block_coords[[n_block]] <- c(n_snp,(start-1))
          n_block <- n_block + 1
          snp_block_matrix[snps,1] <- n_block
          block_coords[[n_block]] <- c(start,start)
          tagging_variants <- c(tagging_variants,start)
          start <- start + 1
        }
        else{
          r2_values <- r2_values[network_pairs[1,] != start & network_pairs[2,] != start]
          block_coords[[n_block]] <- c(n_snp,(start-1))
          if (n_snp == start-1){
            tagging_variant <- n_snp
          }
          else{
            network_pairs<-combn(c(n_snp:(start-1)),2)
            print((r2_values))
            print((network_pairs))
            for (i in min(network_pairs):max(network_pairs)){
              if (i == min(network_pairs)){
                r2_sum <- sum(r2_values[network_pairs[1,] == i | network_pairs[2,] == i],na.rm=T)
                tagging_variant <- i
              }
              else{
                new_r2<-sum(r2_values[network_pairs[1,] == i | network_pairs[2,] == i],na.rm=T)
                if(new_r2 > r2_sum){
                  r2_sum <- new_r2
                  tagging_variant <- i
                }
              }
            }
          }
          tagging_variants <- c(tagging_variants,tagging_variant)
          n_snp <- start
          n_block <- n_block +1
          snp_block_matrix[start,1] = n_block
          start <- start +1
          network_size <- 0
        }
      }
    }
  }
  genotype <- as.matrix(genotype)
  new_genotype <- genotype[,tagging_variants]
  
  output<-list(new_genotype,tagging_variants,snp_block_matrix)
  names(output)<-c("genotype","tagging_genotype","genotype_block_matrix")
  return(output)
}

#' Function to plot summary pseudo-manhattan plots of variants.
#' @description Visualize summary statistics for interactions based on total sum 
#' of -loglikelihoods for each variant across all interactions og the sum of
#' effect sizes.
#' @import ggplot2 
#' @usage pseudo_manhattan(tped,correlations,values="p")
#' @param tped Input tped file as .tped file or data frame. The tped file (.tped)
#' is a transposed ped file, from Plink. 
#' This file contains the SNP and genotype information where one row is a SNP.
#' The first 4 columns of a TPED file are the same as a 4-column MAP file.
#' Then all genotypes are listed for all individuals for each particular SNP on 
#' each line. Again, SNPs are 1,2-coded.#'
#' @param correlations List of epistatic correlations and p-values generated by
#' epistatic.correlation()
#' @param values Values can either be p or for p-values or c for coefficients
#' depending on if you wan't to use the effects sizes for p-values for plotting 
#' @return Plots a pseudo manhattan plot
#' @examples
#' pseudo_manhattan(tped,correlations)
#' 
#' @export


pseudo_manhattan<- function(tped,correlations,values="p"){
  if (is.character(tped)){
    message("loading tped file")
    tped <- fread(tped,data.table=F)
  }
  map<-tped[tped[,2] %in% rownames(correlations$Pvalues),1:2]
  if ((sum(map[,2] == rownames(correlations$Pvalues)))== dim(correlations$Pvalues)[1]){
    if (values=="p"){
      likelyhood_sum <- colSums(-log(correlations$Pvalues))
      #plot(1:length(likelyhood_sum),likelyhood_sum,col=map[,1]%%2+3,xlab="N-variant",ylab="Sum of log-likelihood",main="Pseudo-Manhattan Plot")
      data_p<-as.data.frame(cbind(likelyhood_sum,c(1:length(likelyhood_sum)),map[,1]))
      colnames(data_p)<-c("like_sum","Nvar","chr")
      data_p$chr <- as.factor(data_p$chr)
      gg<-ggplot(data_p,aes(x=Nvar, y=like_sum,col=chr ))+ geom_point()+
        ggtitle("Pseudo-Manhattan Plot")+
        xlab("N-variant")+
        ylab("Sum of -log-likelihood")
      gg <- gg+guides(col=F)
      return(gg)
    }
    if (values=="c"){
      likelyhood_sum <- colSums(abs(correlations$Coefficients))
      data_p<-as.data.frame(cbind(likelyhood_sum,c(1:length(likelyhood_sum)),map[,1]))
      colnames(data_p)<-c("like_sum","Nvar","chr")
      data_p$chr <- as.factor(data_p$chr)
      gg<-ggplot(data_p,aes(x=Nvar, y=like_sum,col=chr ))+ geom_point()+
        ggtitle("Pseudo-Manhattan Plot")+
        xlab("N-variant")+
        ylab("Sum of Effect Sizes")
      gg <- gg+guides(col=F)
      return(gg)
    }
  }
  else{
    message("tped and model output variant names do not match")
  }
}
QSG-Group/WISH documentation built on July 20, 2020, 2:11 p.m.