popgen.stats: Popgen summary statistics calculator

Documented in Tajima_D

#' Tajima_D function
#'
#' @param genome_matrix: a matrix of 0,1's. Each row is an individual. Each column is a polymorphism in the sampled population.
#'
#' @return scalar value of Tajima's D
#' @export
#'
#' @examples Tajima_D(genome_matrix)



##### R Version

Tajima_D<-function(s){
  return((theta_t(s)-theta_w(s))/(v_theta(s)^0.5))
}

#theta_t gives number of pairwise differences, normalised by the number of pairs
theta_t  <- function(genome_matrix){
  dim_genomes<-dim(genome_matrix) #grab dimensions of genome matrix.
  sample_size<-dim_genomes[1] #getting the number of genomes

  #get symmetric manhattan distance matrix
  dist_matrix=as.matrix(dist(genome_matrix,method="manhattan"))

  #sum up elements of matrix and divide by 2 (symmetric matrix)
  pairwise_diff=sum(dist_matrix)*0.5

  #normalize by number of possiblem pairs
  return (pairwise_diff/(sample_size*(sample_size-1)/2))
}

#theta_w is the expected number of segregating sites

theta_w<-function(s){
  sample_size<-dim(s)[1] #get the number of genomes
  segsites<-dim(s)[2]

  #compute the normalizing constant, derived analytically by Tajima. Add reference later!!!
  normalise=0 #initialisation of normalisation constant
  for (i in 1:(sample_size-1)){
    normalise=normalise+1/i
  }
  return ((segsites)/(normalise))
}

#variance of theta was derived analytically by Tajima
v_theta<-function(sequence){
  n=dim(sequence)[2] #get number of samples
  S=dim(sequence)[1] #get num of seg sites

  #initialisation
  a1=0
  a2=0

  for (i in 1:(n-1)){
    a1=a1+1/i
  }
  for (i in 1:(n-1)){
    a2=a2+(1/i^2)
  }

  b1=(n+1)/(3*(n-1))
  b2=2*(n^2+n+3)/(9*n*(n-1))

  c1=b1-(1/a1)
  c2=b2-(n+2)/(a1*n)+(a2/a1^2)

  e1=c1/a1
  e2=c2/(a1^2+a2)

  return=(e1*S+e2*S*(S-1))
}

################# end R version