R/homogeneous_GoF.R

Defines functions get_equi_prob_breaks get_freq get_chi hyper_gof_test get_sample_size sample_sparse sample_graph graph_to_hyper erdos_emp_gof_test erdos_gof_test

Documented in erdos_emp_gof_test erdos_gof_test get_chi get_equi_prob_breaks get_freq get_sample_size graph_to_hyper hyper_gof_test sample_graph sample_sparse

#' get breaks based on equidistant prob
#'
#' for a given hypergeometric
#' distribution gives a set of points
#' so that expected value is
#' greater than or equal to 5
#'
#' @param n number of success in population
#' @param m number of failures in population
#' @param k sample size
#' @param n_obs number of observations
#' @return data frame, col q is cutoff points, p is associated prob. First prob is P(X <= q1)
#' last p is P(X >= qn)
#' @importFrom magrittr '%>%'
#' @export
#' @examples
#' get_equi_prob_breaks(500,500,200,100)
get_equi_prob_breaks = function(n,m,k,n_obs){
  cutoff = 5 / n_obs
  df = data.frame(p = seq(cutoff, 1, cutoff),q=1,real_p = 1)
  for(i in 1:(nrow(df)-1)){
    df[i,'q'] = qhyper(df[i,'p'],m = m,n = n,k = k)
    df[i,'real_p'] = phyper(df[i,'q'],m = m,n = n,k = k)
    df[i+1,'p'] = df[i,'real_p'] + cutoff
    if(df[i+1,'p'] > 1){
      df = df[1:i,]
      break
    }
  }
  df$p = c(df$real_p[1],diff(df$real_p))
  df$p[nrow(df)] = 1 - sum(df$p[-nrow(df)])
  df$q[nrow(df)] = df$q[nrow(df)-1] + 1
  df = df %>% dplyr::select(q,p)
  return(df)
}
#' get vector of frequencies
#'
#' Given vector of data and list of cutoffs points of x gives vector of freq
#' last point is P(X>=qn)
#'
#' @param data observed values
#' @param q vector of cutoff points
#' @return vector of freq
#' @export
#' @note 23-Jan-2015
#' @examples
#' df = get_equi_prob_breaks(500,500,200,100)
#' data = rhyper(100,m = 500,n = 500,k = 200)
#' table(data)
#' df$f = get_freq(data=data,q=df$q)
#' df
get_freq = function(data,q){
  df = data.frame(q = q)
  obs = apply(df,1,function(x){sum(data<=x)})
  obs = c(obs[1],diff(obs))
  obs[length(obs)] = length(data) - sum(obs[-length(obs)])
  return(obs)
}
#' calculates the chi square statistics
#'
#' usual calculations
#'
#' @param obs observed frequencies
#' @param expected frequencies
#' @return chi square
#' @export
#' @examples
#' obs = c(10,10)
#' exp = c(10,10)
#' get_chi(obs,exp)
#' obs = c(5,15)
#' get_chi(obs,exp)
get_chi = function(obs,exp){
  chi = sum((obs - exp)^2 / exp)
  return(chi)
}
#' perform a gof for hypergeometric
#'
#' creates equidistant bins based on hyper then calculates
#' p-value based on chi and also based on empirical
#'
#' @param data
#' @param m number of success in proposed hypergeom
#' @param n number of failures in proposed hypergeom
#' @param k sample size of hypergeom
#' @param n_sims is number of sims for empirical p-value
#' @return list is both p-values and data frame of observed and expected freq
#' so that can plot. Class is 'gof'
#' @export
#' @examples
#' data = rhyper(nn = 1000,m = 500,n = 500,k = 200)
#' gof = hyper_gof_test(data,m = 500,n = 500,k = 200)
#' print(gof)
#' gof = hyper_gof_test(data,m = 500,n = 500,k = 200, n_sims = 100)
#' print(gof)
hyper_gof_test = function(data,m,n,k,n_sims=NA,trace=FALSE){
  df = get_equi_prob_breaks(n = n,m = m,k = k,n_obs = length(data))
  df$freq = get_freq(data = data,q = df$q)
  df = df %>% dplyr::mutate(exp = p * sum(freq))
  gof = list()
  gof$df = df
  gof$chi = get_chi(obs = df$freq,exp = df$exp)
  gof$chi_pv = 1 - pchisq(gof$chi,df = nrow(df) - 1)
  if(trace){
    print(chisq.test(x = df$freq,p = df$p))
  }
  if(!is.na(n_sims)){
    f = function(nn,m,n,k){
      x = rhyper(nn = nn,m = m,n = n,k = k)
      obs = get_freq(x,df$q)
      return(get_chi(obs = obs,exp = df$exp))
    }
    chi_s = replicate(n_sims,f(nn=length(data),m=m,n=n,k=k))
    gof$sim_pv = sum(chi_s >= gof$chi) / length(chi_s)
  }
  if(trace){
    print(chisq.test(x = df$freq,p = df$p,simulate.p.value = TRUE))
  }
  class(gof) = 'gof'
  return(gof)
}
#' choose sample size
#'
#' choose sample size to give maximum variance
#'
#' @param N number of nodes in parent graph
#' @return n sample size
#' @author Jonathan Tuke <simon.tuke@@adelaide.edu.au>
#' @export
#' @note 13-Jan-2015
#' @examples
#' get_sample_size(100)
get_sample_size = function(N){
  Ne = N * (N-1) / 2
  n = (1 + sqrt(1 + 4*Ne)) / 2
  n = round(n)
  return(n)
}
#' Sample a sparse matrix
#'
#' Samples a sparse matrix and returns number of edges
#'
#' @param mat sparse matrix
#' @param n number of row/col to sample
#' @return number of edges
#' @export
sample_sparse = function(mat,n){
  i = sample(1:nrow(mat),n)
  mat = mat[i,i]
  return(sum(mat)/2)
}
#' sample graph
#'
#' takes igraph and samples and gets edges
#'
#' @param g igraph object
#' @param n number of nodes to sample
#' @param k number of sampled graphs
#' @return vector of edges in each sampled graph
#' @author Jonathan Tuke <simon.tuke@@adelaide.edu.au>
#' @export
#' @note 14-Jan-2015
#' @examples
#' g = igraph::erdos.renyi.game(100,0.1)
#' sample_graph(g,n=50,100)
sample_graph = function(g,n,k){
  if(n > vcount(g)){
    stop("Cannot sample more nodes than in graph")
  }
  A = get.adjacency(g)
  m = replicate(n = k,sample_sparse(mat = A,n = n))
  return(m)
}
#' graph to hypergeometric
#'
#' takes a graph and sampling size and converts to equivalent hypergeometic
#'
#' @param g igraph object
#' @param n number of nodes sampled
#' @return list of hypergeometric parameters
#' @author Jonathan Tuke <simon.tuke@@adelaide.edu.au>
#' @export
#' @note 25-Jan-2015
#' @examples
#' g = igraph::erdos.renyi.game(100,0.1)
#' n <-  get_sample_size(100)
#' graph_to_hyper(g, n)
graph_to_hyper = function(g,n){
  if(class(g) != "igraph"){
    stop("g must be an igraph object")
  }
  k = n * (n-1) / 2
  m = ecount(g)
  N = vcount(g) * (vcount(g) - 1)/2
  n = N - m
  return(c(m=m,n=n,k=k))
}
#' empirical gof test for homogeneity
#'
#' takes a graph and returns empirical p-value for homogeneity
#'
#' @param g graph
#' @param n_sims number of sims for empirical dist
#' @param n_samples number of samples to take from graph
#' @return p-value
#' @export
#' @examples
#' g = igraph::erdos.renyi.game(100,0.1)
#' erdos_emp_gof_test(g)
erdos_emp_gof_test = function(g,n_sims=100,n_samples=200){
  ## get observed chi ----
  N = vcount(g)
  e = ecount(g)
  n = get_sample_size(N)
  m = sample_graph(g = g,n = n,k = n_samples)
  para = graph_to_hyper(g = g,n = n)
  chi = hyper_gof_test(data = m,m = para['m'],n = para['n'],k = para['k'])$chi

  ## get empirical dist of chi
  f = function(N,e,n,n_samples){
    g1 = erdos.renyi.game(n = N,p.or.m = e, type="gnm")
    m = sample_graph(g = g,n=n,k = n_samples)
    para = graph_to_hyper(g = g1,n = n)
    gof = hyper_gof_test(data = m,m = para['m'],n = para['n'],k = para['k'])
    return(gof$chi)
  }
  chi_sim = replicate(n_sims,f(N,e,n,n_samples))
  pv = sum(chi_sim >= chi) / length(chi_sim)

  return(pv)
}
#' gof test for homogeneity
#'
#' takes a graph and returns p-value for homogeneity
#'
#' @param g graph
#' @param n_samples number of samples to take from graph
#' @return p-value
#' @export
#' @examples
#' g = igraph::erdos.renyi.game(100,0.1)
#' erdos_gof_test(g)
erdos_gof_test = function(g,n_samples=200){
  N = vcount(g)
  e = ecount(g)
  n = get_sample_size(N)
  m = sample_graph(g = g,n = n,k = n_samples)
  para = graph_to_hyper(g = g,n = n)
  pv = hyper_gof_test(data = m,m = para['m'],n = para['n'],k = para['k'])$chi_pv
  return(pv)
}
jonotuke/graphr documentation built on May 19, 2019, 8:37 p.m.