#' get breaks based on equidistant prob
#'
#' for a given hypergeometric
#' distribution gives a set of points
#' so that expected value is
#' greater than or equal to 5
#'
#' @param n number of success in population
#' @param m number of failures in population
#' @param k sample size
#' @param n_obs number of observations
#' @return data frame, col q is cutoff points, p is associated prob. First prob is P(X <= q1)
#' last p is P(X >= qn)
#' @importFrom magrittr '%>%'
#' @export
#' @examples
#' get_equi_prob_breaks(500,500,200,100)
get_equi_prob_breaks = function(n,m,k,n_obs){
cutoff = 5 / n_obs
df = data.frame(p = seq(cutoff, 1, cutoff),q=1,real_p = 1)
for(i in 1:(nrow(df)-1)){
df[i,'q'] = qhyper(df[i,'p'],m = m,n = n,k = k)
df[i,'real_p'] = phyper(df[i,'q'],m = m,n = n,k = k)
df[i+1,'p'] = df[i,'real_p'] + cutoff
if(df[i+1,'p'] > 1){
df = df[1:i,]
break
}
}
df$p = c(df$real_p[1],diff(df$real_p))
df$p[nrow(df)] = 1 - sum(df$p[-nrow(df)])
df$q[nrow(df)] = df$q[nrow(df)-1] + 1
df = df %>% dplyr::select(q,p)
return(df)
}
#' get vector of frequencies
#'
#' Given vector of data and list of cutoffs points of x gives vector of freq
#' last point is P(X>=qn)
#'
#' @param data observed values
#' @param q vector of cutoff points
#' @return vector of freq
#' @export
#' @note 23-Jan-2015
#' @examples
#' df = get_equi_prob_breaks(500,500,200,100)
#' data = rhyper(100,m = 500,n = 500,k = 200)
#' table(data)
#' df$f = get_freq(data=data,q=df$q)
#' df
get_freq = function(data,q){
df = data.frame(q = q)
obs = apply(df,1,function(x){sum(data<=x)})
obs = c(obs[1],diff(obs))
obs[length(obs)] = length(data) - sum(obs[-length(obs)])
return(obs)
}
#' calculates the chi square statistics
#'
#' usual calculations
#'
#' @param obs observed frequencies
#' @param expected frequencies
#' @return chi square
#' @export
#' @examples
#' obs = c(10,10)
#' exp = c(10,10)
#' get_chi(obs,exp)
#' obs = c(5,15)
#' get_chi(obs,exp)
get_chi = function(obs,exp){
chi = sum((obs - exp)^2 / exp)
return(chi)
}
#' perform a gof for hypergeometric
#'
#' creates equidistant bins based on hyper then calculates
#' p-value based on chi and also based on empirical
#'
#' @param data
#' @param m number of success in proposed hypergeom
#' @param n number of failures in proposed hypergeom
#' @param k sample size of hypergeom
#' @param n_sims is number of sims for empirical p-value
#' @return list is both p-values and data frame of observed and expected freq
#' so that can plot. Class is 'gof'
#' @export
#' @examples
#' data = rhyper(nn = 1000,m = 500,n = 500,k = 200)
#' gof = hyper_gof_test(data,m = 500,n = 500,k = 200)
#' print(gof)
#' gof = hyper_gof_test(data,m = 500,n = 500,k = 200, n_sims = 100)
#' print(gof)
hyper_gof_test = function(data,m,n,k,n_sims=NA,trace=FALSE){
df = get_equi_prob_breaks(n = n,m = m,k = k,n_obs = length(data))
df$freq = get_freq(data = data,q = df$q)
df = df %>% dplyr::mutate(exp = p * sum(freq))
gof = list()
gof$df = df
gof$chi = get_chi(obs = df$freq,exp = df$exp)
gof$chi_pv = 1 - pchisq(gof$chi,df = nrow(df) - 1)
if(trace){
print(chisq.test(x = df$freq,p = df$p))
}
if(!is.na(n_sims)){
f = function(nn,m,n,k){
x = rhyper(nn = nn,m = m,n = n,k = k)
obs = get_freq(x,df$q)
return(get_chi(obs = obs,exp = df$exp))
}
chi_s = replicate(n_sims,f(nn=length(data),m=m,n=n,k=k))
gof$sim_pv = sum(chi_s >= gof$chi) / length(chi_s)
}
if(trace){
print(chisq.test(x = df$freq,p = df$p,simulate.p.value = TRUE))
}
class(gof) = 'gof'
return(gof)
}
#' choose sample size
#'
#' choose sample size to give maximum variance
#'
#' @param N number of nodes in parent graph
#' @return n sample size
#' @author Jonathan Tuke <simon.tuke@@adelaide.edu.au>
#' @export
#' @note 13-Jan-2015
#' @examples
#' get_sample_size(100)
get_sample_size = function(N){
Ne = N * (N-1) / 2
n = (1 + sqrt(1 + 4*Ne)) / 2
n = round(n)
return(n)
}
#' Sample a sparse matrix
#'
#' Samples a sparse matrix and returns number of edges
#'
#' @param mat sparse matrix
#' @param n number of row/col to sample
#' @return number of edges
#' @export
sample_sparse = function(mat,n){
i = sample(1:nrow(mat),n)
mat = mat[i,i]
return(sum(mat)/2)
}
#' sample graph
#'
#' takes igraph and samples and gets edges
#'
#' @param g igraph object
#' @param n number of nodes to sample
#' @param k number of sampled graphs
#' @return vector of edges in each sampled graph
#' @author Jonathan Tuke <simon.tuke@@adelaide.edu.au>
#' @export
#' @note 14-Jan-2015
#' @examples
#' g = igraph::erdos.renyi.game(100,0.1)
#' sample_graph(g,n=50,100)
sample_graph = function(g,n,k){
if(n > vcount(g)){
stop("Cannot sample more nodes than in graph")
}
A = get.adjacency(g)
m = replicate(n = k,sample_sparse(mat = A,n = n))
return(m)
}
#' graph to hypergeometric
#'
#' takes a graph and sampling size and converts to equivalent hypergeometic
#'
#' @param g igraph object
#' @param n number of nodes sampled
#' @return list of hypergeometric parameters
#' @author Jonathan Tuke <simon.tuke@@adelaide.edu.au>
#' @export
#' @note 25-Jan-2015
#' @examples
#' g = igraph::erdos.renyi.game(100,0.1)
#' n <- get_sample_size(100)
#' graph_to_hyper(g, n)
graph_to_hyper = function(g,n){
if(class(g) != "igraph"){
stop("g must be an igraph object")
}
k = n * (n-1) / 2
m = ecount(g)
N = vcount(g) * (vcount(g) - 1)/2
n = N - m
return(c(m=m,n=n,k=k))
}
#' empirical gof test for homogeneity
#'
#' takes a graph and returns empirical p-value for homogeneity
#'
#' @param g graph
#' @param n_sims number of sims for empirical dist
#' @param n_samples number of samples to take from graph
#' @return p-value
#' @export
#' @examples
#' g = igraph::erdos.renyi.game(100,0.1)
#' erdos_emp_gof_test(g)
erdos_emp_gof_test = function(g,n_sims=100,n_samples=200){
## get observed chi ----
N = vcount(g)
e = ecount(g)
n = get_sample_size(N)
m = sample_graph(g = g,n = n,k = n_samples)
para = graph_to_hyper(g = g,n = n)
chi = hyper_gof_test(data = m,m = para['m'],n = para['n'],k = para['k'])$chi
## get empirical dist of chi
f = function(N,e,n,n_samples){
g1 = erdos.renyi.game(n = N,p.or.m = e, type="gnm")
m = sample_graph(g = g,n=n,k = n_samples)
para = graph_to_hyper(g = g1,n = n)
gof = hyper_gof_test(data = m,m = para['m'],n = para['n'],k = para['k'])
return(gof$chi)
}
chi_sim = replicate(n_sims,f(N,e,n,n_samples))
pv = sum(chi_sim >= chi) / length(chi_sim)
return(pv)
}
#' gof test for homogeneity
#'
#' takes a graph and returns p-value for homogeneity
#'
#' @param g graph
#' @param n_samples number of samples to take from graph
#' @return p-value
#' @export
#' @examples
#' g = igraph::erdos.renyi.game(100,0.1)
#' erdos_gof_test(g)
erdos_gof_test = function(g,n_samples=200){
N = vcount(g)
e = ecount(g)
n = get_sample_size(N)
m = sample_graph(g = g,n = n,k = n_samples)
para = graph_to_hyper(g = g,n = n)
pv = hyper_gof_test(data = m,m = para['m'],n = para['n'],k = para['k'])$chi_pv
return(pv)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.