R/sample_population.R

Defines functions get_sample_data sample_population sample_temporal rztpois sample_survival sample_missing sample_nested is.nested

Documented in get_sample_data

## function which tells whether consecutive columns are nested, assuming that the first one is the highest level
is.nested <-function(x) sapply(2:ncol(x), function(i) nrow(unique(x[,c(i-1,i)])) == length(unique(x[,i])) )

## function
## param 
sample_nested <- function(data_structure, param, plot=FALSE){
	
	groups <- colnames(param)
	
	## check naming
	if(!all(groups %in% c(colnames(data_structure),"observation"))){
		stop("Names in sample order need to be in data_structure")
	}
	
	## if there isn't an observation level specified make one
	if("observation" %in% groups & !"observation" %in% colnames(data_structure)){
	 data_structure[,"observation"] <- 1:nrow(data_structure)
	}

	## test if nested
	if(!all(is.nested(data_structure[,groups]))){
		stop("Grouping factors are not nested in the order provided")
	}

	# check that all levels of factor have at least the maximum samples

		# for(i in colnames(data_structure)){
		# 	if(min(table(data_structure[,i]) ) < max(simple[[i]])) stop("not all levels of ",i, " have the maximum number of samples")
		# }

	# for (i in colnames(param)){
	# 	if(length(unique(data_structure[,i])) < max(param[,i])) stop("not all levels of ",i, " have the maximum number of samples")
	# }

		

	# for each parameter set (row in param)
	apply(param,1, function(j){
		
		## get the right number of levels for the highest group
		index<-which(data_structure[,groups[1]] %in% sample(unique(data_structure[,groups[1]]), j[1], replace=FALSE))

		##then cascade through nested groups to get right number of levels within each
		if(ncol(param)>1){
			for(i in 2:ncol(param)){
				index_new <- sort(which(data_structure[index,groups[i]] %in% c(tapply(data_structure[index,groups[i]],data_structure[index,groups[i-1]], function(x){
					if(length(x)>1) sample(unique(x),j[i], replace=FALSE) else x
				}), recursive=TRUE)))
				index <- index[index_new]
			}	
		}

		index
	}, simplify=FALSE)

		# At each level, user can specify proportion or integer of samples at a given level if proportion then round to nearest integer?
		# Apply separately to each response?

# possible alternative data entry
# simple = c("individual(50)/observation(10)","individual(100)/observation(5)")
# structure=simple
# structure <- gsub("\\s","",structure)
# comp_list <- strsplit(structure, "\\/")
# comp_list_N <- lapply(comp_list,extract_N)
# comp_names <- lapply(comp_list,extract_name)

}


sample_missing <- function(pop_data, param, plot=FALSE){
	## check that missingness predictors are are in y or predictors (error message will be given if they;re not, but might be worth making more informative one)
	## by default center and scale predictors
	## option to plot missingness function
	lapply(1:pop_data$n_pop, function(i){	

		## put together y and predictors and scale
		## scaling means that all coefficients are comparable
		dat <- as.data.frame(apply(cbind(pop_data$y[[i]],pop_data$predictors[[i]]),2,scale) )

		l <- sapply(param, function(x) {
			y <- eval(parse(text=x),envir = dat)
		  if(length(y)==1) y <- rep(y, nrow(dat))
		  return(y)
		})
		e <- stats::plogis(l)
		o <- apply(e,2,function(x)as.logical(stats::rbinom(length(x),1,x)))
		indices  <- apply(o,2,which)
		if(length(param)==1){
			list(which(o))
		}else{
			apply(o,2,which)
		}
	})
	# could make total N constant ;using the probabilities e with sample

	# plot(dat$env,o)
	# plot(dat$ind,o)
	# plot(l,o)

	# x <- seq(-4,4,0.1)
	# plot(x,stats::plogis(x*0.5), type="l",ylim=c(0,1))
	# plot(x,stats::rbinom(length(x),1,stats::plogis(x*0.5)),ylim=c(0,1))

}


sample_survival <- function(y,ID,age,death=1,all=TRUE){
	do.call(c,lapply(split(data.frame(y=y,age=age,index=1:length(y)),ID), function(x){
	  if(death %in% x$y) {
	  	# make sure age is right order
	  	x_order <- x[order(x$age),]
	  	if(death==0){x_order$y<-abs(x_order$y-1)}
	  	# subset until death, and then return indexes 
	 		x_order[1:which(cumsum(x_order$y)==1)[1],"index"]
		}else if(all){
			x$index
		}else{
			NULL
		}
	}))
}



#https://stat.ethz.ch/pipermail/r-help/2005-May/070680.html
rztpois <- function(N, lambda) stats::qpois(stats::runif(N, stats::dpois(0, lambda), 1), lambda)


sample_temporal <- function(data_structure, time, group, variance, n, balanced=TRUE, plot=FALSE){
	# Which grouping factor is time
	# Which grouping factor is temporally dependent
	# Sampling parameters - between group variance in sampling across time
	# list(time = c(day, month), group = c(ind, pop), variance = c(0.5, 0,6))

	all_levels <- unique(data_structure[,group])
  N_levels <- length(all_levels)

  Tsamp <- length(unique(data_structure[,time]))
  # Tsamp - number of time points

  Tmin <- min(unique(data_structure[,time]))
  ## work out within group and between group range
  TsampB <- round(Tsamp*variance,0)
  TsampW <- Tsamp-TsampB
  ## 
  if(! TsampW >= n) stop("Number of time steps not enough to implement this variance")


  if(plot){
    plot(NA, xlim=c(1,Tsamp), ylim=c(1,N_levels))
    graphics::abline( h=1:N_levels, col="grey")
  }

  indices <- sort(c(lapply(1:N_levels, function(x){ 
  			if(!balanced) n <- rztpois(1,n)
        Tx <- sort(sample(1:TsampW,n, replace=FALSE)) + sample(1:TsampB,1) + Tmin -1
        if(plot) graphics::points(Tx,rep(x,n), pch=19, col=x)
        # Tx
        # all_levels[x]
        index1 <- which(data_structure[,group]==all_levels[x])
        index1[which(data_structure[index1,time] %in% Tx)]
        
      }), recursive=TRUE))
  if(plot)graphics::points(individual~day,data_structure[indices,])

  indices
}




## @title sample_population
## @description Sample population level data
## @param x A squid object, created using simulate_population().
## @param type Type of sampling, needs to be one of 'nested', 'missing' or temporal. See details.
## @param param A set of parameters, specific to the sampling type. See details.
## @param plot Logical. Should illustrative plots be made - defaults to FALSE.
## @details ...
## @return 
sample_population <- function(x){
	
	type <- x$sample_type
	param <- x$sample_param
	plot <- x$sample_plot
	# if(class(x) != "squid"){
	# 	stop("x needs to be class 'squid'")
	# }

	if(type=="nested"){
		if(!is.matrix(param)){
		  stop("param needs to be a matrix for type='nested'")
		}
		indices <- lapply(1:x$n_pop,function(y) sample_nested(x$data_structure, param, plot))
	
	}else if(type=="missing"){
		if(!is.vector(param)){
		  stop("param needs to be a vector for type='missing'")
		}
		indices <- sample_missing(x, param, plot)
	
	}else if(type=="survival"){
		if(!is.list(param)){
		  stop("param needs to be a list for type='survival'")
		}
		if(!all(sapply(param,is.vector))){
		  stop("All elements of param list must be vectors for type='survival'")
		}
		if(!all(sapply(param,length)==1)) {
		  stop("vectors in param list must be length 1 for type='survival'")
		}
		if(!all(c("y", "ID", "age")	%in% names(param))) {
			stop("param list must include 'y', 'ID' and 'age' for type='survival'")
		}
		if(!is.null(param[["all"]]) && !is.logical(param[["all"]]) ){
			stop("'all' in param list must logical for type='survival'")
		}
		indices <- lapply(x$y,function(z) { # for each population
			list(sample_survival(
				y=z[,param[["y"]]], 
				age=x$data_structure[,param[["age"]]], 
				ID=x$data_structure[,param[["ID"]]],
				death=if(is.null(param[["death"]])){1}else{param[["death"]]},
				all= if(is.null(param[["all"]])){TRUE}else{param[["all"]]} ) )
		})

	}else if(type=="temporal"){
		if(!is.list(param)){
		  stop("param needs to be a list for type='temporal'")
		}
		if(!all(sapply(param,is.vector))){
		  stop("All elements of param list must be vectors for type='temporal'")
		}
		vec_length <- sapply(param,length)
		if(!all(vec_length%in%c(1,max(vec_length)))){
		  stop("vectors in param list must be same length or length 1, for type='temporal'")
		}
		param <- lapply(param,function(y) if(length(y)==1) {rep(y,max(vec_length))} else {y})
		indices <- lapply(1:x$n_pop,function(y) { # for each population
			lapply(1:max(vec_length), function(i) { # for each set of parameters
				do.call(sample_temporal, c(list(data_structure=x$data_structure),lapply(param,function(z) z[i]), plot=FALSE)) # 
			})
		})
	}else {
		stop("type must be 'nested', 'missing', 'survival' or 'temporal'")
	}

	# pop_data$sample_param <- list(type=type, param=param)
	indices
	# pop_data$samples <- indices
	# pop_data
	## apply to each population 
	## incorporate into squid object

## plotting - maybe plot only first population, but all parameter combinations

}



#' @title get_sample_data
#' @description Extracts sampled data from a squid object
#' @param x an R object of class 'squid'
#' @param sample_set Integer - which sample set to return. Defaults to 1
#' @param list Logical - whether to return data as a list or data_table (FALSE; default).
#' @param ... further arguments passed to or from other methods.
#' @export
get_sample_data <- function(x, sample_set=1, list=FALSE,...){
  
  pop_list <- lapply(1:x$n_pop,function(i) {
  	# data.table::data.table(cbind(x$y[[i]],x$predictors[[i]],x$data_structure,squid_pop=i)[x$samples[[i]][[sample_set]],])
  	as.data.frame(cbind(x$y[[i]],x$predictors[[i]],x$data_structure,squid_pop=i)[x$samples[[i]][[sample_set]],])
  })
  
  if(list){
    return(pop_list)
  }else{
    do.call(rbind,pop_list)
  }
}

## think it would be a good idea to provide functionality that allowed a user to chose different variables to have missing data
squid-group/squidSim documentation built on Dec. 15, 2024, 12:26 p.m.