R/generate_data.R

Defines functions generate_seed_data permute_data wrap_permute_data generate_dists_data generate_effs_data get_seed_data

Documented in generate_dists_data generate_effs_data generate_seed_data get_seed_data permute_data wrap_permute_data

# ---------------------------
# generate_data.R
# Generate seed_data and send it
#	back.
# ---------------------------
# Necessary functions
# ---------------------------

#' get_seed_data
#'
#' @export
get_seed_data = function(seed, n, dims, dists, effs){
	set.seed(seed)

	# Unprotected cluster
	# TODO: why 2*n?
	unprot = mvrnorm(2*n, rep(5, dims), diag(dims))
	cent_unprot = colMeans(unprot)

	if (!is.null(globals_list$par_effs) && globals_list$par_effs){
		effs_data_list = foreach(eff=effs, .inorder=FALSE, .export=c("globals_list"), .packages=c("ksmthesis")) %dopar% {
			return(generate_effs_data(eff, dists, unprot, cent_unprot))
		}

	} else {
		effs_data_list = lapply(effs, generate_effs_data, dists, unprot, cent_unprot)
	}

	if (is.null(globals_list$do_permute) || !globals_list$do_permute){
		effs_data = do.call(rbind, effs_data_list)
		return(effs_data)

	} else {
		effs_data = lapply(effs_data_list, function(x){
			return(x$dists_data)
		})
		effs_data = do.call(rbind, effs_data)

		# NOTE: jank
		permuted_effs_inf_data = lapply(effs_data_list, function(x){
			 return(x$permuted_dists_data$inf)
		})
		permuted_effs_inf_data = do.call(rbind, permuted_effs_inf_data)

		permuted_effs_prot_data = lapply(effs_data_list, function(x){
			 return(x$permuted_dists_data$prot)
		})
		permuted_effs_prot_data = do.call(rbind, permuted_effs_prot_data)
	}

	gc()

	# if globals_list$do_permute:
	#	[[1]] is the data
	#	[[2]] is a list of length 2
	#		Both are matrices of effs, dists, permuted labels, perm_iters
	return(list(effs_data=effs_data, permuted_effs_data=list(inf=permuted_effs_inf_data,
								    prot=permuted_effs_prot_data)))
}

# NOTE: eff, NOT effs
#' generate_effs_data
#'
#' @export
generate_effs_data = function(eff, dists, unprot, cent_unprot){
	if (length(eff) > 1){
		warning("Please pass a single efficacy value. Aborting...", .call=TRUE)
		quit()
	}

	num_prot = floor(eff * globals_list$n)
	num_unprot = globals_list$n - num_prot

	dists_data_list = lapply(dists, generate_dists_data, eff, unprot, cent_unprot, num_prot, num_unprot)

	if (is.null(globals_list$do_permute) || !globals_list$do_permute){
		dists_data = do.call(rbind, dists_data_list)
		return(dists_data)

	} else {
		dists_data = lapply(dists_data_list, function(x){ return(x$data) })
		dists_data = do.call(rbind, dists_data)

		# NOTE: jank
		permuted_dists_inf_data = lapply(dists_data_list, function(x){ return(x$permuted_data$inf) })
		permuted_dists_inf_data = do.call(rbind, permuted_dists_inf_data)

		permuted_dists_prot_data = lapply(dists_data_list, function(x){ return(x$permuted_data$prot) })
		permuted_dists_prot_data = do.call(rbind, permuted_dists_prot_data)
	}

	return(list(dists_data=dists_data, permuted_dists_data=list(inf=permuted_dists_inf_data,
								    prot=permuted_dists_prot_data)))
}

#' generate_dists_data
#'
#' @export
generate_dists_data = function(dist, eff, unprot, cent_unprot, num_prot, num_unprot){
	cent_prot = cent_unprot + (dist / sqrt(2))
	prot = mvrnorm(globals_list$n, cent_prot, diag(length(cent_prot)))

	# ---------------------------
	# B. Generate infected and uninfected
				    
	inf_unprot = sample(1:globals_list$n, globals_list$n, replace=FALSE)
	inf = as.matrix(unprot[inf_unprot,])
	uninf = rbind(prot[sample(1:globals_list$n, num_prot, replace=FALSE),],
		      unprot[sample(1:globals_list$n, num_unprot, replace=FALSE),])

	# Jitter points in uninf that are the same as those in inf (sampled from unprot)
	# This serves no statistical purpose, but makes visualization less fraught,
	#	i.e. no thinking there's a problem (cough mislabeling as RNs)
	#	when points of different shapes are overlapping.
	tmp = rbind(inf, uninf)
	dupe_idx = (1:nrow(tmp))[duplicated(tmp)] - nrow(inf)
	uninf[dupe_idx,] = apply(uninf[dupe_idx,], 2, jitter, factor=500)
	tmp = rbind(inf, uninf)

	num_rows = nrow(inf) + nrow(uninf)

	prot_groups = c(rep(globals_list$UnprotNum, nrow(inf)),
			rep(globals_list$ProtNum, num_prot),
			rep(globals_list$UnprotNum, num_unprot))

	inf_groups = c(rep(globals_list$InfNum, nrow(inf)),
		       rep(globals_list$UninfNum, nrow(uninf)))

	data = as.matrix(data.frame(eff=rep(eff, num_rows),
				    dist=rep(dist, num_rows),
				    rbind(inf, uninf),
				    inf_groups, prot_groups))
	data = apply(data, 2, as.numeric)

	if (is.null(globals_list$do_permute) || !globals_list$do_permute){
		return(data)

	} else {
		permuted_data = wrap_permute_data(data)
	}

	return(list(data=data, permuted_data=permuted_data))
}

# ---------------------------
# Requires:
# globals_list$permute_iters
# library(prodlim)

#' wrap_permute_data
#'
#' @export
wrap_permute_data = function(dist_data){
	if (is.null(globals_list$permute_iters)){
		warning("globals_list$permute_iters not set - defaulting to 100. Buckle up!", call.=TRUE)
		globals_list$permute_iters <<- 100
	}

	permuted_dist_data = lapply(1:globals_list$permute_iters, permute_data, dist_data)

	# NOTE: jank
	permuted_dist_inf_data = lapply(permuted_dist_data, function(x){ return(x$inf) })
	permuted_dist_prot_data = lapply(permuted_dist_data, function(x){ return(x$prot) })

	permuted_dist_inf_data = do.call(rbind, permuted_dist_inf_data)
	permuted_dist_prot_data = do.call(rbind, permuted_dist_prot_data)

	gc()
	return(list(inf=permuted_dist_inf_data, prot=permuted_dist_prot_data))
}

# ---------------------------
# Requires:
# fcan

# Reassign ratio number of inf and uninf to the opposite group number.
# Ratio of 0.5 is "perfectly permuted", i.e. as opposite as possible without
#	being an inversion.
#' permute_data
#'
#' @export
permute_data = function(iter, dist_data, ratio=0.5){
	permuted_inf_groups = dist_data[,"inf_groups"]
	permuted_prot_groups = dist_data[,"prot_groups"]

	status_types = c("inf", "prot")
	groups_list = list(inf=dist_data[,"inf_groups"], prot=dist_data[,"prot_groups"])

	results = vector("list", length(groups_list))
	for (i in 1:length(groups_list)){
		group_data = groups_list[[i]]
		tmp = group_data
		groups_levels = unique(group_data)

		# If not, we have nothing to permute
		if (length(groups_levels) > 1){
			# NOTE: jank. Only works for two groups; "switching" labels doesn't
			#	mean anything for more than two groups.
			for (group in groups_levels){
				group_indices = which(group_data == group)
				permute_indices = sample(group_indices, ratio*length(group_indices), replace=FALSE)

				# Switch the groups
				tmp[permute_indices] = groups_levels[which(groups_levels != group)]
			}
		}

		permuted_data = dist_data
		if (class(permuted_data)[1] == "data.frame"){
			toss = c("V1", "V2", "V3")

		} else if (class(permuted_data)[1] == "matrix"){
			toss = c("X1", "X2", "X3")
		}

		cname = paste(status_types[i], "_groups", sep='')

		permuted_data = permuted_data[,!colnames(permuted_data) %in% toss]
		permuted_data[,cname] = tmp
		permuted_data = cbind(permuted_data, perm_iter=iter)

		results[[i]] = permuted_data
	}

	names(results) = status_types

	gc()
	return(results)
}


# ---------------------------
# WRAPPER FOR IT ALL
# ---------------------------

# User doesn't care how you get it, just that they get it
#' generate_seed_data
#'
#' @export
generate_seed_data = function(seeds, dims, dists, effs, results_dir, dogen){
	if (dogen){
		g_s_d_expr = expression(
			get_seed_data(seed, globals_list$n, dims, dists, effs)
		)

		cat("Generating data...\n")
		if (globals_list$par_seeds){
			seed_data = foreach (seed=seeds, .inorder=FALSE, .export=c("globals_list"), .packages=c("ksmthesis")) %dopar% {
				return(eval(g_s_d_expr))
			}

		} else {
			seed_data = foreach (seed=seeds) %do% {
				return(eval(g_s_d_expr))
			}
		}

		names(seed_data) = seeds
		save(seed_data, file=paste(results_dir, "seed_data", sep=''))

	} else {
		load(paste(results_dir, "seed_data", sep=''))
	}

	return(seed_data)
}
kmorrisongr/ksmthesis documentation built on Oct. 5, 2020, 6:41 a.m.