R/summary_data.R

Defines functions calculate_overlap calculate_overlap_2 draw_betas_multi_sample ld_multiplier greedy_remove read_ld_matrices write_ld_matrices generate_ld_matrices_slow generate_ld_matrices get_regions variant_reference

variant_reference <- function(bfile, plink_bin=genetics.binaRies::get_plink_binary(), fn=tempfile())
{
	shell <- ifelse(Sys.info()['sysname'] == "Windows", "cmd", "sh")

	fun1 <- paste0(
		shQuote(plink_bin, type=shell),
		" --bfile ", shQuote(bfile, type=shell),
		" --freq ",
		" --out ", shQuote(fn, type=shell)
	)
	message("Generating MAF")
	system(fun1, ignore.stdout=TRUE)

	message("Reading variant info")
	frq <- data.table::fread(paste0(fn, ".frq"))
	bim <- data.table::fread(paste0(bfile, ".bim"))

	names(bim) <- c("CHR", "RSID", "GP", "POS", "EA", "NEA")
	bim$EAF <- frq$MAF
	bim <- subset(bim, !is.na(EAF))
	return(bim)
}


get_regions <- function(pop="ASN")
{
	system.file(paste0("extdata/ldetect/", pop, ".bed"), package="simulateGP") %>%
		data.table::fread(.data, header=TRUE) %>%
		dplyr::mutate(
			chr=as.numeric(gsub("chr", "", chr)),
			start=as.numeric(start),
			stop=as.numeric(stop)
		) %>% 
		dplyr::as_tibble()
}


generate_ld_matrices <- function(regions, varref, bfile, plink_bin=genetics.binaRies::get_plink_binary(), fn=tempfile())
{
	shell <- ifelse(Sys.info()['sysname'] == "Windows", "cmd", "sh")
	message("Calculating LD for ", nrow(regions), " regions")
	l <- list()
	for(i in 1:nrow(regions))
	{
		message("Region ", i, " of ", nrow(regions))
		variants <- subset(varref, CHR == regions$chr[i] & POS > regions$start[i] & POS < regions$stop[i])
		l[[i]] <- list(
			info=variants,
			ld=ieugwasr::ld_matrix(variants$RSID, bfile=bfile, plink_bin=plink_bin, with_alleles=FALSE)
		)
	}
}

# LD from 
generate_ld_matrices_slow <- function(regions, varref, bfile, plink_bin=genetics.binaRies::get_plink_binary(), fn=tempfile())
{
	shell <- ifelse(Sys.info()['sysname'] == "Windows", "cmd", "sh")
	message("Calculating LD for ", nrow(regions), " regions")
	l <- list()
	for(i in 1:nrow(regions))
	{
		message("Region ", i, " of ", nrow(regions))
		variants <- subset(varref, CHR == regions$chr[i] & POS > regions$start[i] & POS < regions$stop[i])
		write.table(data.frame(variants$RSID), file=fn, row.names=F, col.names=F, quote=F)
		fun2 <- paste0(
			shQuote(plink_bin, type=shell),
			" --bfile ", shQuote(bfile, type=shell),
			" --extract ", shQuote(fn, type=shell), 
			" --recode A ", 
			" --out ", shQuote(fn, type=shell)
		)
		system(fun2, ignore.stdout=TRUE)
		x <- data.table::fread(paste0(fn, ".raw")) %>% {.data[,-c(1:6)]} %>% as.matrix()
		l[[i]] <- list(
			info=variants,
			ld=stats::cor(x, use="pair")
		)
		unlink(paste0(fn, ".raw"))
	}
}

write_ld_matrices <- function(r, fn)
{
	y <- r[lower.tri(r, diag=FALSE)]
	yi <- round(y*127) %>% as.integer
	fn <- file(fn, "wb")
	writeBin(nrow(r)-1, fn, integer())
	writeBin(yi, fn, size=1)
	close(fn)
}

read_ld_matrices <- function(fn)
{
	fn <- file(fn, "rb")
	n <- readBin(fn, 1, integer())
	r <- readBin(fn, integer(), n*(n-1)/2, size=1, signed=TRUE) / 127
	R <- diag(n+1)
	R[lower.tri(R, diag=FALSE)] <- r
	R[upper.tri(R, diag=FALSE)] <- r
	return(R)
}



greedy_remove <- function(r, threshold=0.99)
{
	diag(r) <- 0
	flag <- 1
	rem <- c()
	nom <- colnames(r)
	while(flag == 1)
	{
		message("iteration")
		count <- apply(r, 2, function(x) sum(x >= threshold))
		if(any(count > 0))
		{
			worst <- which.max(count)[1]
			rem <- c(rem, names(worst))
			r <- r[-worst,-worst]
		} else {
			flag <- 0
		}
	}
	return(which(nom %in% rem))
}


ld_multiplier <- function(varref, ld)
{
	varref$beta_rho <- NA
	varref$b_rho <- NA
	for(i in 1:length(ld))
	{
		message("Region ", i, " of ", length(ld))
		m1 <- match(ld[[i]]$info$RSID, varref$RSID) %>% na.exclude %>% as.numeric
		variants_dat <- varref$RSID[m1]
		if(length(variants_dat) > 0)
		{
			m2 <- match(variants_dat, rownames(ld[[i]]$ld))
			r <- ld[[i]]$ld[m2,m2]
			varref$beta_rho[m1] <- varref$beta[m1] %*% r
			varref$b_rho[m1] <- varref$b[m1] %*% r
		}
	}
	varref$beta_rho_se <- expected_se(varref$beta_rho, varref$EAF, varref$N, varref$vy)
	varref$b_rho_se <- expected_se(varref$b_rho, varref$EAF, varref$N, varref$vy)
	return(varref)
}


draw_betas_multi_sample <- function(b, se, N, pcor, Nrep=1)
{
	stopifnot(length(b) == length(se))
	stopifnot(length(b) == nrow(N))
	stopifnot(nrow(N) == nrow(N))
	stopifnot(all(dim(N) == dim(pcor)))
	stopifnot(all(diag(N) == 1))
	stopifnot(all(diag(pcor) == 1))
	ses <- diag(se)
	covmat <- ses %*% (pcor * N) %*% ses
	MASS::mvrnorm(Nrep, b, covmat)
}


calculate_overlap_2 <- function(n1, n2, n_overlap)
{
	n_overlap / sqrt(n1 * n2)
}


calculate_overlap <- function(n, N_overlap)
{
	stopifnot(length(n) == nrow(N_overlap))
	stopifnot(nrow(N_overlap) == ncol(N_overlap))
	N_overlap / sqrt(n %*% t(n))
}
explodecomputer/simulateGP documentation built on April 3, 2024, 2:38 a.m.