#' Script to create pseudo pathways from all pathways run through GSEA.
#' Used as negative control for long-range LD correlation analysis
#' @param hcInDir (char) path to directory with input files
#' (PLINK binary files for each high-confidence pathway)
#' @param allInDir (char) path to directory with input files
#' (PLINK binary files for all GSEA-run pathways)
#' @param pseudoOutDir (char) path to directory to store output files
#' (PLINK binary files for each random pseudo pathways)
createPseudo <- function(hcInDir, allInDir, pseudoOutDir) {
# Read in list of SNPs per each high-confidence pathway
cat("*Reading high-confidence pathway files...")
hc_paths <- list.files(path=hcInDir, pattern="*.snps", full.names=T)
hc_path_snps <- sapply(hc_paths, readLines)
hc_path_size <- sapply(hc_path_snps, length) #n SNPs per pathway
cat(" done.\n")
print(hc_path_size)
# Read in lists of all pathway SNPs as generated by getSNPlists.R
# SNPs will be randomly selected from these pathways to create pseudo pathways
# that reflect the size of each high-confidence pathway
cat("*Reading all GSEA pathway files...")
all_paths <- list.files(path=allInDir, pattern="*.snps", full.names=T)
all_path_snps <- sapply(all_paths, readLines)
cat(" done.\n")
# Remove all high-confidence pathway SNPs from them
# We want to control for our highly enriched SNP set when creating the
# set of random pathways
cat("*Removing all high-confidence pathway SNPs from all pathway files...")
hc_snp_df <- sprintf("%s/snps_unique_high-confidence.txt", hcInDir)
for (i in 1:length(all_paths)) {
system(sprintf("grep -v -x -f %s %s > %s", hc_snp_df, all_paths[i],
sprintf("%s_sans_hc.snps", file_path_sans_ext(all_paths[i]))))
}
cat(" done.\n")
# Use bash to remove all empty files after removing SNPs, since the
# high-confidence pathways now have no SNPs left
system(sprintf("find %s -size 0 -print0 | xargs -0 rm", allInDir))
all_paths_no_hc <- list.files(path=allInDir, pattern="*sans_hc.snps",
full.names=T)
all_path_snps_no_hc <- sapply(all_paths_no_hc, readLines)
no_snp_sample <- melt(all_path_snps_no_hc)
no_snp_sample_unique <- no_snp_sample[unique(no_snp_sample$value), ]
cat(sprintf("Random sampling pseudo pathways from %i SNPs.\n",
nrow(no_snp_sample_unique)))
# Create empty list of k character vectors (k = # of pseudo pathways to make)
# Each pseudo pathway will have i SNPs (i = # of SNPs per hc pathway)
pseudo <- rep(list(c()), length(hc_paths))
no_paths <- 1:length(all_paths_no_hc) #number each pathway to sample from
set.seed(1)
for (k in 1:length(hc_paths)) {
cat(sprintf("*Creating pseudo path %i\n", k))
for (i in 1:hc_path_size[k]) {
n <- sample(no_paths, 1)
pseudo[[k]][i] <- sample(all_path_snps_no_hc[[n]], 1, replace=F)
cat(sprintf("\tRandomly sampling snp from pathway #%i... \ttotal = %i\n",
n, i))
}
cat(" done.\n\n")
write.table(pseudo[[k]],
file=sprintf("%s/pseudo_pathway_%i.snps", pseudoOutDir, k),
col=F, row=F, quote=F)
}
# Create PLINK files per pseudo pathway (for snpStats)
pseudo_paths <- list.files(path=pseudoOutDir, pattern="*.snps", full.names=T)
for (i in 1:length(pseudo_paths)) {
str1 <- sprintf("%s --bed %s.bed --bim %s.bim --fam %s --extract %s",
PLINK, genoF, genoF, realFAM, pseudo_paths[i])
str2 <- sprintf("--make-bed --allow-no-sex --out %s",
file_path_sans_ext(pseudo_paths[i]))
cmd <- sprintf("%s %s", str1, str2)
system(cmd)
}
# Concatenate SNP files into master file
cat("*Combining all SNPs into master SNP file...")
system(sprintf("cat %s/*.snps > %s/snps_all.txt", pseudoOutDir, pseudoOutDir))
cat(sprintf(" file written to %s/snps_all.txt.\n", pseudoOutDir))
cat("*Writing file with only unique SNPs...")
snps <- read.table(sprintf("%s/snps_all.txt", pseudoOutDir), h=F, as.is=T)
snps_unique <- as.data.frame(unique(snps))
write.table(snps_unique,
file=sprintf("%s/snps_uniques.txt", pseudoOutDir),
col=F, row=F, quote=F)
cat(sprintf(" file written to %s/snps_unique.txt.\n", pseudoOutDir))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.