R/~old/dev/makePseudoPaths.R

Defines functions createPseudo

#' Script to create pseudo pathways from all pathways run through GSEA.
#' Used as negative control for long-range LD correlation analysis

#' @param hcInDir (char) path to directory with input files
#' 		(PLINK binary files for each high-confidence pathway)
#' @param allInDir (char) path to directory with input files
#' 		(PLINK binary files for all GSEA-run pathways)
#' @param pseudoOutDir (char) path to directory to store output files
#' 		(PLINK binary files for each random pseudo pathways)

createPseudo <- function(hcInDir, allInDir, pseudoOutDir) {

  # Read in list of SNPs per each high-confidence pathway
  cat("*Reading high-confidence pathway files...")
  hc_paths <- list.files(path=hcInDir, pattern="*.snps", full.names=T)
  hc_path_snps <- sapply(hc_paths, readLines)
  hc_path_size <- sapply(hc_path_snps, length) #n SNPs per pathway
  cat(" done.\n")
  print(hc_path_size)

  # Read in lists of all pathway SNPs as generated by getSNPlists.R
  # SNPs will be randomly selected from these pathways to create pseudo pathways
  # that reflect the size of each high-confidence pathway
  cat("*Reading all GSEA pathway files...")
  all_paths <- list.files(path=allInDir, pattern="*.snps", full.names=T)
  all_path_snps <- sapply(all_paths, readLines)
  cat(" done.\n")

  # Remove all high-confidence pathway SNPs from them
  # We want to control for our highly enriched SNP set when creating the
  # set of random pathways
  cat("*Removing all high-confidence pathway SNPs from all pathway files...")
  hc_snp_df <- sprintf("%s/snps_unique_high-confidence.txt", hcInDir)
  for (i in 1:length(all_paths)) {
    system(sprintf("grep -v -x -f %s %s > %s", hc_snp_df, all_paths[i],
            sprintf("%s_sans_hc.snps", file_path_sans_ext(all_paths[i]))))
  }
  cat(" done.\n")

  # Use bash to remove all empty files after removing SNPs, since the
  # high-confidence pathways now have no SNPs left
  system(sprintf("find %s -size 0 -print0 | xargs -0 rm", allInDir))

  all_paths_no_hc <- list.files(path=allInDir, pattern="*sans_hc.snps",
                                full.names=T)
  all_path_snps_no_hc <- sapply(all_paths_no_hc, readLines)

  no_snp_sample <- melt(all_path_snps_no_hc)
  no_snp_sample_unique <- no_snp_sample[unique(no_snp_sample$value), ]

  cat(sprintf("Random sampling pseudo pathways from %i SNPs.\n",
        nrow(no_snp_sample_unique)))

  # Create empty list of k character vectors (k = # of pseudo pathways to make)
  # Each pseudo pathway will have i SNPs (i = # of SNPs per hc pathway)
  pseudo <- rep(list(c()), length(hc_paths))
  no_paths <- 1:length(all_paths_no_hc) #number each pathway to sample from

  set.seed(1)
  for (k in 1:length(hc_paths)) {
    cat(sprintf("*Creating pseudo path %i\n", k))
    for (i in 1:hc_path_size[k]) {
      n <- sample(no_paths, 1)
      pseudo[[k]][i] <- sample(all_path_snps_no_hc[[n]], 1, replace=F)
      cat(sprintf("\tRandomly sampling snp from pathway #%i... \ttotal = %i\n",
                  n, i))
    }
    cat(" done.\n\n")
    write.table(pseudo[[k]],
                file=sprintf("%s/pseudo_pathway_%i.snps", pseudoOutDir, k),
                col=F, row=F, quote=F)
  }

  # Create PLINK files per pseudo pathway (for snpStats)
  pseudo_paths <- list.files(path=pseudoOutDir, pattern="*.snps", full.names=T)

  for (i in 1:length(pseudo_paths)) {
    str1 <- sprintf("%s --bed %s.bed --bim %s.bim --fam %s --extract %s",
                    PLINK, genoF, genoF, realFAM, pseudo_paths[i])
    str2 <- sprintf("--make-bed --allow-no-sex --out %s",
                    file_path_sans_ext(pseudo_paths[i]))
    cmd <- sprintf("%s %s", str1, str2)
    system(cmd)
  }

  # Concatenate SNP files into master file
  cat("*Combining all SNPs into master SNP file...")
  system(sprintf("cat %s/*.snps > %s/snps_all.txt", pseudoOutDir, pseudoOutDir))
  cat(sprintf(" file written to %s/snps_all.txt.\n", pseudoOutDir))

  cat("*Writing file with only unique SNPs...")
  snps <- read.table(sprintf("%s/snps_all.txt", pseudoOutDir), h=F, as.is=T)
  snps_unique <- as.data.frame(unique(snps))
  write.table(snps_unique,
              file=sprintf("%s/snps_uniques.txt", pseudoOutDir),
              col=F, row=F, quote=F)
  cat(sprintf(" file written to %s/snps_unique.txt.\n", pseudoOutDir))
}
BaderLab/POPPATHR documentation built on Dec. 17, 2021, 9:53 a.m.