R/top_snps_pval_ld2_newformat.R

#'@title Rank SNPs by pval, prune for LD
#'@description Selects a set of top SNP based on p-value that are in minimal LD.
#'Performs the same function as top_snps_pval_ld but with different input format.
#'@param ld_files List of h5 files containing pairwise ld. One for each chromosome.
#'@param snp_info_rds RDS file with snp info
#'@param pvals Vector of pvalues (optional)
#'@param snps List of snps
#'@param r2_thresh r^2 threshold for pruning
#'@param pval_thresh Maximum pvalue
#'@param cores Number of cores to use
#'@export
top_snps_pval_ld2_new <- function(ld_files, snp_info_rds,
                              snps, pval_cols=c(),
                              pval_thresh = Inf, r2_thresh=0.01,
                              cores=parallel::detectCores()-1){

  if(is.finite(pval_thresh) & length(pval_cols)==0){
    stop("pval_thresh is given but pvals is missing.\n")
  }
  p1 <- nrow(snps)
  stopifnot("SNP" %in% names(snps))
  if(length(pval_cols)==0){
    snps <- mutate(snps, p = sample(1:p1, size=p1, replace=FALSE)/p1)
    cat("No p-values are provided so SNPs will be ordered randomly.\n")
    pval_cols <- c("p")
    pval_thresh <- Inf
  }
  cat("You have provided information for ", p1, " snps.\n")
  k <- length(pval_cols)

  snp_info <- readRDS(snp_info_rds)

  snps <- snps %>% mutate(min_pval = apply(snps[, pval_cols], 1, min)) %>%
          filter(min_pval < pval_thresh)
  snp_info <- right_join(snp_info, snps, by="SNP")

  chrom_chunk <- snp_info %>% filter(!is.na(region_id)) %>%
                 group_by(region_id) %>%
                 summarize(chr = first(chr))  %>%
                 select(chr, region_id)
  cl <- makeCluster(cores, type="PSOCK")
  clusterExport(cl, varlist=c("chrom_chunk","pval_thresh", "pval_cols", "k",
                              "r2_thresh", "ld_files", "snp_info"), envir=environment())
  clusterEvalQ(cl, library(plyr))
  clusterEvalQ(cl, library(dplyr))
  keep <- parApply(cl, chrom_chunk, 1, FUN=function(y){
  #keep <- apply(chrom_chunk, 1, FUN=function(y){
                     my_snp_info <- filter(snp_info, region_id == as.numeric(y[2]))
                     ld <- EigenH5::read_df_h5(ld_files[as.numeric(y[1])], paste0("LD_DF/", y[2], "/LD")) %>%
                           filter(colSNP %in% my_snp_info$SNP & rowSNP %in% my_snp_info$SNP)
                     o_c <- select(my_snp_info, pval_cols) %>%
                            alply(., 2, function(x){
                                order(unlist(x), decreasing=FALSE)[seq(sum(x < pval_thresh))]
                            })
                     snps_c <- my_snp_info$SNP
                     keep_c <- llply(o_c, function(x){
                              k_c <- c()
                              while(length(x) > 0){
                                snp <- snps_c[x[1]]
                                k_c <- c(k_c, snp)
                                if(length(x) == 1){
                                  x <- c()
                                  next
                                }
                                myld <- subset(ld, rowSNP==snp | colSNP==snp)
                                if(any(myld$r2 > r2_thresh)){
                                  remove_snps <- subset(myld, r2 > r2_thresh) %>%
                                                 with(., unique( c(rowSNP,colSNP)))
                                  remove_ix <- which(snps_c %in% remove_snps)
                                }else{
                                  remove_ix <- x[1]
                                }
                                x <- subset(x, !x %in% remove_ix)
                              }
                              return(k_c)
                              })
                      return(keep_c)
                      })
  stopCluster(cl)
  if(k == 1){
    keep <- unlist(keep)
    return(keep)
  }
  keep_list <- list()
  for(i in seq(k)){
    keep_list[[i]] <- lapply(keep, function(x){x[[i]]}) %>% unlist()
  } 
  return(keep_list)
}
jean997/sherlockAsh documentation built on May 18, 2019, 11:45 p.m.