R/top_snps_pval_ld2.R

#'@title Rank SNPs by pval, prune for LD
#'@description Selects a set of top SNP based on p-value that are in minimal LD.
#'Performs the same function as top_snps_pval_ld but with different input format.
#'@param pairwise_ld Should have four columns (rowsnp, colsnp r^2, chunk)
#'@param ld_chunk_files List of RDS files containing pairwise ld. Provide either pairwise_ld or ld_chunk_files.
#'@param pvals Vector of pvalues (optional)
#'@param snp_list List of snps
#'@param ld_chunks chunk number for each SNP.
#'If chunk has missing values, those SNPs are assumed not in LD with any other SNPs.
#'@param pval_thresh Maximum pvalue
#'@param max_snps Maximum number of SNPs to retain
#'@param ld_thresh Maximum pairwise correlation between SNPs
#'@export
top_snps_pval_ld2 <- function(pairwise_ld, ld_chunk_files,
                              pvals, snp_list, ld_chunks,
                              pval_thresh = Inf, r2_thresh=0.01,
                              cores=parallel::detectCores()-1){
  if(missing(pairwise_ld) & missing(ld_chunk_files)) stop("Please provide either pairwise_ld or ld_chunk_files\n")
  if(!missing(pairwise_ld)){
    names(pairwise_ld) <- c("rowsnp", "colsnp", "r2", "ld_chunk")
    pwld <- T
   }else{
     pwld <- F
   }

  if(is.finite(pval_thresh) & missing(pvals)){
    stop("pval_thresh is given but pvals is missing.\n")
  }
  p <- length(snp_list)
  cat("You have provided information for ", p, " snps.\n")
  stopifnot(length(ld_chunks) == p)
  #dat <- data.frame("snp"=snp_list, "ld_chunk"=ld_chunks)
  if(missing(pvals)){
    o <- sample(1:p, size=p, replace=FALSE)
    cat("No p-values are provided so SNPs will be ordered randomly.\n")
  }else{
    stopifnot(length(pvals)==p)
    o <- order(pvals, decreasing=F)
    ix <- sum(pvals < pval_thresh)
    cat(ix, " SNPs pass p-value threshold.\n")
    o <- o[1:ix]
    #dat$pval <- pvals
  }
  #dat$order <- o
  if(anyNA(ld_chunks)){
    nasnps <- subset(snp_list, is.na(ld_chunks))
    if(!missing(pvals)){
      napvals <- subset(pvals, is.na(ld_chunks))
      pvals <- subset(pvals, !is.na(ld_chunks))
      keep <- subset(nasnps, napvals < pval_thresh)
    }else{
      keep <- nasnps
    }
    snp_list <- subset(snp_list, !is.na(ld_chunks))
    ld_chunks <- subset(ld_chunks, !is.na(ld_chunks))
  }else{
    keep <- c()
  }
  chunks <- sort(unique(ld_chunks[o]))
  cl <- makeCluster(cores, type="FORK")
  keep_c <- parSapply(cl=cl, chunks, FUN=function(c){
    k <- c()
    cat("chunk ", c, "..")
    o_c <- subset(o, ld_chunks[o]==c)
    if(pwld){
      ld_c <- subset(pairwise_ld, ld_chunk == c)
    }else{
      ld_c <- readRDS(ld_chunk_files[c])
    }
    while(length(o_c) > 0){
      k <- c(k, snp_list[o_c[1]])
      snp <- snp_list[o_c[1]]
      myld <- subset(ld_c, rowsnp==snp | colsnp==snp)
      if(any(myld$r2 > r2_thresh)){
        remove_snps <- subset(myld, r2 > r2_thresh) %>%
                       with(., unique( c(rowsnp,colsnp)))
        remove_ix <- which(snp_list %in% remove_snps)
      }else{
        remove_ix <- o_c[1]
        remove_snps <- snp_list[o_c[1]]
      }
      ld_c <- subset(ld_c, !rowsnp %in% remove_snps & !colsnp %in% remove_snps)
      o_c <- subset(o_c, !o_c %in% remove_ix)
      cat(length(o_c), " ")
    }
    return(k)
  })
  stopCluster(cl)
  keep <- c(keep, unlist(keep_c))
  if(!missing(pvals)){
    keep_pvals <- pvals[snp_list %in% keep]
    o <- order(keep_pvals, decreasing=FALSE)
    keep <- keep[o]
  }
  return(keep)
}
jean997/sherlockAsh documentation built on May 18, 2019, 11:45 p.m.