R/LDprune.R

Defines functions LDprune

Documented in LDprune

#' LD prune a genotype matrix
#'
#' The function prunes SNPs that are in LD using PLINK
#'
#' @param W matrix, dosage of SNPs
#' @param snpList character vector, vector of SNPs in W
#' @param snpLocs data frame, SNP locations in MatrixEQTL form
#' @param fileName character, name for PLINK intermediate files
#' @param windowSize integer, window size for PLINK pruning
#' @param numSNPShift integer, shifting window for PLINK pruning
#' @param ldThresh numeric, LD threshold for PLINK pruning
#'
#' @return list of pruned genotype matrix, vector of SNP names, and location data frame
#'
#' @importFrom data.table fread
#' @importFrom data.table fwrite
#'
#' @export
LDprune <- function(W,
                    snpList,
                    snpLocs,
                    fileName,
                    windowSize,
                    numSNPShift,
                    ldThresh,
                    verbose = T,
                    snpAnnot = NULL){

  if (!dir.exists('temp/')){ dir.create('temp/') }
  genfile = paste0('temp/',fileName,'.gen')
  samplefile = paste0('temp/',fileName,'.sample')
  bedfile = paste0('temp/',fileName)
  outFile = paste0('temp/',fileName)

  df = as.data.frame(matrix(nrow = 1,ncol =2))
  ids = colnames(W)
  if (is.null(rownames(W))){
    rownames(W) = snpList
  }
  geno = as.data.frame(matrix(ncol=ncol(W)+4,nrow = nrow(W)))
  colnames(geno) <- c('SNP','Pos','A1','A2',ids)
  W = W[match(snpList,rownames(W)),]
  geno[,5:ncol(geno)] = W
  geno$SNP = snpList
  onlyThese <- thisSNP[thisSNP$snpid %in% geno$SNP,]
  onlyThese = onlyThese[match(rownames(W),onlyThese$snpid),]
  geno <- geno[geno$SNP %in% snpLocs$snpid,]
  onlyThese <- onlyThese[match(onlyThese$snpid,geno$SNP),]
  geno = geno[!duplicated(geno$SNP),]
  onlyThese = onlyThese[!is.na(onlyThese$snpid),]
  onlyThese = onlyThese[!duplicated(onlyThese$snpid),]
  onlyThese = subset(onlyThese,snpid %in% geno$SNP)
  geno = subset(geno,SNP %in% onlyThese$snpid)
  chr <- onlyThese$chr
  geno$Pos <- onlyThese$pos
  if (is.null(snpAnnot)){
    geno$A1 <- unlist(lapply(strsplit(geno$SNP,':'),function(x) as.character(x[3])))
    geno$A2 <- unlist(lapply(strsplit(geno$SNP,':'),function(x) as.character(x[4])))}
  if (!is.null(snpAnnot)){
    snpA = subset(snpAnnot,SNP %in% geno$SNP)
    snpA = snpA[order(snpA$SNP),]
    geno$A1 = snpA$REF
    geno$A2 = snpA$ALT
  }
  chr_dosage <- cbind(chr,geno)
  rm(geno)

  new.levels <- c('1 0 0','0 1 0','0 0 1')
  matrix.alleles <- round(as.matrix(chr_dosage[,6:ncol(chr_dosage)] + 1))
  impute2.format <- matrix(new.levels[matrix.alleles],ncol=ncol(matrix.alleles))
  gen <- cbind(chr_dosage[,1:5],impute2.format)
  gen[is.na(gen)] <- '<NA>'

  require(data.table)
  data.table::fwrite(gen,genfile,row.names=FALSE,
                     col.names = FALSE, quote = FALSE, sep='\t')

  sample <- as.data.frame(matrix(nrow=(ncol(chr_dosage)-4),ncol=5))
  colnames(sample) <- c('ID_1','ID_2','missing','gender','pheno')
  sample$ID_1[2:nrow(sample)] <- paste('1A',2:nrow(sample),sep='')
  sample$ID_2[2:nrow(sample)] <- colnames(chr_dosage)[6:ncol(chr_dosage)]
  sample$missing[2:nrow(sample)] <- 0
  sample$gender[2:nrow(sample)] <- 0
  sample$pheno[2:nrow(sample)] <- rnorm(nrow(sample)-1)
  sample[1,] <- c(0,0,0,'D','P')
  write.table(sample,samplefile,row.names=FALSE,
              col.names = TRUE, quote = FALSE)

  if (!verbose) {sink('/dev/null')}
  system(paste('plink',
               '--gen',genfile,
               '--sample',samplefile,
               '--make-bed',
               '--allow-no-sex',
               '--out',bedfile),
         intern = !verbose)


  system(paste('plink','--bfile',bedfile,
               '--indep-pairwise',windowSize,numSNPShift,ldThresh,
               '--out',outFile),
         intern = !verbose)

  if (!verbose) {sink()}

  file.remove(genfile,samplefile)

  s <- as.character(data.table::fread(paste0(outFile,
                                             '.prune.in'),
                                      header=F)$V1)

  if (nrow(W) == length(snpList)){
    W = t(W)
  }
  W <- W[,which(snpList %in% s)]
  q <- snpList[snpList %in% s]
  snpList <- q[match(s,q)]
  onlyThese <- subset(onlyThese,snpid %in% snpList)
  snpList = snpList[snpList %in% onlyThese$snpid]

  return(list(W = W,
              snpList = snpList,
              onlyThese = onlyThese))


}
bhattacharya-a-bt/MOSTWAS documentation built on April 6, 2023, 12:20 a.m.