#!/usr/bin/env Rscript
# RN Luben, UCL Institute of Ophthalomolgy, 2021
# Script to automate as much as possible the running of UKB variables against the Regenie pipeline including GCTA
# This script is designed to be general and should work on both a cluster/supercomputer like Myraid or on a single node machine like Oregano
# This version of the script generalises the pipeline further to non-UK Biobank studies. This is a coding branch which should eventually lead
# to one unified script that can be used for UKB or any other study. To do this, all UKB specific references will need to be found and generalised
# To do:
# - Work out how to create "Thin" bfile genetic sets. GCTA will only work if these are defined.
# - Stamp process ID on additional filenames and directories. Perhaps add it to the project readme directory.
###################
# Myriad #
###################
# Prior to running this script on Myriad, load/unload the following modules. These commands can be put into .bashrc to run on login
# module unload gcc-libs/4.9.2
# module unload compilers/intel/2018/update3
# module unload mpi/intel/2018/update3/intel
# module load r/new
suppressPackageStartupMessages({
library(tidyverse)
library(haven)
library(stringr)
library(readr)
library(glue)
library(janitor)
library(future.batchtools)
library(future)
library(listenv)
library(fs)
})
# Function that will display a message both to stdout and to a log file
CatTee <- function(Message,MessageFile=MessageLogfile,Screen=TRUE) {
if(Screen) {cat(Message)}
cat(Message, file=MessageFile, append=TRUE)
}
# Messages
Messages <- c()
Messages[0] <- "[0] Preliminaries .. "
Messages[1] <- "[1] Check that we have been given the necessary information .. "
Messages[2] <- "[2] Check for a project directory (the 'project') variable. Create it if it does not exist; check it is empty .. "
Messages[3] <- "[3] Create the standard subdirectory structure .. "
Messages[4] <- "[4] Standard file locations some assigned according to host .. "
Messages[5] <- "[5] Read in datasets from external files as necessary .. "
Messages[6] <- "[6] Join datasets to main dataset. Common variables in the extra data override those in the main phenotype dataset .. "
Messages[7] <- "[7] Filter participant withdrawals .. "
Messages[8] <- "[8] Filter the dataset, removing any subject with missing main exposure .. "
Messages[9] <- "[9] Create main exposure \"MainExpo.txt\" and covariate \"Covar.txt\" text files .. "
Messages[10] <- "[10] Write some keep files, passed to Regenie to force it to restrict to certain individuals .. "
Messages[11] <- "[11] Examine exposure and decide if it is necessary to subset the directly called data"
Messages[12] <- "[12] Grid related settings for Regenie .. "
Messages[13] <- "[13] Define Regenie templates .. "
Messages[14] <- "[14] Submit step1 job .. "
Messages[15] <- "[15] Waiting for Regenie Step1 to complete .. "
Messages[16] <- "[16] Submit step2 jobs for chromosomes 1 to 22 .. "
Messages[17] <- "[17] GCTA COJO template and other preparation .. "
Messages[18] <- "[18] GCTA COJO script generation .. "
Messages[19] <- "[19] Run GCTA COJO jobs for each chromosome .. "
Messages[50] <- "\xE2\x9C\x94\n"
Messages[51] <- "\nNOTE: Use \"RegeniePipeline --help\" to show help on using the script\n"
Messages[52] <- "\nNOTE: Arguments in operation:\n"
Messages[53] <- "\nNOTE: End of Regenie section\n\n"
Messages[54] <- "\nNOTE: Found process ID from earlier run: "
Messages[55] <- "\nNOTE: Finished GCTA pipeline\n"
Tick <- 50
# Initialisation section
ArgList <- R.utils::commandArgs(asValues = TRUE)
exit <- function() { invokeRestart("abort") }
make_names_unique <- function(x) { make.names(x,unique=TRUE) }
Usage <- "\nUsage: RegeniePipeline [OPTION]...\n
Automate, as much as possible, the running of study variables against the Regenie pipeline\n
Options can either be provided on the command line or using environment variables. Command line options
have presidence over environment options.
Non-mandatory options are shown with square brackets
--project=DIR The location for files created during this pipeline. If DIR exists
then it must be empty. If DIR does not exist, it will be created
--exposure=VARIABLE The main study exposure to be used by the pipeline
--runsection=[regenie|gcta|step1|step2] Only run a particular section. Options are regenie,step2,gcta. Defaults to everything.
--Resume=[true|false] To resume an existing project, set as true. Existing files will not be overwritten. Default is false
--BinaryTrait=[true|false] If the main exposures are binary and coded 0=control,1=case,NA=missing then set to true. Default is false
The covariates eg PCAs will always be considered as continuous
--PThresh=[NUMBER] Threshhold above which p-values are discarded. Defaults to 0.05
--PhenoStudyFile=STATA DATASET File containing study phenotypes - outcomes and exposures. Must have \"StudyID\" as unique identifier
--PhenoStudyDescFile=STATA DATASET File containing descriptions of study phenotypes. Must have columns \"name\" and \"description\"
--PhenoExtraDataFile=[STATA DATASET] Optional additional dataset, in Stata .dta format, which will be linked
to the default dataset. Variables in this dataset will taken presendence
over those with the same name in the default dataset. Must have \"StudyID\" as unique identifier
--NonEuroFile=TEXT FILE File containing a list of study non-Europeans. Tab delimited with header
--StudyBFile=TEXT FILE Prefix for directly called genotype files in bed/bim/fam format
--StudyBGenFile=TEXT FILE Filename for imputed genotype files in \"bgen\" format. Use {chr} (chr in braces) in place of chr1, chr2 etc.
--StudySampleFile=TEXT FILE Filename for imputed genotype \"sample\" file for use with corresponding bgen files
--ThinDir=DIR Location of imputed GWAS files in bfile format for GCTA created using a subset of individuals (hence thin)
and excluding non-Europeans subject and SNPs failing QC
--NonImputedFile=TEXT FILE List of study participants with directly called GWAS but no imputed GWAS. Space delimited without header
--OptOutFile=[TEXT FILE] File of study participants who have chosen to opt out and do not wish their data to be used
--CatCovars=STRING Comma separated string containing variable names of the categorical covariates to be used
--ContCovars=STRING Comma separated string containing variable names of the continuous covariates to be used, including PCAs
--QCPassSNPFile=TEXT FILE File containing list of SNPs/variants that have passed QC
--QCPassFile=TEXT FILE File containing list of study participants who passed QC. Tab delimited without header
--StudyID=STRING String containing the name of the study ID which uniquely identifies individuals
--StudyName=STRING String containing the name of the study - used in output filenames etc
--Regenie=FILE Location of the regenie executable
--Plink2=FILE Location of the plink2 executable
--GCTA=FILE Location of the GCTA executable
--help Show this help and exit
\n"
cat(Messages[0])
# Read arguments from environmental variables, first defining Args$project as a placeholder ina new tibble
ArgsEnv=tibble(help=FALSE)
ArgsEnv$project <-Sys.getenv("PIPELINE_project")
ArgsEnv$exposure <- Sys.getenv("PIPELINE_exposure")
ArgsEnv$runsection <- Sys.getenv("PIPELINE_runsection")
ArgsEnv$Resume <- Sys.getenv("PIPELINE_Resume")
ArgsEnv$BinaryTrait <- Sys.getenv("PIPELINE_BinaryTrait")
ArgsEnv$PThresh <- Sys.getenv("PIPELINE_PThresh")
ArgsEnv$PhenoStudyFile <- Sys.getenv("PIPELINE_PhenoStudyFile")
ArgsEnv$PhenoStudyDescFile <- Sys.getenv("PIPELINE_PhenoStudyDescFile")
ArgsEnv$PhenoExtraDataFile <- Sys.getenv("PIPELINE_PhenoExtraDataFile")
ArgsEnv$NonEuroFile <- Sys.getenv("PIPELINE_NonEuroFile")
ArgsEnv$StudyBFile <- Sys.getenv("PIPELINE_StudyBFile")
ArgsEnv$StudyBGenFile <- Sys.getenv("PIPELINE_StudyBGenFile")
ArgsEnv$StudySampleFile <- Sys.getenv("PIPELINE_StudySampleFile")
ArgsEnv$ThinDir <- Sys.getenv("PIPELINE_ThinDir")
ArgsEnv$NonImputedFile <- Sys.getenv("PIPELINE_NonImputedFile")
ArgsEnv$OptOutFile <- Sys.getenv("PIPELINE_OptOutFile")
ArgsEnv$CatCovars <- Sys.getenv("PIPELINE_CatCovars")
ArgsEnv$ContCovars <- Sys.getenv("PIPELINE_ContCovars")
ArgsEnv$QCPassSNPFile <- Sys.getenv("PIPELINE_QCPassSNPFile")
ArgsEnv$QCPassFile <- Sys.getenv("PIPELINE_QCPassFile")
ArgsEnv$StudyID <- Sys.getenv("PIPELINE_StudyID")
ArgsEnv$StudyName <- Sys.getenv("PIPELINE_StudyName")
ArgsEnv$Regenie <- Sys.getenv("PIPELINE_Regenie")
ArgsEnv$Plink2 <- Sys.getenv("PIPELINE_Plink2")
ArgsEnv$GCTA <- Sys.getenv("PIPELINE_GCTA")
# Read arguments from the command line
ArgsPos <- match("args",names(ArgList))+1
ArgLen <- length(names(ArgList))
if(is.na(ArgsPos)) {
ArgsCL <- tibble(help=FALSE)
} else {
ArgsCL <- ArgList %>% as_tibble(.name_repair=make_names_unique) %>% select(all_of(ArgsPos):all_of(ArgLen))
}
Args <- bind_cols(ArgsCL,ArgsEnv %>% select(all_of(setdiff(names(ArgsEnv),names(ArgsCL)))))
# Args <- Args %>% mutate(project=ifelse(ArgsCL$project=="Default" && ArgsEnv$project!="",ArgsEnv$project,ArgsCL$project))
# Find version number, assuming it appears in the calling script as the last 3 characters. Readlink is called assuming RegeniePipeline is a symlink.
CallingScript <- commandArgs()[grepl("--file",commandArgs())]
CallingScript <- Sys.readlink(substr(CallingScript,8,nchar(CallingScript)))
Args$version <- ifelse(length(CallingScript)>0,substr(CallingScript,nchar(CallingScript)-2,nchar(CallingScript)),"Unknown")
# Assign a process ID
if(tolower(Args$Resume) != "true") {
ProcessID <- Sys.getpid()
} else {
ProcessID <- unlist(str_split(Sys.glob(paste0(Args$project,"/Logs/RegeniePipeline_*.log")),"[_.]"))
ProcessID <- ProcessID[length(ProcessID)-1]
cat(Messages[54],ProcessID,"\n")
}
Args$ProcessID <- ProcessID
# Show all defined arguments
cat(Messages[51])
cat(Messages[52])
Args %>% select(-help) %>% glimpse
# Args %>% glimpse
# Show usage and exit. An improvement would be to pick up some args from the old log when Resume=true
if(Args$help==TRUE || Args$project=="" || Args$exposure=="" || Args$PhenoStudyFile=="" || Args$PhenoStudyDescFile=="") {
if(Args$project=="") {cat("ERROR: You must provide a project location\n")}
if(Args$exposure=="") {cat("ERROR: You must provide an exposure\n")}
if(Args$PhenoStudyFile=="") {cat("ERROR: You must provide a file containing phenotypes and covariates\n")}
if(Args$PhenoStudyDescFile=="") {cat("ERROR: You must provide a file that describes your phenotype file\n")}
cat(Usage)
exit()
}
# Ensure there is a trailing slash on the path names
Args$project <- paste0(Args$project,"/")
Args$ThinDir <- paste0(Args$ThinDir,"/")
# Define a message logfile
MessageLogfile <- paste0(Args$project,"Logs/RegeniePipeline_",Args$ProcessID,".log")
cat(Messages[Tick])
cat(Messages[1])
# Convert comma separated strings to vectors
CatCovars <- unlist(strsplit(Args$CatCovars, split=","))
ContCovars <- unlist(strsplit(Args$ContCovars, split=","))
# CatCovars <- eval(parse(text=Args$CatCovars))
# PCA <- unlist(strsplit(Args$PCA, split=","))
# PCA <- eval(parse(text=Args$PCA))
StudyID <- Args$StudyID
# Other derived variables
StudyBFileFam <- paste0(Args$StudyBFile,".fam")
cat(Messages[Tick])
cat(Messages[2])
dir.create(file.path(Args$project),showWarnings = FALSE)
if(length(list.files(file.path(Args$project)))>0 && tolower(Args$Resume) !="true") {
cat("\nERROR: Project directory is not empty.\n\n")
print(Args$project)
exit()
}
setwd(file.path(Args$project))
cat(Messages[Tick])
cat(Messages[3])
dir.create(file.path(Args$project,"Data"),showWarnings = FALSE)
dir.create(file.path(Args$project,"Links"),showWarnings = FALSE)
dir.create(file.path(Args$project,"Logs"),showWarnings = FALSE)
dir.create(file.path(Args$project,"Output"),showWarnings = FALSE)
dir.create(file.path(Args$project,"R"),showWarnings = FALSE)
dir.create(file.path(Args$project,"Readme"),showWarnings = FALSE)
dir.create(file.path(Args$project,"Scripts"),showWarnings = FALSE)
dir.create(file.path(Args$project,"Templates"),showWarnings = FALSE)
dir.create(file.path(Args$project,"GCTA"),showWarnings = FALSE)
cat(Messages[Tick])
invisible(file.create(MessageLogfile))
cat(Messages[52], file=MessageLogfile, append=TRUE)
cat(capture.output(Args %>% select(-help) %>% glimpse), sep = "\n", file=MessageLogfile, append=TRUE)
CatTee(c(Messages[1],Messages[Tick],Messages[2],Messages[Tick],Messages[3],Messages[Tick]), Screen=FALSE)
CatTee(Messages[4])
Host <- NA_character_
Host <- ifelse(grepl('oregano',Sys.info()[["nodename"]] ),'oregano',Host)
Host <- ifelse(grepl('myriad',Sys.info()[["nodename"]] ),'myriad',Host)
BasicVars <- c(StudyID,CatCovars,ContCovars)
StandardVars <- c(BasicVars,Args$exposure)
BaseDir <- Args$project
DataDir <- paste0(Args$project,"/Data/")
LogDir <- paste0(Args$project,"/Logs/")
ScriptDir <- paste0(Args$project,"/Scripts/")
OutputDir <- paste0(Args$project,"/Output/")
GCTADir <- paste0(Args$project,"/GCTA/")
BinaryTrait <- ifelse(tolower(Args$BinaryTrait)=="true","--bt","")
PlinkBT <- ifelse(tolower(Args$BinaryTrait)=="true","--1","")
PThresh <- ifelse(Args$PThresh!="" && !is.null(Args$PThresh),paste0("--pThresh=",Args$PThresh),"--pThresh=0.05")
Args$runsection <- tolower(Args$runsection)
# Generic Grid related settings
Wallclock = 12
Memory = 10
numThreadsStep1 <- ifelse(Host=="myriad",4,30)
numThreadsFixed = 4
numThreadsStep2Graded <- c(5,5,4,4,3,3,3,3,2,3,3,2,2,2,2,2,2,2,1,1,1,1)
cat(Messages[Tick])
if(Args$runsection %in% c("regenie","step1","step2","")) {
cat(Messages[5])
# Read the description file
Description <- haven::read_dta(Args$PhenoStudyDescFile)
# Does the description contain FID and IID. The style of double identifier used in older plink formats is not ideal. Generally
# individual ID (IID) should be used as identifer, not FID which is an optional family ID somestimes coded 0.
# If FID and IID are in the description file, assume they are needed throughout
FIDIID <- intersect(Description %>% select(name) %>% pull,c("FID","IID"))
# Add FID and IID to the variables if they exist
BasicVars <- c(FIDIID,BasicVars)
StandardVars <- c(FIDIID,StandardVars)
# Is the exposure in the description file?
ExpCheck <- Description %>% filter(name==all_of(Args$exposure)) %>% count %>% pull %>% as.logical
if(ExpCheck) {
# Define Vars as the study ID and the continuous and categorical covariates
Vars <- haven::read_dta(Args$PhenoStudyFile, col_select=all_of(StandardVars))
} else {
# Define Vars as the study ID and the continuous and categorical covariates plus the exposure variable
Vars <- haven::read_dta(Args$PhenoStudyFile, col_select=all_of(BasicVars))
}
# UKB specific command to alter the variable names of the PCAs to PC1,PC2 etc. Rather than hardcoded this rename,
# it would be better to find another way eg. incorporate it into the calling environment.
# Vars <- Vars %>% rename_with(str_replace, pattern = "n_22009_0_", replacement = "PC", matches("n_22009_0_"))
NonEuro <- readr::read_tsv(Args$NonEuroFile, col_names=TRUE,show_col_types = FALSE) %>% mutate(!!StudyID := FID)
# AllStudyGeno <- read_delim(StudyBFileFam,col_names=FALSE,delim=" ") %>% rename(FID=X1,IID=X2,Father=X3,Mother=X4,Sex=X5,Pheno=X6)
# These files DO contain double IDs FID and IID variables. However, StudyID (set to IID) is also added for later joins
qcpass <- readr::read_tsv(Args$QCPassFile,col_names=FALSE,show_col_types = FALSE) %>% rename(FID=X1,IID=X2) %>% mutate(!!StudyID := FID)
NotImputed <- readr::read_tsv(Args$NonImputedFile,col_names=FALSE,show_col_types = FALSE) %>% rename(FID=X1,IID=X2) %>% mutate(!!StudyID := FID)
cat(Messages[Tick])
}
if(Args$runsection %in% c("regenie","step1","step2","")) {
cat(Messages[6])
# Merge the additional phenotype dataset, prioritising its variables over the main pheno file
if(!is.null(Args$PhenoExtraDataFile)) {
if(Args$PhenoExtraDataFile!="") {
ProvidedData <- haven::read_dta(file.path(Args$PhenoExtraDataFile))
Vars <- Vars %>% select(setdiff(names(Vars),names(ProvidedData)),all_of(StudyID))
Vars <- left_join(Vars,ProvidedData,by=StudyID)
}
}
cat(Messages[Tick])
}
if(Args$runsection %in% c("regenie","step1","step2","")) {
cat(Messages[7])
if(!is.null(Args$OptOutFile)) {
if(Args$OptOutFile!="") {
OptOut <- readr::read_tsv(Args$OptOutFile, col_names=FALSE,show_col_types = FALSE) %>% rename(!!StudyID := X1)
Vars <- anti_join(Vars,OptOut,by=StudyID)
}
}
cat(Messages[Tick])
}
if(Args$runsection %in% c("regenie","step1","step2","")) {
cat(Messages[8])
MainExpo <- Vars %>% select(all_of(StudyID),all_of(FIDIID),all_of(Args$exposure)) %>% na.omit()
Covars <- Vars %>% select(all_of(StudyID),all_of(FIDIID),all_of(CatCovars),all_of(ContCovars)) %>% na.omit()
cat(Messages[Tick])
cat(Messages[9])
MainExpoFile <- paste0(DataDir,"MainExpo.txt")
CovarFile <- paste0(DataDir,"Covar.txt")
MainExpo %>%
mutate( FID = if("FID" %in% colnames(.)) FID else !!as.name(StudyID)) %>%
mutate( IID = if("IID" %in% colnames(.)) IID else !!as.name(StudyID)) %>%
select(FID,IID,all_of(Args$exposure)) %>%
write_delim(MainExpoFile, col_names=TRUE,delim=" ")
Covars %>%
mutate( FID = if("FID" %in% colnames(.)) FID else !!as.name(StudyID)) %>%
mutate( IID = if("IID" %in% colnames(.)) IID else !!as.name(StudyID)) %>%
select(FID,IID,all_of(CatCovars),all_of(ContCovars)) %>%
write_delim(CovarFile, col_names=TRUE,delim=" ")
cat(Messages[Tick])
}
if(Args$runsection %in% c("regenie","step1","step2","")) {
cat(Messages[10])
# Notice the keep file (and other files) do not have headers. The keep file must be in FID,IID format
# when written out but minus the header.
Keep <- MainExpo %>% select(all_of(StudyID),all_of(FIDIID))
Keep <- anti_join(Keep,NonEuro,by=StudyID)
Keep <- anti_join(Keep,NotImputed,by=StudyID) %>% inner_join(qcpass,by=StudyID,suffix = c("", ".y")) %>% select(FID,IID)
KeepFile <- paste0(DataDir,Args$exposure,"_keep.txt")
write_tsv(Keep, KeepFile, col_names=FALSE)
cat(Messages[Tick])
}
if(Args$runsection %in% c("regenie","step1","")) {
cat(Messages[11])
# NOTE that an existing .loco file can be used instead of rerunning Regenie Step 1 if it has previously
# been created for a cohort on exactly the same individuals. **NEED AN ARGUMENT SO LOCO FILE CAN BE PROVIDED**
# If n for the exposure is much smaller than the study
# total (eg an eye measure such as iopcc3) , then it is worth creating a subset of the called data using Plink. Since Regenie
# imputes missing values, such imputation is very time consuming when there are substantial missings in the exposure.
# Calculate and check proportion of study participants with exposure as a percentage of all those with imputed data
# Run plink, if necessary, limiting to those with exposure
# Note the use of the R version of here docs together with stringr::str_glue
# The plink command is currently run on the header node rather than being submitted as a job
# Note that plink uses all available threads minus 1, so the --threads parameter is probably not needed
# A bug corrected here relates to definition of cases and controls. Regenie expects 1=case, 0=Control, NA=Missing
# while plink likes 1=Control 2=Case and 0=Missing. However, Plink has an argument "-1" which changes the categories
# to Regenie style. This forces the phenotype to be case/control so can only be used when the BinaryTrait argument is set.
# This is done using the PlinkBT variable
ExpoProportion <- MainExpo %>% count %>% pull / Vars %>% count %>% pull
if(ExpoProportion < 0.9) {
cat(" .. proportion < 0.9 NEED TO run plink .. ")
PlinkCommand_Step11 <- r"(
{Args$Plink2} \
--bfile {Args$StudyBFile} \
--pheno {DataDir}/MainExpo.txt \
--require-pheno {Args$exposure} \
--make-bed {PlinkBT} \
--out {DataDir}/{Args$StudyName}_{Args$exposure}_allChrs
)"
PlinkCommand <- stringr::str_glue(PlinkCommand_Step11)
PlinkCommandFile <- paste0(ScriptDir,"/Plink_",ProcessID,".sh")
PlinkCommandLog <- paste0(LogDir,"/Plink_",ProcessID,".log")
readr::write_lines(PlinkCommand,PlinkCommandFile)
# If necessary, this will also need to be made into a job
sys::exec_wait("bash",PlinkCommandFile,std_out=PlinkCommandLog)
StudyBFile <- glue("{DataDir}/{Args$StudyName}_{Args$exposure}_allChrs")
} else {
StudyBFile <- Args$StudyBFile
cat(" .. proportion >= 0.9 NOT running plink .. ")
}
cat(Messages[Tick])
}
if(Args$runsection %in% c("regenie","step1","step2","")) {
cat(Messages[12])
JobName <- "RegenieRun"
LogFile <- paste0(LogDir,"RegenieRun.log")
GridLogFile <- paste0(LogDir,"GridLog_",ProcessID,".log")
SourceLogFile <- paste0(LogDir,"SourceLog_",ProcessID,".log")
# Template file. Note that Rscript can be used in place of bash as in the following:
## "Rscript -e \'batchtools::doJobCollection(\"<%= uri %>\")\'",
TemplateFile <- c(
"#!/bin/bash",
"#$ -N <%= JobName %>",
"#$ -j y",
"#$ -o <%= GridLogFile %>",
"#$ -cwd",
"#$ -V",
glue("#$ -l h_rt={Wallclock}:0:0"),
glue("#$ -l mem={Memory}G"),
glue("#$ -pe smp {numThreadsFixed}"),
"bash \'batchtools::doJobCollection(\"<%= uri %>\")\'",
"exit 0"
)
TFileName <- paste0(BaseDir,"Templates/batchtools.sge.tmpl")
TemplateFile %>% write_lines(TFileName)
# Note that batchtools_sge is only relevant to Myriad which uses Sun Grid Engine. For Oregano we need batchtools_local or batchtools_multicore
if (Host=="myriad") {
plan(batchtools_sge,template = TFileName)
} else if(Host=="oregano") {
plan(batchtools_multicore)
} else {
cat("ERROR: Invalid host. Available hosts are: myriad oregano\n")
exit()
}
cat(Messages[Tick])
}
if(Args$runsection %in% c("regenie","step1","step2","")) {
cat(Messages[13])
RegenieStep1Template <- r"(
cd {OutputDir}
{Args$Regenie} \
--step 1 \
--bed {StudyBFile} \
--extract {Args$QCPassSNPFile} \
--keep {Args$QCPassFile} \
--phenoFile {MainExpoFile} \
--phenoCol {Args$exposure} \
--covarFile {CovarFile} \
--covarColList {Args$CatCovars},{Args$ContCovars} \
--catCovarList {Args$CatCovars} \
--firth --approx \
--bsize 1000 \
--lowmem {BinaryTrait} \
--lowmem-prefix {DataDir}/Regenie_Step1_Lowmem.txt \
--out {Args$StudyName}_{Args$exposure}_step1_BT \
--threads {numThreadsStep1}
)"
# Note that the --split argument is no longer valid syntax in Regenie version 2.2.4
RegenieStep2Template <- r"(
cd {OutputDir}
{Args$Regenie} \
--step 2 \
--bgen {Args$StudyBGenFile} \
--sample {Args$StudySampleFile} \
--phenoFile {MainExpoFile} \
--phenoCol {Args$exposure} \
--phenoColList {Args$exposure} \
--covarFile {CovarFile} \
--covarColList {Args$CatCovars},{Args$ContCovars} \
--catCovarList {Args$CatCovars} \
--keep {KeepFile} \
--pred {Args$StudyName}_{Args$exposure}_step1_BT_pred.list \
--firth {BinaryTrait} --approx {PThresh} \
--bsize 100 \
--out {Args$StudyName}_{Args$exposure}_step2_BT_{chr} \
--threads {numThreadsStep2}
)"
cat(Messages[Tick])
}
if(Args$runsection %in% c("regenie","step1","")) {
cat(Messages[14])
GridfileStep1 <- stringr::str_glue(RegenieStep1Template) %>% str_trim %>% gsub("\\ +", " ",.)
GridFileStep1Name <- paste0(ScriptDir,"GridFileStep1_",ProcessID,".sh")
GridFileStep1Log <- paste0(ScriptDir,"GridFileStep1_",ProcessID,".log")
readr::write_lines(GridfileStep1,GridFileStep1Name)
Sys.chmod(GridFileStep1Name, "777", use_umask = FALSE)
# Previously step1 was just run (on the header node or wherever) but since Myriad has tightened its policy it is now
# submitted as a job
##sys::exec_wait("bash",GridFileStep1Name,std_out=GridFileStep1Log)
if(Host=="myriad") {
RegenieRunStep1 %<-% {
sys::exec_wait(GridFileStep1Name,std_out=GridFileStep1Log)
}
} else if(Host=="oregano") {
RegenieRunStep1 %<-% {
sys::exec_wait(GridFileStep1Name,std_out=GridFileStep1Log)
}
}
cat(Messages[Tick])
}
if(Args$runsection %in% c("regenie","step1","")) {
cat(Messages[15])
JobRunning<- futureOf(RegenieRunStep1)
while (!resolved(JobRunning)) {
Sys.sleep(60)
cat("o")
}
cat(Messages[Tick])
}
if(Args$runsection %in% c("regenie","step2","")) {
cat(Messages[16])
# The double str_glue interpolates a variable in a variable eg {chr} in BGen filename
GridFileStep2Names <- c()
GridFileStep2Logs <- c()
for (chr in 1:22) {
numThreadsStep2 <- ifelse(Host=="myriad",numThreadsFixed,ifelse(Host=="oregano",numThreadsStep2Graded[chr],NA_character_))
GridfileStep2 <- str_glue(str_glue(RegenieStep2Template)) %>% str_trim %>% gsub("\\ +", " ",.)
GridFileStep2Name <- paste0(ScriptDir,"GridFileStep2_Chr",chr,"_",ProcessID,".sh")
GridFileStep2Names[chr] <- GridFileStep2Name
GridFileStep2Log <- paste0(ScriptDir,"GridFileStep2_Chr",chr,"_",ProcessID,".log")
GridFileStep2Logs[chr] <- GridFileStep2Log
readr::write_lines(GridfileStep2,GridFileStep2Name)
Sys.chmod(GridFileStep2Name, "777", use_umask = FALSE)
# f_RegenieToGrid(chr)
}
# This is crying out to be a loop!! But no obvious way to achieve this. Hence 22 individual lines (for the moment)
RegenieRunStep2Chr1 %<-% { sys::exec_wait(GridFileStep2Names[1],std_out=GridFileStep2Logs[1]) }
RegenieRunStep2Chr2 %<-% { sys::exec_wait(GridFileStep2Names[2],std_out=GridFileStep2Logs[2]) }
RegenieRunStep2Chr3 %<-% { sys::exec_wait(GridFileStep2Names[3],std_out=GridFileStep2Logs[3]) }
RegenieRunStep2Chr4 %<-% { sys::exec_wait(GridFileStep2Names[4],std_out=GridFileStep2Logs[4]) }
RegenieRunStep2Chr5 %<-% { sys::exec_wait(GridFileStep2Names[5],std_out=GridFileStep2Logs[5]) }
RegenieRunStep2Chr6 %<-% { sys::exec_wait(GridFileStep2Names[6],std_out=GridFileStep2Logs[6]) }
RegenieRunStep2Chr7 %<-% { sys::exec_wait(GridFileStep2Names[7],std_out=GridFileStep2Logs[7]) }
RegenieRunStep2Chr8 %<-% { sys::exec_wait(GridFileStep2Names[8],std_out=GridFileStep2Logs[8]) }
RegenieRunStep2Chr9 %<-% { sys::exec_wait(GridFileStep2Names[9],std_out=GridFileStep2Logs[9]) }
RegenieRunStep2Chr10 %<-% { sys::exec_wait(GridFileStep2Names[10],std_out=GridFileStep2Logs[10]) }
RegenieRunStep2Chr11 %<-% { sys::exec_wait(GridFileStep2Names[11],std_out=GridFileStep2Logs[11]) }
RegenieRunStep2Chr12 %<-% { sys::exec_wait(GridFileStep2Names[12],std_out=GridFileStep2Logs[12]) }
RegenieRunStep2Chr13 %<-% { sys::exec_wait(GridFileStep2Names[13],std_out=GridFileStep2Logs[13]) }
RegenieRunStep2Chr14 %<-% { sys::exec_wait(GridFileStep2Names[14],std_out=GridFileStep2Logs[14]) }
RegenieRunStep2Chr15 %<-% { sys::exec_wait(GridFileStep2Names[15],std_out=GridFileStep2Logs[15]) }
RegenieRunStep2Chr16 %<-% { sys::exec_wait(GridFileStep2Names[16],std_out=GridFileStep2Logs[16]) }
RegenieRunStep2Chr17 %<-% { sys::exec_wait(GridFileStep2Names[17],std_out=GridFileStep2Logs[17]) }
RegenieRunStep2Chr18 %<-% { sys::exec_wait(GridFileStep2Names[18],std_out=GridFileStep2Logs[18]) }
RegenieRunStep2Chr19 %<-% { sys::exec_wait(GridFileStep2Names[19],std_out=GridFileStep2Logs[19]) }
RegenieRunStep2Chr20 %<-% { sys::exec_wait(GridFileStep2Names[20],std_out=GridFileStep2Logs[20]) }
RegenieRunStep2Chr21 %<-% { sys::exec_wait(GridFileStep2Names[21],std_out=GridFileStep2Logs[21]) }
RegenieRunStep2Chr22 %<-% { sys::exec_wait(GridFileStep2Names[22],std_out=GridFileStep2Logs[22]) }
# Wait for the future job to be resolved. Again, should be a loop
JobRunning1 <- futureOf(RegenieRunStep2Chr1); while (!resolved(JobRunning1)) { Sys.sleep(60); cat("o") }
JobRunning2 <- futureOf(RegenieRunStep2Chr2); while (!resolved(JobRunning2)) { Sys.sleep(60); cat("o") }
JobRunning3 <- futureOf(RegenieRunStep2Chr3); while (!resolved(JobRunning3)) { Sys.sleep(60); cat("o") }
JobRunning4 <- futureOf(RegenieRunStep2Chr4); while (!resolved(JobRunning4)) { Sys.sleep(60); cat("o") }
JobRunning5 <- futureOf(RegenieRunStep2Chr5); while (!resolved(JobRunning5)) { Sys.sleep(60); cat("o") }
JobRunning6 <- futureOf(RegenieRunStep2Chr6); while (!resolved(JobRunning6)) { Sys.sleep(60); cat("o") }
JobRunning7 <- futureOf(RegenieRunStep2Chr7); while (!resolved(JobRunning7)) { Sys.sleep(60); cat("o") }
JobRunning8 <- futureOf(RegenieRunStep2Chr8); while (!resolved(JobRunning8)) { Sys.sleep(60); cat("o") }
JobRunning9 <- futureOf(RegenieRunStep2Chr9); while (!resolved(JobRunning9)) { Sys.sleep(60); cat("o") }
JobRunning10 <- futureOf(RegenieRunStep2Chr10); while (!resolved(JobRunning10)) { Sys.sleep(60); cat("o") }
JobRunning11 <- futureOf(RegenieRunStep2Chr11); while (!resolved(JobRunning11)) { Sys.sleep(60); cat("o") }
JobRunning12 <- futureOf(RegenieRunStep2Chr12); while (!resolved(JobRunning12)) { Sys.sleep(60); cat("o") }
JobRunning13 <- futureOf(RegenieRunStep2Chr13); while (!resolved(JobRunning13)) { Sys.sleep(60); cat("o") }
JobRunning14 <- futureOf(RegenieRunStep2Chr14); while (!resolved(JobRunning14)) { Sys.sleep(60); cat("o") }
JobRunning15 <- futureOf(RegenieRunStep2Chr15); while (!resolved(JobRunning15)) { Sys.sleep(60); cat("o") }
JobRunning16 <- futureOf(RegenieRunStep2Chr16); while (!resolved(JobRunning16)) { Sys.sleep(60); cat("o") }
JobRunning17 <- futureOf(RegenieRunStep2Chr17); while (!resolved(JobRunning17)) { Sys.sleep(60); cat("o") }
JobRunning18 <- futureOf(RegenieRunStep2Chr18); while (!resolved(JobRunning18)) { Sys.sleep(60); cat("o") }
JobRunning19 <- futureOf(RegenieRunStep2Chr19); while (!resolved(JobRunning19)) { Sys.sleep(60); cat("o") }
JobRunning20 <- futureOf(RegenieRunStep2Chr20); while (!resolved(JobRunning20)) { Sys.sleep(60); cat("o") }
JobRunning21 <- futureOf(RegenieRunStep2Chr21); while (!resolved(JobRunning21)) { Sys.sleep(60); cat("o") }
JobRunning22 <- futureOf(RegenieRunStep2Chr22); while (!resolved(JobRunning22)) { Sys.sleep(60); cat("o") }
# Allow a little time
Sys.sleep(60)
LogCheck <- system(paste0("grep 'End time' ",OutputDir,Args$StudyName,"_",Args$exposure,"*step2*.log | wc -l"), intern = TRUE) %>% as.integer
if(LogCheck!=22) {
cat("ERROR: Some step 2 logs did not finish (no \"End time\"). Likely error. ")
}
if(length(Sys.glob(file.path(paste0(OutputDir,Args$StudyName,"_",Args$exposure), "*step2*.log")))>0) {
cat("ERROR: No step 2 log files were created")
}
cat(Messages[Tick])
}
# Give a message saying end of Regenie section
if(Args$runsection %in% c("regenie","step2","step1","")) {
cat(Messages[53])
}
if(Args$runsection %in% c("gcta","")) {
cat(Messages[17])
# Here GCTA COJO is called to determine the GWAS hits. Since step 14 generally takes a long time to complete
# the script will need to pause here and wait for Regenie to finish. May need to use the "resolved" function to do
# this - something like resolved(RegenieRun) (TRUE or FALSE) but have not tried this. An alternative is to look at the
# Regenie log files and search for 'End time' which appears at the end of a completed log.
# Define suffix and prefix character created by Regenie
# FROM CONFIG FILE GwasPrefix <- "{StudyName}_{Args$exposure}_step2_BT_"
GwasPrefix <- paste0(Args$StudyName,"_",Args$exposure,"_step2_BT_")
GwasSuffix <- paste0("_",Args$exposure,".regenie")
# Location of imputed GWAS files in bfile format. For European cohorts such as UKB, these files should exclude non-Europeans and any SNPs failing QC
ThinDir <- Args$ThinDir
# ThinDir <- ifelse(Host=="myriad",paste0(Args$studypath,"Robert/GCTA/LD_Files/UKBB10000/"),ifelse(Host=="oregano",paste0("/hddraid5/Store/Reference/LD_Files/UKBB10000/"),NA_character_))
# Grid related settings
JobName <- "GCTARun"
LogFile <- paste0(LogDir,"GCTARun.log")
GridLogFile <- paste0(LogDir,"GCTAGridLog_",ProcessID,".log")
TemplateFile2 <- c(
"#!/bin/bash",
"#$ -N <%= JobName %>",
"#$ -j y",
"#$ -o <%= GridLogFile %>",
"#$ -cwd",
"#$ -V",
glue("#$ -l h_rt={Wallclock}:0:0"),
glue("#$ -l mem={Memory}G"),
glue("#$ -pe smp {numThreadsFixed}"),
"Rscript -e \'batchtools::doJobCollection(\"<%= uri %>\")\'",
"exit 0"
)
TFileName2 <- paste0(BaseDir,"Templates/batchtools.sge2.tmpl")
TemplateFile2 %>% write_lines(TFileName2)
# Note that batchtools_sge is only relevant to Myriad which uses Sun Grid Engine. For Oregano we need batchtools_local or batchtools_multicore
# However, here https://rdrr.io/cran/future.batchtools/man/batchtools_multicore.html it says:
## We highly recommend using future::multisession (sic!) futures of the future package instead of multicore batchtools futures.
## In fact, since this Linux and we can fork, then future::multicore is available to us.
if (Host=="myriad") {
plan(batchtools_sge,template = TFileName2)
} else if(Host=="oregano") {
# plan(future.batchtools::batchtools_multicore)
plan(future::multicore, workers=60)
} else {
cat("ERROR: Invalid host. Available hosts are: myriad oregano\n")
exit()
}
cat(Messages[Tick])
}
if(Args$runsection %in% c("gcta","")) {
cat(Messages[18])
# Define GCTA template. Note that hash-pling cannot be indented.
CreateMAFileGCTA <- r"(
#!/usr/bin/Rscript
suppressPackageStartupMessages({{
library(dplyr)
library(purrr)
library(haven)
library(readr)
library(janitor)
library(stringr)
}})
BaseDir <- "{BaseDir}"
ProcessID <- "{ProcessID}"
ThinDir <- "{ThinDir}"
OutputDir <- "{OutputDir}"
GwasPrefix <- "{GwasPrefix}"
GwasSuffix <- "{GwasSuffix}"
GCTADir <- "{GCTADir}"
chr <- "{chr}"
numThreadsGCTA <- {numThreadsGCTA}
# Construct the name of the Regenie Step2 file
RegenieStep2File <- paste0(OutputDir,GwasPrefix,chr,GwasSuffix)
# Read in the text file and tidy up the column names (making them lower case etc)
df <- read_delim(RegenieStep2File, delim=" ", col_names=TRUE, show_col_types = FALSE)
df <- janitor::clean_names(df)
# Create a p-value from the minus log base10 p
df <- df %>% mutate(p=1/(10^log10p))
# Summary of the info scores to 2 dps
df %>% mutate(info_1dp = as.character(round(info, digits=2))) %>% count(info_1dp)
# Info QC score - remove those below 0.8 ( alternatively 0.9 would be another choice)
df <- df %>% filter(info>=0.8)
# Note that A1 is assigned allele1 and A2 allele0, reversing the order they appear in the Regenie output file
df <- df %>% select(SNP=id,A1=allele1,A2=allele0,freq=a1freq,b=beta,se,p,N=n)
# Create the filename for the meta-analysis (MA) and write the file
ma_filename <- paste0(GCTADir,"/Chr",chr,".ma")
write_delim(df,ma_filename, delim=" ")
Sys.sleep(10)
GCTACommand <- paste0("gcta64 --bfile ",ThinDir,"Thin_Imp_chr",chr," --chr ",chr," --maf 0.01 --cojo-file ",ma_filename, " --cojo-slct --out ",GCTADir,"gcta_chr",chr," --threads ",numThreadsGCTA)
print(GCTACommand)
system(GCTACommand)
)"
cat(Messages[Tick])
}
if(Args$runsection %in% c("gcta","")) {
cat(Messages[19])
# Run GCTA jobs for each chromosome
# Submit the jobs for chromosomes 1 to 22 for Regenie output
GridFileGCTANames <- c()
GridFileGCTALogs <- c()
for (chr in 1:22) {
numThreadsGCTA <- ifelse(Host=="myriad",numThreadsFixed,ifelse(Host=="oregano",numThreadsStep2Graded[chr],NA_character_))
GridFileGCTA <- stringr::str_glue(CreateMAFileGCTA)
GridFileGCTAName <- paste0(ScriptDir,"GridFileGCTA_Chr",chr,"_",ProcessID,".R")
GridFileGCTANames[chr] <- GridFileGCTAName
GridFileGCTALog <- paste0(LogDir,"GridGCTALog_",chr,"_",ProcessID,".log")
GridFileGCTALogs[chr] <- GridFileGCTALog
readr::write_lines(GridFileGCTA,GridFileGCTAName)
Sys.chmod(GridFileGCTAName, "777", use_umask = FALSE)
}
# This is crying out to be a loop!! But no obvious way to achieve this. Hence 22 individual lines (for the moment)
GCTARunChr1 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[1])) }
GCTARunChr2 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[2])) }
GCTARunChr3 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[3])) }
GCTARunChr4 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[4])) }
GCTARunChr5 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[5])) }
GCTARunChr6 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[6])) }
GCTARunChr7 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[7])) }
GCTARunChr8 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[8])) }
GCTARunChr9 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[9])) }
GCTARunChr10 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[10])) }
GCTARunChr11 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[11])) }
GCTARunChr12 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[12])) }
GCTARunChr13 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[13])) }
GCTARunChr14 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[14])) }
GCTARunChr15 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[15])) }
GCTARunChr16 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[16])) }
GCTARunChr17 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[17])) }
GCTARunChr18 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[18])) }
GCTARunChr19 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[19])) }
GCTARunChr20 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[20])) }
GCTARunChr21 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[21])) }
GCTARunChr22 %<-% { system(paste0("R CMD BATCH --no-save ",GridFileGCTANames[22])) }
# Wait for the future job to be resolved. Again, should be a loop
GCTAJobRunning1 <- futureOf(GCTARunChr1); while (!resolved(GCTAJobRunning1)) { Sys.sleep(60); cat("o") }
GCTAJobRunning2 <- futureOf(GCTARunChr2); while (!resolved(GCTAJobRunning2)) { Sys.sleep(60); cat("o") }
GCTAJobRunning3 <- futureOf(GCTARunChr3); while (!resolved(GCTAJobRunning3)) { Sys.sleep(60); cat("o") }
GCTAJobRunning4 <- futureOf(GCTARunChr4); while (!resolved(GCTAJobRunning4)) { Sys.sleep(60); cat("o") }
GCTAJobRunning5 <- futureOf(GCTARunChr5); while (!resolved(GCTAJobRunning5)) { Sys.sleep(60); cat("o") }
GCTAJobRunning6 <- futureOf(GCTARunChr6); while (!resolved(GCTAJobRunning6)) { Sys.sleep(60); cat("o") }
GCTAJobRunning7 <- futureOf(GCTARunChr7); while (!resolved(GCTAJobRunning7)) { Sys.sleep(60); cat("o") }
GCTAJobRunning8 <- futureOf(GCTARunChr8); while (!resolved(GCTAJobRunning8)) { Sys.sleep(60); cat("o") }
GCTAJobRunning9 <- futureOf(GCTARunChr9); while (!resolved(GCTAJobRunning9)) { Sys.sleep(60); cat("o") }
GCTAJobRunning10 <- futureOf(GCTARunChr10); while (!resolved(GCTAJobRunning10)) { Sys.sleep(60); cat("o") }
GCTAJobRunning11 <- futureOf(GCTARunChr11); while (!resolved(GCTAJobRunning11)) { Sys.sleep(60); cat("o") }
GCTAJobRunning12 <- futureOf(GCTARunChr12); while (!resolved(GCTAJobRunning12)) { Sys.sleep(60); cat("o") }
GCTAJobRunning13 <- futureOf(GCTARunChr13); while (!resolved(GCTAJobRunning13)) { Sys.sleep(60); cat("o") }
GCTAJobRunning14 <- futureOf(GCTARunChr14); while (!resolved(GCTAJobRunning14)) { Sys.sleep(60); cat("o") }
GCTAJobRunning15 <- futureOf(GCTARunChr15); while (!resolved(GCTAJobRunning15)) { Sys.sleep(60); cat("o") }
GCTAJobRunning16 <- futureOf(GCTARunChr16); while (!resolved(GCTAJobRunning16)) { Sys.sleep(60); cat("o") }
GCTAJobRunning17 <- futureOf(GCTARunChr17); while (!resolved(GCTAJobRunning17)) { Sys.sleep(60); cat("o") }
GCTAJobRunning18 <- futureOf(GCTARunChr18); while (!resolved(GCTAJobRunning18)) { Sys.sleep(60); cat("o") }
GCTAJobRunning19 <- futureOf(GCTARunChr19); while (!resolved(GCTAJobRunning19)) { Sys.sleep(60); cat("o") }
GCTAJobRunning20 <- futureOf(GCTARunChr20); while (!resolved(GCTAJobRunning20)) { Sys.sleep(60); cat("o") }
GCTAJobRunning21 <- futureOf(GCTARunChr21); while (!resolved(GCTAJobRunning21)) { Sys.sleep(60); cat("o") }
GCTAJobRunning22 <- futureOf(GCTARunChr22); while (!resolved(GCTAJobRunning22)) { Sys.sleep(60); cat("o") }
# Allow a little time
Sys.sleep(60)
cat(Messages[Tick])
}
CatTee(Messages[55])
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.