# TODO(brielin): A lot of this code doesn't work now that the data is on AWS. require(tidyverse) require(foreach) require(igraph) require(devtools) require(egg) require(GGally) devtools::load_all()
# Download raw sumstats files. # TODO(brielin): remove local paths and setup working dir from scratch? n_cores = 8 ukbb_ldsc_results_file <- "/gpfs/commons/projects/UKBB/sumstats/ukb31063_h2_topline.02Oct2019.tsv.gz" ukbb_ldsc_manifest_file <- "/gpfs/commons/projects/UKBB/sumstats/ukb31063_ldsc_sumstat_manifest.tsv.gz" save_dir <- "/gpfs/commons/projects/UKBB/sumstats/ldsc_raw" # SLE: rare in ukbb, not sure if theres two easy GWAS to pull # RA: common enough to make exception and/or to use Okada as discovery 20002_1464 # psoriasis: common enough for exception 20002_1453 # crohns: heritable enough for exception 20002_1462 # T1D: Not sure if UKBB diabetes is this? check RG with below # T2d: could make exception 20002_1223 exceptions <- c( "20002_1464", # RA "20002_1453", # psoriasis "20002_1462", # crohns "20002_1223" # T2D ) ukbb_ldsc_results <- readr::read_tsv(gzfile(ukbb_ldsc_results_file)) ukbb_ldsc_manifest <- readr::read_tsv(gzfile(ukbb_ldsc_manifest_file)) ukbb_ldsc_results <- ukbb_ldsc_results %>% filter((h2_z > 4 & isNotPrimary == F & Neff > 4500 & isBadOrdinal == F)|(phenotype %in% exceptions)) ukbb_info <- dplyr::left_join(ukbb_ldsc_results, ukbb_ldsc_manifest, by = c("phenotype", "description")) %>% filter(has_v2_replacement == F, !grepl("dilution_factor", ldsc_sumstat_file)) %>% group_by(phenotype, description) %>% mutate(count = n()) %>% ungroup() %>% filter(count == 3) %>% mutate(dest_file = paste0(save_dir, "/", phenotype, ".", sex.y, ".tsv.bgz")) # future::plan("multisession", workers = n_cores) # furrr::future_walk2(ukbb_info$ldsc_sumstat_dropbox, ukbb_info$dest_file, ~ if(!file.exists(.y)) download.file(.x, .y, method = "wget"))
# Calculate M-F rg to filter on M-F phenotype mismatches. n_cores = 8 ref_ld_chr = "/gpfs/commons/projects/UKBB/sumstats/ldsc_results/eur_w_ld_chr/" w_ld_chr = "/gpfs/commons/projects/UKBB/sumstats/ldsc_results/eur_w_ld_chr/" ldsc_mf_result_dir = "/gpfs/commons/projects/UKBB/sumstats/ldsc_results/male_female_rg/" rg_info <- ukbb_info %>% dplyr::select(phenotype, description, sex.y, dest_file, Neff) %>% tidyr::pivot_wider(names_from = sex.y, values_from = dest_file) %>% dplyr::mutate(ldsr_result_file = paste0(ldsc_mf_result_dir, phenotype, ".mfrg")) # future::plan("multisession", workers = n_cores) # rg_info %>% dplyr::select(female, male, ldsr_result_file) %>% # furrr::future_pwalk(function(female, male, ldsr_result_file){ # if(!file.exists(paste0(ldsr_result_file, ".log"))){ # system(paste0("ldsc.py --rg ", female, ",", male, " --ref-ld-chr ", ref_ld_chr, # " --w-ld-chr ", w_ld_chr, " --out ", ldsr_result_file)) # } # }) rg_values <- purrr::map_df(rg_info$ldsr_result_file, function(.f){ lines <- read_lines(paste0(.f, ".log")) result <- strsplit(lines[length(lines) - 3], "\\s+")[[1]] return(list("rg" = as.double(result[3]), "se" = as.double(result[4]))) }) rg_info <- rg_info %>% dplyr::mutate(mf_rg = rg_values$rg, mf_rg_se = rg_values$se, ci = mf_rg + 2*mf_rg_se, z0 = mf_rg/mf_rg_se, z1 = (1-mf_rg)/mf_rg_se) rg_info %>% ggplot() + geom_histogram(aes(x=mf_rg)) + xlim(0, 1) rg_info %>% ggplot() + geom_histogram(aes(x=ci)) + xlim(0, 1) rg_cutoff = 0.5 z0_cutoff = 2 selected_phenos_ <- rg_info %>% filter(mf_rg > rg_cutoff, z0 > z0_cutoff)
# Calculate genetic correlation to filter near-identical phenotypes. ref_ld_chr = "/gpfs/commons/projects/UKBB/sumstats/ldsc_results/eur_w_ld_chr/" w_ld_chr = "/gpfs/commons/projects/UKBB/sumstats/ldsc_results/eur_w_ld_chr/" ldsc_selected_result_dir = "/gpfs/commons/projects/UKBB/sumstats/ldsc_results/selected_rg/" # 661 phenotypes selected_phenos <- selected_phenos %>% dplyr::mutate(ldsr_selected_file = paste0(ldsc_selected_result_dir, phenotype, ".rg")) all_phenos = do.call(function(...){paste(..., sep = ",")}, as.list(selected_phenos$both_sexes)) # future::plan("multisession", workers = n_cores) # furrr::future_pwalk( # selected_phenos, ~ if(!file.exists(paste0(..12, ".log"))) { # system(paste0("ldsc.py --rg ", ..4, ",", all_phenos, " --ref-ld-chr ", ref_ld_chr, " --w-ld-chr ", w_ld_chr, " --out ", ..13)) # }) run_and_parse_ldsc_rg <- function(pheno1_file, pheno2_files, ref_ld_chr_dir, w_ld_chr_dir, out_file){ # if(!file.exists(paste0(out_file, ".log"))){ system(paste0("ldsc.py --rg ", pheno1_file, ",", pheno2_files, " --ref-ld-chr ", ref_ld_chr_dir, " --w-ld-chr ", w_ld_chr_dir, " --out ", out_file)) # } n_lines <- length(read_lines(paste0(out_file, ".log"))) n_phenos <- length(strsplit(pheno2_files, ",")[[1]]) result_table <- readr::read_delim(paste0(out_file, ".log"), skip = n_lines - n_phenos - 4, delim = " ", trim_ws = T, n_max = n_phenos) return(result_table) } ldsc_settings <- tibble( pheno1_file = selected_phenos$both_sexes, pheno2_files = all_phenos, ref_ld_chr_dir = ref_ld_chr, w_ld_chr_dir = w_ld_chr, out_file = selected_phenos$ldsr_selected_file)
# sjob_run_ldsc <- rslurm::slurm_apply( # run_and_parse_ldsc_rg, ldsc_settings, jobname = "ldsc", nodes = nrow(ldsc_settings), # cpus_per_node = 1, slurm_options = list(time = "12:00:00", mem = "8G"), submit = T)
# IMIDs to check: SLE, RA, psoriasis, allergy, asthma, CB, CrD, CeD, IBD, MS, T1D, T2D, UC # SLE: rare in ukbb, not sure if theres two easy GWAS to pull # RA: common enough to make exception and/or to use Okada as discovery 20002_1464 mfrg na # psoriasis: common enough for exception 20002_1453 good # allergy: included 6152_9 good # DVT: included 6152_5 mfrg < 0.5 # asthma included 6152_8 good actualy 20002_1111 # chronic bronch: included 6152_6 good # crohns: heritable enough for exception 20002_1462 good # celiac: should be included 20002_1456 mfrg z0 < 2 # IBD: potential not heritable in UKBB # MS: too hard # T1D: Not sure if UKBB diabetes is this? check RG with below # T2d: could make exception 20002_1223 mfrg fail # UC: should be in or make exception K51 good rg_max = 0.9 sjob_run_ldsc <- rslurm::slurm_job(jobname = "ldsc", nodes = nrow(ldsc_settings)) rg_results <- rslurm::get_slurm_out(sjob_run_ldsc, outtype = "table") rg_results <- rg_results %>% dplyr::left_join(selected_phenos, by = c("p1" = "both_sexes")) %>% dplyr::left_join(selected_phenos, by = c("p2" = "both_sexes"), suffix = c(".1", ".2")) %>% select(phenotype.1, phenotype.2, description.1, description.2, rg, se, z, p, mf_rg.1, mf_rg_se.1, mf_rg.2, mf_rg_se.2, Neff.1, Neff.2)
# Looking at rg_results at this stage, there are many "none of the above" # phenotypes that are highly genetically anti-correlated with important disease # phenotypes, but have larger sample sizes. We manually filter these to avoid # removing important disease phenotypes instead of "non"-phenotypes. filter_phenotypes = c( "20107_100", # Illnesses of father: None of the above (group 1) "20107_101", # Illnesses of father: None of the above (group 2) "20110_100", # Illnesses of mother: None of the above (group 1) "20111_100", # Illnesses of siblings: None of the above (group 1) "6138_100", # Qualifications: None of the above "6145_100", # Illness, injury, bereavement, stress in last 2 years: None of the above "6146_100", # Attendance/disability/mobility allowance: None of the above # "6149_100", # Mouth/teeth dental problems: None of the above "6150_100", # Vascular/heart problems diagnosed by doctor: None of the above "6152_100", # Blood clot, DVT, bronchitis, emphysema, asthma, rhinitis, eczema, allergy diagnosed by doctor: None of the above "6154_100", # Medication for pain relief, constipation, heartburn: None of the above "6155_100", # Vitamin and mineral supplements: None of the above "6157_100", # Why stopped smoking: None of the above "6159_100", # Pain type(s) experienced in last month: None of the above "6160_100", # Leisure/social activities: None of the above "6164_100", # Types of physical activity in last 4 weeks: None of the above "6179_100", # Mineral and other dietary supplements: None of the above "48_irnt" # Waste circumference has rg > 0.9 with BMI and slightly more samples, but BMI is more interperatable ) pheno_by_neff <- (selected_phenos %>% dplyr::filter(!(phenotype %in% filter_phenotypes)) %>% dplyr::arrange(Neff, mf_rg))$phenotype for(i in 1:length(pheno_by_neff)) { pheno = pheno_by_neff[i] remaining_phenos <- pheno_by_neff[i:length(pheno_by_neff)] # print(filter(rg_results, phenotype.1 == pheno, phenotype.2 != pheno, !(phenotype.2 %in% filter_phenotypes), phenotype.2 %in% remaining_phenos, abs(rg) > 0.9)) n_high_cor <- nrow(filter(rg_results, phenotype.1 == pheno, phenotype.2 != pheno, !(phenotype.2 %in% filter_phenotypes), phenotype.2 %in% remaining_phenos, abs(rg) > 0.9)) if(n_high_cor > 0) filter_phenotypes <- c(filter_phenotypes, pheno) # print(length(filter_phenotypes)) } selected_phenos <- filter(selected_phenos, !(phenotype %in% filter_phenotypes)) rg_result <- filter(rg_results, !(phenotype.1 %in% filter_phenotypes), !(phenotype.2 %in% filter_phenotypes)) save(selected_phenos, file = "/gpfs/commons/home/bbrown/ukbb_network/saved_rdata/selected_phenotypes.Rdata") save(rg_results, file = "/gpfs/commons/home/bbrown/ukbb_network/saved_rdata/rg_results.Rdata")
ukbb_full_loc = "/nfs/scratch/bbrown/ukbb_full_raw/" ukbb_full_files <- ukbb_info %>% filter(phenotype %in% selected_phenos$phenotype, sex.y %in% c("male", "female")) %>% select(phenotype, sex.y, gwas_dropbox) %>% mutate(ukbb_full_file = paste0(ukbb_full_loc, phenotype, ".", sex.y, ".tsv.bgz")) # future::plan("multisession", workers = n_cores) # furrr::future_walk2(ukbb_full_files$gwas_dropbox, ukbb_full_files$ukbb_full_file, ~ if(!file.exists(.y)) download.file(.x, .y, method = "wget"))
# Now we need to clump sumstats using plink on slurm. # # This needs a dups list not to file. Run this on your reference bed and # point dups to the output. # # for i in $(seq 1 22); do # cut -f 2 "${REFERENCE_BED}${i}.bim" | sort | uniq -d > "ukbb_dups_chr${i}.txt" # done load("/gpfs/commons/home/bbrown/ukbb_network/saved_rdata/selected_phenotypes.Rdata") p_thresh = 0.0001 reference_bed = "/nfs/scratch/bbrown/ukbb_ref/ukb_imp_cleaned_rel_hwe_chr" dups = "/nfs/scratch/bbrown/ukbb_ref/ukbb_dups_chr" out_dir = "/nfs/scratch/bbrown/clumped/" gwas_dir = "/nfs/scratch/bbrown/ukbb_full_raw/" ldsc_dir = "/nfs/scratch/bbrown/ldsc_raw/" variants_file = "/nfs/scratch/bbrown/variants.tsv.bgz" split_and_clump <- function(out_dir, chr, gwas_file, ldsc_file, variants_file, reference_bed, dups, p_thresh){ if(!dir.exists(out_dir)) system(paste0("mkdir -p ", out_dir)) bim_file <- paste0(reference_bed, chr, ".bim") bim_data <- readr::read_tsv(bim_file, col_names = c("chr", "rsid", "cm", "pos", "A1", "A2")) fv <- function(x, pos) subset(x, rsid %in% bim_data$rsid) variant_data <- readr::read_tsv_chunked(gzfile(variants_file), DataFrameCallback$new(fv)) %>% dplyr::select(variant, chr, pos, ref, alt, rsid, AF, minor_allele, minor_AF) fs <- function(x, pos) subset(x, variant %in% variant_data$variant) gwas_data <- readr::read_tsv_chunked(gzfile(gwas_file), DataFrameCallback$new(fs)) %>% dplyr::select(variant, minor_allele, minor_AF, n_complete_samples, beta, se, tstat, pval) fl <- function(x, pos) subset(x, SNP %in% variant_data$rsid) ldsc_data <- readr::read_tsv_chunked(gzfile(ldsc_file), DataFrameCallback$new(fl)) sumstats <- variant_data %>% left_join(gwas_data, by = c("variant")) %>% select(rsid, chr, pos, ref, alt, AF, n_complete_samples, beta, se, tstat, pval) dup_snps <- unique(sumstats$rsid[duplicated(sumstats$rsid)]) sumstats <- dplyr::filter(sumstats, !(rsid %in% dup_snps)) %>% dplyr::arrange(chr, pos) ldsc_data <- dplyr::filter(ldsc_data, SNP %in% sumstats$rsid) %>% arrange(SNP) ss_ldsc <- dplyr::filter(sumstats, rsid %in% ldsc_data$SNP) %>% arrange(rsid) num_mismatch <- sum(sign(ldsc_data$Z) != sign(ss_ldsc$tstat)) if(num_mismatch == nrow(ss_ldsc)){ sumstats <- dplyr::mutate(sumstats, beta=-beta, tstat=-tstat) } else if(num_mismatch > 0){ stop("Error: mismatches between LDSC and SS data.") } chr_chunk_file <- paste0(out_dir, "chr", chr, ".tsv") print(chr_chunk_file) readr::write_tsv(sumstats, chr_chunk_file) plink_command <- paste0( "plink --bfile ", reference_bed, chr, # " --thin-indiv-count 100000 ", " --clump ", chr_chunk_file, " --clump-snp-field rsid", " --clump-r2 0.05 --clump-kb 500 --clump-field \"pval\" --clump-p1 ", p_thresh, " --exclude ", dups, chr, ".txt", " --out ", out_dir, "chr", chr) system(plink_command) clumpfile <- paste0(out_dir, "chr", chr, ".clumped") snpfile <- paste0(out_dir, "chr", chr, ".snps") if(file.exists(clumpfile)) system(paste0("tail -n +2 ", clumpfile, " | awk '{print $3}' > ", snpfile)) } selected_phenos_all <- selected_phenos selected_phenos <- filter(selected_phenos, phenotype == "21001_irnt") clump_settings <- tibble( out_dir = rep(paste0(out_dir, selected_phenos$phenotype, ".male/"), each = 22), chr = rep(c(1:22), times = length(selected_phenos$phenotype)), gwas_file = rep(paste0(gwas_dir, selected_phenos$phenotype, ".male.tsv.bgz"), each = 22), ldsc_file = rep(paste0(ldsc_dir, selected_phenos$phenotype, ".male.tsv.bgz"), each = 22), variants_file = variants_file, reference_bed = reference_bed, dups = dups, p_thresh = p_thresh ) sjob_run_clump <- rslurm::slurm_apply( split_and_clump, clump_settings, jobname = "clump", nodes = ceiling(nrow(clump_settings)), cpus_per_node = 1, slurm_options = list(time = "12:00:00", mem = "32G", constraint = "v5"), submit = T)
# Filter summary statistics down to clump index SNPs in at least one phenotype. n_cores = 8 all_snps = "/gpfs/commons/projects/UKBB/sumstats/snps_to_use.txt" filter_script = "ukbb_analysis/filter_ukbb_sumstats.bash" gwas_raw_dir = "/gpfs/commons/projects/UKBB/sumstats/ukbb_full_raw/" clump_dir = "/gpfs/commons/projects/UKBB/sumstats/clumped/" variants_file = "/gpfs/commons/projects/UKBB/sumstats/variants.tsv.bgz" load("/gpfs/commons/home/bbrown/ukbb_network/saved_rdata/selected_phenotypes.Rdata") # purrr::walk(selected_phenos$phenotype, ~ system(paste0("cat ", clump_dir, .x, ".male/chr*.snps | sort | uniq > ", clump_dir, .x, ".male/all.snps" ))) # system(paste0("cat ", clump_dir, "*/*.snps | sort | uniq > ", all_snps)) future::plan("multisession", workers = n_cores) furrr::future_walk(selected_phenos$phenotype, ~ system(paste0("bash ../", filter_script, " ", gwas_raw_dir, .x, ".male.tsv.bgz ", variants_file, " ", all_snps))) furrr::future_walk(selected_phenos$phenotype, ~ system(paste0("bash ../", filter_script, " ", gwas_raw_dir, .x, ".female.tsv.bgz ", variants_file, " ", all_snps)))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.