knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(gwhap) RESOURCE_ROOT = "<YOUR_LOCAL_RESOURCE_FS>"
Download the Rutgers genetic map v3 (Kosambi)
download_rutgers_map(RESOURCE_ROOT)
Create the augmented genetic map using the rutgers genetic map v3
snp_physical_positions = "/neurospin/ukb/genetic/GENETIC_DATA_500k/HAPLOTYPES/v2/ukb_hap_" rutgers_map = "/neurospin/brainomics/bio_resources/rutgers_map_v3" output = '/neurospin/tmp/ymekki/new_repo/gwhap/augmented_genetic_map' create_augmented_genetic_map(snp_physical_positions=snp_physical_positions, genetic_map_dir=rutgers_map, save_genetic_map=TRUE, output=output)
Create blocs with a single delta
augmented_genetic_map_dir = '/neurospin/tmp/ymekki/new_repo/gwhap/augmented_genetic_map' output = '/neurospin/tmp/ymekki/new_repo/gwhap/blocs' augmented_map_df = get_augmented_genetic_map(augmented_genetic_map_dir, chromosomes=1:23) df_blocks = create_blocks(augmented_map_df, delta=1e-3, save_blocs=TRUE, output=output)
Set parameters :
# get blocs blocs_dir = '/neurospin/tmp/ymekki/new_repo/gwhap/blocs' df_blocs = get_blocs(blocs_dir) # bgen file bgen_file_path = '/neurospin/ukb/genetic/GENETIC_DATA_500k/HAPLOTYPES/v2/ukb_hap' # phenotype phenotype_path = '/neurospin/ukb/workspace/pheno_boxcox/all_pheno_res.tsv' phenotype = read_delim(phenotype_path, delim='\t') # index path sample_file_path = '/neurospin/ukb/genetic/GENETIC_DATA_500k/HAPLOTYPES/v2/ukb25251_hap' # output output = '/neurospin/tmp/ymekki/new_repo/gwhap/T1_subjects/haplotypes'
The function handle one blocs at a time. In order to determine the haplotypes for the blocs created using the whole genome, we need to loop over them.
The determination of the haplotypes for 119.000 blocs is estimated to 1.5 days. In order to reduce the run time, we use the mcmapply function. Basically, this function will paralelize the process.
Please refer to the code below to implement the determination of haplotypes using all the blocs determined in the above step.
for(chromosome in unique(df_blocs$chr)){ # get the start an end bp position of each bloc start = df_blocs$fromBp[df_blocs$chr == chromosome] end = df_blocs$toBp[df_blocs$chr == chromosome] #__________________________________________________________________________________________________ # In the bgen file, the IID are annonymized. # It is constitued of, among others, a list of sample that contain the anonymized IID indexed. # The .sample file is used as a reference table. It contains the IID of the participant indexed. # Here, the index can be used as a foreign key to toggle from the anonymized IID to the real ones. # Warning: please notice that .sample file has two header lines. # Be sure to remove the second header and starting the index account from the second line. #__________________________________________________________________________________________________ # set bgen chromosome file path bgen_chromosome_file_path = sprintf('%s_%s_v2.bgen', bgen_file_path, chromosome) # read the index file sample_index_file = read_delim(sprintf('%s_%s_v2_s487395.sample', sample_file_path, chromosome), delim=' ') # remove the first line sample_index_file = sample_index_file[-1, ] # get the index as columns setDT(sample_index_file, keep.rownames = TRUE)[] # filtre on the phenotype IID and get the index as well as the participant IID ordered samples_index = as.numeric(sample_index_file[sample_index_file$ID_2 %in% phenotype$IID, ]$rn) samples_index_ID = as.numeric(sample_index_file[sample_index_file$ID_2 %in% phenotype$IID, ]$ID_2) # set bgi file path bgi_file_path = sprintf("%s.bgi", bgen_chromosome_file_path) # get snps position using the bgi file allVar = get_bgi_file(file_path = bgi_file_path) # get the bgen ID codification dummybg = get_bgen_file(file_path = bgen_chromosome_file_path, start = allVar$position[1], end = allVar$position[1], samples = c(), chromosome = '', max_entries_per_sample = 4) # filter the partcipant bgen ID using their index mysamples = dummybg$samples[samples_index] # determine haplotypes for all blocs of one chromosome haplotype_combined = legacy_determine_haplotypes_per_chromosome(chromosome=chromosome, start, end, bgen_file = bgen_chromosome_file_path, sample_iid = samples_index_ID, sample_bgen_iid_code = mysamples, max_entries_per_sample=4, nb_core=detectCores()-2, verbose=FALSE) # save the haplotypes in tsv file (very heavy, you should consider to use an other way to save the haplotypes) #save_haplotypes(haplotype_combined, chromosome=chr, output=output) # save the haplotypes in RData object save(haplotype_combined, file=sprintf('%s/haplotypes_%s.RData', output, chromosome), compress=T) }
At the end, the haplotypes concerning one chromosome are binded by column and then saved.
Since, the test step implies that the haplotypes are tested per blocs, you need to reverse the binding process. Please refer to the section below to get how to do that.
The function is designed to handle one blocs at the time. In order to apply the three test on all haplotypes, we need to loop over them.
In order to reduce the run time, we use the mcmapply function. Basically, this function will paralelize the process.
Please refer to the code below to implement the tests using all haplotypes determined in the above step.
chr = "chr1" # filtre sur les IDs common phenotype and genotype # read index file sample_semi_path = '/neurospin/ukb/genetic/GENETIC_DATA_500k/HAPLOTYPES/v2/ukb25251_hap' sample_index_file = read_delim(sprintf('%s_%s_v2_s487395.sample', sample_semi_path, chr), delim=' ') sample_index_file = sample_index_file[-1, ] phenotypes_sulci = fread("/neurospin/ukb/workspace/pheno_boxcox/all_pheno_res.tsv") Y_filtred = phenotypes_sulci[, c('IID', 'FCMpost_left')] output = '/neurospin/tmp/ymekki/new_repo/gwhap/T1_subjects/lm_test'
for (i in 1:22){ output_chromosome = sprintf('%s/chr_%s', output, i) dir.create(output_chromosome) dir.create(sprintf('%s/bloc_test_results', output_chromosome)) dir.create(sprintf('%s/complete_test_results', output_chromosome)) dir.create(sprintf('%s/single_test_results', output_chromosome)) # load haplotype_combined haplotypes_path = '/neurospin/tmp/ymekki/new_repo/gwhap/T1_subjects/haplotypes/old_haplotypes_' load(sprintf('%schr%s.RData', haplotypes_path, i)) chr = sprintf('chr%s', i) #_____________________________________________________________________________________ # During the haplotypes determination steps, # the haplotypes concerning one chromosome were binded by column and then saved. # The test haplotype function implies that each bloc is testes separtelty. # Therefore, you need to reverse the binding process. This is done as following: # first, identify the blocs using the haplotypes code (see code below) # then, filter the haplotypes object using the bloc code identified above # (see lm_test_haplotypes_per_bloc function) #_____________________________________________________________________________________ # separate the blocs # replace NA added by cbind by the chromosome code colnames(haplotype_combined) = gsub("NA", chr, colnames(haplotype_combined)) # get the start and end bloc's position start = na.omit(unique(vapply(strsplit(colnames(haplotype_combined),"_"), `[`, 2, FUN.VALUE=character(1)))) end = na.omit(unique(vapply(strsplit(colnames(haplotype_combined),"_"), `[`, 3, FUN.VALUE=character(1)))) # concatenate the start and end bloc position blocs = sprintf('%s_%s', start, end) # run the test results_chr = mcmapply(FUN = lm_test_haplotypes_per_bloc, blocs = blocs, haplotype_combined = list(haplotype_combined), sample_index_file = list(sample_index_file), Y_filtred = list(Y_filtred), output_chromosome = output_chromosome, mc.cores = 30) }
summary_haplotypes_test(output, threshold = 5e-6, verbose=FALSE)
lm_test_haplotypes_per_bloc = function(blocs, haplotype_combined, sample_index_file, Y_filtred, output_chromosome, kind='all'){ # change Y_filtred column name colnames(Y_filtred) = c('IID', 'phenotype') # get only the haplotypes corresponding to bloc got as input columns_blocs = str_subset(colnames(haplotype_combined), blocs) # filter the haplotypes corresponding to the bloc haplotype_one_bloc = haplotype_combined[, columns_blocs] # This is done in order to be sure that the subject are well alligned # get the index columns setDT(haplotype_one_bloc, keep.rownames = TRUE)[] # merge the index file and the haplotypes using the index haplotype_one_bloc$rn <- as.integer(haplotype_one_bloc$rn) df_tmp_merged = merge(sample_index_file, haplotype_one_bloc, by.x='ID_2', by.y="rn") # merge the df obtained above with the phenotype using the ID df_merged = merge(df_tmp_merged, Y_filtred, by.x = 'ID_2', by.y = 'IID') # set X, Y and run the haplotypique test X = df_merged[, columns_blocs] Y = data.frame(phenotype = df_merged[, 'phenotype']) # perform the test results = lm_test_haplotypes(X, Y, kind=kind) # save the results write.table(data.frame(results$bloc), sprintf('%s/bloc_test_results/%s.tsv', output_chromosome, blocs), sep="\t", row.names=FALSE, quote=FALSE) write.table(data.frame(results$complete), sprintf('%s/complete_test_results/%s.tsv', output_chromosome, blocs), sep="\t", row.names=FALSE, quote=FALSE) write.table(data.frame(results$single), sprintf('%s/single_test_results/%s.tsv', output_chromosome, blocs), sep="\t", row.names=FALSE, quote=FALSE) return(results) }
If you need to residualise your phenotypes, please refer to this code as an example:
covariates = fread('/neurospin/ukb/workspace/pheno_boxcox/covar_SexAgeMRI_10PCSArray_19k.cov') X_covariates_names = c("IID", "Array", "Sex", "Age", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10") phenotypes_sulci = fread("/neurospin/ukb/workspace/pheno_boxcox/all_sulci_boxcox.tsv") phenotype_FCMpost_left = phenotypes_sulci[, c('IID', 'FCMpost_left')] Y_residualised = phenotype_residualisation(phenotype=phenotype_FCMpost_left, covariates=covariates[, ..X_covariates_names], verbose=FALSE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.