In yasmina-mekki/gwhap: Genome wide haplotype analysis

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

library(gwhap)

RESOURCE_ROOT = "<YOUR_LOCAL_RESOURCE_FS>"

Download the Rutgers genetic map

Download the Rutgers genetic map v3 (Kosambi)

download_rutgers_map(RESOURCE_ROOT)

Create the augmented genetic map

Create the augmented genetic map using the rutgers genetic map v3

snp_physical_positions = "/neurospin/ukb/genetic/GENETIC_DATA_500k/HAPLOTYPES/v2/ukb_hap_"
rutgers_map = "/neurospin/brainomics/bio_resources/rutgers_map_v3"
output = '/neurospin/tmp/ymekki/new_repo/gwhap/augmented_genetic_map'
create_augmented_genetic_map(snp_physical_positions=snp_physical_positions, genetic_map_dir=rutgers_map, save_genetic_map=TRUE, output=output)

Blocs creation

Create blocs with a single delta

augmented_genetic_map_dir = '/neurospin/tmp/ymekki/new_repo/gwhap/augmented_genetic_map'
output = '/neurospin/tmp/ymekki/new_repo/gwhap/blocs'
augmented_map_df = get_augmented_genetic_map(augmented_genetic_map_dir, chromosomes=1:23)
df_blocks = create_blocks(augmented_map_df, delta=1e-3, save_blocs=TRUE, output=output)

Determine the haplotypes

Set parameters :

# get blocs
blocs_dir = '/neurospin/tmp/ymekki/new_repo/gwhap/blocs'
df_blocs  = get_blocs(blocs_dir)

# bgen file
bgen_file_path = '/neurospin/ukb/genetic/GENETIC_DATA_500k/HAPLOTYPES/v2/ukb_hap'

# phenotype
phenotype_path = '/neurospin/ukb/workspace/pheno_boxcox/all_pheno_res.tsv'
phenotype = read_delim(phenotype_path, delim='\t')

# index path
sample_file_path = '/neurospin/ukb/genetic/GENETIC_DATA_500k/HAPLOTYPES/v2/ukb25251_hap'

# output
output = '/neurospin/tmp/ymekki/new_repo/gwhap/T1_subjects/haplotypes'

The function handle one blocs at a time. In order to determine the haplotypes for the blocs created using the whole genome, we need to loop over them.

The determination of the haplotypes for 119.000 blocs is estimated to 1.5 days. In order to reduce the run time, we use the mcmapply function. Basically, this function will paralelize the process.

Please refer to the code below to implement the determination of haplotypes using all the blocs determined in the above step.

for(chromosome in unique(df_blocs$chr)){

  # get the start an end bp position of each bloc
  start     = df_blocs$fromBp[df_blocs$chr == chromosome]
  end       = df_blocs$toBp[df_blocs$chr == chromosome]


  #__________________________________________________________________________________________________
  # In the bgen file, the IID are annonymized.
  # It is constitued of, among others, a list of sample that contain the anonymized IID indexed.
  # The .sample file is used as a reference table. It contains the IID of the participant indexed.
  # Here, the index can be used as a foreign key to toggle from the anonymized IID to the real ones.
  # Warning: please notice that .sample file has two header lines.
  # Be sure to remove the second header and starting the index account from the second line.
  #__________________________________________________________________________________________________


  # set bgen chromosome file path
  bgen_chromosome_file_path = sprintf('%s_%s_v2.bgen', bgen_file_path, chromosome)

  # read the index file
  sample_index_file = read_delim(sprintf('%s_%s_v2_s487395.sample', sample_file_path, chromosome), delim='  ')

  # remove the first line
  sample_index_file = sample_index_file[-1, ]

  # get the index as columns
  setDT(sample_index_file, keep.rownames = TRUE)[]


  # filtre on the phenotype IID and get the index as well as the participant IID ordered
  samples_index = as.numeric(sample_index_file[sample_index_file$ID_2 %in% phenotype$IID, ]$rn)
  samples_index_ID = as.numeric(sample_index_file[sample_index_file$ID_2 %in% phenotype$IID, ]$ID_2)

  # set bgi file path
  bgi_file_path = sprintf("%s.bgi", bgen_chromosome_file_path)

  # get snps position using the bgi file
  allVar = get_bgi_file(file_path = bgi_file_path)

  # get the bgen ID codification
  dummybg = get_bgen_file(file_path = bgen_chromosome_file_path,
                          start = allVar$position[1],
                          end = allVar$position[1],
                          samples = c(),
                          chromosome = '',
                          max_entries_per_sample = 4)

  # filter the partcipant bgen ID using their index
  mysamples = dummybg$samples[samples_index]

  # determine haplotypes for all blocs of one chromosome
  haplotype_combined = legacy_determine_haplotypes_per_chromosome(chromosome=chromosome,
                                                           start,
                                                           end,
                                                           bgen_file = bgen_chromosome_file_path,
                                                           sample_iid = samples_index_ID,
                                                           sample_bgen_iid_code = mysamples,
                                                           max_entries_per_sample=4,
                                                           nb_core=detectCores()-2,
                                                           verbose=FALSE)

  # save the haplotypes in tsv file (very heavy, you should consider to use an other way to save the haplotypes)
  #save_haplotypes(haplotype_combined, chromosome=chr, output=output)

  # save the haplotypes in RData object
  save(haplotype_combined, file=sprintf('%s/haplotypes_%s.RData', output, chromosome), compress=T)
}

At the end, the haplotypes concerning one chromosome are binded by column and then saved.

Since, the test step implies that the haplotypes are tested per blocs, you need to reverse the binding process. Please refer to the section below to get how to do that.

Test the haplotypes

The function is designed to handle one blocs at the time. In order to apply the three test on all haplotypes, we need to loop over them.

In order to reduce the run time, we use the mcmapply function. Basically, this function will paralelize the process.

Please refer to the code below to implement the tests using all haplotypes determined in the above step.

Set parameters

chr = "chr1"

# filtre sur les IDs common phenotype and genotype
# read index file
sample_semi_path = '/neurospin/ukb/genetic/GENETIC_DATA_500k/HAPLOTYPES/v2/ukb25251_hap'
sample_index_file = read_delim(sprintf('%s_%s_v2_s487395.sample', sample_semi_path, chr), delim='  ')
sample_index_file = sample_index_file[-1, ]

phenotypes_sulci = fread("/neurospin/ukb/workspace/pheno_boxcox/all_pheno_res.tsv")
Y_filtred = phenotypes_sulci[, c('IID', 'FCMpost_left')]

output = '/neurospin/tmp/ymekki/new_repo/gwhap/T1_subjects/lm_test'

Apply the test

for (i in 1:22){

  output_chromosome = sprintf('%s/chr_%s', output, i)

  dir.create(output_chromosome)
  dir.create(sprintf('%s/bloc_test_results', output_chromosome))
  dir.create(sprintf('%s/complete_test_results', output_chromosome))
  dir.create(sprintf('%s/single_test_results', output_chromosome))


  # load haplotype_combined
  haplotypes_path = '/neurospin/tmp/ymekki/new_repo/gwhap/T1_subjects/haplotypes/old_haplotypes_'
  load(sprintf('%schr%s.RData', haplotypes_path, i))


  chr = sprintf('chr%s', i)

  #_____________________________________________________________________________________
  # During the haplotypes determination steps,
  # the haplotypes concerning one chromosome were binded by column and then saved.
  # The test haplotype function implies that each bloc is testes separtelty.
  # Therefore, you need to reverse the binding process. This is done as following:
  # first, identify the blocs using the haplotypes code (see code below)
  # then, filter the haplotypes object using the bloc code identified above
  # (see lm_test_haplotypes_per_bloc function)
  #_____________________________________________________________________________________

  # separate the blocs

  # replace NA added by cbind by the chromosome code
  colnames(haplotype_combined) = gsub("NA", chr, colnames(haplotype_combined))

  # get the start and end bloc's position
  start = na.omit(unique(vapply(strsplit(colnames(haplotype_combined),"_"), `[`, 2, FUN.VALUE=character(1))))
  end = na.omit(unique(vapply(strsplit(colnames(haplotype_combined),"_"), `[`, 3, FUN.VALUE=character(1))))

  # concatenate the start and end bloc position
  blocs = sprintf('%s_%s', start, end)

  # run the test
  results_chr = mcmapply(FUN = lm_test_haplotypes_per_bloc,
                         blocs = blocs,
                         haplotype_combined = list(haplotype_combined),
                         sample_index_file = list(sample_index_file),
                         Y_filtred = list(Y_filtred),
                         output_chromosome = output_chromosome,
                         mc.cores = 30)
}

Get the summary of the results. It consist on filtering on the signicant p values.

summary_haplotypes_test(output, threshold = 5e-6, verbose=FALSE)

lm_test_haplotypes_per_bloc = function(blocs, haplotype_combined, sample_index_file, Y_filtred, output_chromosome, kind='all'){

  # change Y_filtred column name
  colnames(Y_filtred) = c('IID', 'phenotype')

  # get only the haplotypes corresponding to bloc got as input
  columns_blocs = str_subset(colnames(haplotype_combined), blocs)

  # filter the haplotypes corresponding to the bloc
  haplotype_one_bloc = haplotype_combined[, columns_blocs]

  # This is done in order to be sure that the subject are well alligned

  # get the index columns
  setDT(haplotype_one_bloc, keep.rownames = TRUE)[]

  # merge the index file and the haplotypes using the index
  haplotype_one_bloc$rn <- as.integer(haplotype_one_bloc$rn)
  df_tmp_merged = merge(sample_index_file, haplotype_one_bloc, by.x='ID_2', by.y="rn")

  # merge the df obtained above with the phenotype using the ID
  df_merged = merge(df_tmp_merged, Y_filtred, by.x = 'ID_2', by.y = 'IID')

  # set X, Y and run the haplotypique test
  X = df_merged[, columns_blocs]
  Y = data.frame(phenotype = df_merged[, 'phenotype'])

  # perform the test
  results = lm_test_haplotypes(X, Y, kind=kind)

  # save the results
  write.table(data.frame(results$bloc), sprintf('%s/bloc_test_results/%s.tsv', output_chromosome, blocs), sep="\t", row.names=FALSE, quote=FALSE)
  write.table(data.frame(results$complete), sprintf('%s/complete_test_results/%s.tsv', output_chromosome, blocs), sep="\t", row.names=FALSE, quote=FALSE)
  write.table(data.frame(results$single), sprintf('%s/single_test_results/%s.tsv', output_chromosome, blocs), sep="\t", row.names=FALSE, quote=FALSE)

  return(results)
}

Residualisation

If you need to residualise your phenotypes, please refer to this code as an example:

covariates = fread('/neurospin/ukb/workspace/pheno_boxcox/covar_SexAgeMRI_10PCSArray_19k.cov')
X_covariates_names = c("IID", "Array", "Sex", "Age", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8", "PC9", "PC10")

phenotypes_sulci = fread("/neurospin/ukb/workspace/pheno_boxcox/all_sulci_boxcox.tsv")
phenotype_FCMpost_left = phenotypes_sulci[, c('IID', 'FCMpost_left')]

Y_residualised = phenotype_residualisation(phenotype=phenotype_FCMpost_left, covariates=covariates[, ..X_covariates_names], verbose=FALSE)