#' Calculates SNP-SNP association statistics and perform exploratory analyses
#' within pathways
#'
#' @param pop_one (char) character code for the first population (e.g., CEU).
#' @param pop_two (char) character code for the second population (e.g., YRI).
#' @param snp2gene_file (char) path to file with snp-to-gene mappings
#' (output of SNP2gene.R).
#' @param ASSOC_FDR (interger) FDR value to define significant pathway coevolution
#' (default=0.05).
#' @param enrich_folder (char) path to selection-enriched pathway SNP lists.
#' @param unenrich_folder (char) path to unenriched pathway SNP lists.
#' @param output_folder (char) path to output directory.
#'
#' @return none
#' @export
#'
SNPassocWPM <- function(pop_one, pop_two, snp2gene_file, ASSOC_FDR=0.05,
enrich_folder, unenrich_folder, output_folder) {
# Read in snp2gene file to assign SNPs to genes
snp2gene <- read.table(snp2gene_file, h=FALSE, as.is=TRUE)
snp2gene <- snp2gene[,-3]
# Calculate SNP-SNP association statistics
cat("Calculating SNP-SNP association within pathways\n")
# NOTE: argument 'depth' specifies the max. separation b/w pairs of SNPs
# to be considered, so that depth=1 would specify calculation of LD b/w
# immediately adjacent SNPs. For our purposes we want to determine LD
# b/w all SNPs in each pathway despite their distance from each other,
# so we specify depth in ld() as ((number of SNPs)-1)
calcAssoc <- function(pathway_set, population_cohort) {
# Initiate vectors and lists to fill up with data
ss <- c(); snp_map <- c()
snp_assoc <- list(); pairwise_df <- list(); diff_df <- list()
# Read PLINK files for each pathway
if (pathway_set == "enriched") { snp_folder <- enrich_folder }
if (pathway_set == "unenriched") { snp_folder <- unenrich_folder }
# Define pathway files
bed <- list.files(path=snp_folder, pattern="*.bed", full.names=TRUE)
bim <- list.files(path=snp_folder, pattern="*.bim", full.names=TRUE)
fam <- list.files(path=snp_folder, pattern="*.fam", full.names=TRUE)
# Stop if files are missing
if (!length(bed) | !length(bim) | !length(fam)) {
stop("Pathway files are missing (did you run writePathFiles.R?)\n")
} else {
cat(sprintf("\nReading in %s %s pathways\n", length(bed), pathway_set))
}
# Skip SNP assocation calculation for pathways with <= 1 SNP
skip <- sapply(bim, readLines)
skip <- which(sapply(skip, length) <= 1)
if (length(skip) > 0) {
cat(sprintf("Skipping LD calculation for pathway %s. Not enough SNPs\n\n", skip))
bed <- bed[-skip]
bim <- bim[-skip]
fam <- fam[-skip]
} else {
cat("All pathways have enough SNPs for LD calculation. Commencing...\n\n")
}
Sys.sleep(3)
# Run SNP-SNP analysis for each pathway
for (i in seq_along(bim)) {
# Convert PLINK files to snpStats input format
# Output object is a list with 3 elements ($genotypes, $fam, $map)
# NOTE: order is important!
path_name <- gsub("\\..*", "", basename(bim[i]))
cat(sprintf("(%s) %s\n", i, path_name))
ss[[i]] <- read.plink(bed[i], bim[i], fam[i])
# Subset genotypes by population
pop_code <- ifelse(population_cohort == pop_one, 1, 2)
pop_genotypes <- which(ss[[i]]$fam$affected == pop_code)
ss[[i]]$genotypes <- ss[[i]]$genotypes[pop_genotypes,]
cat(sprintf("* Subsetting genotypes for %s population (%s genotypes)\n",
population_cohort, length(pop_genotypes)))
print(ss[[i]]$genotypes)
# Calculate SNP association for all SNPs in pathway
snp_assoc[[i]] <- ld(ss[[i]]$genotypes,
stats="R.squared",
depth=ncol(ss[[i]]$genotypes)-1)
snp_map <- ss[[i]]$map # genomic location of each SNP
# Covert association matrix into a data frame
assoc_df <- as.matrix(snp_assoc[[i]]) # convert sparseMatrix to regular matrix
lowerTriangle(assoc_df, byrow=FALSE, diag=TRUE) <- NA # replace lower triangle and diag with NA
assoc_df <- melt(assoc_df)
colnames(assoc_df)[3] <- "Rsquared"
cat(sprintf("* Calculating r2 association for all SNP-SNP pairs (%s pairs)\n",
nrow(assoc_df)))
# Generate pariwise distance table for each SNP-SNP pair
colnames(assoc_df)[1] <- "snp.name"
snp_map <- subset(snp_map, select=c("snp.name", "chromosome", "position"))
pairwise <- merge(snp_map, assoc_df, by="snp.name")
colnames(pairwise)[1:4] <- c("snp_1", "chr_1", "pos_1", "snp.name")
pairwise <- merge(snp_map, pairwise, by="snp.name")
colnames(pairwise) <- c("snp_1", "chr_1", "pos_1", "snp_2", "chr_2", "pos_2", "Rsquared")
pairwise$dist <- abs(pairwise$pos_1 - pairwise$pos_2)
pairwise$pathway <- path_name
# Assign SNPs to genes
colnames(snp2gene) <- c("snp_1", "gene_1")
pairwise <- left_join(pairwise, snp2gene, by="snp_1")
colnames(snp2gene) <- c("snp_2", "gene_2")
pairwise <- left_join(pairwise, snp2gene, by="snp_2")
# Select for SNP-SNP pairs on different chromosomes
pairwise_df[[i]] <- pairwise
diff_df[[i]] <- filter(pairwise_df[[i]], chr_1 != chr_2)
cat(sprintf("* Subsetting for trans-chromosomal SNP-SNP pairs (%s pairs)\n\n",
nrow(diff_df[[i]])))
}
# Generate summary files
all_pairs_df <- do.call("rbind", pairwise_df)
diff_pairs_df <- do.call("rbind", diff_df)
cat(sprintf("Completed SNP-SNP association analysis (%s, %s, %s pathways)\n",
population_cohort, pathway_set, length(bim)))
cat(sprintf(" --> %i total trans-chromosomal pairs\n\n", nrow(diff_pairs_df)))
# Prepare for output
diff_pairs_df$population <- population_cohort
diff_pairs_df$set <- pathway_set
# Return out
return(diff_pairs_df)
}
# Calculation of all pairwise SNP combinations
# n is the number of SNPs in each pathway, divided by 2 to account for
# duplicate pairs ("n choose 2")
# combos <- function(n) {
# return( n*(n-1)/2 ) }
# Calculate trans-chromosomal SNP association within enriched and
# unenriched pathways per population cohort
enrich_pop_one <- calcAssoc("enriched", pop_one)
enrich_pop_two <- calcAssoc("enriched", pop_two)
unenrich_pop_one <- calcAssoc("unenriched", pop_one)
unenrich_pop_two <- calcAssoc("unenriched", pop_two)
## SIGNIFICANCE CALCULATION
cat("\nDetermining significant within-pathway coevolution\n")
# Merge all results together
dat <- rbind(enrich_pop_one, unenrich_pop_one, enrich_pop_two, unenrich_pop_two)
# Initiate lists
enrich_path_ld <- list()
ks_pval <- list()
res <- list()
# Determine significance of trans-chromosomal SNP-SNP association
# per enriched pathway vs. cumulative set of unenriched pathways
# via the KS test (alternative=less{the CDF of x lies below+right of y})
calcSig <- function(population_cohort) {
# Separate data by enriched and unenriched pathways
enriched <- filter(dat, population == population_cohort & set == "enriched")
unenriched <- filter(dat, population == population_cohort & set == "unenriched")
# Prepare p-value dataframe
pval_df <- data.frame(
pathway=unique(enriched$pathway),
pvalue=NA,
qvalue=NA
)
# Calculate p-value per enriched pathway
for (path in unique(enriched$pathway)) {
test_path <- subset(enriched, pathway == path)
ks_test <-
suppressWarnings(
ks.test(
test_path$Rsquared,
unenriched$Rsquared,
alternative="less",
))
# Insert pvalue in pval_df
ks_pval <- ks_test$p.value
pval_df[which(pval_df$pathway == path), "pvalue"] <- ks_pval
}
# FDR-correct pvalues for multiple hypothesis testing
pval_df <- pval_df[order(pval_df$pvalue),]
pval_df$qvalue <- p.adjust(pval_df$pvalue, method="BH")
# Quantify number of significant pathways
n_sig <- filter(pval_df, qvalue <= ASSOC_FDR)
cat(sprintf("* Pathways with significant coevolution in %s population: %s\n",
population_cohort, nrow(n_sig)))
# Get summary statistics for final output dataframe
nsnp <- enriched %>% group_by(pathway) %>% summarise(n_snps=length(Rsquared), .groups="keep")
stat <- enriched %>% group_by(pathway) %>% summarise(mean_r2=mean(Rsquared, na.rm=TRUE), .groups="keep")
perc <- enriched %>% group_by(pathway) %>% summarise(perc_99=quantile(Rsquared, 0.99, na.rm=TRUE), .groups="keep")
pval_df <- reduce(list(nsnp, stat, perc, pval_df), left_join, by="pathway")
pval_df <- as.data.frame(pval_df)
pval_df <- pval_df[order(pval_df$qvalue),]
# Write out results
fname <- sprintf("%s/table_WPM_enrich_sig_coevolution_%s.txt", output_folder, population_cohort)
write.table(format(pval_df, digits=3), file=fname, col=TRUE, row=FALSE, quote=FALSE, sep="\t")
cat(sprintf("** Table of pathway signifiance values written to %s\n", fname))
# Return out results
return(pval_df)
}
# Run on both populations
results_pop_one <- calcSig(population_cohort=pop_one)
results_pop_two <- calcSig(population_cohort=pop_two)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.