#' Generate a selection of markers appropriate for haplotype generation
#'
#' This function applies a series of filters on the genotype data provided (either
#' in vcf or hapmap format) and outputs a list of markers kept at different stages
#' of the analysis.
#'
#' The behaviour of the function can be finely tuned by providing the function call
#' with an \code{analysis_parameters} object with appropriate contents.
#'
#' See function \code{genotype.filter} for a list of criteria used to filter the
#' markers kept for haplotype generation.
#'
#' So far, the function can only take hapmap and vcf files as input. The name of
#' the input file must be provided in the list of analysis parameters, but there
#' should also be an option allowing the user to provide a genotype object as input
#' to this function. This would allow the user an easier way to look at their input
#' data before carrying the analysis.
#'
#' @param analysis_parameters a list of analysis parameters generated by the function
#' \code{build.analysis.parameters}.
#' @param snp_data To be completed.
#' @param verbose To be completed.
#'
#' @return A snp_data containing the subset of the chosen markers.
#'
#' @export
#'
#' @examples
#' NULL
#'
haplo_selection <- function(analysis_parameters, snp_data = NULL, verbose = TRUE) {
# Reading input data according to format
if(is.null(snp_data)) {
# Read the data from file if no snp_data object has been provided
if(analysis_parameters$File_format == "hapmap") {
all_markers <- read_hapmap(filename = analysis_parameters$Input_file)
} else if (analysis_parameters$File_format == "vcf") {
all_markers <- read_vcf(filename = analysis_parameters$Input_file)
} else {
stop("File format should be vcf or hapmap. Other formats not recognized")
}
} else {
# Otherwise the data is used directly
all_markers <- snp_data
}
# Avoids problems later on if markers have numeric names
if(is.numeric(all_markers[["Markers"]][["rs"]])) {
all_markers[["Markers"]][["rs"]] <- paste0("X", all_markers[["Markers"]][["rs"]])
colnames(all_markers[["Genotypes"]]) <- paste0("X", colnames(all_markers[["Genotypes"]]))
}
# Initializing the output object with analysis parameters
results <- list(Parameters = analysis_parameters)
# Adding kinship and structure data to the analysis (if provided)
if(!is.null(analysis_parameters$Kinship_file)) {
results[["Kinship"]] <- read_kinship(analysis_parameters$Kinship_file)
} else {
results[["Kinship"]] <- NULL
}
if(!is.null(analysis_parameters$Structure_file)) {
results[["Structure"]] <- read_structure(analysis_parameters$Structure_file)
} else {
results[["Structure"]] <- NULL
}
# Adding the whole set of markers to the results object
results[["All_markers"]] <- all_markers
# Ensuring that the names are the same across the different objects
# A warning should be issued here when the names are not the same
if(!is.null(results[["Structure"]])) rownames(results[["Structure"]]) <- rownames(results[["All_markers"]][["Genotypes"]]@.Data)
if(!is.null(results[["Kinship"]])) rownames(results[["Kinship"]]) <- rownames(results[["All_markers"]][["Genotypes"]]@.Data)
# Store gene center in local variable for conciseness
gene_center <- analysis_parameters$Gene_center
# Remove markers on the wrong chromosome, heterozygous markers and alleles
# with no alternate genotypes.
filtered_markers <- genotype_filter(snp_data = all_markers,
chrom = analysis_parameters$Gene_chromosome,
center_pos = gene_center,
max_distance_to_gene = analysis_parameters$Maximum_marker_to_gene_distance,
max_missing_threshold = analysis_parameters$Maximum_missing_allele_frequency,
max_het_threshold = analysis_parameters$Maximum_heterozygous_frequency,
min_alt_threshold = analysis_parameters$Minimum_alternate_allele_frequency,
min_allele_count = analysis_parameters$Minimum_allele_count,
verbose = verbose
)
# The rest of the computations occur only if the "snp_data" is suitable.
# There should be an else statement that results in a warning to the user.
# Or else it should just be some kind of "stopifnot" statement which throws
# an error if there are no markers kept.
if(is_valid(snp_data = filtered_markers, center_pos = gene_center)) {
# Compute linkage disequilibrium
# These results will be used for the clustering and use the non-corrected R2
filtered_LD <- compute_ld(filtered_markers,
measure = analysis_parameters$Cluster_R2_measure,
kinship = results[["Kinship"]],
structure = results[["Structure"]])
filtered_markers[["LD"]] <- filtered_LD[["LD"]]
filtered_markers[["LD_measure"]] <- filtered_LD[["LD_measure"]]
#Adding the object containing filtered markers to the output
results[["Filtered_markers"]] <- filtered_markers
# Concatenate adjacent markers with LD = 1 and keep only the first such marker
clustered_markers <- cluster(snp_data = filtered_markers,
center_pos = gene_center,
block_threshold = analysis_parameters$Marker_cluster_threshold)
# Adding the corrected LD values to the object
if(analysis_parameters$Cluster_R2_measure == analysis_parameters$R2_measure) {
clustered_markers[["LD_measure"]] <- analysis_parameters$R2_measure
} else {
clustered_LD <- compute_ld(snp_data = clustered_markers,
measure = analysis_parameters[["R2_measure"]],
kinship = results[["Kinship"]],
structure = results[["Structure"]])
clustered_markers[["LD"]] <- clustered_LD[["LD"]]
clustered_markers[["LD_measure"]] <- clustered_LD[["LD_measure"]]
}
# Adding the clustered markers to the output
results[["Clustered_markers"]] <- clustered_markers
# Remove markers which have no LD > 0.5 across the gene center
selected_clusters <-
cluster_selection(snp_data = clustered_markers,
center_pos = gene_center,
ld_threshold = analysis_parameters$Marker_independence_threshold,
max_pair_distance = analysis_parameters$Maximum_flanking_pair_distance,
max_marker_distance = analysis_parameters$Maximum_marker_to_gene_distance)
if(is_valid(snp_data = selected_clusters, center_pos = gene_center)) {
results[["Selected_clusters"]] <- selected_clusters
results[["Selected_markers"]] <- restore_markers(snp_original = filtered_markers,
snp_target = results[["Selected_clusters"]])
results[["Haplotypes"]] <- haplotypes(results[["Selected_markers"]],
results[["Selected_clusters"]][["Markers"]],
center_pos = gene_center)
} else {
warning("No markers were kept following the clustering phase.")
results$Selected_clusters <- NA
results$Selected_markers <- NA
results$Haplotypes <- NA
return(results)
}
} else {
warning("No markers were kept following the filtering phase.")
results$Clustered_markers <- NA
results$Selected_clusters <- NA
results$Selected_markers <- NA
results$Haplotypes <- NA
return(results)
}
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.