R/haplo_selection.R

Defines functions haplo_selection

Documented in haplo_selection

#' Generate a selection of markers appropriate for haplotype generation
#'
#' This function applies a series of filters on the genotype data provided (either
#' in vcf or hapmap format) and outputs a list of markers kept at different stages
#' of the analysis.
#'
#' The behaviour of the function can be finely tuned by providing the function call
#' with an \code{analysis_parameters} object with appropriate contents.
#'
#' See function \code{genotype.filter} for a list of criteria used to filter the
#' markers kept for haplotype generation.
#'
#' So far, the function can only take hapmap and vcf files as input. The name of
#' the input file must be provided in the list of analysis parameters, but there
#' should also be an option allowing the user to provide a genotype object as input
#' to this function. This would allow the user an easier way to look at their input
#' data before carrying the analysis.
#'
#' @param analysis_parameters a list of analysis parameters generated by the function
#'   \code{build.analysis.parameters}.
#' @param snp_data To be completed.
#' @param verbose To be completed.
#'
#' @return A snp_data containing the subset of the chosen markers.
#'
#' @export
#'
#' @examples
#' NULL
#'
haplo_selection <- function(analysis_parameters, snp_data = NULL, verbose = TRUE) {

  # Reading input data according to format

  if(is.null(snp_data)) {
    # Read the data from file if no snp_data object has been provided
    if(analysis_parameters$File_format == "hapmap") {
      all_markers <- read_hapmap(filename = analysis_parameters$Input_file)

    } else if (analysis_parameters$File_format == "vcf") {
      all_markers <- read_vcf(filename = analysis_parameters$Input_file)

    } else {
      stop("File format should be vcf or hapmap. Other formats not recognized")
    }
  } else {
    # Otherwise the data is used directly
    all_markers <- snp_data
  }


  # Avoids problems later on if markers have numeric names
  if(is.numeric(all_markers[["Markers"]][["rs"]])) {
    all_markers[["Markers"]][["rs"]] <- paste0("X", all_markers[["Markers"]][["rs"]])
    colnames(all_markers[["Genotypes"]]) <- paste0("X", colnames(all_markers[["Genotypes"]]))
  }

  # Initializing the output object with analysis parameters
  results <- list(Parameters = analysis_parameters)

  # Adding kinship and structure data to the analysis (if provided)
  if(!is.null(analysis_parameters$Kinship_file)) {
    results[["Kinship"]] <- read_kinship(analysis_parameters$Kinship_file)
  } else {
    results[["Kinship"]] <- NULL
  }

  if(!is.null(analysis_parameters$Structure_file)) {
    results[["Structure"]] <- read_structure(analysis_parameters$Structure_file)
  } else {
    results[["Structure"]] <- NULL
  }

  # Adding the whole set of markers to the results object
  results[["All_markers"]] <- all_markers

  # Ensuring that the names are the same across the different objects
  # A warning should be issued here when the names are not the same
  if(!is.null(results[["Structure"]])) rownames(results[["Structure"]]) <- rownames(results[["All_markers"]][["Genotypes"]]@.Data)
  if(!is.null(results[["Kinship"]])) rownames(results[["Kinship"]]) <- rownames(results[["All_markers"]][["Genotypes"]]@.Data)

  # Store gene center in local variable for conciseness
  gene_center <- analysis_parameters$Gene_center

  # Remove markers on the wrong chromosome, heterozygous markers and alleles
  # with no alternate genotypes.
  filtered_markers <- genotype_filter(snp_data = all_markers,
                                      chrom = analysis_parameters$Gene_chromosome,
                                      center_pos = gene_center,
                                      max_distance_to_gene = analysis_parameters$Maximum_marker_to_gene_distance,
                                      max_missing_threshold = analysis_parameters$Maximum_missing_allele_frequency,
                                      max_het_threshold = analysis_parameters$Maximum_heterozygous_frequency,
                                      min_alt_threshold = analysis_parameters$Minimum_alternate_allele_frequency,
                                      min_allele_count = analysis_parameters$Minimum_allele_count,
                                      verbose = verbose
                                      )

  # The rest of the computations occur only if the "snp_data" is suitable.
  #  There should be an else statement that results in a warning to the user.
  #  Or else it should just be some kind of "stopifnot" statement which throws
  #  an error if there are no markers kept.

  if(is_valid(snp_data = filtered_markers, center_pos = gene_center)) {
    # Compute linkage disequilibrium
    # These results will be used for the clustering and use the non-corrected R2
    filtered_LD <- compute_ld(filtered_markers,
                              measure = analysis_parameters$Cluster_R2_measure,
                              kinship = results[["Kinship"]],
                              structure = results[["Structure"]])

    filtered_markers[["LD"]]         <- filtered_LD[["LD"]]
    filtered_markers[["LD_measure"]] <- filtered_LD[["LD_measure"]]

    #Adding the object containing filtered markers to the output
    results[["Filtered_markers"]] <- filtered_markers

    # Concatenate adjacent markers with LD = 1 and keep only the first such marker
    clustered_markers <- cluster(snp_data = filtered_markers,
                                 center_pos = gene_center,
                                 block_threshold = analysis_parameters$Marker_cluster_threshold)

    # Adding the corrected LD values to the object
    if(analysis_parameters$Cluster_R2_measure == analysis_parameters$R2_measure) {
      clustered_markers[["LD_measure"]] <- analysis_parameters$R2_measure
    } else {
      clustered_LD <- compute_ld(snp_data = clustered_markers,
                                 measure = analysis_parameters[["R2_measure"]],
                                 kinship = results[["Kinship"]],
                                 structure = results[["Structure"]])

      clustered_markers[["LD"]]         <- clustered_LD[["LD"]]
      clustered_markers[["LD_measure"]] <- clustered_LD[["LD_measure"]]
    }

    # Adding the clustered markers to the output
    results[["Clustered_markers"]] <- clustered_markers

    # Remove markers which have no LD > 0.5 across the gene center
    selected_clusters <-
      cluster_selection(snp_data = clustered_markers,
                        center_pos = gene_center,
                        ld_threshold = analysis_parameters$Marker_independence_threshold,
                        max_pair_distance = analysis_parameters$Maximum_flanking_pair_distance,
                        max_marker_distance = analysis_parameters$Maximum_marker_to_gene_distance)

    if(is_valid(snp_data = selected_clusters, center_pos = gene_center)) {
      results[["Selected_clusters"]] <- selected_clusters
      results[["Selected_markers"]] <- restore_markers(snp_original = filtered_markers,
                                                       snp_target   = results[["Selected_clusters"]])
      results[["Haplotypes"]] <- haplotypes(results[["Selected_markers"]],
                                            results[["Selected_clusters"]][["Markers"]],
                                            center_pos = gene_center)
    } else {
      warning("No markers were kept following the clustering phase.")
      results$Selected_clusters <- NA
      results$Selected_markers  <- NA
      results$Haplotypes        <- NA
      return(results)
    }

  } else {
    warning("No markers were kept following the filtering phase.")
    results$Clustered_markers <- NA
    results$Selected_clusters <- NA
    results$Selected_markers  <- NA
    results$Haplotypes        <- NA
    return(results)
  }

  return(results)
}
malemay/HaplotypeMiner documentation built on Feb. 6, 2024, 3:29 a.m.