R/genphys_fit.R

Defines functions genphys_fit

Documented in genphys_fit

# genphys_fit
#' Model the relationship between genetical and physical genome positions
#'
#' Fit a penalized thin plate regression spline (Wood, 2003) through genetic position in centiMorgan and physical genome positions in (Mega) base pairs.
#' @param MF.obj A mapfuser object with a reference map loaded and either a consensus map created with mapfuser or a genetic loaded with read.maps
#' @param type Fit physical genome positions vs. consensus genetic map made with mapfuser or an individual genetic map
#' @param map Name of the genetic map to use when the consensus map is not used for fitting a P-spline and recombination rate calculation
#' @param z discard invididual data points based on z-score threshold for scaled pearson residuals
#' @param chromosomes The chromosomes to fit a P-spline, default to all chromosomes
#' @return The input object is returned with added components recombination rate at a 0.1 Mbp interval,
#' the general additive model fit (gam) with penalized spline fits per chromosome,
#' and predictions of centiMorgan positions used for fitting. Markers that have been removed due to z-threshold are saved to the config slot.
#' @author Dennis van Muijen
#' @examples
#' \dontshow{
#' fpath <- system.file("extdata", package="mapfuser")
#' maps <- list.files(fpath, pattern = "Bur", full.names = TRUE)
#' MF.obj <- read_maps(mapfiles = maps, sep = ",", header = TRUE, type = "delim")
#' ref_file <- list.files(fpath, pattern = "reference", full.names = TRUE)
#' MF.obj <- read_ref(MF.obj = MF.obj, ref_file = ref_file, sep = ",", header = TRUE, type = "delim")
#' MF.obj <- genphys_fit(MF.obj,chromosomes = 1, map = "Col-0_Bur-0.csv", type = "map")
#' }
#' \dontrun{
#' MF.obj <- genphys_fit(MF.obj, type = "consensus", z = 5, chromosomes = 1:5, map = NULL)
#' MF.obj <- genphys_fit(MF.obj, type = "map", z = 5, chromosomes = 1:5, map = "Col-0_Blh-1.csv")
#' # Plot the result
#' plot(MF.obj, which = "mareymap", maps = "consensus", chr = 1:5)
#' }
#' @export

genphys_fit <- function(MF.obj, type = c("consensus", "map"), z = 5, chromosomes = NULL, map = NULL) {
  if (class(MF.obj) != "mapfuser") {
    stop("Object should be of class mapfuser")
  }
  if (is.null(MF.obj$ref_map)) {
    stop("Load a reference map first")
  }
  if (is.null(chromosomes)) {
    chromosomes <- MF.obj$config$chromosomes
  }
  if (type == "consensus") {
    df <- inner_join(MF.obj$ref_map, MF.obj$result$consensus_map, by = "Marker") %>%
      arrange(!! as.name("Chr"), !! as.name("Position_physical"))
  }
  if (type == "map") {
    map_frame <- MF.obj$input[[map]]
    df <- inner_join(MF.obj$ref_map, map_frame, by = "Marker") %>%
      arrange(!! as.name("Chr"), !! as.name("Position_physical"))
  }
  MF.obj$config$removed_markers <- NULL
  MF.obj$pspline$recombination_rate <- NULL
  MF.obj$pspline$gam_models <- list()
  MF.obj$pspline$gam_fitted <- NULL

  for (i in seq_along(chromosomes)) {
    df_i <- filter(df, (!!quote(LG)) == chromosomes[i] & (!!quote(Chr)) == chromosomes[i])
    ## Through error if dataset is small
    if (nrow(df_i) <= 5) {
      stop(paste0("Not enough data points to fit Pspline for chromosome ", chromosomes[i]))
    }
    k <- (nrow(df_i) / 2) %>%
      trunc()
    if (k > 25) {
      k <- 25
    }
    set.seed(987428)
    MF.obj$pspline$gam_models[[i]] <- gam(
      Position ~ s(Position_physical, bs = "tp", fx = FALSE, k = k),
      data = df_i, method = "GCV.Cp"
    )
    df_i$Predictions <- predict(MF.obj$pspline$gam_models[[i]], df_i)
    markers_toremove <- which(residuals(MF.obj$pspline$gam_models[[i]], type = "scaled.pearson") %>%
      abs() > z)
    while (length(markers_toremove) != 0) {
      df_i <- df_i[-markers_toremove, ]
      MF.obj$config$removed_markers <- rbind(MF.obj$config$removed_markers, df_i[markers_toremove, ])
      MF.obj$pspline$gam_models[[i]] <- NULL
      MF.obj$pspline$gam_models[[i]] <- gam(
        Position ~ s(Position_physical, bs = "tp", fx = FALSE, k = k),
        data = df_i, method = "GCV.Cp"
      )
      df_i$Predictions <- predict(MF.obj$pspline$gam_models[[i]], df_i)
      markers_toremove <- NULL
    }
    MF.obj$pspline$gam_fitted <- rbind(MF.obj$pspline$gam_fitted, df_i)
  }
  for (i in seq_along(chromosomes)) {
    rr_df <- filter(MF.obj$ref_map, (!!quote(Chr)) == chromosomes[i])
    r_rate <- data.frame(Position_physical = seq(min(rr_df$Position_physical), max(rr_df$Position_physical), by = 0.01))
    preds <- predict(MF.obj$pspline$gam_models[[i]], r_rate)
    names(preds) <- NULL
    r_rate$cM_per_Mbp <- c(diff(preds, differences = 1) * 10, NA)
    r_rate$Chr <- chromosomes[i]
    MF.obj$pspline$recombination_rate <- rbind(MF.obj$pspline$recombination_rate, r_rate)
  }
  MF.obj$pspline$recombination_rate$cM_per_Mbp[MF.obj$pspline$recombination_rate$cM_per_Mbp < 0] <- 0
  MF.obj$pspline$recombination_rate <- MF.obj$pspline$recombination_rate %>%
    na.omit()
  names(MF.obj$pspline$gam_models) <- chromosomes
  return(MF.obj)
}

Try the mapfuser package in your browser

Any scripts or data that you put into this service are public.

mapfuser documentation built on Oct. 10, 2017, 5:07 p.m.