R/roh_window.R

Defines functions roh_window

Documented in roh_window

#' Sliding window to capture the high frequent ROH region
#'
#'
#' This function is used to summarize ROH by window size
#' Contain three functions
#' First is to generate sliding windows by the customized size for all Chromosomes
#' Second is to find out the overlap between sliding windows and roh by Chromosome, Start and End position
#' Third is to summary results, count the roh frequency in each sliding window
#'
#'
#' @param roh roh file from plink results, or generated by cnv_clean for CNVPartition results
#' @param window_size the length of sliding windows for dividing the chromosome, the unit is Mb
#' @param max_chr The maximum number of chromosome
#' @param length_group Vector, must be 5 integer unit in 'Mb'. The default value are 1, 2, 4, 8 and 16 Mb, used for category the length group of ROH
#' @param length_autosomal Important, total autosomal length of the corresponding species, used in calculated the F_ROH
#' @param threshold the proportion used in the roh windows distribution plot. For example 1000 samples, if threshold set as 0.5,
#' and 1000*0.5 = 500, the value of histogram above 500 will be color as red and the group below 500 will filling with other color
#' @param folder set name of new created folder
#' @param col_higher set the color of bar which frequency higher than threshold in high frequent ROH plot
#' @param col_lower set the color of bar which frequency lower than threshold in high frequent ROH plot
#' @param height_1 set the height of final high frequent ROH plot, units ='cm'
#' @param width_1 set the width of final high frequent ROH plot, units = 'cm'
#' @param legend_x set position of legend, 0 to 1 indicate from left to right
#' @param legend_y set position of legend, 0 to 1 indicate from bottom to top
#' @param ncol_1 set how many columns to present in facet plot
#' @param x_label customize the label of x axis in ROH plot, default text is "Physical Position (Mb)"
#' @param y_label customize the label of y axis in ROH plot, default text is "Number of Samples"
#'
#' @import dplyr
#' @importFrom data.table fread fwrite setkey foverlaps setDT
#' @importFrom scales unit_format
#' @importFrom stats sd
#' @importFrom utils read.table write.table
#'
#' @return
#' Summary results of ROH by sliding windows across all chromosomes.
#' @export roh_window
#'
roh_window <- function(roh, window_size = 1, max_chr = 29, length_autosomal = 2489.386, threshold = 0.3, length_group = c(1, 2, 4, 8, 16), folder = "roh_window", col_higher = "red", col_lower = "black", height_1 = 18, width_1 = 21, legend_x = 0.9, legend_y = 0.1, ncol_1 = 6, x_label = "Physical Position (Mb)", y_label = "Number of Samples") {
  if(!(dir.exists(folder))){
    dir.create(folder)
    print(paste0("New foler ", folder, " was create in working directory."))
  }
  ###################################################################################
  #1.set module of windows for single chromosome
  window_module <- function(chr, size, chr_length){
    n_interval <- ceiling(chr_length/size) #calculate the number of interval to generate

    Chr <- rep(x = chr, times = n_interval)

    if (chr_length %% size == 0){
      Start <- seq(from = 0, to = chr_length - size, by = size)
      End <- seq(from = size, to = chr_length, by = size)
    } else{
      Start <- seq(from = 0, to = chr_length, by = size)
      End <- seq(from = size, to = chr_length + size, by = size)
    }

    window <- data.frame(Chr, Start, End)
    return(window)
  }

  #2.call all windows
  set_window <- function(win_size){
    # ars <- data.frame("chr" = c(29:1),
    #                   "length" = c( 51.098607, 45.94015, 45.612108, 51.992305,
    #                                 42.350435, 62.317253, 52.498615, 60.773035, 69.862954,
    #                                 71.974595, 63.449741, 65.820629, 73.167244, 81.013979,
    #                                 85.00778, 82.403003, 83.472345, 87.216183, 106.982474,
    #                                 103.308737, 105.454467, 113.31977, 110.682743, 117.80634,
    #                                 120.089316, 120.000601, 121.005158, 136.231102, 158.53411))

    # extract chr and maximum length from roh data
    chr_len <- roh %>%
                  group_by(Chr) %>%
                  summarise(length = max(End) / 1000000) %>%
                  arrange(as.numeric(Chr)) %>%
                  select(Chr, length)

    windows <- data.frame()
    for(i in 1:nrow(chr_len)){
      window <- window_module(chr = chr_len$Chr[i], chr_length = chr_len$length[i], size = win_size)
      windows <- rbind(windows, window)
    }

    windows <- dplyr::arrange(windows, Chr, Start) %>%
      mutate(window_id = paste0("Win_", seq(from = 1, to = nrow(windows), by = 1))) %>%
      mutate(Start_win = Start*1000000, End_win = End*1000000) %>%
      select(c("Chr", "Start_win", "End_win", "window_id"))
    windows <- data.table::setDT(windows)
    return(windows)
  }

  #3. plot roh
  plot_roh <- function(roh_freq, threshold, col_higher = "red", col_lower = "blue", legend_x  = 0.9, legend_y = 0.2, ncol_1 = 6, x_label = x_label, y_label = y_label){
    roh_freq = roh_freq
    roh_freq <- roh_freq %>%
      mutate(Group = if_else(prop_roh >= threshold, true = "group_1", false = "group_2"))
    #png(res = 300, filename = "roh_window.png",width = 4000, height = 2000, bg = "transparent")
    color_group <- c(group_1 = col_higher,  group_2 = col_lower)
    ggplot(roh_freq, aes(x = Start_win, y = number_roh, fill = Group)) +
      geom_col() +
      #geom_hline(yintercept = 100) +
      theme_classic() +
      theme(legend.position = c(legend_x, legend_y)) +
      scale_fill_manual(values = color_group,name = "Proportion",
                        labels = c(paste0(">=", threshold*100, "%"),
                                   paste0("<", threshold*100, "%"))) +
      scale_x_continuous(labels = scales::unit_format(scale = 1e-6, unit = NULL)) +
      facet_wrap(~as.factor(Chr), scales =  "free_x", ncol = ncol_1) +
      labs(x = x_label, y = y_label)
    #dev.off()
  }

  # 4 function to unite ROH
  # only three columns required, Chr, Start, End
  merge_roh <- function(roh_input) {
    if (nrow(roh_input) == 1) {
      return(roh_input)
    }

    roh_input <- roh_input[order(roh_input$Chr, roh_input$Start_win),]
    roh_union = roh_input[1, ]

    for (i in 2:nrow(roh_input)) {
      rest_roh <- roh_input[i, ]

      #note the accuracy of interval
      if (all.equal(roh_union$End_win[nrow(roh_union)], rest_roh$Start_win) == TRUE) {
          roh_union$End_win[nrow(roh_union)] <- rest_roh$End_win
         } else {
           roh_union <- bind_rows(roh_union, rest_roh)
         }
    }
    return(roh_union)
  }


  #5. sub_function to unite roh from high frequent roh list
  #process by chromosome, then combine all together
  unite_roh_bychr <- function(high_freq){

    #build an empty table to save results
    roh_uni <- data.frame()

    #get the unique chr number, it will used in next for loop
    chr <- sort(unique(high_freq$Chr))

    #unite roh by chromosome
    for (i in chr){
      roh_chr <- high_freq[which(high_freq$Chr == i), ]
      roh_chr <- merge_roh(roh_input = roh_chr)
      roh_uni <- rbind(roh_uni, roh_chr)
      cat(paste0("High frequency ROH regions on Chromosome ", i, " has been processed.\n"))
    }

    return(roh_uni)
  }

  #4. fine mapping ROH regions
  #this function used to uniting adjacent roh
  unite_roh <- function(roh_freq, high_freq_threshold){
    #read roh data, default list from HandyCNV
    #roh_freq <- read.table(file = roh_freq, header = T, sep = "\t")
    high_freq <- roh_freq %>%
      filter(prop_roh >= high_freq_threshold) %>%
      select(Chr, Start_win, End_win)

    #unite roh regions
    high_freq_union <- unite_roh_bychr(high_freq = high_freq)

    return(high_freq_union)
  }
  ####################################################################################

  if(typeof(roh) == "character"){
    roh <- fread(file = roh)
  } else {
    roh = roh
  }

  #check if the input is a Plink results
  #convert to the standard format if it is
  plink_roh_names <- c("FID", "IID", "PHE", "CHR", "SNP1", "SNP2", "POS1", "POS2",
                       "KB", "NSNP", "DENSITY", "PHOM", "PHET")
  if(length(colnames(roh)) == length(plink_roh_names)){
    cat("ROH with input file in PLINK format was detected, coverting to HandyCNV standard formats\n")
    colnames(roh) <- c("FID", "Sample_ID", "PHE", "Chr", "Start_SNP", "End_SNP",
                       "Start", "End", "Length", "Num_SNP", "DENSITY", "PHOM", "PHET")
    handycnv_name <- c("Sample_ID",	"Chr", "Start", "End", "Num_SNP",	"Length", "Start_SNP",	"End_SNP")
    roh <- roh %>% select(handycnv_name) %>% mutate(CNV_Value = 2)
    roh$Length <- roh$Length * 1000 #because the unit is kb in default plink results
  }
  roh <- roh %>% filter(Chr >=1 & Chr <= max_chr)
  roh <- data.table::setDT(roh)

  #length group
  cat("To group by length...\n")
  len_int <- as.vector(length_group)
  len_int_bp <- len_int * 1000000 #convert to Mb
  roh_length <- roh %>%
                mutate(Length_group = case_when(Length < len_int_bp[1] ~ paste0("<", len_int[1], "Mb"),
                                                Length >= len_int_bp[1] & Length < len_int_bp[2] ~ paste0(len_int[1], "-", len_int[2], "Mb"),
                                                Length >= len_int_bp[2] & Length < len_int_bp[3] ~ paste0(len_int[2], "-", len_int[3], "Mb"),
                                                Length >= len_int_bp[3] & Length < len_int_bp[4] ~ paste0(len_int[3], "-", len_int[4], "Mb"),
                                                Length >= len_int_bp[4] & Length < len_int_bp[5] ~ paste0(len_int[4], "-", len_int[5], "Mb"),
                                                Length >= len_int_bp[5]  ~ paste0(">=", len_int[5], "Mb")))

  length_summary <- roh_length %>%
                    group_by(Length_group) %>%
                    summarise(number_ROH = n(),
                              Mean = round(mean(Length, na.rm = T), digits = 0),
                              SD = round(sd(Length, na.rm = T), digits = 0),
                              Min = min(Length, na.rm = T),
                              Max = max(Length, na.rm = T)) %>%
                    mutate(Proportion = round(number_ROH / sum(number_ROH), digits = 3),
                           N_per_animal = round(number_ROH / length(unique(roh$Sample_ID)), digits = 1))
  cat("The summary of length groups as following: \n")
  print(length_summary)

  fwrite(length_summary, file = paste0(folder, "/length_group_summary.txt"), sep = "\t", quote = FALSE)


  #summary of roh for individual
  cat("Calculating inbreeding coefficient...\n")
  if(all(len_int == c(1, 2, 4, 8, 16))){
    cat("by length group...\n")

    indiv_roh <- roh_length %>%
      group_by(Sample_ID) %>%
      summarise(n = n_distinct(Start),
                total_length = sum(Length)/1000000,
                Mean = round(mean(Length) /1000000, digits = 2),
                SD = round(sd(Length)/1000000, digits = 2),
                min_length = min(Length)/1000000,
                max_length = max(Length)/1000000) %>%
      mutate(Froh_1Mb = round(total_length/length_autosomal, digits = 3))


    inbreeding_roh_2mb <- roh_length %>%
      filter(!Length_group == "<1Mb" & !Length_group == "1-2Mb") %>%
      group_by(Sample_ID) %>%
      summarise(Froh_2Mb = round(sum(Length) / length_autosomal / 1000000, digits = 3))

    inbreeding_roh_4mb <- roh_length %>%
      filter(!Length_group == "<1Mb" & !Length_group == "1-2Mb" & !Length_group == "2-4Mb") %>%
      group_by(Sample_ID) %>%
      summarise(Froh_4Mb = round(sum(Length) / length_autosomal / 1000000, digits = 3))

    inbreeding_roh_8mb <- roh_length %>%
      filter(!Length_group == "<1Mb" & !Length_group == "1-2Mb" & !Length_group == "2-4Mb" & !Length_group == "4-8Mb") %>%
      group_by(Sample_ID) %>%
      summarise(Froh_8Mb = round(sum(Length) / length_autosomal / 1000000, digits = 3))

    inbreeding_roh_16mb <- roh_length %>%
      filter(Length_group == ">=16Mb") %>%
      group_by(Sample_ID) %>%
      summarise(Froh_16Mb = round(sum(Length) / length_autosomal / 1000000, digits = 3))

    indiv_roh_inbreeding <- indiv_roh %>%
                            left_join(x = ., y = inbreeding_roh_2mb, by = "Sample_ID") %>%
                            left_join(x = ., y = inbreeding_roh_4mb, by = "Sample_ID") %>%
                            left_join(x = ., y = inbreeding_roh_8mb, by = "Sample_ID") %>%
                            left_join(x = ., y = inbreeding_roh_16mb, by = "Sample_ID")

    cat("Inbreeding coefficient of Individual as following:\n")
    print(indiv_roh_inbreeding)
    fwrite(indiv_roh_inbreeding, file = paste0(folder, "/indiv_roh_inbreeding.txt"), sep = "\t", quote = FALSE)
  } else {
    cat("by 1Mb length...\n")
    indiv_roh <- roh_length %>%
      group_by(Sample_ID) %>%
      summarise(n = n_distinct(Start),
                total_length = sum(Length)/1000000,
                Mean = round(mean(Length) /1000000, digits = 2),
                SD = round(sd(Length)/1000000, digits = 2),
                min_length = min(Length)/1000000,
                max_length = max(Length)/1000000) %>%
      mutate(Froh_1Mb = round(total_length/length_autosomal, digits = 3))

    cat("Inbreeding coefficient of Individual as following:\n")
    print(indiv_roh)
    fwrite(indiv_roh, file = paste0(folder, "/indiv_roh.txt"), sep = "\t", quote = FALSE)
  }



  total_n_sample <- length(unique(roh$Sample_ID))

  cat(paste0("Setting the sliding window in ", window_size, " Mb interval...\n"))
  w <- set_window(win_size = window_size)

  setkey(w, Chr, Start_win, End_win)
  win_over <- data.table::foverlaps(x = roh, y = w, by.x = c("Chr", "Start", "End"), type = "any")

  cat("Counting the frequency of ROHs that overlapped to sliding windows...\n")
  roh_freq <- win_over %>%
    group_by(window_id) %>%
    summarise(number_roh = n_distinct(Sample_ID)) %>%
    merge(., y = w, by = "window_id") %>% #reassign the position for windows
    mutate(prop_roh = round(number_roh / total_n_sample, 3)) %>%
    arrange(-number_roh)


  cat("The top 10 ROHs by frequency are as follows:\n")
  print(roh_freq[1:10, ])
  fwrite(roh_freq, file = paste0(folder, "/roh_freq.txt"), sep = "\t", quote = FALSE)


  #unite high frequency ROH region
  cat("Uniting the adjacent sliding windows that with ROH frequency greater than ", threshold * 100, "% of the sample size...\n")
  hfroh <- unite_roh(roh_freq = roh_freq, high_freq_threshold = threshold)

  if(nrow(hfroh) >= 1){
    hfroh <- hfroh %>%
      mutate(Interval_ID = paste0("ROH_", seq(1, nrow(hfroh), 1)),
             Length = round(End_win - Start_win + 1, digits = 0)) %>%
      select(Interval_ID, Chr = Chr, Start = Start_win, End = End_win, Length)

    print(hfroh)
  } else {
    cat("No ROH regions was passed ", threshold * 100, "% threshold.\n")
  }

  cat(paste0("The total length of high frequency ROH regions about ", round(sum(hfroh$Length), digits = 0), " bp.\n"))
  write.table(x = hfroh, file = paste0(folder, "/high_freq_roh.txt"), quote = F, row.names = F, col.names = T, sep = "\t")


  #plot roh
  cat("Plotting frequncy of ROH by windows...\n")
  png(res = 350,filename = paste0(folder, "/roh_freq_windows.png"), width = width_1, height = height_1, bg = "transparent", units = "cm")
  windows_plot <- plot_roh(roh_freq = roh_freq,
                           threshold = threshold,
                           col_higher = col_higher,
                           col_lower = col_lower,
                           legend_x = legend_x,
                           legend_y = legend_y,
                           ncol_1 = ncol_1, x_label = x_label, y_label = y_label)
  print(windows_plot)
  dev.off()
  #plot_roh(roh_freq = roh_freq, threshold = threshold)
  #return(hfroh)
  cat("Task done.\n")
}
JH-Zhou/HandyCNV documentation built on Dec. 18, 2021, 12:25 a.m.