#' Sliding window to capture the high frequent ROH region
#'
#'
#' This function is used to summarize ROH by window size
#' Contain three functions
#' First is to generate sliding windows by the customized size for all Chromosomes
#' Second is to find out the overlap between sliding windows and roh by Chromosome, Start and End position
#' Third is to summary results, count the roh frequency in each sliding window
#'
#'
#' @param roh roh file from plink results, or generated by cnv_clean for CNVPartition results
#' @param window_size the length of sliding windows for dividing the chromosome, the unit is Mb
#' @param max_chr The maximum number of chromosome
#' @param length_group Vector, must be 5 integer unit in 'Mb'. The default value are 1, 2, 4, 8 and 16 Mb, used for category the length group of ROH
#' @param length_autosomal Important, total autosomal length of the corresponding species, used in calculated the F_ROH
#' @param threshold the proportion used in the roh windows distribution plot. For example 1000 samples, if threshold set as 0.5,
#' and 1000*0.5 = 500, the value of histogram above 500 will be color as red and the group below 500 will filling with other color
#' @param folder set name of new created folder
#' @param col_higher set the color of bar which frequency higher than threshold in high frequent ROH plot
#' @param col_lower set the color of bar which frequency lower than threshold in high frequent ROH plot
#' @param height_1 set the height of final high frequent ROH plot, units ='cm'
#' @param width_1 set the width of final high frequent ROH plot, units = 'cm'
#' @param legend_x set position of legend, 0 to 1 indicate from left to right
#' @param legend_y set position of legend, 0 to 1 indicate from bottom to top
#' @param ncol_1 set how many columns to present in facet plot
#' @param x_label customize the label of x axis in ROH plot, default text is "Physical Position (Mb)"
#' @param y_label customize the label of y axis in ROH plot, default text is "Number of Samples"
#'
#' @import dplyr
#' @importFrom data.table fread fwrite setkey foverlaps setDT
#' @importFrom scales unit_format
#' @importFrom stats sd
#' @importFrom utils read.table write.table
#'
#' @return
#' Summary results of ROH by sliding windows across all chromosomes.
#' @export roh_window
#'
roh_window <- function(roh, window_size = 1, max_chr = 29, length_autosomal = 2489.386, threshold = 0.3, length_group = c(1, 2, 4, 8, 16), folder = "roh_window", col_higher = "red", col_lower = "black", height_1 = 18, width_1 = 21, legend_x = 0.9, legend_y = 0.1, ncol_1 = 6, x_label = "Physical Position (Mb)", y_label = "Number of Samples") {
if(!(dir.exists(folder))){
dir.create(folder)
print(paste0("New foler ", folder, " was create in working directory."))
}
###################################################################################
#1.set module of windows for single chromosome
window_module <- function(chr, size, chr_length){
n_interval <- ceiling(chr_length/size) #calculate the number of interval to generate
Chr <- rep(x = chr, times = n_interval)
if (chr_length %% size == 0){
Start <- seq(from = 0, to = chr_length - size, by = size)
End <- seq(from = size, to = chr_length, by = size)
} else{
Start <- seq(from = 0, to = chr_length, by = size)
End <- seq(from = size, to = chr_length + size, by = size)
}
window <- data.frame(Chr, Start, End)
return(window)
}
#2.call all windows
set_window <- function(win_size){
# ars <- data.frame("chr" = c(29:1),
# "length" = c( 51.098607, 45.94015, 45.612108, 51.992305,
# 42.350435, 62.317253, 52.498615, 60.773035, 69.862954,
# 71.974595, 63.449741, 65.820629, 73.167244, 81.013979,
# 85.00778, 82.403003, 83.472345, 87.216183, 106.982474,
# 103.308737, 105.454467, 113.31977, 110.682743, 117.80634,
# 120.089316, 120.000601, 121.005158, 136.231102, 158.53411))
# extract chr and maximum length from roh data
chr_len <- roh %>%
group_by(Chr) %>%
summarise(length = max(End) / 1000000) %>%
arrange(as.numeric(Chr)) %>%
select(Chr, length)
windows <- data.frame()
for(i in 1:nrow(chr_len)){
window <- window_module(chr = chr_len$Chr[i], chr_length = chr_len$length[i], size = win_size)
windows <- rbind(windows, window)
}
windows <- dplyr::arrange(windows, Chr, Start) %>%
mutate(window_id = paste0("Win_", seq(from = 1, to = nrow(windows), by = 1))) %>%
mutate(Start_win = Start*1000000, End_win = End*1000000) %>%
select(c("Chr", "Start_win", "End_win", "window_id"))
windows <- data.table::setDT(windows)
return(windows)
}
#3. plot roh
plot_roh <- function(roh_freq, threshold, col_higher = "red", col_lower = "blue", legend_x = 0.9, legend_y = 0.2, ncol_1 = 6, x_label = x_label, y_label = y_label){
roh_freq = roh_freq
roh_freq <- roh_freq %>%
mutate(Group = if_else(prop_roh >= threshold, true = "group_1", false = "group_2"))
#png(res = 300, filename = "roh_window.png",width = 4000, height = 2000, bg = "transparent")
color_group <- c(group_1 = col_higher, group_2 = col_lower)
ggplot(roh_freq, aes(x = Start_win, y = number_roh, fill = Group)) +
geom_col() +
#geom_hline(yintercept = 100) +
theme_classic() +
theme(legend.position = c(legend_x, legend_y)) +
scale_fill_manual(values = color_group,name = "Proportion",
labels = c(paste0(">=", threshold*100, "%"),
paste0("<", threshold*100, "%"))) +
scale_x_continuous(labels = scales::unit_format(scale = 1e-6, unit = NULL)) +
facet_wrap(~as.factor(Chr), scales = "free_x", ncol = ncol_1) +
labs(x = x_label, y = y_label)
#dev.off()
}
# 4 function to unite ROH
# only three columns required, Chr, Start, End
merge_roh <- function(roh_input) {
if (nrow(roh_input) == 1) {
return(roh_input)
}
roh_input <- roh_input[order(roh_input$Chr, roh_input$Start_win),]
roh_union = roh_input[1, ]
for (i in 2:nrow(roh_input)) {
rest_roh <- roh_input[i, ]
#note the accuracy of interval
if (all.equal(roh_union$End_win[nrow(roh_union)], rest_roh$Start_win) == TRUE) {
roh_union$End_win[nrow(roh_union)] <- rest_roh$End_win
} else {
roh_union <- bind_rows(roh_union, rest_roh)
}
}
return(roh_union)
}
#5. sub_function to unite roh from high frequent roh list
#process by chromosome, then combine all together
unite_roh_bychr <- function(high_freq){
#build an empty table to save results
roh_uni <- data.frame()
#get the unique chr number, it will used in next for loop
chr <- sort(unique(high_freq$Chr))
#unite roh by chromosome
for (i in chr){
roh_chr <- high_freq[which(high_freq$Chr == i), ]
roh_chr <- merge_roh(roh_input = roh_chr)
roh_uni <- rbind(roh_uni, roh_chr)
cat(paste0("High frequency ROH regions on Chromosome ", i, " has been processed.\n"))
}
return(roh_uni)
}
#4. fine mapping ROH regions
#this function used to uniting adjacent roh
unite_roh <- function(roh_freq, high_freq_threshold){
#read roh data, default list from HandyCNV
#roh_freq <- read.table(file = roh_freq, header = T, sep = "\t")
high_freq <- roh_freq %>%
filter(prop_roh >= high_freq_threshold) %>%
select(Chr, Start_win, End_win)
#unite roh regions
high_freq_union <- unite_roh_bychr(high_freq = high_freq)
return(high_freq_union)
}
####################################################################################
if(typeof(roh) == "character"){
roh <- fread(file = roh)
} else {
roh = roh
}
#check if the input is a Plink results
#convert to the standard format if it is
plink_roh_names <- c("FID", "IID", "PHE", "CHR", "SNP1", "SNP2", "POS1", "POS2",
"KB", "NSNP", "DENSITY", "PHOM", "PHET")
if(length(colnames(roh)) == length(plink_roh_names)){
cat("ROH with input file in PLINK format was detected, coverting to HandyCNV standard formats\n")
colnames(roh) <- c("FID", "Sample_ID", "PHE", "Chr", "Start_SNP", "End_SNP",
"Start", "End", "Length", "Num_SNP", "DENSITY", "PHOM", "PHET")
handycnv_name <- c("Sample_ID", "Chr", "Start", "End", "Num_SNP", "Length", "Start_SNP", "End_SNP")
roh <- roh %>% select(handycnv_name) %>% mutate(CNV_Value = 2)
roh$Length <- roh$Length * 1000 #because the unit is kb in default plink results
}
roh <- roh %>% filter(Chr >=1 & Chr <= max_chr)
roh <- data.table::setDT(roh)
#length group
cat("To group by length...\n")
len_int <- as.vector(length_group)
len_int_bp <- len_int * 1000000 #convert to Mb
roh_length <- roh %>%
mutate(Length_group = case_when(Length < len_int_bp[1] ~ paste0("<", len_int[1], "Mb"),
Length >= len_int_bp[1] & Length < len_int_bp[2] ~ paste0(len_int[1], "-", len_int[2], "Mb"),
Length >= len_int_bp[2] & Length < len_int_bp[3] ~ paste0(len_int[2], "-", len_int[3], "Mb"),
Length >= len_int_bp[3] & Length < len_int_bp[4] ~ paste0(len_int[3], "-", len_int[4], "Mb"),
Length >= len_int_bp[4] & Length < len_int_bp[5] ~ paste0(len_int[4], "-", len_int[5], "Mb"),
Length >= len_int_bp[5] ~ paste0(">=", len_int[5], "Mb")))
length_summary <- roh_length %>%
group_by(Length_group) %>%
summarise(number_ROH = n(),
Mean = round(mean(Length, na.rm = T), digits = 0),
SD = round(sd(Length, na.rm = T), digits = 0),
Min = min(Length, na.rm = T),
Max = max(Length, na.rm = T)) %>%
mutate(Proportion = round(number_ROH / sum(number_ROH), digits = 3),
N_per_animal = round(number_ROH / length(unique(roh$Sample_ID)), digits = 1))
cat("The summary of length groups as following: \n")
print(length_summary)
fwrite(length_summary, file = paste0(folder, "/length_group_summary.txt"), sep = "\t", quote = FALSE)
#summary of roh for individual
cat("Calculating inbreeding coefficient...\n")
if(all(len_int == c(1, 2, 4, 8, 16))){
cat("by length group...\n")
indiv_roh <- roh_length %>%
group_by(Sample_ID) %>%
summarise(n = n_distinct(Start),
total_length = sum(Length)/1000000,
Mean = round(mean(Length) /1000000, digits = 2),
SD = round(sd(Length)/1000000, digits = 2),
min_length = min(Length)/1000000,
max_length = max(Length)/1000000) %>%
mutate(Froh_1Mb = round(total_length/length_autosomal, digits = 3))
inbreeding_roh_2mb <- roh_length %>%
filter(!Length_group == "<1Mb" & !Length_group == "1-2Mb") %>%
group_by(Sample_ID) %>%
summarise(Froh_2Mb = round(sum(Length) / length_autosomal / 1000000, digits = 3))
inbreeding_roh_4mb <- roh_length %>%
filter(!Length_group == "<1Mb" & !Length_group == "1-2Mb" & !Length_group == "2-4Mb") %>%
group_by(Sample_ID) %>%
summarise(Froh_4Mb = round(sum(Length) / length_autosomal / 1000000, digits = 3))
inbreeding_roh_8mb <- roh_length %>%
filter(!Length_group == "<1Mb" & !Length_group == "1-2Mb" & !Length_group == "2-4Mb" & !Length_group == "4-8Mb") %>%
group_by(Sample_ID) %>%
summarise(Froh_8Mb = round(sum(Length) / length_autosomal / 1000000, digits = 3))
inbreeding_roh_16mb <- roh_length %>%
filter(Length_group == ">=16Mb") %>%
group_by(Sample_ID) %>%
summarise(Froh_16Mb = round(sum(Length) / length_autosomal / 1000000, digits = 3))
indiv_roh_inbreeding <- indiv_roh %>%
left_join(x = ., y = inbreeding_roh_2mb, by = "Sample_ID") %>%
left_join(x = ., y = inbreeding_roh_4mb, by = "Sample_ID") %>%
left_join(x = ., y = inbreeding_roh_8mb, by = "Sample_ID") %>%
left_join(x = ., y = inbreeding_roh_16mb, by = "Sample_ID")
cat("Inbreeding coefficient of Individual as following:\n")
print(indiv_roh_inbreeding)
fwrite(indiv_roh_inbreeding, file = paste0(folder, "/indiv_roh_inbreeding.txt"), sep = "\t", quote = FALSE)
} else {
cat("by 1Mb length...\n")
indiv_roh <- roh_length %>%
group_by(Sample_ID) %>%
summarise(n = n_distinct(Start),
total_length = sum(Length)/1000000,
Mean = round(mean(Length) /1000000, digits = 2),
SD = round(sd(Length)/1000000, digits = 2),
min_length = min(Length)/1000000,
max_length = max(Length)/1000000) %>%
mutate(Froh_1Mb = round(total_length/length_autosomal, digits = 3))
cat("Inbreeding coefficient of Individual as following:\n")
print(indiv_roh)
fwrite(indiv_roh, file = paste0(folder, "/indiv_roh.txt"), sep = "\t", quote = FALSE)
}
total_n_sample <- length(unique(roh$Sample_ID))
cat(paste0("Setting the sliding window in ", window_size, " Mb interval...\n"))
w <- set_window(win_size = window_size)
setkey(w, Chr, Start_win, End_win)
win_over <- data.table::foverlaps(x = roh, y = w, by.x = c("Chr", "Start", "End"), type = "any")
cat("Counting the frequency of ROHs that overlapped to sliding windows...\n")
roh_freq <- win_over %>%
group_by(window_id) %>%
summarise(number_roh = n_distinct(Sample_ID)) %>%
merge(., y = w, by = "window_id") %>% #reassign the position for windows
mutate(prop_roh = round(number_roh / total_n_sample, 3)) %>%
arrange(-number_roh)
cat("The top 10 ROHs by frequency are as follows:\n")
print(roh_freq[1:10, ])
fwrite(roh_freq, file = paste0(folder, "/roh_freq.txt"), sep = "\t", quote = FALSE)
#unite high frequency ROH region
cat("Uniting the adjacent sliding windows that with ROH frequency greater than ", threshold * 100, "% of the sample size...\n")
hfroh <- unite_roh(roh_freq = roh_freq, high_freq_threshold = threshold)
if(nrow(hfroh) >= 1){
hfroh <- hfroh %>%
mutate(Interval_ID = paste0("ROH_", seq(1, nrow(hfroh), 1)),
Length = round(End_win - Start_win + 1, digits = 0)) %>%
select(Interval_ID, Chr = Chr, Start = Start_win, End = End_win, Length)
print(hfroh)
} else {
cat("No ROH regions was passed ", threshold * 100, "% threshold.\n")
}
cat(paste0("The total length of high frequency ROH regions about ", round(sum(hfroh$Length), digits = 0), " bp.\n"))
write.table(x = hfroh, file = paste0(folder, "/high_freq_roh.txt"), quote = F, row.names = F, col.names = T, sep = "\t")
#plot roh
cat("Plotting frequncy of ROH by windows...\n")
png(res = 350,filename = paste0(folder, "/roh_freq_windows.png"), width = width_1, height = height_1, bg = "transparent", units = "cm")
windows_plot <- plot_roh(roh_freq = roh_freq,
threshold = threshold,
col_higher = col_higher,
col_lower = col_lower,
legend_x = legend_x,
legend_y = legend_y,
ncol_1 = ncol_1, x_label = x_label, y_label = y_label)
print(windows_plot)
dev.off()
#plot_roh(roh_freq = roh_freq, threshold = threshold)
#return(hfroh)
cat("Task done.\n")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.