#' Convert coordinates of genomic intervals
#'
#' The input file should have at least three columns: Chr, Start and End.
#'
#' Quality control Principle of conversion
#' 1.End must larger than Start, i.e. the length of the interval must be a positive value
#' 2.Both the Start and End position must match rows in the def_tar_map file
#' 3.The difference of length between Converted Interval and Original Intervals should be less than threshold `len_diff_threshold`
#' 4.If allow to replace the missing points by nearest point, new interval should fit all above conditions. For Start position, matching proxy from down stream, for End position, matching proxy from up stream
#'
#' @param input_tar the interval list, the version of reference genome should be the same as one in the 'def_tar_map'.
#' @param input_def the interval list the version of reference genome should be the same as one in the 'def_tar_map'.
#' @param def_tar_map map file contains coordinates in both version of map. default file was generated by the 'convert_map' function.
#' @param proxy_range the range to match the nearest proxy position to missing location SNP
#' @param len_diff_threshold the maximum length difference allowed between the converted Intervals and original intervals
#' @param folder create new folder to save results
#'
#' @import dplyr
#' @importFrom data.table fread fwrite setkey foverlaps setDT
#'
#' @return Interval with converted map
#' @export convert_coord
#'
convert_coord <- function(input_tar = NULL, input_def = NULL, def_tar_map, proxy_range = 20000, len_diff_threshold = 100000, folder = "convert_coord"){
if(!file.exists(folder)){
dir.create(folder)
print(paste0("A new folder ", folder, " was created in working directory."))
}
map <- fread(def_tar_map)
#check map file format
standard_map <- c("Name", "Chr_def", "Morgan_def", "Position_def", "Chr_tar", "Mb_tar", "Position_tar", "Match", "Type")
if(all(colnames(map) == standard_map)){
print("'def_tar_map' file passed input check...")
} else{
print("'def_tar_map' file failed input check! This file should be generated by the 'convert_map' funcion. The standard format is as follows:")
print(standard_map)
}
#convert from ARS to UMD
if (is.null(input_def)) {
cnvr <- fread(input_tar)
orig_colnames <- colnames(cnvr)
#first step, matching the start position
cnvr_start <- dplyr::left_join(x = cnvr, y = map, by = c("Chr" = "Chr_tar", "Start" = "Position_tar")) %>%
rename(Start_def = Position_def, Start_Match = Match) %>%
unique()
#second step, matching the end position
cnvr_end <- dplyr::left_join(x = cnvr_start, y = map, by = c("Chr" = "Chr_tar", "End" = "Position_tar")) %>%
rename(End_def = Position_def, End_Match = Match)
#third step, extract all information we need
cnvr_new_coord <- cnvr_end %>%
select(c(orig_colnames, "Start_def", "End_def", "Start_Match", "End_Match")) %>%
unique(.) %>%
replace(., is.na(.), values = "Missing") #replace the missing position as 'Missing'
if(!missing(proxy_range)){
#assign the nearest location for the missing position
#if missing of Start, assign the most nearest smaller position within 200 kb distance
#if missing of End, assign the most nearest larger position within 200 kb distance
missing_start <- which(cnvr_new_coord$Start_def == "Missing")
if(!(length(missing_start) == 0)){
print(paste0(length(missing_start), " start position are Missing in target map file, the most nearest larger position will be assigned"))
for(i in 1:length(missing_start)){
proxy_ponit <- map %>%
filter(Chr_tar == cnvr_new_coord$Chr[missing_start[i]] &
Position_tar >= cnvr_new_coord$Start[missing_start[i]] &
Position_tar < cnvr_new_coord$Start[missing_start[i]] + proxy_range) %>%
select(Position_def) %>%
filter(Position_def > 0)
if(nrow(proxy_ponit) == 0){
cnvr_new_coord$Start_def[missing_start[i]] <- "Missing"
} else{
cnvr_new_coord$Start_def[missing_start[i]] <- min(proxy_ponit$Position_def)
}
}
}
missing_end <- which(cnvr_new_coord$End_def == "Missing")
if(!(length(missing_end) == 0)){
print(paste0(length(missing_end), " end positions are Missing in target map file, the most nearest smaller position will be assigned"))
for(i in 1:length(missing_end)){
proxy_ponit <- map %>%
filter(Chr_tar == cnvr_new_coord$Chr[missing_end[i]] &
Position_tar < cnvr_new_coord$Start[missing_end[i]] &
Position_tar >= cnvr_new_coord$Start[missing_end[i]] - proxy_range) %>%
select(Position_def) %>%
filter(Position_def > 0)
if(nrow(proxy_ponit) == 0){
cnvr_new_coord$End_def[missing_end[i]] <- "Missing"
} else{
cnvr_new_coord$End_def[missing_end[i]] <- max(proxy_ponit$Position_def)
}
}
}
#summary the quality of conversion
print("Conversion Quality Check for Start Positions")
print(table(cnvr_new_coord$Start_Match))
print("Conversion Quality Check for End Positions")
print(table(cnvr_new_coord$End_Match))
}
cnvr_stat_1 <- cnvr_new_coord %>%
filter(!(End_def == "Missing" | Start_def == "Missing")) %>%
mutate(original_len = End - Start + 1,
converted_len = as.numeric(End_def) - as.numeric(Start_def) + 1,
length_diff = abs(converted_len - original_len),
qc_check = if_else(converted_len > 0 & Start_Match == "Match" & End_Match == "Match" & length_diff <= len_diff_threshold,
true = "Pass",
false = "Fail"))
cnvr_stat_2 <- cnvr_new_coord %>%
filter(End_def == "Missing" | Start_def == "Missing") %>%
mutate(original_len = End - Start + 1,
converted_len = "Missing",
length_diff = "Missing",
qc_check = "Fail")
cnvr_stat_all <- base::rbind(cnvr_stat_1, cnvr_stat_2)
cnvr_right_clean <- cnvr_stat_all %>%
filter(qc_check =="Pass") %>%
#select(c("CNVR_ID", "Chr", "Start_def", "End_def", "converted_len")) %>%
#rename(Start = "Start_def", End = "End_def", Length = "converted_len") %>%
setDT()
print(paste0(nrow(cnvr_right_clean), " Intervals passed quality check."))
print(table(cnvr_stat_all$qc_check))
#check total length change
clean_conv_len <- sum( as.integer(cnvr_right_clean$converted_len) )
clean_orig_len <- sum(cnvr_right_clean$original_len)
total_orig_len <- sum(cnvr_stat_all$original_len)
failed_convert_len <- total_orig_len - clean_orig_len
diff_clean_orig_conv <- clean_conv_len - clean_orig_len
summary_convert <- data.frame("Item" = c("Total Original", "Successful Original","Successful Conversion", "Failed Conversion", "Converted minus Original"),
"Length" = c(total_orig_len, clean_conv_len, clean_orig_len, failed_convert_len, diff_clean_orig_conv),
"Number" = c(nrow(cnvr_stat_all), nrow(cnvr_right_clean), "-", nrow(cnvr_stat_all)- nrow(cnvr_right_clean), "-"))
print(summary_convert)
fwrite(cnvr_stat_all, paste0(folder, "/converted_coord.txt"), sep ="\t", quote = FALSE)
fwrite(cnvr_right_clean, paste0(folder, "/converted_coord.correct"), sep = "\t", quote = FALSE)
fwrite(summary_convert, paste0(folder, "/summary.txt"), sep ="\t", quote = FALSE)
}
else {
cnvr = fread(input_def)
orig_colnames <- colnames(cnvr)
#convert from UMD to ARS
cnvr_start <- dplyr::left_join(x = cnvr, y = map, by =c("Chr" = "Chr_def", "Start" = "Position_def")) %>%
rename(Start_tar = Position_tar, Start_Match = Match)
#second step, matching the end position
cnvr_end <- dplyr::left_join(x = cnvr_start, y = map, by = c("Chr" = "Chr_def", "End" = "Position_def")) %>%
rename(End_tar = Position_tar, End_Match = Match)
#third step, extract all information we need
cnvr_new_coord <- cnvr_end %>%
select(c(orig_colnames, "Start_tar", "End_tar", "Start_Match", "End_Match")) %>%
unique(.) %>%
replace(., is.na(.), values = "Missing") #replace the missing position as 'Missing'
#assign the nearest location for the missing position
#if missing of Start, assign the most nearest larger position within 200 kb distance
#if missing of End, assign the most nearest larger position within 200 kb distance
if(!missing(proxy_range)){
missing_start <- which(cnvr_new_coord$Start_tar == "Missing")
if(!(length(missing_start) == 0)){
print(paste0(length(missing_start), " start positions are Missing in target map file, the most nearest larger position will be assigned"))
for(i in 1:length(missing_start)){
proxy_ponit <- map %>%
filter(Chr_def == cnvr_new_coord$Chr[missing_start[i]] &
Position_def >= cnvr_new_coord$Start[missing_start[i]] &
Position_def < cnvr_new_coord$Start[missing_start[i]] + proxy_range) %>%
select(Position_tar) %>%
filter(Position_tar > 0)
if(nrow(proxy_ponit) == 0){
cnvr_new_coord$Start_tar[missing_start[i]] <- "Missing"
} else{
cnvr_new_coord$Start_tar[missing_start[i]] <- min(proxy_ponit$Position_tar)
}
}
}
missing_end <- which(cnvr_new_coord$End_tar == "Missing")
if(!(length(missing_end) == 0)){
print(paste0(length(missing_end), " end positions are Missing in target map file, the most nearest smaller position will be assigned"))
for(i in 1:length(missing_end)){
proxy_ponit <- map %>%
filter(Chr_def == cnvr_new_coord$Chr[missing_end[i]] &
Position_def < cnvr_new_coord$Start[missing_end[i]] &
Position_def >= cnvr_new_coord$Start[missing_end[i]] - proxy_range) %>%
select(Position_tar) %>%
filter(Position_tar > 0)
if(nrow(proxy_ponit) == 0){
cnvr_new_coord$End_tar[missing_end[i]] <- "Missing"
} else{
cnvr_new_coord$End_tar[missing_end[i]] <- max(proxy_ponit$Position_tar)
}
}
}
#summary the quality of conversion
print("The quality check of converion on Start Position as below: FALSE means not match between two version.")
print(table(cnvr_new_coord$Start_Match))
print("The quality check of converion on End Position as below: FALSE means not match between two version.")
print(table(cnvr_new_coord$End_Match))
}
cnvr_stat_1 <- cnvr_new_coord %>%
filter(!(End_tar == "Missing" | Start_tar == "Missing")) %>%
mutate(original_len = End - Start + 1,
converted_len = as.numeric(End_tar) - as.numeric(Start_tar) + 1,
length_diff = abs(converted_len - original_len),
qc_check = if_else(converted_len > 0 & Start_Match == "Match" & End_Match == "Match" & length_diff <= len_diff_threshold,
true = "Pass",
false = "Fail"))
cnvr_stat_2 <- cnvr_new_coord %>%
filter(End_tar == "Missing" | Start_tar == "Missing") %>%
mutate(original_len = End - Start + 1,
converted_len = "Missing",
length_diff = "Missing",
qc_check = "Fail")
cnvr_stat_all <- base::rbind(cnvr_stat_1, cnvr_stat_2)
cnvr_right_clean <- cnvr_stat_all %>%
filter(qc_check =="Pass") %>%
#select(c("CNVR_ID", "Chr", "Start_tar", "End_tar", "converted_len")) %>%
#rename(Start = "Start_tar", End = "End_tar", Length = "converted_len") %>%
setDT()
print(paste0(nrow(cnvr_right_clean), " Intervals passed quality check."))
print(table(cnvr_stat_all$qc_check))
#check total length change
clean_conv_len <- sum( as.integer(cnvr_right_clean$converted_len) )
clean_orig_len <- sum(cnvr_right_clean$original_len)
total_orig_len <- sum(cnvr_stat_all$original_len)
failed_convert_len <- total_orig_len - clean_orig_len
diff_clean_orig_conv <- clean_conv_len - clean_orig_len
summary_convert <- data.frame("Item" = c("Total Original", "Succeed Original ","Succeed Convertion", "Failed convertion", "Difference Converted minus Original"),
"Length" = c(total_orig_len, clean_conv_len, clean_orig_len, failed_convert_len, diff_clean_orig_conv),
"Number" = c(nrow(cnvr_stat_all), nrow(cnvr_right_clean), "-", nrow(cnvr_stat_all)- nrow(cnvr_right_clean), "-"))
print(summary_convert)
fwrite(cnvr_stat_all, paste0(folder, "/converted_coord.txt"), sep ="\t", quote = FALSE)
fwrite(cnvr_right_clean, paste0(folder, "/converted_coord.correct"), sep = "\t", quote = FALSE)
fwrite(summary_convert, paste0(folder, "/summary.txt"), sep ="\t", quote = FALSE)
}
}
#' Title compare_interval
#' compare cnvr, each input file should contain Chr, Start and End columns,
#' the Chr column should be the number only, for example: 1 not chr1
#' @param interval_1 the interval list with ARS1.2 coordinates, not limited on ARS once given the right map file
#' @param interval_2 the interval list with UMD3.1 coordinates, not limited on UMD once given the right map file
#' @param map map file contains coordinates in both version of map. only need in comparison between the results from different versions. default file is generated from convert_map function
#'
#' @importFrom data.table fread fwrite setkey foverlaps setDT
#'
#' @return overlap
#' @export compare_interval
#'
compare_interval <- function(interval_1, interval_2, map = NULL){
if(typeof(interval_1) == "character"){
interval_1 <- fread(file = interval_1, header = TRUE)
} else {
interval_1 = interval_1
}
interval_1$Chr <- as.factor(interval_1$Chr)
interval_1$Start <- as.integer(interval_1$Start)
interval_1$End <- as.integer(interval_1$End)
setDT(interval_1)
if(typeof(interval_2) == "character"){
interval_2 <- fread(file = interval_2, header = TRUE)
} else {
interval_2 = interval_2
}
interval_2$Chr <- as.factor(interval_2$Chr)
interval_2$Start <- as.integer(interval_2$Start)
interval_2$End <- as.integer(interval_2$End)
setDT(interval_2)
#set key for interval_1
setkey(interval_1, Chr, Start, End)
names(interval_2) <- paste0(colnames(interval_2),"_2")
#find overlap
overlap <- data.table::foverlaps(x = interval_2, y = interval_1, by.x = c("Chr_2", "Start_2", "End_2"), type = "any")
overlap <- overlap %>%
filter(Start > 0) %>%
mutate(overlap_len = pmin(End_2, End) - pmax(Start, Start_2) + 1)
fwrite(overlap, file = "overlap.txt", sep ="\t", quote = FALSE)
print("task done")
return(overlap)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.