#' Generate CNV Regions (CNVR)
#'
#' Generate CNVRs, summarize individual CNVR type, report frequency and type of CNVR
#'
#' @param clean_cnv a clean CNV file that was generated by the clean_cnv function
#' @param roh ROH file from the clean_cnv function; only CNVPartition results will generate ROH results
#' @param chr_set set the maximum number of chromosomes in the CNV list
#' @param folder set the name of the output folder.
#' @import dplyr
#' @importFrom data.table fread fwrite setkey foverlaps setDT
#' @importFrom reshape2 dcast
#'
#' @return Four or five tables: CNVR list, brief summary, and individual CNV and CNVR ID.
#' @export call_cnvr
#'
call_cnvr <- function(clean_cnv, roh = NULL, chr_set = 29, folder = "cnvr") {
if(!file.exists(folder)){
dir.create(folder)
cat(paste0("Output folder '", folder, "' was created in the working directory.\n"))
}
if(typeof(clean_cnv) == "character"){
clean_cnv <- data.table::fread(file = clean_cnv, sep = "\t", header = TRUE)
} else {
clean_cnv = clean_cnv
}
check_colnames <- c("Sample_ID", "Chr", "Start", "End", "CNV_Value", "Length")
if(!all(check_colnames %in% names(clean_cnv))){
cat("Lack of necessary information. The input file shoud have at least five columns as follows:")
print(check_colnames)
stop("Standard input file should be generated by the 'clean_cnv' function!")
}
merge_cnvr <- function(cnv) {
if (nrow(cnv) == 1) {
return(cnv)
}
cnv <- cnv[order(cnv$Chr, cnv$Start),]
cnvr_union = cnv[1, ]
for (i in 2:nrow(cnv)) {
rest_cnv <- cnv[i, ]
if (cnvr_union$End[nrow(cnvr_union)] < rest_cnv$Start) {
cnvr_union <- bind_rows(cnvr_union, rest_cnv)
} else if (cnvr_union$End[nrow(cnvr_union)] == rest_cnv$Start) {
cnvr_union$End[nrow(cnvr_union)] <- rest_cnv$End
}
if (rest_cnv$End > cnvr_union$End[nrow(cnvr_union)]) {
cnvr_union$End[nrow(cnvr_union)] <- rest_cnv$End
}
}
return(cnvr_union)
}
cnvr <- data.frame()
for (i in 1:chr_set){
cnv_chr <- clean_cnv[which(clean_cnv$Chr == i), ]
if(nrow(cnv_chr) >= 1){
cnvr_chr <- merge_cnvr(cnv = cnv_chr)
cnvr <- rbind(cnvr, cnvr_chr)
cat(paste0("Chromosome ", i, " has been processed.\n"))
} else {
cat(paste0("No CNVs detected on Chromosome", i, ".\n"))
}
}
#extract CNVR information, recode for CNVR
cnvr_union <- cnvr[, c("Chr", "Start", "End")]
cnvr_union$CNVR_ID <- paste0("CNVR_", seq(1, nrow(cnvr_union), 1))
cnvr_union <- setDT(cnvr_union)
clean_cnv <- setDT(clean_cnv)
data.table::setkey(cnvr_union, Chr, Start, End)
cnv_cnvr <- data.table::foverlaps(clean_cnv, cnvr_union)
names(cnv_cnvr)[names(cnv_cnvr) == "i.Start"] <- "CNV_Start"
names(cnv_cnvr)[names(cnv_cnvr) == "i.End"] <- "CNV_End"
#add frequency of CNVR
cnvr_frequency <- cnv_cnvr %>%
group_by(CNVR_ID) %>%
summarize(Frequency = n(),
n_Sample = length(unique(Sample_ID)))
cnvr_union_f <- merge(cnvr_union, cnvr_frequency, by = "CNVR_ID", sort = F)
fwrite(cnvr_union_f, file = paste0(folder, "/no_freq_cnvr.txt"), sep = "\t", quote = FALSE)
if (is.null(roh)) {
#add type of CNVR
cnvr_type <- cnv_cnvr %>% group_by(CNVR_ID) %>% count(CNV_Value, name = "Count")
cnvr_type2 <- reshape2::dcast(cnvr_type, CNVR_ID ~ CNV_Value, value.var = "Count")
cnvr_type2$Type <- NA
#check cnv value completeness
cnv_value = c("0", "1", "3", "4")
if (!all(cnv_value %in% colnames(cnvr_type2))){
if (!(length(cnvr_type2$`0`) > 0)){
cnvr_type2$`0` <- NA
cat("No 0-copy type detected in CNV list.\n")
}
if (!(length(cnvr_type2$`1`) > 0)){
cnvr_type2$`1` <- NA
cat("No 1-copy type detected in CNV list.\n")
}
if (!(length(cnvr_type2$`3`) > 0)){
cnvr_type2$`3` <- NA
cat("No 3-copy type detected in CNV list.\n")
}
if (!(length(cnvr_type2$`4`) > 0)){
cnvr_type2$`4` <- NA
cat("No 4-copy type detected in CNV list.\n")
}
}
for (i in 1:nrow(cnvr_type2)) {
if (is.na(cnvr_type2[i, c("0")]) & is.na(cnvr_type2[i, c("1")])){
cnvr_type2$Type[i] <- "Gain"
}
else if (is.na(cnvr_type2[i, c("3")]) & is.na(cnvr_type2[i, c("4")])) {
cnvr_type2$Type[i] <- "Loss"
}
else{cnvr_type2$Type[i] <- "Mixed"}
}
cnvr_f_type <- merge(cnvr_union_f, cnvr_type2, sort = F)
cnvr_f_type$Length <- cnvr_f_type$End - cnvr_f_type$Start + 1
cat(paste0(nrow(cnvr_f_type), " CNVRs generated in total.\n"))
#The overall summary
cnvr_summary <- cnvr_f_type %>%
group_by(Type) %>%
summarise(N = n(), "Average Length" = round(mean(Length), digits = 0), "Min Length" = min(Length), "Max Length" = max(Length), "Total Length" = sum(Length, na.rm = T))
#Partial summary
cnvr_chr_summary <- cnvr_f_type %>%
group_by(Chr) %>%
summarise("Total Length" = sum(Length, na.rm = T), "Number of CNVR" = n())
cat("Overall summary of CNVRs:\n")
print(cnvr_summary)
cat("Summary of CNVRs on each Chromosome:\n")
print(cnvr_chr_summary)
fwrite(cnv_cnvr, file = paste0(folder, "/individual_cnv_cnvr.txt"), sep = "\t", quote = FALSE)
fwrite(cnvr_f_type, file = paste0(folder, "/cnvr.txt"), sep = "\t", quote = FALSE)
fwrite(cnvr_summary, file = paste0(folder, "/cnvr_summary.txt"), sep = "\t", quote = FALSE, col.names = TRUE)
fwrite(cnvr_chr_summary, file = paste0(folder, "/cnvr_chr_summary.txt"), sep = "\t", quote = FALSE, col.names = TRUE)
return(cnvr_f_type)
if(file.exists(paste0(folder,"/cnvr.txt")) & file.exists(paste0(folder, "/individual_cnv_cnvr.txt"))) {
cat(paste0("Task done, CNVR results were saved in the '", folder, "' directory.\n"))
} else {warning("No output file produced. Please check the format of your input file!!")}
}
else {
cnvr_union_f$length <- cnvr_union_f$End - cnvr_union_f$Start + 1
fwrite(cnvr_union_f, file = paste0("call_cnvr_", folder, "/roh.txt"), sep = "\t", quote = FALSE)
if(file.exists(paste0(folder, "/roh.txt"))) {
cat(paste0("Task done, ROH results saved in the '", folder, "' directory.\n"))
} else {warning("No output file produced. Please check the format of your input file!!")}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.