R/call_cnvr.R

Defines functions call_cnvr

Documented in call_cnvr

#' Generate CNV Regions (CNVR)
#'
#' Generate CNVRs, summarize individual CNVR type, report frequency and type of CNVR
#'
#' @param clean_cnv a clean CNV file that was generated by the clean_cnv function
#' @param roh ROH file from the clean_cnv function; only CNVPartition results will generate ROH results
#' @param chr_set set the maximum number of chromosomes in the CNV list
#' @param folder set the name of the output folder.
#' @import dplyr
#' @importFrom data.table fread fwrite setkey foverlaps setDT
#' @importFrom reshape2 dcast
#'
#' @return Four or five tables: CNVR list, brief summary, and individual CNV and CNVR ID.
#' @export call_cnvr
#'
call_cnvr <- function(clean_cnv, roh = NULL, chr_set = 29, folder = "cnvr") {
  if(!file.exists(folder)){
    dir.create(folder)
    cat(paste0("Output folder '", folder, "' was created in the working directory.\n"))
  }

   if(typeof(clean_cnv) == "character"){
     clean_cnv <- data.table::fread(file = clean_cnv, sep = "\t", header = TRUE)
   } else {
     clean_cnv = clean_cnv
   }

  check_colnames <- c("Sample_ID", "Chr", "Start", "End", "CNV_Value", "Length")
  if(!all(check_colnames %in% names(clean_cnv))){
    cat("Lack of necessary information. The input file shoud have at least five columns as follows:")
    print(check_colnames)
    stop("Standard input file should be generated by the 'clean_cnv' function!")
  }

  merge_cnvr <- function(cnv) {
    if (nrow(cnv) == 1) {
      return(cnv)
    }

    cnv <- cnv[order(cnv$Chr, cnv$Start),]
    cnvr_union = cnv[1, ]

    for (i in 2:nrow(cnv)) {
      rest_cnv <- cnv[i, ]

      if (cnvr_union$End[nrow(cnvr_union)] < rest_cnv$Start) {
        cnvr_union <- bind_rows(cnvr_union, rest_cnv)
      } else if (cnvr_union$End[nrow(cnvr_union)] == rest_cnv$Start) {
        cnvr_union$End[nrow(cnvr_union)] <- rest_cnv$End
      }
      if (rest_cnv$End > cnvr_union$End[nrow(cnvr_union)]) {
        cnvr_union$End[nrow(cnvr_union)] <- rest_cnv$End
      }
    }
    return(cnvr_union)
  }

  cnvr <- data.frame()

  for (i in 1:chr_set){
    cnv_chr <- clean_cnv[which(clean_cnv$Chr == i), ]
    if(nrow(cnv_chr) >= 1){
      cnvr_chr <- merge_cnvr(cnv = cnv_chr)
      cnvr <- rbind(cnvr, cnvr_chr)
      cat(paste0("Chromosome ", i, " has been processed.\n"))
    } else {
      cat(paste0("No CNVs detected on Chromosome", i, ".\n"))
    }
  }

  #extract CNVR information, recode for CNVR
  cnvr_union <- cnvr[, c("Chr", "Start", "End")]
  cnvr_union$CNVR_ID <- paste0("CNVR_", seq(1, nrow(cnvr_union), 1))
  cnvr_union <- setDT(cnvr_union)
  clean_cnv <- setDT(clean_cnv)
  data.table::setkey(cnvr_union, Chr, Start, End)
  cnv_cnvr <- data.table::foverlaps(clean_cnv, cnvr_union)
  names(cnv_cnvr)[names(cnv_cnvr) == "i.Start"] <- "CNV_Start"
  names(cnv_cnvr)[names(cnv_cnvr) == "i.End"] <- "CNV_End"

  #add frequency of CNVR
  cnvr_frequency <- cnv_cnvr %>%
                   group_by(CNVR_ID) %>%
                   summarize(Frequency = n(),
                             n_Sample = length(unique(Sample_ID)))

  cnvr_union_f <- merge(cnvr_union, cnvr_frequency, by = "CNVR_ID", sort = F)
  fwrite(cnvr_union_f, file = paste0(folder, "/no_freq_cnvr.txt"), sep = "\t", quote = FALSE)

  if (is.null(roh)) {
    #add type of CNVR
    cnvr_type <- cnv_cnvr %>% group_by(CNVR_ID) %>% count(CNV_Value, name = "Count")
    cnvr_type2 <- reshape2::dcast(cnvr_type, CNVR_ID ~ CNV_Value, value.var = "Count")
    cnvr_type2$Type <- NA

    #check cnv value completeness

    cnv_value = c("0", "1", "3", "4")
    if (!all(cnv_value %in% colnames(cnvr_type2))){

      if (!(length(cnvr_type2$`0`) > 0)){
        cnvr_type2$`0` <- NA
        cat("No 0-copy type detected in CNV list.\n")
        }

      if (!(length(cnvr_type2$`1`) > 0)){
        cnvr_type2$`1` <- NA
        cat("No 1-copy type detected in CNV list.\n")
        }

      if (!(length(cnvr_type2$`3`) > 0)){
        cnvr_type2$`3` <- NA
        cat("No 3-copy type detected in CNV list.\n")
        }

      if (!(length(cnvr_type2$`4`) > 0)){
        cnvr_type2$`4` <- NA
        cat("No 4-copy type detected in CNV list.\n")
        }
      }

    for (i in 1:nrow(cnvr_type2)) {
      if (is.na(cnvr_type2[i, c("0")]) & is.na(cnvr_type2[i, c("1")])){
        cnvr_type2$Type[i] <- "Gain"
      }

      else if (is.na(cnvr_type2[i, c("3")]) & is.na(cnvr_type2[i, c("4")])) {
        cnvr_type2$Type[i] <- "Loss"
      }

      else{cnvr_type2$Type[i] <- "Mixed"}
    }

    cnvr_f_type <- merge(cnvr_union_f, cnvr_type2, sort = F)
    cnvr_f_type$Length <- cnvr_f_type$End - cnvr_f_type$Start + 1

    cat(paste0(nrow(cnvr_f_type), " CNVRs generated in total.\n"))

    #The overall summary
    cnvr_summary <- cnvr_f_type %>%
      group_by(Type) %>%
      summarise(N = n(), "Average Length" = round(mean(Length), digits = 0), "Min Length" = min(Length), "Max Length" = max(Length), "Total Length" = sum(Length, na.rm = T))

    #Partial summary
    cnvr_chr_summary <- cnvr_f_type %>%
      group_by(Chr) %>%
      summarise("Total Length" = sum(Length, na.rm = T), "Number of CNVR" = n())

    cat("Overall summary of CNVRs:\n")
    print(cnvr_summary)
    cat("Summary of CNVRs on each Chromosome:\n")
    print(cnvr_chr_summary)

    fwrite(cnv_cnvr, file = paste0(folder, "/individual_cnv_cnvr.txt"), sep = "\t", quote = FALSE)
    fwrite(cnvr_f_type, file = paste0(folder, "/cnvr.txt"), sep = "\t", quote = FALSE)
    fwrite(cnvr_summary, file = paste0(folder, "/cnvr_summary.txt"), sep = "\t", quote = FALSE, col.names = TRUE)
    fwrite(cnvr_chr_summary, file = paste0(folder, "/cnvr_chr_summary.txt"), sep = "\t", quote = FALSE, col.names = TRUE)

    return(cnvr_f_type)

    if(file.exists(paste0(folder,"/cnvr.txt")) & file.exists(paste0(folder, "/individual_cnv_cnvr.txt"))) {
      cat(paste0("Task done, CNVR results were saved in the '", folder, "' directory.\n"))
    } else {warning("No output file produced. Please check the format of your input file!!")}
  }

  else {
    cnvr_union_f$length <- cnvr_union_f$End - cnvr_union_f$Start + 1
    fwrite(cnvr_union_f, file = paste0("call_cnvr_", folder, "/roh.txt"), sep = "\t", quote = FALSE)

    if(file.exists(paste0(folder, "/roh.txt"))) {
      cat(paste0("Task done, ROH results saved in the '", folder, "' directory.\n"))
    } else {warning("No output file produced. Please check the format of your input file!!")}
  }
}
JH-Zhou/HandyCNV documentation built on Dec. 18, 2021, 12:25 a.m.