#' Read Illumina array raw data
#'
#' \code{read_finalreport} handles inputs of data in FinalReport-like format
#'
#' This function handles the input, pre-processing and temporary storage (as RDS
#' files) of the the markers-level raw data for each sample starting from
#' FinalReport-like files (any plain text file with columns header can be read).
#' Input must be one file per sample.
#'
#' @param DT_path, character, the path to the directory with the raw data (i.e.
#' the splitted final reports)
#' @param rds_path, path to the directory where RDS should be stored.
#' @param pref,suff, eventual prefix an suffix (e.g. ".txt"). If not necessary
#' must be set to \code{NA}.
#' @param sample_list, a \code{data.table}, the output of
#' \code{\link{read_metadt}}.
#' @param markers, a \code{data.table}, the output of
#' \code{\link{read_NGS_intervals}} or \code{\link{read_finalreport_snps}},
#' depending on the initial data type.
#' @param chr_col, name of the column containing the chromosome information in
#' the input file. Default is \code{"Chr"}.
#' @param pos_col, name of the column containing the SNPs position information
#' in the input file. Default is \code{"Position"}.
#' @param LRR_col, name of the column containing the LRR information in the
#' input file. Default is \code{"Log R Ratio"}.
#' @param BAF_col, name of the column containing the BAF information in the
#' input file. Default is \code{"B Allele Freq"}.
#'
#' @return nothing, this function saves the results on disk.
#'
#' @export
#'
#' @examples
#'
#' read_finalreport_raw(DT_path = system.file("extdata", package = "CNVgears"),
#' pref = "Final_report_example", suff = ".txt",
#' rds_path = file.path("tmp_RDS"),
#' markers = markers_examples,
#' sample_list = cohort_examples[sample_ID == "NA12878", ])
read_finalreport_raw <- function(DT_path, rds_path, pref, suff,
sample_list, markers, chr_col = "Chr",
pos_col = "Position", LRR_col = "Log R Ratio",
BAF_col = "B Allele Freq") {
# check inputs
if (!dir.exists(DT_path)) stop("File do not exist, typo(s)?\n")
# create the directory for the RDS if don't exist
dir.create(rds_path, showWarnings = FALSE)
for (i in 1:nrow(sample_list)) {
cat("Reading data for sample #:", i ,"\n")
DT_path_samp <- file.path(DT_path, paste0(pref[!is.na(pref)], sample_list$sample_ID[i],
suff[!is.na(suff)]))
# read file, excluding eventual header using fread()
columns <- c(chr_col, pos_col, LRR_col, BAF_col)
DT <- fread(DT_path_samp, skip = chr_col, select = columns)
colnames(DT) <- c("chr", "start", "log2R", "BAF")
# this is (MOSTLY) copy-pasted from read_NGS_raw.R, could be a function
sex <- sample_list$sex[i]
# standardize chr
DT <- chr_uniform(DT)
if (sex == 2) DT <- DT[chr %in% as.character(1:23)]
# standardize
DT[, `:=` (start = as.integer(start), end = as.integer(start), log2R = as.numeric(log2R))]
# col order
DT <- DT[, .(chr, start, end, log2R, BAF)]
# sort
setorder(DT, chr, start)
# compute and add these columns: P_ID (match from markers), P_CN, seg_ID, copyratio ..?
for (cc in unique(DT$chr)) {
DT[chr == cc, P_ID := markers[chr == cc, ][match(DT[chr == cc, start],
markers[chr == cc, start]), P_ID]]
}
DT[, `:=` (P_CN = round((2^log2R)*2), copyratio = 2^log2R)]
# save in RDS one chromosome at time in the specified path
if (sex == 1) chrs <- as.character(1:24)
else chrs <- as.character(1:23)
cat("Writing data for sample #:", i ,"\n")
for (cc in chrs) {
rds_file <- file.path(rds_path, paste0(sample_list$sample_ID[i], "_chr", cc, ".rds"))
# cat("INFO: saving", rds_file, "\n")
saveRDS(DT[chr == cc, ], file = rds_file)
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.