R/prepare.R

Defines functions summary_to_fitpoly get_R_theta clean_summary read_axiom find_header_line read_illumina_array qploidy_read_vcf

Documented in clean_summary find_header_line get_R_theta qploidy_read_vcf read_axiom read_illumina_array summary_to_fitpoly

globalVariables(c("ind", "zscore", "chr", "write.csv"))



#' Convert VCF File to Qploidy Data
#'
#' This function converts a VCF file into a format compatible with Qploidy 
#' analysis. It extracts genotype and allele depth information and formats 
#' it into a data frame.
#'
#' @param vcf_file Path to the VCF file.
#' @param geno Logical. If TRUE, the output columns will include MarkerName, 
#' SampleName, geno, and prob. If FALSE, the output will include MarkerName, 
#' SampleName, X, Y, R, and ratio.
#' @param geno.pos Logical. If TRUE, the output will include MarkerName, 
#' Chromosome, and Position columns.
#' @return A data frame containing the processed VCF data.
#' @export
#' @import vcfR
#' @importFrom tidyr pivot_longer
qploidy_read_vcf <- function(vcf_file, geno = FALSE, geno.pos = FALSE) {

  checks <- vcf_sanity_check(vcf_file)

  # Check if the required checks are TRUE
  required <- c("VCF_header", "VCF_columns", "GT", "allele_counts", 
                "samples", "chrom_info", "pos_info")
  if(!all(checks$checks[required])) {
    stop(paste(checks$message[1,required][which(!checks$checks[required])], 
               collapse = "\n"))
  }

  # Check if the required checks are FALSE
  required_not <- c("multiallelics", "phased_GT")
  if(any(checks$checks[required_not])) {
    stop(paste(checks$message[2,required_not][which(checks$checks[required_not])], 
               collapse = "\n"))
  }

  vcf <- read.vcfR(vcf_file,  verbose = FALSE)

  if(!(geno || geno.pos)){
    DP <- extract.gt(vcf, "AD")

    mknames <- pivot_longer(data.frame(mks = rownames(DP), DP), 
                            cols = 2:(ncol(DP)+1))

    dp_split <- strsplit(mknames$value, ",")

    # remove markers without two alleles
    dp_split_filt <- lapply(dp_split, function(x){
      if(length(x) != 2) return(c(NA,NA)) else return(x)
    })

    ref <- as.numeric(sapply(dp_split_filt, "[[", 1))
    alt <- as.numeric(sapply(dp_split_filt, "[[", 2))

    data_qploidy <- data.frame(MarkerName = mknames$mks,
                               SampleName = mknames$name,
                               X= ref,
                               Y= alt,
                               R = alt+ref,
                               ratio = alt/(ref+alt))
  } else if(geno){
    GT <- extract.gt(vcf, "GT")
    GT <- pivot_longer(data.frame(mks = rownames(GT), GT), 
                       cols = 2:(ncol(GT)+1))
    GT$value <- stringr::str_count(GT$value, "1")
    colnames(GT) <- c("MarkerName","SampleName","geno")

    prob <- extract.gt(vcf, "MPP")
    prob <- pivot_longer(data.frame(mks = rownames(prob), prob), 
                         cols = 2:(ncol(prob)+1))
    colnames(prob) <- c("MarkerName","SampleName","prob")

    data_qploidy <- merge.data.frame(GT, prob, by = c("MarkerName", 
                                                      "SampleName"))
    data_qploidy$prob <- as.numeric(data_qploidy$prob)


  } else if(geno.pos){
    data_qploidy <- data.frame("MarkerName" = vcf@fix[,3],
                               "Chromosome" = vcf@fix[,1],
                               "Position" = vcf@fix[,2])


  }

  return(data_qploidy)
}

#' Read Illumina Array Files
#'
#' This function reads Illumina array files and processes them into a format 
#' suitable for Qploidy analysis. It adds a suffix to sample IDs if multiple 
#' files are provided.
#'
#' @param ... One or more Illumina array filenames.
#' @return A data frame containing the processed Illumina array data.
#' @export
#' @importFrom tidyr pivot_longer
read_illumina_array <- function(...){
  files <- list(...)

  raw_all <- vector()
  for (i in seq_along(files)) {
    skipn <- find_header_line(files[[i]], word = "SNP Name")
    raw <- vroom(files[[i]], delim = "\t", skip = skipn-1)
    raw1 <- raw[,c("SNP Name","Sample ID","X","Y")]
    if(length(files) > 1) raw1$`Sample ID` <- paste0(raw1$`Sample ID`, "_",i)
    raw_all <- rbind(raw_all, raw1)
  }

  fitpoly_potato_input <- cbind(raw_all, R = raw_all$X + raw_all$Y, 
                                ratio = raw_all$Y/(raw_all$X + raw_all$Y))
  colnames(fitpoly_potato_input)[1:2] <- c("MarkerName", "SampleName")
  return(fitpoly_potato_input)
}


#' Find the Header Line in a File
#'
#' This function scans a file to locate the first line containing a specific 
#' keyword, such as 'probeset_id'. It is useful for identifying the starting 
#' point of data in files with headers or metadata.
#'
#' @param summary_file The path to the file to be scanned.
#' @param max_lines The maximum number of lines to scan. Default is 6000.
#' @param word The keyword to search for in the first column. Default is 
#' "probeset_id".
#' @return The line number where the keyword is found.
#' 
#' @export
find_header_line <- function(summary_file, word = "probeset_id", 
                             max_lines = 6000) {
  con <- file(summary_file, open = "r")
  on.exit(close(con))
  
  for (i in 1:max_lines) {
    line <- readLines(con, n = 1)
    if (length(line) == 0) break  # EOF
    if (startsWith(line, word)) return(i)
  }
  
  stop(substitute(word)," not found in the first column within the first ", 
       max_lines, " lines.")
}

#' Convert Axiom Array Summary File to Qploidy Input
#'
#' This function processes an Axiom array summary file and converts it into a 
#' format compatible with Qploidy and fitpoly analysis.
#'
#' @param summary_file Path to the Axiom summary file.
#' @param ind_names Optional. A file with two columns: Plate_name (sample IDs 
#' in the summary file) and Sample_Name (desired sample names).
#' @param atan Logical. If TRUE, calculates theta using atan2.
#' @return A data frame formatted for Qploidy analysis, containing the following columns:
#'   - `MarkerName`: Marker identifiers.
#'   - `SampleName`: Sample identifiers (if `ind_names` is provided, these will be updated accordingly).
#'   - `X`: Reference allele intensity (calculated if applicable).
#'   - `Y`: Alternative allele intensity (calculated if applicable).
#'   - `R`: Total signal intensity (calculated if applicable).
#'   - `ratio`: Allelic ratio (theta, calculated if applicable).
#'   - Additional columns may be included depending on the input data and processing steps.
#' @export
#' @importFrom tidyr pivot_longer
#' @importFrom tidyr pivot_wider
read_axiom <- function(summary_file, ind_names = NULL, atan = FALSE){

  skip_line <- find_header_line(summary_file)
  raw <- vroom(summary_file, skip = as.numeric(skip_line) -1)

  # Guarantee that every A has a B
  summary_filt <- clean_summary(raw)

  if(!is.null(ind_names)){
    ind.names <- vroom(ind_names)

    new.names <- ind.names$Sample_Name[match(colnames(summary_filt$A_probes), 
                                             ind.names$Plate_Name)]
    colnames(summary_filt$A_probes)[which(!is.na(new.names))] <- 
      new.names[which(!is.na(new.names))]
    colnames(summary_filt$B_probes)[which(!is.na(new.names))] <- 
      new.names[which(!is.na(new.names))]
  }

  R_theta <- get_R_theta(cleaned_summary = summary_filt, atan)

  fitpoly_input <- summary_to_fitpoly(R_theta$R_all, R_theta$theta_all)

  return(fitpoly_input)
}


#' Clean Axiom Summary File
#'
#' This function removes consecutive A allele probes from an Axiom summary 
#' file.
#'
#' @param summary_df A data frame containing A and B probe intensities.
#' @return A list with cleaned A and B probes.
#' @examples NULL
#' @export
clean_summary <- function(summary_df){
  summaries_filt <- summary_df
  if(length(grep("Contig", summary_df$probeset_id)) > 0) 
    summaries_filt <- summary_df[-grep("Contig", summaries_filt$probeset_id),]
  if(length(grep("contig", summaries_filt$probeset_id)) > 0) 
    summaries_filt <- summaries_filt[-grep("contig", summaries_filt$probeset_id),]
  if(length(grep("comp", summaries_filt$probeset_id)) > 0) 
    summaries_filt <- summaries_filt[-grep("comp", summaries_filt$probeset_id),]

  A_probes <- summaries_filt[seq(1,dim(summaries_filt)[1], 2),]
  B_probes <- summaries_filt[seq(2,dim(summaries_filt)[1], 2),]
  return(list(A_probes=A_probes, B_probes = B_probes))
}

#' Get R and Theta Values from Summary File
#'
#' This function calculates R and theta values from a cleaned summary file. 
#' It optionally performs standard normalization by plate and markers.
#'
#' @param cleaned_summary A summary object from the clean_summary function.
#' @param atan Logical. If TRUE, calculates theta using atan2.
#' @return A list containing the following elements:
#'   - `R_all`: A data frame where each row corresponds to a marker, and columns represent total signal intensity (R) values for each sample.
#'   - `theta_all`: A data frame where each row corresponds to a marker, and columns represent allelic ratio (theta) values for each sample.
#'   - Both data frames include a `MarkerName` column as the first column, which contains marker identifiers.
#' @export
#' @importFrom dplyr mutate
get_R_theta <- function(cleaned_summary, atan = FALSE){
  R_all <- as.data.frame(cleaned_summary$A_probes[,-1] + 
                         cleaned_summary$B_probes[,-1])
  if(atan){
    theta_all <- as.data.frame((atan2(as.matrix(cleaned_summary$B_probes[,-1]), 
                                      as.matrix(cleaned_summary$A_probes[,-1])))/(pi/2))
  }  else {
    theta_all <- as.data.frame(cleaned_summary$B_probes[,-1]/(cleaned_summary$B_probes[,-1] + 
                                                              cleaned_summary$A_probes[,-1]))
  }

  probes_names <-  cleaned_summary$A_probes[,1]
  probes_names <- gsub("-A","", as.vector(probes_names$probeset_id))

  R_all <- cbind(MarkerName = probes_names, R_all)
  theta_all <- cbind(MarkerName = probes_names, theta_all)

  colnames(theta_all)[1] <- colnames(R_all)[1] <- "MarkerName"

  return(list(R_all=R_all, theta_all = theta_all))
}

#' Convert Summary Data to FitPoly-Compatible Format
#'
#' This function processes R (total signal intensity) and theta (allelic ratio) values
#' to generate a data frame compatible with the FitPoly tool. It calculates X and Y
#' values (reference and alternative allele intensities, respectively) and combines
#' them with R and theta into a long-format data frame.
#'
#' @param R_all A data frame containing total signal intensity (R) values. The first
#' column should be `MarkerName`, and subsequent columns should represent samples.
#' @param theta_all A data frame containing allelic ratio (theta) values. The first
#' column should be `MarkerName`, and subsequent columns should represent samples.
#'
#' @return A data frame in long format with the following columns:
#'   - `MarkerName`: Marker identifiers.
#'   - `SampleName`: Sample identifiers.
#'   - `X`: Reference allele intensity.
#'   - `Y`: Alternative allele intensity.
#'   - `R`: Total signal intensity.
#'   - `ratio`: Allelic ratio (theta).
#'
#' @details The function calculates X and Y values as follows:
#'   - `X = R * (1 - theta)`
#'   - `Y = R * theta`
#' The resulting data frame is in a long format, where each row corresponds to a
#' specific marker-sample combination.
#'
#' @importFrom tidyr pivot_longer
#' @export
summary_to_fitpoly <- function(R_all, theta_all){
  Y_all <- R_all[,-1]*theta_all[,-1]
  X_all <- R_all[,-1] - Y_all

  X_all <- cbind(MarkerName= R_all[,1], X_all)
  Y_all <- cbind(MarkerName = R_all[,1], Y_all)

  R <- pivot_longer(R_all, cols = 2:ncol(R_all), values_to = "R", 
                    names_to = "SampleName")
  theta <- pivot_longer(theta_all, cols = 2:ncol(theta_all), values_to = "ratio", 
                        names_to = "SampleName")
  X <- pivot_longer(X_all, cols = 2:ncol(X_all), values_to = "X", 
                    names_to = "SampleName")
  Y <- pivot_longer(Y_all, cols = 2:ncol(Y_all), values_to = "Y", 
                    names_to = "SampleName")

  fitpoly_input <- cbind(X, Y = Y[,3], R = R[,3], ratio = theta[,3])

  return(fitpoly_input)
}

Try the Qploidy package in your browser

Any scripts or data that you put into this service are public.

Qploidy documentation built on June 8, 2025, 10 a.m.