R/RcppExports.R

Defines functions read_vcf_cpp read_vcf_multisamps_cpp merge_gq_svsites_cpp merge_ac_svsites_cpp aldist

Documented in aldist merge_ac_svsites_cpp merge_gq_svsites_cpp read_vcf_cpp read_vcf_multisamps_cpp

# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

#' Using edlib to quickly compute the edit distances between pairs of sequences
#' @title Align sequences and compute edit distance
#' @param seq1 a vector with first sequences in the pair to align
#' @param seq2 a vector with second sequences in the pair to align
#' @return a vector with edit distances
#' @author Jean Monlong
#' @keywords internal
aldist <- function(seq1, seq2) {
    .Call(`_sveval_aldist`, seq1, seq2)
}

#' Count allele at the SV site level. This function will use variant ids
#' created by read_vcf_multisamps_cpp
#' @title Count alleles in SV sites across samples
#' @param filename the path to the VCF file (unzipped or gzipped).
#' @param use_gz is the VCF file gzipped?
#' @param sv_sites a list defining the variant ID for each sv site (element). 
#' @return matrix with allele counts for each sv site (rows) and sample (columns)
#' @author Jean Monlong
#' @keywords internal
merge_ac_svsites_cpp <- function(filename, use_gz, sv_sites) {
    .Call(`_sveval_merge_ac_svsites_cpp`, filename, use_gz, sv_sites)
}

#' Extract genotype quality at the SV site level. This function will use variant ids
#' created by read_vcf_multisamps_cpp
#' @title Genotype quality in SV sites across samples
#' @param filename the path to the VCF file (unzipped or gzipped).
#' @param use_gz is the VCF file gzipped?
#' @param sv_sites a list defining the variant ID for each sv site (element). 
#' @return matrix with genotype quality for each sv site (rows) and sample (columns)
#' @author Jean Monlong
#' @keywords internal
merge_gq_svsites_cpp <- function(filename, use_gz, sv_sites) {
    .Call(`_sveval_merge_gq_svsites_cpp`, filename, use_gz, sv_sites)
}

#' For each VCF record the information in the INFO field is used in priority. If
#' missing, information is guessed from the REF/ALT sequences.
#' If multiple alleles are defined in ALT, they are split and the allele count extracted
#' from the GT field.
#'
#' Alleles are split and, for each, the allele count is computed across samples. 
#' @title Read VCF using CPP reader
#' @param filename the path to the VCF file (unzipped or gzipped).
#' @param use_gz is the VCF file gzipped?
#' @param min_sv_size minimum variant size to keep in bp. Variants shorter than this
#' will be skipped. Default is 10. 
#' @param shorten_ref should the REF sequence be shortened to the first 10 bp. Default is TRUE
#' @param shorten_alt should the ALT sequence be shortened to the first 10 bp. Default is TRUE
#' @param check_inv guess if a variant is an inversion by aligning REF with the
#' reverse complement of ALT. If >80\% similar (and REF and ALT>10bp), variant is classified as INV.
#' @return data.frame with variant and genotype information
#' @author Jean Monlong
#' @keywords internal
read_vcf_multisamps_cpp <- function(filename, use_gz, min_sv_size = 10L, shorten_ref = TRUE, shorten_alt = TRUE, check_inv = FALSE) {
    .Call(`_sveval_read_vcf_multisamps_cpp`, filename, use_gz, min_sv_size, shorten_ref, shorten_alt, check_inv)
}

#' For each VCF record the information in the INFO field is used in priority. If
#' missing, information is guessed from the REF/ALT sequences.
#' If multiple alleles are defined in ALT, they are split and the allele count extracted
#' from the GT field.
#'
#' Alleles are split and, for each, column 'ac' reports the allele count. Notable cases incude
#' 'ac=-1' for no/missing calls (e.g. './.'), and 'ac=0' on the first allele to report hom ref,
#' variants. These cases are often filtered later with 'ac>0' to keep only non-ref calls. If
#' the VCF contains no samples or if no sample selection if forced (sample_name='*'), 'ac' will
#' contain '-1' for all variants in the VCF.
#' @title Read VCF using CPP reader
#' @param filename the path to the VCF file (unzipped or gzipped).
#' @param use_gz is the VCF file gzipped?
#' @param sample_name which sample to process. If not found, uses first sample in VCF file.
#' If "*", force no sample selection
#' @param min_sv_size minimum variant size to keep in bp. Variants shorter than this
#' will be skipped. Default is 10. 
#' @param shorten_ref should the REF sequence be shortened to the first 10 bp. Default is TRUE
#' @param shorten_alt should the ALT sequence be shortened to the first 10 bp. Default is TRUE
#' @param gq_field which field from FORMAT should be used as genotype quality. Default is "GQ".
#' If not found, QUAL will be used
#' @param check_inv guess if a variant is an inversion by aligning REF with the
#' reverse complement of ALT. If >80\% similar (and REF and ALT>10bp), variant is classified as INV.
#' @param keep_nocalls should we keep variants/alleles with missing genotypes (e.g. "./.").
#' Default is FALSE
#' @param other_fields name of another field from INFO to extract.
#' @return data.frame with variant and genotype information
#' @author Jean Monlong
#' @keywords internal
read_vcf_cpp <- function(filename, use_gz, sample_name = "", min_sv_size = 10L, shorten_ref = TRUE, shorten_alt = TRUE, gq_field = "GQ", check_inv = FALSE, keep_nocalls = FALSE, other_fields = as.character( c())) {
    .Call(`_sveval_read_vcf_cpp`, filename, use_gz, sample_name, min_sv_size, shorten_ref, shorten_alt, gq_field, check_inv, keep_nocalls, other_fields)
}
jmonlong/sveval documentation built on July 31, 2023, 7:50 p.m.