r format(Sys.Date(), "%d/%m/%Y")
knitr::opts_chunk$set(collapse = TRUE,comment = "#>") vcf="/Users/km28/Documents/Karim/Packages/Input_Data.vcf.gz" metadata="/Users/km28/Documents/Karim/Packages/SampleMetadata.txt" outDir="/Users/km28/Documents/Karim/Packages/Test"
``` {r eval=FALSE} library(devtools) devtools::install_github("FellouMada/rSNPdata", build_vignettes = TRUE) library(rSNPdata)
## get SNPdata object the functions in this package require a `SNPdata` object. This can be generated using the `get_snpdata` function with the following arguments: 1. the input VCF file (required) 2. the file with the samples metadata (required) 3. the path to the folder where the output files will be stored (optional) 4. the gene ontology annotation file (optional). If not provided, the default file that was downloaded from https://plasmodb.org/plasmo/app/downloads/Current_Release/Pfalciparum3D7/gaf/ will be used 5. the gene annotation file (optional). If not provided, the default file that was downloaded from https://plasmodb.org/plasmo/app/downloads/Current_Release/Pfalciparum3D7/gff/ will be used type `?get_snpdata` for more details ```r snpdata=get_snpdata(vcf.file = vcf, meta.file = metadata, output.dir = outDir, gaf = NULL, gff = NULL)
print(snpdata)
the compute_MAF
requires a SNPdata object, the name of the genotype table from which the MAF should be calculated (either "GT" for the raw genotypes, "Phased" for phased genotypes, "Phased_Imputed" for phased and imputed genotypes).
include.het determines whether to account for heterozygous sites in MAF calculation.
type ?compute_MAF
for more details
snpdata = compute_MAF(snpdata, include.het=FALSE, mat.name="GT") head(snpdata$details)
use the filter_snps_samples
function to remove the SNPs and samples that do not satisfy the user criteria.
snpdata = filter_snps_samples(snpdata, min.qual=10, max.missing.sites=0.2, max.missing.samples=0.2, maf.cutoff=0.01) print(snpdata) hist(snpdata$details$MAF, 100) grid()
the calculate_Fws
function requires a SNPdata object
type ?calculate_Fws
for more details
snpdata = calculate_Fws(snpdata) head(snpdata$meta)
use the phase_mixed_genotypes
function to phase the mixed genotypes.
snpdata = phase_mixed_genotypes(snpdata, nsim=100) head(snpdata$Phased[,1:5])
use the impute_missing_genotypes
function to impute the missing genotypes. the genotype
argument determines which genotype table should be used to impute the missing data ("GT" to impute from raw data, "Phased" to impute from the phased data).
snpdata = impute_missing_genotypes(snpdata, genotype="GT", nsim=100) head(snpdata$Imputed[,1:5])
to get the data from chromosome 7 ("Pf3D7_07_v3"), use the select_chrom
chrom7_snpdata = select_chrom(snpdata, chrom="Pf3D7_07_v3") print(chrom7_snpdata)
if you want to remove some loci from the data, use the drop_snps
function and provide it with a data frame with 2 columns, "Chrom" and "Pos", representing the genomic coordinates of the loci to be dropped or the chrom, start and end position of the loci to be dropped
snp.to.be.dropped = chrom7_snpdata$details %>% select(Chrom,Pos) idx = sample(1:nrow(snp.to.be.dropped),100, replace = FALSE) snp.to.be.dropped = snp.to.be.dropped[idx,] reduced_snpdata = drop_snps(chrom7_snpdata, snp.to.be.dropped) print(reduced_snpdata) reduced_snpdata = drop_snps(chrom7_snpdata, chrom="Pf3D7_10_v3", start=100, end=500) print(reduced_snpdata)
if you want to remove some samples from the data, use the drop_samples
function and provide it with a vector of samples to be dropped.
idx = sample(1:nrow(reduced_snpdata$meta),10, replace = FALSE) samples.to.be.dropped = reduced_snpdata$meta$sample[idx] reduced_snpdata = drop_samples(reduced_snpdata, samples.to.be.dropped) print(reduced_snpdata)
In this package, Weir & Cockerham's Fst is calculated using the vcflib
tools.
The from option specifies the name of the column in the metadata table with the groups between which Fst should be calculated.
the calculate_wcFst
returns a SNPdata
object with an additionnal field named Fst
. This is a list that contains data frames representing each the Fst results between a given pair-wise population.
snpdata=get_snpdata(vcf.file = vcf, meta.file = metadata, output.dir = outDir) snpdata = compute_MAF(snpdata, include.het=FALSE, mat.name="GT") snpdata = filter_snps_samples(snpdata, min.qual=10, max.missing.sites=0.2, max.missing.samples=0.2, maf.cutoff=0.01) snpdata = calculate_Fws(snpdata) snpdata = calculate_wcFst(snpdata, from="Country", groups=c("Senegal","Gambia")) names(snpdata$Fst) head(snpdata$Fst$Senegal_vs_Gambia)
The identity by state (IBS) represents the proportion of similar loci between 2 pairs of isolates. The calculate_IBS
function returns the dissimilarity matrix i.e. (1-IBS). It returns a SNPdata
object with an additionnal field named IBS
.
snpdata = calculate_IBS(snpdata, mat.name="GT")
The linkage between pairs of SNPs is calculated using the vcftools
program.
## calculate LD between pairs of SNPs across chromosome 4 and 5 snpdata = calculate_LD(snpdata, min.r2=0.2, inter.chrom=FALSE, chroms=c("Pf3D7_04_v3","Pf3D7_05_v3")) ## calculate LD between pairs of SNPs across all chromosomes snpdata = calculate_LD(snpdata, min.r2=0.2, inter.chrom=FALSE, chroms=NULL)
groups = c("DongoroBa","Chogen") idx = which(snpdata$meta$Location %in% groups) samples.to.be.dropped = snpdata$meta$sample[-idx] reduced_snpdata = drop_samples(snpdata, samples.to.be.dropped) print(reduced_snpdata) res.iR = calculate_iR(reduced_snpdata, mat.name="Phased", family="Location", number.cores=4) dim(res.iR$iR$Chogen_vs_DongoroBa)
relatedness between pairs of isolates from a given pairs of population. The relatedness is calculated using the GT
table. The value of the from
option is the name of the column in the metadata data frame that represents the sample's population of origin. To remove regions of the genome under selection, use the sweepRegions
option. You can specify a vector of populations to compute the relatedness for as a value for the groups option.
## calculate relatedness between pairs of isolates from Chogen and DongoroBa snpdata = impute_missing_genotypes(snpdata, genotype="GT", nsim=10) snpdata = calculate_relatedness(snpdata, mat.name="Imputed", from="Location", sweepRegions=NULL, groups=c("Chogen","DongoroBa")) relatedness = snpdata$relatedness$df relatedness.matrix = snpdata$relatedness$matrix ## calculate relatedness between pairs of isolates from all locations snpdata = calculate_relatedness(snpdata, mat.name="GT", family="Location", sweepRegions=NULL, groups=NULL) ## calculate relatedness between pairs of isolates from all locations after excluding the selective sweep regions Chrom = c("Pf3D7_04_v3","Pf3D7_05_v3","Pf3D7_07_v3") Start = c(723088, 932890, 378222) End = c(774914, 987149, 431317) selection.region = data.frame(cbind(Chrom, Start, End), stringsAsFactors = FALSE) snpdata = calculate_relatedness(snpdata, mat.name="GT", family="Location", sweepRegions=selection.region, groups=NULL)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.