date: r format(Sys.Date(), "%d/%m/%Y")

knitr::opts_chunk$set(collapse = TRUE,comment = "#>")
vcf="/Users/km28/Documents/Karim/Packages/Input_Data.vcf.gz"
metadata="/Users/km28/Documents/Karim/Packages/SampleMetadata.txt"
outDir="/Users/km28/Documents/Karim/Packages/Test"

Install

``` {r eval=FALSE} library(devtools) devtools::install_github("FellouMada/rSNPdata", build_vignettes = TRUE) library(rSNPdata)

## get SNPdata object      
the functions in this package require a `SNPdata` object. This can be generated using the `get_snpdata` function with the following arguments:    
1. the input VCF file (required)    
2. the file with the samples metadata (required)    
3. the path to the folder where the output files will be stored (optional)  
4. the gene ontology annotation file (optional). If not provided, the default file that was downloaded from https://plasmodb.org/plasmo/app/downloads/Current_Release/Pfalciparum3D7/gaf/ will be used    
5. the gene annotation file (optional). If not provided, the default file that was downloaded from https://plasmodb.org/plasmo/app/downloads/Current_Release/Pfalciparum3D7/gff/ will be used
type `?get_snpdata` for more details      
```r
snpdata=get_snpdata(vcf.file = vcf, meta.file = metadata, output.dir = outDir, gaf = NULL, gff = NULL)

print the SNPdata object

print(snpdata)

calculate the SNPs minor allele frequency

the compute_MAF requires a SNPdata object, the name of the genotype table from which the MAF should be calculated (either "GT" for the raw genotypes, "Phased" for phased genotypes, "Phased_Imputed" for phased and imputed genotypes).
include.het determines whether to account for heterozygous sites in MAF calculation.
type ?compute_MAF for more details

snpdata = compute_MAF(snpdata, include.het=FALSE, mat.name="GT")
head(snpdata$details)

filter the SNPdata object

use the filter_snps_samples function to remove the SNPs and samples that do not satisfy the user criteria.

snpdata = filter_snps_samples(snpdata, min.qual=10, max.missing.sites=0.2, max.missing.samples=0.2, maf.cutoff=0.01)
print(snpdata)
hist(snpdata$details$MAF, 100)
grid()

calculate the within-host genetics diversity index (Fws)

the calculate_Fws function requires a SNPdata object
type ?calculate_Fws for more details

snpdata =  calculate_Fws(snpdata)
head(snpdata$meta)

mixed genotypes phasing

use the phase_mixed_genotypes function to phase the mixed genotypes.

snpdata = phase_mixed_genotypes(snpdata, nsim=100)
head(snpdata$Phased[,1:5])

impute the missing data

use the impute_missing_genotypes function to impute the missing genotypes. the genotype argument determines which genotype table should be used to impute the missing data ("GT" to impute from raw data, "Phased" to impute from the phased data).

snpdata = impute_missing_genotypes(snpdata, genotype="GT", nsim=100)
head(snpdata$Imputed[,1:5])

select data for a given chromosome

to get the data from chromosome 7 ("Pf3D7_07_v3"), use the select_chrom

chrom7_snpdata = select_chrom(snpdata, chrom="Pf3D7_07_v3")
print(chrom7_snpdata)

drop a set of SNPs from the data

if you want to remove some loci from the data, use the drop_snps function and provide it with a data frame with 2 columns, "Chrom" and "Pos", representing the genomic coordinates of the loci to be dropped or the chrom, start and end position of the loci to be dropped

snp.to.be.dropped = chrom7_snpdata$details %>% select(Chrom,Pos)
idx = sample(1:nrow(snp.to.be.dropped),100, replace = FALSE)
snp.to.be.dropped = snp.to.be.dropped[idx,]
reduced_snpdata = drop_snps(chrom7_snpdata, snp.to.be.dropped)
print(reduced_snpdata)
reduced_snpdata = drop_snps(chrom7_snpdata, chrom="Pf3D7_10_v3", start=100, end=500)
print(reduced_snpdata)

drop a set of samples from the data

if you want to remove some samples from the data, use the drop_samples function and provide it with a vector of samples to be dropped.

idx = sample(1:nrow(reduced_snpdata$meta),10, replace = FALSE)
samples.to.be.dropped = reduced_snpdata$meta$sample[idx]
reduced_snpdata = drop_samples(reduced_snpdata, samples.to.be.dropped)
print(reduced_snpdata)

Fst

In this package, Weir & Cockerham's Fst is calculated using the vcflib tools.
The from option specifies the name of the column in the metadata table with the groups between which Fst should be calculated. the calculate_wcFst returns a SNPdata object with an additionnal field named Fst. This is a list that contains data frames representing each the Fst results between a given pair-wise population.

snpdata=get_snpdata(vcf.file = vcf, meta.file = metadata, output.dir = outDir)
snpdata = compute_MAF(snpdata, include.het=FALSE, mat.name="GT")
snpdata = filter_snps_samples(snpdata, min.qual=10, max.missing.sites=0.2, max.missing.samples=0.2, maf.cutoff=0.01)
snpdata =  calculate_Fws(snpdata)
snpdata = calculate_wcFst(snpdata, from="Country", groups=c("Senegal","Gambia"))
names(snpdata$Fst)
head(snpdata$Fst$Senegal_vs_Gambia)

IBS

The identity by state (IBS) represents the proportion of similar loci between 2 pairs of isolates. The calculate_IBS function returns the dissimilarity matrix i.e. (1-IBS). It returns a SNPdata object with an additionnal field named IBS.

snpdata = calculate_IBS(snpdata, mat.name="GT")

LD

The linkage between pairs of SNPs is calculated using the vcftools program.

## calculate LD between pairs of SNPs across chromosome 4 and 5
snpdata = calculate_LD(snpdata, min.r2=0.2, inter.chrom=FALSE, chroms=c("Pf3D7_04_v3","Pf3D7_05_v3"))

## calculate LD between pairs of SNPs across all chromosomes
snpdata = calculate_LD(snpdata, min.r2=0.2, inter.chrom=FALSE, chroms=NULL)

iR

groups = c("DongoroBa","Chogen")
idx = which(snpdata$meta$Location %in% groups)
samples.to.be.dropped = snpdata$meta$sample[-idx]
reduced_snpdata = drop_samples(snpdata, samples.to.be.dropped)
print(reduced_snpdata)
res.iR = calculate_iR(reduced_snpdata, mat.name="Phased", family="Location", number.cores=4)
dim(res.iR$iR$Chogen_vs_DongoroBa)

relatedness

relatedness between pairs of isolates from a given pairs of population. The relatedness is calculated using the GT table. The value of the from option is the name of the column in the metadata data frame that represents the sample's population of origin. To remove regions of the genome under selection, use the sweepRegions option. You can specify a vector of populations to compute the relatedness for as a value for the groups option.

## calculate relatedness between pairs of isolates from Chogen and DongoroBa

snpdata = impute_missing_genotypes(snpdata, genotype="GT", nsim=10)
snpdata = calculate_relatedness(snpdata, mat.name="Imputed", from="Location", sweepRegions=NULL, groups=c("Chogen","DongoroBa"))
relatedness = snpdata$relatedness$df
relatedness.matrix = snpdata$relatedness$matrix

## calculate relatedness between pairs of isolates from all locations
snpdata = calculate_relatedness(snpdata, mat.name="GT", family="Location", sweepRegions=NULL, groups=NULL)

## calculate relatedness between pairs of isolates from all locations after excluding the selective sweep regions
Chrom = c("Pf3D7_04_v3","Pf3D7_05_v3","Pf3D7_07_v3")
Start = c(723088, 932890, 378222)
End = c(774914, 987149, 431317)
selection.region = data.frame(cbind(Chrom, Start, End), stringsAsFactors = FALSE)
snpdata = calculate_relatedness(snpdata, mat.name="GT", family="Location", sweepRegions=selection.region, groups=NULL)


FellouMada/rSNPdata documentation built on June 12, 2022, 7:29 p.m.