R/popgen.R
In PopGenKit: Useful functions for (batch) file conversion and data resampling in microsatellite datasets

popgen <-
function(datafile,ndigit=3, freq.overall=T, freq.by.pop=T, genetic.stats=T, pairwise.fst=T)
{
input=read.table(file=datafile, sep='\t', colClasses='character')
noext= gsub("[.].*$","",datafile)

nloci=ncol(input)-2

#transformer pour trouver npops et popsizes
whereNbSamples=grep('NbSamples',input[,1])
npops=gsub( "[^0123456789]", "", input[whereNbSamples,1])
npops=as.numeric(npops)

whereSampleSize =grep('SampleSize',input[,1])
popsizes=as.numeric(c(gsub('[^0123456789]', "", input[whereSampleSize,1])))

#reconnaître les noms des pops
whereSampleName =grep('SampleName',input[,1])
popnames=(c(gsub('SampleName=', "", input[whereSampleName,1])))

#créer une matrice nind x nloci seulement
matinput=input[,3:ncol(input)]
xsums=rep(NA,times=nrow(matinput))
for (i in 1:nrow(matinput)){ xsums[i]=sum(nchar(matinput[i,])) }
emptyrows=which(xsums==0)
matinput=matinput[-emptyrows,]

#déterminer le nombre dallèles/locus pour formatter output
kvec=vector(mode='numeric',nloci)
for (i in 1:nloci)
{
alleles=matinput[,i]
vec=unique(alleles)
vec=paste(vec,collapse='')
vec=gsub( "[^[:alnum:]]", "", vec)
k=nchar(vec)/ndigit
kvec[i]=k
}

MAX=max(kvec)

#créer le tableau de résultats
results=matrix(NA,2*nloci,MAX)

missing=rep(NA, times=nloci)
nbk=rep(NA, times=nloci)
n.alleles=rep(NA, times=nloci)
for (j in 1:nloci)
{ 
alleles=matinput[,j]
totaln=length(alleles)
vec=unique(alleles)
vec=paste(vec,collapse='')
vec=gsub( "[^[:alnum:]]", "", vec)
k=nchar(vec)/ndigit

sampsize=paste(alleles,collapse='')
sampsize=gsub( "[^[:alnum:]]", "", sampsize)
sampsize=(nchar(sampsize)/ndigit)

missingABS=length(grep('[?]',alleles))
missing[j]=round((100*(missingABS/totaln)),2)

nbk[j]=k

n.alleles[j]=sampsize/2

for (m in 1:k)
{
alleleID=substr(vec,(m*ndigit)-(ndigit-1),m*ndigit)
results[(2*j)-1,m]=alleleID
count=0
for (z in 1:length(alleles))
{
if (alleles[z]==alleleID) count=count+1
}
results[2*j,m]=round(count/sampsize,3)
}
}

#trier les allèles en ordre croissant dans le output
for (j in 1:nloci)
{
ordre=order(results[(2*j)-1,])
results[(2*j)-1,]=results[(2*j)-1,ordre]
results[(2*j),]=results[(2*j),ordre]
}

#ajouter une colonne au début avec le no de locus et le % de données manquantes
loc.col=NULL
missing.col=NULL
k.col=NULL
n.alleles.col=NULL

for (i in 1:nloci) {
loc.col=c(loc.col,i,NA) 
missing.col=c(missing.col,missing[i],NA) 
k.col=c(k.col,nbk[i],NA)
n.alleles.col=c(n.alleles.col,n.alleles[i],NA)
}

table.results=cbind(loc.col,n.alleles.col,missing.col, k.col, results)


#mettre les cellules NA vides pour lesthétisme !
for (r in 1:nrow(table.results))
{
for (c in 1:ncol(table.results))
{
if (is.na(table.results[r,c])==T) table.results[r,c]=''
}
} 

col.name=rep('',times=ncol(table.results))
col.name[1]= 'Locus#'
col.name[2]= 'n'
col.name[3]= 'Miss.%'
col.name[4]= 'k'
col.name[5]= 'Allele frequencies'
colnames(table.results)=col.name

filename=paste(noext,'_Overall_freq.txt',sep='')

if (freq.overall==T)  {
write.table(table.results, file=filename, quote=F, row.names=F, col.names=T, sep='\t') }


#ici les analyses par population commencent

allpopresults=NULL
matpopresults=NULL
matpopcounts=NULL
popsizes2=2*popsizes

for (v in 1:npops)
{
popmissing=rep(NA,times=nloci)
popnbk=rep(NA,times=nloci)
pop.n.alleles=rep(NA,times=nloci)

popresults=matrix(NA,2*nloci,MAX)
popcounts=matrix(NA,2*nloci,MAX)

popno=v
if (v==1) first=1
if (v>1) first=sum(popsizes2[1:(popno-1)])+1
last=sum(popsizes2[1:popno])
popdata=matinput[first:last,]

for (j in 1:nloci)
{ 
alleles=popdata[,j]
vecalleles=results[(2*j)-1,]
k=nbk[j]

sampsize=paste(alleles,collapse='')
sampsize=gsub( "[^0123456789]", "", sampsize)
sampsize=(nchar(sampsize)/ndigit)

missingABS=length(grep('[?]',alleles))
popmissing[j]=round((100*(missingABS/popsizes2[v])),2)

pop.n.alleles[j]=sampsize/2

for (m in 1:k)
{
alleleID=vecalleles[m]
count=0
for (z in 1:length(alleles))
{
if (alleles[z]==alleleID) count=count+1
}
popresults[(2*j),m]= round(count/sampsize,3)
popresults[(2*j)-1,]=vecalleles

popcounts[(2*j),m]= count
popcounts[(2*j)-1,]=vecalleles

if (popresults[2*j,m]=='NaN') popresults[2*j,m]=0 
}

popnbk[j]= length(which(popresults[2*j,]>0))
}
pop.missing.col=NULL
pop.k.col=NULL
pop.n.alleles.col=NULL

for (i in 1:nloci) {
pop.missing.col=c(pop.missing.col,popmissing[i],NA) 
pop.k.col=c(pop.k.col,popnbk[i],NA)
pop.n.alleles.col=c(pop.n.alleles.col,pop.n.alleles[i],NA)
}

table.popresults=cbind(loc.col,pop.n.alleles.col,pop.missing.col, pop.k.col, popresults)

blank.row=rep(NA,times=ncol(table.popresults))
blank.row[1]=popnames[v]
allpopresults=rbind(allpopresults,blank.row,table.popresults)

matpopresults=rbind(matpopresults,popresults)

matpopcounts=rbind(matpopcounts,popcounts)
}

NAallpopresults=allpopresults
#mettre les cellules NA vides pour lesthétisme !
for (r in 1:nrow(allpopresults))
{
for (c in 1:ncol(allpopresults))
{
if (is.na(allpopresults[r,c])==T) allpopresults[r,c]=''
}
} 

colnames(allpopresults)=col.name

filename2=paste(noext,'_Pops_freq.txt',sep='')
if (freq.by.pop==T) {
write.table(allpopresults, file=filename2, quote=F, row.names=F, col.names=T, sep='\t')
}

#ici toutes les variables à mettre dans le tableau (certaines requises pour alleles prives)

allpopn=NAallpopresults[,2]
allpopn=allpopn[!is.na(allpopn)]
allpopk=NAallpopresults[,4]
allpopk=allpopk[!is.na(allpopk)]
first.col=NAallpopresults[,1]
first.col=first.col[!is.na(first.col)]



#ici pour calculer Ho
if (genetic.stats==T) {
allpopHo=NULL
for (v in 1:npops)
{
popHo=rep(NA,times=nloci)

popno=v
if (v==1) first=1
if (v>1) first=sum(popsizes2[1:(popno-1)])+1
last=sum(popsizes2[1:popno])
popdata=matinput[first:last,]

for (j in 1:nloci)
{ 
if (allpopk[((v-1)*nloci)+j]>0) {
alleles=popdata[,j]
vec.missing=(which(alleles[]=='?'))
if (sum(vec.missing)==0) alleles2=alleles else alleles2=alleles[-vec.missing]
true.n=length(alleles2)/2
count=0
for (n in 1:(true.n))
{
second.allele= alleles2[(2*n)]
if (alleles2[(2*n)-1]!=second.allele) count=count+1 
}

popHo[j]= round(count/true.n, 3)
}}

allpopHo=cbind(allpopHo,popHo)
}


#ici pour calculer He

allpopHe=NULL

for (v in 1:npops)
{
popHe=rep(NA,times=nloci)

popno=v
first=(((2*popno)-2)*nloci)+1
last=2*popno*nloci
freqdata=matpopresults[first:last,]

for (j in 1:nloci)
{ 
if (allpopk[((v-1)*nloci)+j]>0) {
popHe[j]=round(1-sum(as.numeric(freqdata[2*j,])^2, na.rm=T),3)
}}

allpopHe=cbind(allpopHe,popHe)
}


#ici pour calculer les alleles privés

allpopprivate=NULL
num.allpopk=as.numeric(allpopk)

for (v in 1:npops)
{
count=0
popno=v
otherpops=c(1:npops)
otherpops=otherpops[-v]
private.pop=rep(NA,times=nloci)
for (j in 1:nloci)
{ 
if (allpopk[((v-1)*nloci)+j]>0) {
vecalleleID=matpopresults[((2*j)-1),]
vecalleleID=vecalleleID[!is.na(vecalleleID)]
nbk=length(vecalleleID)

private=0
vec=rep(NA,times=length(otherpops))
for (i in 1:length(otherpops))
{
vec[i]=(2*(otherpops[i]-1)*nloci+(2*j))
}
freqvec=matpopresults[vec,]

for (k in 1:nbk)
{
alleleID=vecalleleID[k]
freqk=freqvec[,k]
freqk=as.numeric(freqk)
specific=as.numeric(matpopresults[(2*(v-1)*nloci+(2*j)),k])
if (specific>0) {
if (sum(freqk)==0) private=private+1 }
}
private.pop[j]=private
}
}
allpopprivate=cbind(allpopprivate,private.pop)
}


#ici pour calculer la richesse allélique
#avant de commencer les calculs, on doit savoir le minimum sample size pour chaque locus

minima=matrix(NA, nrow=npops, ncol=nloci)
for (v in 1:npops)
{
popno=v
if (v==1) first=1
if (v>1) first=sum(popsizes2[1:(popno-1)])+1
last=sum(popsizes2[1:popno])
popdata=matinput[first:last,]

for (j in 1:nloci)
{ 
if (allpopk[((v-1)*nloci)+j]>0) {
alleles=popdata[,j]
vec.missing=(which(alleles[]=='?'))
#if (sum(vec.missing)==popsizes[v]) minima[v,j]=NA else 
if (sum(vec.missing)==0) alleles2=alleles else alleles2=alleles[-vec.missing]
minima[v,j]=length(alleles2)/2 
}}
}

lowest=rep(NA,times=nloci)
for (i  in 1 : nloci)
{ lowest[i]=min(minima[,i], na.rm=T)  }

#calculer pour vrai

pop.richness=NULL

for (v in 1:npops)
{
popno=v
if (v==1) first=1
if (v>1) first=sum(popsizes2[1:(popno-1)])+1
last=sum(popsizes2[1:popno])
popdata=matinput[first:last,]
loc.richness=rep(NA, times=nloci)

for (j in 1:nloci)
{ 
if (allpopk[((v-1)*nloci)+j]>0) {
alleles=popdata[,j]
vec.missing=(which(alleles[]=='?'))
if (sum(vec.missing)==0) alleles2=alleles else alleles2=alleles[-vec.missing]
mat.alleles=matrix(alleles2, ncol=2, byrow=T)
sim=rep(NA,1000)
for (n in 1:1000)
{
 ech=sample(1:nrow(mat.alleles),lowest[j],replace=F)
resampled=mat.alleles[ech,]
sim[n]= length(unique(as.vector(resampled)))
}
loc.richness[j]=round(mean(sim), 2)
}}
pop.richness=cbind(pop.richness,loc.richness)

}




#créer le tableau de résultats
table.Ho=as.matrix(as.vector(allpopHo), ncol=1)
table.He=as.matrix(as.vector(allpopHe), ncol=1)
table.private=as.matrix(as.vector(allpopprivate), ncol=1)
table.richness=as.matrix(as.vector(pop.richness), ncol=1)



table.stats=cbind(allpopn,allpopk, table.richness,table.private, table.Ho,table.He)

for (p in 2:npops)
{ table.stats=rbind(table.stats[1:(((p-1)*nloci)+p-2),], NA, table.stats[(((p-1)*nloci)+p-1):nrow(table.stats),]) }

table.stats=rbind(NA,table.stats)
table.stats=cbind(first.col,table.stats)


col.name=rep('',times=ncol(table.stats))
col.name[1]= 'Locus#'
col.name[2]= 'n'
col.name[3]= 'k'
col.name[4]= 'Ar'
col.name[5]= 'priv'
col.name[6]= 'Ho'
col.name[7]= 'He'
colnames(table.stats)=col.name

filename3=paste(noext,'_Stats.txt',sep='')

#mettre les cellules NA vides pour lesthétisme !
for (r in 1:nrow(table.stats))
{
for (c in 1:ncol(table.stats))
{
if (is.na(table.stats[r,c])==T) table.stats[r,c]=''
}
} 

write.table(table.stats, file=filename3, quote=F, row.names=F, col.names=T, sep='\t')

}

#calculs de Fst

if (pairwise.fst==T) {

v1=c(1:(npops-1))
v2=c(2:npops)

pair=as.matrix(expand.grid(v1,v2,stringsAsFactors=T))

if (npops>2) {
remove=NULL
for (i in 1:nrow(pair))
{
un=pair[i,1]
deux=pair[i,2]
if (un>=deux) remove=c(remove,i)
}
pair2=pair[-remove,]
} else { pair2=pair   }

nbcomp=nrow(pair2)

all.loci.fst=NULL
all.sums=NULL
for (j in 1:nloci)
{
table.fst=matrix(NA,npops,npops)
table.sums=matrix(NA, npops, npops)
for (t in 1:nbcomp)
{
pop1=pair2[t,1]
pop2=pair2[t,2]

first1=(((2*pop1)-2)*nloci)+1
last1=2*pop1*nloci
pop1freqs=matpopresults[first1:last1,]
pop1counts=matpopcounts[first1:last1,]

first2=(((2*pop2)-2)*nloci)+1
last2=2*pop2*nloci
pop2freqs=matpopresults[first2:last2,]
pop2counts=matpopcounts[first2:last2,]

locus1freq=as.numeric(pop1freqs[j*2,])
locus2freq=as.numeric(pop2freqs[j*2,])

locus1counts=as.numeric(pop1counts[j*2,])
locus2counts=as.numeric(pop2counts[j*2,])

sum1=sum(locus1counts, na.rm=T)
sum2=sum(locus2counts, na.rm=T)

counts.all=locus1counts+locus2counts
sum.all=sum(counts.all, na.rm=T)
freq.all=counts.all/sum.all

He1=1-sum(locus1freq^2, na.rm=T)
He2=1-sum(locus2freq^2, na.rm=T)

Hs=((He1*sum1)+(He2*sum2))/sum.all
Ht=1-sum(freq.all^2, na.rm=T)
Fst=round(1-(Hs/Ht), 4)

table.fst[pop1,pop2]=Fst
table.sums[pop1,pop2]=sum.all

}

#ajouter les zeros sur la diagonale
for (i in 1:npops){
for (j in 1:npops) {
if (i==j) table.fst[i,j]=0 }}

all.loci.fst=rbind(all.loci.fst, table.fst)
all.sums=rbind(all.sums,table.sums)
}

#calculer multilocus Fst (moyenne pondérée)
multilocus.fst=matrix(NA, npops, npops)
for (t in 1:nbcomp)
{
pop1=pair2[t,1]
pop2=pair2[t,2]

weighted.freq=0
overall.sum=0
exclude.loci=0
for (j in 1:nloci) {
if (is.na(all.loci.fst[((npops*(j-1))+pop1),pop2])== T) 
{exclude.loci=exclude.loci+1}
else
{this.weighted.freq=all.loci.fst[((npops*(j-1))+pop1),pop2]*all.sums[((npops*(j-1))+pop1),pop2]
weighted.freq=weighted.freq+this.weighted.freq
overall.sum=overall.sum+all.sums[((npops*(j-1))+pop1),pop2] }}

multilocus.fst[pop1,pop2]=round(weighted.freq/overall.sum, 4)
if (exclude.loci>0) multilocus.fst[pop1,pop2]=paste(multilocus.fst[pop1,pop2], '!!', exclude.loci, sep='')

}

#ajouter les zeros sur la diagonale
for (i in 1:npops){
for (j in 1:npops) {
if (i==j) multilocus.fst[i,j]=0 }}



filename4=paste(noext,'_singlelocusFst.txt',sep='')

filename5=paste(noext,'_multilocusFst.txt',sep='')


for (t in 1:nbcomp)
{
pop1=pair2[t,1]
pop2=pair2[t,2]
for (j in 1:nloci) {
if (all.loci.fst[((npops*(j-1))+pop1),pop2]== 'NaN') all.loci.fst[((npops*(j-1))+pop1),pop2]= 'NoFst'
if (is.na(all.loci.fst[((npops*(j-1))+pop1),pop2])== T) all.loci.fst[((npops*(j-1))+pop1),pop2]= 'NoFst'
}}

#ajouter le nom des loci dans all.loci.fst
locus.name=NULL
for (j in 1:nloci) {
locus.name=c(locus.name,j,rep(NA, times=npops-1)) }

all.loci.fst=cbind(locus.name,all.loci.fst)

#mettre les cellules NA vides pour lesthétisme !
for (r in 1:nrow(all.loci.fst))
{for (c in 1:ncol(all.loci.fst)){
if (is.na(all.loci.fst[r,c])==T) all.loci.fst[r,c]='' }
} 

for (r in 1:nrow(multilocus.fst))
{for (c in 1:ncol(multilocus.fst)){
if (is.na(multilocus.fst[r,c])==T) multilocus.fst[r,c]='' }
} 

col.name=rep('',times=ncol(all.loci.fst))
col.name[1]= 'Locus#'
colnames(all.loci.fst)=col.name


write.table(all.loci.fst, file=filename4, quote=F, row.names=F, col.names=T, sep='\t')
write.table(multilocus.fst, file=filename5, quote=F, row.names=F, col.names=F, sep='\t')

}


return(list('matinput'=matinput,  'popsizes'=popsizes,  'matpopresults'=matpopresults, 'NAallpopresults'=NAallpopresults, 'matpopcounts'=matpopcounts, 'allpopk'=allpopk, 'all.loci.fst'=all.loci.fst, 'all.sums'=all.sums))
}
Any scripts or data that you put into this service are public.
PopGenKit documentation built on May 2, 2019, 3:33 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
PopGenKit
Useful functions for (batch) file conversion and data resampling in microsatellite datasets

R/popgen.R
In PopGenKit: Useful functions for (batch) file conversion and data resampling in microsatellite datasets

Try the PopGenKit package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PopGenKit Useful functions for (batch) file conversion and data resampling in microsatellite datasets

R/popgen.R In PopGenKit: Useful functions for (batch) file conversion and data resampling in microsatellite datasets

Try the PopGenKit package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

PopGenKit
Useful functions for (batch) file conversion and data resampling in microsatellite datasets

R/popgen.R
In PopGenKit: Useful functions for (batch) file conversion and data resampling in microsatellite datasets