Nothing
#' Read in a file of allele frequencies
#'
#' Reads in a file of alleles in a particular format.
#'
#' This function reads frequencies in the rectangular allele freqency table
#' format used by FSI Genetics and other journals. This file format assumes a
#' comma separated value file (CSV) (although the column delimeter can be
#' specified). The first column should be labelled 'Allele' and contain the STR
#' allele designations that are used in the data set. The remaining columns
#' will have the locus name as a header, and frequencies that are either blank,
#' zero, or non-zero. Blanks or zeros are used to specify that the allele is
#' not observed (and not used) at the locus. The final row of the file should
#' start with 'N' or 'n' in the first column and give the number of individuals
#' typed (or the number of alleles recorded) in assessing the frequency of the
#' alleles.
#'
#' The second format is a very particular 'Curran' text format. The first line
#' contains the number of loci in the multiplex. The next line will contain the
#' name of the first locus and the number of alleles, nA, the locus separated
#' by a comma. The next nA lines contain the allele number (from 1 to nA), the
#' STR designation of the allele, and the frequency separated by commas. This
#' pattern is repeated for each locus. In the future this function will read
#' the rectangular allele freqency table used by FSI Genetics and other
#' journals.
#'
#' @param strPath The file from which to read the frequencies
#' @param FSIGenFormat Tells the function whether the file is either in FSI
#' Genetics format (see below) or 'Curran' format
#' @param delim This argument is used when \code{FSIGenFormat} is \code{TRUE},
#' and is the regular expression used to delimit columns of the table. it is
#' set to a single comma by default, and multiple delimiters are considered
#' empty separate fields. There probably should be an additional argument which
#' specifies the missing or empty cell symbol, but I won't programme this
#' unless somebody asks for it
#' @return a list containing two vectors and a list, loci, counts, and freqs.
#' The vector loci is a vector of the locus names in the frequency file. The
#' vector counts is a vector of the number of individuals (or sometimes
#' alleles) typed at each locus. This will null if the 'Curran' format is used.
#' The list freqs, is a list of vectors with each vector containing the
#' frequencies of the alleles at the locus. The names of the elements of the
#' vectors are the STR allele designations.
#' @author James M. Curran
#' @export readFreqs
readFreqs = function(strPath, FSIGenFormat = TRUE, delim = ','){
Freqs = list(loci = NULL, freqs = NULL, counts = NULL)
if(FSIGenFormat){
## expect the information to start with Alleles, and end with n
f1 = file(strPath, open = 'r', encoding = 'UTF-8-BOM')
if(isOpen(f1)){
Lines = readLines(f1)
close(f1)
stripBlankLines = function(Lines){
allDelim = paste0("^[", delim, "]*$")
Lines = Lines[!grepl(allDelim, Lines)]
}
Lines = stripBlankLines(Lines)
alleleStart = -1
## look for a line starting with alleles
if(!any(grepl('^[^,]*[Aa]llele[^,]*,', Lines))){
stop("The header line should start with 'Allele'")
}else{
alleleStart = grep('^[Aa]llele', Lines)
if(length(alleleStart) > 1){
stop("There are two or more lines starting with Allele")
}
}
## look for line starting with n
countStart = -1
if(!any(grepl('^([Nn]|total)', Lines))){
stop("The frequency file should have locus allele counts")
}else{
countStart = grep('^([Nn]|total)', Lines)
if(length(countStart) > 1){
warning("There should only one locus allele count line. Using first ")
countStart = countStart[1]
}
}
if(countStart == -1 || alleleStart == -1){
stop("Header and footer lines not found")
}else{
n = length(Lines)
Lines = Lines[alleleStart:countStart]
if((n - length(Lines)) > 0){
cat(paste("Dropped", n - length(Lines), "lines\n"))
}
locusLine = Lines[1]
Tokens = unlist(strsplit(locusLine, delim))
Loci = toupper(Tokens[-1])
nLoci = length(Loci)
cat(paste("Detected", nLoci, "loci\n"))
countLine = Lines[length(Lines)]
Tokens = unlist(strsplit(countLine, delim))
counts = as.numeric(Tokens[-1])
Lines = Lines[-c(1,length(Lines))]
freqs = vector(length = nLoci, mode = "list")
names(freqs) = Loci
for(line in Lines){
Tokens = unlist(strsplit(line, delim))
a = Tokens[1]
fx = as.numeric(Tokens[-1])
for(loc in 1:nLoci){
if(!is.na(fx[loc]) && fx[loc] > 0){
f = fx[loc]
if(f > 1){
stop("Error: frequency greater than 1")
}
freqs[[loc]] = c(freqs[[loc]], f)
names(freqs[[loc]])[length(freqs[[loc]])] = a
}
}
}
cat(paste("Successfully processed", length(Lines), "lines\n"))
Freqs$freqs = freqs
Freqs$loci = Loci
Freqs$counts = counts
}
}else{
stop(paste("Couldn't open", strPath, "for reading"))
}
}else{ ## Curran format
f1 = file(strPath, open = 'r', encoding = 'UTF-8-BOM')
if(isOpen(f1)){
Lines = readLines(f1)
nLines = length(Lines)
## cat(paste("Read", nLines, "lines from", strPath,"\n"))
close(f1)
}else{
stop(paste("Couldn't open :", strPath, "for reading"))
}
Freqs = NULL
nLoci = as.numeric(Lines[1])
## cat(paste(nLoci, "\n"))
Loci = rep("", nLoci)
freqs = vector(nLoci, mode = "list")
currLine = 2
for(nLoc in 1:nLoci){
Tokens = unlist(strsplit(Lines[currLine], ","))
currLine = currLine + 1
Loci[nLoc] = Tokens[1]
nAlleles = as.numeric(Tokens[2])
Alleles = rep("", nAlleles)
freqs[[nLoc]] = rep(0, nAlleles)
for(nA in 1:nAlleles){
Tokens = unlist(strsplit(Lines[currLine], ","))
currLine = currLine + 1
Alleles[nA] = Tokens[2]
freqs[[nLoc]][nA] = as.numeric(Tokens[3])
}
names(freqs[[nLoc]]) = Alleles
}
names(freqs) = Loci
Freqs$loci = Loci
Freqs$freqs = freqs
}
return(Freqs)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.