trim <- function (x) gsub("^\\s+|\\s+$", "", x)
#' Reading data from Genepop.
#'
#' Import Genepop format file and convert to format compatible with GenePlot.
#'
#' File should have locus names at the top of the file, either as single-line
#' list with commas, or as one name per line. Locus names are assumed to stop at
#' the line before the first instance of POP.
#'
#' Population names: By default, Genepop format does not include population
#' names. If the individuals in a pop within the file do not have unique IDs
#' then by default the whole population will be given the ID of the first
#' individual as the population name and then the individuals in that population
#' will be given auto-generated unique IDs. Otherwise, if the individuals in the
#' population do have unique IDs then the populations will be named Pop1, Pop2
#' etc. according to the order in which they appear in the file.
#' If there is a mixture in the file, then pops with unique ID individuals will
#' be named Popx where x is their position in the file, and pops with non-unique
#' ID individuals will be given the ID of their first individual as their popname.
#' Use \code{pop_names} to define the pop names at the point of reading in the file.
#'
#' @param file_path String definining path of the file to read, which can have any
#' extension.
#'
#' @param header (default TRUE) Boolean, indicate whether there is an additional
#' descriptive line at the top of the Genepop-format file (TRUE) or whether
#' the first line is the start of the locus names (FALSE).
#'
#' @param diploid (default TRUE) Boolean, indicates whether data is diploid (TRUE)
#' or haploid (FALSE).
#'
#' @param digits_per_allele (default 2) Indicates whether data uses 2 or 3 digits
#' per allele.
#'
#' @param pop_names (default NULL) Character vector (optional). Define the names
#' of the populations, in the order that they appear in the file.
#'
#' @returns A list containing the following components:
#' #' \describe{
#' \item{\code{locnames}}{Character vector of the locus names.}
#' \item{\code{pop_data}}{The data, in a data frame, with two columns labelled as 'id' and
#' 'pop', and with two additional columns per locus, labelled in the format
#' Loc1.a1, Loc1.a2, Loc2.a1, Loc2.a2, etc.}
#' }
#'
#' @importFrom utils flush.console read.table write.table
#'
#' @export
read_genepop_format <- function(file_path,header=TRUE,diploid=TRUE,digits_per_allele=2,
pop_names=NULL)
{
if (!(digits_per_allele %in% 2:3)) stop("digits_per_allele must be 2 or 3.")
if (header) loc_start_line <- 2
else loc_start_line <- 1
loc_end_line <- NA
con <- file(file_path)
open(con)
pop_start_lines <- vector()
current_line <- 1
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0)
{
if (trimws(tolower(line)) == 'pop') {
if (is.na(loc_end_line)) loc_end_line <- current_line-1
pop_start_lines <- c(pop_start_lines,current_line+1)
}
current_line <- current_line + 1
}
close(con)
### Read loci names
if (loc_end_line == loc_start_line) {
## If loci are a single comma-separated line, read in accordingly, but
## can't replace failed reading with artificial locus names because don't
## know how many loci there are if there was an error reading them in
raw_locnames <- read.table(file_path,header=FALSE,sep=",",skip=loc_start_line-1,nrows=1,colClasses="character")
nloci <- ncol(raw_locnames)
} else {
nloci <- loc_end_line-loc_start_line+1
raw_locnames <- tryCatch({
read.table(file_path,header=FALSE,sep="",skip=loc_start_line-1,nrows=nloci,colClasses="character")
}, error = function(err) { return(paste0("loc-",1:nloci)) })
}
locnames <- trimws(unlist(raw_locnames))
names(locnames) <- NULL
### Read allelic data pop by pop
npops <- length(pop_start_lines)
pop_end_lines <- c(pop_start_lines[2:npops]-2,current_line-1)
raw_data_list <- lapply(1:npops,function(pop){
dat <- tryCatch({
raw_tab <- read.table(file_path, header=FALSE, sep="", colClasses="character",
skip = pop_start_lines[pop]-1, nrows = pop_end_lines[pop]-pop_start_lines[pop]+1)
## Remove the comma at the end of each identifier that is required in
## the Genepop format but not used by GenePlot -- but first check if
## the comma is in a separate column or within the text in column 1
## (Easypop generates Genepop files where there is a space between the
## population index and the comma in each line, and so when read in
## the comma appears as a separate column)
if (raw_tab[1,2] == ",") raw_tab <- raw_tab[,-2]
else raw_tab[,1] <- sapply(raw_tab[,1],function(txt) unlist(strsplit(txt,",")))
## When adding ids or pop_names, note that the Genepop data is expected
## to turn up with pop as the FIRST column and id as the SECOND, so
## the next stage of this file will swap the columns around. So at
## this stage, add ids as column AFTER pop_names or add pop_names as
## the FIRST column.
## If IDs are non-unique,make unique ones and use the first ID as the popname
if (length(unique(raw_tab[,1])) < nrow(raw_tab))
{
pop_name_raw <- raw_tab[1,1]
## Convert any purely numeric popnames into "Pop#" to avoid errors in GenePlot
## Put the attempt to convert name to numeric inside a tryCatch
## because we are fine with it producing a warning about NAs
## introduced by coercion so don't want the user to see the warning
pop_name <- tryCatch({
suppressWarnings({
if (!is.na(as.numeric(pop_name_raw))) pop_name <- paste0("Pop",pop_name_raw)
else pop_name <- pop_name_raw
})
}, error = function(e) {
print(paste('error:', e))
})
raw_tab <- cbind(rep(pop_name,times=nrow(raw_tab)),paste0(pop_name,"-",1:nrow(raw_tab)),raw_tab[,2:ncol(raw_tab)])
}
else ## If IDs are unique, make up popname
{
pop_name <- paste0("Pop",pop)
raw_tab <- cbind(rep(pop_name,nrow(raw_tab)),raw_tab)
}
## Make the first two column names "pop" and "id" so that if some pops
## had all unique IDs and other pops didn't, the first two columns still
## end up being called "pop" and "id" instead of the code call used
## to generate them
names(raw_tab)[1] <- "pop"
names(raw_tab)[2] <- "id"
## Now make sure to return the raw_tab object
raw_tab
}, error = function(err) { return(NULL) })
})
raw_data <- do.call(rbind,raw_data_list)
### Tidy up allelic data and convert into GenePlot format
pop_data <- data.frame(id=raw_data[,2],pop=raw_data[,1]) ## Note switch order of id,pop
if (diploid){
for (locIdx in 1:nloci)
{
if (locIdx+2 <= ncol(raw_data))
{
## Extract the data for each of the alleles, converting it into numeric format instead of factors
raw_alleles <- raw_data[,locIdx+2]
# if (any(nchar(raw_alleles) < 2*digits_per_allele)) browser()
allele1 <- as.numeric(sapply(raw_alleles,substr,start=1,stop=digits_per_allele))
allele2 <- as.numeric(sapply(raw_alleles,substr,start=digits_per_allele+1,stop=2*digits_per_allele))
pop_data <- cbind(pop_data,allele1,allele2)
}
}
names(pop_data) <- c("id","pop", unlist(lapply(locnames, function(loc) c(paste0(loc,".a1"),paste0(loc,".a2")))))
}
else
{
pop_data <- cbind(pop_data,as.numeric(raw_data[3:ncol(raw_data)]))
names(pop_data) <- c("id","pop", paste0(locnames,".a1"))
}
## Convert the pop column into characters to avoid errors in GenePlot
pop_data$id <- as.character(pop_data$id)
pop_data$pop <- as.character(pop_data$pop)
## If the user has entered pop_names, use those
if (!is.null(pop_names)) {
dat_pop_names <- unique(pop_data$pop)
if (length(pop_names) != length(dat_pop_names)) stop("The length of pop_names does not match the number of pops in the data. Please check and try again.")
for (pop in 1:length(dat_pop_names)) {
pop_data$pop[which(pop_data$pop == dat_pop_names[pop])] <- pop_names[pop]
}
}
list(pop_data=pop_data,locnames=locnames)
}
write_genepop_format <- function(dat, outfile, locnames_all, delete_loc=NULL, min_loci_per_indiv=1){
## genepop.dataformat.func 9/5/12
## The output from this function is ready for sending to any Genepop-format
## application.
## The input data (dat) has missing code 0. It uses 2 digits per allele, and
## works only for diploid data.
##
## To remove specific loci from the analysis, it's easiest to keep
## locnames_all as the full list of loci, and specify which loci should be
## removed using delete_loc : e.g. delete_loc = c(1, 2) will remove the
## first two of the loci named in locnames_all.
## The primary purpose of argument locnames_all is to allow different loci
## sets for different species, e.g. might want to use locnames_all =
## locnames_rat or locnames_all = locnames_stoat.
##
## If delete_loc=NULL, no loci are removed.
##
## For Genepop format, the alleles are numbered as follows, within each locus.
## 00 = missing code
## 01, 02, 03, ... alleles sampled at the locus.
## Information about the allele label is lost, e.g. an individual with
## genotype 96, 102 might become 0103 under this scheme, meaning that allele
## 96 is given label 01, and allele 102 is given label 03.
## The order of labels 01, 02, etc follows the order dictated by the R
## function table().
##
## This function assumes a maximum of 99 alleles at any single locus, which
## will be labelled 00 (missing code), 01, 02, ..., 99. If there are more
## than 99 alleles at a locus, the code needs to be updated to 3-digit
## format by adding extra leading zeros: 001, 002, ..., 099.
##
##
## Minimum Loci:
##
## This function only includes individuals with at least min_loci_per_indiv
## typed loci - e.g. a reasonable minimum might be 8. This aims to exclude
## altogether individuals with low-quality DNA samples.
## For example, if an individual has poor-quality DNA and is typed as a
## homozygote at one of its "successful" loci, it might actually be a
## heterozygote at that loci but the second allele failed to amplify.
##
## NOTE that min.loci applies to the FULL data from locnames_all: so if an
## individual has 8 present loci out of 10, it counts as 8 for purposes of
## exclusion, even if only (say) 5 of these coincide with the actual loci of
## interest after delete_loc have been deleted. The rationale is that this
## is an individual with reasonable data quality, evidenced by 8 out of 10
## successful loci.
##
## min_loci_per_indiv defaults to 1, to include all individuals with any
## data present.
##
## EXAMPLE:
## The following command will prompt to accept the outfile name
## "genepop.gbi.dat.txt", and write results into it:
## genepop.dataformat.func(gbi.dat, min_loci_per_indiv=8)
##--------------------------------------------------------------------------------------------
## Create default name for outfile if not supplied:
if(missing(outfile)){
outfile <- paste("genepop", substitute(dat), "txt", sep=".")
cat("No name supplied for outfile: use", outfile, "?\n")
cat("Enter to agree, or Ctrl-C to cancel.\n")
readline()
}
##--------------------------------------------------------------------------------------------
## Exclude poor-quality individuals with < min_loci_per_indiv loci from the
## FULL data in locnames_all:
if(min_loci_per_indiv>1){
## Find number of non-missing loci (allele anything except 0) for each
## individual.
## If there is missing data at a locus, the loc_a1 entry and loc_a2
## entry will both be 0.
## Use only the loc_a1 entry.
nloc_dat <- dat[, paste0(locnames_all, ".a1")]
nloc <- apply(nloc_dat, 1, function(x) length(x[x!=0]))
cat("WARNING! Excluding all individuals with <", min_loci_per_indiv,
" loci out of ", length(locnames_all), " loci available.\n")
## Remove individuals from dat if they have too few loci:
dat <- dat[nloc >= min_loci_per_indiv,]
}
##--------------------------------------------------------------------------------------------
## Create the names of the desired loci:
if(is.null(delete_loc)) locnames_for_use <- locnames_all
else locnames_for_use <- locnames_all[-delete_loc]
##--------------------------------------------------------------------------------------------
## Isolate the column numbers for the required columns:
colnames_a1 <- paste0(locnames_for_use, ".a1")
colnames_a2 <- paste0(locnames_for_use, ".a2")
colnames_locus <- sort(c(colnames_a1, colnames_a2))
dat_out <- dat[,c("id", "pop", colnames_locus)]
##--------------------------------------------------------------------------------------------
## Next order the data frame by population, so that each population
## occupies a block of rows in the data frame:
dat_out <- dat_out[order(dat_out$pop), ]
##--------------------------------------------------------------------------------------------
## For each locus in turn, replace the allele labels with 00 (missing), 01, 02, ...10, 11, ...
for(loc in locnames_for_use){
col_a1 <- paste0(loc, ".a1")
col_a2 <- paste0(loc, ".a2")
loc_a1 <- dat_out[, col_a1]
loc_a2 <- dat_out[, col_a2]
## Identify the labels of all alleles in the data for this locus:
locboth <- c(loc_a1, loc_a2)
loctab <- table(locboth)
loctab_names <- names(loctab)
## Number of visible alleles (non-null):
nvisible_alleles <- length(loctab[loctab_names!="0"])
new_names <- as.character(1:nvisible_alleles)
## Add leading zeros:
new_names[1:9] <- paste("0", new_names[1:9], sep="")
## Add the missing code if there are any missing data at this locus.
## Missing (0) becomes 00.
if(any(loctab_names=="0")) new_names <- c("00", new_names)
## Replace the old allele names in dat_out with the new ones:
for(al in 1:length(loctab_names)){
dat_out[,col_a1][as.character(loc_a1)==loctab_names[al]] <- new_names[al]
dat_out[,col_a2][as.character(loc_a2)==loctab_names[al]] <- new_names[al]
}
}
## The results are now of the form 00, 01, 02, ... where the order of the new label
## is the same as the numeric order of the old label.
## Now add a comma after the population name for each individual, and paste
## together the columns for each locus.
## dat_new looks something like this:
## id loc1 loc2 loc3 ...
## Awa, 1010 0611 0303 ...
## Awa, 0505 0711 0102 ... etc.
# dat_new <- data.frame(id=paste0(dat_out$pop, ","))
dat_new <- data.frame(id=paste0(dat_out$id,","))
for(loc in locnames_for_use){
col_a1 <- paste(loc, ".a1", sep="")
col_a2 <- paste(loc, ".a2", sep="")
dat_new[,loc] <- paste0(dat_out[,col_a1], dat_out[,col_a2])
}
## Split into populations:
dat_split <- split(dat_new, dat_out$pop)
##--------------------------------------------------------------------------------------------
## Now write the results into outfile in the required Genepop format:
cat(outfile, file=outfile, "\n", append=F)
for(ln in paste(sort(locnames_for_use))) cat(ln, "\n", file=outfile, append=T)
write_output <- function(pop){
cat("POP\n", file=outfile, append=T)
write.table(pop, outfile, row.names=F, col.names=F, quote=F, sep=" ", append=T)
}
lapply(dat_split, write_output)
cat("\nFile", outfile, "has been created in Genepop format.\n")
cat("Genepop On The Web is at http://genepop.curtin.edu.au/\n")
invisible()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.