loci2migrate <- function(locipath, poppath, filename) {
# read the loci file and the pop file
loci <- scan(locipath, sep = '\n', what = 'char', quiet = TRUE)
popdat <- read.table(poppath, sep = '\t')
# initialize
j <- 0
j_end <- ind <- seq <- locus <- pop <- NULL
# loop thru each line
for (i in seq_along(loci)) {
if (!startsWith(loci[i], '//')) { # read the sequence
j <- j + 1 # j keep track of ind #
sp_ln <- unlist(strsplit(loci[i], ' '))
ind[j] <- sp_ln[1]
seq[j] <- tail(sp_ln, 1)
} else { # read the locus label
if (is.null(j_end))
j_end = 1
sp_ln <- unlist(strsplit(loci[i], ' '))
# get the label and clean other symbols
locus_tmp <- gsub('*', '', tail(sp_ln, 1), fixed = TRUE)
locus_tmp <- gsub('-', '', locus_tmp, fixed = TRUE)
locus[j_end:j] <- gsub('|', '', locus_tmp, fixed = TRUE)
j_end <- j + 1 # j_end keep track of last ind of last locus
}
}
# check for spelling error in ind label
wrong_spell_in_pop <- !(popdat[, 1] %in% ind)
if (any(wrong_spell_in_pop)) {
stop('Not all ind label are found in the .loci file, please check whether\n',
'you spelt the below entry wrong in the pop file* or you wish to ',
'remove \nthe entry from the pop file:\n',
paste0(popdat[which(wrong_spell), 1], ' (line', which(wrong_spell),
')\n'), '*Note: label is case sensitive')
}
wrong_spell_in_loci <- !(ind %in% popdat[, 1]) # quick fix
if (any(wrong_spell_in_loci)) {
stop('loci file contains ind label not found in pop file, please check\n',
'whether there is an spelling error in loci file, specifically ',
'the following ind labels:\n',
paste(ind[which(!ind %in% popdat[, 1])], collapse = ', '),
'\n*Note: label is case sensitive')
}
# check pop, anything starts with number?
pop_start_num <- grepl("^[0-9]", as.character(popdat[, 2]))
if (any(pop_start_num))
stop('Population name should not starts with a number, perhaps change these ',
'names?:\n', paste(popdat[pop_start_num, 2], collapse = ', '))
# match population
for (i in seq_along(ind)) {
pop[i] <- as.character(popdat[, 2][popdat[, 1] == ind[i]])
}
# sequence length
seqlen <- sapply(seq, nchar, USE.NAMES = FALSE)
# open connection for writing
filecon <- file(filename, "wt")
# remove missing locus (locus that not present in all populations)
locus <- as.factor(locus)
pop <- as.factor(pop)
locus_pop_i <- matrix(NA, nrow = length(levels(locus)),
ncol = length(levels(pop)))
for (i in seq_along(levels(pop))) {
locus_pop_i[, i] <- table(locus[pop == levels(pop)[i]]) != 0
}
locus_keep_idx_levels <- apply(locus_pop_i, 1, all) # keep if all are TRUE
locus_keep_idx_ind <- locus %in% levels(locus)[locus_keep_idx_levels]
# write some metadata
cat(file = filecon,
length(levels(pop)),
length(levels(droplevels(locus[locus_keep_idx_ind]))),
basename(locipath), '\n')
if (any(!locus_keep_idx_levels)) {
cat(file = filecon,
'# These loci were not found in all populations and hence removed:\n#',
levels(locus)[!locus_keep_idx_levels], '\n')
}
# combine all extracted info into a big dataframe
dat <- data.frame(ind = ind, locus = locus, pop = as.factor(pop), seq = seq,
seqlen = seqlen)[locus_keep_idx_ind, ]
dat$locus <- droplevels(dat$locus)
# and clear memory
rm(ind, locus, pop, seq, seqlen)
# first sort by pop
for (i in seq_along(levels(dat$pop))) {
dat_sub <- subset(dat, dat$pop == levels(dat$pop)[i])
dat_sub$locus <- droplevels(dat_sub$locus)
# write summary
seqlen_total <- as.integer(xtabs(dat_sub$seqlen~dat_sub$locus))
n_ind_locus <- table(dat_sub$locus)
if (i == 1)
cat(file = filecon, seqlen_total/n_ind_locus, '\n')
cat(file = filecon, n_ind_locus, levels(dat$pop)[i], '\n')
# then sort by locus
for (j in seq_along(levels(dat_sub$locus))) {
locus_j <- levels(dat_sub$locus)[j]
# write locus label
cat(file = filecon, paste('#', locus_j, '\n'))
dat_sub2 <- subset(dat_sub, dat_sub$locus == locus_j)
# then write ind and seq
for (k in seq_along(dat_sub2$ind)) {
# check if ind label more than 10char, if true then trim
ind_to_write <- as.character(dat_sub2$ind[k])
ind_to_write <- ifelse(nchar(ind_to_write) <= 10, ind_to_write,
substring(ind_to_write, 1, 10))
cat(file = filecon, ind_to_write, '\t',
as.character(dat_sub2$seq[k]), '\n')
}
}
}
# give a warning if trimming of ind label was done
if (any(sapply(as.character(dat$ind), nchar) > 10))
warning('ind labels with >10 characters found and were trimmed to first 10')
# close file end writing
close(filecon)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.