Nothing
Biokey <- function(data, from="", to="", recalculate=TRUE, internal=FALSE, force=FALSE) {
mat <- matrix(c(
"bracket", "backreferenced",
"bracket", "indented",
"bracket", "serial",
"bracket", "newick",
##
"branched", "bracket",
"branched", "indented",
"branched", "serial",
"branched", "newick",
##
"indented", "bracket",
"indented", "serial",
"indented", "newick",
##
"serial", "bracket",
"serial", "indented",
"serial", "newick",
##
"classif", "newick",
"classif", "table",
##
"newick", "classif",
##
"table", "classif"
), ncol=2, byrow=TRUE, dimnames=list(NULL, c("from", "to")))
available.pairs <- apply(mat, 1, paste, collapse=" => ")
if (!force & !paste(from, to, sep=" => ") %in% available.pairs) {
stop("only these conversions are possible:\n ",
paste0(available.pairs, "\n "),
"use force=TRUE to overrun")
}
##
.depth <- function(vec) { # calculate depths, e.g., indents in indented keys
depths <- numeric(length(vec))
for(n in 2:length(depths)) {
if(vec[n] != vec[n-1]) {
m <- match(vec[n], vec[1:(n-1)])
if(!is.na(m)) {
depths[n] <- depths[m]
} else {
depths[n] <- depths[n-1] + 1
}
} else {
depths[n] <- depths[n-1]
}
}
depths
}
##
data <- as.data.frame(data, stringsAsFactors=FALSE) # in case 'key' is a matrix
##
if (from == "newick") {
tmp <- data[1, 1] # "newick" string should be in 'data' first cell
tmp <- gsub(":[0-9.]+", "", tmp) # remove branch lengths, if any
oprs <- which(unlist(strsplit(tmp, NULL)) == "(") # positions of opening parentheses
lbls <- oprs # labels
for (i in seq_along(oprs)) {
.x <- gsub("\\(([^()]|(?R))*\\)", "", substr(tmp, oprs[i], nchar(tmp)), perl=TRUE) # removes all fully match parentheses with content, leaves labels, commas and "tails"
lbls[i] <- gsub("[(,);].*$", "", .x) # removes everything after the supposed label, works because labels separated at least by comma
}
lbls[lbls == ""] <- "NA" # if there are no node labels, use NA-like _strings_
tmp <- gsub("\\)[^(,);]+", "\\)", tmp) # now remove node labels, if any
tmp <- gsub("^\\((.*) *,([^(,);]+)\\);", "(\\2,\\1));", tmp) # if the root node name is the last, it goes to the first position
tmp <- gsub("[ ;]", "", tmp) # remove spaces because we will use them later, also remove semicolon in the end
tmp <- trimws(gsub("([(,)])", " \\1 ", tmp)) # add spaces around symbols to split and remove trailing/leading spaces
tmp <- unlist(strsplit(tmp, " ")) # split by spaces and remove list "shell"
tmp <- tmp[!tmp %in% c(",", ";")] # remove chunks with commas and semicolons
maxrank <- max(table(tmp)) + 2 # biggest possible rank, because first opening bracket introduces two ranks (group and lower)
rrs <- Recode(tmp, c("(", ")"), c(-1, 1)) # opening parenthesis lowers rank, closing makes it higher
rrs <- suppressWarnings(as.numeric(rrs)) # because all text strings will issue warnings
rrs[is.na(rrs)] <- 0 # ... so these past text strings will be zeroes now
rrs <- maxrank + cumsum(rrs) # kind of relative ranks with maxrank base (without maxrank base, they might be negative)
tmp <- tmp[tmp != ")"] # now remove closing parentheses
ids <- ifelse(!(tmp == "("), 1, rrs) # if terminals, rank=1, if higher, rank is relative rank
ids <- as.numeric(as.factor(as.numeric(ids))) # yes, funny but allows to recalculate ranks from lowers to highest
tmp[tmp == "("] <- lbls # opening parentheses turn into labels
tmp <- gsub("_", " ", tmp) # replace Newick's underscores with spaces
key <- cbind(ids, NA, tmp, NA)
key <- key[key[, 3] != "", ] # remove empty terminals
}
if (from == "bracket") {
key <- cbind(data, suppressWarnings(as.numeric(data[, 3]))) # goto's into 4th column
key[, 3] <- ifelse(is.na(key[, 4]), key[, 3], "") # 3rd column with terminals only
for(n in 2:nrow(key)) {
jump <- key[n-1, 4] # 4th column must contain goto's
if(!is.na(jump)) {
first <- key[1:(n-1), ]
move2 <- (seq_len(nrow(key)))[key[, 1] == jump]
second <- key[move2, ]
third <- key[-c(1:(n-1), move2), ]
key <- rbind(first, second, third)
row.names(key) <- NULL
}
}
}
if (from == "branched") { # the most simple format, similar to internal 'key'
key <- cbind(data, NA) # add fake 4th column
}
if (from == "indented") { # similar to 'branched' but have idents as first column
key <- cbind(data[, -1], NA) # skip indents (they will be recalculated if needed)
}
if (from == "serial") { # similar to branched but ids are two ref colums (id + pair)
idsf <- paste(data[, 1], data[, 2])
idsr <- paste(data[, 2], data[, 1])
newid <- idsf
for (n in seq_along(idsf)) newid[n] <- newid[which(idsr %in% idsf[n])]
newid <- as.numeric(factor(newid, levels=unique(newid)))
key <- cbind(newid, data[, 3:4], NA) # discard old ref columns, add fake 4th column
}
if (from == "classif") {
data <- data[, 1:2] # if there are more columns, ignore them
data <- data[!duplicated(data[, 1:2]), ] # remove row duplicates (possible result of line above)
if (to != "table") { # we need _relative_ ranks and higher groups separate from terminals
data[, 1] <- as.numeric(as.factor(data[, 1])) # convert absolute numeric ranks to relative
data[, 3] <- character(nrow(data))
higher <- data[, 1] > min(data[, 1])
data[higher, 3] <- data[higher, 2] # keep higher names in 3rd column (temporary)
data[higher, 2] <- "" # remove names of all higher groups
dd <- diff(data[, 1]) # how many rows to add
dd <- abs(sum(dd[dd < -1] + 1)) # each minus below -1 is one more row
for(n in 1:(dd + nrow(data)-1)) { # insert fake rows to propagate all ranks
dfs <- data[n, 1] - data[n+1, 1]
if(dfs > 1) {
before <- data[1:n, ]
after <- data[-(1:n), ]
insert <- data[n, ]
insert[, 1] <- data[n, 1] - 1 # intermediate rank here
insert[, 2] <- "" # no terminal here
insert[, 3] <- "NA" # fake higher group
data <- rbind(before, insert, after)
row.names(data) <- NULL
}
}
key <- cbind(data[, 1], data[, 3], data[, 2], NA) # add fake 4th column
} else { # for "table" (maybe also for keys?), we need to keep original ranks and names
key <- cbind(data[, 1], NA, data[, 2], NA) # add fake 2nd (classifs do not have descriptions) and 4th columns
}
}
if (from == "table") {
data <- sapply(data, Ditto)
data <- data[, rev(colnames(data))]
ranks <- data
ranks[] <- ""
ranks[1, ] <- colnames(ranks)
ranks <- apply(ranks, 2, Fill)
ranks.data <- paste(t(ranks), t(data))
ranks.data <- gsub("[0-9.]+ *$", "", ranks.data)
ranks.data <- ranks.data[ranks.data != ""]
ranks.data <- gsub("([0-9.]+) ([^0-9]+)$", "\\1\t\\2", ranks.data)
ranks.data <- do.call(rbind, strsplit(ranks.data, "\t"))
key <- data.frame(ranks.data[, 1], NA, ranks.data[, 2], NA, stringsAsFactors=FALSE)
}
## at this point, all "from" conversions should finish their output and return internal 'key'
## 'key' is the universal internal type: linear, branched, with four columns:
## (1) ids or ranks (numbers),
## (2) descriptions or higher categories (text),
## (3) terminals (text),
## (4) goto's (if any, otherwise NAs) -- needed only to _output_ bracket (e.g., with recalculated numbers or with backreferences)
colnames(key) <- c("id", "description", "terminal", "goto")
key[is.na(key[, 3]), 3] <- "" # sometimes, terminals contain NAs instead of empty strings (depends on input format)
##
## recalculation:
if (from == "classif" | from == "newick" | from == "table") recalculate <- FALSE # these types do not need recalculation
if (recalculate) {
newids <- as.numeric(factor(key[, 1], levels=unique(key[, 1]))) # recalculate (1) ids in order of appearance
newgotos <- Recode(key[, 4], key[, 1], newids) # and (2) goto's
key[, 1] <- newids
key[, 4] <- newgotos
}
## "to" part:
##
if (to == "classif") {
res <- key[, c(1, 3)] # classifs should have ranks a.k.a. ids and terminals
colnames(res) <- c("RANK", "NAME") # not absolutely necessary but adds understanding
}
if (to == "indented") {
indents <- .depth(key[, 1])
res <- cbind(indents, key[, 1:3]) # indents as numbers
}
if (to == "serial") {
refs <- numeric(nrow(key))
for(n in seq_along(refs)) {
w <- which(key[, 1] %in% key[n, 1])
refs[n] <- w[w != n]
}
refs <- cbind(seq_along(refs), refs) # refs as two columns (id and pair); to make typographic, add parentheses around 'refs'
res <- cbind(refs, key[, 2:3]) # add two new id columns and skip 1st column with old ids
}
if (to == "bracket") {
for (n in seq_len(nrow(key))) {
if(key[n, 3] == "") key[n, 3] <- key[n+1, 1] # goto's taken from next step and placed into 3rd column
}
res <- key[order(key[, 1]), 1:3] # now theses and anti-theses are together
}
if (to == "backreferenced") {
back <- numeric(nrow(key))
for (n in seq_len(nrow(key))) {
if(key[n, 3] == "") {
key[n, 3] <- key[n+1, 1] # goto taken from next step id and placed into 4th column
back[n+1] <- key[n, 1] # backreference for next step taken from previous step id
}
}
back[back == 0] <- "" # replace skipped back's with empty strings
key <- cbind(key[, 1], back, key[, 2:3]) # backrefrences as separate character column, to make typographic, add parentheses around 'back'
res <- key[order(key[, 1]), ] # now theses and anti-theses are together
}
if (to == "newick") {
indents <- .depth(key[, 1])
dfs <- indents - c(indents[2:(length(indents))], 0)
brt <- Recode(sign(dfs), c(-1, 0, 1), c("(", ",", ")"))
mul <- abs(dfs) + (dfs == 0)
for (i in seq_along(brt)) brt[i] <- paste(rep(brt[i], mul[i]), collapse="")
tmp <- cbind(key[, 3], brt)
tmp <- paste0(paste0(t(tmp), colalpse=""), collapse="")
tmp <- gsub(")(", "),(", tmp, fixed=TRUE)
tmp <- gsub("([A-z0-9])\\(", "\\1,\\(", tmp) # 'name(' -> 'name,('
tmp <- gsub("\\)([A-z0-9])", "\\),\\1", tmp) # ')name' -> '),name'
tmp <- gsub("(,", "(", tmp, fixed=TRUE) # remove left empty terminals
tmp <- gsub(",)", ")", tmp, fixed=TRUE) # remove right empty terminals
tmp <- gsub(",*\\(\\)", "", tmp) # some unknown empties
tmp <- gsub(",,+", ",", tmp) # remove empty terminals, again
tmp <- gsub(" ", "_", tmp, fixed=TRUE) # Newick dislikes spaces, at least in terminals
nn <- paste0("(", tmp, ");") # add root node and Newick's "end of tree"
if (from == "classif") {
lbls <- key[grep("\\(", brt), 2] # labels for opening parentheses (including double and more)
lbls <- c("", lbls) # add empty root node label
lbls[lbls == "NA"] <- "" # make "NA" string labels (fake subgroups) empty
.PPadd <- function(txt, labels){ # adds labels to closing matches of the each opening parenthesis
txts <- unlist(strsplit(txt, NULL))
cpp <- opp <- which(txts == "(")
txtn <- Recode4(txts, c("(", ")"), c(-1, 1), 0)
txtl <- length(txtn)
for (i in seq_along(opp)) {
pos <- (opp[i] + which(cumsum(txtn[opp[i]:txtl]) == 0)[1]) - 1 # we need the first match
txts[pos] <- paste0(txts[pos], labels[i])
}
paste0(txts, collapse="")
}
res <- .PPadd(nn, lbls)
} else {
res <- nn
}
}
if (to == "table") {
allranks <- sort(unique(key[, 1]))
for (i in seq_along(allranks)) {
tmp <- key[, 3]
tmp[(as.numeric(key[, 1]) < as.numeric(allranks[i]))] <- "" # empty values of lower ranks
tmp <- Fill(tmp) # fill them from values above
assign(allranks[i], tmp) # make as many variables as there are ranks
}
res <- do.call(cbind, mget(allranks))
res <- res[as.numeric(key[, 1]) == min(as.numeric(allranks)), ]
}
if (internal) res <- key # allows to output internal 4-column object
res
}
Numranks <- function(nums=NULL, ranks=NULL, add=NULL, empty="Species") {
if(!is.null(nums) & !is.null(ranks)) stop("either 'nums' or 'ranks' (but not both) must be specified")
mat <- matrix(c(
0.5, "Varietes",
0.8, "Subspecies",
1.0, "Species",
1.2, "Subsectio",
1.5, "Sectio",
1.8, "Subgenus",
2.0, "Genus",
2.2, "Subtribus",
2.5, "Tribus",
2.8, "Subfamilia",
3.0, "Familia",
3.2, "Superfamilia",
3.5, "Infraordo",
3.8, "Subordo",
4.0, "Ordo",
4.2, "Superordo",
4.5, "Infraclassis",
4.8, "Subclassis",
5.0, "Classis",
5.2, "Superclassis",
5.8, "Subphylum",
6.0, "Phylum",
6.2, "Superphylum",
6.5, "Infraregnum",
6.8, "Subregnum",
7.0, "Regnum",
7.2, "Superregnum"
), ncol=2, byrow=TRUE, dimnames=list(NULL, c("num", "rank")))
if(!is.null(add)) {
mat <- rbind(mat, matrix(c(add, names(add)), ncol=2))
}
if (is.null(nums) & is.null(ranks)) res <- data.frame(mat)
if(!is.null(nums) & is.null(ranks)) {
res <- Recode(nums, mat[, 1], mat[, 2])
}
if(is.null(nums) & !is.null(ranks)) {
ranks <- gsub("[^a-z]", "", tolower(ranks))
ranks <- Recode(ranks,
c("tribe", "subtribe", "section", "subsection", "family", "subfamily", "superfamiliy", "order", "suborder", "superorder", "kingdom", "subkingdom"),
c("tribus","subtribus", "sectio", "subsectio", "familia", "subfamilia", "superfamilia", "ordo", "subordo", "superordo", "regnum", "subregnum"))
ranks[ranks == ""] <- empty
.cap <- function(.x) { # from help(toupper)
sapply(strsplit(.x, split=" "), function(.xx)
paste0(toupper(substring(.xx, 1, 1)), substring(.xx, 2), collapse=" "))
}
res <- as.numeric(RecodeR(.cap(ranks), mat[, 2], mat[, 1]))
}
res
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.