Nothing
## ====================================================
## Computing distances between sequences
## ====================================================
seqdistOO <- function(seqdata, method, refseq=NULL, norm=FALSE,
indel=1, sm=NA, with.missing=FALSE, full.matrix=TRUE,
kweights=rep(1.0, ncol(seqdata)), tpow=1,
expcost=0.5, context=1-2*expcost,
link="mean", h=0.5, nu=0,
transindel="constant", otto, previous=FALSE,
addcolumn=TRUE, numcpu=1,
breaks=NULL, step=1, overlap=FALSE, weighted=TRUE) {
gc(FALSE)
warnings(" [>] seqdistOO is still under development. Some configuration of parameters may produce errors.")
## Checking method name
metlist <- c("OM", "OMopt", "LCP", "LCS", "LCSopt", "RLCP", "DHD", "HAM",
"NMS", "NMSMST", "SVRspell",
"CHI2", "EUCLID",
"OMloc", "OMspell", "OMslen", "OMstran", "TWED")
OMvariant <- c("OM", "OMloc", "OMspell", "OMslen")
if (missing(method)) {
stop(" [!] You should specify a method to compute the distances. It must be one of: ", paste(metlist,collapse=" "))
}
if (!method %in% metlist) {
stop(" [!] Method must be one of: ", paste(metlist, collapse=" "), call.=FALSE)
}
debut <- proc.time()
if(method=="OM" && length(indel)>1){
method <- "OMloc"
}
## Checking correct arguments
if (!inherits(seqdata,"stslist")) {
stop(" [!] data is not a state sequence object, use 'seqdef' function to create one", call.=FALSE)
}
if (inherits(refseq, "stslist")) {
a.ref <- alphabet(refseq)
a.seq <- alphabet(seqdata)
if ((length(a.ref)!=length(a.seq)) | (!suppressWarnings(all(a.ref==a.seq)))) {
stop(" [!] same alphabet must be assigned to 'refseq' and 'seqdata'!", call.=FALSE)
}
}
if (method=="SVRspell") {
method <- "NMSMST"
SVRspell <- TRUE
} else{
SVRspell <- FALSE
}
if (method=="OMopt") {
method <- "OM"
optimized <- TRUE
}
else if (method=="LCSopt") {
method <- "LCS"
optimized <- TRUE
} else if(method %in% c("CHI2", "EUCLID", "OMstran")){
if(method=="CHI2"){
distances <- chisqdist(seqdata, breaks=breaks, step=step, with.missing=with.missing, norm=norm, weighted=weighted, overlap=overlap)
} else if(method=="EUCLID"){
distances <- chisqdist(seqdata, breaks=breaks, step=step, with.missing=with.missing, norm=norm, weighted=FALSE, overlap=overlap, euclid=TRUE)
} else if(method=="OMstran"){
distances <- seqdistOMTrans(seqObj=seqdata, indel=indel, sm=sm, transmetric=transindel, stateweight=otto, with.missing=with.missing, norm=norm, previous=previous, addcolumn=addcolumn, full.matrix=full.matrix, weighted=weighted, numcpu=numcpu)
}
if (full.matrix && inherits(distances, "dist")) {
return(TraMineR:::dist2matrix(distances))
}
else {
return(distances)
}
}
else {
optimized <- FALSE
}
## Taking care of correct normalization settings
if (is.logical(norm)) {
if (norm) {
## Normalize using Elzinga for LCP, LCS, RLCP
if (method %in% c("LCP", "LCS", "RLCP")) {
norm <- 2
} else {## Normalize using Abbott for OM, HAM, and DHD
norm <- 1
}
} else {
norm <- 0
}
} else if (is.character(norm)) {
## Using normalization name
## Cast to integer for c code and normdist function
## Match return the position, removing 1 to start at zero
normIndice <- match(norm, c("none", "maxlength", "gmean", "maxdist", "YujianBo")) -1
if (is.na(normIndice)) { ##Not found
stop(" [!] Unknown distance normalization method ", norm)
}
norm <- normIndice
} else {
stop(" [!] Unknown distance normalization method ", norm)
}
## Checking missing values
if (!with.missing && any(seqdata==attr(seqdata,"nr"))) {
stop("found missing values in sequences, please set 'with.missing=TRUE' to nevertheless compute distances")
}
if (method == "OM" && is.character(sm)) {
if (sm == "TRATE" || sm =="CONSTANT") {
sm <- seqsubm(seqdata, method=sm, cval=2, with.missing=with.missing, miss.cost=2)
} else {
stop(" [!] Unknown method ", sm, " to compute substitution costs")
}
}
if (method == "OM" && is.na(sm)) {
stop(" [!] sm=NA not allowed with method='OM'. Specify a valid sm value!")
}
## =====================
## Base information
## =====================
n <- nrow(seqdata)
alphabet <- attr(seqdata,"alphabet")
alphsize <- length(alphabet)
message(" [>] ",n," sequences with ", alphsize, " distinct events/states")
## Gaps in sequences
if (with.missing) {
alphabet <- c(alphabet,attr(seqdata,"nr"))
alphsize <- length(alphabet)
message(" [>] including missing value as additional state" )
}
## Checking methods that are treated the same
methodname <- method
if (method == "LCS") {
method <- "OM"
sm <- suppressMessages(seqsubm(seqdata, method="CONSTANT", cval=2, with.missing=with.missing, miss.cost=2))
indel <- 1
} else if (method == "HAM") {
method <- "DHD"
if (!is.null(dim(sm))) {
TraMineR.checkcost(sma=sm, seqdata=seqdata, with.missing=with.missing)
if(is.matrix(sm)){
costs <- array(0, dim=c(alphsize, alphsize, ncol(seqdata)))
for(i in 1:ncol(seqdata)){
costs[,,i] <- sm
}
sm <- costs
}
} else {
sm <- suppressMessages(seqsubm(seqdata, "CONSTANT", cval=1, with.missing=with.missing,
miss.cost=1, time.varying=TRUE))
}
}
## ===========================
## Checking correct size of sm
## ===========================
## Checking if substitution cost matrix contains values for each state
## and if the triangle inequality is respected
if (method %in% OMvariant) {
TraMineR.checkcost(sma=sm, seqdata=seqdata, with.missing=with.missing, indel=indel)
}
## Checking if substitution cost matrix contains values for each state
## and if the triangle inequality is respected
if(methodname == "DHD"){
## User entered substitution cost
if(length(sm)>1){
TraMineR.checkcost(sma=sm, seqdata=seqdata, with.missing=with.missing)
}
else {
sm <- seqsubm(seqdata, "TRATE", cval=4, with.missing=with.missing,
miss.cost=4, time.varying=TRUE)
}
}
## ==============
## Preparing data
## ==============
if(method %in% c("NMSMST", "OMspell")) {
seqduree <- seqdur(seqdata, with.missing=with.missing)^tpow
seqdatadss <- seqdss(seqdata, with.missing=with.missing)
}else{
seqduree <- NULL
}
seqdata <- seqnum(seqdata, with.missing=with.missing)
## Selecting distinct sequences only and saving the indexes
dseq <- unique(seqdata)
mcorr <- match(seqconc(seqdata), seqconc(dseq))
if(method %in% c("NMSMST", "OMspell")) {
mcorrReverse <- match(seqconc(dseq), seqconc(seqdata))
seqduree <- seqduree[mcorrReverse,]
dseq <- seqnum(seqdatadss[mcorrReverse,], with.missing=with.missing)
}
nd <- nrow(dseq)
message(" [>] ", nd," distinct sequences")
slength <- seqlength(dseq)
if (method=="OMslen") {
seqduree <- matrix(0, nrow=nrow(dseq), ncol=ncol(dseq))
ddur <- seqdur(dseq)
for(i in 1:nrow(dseq)){
y <- ddur[i, !is.na(ddur[i,])]
seqduree[i, 1:sum(y)] <- rep(y, times=y)
}
seqduree <- seqduree^(-1*h)
}
dseq <- TraMineR:::seqasnum(dseq, with.missing=with.missing)
message(" [>] min/max sequence length: ",min(slength),"/",max(slength))
## ===================================
## Preparing data for Hamming distance
## ===================================
if (method=="DHD") {
if(length(unique(slength))>1) {
## Hamming is not defined for sequence of different length
stop(methodname, " distance can only be computed between sequences of equal length")
}
## Here we use the indel parameter for the C function
## But it contain the maximum possible cost of the hamming distance
indel <- 0
for (i in 1:max(slength)) {
indel <- indel + max(sm[,,i])
}
}
message(" [>] computing distances using ", methodname,
ifelse(norm!=0," normalized", ""), " metric")
params <- list()
if (method=="OM") {
## One for OM, 2 for LCP
disttype <- as.integer(1)
params[["scost"]] <- sm
params[["alphasize"]] <- alphsize
params[["indel"]] <- indel
}
else if (method=="LCP") {
disttype <- as.integer(2) ## One for OM, 2 for LCP
params[["sign"]] <- as.integer(1)
}
else if (method=="RLCP") {
disttype <- as.integer(3) ## One for OM, 2 for LCP
params[["sign"]] <- as.integer(-1)
}
else if (method=="DHD") {
disttype <- as.integer(4) ## 4 for DHD or HAM
params[["scost"]] <- sm
params[["alphasize"]] <- alphsize
params[["maxdist"]] <- indel
}
else if (method=="NMS") {
## One for OM, 2 for LCP
disttype <- as.integer(5)
kweights2 <- double(ncol(dseq))
kweights2[1:min(ncol(dseq),length(kweights))] <- kweights[1:min(ncol(dseq),length(kweights))]
params[["kweights"]] <- as.double(kweights2)
params[["distMethod"]] <- as.integer(2)
params[["distTransform"]] <- as.integer(link=="log")
if(!is.na(sm)){
disttype <- as.integer(12)
params[["softmatch"]] <- sm
params[["alphasize"]] <- alphsize
if (nrow(sm)!=alphsize | ncol(sm)!=alphsize) {
stop(" [!] size of soft matching matrix must be ", alphsize,"x", alphsize)
}
eg <- eigen(sm)
if(any(eg$values<0)){
warning(" [!] matching between states is not semipositive definite. Eigen values: ", paste(eg$values, collapse=" "))
}
}
}
else if (method=="NMSMST") {
## One for OM, 2 for LCP
kweights2 <- double(ncol(dseq))
kweights2[1:min(ncol(dseq),length(kweights))] <- kweights[1:min(ncol(dseq),length(kweights))]
disttype <- as.integer(6)
params[["seqdur"]] <- as.double(seqduree)
params[["kweights"]] <- as.double(kweights2)
params[["distMethod"]] <- as.integer(2)
params[["distTransform"]] <- as.integer(link=="log")
if(length(sm)>1||SVRspell){
if(SVRspell){ ## SVRspell
disttype <- as.integer(13)
if(length(sm)==1){
sm <- matrix(0, ncol=alphsize, nrow=alphsize)
diag(sm) <- 1
}
} else {
disttype <- as.integer(11)
}
params[["softmatch"]] <- sm
params[["alphasize"]] <- alphsize
if (nrow(sm)!=alphsize | ncol(sm)!=alphsize) {
stop(" [!] size of soft matching matrix must be ", alphsize,"x", alphsize)
}
eg <- eigen(sm)
if(any(eg$values < -1e-07)){
warning(" [!] matching between states is not semipositive definite. Eigen values: ", paste(round(eg$values, 3), collapse=" "))
}
}
}else if (method=="OMloc") {
## One for OM, 2 for LCP
disttype <- as.integer(7)
params[["scost"]] <- sm
params[["alphasize"]] <- alphsize
params[["indel"]] <- indel
if(length(indel)>1){
params[["indels"]] <- indel
params[["indelmethod"]] <- as.integer(0)
}else {
params[["indels"]] <- rep(indel, alphsize)
if(link=="min"){
params[["indelmethod"]] <- as.integer(2)
} else if(link=="mean"){
## Maximum indel cost
params[["indel"]] <- max(sm)*(expcost+context)
params[["indelmethod"]] <- as.integer(1)
} else if(link=="previous"){
disttype <- as.integer(9)
params[["firststatemethod"]] <- as.integer(1)
}
else if(link=="previous_nofirst"){
disttype <- as.integer(9)
params[["firststatemethod"]] <- as.integer(0)
}
message(" [>] 2*expcost+context=", 2*expcost+context)
}
params[["localcost"]] <- context
params[["timecost"]] <- expcost
}
else if (method=="OMspell") {
## One for OM, 2 for LCP
params[["seqdur"]] <- as.double(seqduree-1)
disttype <- as.integer(8)
params[["scost"]] <- sm
params[["alphasize"]] <- alphsize
params[["indel"]] <- indel
params[["timecost"]] <- expcost
}
else if (method=="OMslen") {
## One for OM, 2 for LCP
params[["seqdur"]] <- as.double(seqduree)
disttype <- as.integer(10)
if(link=="mean"){
params[["sublink"]] <- as.integer(1)
params[["scost"]] <- sm/2
}
else if (link=="gmean"){
params[["sublink"]] <- as.integer(0)
params[["scost"]] <- sm
}
else if (link=="max"){
params[["sublink"]] <- as.integer(2)
params[["scost"]] <- sm
}
params[["alphasize"]] <- alphsize
params[["indel"]] <- indel
} else if (method=="TWED") {
## One for OM, 2 for LCP
disttype <- as.integer(14)
params[["scost"]] <- sm
params[["alphasize"]] <- alphsize
params[["indel"]] <- indel
params[["lambda"]] <- h
params[["nu"]] <- nu
}else {
stop(" [!] unsupported distance method ", method)
}
## Function and arguments
if (!missing(refseq) && !is.null(refseq)) {
if (is.numeric(refseq) && refseq>0 && refseq <= nrow(seqdata)) {
message(" [>] using sequence ", refseq,": ",
suppressMessages(seqformat(seqdata[refseq,], from="STS", to="SPS", compressed=TRUE)), " as reference")
refseqid <- mcorr[refseq]
} else {
stop("[!] invalid reference sequence", call.=FALSE)
}
## Getting refseq
##User specified
# if (inherits(refseq,"stslist") && nrow(refseq)==1) {
# compseq <- refseq
# message(" [>] using (external) sequence ",
# suppressMessages(seqformat(compseq, from="STS", to="SPS", compressed=TRUE)), " as reference")
# }
# ## Most frequent sequence as reference
# else if (refseq==0) {
# mfseq <- seqtab(seqdata, tlim=1)
# message(" [>] using most frequent sequence as reference: ",
# suppressMessages(seqformat(mfseq, from="STS", to="SPS", compressed=TRUE)))
# idxmfseq <- suppressMessages(seqfind(mfseq, seqdata))
# message(" [>] most frequent sequence appears ", length(idxmfseq), " times")
# compseq <- seqdata[idxmfseq[1],]
# }
# ## Indice of sequence given as reference
# else
# ## Length of compseq
## lcompseq <- seqlength(compseq)
## Vector of distance
## m <- vector(mode="numeric", length=nd)
## compseq <- TraMineR:::seqasnum(seqnum(compseq), with.missing=with.missing)
## ddseq <- rbind(dseq[1,], dseq)
## ddseq[1,1:lcompseq] <- compseq[1:lcompseq]
## slength <- c(lcompseq, slength)
## SEXP cstringrefseqdistanceOO(SEXP Ssequences, SEXP seqdim, SEXP lenS, SEXP paramS, SEXP normS, SEXP disttypeS, SEXP refseqS) {
distances <- .Call(TMR_cstringrefseqdistanceOO,
as.integer(dseq),
as.integer(dim(dseq)),
as.integer(slength),
params,
as.integer(norm),
disttype,
as.integer(refseqid))
## distances <- distances[2:length(distances)]
## Constructing the final distance vector
#mcorr <- match(seqconc(seqdata),seqconc(dseq))
distances <- distances[mcorr]
names(distances) <- NULL
}
else { ## !Refseq
magicSeq <- order(mcorr)
magicIndex <- c(unique(rank(mcorr, ties.method="min")), nrow(seqdata)+1)-1
#SEXP cstringdistanceOO(SEXP Ssequences, SEXP seqdim, SEXP lenS, SEXP paramS, SEXP normS, SEXP magicIndexS, SEXP magicSeqS, SEXP disttypeS)
if(numcpu==1){
distances <- .Call(TMR_cstringdistanceOO,
as.integer(dseq),
as.integer(dim(dseq)),
as.integer(slength),
params,
as.integer(norm),
as.integer(magicIndex),
as.integer(magicSeq),
disttype)
} else {
distances <- .Call(TMR_cstringdistanceOOomp,
as.integer(dseq),
as.integer(dim(dseq)),
as.integer(slength),
params,
as.integer(norm),
as.integer(magicIndex),
as.integer(magicSeq),
disttype,
as.integer(numcpu))
}
if(method=="NMS"||method=="SVRspell"||method=="NMSMST"){
distances <- sqrt(distances)
}
## Setting some attributes for the dist object
class(distances) <- "dist"
attr(distances,"Size") <- length(magicSeq)
attr(distances,"method") <- method
attr(distances, "Labels") <- dimnames(seqdata)[[1]]
attr(distances, "Diag") <- FALSE
attr(distances, "Upper") <- FALSE
}
fin <- proc.time()
totaltime <- format(round(difftime(as.POSIXct(sum(fin[1:2]), origin="1960-01-01"), as.POSIXct(sum(debut[1:2]), origin="1960-01-01")), 3))
message(" [>] total time: ", totaltime)
if (full.matrix && inherits(distances, "dist")) {
return(TraMineR:::dist2matrix(distances))
}
else {
return(distances)
}
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.