Nothing
#' Fit the supervised classifier under partition exchangeability
#'
#' Fits the model according to training data x, where x is assumed to follow
#' the Poisson-Dirichlet distribution, and discrete labels y.
#' @param x data vector, or matrix with rows as data points and columns as features.
#' @param y training data label vector of length equal to the amount of rows in `x`.
#' @details This function is used to learn the model parameters from the
#' training data, and gather them into an object that is used by the
#' classification algorithms `tMarLab()` and `tSimLab()`. The parameters it learns
#' are the Maximum Likelihood Estimate of the \eqn{\psi} of each feature within
#' each class in the training data. It also records the frequencies of the data
#' for each feature within each class as well. These are used in calculating the
#' predictive probability of each test data being in each of the classes.
#' @return Returns an object used as training data objects for the classification
#' algorithms `tMarLab()` and `tSimLab()`.
#' @return If `x` is multidimensional, each list described below is returned for each dimension.
#' @return Returns a list of classwise lists, each with components:
#' @return `frequencies`: the frequencies of values in the class.
#' @return `psi`: the Maximum Likelihood estimate of \eqn{\psi} for the class.
#' @keywords Fit training data
#' @export
#' @examples
#' ## Create training data x and its class labels y from Poisson-Dirichlet distributions
#' ## with different psis:
#' set.seed(111)
#' x1<-rPD(5000,10)
#' x2<-rPD(5000,100)
#' x<-c(x1,x2)
#' y1<-rep("1", 5000)
#' y2<-rep("2", 5000)
#' y<-c(y1,y2)
#' fit<-classifier.fit(x,y)
#'
#' ## With multidimensional x:
#' set.seed(111)
#' x1<-cbind(rPD(5000,10),rPD(5000,50))
#' x2<-cbind(rPD(5000,100),rPD(5000,500))
#' x<-rbind(x1,x2)
#' y1<-rep("1", 5000)
#' y2<-rep("2", 5000)
#' y<-c(y1,y2)
#' fit<-classifier.fit(x,y)
classifier.fit <- function(x, y) {
if(length(dim(x))>1){
feature_wise_results<-apply(x, 2, function(z) classifier.fit(z, y))
return(feature_wise_results)
}
y<-sapply(y, toString)
results<-list()
classes<-unique(y)
for (yi in classes) {
cdata<-x[which(yi==y)]
cw.freqs<-table(cdata)
cw.abunds<-abundances(cw.freqs)
cw.PsiMLE<-MLEp(cw.abunds)
results[[yi]]<-list(frequencies=cw.freqs, psi=cw.PsiMLE)
}
return(results)
}
#' Marginally predicted labels of the test data given training data classification.
#'
#' Classifies the test data `x` based on the training data object.
#' The test data is considered i.i.d., so each
#' data point is classified one by one.
#' @param training A training data object from the function `classifier.fit()`.
#' @param x Test data vector or matrix with rows as data points and columns as features.
#' @return A vector of predicted labels for test data x.
#' @keywords Marginal classifier
#' @export
#' @references Amiryousefi A. Asymptotic supervised predictive classifiers under
#' partition exchangeability. . 2021. <https://arxiv.org/abs/2101.10950>.
#' @references Corander, J., Cui, Y., Koski, T., and Siren, J.: Have I seen you before?
#' Principles of Bayesian predictive classification revisited. Springer, Stat.
#' Comput. 23, (2011), 59–73, (<\doi{10.1007/s11222-011-9291-7}>).
#' @details
#' Independently assigns a class label for each test data point according to a
#' \eqn{maximum \, a \, posteriori} rule. The predictive probability of data point
#' \eqn{x_i} arising from class \eqn{c} assuming the training data of size \eqn{m_c} in the class
#' arises from a Poisson-Dirichlet(\eqn{\hat{\psi}_c}) distribution is:
#' \deqn{\hat{\psi}_c / (m_c + \hat{\psi}_c),}
#' if no value equal to \eqn{x_i} exists in the training data of class \eqn{c}, and
#' \deqn{m_{ci} / (m_c + \hat{\psi}_c),}
#' if there does, where \eqn{m_{ci}} is the frequency of the value of \eqn{x_i}
#' in the training data.
#' @examples
#' ## Create random samples x from Poisson-Dirichlet distributions with different
#' ## psis, treating each sample as coming from a class of its own:
#' set.seed(111)
#' x1<-rPD(10500,10)
#' x2<-rPD(10500,1000)
#' test.ind1<-sample.int(10500,500) # Sample test datasets from the
#' test.ind2<-sample.int(10500,500) # original samples
#' x<-c(x1[-test.ind1],x2[-test.ind2])
#' ## create training data labels:
#' y1<-rep("1", 10000)
#' y2<-rep("2", 10000)
#' y<-c(y1,y2)
#'
#' ## Test data t, with first half belonging to class "1", second have in "2":
#' t1<-x1[test.ind1]
#' t2<-x2[test.ind2]
#' t<-c(t1,t2)
#'
#' fit<-classifier.fit(x,y)
#'
#' ## Run the classifier, which returns
#' tM<-tMarLab(fit, t)
#'
#' ##With multidimensional x:
#' set.seed(111)
#' x1<-cbind(rPD(5500,10),rPD(5500,50))
#' x2<-cbind(rPD(5500,100),rPD(5500,500))
#' test.ind1<-sample.int(5500,500)
#' test.ind2<-sample.int(5500,500)
#' x<-rbind(x1[-test.ind1,],x2[-test.ind2,])
#' y1<-rep("1", 5000)
#' y2<-rep("2", 5000)
#' y<-c(y1,y2)
#' fit<-classifier.fit(x,y)
#' t1<-x1[test.ind1,]
#' t2<-x2[test.ind2,]
#' t<-rbind(t1,t2)
#'
#' tM<-tMarLab(fit, t)
tMarLab <- function(training, x) {
pred<-c()
if(names(training[[1]])[1]=="frequencies"){training<-list(training)}
nfeatures<-length(training)
classes<-names(training[[1]])
x<-matrix(x, ncol = nfeatures)
for (i in 1:length(x[,1])) { ####problem: how to for entire row
probs<-cbind(classes, rep(0, length(classes))) #vector to collect prob of each class
j<-0 #class index for prob vector
xi<-x[i,]
for (class in classes) {
j<-j+1
for(feat in 1:nfeatures) {
xd<-toString(xi[feat])
if (xd %in% names(training[[feat]][[class]][["frequencies"]])) {
probs[j,2]<- as.double(probs[j,2]) + log( training[[feat]][[class]][["frequencies"]][[xd]] /
(sum(training[[feat]][[class]][["frequencies"]]) + training[[feat]][[class]][["psi"]]) )
} else {
probs[j,2]<- as.double(probs[j,2]) + log( training[[feat]][[class]][["psi"]] /
(sum(training[[feat]][[class]][["frequencies"]]) + training[[feat]][[class]][["psi"]]) )
}
}
}
pred<-c(pred, probs[which.max(probs[,2]),1], use.names=FALSE)
}
return(pred)
}
#' Simultaneously predicted labels of the test data given the training data classification.
#'
#' Classifies the test data `x` based on the training data object.
#' All of the test data is used simultaneously to make the classification.
#' @param training A training data object from the function `classifier.fit()`.
#' @param x Test data vector or matrix with rows as data points and columns as features.
#' @return A vector of predicted labels for test data x.
#' @keywords Simultaneous classifier
#' @usage tSimLab(training, x)
#' @export
#' @details
#' The test data are first labeled with the marginal classifier. The simultaneous
#' classifier then iterates over all test data, assigning each a label by finding
#' the maximum predictive probability given the current classification structure of
#' the test data as a whole. This is repeated until the classification structure
#' converges after iterating over all data.
#' @references Amiryousefi A. Asymptotic supervised predictive classifiers under
#' partition exchangeability. . 2021. <https://arxiv.org/abs/2101.10950>.
#' @references Corander, J., Cui, Y., Koski, T., and Siren, J.: Have I seen you before?
#' Principles of Bayesian predictive classification revisited. Springer, Stat.
#' Comput. 23, (2011), 59–73, (<\doi{10.1007/s11222-011-9291-7}>).
#' @examples
#' ## Create random samples x from Poisson-Dirichlet distributions with different
#' ## psis, treating each sample as coming from a class of its own:
#' set.seed(111)
#' x1<-rPD(1050,10)
#' x2<-rPD(1050,1000)
#' test.ind1<-sample.int(1050,50) # Sample test datasets from the
#' test.ind2<-sample.int(1050,50) # original samples
#' x<-c(x1[-test.ind1],x2[-test.ind2])
#' ## create training data labels:
#' y1<-rep("1", 1000)
#' y2<-rep("2", 1000)
#' y<-c(y1,y2)
#'
#' ## Test data t, with first half belonging to class "1", second have in "2":
#' t1<-x1[test.ind1]
#' t2<-x2[test.ind2]
#' t<-c(t1,t2)
#'
#' fit<-classifier.fit(x,y)
#'
#' ## Run the classifier, which returns
#' tS<-tSimLab(fit, t)
#'
#' ##With multidimensional x:
#' set.seed(111)
#' x1<-cbind(rPD(500,1),rPD(500,5))
#' x2<-cbind(rPD(500,10),rPD(500,50))
#' test.ind1<-sample.int(500,50)
#' test.ind2<-sample.int(500,50)
#' x<-rbind(x1[-test.ind1,],x2[-test.ind2,])
#' y1<-rep("1", 450)
#' y2<-rep("2", 450)
#' y<-c(y1,y2)
#' fit<-classifier.fit(x,y)
#' t1<-x1[test.ind1,]
#' t2<-x2[test.ind2,]
#' t<-rbind(t1,t2)
#'
#' tS<-tSimLab(fit, t)
tSimLab <- function(training, x) {
if(names(training[[1]])[1]=="frequencies"){training<-list(training)}
classes<-names(training[[1]])
nfeatures<-length(training)
x<-matrix(x, ncol = nfeatures)
#collect class Dirichlet priors
#pred<-cbind(x,tMarLab(training,x)) #using the new tMarLab. pred is the iterator
#predS <- cbind(rep(NULL, length(x)), rep(NULL, length(x))) # predS is the previous value of pred
pred<-tMarLab(training, x)
predS<-rep(NULL, length(pred))
#while (sum(pred[,2]==predS[,2])!=length(x[,1])) {
while (sum(pred==predS)!=length(x[,1])) {
predS<-pred
for (i in 1:length(x[,1])) {
probsByClass<-c()
j=0
for (class in classes) {
pred[i] <- NaN
j=j+1
prob<-0
#combined frequencies of training and test data in the same class
#add a line that takes the currently predicted test item out of test data
for (feat in 1:nfeatures) {
L<-list(table(x[which(pred==class),feat]), training[[feat]][[class]][["frequencies"]])#combined frequencies of training and test data in the same class
freqs<-tapply(unlist(L), names(unlist(L)), sum)
xid<-toString(x[i,feat])
if (xid %in% names(freqs)) {
prob<-prob + log( freqs[[xid]] /
(sum(freqs) + training[[feat]][[class]][["psi"]]) )
} else {
prob<-prob + log( training[[feat]][[class]][["psi"]] /
(sum(freqs) + training[[feat]][[class]][["psi"]]) )
}
}
probsByClass <- append(probsByClass, prob)
}
pred[i] <- classes[which.max(probsByClass)]
}
}
return(pred)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.