How to use RSDA 3.2.1"

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 6,
  fig.height = 5
)
library(RSDA)

RSDA Package version 3.3

Oldemar Rodríguez R.

Installing the package

CRAN

install.packages("RSDA", dependencies=TRUE)

Github

devtools::install_github("PROMiDAT/RSDA")

How to read a Symbolic Table from a CSV file with RSDA?

ex3 <- read.sym.table(file = 'tsym1.csv', header=TRUE, sep=';',dec='.', row.names=1)
ex3

How to save a Symbolic Table in a CSV file with RSDA?

write.sym.table(ex3, file = 'tsymtemp.csv', sep = ';',dec = '.',
                row.names = TRUE, col.names = TRUE)

Symbolic Data Frame Example in RSDA

data(example3)
example3
example3[2,]
example3[,3]
example3[2:3,5]
example3$F1

How to generated a symbolic data table from a classic data table in RSDA?

data(ex1_db2so)
ex1_db2so

The classic.to.sym function allows to convert a traditional table into a symbolic one, to this we must indicate the following parameters.

Example 1

result <- classic.to.sym(x = ex1_db2so, 
                         concept = c(state, sex),
                         variables = c(county, group, age))
result

We can add new variables indicating the type we want them to be.

result <- classic.to.sym(x = ex1_db2so, 
                         concept = c("state", "sex"),
                         variables = c(county, group, age),
                         age_hist = sym.histogram(age, breaks = pretty(ex1_db2so$age, 5)))
result

Example 2

data(USCrime)
head(USCrime)
result  <- classic.to.sym(x = USCrime,
                          concept = state, 
                          variables= c(NumInShelters,
                                       NumImmig,
                                       ViolentCrimesPerPop),
                          ViolentCrimesPerPop_hist = sym.histogram(ViolentCrimesPerPop,
                                                                   breaks = pretty(USCrime$ViolentCrimesPerPop,5)))
result

Example 3

data("ex_mcfa1") 
head(ex_mcfa1)
sym.table <- classic.to.sym(x = ex_mcfa1, 
                            concept = suspect, 
                            variables=c(hair,
                                        eyes,
                                        region),
                            default.categorical = sym.set)
sym.table

Example 4

We can modify the function that will be applied by default to the categorical variables

sym.table <- classic.to.sym(x = ex_mcfa1, 
                            concept = suspect,
                            default.categorical = sym.set)
sym.table

Converting a SODAS 1.0 *.SDS files to RSDA files

hani3101 <- SDS.to.RSDA(file.path = "hani3101.sds")
hani3101
# We can save the file in CSV to RSDA format as follows:
write.sym.table(hani3101,
                file='hani3101.csv',
                sep=';',
                dec='.',
                row.names=TRUE,
                col.names=TRUE)

Converting a SODAS 2.0 *.XML files to RSDA files

abalone <- SODAS.to.RSDA("abalone.xml")
abalone
write.sym.table(abalone,
                file='abalone.csv',
                sep=';',
                dec='.',
                row.names = TRUE,
                col.names = TRUE)

Basic statistics

Symbolic Mean

data(example3)
mean(example3$F1)
mean(example3[,1])
mean(example3$F2)
mean(example3[,2])
mean(example3$F2,method = "interval")
mean(example3[,2],method = "interval")

Symbolic median

median(example3$F1)
median(example3[,1])
median(example3$F2)
median(example3[,2])
median(example3$F6, method = 'interval')
median(example3[,6], method = 'interval')

Variance and standard deviation

var(example3[,1])
var(example3[,2])
var(example3$F6)
var(example3$F6, method = 'interval')
var(example3$F6, method = 'billard')
sd(example3$F1)
sd(example3$F2)
sd(example3$F6)
sd(example3$F6, method = 'interval')
sd(example3$F6, method = 'billard')

Symbolic correlation

cor(example3$F1, example3$F4)
cor(example3[,1], example3[,4])
cor(example3$F2, example3$F6, method = 'centers')
cor(example3$F2, example3$F6, method = 'billard')

Radar plot for intervals

library(ggpolypath)

data(oils)
oils <- RSDA:::to.v3(RSDA:::to.v2(oils))
sym.radar.plot(oils[2:3,])
sym.radar.plot(oils[2:5,])

res <- interval.histogram.plot(oils[,2],
                               n.bins = 4,
                               col = c(2,3,4,5))
res

res <- interval.histogram.plot(oils[,3],
                               n.bins = 3,
                               main = "Histogram",
                               col = c(2, 3, 4))
res

Distances for intervals

Gowda-Diday

data("oils")
DM <- sym.dist.interval(sym.data = oils[,1:4],
                        method = "Gowda.Diday")
model <- hclust(DM)
plot(model, hang = -1)

Ichino

DM <- sym.dist.interval(sym.data= oils[,1:4],
                        method = "Ichino")
model <- hclust(DM)
plot(model, hang = -1)

Hausdorff

DM <- sym.dist.interval(sym.data = oils[,c(1,2,4)],
                        gamma = 0.5,
                        method = "Hausdorff",
                        normalize = FALSE,
                        SpanNormalize = TRUE,
                        euclidea = TRUE,
                        q = 2)
model <- hclust(DM)
plot(model, hang = -1)

Linear regression for intervals

Training

data(int_prost_train)
data(int_prost_test)
res.cm <- sym.lm(formula = lpsa~., sym.data = int_prost_train, method = 'cm')
res.cm

Prediction

pred.cm <- sym.predict(model = res.cm, new.sym.data = int_prost_test)

Testing

RMSE.L(int_prost_test$lpsa, pred.cm$Fitted)
RMSE.U(int_prost_test$lpsa, pred.cm$Fitted)
R2.L(int_prost_test$lpsa, pred.cm$Fitted)
R2.U(int_prost_test$lpsa, pred.cm$Fitted)
deter.coefficient(int_prost_test$lpsa, pred.cm$Fitted)

LASSO regression for intervals

data(int_prost_train)
data(int_prost_test)

Training

res.cm.lasso <- sym.glm(sym.data = int_prost_train,
                        response = 9,
                        method = 'cm',
                        alpha = 1,
                        nfolds = 10,
                        grouped = TRUE)

Prediction

pred.cm.lasso <- sym.predict(res.cm.lasso,
                             response = 9,
                             int_prost_test,
                             method = 'cm')

Testing

plot(res.cm.lasso)
plot(res.cm.lasso$glmnet.fit, "lambda", label=TRUE)
RMSE.L(int_prost_test$lpsa,pred.cm.lasso)
RMSE.U(int_prost_test$lpsa,pred.cm.lasso) 
R2.L(int_prost_test$lpsa,pred.cm.lasso) 
R2.U(int_prost_test$lpsa,pred.cm.lasso) 
deter.coefficient(int_prost_test$lpsa, pred.cm.lasso)

RIDGE regression for intervals

Training

data(int_prost_train)
data(int_prost_test)

res.cm.ridge <- sym.glm(sym.data = int_prost_train,
                        response = 9,
                        method = 'cm',
                        alpha = 0,
                        nfolds = 10,
                        grouped = TRUE)

Prediction

pred.cm.ridge <- sym.predict(res.cm.ridge,
                             response = 9,
                             int_prost_test,
                             method = 'cm')

Testing

plot(res.cm.ridge)
plot(res.cm.ridge$glmnet.fit, "lambda", label=TRUE)
RMSE.L(int_prost_test$lpsa, pred.cm.ridge)
RMSE.U(int_prost_test$lpsa, pred.cm.ridge)
R2.L(int_prost_test$lpsa, pred.cm.ridge)
R2.U(int_prost_test$lpsa, pred.cm.ridge)
deter.coefficient(int_prost_test$lpsa, pred.cm.ridge)

PCA for intervals

Example 1

data("oils")
res <- sym.pca(oils,'centers')
plot(res, choix = "ind")
plot(res, choix = "var")

Example 2

res <- sym.pca(oils,'tops')
plot(res, choix = "ind")

Example 3

res <- sym.pca(oils, 'principal.curves')
plot(res, choix = "ind")

Example 4

res <- sym.pca(oils,'optimized.distance')
plot(res, choix = "ind")
plot(res, choix = "var")

Example 5

res <- sym.pca(oils,'optimized.variance')
plot(res, choix = "ind")
plot(res, choix = "var")

Symbolic Multiple Correspondence Analysis

Example 1

data("ex_mcfa1") 
ex_mcfa1
sym.table <- classic.to.sym(x = ex_mcfa1, 
                            concept = suspect, 
                            default.categorical = sym.set)
sym.table
res <- sym.mcfa(sym.table, c(2,3))
mcfa.scatterplot(res[,2], res[,3], sym.data = sym.table, pos.var = c(2,3))
res <- sym.mcfa(sym.table, c(2,3,4))
mcfa.scatterplot(res[,2], res[,3], sym.data = sym.table, pos.var = c(2,3,4))

Symbolic UMAP

Ejemplo Oils

datos <- oils
datos
x <- sym.umap(datos)
x
plot(x)

Ejemplo Cardiological

datos <- Cardiological
datos
x <- sym.umap(datos)
x
plot(x)

Length of intervals

data(oils)
datos <- oils
interval.length(datos)

PCA Histogram

Hardwood Data

data("hardwoodBrito")
Hardwood.histogram<-hardwoodBrito
Hardwood.cols<-colnames(Hardwood.histogram)
Hardwood.names<-row.names(Hardwood.histogram)
Hardwood.histogram

Hardwood.histogram[[1]][[1]]

Weighted Center Matrix

weighted.center<-weighted.center.Hist.RSDA(Hardwood.histogram)

Bin Matrix

BIN.Matrix<-matrix(rep(3,length(Hardwood.cols)*length(Hardwood.names)),nrow = length(Hardwood.names))

PCA

pca.hist<-sym.histogram.pca(Hardwood.histogram,BIN.Matrix)
pca.hist$classic.PCA
pca.hist$sym.hist.matrix.PCA

Plots

ACER.p1<-Sym.PCA.Hist.PCA.k.plot(data.sym.df = pca.hist$Bins.df,
                             title.graph = " ",
                             concepts.name = c("ACER"),
                             title.x = "First Principal Component (84.83%)",
                             title.y = "Frequency",
                             pca.axes = 1)

ACER.p1
ALL.p1<-Sym.PCA.Hist.PCA.k.plot(data.sym.df = pca.hist$Bins.df,
                    title.graph = " ",
                    concepts.name = unique(pca.hist$Bins.df$Object.Name),
                    title.x = "First Principal Component (84.83%)",
                    title.y = "Frequency",
                    pca.axes = 1)

ALL.p1
Hardwood.quantiles.PCA<-quantiles.RSDA(pca.hist$sym.hist.matrix.PCA,3)

label.name<-"Hard Wood"
Title<-"First Principal Plane"
axes.x.label<- "First Principal Component (84.83%)"
axes.y.label<- "Second Principal Component (9.70%)"
concept.names<-c("ACER")
var.names<-c("PC.1","PC.2")

quantile.ACER.plot<-Percentil.Arrow.plot(Hardwood.quantiles.PCA,
                     concept.names,
                     var.names,
                     Title,
                     axes.x.label,
                     axes.y.label,
                     label.name
                     )

quantile.ACER.plot
label.name<-"Hard Wood"
Title<-"First Principal Plane"
axes.x.label<- "First Principal Component (84.83%)"
axes.y.label<- "Second Principal Component (9.70%)"
concept.names<-row.names(Hardwood.quantiles.PCA)
var.names<-c("PC.1","PC.2")

quantile.plot<-Percentil.Arrow.plot(Hardwood.quantiles.PCA,
                     concept.names,
                     var.names,
                     Title,
                     axes.x.label,
                     axes.y.label,
                     label.name
                     )

quantile.plot
label.name<-"Hard Wood"
Title<-"First Principal Plane"
axes.x.label<- "PC 1 (84.83%)"
axes.y.label<- "PC 2 (9.70%)"
concept.names<-c("ACER")
var.names<-c("PC.1","PC.2")

plot.3D.HW<-sym.quantiles.PCA.plot(Hardwood.quantiles.PCA,
                               concept.names,
                               var.names,
                               Title,
                               axes.x.label,
                               axes.y.label,
                               label.name)

plot.3D.HW
concept.names<-row.names(Hardwood.quantiles.PCA)
sym.all.quantiles.plot(Hardwood.quantiles.PCA,
                               concept.names,
                               var.names,
                               Title,
                               axes.x.label,
                               axes.y.label,
                               label.name)
sym.all.quantiles.mesh3D.plot(Hardwood.quantiles.PCA,
                               concept.names,
                               var.names,
                               Title,
                               axes.x.label,
                               axes.y.label,
                               label.name)

KS

Hardwood.quantiles.PCA.2<-quantiles.RSDA.KS(pca.hist$sym.hist.matrix.PCA,100)
h<-Hardwood.quantiles.PCA.2[[1]][[1]]
tmp<-HistRSDAToEcdf(h)

h2<-Hardwood.quantiles.PCA.2[[1]][[2]]
tmp2<-HistRSDAToEcdf(h2)

h3<-Hardwood.quantiles.PCA.2[[1]][[3]]
tmp3<-HistRSDAToEcdf(h3)

h4<-Hardwood.quantiles.PCA.2[[1]][[4]]
tmp4<-HistRSDAToEcdf(h4)

h5<-Hardwood.quantiles.PCA.2[[1]][[5]]
tmp5<-HistRSDAToEcdf(h5)

breaks.unique<-unique(c(h$breaks,h2$breaks,h3$breaks,h4$breaks,h5$breaks))
tmp.unique<-breaks.unique[order(breaks.unique)]

tmp<-tmp(v = tmp.unique)
tmp2<-tmp2(v = tmp.unique)
tmp3<-tmp3(v = tmp.unique)
tmp4<-tmp4(v = tmp.unique)
tmp5<-tmp5(v = tmp.unique)
abs_dif <-  abs(tmp2 - tmp)
# La distancia Kolmogorov–Smirnov es el máximo de las distancias absolutas.
distancia_ks <- max(abs_dif)
distancia_ks
library(tidyr)
# Se unen los valores calculados en un dataframe.
df.HW <- data.frame(
  PC.1 = tmp.unique,
  ACER = tmp,
  ALNUS = tmp2,
  FRAXINUS = tmp3,
  JUGLANS = tmp4,
  QUERCUS = tmp5
) %>%
  pivot_longer(
    cols = c(ACER, ALNUS,FRAXINUS,JUGLANS,QUERCUS),
    names_to = "HardWood",
    values_to = "ecdf"
  )

grafico_ecdf <- ggplot(data = df.HW,
                       aes(x = PC.1, y = ecdf, color = HardWood)) +
  geom_line(size = 1) +
  labs(
    color = "Hardwood",
    y = "Empirical Cumulative Distribution "
  ) +
  theme_bw() +
  theme(legend.position = "bottom",
        plot.title = element_text(size = 12))+geom_line()

grafico_ecdf


Try the RSDA package in your browser

Any scripts or data that you put into this service are public.

RSDA documentation built on Nov. 10, 2023, 5:06 p.m.