#
# All the methods related to loading data and making it into the structures
# expected by the rest of the library.
#
#' Extracts attribute values from a string
#'
#' From a string link "att1=v1,att2=v2",
#' returns a named list containing names "att1" and "att2"
#' and values "v1" and "v2". An error (\link{stop}) is raised
#' if the string is malformed.
#'
#' @param name the string to decode.
#'
#' @return a named list with attribute as key and value for each
#'
#' @keywords internal
#'
extract_attributes_values <- function(name) {
kv <- unlist(strsplit(name,","))
if (length(kv) < 1) {
stop("the name should not be empty but was'",name,"'")
}
k2v <- strsplit(kv,"=")
# check the validity (we expect pairs !)
if (!all(lapply(k2v,length)==2)) {
stop("invalid name ",name,"; we expect a scheme like att1=v1,att2=vx")
}
k <- unlist(lapply(k2v,'[',1))
v <- unlist(lapply(k2v,'[',2))
# return as a named list
setNames(v,k)
}
#' Relax the dataset by changing zeros with low probability
#'
#' @param x the case to print
#' @param by the value to use to replace zeros (defaults to 1e-6)
#' @param ... ignored
#'
#' @export
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
relax.zeros <- function(x, by=.Machine$double.eps, ...) {
UseMethod("relax.zeros", x)
}
relax.zeros.data.frame <- function(x, by=.Machine$double.eps, ...) {
x[x == 0] <- by
normalise.data.frame(x)
}
#' Creates a sample organized for dpp manipulation.
#'
#' The resulting object contains the sample and a dictionary of data.
#' If no weight column is provided, one will be created with value 1 (uniform weights).
#'
#' @param data a data frame containing a sample (weighted list) of entities
#' @param encoding a dictionary containing information about the variables of the sample
#' @param weight.colname the name of the column containing the weights
#' @return a sample
#'
#' @examples
#' # to read a CSV file as a sample
#' f <- system.file("extdata", "logements.csv", package = "gosp.dpp")
#' m <- read.csv(f, sep=";", dec=",", check.names=FALSE)
#' df <- as.data.frame(m, check.names=FALSE)
#' dictionary <- list('surface'=list('small'=1, 'medium'=2, 'large'=3))
#' create_sample(data=df, encoding=dictionary, weight.colname="weight")
#'
#' # to create a sample from random data
#' # ... create 100 entities being either male of female
#' df <- data.frame(gender=sample(1:2, size=100, replace=TRUE))
#' # ... describe the encoding of data
#' dictionary <- list("gender"=list("male"=1,"female"=2))
#' create_sample(data=df, encoding=dictionary)
#' # ... the weights columns was created automatically
#'
#' @export
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
#' @importFrom stats setNames
#'
create_sample <- function(data, encoding=NULL, weight.colname=NULL) {
# test types if parameters
if (!is.data.frame(data)) {
stop("data is expected to be a dataframe")
}
if (is.null(weight.colname)) {
# if no weight column is given, then create a weight column filled by 1.0
weight.colname <- "_auto_weight"
data[ , weight.colname] <- rep(1, times=nrow(data))
} else {
# ensure the weight column does exist in the dataset
if (!weight.colname %in% colnames(data)) {
stop(paste("There is no column weight.colname='",weight.colname,"' in the data",sep=""))
}
}
# if no encoding is provided, then create one with a direct mapping
if (is.null(encoding)) {
encoding <- list()
}
for (name in colnames(data)) {
uniques <- unique(data[,name])
if (
# skip the columns already defined
is.null(encoding[[name]])
# skip when columns have a huge cardinality,
# as they might be ids, weights... which don't require a dictionnary
&& (length(uniques) < nrow(data))
# don't create a dictionary for weights
&& (name != weight.colname)
) {
values <- sort(uniques)
keys <- as.character(values)
encoding[[name]] <- setNames(values, keys)
message("no dictionary provided for the column ", name, " of the sample; creating a dictionary ",paste(encoding[[name]],collapse=","))
}
}
# ensure the inputs are consistent
encoding.table <- list()
# build the inverse dictionary
decoding <- list()
for (attname in names(encoding)) {
values <- list()
for (label in names(encoding[[attname]])) {
code <- encoding[[attname]][[label]]
# add the correspondance with the code used in dataframes
encoding.table[[attname]][make.names(label)] <- code
values[code] <- label
}
decoding[[attname]] <- values
}
res <- list(
sample=data,
dictionary=list(
encoding=encoding,
encoding.table=encoding.table,
decoding=decoding,
colname.weight=weight.colname
)
)
class(res) <- "dpp_sample"
return(res)
}
#' Display a sample for Direct Probabilistic Peering
#'
#' @param x the matching probabilities to print
#' @param ... ignored
#'
#' @export
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
print.dpp_sample <- function(x, ...) {
cat("Sample containing ", nrow(x$sample), " elements ",sep="")
cat("having ",length(x$dictionary$encoding)," columns:",paste(names(x$dictionary$encoding),collapse=","),sep="")
cat(" (weight column:",x$dictionary$colname.weight,")\n",sep="")
}
#' Coerce the a dpp sample into a data frame
#'
#' Extracts the data frame of a sample prepare by function \code{\link{create_sample}}
#'
#' @param x the sample to convert
#' @param ... further parameters are ignored quietly
#'
#' @export
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
as.data.frame.dpp_sample <- function(x, ...) {
x$sample
}
relax.zeros.dpp_sample <- function(x, by=.Machine$double.eps, ...) {
x$sample <- relax.zeros.data.frame(x$sample)
x
}
#' Sets row names for degrees distribution
#'
#' Changes the row names so their numbering
#' starts at 0 instead of 1. It's fitting the
#' actual semantics of this type of table.
#'
#' @param df a dataframe
#' @return the dataframe with fixed names
#'
#' @keywords internal
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
set.rownames.degrees <- function(df) {
row.names(df) <- 0:(nrow(df)-1)
df
}
# TODO manage multiple attributes
#
#' Creates a table storing probabilities for degrees
#'
#' @param probabilities a data frame containing the probabilities
#' @param norm if TRUE, will normalize the table so the columns sum up to 1 (defaults to TRUE)
#'
#' @examples
#' # create a table describing degrees depending to size; the bigger the size, the highest the degree
#' p <- data.frame(
#' 'size=0'=c(0.2, 0.8, 0, 0, 0),
#' 'size=1'=c(0.15, 0.8, 0.05, 0, 0),
#' 'size=2'=c(0.05, 0.8, 0.1, 0.05, 0),
#' check.names=FALSE
#' )
#' create_degree_probabilities_table(probabilities=p)
#'
#'
#' @export
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
create_degree_probabilities_table <- function(probabilities, norm=TRUE) {
# check inputs
if (!is.data.frame(probabilities)) {
stop("probabilities should be a data frame")
}
# ensure the given colnames(dwellings_households$pdi)
for (name in colnames(probabilities)) {
# ensure probabilities sum to 1
if (norm) {
probabilities[,name] <- normalise(probabilities[,name])
} else if (abs(sum(probabilities[,name]) - 1.0) >= 0.0000000001) {
stop(paste("invalid probabilities for ", name,
": should sum to 1 but sums to ",
sum(probabilities[,name]), sep=""))
}
}
idx2k2v <- lapply(colnames(probabilities),extract_attributes_values)
# do all the parameters have the same length of attributes ?
if (length(unique(lapply(idx2k2v, length))) != 1) {
stop("all the column names should contain the same count of attributes")
}
# TODO check attributes !
# list the attribute names concerned by the table
attributes.names <- unique(unlist(lapply(idx2k2v, names)))
# forge the result
res <- list(
data=set.rownames.degrees(probabilities),
attributes=attributes.names
)
class(res) <- "dpp_degree_cpt"
return(res)
}
#' Display a sample for Direct Probabilistic Peering
#'
#' @param x the degrees distribution of probabilities to display
#' @param ... ignored
#'
#' @export
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
print.dpp_degree_cpt <- function(x, ...) {
cat("distribution of degrees depending to attributes '", paste(x$attributes, collapse=","),"':\n",sep="")
print(x$data)
}
#' Coerce a distribution of degrees into a data frame
#'
#' Extracts the data frame of the distribution of degrees.
#'
#' @param x the distribution of degrees created by \code{\link{create_degree_probabilities_table}}
#' @param ... further parameters are ignored quietly
#'
#' @export
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
as.data.frame.dpp_degree_cpt <- function(x, ...) {
x$data
}
relax.zeros.dpp_degree_cpt <- function(x, by=.Machine$double.eps, ...) {
x$data <- relax.zeros.data.frame(x$data)
normalise(x)
}
# TODO manage multiple attributes
#
#' Creates a table storing matching probabilities
#'
#'
#'
#' @param data a data frame containing matching probabilities
#' @param norm if TRUE, will normalize the table so the totals sum up to one (defaults to TRUE)
#' @return a matching probability table ready to be used for usage with \code{\link{matching.prepare}}
#'
#' @examples
#'
#' dwellings_households.pij <- create_matching_probabilities_table(
#' data.frame(
#' 'surface=1'=c(0.2, 0.1, 0.05, 0.025),
#' 'surface=2'=c(0.0375, 0.125, 0.1, 0.05),
#' 'surface=3'=c(0.0125, 0.025, 0.1, 0.175),
#' row.names=c("size=1", "size=2", "size=3", "size=4"),
#' check.names=FALSE
#' )
#' )
#' print(dwellings_households.pij)
#'
#' @export
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
create_matching_probabilities_table <- function(data, norm=TRUE) {
# check inputs
if (!is.data.frame(data)) {
stop("data should be a data frame")
}
data_normalized <- if (norm) normalise(data) else data
if (abs(sum(data_normalized) - 1.0) >= 1e-6) {
stop(paste("the pairing probabilities should sum up to 1.0 but sum up to", sum(data_normalized)))
}
# create list of Ai
Ai.idx2k2v <- lapply(colnames(data_normalized),extract_attributes_values)
# do all the parameters have the same length of attributes ?
if (length(unique(lapply(Ai.idx2k2v, length))) != 1) {
stop("all the column names should contain the same count of attributes")
}
# list the attribute names concerned by the table
Ai <- unique(unlist(lapply(Ai.idx2k2v, names)))
# create list of Bj
Bj.idx2k2v <- lapply(rownames(data_normalized),extract_attributes_values)
# do all the parameters have the same length of attributes ?
if (length(unique(lapply(Bj.idx2k2v, length))) != 1) {
stop("all the row names should contain the same count of attributes")
}
# list the attribute names concerned by the table
Bj <- unique(unlist(lapply(Bj.idx2k2v, names)))
# TODO check input
res <- list(data=data_normalized, Ai=Ai,Bi=Bj)
class(res) <- "dpp_matching_probas"
res
}
#' Display matching probabilities
#'
#' @param x the matching probabilities to print
#' @param ... ignored
#'
#' @export
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
print.dpp_matching_probas <- function(x, ...) {
cat("matching probabilities given '", x$Ai,"' and '",x$Bi,"':\n",sep="")
print(x$data)
}
#' Coerce pairing probabilities into a data frame
#'
#' Extracts the data frame of pairing probabilities prepared by function \code{\link{create_matching_probabilities_table}}
#'
#' @param x the pairing probabilities to convert
#' @param ... further parameters are ignored quietly
#'
#' @export
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
as.data.frame.dpp_matching_probas <- function(x, ...) {
x$data
}
relax.zeros.dpp_matching_probas <- function(x, by=.Machine$double.eps, ...) {
x$data <- relax.zeros.data.frame(x$data)
x
}
#' Replace underscores by variables and modalities
#'
#' In a vector like c("1_A","1_B", "2_A", "2_B"),
#' given variable names c("x","y"),
#' returns a vector c("x=1&y=A","x=1&y=B", "x=2&y=A", "x=2&y=B")
#'
#' @param vec a vector of strings
#' @param vars a vector of variable names
#' @return a vector of strings
#'
#' @keywords internal
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
underscore_to_varmod <- function(vec, vars) {
# replace the first elem
vec2 <- paste(vars[1], vec, sep="=")
for (v in tail(vars, -1)) {
vec2 <- gsub("_", paste("&",v,"=",sep=""), vec2)
}
vec2
}
#' Create pairing probabilities
#'
#' Takes a dataframe containing a sample, and extracts a joint probabilitie table
#' ready to be passed to \code{\link{create_matching_probabilities_table}}.
#'
#' @param df the dataframe
#' @param var.row the variables to use as rows
#' @param var.col the variables to use as cols
#' @param var.weight the name of the variable to use for weight
#' @return a dataframe structured in two dimensions as joint probabilities
#'
#' @export
#'
#' @author Samuel Thiriot <samuel.thiriot@res-ear.ch>
#'
#' @importFrom reshape2 dcast
#'
measure_pairing_from_df <- function(df, var.row, var.col, var.weight) {
# TODO check params
formula <- paste(paste(var.row, collapse="+"), "~", paste(var.col, collapse="+"))
# aggregate
casted <- dcast(df, formula, fun.aggregate=sum, value.var=var.weight)
# replace row names
rownames_underscore <- apply(casted[,var.row,drop=F], 1, paste, collapse="_")
row.names(casted) <- underscore_to_varmod(rownames_underscore, var.row)
for (i in 1:length(var.row)) {
casted <- casted[-1]
}
# replace col names
colnames(casted) <- underscore_to_varmod(colnames(casted), var.col)
# return a normalised version
normalise(casted)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.