#' Change values in a categorical vector to a lower or higher value
#'
#' @param num
#' the number of categories above you want to steal from
#' if you are giving and not stealing, you always give to the category
#' directly above and num=1
#' @param rank.col
#' the colum in which the rankings will be
#' @param i
#' the interation you are up to in the while/for loop
#' @param new.all.dat
#' the data you are working on and want modified
#' @param n.change
#' the number of observations that need moving into or out of each
#' category to get the requested proportions
#'
#' @return
#' the categorical vector after modified
#'
#' @seealso This function is called by the function \code{\link{modifyProps}}
#'
#' @export
change.cat <- function(num, rank.col, i, new.all.dat, n.change) {
if (sign(n.change[i])==1) {
steal=FALSE
}
else if (sign(n.change[i])== -1) {
steal=TRUE
}
#identify those in the category you want to change
#if giving it is category i
#if stealing it is category i+1 or above
ch.cat.id = which(new.all.dat[,1]==(i+num*steal))
#put in separate dataset
dat.ch.cat = new.all.dat[ch.cat.id,]
#add the rankings of the propensity scores as the last column
dat.ch.cat = cbind(dat.ch.cat, rank(dat.ch.cat[,i+(num-1)+1],
ties.method="random"))
if (steal==FALSE) {
#calculate cut-off for deciding which ones to change to a higher value
cut.off = nrow(dat.ch.cat) - n.change[i] + 1
#change those that are greater than or equal to the cut-off to have a higher value
dat.ch.cat[,1][dat.ch.cat[,rank.col]>=cut.off] <- (i+1)
}
if (steal==TRUE) {
#calculate cut-off for deciding which ones to change to a lower value
cut.off = n.change[i]*(-1)
#change those that are below the cut-off to have a lower value
#
dat.ch.cat[,1][dat.ch.cat[,rank.col]<=cut.off] <- i
}
#identify those not in the current category (category i) and put in separate
#dataset
rest.id = which(new.all.dat[,1]!=(i+num*steal))
dat.rest = new.all.dat[rest.id,]
#combine the datasets so all data is together again
new.all.dat = rbind(dat.ch.cat[,-rank.col], dat.rest)
return(new.all.dat)
}
#' Change the default simulated values to proportions requested by the user.
#' The values of a vector are changed so that the proportions of each discrete value
#' are that requested by the user.
#' The values that are changed are the ones with the highest propensity to do so
#' The propensity score(s) of an observation to be a higher (or lower) value can
#' be given as input to the function.
#' If the propensity score(s) are not provided by the user then random
#' propensity scores are generated.
#'
#' @param default.vec
#' a vector after a run of the simulation. The values of this
#' variable will be changed in accordance with what the user requests
#'
#' @param desired_props
#' a vector that is the proportions requested by the user.
#' The vector is the length of the number of distinct values of the variable
#' being modified.
#'
#' @param propens
#' matrix or vector of the propensity scores for each child
#' For binary variables there is one column of propensity scores: the
#' propensities to change from a 0 to a 1.
#' For categorical variables with more than two categories there are multiple
#' columns of propensity scores: E.g. for a three category variables the
#' propensities to change from category 1 to category 2 are in the first
#' column and the propensities to change from category 2 to category 3 are
#' in the second column.
#'
#' @param accuracy
#' gives how close the end proportions are allowed to be away from the desired proportions before an error message is given
#' - the default is 0.01.
#' If the '.accuracy' global variable exists, its value will be used instead of that in function call.
#'
#' @note Assumptions made by the function:
#' It is assumed that the proportions given in props are given in consectuive
#' increasing order (e.g. {0,1}, {1, 2, 3} or {2, 5, 9, 23}). If the user
#' wants to make it so no observations are in a particular category the value
#' 0 must be put in the corresponding place in the vector props
#' If the propensity scores (propens) are provided by the user then it is assumed
#' that default.vec and propens are given in the same order and exactly the
#' same children are in each vector (i.e. there are no children in one vector
#' that are not in the other). In other words, the propensity score for a
#' specific child is in the same row in propens as that same child's value of
#' the variable in default.vec.
#'
#' @return
#' a modified vector with the proportions requested.
#'
#' @seealso This function calls \code{\link{change.cat}}
#'
#' @export
modifyProps <- function(default.vec, desired_props, propens=NULL, accuracy=.01) {
if (exists(".accuracy")) {accuracy<-.accuracy}
if (is.null(desired_props) || any(is.na(desired_props)) || length(default.vec) == 0) {
#no props, silently do nothing
return(default.vec)
}
if (is.null(default.vec)) {
stop(gettextf("%s is NULL", as.character(sys.call())[2]))
}
#keep original vector
orig.default.vec <- default.vec
#dim(propens)
#dim(orig.default.vec)
if (!is.null(propens)
&& (
(length(dim(propens)) != 2 && length(orig.default.vec) != length(propens))
|| (length(dim(propens)) == 2 && length(orig.default.vec) != dim(propens)[1])
) ) {
firstParamName <- as.character(sys.call())[2]
stop(gettextf("Propensities must be the same length as %s ",firstParamName))
}
#check that the number of categories in the provided vector of data
#(default.vec) is the same number of categories as given in desired_props
num.cat <- length(table(orig.default.vec))
num.cat2 <- length(desired_props)
if (num.cat!=num.cat2) {
note2 = cat("Note: Length of desired_props not equal to number of categories in variable:",
"\n", "Assumed that there were unobserved categories in the variable",
"\n")
}
#n is the number of children in one year
n <- length(orig.default.vec)
type <- is(orig.default.vec)[1]
#if propensity scores not provided then create them
if (length(propens)==0 || any(is.na(propens))) {
note2 <- "Note: No propensity scores available so random propensity scores were created\n"
cat(note2)
#col.num is the number of vectors of propensity scores required.
#E.g. for a 2-category varaible, 1 vector of propensity scores is required
#and for a 3-category variable, 2 vectors of propensity scores are
#required etc.
col.num <- num.cat2 - 1
#create random propensity score
propens.score <- NULL
for (i in 1:col.num) {
propens.score <- c(propens.score, runif(n))
}
propens <- matrix(propens.score, ncol=col.num, nrow=n)
}
#if the the default.vec values are not consecutive numbers starting at 1 change them
#so they are
if (type=="factor") {
tab.names <- names(table(orig.default.vec))
} else if ((type=="integer")|(type=="numeric")) {
tab.names <- as.numeric(names(table(orig.default.vec)))
} else {
stop("Add additional type in modifyProps()")
}
#if (sum(tab.names!= 1:length(desired_props))>=1) {
# default.vec2 <- numeric(length(default.vec))
# for (i in 1:length(desired_props)) {
# default.vec2[default.vec==tab.names[i]] <- i
# }
# default.vec <- default.vec2
#}
#browser()
#match propensity scores to children
#(this function assumed that default.vec and propens have the same children
#in the same order)
#the last column is a child identifier so I can put them back in the right
#order
#browser()
# !!!!! Need to check this over!!!
if(!is.null(attr(desired_props, "levels")) && type=="factor")
default.vec <- as.numeric(factor(default.vec, levels = attr(desired_props, "levels")))
else
default.vec <- as.numeric(factor(default.vec))
all.dat <- data.frame(default.vec, propens, 1:n)
new.all.dat <- all.dat
#rank.col identifies the column that the rankings will be in
#this is used later in the change.cat function where the propensities are converted to ranks
rank.col <- ncol(all.dat) + 1
#create table of given data and calculate current proportions and the numbers
#that need moving into or out of categories to get the requested proportions
#(n.change)
tab <- table(new.all.dat[,1])
cats <- c(1:length(desired_props))
#if any categories in desired_props are not present in default.vec then this merge is needed to fix the
#problem
tab.df <- merge(cats, tab, by = 1, all=TRUE)
#after the merge any categories in cat that were not present in tab appear as NAs
#these NAS are changed to 0s
na.id <- which(is.na(tab.df$Freq))
tab.df$Freq[na.id] <- 0
tab.df$x <- 1:nrow(tab.df)
current.props <- tab.df$Freq/sum(tab.df$Freq)
n.change <- round(current.props*n) - round(desired_props*n)
#(e.g n.change[1] is the excess/deficient number of observations in the first
#category in observed data)
num <- 1
i <- 1 #i = current category
#browser()
while (i < length(desired_props)) {
if (n.change[i]==0) {
#if no change needs to be made for category i then move onto next category
i <- i + 1
num <- 1
} else if (sign(n.change[i])==1) {
#category i has too many obs - give to a higher category
num <- 1
new.all.dat <- change.cat(num, rank.col, i, new.all.dat, n.change)
i <- i + 1
} else if (sign(n.change[i])== -1) {
#category i has too few obs - steal from a higher category
new.all.dat <- change.cat(num, rank.col, i, new.all.dat, n.change)
num <- num + 1
#the maximum value of num is the number of categories above to steal
#from
#by incrementing i here we ensure that we don't get into an infinite
#loop due to the while condition never being satisfied
if (num>(length(desired_props)-(i-1))) {
i <- i + 1
}
}
#at this point changes have been made for one iteration of category i (may need more than
#one iteration if couldn't steal enough observations from the category
#immediately above it
#create table of current data (tab.df) and calculate numbers that need
#moving into or out of categories to get the requested proportions
#(n.change)
tab <- table(new.all.dat[,1])
cats <- c(1:length(desired_props))
tab.df <- merge(cats, tab, by = 1, all.x=TRUE)
na.id <- which(is.na(tab.df$Freq))
tab.df$Freq[na.id] <- 0
tab.df$x <- 1:nrow(tab.df)
current.props <- tab.df$Freq/sum(tab.df$Freq)
n.change <- round(current.props*n) - round(desired_props*n)
}
#check if requested proportions acheived
if (sum(abs(desired_props - current.props))>accuracy) {
# give output with warning
#(output should all be correct if I have thought of everything and made no
#mistakes)
#change back to orignal category names
#put observations back in the right order
warning("Proportions may not be correct - Source code may need to be modified to handle this situaion")
}
#change back to orignal category names (if they were changed earlier)
#e.g. if the orginal variable was a {0, 1} variable then all 0s would have
#been changed to 1s and alls 1s changed to 2s. At this step, after the
#changes to get the right proportions, the 1s are changed back to 0s and
#the 2s are changed back to 1s.
if(!any( tab.names == 1:length(desired_props))){
default.vec2 = numeric(nrow(new.all.dat))
if (type=="factor") {
for (i in 1:length(desired_props)) {
default.vec2[new.all.dat[,1]==i] <- levels(orig.default.vec)[i]
default.vec2 <- factor(default.vec2, levels=levels(orig.default.vec))
}
new.all.dat[,1] = default.vec2
} else if ((type=="integer")|(type=="numeric")) {
for (i in 1:length(desired_props)) {
default.vec2[new.all.dat[,1]==i] <- tab.names[i]
}
new.all.dat[,1] = default.vec2
} else {
stop("Add additional type in modifyProps()")
}
}
#browser()
#and put children back in the right order
new.all.dat2 = new.all.dat[order(new.all.dat[,rank.col-1]),]
return(new.all.dat2[,1])
}
#' Takes a categorical var specified in separate binary level variables
#' and applies modifyProps and returns the result as a list of modified
#' binary level variables.
#'
#' @param vecs.list
#' vectors to modify, eg: simframe[binLevelVarnames]
#' @param desiredProps
#' desired proportions
#' @param propens
#' propensities, if ANY
#'
#' @return
#' same list of vectors, but with proportions modified
#'
#' @export
modifyPropsAsBinLevels <- function (vecs.list, desiredProps, propens=NULL) {
#catvar <- binary.levels.combine(simframe[binLevelVarnames])
cats <- seq(length(vecs.list))
catvar <- binary.levels.combine(vecs.list)
# prop.table(table(catvar))
adjcatvar <- modifyProps(catvar, desiredProps, propens)
# quick check: this will fail if modifyProps returns any NAs
# like for example, where there is only 1 category catvar
assert(!is.na(adjcatvar))
# prop.table(table(adjcatvar))
# split back into seperate level vars
result <- binary.levels.split(adjcatvar, f=cats)
result.names <- names(vecs.list)[as.integer(names(result))]
structure(result, names=names(vecs.list))
}
#' Calls modifyProps on a subset, returning the whole vector, but with the subset modified
#'
#' @param desired_props
#' a vector that is the proportions requested by the user.
#' The vector is the length of the number of distinct values of the variable
#' being modified.
#'
#' @param default.vec
#' a vector after a run of the simulation. The values of this
#' variable will be changed in accordance with what the user requests
#'
#' @param propens
#' matrix or vector of the propensity scores for each child
#' For binary variables there is one column of propensity scores: the
#' propensities to change from a 0 to a 1.
#' For categorical variables with more than two categories there are multiple
#' columns of propensity scores: E.g. for a three category variables the
#' propensities to change from category 1 to category 2 are in the first
#' column and the propensities to change from category 2 to category 3 are
#' in the second column.
#'
#' @param logiset
#' logical vector indicating which observations to include, or NULL to include all.
#'
#' @param accuracy
#' gives how close the end proportions are allowed to be away from the desired
#' proportions - the default is 0.01. It is passed to function modifyProps().
#'
#'
#' @note Assumptions made by the function:
#' It is assumed that the proportions given in props are given in consectuive
#' increasing order (e.g. {0,1}, {1, 2, 3} or {2, 5, 9, 23}). If the user
#' wants to make it so no observations are in a particular category the value
#' 0 must be put in the corresponding place in the vector props
#' If the propensity scores (propens) are provided by the user then it is assumed
#' that default.vec and propens are given in the same order and exactly the
#' same children are in each vector (i.e. there are no children in one vector
#' that are not in the other). In other words, the propensity score for a
#' specific child is in the same row in propens as that same child's value of
#' the variable in default.vec.
#'
#' @return
#' a vector with the subset modified
#'
#' @seealso This function calls \code{\link{modifyProps}}
#'
#' @export
modifypropsVarSingle_on_subset<-function(default.vec, desired_props, propens=NULL, logiset=NULL, accuracy=.01) {
if (is.null(logiset)) {logiset<-rep(TRUE, length(default.vec))}
default.df<-as.data.frame(default.vec)
propens<-subset(propens, logiset)
#adding a temporary ID variable - a rank column - onto a copy of the simframe portion
#will enable the subsets to be put back into the same order later
n<-dim(default.df)[1]
sf<-data.frame(default.df,1:n)
rankcolnum<-2
#subsetting the copy of the simframe according to logiset
subset_to_change<-subset(sf,logiset)
#keeping those not in the logiset - those that aren't to be passed to modifyprops
rest_not_to_be_modified<-subset(sf,!logiset)
#modifying the logiset
subset_to_change_modified <- modifyProps(subset_to_change[,-rankcolnum], desired_props, propens, accuracy)
#putting changed set back with those that weren't in the logiset
new_sf<-rbind(as.matrix(subset_to_change_modified), as.matrix(rest_not_to_be_modified[,1]))
original.position<-rbind(as.matrix(subset_to_change[,rankcolnum]), as.matrix(rest_not_to_be_modified[,rankcolnum]))
#putting the records back in their orignal order according to the rank column created earlier
new_sf[order(original.position),]
}
#' Runs modifyProps on a continuous variable
#' Takes a continuous variable, converts it to a categorical variable using the binbreaks,
#' modifyProps is then called on that categorical variable.
#' The categorical variable is then converted back to a continuos variable using the catToContModels
#'
#' @param x.cont
#' a continuous variable to be adjusted
#'
#' @param desired_props
#' desired proportions
#'
#' @param catToContModels
#' a list of models which will to used to convert the adjusted categorical variable back
#' to continuous
#'
#' @param cont.binbreaks
#' binbreaks for the continuous variable to be adjusted
#'
#' @param propens
#' matrix or vector of the propensity scores for each child
#' For binary variables there is one column of propensity scores: the
#' propensities to change from a 0 to a 1.
#' For categorical variables with more than two categories there are multiple
#' columns of propensity scores: E.g. for a three category variables the
#' propensities to change from category 1 to category 2 are in the first
#' column and the propensities to change from category 2 to category 3 are
#' in the second column.
#'
#' @param logiset
#' logical vector indicating which observations to include, or NULL to include all.
#'
#' @param accuracy
#' gives how close the end proportions are allowed to be away from the desired
#' proportions before an error message is given - the default is 0.01.
#' If the '.accuracy' global variable exists, its value will be used instead of that in
#' function call.
#'
#' @param envir
#' environment in which to evaluate model variables.
#'
#' @return
#' an 'adjusted' continuous variable that if binned will have the same proportions as requested in desired_props
#'
#' @export
modifyPropsContinuous <- function(x.cont, desired_props, catToContModels, cont.binbreaks, propens=NULL, logiset=NULL, accuracy=.01, envir=parent.frame()) {
x.cat <- bin(x.cont, cont.binbreaks)
adj.x.cat <- modifyProps(x.cat, desired_props, propens, accuracy)
adj.x.cont <- predSimModSelect(adj.x.cat, catToContModels, cont.binbreaks, logiset, envir)
adj.x.cont
}
#' A wrapper for modifyProps.
#' Subsets by the logiset call the appropriate version of modifyProps
#' (modifyPropsContinuous or modifyProps) then, if modifyProps was called on a logiset,
#' combine and reorder the data using combine.and.reorder().
#' Called in adjustCatVar(), adjustContVar, and applyContAdjustmentToSimframe().
#'
#' @param x
#' A categorical vector to be adjusted.
#'
#' @param desiredProps
#' Vector of desired proportions
#'
#' @param propens
#' propensity scores used to decide who should change categories
#'
#' @param logiset
#' A TRUE/FALSE vector indicating the subset of units to apply the scenari (change in
#' proportions) to
#'
#' @param catToContModels
#' A list of models which will to used to convert the adjusted categorical variable back
#' to continuous.
#'
#' @param cont.binbreaks
#' Binbreaks for the variable being adjusted if exist. Used to ensure the imputed
#' continuous values (only for continuous variables) are within the bounds of the
#' category.
#'
#' @param envir
#' environment - for the MELC MSM is usually the simframe of the scenario environment.
#'
#' @return
#' an adjusted vector, either categorical or continuous depending on whether catToCont
#' models were provided.
#'
#' @export
adjust.proportions <- function(x, desiredProps, propens=NULL, logiset=NULL, catToContModels=NULL, cont.binbreaks=NULL, envir=parent.frame()) {
#browser()
if (!is.null(logiset) && sum(logiset)>0) {
#subset the propensities according to the logiset
propens_subset <- if (!is.null(propens)) { subsetFirstDimension(propens, logiset) } else NULL
#subset x according to the logiset
subset_to_change <- x[logiset]
if (!is.null(catToContModels)) {
subset_to_change_modified <- modifyPropsContinuous(subset_to_change, desiredProps, catToContModels, cont.binbreaks, propens_subset, logiset, accuracy=.05, envir)
} else {
subset_to_change_modified <- modifyProps(subset_to_change, desiredProps, propens_subset, accuracy=.05) |> as.character() |> as.numeric()
}
non.modified.x <- x[!logiset]
non.modified.x <- non.modified.x |> as.character() |> as.numeric()
modified.in.order <- combine.and.reorder(subset_to_change_modified, non.modified.x, logiset)
return(modified.in.order)
} else {
#there is no logiset and the scenario is applied to all units
if (!is.null(catToContModels)) {
adj.x.cont <- modifyPropsContinuous(x, desiredProps, catToContModels, cont.binbreaks, propens, envir=envir)
return(adj.x.cont)
} else {
adj.x.cat <- modifyProps(x, desiredProps, propens, accuracy=.05)
return(adj.x.cat)
}
}
}
#' Combines and reorders (so correct original order) after modifyProps has been called on
#' a logiset.
#' Called in adjust.proportions().
#'
#' @param modified.x
#' A vector of modified values (i.e. those tht were in the logiset. Either categorical or
#' continuous.
#'
#' @param non.modified.x
#' A vector of non-modified values of the same variable (i.e. those tht were not in the
#' logiset. Either categorical or continuous.
#'
#' @param logiset
#' A TRUE/FALSE vetor defining which units are in the logical subset on which modifyProps
#' was called.
#'
#' @return
#' A combined and reordered vector. Contains evereyone in the population in the correct
#' order.
#'
#' @export
combine.and.reorder <- function(modified.x, non.modified.x, logiset) {
n = length(modified.x) + length(non.modified.x)
original.position <- 1:n
modified.out.of.order <- rbind(cbind(modified.x, original.position[logiset]), cbind(non.modified.x, original.position[!logiset]))
modified.in.order <- modified.out.of.order[order(modified.out.of.order[,2]), 1]
return(modified.in.order)
}
########### Add missing categories ##############
modifyProps.Mengdan <- function(default.vec, desired_props, propens=NULL, accuracy=.01) {
# If the length of desired_props not equal to number of categories in variable which means
# there were unobserved categories in the variable, then we choose the units randomly from the
# the category in majority and apply the missing categories to them.
addMissingCategories <- function(){
varname <- attr(desired_props,"varname")
# The categories of the variable
#categories<-as.vector(dict_demo$codings[[varname]])
#binbreak <- attr(desired_props,"binbreak")
#if(length(binbreak)==0){
#categories <- names(binbreak)[-1]
#}else{
categories <- attr(desired_props,"levels")
#}
num.missingCategories=0
# The list to store all missing categories
missingCategories <- list()
# Store all missing categories into missingCategories
for(i in categories){
if(!any(default.vec == categories[i])){
num.missingCategories <- num.missingCategories+1
missingCategories[[num.missingCategories]] <- categories[i]
}
}
# check the type of the categories and get the category in majority
if (type=="factor") {
majorityCategory <- names(which.max(table(default.vec)))
} else if ((type=="integer")|(type=="numeric")) {
majorityCategory <- as.numeric(names(which.max(table(default.vec))))
} else {
stop("Add additional type in modifyProps()")
}
# A logical vector to store if each element belongs to the category in majority
isMajority <- default.vec == majorityCategory
# The number of units in majority category
num.majority <- max(table(default.vec))
# Choose the units randomly from the category in majority
randomPeople <- sample(1:num.majority, num.missingCategories, replace=TRUE)
# Apply the missing units to random units and store into default.vec
if(num.majority >= num.missingCategories){
for(i in (1:length(missingCategories))){
default.vec[isMajority][randomPeople[i]] <<- missingCategories[[i]]
}
}else{
stop("Error: The total number of micro-units in the subgroup being adjusted should be at least the number of categories in the variable being adjusted")
}
}
#if the the default.vec values are not consecutive numbers starting at 1 change them
#so they are
convertCategoryName <- function(){
if (type=="factor") {
tab.names <<- names(table(default.vec))
} else if ((type=="integer")|(type=="numeric")) {
tab.names <<- as.numeric(names(table(default.vec)))
} else {
stop("Add additional type in modifyProps()")
}
if (sum(tab.names!= 1:length(desired_props))>=1) {
default.vec2 <- numeric(length(default.vec))
for (i in 1:length(desired_props)) {
default.vec2[default.vec==tab.names[i]] <- i
}
default.vec <<- default.vec2
}
}
#check if requested proportions acheived with the accuracy and transform back to orignal category names.
checking <- function(){
if (sum(abs(desired_props - current.props))<=accuracy) {
#if correct
#change back to orignal category names (if they were changed earlier)
#e.g. if the orginal variable was a {0, 1} variable then all 0s would have
#been changed to 1s and alls 1s changed to 2s. At this step, after the
#changes to get the right proportions, the 1s are changed back to 0s and
#the 2s are changed back to 1s.
if (sum(tab.names!= 1:length(desired_props))>=1) {
default.vec2 = numeric(nrow(new.all.dat))
if (type=="factor") {
for (i in 1:length(desired_props)) {
default.vec2[new.all.dat[,1]==i] <- levels(orig.default.vec)[i]
default.vec2 <- factor(default.vec2, levels=levels(orig.default.vec))
}
new.all.dat[,1] = default.vec2
} else if ((type=="integer")|(type=="numeric")) {
for (i in 1:length(desired_props)) {
default.vec2[new.all.dat[,1]==i] <- tab.names[i]
}
new.all.dat[,1] = default.vec2
} else {
stop("Add additional type in modifyProps()")
}
}
#and put children back in the right order
new.all.dat2 = new.all.dat[order(new.all.dat[,rank.col-1]),]
return(new.all.dat2[,1])
} else if (sum(abs(desired_props - current.props))>accuracy) {
#if not correct - still do these things but give output with warning
#(output should all be correct if I have thought of everything and made no
#mistakes)
#change back to orignal category names
if (sum(tab.names!= 1:length(desired_props))>=1) {
default.vec2 = numeric(nrow(new.all.dat))
if (type=="factor") {
for (i in 1:length(desired_props)) {
default.vec2[new.all.dat[,1]==i] <- levels(orig.default.vec)[i]
default.vec2 <- factor(default.vec2, levels=levels(orig.default.vec))
}
new.all.dat[,1] = default.vec2
} else if ((type=="integer")|(type=="numeric")) {
for (i in 1:length(desired_props)) {
default.vec2[new.all.dat[,1]==i] <- tab.names[i]
}
new.all.dat[,1] = default.vec2
} else {
stop("Add additional type in modifyProps()")
}
}
#put observations back in the right order
new.all.dat2 = new.all.dat[order(new.all.dat[,rank.col-1]),]
warning("Proportions may not be correct - Source code may need to be modified to handle this situaion")
return(new.all.dat2[,1])
}
}
if (exists(".accuracy")) {accuracy<-.accuracy}
if (is.null(desired_props) || any(is.na(desired_props))) {
#no props, silently do nothing
return(default.vec)
}
if (is.null(default.vec)) {
stop(gettextf("%s is NULL", as.character(sys.call())[2]))
}
if(length(default.vec)==0){
#if the default vector is empty, silently do nothing
return(default.vec)
}
#keep original vector
orig.default.vec <- default.vec
#n is the number of children in one year
n <- length(orig.default.vec)
type <- is(orig.default.vec)[1]
dim(propens)
dim(orig.default.vec)
if (!is.null(propens)
&& (
(length(dim(propens)) != 2 && length(orig.default.vec) != length(propens))
|| (length(dim(propens)) == 2 && length(orig.default.vec) != dim(propens)[1])
) ) {
firstParamName <- as.character(sys.call())[2]
stop(gettextf("Propensities must be the same length as %s ",firstParamName))
}
#check that the number of categories in the provided vector of data
#(default.vec) is the same number of categories as given in desired_props
# If not, add the missing categories to the provided vector.
num.cat <- length(table(orig.default.vec))
num.cat2 <- length(desired_props)
if (num.cat!=num.cat2) {
note2 = cat("Note: Length of desired_props not equal to number of categories in variable:",
"\n", "Assumed that there were unobserved categories in the variable",
"\n")
addMissingCategories()
}
#if propensity scores not provided then create them
if (length(propens)==0 || any(is.na(propens))) {
note2 <- "Note: No propensity scores available so random propensity scores were created\n"
cat(note2)
#col.num is the number of vectors of propensity scores required.
#E.g. for a 2-category varaible, 1 vector of propensity scores is required
#and for a 3-category variable, 2 vectors of propensity scores are
#required etc.
col.num <- num.cat2 - 1
#create random propensity score
propens.score <- NULL
for (i in 1:col.num) {
propens.score <- c(propens.score, runif(n))
}
propens <- matrix(propens.score, ncol=col.num, nrow=n)
}
convertCategoryName()
#match propensity scores to children
#(this function assumed that default.vec and propens have the same children
#in the same order)
#the last column is a child identifier so I can put them back in the right
#order
all.dat <- data.frame(default.vec, propens, 1:n)
new.all.dat <- all.dat
#rank.col identifies the column that the rankings will be in
#this is used later in the change.cat function where the propensities are converted to ranks
rank.col <- ncol(all.dat) + 1
#create table of given data and calculate current proportions and the numbers
#that need moving into or out of categories to get the requested proportions
#(n.change)
tab <- table(new.all.dat[,1])
cats <- c(1:length(desired_props))
#if any categories in desired_props are not present in default.vec then this merge is needed to fix the
#problem
tab.df <- merge(cats, tab, by = 1, all=TRUE)
#after the merge any categories in cat that were not present in tab appear as NAs
#these NAS are changed to 0s
na.id <- which(is.na(tab.df$Freq))
tab.df$Freq[na.id] <- 0
tab.df$x <- 1:nrow(tab.df)
current.props <- tab.df$Freq/sum(tab.df$Freq)
n.change <- round(current.props*n) - round(desired_props*n)
#(e.g n.change[1] is the excess/deficient number of observations in the first
#category in observed data)
num <- 1
i <- 1 #i = current category
while (i < length(desired_props)) {
if (n.change[i]==0) {
#if no change needs to be made for category i then move onto next category
i <- i + 1
num <- 1
} else if (sign(n.change[i])==1) {
#category i has too many obs - give to a higher category
num <- 1
new.all.dat <- change.cat(num, rank.col, i, new.all.dat, n.change)
i <- i + 1
} else if (sign(n.change[i])== -1) {
#category i has too few obs - steal from a higher category
new.all.dat <- change.cat(num, rank.col, i, new.all.dat, n.change)
num <- num + 1
#the maximum value of num is the number of categories above to steal
#from
#by incrementing i here we ensure that we don't get into an infinite
#loop due to the while condition never being satisfied
if (num>(length(desired_props)-(i-1))) {
i <- i + 1
}
}
#at this point changes have been made for one iteration of category i (may need more than
#one iteration if couldn't steal enough observations from the category
#immediately above it
#create table of current data (tab.df) and calculate numbers that need
#moving into or out of categories to get the requested proportions
#(n.change)
tab <- table(new.all.dat[,1])
cats <- c(1:length(desired_props))
tab.df <- merge(cats, tab, by = 1, all.x=TRUE)
na.id <- which(is.na(tab.df$Freq))
tab.df$Freq[na.id] <- 0
tab.df$x <- 1:nrow(tab.df)
current.props <- tab.df$Freq/sum(tab.df$Freq)
n.change <- round(current.props*n) - round(desired_props*n)
}
checking()
}
########### Tidy up the original function (modifyProps2) ##############
modifyProps1 <- function(default.vec, desired_props, propens=NULL, accuracy=.01) {
#if the the default.vec values are not consecutive numbers starting at 1 change them
#so they are
convertCategoryName <- function(){
if (type=="factor") {
tab.names <<- names(table(default.vec))
} else if ((type=="integer")|(type=="numeric")) {
tab.names <<- as.numeric(names(table(default.vec)))
} else {
stop("Add additional type in modifyProps()")
}
if (sum(tab.names!= 1:length(desired_props))>=1) {
default.vec2 <- numeric(length(default.vec))
for (i in 1:length(desired_props)) {
default.vec2[default.vec==tab.names[i]] <- i
}
default.vec <<- default.vec2
}
}
#check if requested proportions acheived with the accuracy and transform back to orignal category names.
checking <- function(){
if (sum(abs(desired_props - current.props))<=accuracy) {
#if correct
#change back to orignal category names (if they were changed earlier)
#e.g. if the orginal variable was a {0, 1} variable then all 0s would have
#been changed to 1s and alls 1s changed to 2s. At this step, after the
#changes to get the right proportions, the 1s are changed back to 0s and
#the 2s are changed back to 1s.
if (sum(tab.names!= 1:length(desired_props))>=1) {
default.vec2 = numeric(nrow(new.all.dat))
if (type=="factor") {
for (i in 1:length(desired_props)) {
default.vec2[new.all.dat[,1]==i] <- levels(orig.default.vec)[i]
default.vec2 <- factor(default.vec2, levels=levels(orig.default.vec))
}
new.all.dat[,1] = default.vec2
} else if ((type=="integer")|(type=="numeric")) {
for (i in 1:length(desired_props)) {
default.vec2[new.all.dat[,1]==i] <- tab.names[i]
}
new.all.dat[,1] = default.vec2
} else {
stop("Add additional type in modifyProps()")
}
}
#and put children back in the right order
new.all.dat2 = new.all.dat[order(new.all.dat[,rank.col-1]),]
return(new.all.dat2[,1])
} else if (sum(abs(desired_props - current.props))>accuracy) {
#if not correct - still do these things but give output with warning
#(output should all be correct if I have thought of everything and made no
#mistakes)
#change back to orignal category names
if (sum(tab.names!= 1:length(desired_props))>=1) {
default.vec2 = numeric(nrow(new.all.dat))
if (type=="factor") {
for (i in 1:length(desired_props)) {
default.vec2[new.all.dat[,1]==i] <- levels(orig.default.vec)[i]
default.vec2 <- factor(default.vec2, levels=levels(orig.default.vec))
}
new.all.dat[,1] = default.vec2
} else if ((type=="integer")|(type=="numeric")) {
for (i in 1:length(desired_props)) {
default.vec2[new.all.dat[,1]==i] <- tab.names[i]
}
new.all.dat[,1] = default.vec2
} else {
stop("Add additional type in modifyProps()")
}
}
#put observations back in the right order
new.all.dat2 = new.all.dat[order(new.all.dat[,rank.col-1]),]
warning("Proportions may not be correct - Source code may need to be modified to handle this situaion")
return(new.all.dat2[,1])
}
}
if (exists(".accuracy")) {accuracy<-.accuracy}
if (is.null(desired_props) || any(is.na(desired_props))) {
#no props, silently do nothing
return(default.vec)
}
if (is.null(default.vec)) {
stop(gettextf("%s is NULL", as.character(sys.call())[2]))
}
#keep original vector
orig.default.vec <- default.vec
#n is the number of children in one year
n <- length(orig.default.vec)
type <- is(orig.default.vec)[1]
dim(propens)
dim(orig.default.vec)
if (!is.null(propens)
&& (
(length(dim(propens)) != 2 && length(orig.default.vec) != length(propens))
|| (length(dim(propens)) == 2 && length(orig.default.vec) != dim(propens)[1])
) ) {
firstParamName <- as.character(sys.call())[2]
stop(gettextf("Propensities must be the same length as %s ",firstParamName))
}
#check that the number of categories in the provided vector of data
#(default.vec) is the same number of categories as given in desired_props
# If not, add the missing categories to the provided vector.
num.cat <- length(table(orig.default.vec))
num.cat2 <- length(desired_props)
if (num.cat!=num.cat2) {
note2 = cat("Note: Length of desired_props not equal to number of categories in variable:",
"\n", "Assumed that there were unobserved categories in the variable",
"\n")
}
#if propensity scores not provided then create them
if (length(propens)==0 || any(is.na(propens))) {
note2 <- "Note: No propensity scores available so random propensity scores were created\n"
cat(note2)
#col.num is the number of vectors of propensity scores required.
#E.g. for a 2-category varaible, 1 vector of propensity scores is required
#and for a 3-category variable, 2 vectors of propensity scores are
#required etc.
col.num <- num.cat2 - 1
#create random propensity score
propens.score <- NULL
for (i in 1:col.num) {
propens.score <- c(propens.score, runif(n))
}
propens <- matrix(propens.score, ncol=col.num, nrow=n)
}
convertCategoryName()
#match propensity scores to children
#(this function assumed that default.vec and propens have the same children
#in the same order)
#the last column is a child identifier so I can put them back in the right
#order
all.dat <- data.frame(default.vec, propens, 1:n)
new.all.dat <- all.dat
#rank.col identifies the column that the rankings will be in
#this is used later in the change.cat function where the propensities are converted to ranks
rank.col <- ncol(all.dat) + 1
#create table of given data and calculate current proportions and the numbers
#that need moving into or out of categories to get the requested proportions
#(n.change)
tab <- table(new.all.dat[,1])
cats <- c(1:length(desired_props))
#if any categories in desired_props are not present in default.vec then this merge is needed to fix the
#problem
tab.df <- merge(cats, tab, by = 1, all=TRUE)
#after the merge any categories in cat that were not present in tab appear as NAs
#these NAS are changed to 0s
na.id <- which(is.na(tab.df$Freq))
tab.df$Freq[na.id] <- 0
tab.df$x <- 1:nrow(tab.df)
current.props <- tab.df$Freq/sum(tab.df$Freq)
n.change <- round(current.props*n) - round(desired_props*n)
#(e.g n.change[1] is the excess/deficient number of observations in the first
#category in observed data)
num <- 1
i <- 1 #i = current category
while (i < length(desired_props)) {
if (n.change[i]==0) {
#if no change needs to be made for category i then move onto next category
i <- i + 1
num <- 1
} else if (sign(n.change[i])==1) {
#category i has too many obs - give to a higher category
num <- 1
new.all.dat <- change.cat(num, rank.col, i, new.all.dat, n.change)
i <- i + 1
} else if (sign(n.change[i])== -1) {
#category i has too few obs - steal from a higher category
new.all.dat <- change.cat(num, rank.col, i, new.all.dat, n.change)
num <- num + 1
#the maximum value of num is the number of categories above to steal
#from
#by incrementing i here we ensure that we don't get into an infinite
#loop due to the while condition never being satisfied
if (num>(length(desired_props)-(i-1))) {
i <- i + 1
}
}
#at this point changes have been made for one iteration of category i (may need more than
#one iteration if couldn't steal enough observations from the category
#immediately above it
#create table of current data (tab.df) and calculate numbers that need
#moving into or out of categories to get the requested proportions
#(n.change)
tab <- table(new.all.dat[,1])
cats <- c(1:length(desired_props))
tab.df <- merge(cats, tab, by = 1, all.x=TRUE)
na.id <- which(is.na(tab.df$Freq))
tab.df$Freq[na.id] <- 0
tab.df$x <- 1:nrow(tab.df)
current.props <- tab.df$Freq/sum(tab.df$Freq)
n.change <- round(current.props*n) - round(desired_props*n)
}
checking()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.