Nothing
#Modified October, 2019 to have a variable number of items per scale
#Modified Sept 8, 2019 to include weighted scoring
#Modified April 15, 2019 to make structure somewhat cleaner
#Minor fix December 22, 2021 to allow correctly for correlation input
"bestScales" <-
function(x, #the raw data or a correlation matrix
criteria, # the criteria (name) to predict
min.item=NULL,max.item=NULL,delta=0, #parameters for multiple solutions
cut=.1,n.item =10,wtd.cut = 0,wtd.n=10, #just one solution / criteria
n.iter =1,folds=1,p.keyed=.9, #how many bootstraps (n.iter) or folds
overlap=FALSE,dictionary=NULL,check=TRUE,impute="none", log.p=FALSE, digits=2) {
cl <- match.call()
first <- TRUE
#check for bad input -- the Mollycoddle option
if(is.vector(criteria) & any( !(criteria %in% colnames(x)) )) {
cat("\nCriteria names are incorrectly specified. Offending items are ", criteria[which(!(criteria %in% colnames(x)))],"\n")
stop("I am sorry. I am stopping because you did not specify the criteria correctly. See above. ")}
#further check if the dictionary is specified correctly
if(!is.null(dictionary)) if(length(dictionary) < 1) stop("I am sorry. I am stopping because you did not specify the dictionary correctly. ")
#check and delete variables with no variance (by default)
if(check) {item.var <- apply(x,2,sd,na.rm=TRUE)
bad <- which((item.var <= 0)|is.na(item.var))
if((length(bad) > 0) ) {
for (baddy in 1:length(bad)) {message( "Item = ",colnames(x)[bad][baddy], " had no variance and was deleted")}
x <- x[,-bad]
}
}
#check various parameters
#
frac <- 1
if(folds > 1) {frac = 1/folds
if(n.iter !=folds) {n.iter <- folds
cat('Number of iterations set to the number of folds = ',n.iter) }
}
if(n.iter>1 && isCorrelation(x)) {n.iter <- 1
folds <- 1
cat("Number of iterations and number of folds set to 1 because the you are using a correlation matrix\n")
}
set.seed(NULL)
old.seed <- .Random.seed[42] #we save this if we want to do k-fold cross validation
####
#first, define function to be parallelized
#mcmapply for parallel, mapply for debugging
short <- function(i,x,n.obs,criteria,cut,n.item,impute,digits,dictionary,frac,log.p=FALSE,min.item,max.item) {#this is the function that is iterated
multi.score <- NULL
multi.cross <- NULL
if(n.iter > 1) {
if(!isCorrelation(x)) { ss <- (1:n.obs)
if(frac==1) {ss <- sample(ss,n.obs,replace=TRUE) #bootstrap resampling ss is 'in the bag'
} else {
set.seed(old.seed) #this will take the same random sequence for this run so that we can do k fold
ss <- sample(ss,n.obs,FALSE) # this is the 1:n.obs in the same random order for each of the k fold
ss <- ss[-(((i-1)*frac*n.obs +1):(i*frac*n.obs))] #this drops frac cases out each trial
}
#the main function for finding the best items is right here
#probably don't need to pass dictionary every time, since we rarely examine a single pass
scores <- bScales(x[ss,],criteria=criteria,cut=cut,
n.item =n.item,overlap=overlap,dictionary=dictionary,impute=impute,digits=digits,log.p=log.p)
#These next two lines then try to find the optimal number of items for this pass
#Not clear if we really need to do this for every iteration, perhaps just for the final pooled r
if(!is.null(min.item)){
multi.score <- fastValidity(items=x[ss,],criteria=criteria,r=NULL,overlap=overlap, nlow=min.item,nhigh=max.item)
multi.cross <- fastCrossValidity(new.items = x[-ss,],r=multi.score$r,item.order=multi.score$item.order,criteria=criteria,
nlow=min.item,nhigh=max.item,delta=0,overlap=overlap,optimal.n=multi.score$optimal.unit.n,optimal.wtd.n = multi.score$optimal.wtd.n,)} else {multi.score <- NULL}
} else {message("iterative solutions not possible for correlation matrices")
n.iter <- 1
}} else { # a correlation matrix or n.iter = 1
scores <- bScales(x,criteria=criteria,cut=cut,
n.item =n.item, overlap=overlap, dictionary=dictionary, impute=impute, digits=digits,log.p=log.p)
if(!is.null(min.item)){ multi.score <- fastValidity(items=x,criteria=criteria,r=NULL,overlap=overlap, nlow=min.item,nhigh=max.item)} else {multi.score <- NULL}
}
key.list <- keys2list(scores$key) #this converts the -1 and 1s to a list with the variable names
if(n.iter > 1) {
cross <- scoreFast(key.list,x[-ss,],impute=impute,min=1,max=6) #why are these fixed values?
validity <- diag(cor(cross,x[-ss,criteria],use="pairwise"))
#now, add the two new functions FastValidity and FastCrossValidity
#if we just want to do the optimal number of items on the summaries, we don't need to return the multi.scores here
short.result <- list(r = c(scores$r,validity),key.list=key.list,R = scores$R,multi.scores=multi.score,multi.cross=multi.cross)
} else {short.result <- scores # this is the list of various objects from bScales
short.result$key.list <- key.list
short.result$multi.score <- multi.score
short.result$multi.cross <- multi.cross}
class(short.result) <- cbind("psych","bestScales")
return(short.result) } #this is the result from 1 iteration of all criteria
###
###
#begin the main function
#if criteria is a separate data frame, combine x and criteria
#there are several alternative forms for criteria
#it is either a column name of x, or it is a separate data.frame/matrix
if(!is.null(dim(criteria))| (length(criteria) == NROW(x))) { x <- cbind(x,criteria)
if(NCOL(criteria) > 1 ){criteria <- colnames(criteria) } else {criteria <- "criteria"}
#criteria <- colnames(criteria)
}
n.obs <- nrow(x)
#if((n.iter ==1)| first ) { #don't bother to parallelize, just do one trial
if((n.iter ==1)) {
first.result <- short(1,x,n.obs=n.obs,criteria=criteria,cut=cut,n.item=n.item,impute=impute,digits=digits,dictionary=dictionary,frac=1,min.item=min.item,max.item=max.item)
first <- FALSE
result <- first.result
result$best.keys <- lapply(first.result$short.key,function(x) create.ordered.key(x))
} else {first.result <- NULL}
#the case for n.iter > 1. We want to parallelize this because we are working pretty hard
if(n.iter > 1) {
result <- list()
#This does the work across n.iter and across all criteria
result <- mapply(short,c(1:n.iter),MoreArgs=list(x,n.obs=n.obs,criteria=criteria,cut=cut,n.item=n.item,impute=impute,digits=digits,dictionary=dictionary,frac=frac,min.item=min.item,max.item=max.item))
#result <- mcmapply(short,c(1:n.iter),MoreArgs=list(x,n.obs=n.obs,criteria=criteria,cut=cut,n.item=n.item,impute=impute,digits=digits,dictionary=dictionary,frac=frac,min.item=min.item,max.item=max.item))
#we have done the heavy lifting, now we need to prepare various results for output.
if(delta > 0) { delta <- delta /sqrt(n.obs)}
result <- organize.results(result,x,n.iter=n.iter,p.keyed=p.keyed,dictionary=dictionary,wtd.cut=wtd.cut,wtd.n = wtd.n,overlap=overlap,min.item=min.item,max.item=max.item,delta=delta) #makes the function a bit cleaner by doing this in its own function
#save the keys and the summary
} else { #we just did a single pass, the simple summaries are already there
#but we should organize them a little
if(!is.null(result$scores)){final.means <- colMeans(result$scores,na.rm=TRUE)
final.sd <- apply(result$scores,2,sd,na.rm=TRUE)} else {final.means<- NA
final.sd <- NA}
if(length(criteria) > 1 ) {crit.mean <- colMeans(x[,criteria],na.rm=TRUE)
crit.sd <- apply(x[,criteria],2,sd,na.rm=TRUE)} else {
crit.mean <- mean(x[,criteria],na.rm=TRUE)
crit.sd <- sd(x[,criteria],na.rm=TRUE)}
if(any(is.na(final.means))) {result$final.stats <- data.frame(r=result$r,crit.m=crit.mean,crit.sd =crit.sd) } else {
result$final.stats <- data.frame(mean=final.means,sd=final.sd,r=result$r,crit.m=crit.mean,crit.sd =crit.sd)}
result$items <- NULL
}
result$Call <- cl
result$first.result <- first.result
class(result) <- c("psych","bestScales")
return(result)
}
#####################
#######################################
#This function takes the results from short for many trials and then tries to make sense of them
######################################
organize.results <- function(result,x=NA,n.iter=1,p.keyed=.9,dictionary=NULL,wtd.cut,wtd.n,overlap=overlap, min.item=min.item,max.item=max.item,delta=delta) {
#The results are n.iter lists, each with validity,keys,R, and the results from multi.score
validity <- list()
#validity is a list of elements repeated n.iter times
#first are the validities
#then are the keys
#then are the item by criteria correlations
#then the multi.score matrices
keys <- R.list <- multi.valid <- multi.cross <- list()
#take the list from all the iterations, and organize them in a more meaningful way
for(i in (1:n.iter)) {
validity[[i]] <- result[["r",i]]
keys[[i]] <- result[["key.list",i]]
R.list[[i]] <- result [["R",i,drop=FALSE]]
if(!is.null(min.item)) { multi.valid [[i]] <- result[["multi.scores",i,drop=FALSE]]
multi.cross [[i]] <- result[["multi.cross",i,drop=FALSE]]
}
}
replicated.items <- bestReplicatedItems(keys)
items <- list()
item.mean <- list()
best.keys <- list()
criteria <- names(replicated.items)
optimal.n <- optimal.wtd.n <- optimal.unit.deriv <- optimal.wtd.deriv <- optimal.cross.unit <- optimal.cross.wtd <- cross.n <- cross.wtd.n <- list()
#we can find the optimal length for all criteria at once
if(!is.null(min.item)) {
for (i in 1:n.iter) {
optimal.n[[i]] <- multi.valid[[i]][["optimal.unit.n"]]
optimal.wtd.n[[i]] <- multi.valid[[i]][["optimal.wtd.n"]]
optimal.unit.deriv[[i]] <- multi.valid[[i]][["optimal.unit.deriv"]]
optimal.wtd.deriv[[i]] <- multi.valid[[i]][["optimal.wtd.deriv"]]
optimal.cross.unit[[i]] <- multi.cross[[i]][["cross.unit"]]
optimal.cross.wtd[[i]] <- multi.cross[[i]][["cross.wtd"]]
cross.n[[i]] <- multi.cross[[i]][["cross.n"]]
cross.wtd.n[[i]] <- multi.cross[[i]][["cross.wtd.n"]]
}
optimal.n.mean <-apply(matrix(unlist(optimal.n),nrow=n.iter,byrow=TRUE),2,median)
optimal.wtd.mean <- apply(matrix(unlist(optimal.wtd.n),nrow=n.iter,byrow=TRUE),2,median)
optimal.unit.deriv <- colMeans(matrix(unlist(optimal.unit.deriv),nrow=n.iter,byrow=TRUE))
optimal.wtd.deriv <- colMeans(matrix(unlist(optimal.wtd.deriv),nrow=n.iter,byrow=TRUE))
optimal.cross.unit <- colMeans(matrix(unlist(optimal.cross.unit),nrow=n.iter,byrow=TRUE))
optimal.cross.wtd <- colMeans(matrix(unlist(optimal.cross.wtd),nrow=n.iter,byrow=TRUE))
cross.n <- apply(matrix(unlist(cross.n),nrow=n.iter,byrow=TRUE),2,median)
cross.wtd.n <- apply(matrix(unlist(cross.wtd.n),nrow=n.iter,byrow=TRUE),2,median)
}
#but we need to find item statistics one criteria at a time
for(j in 1:length(criteria)) {
#first, find the means and standard deviations for each selected item
rep.item <- replicated.items[[j]][replicated.items[[j]] >= n.iter * p.keyed]
if(length(rep.item)==0) rep.item <- replicated.items[[j]][1]
# if(length(criteria) > 1 ) {for (i in 1:n.iter) { item.mean[[i]] <- R.list[[i]][names(replicated.items[[j]][replicated.items[[j]] > n.iter * p.keyed]),criteria[j]] }
# } else { for (i in 1:n.iter) {item.mean[[i]] <- R.list[[i]][names(replicated.items[[j]][replicated.items[[j]] > n.iter * p.keyed])] } }
for (i in 1:n.iter) {if(length(criteria) > 1) {
item.mean[[i]] <- R.list[[i]][names(rep.item),criteria[j]]} else {item.mean[[i]] <- R.list[[i]][names(rep.item)]}
}
item.m <- matrix(unlist(item.mean),nrow=n.iter,byrow=TRUE)
colnames(item.m) <- names(rep.item)
means = colMeans(item.m,na.rm=TRUE)
sds <- apply(item.m,2,sd,na.rm=TRUE)
# Freq <- colSums(!is.na(item.m)) #This is the total number of items and just reflect n.iter
Freq <- as.vector(rep.item)
names(Freq) <- names(rep.item)
#items [[criteria[j] ]] <- cbind(replicated.items[[j]],Freq=Freq,mean.r=means,sd.r = sds,dictionary[names(replicated.items[[j]]),])
items[[criteria[j]]] <- cbind(Freq,mean.r = means,sd.r = sds,dictionary[names(rep.item),,drop=FALSE])
items[[criteria[j]]] <- psychTools::dfOrder(items [[criteria[j] ]],"-mean.r",absolute=TRUE) #sort on the mean.r column
# items[[criteria[j]]] <- items[[criteria[j]]][items[[criteria[j]]][,"Freq"] >= n.iter * p.keyed,]
#now prepare the best.keys list
if(!is.null(dim(items[[criteria[[j]] ]] ))){ direction <- sign(items[[criteria[[j]] ]][,"mean.r"])
direction <- as.matrix(direction)
rownames(direction) <- rownames(items[[criteria[[j]] ]])
count <- items[[criteria[[j]]]][,"Freq"]} else {
if(!is.null(items[[criteria[[j]] ]])) {
# items [[criteria[j] ]] <- cbind(Freq=Freq,mean.r=means,sd.r = sds,dictionary[names(replicated.items[[j]]),])
items [[criteria[j] ]] <- cbind(Freq=Freq,mean.r=means,sd.r = sds,dictionary[names(replicated.items[[j]]),])
direction <- sign(items[[criteria[[j]] ]]["mean.r"])
names(direction) <- names(Freq)
direction <- as.matrix(direction)
count <- items[[criteria[[j]]]][1] }
else {count <- 0}
}
count <- count >= n.iter*p.keyed
if(sum(count,na.rm=TRUE) > 0) {
best.keys[[j]] <- rownames(direction)[count]
direction <- direction[count,drop=FALSE]
if(length(direction)> 1) { best.keys[[j]][direction < 0] <- paste0("-", best.keys[[j]][direction < 0]) }
if((length(direction) ==1) && (!is.na(direction))) {
best.keys[[j]][direction < 0] <- paste0("-", best.keys[[j]][direction < 0]) }
} else { best.keys[[j]] <- NA
}
}
#Find the mean, zero order correlation of each item with each criteria
#We do this by pooling the data in R.list
mean.raw.r <- matrix(unlist(R.list),ncol=NCOL(R.list[[1]]) * NROW(R.list[[1]]),byrow=TRUE )
sd.raw.r <- apply(mean.raw.r,2,sd,na.rm=TRUE)
sd.raw.r <- matrix(sd.raw.r,ncol=length(criteria))
mean.raw.r <- matrix(colMeans(mean.raw.r,na.rm=TRUE),ncol=length(criteria))
if(length(criteria) == 1) {colnames(mean.raw.r) <- criteria
rownames(mean.raw.r) <- names(R.list[[1]])} else {colnames(mean.raw.r) <- colnames(R.list[[1]])
rownames(mean.raw.r) <- rownames(R.list[[1]])}
final.mean.r <- mean.raw.r
mean.raw.r[abs(mean.raw.r) < wtd.cut] <- 0
#now, drop all except the wtd.n items
ny <- length(criteria)
nvar <- NROW(mean.raw.r)
if(ny > 1 ) {ord <- apply(abs(mean.raw.r[,criteria]),2,order,decreasing=TRUE)
for (i in 1:ny) {mean.raw.r[ord[(wtd.n+1):nvar,i],criteria[i]] <- 0
}
} else {
ord <- order(abs(mean.raw.r),decreasing=TRUE)
for (i in 1:ny) {mean.raw.r[ord[(wtd.n+1):nvar]] <- 0 }
}
N.wtd <- colSums(abs(mean.raw.r) >0)
if(length(best.keys) == length(criteria)) names(best.keys) <- criteria
#Find the results for best keys
final.scale <- scoreFast(best.keys,x) #these are the unit weighted
final.raw.scale <- scoreWtd(mean.raw.r,x) #these are the zero order weighted scores
final.raw.valid <- diag(cor(final.raw.scale,x[,criteria,drop=FALSE],use="pairwise") )
final.valid <- diag(cor(final.scale, x[,criteria,drop=FALSE],use="pairwise") )
final.means <- colMeans(final.scale,na.rm=TRUE)
final.sd <- apply(final.scale,2,sd,na.rm=TRUE)
crit.mean <- colMeans(x[,criteria,drop=FALSE],na.rm=TRUE)
crit.sd <- apply(x[,criteria,drop=FALSE],2,sd,na.rm=TRUE)
result.df <- data.frame(matrix(unlist(validity),ncol=2*length(criteria),byrow=TRUE))
colnames(result.df) <-c(paste("derivation",criteria),paste("validation",criteria))
if(!is.null(min.item)) {
multi.derivation.df <- data.frame(n=optimal.n.mean,unit=optimal.unit.deriv,n.wtd=optimal.wtd.mean,wtd=optimal.wtd.deriv,valid.n=cross.n,valid.unit=optimal.cross.unit,valid.wtd.n = cross.wtd.n,valid.wtd=optimal.cross.wtd)
rownames(multi.derivation.df ) <- criteria
} else {multi.derivation.df <- NULL}
ncriteria <- length(criteria)
if(!is.null(min.item)){
final.multi.validities <- fastValidity(x,criteria,r=final.mean.r, nlow=min.item, nhigh=max.item,overlap=overlap)
final.order <- final.multi.validities$item.order
final.item.valid.list <- list()
for (j in 1 : ny ) {if(!is.null(dictionary)) {final.item.valid.list[[criteria[j]]] <- data.frame(item=rownames(final.mean.r)[final.order[1:max.item,j]]
,r=final.mean.r[final.order[1:max.item,j],j],unit = final.multi.validities$unit.deriv[,j],wtd=final.multi.validities$wtd.deriv[,j],dictionary[rownames(final.mean.r)[final.order[1:max.item,j]],])
} else { final.item.valid.list[[criteria[j]]] <- data.frame(item=rownames(final.mean.r)[final.order[1:max.item,j]],r=final.mean.r[final.order[1:max.item,j],j],unit = final.multi.validities$unit.deriv[,j],wtd=final.multi.validities$wtd.deriv[,j])}
}
} else {final.multi.validities <- NULL
final.item.valid.list <- NULL}
#now, organize the output object into a reasonable order
result <- list()
results <- list() #to hold things we don't actually want to return
#now get out the items and incremental validities for each scale
results$means = colMeans(result.df,na.rm=TRUE)
results$sd <- apply(result.df,2,sd,na.rm=TRUE)
result$summary <- data.frame(derivation.mean= results$means[1:ncriteria],derivation.sd = results$sd[1:ncriteria],validation.m=results$mean[(ncriteria+1):(2*ncriteria)],
validation.sd =results$sd[(ncriteria+1):(2*ncriteria)],final.valid = final.valid,final.wtd=final.raw.valid,N.wtd=N.wtd )
rownames(result$summary) <- criteria
#result <- list(validity = result.df,multi.validities=multi.derivation.df,items=items,replicated.items =replicated.items,keys = keys,final.mean.r =final.mean.r,multi.validities=multi.valid)
result$optimal <- multi.derivation.df
result$best.keys <- best.keys
result$weights <- mean.raw.r
result$final.item.list <- final.item.valid.list
result$multi.validities <- final.multi.validities
result$items <- items
if(!is.null(min.item)) {
result$optimal.keys <- optimal.keys(final.multi.validities,delta=delta)
result$optimal.weights <- optimal.weights(final.multi.validities,delta=delta)
n.optimal.unit <- sapply(result$optimal.keys,length)
n.optimal.wtd <- apply(result$optimal.weights,2,function(x) sum(abs(x) > 0) )
result$optimal <- data.frame(result$optimal,n.final=n.optimal.unit,n.wtd.final = n.optimal.wtd)
}
result$stats <- data.frame(mean=results$means,se=results$sd)
# result$final <- final.valid
result$scores <- final.scale
result$wtd.scores <- final.raw.scale
result$final.stats <- data.frame(mean=final.means,sd=final.sd,r=final.valid,crit.mean = crit.mean,crit.sd=crit.sd,final.wtd=final.raw.valid,N.wtd=N.wtd)
# result$sd.weights <- sd.raw.r
# result$final.raw <- final.raw.valid
return(result)
}
########################################### #end of organize results
###########################################
#This one actually does the work -- but should not process n.item at this point
"bScales" <-
function(x,criteria,cut=.1,n.item =10, overlap=FALSE,dictionary=NULL,impute="median",digits=2,log.p=FALSE) {
#created 20/2/14
#find the scales based upon the items that most correlate with a criteria
#pure dust bowl empiricism
#modified 13/3/15 to handle the problem of missing item labels
#Completely redone June 2017 to allow for raw data and bootstrapping
##
#begin the main function
##Basically two cases:
#a correlation matrix is provided and we do basic matrix algebra
#or raw data are provided (getting ready to do bootstrapping) and we find just the necessary correlations
nvar <- ncol(x)
n.item <- min(n.item,nvar-1)
if(isCorrelation(x) | (all(colnames(x) %in% rownames(x)))) {r <- x # case 1
raw <- FALSE
nvar <- NROW(r)
} else { #case 2
y <- x[,criteria]
if(log.p) {r <- log(corr.test(x,y)$p)} else { r <- cor(x,y,use="pairwise")}
colnames(r) <- criteria
x <- as.matrix(x)
raw <- TRUE
n.obs <- NROW(x)}
#don't actually need to have a square matrix
ny <- length(criteria)
nc <- length(cut)
ni <- length(n.item) #number of items per scale to find
ord.name <- NULL
if(length(cut) == 1) cut <- rep(cut,ny)
if(length(n.item) == 1) n.item <- rep(n.item,ny) #
#We have the correlations with the criteria, we can
#this next part just finds the cut values to use
if(!overlap) {r[criteria,criteria] <- 0} else {for(i in 1:ny) r[criteria[i],criteria[i]] <- 0}
if(ny > 1 ) {ord <- apply(abs(r[,criteria]),2,order,decreasing=TRUE)
for (i in 1:ny) {cut[i] <- max(cut[i],abs(r[ord[n.item[i],i],criteria[i]]))
}
} else {
ord <- order(abs(r[,criteria]),decreasing=TRUE)
for (i in 1:ny) {cut[i] <- max(cut[i],abs(r[ord[n.item[i]+1],criteria])) }
}
# cut has been adjusted
#The unit weightsr
key <- matrix(0,ncol=ny,nrow=nvar)
key[t(t(r[,criteria]) >= cut)] <- 1
key[t(t(r[,criteria]) <= -cut)]<- -1
rownames(key) <- rownames(r)
colnames(key) <- criteria
k <- key #this just gets it to be a matrix of the right size and names
#colnames(key) <- paste(criteria,"S",sep=".")
#colnames(key) <- criteria
#now, drop those items from the keys that are not used
used <- rowSums(abs(key))
key <- key[used > 0,,drop=FALSE]
x <- x[,used >0,drop=FALSE]
if(NCOL(x) < 1) stop("No items met the criteria. Try changing the cut value.")
#now, if we have raw data, find the correlation of the composite scale with the criteria
#if we have raw data, then we find the scales from the data
if(raw) { #case 2
#score <- matrix(NA,ncol=ny,nrow=nrow(x))
#for (i in (1:ny)) {
# score[,i] <- rowSums(t((key[,i])* t(x)),na.rm=TRUE)
# }
score <- scoreFast(key,x,impute=impute,min=1,max=6) #min and max should not be fixed values
R <- diag(cor(score,y,use="pairwise")) #the validities
re <- r[,criteria]
ni <- colSums(abs(key))
} else { #case 1 (from a correlation matrix)
score <- NULL
r <- r[,used > 0,drop=FALSE]
k<- key #get the matrix dimensions right
ss <- rownames(key)
if(any(is.na(r))) {#Are there any bad values
for(i in 1:ny) {#key[,i] <- findBad(key[,i],r) #Drop the bad items from any scoring key
k[,i] <- colSums((key[,i]) * (r[ss,ss]),na.rm=TRUE)} #replace matrix addition with a colSums
k <- t(k)
} else {#otherwise, don't bother
C <- t(t(key) %*% t(r[criteria,,drop=FALSE])) #criterion covariance
V <- t(key) %*% r[ used > 0,] %*% key #predictor variance
# k <- t(t(key) %*% t(r[criteria,,drop=FALSE])) #we can do the matrix multiply because there are no bad data
}
# V <- t(k) %*% key #this is the covariance of the criteria with criteria
# C <- k[criteria,]
if(ny < 2) {re <- r[criteria,]
R <- C/sqrt(V)} else {
# R <- diag(C/sqrt(V))
R <- C /sqrt(diag(V))
#re <- diag(k[criteria,])/sqrt(diag(C))
}
ni <- colSums(abs(key))
#R <- cov2cor(C)
r <- t(r)
re <- r[,criteria]
}
short.key <- list()
value <- list()
#R is the correlation with the criterion
#re is the correlations of each item with the criteria
for(i in 1:ny) {short.key[[criteria[i]]] <- round(key.value(key[,i,drop=FALSE],r),digits=digits)
#actually we should not bother with the dictionary here, just at the summary level
if(!is.null(dictionary)) {if(!is.factor(dictionary)) {temp <- lookup(rownames(short.key[[criteria[i]]]),dictionary)
#this next line needs to be rethought -- the merge command is very slow
value[[criteria[[i]]]] <- merge(short.key[[i]],temp,by="row.names",all.x=TRUE,sort=FALSE)
rownames( value[[criteria[[i]]]]) <- value[[criteria[[i]]]][,1]
value[[criteria[[i]]]] <- value[[criteria[[i]]]][-1] #this looks weird but is because there is an extra name
ord <- order(abs(value[[criteria[[i]]]][[criteria[[i]]]]),decreasing=TRUE)
value[[criteria[[i]]]] <- value[[criteria[[i]]]][ord,]
}
}}
bScales.results <- list(r=R,n.items=ni,R=re,cut=cut,short.key=short.key,value=value,key=key,ordered=ord.name,scores=score)
class(bScales.results) <- c("psych","bestScales") #This is the solution for one pass
return(bScales.results)
}
################################
#various minor functions used in bestScales
#first, declare a function to identify the bad items and drop them from the keys
findBad <- function(key,r) {
ss <- abs(key) > 0
rss <- r[ss,ss]
if(any(is.na(rss))){ #some of these are bad
n.bad <- apply(rss,1,function(x) sum(is.na(x)))
key[names(which.max(n.bad))] <- 0
findBad(key,r)}
return(key)
}
key.value <- function(key,r) {
kn <- names(key[abs(key[,1]) > 0,1])
if(is.null(kn)) kn <- names(which(abs(key[,1]) > 0))
cn <- colnames(key)
ord <- order(abs(r[kn,cn]),decreasing=TRUE)
kn <- kn[ord]
result <- r[kn,cn,drop=FALSE]
return(result)
}
#added the test for length>1 for the null case 4/16/23
create.ordered.key <- function(x) {
if(length(x)>1 ){
for (i in 1: length(x)) {
if(sign(x[i])<0 ) rownames(x)[i] <- paste0("-",rownames(x)[i])
}}
return(rownames(x))
}
"finalsummary" <- function(r,keys) {
}
"bestReplicatedItems" <- function( L) {
n.iters <- length(L)
# n.vars <- NCOL(L[[1]])
vars <- names(L[[1]])
n.vars<- length(vars)
item.nums <- list()
one.criterion <- list()
for (j in 1:n.vars) { #do this over the criteria
for (i in 1:n.iters) {one.criterion[[i]] <- L[[i]][j] }
select <- sub("-","",unlist(one.criterion))
item.nums[[vars[j]]] <- sort(table(select),decreasing=TRUE)
nam.items <- names(item.nums)
}
item.nums <- as.vector(item.nums)
names(item.nums) <- nam.items
return(item.nums)
}
predict_bestScales <- function(object,data,new.data)
{keys <- object$keys
if (is.null(keys)){ keys<- object$key.list
keys <- make.keys(data,keys) }
stats <- describe(data,fast=TRUE,ranges=FALSE)
old.mean <- stats$mean
old.sigm <- stats$sd
z.data <- scale(new.data,center=stats$mean,scale=stats$sd)
z.data[is.na(z.data)] <- 0
predicted <- z.data %*% keys
return(predicted)
}
#does not do what I want
predict_wtdScales <- function(object,data,new.data)
{weights <- object$weights
stats <- describe(data,fast=TRUE,ranges=FALSE)
old.mean <- stats$mean
old.sigm <- stats$sd
predicted <- scoreWtd(weights,new.data)
predicted <- scale(predicted,center=stats$mean,scale=stats$sd)
return(predicted)
}
#####
#print.psych.bestscales is called from the psych.print function
print_psych.bestScales <- function(x,digits=2,short=NULL,...) {
cat("\nCall = ")
print(x$Call)
if(!is.null(x$items)) {
print(x$summary,digits=digits)
if(!is.null(x$optimal)) {
cat("\n Optimal number of items, derivation and cross validation\n")
print(x$optimal,digits=digits) }
# x$replicated.items
items <- x$items
size <- NCOL(items[[1]])
nvar <- length(items)
if(is.null(x$optimal)) {
cat("\n Best items on each scale with counts of replications\n")
for(i in 1:nvar) {
cat("\n Criterion = ",names(items[i]),"\n")
if(length(items[[i]]) > 3) {
temp <- data.frame(items[[i]])
temp[2:3] <- round(temp[2:3],digits)} else{temp <- items[[i]]
temp[2:3] <- round(temp[2:3],digits)
}
print(temp)
}} else {
items <- x$final.item.list
cat("\n Best items on each scale with cumulative validities\n")
for(i in 1:nvar) {
cat("\n Criterion = ",names(items[i]),"\n")
temp <- data.frame(items[[i]])
temp <- temp[-1]
if(length(items[[i]]) > 3) {
temp[1:3] <- round(temp[1:3],digits)} else{temp <- items[[i]]
temp[1:3] <- round(temp[1:3],digits)
}
print(temp)
}}
# print(items)
} else {
df <- data.frame(correlation=x$r,n.items = x$n.items)
cat("The items most correlated with the criteria yield r's of \n")
print(round(df,digits=digits))
if(length(x$value) > 0) {cat("\nThe best items, their correlations and content are \n")
print(x$value) } else {cat("\nThe best items and their correlations are \n")
for(i in 1:length(x$short.key)) {print(round(x$short.key[[i]],digits=digits))}
}
}
}
#end of print.psych.bestScales
#These next two functions were added in October, 2019 to allow for finding the maximum value of cross.validated as a function of n.item
#ValidityList <- mapply(FastValidity,c(1:ny),MoreArgs=list(nlow=nlow,nhigh=nhigh)) #probably not that helpful
### Find predictions and validities for scales from nlow to nhigh number of items
fastValidity <- function(items, criteria,r=NULL,nlow,nhigh,overlap) { #r and criteria are found before
ny <-length(criteria)
if(is.null(r)) r <- cor(items,items[criteria],use="pairwise")
wtd.validity <- unit.validity <- matrix(NA,nrow=nhigh,ncol= ny)
if(!overlap) {r[criteria,criteria] <- 0} else {for(i in 1:ny) r[criteria[i],criteria[i]] <- 0}
colnames(unit.validity) <- criteria
colnames(wtd.validity) <- criteria
item.min <- min(items,na.rm=TRUE)
item.max <- max(items,na.rm = TRUE)
if(item.max < 10) {
item.range.correction <- item.max - item.min + 1} else item.range.correction <- 7 #this is the case where we include some weird items, like age
for(scale in 1:ny) {
if(ny > 1 ) {ord <- apply(abs(r[,criteria,drop=FALSE]),2,order,decreasing=TRUE)
} else {
ord <- matrix(order(abs(r[,criteria,drop=FALSE]),decreasing=TRUE))
rownames(ord) <- rownames(r)
colnames(ord) <- criteria
}
abs.item <- t(t(items) * sign(r[,scale]) + sign(r[,scale] < 0) *item.range.correction ) #this gets all the items to be scored and adds in the max - min + 1
wtd.item <- t(t(items) * r[,scale] + sign(r[,scale] < 0) *item.range.correction ) #these are the wtd item scores
for (j in nlow: nhigh){
# temp <- abs.item[,ord[1:j,scale,drop=FALSE]]
# wtd.temp <- wtd.item[,ord[1:j,scale,drop=FALSE]]
if(j > 1) {scores <- rowMeans( abs.item[,ord[1:j,scale,drop=FALSE]] ,na.rm=TRUE)
wtd.scores <- rowMeans(wtd.item[,ord[1:j,scale,drop=FALSE]],na.rm=TRUE)
} else {scores <- abs.item[,ord[1:j,scale,drop=FALSE]]
wtd.scores <- wtd.item[,ord[1:j,scale,drop=FALSE]]}
unit.validity[j,scale] <- cor(scores,items[,criteria[scale]],use="pairwise")
wtd.validity [j,scale] <- cor(wtd.scores,items[,criteria[scale]],use="pairwise")
}
}
optimal.unit.n <- apply(unit.validity,2,which.max)
optimal.wtd.n <- apply(wtd.validity,2,which.max)
optimal.unit.valid <- apply(unit.validity,2,max)
optimal.wtd.valid <- apply(wtd.validity,2,max)
result <- list(optimal.unit.n=optimal.unit.n, optimal.wtd.n = optimal.wtd.n,
optimal.unit.deriv=optimal.unit.valid, optimal.wtd.deriv=optimal.wtd.valid,
unit.deriv=unit.validity,wtd.deriv = wtd.validity,item.order = ord,r=r,item.range.correction=item.range.correction)
return(result)
}
#This takes the item order from fastValidity and applies it to a new data set,using the old correlations
fastCrossValidity <- function(new.items,r,item.order,criteria,nlow,nhigh,overlap,optimal.n,optimal.wtd.n,delta=0,item.range.correction=0) { #r and order are from the derivation set
ny <-length(criteria)
#r and item.order are from the derivation sample
wtd.cross <- unit.cross <- matrix(NA,nrow=nhigh,ncol= ny)
if(!overlap) {r[criteria,criteria] <- 0} else {for(i in 1:ny) r[criteria[i],criteria[i]] <- 0}
colnames(unit.cross) <- criteria
colnames(wtd.cross) <- criteria
ord <- item.order
for(scale in 1:ny) {
abs.item <- t(t(new.items) * sign(r[,scale]) + sign(r[,scale] < 0) * item.range.correction)
#this gets all the items to be scored
wtd.item <- t(t(new.items) * r[,scale] + item.range.correction) #these are the wtd item scores
for (j in nlow: nhigh){
temp <- abs.item[,ord[1:j,scale,drop=FALSE]]
wtd.temp <- wtd.item[,ord[1:j,scale,drop=FALSE]]
if(j > 1) {scores <- rowMeans(temp[,1:j],na.rm=TRUE)
wtd.scores <- rowMeans(wtd.temp[,1:j],na.rm=TRUE)
} else {scores <- abs.item[,ord[1:j,scale,drop=FALSE]]
wtd.scores <- wtd.item[,ord[1:j,scale,drop=FALSE]]}
unit.cross[j,scale] <- cor(scores,new.items[,criteria[scale]],use="pairwise")
wtd.cross [j,scale] <- cor(wtd.scores,new.items[,criteria[scale]],use="pairwise")
}
cross.unit.valid <- apply(unit.cross,2,max)
cross.wtd.valid <- apply(wtd.cross,2,max)
# temp <- apply(unit.cross,2,function(x) which(x >= (max(x) - delta)))
#if(is.list(temp)) {cross.unit.n <- sapply(temp,function(x) x[1],simplify=TRUE) } else {cross.unit.n <- temp[1]}
cross.unit.n <- apply(unit.cross,2,which.max)
cross.wtd.n <- apply(wtd.cross,2,which.max)
optimal.cross.unit <- diag(unit.cross[optimal.n,1:ny,drop=FALSE])
optimal.cross.wtd <-diag( wtd.cross[optimal.wtd.n,1:ny,drop=FALSE])
}
result <- list(unit.cross=unit.cross, wtd.cross = wtd.cross, cross.unit= optimal.cross.unit,
cross.wtd=optimal.cross.wtd, cross.n=cross.unit.n, cross.wtd.n=cross.wtd.n)
return(result)
}
optimal.keys <- function(L,delta=0) {
#take the information from multi.validities and create a keys.list and a weights matrix
criteria <- names(L[["optimal.unit.n"]])
n <-L[["optimal.unit.n"]]
unit.cross <- L[["unit.deriv"]]
if(delta>0) {
temp <- apply(unit.cross,2,function(x) which(x >= (max(x,na.rm=TRUE) - delta)))
if(is.list(temp)) {n <- sapply(temp,function(x) x[1],simplify=TRUE) } else {if (is.vector(temp)) {n <- temp} else {n <- temp[1,,drop=FALSE]}}
}
item.order <- L [["item.order"]]
var.names <- rownames(L[["r"]])
r <- L [["r"]]
keys <- direction <- list()
for (j in 1:length(criteria)) {
keys[[j]] <- var.names[item.order[1:n[j],j]]
direction <- sign(r[item.order[1:n[j]],j])
keys[[j]][direction <0 ] <- paste0("-",keys[[j]][direction < 0])
}
names(keys) <- criteria
return(keys)
}
optimal.weights <- function(L,delta=0) {
#take the information from multi.validities and create a keys.list and a weights matrix
criteria <- names(L[["optimal.unit.n"]])
n <-L[["optimal.unit.n"]]
wtd.cross <- L[["wtd.deriv"]]
if(delta>0) {
temp <- apply(wtd.cross,2,function(x) which(x >= (max(x,na.rm=TRUE) - delta)))
if(is.list(temp)) {n <- sapply(temp,function(x) x[1],simplify=TRUE) } else {if (is.vector(temp)) {n <- temp} else {n <- temp[1,,drop=FALSE]}}
}
item.order <- L [["item.order"]]
var.names <- rownames(L[["r"]])
weights <- L [["r"]]
cut <- abs(diag(weights[diag(item.order[n,,drop=FALSE]),,drop=FALSE]))
weights[t(abs(t(weights)) < cut)] <- 0
return(weights)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.