#' NOT CURRENTLY EXPORTED! In Future this could potentially be made to a separate R package
#' THIS parseScript_SPSS function should be used 100% moving forward.
#' Old/Previous SPSS script parsers should be slowly transitioned to utilize this function when possible to maximize code use.
#' @title Parse SPSS Syntax Script for Fixed-Width Data Files
#' @description Parses an SPSS Syntax Script (.sps) file to return information relating to fixed-width data files.
#' @param spsFilePath a character value of the file path to the SPSS script to parse.
#' @param verbose a logic value to indicate if user wishes to print parsing activity to console. Default value is \code{FALSE}.
#' @param outputFormat a named argument to indicate which output format the resulting object should be. See details for information on each format.
#' Currently, \code{data.frame} format is only supported.
#' @param encoding a character value to indicate the encoding specification that is used by \code{readLines} base function for the \code{spsFilePath} parameter.
#' Only adjust this parameter if the original file encoding of the file is known, is not producing correct string values, or other errors occur.
#' See \code{?readLines} help for details about it's use for file encoding, and additional details.
#' @details The SPSS syntax script parser is focused on gathering details for use with fixed-width data files. This function scans for the following SPSS commands:
#' \itemize{
#' \item{FILE HANDLE}
#' \item{DATA LIST}
#' \item{VALUE LABEL}
#' }
#' The \code{outputFormat} specified will determine the result object returned. This function currently supports the following formats.
#' \itemize{
#' \item{data.frame}
#' \itemize{
#' \item{variableName - The variable name as defined in the script}
#' \item{Start - The start number index of the variable defined for the fixed-width format layout}
#' \item{End - The end number index of the variable defined for the fixed-width format layout}
#' \item{Width - The length of how many columns the variable uses in the fixed-width format layout}
#' \item{Attributes - Any SPSS attributes that are defined in the DATA LIST command. This is typically only for field formatting.}
#' \item{RecordNumber - Some fixed-width data files are considered "multi-line" where one record of data can span multiple rows in the file.
#' The RecordNumber indicates which line the variable is assigned.}
#' \item{Labels - The descriptive label associated with the variable name to give more detail or context.}
#' \item{labelValues - For categorical variables a stored value will typically be assigned a longer label/definition. This string identifies these mappings.
#' The '^' symbol is used to delimit each individual label value. Then additionally, the '=' is used to split the value from the left side of the '=' symbol,
#' and the remaining right-hand side of '=' is the text label for that value.}
#' \item{dataType - A best-guess of the data type (either 'numeric' or 'character') without actually examining the data-file.}
#' \item{missingValues - If a MISSING VALUE clause is included in the script this will list the values that are considered 'Missing'.
#' If multiple values specified, they will be delimited by a ';' (semi-colon) symbol.}
#' }
#' }
#' @return returns an object containing information specified by the \code{outputFormat} argument.
#' @author Tom Fink
parseScript_SPSS <- function(spsFilePath, verbose = FALSE, outputFormat = c("data.frame"), encoding = getOption("encoding")) {
if (!file.exists(spsFilePath)) {
stop(paste0("Unable to locate ", dQuote("spssFP"), " file at location: ", dQuote(spsFilePath)))
if (!is.logical(verbose) || length(verbose) != 1) {
stop(paste0("The ", sQuote("verbose"), " argument must be a logical vector of length 1."))
outputFormat <- match.arg(outputFormat)
fCon <- file(spsFilePath, open = "rt", blocking = FALSE, encoding = encoding) # blocking = FALSE here so it reads entire file instead of first break
spsText <- tryCatch(readLines(fCon),
error = function(e) {
stop(paste0("Unable to read data file, readLines call failed: ", e))
spsText <- paste(spsText, sep = "", collapse = "\n") # collapse as one single string, easier to parse going forward
# break the text into its command parts for further processing. certain comments are removed here
cmdText <- parseSPSS_COMMANDS(spsText)
# define the command regex to locate them properly that we want
fileHandleRE <- "^\\s*FILE HANDLE"
dataListRE <- "^\\s*DATA LIST"
variableLblRE <- "^\\s*VARIABLE LABEL(S){0,1}"
missingValRE <- "^\\s*MISSING VALUE(S){0,1}"
valueLblRE <- "^\\s*VALUE LABEL(S){0,1}"
# create flags
hasFileHandle <- any(grepl(fileHandleRE, cmdText, = TRUE))
hasDataList <- any(grepl(dataListRE, cmdText, = TRUE))
hasVariableLabels <- any(grepl(variableLblRE, cmdText, = TRUE))
hasMissingValues <- any(grepl(missingValRE, cmdText, = TRUE))
hasValueLabel <- any(grepl(valueLblRE, cmdText, = TRUE))
# procure return list of items
retList <- list()
if (hasFileHandle) {
if (verbose) {
cat(paste0("Parsing Script File Handle...\n"))
fileHandleText <- cmdText[grepl(fileHandleRE, cmdText, = TRUE)]
fileHandleData <- vector("list", length = length(fileHandleText))
for (i in seq_along(fileHandleText)) {
fileHandleData[[i]] <- parseSPSS_FILEHANDLE(fileHandleText[i])
retList$FileHandle <- fileHandleData
} # end if(hasFileHandle)
allVars <- character(0) # store all of the variable names here for functions downstream
if (hasDataList) {
if (verbose) {
cat(paste0("Parsing Data List...\n"))
dataListText <- cmdText[grepl(dataListRE, cmdText, = TRUE)]
if (length(dataListText) > 1) {
stop(paste0("Unable to handle multiple DATA LIST calls in SPSS script. Please revise the script and retry."))
dataListData <- vector("list", length = length(dataListText))
for (i in seq_along(dataListText)) {
dataListData[[i]] <- parseSPSS_DATALIST(dataListText)
allVars <- c(allVars, dataListData[[i]]$DataList$VariableName)
retList$DataList <- dataListData
if (hasVariableLabels) {
if (hasDataList == FALSE || length(allVars) == 0) {
stop("DATA LIST Command required in script for parsing variable labels.")
if (verbose) {
cat(paste0("Parsing Variable Labels...\n"))
variableLabelText <- cmdText[grepl(variableLblRE, cmdText, = TRUE)]
variableLabelData <- vector("list", length = length(variableLabelText))
for (i in seq_along(variableLabelText)) {
variableLabelData[[i]] <- parseSPSS_VARIABLELABEL(variableLabelText[i], allVars, verbose)
retList$VariableLabels <- variableLabelData
if (hasMissingValues) {
if (hasDataList == FALSE || length(allVars) == 0) {
stop("DATA LIST Command required in script for parsing missing values.")
if (verbose) {
cat(paste0("Parsing Missing Values...\n"))
missingValueText <- cmdText[grepl(missingValRE, cmdText, = TRUE)]
missingValueData <- vector("list", length = length(missingValueText))
for (i in seq_along(missingValueText)) {
missingValueData[[i]] <- parseSPSS_MISSINGVALUES(missingValueText[i], allVars)
retList$MissingValues <- missingValueData
if (hasValueLabel) {
if (hasDataList == FALSE || length(allVars) == 0) {
stop("DATA LIST Command required in script for parsing value labels.")
if (verbose) {
cat(paste0("Parsing Value Labels...\n"))
valueLabelText <- cmdText[grepl(valueLblRE, cmdText, = TRUE)]
valueLabelData <- vector("list", length = length(valueLabelText))
for (i in seq_along(valueLabelText)) {
valueLabelData[[i]] <- parseSPSS_VALUELABEL(valueLabelText[i], allVars)
retList$ValueLabels <- valueLabelData
outDF <- formatSPSS_Output(retList, outputFormat = outputFormat)
parseSPSS_RemoveComments <- function(spsTxt) {
# get the comments, as we want to remove them to make parsing easier
# comments take many forms, typically they begin with a '*' character, and end in a '.' character.
rVal <- "|~{?}~|"
txt <- spsTxt # working copy of text
# remove any '*' characters that are inside double or single quotes
dblQ_match <- gregexpr("([\"]).+?([\"])", txt)
dblQ_txt <- regmatches(txt, dblQ_match)
dblQ_txt <- lapply(dblQ_txt, function(t) {
gsub("*", rVal, x = t, fixed = TRUE)
regmatches(txt, dblQ_match) <- dblQ_txt # apply change back
sngQ_match <- gregexpr("([\']).+?([\'])", txt)
sngQ_txt <- regmatches(txt, sngQ_match)
sngQ_txt <- lapply(sngQ_txt, function(t) {
gsub("*", rVal, x = t, fixed = TRUE)
regmatches(txt, sngQ_match) <- sngQ_txt # apply change back
# regular comments start line with (*) and end in a period, this is most typical
# also can accept '\*' and ends in '.'
commentRE1 <- "(^|\\s*|\\/)[*](.|\\n)+?[.]\\s*?(\\n|$)"
m <- gregexpr(commentRE1, txt, perl = TRUE)
regmatches(txt, m) <- ""
# Block comments start with (/*) and end with a terminator (*/)
commentRE2 <- "(\\/[*])(.|\\n)+?([*]\\/)"
m <- gregexpr(commentRE2, txt, perl = TRUE, = TRUE)
regmatches(txt, m) <- ""
# The COMMENT command starts the comment and ends with a period (.)
commentRE3 <- "(^|\\s)(COMMENT)(.|\\n)+?([.])"
m <- gregexpr(commentRE3, txt, perl = TRUE, = TRUE)
regmatches(txt, m) <- ""
txt <- trimws(txt)
txt <- gsub(rVal, "*", txt, fixed = TRUE)
parseSPSS_FILEHANDLE <- function(spsTxt) {
txt <- gsub("(FILE HANDLE)", "", spsTxt, = TRUE)
txt <- sub("[.]\\s*\\n$", "", txt)
hnRE <- regexpr("^\\s*(\\w*){1}", txt) # get first word (alphanumeric and _ characters)
hnTxt <- regmatches(txt, hnRE)
handleName <- trimws(gsub("/", "", hnTxt, fixed = TRUE))
retList <- list(FILEHANDLE = handleName)
txt <- sub(hnTxt, "", txt)
subcommands <- c("NAME", "MODE", "RECFORM", "LRECL", "ENCODING")
for (sc in subcommands) {
re <- paste0("(", sc, ")", ".*?(\\s|$)")
reM <- regexpr(re, txt, = TRUE)
if (reM > 0) { # match found for subcommand
scTxt <- regmatches(txt, reM)
scTxt <- trimws(scTxt)
scTxt <- sub(sc, "", scTxt, = TRUE) # remove the subcommand text
scTxt <- sub("=", "", scTxt, fixed = TRUE) # remove the first '=' assignment character
# is the command value quoted?
if (grepl("^[\"'].*[\"']$", scTxt)) {
scTxt <- sub("^[\"']", "", scTxt) # remove opening double or single quote character
scTxt <- sub("[\"']$", "", scTxt) # remove ending double or single quote character
retList[sc] <- trimws(scTxt) # add param to the return list
} # end if(reM > 0)
} # end for(sc in subcommands)
parseSPSS_DATALIST <- function(spsTxt) {
# splitting here
tokens <- spss_CustomSplitter(spsTxt, splitChar = "/")
cmdToken <- tokens[1]
dataTokens <- tokens[-1]
IsMultiLine <- length(tokens) > 2 # TRUE/FALSE
# remove the 'DATA LIST' command
cmdToken <- sub("^\\s*DATA LIST", "", cmdToken, = TRUE)
fileM <- regexpr("FILE\\s*[=]\\s*\\w+", cmdToken, = TRUE)
fileTxt <- regmatches(cmdToken, fileM)
fileTxt <- sub("FILE", "", fileTxt, = TRUE)
fileTxt <- sub("=", "", fileTxt, fixed = TRUE)
fileHandle <- trimws(fileTxt) # the file handle associated with this data list command
regmatches(cmdToken, fileM) <- "" # remove our file = xxxx from the argument, leaves remaining commands
cmdToken <- trimws(cmdToken)
# FIXED is the DEFAULT format subcommand, so not necessary be in string
# check if either FREE or LIST subcommands specified as we don't handle them
badM <- regexpr("\\s*(FREE|LIST)\\s*", cmdToken, = TRUE)
if (badM > 0) {
stop("Unable to parse a DATA LIST command using FREE or LIST subcommands.")
recN <- length(dataTokens)
if (recN > 1) { # split the string by the table records for processing
for (i in seq_along(dataTokens)) {
# remove line breaks and the ending '.' terminator
chunkTxt <- sub("^\\d+", "", dataTokens[i])
chunkTxt <- gsub("\n", "", chunkTxt)
chunkTxt <- sub("[.]\\s*?[\n]$", "", chunkTxt)
tmpItem <- parseSPSS_DATALIST_Items(chunkTxt)
tmpItem$RecordNumber <- i
if (i == 1) {
items <- tmpItem
} else {
items <- base::rbind(items, tmpItem)
} else { # single record only. may or may not have record number as part of the listing
# remove line breaks and the ending '.' terminator
dataTokens <- gsub("\n", "", dataTokens)
dataTokens <- sub("[.]\\s*?[\n]$", "", dataTokens)
items <- parseSPSS_DATALIST_Items(dataTokens)
# define output object
outputList <- list(
FileHandle = fileHandle,
RecordNumber = recN,
DataList = items
# ensure all names are valid and not duplicated
items$VariableName <- make.names(items$VariableName)
parseSPSS_DATALIST_Items <- function(txt) {
# tokens begin with a word, then digits for starting position
# usually it has a '-' character and another digit to signify ending position, but not always
# lastly, there may be additional attributes enclosed in parenthesis to signify alphanumeric (A)
# or numeric formatting precision (6)
tokensM <- gregexpr("(\\w|[_])+\\s*\\d+(\\s*[-]\\s*\\d+){0,1}\\s*([(]\\w+[)]){0,1}", txt)
tokens <- regmatches(txt, tokensM)[[1]]
# varname is the first word of the token
varsM <- regexpr("(\\w|[_])+", tokens)
vars <- trimws(regmatches(tokens, varsM))
regmatches(tokens, varsM) <- "" # remove the variables now that we have them isolated
# attributes are enclosed in parens e.g., (A), (6)
attribM <- regexpr("([(]\\w+[)])", tokens)
attrib <- regmatches(tokens, attribM)
regmatches(tokens, attribM) <- "" # remove the attributes from tokens
# cleanup the attributes as not all variables have them defined
attribClean <- rep("", times = length(vars))
attribClean[attribM > 0] <- trimws(attrib) # maintains ordering
startM <- regexpr("\\d+", tokens)
startPos <- as.numeric(regmatches(tokens, startM))
regmatches(tokens, startM) <- ""
tokens <- sub("-", "", tokens, fixed = TRUE) # remove the '-' from the string now to isolate the end position
endPos <- as.numeric(trimws(tokens))
endPos[] <- startPos[] # if there is no ending position defined, use the start position (length of 1)
width <- (endPos - startPos) + 1
# build output object
df <- data.frame(
VariableName = vars,
StartPosition = startPos,
EndPosition = endPos,
Width = width,
Attributes = attribClean
# fix FWF spacing to ensure there are no gaps in the definition
df <- parseSPSS_fixSpacingFWF(df[order(startPos), ])
# the spsTxt is the spss script block to be parsed.
# the varNames are the variable names defined that will be used for matching
parseSPSS_VARIABLELABEL <- function(spsTxt, varNames, verbose = FALSE) {
# prep: remove the command text from the string
spsTxt <- sub("^\\s*VARIABLE LABEL(S){0,1}", "", spsTxt, = TRUE)
spsTxt <- sub("[.]\\s*(\n|$)", "", spsTxt)
retDF <- data.frame(sortOrder = seq_along(varNames), varNames = tolower(varNames), stringsAsFactors = FALSE)
# + " TO 1988"
# + "blah blah blah"
# country " Country three-digit ISO code"
# schoolid " School ID (unique)"
# stidstd " Student ID"
# subnatio " Language community ID"
tokensM <- gregexpr("(\\w|[+])+\\s*((\"(?:\\\\.|[^\"\\\\])*?\")|(\'(?:\'.|[^\'])*?\'))(\\s|\\n|$)", spsTxt, perl = TRUE)
tokens <- trimws(unlist(regmatches(spsTxt, tokensM)))
# grab the varname
wordValM <- regexpr("(\\w|_|[+])+\\s*", tokens)
wordVal <- trimws(unlist(regmatches(tokens, wordValM)))
appendTokens <- c() # zero-length vector here if no appending necessary
if (any(substr(wordVal, 1, 1) == "+")) {
idx <- which(wordVal == "+", arr.ind = TRUE)
appendTokens <- tokens[idx]
appendWord <- rep("", times = length(appendTokens))
appendIdx <- rep(0, times = length(appendTokens))
for (i in rev(seq_along(idx))) {
ii <- idx[i] - 1
while (wordVal[ii] == "+") {
ii <- ii - 1
appendWord[i] <- wordVal[ii]
appendIdx[i] <- ii
# cleanup the appending tokens
# remove the '+'/append character from the token text
appendTokens <- trimws(sub("+", "", appendTokens, fixed = TRUE))
appendTokens <- substr(appendTokens, 2, nchar(appendTokens) - 1)
# apply the appending tokens
for (i in seq_along(appendIdx)) {
iT <- tokens[appendIdx[i]]
aT <- appendTokens[i]
# create the new token string with the appended text
iT <- paste0(substr(iT, 1, nchar(iT) - 1), aT, substr(iT, nchar(iT), nchar(iT)))
tokens[appendIdx[i]] <- iT # apply the new label back to the variable
# remove the appending tokens from the words and tokens
tokens <- tokens[-idx]
wordVal <- wordVal[-idx]
# the match needs to then be recalculated as it can't be simply dropped for whatever reason
wordValM <- regexpr("(\\w|_|[+])+\\s*", tokens)
# clear out the word from the token
regmatches(tokens, wordValM) <- ""
tokens <- trimws(tokens)
# strip off the beginning/ending quote charaters (either double or single quotes)
tokens <- substr(tokens, 2, nchar(tokens) - 1)
# cleanup any text escapes
tokens <- gsub("''", "'", tokens, fixed = TRUE) # a ('') is escaped to just a single quote (')
tokens <- gsub("\\\"", "\"", tokens, fixed = TRUE) # a (\") is escaped to just a double quote (")
tokens <- gsub("\\n", "", tokens, fixed = TRUE) # a (\n) should be removed from the token, we don't want the wrapping
# create a data.frame to help merging later
tokenDF <- data.frame(varNames = tolower(wordVal), varLabel = trimws(tokens), stringsAsFactors = FALSE)
# check for duplicates in vars and reconcile them
if (anyDuplicated(tokenDF$varNames)) {
idx <- which(duplicated(tokenDF$varNames), arr.ind = TRUE)
dupeVars <- unique(tokenDF$varNames[idx])
"Duplicate Variable Labels Defined For Variable(s): ", paste(sQuote(dupeVars), collapse = ","),
"\n", "The last defined variable label will be used."
for (dv in dupeVars) {
rowIdx <- which(tokenDF$varNames == dv, arr.ind = TRUE)
rowIdx <- rowIdx[rowIdx != max(rowIdx)]
tokenDF <- tokenDF[-rowIdx, ]
retDF <- merge(retDF, tokenDF, all.x = TRUE, all.y = FALSE, by = "varNames")
retDF <- retDF[order(retDF$sortOrder), ] # re-order based on the original sort order
retDF$sortOrder <- NULL # drop the sorting
row.names(retDF) <- 1:nrow(retDF) # renumber the row.names
parseSPSS_VALUELABEL <- function(spsTxt, varNames) {
retDF <- data.frame(
variableName = tolower(varNames),
valType = rep("", times = length(varNames)),
valLblDesc = rep("", times = length(varNames))
spsTxt <- sub("^\\s*VALUE LABEL(S){0,1}", "", spsTxt, = TRUE)
tokens <- spss_CustomSplitter(spsTxt, splitChar = "/")
tokens <- trimws(tokens)
tokens <- tokens[nchar(tokens) > 0] # drop any blank tokens
# define the regular expression for grabbing both the quoted value, and the quoted value label
quotedDef_Regex <- "([\'\"]).+?([\'\"])\\s+[\'\"].+[\'\"](\\s|\\n|$)(\\s*[+]\\s*[\'\"].+[\'\"](\\s|\\n|$)){0,5}"
# get the index to the quoted assignment definition, if not quoted, then it's numeric
quotedTokenIdx <- which(grepl(quotedDef_Regex, tokens, = TRUE, perl = TRUE), arr.ind = TRUE)
quotedTokens <- tokens[quotedTokenIdx]
# numeric tokens will have a number assignment that's not within quotes for a label value
if (length(quotedTokenIdx) == 0) {
numericTokens <- tokens
} else {
numericTokens <- tokens[-quotedTokenIdx]
# parse the quoted tokens first
if (length(quotedTokens) > 0) {
quotedM <- gregexpr(quotedDef_Regex, quotedTokens, = TRUE, perl = TRUE)
tokenTemp <- regmatches(quotedTokens, quotedM) # this is now the quoted value along with the quoted label
regmatches(quotedTokens, quotedM) <- "" # remove the assignment from the token::leaves only the variables left
quotedTokens <- trimws(quotedTokens)
quotedM <- gregexpr("[A-z]\\w*", quotedTokens, = TRUE, perl = TRUE)
qVars <- regmatches(quotedTokens, quotedM) # list of jagged vectors containing the associated variable names
# check if any of the variables has the 'TO' keyword
for (i in seq_along(tokenTemp)) {
z <- tokenTemp[[i]]
vars <- tolower(qVars[[i]])
m <- gregexpr("^[\'\"].*?[\'\"]", z, = TRUE, perl = TRUE)
valLbl <- regmatches(z, m)
valLbl <- substr(valLbl, 2, nchar(valLbl) - 1)
regmatches(z, m) <- ""
z <- trimws(z)
lblLbl <- substr(z, 2, nchar(z) - 1)
lblLbl <- gsub("[\'\"]\\s*[+]\\s*[\'\"]", "", lblLbl) # for any line concatenations we want to remove and leave full string
# ensure our collapse character isn't present in the text!
lblLbl <- gsub("^", "", lblLbl, fixed = TRUE)
lblFormatted <- paste(valLbl, lblLbl, sep = "=", collapse = "^")
# Check if 'TO' keyword is used. vars will be a length of 3
# e.g., "V1 to V3" OR Var1 TO Var10
if (any(grepl("^to$", vars[2], = TRUE)) && length(vars) == 3) {
p1 <- which(grepl(paste0("^", vars[1], "$"), retDF$variableName, = TRUE), arr.ind = TRUE)
p2 <- which(grepl(paste0("^", vars[3], "$"), retDF$variableName, = TRUE), arr.ind = TRUE)
vars <- retDF$variableName[p1:p2]
retDF[retDF$variableName %in% vars, "valType"] <- "character"
retDF[retDF$variableName %in% vars, "valLblDesc"] <- lblFormatted
} # end if (length(quotedTokens) > 0)
# then process the numeric tokens
# allows for up to 5 concatenations to the label!!
if (length(numericTokens) > 0) {
numericDef_Regex <- "\\s+(\\d*|[.]\\d*)\\s*([-]){0,1}\\s*(\\d+|[.]\\d+)\\s+[\'\"].+[\'\"](\\s|\\n|$)(\\s*[+]\\s*[\'\"].+[\'\"](\\s|\\n|$)){0,5}"
numericM <- gregexpr(numericDef_Regex, numericTokens, = TRUE, perl = TRUE)
tokenTemp <- regmatches(numericTokens, numericM)
regmatches(numericTokens, numericM) <- "" # remove the assignment from the token::leaves only the variables left
numericTokens <- trimws(numericTokens)
quotedM <- gregexpr("[A-z]\\w*", numericTokens, = TRUE, perl = TRUE)
qVars <- regmatches(numericTokens, quotedM) # list of jagged vectors containing the associated variable names
# check if any of the variables has the 'TO' keyword
for (i in seq_along(tokenTemp)) {
z <- tokenTemp[[i]]
z <- trimws(z) # matches generally have a leading space here due to the regex specification!
vars <- tolower(qVars[[i]])
m <- gregexpr("^(\\d*|[.]\\d*)\\s*([-]){0,1}\\s*(\\d+|[.]\\d+)", z, = TRUE, perl = TRUE)
valLbl <- trimws(regmatches(z, m)) # trimws unlists here!
regmatches(z, m) <- ""
z <- trimws(z)
lblLbl <- substr(z, 2, nchar(z) - 1)
lblLbl <- gsub("[\'\"]\\s*[+]\\s*[\'\"]", "", lblLbl) # for any line concatenations we want to remove and leave full string
# ensure our collapse character isn't present in the text!
lblLbl <- gsub("^", "", lblLbl, fixed = TRUE)
# check for any numeric ranges, exclude them if found (e.g. 0 - 100; 1-4)
include <- sapply(valLbl, function(x) {
!grepl("\\d+\\s*[-]\\s*\\d+", x, = TRUE)
# ranges are excluded here
lblFormatted <- paste(valLbl[include], lblLbl[include], sep = "=", collapse = "^")
# Check if 'TO' keyword is used. vars will be a length of 3
# e.g., "V1 to V3" OR Var1 TO Var10
if (any(grepl("^to$", vars[2], = TRUE)) && length(vars) == 3) {
p1 <- which(grepl(paste0("^", vars[1], "$"), retDF$variableName, = TRUE), arr.ind = TRUE)
p2 <- which(grepl(paste0("^", vars[3], "$"), retDF$variableName, = TRUE), arr.ind = TRUE)
vars <- retDF$variableName[p1:p2]
retDF[retDF$variableName %in% vars, "valType"] <- "numeric"
retDF[retDF$variableName %in% vars, "valLblDesc"] <- lblFormatted
} # end if (length(numericTokens) > 0)
# custom splitting on the '/' character, escapes text within double and single quotes
# also checks for escape tokens within the quoted text if double or single quotes are used
# the nextCharRegEx is a regular expression that will check the n+1 character of the splitChar specified.
# this is useful if the splitChar needs additional validation
spss_CustomSplitter <- function(inTxt, splitChar = "/", nextCharRegEx = NULL) {
tokenStart <- 1
allTokens <- c()
inDblQuotes <- FALSE
inSglQuotes <- FALSE
prevX <- ""
checkNextChar <- FALSE
if (is.character(nextCharRegEx)) {
checkNextChar <- TRUE
} else {
nextCharTestVal <- TRUE # default to always true if no regex supplied
charArr <- strsplit(inTxt, "")[[1]]
endPos <- length(charArr)
checkChars <- which(charArr %in% c(splitChar, "\"", "'"), arr.ind = TRUE)
for (i in seq_along(checkChars)) {
x <- charArr[checkChars[i]]
if (checkNextChar) {
nextX <- ifelse(i == endPos, "\n", charArr[checkChars[i] + 1])
nextCharTestVal <- grepl(nextCharRegEx, nextX, = TRUE)
if (x == "\"" && inSglQuotes == FALSE && charArr[checkChars[i] - 1] != "\\") {
inDblQuotes <- !inDblQuotes
if (x == "'" && inDblQuotes == FALSE && charArr[checkChars[i] - 1] != "'") {
inSglQuotes <- !inSglQuotes
if (x == splitChar && !inDblQuotes && !inSglQuotes && nextCharTestVal) {
allTokens <- c(allTokens, substr(inTxt, tokenStart, checkChars[i] - 1))
tokenStart <- checkChars[i] + 1
if (i == length(checkChars) && !inDblQuotes && !inSglQuotes) { # end of the text, be sure to add it
allTokens <- c(allTokens, substr(inTxt, tokenStart, endPos))
tokenStart <- checkChars[i] + 1
# parse the missing values command from the script to know which values are considered 'missings' for analysis
parseSPSS_MISSINGVALUES <- function(spsTxt, varNames) {
spsTxt <- sub("^\\s*MISSING VALUE(S){0,1}", "", spsTxt, = TRUE)
retDF <- data.frame(
variableName = varNames,
missingValues = rep("", times = length(varNames)),
valueType = rep("", times = length(varNames)), stringsAsFactors = FALSE
tokens <- spss_CustomSplitter(spsTxt, splitChar = "/")
tokens <- trimws(tokens)
tokens <- gsub("\n", "", tokens, fixed = TRUE)
tokens <- paste0(tokens, collapse = "\n")
m <- gregexpr("(\\w+\\s*){1,}[(](\\s|\\w|[ ,]|[\"'])+[)]", tokens)
valDef <- regmatches(tokens, m)
valDef <- trimws(valDef[[1]]) # from list to vector
for (i in seq_along(valDef)) {
iVals <- valDef[[i]]
m <- gregexpr("[(].*[)]", iVals) # get the part in the parenthesis
missVals <- regmatches(iVals, m)[[1]] # will only have one value
assignVars <- trimws(gsub(missVals, "", iVals, fixed = TRUE))
missVals <- gsub("([(]|[)])", "", missVals)
isQuoted <- grepl("[\"']", missVals)
missVals <- gsub("(\"|')", "", missVals) # remove the quotes from the string
# change the value separator from a ',' to ';' character
missVals <- gsub(",", ";", missVals, fixed = TRUE)
# split out all tokens of the variables
m <- gregexpr("\\w+", assignVars)
assignVars <- trimws(regmatches(assignVars, m)[[1]])
idxTO <- which(grepl("^to$", assignVars, = TRUE), arr.ind = TRUE) # need to expand the variables that have the 'TO' range keyword
if (length(idxTO) > 0) {
appendVars <- c()
# expand the variables to include the variables defined between the vars
for (ii in seq_along(idxTO)) {
idxVal <- idxTO[ii]
aVar <- assignVars[idxVal - 1]
bVar <- assignVars[idxVal + 1]
aVarRow <- which(grepl(paste0("^", aVar, "$"), retDF$variableName, = TRUE))
bVarRow <- which(grepl(paste0("^", bVar, "$"), retDF$variableName, = TRUE))
appendVars <- c(appendVars, retDF$variableName[aVarRow:bVarRow])
assignVars <- assignVars[-idxTO] # drop the 'TO' variables
assignVars <- c(assignVars, appendVars) # add the 'range' variables
retDF$missingValues[tolower(retDF$variableName) %in% tolower(assignVars)] <- missVals
retDF$valueType[tolower(retDF$variableName) %in% tolower(assignVars)] <- ifelse(isQuoted, "character", "numeric")
# split the script up into its command parts by the ending '.', then clean up the text a bit
parseSPSS_COMMANDS <- function(spsTxt) {
items <- spss_CustomSplitter(spsTxt, splitChar = ".", nextCharRegEx = "(\\s|\\n)")
items <- trimws(items) # trim any spacing before or after
items <- items[nchar(items) > 0]
# drop any items beginning with a '*', or uses the COMMENT command, as those are comments
items <- items[!grepl("^([*]|COMMENT)", items, = TRUE)]
# create a formatted output object
formatSPSS_Output <- function(resList, outputFormat) {
if (outputFormat == "data.frame") {
formatSPSS_Output_dataframe <- function(resList) {
df <- resList$DataList[[1]]$DataList # at this time only one 'DATA LIST' command is supported, use as the basis for the data.frame
# rename to match expected column names for EdSurvey
names(df)[grepl("variablename", names(df), = TRUE)] <- "variableName"
names(df)[grepl("startposition", names(df), = TRUE)] <- "Start"
names(df)[grepl("endposition", names(df), = TRUE)] <- "End"
names(df)[grepl("width", names(df), = TRUE)] <- "Width"
names(df)[grepl("attributes", names(df), = TRUE)] <- "Attributes"
df$Labels <- rep("", times = nrow(df))
for (x in resList$VariableLabels) {
m <- match(tolower(x$varNames), tolower(df$variableName))
df$Labels[m] <- x$varLabel[m]
df$labelValues <- rep("", times = nrow(df))
for (x in resList$ValueLabels) {
m <- match(tolower(x$variableName), tolower(df$variableName))
df$labelValues[m] <- x$valLblDesc[m]
df$dataType[m] <- x$valType[m]
for (x in resList$MissingValues) {
m <- match(tolower(x$variableName), tolower(df$variableName))
df$missingValues[m] <- x$missingValues[m]
df <- parseSPSS_validateDataType(df)
# validates the data.frame's 'dataType' column to ensure it has a value, and is consistent with the attributes from the data list
parseSPSS_validateDataType <- function(df) {
# if no dataType defined and has a numeric format attribute consider it numeric
idx <- which(nchar(df$dataType) == 0 & grepl("\\d", df$Attributes, = TRUE), arr.ind = TRUE)
df[idx, "dataType"] <- "numeric"
# otherwise if no data type, it will be classified as character
idx <- which(nchar(df$dataType) == 0, arr.ind = TRUE)
df[idx, "dataType"] <- "character"
# takes a built data.frame from the parseSPSS_DATALIST_Items function and ensures there are no gaps in the FWF spacing
# if gaps exist then it fills them with filler columns for proper spacing
parseSPSS_fixSpacingFWF <- function(df) {
# get the expected widths based on the end positions, returns a vector
df$TestWidth <- c(df$End[1], df$End[-1] - df$End[-nrow(df)])
# determine what numbers are missing from our FWF sequence
issues <- (1:nrow(df))[df$Width != df$TestWidth]
# shortcut out if no problems
if (length(issues) == 0) {
df$TestWidth <- NULL # remove temporary column
# reverse issue order so that order stay correct
for (issue in rev(issues)) {
# issue is a row index with a problem
# make a new row
newrow <- df[issue, ]
# set the width to fill the gap
newrow$Width <- newrow$TestWidth - newrow$Width # fill the gap
newrow$VariableName <- "zzz_Filler"
newrow$Attributes <- "(A)" # make sure it can be read in
df[issue:nrow(df) + 1, ] <- df[issue:nrow(df), ]
df[issue, ] <- newrow
# recalibrate the start/end positions after placing in the filler columns
df$StartPosition <- c(1, 1 + cumsum(df$Width))[1:nrow(df)]
df$EndPosition <- cumsum(df$Width)
df$TestWidth <- NULL # remove out temporary column
# ensure unique names
df$VariableName <- make.names(df$VariableName, unique = TRUE)
