R/oplike.R
In dexter: Data Management and Analysis of Tests

Documented in read_oplm_par start_new_project_from_oplm

#' Start a new project from oplm files
#'
#' Creates a dexter project database and fills it with response data based on a .dat and .scr file
#'
#'
#' @param dbname filename/path of new dexter project database (will be overwritten if already exists)
#' @param scr_path path to the .scr file
#' @param dat_path path to the .dat file
#' @param booklet_position vector of start and end of booklet position in the dat file, e.g. c(1,4), 
#' all positions are counted from 1, start and end are both inclusive. If NULL, this is read from the scr file.
#' @param responses_start start position of responses in the .dat file. If NULL, this is read from the scr file.
#' @param response_length length of individual responses, default=1
#' @param person_id optionally, a vector of start and end position of person_id in the .dat file.
#' If NULL, person id's will be auto-generated.
#' @param missing_character vector of character(s) used to indicate missing in .dat file, 
#' default is to use both a space and a 9 as missing characters.
#' @param use_discrim if TRUE, the scores for the responses will be multiplied by the
#' discrimination parameters of the items
#' @param skip_invalid_booklets whether to skip lines that have a booklet number that is not listed in the .scr file. This supports 
#' an extremely poor but unfortunately common practice of excluding students by changing their booklets id's or inserting random 
#' comments in the data file. If `skip_invalid_booklets` is TRUE such lines will be skipped, if FALSE they will generate an error
#' @return a database connection object.
#' 
#' @details
#' start_new_project_from_oplm builds a complete dexter database from a .dat and .scr file in
#' the proprietary oplm format. Four custom variables are added to the database: 
#' booklet_on_off, oplm_marginal, item_local_on_off, item_global_on_off. These are taken from the .scr file
#' and can be used in predicates in the various dexter functions.
#' 
#' `booklet_position` and `responses_start` can be inferred from the .scr file but since they
#' are sometimes misspecified in the .scr file they can be overridden. `response_length` is not 
#' inferred from the .scr file since any value other than 1 is usually a mistake.
#' 
#' @examples
#'\dontrun{\donttest{
#' db = start_new_project_from_oplm('test.db',
#'    'path_to_scr_file', 'path_to_dat_file', 
#'    booklet_position=c(1,3), responses_start=101,
#'    person_id=c(50,62))
#'
#' prms = fit_enorm(db, 
#'    item_global_on_off==1 & item_local_on_off==1 & booklet_on_off==1)
#' 
#' 
#'}}
start_new_project_from_oplm = function(dbname, scr_path, dat_path, 
                                       booklet_position = NULL, responses_start = NULL, response_length = 1, 
                                       person_id = NULL, missing_character = c(' ','9'), use_discrim=FALSE,
                                       skip_invalid_booklets = FALSE)
{

  check_file(scr_path)
  check_file(dat_path)
  
  scr = readSCR(scr_path)
  scr$itemLabels = trimws(scr$itemLabels)
  
  if(is.null(booklet_position))
  {
    booklet_position = scr$booklet_position
  } else
  {
    booklet_position = rng_fl(booklet_position)
  }  
  
  if(is.null(responses_start))
    responses_start = scr$responses_start
  
  if(response_length==1)
  { 
    rsplit = function(s) strsplit(s,"")
  } else if(response_length==2)
  {
    rsplit = function(s) lapply(strsplit(s, ""), function(x){paste0(x[c(TRUE, FALSE)], x[c(FALSE, TRUE)])})
  } else if(response_length==3)
  {
    rsplit = function(s) lapply(strsplit(s, ""), function(x){paste0(x[c(TRUE, FALSE, FALSE)], x[c(FALSE, TRUE, FALSE)], x[c(FALSE, FALSE, TRUE)])})
  } else 
  {
    stop('response length cannot be larger than 3.')
  }
  missing_character = trimws(missing_character)
  
  if(max(nchar(missing_character))>response_length) 
      stop('missing character may not be longer than the length of an individual response')
  
  if(!is.null(person_id))
    person_id = rng_fl(person_id)

  if(anyDuplicated(scr$itemLabels))
  {
    warning("screenfile contains duplicate itemlabels, labels have been adjusted to make them unique")
    scr$itemLabels = sprintf('i%04i_%s',1:length(scr$itemLabels),scr$itemLabels)
  }
  
  rules = tibble(item_id = rep(scr$itemLabels, as.integer(scr$maxScore)+1+length(missing_character)), 
                 response = unlist(lapply(scr$maxScore, function(n) c(0:n,missing_character))),
                 item_score = unlist(lapply(scr$maxScore, function(n) c(0:n,rep(0,length(missing_character))))))
  
  if(use_discrim) rules$item_score = rules$item_score * rep(scr$discrim, as.integer(scr$maxScore)+1+length(missing_character))
  
  db = start_new_project(rules, dbname)
  
  finished = FALSE
  dbTransaction(db,{
    con = file(dat_path, "r")
    
    dbExecute(db,'ALTER TABLE dxbooklets ADD COLUMN booklet_on_off INTEGER NOT NULL DEFAULT 1;')
    dbExecute(db,'ALTER TABLE dxbooklet_design ADD COLUMN item_local_on_off INTEGER NOT NULL DEFAULT 1;')
    dbExecute(db,'ALTER TABLE dxitems ADD COLUMN item_global_on_off INTEGER NOT NULL DEFAULT 1;')
    
    dbExecute_param(db,'UPDATE dxitems SET item_global_on_off=:onoff WHERE item_id=:item;', 
                tibble(onoff=scr$globCal, item=scr$itemLabels))
  
    design = tibble(booklet_id = as.character(unlist(lapply(1:scr$nBook, function(x) rep(x,scr$nitBook[x])))),
                    item_nbr = unlist(scr$itemsBook),
                    item_position = unlist(lapply(scr$itemsBook, function(x) 1:length(x))),
                    onoff = unlist(scr$itemsOn)) |>
              inner_join(tibble(item_nbr = 1:scr$nit, item_id = scr$itemLabels), by='item_nbr') |>
              arrange(.data$booklet_id, .data$item_position) 
            
    dbExecute_param(db,'INSERT INTO dxbooklets(booklet_id, booklet_on_off) VALUES(:b,:onoff);',
                      tibble(b=as.character(1:scr$nBook),onoff=scr$bookOn))
    dbExecute_param(db,'INSERT INTO dxbooklet_design(booklet_id, item_id, item_position, item_local_on_off) 
                            VALUES(:booklet_id,:item_id,:item_position, :onoff);', 
					select(design, 'booklet_id', 'item_id', 'item_position', 'onoff'))     
    
    if(length(scr$margLabels)>1)
    {
      dbExecute(db,"ALTER TABLE dxbooklets ADD COLUMN oplm_marginal VARCHAR(50) NOT NULL DEFAULT '<unknown>';")
      dbExecute_param(db,'UPDATE dxbooklets SET oplm_marginal=:marg WHERE booklet_id=:booklet_id;', 
                      tibble(booklet_id=as.character(1:scr$nBook), marg=scr$margLabels[scr$margBook]))
    }
    
    vp = 1
    
    short_line = NULL
    while ( TRUE ) {
      lines = readLines(con, n = 5000, encoding='ascii')
      if ( length(lines) == 0 ) {
        break
      }
      cat('\rreading lines:', vp, '-', as.integer(vp + length(lines)-1))
      bkl = suppressWarnings({as.integer(substr(lines, booklet_position[1], booklet_position[2]))})
      
      if(skip_invalid_booklets )
      {
        skip = is.na(bkl) | coalesce(bkl,0L) > scr$nBook
        lines = lines[!skip]
        bkl = bkl[!skip]
      } else
      {
        if(anyNA(bkl))
        {
          cat('\n')
          stop_(paste0("empty booklet id's at position (", paste0(booklet_position, collapse=', ' ),")"))
        } else if(min(bkl) < 1 || max(bkl) > scr$nBook)
        {
          cat('\n')
          stop_(format_plural("The following booklet_id['s] in data [is/are] not present in the .scr file:\n%s", setdiff(bkl, 1:scr$nBook)))
        }
      }
      
      if(scr$expanded==0)
      {
        rsp = rsplit(substring(lines, 
                          responses_start, 
                          responses_start + scr$nitBook[bkl] * response_length-1))
        
        if(any(sapply(rsp,length) < scr$nitBook[bkl]))
        {
          lnbr = which(sapply(rsp,length) < scr$nitBook[bkl])
          short_line = c(short_line, lnbr + vp - 1)
          sapply(scr$nitBook[bkl[lnbr]] - sapply(rsp[lnbr], length), rep, x = ' ')
          
          rsp[lnbr] = mapply(c, 
                             rsp[lnbr],  
                             lapply(scr$nitBook[bkl[lnbr]] - sapply(rsp[lnbr], length), rep, x = ' '),
                             SIMPLIFY = FALSE)
        }
      } else
      {
        rsp = rsplit(substring(lines, responses_start, responses_start + scr$nit-1L))
        rsp = mapply(rsp,bkl,SIMPLIFY = FALSE, FUN=\(r,bk) r[scr$itemsBook[[bk]]])
      } 

      bkl=as.character(bkl)
      
      if(is.null(person_id))
      {
        prs = as.character(vp:(vp+length(lines)-1) )
      } else 
      {
        prs = trimws(substr(lines,person_id[1],person_id[2]))
      }
      vp = vp+length(lines)

      data = tibble(booklet_id = rep(bkl, sapply(rsp,length)), 
                    person_id = rep(prs, sapply(rsp,length)),
                    response=trimws(unlist(rsp)), 
                    item_position = unlist(sapply(rsp, function(n) 1:length(n),simplify=FALSE))) |>
        inner_join(design, by=c('booklet_id','item_position')) |>
        select('person_id', 'booklet_id', 'item_id', 'response')
      
      dbExecute_param(db,'INSERT INTO dxpersons(person_id) VALUES(:person_id);',tibble(person_id=prs) )
      dbExecute_param(db,'INSERT INTO dxadministrations(person_id,booklet_id) VALUES(:person_id,:booklet_id);', 
                      tibble(person_id=prs,booklet_id=bkl))

      dbExecute_param(db,
                'INSERT INTO dxresponses(booklet_id,person_id,item_id,response) 
                                  VALUES(:booklet_id,:person_id,:item_id,:response);', 
                select(data, 'booklet_id', 'person_id', 'item_id', 'response'))
    
    }
    
    close(con)
    finished=TRUE
    if(!is.null(short_line))
    {
      cat('\nLines in file that are too short:\n')
      print(short_line)
      warning('Some lines are too short for the number of responses that are supposed to be present ',
              'according to the design', call.=FALSE)
    }
    cat('\nvalidating constraints...\n')
 }, on_error = function(e)
    {
      if(!finished) close(con)
   
      if(finished && grepl('foreign key', e$message,ignore.case=TRUE))
      {

        # there are three deferred foreign key constraints
        # that are not caught until the end of the transaction (for efficiency reasons)
        # we can test them one by one
        unknown_responses = dbGetQuery(db,'SELECT item_id, response FROM dxresponses 
                                            EXCEPT SELECT item_id, response FROM dxscoring_rules;')
        if(nrow(unknown_responses) > 0)
        {
          cat('\nThe following responses were found in the data but they are not defined in the .scr file or coded as missing responses.')
          cat('Possible causes are that not all missing characters are correctly specified, your screen and dat files do not match ')
          cat('or responses_start is incorrect.\n')
          unknown_responses = dbGetQuery(db, 'SELECT item_id, response, COUNT(*) AS tally FROM dxresponses GROUP BY item_id, response;') |>
            semi_join(unknown_responses, by=c('item_id','response'))
                  
          print(unknown_responses)
          e$message = 'Invalid responses, see output'
        } 
        # FOREIGN KEY (booklet_id, item_id) is highly unlikely to be violated. 
        # because this will most likely manifest in the previous constraint
        # FOREIGN KEY (booklet_id, person_id) can also not be wrong in this function because they are entered together

      }
      
      dbRollback(db)  
      dbDisconnect(db)
      stop(e)
    }, on_error_rollback=FALSE)
  return(db)
}



#' Read item parameters from oplm PAR or CML files
#'
#' @param par_path path to a file in the (binary) OPLM PAR format or the human readable CML format
#' @return depends on the input. For .PAR files a data.frame with columns: item_id, item_score, beta, nbr,
#' for .CML files also several statistics columns that are outputted by OPLM as part of the calibration.
#' 
#' @details
#' It is very occasionally useful to calibrate new items on an existing scale. This
#' function offers the possibility to read parameters from the proprietary oplm format 
#' so that they can be used to fix a new calibration in Dexter on an existing scale of items
#' that were calibrated in oplm.
#' 
#' @examples
#' \dontrun{
#' \donttest{
#' par = read_oplm_par('/parameters.PAR')
#' f = fit_enorm(db, fixed_params=par)
#' }}
read_oplm_par = function(par_path){
  if(grepl('\\.par$',par_path,perl=TRUE, ignore.case=TRUE))
  {
    return(
      lapply(readPAR(par_path), function(p)
      {
        tibble(item_id=trimws(p$label),
               item_score=c(1:p$ncat)*p$discr,
               beta=p$delta,
               nbr=p$ilabel)
      }) |> 
        bind_rows() |>
        arrange(.data$item_id, .data$item_score) |>
        df_format()
    )
  } else
  {
    return(df_format(readCML(par_path)))
  }
}


### occasionally useful mainly for testing purposes to get pars from cml files  ###

readCML = function(cml_path)
{
  con = file(cml_path, "r")
  l = readLines(con, skipNul = TRUE)
  close(con)
  
  i = min(which(grepl('^\\s+RESULTS CALIBRATION\\.',l,perl=TRUE)))
  colnames = strsplit(trimws(l[i + 2]),'\\s+',perl=TRUE)[[1]]
  
  l = l[(i + 5):length(l)]
  l = l[1:(min(which(grepl('^-+$',l,perl=TRUE) == TRUE))-1)]
  
  pars = do.call(rbind, 
                 lapply(strsplit(trimws(l),'\\s+',perl=TRUE),
                        function(x)
                        {
                          if(length(x) == 0) return(NULL)
                          if(substr(x[1], 1, 1) == '[') x = c(NA, NA, gsub('\\D+','',x[1]), x[2:length(x)])
                          x[x=='---'] = NA
                          x
                        })
                 ) |>
          as.data.frame(stringsAsFactors = FALSE) 
  
  pars[,c(1,3:ncol(pars))] = lapply(pars[,c(1,3:ncol(pars))],as.numeric)
  colnames(pars) = gsub('\\.+$','',make.names(tolower(colnames)))
  colnames(pars)[2:4] = c('item_id','item_score','beta') 
  
  # fill out the NA's in the first two columns
  pars |> 
    fill('nr', 'item_id') |> 
    rename(nbr = 'nr') |>
	df_format()
}



oplm_r2c =  function(x)
{
  y = try({sapply(x,rawToChar)}, silent=TRUE)
  if(inherits(y, "try-error"))
    y = sapply(x,intToUtf8)
  
  z = try({trimws(y)}, silent=TRUE)
  if(inherits(z, "try-error"))
    return(y)
  z
}

readSCR = function (scrfile) 
{
  z = file(scrfile, "rb")
  n = readBin(z, 'int', size = 2, 3)
  nit = n[3]
  itemLabels = sapply(1:nit, function(x) {
    sl = readBin(z, 'int', size = 1, 1)
    rawToChar(readBin(z, 'raw', n = 8)[1:sl])
  })
  globCal = readBin(z, 'int', size = 1, nit)
  discrim = readBin(z, 'int', size = 1, nit)
  maxScore = readBin(z, 'int', size = 1, nit)
  parFixed = readBin(z, 'int', size = 1, nit)
  four = readBin(z, 'int', size = 1, 4)
  sl = readBin(z, 'int', size = 1, 1)
  jobname = rawToChar(readBin(z, 'raw', n = 12)[1:sl])
  five = readBin(z, 'int', size = 1, 5)
  sl = readBin(z, 'int', size = 1, 1)
  title = rawToChar(readBin(z, 'raw', n = 79)[1:sl])
  for (i in 1:20) {
    sl = readBin(z, 'int', size = 1, 1)
    if (sl > 0) 
      someComment = rawToChar(readBin(z, 'raw', n = sl))
  }
  sl = readBin(z, 'int', size = 1, 1)
  dataDir = rawToChar(readBin(z, 'raw', n = 60)[1:sl])
  sl = readBin(z, 'int', size = 1, 1)
  dataFile = rawToChar(readBin(z, 'raw', n = 12)[1:sl])
  expanded = readBin(z, 'int', size = 1, 1)
  expanded = 1 - expanded
  sl = readBin(z, 'int', size = 1, 1)
  fmt = rawToChar(readBin(z, 'raw', n = sl))
  nb = readBin(z, 'int', size = 2, 1)
  inUse = readBin(z, 'int', size = 1, nb)
  nMarg = readBin(z, 'int', size = 1, 1)
  nStat = readBin(z, 'int', size = 1, 1)
  margLabels = sapply(1:nMarg, function(x) {
    sl = readBin(z, 'int', size = 1, 1)
  readBin(z, 'raw', n = 8)[1:sl]
  },simplify=FALSE)
  statLabels = sapply(1:nStat, function(x) {
    sl = readBin(z, 'int', size = 1, 1)
    readBin(z, 'raw', n = 8)[1:sl]
  })
  nitb = readBin(z, 'int', size = 2, nb)
  datum = readBin(z, 'int', size = 2, 5)
  margBook = statBook = rep(NA, nb)
  itemsBook = vector(mode = "list", length = nb)
  itemsOn = vector(mode = "list", length = nb)
  for (i in 1:nb) {
    rubbish = readBin(z, 'int', size = 2, 1)
    margBook[i] = readBin(z, 'int', size = 1, 1)
    statBook[i] = readBin(z, 'int', size = 1, 1)
    itemsBook[[i]] = readBin(z, 'int', size = 2, nitb[i])
    itemsOn[[i]] = readBin(z, 'int', size = 1, nitb[i])
  }
  close(z)

  fmt = as.integer(unlist(regmatches(fmt, gregexpr('\\d+',fmt,perl=TRUE))))

  list(nit = nit, nMarg = nMarg, nStat = nStat, itemLabels = itemLabels, 
       booklet_position = c(fmt[1], fmt[1] + fmt[2] - 1L),
       responses_start = fmt[3], response_length = fmt[5],
       margLabels = oplm_r2c(margLabels), 
       globCal = globCal, 
       discrim = discrim, maxScore = maxScore, 
       nBook = nb, bookOn = inUse, nitBook = nitb, 
       margBook = margBook, statBook = statBook, 
       itemsBook = itemsBook, itemsOn = itemsOn, 
       expanded = expanded)
}


readPAR = function(parfile)
{
  z = file(parfile, "rb")
  tit = rawToChar(readBin(z, 'raw', size = 1, 80))
  n = readBin(z, 'int', size = 2, 4)
  nit = n[3]
  nstat = -1 * n[1]
  statlab = sapply(1:nstat, function(x) rawToChar(readBin(z, 'raw', size = 1, 8)))
  pars = vector(mode = "list", length = nit)
  for (i in 1:nit) 
  {
    iLab = readBin(z, 'int', size = 2, 1)
    label = rawToChar(readBin(z, 'raw', size = 1, 8))
    disc = readBin(z, 'int', size = 2, 1)
    ncat = readBin(z, 'int', size = 2, 1)
    if (nstat > 1) 
    {
      idumm = readBin(z, 'int', size = 2, 1)
      dummy = readBin(z, 'int', size = 2, idumm)
    }
    delta = readBin(z, numeric(), size = 8, ncat)
    pars[[i]] = list(ncat = ncat, discr = disc, delta = delta, 
                    ilabel = iLab, label=label)
  }
  close(z)
  pars
}