R/alignmentout.R

Defines functions alignmentout

Documented in alignmentout

## Generating table(s) of Threshold and Loading Parameters
#' @title alignmentout
#' @description Generating table(s) with estimates, alignment values and R-square of Thresholds and Loadings
#' @details
#' Split a Mplus output into multiple parts.
#' Produce table(s) with estimates, alignment values and R-square of Thresholds and Loadings.
#' All would be stored in folder 'Output_current date' at the working directory.
#' @author Hai Nguyen \email{hnguye72@@uic.edu}, Tianxiu Wang, Ariel Aloe, Rachel Gordon
#' @export alignmentout
#' @import tidyr stringr openxlsx
#' @param infile entering the Mplus output file name and path
#' @param directory entering the directory folder name to store the output
#' @return A list of text, CSV files and one Excel file with multiple tabs in the specific folder.


alignmentout<-function(infile="", directory=NULL){

  # Inform users the parameters ----------------------------------------------------
  ## 1. Enter a Mplus ouput file
  ## 2. Number of Groups: latent classes
  ## 3. Factors: number of continuous latent variables
  ## 4. Items: number of dependent variables
  ## 5. Threshold: number of categories - 1
  cat("\nThe function provides information from Mplus alignment output, including: \n - Groups (latent classes),\n - Factors (continuous latent variables),\n - Items (dependent variables) and\n - Categories of each Item (equal to Thresholds + 1).\n")
  cat("\nIn addition, you may find from the folder 'Output_current date', if not specifying a directory, in the working directory:\n - the multiple text files which split from the origin Mplus output\n - the thresholds, loadings tables (CSV format) and\n - especially, a combined Excel file with all separated spreadsheets")

  ## 1. Enter a Mplus ouput file======================================================
  # Directly input the filepath as an argument in the function
  #infile <- readline(prompt="Enter path and Mplus output file (separated by /):\n")

  ### Create a folder to store the output
  if (is.null(directory)) {
    filepath <- paste0("Output","_", Sys.Date())
    filepath.misc <- paste0("Output","_", Sys.Date(),"/Misc") # clean up: put all un-necessary files in filepath.misc
  } else {
    filepath <- directory
    filepath.misc <- paste0(directory,"/Misc")
  }

  ifelse(!dir.exists(file.path(filepath)), dir.create(file.path(filepath)), FALSE)
  ifelse(!dir.exists(file.path(filepath.misc)), dir.create(file.path(filepath.misc)), FALSE)

  ### Split Mplus output file into 6 parts by support function `mplussplit` within the package:
  ###  ext1_input instructions - ext2_summary of analysis
  ###  ext3_model fit information - ext4_model results
  ###  ext5_alignment output - ext6_savedata information
  mplussplit(outpath = filepath.misc, inputfile = infile)

  ## 2- Number of Groups: latent classes===============================================
  ext1<-readLines(paste0(filepath.misc, "/ext1_input instructions.txt"))
  g<-grep("^.*classes( )?=.*", ext1, ignore.case = T, value=T)

  g.line<-grep("^.*KNOWNCLASS.*", ext1, ignore.case = T, value=T)
  g.line.n<-grep("^.*KNOWNCLASS.*", ext1, ignore.case = T)
  Group <- as.numeric(str_extract_all(g,"\\d+"))
  Group.name <- gsub("^.*KNOWNCLASS( )?=( )?c\\(|( )?=( )?.*", "", g.line)

  # Group.cat <- unlist(strsplit(gsub("^.*KNOWNCLASS = c\\(\\w+( )?=( )?|\\) ;.*", "", g.line), split=" "))
  Group.cat <- unlist(strsplit(str_squish(gsub("^.*KNOWNCLASS = c\\(\\w+( )?=( )?|\\) ;.*", "", g.line)), split=" "))

  while (length(Group.cat)<Group) {
    g.line.n.add <- g.line.n + 1
    # Group.cat.add <- unlist(strsplit(gsub("^[:space]|\\)( )?;.*", "", trimws(ext1[g.line.n.add])), split=" "))
    Group.cat.add <- unlist(strsplit(str_squish(gsub("^[:space]|\\)( )?;.*", "", trimws(ext1[g.line.n.add]))), split=" "))

    Group.cat <- c(Group.cat,Group.cat.add)
  }

  cat("- The Number of Groups (Latent Classes):", Group,"\n")
  cat("- The Name of Groups (Latent Classes):", Group.name, "with categories of", Group.cat,"\n")

  ## 3- Factors: number of continuous latent variables=================================
  ext2<-readLines(paste0(filepath.misc, "/ext2_summary of analysis.txt"))
  m<-grep("Continuous latent variables",ext2)
  Factor<-unlist(str_extract_all(ext2[m+1],"\\w+"))
  Factor.n <- length(Factor)

  cat("- The Number of Factors:", Factor.n, ",including ", Factor,"\n")

  ## 4- Items: number of dependent variables============================================
  n<-grep("Binary and ordered categorical", ext2, ignore.case = T)
  Item.name<-str_extract_all(ext2[n+1],"\\w+")[[1]]
  for (i in (n+2):(m-1)){
    Item.name<-c(Item.name,str_extract_all(ext2[i],"\\w+")[[1]])
    if (i==(m-1)) Item.name<-Item.name[!is.na(Item.name)]
  }
  Item.n <-  length(Item.name)
  Item.name<-sapply(Item.name, FUN = function(x)str_sub(x,1,8)) #limit to 8 character long for each name

  cat("- The Number of Items: ", Item.n, ",including: ", Item.name,"\n")

  ## 5- Categories of each Item (Threshold: number of categories - 1)===================
  s <- k <-grep("UNIVARIATE PROPORTIONS AND COUNTS FOR CATEGORICAL VARIABLES", ext2, ignore.case = T) + 2 # s: stands for start
  while (!grepl("^[ \t\n]*$", ext2[k])) {
    k<-k+1
    e<-k #e: stands for end
  }
  Category<-vector(mode = "numeric", length=length(Item.name))
  for (i in 2:length(Item.name)){
    while ((s<e)&(!str_detect(ext2[s], Item.name[i]))){
      s<-s+1
    }
    Category[i-1]<-as.numeric(str_sub(ext2[s-1],str_locate(ext2[s-1],"y")+2, str_locate(ext2[s-1],"y")+3))
  }
  Category[length(Item.name)]<-as.numeric(str_sub(ext2[e-1],str_locate(ext2[e-1],"y")+2, str_locate(ext2[e-1],"y")+3))
  Threshold <- Category - 1
  Item.cat.df <- data.frame(cbind(Item.name, Category, Threshold))

  cat("- The Number of Categories and Threshold in each Item:\n")
  print(Item.cat.df, row.names = FALSE)


  # Build first 2 columns of model table: Item and Factor------------------------------
  f.stri<-NA; f.stri.c<-NA; f<-NA
  by.items <- vector(mode = "list", length = length(Factor)) #empty_list
  for (i in 1:length(Factor)) {
    f.stri[i]   <- paste0(Factor[i],"(.*) BY ")
    f.stri.c[i] <- paste0("^\\s+",Factor[i],"(.*) BY\\s+|\\s+;.*")
    f           <- grep(f.stri[i], ext1, ignore.case = T)
    if (length(f) > 1){
      by.items[[i]] <- toupper(gsub(f.stri.c[i], "", ext1[f], ignore.case = T))
    } else {
      by.items[[i]] <- toupper(gsub(f.stri.c[i], "", ext1[f], ignore.case = T))

      by.items.next <- c()
      while (!grepl(";", ext1[f])){
        by.items.next <- toupper(gsub("^[[:space:]]+|([[:space:]]+)?;.*", "", ext1[f+1], ignore.case = T))
        by.items[[i]] <- paste(by.items[[i]],by.items.next)
        f <- f+1
      }
    }


  }

  if (length(f) > 1){
    Item<-unlist(by.items) #matches with Item.name
  } else {
    Item<-unlist(strsplit(as.character(by.items), " "))
  }
  #Item<-unlist(by.items) #matches with Item.name
  #Item<-unlist(strsplit(as.character(by.items), " "))
  Item.orig<-Item #reserve the original Items
  Item<-sapply(Item, FUN = function(x)str_sub(x,1,8)) #limit to 8 character long for each name
  Factor.by<-rep(Factor[1],length(strsplit(by.items[[1]]," ")[[1]]))
  if (length(Factor)!=1){
    for (i in 2:length(Factor)) {
      f<-rep(Factor[i],length(strsplit(by.items[[i]]," ")[[1]]))
      Factor.by<-append(Factor.by,f)
    }
  }


  model.table<-data.frame(cbind(Item,Factor.by), row.names = NULL) #Global environment?
  model.table$Item<-as.character(model.table$Item)

  model.table.loadings<-model.table #save for building loadings table later

  Itemstring<-gsub(", ","|",toString(union(Item,Item.orig))) #set up the Item string served for pattern later on

  ## Due to i Thresholds so we must have i tables of model.table.threshold===============
  Threshold.max<-max(Threshold)
  model.table.threshold <- vector(mode = "list", length = Threshold.max) #empty_list
  for (i in 1:Threshold.max){
    model.table.threshold[[i]]<-model.table
  }

  # Build Threshold estimate and SE-------------------------------------------------------
  ## Split the Model Results to the number of classes=====================================
  latentsplit(filepath = filepath.misc, inputfile="ext4_model results.txt")

  Threshold.df <- vector(mode = "list", length = Threshold.max) #empty_list
  for (i in 1:Threshold.max){
    Threshold.df[[i]]<-vector(mode = "list", length = Group)
  }

  ## Run from 1 to threshold of model.table.threshold=====================================
  for (i in 1:Threshold.max){
    col.thres<-paste0("Threshold",i,"_G")
    col.se<-paste0("Threshold",i,"_SE_G")
    ### Run from 1 to Group (G) of Threshold.df of i, then repeat merge to threshold.table.threshold by Item
    for (j in 1:Group){
      readfile<-paste0(filepath.misc,"/LatentClass ",j,".txt")
      Threshold.file <- readLines(readfile)
      pattern<-sprintf("^ +(%s)\\$%s +([-+]?\\d+.\\d+) +([-+]?\\d+.\\d+) +([-+]?\\d+.\\d+) +([-+]?\\d+.\\d+)",Itemstring,i)
      matches<-str_match(Threshold.file,pattern)
      Threshold.df[[i]][[j]] <- drop_na(data.frame(matches[,-c(1,5,6)]))
      Th<-paste0(col.thres,j)
      Se<-paste0(col.se,j)
      colnames(Threshold.df[[i]][[j]]) <- c("Item",Th,Se)
      Threshold.df[[i]][[j]]$Item<-as.character(Threshold.df[[i]][[j]]$Item)

      #### merge by Item
      model.table.threshold[[i]] <- dplyr::full_join(model.table.threshold[[i]], Threshold.df[[i]][[j]], by="Item")
    }

  }

  # Build the Threshold1_Weighted_Average & Threshold1_R-square-----------------------------
  ## Split then obtain the part needed for Invariant estimates==============================
  paraextract(paste0(filepath.misc, "/ext5_alignment output.txt"),"ALIGNMENT OUTPUT","Loadings",paste0(filepath.misc, "/Threshold_Invariant_Rsq.txt"))
  Threshold.file <- readLines(paste0(filepath.misc, "/Threshold_Invariant_Rsq.txt"))

  ## Select the lines in which have the information==========================================
  st<-grep("^ Threshold.*|^ Weighted Average Value Across Invariant Groups:,*|^ R-square/Explained variance/Invariance index:.*|^ Approximate Invariance Was Not Found For This Parameter.", Threshold.file, ignore.case = T,value=T)

  ## Obtain the part needed for Invariant-Noninvariant values================================
  paraextract(paste0(filepath.misc, "/ext4_model results.txt"),"APPROXIMATE MEASUREMENT INVARIANCE \\(NONINVARIANCE\\) FOR GROUPS","FACTOR MEAN COMPARISON AT THE 5% SIGNIFICANCE LEVEL IN DESCENDING ORDER",paste0(filepath.misc, "/Invariant_Noninvariant.txt"))

  Invariance.file <- readLines(paste0(filepath.misc, "/Invariant_Noninvariant.txt"))

  invariancesplit(inputfile=paste0(filepath.misc,"/Invariant_Noninvariant.txt"), filepath = filepath.misc)

  Threshold.Invariance.file <- readLines(paste0(filepath.misc, "/ThresholdInvariance.txt"))

  ### Adjust for the indent from multi-groups---------------------------------------------
  Threshold.Invariance.file <- Threshold.Invariance.file[-c(1,length(Threshold.Invariance.file))]

  Threshold.Invariant.df <- vector(mode = "list", length = Threshold.max) #empty_list

  empty_lines = grepl('^\\s*$', Threshold.Invariance.file)
  Threshold.Invariance.file <- Threshold.Invariance.file[! empty_lines]
  Threshold.Invariance.file <- str_squish(Threshold.Invariance.file) #reduces repeated whitespace inside a string
  #l.digit<-grep('^\\d',Threshold.Invariance.file)
  l.digit<-grep('^\\d|^\\(',Threshold.Invariance.file) # minor modified on 3/15/2021
  if (length(l.digit) != 0) {
    for (i in 1:length(l.digit)){
      Threshold.Invariance.file[l.digit[i]-1] <- paste(Threshold.Invariance.file[l.digit[i]-1],Threshold.Invariance.file[l.digit[i]])
    }
    Threshold.Invariance.file <- Threshold.Invariance.file[-c(l.digit)]
  }

  ## Build the table from information obtained above=========================================
  for (i in 1:Threshold.max){
    ### Create a table with Group+3 columns (Item, Threshold[i]_Invariant_G[j], Weighted_Average and R-square) and Item.n rows
    df <- data.frame(matrix(ncol = (Group+3), nrow = length(Item)))
    name<-NA
    col.thres.invariant<-paste0("Threshold",i,"_Invariant_G")
    col.thres.weighted<-paste0("Threshold",i,"_Weighted_Average")
    col.thres.rsq<-paste0("Threshold",i,"_R-square")
    for (j in 1:Group){
      name[j]<-paste0(col.thres.invariant,j)
    }
    x <- c("Item", name, col.thres.weighted, col.thres.rsq)
    colnames(df) <- x
    Threshold.Invariant.df[[i]]<-df
    rm(df)


    k<-1

    for (j in 1:(length(st))){

      if (grepl("Threshold", st[j]) & str_sub(st[j],-1,-1)==i){
        Threshold.Invariant.df[[i]][k,1]=str_trim(substr(st[j],11,nchar(st[j])-2))

        if (grepl("Approximate Invariance Was Not Found For This Parameter", st[j+1])){
          for (l in 1:Group){
            Threshold.Invariant.df[[i]][k,l+1]="False"
          }
          Threshold.Invariant.df[[i]][k,l+2]=NA
          Threshold.Invariant.df[[i]][k,l+3]=NA
        } else {

          l.matchedline.threshold <- grep(sprintf("%s\\$%s",str_trim(substr(st[j],11,nchar(st[j])-2)),i), Threshold.Invariance.file)
          l.line.threshold <- unlist(strsplit(Threshold.Invariance.file[l.matchedline.threshold], split = " "))

          for (l in 1:Group){
            if (str_detect(l.line.threshold[l+1], "\\(")) {
              Threshold.Invariant.df[[i]][k,l+1]="False"
            } else {
              Threshold.Invariant.df[[i]][k,l+1]="True"
            }
          }
          Threshold.Invariant.df[[i]][k,l+2]=as.numeric(substr(st[j+1],nchar(st[j+1])-6,nchar(st[j+1])))
          Threshold.Invariant.df[[i]][k,l+3]=as.numeric(substr(st[j+2],nchar(st[j+2])-6,nchar(st[j+2])))
        }

        k<-k+1
      } else {next}
    }

    Threshold.Invariant.df[[i]] <- Threshold.Invariant.df[[i]][!is.na(Threshold.Invariant.df[[i]]$Item),]
    model.table.threshold[[i]] <- dplyr::full_join(model.table.threshold[[i]], Threshold.Invariant.df[[i]], by="Item")
  }

  # Now, building the loadings table, starting with the estimate and SE------------------
  Loadings.df<-vector(mode = "list", length = Group)
  ## Run from 1 to Group (G) of Loadings.df, then repeat merge to Loadings.df by Item
  for (j in 1:Group){
    readfile<-paste0(filepath.misc,"/LatentClass ",j,".txt")
    Loadings.file <- readLines(readfile)

    pattern<-sprintf("^ +(%s) +([-+]?\\d+.\\d+) +([-+]?\\d+.\\d+) +([-+]?\\d+.\\d+) +([-+]?\\d+.\\d+)", Itemstring)
    matches<-str_match(Loadings.file, pattern)

    Loadings.df[[j]] <- drop_na(data.frame(matches[,-c(1,5,6)]))

    Lg<-paste0("Loadings_G",j)
    Se<-paste0("Loadings_SE_G",j)
    colnames(Loadings.df[[j]]) <- c("Item",Lg,Se)
    Loadings.df[[j]]$Item<-trimws(as.character(str_sub(Loadings.df[[j]]$Item,1,8)))

    #### here we merge by Item
    model.table.loadings<-dplyr::full_join(model.table.loadings, Loadings.df[[j]], by="Item")
  }


  # Continue to build the Loadings Invariant table, then merge to just preceding table-----
  ## Select the paragraph in which has the information=============================
  paraextract(paste0(filepath.misc, "/ext5_alignment output.txt"),"Loadings","SAMPLE STATISTICS FOR ESTIMATED FACTOR SCORES|DIAGRAM INFORMATION", paste0(filepath.misc, "/Loadings_Invariant_Rsq.txt"))

  ## Load the extract paragraph (above)=====================================================
  Loadings.file <- readLines(paste0(filepath.misc, "/Loadings_Invariant_Rsq.txt"))

  ## Select the lines in which have the information=========================================
  st<-grep("^ Loadings for.*|^ Weighted Average Value Across Invariant Groups:,*|^ R-square/Explained variance/Invariance index:.*|^ Approximate Invariance Was Not Found For This Parameter.", Loadings.file, value=T, ignore.case=T)

  loadings.invariant.df <- data.frame(matrix(ncol = (Group+3), nrow = length(Item))) #Item.n may be different
  col.load.invariant<-NA
  for (j in 1:Group){
    col.load.invariant[j]<-paste0("Loadings_Invariant_G",j)
  }

  colnames(loadings.invariant.df) <- c("Item", col.load.invariant, "Loadings_Weighted_Average", "Loadings_R_square")

  ## Build the table=======================================================================

  Loadings.Invariance.file <- readLines(paste0(filepath.misc, "/LoadingsInvariance.txt"))

  ### Adjust for the indent from multi-groups---------------------------------------------
  Loadings.Invariance.file <- Loadings.Invariance.file[-length(Loadings.Invariance.file)]

  #Threshold.Invariant.df <- vector(mode = "list", length = Threshold.max) #empty_list

  empty_lines = grepl('^\\s*$', Loadings.Invariance.file)
  Loadings.Invariance.file <- Loadings.Invariance.file[! empty_lines]
  Loadings.Invariance.file <- str_squish(Loadings.Invariance.file) #reduces repeated whitespace inside a string
  #l.digit<-grep('^\\d',Threshold.Invariance.file)
  l.digit<-grep('^\\d|^\\(',Loadings.Invariance.file) # minor modified on 8/19/2021
  if (length(l.digit) != 0) {
    for (i in 1:length(l.digit)){
      Loadings.Invariance.file[l.digit[i]-1] <- paste(Loadings.Invariance.file[l.digit[i]-1],Loadings.Invariance.file[l.digit[i]])
    }
    Loadings.Invariance.file <- Loadings.Invariance.file[-c(l.digit)]
  }



  k<-1

  for (j in 1:(length(st))){

    if (grepl("Loadings for", st[j])){
      loadings.invariant.df[k,1]=str_trim(substr(st[j],14,nchar(st[j])))

      if (grepl("Approximate Invariance Was Not Found For This Parameter", st[j+1])){
        for (l in 1:Group){
          loadings.invariant.df[k,l+1]="False"
        }
        loadings.invariant.df[k,l+2]=NA
        loadings.invariant.df[k,l+3]=NA
      } else {

        l.matchedline <- grep(str_trim(substr(st[j],14,nchar(st[j]))),Loadings.Invariance.file)
        l.line <- unlist(strsplit(gsub("^\\s+|\\s+"," ", sub("^\\s+","", Loadings.Invariance.file[l.matchedline])), split=" "))

        for (l in 1:Group){
          #loadings.invariant.df[k,l+1]="True"
          if (str_detect(l.line[l+1], "\\(")){
            loadings.invariant.df[k,l+1]="False"
          } else {
            loadings.invariant.df[k,l+1]="True"
          }
        }
        loadings.invariant.df[k,l+2]=as.numeric(substr(st[j+1],nchar(st[j+1])-6,nchar(st[j+1])))
        loadings.invariant.df[k,l+3]=as.numeric(substr(st[j+2],nchar(st[j+2])-6,nchar(st[j+2])))
      }

      k<-k+1
    } else {next}
  }

  ## Link to the previous table============================================================
  model.table.loadings <- dplyr::full_join(model.table.loadings, loadings.invariant.df, by="Item")

  # Create one Excel file with all spreadsheets----------------------------------------
  completed.table.threshold <- vector(mode = "list", length = Threshold.max) #empty_list
  for (i in 1:Threshold.max){
    completed.table.threshold[[i]]<-model.table.threshold[[i]]
    utils::write.csv(model.table.threshold[[i]], paste0(filepath,"/threshold",i,".csv"), row.names=FALSE)
  }

  completed.table.loadings <- model.table.loadings
  utils::write.csv(model.table.loadings, paste0(filepath,"/loadings.csv"), row.names=FALSE)

  of=paste0(filepath,"/alignment_tables.xlsx")
  OUT <- createWorkbook()
  for(i in 1:Threshold.max){
    tname<-paste("Threshold ",i,sep="")
    addWorksheet(OUT, tname)
    writeData(OUT, sheet = tname, x = completed.table.threshold[[i]])
  }
  addWorksheet(OUT, "Loadings")
  writeData(OUT, sheet = "Loadings", x = completed.table.loadings)
  saveWorkbook(OUT,of,overwrite=TRUE)
}
hai-mn/Harmony documentation built on Feb. 7, 2023, 2:52 p.m.