fasterParser.R

## ======================================================================
## This is a much faster version of the oldParser.R
## Code by Tobias Kächele & Felix Schönbrodt
## ======================================================================

library(stringi)
library(dplyr)

# returns the reported precision (i.e., number of decimals) from a string
# e.g. decimals("0.120") == 3
decimals <- function(str) {
  locations <- stri_locate_first_fixed(str, '.')[,1]
  decs <- nchar(str) - locations
  decs[is.na(locations)] <- 0
  decs
}

is.one <- function(v) {
  !is.na(v) & v==1
}


# parses a multiline string
parse_ES <- function(txt, round_up = FALSE) {

  if(is.null(txt) || nchar(txt) == 0) {
    return(NULL)
  }
  
  # split into lines
  txt.lines <- stri_split_lines(txt)[[1]]

  # remove all comments
  txt.lines <-  stri_replace_first_regex(txt.lines, '#.*$', '')

  # convert chains of whitespace characters to a single space
  txt.lines <- stri_replace_all_regex(txt.lines, "\\s+", " ")

  # trim all lines
  txt.lines <- stri_trim_both(txt.lines)

  # find all non empty lines (indices_not_empty represents correct line numbers!)
  indices_not_empty <- which(!stri_isempty(txt.lines))

  # remove all empty lines
  txt.lines <- txt.lines[indices_not_empty]

  # number of (non-empty) lines
  nlines <- length(txt.lines)

  if(nlines == 0) {
    return(NULL)
  }
  
  # allocate space for error and warning messages
  errors <- rep("", nlines)
  warning <- rep("", nlines)

  # definition of all column names of output matrix
  TYPE            <-  1
  DF1             <-  2
  DF2             <-  3
  STAT            <-  4
  SIGN            <-  5 # e.g. -1 if stat is -5.3
  P.REPORTED      <-  6 # e.g. 0.12
  P.REPORTED.DECS <-  7 # e.g. 2 if p.reported is 0.12
  P.COMP          <-  8
  CRIT.VALUE      <-  9
  ONE.TAILED      <- 10
  P.VALUE         <- 11
  P.VALUE.ONE     <- 12
  P.ACTUAL        <- 13
  G               <- 14
  D               <- 15
  N.APPROX        <- 16
  SIGNIFICANT     <- 17
  P.REPORTED.ERROR  <- 18
  P.REPORTED.ERROR.DIRECTION   <- 19
  IS.FOCAL        <- 20
  PARSE.ERROR     <- 21
  D.REPORTED        <- 22
  D.REPORTED.SIGN   <- 23
  D.REPORTED.DECS   <- 24
  D.REPORTED.LOWER  <- 25
  D.REPORTED.UPPER  <- 26
  D.REPORTED.ERROR.DIRECTION  <- 27
  D.REPORTED.ERROR  <- 28
  REPORTING.ERROR <- 29
  P.VALUE.LOG       <- 30
  D.VAR       <- 31
  STUDYDESIGN <- 32

  # output matrix for all data in numeric form
  BIG <- matrix(NA, nrow = nlines, ncol = 32)

  # find study labels and extract them (everything before ":")
  extraction <- stri_match_first_regex(txt.lines, '^ *(.*?) *(?:(?<=\\)) *(.+?) *)?: *')
  txt.lines.edited <- txt.lines
  indices_not_na <- which(!is.na(extraction[,1]))
  if(length(indices_not_na)) {
    txt.lines.edited[indices_not_na] <- stri_replace_first_fixed(txt.lines.edited[indices_not_na], extraction[indices_not_na,1], '')
  }

  # init paper ids with auto id
  PAPER_ID <- paste0(".", as.character(1:nlines))

  # init study ids with NA
  STUDY_ID <- rep("", nlines)

  # get indices for certain rows
  indices_paper_id <- which(!is.na(extraction[,2]))
  indices_study_id <- which(!is.na(extraction[,3]))

  # set paper id respectively
  if(length(indices_paper_id))
    PAPER_ID[indices_paper_id] <- extraction[indices_paper_id, 2]

  if(length(indices_study_id))
    STUDY_ID[indices_study_id] <- extraction[indices_study_id, 3]

  # is study focal? Underscore at start of id indicates a non-focal test statistic
  BIG[,IS.FOCAL] <- ( stri_sub(PAPER_ID, 1, 1) != "_" )

  # definition of statistic types and typestrings array
  TYPE_T <- 1
  TYPE_CHI2 <- 2
  TYPE_F <- 3
  TYPE_R <- 4
  TYPE_Z <- 5
  TYPE_P <- 6
  TYPESTRINGS <- c('t','chi2','f','r','z', 'p')

  # find statistic and extract it
  extraction <- stri_match_first_regex(txt.lines.edited, ' *\\b(t|chi2|f|r|z|p)(?: *\\( *((?:\\d*\\.)?\\d+)(?: *, *((?:\\d*\\.)?\\d+))? *\\))? *= *(-?(?:\\d*\\.)?\\d+)[ ,;]*', case_insensitive=TRUE)
  indices_not_na <- which(!is.na(extraction[,1]))
  if( length(indices_not_na)){
    txt.lines.edited[indices_not_na] <- stri_replace_first_fixed(txt.lines.edited[indices_not_na], extraction[indices_not_na,1], ' ')
  }

  # mark lines without statistic as error
  indices_no_statistic <- which(is.na(extraction[,1]))
  if (length(indices_no_statistic) > 0)
    errors[indices_no_statistic] <- paste0(errors[indices_no_statistic], "\nNo statistic given!")
  
  # store numeric representation for type of statistic
  type.factor <- factor(stri_trans_tolower(extraction[,2]), TYPESTRINGS)
  BIG[,TYPE] <- unclass(type.factor)

  # store first argument enclosed in braces
  BIG[,DF1]  <- as.numeric(extraction[,3])

  # store second argument enclosed in braces
  BIG[,DF2]  <- as.numeric(extraction[,4])

  # store value of statistic
  BIG[,STAT] <- as.numeric(extraction[,5])

  # store sign of value of statistic
  BIG[,SIGN] <- sign(BIG[,STAT])

  # round statistic if necessary
  if (round_up == TRUE) {
    decPlaces  <- decimals(extraction[,5])
    BIG[,STAT] <- BIG[,STAT] + BIG[,SIGN] * (4.999 / 10^(decPlaces+1))
  }

  # remove sign from value of statistic
  BIG[,STAT] <- abs(BIG[,STAT])



  # vectors of indices for each type
  is_t         <- BIG[,TYPE] == TYPE_T
  is_chi2      <- BIG[,TYPE] == TYPE_CHI2
  is_f         <- BIG[,TYPE] == TYPE_F
  is_r         <- BIG[,TYPE] == TYPE_R
  is_z         <- BIG[,TYPE] == TYPE_Z
  is_p         <- BIG[,TYPE] == TYPE_P
  has_df1      <- !is.na(BIG[,DF1])
  has_df2      <- !is.na(BIG[,DF2])
  indices_t    <- which(is_t)
  indices_chi2 <- which(is_chi2)
  indices_f    <- which(is_f)
  indices_r    <- which(is_r)
  indices_z    <- which(is_z)
  indices_pdirect <- which(is_p)

  # find p-value and extract it
  # (if the p-value is given as test statistic)
  extraction <- stri_match_first_regex(txt.lines.edited, ' *\\bp *(<|<=|=|>) *0*((?:\\d*\\.)?\\d+)[ ,;]*', case_insensitive=TRUE)
  p.reported.str <- rep("", nlines)
  indices_not_na <- which(!is.na(extraction[,1]))
  if( length(indices_not_na)) {
    p.reported.str[indices_not_na] <- paste0("p ", extraction[indices_not_na,2], " ", extraction[indices_not_na,3])
    txt.lines.edited[indices_not_na] <- stri_replace_first_fixed(txt.lines.edited[indices_not_na], extraction[indices_not_na,1], ' ')
  }



  # store p-value
  BIG[,P.REPORTED] <- as.numeric(extraction[,3])

  # store numeric value for each comparator used with p-value
  BIG[,P.COMP] <- unclass(factor(extraction[,2], c('<','<=', '=','>')))

  # store number of decimals of p-value
  BIG[,P.REPORTED.DECS] <- decimals(extraction[,3])

  # get indices depending on p-value specification
  indices_p     <- which(!is.na(BIG[,P.REPORTED]))
  indices_p_lt  <- which(BIG[,P.COMP] == 1)
  indices_p_leq <- which(BIG[,P.COMP] == 2)
  indices_p_eq  <- which(BIG[,P.COMP] == 3)
  indices_p_gt  <- which(BIG[,P.COMP] == 4)

  # find critical value and extract it
  extraction <- stri_match_first_regex(txt.lines.edited, ' *\\bcrit *= *((?:\\d*\\.)?\\d+)[ ,;]*', case_insensitive=TRUE)
  indices_not_na <- which(!is.na(extraction[,1]))
  if( length(indices_not_na)) {
    txt.lines.edited[indices_not_na] <- stri_replace_first_fixed(txt.lines.edited[indices_not_na], extraction[indices_not_na,1], ' ')
  }

  # store critical value
  BIG[,CRIT.VALUE] <- as.numeric(extraction[,2])

  # find "one-tailed" and extract it
  extraction <- stri_match_first_regex(txt.lines.edited, ' *\\b(one-tailed|1-tailed|one|1t)\\b[ ,;]*', case_insensitive=TRUE)
  # store if one-tailed was specified
  BIG[,ONE.TAILED] <- !is.na(extraction[,1])
  indices_one_tailed <- which(BIG[,ONE.TAILED] == 1)
  if( length(indices_one_tailed)) {
    txt.lines.edited[indices_one_tailed] <- stri_replace_first_fixed(txt.lines.edited[indices_one_tailed], extraction[indices_one_tailed,1], ' ')
  }
  
  
  # ---------------------------------------------------------------------
  #  find study design for t-values: between subjects (BS) or within-subjects (WS)?
  extraction <- stri_match_first_regex(txt.lines.edited, ' *\\b(BS|WS)\\b[ ,;]*', case_insensitive=TRUE)
  
  BIG[,STUDYDESIGN] <- as.numeric(factor(extraction[,2], levels=c("BS", "WS")))
  indices_T_BS <- which(BIG[,STUDYDESIGN] == 1)
  indices_T_WS <- which(BIG[,STUDYDESIGN] == 2)
  if( length(indices_T_BS)) {
    txt.lines.edited[indices_T_BS] <- stri_replace_first_fixed(txt.lines.edited[indices_T_BS], extraction[indices_T_BS,1], ' ')
  }
  if( length(indices_T_WS)) {
    txt.lines.edited[indices_T_WS] <- stri_replace_first_fixed(txt.lines.edited[indices_T_WS], extraction[indices_T_WS,1], ' ')
  }
  

  # find Cohen's d with and without CI/SE and extract it
  extraction <- stri_match_first_regex(txt.lines.edited, ' *\\bd *= *(-)?0*((?:\\d*\\.)?\\d+) *(?:\\[ *(-?(?:\\d*\\.)?\\d+) *[,;] *(-?(?:\\d*\\.)?\\d+) *\\]|[,; ]*se *= *((?:\\d*\\.)?\\d+))?[ ,;]*', case_insensitive=TRUE)
  indices_cohens_d <- which(!is.na(extraction[,1]))
  if( length(indices_cohens_d)) {
    txt.lines.edited[indices_cohens_d] <- stri_replace_first_fixed(txt.lines.edited[indices_cohens_d], extraction[indices_cohens_d,1], ' ')
  }

  # store Cohen's d and number of decimals
  BIG[,D.REPORTED] <- as.numeric(extraction[,3])
  BIG[,D.REPORTED.SIGN] <- ifelse(is.na(extraction[,2]), 1, -1)
  BIG[,D.REPORTED] <- BIG[,D.REPORTED] * BIG[,D.REPORTED.SIGN]

  BIG[,D.REPORTED.DECS] <- decimals(extraction[,3])

  # has Cohen's d?
  indices_cohens_d <- which(!is.na(BIG[,D.REPORTED]))

  # save how it was reported
  d.reported.str <- rep("", nlines)
  if(length(indices_cohens_d)) {
    d.reported.str[indices_cohens_d] <- paste0("d = ", extraction[indices_cohens_d, 3])
  }

  # CI specified for Cohen's d?
  indices_cohens_d_has_ci <- which(!is.na(extraction[,4]))

  # store Cohen's d upper and lower limits if specified
  if(length(indices_cohens_d_has_ci)) {
      BIG[indices_cohens_d_has_ci, D.REPORTED.LOWER] <- as.numeric(extraction[indices_cohens_d_has_ci, 4])
      BIG[indices_cohens_d_has_ci, D.REPORTED.UPPER] <- as.numeric(extraction[indices_cohens_d_has_ci, 5])

      # check ci
      indices_cohens_d_ci_not_in_order <- which(BIG[, D.REPORTED.LOWER] > BIG[, D.REPORTED.UPPER])
      if(length(indices_cohens_d_ci_not_in_order))
        errors[indices_cohens_d_ci_not_in_order] <- paste0(errors[indices_cohens_d_ci_not_in_order], "\nLower bounds of CI for Cohen's d are greater than upper bounds!")

      indices_cohens_d_ci_not_in_order <- which(BIG[, D.REPORTED.LOWER] > BIG[,D.REPORTED])
      if(length(indices_cohens_d_ci_not_in_order))
        errors[indices_cohens_d_ci_not_in_order] <- paste0(errors[indices_cohens_d_ci_not_in_order], "\nLower bounds of CI for Cohen's d are greater than Cohen's d!")

      indices_cohens_d_ci_not_in_order <- which(BIG[, D.REPORTED.UPPER] < BIG[,D.REPORTED])
      if(length(indices_cohens_d_ci_not_in_order))
        errors[indices_cohens_d_ci_not_in_order] <- paste0(errors[indices_cohens_d_ci_not_in_order], "\nUpper bounds of CI for Cohen's d are less than Cohen's d!")
  }


  # SE specified for Cohen's d?
  indices_cohens_d_has_se <- which(!is.na(extraction[,6]))

  # compute Cohen's d upper and lower limits using SE if specified and store them
  if(length(indices_cohens_d_has_se)) {
      SE <- as.numeric(extraction[indices_cohens_d_has_se, 6]) * qnorm(0.975)
      BIG[indices_cohens_d_has_se, D.REPORTED.LOWER] <- BIG[indices_cohens_d_has_se, D.REPORTED] - SE
      BIG[indices_cohens_d_has_se, D.REPORTED.UPPER] <- BIG[indices_cohens_d_has_se, D.REPORTED] + SE
  }

  # now switch and change sign of upper and lower limits if Cohens'd is negative
  indices_cohens_d_neg <- which(BIG[,D.REPORTED] < 0)
  if(length(indices_cohens_d_neg)) {
    BIG[indices_cohens_d_neg, c(D.REPORTED.UPPER, D.REPORTED.LOWER)] <- -BIG[indices_cohens_d_neg, c(D.REPORTED.LOWER, D.REPORTED.UPPER)]
    BIG[indices_cohens_d_neg, D.REPORTED] <- -BIG[indices_cohens_d_neg, D.REPORTED]
  }


  # trim rest which couldn't be parsed
  txt.lines.edited <- stri_trim_both(txt.lines.edited)

  # find lines which have an unparseable part
  indices_wrong_syntax <- which(!stri_isempty(txt.lines.edited) )
  if(length(indices_wrong_syntax) >0 )
    errors[indices_wrong_syntax] <- paste0(errors[indices_wrong_syntax], "\nSyntax error. Are there any illegal expressions? Are there conflicting definitions?")

  # set default crit.value when no crit value has been specified
  indices_crit_na <- which(is.na(BIG[, CRIT.VALUE]))
  BIG[indices_crit_na, CRIT.VALUE] <- ifelse(BIG[indices_crit_na, ONE.TAILED], .10, .05)


  # ---------------------------------------------------------------------
  #  error detection / define error messages
  
  indices_df1_missing <- which((is_t | is_chi2 | is_r) & !has_df1)
  if(length(indices_df1_missing))
    errors[indices_df1_missing] <- paste0(errors[indices_df1_missing], "\nStatistic needs specification of df!")

  indices_df2_missing <- which(is_f & !has_df2)
  if(length(indices_df2_missing))
    errors[indices_df2_missing] <- paste0(errors[indices_df2_missing], "\nStatistic needs specification of second df!")

  indices_excessive_df2 <- which((is_t | is_r | is_z) & has_df2)
  if(length(indices_excessive_df2))
    errors[indices_excessive_df2] <- paste0(errors[indices_excessive_df2], "\nStatistic has two dfs but only one df allowed!")

  indices_df_zero <- which(BIG[,DF1] == 0 || BIG[,DF2] == 0)
  if(length(indices_df_zero))
    errors[indices_df_zero] <- paste0(errors[indices_df_zero], "\nDfs of statistic must be greater than zero!")

  indices_df1_real <- which((is_chi2 | is_z | is_r) & (round(BIG[,DF1]) != BIG[,DF1] ))
  if(length(indices_df1_real))
    errors[indices_df1_real] <- paste0(errors[indices_df1_real], "\nFirst df of statistic must be an integer value!")

  indices_df2_real <- which(is_chi2 & (round(BIG[,DF2]) != BIG[,DF2] ))
  if(length(indices_df2_real))
    errors[indices_df2_real] <- paste0(errors[indices_df2_real], "\nSecond df of statistic must be an integer value!")

  indices_stat_neg <- which((is_f | is_chi2) & BIG[,SIGN] == -1)
  if(length(indices_stat_neg))
    errors[indices_stat_neg] <- paste0(errors[indices_stat_neg], "\nStatistic must be greater or equal 0!")
  
  indices_p_outofrange <- which(is_p & (BIG[ ,STAT] > 1 | BIG[,SIGN] == -1))
  if(length(indices_p_outofrange))
    errors[indices_p_outofrange] <- paste0(errors[indices_p_outofrange], "\np-values must be between 0 and 1!")
  
  indices_stat_out_of_bounds <- which(is_r & BIG[,STAT] > 1)
  if(length(indices_stat_out_of_bounds))
    errors[indices_stat_out_of_bounds] <- paste0(errors[indices_stat_out_of_bounds], "\nStatistic must be >= -1 and <= +1!")

  indices_p_out_of_bounds <- which(BIG[,P.REPORTED] > 1)
  if(length(indices_p_out_of_bounds))
    errors[indices_p_out_of_bounds] <- paste0(errors[indices_p_out_of_bounds], "\np-value must be less or equal 1!")

  indices_crit_out_of_bounds <- which(BIG[,CRIT.VALUE] > 1)
  if(length(indices_crit_out_of_bounds))
    errors[indices_crit_out_of_bounds] <- paste0(errors[indices_crit_out_of_bounds], "\nCritical value must be less or equal 1!")
  
  indices_tF_without_design <- which((is_t | is_f) & is.na(BIG[,STUDYDESIGN]))
  if(length(indices_tF_without_design))
    warning[indices_tF_without_design] <- paste0(warning[indices_tF_without_design], "\nWarning: t- or F-value is reported without design. <i>Effect size computations assume a <u>between-subject</u> design!</i> Add \"; BS\" for between subject design; add \"; WS\" for within subject design)")


  # compute t-statistic
  if(length(indices_t))
  {
    BIG[indices_t, P.VALUE] <- pt(BIG[indices_t, STAT], BIG[indices_t, DF1], lower.tail=FALSE) * 2
	BIG[indices_t, P.VALUE.LOG] <- pt(BIG[indices_t, STAT], BIG[indices_t, DF1], lower.tail=FALSE, log.p = TRUE) + log(2)
	
    BIG[indices_t, N.APPROX] <- BIG[indices_t, DF1] + 2
	
	# Which studies are between-SS, which within-SS?
	BS_design <- which(BIG[,TYPE] == TYPE_T & (BIG[,STUDYDESIGN] == 1 | is.na(BIG[,STUDYDESIGN])))
	WS_design <- which(BIG[,TYPE] == TYPE_T & BIG[,STUDYDESIGN] == 2)
	
	# compute between effect size
	BIG[BS_design, D] <- (2*BIG[BS_design, STAT] / sqrt(BIG[BS_design, N.APPROX])) * BIG[BS_design, SIGN]
	
	# compute within effect size
	BIG[WS_design, D] <- (BIG[WS_design, STAT] / sqrt(BIG[WS_design, N.APPROX])) * BIG[WS_design, SIGN]
	
	BIG[indices_t, G] <- BIG[indices_t, D] * ( 1- (3/(4 * BIG[indices_t, N.APPROX] - 1)))
    
	#BIG[indices_t, D.VAR] <- 4/BIG[indices_t, N.APPROX] + BIG[indices_t, D]^2 / (2*BIG[indices_t, N.APPROX])
	BIG[indices_t, D.VAR] <- (4 + BIG[indices_t, D]^2) / BIG[indices_t, N.APPROX]
  }

  # compute pearson's r
  if(length(indices_r))
  {
	# t value from correlation
	t.r <- BIG[indices_r, STAT]*sqrt(BIG[indices_r, DF1] / (1 - BIG[indices_r, STAT]^2))
    BIG[indices_r, P.VALUE] <- pt(t.r, BIG[indices_r, DF1], lower.tail=FALSE) * 2
	BIG[indices_r, P.VALUE.LOG] <- pt(t.r, BIG[indices_r, DF1], lower.tail=FALSE, log.p = TRUE) + log(2)
    BIG[indices_r, D] <- BIG[indices_r, SIGN] * (2 * BIG[indices_r, STAT]) / sqrt(1 - BIG[indices_r, STAT]^2)
    BIG[indices_r, G] <- BIG[indices_r, D] * (1 - (3 / (4 * BIG[indices_r, DF1] - 1)))
    BIG[indices_r, N.APPROX] <- BIG[indices_r, DF1] + 2

	var.r <- (1 - BIG[indices_r, STAT]^2)^2/(BIG[indices_r, N.APPROX] - 1)
	BIG[indices_r, D.VAR] <-  4 * var.r/(1 - BIG[indices_r, STAT]^2)^3
  }

  # compute f-statistic
  if(length(indices_f))
  {
    indices_f_df1_is_1 <- which(BIG[,TYPE] == 3 & BIG[,DF1] == 1)
    if(length(indices_f_df1_is_1))
    {
      BIG[indices_f_df1_is_1, N.APPROX] <- BIG[indices_f_df1_is_1, DF2] + 2
	  
  	  # Which studies are between-SS, which within-SS?
  	  BS_design <- which(BIG[,TYPE] == TYPE_F & BIG[,DF1] == 1 & (BIG[,STUDYDESIGN] == 1 | is.na(BIG[,STUDYDESIGN])))
  	  WS_design <- which(BIG[,TYPE] == TYPE_F & BIG[,DF1] == 1 & BIG[,STUDYDESIGN] == 2)
	  
  	  # compute between effect size
  	  BIG[BS_design, D] <- (2*sqrt(BIG[BS_design, STAT]) / sqrt(BIG[BS_design, N.APPROX])) * BIG[BS_design, SIGN]
	  
  	  # compute within effect size
  	  BIG[WS_design, D] <- sqrt(BIG[WS_design, STAT] /BIG[WS_design, N.APPROX]) * BIG[WS_design, SIGN]

      BIG[indices_f_df1_is_1, G] <- BIG[indices_f_df1_is_1, D] * ( 1- (3/(4 * BIG[indices_f_df1_is_1, N.APPROX] - 1)))
	  
	  #BIG[indices_f_df1_is_1, D.VAR] <- 4/BIG[indices_f_df1_is_1, N.APPROX] + BIG[indices_f_df1_is_1, D]^2 / (2*BIG[indices_f_df1_is_1, N.APPROX])
	  BIG[indices_f_df1_is_1, D.VAR] <- (4 + BIG[indices_f_df1_is_1, D]^2) / BIG[indices_f_df1_is_1, N.APPROX]
    }
	
	BIG[indices_f, P.VALUE.LOG] <- pf(BIG[indices_f, STAT], BIG[indices_f, DF1], BIG[indices_f, DF2], lower.tail=FALSE, log.p = TRUE)
	BIG[indices_f, P.VALUE] <- exp(BIG[indices_f, P.VALUE.LOG])
  }

  # compute z-value
  if(length(indices_z) ) {
    BIG[indices_z, P.VALUE] <- pnorm(BIG[indices_z, STAT], lower.tail=FALSE) * 2
	BIG[indices_z, P.VALUE.LOG] <- pnorm(BIG[indices_z, STAT], lower.tail=FALSE, log.p = TRUE) + log(2)

    indices_z_df_exists <- which(BIG[, TYPE] == TYPE_Z & !is.na(BIG[, DF1]))
    if(length(indices_z_df_exists) ){
      # If a number is provided for z it's the sample size
      BIG[indices_z_df_exists, N.APPROX] <- BIG[indices_z_df_exists,DF1]  
      BIG[indices_z_df_exists, D] <- (BIG[indices_z_df_exists, STAT] / sqrt(BIG[indices_z_df_exists, N.APPROX])) * BIG[indices_z_df_exists, SIGN]
      BIG[indices_z_df_exists, G] <- BIG[indices_z_df_exists, D] * (1 - (3 / (4 * BIG[indices_z_df_exists, N.APPROX] - 1)))
    }
  }

  # compute chi2-statistic
  if(length(indices_chi2)){
    # If two numbers are provided for chi2, the first are the dfs, the second is the sample size
    BIG[indices_chi2, P.VALUE] <- pchisq(BIG[indices_chi2, STAT], BIG[indices_chi2, DF1], lower.tail=FALSE)
	BIG[indices_chi2, P.VALUE.LOG] <- pchisq(BIG[indices_chi2, STAT], BIG[indices_chi2, DF1], lower.tail=FALSE, log.p = TRUE)

    indices_chi2_with_n <- which(BIG[,TYPE] == TYPE_CHI2 & BIG[,DF1] == 1 & !is.na(BIG[,DF2]))

    if(length(indices_chi2_with_n)) {
      BIG[indices_chi2_with_n, N.APPROX] <- BIG[indices_chi2_with_n, DF2]
      BIG[indices_chi2_with_n, D] <- sqrt(BIG[indices_chi2_with_n, STAT] / BIG[indices_chi2_with_n, N.APPROX])
      BIG[indices_chi2_with_n, D] <- 2 * BIG[indices_chi2_with_n, D] * sqrt((BIG[indices_chi2_with_n, N.APPROX] - 1)/(BIG[indices_chi2_with_n, N.APPROX] * (1 - BIG[indices_chi2_with_n, D]^2))) * abs(BIG[indices_chi2_with_n, D])/BIG[indices_chi2_with_n, D]
      BIG[indices_chi2_with_n, G] <- BIG[indices_chi2_with_n, D] * (1 - (3/(4 * (BIG[indices_chi2_with_n, N.APPROX]-2) - 1)))
    }
  }
  
  # compute directly entered p-value
  if(length(indices_pdirect) ) {
	
    BIG[indices_pdirect, P.REPORTED] <- BIG[indices_pdirect, STAT]
	
	# assume that the directly reported p-value is the correct p-value;	
	BIG[indices_pdirect, P.VALUE] <- BIG[indices_pdirect, STAT]
	BIG[indices_pdirect, P.VALUE.LOG] <- log(BIG[indices_pdirect, STAT])
	
	# If one-tailed, the actual p-value is double the size
	BIG[indices_pdirect, P.VALUE][BIG[indices_pdirect, ONE.TAILED] == 1] <- BIG[indices_pdirect, P.VALUE][BIG[indices_pdirect, ONE.TAILED] == 1]*2
	BIG[indices_pdirect, P.VALUE.LOG][BIG[indices_pdirect, ONE.TAILED] == 1] <- BIG[indices_pdirect, P.VALUE.LOG][BIG[indices_pdirect, ONE.TAILED] == 1]+ log(2)

	indices_pdirect_df_exists <- which(BIG[, TYPE] == TYPE_P & !is.na(BIG[, DF1]))
    if(length(indices_pdirect_df_exists) ){
		
	# conversion formula for converting p to d, see: https://books.google.de/books?id=GC42CwAAQBAJ&pg=PA100&lpg=PA100&dq=meta-analysis+convert+chi2+to+d+degrees+of+freedom&source=bl&ots=_c4EHEyRis&sig=yIaUDAbQ3RfPvLTfEE7-thpEXys&hl=de&sa=X&ved=0ahUKEwiyp-XntdnLAhWipnIKHab5CaA4ChDoAQhfMAg#v=onepage&q=meta-analysis%20convert%20chi2%20to%20d%20degrees%20of%20freedom&f=false
	
  	  BIG[indices_pdirect_df_exists, N.APPROX] <- BIG[indices_pdirect_df_exists, DF1]+2
	
	  d <- (qnorm(1-(BIG[indices_pdirect_df_exists, STAT]/2))*2)/sqrt(BIG[indices_pdirect_df_exists, N.APPROX])
      BIG[indices_pdirect_df_exists, D] <- d * BIG[indices_pdirect_df_exists, SIGN]
	  
      BIG[indices_pdirect_df_exists, G] <- BIG[indices_pdirect_df_exists, D] * (1 - (3 / (4 * BIG[indices_pdirect_df_exists, N.APPROX] - 1)))
    }
  }


  # store significance (computed p-value must be less than (un)specified critical value)
  BIG[, SIGNIFICANT] <- BIG[, P.VALUE] < BIG[, CRIT.VALUE]

  # store p-value one-tailed by dividing p-value in half
  BIG[, P.VALUE.ONE] <- BIG[, P.VALUE] / 2

  # "actual p-value" must be in accordance to one-tailed specification
  BIG[, P.ACTUAL] <- BIG[, P.VALUE]
  BIG[indices_one_tailed, P.ACTUAL] <- BIG[indices_one_tailed, P.VALUE.ONE]

  # init error
  #BIG[, P.REPORTED.ERROR]     <- rep(0, nlines)
  #BIG[, P.REPORTED.ERROR.DIRECTION] <- rep(0, nlines)

  # check all "p < ?" specifications for reporting errors
  if(length(indices_p_lt)){
    BIG[indices_p_lt, P.REPORTED.ERROR] <- BIG[indices_p_lt, P.ACTUAL] >= BIG[indices_p_lt, P.REPORTED]
    BIG[indices_p_lt, P.REPORTED.ERROR.DIRECTION] <- -BIG[indices_p_lt, P.REPORTED.ERROR]
  }

  # check all "p <= ?" specifications for reporting errors
  if(length(indices_p_leq)) {
    BIG[indices_p_leq, P.REPORTED.ERROR] <- BIG[indices_p_leq, P.ACTUAL] > BIG[indices_p_leq, P.REPORTED]
    BIG[indices_p_leq, P.REPORTED.ERROR.DIRECTION] <- -BIG[indices_p_leq, P.REPORTED.ERROR]
  }

  # check all "p > ?" specifications for reporting errors
  if(length(indices_p_gt)){
    BIG[indices_p_gt, P.REPORTED.ERROR] <- BIG[indices_p_gt, P.ACTUAL] <= BIG[indices_p_gt, P.REPORTED]
    BIG[indices_p_gt, P.REPORTED.ERROR.DIRECTION] <- BIG[indices_p_gt, P.REPORTED.ERROR]
  }

  # check all "p = ?" specifications for reporting errors
  if(length(indices_p_eq)){
    difference <- BIG[indices_p_eq, P.REPORTED] - round(BIG[indices_p_eq, P.ACTUAL], BIG[indices_p_eq, P.REPORTED.DECS])
    BIG[indices_p_eq, P.REPORTED.ERROR.DIRECTION] <- sign(difference)
    BIG[indices_p_eq, P.REPORTED.ERROR] <- difference != 0
  }

  # check d
  indices_cohens_d2 <- which(!is.na(BIG[, D]) & !is.na(BIG[, D.REPORTED]))
  if(length(indices_cohens_d2)) {
    difference <- BIG[indices_cohens_d2, D.REPORTED] - round(BIG[indices_cohens_d2, D], BIG[indices_cohens_d2, D.REPORTED.DECS])
    BIG[indices_cohens_d2, D.REPORTED.ERROR.DIRECTION] <- sign(difference)
    BIG[indices_cohens_d2, D.REPORTED.ERROR] <- difference != 0
  }

 
  # ---------------------------------------------------------------------
  # Bring errors into a nice shape
  
  # find indices of lines with and without error
  has_no_error <- stri_isempty(errors)
  indices_no_error <- which(has_no_error)
  indices_error    <- which(!has_no_error)

  # produce error message
  ERRORS <- NULL
  if(length(indices_error)) {
    ERRORS <- matrix(
      c(
        as.character(indices_not_empty[indices_error]),
        txt.lines[indices_error],
        errors[indices_error]
      ),
      ncol=3
    )
  }
  
  # ---------------------------------------------------------------------
  # Bring warnings into a nice shape
  
  # find indices of lines with and without error
  has_no_warning <- stri_isempty(warning)
  indices_no_warning <- which(has_no_warning)
  indices_warning    <- which(!has_no_warning)

  # produce warning message
  WARNINGS <- NULL
  if(length(indices_warning)) {
    WARNINGS <- matrix(
      c(
        as.character(indices_not_empty[indices_warning]),
        txt.lines[indices_warning],
        warning[indices_warning]
      ),
      ncol=3
    )
  }


  p.reported.error.direction <- c("smaller", "", "", "bigger")[match(BIG[,P.REPORTED.ERROR.DIRECTION], c(-1,0,NA,1))]
  d.reported.error.direction <- c("smaller", "", "", "bigger")[match(BIG[,D.REPORTED.ERROR.DIRECTION], c(-1,0,NA,1))]

  global.reporting.error <- is.one(BIG[,P.REPORTED.ERROR]) | is.one(BIG[,D.REPORTED.ERROR])

  # convert data to data.frame as return value
  res <- data.frame(
    line = indices_not_empty,
    paper_id = PAPER_ID,
    study_id = STUDY_ID,
    focal = as.logical(BIG[,IS.FOCAL]),
    type	= type.factor,
    df1 	= BIG[,DF1],
    df2 	= BIG[,DF2],
    d		  = BIG[,D],
    g		  = BIG[,G],
    n.approx = BIG[,N.APPROX],
    statistic = BIG[,STAT],
    p.value	= BIG[,P.VALUE],
    p.value.one	= BIG[,P.VALUE.ONE],
    p.reported = p.reported.str,
    p.crit	= BIG[,CRIT.VALUE],
    significant = as.logical(BIG[,SIGNIFICANT]),
    one.tailed = as.logical(BIG[,ONE.TAILED]),
    reporting.error = as.logical(BIG[,P.REPORTED.ERROR]),
    error.direction = p.reported.error.direction,
    parse.error = !has_no_error,
    d.reported = BIG[,D.REPORTED],
    d.reported.str = d.reported.str,
    d.reported.lower = BIG[,D.REPORTED.LOWER],
    d.reported.upper = BIG[,D.REPORTED.UPPER],
    d.reported.error = as.logical(BIG[,D.REPORTED.ERROR]),
    d.reported.error.direction = d.reported.error.direction,
    global.reporting.error = global.reporting.error,
	p.value.log	= BIG[,P.VALUE.LOG],
	d.var = BIG[, D.VAR],
	d.se = sqrt(BIG[, D.VAR]),
	studydesign = BIG[, STUDYDESIGN],
    stringsAsFactors = FALSE
  )


  # add attribute warnings to object "res"
  attr(res, 'ERRORS') <- ERRORS
  attr(res, 'WARNINGS') <- WARNINGS

  # return data.frame
  return(res)
}


parse_ES("t")
#
# parse_ES("t(72) = 4.80; BS
# t(72) = 4.80; WS
# t(72) = 5.80; WS")
#
# (p <- parse_ES("t(47, 4)=2.1"))
# (p <- parse_ES("t(47, 4)=2.1; BS"))
# (p <- parse_ES("t(47, 4)=2.1; WS"))
#
# (p <- parse_ES("F(1, 47)=4.41"))
# (p <- parse_ES("F(1, 47)=4.41; BS"))
# (p <- parse_ES("F(1, 47)=4.41; WS"))
nicebread/p-checker documentation built on May 24, 2023, 8:35 a.m.