Nothing
# Copyright 2010-2021 Meik Michalke <meik.michalke@hhu.de>
#
# This file is part of the R package koRpus.
#
# koRpus is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# koRpus is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with koRpus. If not, see <http://www.gnu.org/licenses/>.
# this internal function does the real readability calculations,
# so it's mostly called by readability()
## TODO:
# Fry Graph
# Raygor Estimate Graph
# https://en.wikipedia.org/wiki/Raygor_readability_estimate
# - harris-jacobson wide range (noch keine formel)
# - dolch sight words suite (noch keine formel)
# - mcalpine EFLAW
# devereaux (smith, s. kane)
# - grade eqivalent zum DRP
rdb_indices <- matrix(
c(
# fast sylls params validated
TRUE, FALSE, TRUE, TRUE, # ARI
FALSE, FALSE, FALSE, TRUE, # ARI.NRI
FALSE, FALSE, FALSE, FALSE, # ARI.simple
TRUE, FALSE, TRUE, TRUE, # Bormuth
TRUE, TRUE, TRUE, FALSE, # Coleman
TRUE, FALSE, TRUE, TRUE, # Coleman.Liau
TRUE, FALSE, TRUE, TRUE, # Dale.Chall
FALSE, FALSE, FALSE, TRUE, # Dale.Chall.old
FALSE, FALSE, FALSE, TRUE, # Dale.Chall.PSK
TRUE, FALSE, TRUE, FALSE, # Danielson.Bryan
TRUE, FALSE, TRUE, FALSE, # Dickes.Steiwer
TRUE, FALSE, FALSE, TRUE, # DRP
TRUE, TRUE, TRUE, FALSE, # ELF
TRUE, TRUE, TRUE, TRUE, # Farr.Jenkins.Paterson
FALSE, TRUE, FALSE, TRUE, # Farr.Jenkins.Paterson.PSK
TRUE, TRUE, TRUE, TRUE, # Flesch
FALSE, TRUE, FALSE, FALSE, # Flesch.Brouwer
FALSE, TRUE, FALSE, FALSE, # Flesch.de
FALSE, TRUE, FALSE, TRUE, # Flesch.es
FALSE, TRUE, FALSE, FALSE, # Flesch.fr
TRUE, TRUE, TRUE, TRUE, # Flesch.Kincaid
FALSE, TRUE, FALSE, FALSE, # Flesch.nl
FALSE, TRUE, FALSE, TRUE, # Flesch.nl-b
FALSE, TRUE, FALSE, TRUE, # Flesch.PSK
FALSE, TRUE, FALSE, TRUE, # Flesch.Szigriszt
FALSE, TRUE, TRUE, TRUE, # FOG
FALSE, TRUE, FALSE, TRUE, # FOG.NRI
FALSE, TRUE, FALSE, TRUE, # FOG.PSK
TRUE, TRUE, TRUE, TRUE, # FORCAST
FALSE, TRUE, FALSE, TRUE, # FORCAST.RGL
TRUE, TRUE, FALSE, FALSE, # Fucks
TRUE, FALSE, TRUE, FALSE, # Gutierrez
TRUE, FALSE, TRUE, FALSE, # Harris.Jacobson
TRUE, TRUE, TRUE, TRUE, # Linsear.Write
TRUE, FALSE, TRUE, TRUE, # LIX
TRUE, TRUE, TRUE, FALSE, # nWS
TRUE, FALSE, TRUE, TRUE, # RIX
TRUE, TRUE, TRUE, TRUE, # SMOG
FALSE, TRUE, FALSE, FALSE, # SMOG.C
FALSE, TRUE, FALSE, FALSE, # SMOG.de
FALSE, TRUE, FALSE, TRUE, # SMOG.simple
TRUE, FALSE, TRUE, TRUE, # Spache
FALSE, FALSE, FALSE, TRUE, # Spache.de
FALSE, FALSE, FALSE, TRUE, # Spache.old
TRUE, TRUE, TRUE, FALSE, # Strain
TRUE, FALSE, TRUE, FALSE, # Traenkle.Bailer
TRUE, TRUE, TRUE, FALSE, # TRI
TRUE, TRUE, TRUE, TRUE, # Tuldava
TRUE, TRUE, TRUE, TRUE, # Wheeler.Smith
FALSE, TRUE, FALSE, TRUE # Wheeler.Smith.de
),
ncol=4,
byrow=TRUE,
dimnames=list(
c(
"ARI",
"ARI.NRI",
"ARI.simple",
"Bormuth",
"Coleman",
"Coleman.Liau",
"Dale.Chall",
"Dale.Chall.old",
"Dale.Chall.PSK",
"Danielson.Bryan",
"Dickes.Steiwer",
"DRP",
"ELF",
"Farr.Jenkins.Paterson",
"Farr.Jenkins.Paterson.PSK",
"Flesch",
"Flesch.Brouwer",
"Flesch.de",
"Flesch.es",
"Flesch.fr",
"Flesch.Kincaid",
"Flesch.nl",
"Flesch.nl-b",
"Flesch.PSK",
"Flesch.Szigriszt",
"FOG",
"FOG.NRI",
"FOG.PSK",
"FORCAST",
"FORCAST.RGL",
"Fucks",
"Gutierrez",
"Harris.Jacobson",
"Linsear.Write",
"LIX",
"nWS",
"RIX",
"SMOG",
"SMOG.C",
"SMOG.de",
"SMOG.simple",
"Spache",
"Spache.de",
"Spache.old",
"Strain",
"Traenkle.Bailer",
"TRI",
"Tuldava",
"Wheeler.Smith",
"Wheeler.Smith.de"
),
c(
"fast",
"sylls",
"params",
"validated"
)
)
)
## function validate_parameters()
# - given: character string or list of parameter names
# - index: name of the index that all values of "given" qill be checked against
# if "all" a basic check is done whether params for unknown indizes are given
validate_parameters <- function(given, index){
if(identical(index, "all")){
valid <- rdb_indices[,"params"]
given <- names(given)
index <- "parameters"
} else {
valid <- sapply(unlist(rdb_parameters(index=index)), function(x) TRUE)
given <- names(unlist(given))
}
if(
any(
all(identical(index, "parameters"), isTRUE(all(valid[given]))),
all(sum(valid[given], na.rm=TRUE) == length(given), length(valid) == length(given))
)
){
return(TRUE)
} else {
invalid_params <- given[!given %in% names(valid)]
missing_params <- names(valid)[!names(valid) %in% given]
check_location <- paste0(" in \"", index, "\"")
if(length(missing_params)){
stop(simpleError(paste0("Missing parameters", check_location, ": \"", paste(missing_params, collapse="\", \""), "\"")))
} else {
stop(simpleError(paste0("Invalid parameters", check_location, ": \"", paste(invalid_params, collapse="\", \""), "\"")))
}
}
} ## end function validate_parameters()
## function check_parameters()
# takes provided parameters (if any) and tries to figure out if they should be
# used as-is or need to be replaced because they are a shortcut string
# the function calls validate_parameters() and returns a list of two
# elements, "p" with the parameters to use and "f" with the name of the
# index flavour
# - index: character string, name of the readability index
# - given: a list of parameters to check
# - short: a named list of character strings, where element names are shortcuts used
# as parameters and the value is to be used as its flavour; if missing
# the flavour will be "default", "custom", or identical to the shortcut
check_parameters <- function(
index,
given,
flav_names=NULL
){
if(length(given)){
if(all(length(given) == 1, is.null(names(given)))){
# should be a shortcut
flavour <- given
params <- rdb_parameters(index=index, flavour=given)
} else {
# should be custom parameters
validate_parameters(given=given, index=index)
flavour <- ifelse(
identical(given, rdb_parameters(index=index, flavour="default")),
"default",
"custom"
)
params <- given
}
} else {
# no parameters given, fall back to defaults
flavour <- "default"
params <- rdb_parameters(index=index, flavour=flavour)
}
# finally check for flavour name replacement
if(length(flav_names[[flavour]])){
flavour <- flav_names[[flavour]]
}
return(list(p=params, f=flavour))
} ## end function check_parameters()
## additional options:
# - analyze.text:
# TRUE for kRp.rdb.formulae(), i.e. get values from text object examination,
# FALSE to calculate with the values in txt.features
# - txt.features: a named list with key values needed to calculate
#' @importFrom sylly hyphenText hyphenText<-
kRp.rdb.formulae <- function(
txt.file=NULL,
hyphen=NULL,
index=c(),
parameters=list(),
word.lists=list(),
fileEncoding="UTF-8",
sentc.tag="sentc",
nonword.class="nonpunct",
nonword.tag=c(),
quiet=FALSE,
keep.input=NULL,
analyze.text=TRUE,
txt.features=list()
){
## TODO: validation
if(identical(index, "validation")){
message(paste0("
The following implementations have already been checked against various tools
to validate the correctness of calculation. This doesn't mean they always came to identical
results at once, since the accuracy of input data (like number of syllables or sentences)
varies considerably. But if these differences were manually corrected, the results were similar/identical:
- ARI [OUT, RDS, FRT, WDC]
- NRI [RDS (labeled as \"simplified\")]
- Bormuth Mean Cloze [RDS]
- Coleman-Liau [OUT, RDS]
- Dale-Chall [RDS]
- PSK [RDS]
- Dale-Chall (1948) [OKP]
- DRP [RDS]
- Farr-Jenkins-Paterson [RDS]
- PSK [RDS]
- Flesch [OUT, RDS, TAL, LLB, JRT, WDC]
- PSK [RDS]
- Szigriszt (es) [INF]
- Flesch.es [INF]
- Flesch-Kincaid [OUT, RDS, JRT, WDC]
- FOG [GFI, RDS, OUT, JRT, WDC]
- PSK [RDS]
- FORCAST [RDS]
- Harris-Jacobson (HJ2) [RDS]
- LIX [RDS]
- Linsear Write [FRT]
- RIX [RDS]
- SMOG [OUT, RDS, WDC]
- Spache [RDS]
- Wheeler-Smith [RDS, WHE]
These measures produce plausible results, but need checking:
- ARI simplified
- Coleman Formulas (1-4)
- Danielson-Bryan (1-2)
- Dickes-Steiwer's Handformel
- Easy Listening Formula
- Flesch.de
- Flesch.fr
- Flesch.nl
- some papers use 0.33 and other 0.93 for the average sentence length parameter!
- Flesch.Brouwer (nl)
- Fucks
- Gutierrez
- Harris-Jacobson (1-5)
- Neue Wiener Sachtextformeln (1-4)
- SMOG Qu
- SMOG C
- Strain
- Traenkle-Bailer
These measures look bogus:
- TRI
Tools used:
FRT: https://www.readabilityformulas.com/free-readability-formula-tests.php (17 apr 2021)
GFI: http://gunning-fog-index.com
INF: INFLESZ v1.0, http://www.legibilidad.com/home/descargas.html
JRT: https://juicystudio.com/services/readability.php (17 apr 2021)
LLB: http://www.leichtlesbar.ch/html/ (17 apr 2021)
OKP: http://www.lefthandlogic.com/htmdocs/tools/okapi/okapi.php
OUT: https://www.online-utility.org/english/readability_test_and_improve.jsp (17 apr 2021)
RDS: Readability Studio, version 3.2.7.0 (14 jan 2011)
TAL: http://www.textalyser.net
WDC: http://wordscount.info/wc/jsp/clear/analyze_smog.jsp (original SMOG implementation)
Other:
WHE: example from original article by Wheeler & Smith"
))
return(invisible(NULL))
} else {}
# see if just the default parameters should be returned:
if(identical(parameters, "dput")){
# rdb_parameters() is defined in koRpus-internal.rdb.params.grades.R
return(rdb_parameters("dput"))
} else {}
# check for given magic numbers
if(!is.list(parameters)){
stop(simpleError("Missing accurate paramteter list!"))
} else {
# first complain if unknown parameters supplied
validate_parameters(
names(parameters),
index="all"
)
}
all.valid.indices <- rownames(rdb_indices)
# activate all?
if(identical(index, "all")){
index <- rownames(rdb_indices)
} else {}
if(identical(index, "fast")){
# this should be like the defaults but without FOG
index <- rownames(rdb_indices)[rdb_indices[, "fast"]]
} else {}
need.sylls <- rownames(rdb_indices)[rdb_indices[, "sylls"]]
# use for keep.input
dropHyphen <- FALSE
if(isTRUE(analyze.text)){
#######################
## analyze.text=TRUE ##
#######################
lang <- language(txt.file)
docID <- doc_id(txt.file)
if(identical(nonword.class, "nonpunct")){
nonword.class <- kRp.POS.tags(lang, tags=c("punct","sentc"), list.classes=TRUE)
} else {}
tagged.words.only <- filterByClass(txt.file, corp.rm.class=nonword.class, corp.rm.tag=nonword.tag, update.desc=NULL)
if(is.null(describe(txt.file, doc_id=docID)[["all.words"]])){
describe(txt.file, doc_id=docID)[["all.words"]] <- tagged.words.only[["token"]]
} else {}
txt.desc <- describe(txt.file, doc_id=docID)
# check how to handle the hyphen parameter
# first see if there's results to re-use
if(all(hasFeature(txt.file, "hyphen"), is.null(hyphen))){
hyphen <- corpusHyphen(txt.file)
} else {}
# we don't need hyphenation for certain formulas,
# then we'll skip that step automatically
if(any(index %in% need.sylls)){
if(is.null(hyphen)){
hyphen <- hyphen(txt.file, corp.rm.class=nonword.class, corp.rm.tag=nonword.tag, quiet=quiet)
} else {
stopifnot(inherits(hyphen, "kRp.hyphen"))
dropHyphen <- TRUE
}
sylls.available <- TRUE
} else {
if(inherits(hyphen, "kRp.hyphen")){
sylls.available <- TRUE
dropHyphen <- TRUE
} else {
# we'll just create an empty object to not disturb the other functions
hyphen <- new("kRp.hyphen")
sylls.available <- FALSE
dropHyphen <- TRUE
}
}
if(identical(sentc.tag, "sentc")){
sentc.tag <- kRp.POS.tags(lang, tags="sentc", list.tags=TRUE)
} else {}
## some global calculations, for use in several indices
# we'll use some default minimum values for the number of sentences
# and average sentence length
txt.words.only <- txt.desc$all.words
num.sentences <- ifelse(txt.desc$sentences < 1, 1, txt.desc$sentences)
num.words <- txt.desc$words
num.all.chars <- txt.desc$all.chars
# txt.desc$letters had all digits removed
num.letters <- txt.desc$letters[["all"]]
lttr.distrib <- txt.desc$lttr.distrib
fixed.letters <- txt.desc$letters
avg.sntc.len <- ifelse(txt.desc$sentences < 1, num.words, txt.desc$avg.sentc.length)
avg.word.len <- txt.desc$avg.word.length
sntc.per.word <- num.sentences / num.words
sntc.per100 <- sntc.per.word * 100
lett.per100 <- num.letters * 100 / num.words
num.punct <- txt.desc$punct
if(isTRUE(sylls.available)){
num.syll <- describe(hyphen)[["num.syll"]]
num.monosyll <- describe(hyphen)[["syll.distrib"]]["num", 1]
pct.monosyll <- num.monosyll * 100 / num.words
syll.distrib <- describe(hyphen)[["syll.distrib"]]
syll.uniq.distrib <- describe(hyphen)[["syll.uniq.distrib"]]
avg.syll.word <- describe(hyphen)[["avg.syll.word"]]
syll.per100 <- describe(hyphen)[["syll.per100"]]
fixed.syllables <- distrib.to.fixed(syll.distrib, all.values=num.syll, idx="s")
} else {
num.syll <- NA
num.monosyll <- NA
pct.monosyll <- NA
syll.distrib <- NA
syll.uniq.distrib <- NA
avg.syll.word <- NA
syll.per100 <- NA
fixed.syllables <- NULL
}
# check if the "Dickes.Steiwer" parameters are set for TTR
DS.case.sens <- check_parameters(
index="Dickes.Steiwer",
given=parameters[["Dickes.Steiwer"]]
)[["case.sens"]]
num.TTR <- slot(TTR(txt.file, case.sens=DS.case.sens, quiet=TRUE), "TTR")
# some word classes needed by one formula or the other
num.conjunctions <- sum(taggedText(txt.file)$wclass %in% "conjunction", na.rm=TRUE)
num.prepositions <- sum(taggedText(txt.file)$wclass %in% "preposition", na.rm=TRUE)
num.pronouns <- sum(taggedText(txt.file)$wclass %in% "pronoun", na.rm=TRUE)
num.foreign <- sum(taggedText(txt.file)$wclass %in% "foreign", na.rm=TRUE)
only.kRp.tags <- all(taggedText(txt.file)[["tag"]] %in% kRp.POS.tags("kRp", list.tags=TRUE))
} else {
########################
## analyze.text=FALSE ##
########################
# get needed values from txt.features
txt.desc <- txt.features
txt.words.only <- txt.features$words # needed for measures with word lists
num.sentences <- txt.features$sentences
num.words <- txt.features$words
num.all.chars <- txt.features$all.chars # danielson-bryan
num.letters <- txt.features$letters[["all"]]
num.punct <- txt.features$punct
lttr.distrib <- distrib.from.fixed(txt.features$letters, all.words=num.words, idx="l", unaccounted="x")
txt.desc[["lttr.distrib"]] <- lttr.distrib
fixed.letters <- txt.features$letters
avg.sntc.len <- num.words / num.sentences
avg.word.len <- num.letters / num.words
sntc.per.word <- num.sentences / num.words
sntc.per100 <- sntc.per.word * 100
lett.per100 <- avg.word.len * 100
num.syll <- txt.features$syllables[["all"]]
num.monosyll <- txt.features$syllables[["s1"]]
pct.monosyll <- num.monosyll * 100 / num.words
syll.distrib <- distrib.from.fixed(txt.features$syllables, all.words=num.words, idx="s", unaccounted="x")
syll.uniq.distrib <- txt.features$syll.uniq.distrib
fixed.syllables <- txt.features$syllables
avg.syll.word <- num.syll / num.words
syll.per100 <- avg.syll.word * 100
num.TTR <- txt.features$TTR
num.conjunctions <- txt.features$conjunctions
num.prepositions <- txt.features$prepositions
num.pronouns <- txt.features$pronouns
num.foreign <- txt.features$foreign
only.kRp.tags <- FALSE
if(is.null(txt.features$lang)){
lang <- character()
} else {
lang <- txt.features$lang
}
if(!inherits(hyphen, "kRp.hyphen")){
hyphen <- new("kRp.hyphen",
desc=list(
num.syll=num.syll,
syll.distrib=syll.distrib,
syll.uniq.distrib=NA,
avg.syll.word=avg.syll.word,
syll.per100=syll.per100
))
} else {
dropHyphen <- TRUE
}
}
# keep the descriptives in the results
desc <- list(
sentences=num.sentences,
words=num.words,
letters=fixed.letters,
all.chars=num.all.chars,
syllables=fixed.syllables,
lttr.distrib=lttr.distrib,
syll.distrib=syll.distrib,
syll.uniq.distrib=syll.uniq.distrib,
punct=num.punct,
conjunctions=num.conjunctions,
prepositions=num.prepositions,
pronouns=num.pronouns,
foreign=num.foreign,
TTR=num.TTR,
avg.sentc.length=avg.sntc.len,
avg.word.length=avg.word.len,
avg.syll.word=avg.syll.word,
sntc.per.word=sntc.per.word,
sntc.per100=sntc.per100,
lett.per100=lett.per100,
syll.per100=syll.per100,
FOG.hard.words=NULL,
Bormuth.NOL=NULL,
Dale.Chall.NOL=NULL,
Harris.Jacobson.NOL=NULL,
Spache.NOL=NULL
)
# initialize result object
all.results <- kRp_readability(
lang=lang,
desc=desc,
param=parameters
)
if(isTRUE(analyze.text)){
if(isTRUE(keep.input)){
feature(txt.file, "hyphen") <- hyphen
} else if(is.null(keep.input)){
if(!isTRUE(dropHyphen)){
feature(txt.file, "hyphen") <- hyphen
} else {}
} else {}
} else {}
#####################################
## readability measures start here ##
#####################################
## Automated Readability Index (ARI)
if("ARI" %in% index){
pf <- check_parameters(
index="ARI",
given=parameters[["ARI"]],
flav_names=list(simple="simplified")
)
ARI.grade <- pf[["p"]][["asl"]] * avg.sntc.len + pf[["p"]][["awl"]] * avg.word.len - pf[["p"]][["const"]]
if(identical(pf[["f"]], "simplified")){
slot(all.results, "ARI") <- list(flavour=pf[["f"]], index=ARI.grade, grade=NA)
} else {
slot(all.results, "ARI") <- list(flavour=pf[["f"]], index=NA, grade=ARI.grade)
}
} else{}
# recursive calls for alternative shortcuts
if("ARI.NRI" %in% index){
# setting hyphen to NULL here, because otherwise we could end up with pseudo hyphen objects crashing the function
slot(all.results, "ARI.NRI") <- slot(kRp.rdb.formulae(txt.file, hyphen=NULL, index=c("ARI"), parameters=list(ARI="NRI"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "ARI")
} else {}
if("ARI.simple" %in% index){
# setting hyphen to NULL here, because otherwise we could end up with pseudo hyphen objects crashing the function
slot(all.results, "ARI.simple") <- slot(kRp.rdb.formulae(txt.file, hyphen=NULL, index=c("ARI"), parameters=list(ARI="simple"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "ARI")
} else {}
## Bormuth Grade Level
if("Bormuth" %in% index | "DRP" %in% index ){
# check for word list and add it to params, otherwise skip this index
if(!"Bormuth" %in% names(word.lists) | is.null(word.lists[["Bormuth"]])){
warning("Bormuth: Missing word list, hence not calculated.", call.=FALSE)
} else {
bormuth.word.list <- read.word.list(word.lists[["Bormuth"]], encoding=fileEncoding)
pf <- check_parameters(
index="Bormuth",
given=parameters[["Bormuth"]]
)
fam.words.all.txt <- difficult.words(txt.words.only, bormuth.word.list)
fam.words.txt <- fam.words.all.txt[["num.listed"]]/num.words
fam.words.nol <- fam.words.all.txt[["words.not.listed"]]
bmc <- pf[["p"]][["meanc"]][["const"]] - (pf[["p"]][["meanc"]][["awl"]] * avg.word.len) + (pf[["p"]][["meanc"]][["afw"]] * fam.words.txt^3) -
(pf[["p"]][["meanc"]][["asl1"]] * avg.sntc.len) + (pf[["p"]][["meanc"]][["asl2"]] * avg.sntc.len^2) - (pf[["p"]][["meanc"]][["asl3"]] * avg.sntc.len^3)
cloze.crit <- pf[["p"]][["clz"]] / 100
# here comes the rather long formula...
bm.grade <- pf[["p"]][["grade"]][["const"]] + (pf[["p"]][["grade"]][["m1"]] * bmc) - (pf[["p"]][["grade"]][["m2"]] * bmc^2) + (pf[["p"]][["grade"]][["m3"]] * bmc^3) +
(pf[["p"]][["grade"]][["c1"]] * cloze.crit) - (pf[["p"]][["grade"]][["c2"]] * cloze.crit^2) - (pf[["p"]][["grade"]][["c3"]] * cloze.crit^3) -
(pf[["p"]][["grade"]][["mc1"]] * bmc * cloze.crit) + (pf[["p"]][["grade"]][["mc2"]] * (bmc * cloze.crit)^2) - (pf[["p"]][["grade"]][["mc3"]] * (bmc * cloze.crit)^3)
# preserve hard words in the desc slot
if(isTRUE(analyze.text)){
slot(all.results, "desc")[["Bormuth.NOL"]] <- length(fam.words.nol)
} else {
slot(all.results, "desc")[["Bormuth.NOL"]] <- bormuth.word.list
}
slot(all.results, "Bormuth") <- list(flavour=pf[["f"]], word.list=bormuth.word.list, not.on.list=fam.words.nol, pct.fam=fam.words.txt * 100, mean.cloze=bmc * 100, grade=bm.grade)
}
} else{}
## Coleman Formulas
if("Coleman" %in% index){
# this formula needs proper POS tags; skip if missing
if(only.kRp.tags){
# this text was just tagged with tokenize() and misses important tags
warning("Coleman: POS tags are not elaborate enough, can't count pronouns and prepositions. Formulae skipped.", call.=FALSE)
} else {
pf <- check_parameters(
index="Coleman",
given=parameters[["Coleman"]]
)
coleman.words <- describe(hyphen)[["syll.distrib"]]["cum.sum", pf[["p"]][["syll"]]] * 100 / num.words
coleman.sentc <- sntc.per100
coleman.pronoun <- num.pronouns * 100 / num.words
coleman.prepos <- num.prepositions * 100 / num.words
Coleman1 <- (pf[["p"]][["clz1"]][["word"]] * coleman.words) - pf[["p"]][["clz1"]][["const"]]
Coleman2 <- (pf[["p"]][["clz2"]][["word"]] * coleman.words) + (pf[["p"]][["clz2"]][["sntc"]] * coleman.sentc) - pf[["p"]][["clz2"]][["const"]]
Coleman3 <- (pf[["p"]][["clz3"]][["word"]] * coleman.words) + (pf[["p"]][["clz3"]][["sntc"]] * coleman.sentc) + (pf[["p"]][["clz3"]][["pron"]] * coleman.pronoun) - pf[["p"]][["clz3"]][["const"]]
Coleman4 <- (pf[["p"]][["clz4"]][["word"]] * coleman.words) + (pf[["p"]][["clz4"]][["sntc"]] * coleman.sentc) + (pf[["p"]][["clz4"]][["pron"]] * coleman.pronoun) - (pf[["p"]][["clz4"]][["prep"]] * coleman.prepos) - pf[["p"]][["clz4"]][["const"]]
slot(all.results, "Coleman") <- list(flavour=pf[["f"]], num.pron=coleman.pronoun, num.prep=coleman.prepos, C1=Coleman1, C2=Coleman2, C3=Coleman3, C4=Coleman4)
}
} else {}
## Coleman-Liau
if("Coleman.Liau" %in% index){
pf <- check_parameters(
index="Coleman.Liau",
given=parameters[["Coleman.Liau"]]
)
co.li.ECP <- pf[["p"]][["ecp"]][["const"]] - (pf[["p"]][["ecp"]][["char"]] * lett.per100) + (pf[["p"]][["ecp"]][["sntc"]] * sntc.per100)
co.li.grade <- (pf[["p"]][["grade"]][["ecp"]] * (co.li.ECP / 100)) + pf[["p"]][["grade"]][["const"]]
co.li.short <- pf[["p"]][["short"]][["awl"]] * avg.word.len - pf[["p"]][["short"]][["spw"]] * sntc.per.word - pf[["p"]][["short"]][["const"]]
slot(all.results, "Coleman.Liau") <- list(flavour=pf[["f"]], ECP=co.li.ECP, grade=co.li.grade, short=co.li.short)
} else{}
## Dale-Chall
## TODO: remove names and numbers, they're not counted as unfamiliar!
if("Dale.Chall" %in% index){
# check for word list and add it to params, otherwise skip this index
if(!"Dale.Chall" %in% names(word.lists) | is.null(word.lists[["Dale.Chall"]])){
warning("Dale-Chall: Missing word list, hence not calculated.", call.=FALSE)
} else {
pf <- check_parameters(
index="Dale.Chall",
given=parameters[["Dale.Chall"]],
flav_names=list(
PSK="Powers-Sumner-Kearl",
old="Dale-Chall (1948)",
default="New Dale-Chall (1995)"
)
)
dale.chall.word.list <- read.word.list(word.lists[["Dale.Chall"]], encoding=fileEncoding)
diff.words.all.txt <- difficult.words(txt.words.only, dale.chall.word.list)
diff.words.txt <- diff.words.all.txt[["pct.not.listed"]]
diff.words.nol <- diff.words.all.txt[["words.not.listed"]]
if(identical(pf[["f"]], "Powers-Sumner-Kearl")){
dale.chall.raw <- pf[["p"]][["const"]] + (pf[["p"]][["asl"]] * avg.sntc.len) + (pf[["p"]][["dword"]] * diff.words.txt)
dale.chall.grade <- get.grade.level(dale.chall.raw, measure="Dale.Chall.PSK")
} else if(identical(pf[["f"]], "Dale-Chall (1948)")){
dale.chall.raw <- (pf[["p"]][["dword"]] * diff.words.txt) + (pf[["p"]][["asl"]] * avg.sntc.len) + pf[["p"]][["const"]]
dale.chall.grade <- get.grade.level(dale.chall.raw, measure="Dale.Chall")
} else {
dale.chall.raw <- pf[["p"]][["const"]] - (pf[["p"]][["dword"]] * diff.words.txt) - (pf[["p"]][["asl"]] * avg.sntc.len)
dale.chall.grade <- get.grade.level(dale.chall.raw, measure="Dale.Chall.new")
}
# preserve hard words in the desc slot
if(isTRUE(analyze.text)){
slot(all.results, "desc")[["Dale.Chall.NOL"]] <- length(diff.words.nol)
} else {
slot(all.results, "desc")[["Dale.Chall.NOL"]] <- dale.chall.word.list
}
slot(all.results, "Dale.Chall") <- list(flavour=pf[["f"]], word.list=dale.chall.word.list, not.on.list=diff.words.nol, pct=diff.words.txt, raw=dale.chall.raw,
grade=dale.chall.grade[["grade"]], grade.min=dale.chall.grade[["grade.min"]],
age=dale.chall.grade[["age"]], age.min=dale.chall.grade[["age.min"]])
}
} else{}
# recursive calls for alternative shortcuts
if("Dale.Chall.PSK" %in% index){
slot(all.results, "Dale.Chall.PSK") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("Dale.Chall"), parameters=list(Dale.Chall="PSK"),
word.lists=list(Dale.Chall=word.lists[["Dale.Chall"]]),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "Dale.Chall")
} else {}
if("Dale.Chall.old" %in% index){
slot(all.results, "Dale.Chall.old") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("Dale.Chall"), parameters=list(Dale.Chall="old"),
word.lists=list(Dale.Chall=word.lists[["Dale.Chall"]]),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "Dale.Chall")
} else {}
## Danielson-Bryan
# since after tokenization we cannot really count
# the blanks any more, we'll assume one blank between
# every two words, hence blanks = num.words - 1
if("Danielson.Bryan" %in% index){
pf <- check_parameters(
index="Danielson.Bryan",
given=parameters[["Danielson.Bryan"]]
)
db.num.blanks <- num.words - 1
db.avg.let.blnk <- num.all.chars / db.num.blanks
db.avg.let.sntc <- num.all.chars / num.sentences
DB1 <- (pf[["p"]][["db1"]][["cpb"]] * db.avg.let.blnk) + (pf[["p"]][["db1"]][["cps"]] * db.avg.let.sntc) - pf[["p"]][["db1"]][["const"]]
DB2 <- pf[["p"]][["db2"]][["const"]] - (pf[["p"]][["db2"]][["cpb"]] * db.avg.let.blnk) - (pf[["p"]][["db2"]][["cps"]] * db.avg.let.sntc)
DB2.grade <- get.grade.level(DB2, measure="Danielson.Bryan")
slot(all.results, "Danielson.Bryan") <- list(flavour=pf[["f"]], avg.blank=db.avg.let.blnk, avg.sentc=db.avg.let.sntc,
DB1=DB1, DB2=DB2, DB2.grade=DB2.grade[["grade"]], DB2.grade.min=DB2.grade[["grade.min"]])
} else {}
## Degrees of Reading Power (DRP)
if("DRP" %in% index){
# check for word list and add it to params, otherwise skip this index
if(identical(slot(all.results, "Bormuth"), list(NA))){
warning("DRP: Missing Bormuth Mean Cloze, hence not calculated.", call.=FALSE)
} else {
# re-use the Bormuth score that we just calculated
drp.final <- (1 - slot(all.results, "Bormuth")[["mean.cloze"]]) * 100
slot(all.results, "DRP") <- list(DRP=drp.final)
}
} else {}
## Dickes-Steiwer
# only "shortcut formula" due to missing information from POS tagging
# if more elaborate info on pronouns was available,
# the "computer formula" could easily be added
if("Dickes.Steiwer" %in% index){
pf <- check_parameters(
index="Dickes.Steiwer",
given=parameters[["Dickes.Steiwer"]]
)
Dickes.Steiwer.score <- pf[["p"]][["const"]] - (log(avg.word.len + 1) * pf[["p"]][["awl"]]) - (log(avg.sntc.len + 1) * pf[["p"]][["asl"]]) - (num.TTR * pf[["p"]][["ttr"]])
slot(all.results, "Dickes.Steiwer") <- list(flavour=pf[["f"]], TTR=num.TTR, Dickes.Steiwer=Dickes.Steiwer.score)
} else {}
## Easy Listening Formula
if("ELF" %in% index){
pf <- check_parameters(
index="ELF",
given=parameters[["ELF"]]
)
ELF.exsyls <- long.words(pf[["p"]][["syll"]] - 1, hyphen=hyphen)
ELF.score <- ELF.exsyls / num.sentences
slot(all.results, "ELF") <- list(flavour=pf[["f"]], num.exsyls=ELF.exsyls, ELF=ELF.score)
} else {}
## "Fasse-Dich-Kurz-Index" (FDK-Index, Rolf W. Schirm)
# 1. Zaehlen Sie die Saetze (ohne Ueberschrift, Gruss oder Anrede)
# 2. Zaehlen Sie die Woerter, die mehr als zwei Silben haben (ohne Namen und Fachwoerter)
# 3. Errechnen Sie den "Fasse-Dich-Kurz-Index (FDK-Index)" nach der Formel:
#
# FDK = Anzahl der Woerter mit mehr als 2 Silben * 10 / Anzah1 der Saetze
#
# FDK bis 10: Sehr knapper Stil (Telegramm, Fernscheiben)
# FDK 11 - 25: Zeitgemaesser Stil (kurz, knapp, praezise)
# FDK 26 - 50: Weitschweifiger Stil (haeufig im geschaeftlichen Schriftverkehr)
# FDK ueber 50: Schwuelstiger, unklarer Stil
## Farr-Jenkins-Paterson
# simplified Flesch RE
# monsy: percentage of monosyllabic words
# asl: number of sentenvces
if("Farr.Jenkins.Paterson" %in% index){
pf <- check_parameters(
index="Farr.Jenkins.Paterson",
given=parameters[["Farr.Jenkins.Paterson"]],
flav_names=list(PSK="Powers-Sumner-Kearl")
)
fjp.RE <- pf[["p"]][["const"]] - (pf[["p"]][["asl"]] * avg.sntc.len) + (pf[["p"]][["monsy"]] * pct.monosyll)
fjp.grade <- get.grade.level(fjp.RE, measure="Flesch")
slot(all.results, "Farr.Jenkins.Paterson") <- list(flavour=pf[["f"]], FJP=fjp.RE, grade=fjp.grade[["grade"]], grade.min=fjp.grade[["grade.min"]])
} else {}
# recursive calls for alternative shortcuts
if("Farr.Jenkins.Paterson.PSK" %in% index){
slot(all.results, "Farr.Jenkins.Paterson.PSK") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("Farr.Jenkins.Paterson"), parameters=list(Farr.Jenkins.Paterson="PSK"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "Farr.Jenkins.Paterson")
} else {}
## Flesch Reading Ease
if("Flesch" %in% index){
pf <- check_parameters(
index="Flesch",
given=parameters[["Flesch"]],
flav_names=list(
default="en (Flesch)",
en="en (Flesch)",
de="de (Amstad)",
es="es (Fernandez-Huerta)",
"es-s"="es (Szigriszt)",
nl="nl (Douma)",
"nl-b"="nl (Brouwer)",
fr="fr (Kandel-Moles)",
PSK="Powers-Sumner-Kearl"
)
)
Flesch.RE <- pf[["p"]][["const"]] - (pf[["p"]][["asl"]] * avg.sntc.len) - (pf[["p"]][["asw"]] * avg.syll.word)
if(identical(pf[["f"]], "Powers-Sumner-Kearl")){
slot(all.results, "Flesch") <- list(flavour=pf[["f"]], RE=NA, grade=Flesch.RE, grade.min=Flesch.RE, age=(Flesch.RE + 5))
} else {
Flesch.grade <- get.grade.level(Flesch.RE, measure="Flesch")
slot(all.results, "Flesch") <- list(flavour=pf[["f"]], RE=Flesch.RE, grade=Flesch.grade[["grade"]], grade.min=Flesch.grade[["grade.min"]], age=NA)
}
} else {}
# recursive calls for alternative shortcuts
if("Flesch.PSK" %in% index){
slot(all.results, "Flesch.PSK") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("Flesch"), parameters=list(Flesch="PSK"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "Flesch")
} else {}
if("Flesch.de" %in% index){
slot(all.results, "Flesch.de") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("Flesch"), parameters=list(Flesch="de"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "Flesch")
} else {}
if("Flesch.es" %in% index){
slot(all.results, "Flesch.es") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("Flesch"), parameters=list(Flesch="es"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "Flesch")
} else {}
if("Flesch.Szigriszt" %in% index){
# Flesch-Szigriszt (Indice de Legibilidad de Flesch-Szigriszt (IFSZ))
# see http://www.legibilidad.com/home/acercade.html
slot(all.results, "Flesch.Szigriszt") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("Flesch"), parameters=list(Flesch="es-s"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "Flesch")
} else {}
if("Flesch.fr" %in% index){
slot(all.results, "Flesch.fr") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("Flesch"), parameters=list(Flesch="fr"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "Flesch")
} else {}
if("Flesch.nl" %in% index){
slot(all.results, "Flesch.nl") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("Flesch"), parameters=list(Flesch="nl"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "Flesch")
} else {}
if("Flesch.Brouwer" %in% index){
slot(all.results, "Flesch.Brouwer") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("Flesch"), parameters=list(Flesch="nl-b"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "Flesch")
} else {}
## Flesch-Kincaid Grade Level
if("Flesch.Kincaid" %in% index){
pf <- check_parameters(
index="Flesch.Kincaid",
given=parameters[["Flesch.Kincaid"]]
)
Flesch.Kincaid.GL <- (pf[["p"]][["asl"]] * avg.sntc.len) + (pf[["p"]][["asw"]] * avg.syll.word) - pf[["p"]][["const"]]
slot(all.results, "Flesch.Kincaid") <- list(flavour=pf[["f"]], grade=Flesch.Kincaid.GL, age=Flesch.Kincaid.GL+5)
} else {}
## FORCAST
if("FORCAST" %in% index){
pf <- check_parameters(
index="FORCAST",
given=parameters[["FORCAST"]],
flav_names=list(RGL="precise reading grade level")
)
FORCAST.monosyll <- describe(hyphen)[["syll.distrib"]]["cum.sum", pf[["p"]][["syll"]]] * 150 / num.words
FORCAST.grade <- pf[["p"]][["const"]] - (FORCAST.monosyll * pf[["p"]][["mult"]])
FORCAST.age <- FORCAST.grade + 5
slot(all.results, "FORCAST") <- list(flavour=pf[["f"]], grade=FORCAST.grade, age=FORCAST.age)
} else {}
# recursive calls for alternative shortcuts
if("FORCAST.RGL" %in% index){
slot(all.results, "FORCAST.RGL") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("FORCAST"), parameters=list(FORCAST="RGL"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "FORCAST")
} else {}
## Fucks Stilcharakteristik
if("Fucks" %in% index){
Fucks.raw <- avg.syll.word * avg.sntc.len
Fucks.grade <- sqrt(Fucks.raw)
slot(all.results, "Fucks") <- list(Fucks=Fucks.raw, grade=Fucks.grade)
} else {}
## Gunning FOG Index
if("FOG" %in% index){
pf <- check_parameters(
index="FOG",
given=parameters[["FOG"]],
flav_names=list(
PSK="Powers-Sumner-Kearl",
NRI="New FOG (NRI)"
)
)
FOG.hyphen <- hyphen
# a list for dropped tokens
FOG.dropped <- list(names=NA, combi=NA, verbs=NA)
if(isTRUE(analyze.text)){
# exclude certain words
# proper nouns/names will completely be omitted
FOG.names <- which(tagged.words.only[["wclass"]] == "name")
# check for verbs ending in -es, -ed, or -ing (or anything else set in pf[["p"]][["suffix"]])
# these endings must not be counted as syllables
FOG.verbs <- which(tagged.words.only[["wclass"]] == "verb")
FOG.verb.suffix <- paste0("(", paste(pf[["p"]][["suffix"]], collapse="|"), ")$")
FOG.verbs <- FOG.verbs[grepl(FOG.verb.suffix, tagged.words.only[FOG.verbs,"token"])]
# count one syllable less for these
if(length(FOG.verbs) > 0){
FOG.dropped[["verbs"]] <- tagged.words.only[FOG.verbs[hyphenText(FOG.hyphen)[FOG.verbs,"syll"] == pf[["p"]][["syll"]]],]
hyphenText(FOG.hyphen)[FOG.verbs,"syll"] <- hyphenText(FOG.hyphen)[FOG.verbs,"syll"] - 1
} else {}
# syllables of combined words must be counted separately
# \p{Pd} matches dashes, indicating hyphenated/combined words
FOG.combi <- which(grepl("\\p{Pd}", tagged.words.only[["token"]], perl=TRUE))
# next step: split into seperate parts, count syllables for each and
# drop all if none of the parts is long enough on its own
FOG.combi.dopped <- c()
for(combi in FOG.combi){
combi.to.check <- unlist(strsplit(tagged.words.only[combi, "token"], "\\p{Pd}", perl=TRUE))
# quite liberal check here, just to ensure that we don't run into empty strings, e.g. when a single dash was tagged as a word
if(all(any(nchar(combi.to.check, type="width") >= pf[["p"]][["syll"]]), all(nchar(combi.to.check, type="width") >= 1))){
## TODO: should this be cached or not?
FOG.this.combi <- hyphen(combi.to.check, hyph.pattern=lang, quiet=TRUE)
if(all(hyphenText(FOG.this.combi)[["syll"]] < pf[["p"]][["syll"]])){
# don't count this as a hard word
FOG.combi.dopped <- combi
} else {}
} else {
FOG.combi.dopped <- combi
}
}
FOG.num.syll <- hyphenText(FOG.hyphen)$syll
FOG.all.dropped <- c()
# disarm the found names
if(length(FOG.names) > 0){
FOG.dropped[["names"]] <- tagged.words.only[FOG.names,]
FOG.all.dropped <- c(FOG.all.dropped, FOG.names)
} else {}
if(length(FOG.combi.dopped) > 0){
FOG.dropped[["combi"]] <- tagged.words.only[FOG.combi.dopped,]
FOG.all.dropped <- c(FOG.all.dropped, FOG.combi.dopped)
} else {}
if(length(FOG.all.dropped) > 0){
FOG.num.syll[FOG.all.dropped] <- 0
} else {}
FOG.hard.words.absolute <- sum(FOG.num.syll > 2, na.rm=TRUE)
} else {
FOG.hard.words.absolute <- txt.features$FOG.hard.words
}
FOG.hard.words <- FOG.hard.words.absolute * 100 / num.words
if(identical(pf[["f"]], "Powers-Sumner-Kearl")){
Gunning.FOG <- pf[["p"]][["const"]] + (pf[["p"]][["asl"]] * avg.sntc.len) + (pf[["p"]][["hword"]] * FOG.hard.words)
} else if(identical(pf[["f"]], "New FOG (NRI)")){
FOG.easy.words <- (num.words - FOG.hard.words) * 100 / num.words
Gunning.FOG <- (((FOG.easy.words + (pf[["p"]][["hword"]] * FOG.hard.words)) / sntc.per100) - pf[["p"]][["const"]]) / pf[["p"]][["div"]]
} else {
if(isTRUE(analyze.text)){
FOG.hard.words <- sum(FOG.num.syll > (pf[["p"]][["syll"]] - 1), na.rm=TRUE) * 100 / num.words
} else {}
Gunning.FOG <- (avg.sntc.len + FOG.hard.words) * pf[["p"]][["const"]]
}
# preserve hard words in the desc slot
slot(all.results, "desc")[["FOG.hard.words"]] <- FOG.hard.words.absolute
slot(all.results, "FOG") <- list(flavour=pf[["f"]], dropped=FOG.dropped, FOG.hard.words=FOG.hard.words.absolute, FOG=Gunning.FOG)
} else {}
# recursive calls for alternative shortcuts
if("FOG.PSK" %in% index){
slot(all.results, "FOG.PSK") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("FOG"), parameters=list(FOG="PSK"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "FOG")
} else {}
if("FOG.NRI" %in% index){
slot(all.results, "FOG.NRI") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("FOG"), parameters=list(FOG="NRI"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "FOG")
} else {}
## Gutierrez
# see https://legible.es/blog/comprensibilidad-gutierrez-de-polini/
if("Gutierrez" %in% index){
pf <- check_parameters(
index="Gutierrez",
given=parameters[["Gutierrez"]]
)
Gutierrez.C <- pf[["p"]][["const"]] - (pf[["p"]][["awl"]] * avg.word.len) - (pf[["p"]][["asl"]] * avg.sntc.len)
slot(all.results, "Gutierrez") <- list(flavour=pf[["f"]], Gutierrez=Gutierrez.C)
} else {}
## Harris-Jacobson
# V1: percent unfamiliar words ("short list", grades 1 + 2)
# V2: asl
# V3: percent of long words (6+ letters)
# correction grade table see harris & jacobson (1974), p. 9
if("Harris.Jacobson" %in% index){
# check for word list and add it to params, otherwise skip four formulae of this index
if(!"Harris.Jacobson" %in% names(word.lists) | is.null(word.lists[["Harris.Jacobson"]])){
warning("Harris.Jacobson: Missing word list, hence not calculated.", call.=FALSE)
} else {
pf <- check_parameters(
index="Harris.Jacobson",
given=parameters[["Harris.Jacobson"]]
)
hj.word.list <- read.word.list(word.lists[["Harris.Jacobson"]], encoding=fileEncoding)
hj.diff.words.all.txt <- difficult.words(txt.words.only, hj.word.list)
hj.diff.words.txt <- hj.diff.words.all.txt[["pct.not.listed"]]
hj.diff.words.nol <- hj.diff.words.all.txt[["words.not.listed"]]
# percentage of long words
hj.long.words <- (long.words(pf[["p"]][["char"]], txt.desc=txt.desc) * 100) / num.words
# HJ1 <- 0.094 * V1 + 0.168 * V2 + 0.502
Harris.Jacobson1 <- (pf[["p"]][["hj1"]][["dword"]] * hj.diff.words.txt) + (pf[["p"]][["hj1"]][["asl"]] * avg.sntc.len) + pf[["p"]][["hj1"]][["const"]]
# HJ2 <- 0.140 * V1 + 0.153 * V2 + 0.560
Harris.Jacobson2 <- (pf[["p"]][["hj2"]][["dword"]] * hj.diff.words.txt) + (pf[["p"]][["hj2"]][["asl"]] * avg.sntc.len) + pf[["p"]][["hj2"]][["const"]]
# HJ3 <- 0.158 * V2 + 0.055 * V3 + 0.355
Harris.Jacobson3 <- (pf[["p"]][["hj3"]][["asl"]] * avg.sntc.len) + (pf[["p"]][["hj3"]][["lword"]] * hj.long.words) + pf[["p"]][["hj3"]][["const"]]
# HJ4 <- 0.070 * V1 + 0.125 * V2 + 0.037 * V3 + 0.497 ## best validity, formula of choice
Harris.Jacobson4 <- (pf[["p"]][["hj4"]][["dword"]] * hj.diff.words.txt) + (pf[["p"]][["hj4"]][["asl"]] * avg.sntc.len) + (pf[["p"]][["hj4"]][["lword"]] * hj.long.words) + pf[["p"]][["hj4"]][["const"]]
# HJ5 <- 0.118 * V1 + 0.134 * V2 + 0.032 * V3 + 0.424 ## > 3. grade
Harris.Jacobson5 <- (pf[["p"]][["hj5"]][["dword"]] * hj.diff.words.txt) + (pf[["p"]][["hj5"]][["asl"]] * avg.sntc.len) + (pf[["p"]][["hj5"]][["lword"]] * hj.long.words) + pf[["p"]][["hj5"]][["const"]]
# preserve hard words in the desc slot
if(isTRUE(analyze.text)){
slot(all.results, "desc")[["Harris.Jacobson.NOL"]] <- length(hj.diff.words.nol)
} else {
slot(all.results, "desc")[["Harris.Jacobson.NOL"]] <- hj.word.list
}
slot(all.results, "Harris.Jacobson") <- list(flavour=pf[["f"]],
word.list=hj.word.list, not.on.list=hj.diff.words.nol, pct=hj.diff.words.txt,
pct.long=hj.long.words,
HJ1=Harris.Jacobson1, HJ2=Harris.Jacobson2, HJ3=Harris.Jacobson3, HJ4=Harris.Jacobson4, HJ5=Harris.Jacobson5)
}
} else {}
## Laesbarhetsindex (LIX)
if("LIX" %in% index){
pf <- check_parameters(
index="LIX",
given=parameters[["LIX"]]
)
LIX.long.words <- long.words(pf[["p"]][["char"]], txt.desc=txt.desc)
LIX.idx <- (num.words / num.sentences) + ((LIX.long.words * pf[["p"]][["const"]]) / num.words)
LIX.rating <- get.grade.level(LIX.idx, measure="LIX")
LIX.grade <- get.grade.level(LIX.idx, measure="LIX.grade")
slot(all.results, "LIX") <- list(flavour=pf[["f"]], index=LIX.idx, rating=LIX.rating[["grade"]], grade=LIX.grade[["grade"]], grade.min=LIX.grade[["grade.min"]])
} else {}
## Linsear Write
if("Linsear.Write" %in% index){
pf <- check_parameters(
index="Linsear.Write",
given=parameters[["Linsear.Write"]]
)
hard.words <- (long.words(pf[["p"]][["long.syll"]] - 1, hyphen=hyphen) * 100) / num.words
easy.words <- 100 - ((long.words(pf[["p"]][["short.syll"]], hyphen=hyphen) * 100) / num.words)
Linsear.Write.raw <- (easy.words + (hard.words * 3)) / sntc.per100
if(Linsear.Write.raw > pf[["p"]][["thrs"]]){
Linsear.Write.RM <- Linsear.Write.raw / 2
} else {
Linsear.Write.RM <- (Linsear.Write.raw - 2) / 2
}
slot(all.results, "Linsear.Write") <- list(flavour=pf[["f"]], easy.words=easy.words, hard.words=hard.words, raw=Linsear.Write.raw, grade=Linsear.Write.RM)
} else {}
## Neue Wiener Sachtextformeln
if("nWS" %in% index){
pf <- check_parameters(
index="nWS",
given=parameters[["nWS"]]
)
wien.MS <- long.words(pf[["p"]][["ms.syll"]] - 1, hyphen=hyphen) / num.words
wien.SL <- avg.sntc.len
wien.IW <- long.words(pf[["p"]][["iw.char"]], txt.desc=txt.desc) / num.words
wien.ES <- describe(hyphen)[["syll.distrib"]]["cum.sum", pf[["p"]][["es.syll"]]] / num.words
nWS1 <- (pf[["p"]][["nws1"]][["ms"]] * wien.MS) + (pf[["p"]][["nws1"]][["sl"]] * wien.SL) + (pf[["p"]][["nws1"]][["iw"]] * wien.IW) - (pf[["p"]][["nws1"]][["es"]] * wien.ES) - pf[["p"]][["nws1"]][["const"]]
nWS2 <- (pf[["p"]][["nws2"]][["ms"]] * wien.MS) + (pf[["p"]][["nws2"]][["sl"]] * wien.SL) + (pf[["p"]][["nws2"]][["iw"]] * wien.IW) - pf[["p"]][["nws2"]][["const"]]
nWS3 <- (pf[["p"]][["nws3"]][["ms"]] * wien.MS) + (pf[["p"]][["nws3"]][["sl"]] * wien.SL) - pf[["p"]][["nws3"]][["const"]]
nWS4 <- (pf[["p"]][["nws4"]][["sl"]] * wien.SL) + (pf[["p"]][["nws4"]][["ms"]] * wien.MS) - pf[["p"]][["nws4"]][["const"]]
slot(all.results, "Wiener.STF") <- list(flavour=pf[["f"]], nWS1=nWS1, nWS2=nWS2, nWS3=nWS3, nWS4=nWS4)
} else {}
## RIX
if("RIX" %in% index){
pf <- check_parameters(
index="RIX",
given=parameters[["RIX"]]
)
RIX.long.words <- long.words(pf[["p"]][["char"]], txt.desc=txt.desc)
RIX.idx <- RIX.long.words / num.sentences
RIX.grade <- get.grade.level(RIX.idx, measure="RIX")
slot(all.results, "RIX") <- list(flavour=pf[["f"]], index=RIX.idx, grade=RIX.grade[["grade"]], grade.min=RIX.grade[["grade.min"]])
} else {}
## Simple Measure of Gobbledygook (SMOG)
if("SMOG" %in% index){
pf <- check_parameters(
index="SMOG",
given=parameters[["SMOG"]],
flav_names=list(
de="de (\"Qu\", Bamberger-Vanecek)",
C="Fomula C",
simple="simplified"
)
)
SMOG.grade <- pf[["p"]][["sqrt"]] * sqrt((pf[["p"]][["fact"]] * (long.words(pf[["p"]][["syll"]] - 1, hyphen=hyphen) / num.sentences)) + pf[["p"]][["sqrt.const"]]) + pf[["p"]][["const"]]
slot(all.results, "SMOG") <- list(flavour=pf[["f"]], grade=SMOG.grade, age=SMOG.grade + 5)
} else {}
# recursive calls for alternative shortcuts
if("SMOG.de" %in% index){
slot(all.results, "SMOG.de") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("SMOG"), parameters=list(SMOG="de"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "SMOG")
} else {}
if("SMOG.C" %in% index){
slot(all.results, "SMOG.C") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("SMOG"), parameters=list(SMOG="C"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "SMOG")
} else {}
if("SMOG.simple" %in% index){
slot(all.results, "SMOG.simple") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("SMOG"), parameters=list(SMOG="simple"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "SMOG")
} else {}
## Spache
if("Spache" %in% index){
# check for word list and add it to params, otherwise skip this index
if(!"Spache" %in% names(word.lists) | is.null(word.lists[["Spache"]])){
warning("Spache: Missing word list, hence not calculated.", call.=FALSE)
} else {
pf <- check_parameters(
index="Spache",
given=parameters[["Spache"]],
flav_names=list(
old="First formula (1953)"
)
)
spache.word.list <- read.word.list(word.lists[["Spache"]], encoding=fileEncoding)
diff.words.all.txt <- difficult.words(txt.words.only, spache.word.list, only.once=TRUE)
diff.words.txt <- diff.words.all.txt[["pct.not.listed"]]
diff.words.nol <- diff.words.all.txt[["words.not.listed"]]
Spache.grade <- pf[["p"]][["asl"]] * avg.sntc.len + pf[["p"]][["dword"]] * diff.words.txt + pf[["p"]][["const"]]
# preserve hard words in the desc slot
if(isTRUE(analyze.text)){
slot(all.results, "desc")[["Spache.NOL"]] <- length(diff.words.nol)
} else {
slot(all.results, "desc")[["Spache.NOL"]] <- spache.word.list
}
slot(all.results, "Spache") <- list(flavour=pf[["f"]], word.list=spache.word.list, not.on.list=diff.words.nol, pct=diff.words.txt, grade=Spache.grade)
}
} else {}
# recursive calls for alternative shortcuts
if("Spache.old" %in% index){
slot(all.results, "Spache.old") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("Spache"), parameters=list(Spache="old"),
word.lists=list(Spache=word.lists[["Spache"]]),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "Spache")
} else {}
## Strain Index
# https://strainindex.wordpress.com/2007/09/25/hello-world/
if("Strain" %in% index){
pf <- check_parameters(
index="Strain",
given=parameters[["Strain"]]
)
# syllables per 3 sentences divided by 10
Strain.id <- (num.syll / (num.sentences / pf[["p"]][["sent"]])) / pf[["p"]][["const"]]
slot(all.results, "Strain") <- list(flavour=pf[["f"]], index=Strain.id)
} else {}
## Traenkle-Bailer
# new Dickes-Steiwer for german texts
if("Traenkle.Bailer" %in% index){
# this formula needs proper POS tags; skip if missing
if(only.kRp.tags){
# this text was just tagged with tokenize() and misses important tags
warning("Traenkle.Bailer: POS tags are not elaborate enough, can't count prepositions and conjuctions. Formulae skipped.", call.=FALSE)
} else {
pf <- check_parameters(
index="Traenkle.Bailer",
given=parameters[["Traenkle.Bailer"]]
)
if(all(num.prepositions == 0, num.conjunctions == 0)){
warning("Traenkle.Bailer: No tokens tagged as preposition or conjunction found. Was the text properly tagged?", call.=FALSE)
} else {}
TrBa.pct.prep <- num.prepositions * 100 / num.words
TrBa.pct.conj <- num.conjunctions * 100 / num.words
# TB1 = 224.6814 - (79.8304 * ln(awl + 1)) - (12.24032 * ln(asl + 1)) - (1.292857 * %prepos)
Traenkle.Bailer1 <- pf[["p"]][["TB1"]][["const"]] - (log(avg.word.len + 1) * pf[["p"]][["TB1"]][["awl"]]) - (log(avg.sntc.len + 1) * pf[["p"]][["TB1"]][["asl"]]) - (TrBa.pct.prep * pf[["p"]][["TB1"]][["prep"]])
# TB2 = 234.1063 - (96.11069 * ln(awl + 1)) - (2.05444 * %prepos) - (1.02805 * %conjct)
Traenkle.Bailer2 <- pf[["p"]][["TB2"]][["const"]] - (log(avg.word.len + 1) * pf[["p"]][["TB2"]][["awl"]]) - (TrBa.pct.prep * pf[["p"]][["TB2"]][["prep"]]) - (TrBa.pct.conj * pf[["p"]][["TB2"]][["conj"]])
slot(all.results, "Traenkle.Bailer") <- list(flavour=pf[["f"]], pct.prep=TrBa.pct.prep, pct.conj=TrBa.pct.conj, TB1=Traenkle.Bailer1, TB2=Traenkle.Bailer2)
}
} else {}
## TRI -- Kuntzsch's Text-Redundanz-Index
# zit. nach Klein, H. (2002). Lesbarkeit und Verstaendlichkeit von Texten (Teil 2). Technische Dokumentation, 2002/07.
# http://www.doku.net/ausgabe/200207.htm (2011-05-09)
if("TRI" %in% index){
pf <- check_parameters(
index="TRI",
given=parameters[["TRI"]]
)
num.one.syll <- num.words - long.words(pf[["p"]][["syll"]], hyphen=hyphen)
TRI <- (num.one.syll * pf[["p"]][["word"]]) - (num.punct * pf[["p"]][["pnct"]]) - (num.foreign * pf[["p"]][["frgn"]]) - pf[["p"]][["const"]]
slot(all.results, "TRI") <- list(flavour=pf[["f"]], short=num.one.syll, punct=num.punct, foreign=num.foreign, TRI=TRI)
} else {}
## Tuldava
if("Tuldava" %in% index){
pf <- check_parameters(
index="Tuldava",
given=parameters[["Tuldava"]]
)
Tuldava.score <- ((pf[["p"]][["syll"]] * num.syll) / (pf[["p"]][["word1"]] * num.words)) * log(((pf[["p"]][["word2"]] * num.words) / (pf[["p"]][["sent"]] * num.sentences)))
# Tuldava.score <- avg.syll.word * log(avg.sntc.len)
slot(all.results, "Tuldava") <- list(flavour=pf[["f"]], Tuldava=Tuldava.score)
} else {}
## Wheeler-Smith
if("Wheeler.Smith" %in% index){
pf <- check_parameters(
index="Wheeler.Smith",
given=parameters[["Wheeler.Smith"]],
flav_names=list(
de="de (Bamberger & Vanecek)"
)
)
if(identical(pf[["f"]], "de (Bamberger & Vanecek)")){
grade.measure <- "Wheeler.Smith.de"
} else {
grade.measure <- "Wheeler.Smith"
}
Wheeler.Smith.long.words <- 10 * long.words(pf[["p"]][["syll"]] - 1, hyphen=hyphen) / num.words
Wheeler.Smith.score <- avg.sntc.len * Wheeler.Smith.long.words
Wheeler.Smith.grade <- get.grade.level(Wheeler.Smith.score, measure=grade.measure)
slot(all.results, "Wheeler.Smith") <- list(flavour=pf[["f"]], score=Wheeler.Smith.score,
grade=Wheeler.Smith.grade[["grade"]], grade.min=Wheeler.Smith.grade[["grade.min"]])
} else {}
# recursive calls for alternative shortcuts
if("Wheeler.Smith.de" %in% index){
slot(all.results, "Wheeler.Smith.de") <- slot(kRp.rdb.formulae(txt.file, hyphen=hyphen, index=c("Wheeler.Smith"), parameters=list(Wheeler.Smith="de"),
sentc.tag=sentc.tag,
nonword.class=nonword.class, nonword.tag=nonword.tag, analyze.text=analyze.text, txt.features=txt.features, quiet=TRUE, keep.input=FALSE), "Wheeler.Smith")
} else {}
# if("" %in% names(parameters)){
# pf <- check_parameters(
# index="",
# given=parameters[[""]],
# flav_names=list(
# =""
# )
# )
# slot(all.results, "<nm>") <- <frml>
# } else {}
## for the time being, give a warning until all implementations have been validated
needs.warning <- index %in% rownames(rdb_indices)[!rdb_indices[, "validated"]]
if(all(!isTRUE(quiet), any(needs.warning))){
warning(paste0("The implementations of these formulas are still subject to validation:\n ",
paste(index[needs.warning], collapse=", "),
"\n Use the results with caution, even if they seem plausible!",
"\n See readability(index=\"validation\") for more details."), call.=FALSE)
} else {}
return(all.results)
}
############################
## internal helper functions
## function read.word.list()
# this function tries to get the actual word vector out of a given
# word list, be it a vector, matrix, data.frame or file name
read.word.list <- function(word.list, encoding="UTF-8"){
if(all(length(word.list) == 1, is.numeric(word.list))){
# we got a number, seems to be no word list, but already the result
return(word.list)
} else {}
if(((is.data.frame(word.list) | is.matrix(word.list)) & isTRUE(ncol(word.list) == 1)) |
(is.vector(word.list) & length(word.list) > 1)){
local.word.list <- as.character(unlist(word.list))
} else if(check.file(word.list, mode="exist")){
wl.file.con <- file(word.list, open="r")
local.word.list <- as.character(unlist(readLines(wl.file.con, encoding=encoding)))
close(wl.file.con)
}
return(local.word.list)
} ## function read.word.list()
## function long.words()
long.words <- function(min.num, txt.desc=NULL, hyphen=NULL){
if(!is.null(txt.desc)){
# will return the number of words with at least min.num characters!
if(min.num %in% colnames(txt.desc[["lttr.distrib"]])){
# can be that column name and numer are different, e.g. no words with three letters
# therefore, we transform the number into character
result <- txt.desc[["lttr.distrib"]]["cum.inv", as.character(min.num)]
} else {
result <- 0
}
} else if(!is.null(hyphen)){
if(min.num %in% colnames(describe(hyphen)[["syll.distrib"]])){
result <- describe(hyphen)[["syll.distrib"]]["cum.inv", as.character(min.num)]
} else {
result <- 0
}
} else {
stop(simpleError("internal bug: long.words() called without data!"))
}
return(result)
} ## end function long.words()
## function difficult.words()
# this part calculates the percentage of words in the analyzed text
# that are not found in a given list of words. this list must be
# a data.frame/matrix with one column or a vector.
difficult.words <- function(words.only, word.list, only.once=FALSE){
# define the built-in word lists
# it's not enough to have the data file around
## this much doesn't work as of now, we can't ship any word list due to copyright restrictions
# the code is left intact though, should we get permission to include word lists in the future...
if(length(word.list) == 1){
if(is.numeric(word.list)){
# we got numbers, seems to be no word list, but almost already the result
if(!is.numeric(words.only)){
# ok, this is not a number, so i guess it's actual words, let's simply count them
words.only <- length(words.only)
} else {}
pct.not.listed <- word.list * 100 / words.only
result <- list(words.not.listed=word.list, num.not.listed=word.list, pct.not.listed=pct.not.listed,
num.listed=(words.only-word.list), pct.listed=(100-pct.not.listed))
return(result)
} else if(word.list %in% c(
# list internal lists here ...
"some.free.word.list")){
wlist.to.load <- parse(text=paste0("if(!exists(\"", word.list, "\", envir=.koRpus.env, inherits=FALSE)){
data(", word.list, ", package=\"koRpus\", envir=.koRpus.env)} else {}
local.word.list <- as.vector(unlist(get(\"", word.list, "\", envir=.koRpus.env)))"))
eval(wlist.to.load)
} else {
stop(simpleError(paste0("Invalid word list: \"", word.list, "\".\nPlease provide a valid file name, or a data.frame/matrix with one column, or a character vector!")))
}
} else if((is.vector(word.list)) & length(word.list) > 1){
local.word.list <- word.list
} else {
stop(simpleError(paste0("Invalid word list: \"", word.list, "\".\nPlease provide a valid file name, or a data.frame/matrix with one column, or a character vector!")))
}
# make sure it's a vector, and we don't care about cases
local.word.list <- tolower(local.word.list)
local.tokens <- tolower(words.only)
# Space counts unfamiliar words only once
if(isTRUE(only.once)){
words.not.listed <- unique(words.only[!local.tokens %in% local.word.list])
# number of tokens not on the list
num.not.listed <- sum(!unique(local.tokens) %in% local.word.list)
} else {
words.not.listed <- words.only[!local.tokens %in% local.word.list]
# number of tokens not on the list
num.not.listed <- sum(!local.tokens %in% local.word.list)
}
pct.not.listed <- num.not.listed * 100 / length(local.tokens)
result <- list(words.not.listed=words.not.listed, num.not.listed=num.not.listed, pct.not.listed=pct.not.listed,
num.listed=(length(local.tokens)-num.not.listed), pct.listed=(100-pct.not.listed))
return(result)
} ## end function difficult.words()
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.