# R/FixVerbs.R In PersianStemmer: Persian Stemmer for Text Analysis

```FixVerbs <-
function(texts,Context){
# Present Perfect Suffixes
preper <- c("\u0627\u0645",
"\u0627\u06CC",
"\u0627\u0633\u062A",
"\u0627\u06CC\u0645",
"\u0627\u06CC\u062F",
"\u0627\u0646\u062F")
# Past Perfect Suffixes
pasper <- c("\u0628\u0648\u062F\u0645",
"\u0628\u0648\u062F\u06CC",
"\u0628\u0648\u062F",
"\u0628\u0648\u062F\u06CC\u0645",
"\u0628\u0648\u062F\u06CC\u062F",
"\u0628\u0648\u062F\u0646\u062F")
# Past Subjunctive Suffixes
passub <- c("\u0628\u0627\u0634\u0645",
"\u0628\u0627\u0634\u06CC",
"\u0628\u0627\u0634\u062F",
"\u0628\u0627\u0634\u06CC\u0645",
"\u0628\u0627\u0634\u06CC\u062F",
"\u0628\u0627\u0634\u0646\u062F")
# Supplement Suffixes
sup <- c("\u0634\u062F\u0647",
"\u0628\u0648\u062F\u0647")
# Present Perfect and Past Subjunctive Suffixes (Negative and Positive),
# Past Perfect and Supplement Suffixes (Negative, Positive, Simple and Progressive)
paslist <- c(preper,
pasper,
paste('\u0646\u0645\u06CC',pasper,sep=""),
paste('\u0645\u06CC',pasper,sep=""),
paste('\u0646',pasper,sep=""),
passub,
paste('\u0646\u0645\u06CC',passub,sep=""),
paste('\u0645\u06CC',passub,sep=""),
paste('\u0646',passub,sep=""),
sup,
paste('\u0646\u0645\u06CC',sup,sep=""),
paste('\u0645\u06CC',sup,sep=""),
paste('\u0646',sup,sep=""))
# Present Perfect and Past Subjunctive Suffixes (Negative),
# Past Perfect and Supplement Suffixes (Negative, Simple and Progressive)
paslistneg <- c(paste('\u0646\u0645\u06CC',pasper,sep=""),
paste('\u0646',pasper,sep=""),
paste('\u0646\u0645\u06CC',passub,sep=""),
paste('\u0646',passub,sep=""),
paste('\u0646\u0645\u06CC',sup,sep=""),
paste('\u0646',sup,sep=""))
# Past Perfect and Supplement Suffixes (Negative Progressive)
paspersupnegpro <- c(paste('\u0646\u0645\u06CC',pasper,sep=""),
paste('\u0646\u0645\u06CC',sup,sep=""))
# Future Prefixes (Positive)
futurepossin <- c("\u062E\u0648\u0627\u0647\u0645",
"\u062E\u0648\u0627\u0647\u06CC",
"\u062E\u0648\u0627\u0647\u062F")
futureposplu <- c("\u062E\u0648\u0627\u0647\u06CC\u0645",
"\u062E\u0648\u0627\u0647\u06CC\u062F",
"\u062E\u0648\u0627\u0647\u0646\u062F")
futurepos <- c(futurepossin,futureposplu)
# Future Prefixes (Negative)
futureneg <- paste('\u0646',futurepos,sep="")
# Future Prefixes (Positive and Negative)
future <- c(futurepos,futureneg)
# Progressive Future Prefixes (Positive)
futureposprogsin <- paste('\u0645\u06CC',futurepossin,sep="")
futureposprogplu <- paste('\u0645\u06CC',futureposplu,sep="")
futureposprog <- paste('\u0645\u06CC',futurepos,sep="")
# Progressive Future Prefixes (Negative)
futurenegprogsin <- paste('\u0646',futureposprogsin,sep="")
futurenegprogplu <- paste('\u0646',futureposprogplu,sep="")
futurenegprog <- paste('\u0646',futureposprog,sep="")
# Progressive Future Prefixes (Positive and Negative)
futureprog <- c(futureposprog,futurenegprog)
# Simple Present Suffixes (Singular)
pressinsuf <- c("\u0645","\u06CC","\u062F")
# Simple Present Suffixes (Plural)
presplusuf <- c("\u06CC\u0645","\u06CC\u062F","\u0646\u062F")
# Passive Simple Present Suffixes (Positive)
simprepasspos <- c("\u0634\u0648\u0645",
"\u0634\u0648\u06CC",
"\u0634\u0648\u062F",
"\u0634\u0648\u06CC\u0645",
"\u0634\u0648\u06CC\u062F",
"\u0634\u0648\u0646\u062F")
# Passive Simple Present Suffixes (Negative)
simprepassneg <- paste('\u0646',simprepasspos,sep="")
# Passive Simple Present Suffixes (Positive and Negative)
simprepass <- c(simprepasspos,simprepassneg)
# Passive Progressive Present Suffixes (Positive)
proprepasspos <- paste('\u0645\u06CC',simprepasspos,sep="")
# Passive Progressive Present Suffixes (Negative)
proprepassneg <- paste('\u0646',proprepasspos,sep="")
# Passive Progressive Present Suffixes (Positive and Negative)
proprepass <- c(proprepasspos,proprepassneg)
# Passive Simple Past Suffixes (Positive)
simpaspasspos <- c("\u0634\u062F\u0645",
"\u0634\u062F\u06CC",
"\u0634\u062F",
"\u0634\u062F\u06CC\u0645",
"\u0634\u062F\u06CC\u062F",
"\u0634\u062F\u0646\u062F")
# Passive Simple Past Suffixes (Negative)
simpaspassneg <- paste('\u0646',simpaspasspos,sep="")
# Passive Simple Past Suffixes (Positive and Negative)
simpaspass <- c(simpaspasspos,simpaspassneg)
# Passive Progressive Past Suffixes (Positive)
propaspasspos <- paste('\u0645\u06CC',simpaspasspos,sep="")
# Passive Progressive Past Suffixes (Negative)
propaspassneg <- paste('\u0646',propaspasspos,sep="")
# Passive Progressive Past Suffixes (Positive and Negative)
propaspass <- c(propaspasspos,propaspassneg)
# All Passive Past and Present Suffixes (Simple and Progressive)
pass <- c(simprepass,proprepass,simpaspass,propaspass)
# Suppliment Prefixes before Progressive Past and Presenet:
# (ex: "dashtam" miraftam, "daram" miravam)
suppresinpro <- c("\u062F\u0627\u0631\u0645",
"\u062F\u0627\u0631\u06CC",
"\u062F\u0627\u0631\u062F")
suppreplupro <- c("\u062F\u0627\u0631\u06CC\u0645",
"\u062F\u0627\u0631\u06CC\u062F",
"\u062F\u0627\u0631\u0646\u062F")
supprepro <- c(suppresinpro, suppreplupro)
suppassinpro <- c("\u062F\u0627\u0634\u062A\u0645",
"\u062F\u0627\u0634\u062A\u06CC",
"\u062F\u0627\u0634\u062A")
suppasplupro <- c("\u062F\u0627\u0634\u062A\u06CC\u0645",
"\u062F\u0627\u0634\u062A\u06CC\u062F",
"\u062F\u0627\u0634\u062A\u0646\u062F")
suppaspro <- c(suppassinpro, suppasplupro)
# This is a list of words that can be confused with verbs.
NoStemVerb <- c('\u0647\u0645\u0647', #hameh
'\u0639\u062F\u0647', #edeh
'\u0646\u0627\u0645\u0647', #nameh
'\u0645\u06CC\u0644\u0627\u062F\u06CC') #miladi
# This loop removes the space between the "mi" and "nmi" prefixes and verbs
textsSplit <- strsplit(texts," ")[[1]]
for(i in 1:length(textsSplit)){
if(textsSplit[i] == '\u0646\u0645\u06CC'){
textsSplit[i] <- ""
textsSplit[i+1] <- paste('\u0646\u0645\u06CC', textsSplit[i+1], sep="")}
if(textsSplit[i] == '\u0645\u06CC'){
textsSplit[i] <- ""
textsSplit[i+1] <- paste('\u0645\u06CC', textsSplit[i+1], sep="")}
}
# This loop detects and stems the verbs
texts <- paste(textsSplit, collapse=" ")
texts <- trim(gsub(" {2,}"," ", texts))
textsSplit <- strsplit(texts," ")[[1]]
for(i in 1:length(textsSplit)){
word1 <- textsSplit[i]
word2 <- textsSplit[i+1]
word3 <- textsSplit[i+2]
word4 <- textsSplit[i+3]
word1str <- strsplit(word1,"")[[1]]
word2str <- strsplit(word2,"")[[1]]
word3str <- strsplit(word3,"")[[1]]
# 1- Absolute Past Perfect - Passive
# (ex.neveshteh shodeh budeh ast)
if (!(word1 %in% NoStemVerb) & (word4 %in% paslist) & (word3 %in% paslist) & (word2 %in% paslist)){
if(length(word1str) >= 3){
if(word1str[length(word1str)] == '\u0647'){
word1str[length(word1str)] <- ""
if(paste(word1str[1:3],collapse="") == '\u0646\u0645\u06CC'){word1str[2:3] <- ""}
if(paste(word1str[1:2],collapse="") == '\u0645\u06CC'){word1str[1:2] <- ""}
Test <- paste0("^(|\u0646\u0645\u06CC|\u0645\u06CC|\u0646)",paste0(word1str,collapse=""),"(|\u0646|\u0645|\u06CC|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
PWord <- paste0(word1str,collapse="")
PWord <- paste(PWord,"\u0030",sep="")
if((word2 %in% paslistneg) | (word3 %in% paslistneg)){PWord <- paste("\u0646",PWord,sep="")}
if(Context){if(TRUE %in% grepl(Test, textsSplit)){textsSplit[i] <- PWord}}
if(!Context){textsSplit[i] <- PWord}
textsSplit[i+1] <- ""
textsSplit[i+2] <- ""
textsSplit[i+3] <- ""
next
}
}
}
# 2- Past Perfect, Present Perfect and Subjunctive Past - Passive
# (ex.neveshteh shodeh ast, neveshteh shodeh bud, neveshteh shodeh bashad)
if (!(word1 %in% NoStemVerb) & (word3 %in% paslist) & (word2 %in% paslist)){
if(length(word1str) >= 3){
if(word1str[length(word1str)] == '\u0647'){
word1str[length(word1str)] <- ""
if(paste(word1str[1:3],collapse="") == '\u0646\u0645\u06CC'){word1str[2:3] <- ""}
if(paste(word1str[1:2],collapse="") == '\u0645\u06CC'){word1str[1:2] <- ""}
Test <- paste0("^(|\u0646\u0645\u06CC|\u0645\u06CC|\u0646)",paste0(word1str,collapse=""),"(|\u0646|\u0645|\u06CC|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
PWord <- paste0(word1str,collapse="")
PWord <- paste(PWord,"\u0030",sep="")
if((word2 %in% paslistneg) | (word3 %in% paslistneg)){PWord <- paste("\u0646",PWord,sep="")}
if(Context){if(TRUE %in% grepl(Test, textsSplit)){textsSplit[i] <- PWord}}
if(!Context){textsSplit[i] <- PWord}
textsSplit[i+1] <- ""
textsSplit[i+2] <- ""
next
}
}
}
# 3- Future - Passive
# (eg. neveshteh khahad shod)
if (!(word1 %in% NoStemVerb) & (word3 %in% c("\u0634\u062F", "\u0646\u0634\u062F")) & (word2 %in% future)){
if(length(word1str) >= 3){
if(word1str[length(word1str)] == '\u0647'){
word1str[length(word1str)] <- ""
Test <- paste0("^(|\u0646\u0645\u06CC|\u0645\u06CC|\u0646)",paste0(word1str,collapse=""),"(|\u0646|\u0645|\u06CC|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
PWord <- paste0(word1str,collapse="")
PWord <- paste(PWord,"\u0030",sep="")
if((word2 %in% futureneg) | (word3 == "\u0646\u0634\u062F")){PWord <- paste("\u0646",PWord,sep="")}
if(Context){if(TRUE %in% grepl(Test, textsSplit)){textsSplit[i] <- PWord}}
if(!Context){textsSplit[i] <- PWord}
textsSplit[i+1] <- ""
textsSplit[i+2] <- ""
next
}
}
}
# 4- Progressive Past and Present with Supplement Prefixes - Passive
# (eg. dasht neveshteh mishod, darad neveshteh mishavad)
if (!(word2 %in% NoStemVerb) & (word1 %in% c(supprepro,suppaspro)) & (word3 %in% c(proprepass, propaspass))){
if(length(word2str) >= 3){
if(word2str[length(word2str)] == '\u0647'){
word2str[length(word2str)] <- ""
Test <- paste0("^(|\u0646\u0645\u06CC|\u0645\u06CC|\u0646)",paste0(word2str,collapse=""),"(|\u0646|\u0645|\u06CC|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
PWord <- paste0(word2str,collapse="")
PWord <- paste(PWord,"\u0030",sep="")
if(word3 %in% c(proprepassneg,propaspassneg)){PWord <- paste("\u0646",PWord,sep="")}
if(Context){if(TRUE %in% grepl(Test, textsSplit)){textsSplit[i+1] <- PWord}}
if(!Context){textsSplit[i+1] <- PWord}
textsSplit[i] <- ""
textsSplit[i+2] <- ""
next
}
}
}
# 5- Progressive Present with Supplement Prefixes - Active
# (eg.darad minevisad)
if (word1 %in% supprepro){
if((paste(word2str[1:2],collapse="") == '\u0645\u06CC')|(paste(word2str[1:3],collapse="") == '\u0646\u0645\u06CC')){
pre <- word2str[1]
if(paste(word2str[1:3],collapse="") == '\u0646\u0645\u06CC'){word2str[1:3] <- ""}
if(paste(word2str[1:2],collapse="") == '\u0645\u06CC'){word2str[1:2] <- ""}
if (word1 %in% suppresinpro){word2str[length(word2str)]<-""}
if (word1 %in% suppreplupro){word2str[(length(word2str)-1):length(word2str)] <-""}
textsSplit[i] <- ""
textsSplit[i+1] <- paste(word2str,collapse="")
if(pre=="\u0646"){textsSplit[i+1] <- paste("\u0646",textsSplit[i+1],sep="")}
textsSplit[i+1] <- paste(textsSplit[i+1],"\u0031",sep="")
next
}
}
# 6- Progressive Past with Supplement Prefixes - Active
# (eg.dasht minevesht)
if (word1 %in% suppaspro){
if((paste(word2str[1:2],collapse="") == '\u0645\u06CC')|(paste(word2str[1:3],collapse="") == '\u0646\u0645\u06CC')){
pre <- word2str[1]
if(paste(word2str[1:3],collapse="") == '\u0646\u0645\u06CC'){word2str[1:3] <- ""}
if(paste(word2str[1:2],collapse="") == '\u0645\u06CC'){word2str[1:2] <- ""}
if ((word1 %in% suppassinpro) & (word1 != "\u062F\u0627\u0634\u062A")){word2str[length(word2str)]<-""}
if (word1 %in% suppasplupro){word2str[(length(word2str)-1):length(word2str)] <-""}
textsSplit[i] <- ""
textsSplit[i+1] <- paste(word2str,collapse="")
if(pre=="\u0646"){textsSplit[i+1] <- paste("\u0646",textsSplit[i+1],sep="")}
textsSplit[i+1] <- paste(textsSplit[i+1],"\u0030",sep="")
next
}
}
# 7- Past Perfect, Present Perfect and Subjunctive Past - Active
# (eg. neveshteh bud, neveshteh ast, neveshteh bashad)
if (!(word1 %in% NoStemVerb) & (word2 %in% paslist)){
if(length(word1str) >= 3){
if(word1str[length(word1str)] == '\u0647'){
word1str[length(word1str)] <- ""
if(paste(word1str[1:3],collapse="") == '\u0646\u0645\u06CC'){word1str[2:3] <- ""}
if(paste(word1str[1:2],collapse="") == '\u0645\u06CC'){word1str[1:2] <- ""}
Test <- paste0("^(|\u0646\u0645\u06CC|\u0645\u06CC|\u0646)",paste0(word1str,collapse=""),"(|\u0646|\u0645|\u06CC|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
PWord <- paste0(word1str,collapse="")
PWord <- paste(PWord,"\u0030",sep="")
if(word2 %in% paslistneg){PWord <- paste("\u0646",PWord,sep="")}
if(Context){if(TRUE %in% grepl(Test, textsSplit)){textsSplit[i] <- PWord}}
if(!Context){textsSplit[i] <- PWord}
textsSplit[i+1] <- ""
next
}
}
}
# 8- Simple and Progressive Past and Present - Passive
# (eg. neveshteh mishavad, neveshteh mishod)
if (!(word1 %in% NoStemVerb) & (word2 %in% pass)){
if(length(word1str) >= 3){
if(word1str[length(word1str)] == '\u0647'){
word1str[length(word1str)] <- ""
if(paste(word1str[1:3],collapse="") == '\u0646\u0645\u06CC'){word1str[2:3] <- ""}
if(paste(word1str[1:2],collapse="") == '\u0645\u06CC'){word1str[1:2] <- ""}
Test <- paste0("^(|\u0646\u0645\u06CC|\u0645\u06CC|\u0646)",paste0(word1str,collapse=""),"(|\u0646|\u0645|\u06CC|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
PWord <- paste0(word1str,collapse="")
PWord <- paste(PWord,"\u0030",sep="")
if ((word2 %in% c(proprepassneg,propaspassneg,simprepassneg,simpaspassneg))){PWord <- paste("\u0646",PWord,sep="")}
if(Context){if(TRUE %in% grepl(Test, textsSplit)){textsSplit[i] <- PWord}}
if(!Context){textsSplit[i] <- PWord}
textsSplit[i+1] <- ""
next
}
}
}
# 9- Simple Future - Active
# (eg. khahad nevesht)
if (word1 %in% future){
if(length(word2str) >= 2){
textsSplit[i] <- paste(word2str,collapse="")
if(word1 %in% futureneg){textsSplit[i] <- paste("\u0646",textsSplit[i],sep="")}
textsSplit[i] <- paste(textsSplit[i],"\u0030",sep="")
textsSplit[i+1] <- ""
next
}
}
# 10- Progressive Future - Active
# (eg. mikhahad benevisad)
if (word1 %in% futureprog){
if(length(word2str) >= 3){
if(word2str[1] %in% c("\u0628","\u0646")){
if((word2str[length(word2str)] %in% pressinsuf) | (paste(word2str[(length(word2str)-1):length(word2str)],collapse="") %in% presplusuf)){
if(word1 %in% c(futureposprogplu,futurenegprogplu)){word2str[(length(word2str)-1):length(word2str)] <-""}
if(word1 %in% c(futureposprogsin,futurenegprogsin)){word2str[length(word2str)]<-""}
pre <- word2str[1]
word2str[1] <- ""
textsSplit[i+1] <- paste(word2str,collapse="")
if((word1 %in% futurenegprog) | (pre=="\u0646")){textsSplit[i+1] <- paste("\u0646",textsSplit[i+1],sep="")}
textsSplit[i] <- ""
textsSplit[i+1] <- paste(textsSplit[i+1],"\u0031",sep="")
next
}
}
}
}
# 11- Progressive Presnet and Past - Active
# (eg. minevesht, minevisad)
if(!(word1 %in% NoStemVerb) & length(word1str) >= 4){
if((paste(word1str[1:2],collapse="") == '\u0645\u06CC')|(paste(word1str[1:3],collapse="") == '\u0646\u0645\u06CC')){
if((word1str[length(word1str)] %in% pressinsuf) | (paste(word1str[(length(word1str)-1):length(word1str)],collapse="") %in% presplusuf)){
pre <- word1str[1]
if(paste(word1str[1:2],collapse="") == '\u0645\u06CC'){word1str <- word1str[3:length(word1str)]}
if(paste(word1str[1:3],collapse="") == '\u0646\u0645\u06CC'){word1str <- word1str[4:length(word1str)]}
if((length(word1str)>2) & paste(word1str[(length(word1str)-1):length(word1str)],collapse="") %in% presplusuf){
if (!(word1str[(length(word1str)-2)] %in% c("\u0627", "\u0648")) & (length(word1str) > 3)){word1str[(length(word1str)-1):length(word1str)] <-""}}
if((word1str[length(word1str)] %in% pressinsuf) & (word1str[length(word1str)] !="\u062F")){word1str[length(word1str)]<-""}
if(word1str[length(word1str)]=="\u062F"){word1str[length(word1str)]<-"\u062F\u0034"}
textsSplit[i] <- paste(word1str,collapse="")
if(pre=="\u0646"){textsSplit[i] <- paste("\u0646",textsSplit[i],sep="")}
textsSplit[i] <- paste(textsSplit[i],"\u0032",sep="")
next
}
}
}
}
# This section uses the past and present roots we get in the above 11 stpes
# to detect and stem remaining verbs in text.
# present roots
preroot <- grep("\u0031\$|\u0032\$",textsSplit, value = TRUE)
preroot <- c(unique(gsub("\u0031\$|\u0032\$|\u0034\u0032\$","",preroot)))
preroot <- unique(c(preroot,
gsub("^\u06CC\u0627","\u0622",preroot),
gsub("^\u0622","\u06CC\u0627",preroot),
gsub("^\u0622","\u0627",preroot),
gsub("^\u0627","\u0622",preroot),
gsub("^\u06CC\u0627!","\u0627",preroot),
gsub("^\u0646\u0622","\u0646\u06CC\u0627",preroot),
gsub("^\u0646\u0627","\u0646\u06CC\u0627",preroot)))
prerootpos <- unique(c(preroot[!grepl("^\u0646",preroot)],gsub("^\u0646","",grep("^\u0646\u0646",preroot, value =TRUE))))
prerootneg <- preroot[grepl("^\u0646",preroot)]
# past roots
pasroot <- grep(("\u0030\$|\u0032\$"),textsSplit, value = TRUE)
pasroot <- c(unique(gsub("|\u0030\$|\u0032\$|\u0034\u0032\$","",pasroot)))
pasroot <- unique(c(pasroot,
gsub("^\u06CC\u0627","\u0622",pasroot),
gsub("^\u0622","\u06CC\u0627",pasroot),
gsub("^\u0622","\u0627",pasroot),
gsub("^\u0627","\u0622",pasroot),
gsub("^\u06CC\u0627!","\u0627",pasroot),
gsub("^\u0646\u0622","\u0646\u06CC\u0627",pasroot),
gsub("^\u0646\u0627","\u0646\u06CC\u0627",pasroot)))
pasrootpos <- unique(c(pasroot[!grepl("^\u0646",pasroot)],gsub("^\u0646","",grep("^\u0646\u0646",pasroot, value =TRUE))))
pasrootneg <- pasroot[grepl("^\u0646",pasroot)]
# The four loops below uses verb roots to detect and stem remaining verbs in text
if(length(prerootpos)>0){
for (i in 1:length(prerootpos)){
if(grepl("^\u0627",prerootpos[i])==TRUE){
preverbpos1 <- paste0("^(|\u0628\u06CC)",prerootpos[i],"(|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
preverbneg1 <- paste0("^\u0646\u06CC",prerootpos[i],"(|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
textsSplit <- gsub(preverbpos1,paste0(prerootpos[i],"\u0033"),textsSplit)
textsSplit <- gsub(preverbneg1,paste0("\u0646\u06CC",prerootpos[i],"\u0033"),textsSplit)
}else{
preverbpos1 <- paste0("^(|\u0628)",prerootpos[i],"(|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
preverbneg1 <- paste0("^\u0646",prerootpos[i],"(|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
textsSplit <- gsub(preverbpos1,paste0(prerootpos[i],"\u0033"),textsSplit)
textsSplit <- gsub(preverbneg1,paste0("\u0646",prerootpos[i],"\u0033"),textsSplit)
}
}
}
if(length(prerootneg)>0){
for (i in 1:length(prerootneg)){
if(grepl("^\u0627",prerootneg[i])==TRUE){
preverbpos2 <- paste0("^(|\u0628\u06CC)",prerootneg[i],"(|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
preverbneg2 <- paste0("^\u0646\u06CC",prerootneg[i],"(|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
textsSplit <- gsub(preverbpos2,paste0(prerootneg[i],"\u0033"),textsSplit)
textsSplit <- gsub(preverbneg2,paste0("\u0646\u06CC",prerootneg[i],"\u0033"),textsSplit)
}else{
preverbpos2 <- paste0("^(|\u0628)",prerootneg[i],"(|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
preverbneg2 <- paste0("^\u0646",prerootneg[i],"(|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
textsSplit <- gsub(preverbpos2,paste0(prerootneg[i],"\u0033"),textsSplit)
textsSplit <- gsub(preverbneg2,paste0("\u0646",prerootneg[i],"\u0033"),textsSplit)
}
}
}
if(length(pasrootpos)>0){
for (i in 1:length(pasrootpos)){
if(grepl("^\u0627",pasrootpos[i])==TRUE){
pasverbpos1 <- paste0("^(|\u0645\u06CC)",pasrootpos[i],"(|\u0646|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
pasverbneg1 <- paste0("^(\u0646\u06CC|\u0646\u0645\u06CC)",pasrootpos[i],"(|\u0646|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
textsSplit <- gsub(pasverbpos1,paste0(pasrootpos[i],"\u0033"),textsSplit)
textsSplit <- gsub(pasverbneg1,paste0("\u0646\u06CC",pasrootpos[i],"\u0033"),textsSplit)
}else{
pasverbpos1 <- paste0("^(|\u0645\u06CC)",pasrootpos[i],"(|\u0646|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
pasverbneg1 <- paste0("^(\u0646|\u0646\u0645\u06CC)",pasrootpos[i],"(|\u0646|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
textsSplit <- gsub(pasverbpos1,paste0(pasrootpos[i],"\u0033"),textsSplit)
textsSplit <- gsub(pasverbneg1,paste0("\u0646",pasrootpos[i],"\u0033"),textsSplit)
}
}
}
if(length(pasrootneg)>0){
for (i in 1:length(pasrootneg)){
if(grepl("^\u0627",pasrootpos[i])==TRUE){
pasverbpos2 <- paste0("^(|\u0645\u06CC)",pasrootneg[i],"(|\u0646|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
pasverbneg2 <- paste0("^(\u0646\u06CC|\u0646\u0645\u06CC)",pasrootneg[i],"(|\u0646|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
textsSplit <- gsub(pasverbpos2,paste0(pasrootneg[i],"\u0033"),textsSplit)
textsSplit <- gsub(pasverbneg2,paste0("\u0646\u06CC",pasrootneg[i],"\u0033"),textsSplit)
}else{
pasverbpos2 <- paste0("^(|\u0645\u06CC)",pasrootneg[i],"(|\u0646|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
pasverbneg2 <- paste0("^(\u0646|\u0646\u0645\u06CC)",pasrootneg[i],"(|\u0646|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
textsSplit <- gsub(pasverbpos2,paste0(pasrootneg[i],"\u0033"),textsSplit)
textsSplit <- gsub(pasverbneg2,paste0("\u0646",pasrootneg[i],"\u0033"),textsSplit)
}
}
}
# This section fixes verbs starting with "alef", "ye", and "ye and alef"
yaA <- grep("\u0030\$|\u0031\$|\u0032\$|\u0033\$",textsSplit, value = TRUE)
yaA <- gsub("\u0030\$|\u0031\$|\u0032\$|\u0034\u0032\$|\u0033\$","",yaA)
y <- unique(grep("^\u06CC\u0627|^\u06CC",yaA, value = TRUE))
a <- unique(grep("^\u0627",yaA, value = TRUE))
A <- unique(grep("^\u0622",yaA, value = TRUE))
for (i in 1:length(y)){
if (length(gsub("^\u06CC\u0627","\u0622", y[i]))>0){
if (gsub("^\u06CC\u0627","\u0622", y[i]) %in% A){
v1 <- gsub("^\u06CC\u0627","\u0622", y[i])
v2 <- paste0(v1,"(|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
textsSplit <- gsub(paste0("^",y[i],"(\u0030|\u0031|\u0032|\u0034\u0032|\u0033)\$"),paste0(v1,"\u0035"),textsSplit)
textsSplit <- gsub(v2,paste0(v1,"\u0035"),textsSplit);
next}};
if (length(gsub("^\u06CC","\u0627", y[i]))>0){
if (gsub("^\u06CC","\u0627", y[i]) %in% a){
v1 <- gsub("^\u06CC","\u0627", y[i])
v2 <- paste0(v1,"(|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
textsSplit <- gsub(paste0("^",y[i],"(\u0030|\u0031|\u0032|\u0034\u0032|\u0033)\$"),paste0(v1,"\u0035"),textsSplit)
textsSplit <- gsub(v2,paste0(v1,"\u0035"),textsSplit);
next}}
}
for (i in 1:length(a)){
if (length(gsub("^\u0627","\u0622", a[i]))>0){
if (gsub("^\u0627","\u0622", a[i]) %in% A){
v1 <- gsub("^\u0627","\u0622", a[i])
v2 <- paste0(v1,"(|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
textsSplit <- gsub(paste0("^",a[i],"(\u0030|\u0031|\u0032|\u0034\u0032|\u0033)\$"),paste0(v1,"\u0035"),textsSplit)
textsSplit <- gsub(v2,paste0(v1,"\u0035"),textsSplit);
next}};
if (length(gsub("^\u0627","\u06CC\u0627", a[i]))>0){
if (gsub("^\u0627","\u06CC\u0627", a[i]) %in% y){
v1 <- gsub("^\u0627","\u06CC\u0627", a[i])
v2 <- paste0(v1,"(|\u0645|\u06CC|\u062F|\u06CC\u0645|\u06CC\u062F|\u0646\u062F)\$")
textsSplit <- gsub(paste0("^",a[i],"(\u0030|\u0031|\u0032|\u0034\u0032|\u0033)\$"),paste0(v1,"\u0035"),textsSplit)
textsSplit <- gsub(v2,paste0(v1,"\u0035"),textsSplit);
next}}
}
textsSplit <- gsub("^\u0646\u0622","\u0646\u06CC\u0627",textsSplit)
texts <- paste(textsSplit, collapse=" ")
texts <- trim(gsub(" {2,}"," ", texts))
return(texts)
}
```

## Try the PersianStemmer package in your browser

Any scripts or data that you put into this service are public.

PersianStemmer documentation built on June 28, 2019, 5:03 p.m.