R/stemmer.R

Defines functions removeSuffixes removePrefixes doStemming2 doStemming rYah rTamarbutta rHeh rYahTamarbutta rYahHeh rYahNun rWawNun rAlifTa rAlifNun rHaAlif rWaw rLamLam rFaAlifLam rKafAlifLam rBaAlifLam rWawAlifLam rAlifLam reverse.transliterate transliterate removeStopWords cleanLatinChars cleanChars fixAlifs clean removeNewlineChars removeDiacritics removePunctuation removeNumbers removeFarsiNumbers removeArabicNumbers removeEnglishNumbers stem stemArabic

Documented in cleanChars cleanLatinChars doStemming fixAlifs removeArabicNumbers removeDiacritics removeEnglishNumbers removeFarsiNumbers removeNewlineChars removeNumbers removePrefixes removePunctuation removeStopWords removeSuffixes reverse.transliterate stem stemArabic transliterate

## An arabic stemmer, modeled after after the light10 stemmer, but with substantial modifications
## Richard Nielsen
## This version: 7/14/2022

###########################################################
## A list of all chars in the Arabic unicode range
## I use this below to demonstrate which characters are being cleaned
## by the cleanChars() function.
triplet <- c(paste("0",c(60:69),sep=""),paste("06",c("A","B","C","D","E","F"),sep=""))
ArabicUnicodeChars <- as.vector(sapply(triplet,function(x){paste(x,c(0:9,c("A","B","C","D","E","F")),sep="")}))
x1 <- paste0("\\u",ArabicUnicodeChars)
ArabicUnicodeChars <- sapply(x1,function(x){parse(text = paste0("'", x, "'"))[[1]]})
rm(x1,triplet)


###########################################################
## Trim funtion
## This is used throughout to trim whitespace
trim <- function (x) gsub("^\\s+|\\s+$", "", x)


###########################################################
## package all the stemmer functions together
stemArabic <- function(dat, cleanChars=TRUE, cleanLatinChars=TRUE, 
                 transliteration=TRUE, returnStemList=FALSE,
                 defaultStopwordList=TRUE, customStopwordList=NULL,
                 dontStemTheseWords=c("allh","llh")){
  dat <- removeNewlineChars(dat)  ## gets rid of \n\r\t\f\v
  dat <- removePunctuation(dat)  ## gets rid of punctuation
  dat <- removeDiacritics(dat)  ## gets rid of Arabic diacritics
  dat <- removeEnglishNumbers(dat)  ## gets rid of English numbers
  dat <- removeArabicNumbers(dat)  ## gets rid of Arabic numbers
  dat <- removeFarsiNumbers(dat)  ## gets rid of Farsi numbers
  dat <- fixAlifs(dat)  ## standardizes different hamzas on alif seats
  if(cleanChars){dat <- cleanChars(dat)}  ## removes all unicode chars except Latin chars and Arabic alphabet
  if(cleanLatinChars){dat <- cleanLatinChars(dat)}  ## removes all Latin chars
  dat <- removeStopWords(dat, defaultStopwordList=TRUE, customStopwordList=customStopwordList)$text  ## removes the stopwords
  ## transliterate the words that should not be stemmed
  if(is.null(dontStemTheseWords) == F) {
    hasArabic <- unlist(lapply(dontStemTheseWords, function(x) {
      length(grep("[\u0600-\u0700]", x)) > 0
    }))
    dontStemTheseWords[hasArabic == F] <- sapply(dontStemTheseWords[hasArabic == F], reverse.transliterate)
  }
  ## do the stemming
  if(returnStemList==TRUE){
    tmp <- lapply(dat,function(x){doStemming(x, dontstem = dontStemTheseWords)}) ## removes prefixes and suffixes, and can return a list matching words to stemmed words
    dat <- lapply(tmp,function(x){x$text})
    stemlist <- lapply(tmp,function(x){x$stemmedWords})
    if(transliteration){dat <- transliterate(dat)}  ## performs transliteration
    return(list(text=dat,stemlist=stemlist))
  } else {
    dat <- unlist(lapply(dat,function(x){removePrefixes(x, dontstem = dontStemTheseWords)}))  ## removes prefixes
    dat <- unlist(lapply(dat,function(x){removeSuffixes(x, dontstem = dontStemTheseWords)}))  ## removes suffixes
    if(transliteration){dat <- transliterate(dat)}  ## performs transliteration
    return(dat)
  }
}

## The deprecated function
stem <- function(dat, cleanChars=TRUE, cleanLatinChars=TRUE, 
                 transliteration=TRUE, returnStemList=FALSE,
                 defaultStopwordList=TRUE, customStopwordList=NULL,
                 dontStemTheseWords=c("allh","llh")){
  #.Deprecated("stemArabic") ## wouldn't pass CRAN checks with this
  dat <- removeNewlineChars(dat)  ## gets rid of \n\r\t\f\v
  dat <- removePunctuation(dat)  ## gets rid of punctuation
  dat <- removeDiacritics(dat)  ## gets rid of Arabic diacritics
  dat <- removeEnglishNumbers(dat)  ## gets rid of English numbers
  dat <- removeArabicNumbers(dat)  ## gets rid of Arabic numbers
  dat <- removeFarsiNumbers(dat)  ## gets rid of Farsi numbers
  dat <- fixAlifs(dat)  ## standardizes different hamzas on alif seats
  if(cleanChars){dat <- cleanChars(dat)}  ## removes all unicode chars except Latin chars and Arabic alphabet
  if(cleanLatinChars){dat <- cleanLatinChars(dat)}  ## removes all Latin chars
  dat <- removeStopWords(dat, defaultStopwordList=TRUE, customStopwordList=customStopwordList)$text  ## removes the stopwords
  ## transliterate the words that should not be stemmed
  if(is.null(dontStemTheseWords) == F) {
    hasArabic <- unlist(lapply(dontStemTheseWords, function(x) {
                 length(grep("[\u0600-\u0700]", x)) > 0
    }))
    dontStemTheseWords[hasArabic == F] <- sapply(dontStemTheseWords[hasArabic == F], reverse.transliterate)
  }
  ## do the stemming
  if(returnStemList==TRUE){
    tmp <- doStemming(dat, dontstem = dontStemTheseWords) ## removes prefixes and suffixes, and can return a list matching words to stemmed words
    dat <- tmp$text
    stemlist <- tmp$stemmedWords
    if(transliteration){dat <- transliterate(dat)}  ## performs transliteration
    return(list(text=dat,stemlist=stemlist))
  } else {
    dat <- removePrefixes(dat, dontstem = dontStemTheseWords)  ## removes prefixes
    dat <- removeSuffixes(dat, dontstem = dontStemTheseWords)  ## removes suffixes
    if(transliteration){dat <- transliterate(dat)}  ## performs transliteration
    return(dat)
  }
}

###########################################################
## remove numbers

## Removes Latin character numbers
removeEnglishNumbers <- function(texts){
  texts <- gsub('[0-9]',' ',texts)
  # remove extra spaces
  return(trim(gsub(" {2,}"," ", texts)))
}

## Removes Arabic character numbers
removeArabicNumbers <- function(texts){
  texts <- gsub('[\u0660-\u0669]',' ',texts)
  # remove extra spaces
  return(trim(gsub(" {2,}"," ", texts)))
}

## Removes Farsi character numbers
removeFarsiNumbers <- function(texts){
  texts <- gsub('[\u06f0-\u06f9]',' ',texts)
  # remove extra spaces
  return(trim(gsub(" {2,}"," ", texts)))
}

## bundles the three functions for removing numbers
removeNumbers <- function(texts){
  texts <- removeEnglishNumbers(texts)
  texts <- removeArabicNumbers(texts)
  texts <- removeFarsiNumbers(texts)
  return(texts)
}

###########################################################
## clean out junk characters

## removes punctuation
removePunctuation <- function(texts){
  ## replace arabic specific punctuation
  texts <- gsub('\u060c|\u061b|\u061f|\u066c|\u066d|\u06d4|\u06dd|\u06de|\u06e9',' ',texts)
  ## replace other junk characters that sometimes show up
  texts <- gsub('[\u200C-\u200F]|&nbsp|~|\u2018|\u2022|\u2013|\u2026|\u201c|\u201d|\u2019|\ufd3e|\ufd3f', ' ', texts)
  #texts <- gsub('\xbb|\xab|\xf7|\xb7', ' ', texts) ## removed on 7/14/22 because CRAN checks threw this error: Flavor: r-devel-linux-x86_64-debian-gcc, r-devel-windows-x86_64, Check: examples, Result: ERROR, Running examples in 'arabicStemR-Ex.R' failed,   Warning in gsub("\xbb|\xab|\xf7|\xb7", " ", texts) : unable to translate '<bb>|<ab>|<f7>|<b7>' to a wide string, Error in gsub("\xbb|\xab|\xf7|\xb7", " ", texts) : 'pattern' is invalid, Calls: stemArabic -> removePunctuation -> gsub
  ## remove general punctuation
  texts <- gsub(pattern="[[:punct:]]", texts, replacement=" ")
  # remove extra spaces
  return(trim(gsub(" {2,}"," ", texts)))
}

## removes diacritic marks indicating short Arabic vowels
removeDiacritics <- function(texts){
  ## diacritics that I replace without spaces (appear over or under chars)
  texts <- gsub('[\u0610-\u061a]|\u0640|[\u064b-\u065f]|\u0670|[\u06d6-\u06dc]|[\u06df-\u06e4]|[\u06e7-\u06e8]|[\u06ea-\u06ed]',
                '',texts)
  # remove extra spaces
  return(trim(gsub(" {2,}"," ", texts)))
}
## see which diacritics are removed
#grep('[\u0610-\u061a]|\u0640|[\u064b-\u065f]|\u0670|[\u06d6-\u06dc]|[\u06df-\u06e4]|[\u06e7-\u06e8]|[\u06ea-\u06ed]',  ArabicUnicodeChars, value=T)


## remove \n \r \t \f \v
removeNewlineChars <- function(texts){
  texts <- gsub('\n|\r|\t|\f|\v',' ',texts)
  # remove extra spaces
  return(trim(gsub(" {2,}"," ", texts)))
}


## bundle the functins for punctuation, diacritics, and newlines together.   Mostly to preserve possible reverse compatibility with older versions
clean <- function(texts){
  texts <- removePunctuation(texts)
  texts <- removeDiacritics(texts)
  texts <- removeNewlineChars(texts)
  return(texts)
}


#######################################################
## Standardize the alifs

fixAlifs <- function(texts){
  texts <- gsub('\u0622|\u0623|\u0625|[\u0671-\u0673]|\u0675','\u0627', texts)
  return(texts)
}


#######################################################
## clean up the characters

## This function removes any characters in the text that are not in either the Latin unicode range
## or in the Arabic alphabet + "p".  To see which characters are retained, uncomment and run this
## line of code: ArabicUnicodeChars[!sapply(ArabicUnicodeChars,cleanChars)==""]
cleanChars <- function(texts){
  # http://jrgraphix.net/research/unicode_blocks.php
  ## ones I'm dropping
  texts <- gsub('[\u00A0-\u0600]|[\u0600-\u0621]|[\u063b-\u0640]|[\u064b-\u065f]|[\u066a-\u067d]|[\u067f-\u06ff]|[\u0700-\uFB4F]|[\uFB50-\uFDFF]|[\uFE00-\uFFFF]','',texts)
  ## I could sort through these ones too: http://jrgraphix.net/r/Unicode/FB50-FDFF, but I'm not right now
  ## clean up spaces
  return(trim(gsub(" {2,}"," ", texts)))
}

## This function removes all latin characters using unicode ranges
cleanLatinChars <- function(texts){
  # http://jrgraphix.net/research/unicode_blocks.php
  ## ones I'm aiming to keep
  # texts <- gsub('[[:alpha:]]','',texts)
  # Romney fix
  texts <- gsub("\\p{Latin}", "", texts, perl = TRUE)
  ## I could sort through these ones too: http://jrgraphix.net/r/Unicode/FB50-FDFF, but I'm not right now
  ## clean up spaces
  return(trim(gsub(" {2,}"," ", texts)))
}

#######################################################
## Remove stopwords

## Removes stopwords from a list that I've put together

removeStopWords <- function(texts, defaultStopwordList=TRUE, customStopwordList=NULL){
  
  # Split up the words...
  textsSplit = strsplit(texts," ")
  
  preps  <- c('\u0641\u064a',  #fy
              '\u0641\u064a\u0647',  #fyh
              '\u0641\u064a\u0647\u0627',  #fyha
              '\u0641\u064a\u0647\u0645',  #fyhm
              '\u0639\u0644\u0649',  #3lA
              '\u0639\u0644\u064a\u0643',  #3lyk
              '\u0639\u0644\u064a\u0643\u0645',  #3lykm
              '\u0639\u0644\u064a\u0646\u0627',  #3lyna
              '\u0639\u0644\u064a\u0647',  #3lyh
              '\u0639\u0644\u064a\u0647\u0627',  #3lyha
              '\u0639\u0644\u064a\u0647\u0645',  #3lyhm
              '\u0639\u0644\u064a',  #3ly
              '\u0628\u0647',  #bh
              '\u0628\u0647\u0627',  #bha
              '\u0628\u0647\u0645',  #bhm
              '\u0628\u0647\u0630\u0627',  #bhia
              '\u0628\u0630\u0644\u0643',  #bilk
              '\u0628\u0643',  #bk
              '\u0628\u0643\u0645',  #bkm
              '\u0628\u0643\u0644',  #bkl
              '\u0628\u0645\u0627',  #bma
              '\u0628\u0645\u0646',  #bmn
              '\u0628\u0646\u0627',  #bna
              '\u0644\u0647',  #lh
              '\u0644\u0647\u0627',  #lha
              '\u0644\u0647\u0645',  #lhm
              '\u0645\u0639',  #m3
              '\u0645\u0639\u0647',  #m3h
              '\u0645\u0639\u0647\u0627',  #m3ha
              '\u0645\u0639\u0647\u0645',  #m3hm
              '\u0639\u0646',  #3n
              '\u0639\u0646\u0627',  #3na
              '\u0639\u0646\u0647',  #3nh
              '\u0639\u0646\u0647\u0627',  #3nha
              '\u0639\u0646\u0647\u0645',  #3nhm
              '\u062a\u062d\u062a',  #t7t
              '\u062d\u062a\u0649',  #7tA
              '\u0641\u0648\u0642',  #fwQ
              '\u0641\u0648\u0642\u064e',  #fwQ?
              '\u0628\u062c\u0627\u0646\u0628',  #bjanb
              '\u0623\u0645\u0627\u0645',  #amam
              '\u0623\u0645\u0627\u0645\u064e',  #amam?
              '\u0627\u0645\u0627\u0645',  #amam
              '\u062e\u0627\u0631\u062c',  #Karj
              '\u0628\u0627\u0644\u062e\u0627\u0631\u062c',  #balKarj
              '\u062d\u0648\u0644\u064e',  #7wl?
              '\u062d\u0648\u0644',  #7wl
              '\u0631\u063a\u0645',  #rGm
              '\u0628\u0627\u0644\u0631\u063a\u0645',  #balrGm
              '\u0631\u063a\u0645\u064e',  #rGm?
              '\u0645\u0646\u0630',  #mni
              '\u0645\u0646\u0630\u064f',  #mni?
              '\u0645\u0646',  #mn
              '\u062e\u0644\u0627\u0644',  #Klal
              '\u062e\u0644\u0627\u0644\u064e',  #Klal?
              '\u062d\u0648\u0644',  #7wl
              '\u062d\u0648\u0644\u064e',  #7wl?
              '\u0642\u0628\u0644',  #Qbl
              '\u0642\u0628\u0644\u064e',  #Qbl?
              '\u0648\u0641\u0642\u0627',  #wfQa
              '\u0625\u0644\u0649',  #alA
              '\u0627\u0644\u0649\u0648\u0631\u0627\u0621\u064e',  #alAwraq?
              '\u0648\u0631\u0627\u0621',  #wraq
              '\u0628\u064a\u0646\u064e',  #byn?
              '\u0628\u064a\u0646',  #byn
              '\u0628\u064a\u0646\u0647\u0645',  #bynhm
              '\u0628\u064a\u0646\u0647\u0645\u0627',  #bynhma
              '\u0628\u064a\u0646\u0643\u0645',  #bynkm
              '\u0628\u064a\u0646\u0645\u0627',  #bynma
              '\u0628\u062f\u0648\u0646',  #bdwn
              '\u0644\u0643\u0646',  #lkn
              '\u0628\u0627\u062a\u062c\u0627\u0647',  #batjah
              '\u0623\u0642\u0644',  #aQl
              '\u0627\u0642\u0644',  #aQl
              '\u0627\u0643\u062b\u0631'  #akUr
  )
  
  
  # Demonstrative, subject and relative pronouns
  pronouns <- c('\u0647\u0630\u0627',  #hia
                '\u0647\u0630\u0647',  #hih
                '\u0630\u0644\u0643',  #ilk
                '\u062a\u0644\u0643',  #tlk
                '\u0647\u0624\u0644\u064e\u0627\u0621',  #hol?aq
                '\u0647\u0624\u0644\u0627\u0621',  #holaq
                '\u0627\u0648\u0644\u0627\u0626\u0643',  #awla5k
                '\u0647\u0630\u0627\u0646',  #hian
                '\u0647\u0630\u064a\u0646\u0647\u062a\u0627\u0646',  #hiynhtan
                '\u0647\u062a\u064a\u0646\u0623\u0646\u0627',  #htynana
                '\u0627\u0646\u0627',  #ana
                '\u0623\u0646\u062a',  #ant
                '\u0647\u0645\u0627',  #hma
                '\u0623\u0646\u062a\u064e',  #ant?
                '\u0627\u0646\u062a',  #ant
                '\u0623\u0646\u062a',  #ant
                '\u0623\u0646\u062a\u0650',  #ant?
                '\u0627\u0646\u062a\u0647\u0648',  #anthw
                '\u0647\u0648\u064e',  #hw?
                '\u0647\u0648',  #hw
                '\u0647\u064a',  #hy
                '\u0647\u064a\u064e',  #hy?
                '\u0646\u062d\u0646',  #n7n
                '\u0623\u0646\u062a\u0645',  #antm
                '\u0627\u0646\u062a\u0645',  #antm
                '\u0623\u0646\u062a\u0645',  #antm
                '\u0627\u0646\u062a\u0645',  #antm
                '\u0647\u064f\u0645',  #h?m
                '\u0647\u0645',  #hm
                '\u0644\u0647\u0645',  #lhm
                '\u0645\u0646\u0647\u0645',  #mnhm
                '\u0648\u0647\u0645',  #whm
                '\u0627\u0644\u062a\u064a',  #alty
                '\u0627\u0644\u0630\u064a',  #aliy
                '\u0627\u0644\u0644\u0630\u0627\u0646',  #allian
                '\u0627\u0644\u0644\u0630\u064a\u0646',  #alliyn
                '\u0627\u0644\u0644\u062a\u0627\u0646',  #alltan
                '\u0627\u0644\u0644\u062a\u064a\u0646')  #alltyn
  
  # Particles
  particles <- c('\u0627\u0646',  #an
                 '\u0648\u0627\u0646',  #wan
                 '\u0625\u0646',  #an
                 '\u0625\u0646\u0647',  #anh
                 '\u0625\u0646\u0647\u0627',  #anha
                 '\u0625\u0646\u0647\u0645',  #anhm
                 '\u0625\u0646\u0647\u0645\u0627',  #anhma
                 '\u0625\u0646\u064a',  #any
                 '\u0648\u0625\u0646',  #wan
                 '\u0648\u0623\u0646',  #wan
                 '\u0627\u0646',  #an
                 '\u0627\u0646\u0647',  #anh
                 '\u0627\u0646\u0647\u0627',  #anha
                 '\u0627\u0646\u0647\u0645',  #anhm
                 '\u0627\u0646\u0647\u0645\u0627',  #anhma
                 '\u0627\u0646\u064a',  #any
                 '\u0623\u0646\u0643',  #ank
                 '\u0625\u0646\u0643',  #ank
                 '\u0627\u0646\u0643',  #ank
                 '\u0623\u0646\u0643\u0645',  #ankm
                 '\u0625\u0646\u0643\u0645',  #ankm
                 '\u0627\u0646\u0643\u0645',  #ankm
                 '\u0627\u0646\u0646\u0627',  #anna
                 '\u0648\u0627\u0646',  #wan
                 '\u0648\u0627\u0646',  #wan
                 '\u0623\u0646',  #an
                 '\u0627\u0646',  #an
                 '\u0623\u0644\u0627',  #ala
                 '\u0628\u0623\u0646',  #ban
                 '\u0627\u0646',  #an
                 '\u0627\u0644\u0627',  #ala
                 '\u0628\u0627\u0646',  #ban
                 '\u0628\u0627\u0646\u0647\u0645',  #banhm
                 '\u0623\u0646\u0647',  #anh
                 '\u0623\u0646\u0647\u0627',  #anha
                 '\u0623\u0646\u0647\u0645',  #anhm
                 '\u0623\u0646\u0647\u0645\u0627',  #anhma
                 '\u0627\u0646\u0647',  #anh
                 '\u0627\u0646\u0647\u0627',  #anha
                 '\u0627\u0646\u0647\u0645',  #anhm
                 '\u0627\u0646\u0647\u0645\u0627',  #anhma
                 '\u0623\u0630',  #ai
                 '\u0627\u0630',  #ai
                 '\u0627\u0630\u0627',  #aia
                 '\u0625\u0630',  #ai
                 '\u0625\u0630\u0627',  #aia
                 '\u0648\u0625\u0630',  #wai
                 '\u0648\u0625\u0630\u0627',  #waia
                 '\u0627\u0630',  #ai
                 '\u0627\u0630',  #ai
                 '\u0627\u0630\u0627',  #aia
                 '\u0627\u0630',  #ai
                 '\u0627\u0630\u0627',  #aia
                 '\u0641\u0627\u0630\u0627',  #faia
                 '\u0645\u0627\u0630\u0627',  #maia
                 '\u0648\u0627\u0630',  #wai
                 '\u0648\u0627\u0630\u0627',  #waia
                 '\u0644\u0648\u0644\u0627',  #lwla
                 '\u0644\u0648',  #lw
                 '\u0648\u0644\u0648\u0633\u0648\u0641',  #wlwswf
                 '\u0644\u0646',  #ln
                 '\u0645\u0627',  #ma
                 '\u0644\u0645',  #lm
                 '\u0648\u0644\u0645',  #wlm
                 '\u0623\u0645\u0627',  #ama
                 '\u0627\u0645\u0627',  #ama
                 '\u0644\u0627',  #la
                 '\u0648\u0644\u0627',  #wla
                 '\u0625\u0644\u0627',  #ala
                 '\u0627\u0644\u0627',  #ala
                 '\u0623\u0645',  #am
                 '\u0623\u0648',  #aw
                 '\u0627\u0645',  #am
                 '\u0627\u0648',  #aw
                 '\u0628\u0644',  #bl
                 '\u0642\u062f',  #Qd
                 '\u0648\u0642\u062f',  #wQd
                 '\u0644\u0642\u062f',  #lQd
                 '\u0623\u0646\u0645\u0627',  #anma
                 '\u0625\u0646\u0645\u0627',  #anma
                 '\u0628\u0644',  #bl
                 '\u0627\u0646\u0645\u0627',  #anma
                 '\u0627\u0646\u0645\u0627',  #anma
                 '\u0648')  #w
  
  # Connectors
  connectors <- c('\u0628\u0645\u0627',  #bma
                  '\u0643\u0645\u0627',  #kma
                  '\u0644\u0645\u0627',  #lma
                  '\u0644\u0623\u0646',  #lan
                  '\u0644\u0627\u0646',  #lan
                  '\u0644\u064a', #ly
                  '\u0644\u0649', #ly
                  '\u0644\u0647\u0630\u0623', #lhia
                  '\u0644\u0630\u0623', #lia
                  '\u0644\u0623\u0646\u0647',  #lanh
                  '\u0644\u0623\u0646\u0647\u0627',  #lanha
                  '\u0644\u0623\u0646\u0647\u0645',  #lanhm
                  '\u0644\u0627\u0646',  #lan
                  '\u0644\u0627\u0646\u0647',  #lanh
                  '\u0644\u0627\u0646\u0647\u0627',  #lanha
                  '\u0644\u0627\u0646\u0647\u0645',  #lanhm
                  '\u062b\u0645',  #Um
                  '\u0623\u064a\u0636\u0627',  #ayDa
                  '\u0627\u064a\u0636\u0627',  #ayDa
                  '\u0643\u0630\u0644\u0643',  #kilk
                  '\u0642\u0628\u0644',  #Qbl
                  '\u0628\u0639\u062f',  #b3d
                  '\u0644\u0643\u0646',  #lkn
                  '\u0648\u0644\u0643\u0646',  #wlkn
                  '\u0644\u0643\u0646\u0647',  #lknh
                  '\u0644\u0643\u0646\u0647\u0627',  #lknha
                  '\u0644\u0643\u0646\u0647\u0645',  #lknhm
                  '\u0641\u0642\u0637',  #fQT
                  '\u0631\u063a\u0645',  #rGm
                  '\u0628\u0627\u0644\u0631\u063a\u0645',  #balrGm
                  '\u0628\u0641\u0636\u0644',  #bfDl
                  '\u062d\u064a\u062b',  #7yU
                  '\u0628\u062d\u064a\u062b',  #b7yU
                  '\u0644\u0643\u064a',  #lky
                  '\u0647\u0646\u0627',  #hna
                  '\u0647\u0646\u0627\u0643',  #hnak
                  '\u0628\u0633\u0628\u0628',  #bsbb
                  '\u0630\u0627\u062a',  #iat
                  '\u0630\u0648',  #iw
                  '\u0630\u064a',  #iy
                  '\u0630\u0649',  #iy
                  '\u0648\u0647', #wh
                  '\u064a\u0627',  #ya
                  '\u0627\u0646\u0645\u0627',  #anma
                  '\u0641\u0647\u0630\u0627',  #fhia
                  '\u0641\u0647\u0648',  #fhw
                  '\u0641\u0645\u0627',  #fma
                  '\u0641\u0645\u0646',  #fmn
                  '\u0641\u064a\u0645\u0627', #fyma
                  '\u0641\u0647\u0644',  #fhl
                  '\u0648\u0647\u0644',  #whl
                  '\u0641\u0647\u0624\u0644\u0627\u0621',  #fholaq
                  '\u0643\u0630\u0627', #kia
                  '\u0644\u0630\u0644\u0643', #lilk
                  '\u0644\u0645\u0627\u0630\u0627', #lmaia
                  '\u0644\u0645\u0646', #lmn
                  '\u0644\u0646\u0627',  #lna
                  '\u0645\u0646\u0627',  #mna
                  '\u0645\u0646\u0643',  #mnk
                  '\u0645\u0646\u0643\u0645',  #mnkm
                  '\u0645\u0646\u0647\u0645\u0627',  #mnhm
                  '\u0645\u0646\u0647\u0645\u0627',  #mnhma
                  '\u0644\u0643', #lk
                  '\u0648\u0644\u0648', #wlw
                  '\u0645\u0645\u0627', #mma
                  '\u0648\u0645\u0627', #wma
                  '\u0648\u0645\u0646', #wmn
                  '\u0639\u0646\u062f',  #3nd
                  '\u0639\u0646\u062f\u0647\u0645',  #3ndhm
                  '\u0639\u0646\u062f\u0645\u0627',  #3ndma
                  '\u0639\u0646\u062f\u0646\u0627',  #3ndna
                  '\u0639\u0646\u0647\u0645\u0627',  #3nhma
                  '\u0639\u0646\u0643',  #3nk
                  '\u0627\u0630\u0646',  #ain
                  '\u0627\u0644\u0630\u064a',  #aliy
                  '\u0641\u0627\u0646\u0627',  #fana
                  '\u0641\u0627\u0646\u0647\u0645',  #fanhm
                  '\u0641\u0647\u0645',  #fhm
                  '\u0641\u0647',  #fh
                  '\u0641\u0643\u0644',  #fkl
                  '\u0644\u0643\u0644',  #lkl
                  '\u0644\u0643\u0645',  #lkm
                  '\u0641\u0644\u0645',  #flm
                  '\u0641\u0644\u0645\u0627',  #flma
                  '\u0641\u064a\u0643',  #fyk
                  '\u0641\u064a\u0643\u0645',  #fykm
                  '\u0644\u0647\u0630\u0627')    # lhia
  
  all <- c(preps,pronouns,particles,connectors)
  all <- unique(c(all,fixAlifs(all)))
  
  ## if the defaultStopwordList = FALSE, then don't use any of the default stopwords
  if(defaultStopwordList==F){all <- c()}
  
  ## add in the custom stopword list if provided
  if(is.null(customStopwordList)==F){
    ## first check if the custom list is in Arabic or Latin chars
    hasArabic <- unlist(lapply(customStopwordList,function(x){length(grep("[\u0600-\u0700]",x))>0}))
    ## transliterate those that don't have arabic
    customStopwordList[hasArabic==F] <- sapply(customStopwordList[hasArabic==F],reverse.transliterate)
    ## add the custom stopword list to "all"
    all <- c(all, customStopwordList)
  }
  
  
  for(j in 1:length(textsSplit)){
    ts <- textsSplit[[j]]
    if(length(ts) > 0){
      for(i in 1:length(ts)){
        if(ts[i] %in% all){ts[i] <- ""}
      } 
    }
    ts <- paste(ts, collapse=" ")
    ts <- trim(gsub(" {2,}"," ", ts))
    textsSplit[[j]] <- ts
  }
  textsSplit <- unlist(textsSplit)
  return(list(text=textsSplit,arabicStopwordList=all))
}

#######################################################
## Transliterate from arabic to latin characters for use with text analysis software.
## All transliteration is 1-to-1, except for some special characters which I provide 
## transliteration for but generally recommend cleaning out instead.

transliterate <- function(texts){
  # The alphabet 
  texts <- gsub('\u0627', 'a', texts)
  texts <- gsub('\u0649', 'A', texts)
  texts <- gsub('\u0628', 'b', texts)
  texts <- gsub('\u062a', 't', texts)
  texts <- gsub('\u062b', 'U', texts)
  texts <- gsub('\u062c', 'j', texts)
  texts <- gsub('\u062d', '7', texts)
  texts <- gsub('\u062e', 'K', texts)
  texts <- gsub('\u062f', 'd', texts)
  texts <- gsub('\u0630', 'i', texts)
  texts <- gsub('\u0631', 'r', texts)
  texts <- gsub('\u0632', 'z', texts)
  texts <- gsub('\u0633', 's', texts)
  texts <- gsub('\u0634', 'W', texts)
  texts <- gsub('\u0635', 'S', texts)
  texts <- gsub('\u0636', 'D', texts)
  texts <- gsub('\u0637', 'T', texts)
  texts <- gsub('\u0638', 'Z', texts)
  texts <- gsub('\u0639', '3', texts)
  texts <- gsub('\u063a', 'G', texts)
  texts <- gsub('\u0641', 'f', texts)
  texts <- gsub('\u0642', 'Q', texts)
  texts <- gsub('\u0643', 'k', texts)
  texts <- gsub('\u0644', 'l', texts)
  texts <- gsub('\u0645', 'm', texts)
  texts <- gsub('\u0646', 'n', texts)
  texts <- gsub('\u0647', 'h', texts)
  texts <- gsub('\u0648', 'w', texts)
  texts <- gsub('\u064a', 'y', texts)
  # Hamzas
  texts <- gsub('\u0623', 'a', texts)
  texts <- gsub('\u0625', 'a', texts)
  texts <- gsub('\u0624', 'o', texts)
  texts <- gsub('\u0626', '5', texts)
  texts <- gsub('\u0621', 'q', texts)
  texts <- gsub('\u0622', 'a', texts)
  # taa-marbuta and other special letters
  texts <- gsub('\u0629', '0', texts)
  
  # Rare Characters
  texts <- gsub('\u067E', 'p', texts)  # Arabic "peh" -- ba with three dots
  texts <- gsub('\u06C1', 'h', texts)  # another version of heh
  texts <- gsub('\u06A9', 'k', texts)  # another version of kaf, called keheh
  texts <- gsub('\u0679', 't', texts)  # a taa with a Taa over it?
  texts <- gsub("\u06BA", 'n', texts)  # a noon without a dot?
  texts <- gsub("\u06D2", 'y', texts)  # a weird yeh
  texts <- gsub('\u06cc', 'y', texts)  # ARABIC LETTER DOTLESS YA
  texts <- gsub("\u0671", 'a', texts)  # ARABIC LETTER DOTLESS YA
  texts <- gsub("\ufedf", 'l', texts)  # http://www.webtoolhub.com/tn561380-xhtml-characters-list.aspx?type=script&category=arabic-form-b
  texts <- gsub("\uFEEB", 'h', texts)  # http://www.webtoolhub.com/tn561380-xhtml-characters-list.aspx?type=script&category=arabic-form-b
  texts <- gsub("\u063f", 'y', texts)  # (special three dot yeh) http://www.marathon-studios.com/unicode/U063F/Arabic_Letter_Farsi_Yeh_With_Three_Dots_Above
  texts <- gsub("\u063d", 'y', texts)  # special yah: http://www.marathon-studios.com/unicode/U063D/Arabic_Letter_Farsi_Yeh_With_Inverted_V
  texts <- gsub("\u063e", 'y', texts)  # special yah
  texts <- gsub("\u063b", 'k', texts)  # keheh with dots
  texts <- gsub("\u063c", 'k', texts)  # keheh with dots
  
  return(texts)
}



#######################################################
## Transliterate from Latin to Arabic

## this reverses the transliteration, except that it cannot reproduce hamzas on alifs (they 
## all come out as plain alifs).

reverse.transliterate <- function(texts){
  
  txts <- unlist(strsplit(texts, ""))

  # The alphabet 
  txts <- sapply(txts, gsub, pattern='a', replacement='\u0627')
  txts <- sapply(txts, gsub, pattern='A', replacement='\u0649')
  txts <- sapply(txts, gsub, pattern='b', replacement='\u0628')
  txts <- sapply(txts, gsub, pattern='t', replacement='\u062a')
  txts <- sapply(txts, gsub, pattern='U', replacement='\u062b')
  txts <- sapply(txts, gsub, pattern='j', replacement='\u062c')
  txts <- sapply(txts, gsub, pattern='7', replacement='\u062d')
  txts <- sapply(txts, gsub, pattern='K', replacement='\u062e')
  txts <- sapply(txts, gsub, pattern='d', replacement='\u062f')
  txts <- sapply(txts, gsub, pattern='i', replacement='\u0630')
  txts <- sapply(txts, gsub, pattern='r', replacement='\u0631')
  txts <- sapply(txts, gsub, pattern='z', replacement='\u0632')
  txts <- sapply(txts, gsub, pattern='s', replacement='\u0633')
  txts <- sapply(txts, gsub, pattern='W', replacement='\u0634')
  txts <- sapply(txts, gsub, pattern='S', replacement='\u0635')
  txts <- sapply(txts, gsub, pattern='D', replacement='\u0636')
  txts <- sapply(txts, gsub, pattern='T', replacement='\u0637')
  txts <- sapply(txts, gsub, pattern='Z', replacement='\u0638')
  txts <- sapply(txts, gsub, pattern='3', replacement='\u0639')
  txts <- sapply(txts, gsub, pattern='G', replacement='\u063a')
  txts <- sapply(txts, gsub, pattern='f', replacement='\u0641')
  txts <- sapply(txts, gsub, pattern='Q', replacement='\u0642')
  txts <- sapply(txts, gsub, pattern='k', replacement='\u0643')
  txts <- sapply(txts, gsub, pattern='l', replacement='\u0644')
  txts <- sapply(txts, gsub, pattern='m', replacement='\u0645')
  txts <- sapply(txts, gsub, pattern='n', replacement='\u0646')
  txts <- sapply(txts, gsub, pattern='h', replacement='\u0647')
  txts <- sapply(txts, gsub, pattern='w', replacement='\u0648')
  txts <- sapply(txts, gsub, pattern='y', replacement='\u064a')
  # Hamzas
  txts <- sapply(txts, gsub, pattern='o', replacement='\u0624')
  txts <- sapply(txts, gsub, pattern='5', replacement='\u0626')
  txts <- sapply(txts, gsub, pattern='q', replacement='\u0621')
  # taa-marbuta and other special letters
  txts <- sapply(txts, gsub, pattern='0', replacement='\u0629') 
  txts <- sapply(txts, gsub, pattern='a', replacement='\u064b') # this is the tanwiin over alif at the end of accusative nouns
  
  # Rare Characters
  txts <- sapply(txts, gsub, pattern='p', replacement='\u067E')  # Arabic "peh" -- ba with three dots
  
  texts <- paste(txts, collapse="")
  Encoding(texts) <- "UTF-8"
  
  return(texts)
}


############################################################
## removes prefixes and suffixes, roughly in the same way as the light10 stemmer

############################################################
## Slower, more useful stemming.
## The first stemmer ("doStemming") is a bit slower on benchmarks
## but does some useful things:  1) it's modular, and 2) it returns
## a list of the stemmed and unstemmed terms so you can back out
## what words are associated with what stems.

## The arguments specify, for each prefix, the length that the word must
## be in order to have a prefix removed.
## Note that I only allow one prefix to be taken off each word.
## I also have a list of words not to stem (variants of allah).

## These are the component functions for removing each prefix
## These functions expect a word split into letters
## alif-lam
rAlifLam <- function(word, minwordlen=4){if(length(word) >= minwordlen){if(paste(word[1:2],collapse="") == '\u0627\u0644'){word[1:2] <- ""}}; return(paste(word,collapse=""))}
## waw-alif-lam
rWawAlifLam <- function(word, minwordlen=5){if(length(word) >= minwordlen){if(paste(word[1:3],collapse="") == '\u0648\u0627\u0644'){word[1:3] <- ""}}; return(paste(word,collapse=""))}
## ba-alif-lam
rBaAlifLam <- function(word, minwordlen=5){if(length(word) >= minwordlen){if(paste(word[1:3],collapse="") == '\u0628\u0627\u0644'){word[1:3] <- ""}}; return(paste(word,collapse=""))}
## kaf-alif-lam
rKafAlifLam <- function(word, minwordlen=5){if(length(word) >= minwordlen){if(paste(word[1:3],collapse="") == '\u0643\u0627\u0644'){word[1:3] <- ""}}; return(paste(word,collapse=""))}
## fa-alif-lam
rFaAlifLam <- function(word, minwordlen=5){if(length(word) >= minwordlen){if(paste(word[1:3],collapse="") == '\u0641\u0627\u0644'){word[1:3] <- ""}}; return(paste(word,collapse=""))}
## lam-lam
rLamLam <- function(word, minwordlen=4){if(length(word) >= minwordlen){if(paste(word[1:2],collapse="") == '\u0644\u0644'){word[1:2] <- ""}}; return(paste(word,collapse=""))}
## waw
rWaw <- function(word, minwordlen=4){if(length(word) >= minwordlen){if(paste(word[1],collapse="") == '\u0648'){word[1] <- ""}}; return(paste(word,collapse=""))}

## suffixes
## ha-alif
rHaAlif <- function(word, minwordlen=4){if(length(word) >= minwordlen){if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u0647\u0627'){word[ (length(word)-1):length(word) ] <- ""}}; return(paste(word,collapse=""))}
## alif-nun
rAlifNun <- function(word, minwordlen=4){if(length(word) >= minwordlen){if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u0627\u0646'){word[ (length(word)-1):length(word) ] <- ""}}; return(paste(word,collapse=""))}
## alif-ta
rAlifTa <- function(word, minwordlen=4){if(length(word) >= minwordlen){if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u0627\u062a'){word[ (length(word)-1):length(word) ] <- ""}}; return(paste(word,collapse=""))}
## waw-nun
rWawNun <- function(word, minwordlen=4){if(length(word) >= minwordlen){if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u0648\u0646'){word[ (length(word)-1):length(word) ] <- ""}}; return(paste(word,collapse=""))}
## yah-nun
rYahNun <- function(word, minwordlen=4){if(length(word) >= minwordlen){if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u064a\u0646'){word[ (length(word)-1):length(word) ] <- ""}}; return(paste(word,collapse=""))}
## yah-heh
rYahHeh <- function(word, minwordlen=4){if(length(word) >= minwordlen){if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u064a\u0647'){word[ (length(word)-1):length(word) ] <- ""}}; return(paste(word,collapse=""))}
#yah-ta marbutta
rYahTamarbutta <- function(word, minwordlen=4){if(length(word) >= minwordlen){if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u064a\u0629'){word[ (length(word)-1):length(word) ] <- ""}}; return(paste(word,collapse=""))}
# heh
rHeh <- function(word, minwordlen=3){if(length(word) >= minwordlen){if(paste(word[ length(word) ],collapse="") == '\u0647'){word[ length(word) ] <- ""}}; return(paste(word,collapse=""))}
# ta marbutta
rTamarbutta <- function(word, minwordlen=3){if(length(word) >= minwordlen){if(paste(word[ length(word) ],collapse="") == '\u0629'){word[ length(word) ] <- ""}}; return(paste(word,collapse=""))}
# yah
rYah <- function(word, minwordlen=3){if(length(word) >= minwordlen){if(paste(word[ length(word) ],collapse="") == '\u064a'){word[ length(word) ] <- ""}}; return(paste(word,collapse=""))}


## bundle them all together
doStemming <- function(texts, dontstem =  c('\u0627\u0644\u0644\u0647','\u0644\u0644\u0647')){
  # Split up the words...
  textsSplit <- strsplit(texts," ")[[1]]
  ## if there are actually words to stem
  if(length(textsSplit) > 0){
    ts0 <- textsSplit
    # run the prefix functions
    for(i in 1:length(textsSplit)){
      word0 <- textsSplit[i]
      if(!(word0 %in% dontstem)){
        word1 <- strsplit(word0,"")[[1]]
        word <- rAlifLam(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rWawAlifLam(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rBaAlifLam(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rKafAlifLam(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rFaAlifLam(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rLamLam(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rWaw(word1);if(word != word0){textsSplit[i] <- word; next}    
      }
    }
    # suffixes
    for(i in 1:length(textsSplit)){
      word0 <- textsSplit[i]
      if(!(word0 %in% dontstem)){
        word1 <- strsplit(word0,"")[[1]]
        word <- rHaAlif(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rAlifNun(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rAlifTa(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rWawNun(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rYahNun(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rYahHeh(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rYahTamarbutta(word1);if(word != word0){textsSplit[i] <- word; next}    
        word <- rHeh(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rTamarbutta(word1);if(word != word0){textsSplit[i] <- word; next}
        word <- rYah(word1);if(word != word0){textsSplit[i] <- word; next}    
      }
    }
    ## return the texts pasted back together
    names(textsSplit) <- ts0
  } # end "if length textsSplit > 0"
  texts <- paste(textsSplit,collapse=" ")
  return(list(text=texts, stemmedWords=textsSplit))
}


############################################################
## another stemmer that is even faster but doesn't allow
## different word lengths for different prefixes and suffixes

doStemming2 <- function(texts, dontstem =  c('\u0627\u0644\u0644\u0647','\u0644\u0644\u0647')){
  # Split up the words...
  textsSplit <- strsplit(texts," ")[[1]]
  ts0 <- textsSplit
  
  # define prefixes  -- move these down below the other block and uncomment perl=T to do look-aheads
  AlifLam <- '\u0627\u0644(?=.{2,})'
  WawAlifLam <- '\u0648\u0627\u0644(?=..)'
  BaAlifLam <- '\u0628\u0627\u0644(?=..)'
  KafAlifLam <- '\u0643\u0627\u0644(?=..)'
  FaAlifLam <- '\u0641\u0627\u0644(?=..)'
  LamLam <- '\u0644\u0644(?=..)'
  Waw <- '\u0648(?=...)'
  # define suffixes
  HaAlif <- '(?<=..)\u0647\u0627'
  AlifNun <- '(?<=..)\u0627\u0646'
  AlifTa <- '(?<=..)\u0627\u062a'
  WawNun <- '(?<=..)\u0648\u0646'
  YahNun <- '(?<=..)\u064a\u0646'
  YahHeh <- '(?<=..)\u064a\u0647'
  YahTamarbutta <- '(?<=..)\u064a\u0629'
  Heh <- '(?<=..)\u0647'
  Tamarbutta <- '(?<=..)\u0629'
  Yah <- '(?<=..)\u064a'
  
  # define prefixes
  AlifLam <- '\u0627\u0644'
  WawAlifLam <- '\u0648\u0627\u0644'
  BaAlifLam <- '\u0628\u0627\u0644'
  KafAlifLam <- '\u0643\u0627\u0644'
  FaAlifLam <- '\u0641\u0627\u0644'
  LamLam <- '\u0644\u0644'
  Waw <- '\u0648'
  # define suffixes
  HaAlif <- '\u0647\u0627'
  AlifNun <- '\u0627\u0646'
  AlifTa <- '\u0627\u062a'
  WawNun <- '\u0648\u0646'
  YahNun <- '\u064a\u0646'
  YahHeh <- '\u064a\u0647'
  YahTamarbutta <- '\u064a\u0629'
  Heh <- '\u0647'
  Tamarbutta <- '\u0629'
  Yah <- '\u064a'
  
  ## prefixes
  for(i in 1:length(textsSplit)){
    word0 <- textsSplit[i]
    if(!(word0 %in% dontstem)){
      textsSplit[i] <- gsub(paste0("^",paste(AlifLam,WawAlifLam,BaAlifLam,KafAlifLam,FaAlifLam,LamLam,Waw,sep="|^")), '', word0)#, perl=T)
    }
  }
  # suffixes
  for(i in 1:length(textsSplit)){
    word0 <- textsSplit[i]
    if(!(word0 %in% dontstem)){
      textsSplit[i] <- gsub(paste0(paste(HaAlif,AlifNun,AlifTa,WawNun,YahNun,YahHeh,YahTamarbutta,Heh,Tamarbutta,Yah,sep="$|"),"$"), '', word0)#, perl=T)
    }
  }
  ## return the texts pasted back together
  names(textsSplit) <- ts0
  texts <- paste(textsSplit,collapse=" ")
  return(list(text=texts, stemmedWords=textsSplit))
}


############################################################
## faster, less flexible stemming

## The next two functions, "removePrefixes" and "removeSuffixes" are a bit faster
## on benchmarks but not modular and don't return a list matching words to stemmed words.

## The arguments specify, for each suffix, the length that the word must
## be in order to have a suffix removed.
## Note that I only allow one suffix to be taken off each word.
## I also have a list of words not to stem (variants of allah).

removePrefixes <- function(texts, x1=4, x2=4, x3=5, x4=5, x5=5,
                           x6=5, x7=4, dontstem =  c('\u0627\u0644\u0644\u0647','u0644\u0644\u0647') ){
  
  # Split up the words...
  textsSplit = strsplit(texts," ")[[1]]
  ## if there are actually words to stem
  if(length(textsSplit) > 0){
    
    for(i in 1:length(textsSplit)){
      word = textsSplit[i]
      
      ## a list of words to not stem
      if(!(word %in% dontstem)){
        
        word = strsplit(word,"")[[1]]  
        
        ## alif-lam
        if(length(word) >= x2){                 
          if(paste(word[1:2],collapse="") == '\u0627\u0644'){
            word[1:2] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
        
        ## waw-alif-lam
        if(length(word) >= x3){                 
          if(paste(word[1:3],collapse="") == '\u0648\u0627\u0644'){
            word[1:3] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
        
        ## ba-alif-lam
        if(length(word) >= x4){
          if(paste(word[1:3],collapse="") == '\u0628\u0627\u0644'){
            word[1:3] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
        
        ## kaf-alif-lam
        if(length(word) >= x5){
          if(paste(word[1:3],collapse="") == '\u0643\u0627\u0644'){
            word[1:3] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
        
        ## fa-alif-lam
        if(length(word) >= x6){
          if(paste(word[1:3],collapse="") == '\u0641\u0627\u0644'){
            word[1:3] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
        
        ## lam-lam
        if(length(word) >= x7){
          if(paste(word[1:2],collapse="") == '\u0644\u0644'){
            word[1:2] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
        
        ## waw
        if(length(word) >= x1){
          if(word[1] == '\u0648'){
            word[1] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
      } # close the "if word isn't in this list"
    } # close loop over words
  } # close "if length textsSplit > 0"
  texts <- trim(paste(textsSplit,collapse=" "))
  texts <- trim(gsub(" {2,}"," ", texts))
  return(texts)
}


############################################################
## remove suffixes, roughly in the same way as the light10 stemmer

## The arguments specify, for each suffix, the length that the word must
## be in order to have a suffix removed.
## Note that I only allow one suffix to be taken off each word.
## I also have a list of words not to stem (variants of allah).

removeSuffixes <- function(texts, x1=4, x2=4, x3=4, x4=4, x5=4, x6=4, x7=4, x8=3, x9=3, x10=3,
                           dontstem =  c('\u0627\u0644\u0644\u0647','u0644\u0644\u0647')){
  
  # Split up the words...
  textsSplit = strsplit(texts," ")[[1]]
  
  ## if there are actually words to stem
  if(length(textsSplit) > 0){
    
    for(i in 1:length(textsSplit)){
      word = textsSplit[i]
      
      ## a list of words to not stem
      if(!(word %in% dontstem)){
        
        word = strsplit(word,"")[[1]]  
        
        ## ha-alif
        if(length(word) >= x1){                 
          if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u0647\u0627'){
            word[ (length(word)-1):length(word) ] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
        
        ## alif-nun
        if(length(word) >= x2){                 
          if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u0627\u0646'){
            word[ (length(word)-1):length(word) ] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }                  
        
        ## alif-ta
        if(length(word) >= x3){                 
          if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u0627\u062a'){
            word[ (length(word)-1):length(word) ] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }   
        
        ## waw-nun
        if(length(word) >= x4){                 
          if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u0648\u0646'){
            word[ (length(word)-1):length(word) ] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
        
        ## yah-nun
        if(length(word) >= x5){                 
          if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u064a\u0646'){
            word[ (length(word)-1):length(word) ] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
        
        ## yah-heh
        if(length(word) >= x6){                 
          if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u064a\u0647'){
            word[ (length(word)-1):length(word) ] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
        
        ## yah-ta marbutta
        if(length(word) >= x7){                 
          if(paste(word[ (length(word)-1):length(word) ],collapse="") == '\u064a\u0629'){
            word[ (length(word)-1):length(word) ] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
        
        ## heh
        if(length(word) >= x8){                 
          if(paste(word[ length(word) ],collapse="") == '\u0647'){
            word[ length(word) ] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
        
        ## ta marbutta
        if(length(word) >= x9){                 
          if(paste(word[ length(word) ],collapse="") == '\u0629'){
            word[ length(word) ] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        }
        
        ## yah
        if(length(word) >= x9){                 
          if(paste(word[ length(word) ],collapse="") == '\u064a'){
            word[ length(word) ] <- ""
            textsSplit[i] <- trim(paste(word,collapse=""))
            next
          }
        } 
      } # close the "if word isn't in this list"
    } # close loop over words  
  } # close "if length textsSplit > 0"
  texts <- trim(paste(textsSplit,collapse=" "))
  texts <- trim(gsub(" {2,}"," ", texts))
  return(texts)
}

############################################################

Try the arabicStemR package in your browser

Any scripts or data that you put into this service are public.

arabicStemR documentation built on July 18, 2022, 9:06 a.m.