sfeuerriegel · chainsawriot · Jul 11, 2020
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -6,7 +6,8 @@ Date: 2019-03-25
 Authors@R: c(person("Stefan", "Feuerriegel", email="sentiment@sfeuerriegel.com",
                     role=c("aut", "cre")),
              person("Nicolas", "Proellochs", email="nicolas.proellochs@is.uni-freiburg.de",
-                    role=c("aut")))
+                    role=c("aut")),
+	person("Chung-hong", "Chan", email = "chainsawtiney@gmail.com", role = c("ctb"), comment = c(ORCID = "0000-0002-6232-7530")))
 Description: Performs a sentiment analysis of textual contents in R. This implementation
     utilizes various existing dictionaries, such as Harvard IV, or finance-specific 
     dictionaries. Furthermore, it can also create customized dictionaries. The latter 
@@ -20,7 +21,7 @@ Depends:
 Imports: 
     tm (>= 0.6),
     qdapDictionaries,
-    ngramrr (>= 0.1),
+    tau,
     moments,
     stringdist,
     glmnet,
@@ -34,5 +35,5 @@ Suggests:
     XML,
     mgcv
 LazyData: true
-RoxygenNote: 6.1.1
+RoxygenNote: 7.1.1
 VignetteBuilder: knitr
diff --git a/R/ngrams.R b/R/ngrams.R
@@ -1,3 +1,67 @@
+### ported from ngramrr
+
+taungram <- function(text, n = 1, tolower = FALSE, split = "[[:space:]]+", ...) {
+    r <- tau::textcnt(text, method = 'string', n = n, tolower = tolower, split = split, ...)
+    return(Reduce(c, sapply(1:length(r), function(x) rep(names(r[x]), r[x]))))
+}
+
+tauchar <- function(text, n = 1, tolower = FALSE, split = "[[:space:]]+", rmEOL = FALSE, ngmin = 1 , ...) {
+    r <- tau::textcnt(text, method = 'ngram', n = n, tolower = tolower, split = split, ...)
+    g <- unlist(sapply(1:length(r), function(x) rep(names(r[x]), r[x])))
+    if (rmEOL) {
+        g <- g[grep("_", g, invert = TRUE)]
+    }
+    if (ngmin > 1 & ngmin <= n) {
+        g <- Filter(function(x) nchar(x) >= ngmin, g)
+    }
+    return(g)
+}
+
+# General purpose n-gram tokenizer
+#
+# A non-Java based n-gram tokenizer to be used with the tm package. Support both character and word n-gram.
+# 
+# @param x input string.
+# @param char logical, using character n-gram. char = FALSE denotes word n-gram.
+# @param ngmin integer, minimun order of n-gram
+# @param ngmax integer, maximun order of n-gram
+# @param rmEOL logical, remove ngrams wih EOL character
+# @return vector of n-grams
+# @examples
+# require(tm)
+# 
+# nirvana <- c("hello hello hello how low", "hello hello hello how low",
+# "hello hello hello how low", "hello hello hello",
+# "with the lights out", "it's less dangerous", "here we are now", "entertain us",
+# "i feel stupid", "and contagious", "here we are now", "entertain us",
+# "a mulatto", "an albino", "a mosquito", "my libido", "yeah", "hey yay")
+#
+# ngramrr(nirvana[1], ngmax = 3)
+# ngramrr(nirvana[1], ngmax = 3, char = TRUE)
+# nirvanacor <- Corpus(VectorSource(nirvana))
+# TermDocumentMatrix(nirvanacor, control = list(tokenize = function(x) ngramrr(x, ngmax =3)))
+#
+# # Character ngram
+# 
+# TermDocumentMatrix(nirvanacor, control = list(tokenize =
+# function(x) ngramrr(x, char = TRUE, ngmax =3), wordLengths = c(1, Inf)))
+ngramrr <- function(x, char = FALSE, ngmin = 1, ngmax = 2, rmEOL = TRUE) {
+    if (ngmin > ngmax) {
+        stop("ngmax must be higher than or equal to ngmin")
+    }
+    y <- paste(x, collapse = " ") # why TDM is so stupid?
+    if (char) {
+        return(tauchar(y, n = ngmax, rmEOL = rmEOL, ngmin = ngmin))
+    }
+    sentencelength <- length(unlist(strsplit(y, split = " ")))
+    if (sentencelength > ngmax) {
+        return(Reduce(c, Map(function(n) taungram(y, n), seq(from = ngmin, to = ngmax))))
+    } else {
+        return(Reduce(c, Map(function(n) taungram(y, n), seq(from = ngmin, to = sentencelength ))))
+    }
+}
+
+
 rep_gram <- function(text, n) {
   r <- stringdist::qgrams(text, q=n)
   g <- unlist(sapply(1:length(r), function(x) rep(colnames(r)[x], r[x])))
@@ -47,11 +111,11 @@ ngram_tokenize <- function(x, char=FALSE, ngmin=1, ngmax=3) {
   if (!is.logical(char)) {
     stop("Customized routine only supports char grams")
   }
-    
+
   y <- paste(x, collapse=" ") # hint from ngramrr package
   if (char) {
       return(rep_grams(y, ngmin = ngmin, ngmax = ngmax))
   } else {
-      return(ngramrr::ngramrr(x, char=char, ngmin=ngmin, ngmax=ngmax))
+      return(ngramrr(x, char=char, ngmin=ngmin, ngmax=ngmax))
   }
 }
diff --git a/man/DictionaryGI.Rd b/man/DictionaryGI.Rd
diff --git a/man/DictionaryHE.Rd b/man/DictionaryHE.Rd
diff --git a/man/DictionaryLM.Rd b/man/DictionaryLM.Rd
diff --git a/man/SentimentAnalysis.Rd b/man/SentimentAnalysis.Rd
diff --git a/man/SentimentDictionaryWeighted.Rd b/man/SentimentDictionaryWeighted.Rd
diff --git a/man/analyzeSentiment.Rd b/man/analyzeSentiment.Rd
diff --git a/man/countWords.Rd b/man/countWords.Rd
diff --git a/man/enetEstimation.Rd b/man/enetEstimation.Rd