Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ Date: 2019-03-25
Authors@R: c(person("Stefan", "Feuerriegel", email="sentiment@sfeuerriegel.com",
role=c("aut", "cre")),
person("Nicolas", "Proellochs", email="nicolas.proellochs@is.uni-freiburg.de",
role=c("aut")))
role=c("aut")),
person("Chung-hong", "Chan", email = "chainsawtiney@gmail.com", role = c("ctb"), comment = c(ORCID = "0000-0002-6232-7530")))
Description: Performs a sentiment analysis of textual contents in R. This implementation
utilizes various existing dictionaries, such as Harvard IV, or finance-specific
dictionaries. Furthermore, it can also create customized dictionaries. The latter
Expand All @@ -20,7 +21,7 @@ Depends:
Imports:
tm (>= 0.6),
qdapDictionaries,
ngramrr (>= 0.1),
tau,
moments,
stringdist,
glmnet,
Expand All @@ -34,5 +35,5 @@ Suggests:
XML,
mgcv
LazyData: true
RoxygenNote: 6.1.1
RoxygenNote: 7.1.1
VignetteBuilder: knitr
68 changes: 66 additions & 2 deletions R/ngrams.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,67 @@
### ported from ngramrr

taungram <- function(text, n = 1, tolower = FALSE, split = "[[:space:]]+", ...) {
r <- tau::textcnt(text, method = 'string', n = n, tolower = tolower, split = split, ...)
return(Reduce(c, sapply(1:length(r), function(x) rep(names(r[x]), r[x]))))
}

tauchar <- function(text, n = 1, tolower = FALSE, split = "[[:space:]]+", rmEOL = FALSE, ngmin = 1 , ...) {
r <- tau::textcnt(text, method = 'ngram', n = n, tolower = tolower, split = split, ...)
g <- unlist(sapply(1:length(r), function(x) rep(names(r[x]), r[x])))
if (rmEOL) {
g <- g[grep("_", g, invert = TRUE)]
}
if (ngmin > 1 & ngmin <= n) {
g <- Filter(function(x) nchar(x) >= ngmin, g)
}
return(g)
}

# General purpose n-gram tokenizer
#
# A non-Java based n-gram tokenizer to be used with the tm package. Support both character and word n-gram.
#
# @param x input string.
# @param char logical, using character n-gram. char = FALSE denotes word n-gram.
# @param ngmin integer, minimun order of n-gram
# @param ngmax integer, maximun order of n-gram
# @param rmEOL logical, remove ngrams wih EOL character
# @return vector of n-grams
# @examples
# require(tm)
#
# nirvana <- c("hello hello hello how low", "hello hello hello how low",
# "hello hello hello how low", "hello hello hello",
# "with the lights out", "it's less dangerous", "here we are now", "entertain us",
# "i feel stupid", "and contagious", "here we are now", "entertain us",
# "a mulatto", "an albino", "a mosquito", "my libido", "yeah", "hey yay")
#
# ngramrr(nirvana[1], ngmax = 3)
# ngramrr(nirvana[1], ngmax = 3, char = TRUE)
# nirvanacor <- Corpus(VectorSource(nirvana))
# TermDocumentMatrix(nirvanacor, control = list(tokenize = function(x) ngramrr(x, ngmax =3)))
#
# # Character ngram
#
# TermDocumentMatrix(nirvanacor, control = list(tokenize =
# function(x) ngramrr(x, char = TRUE, ngmax =3), wordLengths = c(1, Inf)))
ngramrr <- function(x, char = FALSE, ngmin = 1, ngmax = 2, rmEOL = TRUE) {
if (ngmin > ngmax) {
stop("ngmax must be higher than or equal to ngmin")
}
y <- paste(x, collapse = " ") # why TDM is so stupid?
if (char) {
return(tauchar(y, n = ngmax, rmEOL = rmEOL, ngmin = ngmin))
}
sentencelength <- length(unlist(strsplit(y, split = " ")))
if (sentencelength > ngmax) {
return(Reduce(c, Map(function(n) taungram(y, n), seq(from = ngmin, to = ngmax))))
} else {
return(Reduce(c, Map(function(n) taungram(y, n), seq(from = ngmin, to = sentencelength ))))
}
}


rep_gram <- function(text, n) {
r <- stringdist::qgrams(text, q=n)
g <- unlist(sapply(1:length(r), function(x) rep(colnames(r)[x], r[x])))
Expand Down Expand Up @@ -47,11 +111,11 @@ ngram_tokenize <- function(x, char=FALSE, ngmin=1, ngmax=3) {
if (!is.logical(char)) {
stop("Customized routine only supports char grams")
}

y <- paste(x, collapse=" ") # hint from ngramrr package
if (char) {
return(rep_grams(y, ngmin = ngmin, ngmax = ngmax))
} else {
return(ngramrr::ngramrr(x, char=char, ngmin=ngmin, ngmax=ngmax))
return(ngramrr(x, char=char, ngmin=ngmin, ngmax=ngmax))
}
}
4 changes: 3 additions & 1 deletion man/DictionaryGI.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/DictionaryHE.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/DictionaryLM.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion man/SentimentAnalysis.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 6 additions & 2 deletions man/SentimentDictionaryWeighted.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

72 changes: 54 additions & 18 deletions man/analyzeSentiment.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

54 changes: 42 additions & 12 deletions man/countWords.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 6 additions & 2 deletions man/enetEstimation.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading