Skip to content
This repository was archived by the owner on Jan 10, 2025. It is now read-only.

Commit b227fad

Browse files
author
Ilia Lazarev
committed
Fix Bag of words reference #19
1 parent 1d43a36 commit b227fad

File tree

2 files changed

+14
-1
lines changed

2 files changed

+14
-1
lines changed

bibliography.bib

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1086,4 +1086,17 @@ @Article{pyrkov2019
10861086
publisher = {{MDPI} {AG}},
10871087
}
10881088
1089+
@InProceedings{weikang2016,
1090+
author = {Rui, Weikang and Xing, Kai and Jia, Yawei},
1091+
editor = {Lehner, Franz and Fteimi, Nora},
1092+
title = {BOWL: Bag of Word Clusters Text Representation Using Word Embeddings},
1093+
booktitle = {Knowledge Science, Engineering and Management},
1094+
year = {2016},
1095+
publisher = {Springer International Publishing},
1096+
address = {Cham},
1097+
pages = {3--14},
1098+
abstract = {The text representation is fundamental for text mining and information retrieval. The Bag Of Words (BOW) and its variants (e.g. TF-IDF) are very basic text representation methods. Although the BOW and TF-IDF are simple and perform well in tasks like classification and clustering, its representation efficiency is extremely low. Besides, word level semantic similarity is not captured which results failing to capture text level similarity in many situations. In this paper, we propose a straightforward Bag Of Word cLusters (BOWL) representation for texts in a higher level, much lower dimensional space. We exploit the word embeddings to group semantically close words and consider them as a whole. The word embeddings are trained on a large corpus and incorporate extensive knowledge. We demonstrate on three benchmark datasets and two tasks, that BOWL representation shows significant advantages in terms of representation accuracy and efficiency.},
1099+
isbn = {978-3-319-47650-6}
1100+
}
1101+
10891102
@Comment{jabref-meta: databaseType:bibtex;}

manuscript.tex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ \section{Experimental demonstration of QASOFM}
465465
``Quantum Machine Learning'' (QML),
466466
``Cancer'' (MED)
467467
and ``Gene Expression'' (BIO).
468-
Abstracts were vectorized by the bag-of-words model in order to choose most defining words in each data set (see Fig.~\ref{fig:vectorized_sample}) \cite{mctear2016}.
468+
Abstracts were vectorized by the bag-of-words\cite{weikang2016} model in order to choose most defining words in each data set (see Fig.~\ref{fig:vectorized_sample}) \cite{mctear2016}.
469469
This model represents text as a multiset ``bag'' of its words taking into account only multiplicity of words.
470470
Preparing the bag-of-words we excluded the words that appear only in one abstract and more than in 4 abstracts and we also excluded the word ``level'' from consideration due to the frequent overlap between the clusters because it gives instabilities for both classical and quantum algorithms.
471471
We restricted our bag-of-word size to 9 of the most frequent words from the full bags-of-word due to limitations of the number of qubits.

0 commit comments

Comments
 (0)