Skip to content

Commit c6ceab3

Browse files
committed
modify test case
1 parent b03c80b commit c6ceab3

File tree

2 files changed

+11
-6
lines changed

2 files changed

+11
-6
lines changed

lda/ldacvb0_cpp/ldacvb0/ldacvb0.hpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,14 +154,18 @@ identify and count freaquency of vocabulary
154154
*/
155155
template <class WORD>
156156
class Vocabularies {
157+
bool uses_stopwords;
157158
public:
158159
std::vector<WORD> vocalist;
159160
std::unordered_map<WORD, IdCount> voca;
160161

162+
Vocabularies() : uses_stopwords(true) {}
163+
Vocabularies(bool excludes_stopwords) : uses_stopwords(!excludes_stopwords) {}
164+
161165
size_t add(const WORD &word) {
162166
WORD key(word);
163167
normalize(key);
164-
if (STOPWORDS.find(key)!=STOPWORDS.end()) return SIZE_MAX;
168+
if (uses_stopwords && STOPWORDS.find(key)!=STOPWORDS.end()) return SIZE_MAX;
165169
auto x = voca.find(key);
166170
if (x != voca.end()) {
167171
x->second.count += 1;
@@ -258,10 +262,11 @@ class Documents : public std::vector<Document> {
258262

259263
public:
260264
Documents() : N(0), rexword(REXWORD) {
261-
// TODO : stop words
262265
}
263266
Documents(const std::regex &r) : N(0), rexword(r) {
264267
}
268+
Documents(const std::regex &r, bool excludes_stopwords) : N(0), rexword(r), vocabularies(excludes_stopwords) {
269+
}
265270

266271
bool add(const CHAR* p, const CHAR* end) {
267272
std::regex_iterator<const CHAR*> i( p, end, rexword );

lda/ldacvb0_cpp/ldacvb0_test/test.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@ CYBOZU_TEST_AUTO(test_vocabularies_for_std_string)
1818
" The/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, "
1919
"which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/cc thanks/nns of/in the/at "
2020
"City/nn-tl of/in-tl Atlanta/np-tl ''/'' for/in the/at manner/nn in/in which/wdt the/at election/nn was/bedz conducted/vbn ./.";
21-
cybozu::ldacvb0::Documents<std::string, char> d(cybozu::ldacvb0::REXWORD_WITH_POS);
21+
cybozu::ldacvb0::Documents<std::string, char> d(cybozu::ldacvb0::REXWORD_WITH_POS, true);
2222
d.add(st);
2323
cybozu::ldacvb0::Vocabularies<std::string>& v = d.vocabularies;
24-
CYBOZU_TEST_EQUAL(v.size(), 42);
24+
CYBOZU_TEST_EQUAL(v.size(), 37);
2525
CYBOZU_TEST_EQUAL(v.count("the"), 8);
2626
CYBOZU_TEST_EQUAL(v.count("tHE"), 8);
27-
CYBOZU_TEST_EQUAL(v.count("in"), 2);
27+
CYBOZU_TEST_EQUAL(v.count("in"), 0);
2828
CYBOZU_TEST_EQUAL(v.count("grand"), 1);
2929

3030
CYBOZU_TEST_EQUAL(v.count("the/at"), 0);
@@ -146,7 +146,7 @@ void printgamma(const cybozu::ldacvb0::Mat& gamma) {
146146

147147
CYBOZU_TEST_AUTO(test_lda_cvb0_initialization)
148148
{
149-
Documents<std::string, char> docs;
149+
Documents<std::string, char> docs(cybozu::ldacvb0::REXWORD, true);
150150
docs.vocabularies.add("a"); // 0
151151
docs.vocabularies.add("b"); // 1
152152
docs.vocabularies.add("c"); // 2

0 commit comments

Comments
 (0)