Skip to content

Commit 8154c56

Browse files
committed
support 64bit build
1 parent 32a2741 commit 8154c56

File tree

6 files changed

+200
-16
lines changed

6 files changed

+200
-16
lines changed

lda/ldacvb0_cpp/ldacvb0.sln

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,27 @@ EndProject
1111
Global
1212
GlobalSection(SolutionConfigurationPlatforms) = preSolution
1313
Debug|Win32 = Debug|Win32
14+
Debug|x64 = Debug|x64
1415
Release|Win32 = Release|Win32
16+
Release|x64 = Release|x64
1517
EndGlobalSection
1618
GlobalSection(ProjectConfigurationPlatforms) = postSolution
1719
{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Debug|Win32.ActiveCfg = Debug|Win32
1820
{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Debug|Win32.Build.0 = Debug|Win32
21+
{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Debug|x64.ActiveCfg = Debug|x64
22+
{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Debug|x64.Build.0 = Debug|x64
1923
{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Release|Win32.ActiveCfg = Release|Win32
2024
{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Release|Win32.Build.0 = Release|Win32
25+
{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Release|x64.ActiveCfg = Release|x64
26+
{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Release|x64.Build.0 = Release|x64
2127
{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Debug|Win32.ActiveCfg = Debug|Win32
2228
{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Debug|Win32.Build.0 = Debug|Win32
29+
{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Debug|x64.ActiveCfg = Debug|x64
30+
{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Debug|x64.Build.0 = Debug|x64
2331
{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Release|Win32.ActiveCfg = Release|Win32
2432
{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Release|Win32.Build.0 = Release|Win32
33+
{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Release|x64.ActiveCfg = Release|x64
34+
{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Release|x64.Build.0 = Release|x64
2535
EndGlobalSection
2636
GlobalSection(SolutionProperties) = preSolution
2737
HideSolutionNode = FALSE

lda/ldacvb0_cpp/ldacvb0/ldacvb0.cpp

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ void printnwk(const cybozu::ldacvb0::LDA_CVB0& model, const std::string& word) {
2121

2222
std::cout << "[" << word << "]" << std::endl;
2323
std::cout << "( ";
24-
for (int k=0;k<model.K_;++k) {
24+
for (size_t k=0;k<model.K_;++k) {
2525
std::cout << *(i+k) << " ";
2626
}
2727
std::cout << ")" << std::endl;
@@ -40,6 +40,7 @@ void printHighFreqWords(const cybozu::ldacvb0::Documents<STRING, CHAR> &docs) {
4040
int main(int argc, char* argv[]) {
4141

4242
int K = 20, I = 100, N_WORDS = 20;
43+
size_t ldf = 1, udf = 0; // lower and upper limit of document frequency
4344
double alpha = 0.1;
4445
double beta = 0.01;
4546
bool isCorpusWithPos = false;
@@ -59,6 +60,12 @@ int main(int argc, char* argv[]) {
5960
} else if (st == "-n") {
6061
if (++i>=argc) goto ERROR_OPT_N;
6162
N_WORDS = atoi(argv[i]);
63+
} else if (st == "--ldf") {
64+
if (++i>=argc) goto ERROR_OPT_DF;
65+
ldf = atoi(argv[i]);
66+
} else if (st == "--udf") {
67+
if (++i>=argc) goto ERROR_OPT_DF;
68+
udf = atoi(argv[i]);
6269
} else if (st == "-a") {
6370
if (++i>=argc) goto ERROR_OPT_A;
6471
alpha = atof(argv[i]);
@@ -73,26 +80,32 @@ int main(int argc, char* argv[]) {
7380
}
7481

7582
{
76-
cybozu::ldacvb0::Documents<std::string, char> docs(isCorpusWithPos?cybozu::ldacvb0::REXWORD_WITH_POS:cybozu::ldacvb0::REXWORD);
83+
cybozu::ldacvb0::Documents<std::string, char> orgdocs(isCorpusWithPos?cybozu::ldacvb0::REXWORD_WITH_POS:cybozu::ldacvb0::REXWORD), docs;
7784

7885
for(auto i=files.begin(), iend=files.end();i!=iend;++i) {
7986
try {
8087
cybozu::Mmap map(*i);
8188
const char *p = map.get();
8289
const char *end = p + map.size();
83-
docs.add(p, end);
90+
orgdocs.add(p, end);
8491
} catch (std::exception& e) {
8592
printf("%s\n", e.what());
8693
}
8794
}
8895

96+
size_t M = orgdocs.size();
97+
size_t orgV = orgdocs.vocabularies.size();
98+
if (orgV <= 0) goto ERROR_NO_VOCA;
99+
100+
if (udf == 0) udf = M / 2;
101+
truncDocFreq(docs, orgdocs, ldf, udf);
102+
89103
size_t V = docs.vocabularies.size();
90-
size_t M = docs.size();
91104
if (V <= 0) goto ERROR_NO_VOCA;
92105

93106
std::cout << "M = " << M;
94107
std::cout << ", N = " << docs.N;
95-
std::cout << ", V = " << V << std::endl;
108+
std::cout << ", V = " << V << " / " << orgV << std::endl;
96109
std::cout << "K = " << K << ", alpha = " << alpha << ", beta = " << beta << std::endl;
97110

98111
cybozu::ldacvb0::LDA_CVB0 model(K, V, alpha, beta, docs);
@@ -162,6 +175,9 @@ int main(int argc, char* argv[]) {
162175
ERROR_OPT_N:
163176
p = "[ERROR] -n option needs positive integer";
164177
goto ERROR_EXIT;
178+
ERROR_OPT_DF:
179+
p = "[ERROR] --ldf/udf option needs integer";
180+
goto ERROR_EXIT;
165181
ERROR_OPT_A:
166182
p = "[ERROR] -a option needs positive real number";
167183
goto ERROR_EXIT;

lda/ldacvb0_cpp/ldacvb0/ldacvb0.hpp

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ class Documents : public std::vector<Document> {
228228
public:
229229
Vocabularies<STRING> vocabularies;
230230
int N;
231-
std::unordered_map<size_t, int> docfreq;
231+
std::unordered_map<size_t, size_t> docfreq;
232232

233233
private:
234234
const std::regex &rexword;
@@ -280,11 +280,43 @@ class Documents : public std::vector<Document> {
280280

281281

282282

283+
template <class STRING, class CHAR>
284+
void truncDocFreq(Documents<STRING, CHAR> &docs, const Documents<STRING, CHAR> &orgdocs, size_t ldf, size_t udf) {
285+
std::unordered_map<size_t, size_t> conv;
286+
for (auto i=orgdocs.docfreq.begin(), iend=orgdocs.docfreq.end();i!=iend;++i) {
287+
size_t df = i->second;
288+
if (df <= ldf || df >= udf) continue;
289+
290+
size_t oldid = i->first;
291+
size_t newid = docs.vocabularies.vocalist.size();
292+
const std::string &w = orgdocs.vocabularies.vocalist[oldid];
293+
int c = orgdocs.vocabularies.voca.at(w).count;
294+
295+
conv[oldid] = newid;
296+
docs.vocabularies.vocalist.push_back(w);
297+
docs.vocabularies.voca[w] = IdCount(newid, c);
298+
docs.N += c;
299+
docs.docfreq[newid] = df;
300+
}
301+
302+
for (auto j=orgdocs.begin(), jend=orgdocs.end();j!=jend;++j) {
303+
docs.push_back(Document());
304+
Document &doc = docs.back();
305+
for (auto i=j->begin(), iend=j->end();i!=iend;++i) {
306+
auto x = conv.find(i->id);
307+
if (x!=conv.end()) {
308+
doc.push_back(Term(conv.at(i->id), i->freq));
309+
}
310+
}
311+
}
312+
}
313+
314+
283315

284316
/*
285317
286318
*/
287-
void parameter_init(Vec& n_wk, Vec& n_jk, Vec& n_k, const Documents<std::string, char>& docs, const int K) {
319+
void parameter_init(Vec& n_wk, Vec& n_jk, Vec& n_k, const Documents<std::string, char>& docs, const size_t K) {
288320
const size_t M = docs.size();
289321
const size_t V = docs.vocabularies.size();
290322
n_wk.resize(V*K);
@@ -297,7 +329,7 @@ inline void update_for_word(
297329
Vec& gamma_k,
298330
Vec::iterator i_wk_buf, Vec::iterator i_jk_buf, Vec::iterator i_k_buf,
299331
Vec::const_iterator i_wk, Vec::const_iterator i_jk, Vec::const_iterator i_k,
300-
const size_t w, const int freq, const int K
332+
const size_t w, const int freq, const size_t K
301333
) {
302334
i_wk += w * K;
303335
i_wk_buf += w * K;
@@ -328,14 +360,14 @@ class LDA_CVB0 {
328360
private:
329361
mutable Vec phi; // for worddist at perplecity calcuration
330362
public:
331-
int K_, V_;
363+
size_t K_, V_;
332364
double alpha_;
333365
double beta_;
334366
Vec n_wk1, n_wk2, n_jk1, n_jk2, n_k1, n_k2;
335367
Vec *n_wk, *n_wk_buf, *n_jk, *n_jk_buf, *n_k, *n_k_buf;
336368
Mat gamma_jik;
337369
const Documents<std::string, char>& docs_;
338-
LDA_CVB0(int K, int V, double alpha, double beta, const Documents<std::string, char>& docs) :
370+
LDA_CVB0(size_t K, size_t V, double alpha, double beta, const Documents<std::string, char>& docs) :
339371
K_(K), V_(V), alpha_(alpha), beta_(beta), docs_(docs),
340372
n_wk(&n_wk1), n_wk_buf(&n_wk2), n_jk(&n_jk1), n_jk_buf(&n_jk2), n_k(&n_k1), n_k_buf(&n_k2) {
341373
parameter_init(n_wk1, n_jk1, n_k1, docs, K);
@@ -356,7 +388,7 @@ class LDA_CVB0 {
356388
int freq = i->freq;
357389

358390
double sum = 0;
359-
for (auto ai = aph.begin(), i_wk = n_wk->begin() + w * K, i_jk = j_jk, i_k = n_k->begin();
391+
for (Vec::iterator ai = aph.begin(), i_wk = n_wk->begin() + w * K, i_jk = j_jk, i_k = n_k->begin();
360392
ai != aend; ++ai, ++i_wk, ++i_jk, ++i_k) {
361393
sum += *ai = *i_wk * *i_jk / *i_k;
362394
}
@@ -369,7 +401,7 @@ class LDA_CVB0 {
369401
Vec& gamma = gamma_jik.back();
370402
dd.draw(gamma, aph);
371403

372-
for (auto gi = gamma.begin(), gend = gamma.end(),
404+
for (Vec::iterator gi = gamma.begin(), gend = gamma.end(),
373405
i_wk = n_wk->begin() + w * K, i_jk = j_jk, i_k = n_k->begin();
374406
gi != gend; ++gi, ++i_wk, ++i_jk, ++i_k) {
375407
double g = *gi * freq;
@@ -434,10 +466,10 @@ class LDA_CVB0 {
434466
auto i = n_jk->begin(), iend = n_jk->end();
435467
while(i!=iend) {
436468
double sum = 0;
437-
for (int k=0;k<K_;++k) {
469+
for (size_t k=0;k<K_;++k) {
438470
sum += *(i+k);
439471
}
440-
for (int k=0;k<K_;++k) {
472+
for (size_t k=0;k<K_;++k) {
441473
dist.push_back(*i++/sum);
442474
}
443475
}
@@ -457,7 +489,7 @@ class LDA_CVB0 {
457489
auto vend = vec.end();
458490
for(;j!=jend;++j) {
459491
double sum = 0;
460-
for(int k=0;k<K_;++k) sum += *(i_jk+k);
492+
for(size_t k=0;k<K_;++k) sum += *(i_jk+k);
461493
for(auto v = vec.begin(); v!=vend; ++v) {
462494
*v = *i_jk++ / sum;
463495
}

lda/ldacvb0_cpp/ldacvb0/ldacvb0.vcxproj

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,18 @@
55
<Configuration>Debug</Configuration>
66
<Platform>Win32</Platform>
77
</ProjectConfiguration>
8+
<ProjectConfiguration Include="Debug|x64">
9+
<Configuration>Debug</Configuration>
10+
<Platform>x64</Platform>
11+
</ProjectConfiguration>
812
<ProjectConfiguration Include="Release|Win32">
913
<Configuration>Release</Configuration>
1014
<Platform>Win32</Platform>
1115
</ProjectConfiguration>
16+
<ProjectConfiguration Include="Release|x64">
17+
<Configuration>Release</Configuration>
18+
<Platform>x64</Platform>
19+
</ProjectConfiguration>
1220
</ItemGroup>
1321
<PropertyGroup Label="Globals">
1422
<ProjectGuid>{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}</ProjectGuid>
@@ -22,28 +30,51 @@
2230
<UseDebugLibraries>true</UseDebugLibraries>
2331
<CharacterSet>MultiByte</CharacterSet>
2432
</PropertyGroup>
33+
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
34+
<ConfigurationType>Application</ConfigurationType>
35+
<UseDebugLibraries>true</UseDebugLibraries>
36+
<CharacterSet>MultiByte</CharacterSet>
37+
</PropertyGroup>
2538
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
2639
<ConfigurationType>Application</ConfigurationType>
2740
<UseDebugLibraries>false</UseDebugLibraries>
2841
<WholeProgramOptimization>true</WholeProgramOptimization>
2942
<CharacterSet>MultiByte</CharacterSet>
3043
</PropertyGroup>
44+
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
45+
<ConfigurationType>Application</ConfigurationType>
46+
<UseDebugLibraries>false</UseDebugLibraries>
47+
<WholeProgramOptimization>true</WholeProgramOptimization>
48+
<CharacterSet>MultiByte</CharacterSet>
49+
</PropertyGroup>
3150
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
3251
<ImportGroup Label="ExtensionSettings">
3352
</ImportGroup>
3453
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
3554
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
3655
</ImportGroup>
56+
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
57+
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
58+
</ImportGroup>
3759
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
3860
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
3961
</ImportGroup>
62+
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
63+
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
64+
</ImportGroup>
4065
<PropertyGroup Label="UserMacros" />
4166
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
4267
<LinkIncremental>true</LinkIncremental>
4368
</PropertyGroup>
69+
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
70+
<LinkIncremental>true</LinkIncremental>
71+
</PropertyGroup>
4472
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
4573
<LinkIncremental>false</LinkIncremental>
4674
</PropertyGroup>
75+
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
76+
<LinkIncremental>false</LinkIncremental>
77+
</PropertyGroup>
4778
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
4879
<ClCompile>
4980
<PrecompiledHeader>
@@ -58,6 +89,20 @@
5889
<GenerateDebugInformation>true</GenerateDebugInformation>
5990
</Link>
6091
</ItemDefinitionGroup>
92+
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
93+
<ClCompile>
94+
<PrecompiledHeader>
95+
</PrecompiledHeader>
96+
<WarningLevel>Level3</WarningLevel>
97+
<Optimization>Disabled</Optimization>
98+
<PreprocessorDefinitions>NOMINMAX;_CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
99+
<AdditionalIncludeDirectories>..\cybozulib\include</AdditionalIncludeDirectories>
100+
</ClCompile>
101+
<Link>
102+
<SubSystem>Console</SubSystem>
103+
<GenerateDebugInformation>true</GenerateDebugInformation>
104+
</Link>
105+
</ItemDefinitionGroup>
61106
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
62107
<ClCompile>
63108
<WarningLevel>Level3</WarningLevel>
@@ -76,6 +121,24 @@
76121
<OptimizeReferences>true</OptimizeReferences>
77122
</Link>
78123
</ItemDefinitionGroup>
124+
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
125+
<ClCompile>
126+
<WarningLevel>Level3</WarningLevel>
127+
<PrecompiledHeader>
128+
</PrecompiledHeader>
129+
<Optimization>MaxSpeed</Optimization>
130+
<FunctionLevelLinking>true</FunctionLevelLinking>
131+
<IntrinsicFunctions>true</IntrinsicFunctions>
132+
<PreprocessorDefinitions>NOMINMAX;_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
133+
<AdditionalIncludeDirectories>..\cybozulib\include</AdditionalIncludeDirectories>
134+
</ClCompile>
135+
<Link>
136+
<SubSystem>Console</SubSystem>
137+
<GenerateDebugInformation>true</GenerateDebugInformation>
138+
<EnableCOMDATFolding>true</EnableCOMDATFolding>
139+
<OptimizeReferences>true</OptimizeReferences>
140+
</Link>
141+
</ItemDefinitionGroup>
79142
<ItemGroup>
80143
<ClCompile Include="ldacvb0.cpp" />
81144
</ItemGroup>

0 commit comments

Comments
 (0)