Skip to content

Commit 2937985

Browse files
committed
adding user dict interface
1 parent dc96bb3 commit 2937985

File tree

6 files changed

+55
-11
lines changed

6 files changed

+55
-11
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ CppJieba是"结巴"中文分词的C++版本
1313
+ 内置分词服务,在linux环境下可安装使用。mac因为没有自带`epoll`,使用示例请看[libcppjieba]
1414
+ [libcppjieba] 最简单易懂的CppJieba头文件库使用示例。
1515
+ 项目自带较为完善的单元测试,核心功能中文分词的稳定性接受过线上环境检验。
16+
+ 支持载自定义用户词典。
1617

1718
## Usage & Example
1819

@@ -181,6 +182,12 @@ Full方法切出所有字典里的词语。
181182

182183
Query方法先使用Mix方法切词,对于切出来的较长的词再使用Full方法。
183184

185+
### 自定义用户词典
186+
187+
```
188+
```
189+
190+
184191
### 关键词抽取
185192

186193
```

src/DictTrie.hpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,10 +60,10 @@ namespace CppJieba
6060
_minWeight = MAX_DOUBLE;
6161
_setInitFlag(false);
6262
}
63-
DictTrie(const string& filePath)
63+
DictTrie(const string& dictPath, const string& userDictPath = "")
6464
{
6565
new (this) DictTrie();
66-
_setInitFlag(init(filePath));
66+
_setInitFlag(init(dictPath, userDictPath));
6767
}
6868
~DictTrie()
6969
{
@@ -80,9 +80,12 @@ namespace CppJieba
8080
_loadDict(dictPath, _nodeInfos);
8181
_calculateWeight(_nodeInfos);
8282
_minWeight = _findMinWeight(_nodeInfos);
83+
8384
if(userDictPath.size())
8485
{
85-
_loadUserDict(dictPath, _minWeight, UNKNOWN_TAG, _nodeInfos);
86+
double maxWeight = _findMaxWeight(_nodeInfos);
87+
_loadUserDict(userDictPath, maxWeight, UNKNOWN_TAG, _nodeInfos);
88+
LogDebug("load userdict[%s] ok.", userDictPath.c_str());
8689
}
8790
_shrink(_nodeInfos);
8891
_trie = _creatTrie(_nodeInfos);
@@ -167,6 +170,15 @@ namespace CppJieba
167170
}
168171
return ret;
169172
}
173+
double _findMaxWeight(const vector<DictUnit>& nodeInfos) const
174+
{
175+
double ret = MIN_DOUBLE;
176+
for(size_t i = 0; i < nodeInfos.size(); i++)
177+
{
178+
ret = max(nodeInfos[i].weight, ret);
179+
}
180+
return ret;
181+
}
170182

171183
void _calculateWeight(vector<DictUnit>& nodeInfos) const
172184
{

src/MPSegment.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,20 +35,20 @@ namespace CppJieba
3535

3636
public:
3737
MPSegment(){_setInitFlag(false);};
38-
explicit MPSegment(const string& dictPath)
38+
explicit MPSegment(const string& dictPath, const string& userDictPath = "")
3939
{
40-
_setInitFlag(init(dictPath));
40+
_setInitFlag(init(dictPath, userDictPath));
4141
};
4242
virtual ~MPSegment(){};
4343
public:
44-
bool init(const string& dictPath)
44+
bool init(const string& dictPath, const string& userDictPath = "")
4545
{
4646
if(_getInitFlag())
4747
{
4848
LogError("already inited before now.");
4949
return false;
5050
}
51-
_dictTrie.init(dictPath);
51+
_dictTrie.init(dictPath, userDictPath);
5252
assert(_dictTrie);
5353
LogInfo("MPSegment init(%s) ok", dictPath.c_str());
5454
return _setInitFlag(true);

src/MixSegment.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,17 @@ namespace CppJieba
1515
HMMSegment _hmmSeg;
1616
public:
1717
MixSegment(){_setInitFlag(false);};
18-
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict)
18+
explicit MixSegment(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
1919
{
20-
_setInitFlag(init(mpSegDict, hmmSegDict));
20+
_setInitFlag(init(mpSegDict, hmmSegDict, userDict));
2121
assert(_getInitFlag());
2222
}
2323
virtual ~MixSegment(){}
2424
public:
25-
bool init(const string& mpSegDict, const string& hmmSegDict)
25+
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "")
2626
{
2727
assert(!_getInitFlag());
28-
if(!_mpSeg.init(mpSegDict))
28+
if(!_mpSeg.init(mpSegDict, userDict))
2929
{
3030
LogError("_mpSeg init");
3131
return false;

test/unittest/TSegments.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,21 @@ TEST(MixSegmentTest, Test1)
5858
ASSERT_EQ(words, vector<string>(res2, res2 + sizeof(res2)/sizeof(res2[0])));
5959
}
6060

61+
TEST(MixSegmentTest, UserDict)
62+
{
63+
MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../test/testdata/userdict.utf8");
64+
//MixSegment segment("../dict/extra_dict/jieba.dict.small.utf8", "../dict/hmm_model.utf8", "../dict/extra_dict/jieba.dict.small.utf8");
65+
ASSERT_TRUE(segment);
66+
const char* str = "令狐冲是云计算方面的专家";
67+
vector<string> words;
68+
ASSERT_TRUE(segment.cut(str, words));
69+
print(words);
70+
exit(0);
71+
72+
//* 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
73+
// 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
74+
}
75+
6176
TEST(MPSegmentTest, Test1)
6277
{
6378
MPSegment segment("../dict/extra_dict/jieba.dict.small.utf8");;

test/unittest/TTrie.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,13 @@ TEST(DictTrieTest, Test1)
5454
// print(vec);
5555
}
5656

57+
TEST(DictTrieTest, UserDict)
58+
{
59+
DictTrie trie(DICT_FILE);
60+
ASSERT_TRUE(trie);
61+
string word = "云计算";
62+
Unicode unicode;
63+
ASSERT_TRUE(TransCode::decode(word, unicode));
64+
print((*trie.find(unicode.begin(), unicode.end())));
65+
exit(0);
66+
}

0 commit comments

Comments
 (0)