Skip to content

Commit 44e966b

Browse files
author
zhaoyihua
committed
create docsim
1 parent b1fcb75 commit 44e966b

File tree

8 files changed

+128
-0
lines changed

8 files changed

+128
-0
lines changed

docsim/__init__.py

Whitespace-only changes.

docsim/config.cfg

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
[path]
2+
train_dir = xxx
3+
4+
5+
[segment]
6+
cut_all = False # 精确模式
7+
HMM = False
8+
customized_dict = ""
9+
10+
11+
[method]
12+
method = bow
13+

docsim/segmenter.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import os, sys
2+
import jieba
3+
from utils import get_config_store
4+
5+
6+
7+
class Segmenter(object):
8+
9+
def __init__(self):
10+
'''
11+
load configuration
12+
'''
13+
self.segmenter = jieba
14+
15+
config = get_config_store()
16+
self.cut_all = config.get("segment", "cut_all")
17+
self.HMM = config.get("segment", "HMM")
18+
customized_dict = config.get("segment", "customized_dict")
19+
if customized_dict:
20+
load_dict(customized_dict)
21+
22+
23+
def load_dict( filename ):
24+
'''
25+
load customized dictionary
26+
'''
27+
if not os.path.isfile( filename ):
28+
sys.stderr.write( "cannot find dictionary %s." %filename )
29+
return
30+
self.segmenter.load_userdict( filename )
31+
32+
33+
def cut_line(self, input_line, cut_all=False ):
34+
'''
35+
just cut a single line, and return a list of tokens
36+
'''
37+
seg_list = self.segmenter.cut( input_line, cut_all=cut_all )
38+
return [ x for x in seg_list if x.strip() ]
39+
40+
41+
def cut_one_file( self, filename, cut_all=False ):
42+
'''
43+
cut all sentence in a file and return a list of lists
44+
'''
45+
if not os.path.isfile( filename ):
46+
sys.stderr.write( "%s doesn't exist." %filename )
47+
fin = open( filename, 'r' )
48+
doc = []
49+
for line in fin:
50+
seg_list = cut_line( line, cut_all )
51+
if seg_list:
52+
doc.append( seg_list )
53+
return doc
54+
55+
def cut_files( self, file_list ):
56+
'''
57+
cut documents in a list of files and merge results to a list of lists
58+
'''
59+
docs = []
60+
for filename in file_list:
61+
doc = cut_one_file( filename, self.cut_all )
62+
docs.extends( doc )
63+
return docs
64+
65+
def cut_folder(self, path ):
66+
'''
67+
cut all documents in the folder
68+
'''
69+
if not os.path.isdir( path ):
70+
sys.stderr.write( "%s doesn't exist." %path )
71+
docs = []
72+
for filename in os.listdir( path ):
73+
doc = cut_one_file( filename, self.cut_all )
74+
docs.extends( doc )
75+
return docs
76+

docsim/segmenter.pyc

2.79 KB
Binary file not shown.

docsim/test.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from segmenter import Segmenter
2+
3+
4+
seg = Segmenter()
5+
tokens = seg.cut_line("hello world")
6+
print '|'.join(tokens)

docsim/utils.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import sys, os
2+
from ConfigParser import ConfigParser
3+
from os.path import join, dirname, abspath
4+
5+
6+
CONFIG = None
7+
8+
def relative(*paths):
9+
return join(dirname(abspath(__file__)), *paths)
10+
11+
12+
def set_config_file(filename):
13+
'''
14+
Load configuration from external files
15+
'''
16+
global CONFIG
17+
CONFIG = ConfigParser()
18+
CONFIG.read(filename)
19+
20+
21+
def get_config_store():
22+
'''
23+
Return configuration
24+
'''
25+
global CONFIG
26+
if CONFIG is None:
27+
CONFIG = ConfigParser()
28+
CONFIG.read(relative("config.cfg"))
29+
return CONFIG

docsim/utils.pyc

1.05 KB
Binary file not shown.

tradeOnEbay/ebay/trading.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,7 @@ def addItem( title, description, primaryCategoryId, startPrice='0.99',
6969
pass
7070
site_elem = add_elem(item_elem, "Site", site)
7171

72+
request = ET.tostring(root, 'utf-8')
73+
return get_response(oname, request, encoding)
74+
75+

0 commit comments

Comments
 (0)