Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions python/src/sentencepiece/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# This file was automatically generated by SWIG (https://www.swig.org).
# Version 4.1.0
# Version 4.1.1
#
# Do not make changes to this file unless you know what you are doing - modify
# the SWIG interface file instead.
Expand Down Expand Up @@ -258,6 +258,12 @@ def ResetVocabulary(self):
def LoadVocabulary(self, filename, threshold):
return _sentencepiece.SentencePieceProcessor_LoadVocabulary(self, filename, threshold)

def SetAddDummyPrefix(self, add_dummy_whitespace):
return _sentencepiece.SentencePieceProcessor_SetAddDummyPrefix(self, add_dummy_whitespace)

def GetAddDummyPrefix(self):
return _sentencepiece.SentencePieceProcessor_GetAddDummyPrefix(self)

def CalculateEntropy(self, *args):
return _sentencepiece.SentencePieceProcessor_CalculateEntropy(self, *args)

Expand Down Expand Up @@ -551,6 +557,14 @@ def EncodeAsImmutableProto(self, input, **kwargs):
return self.Encode(input=input, out_type='immutable_proto', **kwargs)


def EncodeNoDummyPrefix(self, input, **kwargs):
old_add_dummy_prefix = self.GetAddDummyPrefix()
self.SetAddDummyPrefix(False)
out = self.Encode(input=input, **kwargs)
self.SetAddDummyPrefix(old_add_dummy_prefix)
return out


def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs):
return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
out_type=str, enable_sampling=True, **kwargs)
Expand Down Expand Up @@ -700,7 +714,6 @@ def SampleEncodeAndScore(self,
if include_best and not wor:
raise RuntimeError('When include_best is True, We must specify "wor = True".')


def _encode(text):
if out_type is int:
return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best,
Expand Down Expand Up @@ -847,6 +860,14 @@ def DecodeIdsAsImmutableProto(self, input, out_type='immutable_proto', **kwargs)
return self.Decode(input=input, out_type=out_type, **kwargs)


def DecodeNoDummyPrefix(self, input, **kwargs):
old_add_dummy_prefix = self.GetAddDummyPrefix()
self.SetAddDummyPrefix(False)
out = self.Decode(input=input, **kwargs)
self.SetAddDummyPrefix(old_add_dummy_prefix)
return out


def CalculateEntropy(self, input, alpha, num_threads=None):
"""Calculate sentence entropy"""
if type(input) is list:
Expand Down
23 changes: 19 additions & 4 deletions python/src/sentencepiece/sentencepiece.i
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
for (int n = 0; n < num_threads; ++n) { \
pool.Schedule([&]() { \
size_t i = 0; \
while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) { \
while ((i = std::atomic_fetch_add(&index, size_t{1})) < outs.size()) { \
auto out = enable_sampling ? \
self->Sample##FuncName(ins[i], \
nbest_size, alpha) : \
Expand All @@ -275,7 +275,7 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
for (int n = 0; n < num_threads; ++n) { \
pool.Schedule([&]() { \
size_t i = 0; \
while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) { \
while ((i = std::atomic_fetch_add(&index, size_t{1})) < outs.size()) { \
CheckIds(ins[i], self->GetPieceSize()); \
auto out = self->FuncName(ins[i]); \
ConvertToUnicodeSpans(&out); \
Expand Down Expand Up @@ -664,7 +664,7 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
for (int n = 0; n < num_threads; ++n) {
pool.Schedule([&]() {
size_t i = 0;
while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) {
while ((i = std::atomic_fetch_add(&index, size_t{1})) < outs.size()) {
outs[i] = self->CalculateEntropy(ins[i], alpha);
}
});
Expand Down Expand Up @@ -832,6 +832,14 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
return self.Encode(input=input, out_type='immutable_proto', **kwargs)


def EncodeNoDummyPrefix(self, input, **kwargs):
old_add_dummy_prefix = self.GetAddDummyPrefix()
self.SetAddDummyPrefix(False)
out = self.Encode(input=input, **kwargs)
self.SetAddDummyPrefix(old_add_dummy_prefix)
return out


def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs):
return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
out_type=str, enable_sampling=True, **kwargs)
Expand Down Expand Up @@ -981,7 +989,6 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
if include_best and not wor:
raise RuntimeError('When include_best is True, We must specify "wor = True".')


def _encode(text):
if out_type is int:
return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best,
Expand Down Expand Up @@ -1128,6 +1135,14 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
return self.Decode(input=input, out_type=out_type, **kwargs)


def DecodeNoDummyPrefix(self, input, **kwargs):
old_add_dummy_prefix = self.GetAddDummyPrefix()
self.SetAddDummyPrefix(False)
out = self.Decode(input=input, **kwargs)
self.SetAddDummyPrefix(old_add_dummy_prefix)
return out


def CalculateEntropy(self, input, alpha, num_threads=None):
"""Calculate sentence entropy"""
if type(input) is list:
Expand Down
128 changes: 102 additions & 26 deletions python/src/sentencepiece/sentencepiece_wrap.cxx
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
/* ----------------------------------------------------------------------------
* This file was automatically generated by SWIG (https://www.swig.org).
* Version 4.1.0
* Version 4.1.1
*
* Do not make changes to this file unless you know what you are doing - modify
* the SWIG interface file instead.
* ----------------------------------------------------------------------------- */


#define SWIG_VERSION 0x040100
#define SWIG_VERSION 0x040101
#define SWIGPYTHON
#define SWIG_PYTHON_DIRECTOR_NO_VTABLE

Expand Down Expand Up @@ -3393,7 +3393,7 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
for (int n = 0; n < num_threads; ++n) { \
pool.Schedule([&]() { \
size_t i = 0; \
while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) { \
while ((i = std::atomic_fetch_add(&index, size_t{1})) < outs.size()) { \
auto out = enable_sampling ? \
self->Sample##FuncName(ins[i], \
nbest_size, alpha) : \
Expand All @@ -3417,7 +3417,7 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
for (int n = 0; n < num_threads; ++n) { \
pool.Schedule([&]() { \
size_t i = 0; \
while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) { \
while ((i = std::atomic_fetch_add(&index, size_t{1})) < outs.size()) { \
CheckIds(ins[i], self->GetPieceSize()); \
auto out = self->FuncName(ins[i]); \
ConvertToUnicodeSpans(&out); \
Expand Down Expand Up @@ -3766,6 +3766,27 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)



SWIGINTERN int
SWIG_AsVal_bool (PyObject *obj, bool *val)
{
int r;
if (!PyBool_Check(obj))
return SWIG_ERROR;
r = PyObject_IsTrue(obj);
if (r == -1)
return SWIG_ERROR;
if (val) *val = r ? true : false;
return SWIG_OK;
}


SWIGINTERNINLINE PyObject*
SWIG_From_bool (bool value)
{
return PyBool_FromLong(value ? 1 : 0);
}


/* Getting isfinite working pre C99 across multiple platforms is non-trivial. Users can provide SWIG_isfinite on older platforms. */
#ifndef SWIG_isfinite
/* isfinite() is a macro for C99 */
Expand Down Expand Up @@ -3828,30 +3849,9 @@ SWIGINTERNINLINE PyObject*
return PyInt_FromLong((long) value);
}


SWIGINTERNINLINE PyObject*
SWIG_From_bool (bool value)
{
return PyBool_FromLong(value ? 1 : 0);
}

SWIGINTERN sentencepiece::util::Status sentencepiece_SentencePieceProcessor_LoadFromFile(sentencepiece::SentencePieceProcessor *self,absl::string_view arg){
return self->Load(arg);
}

SWIGINTERN int
SWIG_AsVal_bool (PyObject *obj, bool *val)
{
int r;
if (!PyBool_Check(obj))
return SWIG_ERROR;
r = PyObject_IsTrue(obj);
if (r == -1)
return SWIG_ERROR;
if (val) *val = r ? true : false;
return SWIG_OK;
}

SWIGINTERN std::vector< int > sentencepiece_SentencePieceProcessor__EncodeAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){
auto ids = enable_sampling ?
self->SampleEncodeAsIds(text, nbest_size, alpha) :
Expand Down Expand Up @@ -4016,7 +4016,7 @@ SWIGINTERN std::vector< float > sentencepiece_SentencePieceProcessor__CalculateE
for (int n = 0; n < num_threads; ++n) {
pool.Schedule([&]() {
size_t i = 0;
while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) {
while ((i = std::atomic_fetch_add(&index, size_t{1})) < outs.size()) {
outs[i] = self->CalculateEntropy(ins[i], alpha);
}
});
Expand Down Expand Up @@ -5072,6 +5072,80 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadVocabulary(PyObject *self,
}


SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetAddDummyPrefix(PyObject *self, PyObject *args) {
PyObject *resultobj = 0;
sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ;
bool arg2 ;
void *argp1 = 0 ;
int res1 = 0 ;
bool val2 ;
int ecode2 = 0 ;
PyObject *swig_obj[2] ;
sentencepiece::util::Status result;

if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SetAddDummyPrefix", 2, 2, swig_obj)) SWIG_fail;
res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 );
if (!SWIG_IsOK(res1)) {
SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SetAddDummyPrefix" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'");
}
arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1);
ecode2 = SWIG_AsVal_bool(swig_obj[1], &val2);
if (!SWIG_IsOK(ecode2)) {
SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_SetAddDummyPrefix" "', argument " "2"" of type '" "bool""'");
}
arg2 = static_cast< bool >(val2);
{
try {
result = (arg1)->SetAddDummyPrefix(arg2);
ReleaseResultObject(resultobj);
}
catch (const sentencepiece::util::Status &status) {
SWIG_exception(ToSwigError(status.code()), status.ToString().c_str());
}
}
{
if (!(&result)->ok()) {
SWIG_exception(ToSwigError((&result)->code()), (&result)->ToString().c_str());
}
resultobj = SWIG_From_bool((&result)->ok());
}
return resultobj;
fail:
return NULL;
}


SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetAddDummyPrefix(PyObject *self, PyObject *args) {
PyObject *resultobj = 0;
sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ;
void *argp1 = 0 ;
int res1 = 0 ;
PyObject *swig_obj[1] ;
bool result;

if (!args) SWIG_fail;
swig_obj[0] = args;
res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 | 0 );
if (!SWIG_IsOK(res1)) {
SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_GetAddDummyPrefix" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'");
}
arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1);
{
try {
result = (bool)(arg1)->GetAddDummyPrefix();
ReleaseResultObject(resultobj);
}
catch (const sentencepiece::util::Status &status) {
SWIG_exception(ToSwigError(status.code()), status.ToString().c_str());
}
}
resultobj = SWIG_From_bool(static_cast< bool >(result));
return resultobj;
fail:
return NULL;
}


SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy__SWIG_0(PyObject *self, Py_ssize_t nobjs, PyObject **swig_obj) {
PyObject *resultobj = 0;
sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ;
Expand Down Expand Up @@ -8752,6 +8826,8 @@ static PyMethodDef SwigMethods[] = {
{ "SentencePieceProcessor_SetVocabulary", _wrap_SentencePieceProcessor_SetVocabulary, METH_VARARGS, NULL},
{ "SentencePieceProcessor_ResetVocabulary", _wrap_SentencePieceProcessor_ResetVocabulary, METH_O, NULL},
{ "SentencePieceProcessor_LoadVocabulary", _wrap_SentencePieceProcessor_LoadVocabulary, METH_VARARGS, NULL},
{ "SentencePieceProcessor_SetAddDummyPrefix", _wrap_SentencePieceProcessor_SetAddDummyPrefix, METH_VARARGS, NULL},
{ "SentencePieceProcessor_GetAddDummyPrefix", _wrap_SentencePieceProcessor_GetAddDummyPrefix, METH_O, NULL},
{ "SentencePieceProcessor_CalculateEntropy", _wrap_SentencePieceProcessor_CalculateEntropy, METH_VARARGS, NULL},
{ "SentencePieceProcessor_GetPieceSize", _wrap_SentencePieceProcessor_GetPieceSize, METH_O, NULL},
{ "SentencePieceProcessor_PieceToId", _wrap_SentencePieceProcessor_PieceToId, METH_VARARGS, NULL},
Expand Down
10 changes: 6 additions & 4 deletions src/normalizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ void Normalizer::Init() {

util::Status Normalizer::Normalize(absl::string_view input,
std::string *normalized,
std::vector<size_t> *norm_to_orig) const {
std::vector<size_t> *norm_to_orig,
bool add_dummy_prefix) const {
norm_to_orig->clear();
normalized->clear();

Expand Down Expand Up @@ -126,7 +127,7 @@ util::Status Normalizer::Normalize(absl::string_view input,
// With this prefix, "world" and "hello world" are converted into
// "_world" and "_hello_world", which help the trainer to extract
// "_world" as one symbol.
if (!treat_whitespace_as_suffix_ && spec_->add_dummy_prefix()) add_ws();
if (!treat_whitespace_as_suffix_ && add_dummy_prefix) add_ws();

bool is_prev_space = spec_->remove_extra_whitespaces();
while (!input.empty()) {
Expand Down Expand Up @@ -177,7 +178,7 @@ util::Status Normalizer::Normalize(absl::string_view input,
}

// Adds a space symbol as a suffix (default is false)
if (treat_whitespace_as_suffix_ && spec_->add_dummy_prefix()) add_ws();
if (treat_whitespace_as_suffix_ && add_dummy_prefix) add_ws();

norm_to_orig->push_back(consumed);

Expand All @@ -189,7 +190,8 @@ util::Status Normalizer::Normalize(absl::string_view input,
std::string Normalizer::Normalize(absl::string_view input) const {
std::vector<size_t> norm_to_orig;
std::string normalized;
Normalize(input, &normalized, &norm_to_orig).IgnoreError();
const bool add_dummy_prefix = spec_->add_dummy_prefix();
Normalize(input, &normalized, &norm_to_orig, add_dummy_prefix).IgnoreError();
return normalized;
}

Expand Down
5 changes: 3 additions & 2 deletions src/normalizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,13 @@ class Normalizer {
// This function can do the following normalizations:
// - Character normalization.
// (NFKC / full-width to half-width conversion etc).
// - Adds a prefix space.
// - Adds a prefix space (controlled by |add_dummy_prefix|).
// - Replaces a space with a meta symbol.
// - Removing heading, tailing and other redundant spaces.
virtual util::Status Normalize(absl::string_view input,
std::string *normalized,
std::vector<size_t> *norm_to_orig) const;
std::vector<size_t> *norm_to_orig,
bool add_dummy_prefix) const;

// Returns a normalized string without alignments.
// This function is used in sentencepiece training.
Expand Down
Loading