google · mpu · Aug 31, 2023 · Aug 31, 2023
diff --git a/python/src/sentencepiece/__init__.py b/python/src/sentencepiece/__init__.py
@@ -1,5 +1,5 @@
 # This file was automatically generated by SWIG (https://www.swig.org).
-# Version 4.1.0
+# Version 4.1.1
 #
 # Do not make changes to this file unless you know what you are doing - modify
 # the SWIG interface file instead.
@@ -258,6 +258,12 @@ def ResetVocabulary(self):
     def LoadVocabulary(self, filename, threshold):
         return _sentencepiece.SentencePieceProcessor_LoadVocabulary(self, filename, threshold)
 
+    def SetAddDummyPrefix(self, add_dummy_whitespace):
+        return _sentencepiece.SentencePieceProcessor_SetAddDummyPrefix(self, add_dummy_whitespace)
+
+    def GetAddDummyPrefix(self):
+        return _sentencepiece.SentencePieceProcessor_GetAddDummyPrefix(self)
+
     def CalculateEntropy(self, *args):
         return _sentencepiece.SentencePieceProcessor_CalculateEntropy(self, *args)
 
@@ -551,6 +557,14 @@ def EncodeAsImmutableProto(self, input, **kwargs):
       return self.Encode(input=input, out_type='immutable_proto', **kwargs)
 
 
+    def EncodeNoDummyPrefix(self, input, **kwargs):
+      old_add_dummy_prefix = self.GetAddDummyPrefix()
+      self.SetAddDummyPrefix(False)
+      out = self.Encode(input=input, **kwargs)
+      self.SetAddDummyPrefix(old_add_dummy_prefix)
+      return out
+
+
     def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs):
       return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
                          out_type=str, enable_sampling=True, **kwargs)
@@ -700,7 +714,6 @@ def SampleEncodeAndScore(self,
       if include_best and not wor:
         raise RuntimeError('When include_best is True, We must specify "wor = True".')
 
-
       def _encode(text):
         if out_type is int:
           return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best,
@@ -847,6 +860,14 @@ def DecodeIdsAsImmutableProto(self, input, out_type='immutable_proto', **kwargs)
       return self.Decode(input=input, out_type=out_type, **kwargs)
 
 
+    def DecodeNoDummyPrefix(self, input, **kwargs):
+      old_add_dummy_prefix = self.GetAddDummyPrefix()
+      self.SetAddDummyPrefix(False)
+      out = self.Decode(input=input, **kwargs)
+      self.SetAddDummyPrefix(old_add_dummy_prefix)
+      return out
+
+
     def CalculateEntropy(self, input, alpha, num_threads=None):
       """Calculate sentence entropy"""
       if type(input) is list:

diff --git a/python/src/sentencepiece/sentencepiece.i b/python/src/sentencepiece/sentencepiece.i
@@ -251,7 +251,7 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
     for (int n = 0;  n < num_threads; ++n) {                            \
       pool.Schedule([&]() {                                             \
           size_t i = 0;                                                 \
-          while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) { \
+          while ((i = std::atomic_fetch_add(&index, size_t{1})) < outs.size()) { \
             auto out = enable_sampling ?                                \
                        self->Sample##FuncName(ins[i],                   \
                                               nbest_size, alpha) :      \
@@ -275,7 +275,7 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
     for (int n = 0;  n < num_threads; ++n) {                            \
       pool.Schedule([&]() {                                             \
           size_t i = 0;                                                 \
-          while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) { \
+          while ((i = std::atomic_fetch_add(&index, size_t{1})) < outs.size()) { \
             CheckIds(ins[i], self->GetPieceSize());                     \
             auto out = self->FuncName(ins[i]);                          \
             ConvertToUnicodeSpans(&out);                                \
@@ -664,7 +664,7 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
       for (int n = 0;  n < num_threads; ++n) {
         pool.Schedule([&]() {
            size_t i = 0;
-           while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) {
+           while ((i = std::atomic_fetch_add(&index, size_t{1})) < outs.size()) {
              outs[i] = self->CalculateEntropy(ins[i], alpha);
            }
          });
@@ -832,6 +832,14 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
     return self.Encode(input=input, out_type='immutable_proto', **kwargs)
 
 
+  def EncodeNoDummyPrefix(self, input, **kwargs):
+    old_add_dummy_prefix = self.GetAddDummyPrefix()
+    self.SetAddDummyPrefix(False)
+    out = self.Encode(input=input, **kwargs)
+    self.SetAddDummyPrefix(old_add_dummy_prefix)
+    return out
+
+
   def SampleEncodeAsPieces(self, input, nbest_size=None, alpha=None, **kwargs):
     return self.Encode(input=input, nbest_size=nbest_size, alpha=alpha,
                        out_type=str, enable_sampling=True, **kwargs)
@@ -981,7 +989,6 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
     if include_best and not wor:
       raise RuntimeError('When include_best is True, We must specify "wor = True".')
 
-
     def _encode(text):
       if out_type is int:
         return self._SampleEncodeAndScoreAsIds(text, num_samples, alpha, wor, include_best,
@@ -1128,6 +1135,14 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
     return self.Decode(input=input, out_type=out_type, **kwargs)
 
 
+  def DecodeNoDummyPrefix(self, input, **kwargs):
+    old_add_dummy_prefix = self.GetAddDummyPrefix()
+    self.SetAddDummyPrefix(False)
+    out = self.Decode(input=input, **kwargs)
+    self.SetAddDummyPrefix(old_add_dummy_prefix)
+    return out
+
+
   def CalculateEntropy(self, input, alpha, num_threads=None):
     """Calculate sentence entropy"""
     if type(input) is list:

diff --git a/python/src/sentencepiece/sentencepiece_wrap.cxx b/python/src/sentencepiece/sentencepiece_wrap.cxx
@@ -1,13 +1,13 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (https://www.swig.org).
- * Version 4.1.0
+ * Version 4.1.1
  *
  * Do not make changes to this file unless you know what you are doing - modify
  * the SWIG interface file instead.
  * ----------------------------------------------------------------------------- */
 
 
-#define SWIG_VERSION 0x040100
+#define SWIG_VERSION 0x040101
 #define SWIGPYTHON
 #define SWIG_PYTHON_DIRECTOR_NO_VTABLE
 
@@ -3393,7 +3393,7 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
     for (int n = 0;  n < num_threads; ++n) {                            \
       pool.Schedule([&]() {                                             \
           size_t i = 0;                                                 \
-          while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) { \
+          while ((i = std::atomic_fetch_add(&index, size_t{1})) < outs.size()) { \
             auto out = enable_sampling ?                                \
                        self->Sample##FuncName(ins[i],                   \
                                               nbest_size, alpha) :      \
@@ -3417,7 +3417,7 @@ inline void InitNumThreads(const std::vector<T> &ins, int *num_threads) {
     for (int n = 0;  n < num_threads; ++n) {                            \
       pool.Schedule([&]() {                                             \
           size_t i = 0;                                                 \
-          while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) { \
+          while ((i = std::atomic_fetch_add(&index, size_t{1})) < outs.size()) { \
             CheckIds(ins[i], self->GetPieceSize());                     \
             auto out = self->FuncName(ins[i]);                          \
             ConvertToUnicodeSpans(&out);                                \
@@ -3766,6 +3766,27 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
 
 
 
+SWIGINTERN int
+SWIG_AsVal_bool (PyObject *obj, bool *val)
+{
+  int r;
+  if (!PyBool_Check(obj))
+    return SWIG_ERROR;
+  r = PyObject_IsTrue(obj);
+  if (r == -1)
+    return SWIG_ERROR;
+  if (val) *val = r ? true : false;
+  return SWIG_OK;
+}
+
+
+SWIGINTERNINLINE PyObject*
+  SWIG_From_bool  (bool value)
+{
+  return PyBool_FromLong(value ? 1 : 0);
+}
+
+
 /* Getting isfinite working pre C99 across multiple platforms is non-trivial. Users can provide SWIG_isfinite on older platforms. */
 #ifndef SWIG_isfinite
 /* isfinite() is a macro for C99 */
@@ -3828,30 +3849,9 @@ SWIGINTERNINLINE PyObject*
   return PyInt_FromLong((long) value);
 }
 
-
-SWIGINTERNINLINE PyObject*
-  SWIG_From_bool  (bool value)
-{
-  return PyBool_FromLong(value ? 1 : 0);
-}
-
 SWIGINTERN sentencepiece::util::Status sentencepiece_SentencePieceProcessor_LoadFromFile(sentencepiece::SentencePieceProcessor *self,absl::string_view arg){
     return self->Load(arg);
   }
-
-SWIGINTERN int
-SWIG_AsVal_bool (PyObject *obj, bool *val)
-{
-  int r;
-  if (!PyBool_Check(obj))
-    return SWIG_ERROR;
-  r = PyObject_IsTrue(obj);
-  if (r == -1)
-    return SWIG_ERROR;
-  if (val) *val = r ? true : false;
-  return SWIG_OK;
-}
-
 SWIGINTERN std::vector< int > sentencepiece_SentencePieceProcessor__EncodeAsIds(sentencepiece::SentencePieceProcessor const *self,absl::string_view text,bool enable_sampling,int nbest_size,float alpha,bool add_bos,bool add_eos,bool reverse,bool emit_unk_piece){
     auto ids = enable_sampling ?
                self->SampleEncodeAsIds(text, nbest_size, alpha) :
@@ -4016,7 +4016,7 @@ SWIGINTERN std::vector< float > sentencepiece_SentencePieceProcessor__CalculateE
       for (int n = 0;  n < num_threads; ++n) {
         pool.Schedule([&]() {
            size_t i = 0;
-           while ((i = std::atomic_fetch_add(&index, 1)) < outs.size()) {
+           while ((i = std::atomic_fetch_add(&index, size_t{1})) < outs.size()) {
              outs[i] = self->CalculateEntropy(ins[i], alpha);
            }
          });
@@ -5072,6 +5072,80 @@ SWIGINTERN PyObject *_wrap_SentencePieceProcessor_LoadVocabulary(PyObject *self,
 }
 
 
+SWIGINTERN PyObject *_wrap_SentencePieceProcessor_SetAddDummyPrefix(PyObject *self, PyObject *args) {
+  PyObject *resultobj = 0;
+  sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ;
+  bool arg2 ;
+  void *argp1 = 0 ;
+  int res1 = 0 ;
+  bool val2 ;
+  int ecode2 = 0 ;
+  PyObject *swig_obj[2] ;
+  sentencepiece::util::Status result;
+
+  if (!SWIG_Python_UnpackTuple(args, "SentencePieceProcessor_SetAddDummyPrefix", 2, 2, swig_obj)) SWIG_fail;
+  res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 |  0 );
+  if (!SWIG_IsOK(res1)) {
+    SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_SetAddDummyPrefix" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); 
+  }
+  arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1);
+  ecode2 = SWIG_AsVal_bool(swig_obj[1], &val2);
+  if (!SWIG_IsOK(ecode2)) {
+    SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "SentencePieceProcessor_SetAddDummyPrefix" "', argument " "2"" of type '" "bool""'");
+  } 
+  arg2 = static_cast< bool >(val2);
+  {
+    try {
+      result = (arg1)->SetAddDummyPrefix(arg2);
+      ReleaseResultObject(resultobj);
+    }
+    catch (const sentencepiece::util::Status &status) {
+      SWIG_exception(ToSwigError(status.code()), status.ToString().c_str());
+    }
+  }
+  {
+    if (!(&result)->ok()) {
+      SWIG_exception(ToSwigError((&result)->code()), (&result)->ToString().c_str());
+    }
+    resultobj = SWIG_From_bool((&result)->ok());
+  }
+  return resultobj;
+fail:
+  return NULL;
+}
+
+
+SWIGINTERN PyObject *_wrap_SentencePieceProcessor_GetAddDummyPrefix(PyObject *self, PyObject *args) {
+  PyObject *resultobj = 0;
+  sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ;
+  void *argp1 = 0 ;
+  int res1 = 0 ;
+  PyObject *swig_obj[1] ;
+  bool result;
+
+  if (!args) SWIG_fail;
+  swig_obj[0] = args;
+  res1 = SWIG_ConvertPtr(swig_obj[0], &argp1,SWIGTYPE_p_sentencepiece__SentencePieceProcessor, 0 |  0 );
+  if (!SWIG_IsOK(res1)) {
+    SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "SentencePieceProcessor_GetAddDummyPrefix" "', argument " "1"" of type '" "sentencepiece::SentencePieceProcessor *""'"); 
+  }
+  arg1 = reinterpret_cast< sentencepiece::SentencePieceProcessor * >(argp1);
+  {
+    try {
+      result = (bool)(arg1)->GetAddDummyPrefix();
+      ReleaseResultObject(resultobj);
+    }
+    catch (const sentencepiece::util::Status &status) {
+      SWIG_exception(ToSwigError(status.code()), status.ToString().c_str());
+    }
+  }
+  resultobj = SWIG_From_bool(static_cast< bool >(result));
+  return resultobj;
+fail:
+  return NULL;
+}
+
+
 SWIGINTERN PyObject *_wrap_SentencePieceProcessor_CalculateEntropy__SWIG_0(PyObject *self, Py_ssize_t nobjs, PyObject **swig_obj) {
   PyObject *resultobj = 0;
   sentencepiece::SentencePieceProcessor *arg1 = (sentencepiece::SentencePieceProcessor *) 0 ;
@@ -8752,6 +8826,8 @@ static PyMethodDef SwigMethods[] = {
 	 { "SentencePieceProcessor_SetVocabulary", _wrap_SentencePieceProcessor_SetVocabulary, METH_VARARGS, NULL},
 	 { "SentencePieceProcessor_ResetVocabulary", _wrap_SentencePieceProcessor_ResetVocabulary, METH_O, NULL},
 	 { "SentencePieceProcessor_LoadVocabulary", _wrap_SentencePieceProcessor_LoadVocabulary, METH_VARARGS, NULL},
+	 { "SentencePieceProcessor_SetAddDummyPrefix", _wrap_SentencePieceProcessor_SetAddDummyPrefix, METH_VARARGS, NULL},
+	 { "SentencePieceProcessor_GetAddDummyPrefix", _wrap_SentencePieceProcessor_GetAddDummyPrefix, METH_O, NULL},
 	 { "SentencePieceProcessor_CalculateEntropy", _wrap_SentencePieceProcessor_CalculateEntropy, METH_VARARGS, NULL},
 	 { "SentencePieceProcessor_GetPieceSize", _wrap_SentencePieceProcessor_GetPieceSize, METH_O, NULL},
 	 { "SentencePieceProcessor_PieceToId", _wrap_SentencePieceProcessor_PieceToId, METH_VARARGS, NULL},

diff --git a/src/normalizer.cc b/src/normalizer.cc
@@ -71,7 +71,8 @@ void Normalizer::Init() {
 
 util::Status Normalizer::Normalize(absl::string_view input,
                                    std::string *normalized,
-                                   std::vector<size_t> *norm_to_orig) const {
+                                   std::vector<size_t> *norm_to_orig,
+                                   bool add_dummy_prefix) const {
   norm_to_orig->clear();
   normalized->clear();
 
@@ -126,7 +127,7 @@ util::Status Normalizer::Normalize(absl::string_view input,
   // With this prefix, "world" and "hello world" are converted into
   // "_world" and "_hello_world", which help the trainer to extract
   // "_world" as one symbol.
-  if (!treat_whitespace_as_suffix_ && spec_->add_dummy_prefix()) add_ws();
+  if (!treat_whitespace_as_suffix_ && add_dummy_prefix) add_ws();
 
   bool is_prev_space = spec_->remove_extra_whitespaces();
   while (!input.empty()) {
@@ -177,7 +178,7 @@ util::Status Normalizer::Normalize(absl::string_view input,
   }
 
   // Adds a space symbol as a suffix (default is false)
-  if (treat_whitespace_as_suffix_ && spec_->add_dummy_prefix()) add_ws();
+  if (treat_whitespace_as_suffix_ && add_dummy_prefix) add_ws();
 
   norm_to_orig->push_back(consumed);
 
@@ -189,7 +190,8 @@ util::Status Normalizer::Normalize(absl::string_view input,
 std::string Normalizer::Normalize(absl::string_view input) const {
   std::vector<size_t> norm_to_orig;
   std::string normalized;
-  Normalize(input, &normalized, &norm_to_orig).IgnoreError();
+  const bool add_dummy_prefix = spec_->add_dummy_prefix();
+  Normalize(input, &normalized, &norm_to_orig, add_dummy_prefix).IgnoreError();
   return normalized;
 }
 

diff --git a/src/normalizer.h b/src/normalizer.h
@@ -82,12 +82,13 @@ class Normalizer {
   // This function can do the following normalizations:
   // - Character normalization.
   //   (NFKC / full-width to half-width conversion etc).
-  // - Adds a prefix space.
+  // - Adds a prefix space (controlled by |add_dummy_prefix|).
   // - Replaces a space with a meta symbol.
   // - Removing heading, tailing and other redundant spaces.
   virtual util::Status Normalize(absl::string_view input,
                                  std::string *normalized,
-                                 std::vector<size_t> *norm_to_orig) const;
+                                 std::vector<size_t> *norm_to_orig,
+                                 bool add_dummy_prefix) const;
 
   // Returns a normalized string without alignments.
   // This function is used in sentencepiece training.