Implement UTF-8 string encoding

James McLaughlin · James McLaughlin · commit cb49edb7bb7b · 2014-09-04T12:37:45.000+01:00
diff --git a/json-builder.c b/json-builder.c
@@ -39,6 +39,8 @@
     #define snprintf _snprintf
 #endif
 
+#include "utf8proc.h"
+
 const static json_serialize_opts default_opts =
 {
    json_serialize_mode_single_line,
@@ -458,20 +460,95 @@ json_value * json_object_merge (json_value * objectA, json_value * objectB)
 static size_t measure_string (unsigned int length,
                               const json_char * str)
 {
+   ssize_t i = 0;
+   int32_t c = -1;
+   size_t measured_length = 0;
 
-   /* TODO encoding
-    */
-   return strlen (str);
+   for(;;)
+   {
+      i += utf8proc_iterate ((const uint8_t *) str + i, length - i, &c);
+
+      if(c == -1)
+         break;
+
+      switch (c)
+      {
+      case '"':
+      case '\\':
+      case '/':
+      case '\b':
+      case '\f':
+      case '\n':
+      case '\r':
+      case '\t':
+
+         measured_length += 2;
+         break;
+
+      case 0:
+
+         measured_length += 6;
+         break;
+
+      default:
+
+         measured_length += utf8proc_measure_char (c);
+         break;
+      };
+   };
+
+   return measured_length;
 }
 
+#define PRINT_ESCAPED(c) do {  \
+   *buf ++ = '\\';             \
+   *buf ++ = (c);              \
+} while(0);                    \
+
 static size_t serialize_string (json_char * buf,
                                 unsigned int length,
                                 const json_char * str)
 {
-   /* TODO encoding
-    */
-   memcpy (buf, str, length);
-   return length;
+   ssize_t i = 0;
+   int32_t c = -1;
+   json_char * orig_buf;
+
+   orig_buf = buf;
+
+   for(;;)
+   {
+      i += utf8proc_iterate ((const uint8_t *) str + i, length - i, &c);
+
+      if(c == -1)
+         break;
+
+      switch (c)
+      {
+      case '"':   PRINT_ESCAPED ('\"');  continue;
+      case '\\':  PRINT_ESCAPED ('\\');  continue;
+      case '/':   PRINT_ESCAPED ('/');   continue;
+      case '\b':  PRINT_ESCAPED ('b');   continue;
+      case '\f':  PRINT_ESCAPED ('f');   continue;
+      case '\n':  PRINT_ESCAPED ('n');   continue;
+      case '\r':  PRINT_ESCAPED ('r');   continue;
+      case '\t':  PRINT_ESCAPED ('t');   continue;
+
+      case 0:
+
+         PRINT_ESCAPED ('u');
+         sprintf (buf, "%04x", c);
+         buf += 4;
+
+         break;
+
+      default:
+
+         buf += utf8proc_encode_char (c, (uint8_t *) buf);
+         break;
+      };
+   };
+
+   return buf - orig_buf;
 }
 
 size_t json_measure (json_value * value)
diff --git a/utf8proc.c b/utf8proc.c
@@ -163,6 +163,26 @@ bool utf8proc_codepoint_valid(int32_t uc) {
   else return true;
 }
 
+/* Added in json-builder
+ */
+ssize_t utf8proc_measure_char(int32_t uc) {
+  if (uc < 0x00) {
+    return 0;
+  } else if (uc < 0x80) {
+    return 1;
+  } else if (uc < 0x800) {
+    return 2;
+  } else if (uc == 0xFFFF) {
+    return 1;
+  } else if (uc == 0xFFFE) {
+    return 1;
+  } else if (uc < 0x10000) {
+    return 3;
+  } else if (uc < 0x110000) {
+    return 4;
+  } else return 0;
+}
+
 ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
   if (uc < 0x00) {
     return 0;
diff --git a/utf8proc.h b/utf8proc.h
@@ -45,6 +45,9 @@
  *    (i.e. UTF-8 encoded UTF-16 surrogates)
  *  - support for korean hangul characters
  *  Unicode Version 5.0.0 is supported.
+ *
+ *
+ *  json-builder note: utf8proc_measure_char has been added.
  */
 
 
@@ -260,11 +263,21 @@ bool utf8proc_codepoint_valid(int32_t uc);
  *  Returns 1, if the given unicode code-point is valid, otherwise 0.
  */
 
+ssize_t utf8proc_measure_char(int32_t uc);
+/*
+ *  Note: This function is not in the original utf8proc and was added in json-builder.
+ *
+ *  Returns the length the unicode char with the code point 'uc' would have as
+ *  a UTF-8 string.
+ *  In case of success the length is returned, otherwise 0.
+ *  This function does not check if 'uc' is a valid unicode code point.
+ */
+
 ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
 /*
  *  Encodes the unicode char with the code point 'uc' as an UTF-8 string in
  *  the byte array being pointed to by 'dst'. This array has to be at least
- *  4 bytes long.
+ *  utf8proc_measure_char(uc) or 4 bytes long.
  *  In case of success the number of bytes written is returned,
  *  otherwise 0.
  *  This function does not check if 'uc' is a valid unicode code point.