Skip to content

Commit cb49edb

Browse files
author
James McLaughlin
committed
Implement UTF-8 string encoding
1 parent cf0b194 commit cb49edb

File tree

3 files changed

+118
-8
lines changed

3 files changed

+118
-8
lines changed

json-builder.c

Lines changed: 84 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
#define snprintf _snprintf
4040
#endif
4141

42+
#include "utf8proc.h"
43+
4244
const static json_serialize_opts default_opts =
4345
{
4446
json_serialize_mode_single_line,
@@ -458,20 +460,95 @@ json_value * json_object_merge (json_value * objectA, json_value * objectB)
458460
static size_t measure_string (unsigned int length,
459461
const json_char * str)
460462
{
463+
ssize_t i = 0;
464+
int32_t c = -1;
465+
size_t measured_length = 0;
461466

462-
/* TODO encoding
463-
*/
464-
return strlen (str);
467+
for(;;)
468+
{
469+
i += utf8proc_iterate ((const uint8_t *) str + i, length - i, &c);
470+
471+
if(c == -1)
472+
break;
473+
474+
switch (c)
475+
{
476+
case '"':
477+
case '\\':
478+
case '/':
479+
case '\b':
480+
case '\f':
481+
case '\n':
482+
case '\r':
483+
case '\t':
484+
485+
measured_length += 2;
486+
break;
487+
488+
case 0:
489+
490+
measured_length += 6;
491+
break;
492+
493+
default:
494+
495+
measured_length += utf8proc_measure_char (c);
496+
break;
497+
};
498+
};
499+
500+
return measured_length;
465501
}
466502

503+
#define PRINT_ESCAPED(c) do { \
504+
*buf ++ = '\\'; \
505+
*buf ++ = (c); \
506+
} while(0); \
507+
467508
static size_t serialize_string (json_char * buf,
468509
unsigned int length,
469510
const json_char * str)
470511
{
471-
/* TODO encoding
472-
*/
473-
memcpy (buf, str, length);
474-
return length;
512+
ssize_t i = 0;
513+
int32_t c = -1;
514+
json_char * orig_buf;
515+
516+
orig_buf = buf;
517+
518+
for(;;)
519+
{
520+
i += utf8proc_iterate ((const uint8_t *) str + i, length - i, &c);
521+
522+
if(c == -1)
523+
break;
524+
525+
switch (c)
526+
{
527+
case '"': PRINT_ESCAPED ('\"'); continue;
528+
case '\\': PRINT_ESCAPED ('\\'); continue;
529+
case '/': PRINT_ESCAPED ('/'); continue;
530+
case '\b': PRINT_ESCAPED ('b'); continue;
531+
case '\f': PRINT_ESCAPED ('f'); continue;
532+
case '\n': PRINT_ESCAPED ('n'); continue;
533+
case '\r': PRINT_ESCAPED ('r'); continue;
534+
case '\t': PRINT_ESCAPED ('t'); continue;
535+
536+
case 0:
537+
538+
PRINT_ESCAPED ('u');
539+
sprintf (buf, "%04x", c);
540+
buf += 4;
541+
542+
break;
543+
544+
default:
545+
546+
buf += utf8proc_encode_char (c, (uint8_t *) buf);
547+
break;
548+
};
549+
};
550+
551+
return buf - orig_buf;
475552
}
476553

477554
size_t json_measure (json_value * value)

utf8proc.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,26 @@ bool utf8proc_codepoint_valid(int32_t uc) {
163163
else return true;
164164
}
165165

166+
/* Added in json-builder
167+
*/
168+
ssize_t utf8proc_measure_char(int32_t uc) {
169+
if (uc < 0x00) {
170+
return 0;
171+
} else if (uc < 0x80) {
172+
return 1;
173+
} else if (uc < 0x800) {
174+
return 2;
175+
} else if (uc == 0xFFFF) {
176+
return 1;
177+
} else if (uc == 0xFFFE) {
178+
return 1;
179+
} else if (uc < 0x10000) {
180+
return 3;
181+
} else if (uc < 0x110000) {
182+
return 4;
183+
} else return 0;
184+
}
185+
166186
ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
167187
if (uc < 0x00) {
168188
return 0;

utf8proc.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@
4545
* (i.e. UTF-8 encoded UTF-16 surrogates)
4646
* - support for korean hangul characters
4747
* Unicode Version 5.0.0 is supported.
48+
*
49+
*
50+
* json-builder note: utf8proc_measure_char has been added.
4851
*/
4952

5053

@@ -260,11 +263,21 @@ bool utf8proc_codepoint_valid(int32_t uc);
260263
* Returns 1, if the given unicode code-point is valid, otherwise 0.
261264
*/
262265

266+
ssize_t utf8proc_measure_char(int32_t uc);
267+
/*
268+
* Note: This function is not in the original utf8proc and was added in json-builder.
269+
*
270+
* Returns the length the unicode char with the code point 'uc' would have as
271+
* a UTF-8 string.
272+
* In case of success the length is returned, otherwise 0.
273+
* This function does not check if 'uc' is a valid unicode code point.
274+
*/
275+
263276
ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
264277
/*
265278
* Encodes the unicode char with the code point 'uc' as an UTF-8 string in
266279
* the byte array being pointed to by 'dst'. This array has to be at least
267-
* 4 bytes long.
280+
* utf8proc_measure_char(uc) or 4 bytes long.
268281
* In case of success the number of bytes written is returned,
269282
* otherwise 0.
270283
* This function does not check if 'uc' is a valid unicode code point.

0 commit comments

Comments
 (0)