@@ -143,6 +143,11 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
143143        {
144144          break ;
145145        }
146+         case  ' b'  :
147+         {
148+           *current_p = ' \b '  ;
149+           break ;
150+         }
146151        case  ' f'  :
147152        {
148153          *current_p = ' \f '  ;
@@ -163,10 +168,19 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
163168          *current_p = ' \t '  ;
164169          break ;
165170        }
166-         case  ' b '  :
171+         case  ' u '  :
167172        {
168-           *current_p = ' \b '  ;
169-           break ;
173+           lit_code_point_t  code_point;
174+ 
175+           if  (!(lit_read_code_point_from_hex  (current_p + 1 , 4 , &code_point)))
176+           {
177+             return ;
178+           }
179+ 
180+           current_p += 5 ;
181+           write_p += lit_code_point_to_utf8  (code_point, write_p);
182+           continue ;
183+           /*  FALLTHRU */ 
170184        }
171185        default :
172186        {
@@ -177,6 +191,57 @@ ecma_builtin_json_parse_string (ecma_json_token_t *token_p) /**< token argument
177191    *write_p++ = *current_p++;
178192  }
179193
194+   /* 
195+    * Post processing surrogate pairs. 
196+    * 
197+    * The general issue is, that surrogate fragments can come from 
198+    * the original stream and can be constructed by \u sequences 
199+    * as well. We need to construct code points from them. 
200+    * 
201+    * Example: JSON.parse ('"\\ud801\udc00"') === "\ud801\udc00" 
202+    *          The first \u is parsed by JSON, the second is by the lexer. 
203+    * 
204+    * The rewrite happens in-place, since the write pointer is always 
205+    * precede the read-pointer. We also cannot create an UTF8 iterator, 
206+    * because the lit_is_utf8_string_valid assertion may fail. 
207+    */  
208+ 
209+   lit_utf8_byte_t  *read_p = token_p->u .string .start_p ;
210+   lit_utf8_byte_t  *read_end_p = write_p;
211+   write_p = read_p;
212+ 
213+   while  (read_p < read_end_p)
214+   {
215+     lit_code_point_t  code_point;
216+     read_p += lit_read_code_point_from_utf8  (read_p,
217+                                              (lit_utf8_size_t ) (read_end_p - read_p),
218+                                              &code_point);
219+ 
220+     /*  The lit_is_code_unit_high_surrogate expects ecma_char_t argument
221+        so code_points above maximum UTF16 code unit must not be tested. */  
222+     if  (read_p < read_end_p
223+         && code_point <= LIT_UTF16_CODE_UNIT_MAX
224+         && lit_is_code_unit_high_surrogate  ((ecma_char_t ) code_point))
225+     {
226+       lit_code_point_t  next_code_point;
227+       lit_utf8_size_t  next_code_point_size = lit_read_code_point_from_utf8  (read_p,
228+                                                                             (lit_utf8_size_t ) (read_end_p - read_p),
229+                                                                             &next_code_point);
230+ 
231+       if  (next_code_point <= LIT_UTF16_CODE_UNIT_MAX
232+           && lit_is_code_unit_low_surrogate  ((ecma_char_t ) next_code_point))
233+       {
234+         code_point = lit_convert_surrogate_pair_to_code_point  ((ecma_char_t ) code_point,
235+                                                                (ecma_char_t ) next_code_point);
236+         read_p += next_code_point_size;
237+       }
238+     }
239+     write_p += lit_code_point_to_utf8  (code_point, write_p);
240+   }
241+ 
242+   JERRY_ASSERT  (lit_is_utf8_string_valid  (token_p->u .string .start_p ,
243+                                           (lit_utf8_size_t ) (write_p - token_p->u .string .start_p )));
244+ 
180245  token_p->u .string .size  = (lit_utf8_size_t ) (write_p - token_p->u .string .start_p );
181246  token_p->current_p  = current_p + 1 ;
182247  token_p->type  = string_token;
@@ -757,17 +822,17 @@ ecma_builtin_json_parse (ecma_value_t this_arg __attr_unused___, /**< 'this' arg
757822                  ret_value);
758823
759824  ecma_string_t  *string_p = ecma_get_string_from_value  (string);
760-   ecma_length_t  length  = (uint32_t ) ecma_string_get_length  (string_p);
761-   size_t  buffer_size = sizeof  (lit_utf8_byte_t ) * (length  + 1 );
825+   ecma_length_t  string_size  = (uint32_t ) ecma_string_get_size  (string_p);
826+   size_t  buffer_size = sizeof  (lit_utf8_byte_t ) * (string_size  + 1 );
762827
763828  MEM_DEFINE_LOCAL_ARRAY  (str_start_p, buffer_size, lit_utf8_byte_t );
764829
765830  ecma_string_to_utf8_string  (string_p, str_start_p, (ssize_t ) buffer_size);
766-   str_start_p[length ] = LIT_BYTE_NULL;
831+   str_start_p[string_size ] = LIT_BYTE_NULL;
767832
768833  ecma_json_token_t  token;
769834  token.current_p  = str_start_p;
770-   token.end_p  = str_start_p + length ;
835+   token.end_p  = str_start_p + string_size ;
771836
772837  ecma_value_t  final_result = ecma_builtin_json_parse_value  (&token);
773838
0 commit comments