Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 55 additions & 60 deletions ext/json/parser/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -477,23 +477,24 @@ static const signed char digit_values[256] = {
-1, -1, -1, -1, -1, -1, -1
};

static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p)
static uint32_t unescape_unicode(JSON_ParserState *state, const char *sp, const char *spe)
{
signed char b;
uint32_t result = 0;
b = digit_values[p[0]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[1]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[2]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[3]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
return result;
if (RB_UNLIKELY(sp > spe - 4)) {
raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
}

const unsigned char *p = (const unsigned char *)sp;

const signed char b0 = digit_values[p[0]];
const signed char b1 = digit_values[p[1]];
const signed char b2 = digit_values[p[2]];
const signed char b3 = digit_values[p[3]];

if (RB_UNLIKELY((signed char)(b0 | b1 | b2 | b3) < 0)) {
raise_parse_error_at("incomplete unicode character escape sequence at %s", state, sp - 2);
}

return ((uint32_t)b0 << 12) | ((uint32_t)b1 << 8) | ((uint32_t)b2 << 4) | (uint32_t)b3;
}

#define GET_PARSER_CONFIG \
Expand Down Expand Up @@ -643,7 +644,7 @@ static inline VALUE json_string_fastpath(JSON_ParserState *state, JSON_ParserCon
typedef struct _json_unescape_positions {
long size;
const char **positions;
bool has_more;
unsigned long additional_backslashes;
} JSON_UnescapePositions;

static inline const char *json_next_backslash(const char *pe, const char *stringEnd, JSON_UnescapePositions *positions)
Expand All @@ -657,7 +658,8 @@ static inline const char *json_next_backslash(const char *pe, const char *string
}
}

if (positions->has_more) {
if (positions->additional_backslashes) {
positions->additional_backslashes--;
return memchr(pe, '\\', stringEnd - pe);
}

Expand Down Expand Up @@ -707,50 +709,43 @@ NOINLINE(static) VALUE json_string_unescape(JSON_ParserState *state, JSON_Parser
case 'f':
APPEND_CHAR('\f');
break;
case 'u':
if (pe > stringEnd - 5) {
raise_parse_error_at("incomplete unicode character escape sequence at %s", state, p);
} else {
uint32_t ch = unescape_unicode(state, (unsigned char *) ++pe);
pe += 3;
/* To handle values above U+FFFF, we take a sequence of
* \uXXXX escapes in the U+D800..U+DBFF then
* U+DC00..U+DFFF ranges, take the low 10 bits from each
* to make a 20-bit number, then add 0x10000 to get the
* final codepoint.
*
* See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
* Surrogate Pairs in UTF-16", and 23.6 "Surrogates
* Area".
*/
if ((ch & 0xFC00) == 0xD800) {
pe++;
if (pe > stringEnd - 6) {
raise_parse_error_at("incomplete surrogate pair at %s", state, p);
}
if (pe[0] == '\\' && pe[1] == 'u') {
uint32_t sur = unescape_unicode(state, (unsigned char *) pe + 2);

if ((sur & 0xFC00) != 0xDC00) {
raise_parse_error_at("invalid surrogate pair at %s", state, p);
}

ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
| (sur & 0x3FF));
pe += 5;
} else {
raise_parse_error_at("incomplete surrogate pair at %s", state, p);
break;
case 'u': {
uint32_t ch = unescape_unicode(state, ++pe, stringEnd);
pe += 3;
/* To handle values above U+FFFF, we take a sequence of
* \uXXXX escapes in the U+D800..U+DBFF then
* U+DC00..U+DFFF ranges, take the low 10 bits from each
* to make a 20-bit number, then add 0x10000 to get the
* final codepoint.
*
* See Unicode 15: 3.8 "Surrogates", 5.3 "Handling
* Surrogate Pairs in UTF-16", and 23.6 "Surrogates
* Area".
*/
if ((ch & 0xFC00) == 0xD800) {
pe++;
if (RB_LIKELY((pe <= stringEnd - 6) && memcmp(pe, "\\u", 2) == 0)) {
uint32_t sur = unescape_unicode(state, pe + 2, stringEnd);

if (RB_UNLIKELY((sur & 0xFC00) != 0xDC00)) {
raise_parse_error_at("invalid surrogate pair at %s", state, p);
}
}

char buf[4];
int unescape_len = convert_UTF32_to_UTF8(buf, ch);
MEMCPY(buffer, buf, char, unescape_len);
buffer += unescape_len;
p = ++pe;
ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF));
pe += 5;
} else {
raise_parse_error_at("incomplete surrogate pair at %s", state, p);
break;
}
}

char buf[4];
int unescape_len = convert_UTF32_to_UTF8(buf, ch);
MEMCPY(buffer, buf, char, unescape_len);
buffer += unescape_len;
p = ++pe;
break;
}
default:
if ((unsigned char)*pe < 0x20) {
if (!config->allow_control_characters) {
Expand Down Expand Up @@ -992,7 +987,7 @@ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfi
JSON_UnescapePositions positions = {
.size = 0,
.positions = backslashes,
.has_more = false,
.additional_backslashes = 0,
};

do {
Expand All @@ -1007,7 +1002,7 @@ static VALUE json_parse_escaped_string(JSON_ParserState *state, JSON_ParserConfi
backslashes[positions.size] = state->cursor;
positions.size++;
} else {
positions.has_more = true;
positions.additional_backslashes++;
}
state->cursor++;
break;
Expand Down
11 changes: 7 additions & 4 deletions include/ruby/internal/core/rtypeddata.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,14 +109,17 @@
/** @cond INTERNAL_MACRO */
#define RTYPEDDATA_P RTYPEDDATA_P
#define RTYPEDDATA_TYPE RTYPEDDATA_TYPE
#define TYPED_DATA_EMBEDDED ((VALUE)1)
#define TYPED_DATA_PTR_MASK (~(TYPED_DATA_EMBEDDED))
/** @endcond */

/**
* Macros to see if each corresponding flag is defined.
*/
#define RUBY_TYPED_FREE_IMMEDIATELY RUBY_TYPED_FREE_IMMEDIATELY
#define RUBY_TYPED_FROZEN_SHAREABLE RUBY_TYPED_FROZEN_SHAREABLE
#define RUBY_TYPED_WB_PROTECTED RUBY_TYPED_WB_PROTECTED
#define RUBY_TYPED_PROMOTED1 RUBY_TYPED_PROMOTED1
/** @endcond */

#define TYPED_DATA_EMBEDDED ((VALUE)1)
#define TYPED_DATA_PTR_MASK (~(TYPED_DATA_EMBEDDED))

/**
* @private
Expand Down
1 change: 1 addition & 0 deletions test/json/json_parser_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ def test_invalid_surogates
assert_raise(JSON::ParserError) { parse('"\\uD800"') }
assert_raise(JSON::ParserError) { parse('"\\uD800_________________"') }
assert_raise(JSON::ParserError) { parse('"\\uD800\\u0041"') }
assert_raise(JSON::ParserError) { parse('"\\uD800\\u004') }
end

def test_parse_big_integers
Expand Down