Skip to content

Commit cabf7dd

Browse files
committed
reverse constants on big endian
1 parent 43a87fa commit cabf7dd

File tree

2 files changed

+103
-36
lines changed

2 files changed

+103
-36
lines changed

include/boost/json/detail/utf8.hpp

+72-21
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,22 @@
1919
BOOST_JSON_NS_BEGIN
2020
namespace detail {
2121

22+
#ifdef BOOST_JSON_BIG_ENDIAN
23+
# define BOOST_JSON_MK_NUM(b1, b2) 0x ## b2 ## b1
24+
# define BOOST_JSON_MK_NUM2(b1, b2) 0x ## b2 ## b1 ## 0000
25+
# define BOOST_JSON_MK_NUM3(b1, b2, b3) 0x ## b3 ## b2 ## b1 ## 00
26+
# define BOOST_JSON_MK_NUM4(b1, b2, b3, b4) 0x ## b4 ## b3 ## b2 ## b1
27+
# define BOOST_JSON_UTF8_KIND(b) (b & 0xFF)
28+
# define BOOST_JSON_UTF8_LENGTH(b) (b >> 8)
29+
#else
30+
# define BOOST_JSON_MK_NUM(b1, b2) 0x ## b1 ## b2
31+
# define BOOST_JSON_MK_NUM2(b1, b2) 0x ## b1 ## b2
32+
# define BOOST_JSON_MK_NUM3(b1, b2, b3) 0x ## b1 ## b2 ## b3
33+
# define BOOST_JSON_MK_NUM4(b1, b2, b3, b4) 0x ## b1 ## b2 ## b3 ## b4
34+
# define BOOST_JSON_UTF8_KIND(b) (b >> 8)
35+
# define BOOST_JSON_UTF8_LENGTH(b) (b & 0xFF)
36+
#endif
37+
2238
template<int N>
2339
std::uint32_t
2440
load_little_endian(void const* p)
@@ -38,6 +54,7 @@ inline
3854
uint16_t
3955
classify_utf8(char c)
4056
{
57+
// for little endian
4158
// 0x000 = invalid
4259
// 0x102 = 2 bytes, second byte [80, BF]
4360
// 0x203 = 3 bytes, second byte [A0, BF]
@@ -46,6 +63,7 @@ classify_utf8(char c)
4663
// 0x504 = 4 bytes, second byte [90, BF]
4764
// 0x604 = 4 bytes, second byte [80, BF]
4865
// 0x704 = 4 bytes, second byte [80, 8F]
66+
// for big endian the bytes are reversed
4967
static constexpr uint16_t first[128]
5068
{
5169
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
@@ -57,13 +75,41 @@ classify_utf8(char c)
5775
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
5876
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
5977

60-
0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
61-
0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
62-
0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
63-
0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
64-
0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
65-
0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
66-
0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
78+
BOOST_JSON_MK_NUM(00, 00), BOOST_JSON_MK_NUM(00, 00),
79+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
80+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
81+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
82+
83+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
84+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
85+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
86+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
87+
88+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
89+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
90+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
91+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
92+
93+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
94+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
95+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
96+
BOOST_JSON_MK_NUM(01, 02), BOOST_JSON_MK_NUM(01, 02),
97+
98+
BOOST_JSON_MK_NUM(02, 03), BOOST_JSON_MK_NUM(03, 03),
99+
BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(03, 03),
100+
BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(03, 03),
101+
BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(03, 03),
102+
103+
BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(03, 03),
104+
BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(03, 03),
105+
BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(04, 03),
106+
BOOST_JSON_MK_NUM(03, 03), BOOST_JSON_MK_NUM(03, 03),
107+
108+
BOOST_JSON_MK_NUM(05, 04), BOOST_JSON_MK_NUM(06, 04),
109+
BOOST_JSON_MK_NUM(06, 04), BOOST_JSON_MK_NUM(06, 04),
110+
BOOST_JSON_MK_NUM(07, 04), BOOST_JSON_MK_NUM(00, 00),
111+
BOOST_JSON_MK_NUM(00, 00), BOOST_JSON_MK_NUM(00, 00),
112+
67113
0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
68114
};
69115
return first[static_cast<unsigned char>(c & 0x7F)];
@@ -74,30 +120,33 @@ bool
74120
is_valid_utf8(const char* p, uint16_t first)
75121
{
76122
uint32_t v;
77-
switch(first >> 8)
123+
switch(BOOST_JSON_UTF8_KIND(first))
78124
{
79125
default:
80126
return false;
81127

82128
// 2 bytes, second byte [80, BF]
83129
case 1:
84-
v = load_little_endian<2>(p);
85-
return (v & 0xC000) == 0x8000;
130+
std::memcpy(&v, p, 2);
131+
return (v & BOOST_JSON_MK_NUM2(C0,00)) == BOOST_JSON_MK_NUM2(80,00);
86132

87133
// 3 bytes, second byte [A0, BF]
88134
case 2:
89-
v = load_little_endian<3>(p);
90-
return (v & 0xC0E000) == 0x80A000;
135+
std::memcpy(&v, p, 3);
136+
return (v & BOOST_JSON_MK_NUM3(C0,E0,00))
137+
== BOOST_JSON_MK_NUM3(80,A0,00);
91138

92139
// 3 bytes, second byte [80, BF]
93140
case 3:
94-
v = load_little_endian<3>(p);
95-
return (v & 0xC0C000) == 0x808000;
141+
std::memcpy(&v, p, 3);
142+
return (v & BOOST_JSON_MK_NUM3(C0,C0,00))
143+
== BOOST_JSON_MK_NUM3(80,80,00);
96144

97145
// 3 bytes, second byte [80, 9F]
98146
case 4:
99-
v = load_little_endian<3>(p);
100-
return (v & 0xC0E000) == 0x808000;
147+
std::memcpy(&v, p, 3);
148+
return (v & BOOST_JSON_MK_NUM3(C0,E0,00))
149+
== BOOST_JSON_MK_NUM3(80,80,00);
101150

102151
// 4 bytes, second byte [90, BF]
103152
case 5:
@@ -106,13 +155,15 @@ is_valid_utf8(const char* p, uint16_t first)
106155

107156
// 4 bytes, second byte [80, BF]
108157
case 6:
109-
v = load_little_endian<4>(p);
110-
return (v & 0xC0C0C000) == 0x80808000;
158+
std::memcpy(&v, p, 4);
159+
return (v & BOOST_JSON_MK_NUM4(C0,C0,C0,00))
160+
== BOOST_JSON_MK_NUM4(80,80,80,00);
111161

112162
// 4 bytes, second byte [80, 8F]
113163
case 7:
114-
v = load_little_endian<4>(p);
115-
return (v & 0xC0C0F000) == 0x80808000;
164+
std::memcpy(&v, p, 4);
165+
return (v & BOOST_JSON_MK_NUM4(C0,C0,F0,00))
166+
== BOOST_JSON_MK_NUM4(80,80,80,00);
116167
}
117168
}
118169

@@ -139,7 +190,7 @@ class utf8_sequence
139190
uint8_t
140191
length() const noexcept
141192
{
142-
return first_ & 0xFF;
193+
return BOOST_JSON_UTF8_LENGTH(first_);
143194
}
144195

145196
bool

test/utf8.cpp

+31-15
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ BOOST_JSON_NS_BEGIN
1717
class utf8_test
1818
{
1919
public:
20+
::test_suite::log_type log;
21+
2022
void
2123
testLoadLittleEndian()
2224
{
@@ -50,21 +52,30 @@ class utf8_test
5052
void
5153
testClassifyUtf8()
5254
{
53-
BOOST_TEST((detail::classify_utf8('\x00') & 0xFF) == 0);
55+
BOOST_TEST(detail::classify_utf8('\x00') == 0);
5456
// from code point U+0080 (0xC280 in UTF-8)
55-
BOOST_TEST((detail::classify_utf8('\xC2') & 0xFF) == 2);
57+
BOOST_TEST(detail::classify_utf8('\xC2') == BOOST_JSON_MK_NUM(01, 02));
5658
// from code point U+07FF (0xDFBF in UTF-8)
57-
BOOST_TEST((detail::classify_utf8('\xDF') & 0xFF) == 2);
59+
BOOST_TEST(detail::classify_utf8('\xDF') == BOOST_JSON_MK_NUM(01, 02));
5860
// from code point U+0800 (0xE0A080 in UTF-8)
59-
BOOST_TEST((detail::classify_utf8('\xE0') & 0xFF) == 3);
60-
// from code point U+0FFFF (0xEFBFBF in UTF-8)
61-
BOOST_TEST((detail::classify_utf8('\xEF') & 0xFF) == 3);
62-
// from code point U+010000 (0xF0908080 in UTF-8)
63-
BOOST_TEST((detail::classify_utf8('\xF0') & 0xFF) == 4);
64-
// from code point U+010000 (0xF0908080 in UTF-8)
65-
BOOST_TEST((detail::classify_utf8('\xF0') & 0xFF) == 4);
66-
// from code point U+010FFFF (0xF48FBFBF in UTF-8)
67-
BOOST_TEST((detail::classify_utf8('\xF4') & 0xFF) == 4);
61+
BOOST_TEST(detail::classify_utf8('\xE0') == BOOST_JSON_MK_NUM(02, 03));
62+
// from code point U+D7B0 (0xED9EB0 in UTF-8)
63+
BOOST_TEST(detail::classify_utf8('\xED') == BOOST_JSON_MK_NUM(04, 03));
64+
// from code point U+FFFF (0xEFBFBF in UTF-8)
65+
BOOST_TEST(detail::classify_utf8('\xEF') == BOOST_JSON_MK_NUM(03, 03));
66+
// from code point U+10000 (0xF0908080 in UTF-8)
67+
BOOST_TEST(detail::classify_utf8('\xF0') == BOOST_JSON_MK_NUM(05, 04));
68+
// from code point U+80000 (0xF1808080 in UTF-8)
69+
BOOST_TEST(detail::classify_utf8('\xF1') == BOOST_JSON_MK_NUM(06, 04));
70+
// from code point U+C00000 (0xF3808080 in UTF-8)
71+
BOOST_TEST(detail::classify_utf8('\xF3') == BOOST_JSON_MK_NUM(06, 04));
72+
// from code point U+10FFFF (0xF48FBFBF in UTF-8)
73+
BOOST_TEST(detail::classify_utf8('\xF4') == BOOST_JSON_MK_NUM(07, 04));
74+
75+
BOOST_TEST(BOOST_JSON_UTF8_KIND(detail::classify_utf8('\xC2')) == 1);
76+
BOOST_TEST(BOOST_JSON_UTF8_LENGTH(detail::classify_utf8('\xC2')) == 2);
77+
BOOST_TEST(BOOST_JSON_UTF8_KIND(detail::classify_utf8('\xF4')) == 7);
78+
BOOST_TEST(BOOST_JSON_UTF8_LENGTH(detail::classify_utf8('\xF4')) == 4);
6879
}
6980

7081
void
@@ -78,9 +89,14 @@ class utf8_test
7889
BOOST_TEST(is_valid_utf8("\xC2\x80")); // code point U+0080
7990
BOOST_TEST(is_valid_utf8("\xDF\xBF")); // code point U+07FF
8091
BOOST_TEST(is_valid_utf8("\xE0\xA0\x80")); // code point U+0800
81-
BOOST_TEST(is_valid_utf8("\xEF\xBF\xBF")); // from code point U+0FFFF
82-
BOOST_TEST(is_valid_utf8("\xF0\x90\x80\x80")); // code point U+010000
83-
BOOST_TEST(is_valid_utf8("\xF4\x8F\xBF\xBF")); // code point U+010FFFF
92+
BOOST_TEST(is_valid_utf8("\xED\x9E\xB0")); // code point U+D7B0
93+
BOOST_TEST(is_valid_utf8("\xEF\xBF\xBF")); // from code point U+FFFF
94+
BOOST_TEST(is_valid_utf8("\xF0\x90\x80\x80")); // code point U+10000
95+
BOOST_TEST(is_valid_utf8("\xF1\x80\x80\x80")); // code point U+80000
96+
BOOST_TEST(is_valid_utf8("\xF3\x80\x80\x80")); // code point U+C00000
97+
BOOST_TEST(is_valid_utf8("\xF4\x8F\xBF\xBF")); // code point U+10FFFF
98+
BOOST_TEST(is_valid_utf8("\xD0\x9F"));
99+
BOOST_TEST(is_valid_utf8("\xF0\x9F\x98\xB9"));
84100

85101
BOOST_TEST(! is_valid_utf8("\x80"));
86102
BOOST_TEST(! is_valid_utf8("\xBF"));

0 commit comments

Comments
 (0)