Skip to content

Commit 1c8c03a

Browse files
committed
utf-8 masks depend on endianness
Rather than reversing 32 bit numbers, when checking if the analysed code units represent a valid utf-8 encoding, we change the constants base on system's endianness.
1 parent 3db8f18 commit 1c8c03a

File tree

1 file changed

+48
-15
lines changed

1 file changed

+48
-15
lines changed

include/boost/json/detail/utf8.hpp

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,30 @@ namespace boost {
2121
namespace json {
2222
namespace detail {
2323

24+
template<endian::order = endian::order::little>
25+
constexpr
26+
std::uint32_t
27+
make_u32_impl(std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1)
28+
{
29+
return (b4 << 24) | (b3 << 16) | (b2 << 8) | b1;
30+
}
31+
32+
template<>
33+
constexpr
34+
std::uint32_t
35+
make_u32_impl<endian::order::big>(
36+
std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1)
37+
{
38+
return (b1 << 24) | (b2 << 16) | (b3 << 8) | b4;
39+
}
40+
41+
constexpr
42+
std::uint32_t
43+
make_u32(std::uint8_t b4, std::uint8_t b3, std::uint8_t b2, std::uint8_t b1)
44+
{
45+
return make_u32_impl<endian::order::native>(b4, b3, b2, b1);
46+
}
47+
2448
template<int N>
2549
std::uint32_t
2650
load_little_endian(void const* p)
@@ -70,46 +94,55 @@ inline
7094
bool
7195
is_valid_utf8(const char* p, uint16_t first)
7296
{
73-
uint32_t v;
97+
std::uint32_t v;
7498
switch(first >> 8)
7599
{
76100
default:
77101
return false;
78102

79103
// 2 bytes, second byte [80, BF]
80104
case 1:
81-
v = load_little_endian<2>(p);
82-
return (v & 0xC000) == 0x8000;
105+
std::memcpy(&v, p, 2);
106+
return ( v & make_u32(0x00,0x00,0xC0,0x00) )
107+
== make_u32(0x00,0x00,0x80,0x00);
83108

84109
// 3 bytes, second byte [A0, BF]
85110
case 2:
86-
v = load_little_endian<3>(p);
87-
return (v & 0xC0E000) == 0x80A000;
111+
std::memcpy(&v, p, 3);
112+
return ( v & make_u32(0x00,0xC0,0xE0,0x00) )
113+
== make_u32(0x00,0x80,0xA0,0x00);
88114

89115
// 3 bytes, second byte [80, BF]
90116
case 3:
91-
v = load_little_endian<3>(p);
92-
return (v & 0xC0C000) == 0x808000;
117+
std::memcpy(&v, p, 3);
118+
return ( v & make_u32(0x00,0xC0,0xC0,0x00) )
119+
== make_u32(0x00,0x80,0x80,0x00);
93120

94121
// 3 bytes, second byte [80, 9F]
95122
case 4:
96-
v = load_little_endian<3>(p);
97-
return (v & 0xC0E000) == 0x808000;
123+
std::memcpy(&v, p, 3);
124+
return ( v & make_u32(0x00,0xC0,0xE0,0x00) )
125+
== make_u32(0x00,0x80,0x80,0x00);
98126

99127
// 4 bytes, second byte [90, BF]
100128
case 5:
101-
v = load_little_endian<4>(p);
102-
return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
129+
std::memcpy(&v, p, 4);
130+
return ( ( ( v & make_u32(0xC0,0xC0,0xF0,0x00) )
131+
+ make_u32(0x7F,0x7F,0x70,0xFF) )
132+
| make_u32(0x00,0x00,0x30,0xFF) )
133+
== make_u32(0x00,0x00,0x30,0xFF);
103134

104135
// 4 bytes, second byte [80, BF]
105136
case 6:
106-
v = load_little_endian<4>(p);
107-
return (v & 0xC0C0C000) == 0x80808000;
137+
std::memcpy(&v, p, 4);
138+
return ( v & make_u32(0xC0,0xC0,0xC0,0x00) )
139+
== make_u32(0x80,0x80,0x80,0x00);
108140

109141
// 4 bytes, second byte [80, 8F]
110142
case 7:
111-
v = load_little_endian<4>(p);
112-
return (v & 0xC0C0F000) == 0x80808000;
143+
std::memcpy(&v, p, 4);
144+
return ( v & make_u32(0xC0,0xC0,0xF0,0x00) )
145+
== make_u32(0x80,0x80,0x80,0x00);
113146
}
114147
}
115148

0 commit comments

Comments
 (0)