xref: /llvm-project/libcxx/test/std/localization/codecvt_unicode.pass.cpp (revision 3497500946c9b6a1b2e1452312a24c41ee412b34)
1 //===----------------------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS -D_LIBCPP_ENABLE_CXX26_REMOVED_CODECVT
10 
11 // Requires the fix in 390840f.
12 // XFAIL: using-built-library-before-llvm-18
13 
14 #include <algorithm>
15 #include <cassert>
16 #include <codecvt>
17 #include <locale>
18 
19 #include "test_macros.h"
20 
21 struct test_offsets_ok {
22   size_t in_size;
23   size_t out_size;
24 };
25 struct test_offsets_partial {
26   size_t in_size;
27   size_t out_size;
28   size_t expected_in_next;
29   size_t expected_out_next;
30 };
31 
32 template <class CharT>
33 struct test_offsets_error {
34   size_t in_size;
35   size_t out_size;
36   size_t expected_in_next;
37   size_t expected_out_next;
38   CharT replace_char;
39   size_t replace_pos;
40 };
41 
42 #define array_size(x) (sizeof(x) / sizeof(x)[0])
43 
44 using std::begin;
45 using std::char_traits;
46 using std::codecvt_base;
47 using std::copy;
48 using std::end;
49 
50 template <class InternT, class ExternT>
utf8_to_utf32_in_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)51 void utf8_to_utf32_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
52   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
53   const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
54   const char32_t expected[]   = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
55   static_assert(array_size(input) == 11, "");
56   static_assert(array_size(expected) == 5, "");
57 
58   ExternT in[array_size(input)];
59   InternT exp[array_size(expected)];
60   copy(begin(input), end(input), begin(in));
61   copy(begin(expected), end(expected), begin(exp));
62   assert(char_traits<ExternT>::length(in) == 10);
63   assert(char_traits<InternT>::length(exp) == 4);
64   test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 4}};
65   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
66     test_offsets_ok t                = *it;
67     InternT out[array_size(exp) - 1] = {};
68     assert(t.in_size <= array_size(in));
69     assert(t.out_size <= array_size(out));
70     mbstate_t state          = {};
71     const ExternT* in_next   = nullptr;
72     InternT* out_next        = nullptr;
73     codecvt_base::result res = codecvt_base::ok;
74 
75     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
76     assert(res == cvt.ok);
77     assert(in_next == in + t.in_size);
78     assert(out_next == out + t.out_size);
79     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
80     if (t.out_size < array_size(out))
81       assert(out[t.out_size] == 0);
82 
83     state   = mbstate_t();
84     int len = cvt.length(state, in, in + t.in_size, t.out_size);
85     assert(len >= 0);
86     assert(static_cast<size_t>(len) == t.in_size);
87   }
88 
89   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
90     test_offsets_ok t            = *it;
91     InternT out[array_size(exp)] = {};
92     assert(t.in_size <= array_size(in));
93     assert(t.out_size <= array_size(out));
94     mbstate_t state          = {};
95     const ExternT* in_next   = nullptr;
96     InternT* out_next        = nullptr;
97     codecvt_base::result res = codecvt_base::ok;
98 
99     res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
100     assert(res == cvt.ok);
101     assert(in_next == in + t.in_size);
102     assert(out_next == out + t.out_size);
103     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
104     if (t.out_size < array_size(out))
105       assert(out[t.out_size] == 0);
106 
107     state   = mbstate_t();
108     int len = cvt.length(state, in, in + t.in_size, array_size(out));
109     assert(len >= 0);
110     assert(static_cast<size_t>(len) == t.in_size);
111   }
112 }
113 
114 template <class InternT, class ExternT>
utf8_to_utf32_in_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)115 void utf8_to_utf32_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
116   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
117   const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
118   const char32_t expected[]   = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
119   static_assert(array_size(input) == 11, "");
120   static_assert(array_size(expected) == 5, "");
121 
122   ExternT in[array_size(input)];
123   InternT exp[array_size(expected)];
124   copy(begin(input), end(input), begin(in));
125   copy(begin(expected), end(expected), begin(exp));
126   assert(char_traits<ExternT>::length(in) == 10);
127   assert(char_traits<InternT>::length(exp) == 4);
128 
129   test_offsets_partial offsets[] = {
130       {1, 0, 0, 0}, // no space for first CP
131 
132       {3, 1, 1, 1}, // no space for second CP
133       {2, 2, 1, 1}, // incomplete second CP
134       {2, 1, 1, 1}, // incomplete second CP, and no space for it
135 
136       {6, 2, 3, 2}, // no space for third CP
137       {4, 3, 3, 2}, // incomplete third CP
138       {5, 3, 3, 2}, // incomplete third CP
139       {4, 2, 3, 2}, // incomplete third CP, and no space for it
140       {5, 2, 3, 2}, // incomplete third CP, and no space for it
141 
142       {10, 3, 6, 3}, // no space for fourth CP
143       {7, 4, 6, 3},  // incomplete fourth CP
144       {8, 4, 6, 3},  // incomplete fourth CP
145       {9, 4, 6, 3},  // incomplete fourth CP
146       {7, 3, 6, 3},  // incomplete fourth CP, and no space for it
147       {8, 3, 6, 3},  // incomplete fourth CP, and no space for it
148       {9, 3, 6, 3},  // incomplete fourth CP, and no space for it
149   };
150 
151   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
152     test_offsets_partial t           = *it;
153     InternT out[array_size(exp) - 1] = {};
154     assert(t.in_size <= array_size(in));
155     assert(t.out_size <= array_size(out));
156     assert(t.expected_in_next <= t.in_size);
157     assert(t.expected_out_next <= t.out_size);
158     mbstate_t state          = {};
159     const ExternT* in_next   = nullptr;
160     InternT* out_next        = nullptr;
161     codecvt_base::result res = codecvt_base::ok;
162 
163     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
164     assert(res == cvt.partial);
165     assert(in_next == in + t.expected_in_next);
166     assert(out_next == out + t.expected_out_next);
167     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
168     if (t.expected_out_next < array_size(out))
169       assert(out[t.expected_out_next] == 0);
170 
171     state   = mbstate_t();
172     int len = cvt.length(state, in, in + t.in_size, t.out_size);
173     assert(len >= 0);
174     assert(static_cast<size_t>(len) == t.expected_in_next);
175   }
176 }
177 
178 template <class InternT, class ExternT>
utf8_to_utf32_in_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)179 void utf8_to_utf32_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
180   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
181   const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
182   const char32_t expected[]   = {'b', 0x0448, 0xD700, 0x10AAAA, 0};
183   static_assert(array_size(input) == 11, "");
184   static_assert(array_size(expected) == 5, "");
185 
186   ExternT in[array_size(input)];
187   InternT exp[array_size(expected)];
188   copy(begin(input), end(input), begin(in));
189   copy(begin(expected), end(expected), begin(exp));
190   assert(char_traits<ExternT>::length(in) == 10);
191   assert(char_traits<InternT>::length(exp) == 4);
192 
193   // There are 5 classes of errors in UTF-8 decoding
194   // 1. Missing leading byte
195   // 2. Missing trailing byte
196   // 3. Surrogate CP
197   // 4. Overlong sequence
198   // 5. CP out of Unicode range
199   test_offsets_error<unsigned char> offsets[] = {
200 
201       // 1. Missing leading byte. We will replace the leading byte with
202       // non-leading byte, such as a byte that is always invalid or a trailing
203       // byte.
204 
205       // replace leading byte with invalid byte
206       {1, 4, 0, 0, 0xFF, 0},
207       {3, 4, 1, 1, 0xFF, 1},
208       {6, 4, 3, 2, 0xFF, 3},
209       {10, 4, 6, 3, 0xFF, 6},
210 
211       // replace leading byte with trailing byte
212       {1, 4, 0, 0, 0b10101010, 0},
213       {3, 4, 1, 1, 0b10101010, 1},
214       {6, 4, 3, 2, 0b10101010, 3},
215       {10, 4, 6, 3, 0b10101010, 6},
216 
217       // 2. Missing trailing byte. We will replace the trailing byte with
218       // non-trailing byte, such as a byte that is always invalid or a leading
219       // byte (simple ASCII byte in our case).
220 
221       // replace first trailing byte with ASCII byte
222       {3, 4, 1, 1, 'z', 2},
223       {6, 4, 3, 2, 'z', 4},
224       {10, 4, 6, 3, 'z', 7},
225 
226       // replace first trailing byte with invalid byte
227       {3, 4, 1, 1, 0xFF, 2},
228       {6, 4, 3, 2, 0xFF, 4},
229       {10, 4, 6, 3, 0xFF, 7},
230 
231       // replace second trailing byte with ASCII byte
232       {6, 4, 3, 2, 'z', 5},
233       {10, 4, 6, 3, 'z', 8},
234 
235       // replace second trailing byte with invalid byte
236       {6, 4, 3, 2, 0xFF, 5},
237       {10, 4, 6, 3, 0xFF, 8},
238 
239       // replace third trailing byte
240       {10, 4, 6, 3, 'z', 9},
241       {10, 4, 6, 3, 0xFF, 9},
242 
243       // 2.1 The following test-cases raise doubt whether error or partial should
244       // be returned. For example, we have 4-byte sequence with valid leading
245       // byte. If we hide the last byte we need to return partial. But, if the
246       // second or third byte, which are visible to the call to codecvt, are
247       // malformed then error should be returned.
248 
249       // replace first trailing byte with ASCII byte, also incomplete at end
250       {5, 4, 3, 2, 'z', 4},
251       {8, 4, 6, 3, 'z', 7},
252       {9, 4, 6, 3, 'z', 7},
253 
254       // replace first trailing byte with invalid byte, also incomplete at end
255       {5, 4, 3, 2, 0xFF, 4},
256       {8, 4, 6, 3, 0xFF, 7},
257       {9, 4, 6, 3, 0xFF, 7},
258 
259       // replace second trailing byte with ASCII byte, also incomplete at end
260       {9, 4, 6, 3, 'z', 8},
261 
262       // replace second trailing byte with invalid byte, also incomplete at end
263       {9, 4, 6, 3, 0xFF, 8},
264 
265       // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
266       // CP U+D700
267       {6, 4, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
268       {6, 4, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
269       {6, 4, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
270       {6, 4, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
271 
272       // 4. Overlong sequence. The CPs in the input are chosen such as modifying
273       // just the leading byte is enough to make them overlong, i.e. for the
274       // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
275       // zeroes.
276       {3, 4, 1, 1, 0b11000000, 1},  // make the 2-byte CP overlong
277       {3, 4, 1, 1, 0b11000001, 1},  // make the 2-byte CP overlong
278       {6, 4, 3, 2, 0b11100000, 3},  // make the 3-byte CP overlong
279       {10, 4, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
280 
281       // 5. CP above range
282       // turn U+10AAAA into U+14AAAA by changing its leading byte
283       {10, 4, 6, 3, 0b11110101, 6},
284       // turn U+10AAAA into U+11AAAA by changing its 2nd byte
285       {10, 4, 6, 3, 0b10011010, 7},
286   };
287   for (test_offsets_error<unsigned char>* it = begin(offsets); it != end(offsets); ++it) {
288     test_offsets_error<unsigned char> t = *it;
289     InternT out[array_size(exp) - 1]    = {};
290     assert(t.in_size <= array_size(in));
291     assert(t.out_size <= array_size(out));
292     assert(t.expected_in_next <= t.in_size);
293     assert(t.expected_out_next <= t.out_size);
294     ExternT old_char  = in[t.replace_pos];
295     in[t.replace_pos] = t.replace_char;
296 
297     mbstate_t state          = {};
298     const ExternT* in_next   = nullptr;
299     InternT* out_next        = nullptr;
300     codecvt_base::result res = codecvt_base::ok;
301 
302     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
303     assert(res == cvt.error);
304     assert(in_next == in + t.expected_in_next);
305     assert(out_next == out + t.expected_out_next);
306     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
307     if (t.expected_out_next < array_size(out))
308       assert(out[t.expected_out_next] == 0);
309 
310     state   = mbstate_t();
311     int len = cvt.length(state, in, in + t.in_size, t.out_size);
312     assert(len >= 0);
313     assert(static_cast<size_t>(len) == t.expected_in_next);
314 
315     in[t.replace_pos] = old_char;
316   }
317 }
318 
319 template <class InternT, class ExternT>
utf8_to_utf32_in(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)320 void utf8_to_utf32_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
321   utf8_to_utf32_in_ok(cvt);
322   utf8_to_utf32_in_partial(cvt);
323   utf8_to_utf32_in_error(cvt);
324 }
325 
326 template <class InternT, class ExternT>
utf32_to_utf8_out_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)327 void utf32_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
328   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
329   const char32_t input[]         = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
330   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
331   static_assert(array_size(input) == 5, "");
332   static_assert(array_size(expected) == 11, "");
333 
334   InternT in[array_size(input)];
335   ExternT exp[array_size(expected)];
336   copy(begin(input), end(input), begin(in));
337   copy(begin(expected), end(expected), begin(exp));
338   assert(char_traits<InternT>::length(in) == 4);
339   assert(char_traits<ExternT>::length(exp) == 10);
340 
341   test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}};
342   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
343     test_offsets_ok t                = *it;
344     ExternT out[array_size(exp) - 1] = {};
345     assert(t.in_size <= array_size(in));
346     assert(t.out_size <= array_size(out));
347     mbstate_t state          = {};
348     const InternT* in_next   = nullptr;
349     ExternT* out_next        = nullptr;
350     codecvt_base::result res = codecvt_base::ok;
351 
352     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
353     assert(res == cvt.ok);
354     assert(in_next == in + t.in_size);
355     assert(out_next == out + t.out_size);
356     assert(char_traits<ExternT>::compare(out, exp, t.out_size) == 0);
357     if (t.out_size < array_size(out))
358       assert(out[t.out_size] == 0);
359   }
360 }
361 
362 template <class InternT, class ExternT>
utf32_to_utf8_out_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)363 void utf32_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
364   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
365   const char32_t input[]         = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
366   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
367   static_assert(array_size(input) == 5, "");
368   static_assert(array_size(expected) == 11, "");
369 
370   InternT in[array_size(input)];
371   ExternT exp[array_size(expected)];
372   copy(begin(input), end(input), begin(in));
373   copy(begin(expected), end(expected), begin(exp));
374   assert(char_traits<InternT>::length(in) == 4);
375   assert(char_traits<ExternT>::length(exp) == 10);
376 
377   test_offsets_partial offsets[] = {
378       {1, 0, 0, 0}, // no space for first CP
379 
380       {2, 1, 1, 1}, // no space for second CP
381       {2, 2, 1, 1}, // no space for second CP
382 
383       {3, 3, 2, 3}, // no space for third CP
384       {3, 4, 2, 3}, // no space for third CP
385       {3, 5, 2, 3}, // no space for third CP
386 
387       {4, 6, 3, 6}, // no space for fourth CP
388       {4, 7, 3, 6}, // no space for fourth CP
389       {4, 8, 3, 6}, // no space for fourth CP
390       {4, 9, 3, 6}, // no space for fourth CP
391   };
392   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
393     test_offsets_partial t           = *it;
394     ExternT out[array_size(exp) - 1] = {};
395     assert(t.in_size <= array_size(in));
396     assert(t.out_size <= array_size(out));
397     assert(t.expected_in_next <= t.in_size);
398     assert(t.expected_out_next <= t.out_size);
399     mbstate_t state          = {};
400     const InternT* in_next   = nullptr;
401     ExternT* out_next        = nullptr;
402     codecvt_base::result res = codecvt_base::ok;
403 
404     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
405     assert(res == cvt.partial);
406     assert(in_next == in + t.expected_in_next);
407     assert(out_next == out + t.expected_out_next);
408     assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
409     if (t.expected_out_next < array_size(out))
410       assert(out[t.expected_out_next] == 0);
411   }
412 }
413 
414 template <class InternT, class ExternT>
utf32_to_utf8_out_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)415 void utf32_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
416   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
417   const char32_t input[]         = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
418   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
419   static_assert(array_size(input) == 5, "");
420   static_assert(array_size(expected) == 11, "");
421 
422   InternT in[array_size(input)];
423   ExternT exp[array_size(expected)];
424   copy(begin(input), end(input), begin(in));
425   copy(begin(expected), end(expected), begin(exp));
426   assert(char_traits<InternT>::length(in) == 4);
427   assert(char_traits<ExternT>::length(exp) == 10);
428 
429   test_offsets_error<InternT> offsets[] = {
430 
431       // Surrogate CP
432       {4, 10, 0, 0, 0xD800, 0},
433       {4, 10, 1, 1, 0xDBFF, 1},
434       {4, 10, 2, 3, 0xDC00, 2},
435       {4, 10, 3, 6, 0xDFFF, 3},
436 
437       // CP out of range
438       {4, 10, 0, 0, 0x00110000, 0},
439       {4, 10, 1, 1, 0x00110000, 1},
440       {4, 10, 2, 3, 0x00110000, 2},
441       {4, 10, 3, 6, 0x00110000, 3}};
442 
443   for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
444     test_offsets_error<InternT> t    = *it;
445     ExternT out[array_size(exp) - 1] = {};
446     assert(t.in_size <= array_size(in));
447     assert(t.out_size <= array_size(out));
448     assert(t.expected_in_next <= t.in_size);
449     assert(t.expected_out_next <= t.out_size);
450     InternT old_char  = in[t.replace_pos];
451     in[t.replace_pos] = t.replace_char;
452 
453     mbstate_t state          = {};
454     const InternT* in_next   = nullptr;
455     ExternT* out_next        = nullptr;
456     codecvt_base::result res = codecvt_base::ok;
457 
458     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
459     assert(res == cvt.error);
460     assert(in_next == in + t.expected_in_next);
461     assert(out_next == out + t.expected_out_next);
462     assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
463     if (t.expected_out_next < array_size(out))
464       assert(out[t.expected_out_next] == 0);
465 
466     in[t.replace_pos] = old_char;
467   }
468 }
469 
470 template <class InternT, class ExternT>
utf32_to_utf8_out(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)471 void utf32_to_utf8_out(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
472   utf32_to_utf8_out_ok(cvt);
473   utf32_to_utf8_out_partial(cvt);
474   utf32_to_utf8_out_error(cvt);
475 }
476 
477 template <class InternT, class ExternT>
test_utf8_utf32_cvt(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)478 void test_utf8_utf32_cvt(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
479   utf8_to_utf32_in(cvt);
480   utf32_to_utf8_out(cvt);
481 }
482 
483 template <class InternT, class ExternT>
utf8_to_utf16_in_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)484 void utf8_to_utf16_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
485   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
486   const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
487   const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
488   static_assert(array_size(input) == 11, "");
489   static_assert(array_size(expected) == 6, "");
490 
491   ExternT in[array_size(input)];
492   InternT exp[array_size(expected)];
493   copy(begin(input), end(input), begin(in));
494   copy(begin(expected), end(expected), begin(exp));
495   assert(char_traits<ExternT>::length(in) == 10);
496   assert(char_traits<InternT>::length(exp) == 5);
497 
498   test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 5}};
499   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
500     test_offsets_ok t                = *it;
501     InternT out[array_size(exp) - 1] = {};
502     assert(t.in_size <= array_size(in));
503     assert(t.out_size <= array_size(out));
504     mbstate_t state          = {};
505     const ExternT* in_next   = nullptr;
506     InternT* out_next        = nullptr;
507     codecvt_base::result res = codecvt_base::ok;
508 
509     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
510     assert(res == cvt.ok);
511     assert(in_next == in + t.in_size);
512     assert(out_next == out + t.out_size);
513     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
514     if (t.out_size < array_size(out))
515       assert(out[t.out_size] == 0);
516 
517     state   = mbstate_t();
518     int len = cvt.length(state, in, in + t.in_size, t.out_size);
519     assert(len >= 0);
520     assert(static_cast<size_t>(len) == t.in_size);
521   }
522 
523   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
524     test_offsets_ok t            = *it;
525     InternT out[array_size(exp)] = {};
526     assert(t.in_size <= array_size(in));
527     assert(t.out_size <= array_size(out));
528     mbstate_t state          = {};
529     const ExternT* in_next   = nullptr;
530     InternT* out_next        = nullptr;
531     codecvt_base::result res = codecvt_base::ok;
532 
533     res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
534     assert(res == cvt.ok);
535     assert(in_next == in + t.in_size);
536     assert(out_next == out + t.out_size);
537     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
538     if (t.out_size < array_size(out))
539       assert(out[t.out_size] == 0);
540 
541     state   = mbstate_t();
542     int len = cvt.length(state, in, in + t.in_size, array_size(out));
543     assert(len >= 0);
544     assert(static_cast<size_t>(len) == t.in_size);
545   }
546 }
547 
548 template <class InternT, class ExternT>
utf8_to_utf16_in_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)549 void utf8_to_utf16_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
550   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
551   const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
552   const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
553   static_assert(array_size(input) == 11, "");
554   static_assert(array_size(expected) == 6, "");
555 
556   ExternT in[array_size(input)];
557   InternT exp[array_size(expected)];
558   copy(begin(input), end(input), begin(in));
559   copy(begin(expected), end(expected), begin(exp));
560   assert(char_traits<ExternT>::length(in) == 10);
561   assert(char_traits<InternT>::length(exp) == 5);
562 
563   test_offsets_partial offsets[] = {
564       {1, 0, 0, 0}, // no space for first CP
565 
566       {3, 1, 1, 1}, // no space for second CP
567       {2, 2, 1, 1}, // incomplete second CP
568       {2, 1, 1, 1}, // incomplete second CP, and no space for it
569 
570       {6, 2, 3, 2}, // no space for third CP
571       {4, 3, 3, 2}, // incomplete third CP
572       {5, 3, 3, 2}, // incomplete third CP
573       {4, 2, 3, 2}, // incomplete third CP, and no space for it
574       {5, 2, 3, 2}, // incomplete third CP, and no space for it
575 
576       {10, 3, 6, 3}, // no space for fourth CP
577       {10, 4, 6, 3}, // no space for fourth CP
578       {7, 5, 6, 3},  // incomplete fourth CP
579       {8, 5, 6, 3},  // incomplete fourth CP
580       {9, 5, 6, 3},  // incomplete fourth CP
581       {7, 3, 6, 3},  // incomplete fourth CP, and no space for it
582       {8, 3, 6, 3},  // incomplete fourth CP, and no space for it
583       {9, 3, 6, 3},  // incomplete fourth CP, and no space for it
584       {7, 4, 6, 3},  // incomplete fourth CP, and no space for it
585       {8, 4, 6, 3},  // incomplete fourth CP, and no space for it
586       {9, 4, 6, 3},  // incomplete fourth CP, and no space for it
587 
588   };
589 
590   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
591     test_offsets_partial t           = *it;
592     InternT out[array_size(exp) - 1] = {};
593     assert(t.in_size <= array_size(in));
594     assert(t.out_size <= array_size(out));
595     assert(t.expected_in_next <= t.in_size);
596     assert(t.expected_out_next <= t.out_size);
597     mbstate_t state          = {};
598     const ExternT* in_next   = nullptr;
599     InternT* out_next        = nullptr;
600     codecvt_base::result res = codecvt_base::ok;
601 
602     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
603     assert(res == cvt.partial);
604     assert(in_next == in + t.expected_in_next);
605     assert(out_next == out + t.expected_out_next);
606     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
607     if (t.expected_out_next < array_size(out))
608       assert(out[t.expected_out_next] == 0);
609 
610     state   = mbstate_t();
611     int len = cvt.length(state, in, in + t.in_size, t.out_size);
612     assert(len >= 0);
613     assert(static_cast<size_t>(len) == t.expected_in_next);
614   }
615 }
616 
617 template <class InternT, class ExternT>
utf8_to_utf16_in_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)618 void utf8_to_utf16_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
619   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
620   const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
621   const char16_t expected[]   = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
622   static_assert(array_size(input) == 11, "");
623   static_assert(array_size(expected) == 6, "");
624 
625   ExternT in[array_size(input)];
626   InternT exp[array_size(expected)];
627   copy(begin(input), end(input), begin(in));
628   copy(begin(expected), end(expected), begin(exp));
629   assert(char_traits<ExternT>::length(in) == 10);
630   assert(char_traits<InternT>::length(exp) == 5);
631 
632   // There are 5 classes of errors in UTF-8 decoding
633   // 1. Missing leading byte
634   // 2. Missing trailing byte
635   // 3. Surrogate CP
636   // 4. Overlong sequence
637   // 5. CP out of Unicode range
638   test_offsets_error<unsigned char> offsets[] = {
639 
640       // 1. Missing leading byte. We will replace the leading byte with
641       // non-leading byte, such as a byte that is always invalid or a trailing
642       // byte.
643 
644       // replace leading byte with invalid byte
645       {1, 5, 0, 0, 0xFF, 0},
646       {3, 5, 1, 1, 0xFF, 1},
647       {6, 5, 3, 2, 0xFF, 3},
648       {10, 5, 6, 3, 0xFF, 6},
649 
650       // replace leading byte with trailing byte
651       {1, 5, 0, 0, 0b10101010, 0},
652       {3, 5, 1, 1, 0b10101010, 1},
653       {6, 5, 3, 2, 0b10101010, 3},
654       {10, 5, 6, 3, 0b10101010, 6},
655 
656       // 2. Missing trailing byte. We will replace the trailing byte with
657       // non-trailing byte, such as a byte that is always invalid or a leading
658       // byte (simple ASCII byte in our case).
659 
660       // replace first trailing byte with ASCII byte
661       {3, 5, 1, 1, 'z', 2},
662       {6, 5, 3, 2, 'z', 4},
663       {10, 5, 6, 3, 'z', 7},
664 
665       // replace first trailing byte with invalid byte
666       {3, 5, 1, 1, 0xFF, 2},
667       {6, 5, 3, 2, 0xFF, 4},
668       {10, 5, 6, 3, 0xFF, 7},
669 
670       // replace second trailing byte with ASCII byte
671       {6, 5, 3, 2, 'z', 5},
672       {10, 5, 6, 3, 'z', 8},
673 
674       // replace second trailing byte with invalid byte
675       {6, 5, 3, 2, 0xFF, 5},
676       {10, 5, 6, 3, 0xFF, 8},
677 
678       // replace third trailing byte
679       {10, 5, 6, 3, 'z', 9},
680       {10, 5, 6, 3, 0xFF, 9},
681 
682       // 2.1 The following test-cases raise doubt whether error or partial should
683       // be returned. For example, we have 4-byte sequence with valid leading
684       // byte. If we hide the last byte we need to return partial. But, if the
685       // second or third byte, which are visible to the call to codecvt, are
686       // malformed then error should be returned.
687 
688       // replace first trailing byte with ASCII byte, also incomplete at end
689       {5, 5, 3, 2, 'z', 4},
690       {8, 5, 6, 3, 'z', 7},
691       {9, 5, 6, 3, 'z', 7},
692 
693       // replace first trailing byte with invalid byte, also incomplete at end
694       {5, 5, 3, 2, 0xFF, 4},
695       {8, 5, 6, 3, 0xFF, 7},
696       {9, 5, 6, 3, 0xFF, 7},
697 
698       // replace second trailing byte with ASCII byte, also incomplete at end
699       {9, 5, 6, 3, 'z', 8},
700 
701       // replace second trailing byte with invalid byte, also incomplete at end
702       {9, 5, 6, 3, 0xFF, 8},
703 
704       // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
705       // CP U+D700
706       {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
707       {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
708       {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
709       {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
710 
711       // 4. Overlong sequence. The CPs in the input are chosen such as modifying
712       // just the leading byte is enough to make them overlong, i.e. for the
713       // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
714       // zeroes.
715       {3, 5, 1, 1, 0b11000000, 1},  // make the 2-byte CP overlong
716       {3, 5, 1, 1, 0b11000001, 1},  // make the 2-byte CP overlong
717       {6, 5, 3, 2, 0b11100000, 3},  // make the 3-byte CP overlong
718       {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
719 
720       // 5. CP above range
721       // turn U+10AAAA into U+14AAAA by changing its leading byte
722       {10, 5, 6, 3, 0b11110101, 6},
723       // turn U+10AAAA into U+11AAAA by changing its 2nd byte
724       {10, 5, 6, 3, 0b10011010, 7},
725   };
726   for (test_offsets_error<unsigned char>* it = begin(offsets); it != end(offsets); ++it) {
727     test_offsets_error<unsigned char> t = *it;
728     InternT out[array_size(exp) - 1]    = {};
729     assert(t.in_size <= array_size(in));
730     assert(t.out_size <= array_size(out));
731     assert(t.expected_in_next <= t.in_size);
732     assert(t.expected_out_next <= t.out_size);
733     ExternT old_char  = in[t.replace_pos];
734     in[t.replace_pos] = t.replace_char;
735 
736     mbstate_t state          = {};
737     const ExternT* in_next   = nullptr;
738     InternT* out_next        = nullptr;
739     codecvt_base::result res = codecvt_base::ok;
740 
741     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
742     assert(res == cvt.error);
743     assert(in_next == in + t.expected_in_next);
744     assert(out_next == out + t.expected_out_next);
745     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
746     if (t.expected_out_next < array_size(out))
747       assert(out[t.expected_out_next] == 0);
748 
749     state   = mbstate_t();
750     int len = cvt.length(state, in, in + t.in_size, t.out_size);
751     assert(len >= 0);
752     assert(static_cast<size_t>(len) == t.expected_in_next);
753 
754     in[t.replace_pos] = old_char;
755   }
756 }
757 
758 template <class InternT, class ExternT>
utf8_to_utf16_in(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)759 void utf8_to_utf16_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
760   utf8_to_utf16_in_ok(cvt);
761   utf8_to_utf16_in_partial(cvt);
762   utf8_to_utf16_in_error(cvt);
763 }
764 
765 template <class InternT, class ExternT>
utf16_to_utf8_out_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)766 void utf16_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
767   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
768   const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
769   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
770   static_assert(array_size(input) == 6, "");
771   static_assert(array_size(expected) == 11, "");
772 
773   InternT in[array_size(input)];
774   ExternT exp[array_size(expected)];
775   copy(begin(input), end(input), begin(in));
776   copy(begin(expected), end(expected), begin(exp));
777   assert(char_traits<InternT>::length(in) == 5);
778   assert(char_traits<ExternT>::length(exp) == 10);
779 
780   test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {5, 10}};
781   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
782     test_offsets_ok t                = *it;
783     ExternT out[array_size(exp) - 1] = {};
784     assert(t.in_size <= array_size(in));
785     assert(t.out_size <= array_size(out));
786     mbstate_t state          = {};
787     const InternT* in_next   = nullptr;
788     ExternT* out_next        = nullptr;
789     codecvt_base::result res = codecvt_base::ok;
790 
791     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
792     assert(res == cvt.ok);
793     assert(in_next == in + t.in_size);
794     assert(out_next == out + t.out_size);
795     assert(char_traits<ExternT>::compare(out, exp, t.out_size) == 0);
796     if (t.out_size < array_size(out))
797       assert(out[t.out_size] == 0);
798   }
799 }
800 
801 template <class InternT, class ExternT>
utf16_to_utf8_out_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)802 void utf16_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
803   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
804   const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
805   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
806   static_assert(array_size(input) == 6, "");
807   static_assert(array_size(expected) == 11, "");
808 
809   InternT in[array_size(input)];
810   ExternT exp[array_size(expected)];
811   copy(begin(input), end(input), begin(in));
812   copy(begin(expected), end(expected), begin(exp));
813   assert(char_traits<InternT>::length(in) == 5);
814   assert(char_traits<ExternT>::length(exp) == 10);
815 
816   test_offsets_partial offsets[] = {
817       {1, 0, 0, 0}, // no space for first CP
818 
819       {2, 1, 1, 1}, // no space for second CP
820       {2, 2, 1, 1}, // no space for second CP
821 
822       {3, 3, 2, 3}, // no space for third CP
823       {3, 4, 2, 3}, // no space for third CP
824       {3, 5, 2, 3}, // no space for third CP
825 
826       {5, 6, 3, 6}, // no space for fourth CP
827       {5, 7, 3, 6}, // no space for fourth CP
828       {5, 8, 3, 6}, // no space for fourth CP
829       {5, 9, 3, 6}, // no space for fourth CP
830 
831       {4, 10, 3, 6}, // incomplete fourth CP
832 
833       {4, 6, 3, 6}, // incomplete fourth CP, and no space for it
834       {4, 7, 3, 6}, // incomplete fourth CP, and no space for it
835       {4, 8, 3, 6}, // incomplete fourth CP, and no space for it
836       {4, 9, 3, 6}, // incomplete fourth CP, and no space for it
837   };
838   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
839     test_offsets_partial t           = *it;
840     ExternT out[array_size(exp) - 1] = {};
841     assert(t.in_size <= array_size(in));
842     assert(t.out_size <= array_size(out));
843     assert(t.expected_in_next <= t.in_size);
844     assert(t.expected_out_next <= t.out_size);
845     mbstate_t state          = {};
846     const InternT* in_next   = nullptr;
847     ExternT* out_next        = nullptr;
848     codecvt_base::result res = codecvt_base::ok;
849 
850     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
851     assert(res == cvt.partial);
852     assert(in_next == in + t.expected_in_next);
853     assert(out_next == out + t.expected_out_next);
854     assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
855     if (t.expected_out_next < array_size(out))
856       assert(out[t.expected_out_next] == 0);
857   }
858 }
859 
860 template <class InternT, class ExternT>
utf16_to_utf8_out_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)861 void utf16_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
862   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
863   const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
864   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
865   static_assert(array_size(input) == 6, "");
866   static_assert(array_size(expected) == 11, "");
867 
868   InternT in[array_size(input)];
869   ExternT exp[array_size(expected)];
870   copy(begin(input), end(input), begin(in));
871   copy(begin(expected), end(expected), begin(exp));
872   assert(char_traits<InternT>::length(in) == 5);
873   assert(char_traits<ExternT>::length(exp) == 10);
874 
875   // The only possible error in UTF-16 is unpaired surrogate code units.
876   // So we replace valid code points (scalar values) with lone surrogate CU.
877   test_offsets_error<InternT> offsets[] = {
878       {5, 10, 0, 0, 0xD800, 0},
879       {5, 10, 0, 0, 0xDBFF, 0},
880       {5, 10, 0, 0, 0xDC00, 0},
881       {5, 10, 0, 0, 0xDFFF, 0},
882 
883       {5, 10, 1, 1, 0xD800, 1},
884       {5, 10, 1, 1, 0xDBFF, 1},
885       {5, 10, 1, 1, 0xDC00, 1},
886       {5, 10, 1, 1, 0xDFFF, 1},
887 
888       {5, 10, 2, 3, 0xD800, 2},
889       {5, 10, 2, 3, 0xDBFF, 2},
890       {5, 10, 2, 3, 0xDC00, 2},
891       {5, 10, 2, 3, 0xDFFF, 2},
892 
893       // make the leading surrogate a trailing one
894       {5, 10, 3, 6, 0xDC00, 3},
895       {5, 10, 3, 6, 0xDFFF, 3},
896 
897       // make the trailing surrogate a leading one
898       {5, 10, 3, 6, 0xD800, 4},
899       {5, 10, 3, 6, 0xDBFF, 4},
900 
901       // make the trailing surrogate a BMP char
902       {5, 10, 3, 6, 'z', 4},
903   };
904 
905   for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
906     test_offsets_error<InternT> t    = *it;
907     ExternT out[array_size(exp) - 1] = {};
908     assert(t.in_size <= array_size(in));
909     assert(t.out_size <= array_size(out));
910     assert(t.expected_in_next <= t.in_size);
911     assert(t.expected_out_next <= t.out_size);
912     InternT old_char  = in[t.replace_pos];
913     in[t.replace_pos] = t.replace_char;
914 
915     mbstate_t state          = {};
916     const InternT* in_next   = nullptr;
917     ExternT* out_next        = nullptr;
918     codecvt_base::result res = codecvt_base::ok;
919 
920     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
921     assert(res == cvt.error);
922     assert(in_next == in + t.expected_in_next);
923     assert(out_next == out + t.expected_out_next);
924     assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
925     if (t.expected_out_next < array_size(out))
926       assert(out[t.expected_out_next] == 0);
927 
928     in[t.replace_pos] = old_char;
929   }
930 }
931 
932 template <class InternT, class ExternT>
utf16_to_utf8_out(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)933 void utf16_to_utf8_out(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
934   utf16_to_utf8_out_ok(cvt);
935   utf16_to_utf8_out_partial(cvt);
936   utf16_to_utf8_out_error(cvt);
937 }
938 
939 template <class InternT, class ExternT>
test_utf8_utf16_cvt(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)940 void test_utf8_utf16_cvt(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
941   utf8_to_utf16_in(cvt);
942   utf16_to_utf8_out(cvt);
943 }
944 
945 template <class InternT, class ExternT>
utf8_to_ucs2_in_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)946 void utf8_to_ucs2_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
947   // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
948   const unsigned char input[] = "b\u0448\uAAAA";
949   const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0};
950   static_assert(array_size(input) == 7, "");
951   static_assert(array_size(expected) == 4, "");
952 
953   ExternT in[array_size(input)];
954   InternT exp[array_size(expected)];
955   copy(begin(input), end(input), begin(in));
956   copy(begin(expected), end(expected), begin(exp));
957   assert(char_traits<ExternT>::length(in) == 6);
958   assert(char_traits<InternT>::length(exp) == 3);
959 
960   test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}};
961   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
962     test_offsets_ok t                = *it;
963     InternT out[array_size(exp) - 1] = {};
964     assert(t.in_size <= array_size(in));
965     assert(t.out_size <= array_size(out));
966     mbstate_t state          = {};
967     const ExternT* in_next   = nullptr;
968     InternT* out_next        = nullptr;
969     codecvt_base::result res = codecvt_base::ok;
970 
971     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
972     assert(res == cvt.ok);
973     assert(in_next == in + t.in_size);
974     assert(out_next == out + t.out_size);
975     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
976     if (t.out_size < array_size(out))
977       assert(out[t.out_size] == 0);
978 
979     state   = mbstate_t();
980     int len = cvt.length(state, in, in + t.in_size, t.out_size);
981     assert(len >= 0);
982     assert(static_cast<size_t>(len) == t.in_size);
983   }
984 
985   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
986     test_offsets_ok t            = *it;
987     InternT out[array_size(exp)] = {};
988     assert(t.in_size <= array_size(in));
989     assert(t.out_size <= array_size(out));
990     mbstate_t state          = {};
991     const ExternT* in_next   = nullptr;
992     InternT* out_next        = nullptr;
993     codecvt_base::result res = codecvt_base::ok;
994 
995     res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
996     assert(res == cvt.ok);
997     assert(in_next == in + t.in_size);
998     assert(out_next == out + t.out_size);
999     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1000     if (t.out_size < array_size(out))
1001       assert(out[t.out_size] == 0);
1002 
1003     state   = mbstate_t();
1004     int len = cvt.length(state, in, in + t.in_size, array_size(out));
1005     assert(len >= 0);
1006     assert(static_cast<size_t>(len) == t.in_size);
1007   }
1008 }
1009 
1010 template <class InternT, class ExternT>
utf8_to_ucs2_in_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1011 void utf8_to_ucs2_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1012   // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1013   const unsigned char input[] = "b\u0448\uAAAA";
1014   const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0};
1015   static_assert(array_size(input) == 7, "");
1016   static_assert(array_size(expected) == 4, "");
1017 
1018   ExternT in[array_size(input)];
1019   InternT exp[array_size(expected)];
1020   copy(begin(input), end(input), begin(in));
1021   copy(begin(expected), end(expected), begin(exp));
1022   assert(char_traits<ExternT>::length(in) == 6);
1023   assert(char_traits<InternT>::length(exp) == 3);
1024 
1025   test_offsets_partial offsets[] = {
1026       {1, 0, 0, 0}, // no space for first CP
1027 
1028       {3, 1, 1, 1}, // no space for second CP
1029       {2, 2, 1, 1}, // incomplete second CP
1030       {2, 1, 1, 1}, // incomplete second CP, and no space for it
1031 
1032       {6, 2, 3, 2}, // no space for third CP
1033       {4, 3, 3, 2}, // incomplete third CP
1034       {5, 3, 3, 2}, // incomplete third CP
1035       {4, 2, 3, 2}, // incomplete third CP, and no space for it
1036       {5, 2, 3, 2}, // incomplete third CP, and no space for it
1037   };
1038 
1039   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1040     test_offsets_partial t           = *it;
1041     InternT out[array_size(exp) - 1] = {};
1042     assert(t.in_size <= array_size(in));
1043     assert(t.out_size <= array_size(out));
1044     assert(t.expected_in_next <= t.in_size);
1045     assert(t.expected_out_next <= t.out_size);
1046     mbstate_t state          = {};
1047     const ExternT* in_next   = nullptr;
1048     InternT* out_next        = nullptr;
1049     codecvt_base::result res = codecvt_base::ok;
1050 
1051     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1052     assert(res == cvt.partial);
1053     assert(in_next == in + t.expected_in_next);
1054     assert(out_next == out + t.expected_out_next);
1055     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1056     if (t.expected_out_next < array_size(out))
1057       assert(out[t.expected_out_next] == 0);
1058 
1059     state   = mbstate_t();
1060     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1061     assert(len >= 0);
1062     assert(static_cast<size_t>(len) == t.expected_in_next);
1063   }
1064 }
1065 
1066 template <class InternT, class ExternT>
utf8_to_ucs2_in_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1067 void utf8_to_ucs2_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1068   const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
1069   const char16_t expected[]   = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
1070   static_assert(array_size(input) == 11, "");
1071   static_assert(array_size(expected) == 6, "");
1072 
1073   ExternT in[array_size(input)];
1074   InternT exp[array_size(expected)];
1075   copy(begin(input), end(input), begin(in));
1076   copy(begin(expected), end(expected), begin(exp));
1077   assert(char_traits<ExternT>::length(in) == 10);
1078   assert(char_traits<InternT>::length(exp) == 5);
1079 
1080   // There are 5 classes of errors in UTF-8 decoding
1081   // 1. Missing leading byte
1082   // 2. Missing trailing byte
1083   // 3. Surrogate CP
1084   // 4. Overlong sequence
1085   // 5. CP out of Unicode range
1086   test_offsets_error<unsigned char> offsets[] = {
1087 
1088       // 1. Missing leading byte. We will replace the leading byte with
1089       // non-leading byte, such as a byte that is always invalid or a trailing
1090       // byte.
1091 
1092       // replace leading byte with invalid byte
1093       {1, 5, 0, 0, 0xFF, 0},
1094       {3, 5, 1, 1, 0xFF, 1},
1095       {6, 5, 3, 2, 0xFF, 3},
1096       {10, 5, 6, 3, 0xFF, 6},
1097 
1098       // replace leading byte with trailing byte
1099       {1, 5, 0, 0, 0b10101010, 0},
1100       {3, 5, 1, 1, 0b10101010, 1},
1101       {6, 5, 3, 2, 0b10101010, 3},
1102       {10, 5, 6, 3, 0b10101010, 6},
1103 
1104       // 2. Missing trailing byte. We will replace the trailing byte with
1105       // non-trailing byte, such as a byte that is always invalid or a leading
1106       // byte (simple ASCII byte in our case).
1107 
1108       // replace first trailing byte with ASCII byte
1109       {3, 5, 1, 1, 'z', 2},
1110       {6, 5, 3, 2, 'z', 4},
1111       {10, 5, 6, 3, 'z', 7},
1112 
1113       // replace first trailing byte with invalid byte
1114       {3, 5, 1, 1, 0xFF, 2},
1115       {6, 5, 3, 2, 0xFF, 4},
1116       {10, 5, 6, 3, 0xFF, 7},
1117 
1118       // replace second trailing byte with ASCII byte
1119       {6, 5, 3, 2, 'z', 5},
1120       {10, 5, 6, 3, 'z', 8},
1121 
1122       // replace second trailing byte with invalid byte
1123       {6, 5, 3, 2, 0xFF, 5},
1124       {10, 5, 6, 3, 0xFF, 8},
1125 
1126       // replace third trailing byte
1127       {10, 5, 6, 3, 'z', 9},
1128       {10, 5, 6, 3, 0xFF, 9},
1129 
1130       // 2.1 The following test-cases raise doubt whether error or partial should
1131       // be returned. For example, we have 4-byte sequence with valid leading
1132       // byte. If we hide the last byte we need to return partial. But, if the
1133       // second or third byte, which are visible to the call to codecvt, are
1134       // malformed then error should be returned.
1135 
1136       // replace first trailing byte with ASCII byte, also incomplete at end
1137       {5, 5, 3, 2, 'z', 4},
1138       {8, 5, 6, 3, 'z', 7},
1139       {9, 5, 6, 3, 'z', 7},
1140 
1141       // replace first trailing byte with invalid byte, also incomplete at end
1142       {5, 5, 3, 2, 0xFF, 4},
1143       {8, 5, 6, 3, 0xFF, 7},
1144       {9, 5, 6, 3, 0xFF, 7},
1145 
1146       // replace second trailing byte with ASCII byte, also incomplete at end
1147       {9, 5, 6, 3, 'z', 8},
1148 
1149       // replace second trailing byte with invalid byte, also incomplete at end
1150       {9, 5, 6, 3, 0xFF, 8},
1151 
1152       // 3. Surrogate CP. We modify the second byte (first trailing) of the 3-byte
1153       // CP U+D700
1154       {6, 5, 3, 2, 0b10100000, 4}, // turn U+D700 into U+D800
1155       {6, 5, 3, 2, 0b10101100, 4}, // turn U+D700 into U+DB00
1156       {6, 5, 3, 2, 0b10110000, 4}, // turn U+D700 into U+DC00
1157       {6, 5, 3, 2, 0b10111100, 4}, // turn U+D700 into U+DF00
1158 
1159       // 4. Overlong sequence. The CPs in the input are chosen such as modifying
1160       // just the leading byte is enough to make them overlong, i.e. for the
1161       // 3-byte and 4-byte CP the second byte (first trailing) has enough leading
1162       // zeroes.
1163       {3, 5, 1, 1, 0b11000000, 1},  // make the 2-byte CP overlong
1164       {3, 5, 1, 1, 0b11000001, 1},  // make the 2-byte CP overlong
1165       {6, 5, 3, 2, 0b11100000, 3},  // make the 3-byte CP overlong
1166       {10, 5, 6, 3, 0b11110000, 6}, // make the 4-byte CP overlong
1167 
1168       // 5. CP above range
1169       // turn U+10AAAA into U+14AAAA by changing its leading byte
1170       {10, 5, 6, 3, 0b11110101, 6},
1171       // turn U+10AAAA into U+11AAAA by changing its 2nd byte
1172       {10, 5, 6, 3, 0b10011010, 7},
1173       // Don't replace anything, show full 4-byte CP U+10AAAA
1174       {10, 4, 6, 3, 'b', 0},
1175       {10, 5, 6, 3, 'b', 0},
1176       // Don't replace anything, show incomplete 4-byte CP at the end. It's still
1177       // out of UCS2 range just by seeing the first byte.
1178       {7, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1179       {8, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1180       {9, 4, 6, 3, 'b', 0}, // incomplete fourth CP
1181       {7, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1182       {8, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1183       {9, 5, 6, 3, 'b', 0}, // incomplete fourth CP
1184   };
1185   for (test_offsets_error<unsigned char>* it = begin(offsets); it != end(offsets); ++it) {
1186     test_offsets_error<unsigned char> t = *it;
1187     InternT out[array_size(exp) - 1]    = {};
1188     assert(t.in_size <= array_size(in));
1189     assert(t.out_size <= array_size(out));
1190     assert(t.expected_in_next <= t.in_size);
1191     assert(t.expected_out_next <= t.out_size);
1192     ExternT old_char  = in[t.replace_pos];
1193     in[t.replace_pos] = t.replace_char;
1194 
1195     mbstate_t state          = {};
1196     const ExternT* in_next   = nullptr;
1197     InternT* out_next        = nullptr;
1198     codecvt_base::result res = codecvt_base::ok;
1199 
1200     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1201     assert(res == cvt.error);
1202     assert(in_next == in + t.expected_in_next);
1203     assert(out_next == out + t.expected_out_next);
1204     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1205     if (t.expected_out_next < array_size(out))
1206       assert(out[t.expected_out_next] == 0);
1207 
1208     state   = mbstate_t();
1209     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1210     assert(len >= 0);
1211     assert(static_cast<size_t>(len) == t.expected_in_next);
1212 
1213     in[t.replace_pos] = old_char;
1214   }
1215 }
1216 
1217 template <class InternT, class ExternT>
utf8_to_ucs2_in(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1218 void utf8_to_ucs2_in(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1219   utf8_to_ucs2_in_ok(cvt);
1220   utf8_to_ucs2_in_partial(cvt);
1221   utf8_to_ucs2_in_error(cvt);
1222 }
1223 
1224 template <class InternT, class ExternT>
ucs2_to_utf8_out_ok(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1225 void ucs2_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1226   // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1227   const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0};
1228   const unsigned char expected[] = "b\u0448\uAAAA";
1229   static_assert(array_size(input) == 4, "");
1230   static_assert(array_size(expected) == 7, "");
1231 
1232   InternT in[array_size(input)];
1233   ExternT exp[array_size(expected)];
1234   copy(begin(input), end(input), begin(in));
1235   copy(begin(expected), end(expected), begin(exp));
1236   assert(char_traits<InternT>::length(in) == 3);
1237   assert(char_traits<ExternT>::length(exp) == 6);
1238 
1239   test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}};
1240   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1241     test_offsets_ok t                = *it;
1242     ExternT out[array_size(exp) - 1] = {};
1243     assert(t.in_size <= array_size(in));
1244     assert(t.out_size <= array_size(out));
1245     mbstate_t state          = {};
1246     const InternT* in_next   = nullptr;
1247     ExternT* out_next        = nullptr;
1248     codecvt_base::result res = codecvt_base::ok;
1249 
1250     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1251     assert(res == cvt.ok);
1252     assert(in_next == in + t.in_size);
1253     assert(out_next == out + t.out_size);
1254     assert(char_traits<ExternT>::compare(out, exp, t.out_size) == 0);
1255     if (t.out_size < array_size(out))
1256       assert(out[t.out_size] == 0);
1257   }
1258 }
1259 
1260 template <class InternT, class ExternT>
ucs2_to_utf8_out_partial(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1261 void ucs2_to_utf8_out_partial(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1262   // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP
1263   const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0};
1264   const unsigned char expected[] = "b\u0448\uAAAA";
1265   static_assert(array_size(input) == 4, "");
1266   static_assert(array_size(expected) == 7, "");
1267 
1268   InternT in[array_size(input)];
1269   ExternT exp[array_size(expected)];
1270   copy(begin(input), end(input), begin(in));
1271   copy(begin(expected), end(expected), begin(exp));
1272   assert(char_traits<InternT>::length(in) == 3);
1273   assert(char_traits<ExternT>::length(exp) == 6);
1274 
1275   test_offsets_partial offsets[] = {
1276       {1, 0, 0, 0}, // no space for first CP
1277 
1278       {2, 1, 1, 1}, // no space for second CP
1279       {2, 2, 1, 1}, // no space for second CP
1280 
1281       {3, 3, 2, 3}, // no space for third CP
1282       {3, 4, 2, 3}, // no space for third CP
1283       {3, 5, 2, 3}, // no space for third CP
1284   };
1285   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1286     test_offsets_partial t           = *it;
1287     ExternT out[array_size(exp) - 1] = {};
1288     assert(t.in_size <= array_size(in));
1289     assert(t.out_size <= array_size(out));
1290     assert(t.expected_in_next <= t.in_size);
1291     assert(t.expected_out_next <= t.out_size);
1292     mbstate_t state          = {};
1293     const InternT* in_next   = nullptr;
1294     ExternT* out_next        = nullptr;
1295     codecvt_base::result res = codecvt_base::ok;
1296 
1297     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1298     assert(res == cvt.partial);
1299     assert(in_next == in + t.expected_in_next);
1300     assert(out_next == out + t.expected_out_next);
1301     assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
1302     if (t.expected_out_next < array_size(out))
1303       assert(out[t.expected_out_next] == 0);
1304   }
1305 }
1306 
1307 template <class InternT, class ExternT>
ucs2_to_utf8_out_error(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1308 void ucs2_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1309   const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1310   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
1311   static_assert(array_size(input) == 6, "");
1312   static_assert(array_size(expected) == 11, "");
1313 
1314   InternT in[array_size(input)];
1315   ExternT exp[array_size(expected)];
1316   copy(begin(input), end(input), begin(in));
1317   copy(begin(expected), end(expected), begin(exp));
1318   assert(char_traits<InternT>::length(in) == 5);
1319   assert(char_traits<ExternT>::length(exp) == 10);
1320 
1321   test_offsets_error<InternT> offsets[] = {
1322       {3, 6, 0, 0, 0xD800, 0},
1323       {3, 6, 0, 0, 0xDBFF, 0},
1324       {3, 6, 0, 0, 0xDC00, 0},
1325       {3, 6, 0, 0, 0xDFFF, 0},
1326 
1327       {3, 6, 1, 1, 0xD800, 1},
1328       {3, 6, 1, 1, 0xDBFF, 1},
1329       {3, 6, 1, 1, 0xDC00, 1},
1330       {3, 6, 1, 1, 0xDFFF, 1},
1331 
1332       {3, 6, 2, 3, 0xD800, 2},
1333       {3, 6, 2, 3, 0xDBFF, 2},
1334       {3, 6, 2, 3, 0xDC00, 2},
1335       {3, 6, 2, 3, 0xDFFF, 2},
1336 
1337       // make the leading surrogate a trailing one
1338       {5, 10, 3, 6, 0xDC00, 3},
1339       {5, 10, 3, 6, 0xDFFF, 3},
1340 
1341       // make the trailing surrogate a leading one
1342       {5, 10, 3, 6, 0xD800, 4},
1343       {5, 10, 3, 6, 0xDBFF, 4},
1344 
1345       // make the trailing surrogate a BMP char
1346       {5, 10, 3, 6, 'z', 4},
1347 
1348       // don't replace anything in the test cases bellow, just show the surrogate
1349       // pair (fourth CP) fully or partially
1350       {5, 10, 3, 6, 'b', 0},
1351       {5, 7, 3, 6, 'b', 0}, // no space for fourth CP
1352       {5, 8, 3, 6, 'b', 0}, // no space for fourth CP
1353       {5, 9, 3, 6, 'b', 0}, // no space for fourth CP
1354 
1355       {4, 10, 3, 6, 'b', 0}, // incomplete fourth CP
1356       {4, 7, 3, 6, 'b', 0},  // incomplete fourth CP, and no space for it
1357       {4, 8, 3, 6, 'b', 0},  // incomplete fourth CP, and no space for it
1358       {4, 9, 3, 6, 'b', 0},  // incomplete fourth CP, and no space for it
1359   };
1360 
1361   for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
1362     test_offsets_error<InternT> t    = *it;
1363     ExternT out[array_size(exp) - 1] = {};
1364     assert(t.in_size <= array_size(in));
1365     assert(t.out_size <= array_size(out));
1366     assert(t.expected_in_next <= t.in_size);
1367     assert(t.expected_out_next <= t.out_size);
1368     InternT old_char  = in[t.replace_pos];
1369     in[t.replace_pos] = t.replace_char;
1370 
1371     mbstate_t state          = {};
1372     const InternT* in_next   = nullptr;
1373     ExternT* out_next        = nullptr;
1374     codecvt_base::result res = codecvt_base::ok;
1375 
1376     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1377     assert(res == cvt.error);
1378     assert(in_next == in + t.expected_in_next);
1379     assert(out_next == out + t.expected_out_next);
1380     assert(char_traits<ExternT>::compare(out, exp, t.expected_out_next) == 0);
1381     if (t.expected_out_next < array_size(out))
1382       assert(out[t.expected_out_next] == 0);
1383 
1384     in[t.replace_pos] = old_char;
1385   }
1386 }
1387 
1388 template <class InternT, class ExternT>
ucs2_to_utf8_out(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1389 void ucs2_to_utf8_out(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1390   ucs2_to_utf8_out_ok(cvt);
1391   ucs2_to_utf8_out_partial(cvt);
1392   ucs2_to_utf8_out_error(cvt);
1393 }
1394 
1395 template <class InternT, class ExternT>
test_utf8_ucs2_cvt(const std::codecvt<InternT,ExternT,mbstate_t> & cvt)1396 void test_utf8_ucs2_cvt(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) {
1397   utf8_to_ucs2_in(cvt);
1398   ucs2_to_utf8_out(cvt);
1399 }
1400 
1401 enum utf16_endianess { utf16_big_endian, utf16_little_endian };
1402 
1403 template <class Iter1, class Iter2>
utf16_to_bytes(Iter1 f,Iter1 l,Iter2 o,utf16_endianess e)1404 Iter2 utf16_to_bytes(Iter1 f, Iter1 l, Iter2 o, utf16_endianess e) {
1405   if (e == utf16_big_endian)
1406     for (; f != l; ++f) {
1407       *o++ = (*f >> 8) & 0xFF;
1408       *o++ = *f & 0xFF;
1409     }
1410   else
1411     for (; f != l; ++f) {
1412       *o++ = *f & 0xFF;
1413       *o++ = (*f >> 8) & 0xFF;
1414     }
1415   return o;
1416 }
1417 
1418 template <class InternT>
utf16_to_utf32_in_ok(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1419 void utf16_to_utf32_in_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1420   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1421   const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1422   static_assert(array_size(input) == 6, "");
1423   static_assert(array_size(expected) == 5, "");
1424 
1425   char in[array_size(input) * 2];
1426   InternT exp[array_size(expected)];
1427   utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1428   copy(begin(expected), end(expected), begin(exp));
1429 
1430   test_offsets_ok offsets[] = {{0, 0}, {2, 1}, {4, 2}, {6, 3}, {10, 4}};
1431   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1432     test_offsets_ok t                = *it;
1433     InternT out[array_size(exp) - 1] = {};
1434     assert(t.in_size <= array_size(in));
1435     assert(t.out_size <= array_size(out));
1436     mbstate_t state          = {};
1437     const char* in_next      = nullptr;
1438     InternT* out_next        = nullptr;
1439     codecvt_base::result res = codecvt_base::ok;
1440 
1441     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1442     assert(res == cvt.ok);
1443     assert(in_next == in + t.in_size);
1444     assert(out_next == out + t.out_size);
1445     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1446     if (t.out_size < array_size(out))
1447       assert(out[t.out_size] == 0);
1448 
1449     state   = mbstate_t();
1450     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1451     assert(len >= 0);
1452     assert(static_cast<size_t>(len) == t.in_size);
1453   }
1454 
1455   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1456     test_offsets_ok t            = *it;
1457     InternT out[array_size(exp)] = {};
1458     assert(t.in_size <= array_size(in));
1459     assert(t.out_size <= array_size(out));
1460     mbstate_t state          = {};
1461     const char* in_next      = nullptr;
1462     InternT* out_next        = nullptr;
1463     codecvt_base::result res = codecvt_base::ok;
1464 
1465     res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
1466     assert(res == cvt.ok);
1467     assert(in_next == in + t.in_size);
1468     assert(out_next == out + t.out_size);
1469     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1470     if (t.out_size < array_size(out))
1471       assert(out[t.out_size] == 0);
1472 
1473     state   = mbstate_t();
1474     int len = cvt.length(state, in, in + t.in_size, array_size(out));
1475     assert(len >= 0);
1476     assert(static_cast<size_t>(len) == t.in_size);
1477   }
1478 }
1479 
1480 template <class InternT>
utf16_to_utf32_in_partial(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1481 void utf16_to_utf32_in_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1482   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1483   const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1484   static_assert(array_size(input) == 6, "");
1485   static_assert(array_size(expected) == 5, "");
1486 
1487   char in[array_size(input) * 2];
1488   InternT exp[array_size(expected)];
1489   utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1490   copy(begin(expected), end(expected), begin(exp));
1491 
1492   test_offsets_partial offsets[] = {
1493       {2, 0, 0, 0}, // no space for first CP
1494       {1, 1, 0, 0}, // incomplete first CP
1495       {1, 0, 0, 0}, // incomplete first CP, and no space for it
1496 
1497       {4, 1, 2, 1}, // no space for second CP
1498       {3, 2, 2, 1}, // incomplete second CP
1499       {3, 1, 2, 1}, // incomplete second CP, and no space for it
1500 
1501       {6, 2, 4, 2}, // no space for third CP
1502       {5, 3, 4, 2}, // incomplete third CP
1503       {5, 2, 4, 2}, // incomplete third CP, and no space for it
1504 
1505       {10, 3, 6, 3}, // no space for fourth CP
1506       {7, 4, 6, 3},  // incomplete fourth CP
1507       {8, 4, 6, 3},  // incomplete fourth CP
1508       {9, 4, 6, 3},  // incomplete fourth CP
1509       {7, 3, 6, 3},  // incomplete fourth CP, and no space for it
1510       {8, 3, 6, 3},  // incomplete fourth CP, and no space for it
1511       {9, 3, 6, 3},  // incomplete fourth CP, and no space for it
1512   };
1513 
1514   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1515     test_offsets_partial t           = *it;
1516     InternT out[array_size(exp) - 1] = {};
1517     assert(t.in_size <= array_size(in));
1518     assert(t.out_size <= array_size(out));
1519     assert(t.expected_in_next <= t.in_size);
1520     assert(t.expected_out_next <= t.out_size);
1521     mbstate_t state          = {};
1522     const char* in_next      = nullptr;
1523     InternT* out_next        = nullptr;
1524     codecvt_base::result res = codecvt_base::ok;
1525 
1526     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1527     assert(res == cvt.partial);
1528     assert(in_next == in + t.expected_in_next);
1529     assert(out_next == out + t.expected_out_next);
1530     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1531     if (t.expected_out_next < array_size(out))
1532       assert(out[t.expected_out_next] == 0);
1533 
1534     state   = mbstate_t();
1535     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1536     assert(len >= 0);
1537     assert(static_cast<size_t>(len) == t.expected_in_next);
1538   }
1539 }
1540 
1541 template <class InternT>
utf16_to_utf32_in_error(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1542 void utf16_to_utf32_in_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1543   char16_t input[]          = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1544   const char32_t expected[] = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1545   static_assert(array_size(input) == 6, "");
1546   static_assert(array_size(expected) == 5, "");
1547 
1548   InternT exp[array_size(expected)];
1549   copy(begin(expected), end(expected), begin(exp));
1550 
1551   // The only possible error in UTF-16 is unpaired surrogate code units.
1552   // So we replace valid code points (scalar values) with lone surrogate CU.
1553   test_offsets_error<char16_t> offsets[] = {
1554       {10, 4, 0, 0, 0xD800, 0},
1555       {10, 4, 0, 0, 0xDBFF, 0},
1556       {10, 4, 0, 0, 0xDC00, 0},
1557       {10, 4, 0, 0, 0xDFFF, 0},
1558 
1559       {10, 4, 2, 1, 0xD800, 1},
1560       {10, 4, 2, 1, 0xDBFF, 1},
1561       {10, 4, 2, 1, 0xDC00, 1},
1562       {10, 4, 2, 1, 0xDFFF, 1},
1563 
1564       {10, 4, 4, 2, 0xD800, 2},
1565       {10, 4, 4, 2, 0xDBFF, 2},
1566       {10, 4, 4, 2, 0xDC00, 2},
1567       {10, 4, 4, 2, 0xDFFF, 2},
1568 
1569       // make the leading surrogate a trailing one
1570       {10, 4, 6, 3, 0xDC00, 3},
1571       {10, 4, 6, 3, 0xDFFF, 3},
1572 
1573       // make the trailing surrogate a leading one
1574       {10, 4, 6, 3, 0xD800, 4},
1575       {10, 4, 6, 3, 0xDBFF, 4},
1576 
1577       // make the trailing surrogate a BMP char
1578       {10, 4, 6, 3, 'z', 4},
1579   };
1580 
1581   for (test_offsets_error<char16_t>* it = begin(offsets); it != end(offsets); ++it) {
1582     test_offsets_error<char16_t> t = *it;
1583     char in[array_size(input) * 2];
1584     InternT out[array_size(exp) - 1] = {};
1585     assert(t.in_size <= array_size(in));
1586     assert(t.out_size <= array_size(out));
1587     assert(t.expected_in_next <= t.in_size);
1588     assert(t.expected_out_next <= t.out_size);
1589     char16_t old_char    = input[t.replace_pos];
1590     input[t.replace_pos] = t.replace_char; // replace in input, not in in
1591     utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1592 
1593     mbstate_t state          = {};
1594     const char* in_next      = nullptr;
1595     InternT* out_next        = nullptr;
1596     codecvt_base::result res = codecvt_base::ok;
1597 
1598     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1599     assert(res == cvt.error);
1600     assert(in_next == in + t.expected_in_next);
1601     assert(out_next == out + t.expected_out_next);
1602     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1603     if (t.expected_out_next < array_size(out))
1604       assert(out[t.expected_out_next] == 0);
1605 
1606     state   = mbstate_t();
1607     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1608     assert(len >= 0);
1609     assert(static_cast<size_t>(len) == t.expected_in_next);
1610 
1611     input[t.replace_pos] = old_char;
1612   }
1613 }
1614 
1615 template <class InternT>
utf32_to_utf16_out_ok(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1616 void utf32_to_utf16_out_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1617   const char32_t input[]    = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1618   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1619   static_assert(array_size(input) == 5, "");
1620   static_assert(array_size(expected) == 6, "");
1621 
1622   InternT in[array_size(input)];
1623   char exp[array_size(expected) * 2];
1624   copy(begin(input), end(input), begin(in));
1625   utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
1626 
1627   test_offsets_ok offsets[] = {{0, 0}, {1, 2}, {2, 4}, {3, 6}, {4, 10}};
1628   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1629     test_offsets_ok t             = *it;
1630     char out[array_size(exp) - 2] = {};
1631     assert(t.in_size <= array_size(in));
1632     assert(t.out_size <= array_size(out));
1633     mbstate_t state          = {};
1634     const InternT* in_next   = nullptr;
1635     char* out_next           = nullptr;
1636     codecvt_base::result res = codecvt_base::ok;
1637 
1638     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1639     assert(res == cvt.ok);
1640     assert(in_next == in + t.in_size);
1641     assert(out_next == out + t.out_size);
1642     assert(char_traits<char>::compare(out, exp, t.out_size) == 0);
1643     if (t.out_size < array_size(out))
1644       assert(out[t.out_size] == 0);
1645   }
1646 }
1647 
1648 template <class InternT>
utf32_to_utf16_out_partial(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1649 void utf32_to_utf16_out_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1650   const char32_t input[]    = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1651   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1652   static_assert(array_size(input) == 5, "");
1653   static_assert(array_size(expected) == 6, "");
1654 
1655   InternT in[array_size(input)];
1656   char exp[array_size(expected) * 2];
1657   copy(begin(input), end(input), begin(in));
1658   utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
1659 
1660   test_offsets_partial offsets[] = {
1661       {1, 0, 0, 0}, // no space for first CP
1662       {1, 1, 0, 0}, // no space for first CP
1663 
1664       {2, 2, 1, 2}, // no space for second CP
1665       {2, 3, 1, 2}, // no space for second CP
1666 
1667       {3, 4, 2, 4}, // no space for third CP
1668       {3, 5, 2, 4}, // no space for third CP
1669 
1670       {4, 6, 3, 6}, // no space for fourth CP
1671       {4, 7, 3, 6}, // no space for fourth CP
1672       {4, 8, 3, 6}, // no space for fourth CP
1673       {4, 9, 3, 6}, // no space for fourth CP
1674   };
1675   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1676     test_offsets_partial t        = *it;
1677     char out[array_size(exp) - 2] = {};
1678     assert(t.in_size <= array_size(in));
1679     assert(t.out_size <= array_size(out));
1680     assert(t.expected_in_next <= t.in_size);
1681     assert(t.expected_out_next <= t.out_size);
1682     mbstate_t state          = {};
1683     const InternT* in_next   = nullptr;
1684     char* out_next           = nullptr;
1685     codecvt_base::result res = codecvt_base::ok;
1686 
1687     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1688     assert(res == cvt.partial);
1689     assert(in_next == in + t.expected_in_next);
1690     assert(out_next == out + t.expected_out_next);
1691     assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
1692     if (t.expected_out_next < array_size(out))
1693       assert(out[t.expected_out_next] == 0);
1694   }
1695 }
1696 
1697 template <class InternT>
utf32_to_utf16_out_error(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1698 void utf32_to_utf16_out_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1699   const char32_t input[]    = {'b', 0x0448, 0xAAAA, 0x10AAAA, 0};
1700   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1701   static_assert(array_size(input) == 5, "");
1702   static_assert(array_size(expected) == 6, "");
1703 
1704   InternT in[array_size(input)];
1705   char exp[array_size(expected) * 2];
1706   copy(begin(input), end(input), begin(in));
1707   utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
1708 
1709   test_offsets_error<InternT> offsets[] = {
1710 
1711       // Surrogate CP
1712       {4, 10, 0, 0, 0xD800, 0},
1713       {4, 10, 1, 2, 0xDBFF, 1},
1714       {4, 10, 2, 4, 0xDC00, 2},
1715       {4, 10, 3, 6, 0xDFFF, 3},
1716 
1717       // CP out of range
1718       {4, 10, 0, 0, 0x00110000, 0},
1719       {4, 10, 1, 2, 0x00110000, 1},
1720       {4, 10, 2, 4, 0x00110000, 2},
1721       {4, 10, 3, 6, 0x00110000, 3}};
1722 
1723   for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
1724     test_offsets_error<InternT> t = *it;
1725     char out[array_size(exp) - 2] = {};
1726     assert(t.in_size <= array_size(in));
1727     assert(t.out_size <= array_size(out));
1728     assert(t.expected_in_next <= t.in_size);
1729     assert(t.expected_out_next <= t.out_size);
1730     InternT old_char  = in[t.replace_pos];
1731     in[t.replace_pos] = t.replace_char;
1732 
1733     mbstate_t state          = {};
1734     const InternT* in_next   = nullptr;
1735     char* out_next           = nullptr;
1736     codecvt_base::result res = codecvt_base::ok;
1737 
1738     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1739     assert(res == cvt.error);
1740     assert(in_next == in + t.expected_in_next);
1741     assert(out_next == out + t.expected_out_next);
1742     assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
1743     if (t.expected_out_next < array_size(out))
1744       assert(out[t.expected_out_next] == 0);
1745 
1746     in[t.replace_pos] = old_char;
1747   }
1748 }
1749 
1750 template <class InternT>
test_utf16_utf32_cvt(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1751 void test_utf16_utf32_cvt(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1752   utf16_to_utf32_in_ok(cvt, endianess);
1753   utf16_to_utf32_in_partial(cvt, endianess);
1754   utf16_to_utf32_in_error(cvt, endianess);
1755   utf32_to_utf16_out_ok(cvt, endianess);
1756   utf32_to_utf16_out_partial(cvt, endianess);
1757   utf32_to_utf16_out_error(cvt, endianess);
1758 }
1759 
1760 template <class InternT>
utf16_to_ucs2_in_ok(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1761 void utf16_to_ucs2_in_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1762   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0};
1763   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1764   static_assert(array_size(input) == 4, "");
1765   static_assert(array_size(expected) == 4, "");
1766 
1767   char in[array_size(input) * 2];
1768   InternT exp[array_size(expected)];
1769   utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1770   copy(begin(expected), end(expected), begin(exp));
1771 
1772   test_offsets_ok offsets[] = {{0, 0}, {2, 1}, {4, 2}, {6, 3}};
1773   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1774     test_offsets_ok t                = *it;
1775     InternT out[array_size(exp) - 1] = {};
1776     assert(t.in_size <= array_size(in));
1777     assert(t.out_size <= array_size(out));
1778     mbstate_t state          = {};
1779     const char* in_next      = nullptr;
1780     InternT* out_next        = nullptr;
1781     codecvt_base::result res = codecvt_base::ok;
1782 
1783     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1784     assert(res == cvt.ok);
1785     assert(in_next == in + t.in_size);
1786     assert(out_next == out + t.out_size);
1787     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1788     if (t.out_size < array_size(out))
1789       assert(out[t.out_size] == 0);
1790 
1791     state   = mbstate_t();
1792     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1793     assert(len >= 0);
1794     assert(static_cast<size_t>(len) == t.in_size);
1795   }
1796 
1797   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1798     test_offsets_ok t            = *it;
1799     InternT out[array_size(exp)] = {};
1800     assert(t.in_size <= array_size(in));
1801     assert(t.out_size <= array_size(out));
1802     mbstate_t state          = {};
1803     const char* in_next      = nullptr;
1804     InternT* out_next        = nullptr;
1805     codecvt_base::result res = codecvt_base::ok;
1806 
1807     res = cvt.in(state, in, in + t.in_size, in_next, out, end(out), out_next);
1808     assert(res == cvt.ok);
1809     assert(in_next == in + t.in_size);
1810     assert(out_next == out + t.out_size);
1811     assert(char_traits<InternT>::compare(out, exp, t.out_size) == 0);
1812     if (t.out_size < array_size(out))
1813       assert(out[t.out_size] == 0);
1814 
1815     state   = mbstate_t();
1816     int len = cvt.length(state, in, in + t.in_size, array_size(out));
1817     assert(len >= 0);
1818     assert(static_cast<size_t>(len) == t.in_size);
1819   }
1820 }
1821 
1822 template <class InternT>
utf16_to_ucs2_in_partial(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1823 void utf16_to_ucs2_in_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1824   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0};
1825   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1826   static_assert(array_size(input) == 4, "");
1827   static_assert(array_size(expected) == 4, "");
1828 
1829   char in[array_size(input) * 2];
1830   InternT exp[array_size(expected)];
1831   utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1832   copy(begin(expected), end(expected), begin(exp));
1833 
1834   test_offsets_partial offsets[] = {
1835       {2, 0, 0, 0}, // no space for first CP
1836       {1, 1, 0, 0}, // incomplete first CP
1837       {1, 0, 0, 0}, // incomplete first CP, and no space for it
1838 
1839       {4, 1, 2, 1}, // no space for second CP
1840       {3, 2, 2, 1}, // incomplete second CP
1841       {3, 1, 2, 1}, // incomplete second CP, and no space for it
1842 
1843       {6, 2, 4, 2}, // no space for third CP
1844       {5, 3, 4, 2}, // incomplete third CP
1845       {5, 2, 4, 2}, // incomplete third CP, and no space for it
1846   };
1847 
1848   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
1849     test_offsets_partial t           = *it;
1850     InternT out[array_size(exp) - 1] = {};
1851     assert(t.in_size <= array_size(in));
1852     assert(t.out_size <= array_size(out));
1853     assert(t.expected_in_next <= t.in_size);
1854     assert(t.expected_out_next <= t.out_size);
1855     mbstate_t state          = {};
1856     const char* in_next      = nullptr;
1857     InternT* out_next        = nullptr;
1858     codecvt_base::result res = codecvt_base::ok;
1859 
1860     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1861     assert(res == cvt.partial);
1862     assert(in_next == in + t.expected_in_next);
1863     assert(out_next == out + t.expected_out_next);
1864     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1865     if (t.expected_out_next < array_size(out))
1866       assert(out[t.expected_out_next] == 0);
1867 
1868     state   = mbstate_t();
1869     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1870     assert(len >= 0);
1871     assert(static_cast<size_t>(len) == t.expected_in_next);
1872   }
1873 }
1874 
1875 template <class InternT>
utf16_to_ucs2_in_error(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1876 void utf16_to_ucs2_in_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1877   char16_t input[]          = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1878   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
1879   static_assert(array_size(input) == 6, "");
1880   static_assert(array_size(expected) == 6, "");
1881 
1882   InternT exp[array_size(expected)];
1883   copy(begin(expected), end(expected), begin(exp));
1884 
1885   // The only possible error in UTF-16 is unpaired surrogate code units.
1886   // Additionally, because the target encoding is UCS-2, a proper pair of
1887   // surrogates is also error. Simply, any surrogate CU is error.
1888   test_offsets_error<char16_t> offsets[] = {
1889       {6, 3, 0, 0, 0xD800, 0},
1890       {6, 3, 0, 0, 0xDBFF, 0},
1891       {6, 3, 0, 0, 0xDC00, 0},
1892       {6, 3, 0, 0, 0xDFFF, 0},
1893 
1894       {6, 3, 2, 1, 0xD800, 1},
1895       {6, 3, 2, 1, 0xDBFF, 1},
1896       {6, 3, 2, 1, 0xDC00, 1},
1897       {6, 3, 2, 1, 0xDFFF, 1},
1898 
1899       {6, 3, 4, 2, 0xD800, 2},
1900       {6, 3, 4, 2, 0xDBFF, 2},
1901       {6, 3, 4, 2, 0xDC00, 2},
1902       {6, 3, 4, 2, 0xDFFF, 2},
1903 
1904       // make the leading surrogate a trailing one
1905       {10, 5, 6, 3, 0xDC00, 3},
1906       {10, 5, 6, 3, 0xDFFF, 3},
1907 
1908       // make the trailing surrogate a leading one
1909       {10, 5, 6, 3, 0xD800, 4},
1910       {10, 5, 6, 3, 0xDBFF, 4},
1911 
1912       // make the trailing surrogate a BMP char
1913       {10, 5, 6, 3, 'z', 4},
1914 
1915       // don't replace anything in the test cases bellow, just show the surrogate
1916       // pair (fourth CP) fully or partially (just the first surrogate)
1917       {10, 5, 6, 3, 'b', 0},
1918       {8, 5, 6, 3, 'b', 0},
1919       {9, 5, 6, 3, 'b', 0},
1920 
1921       {10, 4, 6, 3, 'b', 0},
1922       {8, 4, 6, 3, 'b', 0},
1923       {9, 4, 6, 3, 'b', 0},
1924   };
1925 
1926   for (test_offsets_error<char16_t>* it = begin(offsets); it != end(offsets); ++it) {
1927     test_offsets_error<char16_t> t = *it;
1928     char in[array_size(input) * 2];
1929     InternT out[array_size(exp) - 1] = {};
1930     assert(t.in_size <= array_size(in));
1931     assert(t.out_size <= array_size(out));
1932     assert(t.expected_in_next <= t.in_size);
1933     assert(t.expected_out_next <= t.out_size);
1934     char16_t old_char    = input[t.replace_pos];
1935     input[t.replace_pos] = t.replace_char; // replace in input, not in in
1936     utf16_to_bytes(begin(input), end(input), begin(in), endianess);
1937 
1938     mbstate_t state          = {};
1939     const char* in_next      = nullptr;
1940     InternT* out_next        = nullptr;
1941     codecvt_base::result res = codecvt_base::ok;
1942 
1943     res = cvt.in(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1944     assert(res == cvt.error);
1945     assert(in_next == in + t.expected_in_next);
1946     assert(out_next == out + t.expected_out_next);
1947     assert(char_traits<InternT>::compare(out, exp, t.expected_out_next) == 0);
1948     if (t.expected_out_next < array_size(out))
1949       assert(out[t.expected_out_next] == 0);
1950 
1951     state   = mbstate_t();
1952     int len = cvt.length(state, in, in + t.in_size, t.out_size);
1953     assert(len >= 0);
1954     assert(static_cast<size_t>(len) == t.expected_in_next);
1955 
1956     input[t.replace_pos] = old_char;
1957   }
1958 }
1959 
1960 template <class InternT>
ucs2_to_utf16_out_ok(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1961 void ucs2_to_utf16_out_ok(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1962   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0};
1963   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1964   static_assert(array_size(input) == 4, "");
1965   static_assert(array_size(expected) == 4, "");
1966 
1967   InternT in[array_size(input)];
1968   char exp[array_size(expected) * 2];
1969   copy(begin(input), end(input), begin(in));
1970   utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
1971 
1972   test_offsets_ok offsets[] = {{0, 0}, {1, 2}, {2, 4}, {3, 6}};
1973   for (test_offsets_ok* it = begin(offsets); it != end(offsets); ++it) {
1974     test_offsets_ok t             = *it;
1975     char out[array_size(exp) - 2] = {};
1976     assert(t.in_size <= array_size(in));
1977     assert(t.out_size <= array_size(out));
1978     mbstate_t state          = {};
1979     const InternT* in_next   = nullptr;
1980     char* out_next           = nullptr;
1981     codecvt_base::result res = codecvt_base::ok;
1982 
1983     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
1984     assert(res == cvt.ok);
1985     assert(in_next == in + t.in_size);
1986     assert(out_next == out + t.out_size);
1987     assert(char_traits<char>::compare(out, exp, t.out_size) == 0);
1988     if (t.out_size < array_size(out))
1989       assert(out[t.out_size] == 0);
1990   }
1991 }
1992 
1993 template <class InternT>
ucs2_to_utf16_out_partial(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)1994 void ucs2_to_utf16_out_partial(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
1995   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0};
1996   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0};
1997   static_assert(array_size(input) == 4, "");
1998   static_assert(array_size(expected) == 4, "");
1999 
2000   InternT in[array_size(input)];
2001   char exp[array_size(expected) * 2];
2002   copy(begin(input), end(input), begin(in));
2003   utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
2004 
2005   test_offsets_partial offsets[] = {
2006       {1, 0, 0, 0}, // no space for first CP
2007       {1, 1, 0, 0}, // no space for first CP
2008 
2009       {2, 2, 1, 2}, // no space for second CP
2010       {2, 3, 1, 2}, // no space for second CP
2011 
2012       {3, 4, 2, 4}, // no space for third CP
2013       {3, 5, 2, 4}, // no space for third CP
2014   };
2015   for (test_offsets_partial* it = begin(offsets); it != end(offsets); ++it) {
2016     test_offsets_partial t        = *it;
2017     char out[array_size(exp) - 2] = {};
2018     assert(t.in_size <= array_size(in));
2019     assert(t.out_size <= array_size(out));
2020     assert(t.expected_in_next <= t.in_size);
2021     assert(t.expected_out_next <= t.out_size);
2022     mbstate_t state          = {};
2023     const InternT* in_next   = nullptr;
2024     char* out_next           = nullptr;
2025     codecvt_base::result res = codecvt_base::ok;
2026 
2027     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
2028     assert(res == cvt.partial);
2029     assert(in_next == in + t.expected_in_next);
2030     assert(out_next == out + t.expected_out_next);
2031     assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
2032     if (t.expected_out_next < array_size(out))
2033       assert(out[t.expected_out_next] == 0);
2034   }
2035 }
2036 
2037 template <class InternT>
ucs2_to_utf16_out_error(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)2038 void ucs2_to_utf16_out_error(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
2039   const char16_t input[]    = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
2040   const char16_t expected[] = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
2041   static_assert(array_size(input) == 6, "");
2042   static_assert(array_size(expected) == 6, "");
2043 
2044   InternT in[array_size(input)];
2045   char exp[array_size(expected) * 2];
2046   copy(begin(input), end(input), begin(in));
2047   utf16_to_bytes(begin(expected), end(expected), begin(exp), endianess);
2048 
2049   test_offsets_error<InternT> offsets[] = {
2050       {3, 6, 0, 0, 0xD800, 0},
2051       {3, 6, 0, 0, 0xDBFF, 0},
2052       {3, 6, 0, 0, 0xDC00, 0},
2053       {3, 6, 0, 0, 0xDFFF, 0},
2054 
2055       {3, 6, 1, 2, 0xD800, 1},
2056       {3, 6, 1, 2, 0xDBFF, 1},
2057       {3, 6, 1, 2, 0xDC00, 1},
2058       {3, 6, 1, 2, 0xDFFF, 1},
2059 
2060       {3, 6, 2, 4, 0xD800, 2},
2061       {3, 6, 2, 4, 0xDBFF, 2},
2062       {3, 6, 2, 4, 0xDC00, 2},
2063       {3, 6, 2, 4, 0xDFFF, 2},
2064 
2065       // make the leading surrogate a trailing one
2066       {5, 10, 3, 6, 0xDC00, 3},
2067       {5, 10, 3, 6, 0xDFFF, 3},
2068 
2069       // make the trailing surrogate a leading one
2070       {5, 10, 3, 6, 0xD800, 4},
2071       {5, 10, 3, 6, 0xDBFF, 4},
2072 
2073       // make the trailing surrogate a BMP char
2074       {5, 10, 3, 6, 'z', 4},
2075 
2076       // don't replace anything in the test cases bellow, just show the surrogate
2077       // pair (fourth CP) fully or partially (just the first surrogate)
2078       {5, 10, 3, 6, 'b', 0},
2079       {5, 8, 3, 6, 'b', 0},
2080       {5, 9, 3, 6, 'b', 0},
2081 
2082       {4, 10, 3, 6, 'b', 0},
2083       {4, 8, 3, 6, 'b', 0},
2084       {4, 9, 3, 6, 'b', 0},
2085   };
2086 
2087   for (test_offsets_error<InternT>* it = begin(offsets); it != end(offsets); ++it) {
2088     test_offsets_error<InternT> t = *it;
2089     char out[array_size(exp) - 2] = {};
2090     assert(t.in_size <= array_size(in));
2091     assert(t.out_size <= array_size(out));
2092     assert(t.expected_in_next <= t.in_size);
2093     assert(t.expected_out_next <= t.out_size);
2094     InternT old_char  = in[t.replace_pos];
2095     in[t.replace_pos] = t.replace_char;
2096 
2097     mbstate_t state          = {};
2098     const InternT* in_next   = nullptr;
2099     char* out_next           = nullptr;
2100     codecvt_base::result res = codecvt_base::ok;
2101 
2102     res = cvt.out(state, in, in + t.in_size, in_next, out, out + t.out_size, out_next);
2103     assert(res == cvt.error);
2104     assert(in_next == in + t.expected_in_next);
2105     assert(out_next == out + t.expected_out_next);
2106     assert(char_traits<char>::compare(out, exp, t.expected_out_next) == 0);
2107     if (t.expected_out_next < array_size(out))
2108       assert(out[t.expected_out_next] == 0);
2109 
2110     in[t.replace_pos] = old_char;
2111   }
2112 }
2113 
2114 template <class InternT>
test_utf16_ucs2_cvt(const std::codecvt<InternT,char,mbstate_t> & cvt,utf16_endianess endianess)2115 void test_utf16_ucs2_cvt(const std::codecvt<InternT, char, mbstate_t>& cvt, utf16_endianess endianess) {
2116   utf16_to_ucs2_in_ok(cvt, endianess);
2117   utf16_to_ucs2_in_partial(cvt, endianess);
2118   utf16_to_ucs2_in_error(cvt, endianess);
2119   ucs2_to_utf16_out_ok(cvt, endianess);
2120   ucs2_to_utf16_out_partial(cvt, endianess);
2121   ucs2_to_utf16_out_error(cvt, endianess);
2122 }
2123 
2124 using std::codecvt;
2125 using std::codecvt_utf16;
2126 using std::codecvt_utf8;
2127 using std::codecvt_utf8_utf16;
2128 using std::has_facet;
2129 using std::locale;
2130 using std::use_facet;
2131 
test_utf8_utf32_codecvts()2132 void test_utf8_utf32_codecvts() {
2133   typedef codecvt<char32_t, char, mbstate_t> codecvt_c32;
2134   const locale& loc_c = locale::classic();
2135   assert(has_facet<codecvt_c32>(loc_c));
2136 
2137   const codecvt_c32& cvt = use_facet<codecvt_c32>(loc_c);
2138   test_utf8_utf32_cvt(cvt);
2139 
2140   codecvt_utf8<char32_t> cvt2;
2141   test_utf8_utf32_cvt(cvt2);
2142 
2143 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && !defined(TEST_SHORT_WCHAR)
2144   codecvt_utf8<wchar_t> cvt3;
2145   test_utf8_utf32_cvt(cvt3);
2146 #endif
2147 
2148 #ifndef TEST_HAS_NO_CHAR8_T
2149   typedef codecvt<char32_t, char8_t, mbstate_t> codecvt_c32_c8;
2150   assert(has_facet<codecvt_c32_c8>(loc_c));
2151   const codecvt_c32_c8& cvt4 = use_facet<codecvt_c32_c8>(loc_c);
2152   test_utf8_utf32_cvt(cvt4);
2153 #endif
2154 }
2155 
test_utf8_utf16_codecvts()2156 void test_utf8_utf16_codecvts() {
2157   typedef codecvt<char16_t, char, mbstate_t> codecvt_c16;
2158   const locale& loc_c = locale::classic();
2159   assert(has_facet<codecvt_c16>(loc_c));
2160 
2161   const codecvt_c16& cvt = use_facet<codecvt_c16>(loc_c);
2162   test_utf8_utf16_cvt(cvt);
2163 
2164   codecvt_utf8_utf16<char16_t> cvt2;
2165   test_utf8_utf16_cvt(cvt2);
2166 
2167   codecvt_utf8_utf16<char32_t> cvt3;
2168   test_utf8_utf16_cvt(cvt3);
2169 
2170 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
2171   codecvt_utf8_utf16<wchar_t> cvt4;
2172   test_utf8_utf16_cvt(cvt4);
2173 #endif
2174 
2175 #ifndef TEST_HAS_NO_CHAR8_T
2176   typedef codecvt<char16_t, char8_t, mbstate_t> codecvt_c16_c8;
2177   assert(has_facet<codecvt_c16_c8>(loc_c));
2178   const codecvt_c16_c8& cvt5 = use_facet<codecvt_c16_c8>(loc_c);
2179   test_utf8_utf16_cvt(cvt5);
2180 #endif
2181 }
2182 
test_utf8_ucs2_codecvts()2183 void test_utf8_ucs2_codecvts() {
2184   codecvt_utf8<char16_t> cvt;
2185   test_utf8_ucs2_cvt(cvt);
2186 
2187 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(TEST_SHORT_WCHAR)
2188   codecvt_utf8<wchar_t> cvt2;
2189   test_utf8_ucs2_cvt(cvt2);
2190 #endif
2191 }
2192 
test_utf16_utf32_codecvts()2193 void test_utf16_utf32_codecvts() {
2194   codecvt_utf16<char32_t> cvt;
2195   test_utf16_utf32_cvt(cvt, utf16_big_endian);
2196 
2197   codecvt_utf16<char32_t, 0x10FFFF, std::little_endian> cvt2;
2198   test_utf16_utf32_cvt(cvt2, utf16_little_endian);
2199 
2200 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && !defined(TEST_SHORT_WCHAR)
2201   codecvt_utf16<wchar_t> cvt3;
2202   test_utf16_utf32_cvt(cvt3, utf16_big_endian);
2203 
2204   codecvt_utf16<wchar_t, 0x10FFFF, std::little_endian> cvt4;
2205   test_utf16_utf32_cvt(cvt4, utf16_little_endian);
2206 #endif
2207 }
2208 
test_utf16_ucs2_codecvts()2209 void test_utf16_ucs2_codecvts() {
2210   codecvt_utf16<char16_t> cvt;
2211   test_utf16_ucs2_cvt(cvt, utf16_big_endian);
2212 
2213   codecvt_utf16<char16_t, 0x10FFFF, std::little_endian> cvt2;
2214   test_utf16_ucs2_cvt(cvt2, utf16_little_endian);
2215 
2216 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(TEST_SHORT_WCHAR)
2217   codecvt_utf16<wchar_t> cvt3;
2218   test_utf16_ucs2_cvt(cvt3, utf16_big_endian);
2219 
2220   codecvt_utf16<wchar_t, 0x10FFFF, std::little_endian> cvt4;
2221   test_utf16_ucs2_cvt(cvt4, utf16_little_endian);
2222 #endif
2223 }
2224 
main()2225 int main() {
2226   test_utf8_utf32_codecvts();
2227   test_utf8_utf16_codecvts();
2228   test_utf8_ucs2_codecvts();
2229   test_utf16_utf32_codecvts();
2230   test_utf16_ucs2_codecvts();
2231 }
2232