DXR is a code search and navigation tool aimed at making sense of large projects. It supports full-text and regex searches as well as structural queries.

Mercurial (b6d82b1a6b02)

VCS Links

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "mozilla/Utf8.h"

#include "mozilla/ArrayUtils.h"
#include "mozilla/Assertions.h"
#include "mozilla/EnumSet.h"
#include "mozilla/IntegerRange.h"
#include "mozilla/TextUtils.h"

using mozilla::ArrayLength;
using mozilla::DecodeOneUtf8CodePoint;
using mozilla::EnumSet;
using mozilla::IntegerRange;
using mozilla::IsAscii;
using mozilla::IsValidUtf8;
using mozilla::Utf8Unit;

// Disable the C++ 2a warning. See bug #1509926
#if defined(__clang__) && (__clang_major__ >= 6)
#  pragma clang diagnostic push
#  pragma clang diagnostic ignored "-Wc++2a-compat"
#endif

static void TestUtf8Unit() {
  Utf8Unit c('A');
  MOZ_RELEASE_ASSERT(c.toChar() == 'A');
  MOZ_RELEASE_ASSERT(c == Utf8Unit('A'));
  MOZ_RELEASE_ASSERT(c != Utf8Unit('B'));
  MOZ_RELEASE_ASSERT(c.toUint8() == 0x41);

  unsigned char asUnsigned = 'A';
  MOZ_RELEASE_ASSERT(c.toUnsignedChar() == asUnsigned);
  MOZ_RELEASE_ASSERT(Utf8Unit('B').toUnsignedChar() != asUnsigned);

  Utf8Unit first('@');
  Utf8Unit second('#');

  MOZ_RELEASE_ASSERT(first != second);

  first = second;
  MOZ_RELEASE_ASSERT(first == second);
}

template <typename Char>
struct ToUtf8Units {
 public:
  explicit ToUtf8Units(const Char* aStart, const Char* aEnd)
      : lead(Utf8Unit(aStart[0])), iter(aStart + 1), end(aEnd) {
    MOZ_RELEASE_ASSERT(!IsAscii(aStart[0]));
  }

  const Utf8Unit lead;
  const Char* iter;
  const Char* const end;
};

class AssertIfCalled {
 public:
  template <typename... Args>
  void operator()(Args&&... aArgs) {
    MOZ_RELEASE_ASSERT(false, "AssertIfCalled instance was called");
  }
};

// NOTE: For simplicity in treating |aCharN| identically regardless whether it's
//       a string literal or a more-generalized array, we require |aCharN| be
//       null-terminated.

template <typename Char, size_t N>
static void ExpectValidCodePoint(const Char (&aCharN)[N],
                                 char32_t aExpectedCodePoint) {
  MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0,
                     "array must be null-terminated for |aCharN + N - 1| to "
                     "compute the value of |aIter| as altered by "
                     "DecodeOneUtf8CodePoint");

  ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1);
  auto simple =
      DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end);
  MOZ_RELEASE_ASSERT(simple.isSome());
  MOZ_RELEASE_ASSERT(*simple == aExpectedCodePoint);
  MOZ_RELEASE_ASSERT(simpleUnit.iter == simpleUnit.end);

  ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1);
  auto complex = DecodeOneUtf8CodePoint(
      complexUnit.lead, &complexUnit.iter, complexUnit.end, AssertIfCalled(),
      AssertIfCalled(), AssertIfCalled(), AssertIfCalled(), AssertIfCalled());
  MOZ_RELEASE_ASSERT(complex.isSome());
  MOZ_RELEASE_ASSERT(*complex == aExpectedCodePoint);
  MOZ_RELEASE_ASSERT(complexUnit.iter == complexUnit.end);
}

enum class InvalidUtf8Reason {
  BadLeadUnit,
  NotEnoughUnits,
  BadTrailingUnit,
  BadCodePoint,
  NotShortestForm,
};

template <typename Char, size_t N>
static void ExpectInvalidCodePointHelper(const Char (&aCharN)[N],
                                         InvalidUtf8Reason aExpectedReason,
                                         uint8_t aExpectedUnitsAvailable,
                                         uint8_t aExpectedUnitsNeeded,
                                         char32_t aExpectedBadCodePoint,
                                         uint8_t aExpectedUnitsObserved) {
  MOZ_RELEASE_ASSERT(aCharN[N - 1] == 0,
                     "array must be null-terminated for |aCharN + N - 1| to "
                     "compute the value of |aIter| as altered by "
                     "DecodeOneUtf8CodePoint");

  ToUtf8Units<Char> simpleUnit(aCharN, aCharN + N - 1);
  auto simple =
      DecodeOneUtf8CodePoint(simpleUnit.lead, &simpleUnit.iter, simpleUnit.end);
  MOZ_RELEASE_ASSERT(simple.isNothing());
  MOZ_RELEASE_ASSERT(static_cast<const void*>(simpleUnit.iter) == aCharN);

  EnumSet<InvalidUtf8Reason> reasons;
  uint8_t unitsAvailable;
  uint8_t unitsNeeded;
  char32_t badCodePoint;
  uint8_t unitsObserved;

  struct OnNotShortestForm {
    EnumSet<InvalidUtf8Reason>& reasons;
    char32_t& badCodePoint;
    uint8_t& unitsObserved;

    void operator()(char32_t aBadCodePoint, uint8_t aUnitsObserved) {
      reasons += InvalidUtf8Reason::NotShortestForm;
      badCodePoint = aBadCodePoint;
      unitsObserved = aUnitsObserved;
    }
  };

  ToUtf8Units<Char> complexUnit(aCharN, aCharN + N - 1);
  auto complex = DecodeOneUtf8CodePoint(
      complexUnit.lead, &complexUnit.iter, complexUnit.end,
      [&reasons]() { reasons += InvalidUtf8Reason::BadLeadUnit; },
      [&reasons, &unitsAvailable, &unitsNeeded](uint8_t aUnitsAvailable,
                                                uint8_t aUnitsNeeded) {
        reasons += InvalidUtf8Reason::NotEnoughUnits;
        unitsAvailable = aUnitsAvailable;
        unitsNeeded = aUnitsNeeded;
      },
      [&reasons, &unitsObserved](uint8_t aUnitsObserved) {
        reasons += InvalidUtf8Reason::BadTrailingUnit;
        unitsObserved = aUnitsObserved;
      },
      [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint,
                                                uint8_t aUnitsObserved) {
        reasons += InvalidUtf8Reason::BadCodePoint;
        badCodePoint = aBadCodePoint;
        unitsObserved = aUnitsObserved;
      },
      [&reasons, &badCodePoint, &unitsObserved](char32_t aBadCodePoint,
                                                uint8_t aUnitsObserved) {
        reasons += InvalidUtf8Reason::NotShortestForm;
        badCodePoint = aBadCodePoint;
        unitsObserved = aUnitsObserved;
      });
  MOZ_RELEASE_ASSERT(complex.isNothing());
  MOZ_RELEASE_ASSERT(static_cast<const void*>(complexUnit.iter) == aCharN);

  bool alreadyIterated = false;
  for (InvalidUtf8Reason reason : reasons) {
    MOZ_RELEASE_ASSERT(!alreadyIterated);
    alreadyIterated = true;

    switch (reason) {
      case InvalidUtf8Reason::BadLeadUnit:
        break;

      case InvalidUtf8Reason::NotEnoughUnits:
        MOZ_RELEASE_ASSERT(unitsAvailable == aExpectedUnitsAvailable);
        MOZ_RELEASE_ASSERT(unitsNeeded == aExpectedUnitsNeeded);
        break;

      case InvalidUtf8Reason::BadTrailingUnit:
        MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
        break;

      case InvalidUtf8Reason::BadCodePoint:
        MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint);
        MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
        break;

      case InvalidUtf8Reason::NotShortestForm:
        MOZ_RELEASE_ASSERT(badCodePoint == aExpectedBadCodePoint);
        MOZ_RELEASE_ASSERT(unitsObserved == aExpectedUnitsObserved);
        break;
    }
  }
}

// NOTE: For simplicity in treating |aCharN| identically regardless whether it's
//       a string literal or a more-generalized array, we require |aCharN| be
//       null-terminated in all these functions.

template <typename Char, size_t N>
static void ExpectBadLeadUnit(const Char (&aCharN)[N]) {
  ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadLeadUnit, 0xFF,
                               0xFF, 0xFFFFFFFF, 0xFF);
}

template <typename Char, size_t N>
static void ExpectNotEnoughUnits(const Char (&aCharN)[N],
                                 uint8_t aExpectedUnitsAvailable,
                                 uint8_t aExpectedUnitsNeeded) {
  ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotEnoughUnits,
                               aExpectedUnitsAvailable, aExpectedUnitsNeeded,
                               0xFFFFFFFF, 0xFF);
}

template <typename Char, size_t N>
static void ExpectBadTrailingUnit(const Char (&aCharN)[N],
                                  uint8_t aExpectedUnitsObserved) {
  ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadTrailingUnit, 0xFF,
                               0xFF, 0xFFFFFFFF, aExpectedUnitsObserved);
}

template <typename Char, size_t N>
static void ExpectNotShortestForm(const Char (&aCharN)[N],
                                  char32_t aExpectedBadCodePoint,
                                  uint8_t aExpectedUnitsObserved) {
  ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::NotShortestForm, 0xFF,
                               0xFF, aExpectedBadCodePoint,
                               aExpectedUnitsObserved);
}

template <typename Char, size_t N>
static void ExpectBadCodePoint(const Char (&aCharN)[N],
                               char32_t aExpectedBadCodePoint,
                               uint8_t aExpectedUnitsObserved) {
  ExpectInvalidCodePointHelper(aCharN, InvalidUtf8Reason::BadCodePoint, 0xFF,
                               0xFF, aExpectedBadCodePoint,
                               aExpectedUnitsObserved);
}

static void TestIsValidUtf8() {
  // Note we include the U+0000 NULL in this one -- and that's fine.
  static const char asciiBytes[] = u8"How about a nice game of chess?";
  MOZ_RELEASE_ASSERT(IsValidUtf8(asciiBytes, ArrayLength(asciiBytes)));

  static const char endNonAsciiBytes[] = u8"Life is like a 🌯";
  MOZ_RELEASE_ASSERT(
      IsValidUtf8(endNonAsciiBytes, ArrayLength(endNonAsciiBytes) - 1));

  static const unsigned char badLeading[] = {0x80};
  MOZ_RELEASE_ASSERT(!IsValidUtf8(badLeading, ArrayLength(badLeading)));

  // Byte-counts

  // 1
  static const char oneBytes[] = u8"A";  // U+0041 LATIN CAPITAL LETTER A
  constexpr size_t oneBytesLen = ArrayLength(oneBytes);
  static_assert(oneBytesLen == 2, "U+0041 plus nul");
  MOZ_RELEASE_ASSERT(IsValidUtf8(oneBytes, oneBytesLen));

  // 2
  static const char twoBytes[] = u8"؆";  // U+0606 ARABIC-INDIC CUBE ROOT
  constexpr size_t twoBytesLen = ArrayLength(twoBytes);
  static_assert(twoBytesLen == 3, "U+0606 in two bytes plus nul");
  MOZ_RELEASE_ASSERT(IsValidUtf8(twoBytes, twoBytesLen));

  ExpectValidCodePoint(twoBytes, 0x0606);

  // 3
  static const char threeBytes[] = u8"";  // U+1A1E BUGINESE PALLAWA
  constexpr size_t threeBytesLen = ArrayLength(threeBytes);
  static_assert(threeBytesLen == 4, "U+1A1E in three bytes plus nul");
  MOZ_RELEASE_ASSERT(IsValidUtf8(threeBytes, threeBytesLen));

  ExpectValidCodePoint(threeBytes, 0x1A1E);

  // 4
  static const char fourBytes[] =
      u8"🁡";  // U+1F061 DOMINO TILE HORIZONTAL-06-06
  constexpr size_t fourBytesLen = ArrayLength(fourBytes);
  static_assert(fourBytesLen == 5, "U+1F061 in four bytes plus nul");
  MOZ_RELEASE_ASSERT(IsValidUtf8(fourBytes, fourBytesLen));

  ExpectValidCodePoint(fourBytes, 0x1F061);

  // Max code point
  static const char maxCodePoint[] = u8"􏿿";  // U+10FFFF
  constexpr size_t maxCodePointLen = ArrayLength(maxCodePoint);
  static_assert(maxCodePointLen == 5, "U+10FFFF in four bytes plus nul");
  MOZ_RELEASE_ASSERT(IsValidUtf8(maxCodePoint, maxCodePointLen));

  ExpectValidCodePoint(maxCodePoint, 0x10FFFF);

  // One past max code point
  static const unsigned char onePastMaxCodePoint[] = {0xF4, 0x90, 0x80, 0x80,
                                                      0x0};
  constexpr size_t onePastMaxCodePointLen = ArrayLength(onePastMaxCodePoint);
  MOZ_RELEASE_ASSERT(!IsValidUtf8(onePastMaxCodePoint, onePastMaxCodePointLen));

  ExpectBadCodePoint(onePastMaxCodePoint, 0x110000, 4);

  // Surrogate-related testing

  // (Note that the various code unit sequences here are null-terminated to
  // simplify life for ExpectValidCodePoint, which presumes null termination.)

  static const unsigned char justBeforeSurrogates[] = {0xED, 0x9F, 0xBF, 0x0};
  constexpr size_t justBeforeSurrogatesLen =
      ArrayLength(justBeforeSurrogates) - 1;
  MOZ_RELEASE_ASSERT(
      IsValidUtf8(justBeforeSurrogates, justBeforeSurrogatesLen));

  ExpectValidCodePoint(justBeforeSurrogates, 0xD7FF);

  static const unsigned char leastSurrogate[] = {0xED, 0xA0, 0x80, 0x0};
  constexpr size_t leastSurrogateLen = ArrayLength(leastSurrogate) - 1;
  MOZ_RELEASE_ASSERT(!IsValidUtf8(leastSurrogate, leastSurrogateLen));

  ExpectBadCodePoint(leastSurrogate, 0xD800, 3);

  static const unsigned char arbitraryHighSurrogate[] = {0xED, 0xA2, 0x87, 0x0};
  constexpr size_t arbitraryHighSurrogateLen =
      ArrayLength(arbitraryHighSurrogate) - 1;
  MOZ_RELEASE_ASSERT(
      !IsValidUtf8(arbitraryHighSurrogate, arbitraryHighSurrogateLen));

  ExpectBadCodePoint(arbitraryHighSurrogate, 0xD887, 3);

  static const unsigned char arbitraryLowSurrogate[] = {0xED, 0xB7, 0xAF, 0x0};
  constexpr size_t arbitraryLowSurrogateLen =
      ArrayLength(arbitraryLowSurrogate) - 1;
  MOZ_RELEASE_ASSERT(
      !IsValidUtf8(arbitraryLowSurrogate, arbitraryLowSurrogateLen));

  ExpectBadCodePoint(arbitraryLowSurrogate, 0xDDEF, 3);

  static const unsigned char greatestSurrogate[] = {0xED, 0xBF, 0xBF, 0x0};
  constexpr size_t greatestSurrogateLen = ArrayLength(greatestSurrogate) - 1;
  MOZ_RELEASE_ASSERT(!IsValidUtf8(greatestSurrogate, greatestSurrogateLen));

  ExpectBadCodePoint(greatestSurrogate, 0xDFFF, 3);

  static const unsigned char justAfterSurrogates[] = {0xEE, 0x80, 0x80, 0x0};
  constexpr size_t justAfterSurrogatesLen =
      ArrayLength(justAfterSurrogates) - 1;
  MOZ_RELEASE_ASSERT(IsValidUtf8(justAfterSurrogates, justAfterSurrogatesLen));

  ExpectValidCodePoint(justAfterSurrogates, 0xE000);
}

static void TestDecodeOneValidUtf8CodePoint() {
  // NOTE: DecodeOneUtf8CodePoint decodes only *non*-ASCII code points that
  //       consist of multiple code units, so there are no ASCII tests below.

  // Length two.

  ExpectValidCodePoint(u8"€", 0x80);  // <control>
  ExpectValidCodePoint(u8"©", 0xA9);   // COPYRIGHT SIGN
  ExpectValidCodePoint(u8"", 0xB6);   // PILCROW SIGN
  ExpectValidCodePoint(u8"¾", 0xBE);   // VULGAR FRACTION THREE QUARTERS
  ExpectValidCodePoint(u8"÷", 0xF7);   // DIVISION SIGN
  ExpectValidCodePoint(u8"ÿ", 0xFF);   // LATIN SMALL LETTER Y WITH DIAERESIS
  ExpectValidCodePoint(u8"Ā", 0x100);  // LATIN CAPITAL LETTER A WITH MACRON
  ExpectValidCodePoint(u8"IJ", 0x132);  // LATIN CAPITAL LETTER LIGATURE IJ
  ExpectValidCodePoint(u8"ͼ", 0x37C);  // GREEK SMALL DOTTED LUNATE SIGMA SYMBOL
  ExpectValidCodePoint(u8"Ӝ",
                       0x4DC);  // CYRILLIC CAPITAL LETTER ZHE WITTH DIAERESIS
  ExpectValidCodePoint(u8"۩", 0x6E9);   // ARABIC PLACE OF SAJDAH
  ExpectValidCodePoint(u8"߿", 0x7FF);  // <not assigned>

  // Length three.

  ExpectValidCodePoint(u8"", 0x800);    // SAMARITAN LETTER ALAF
  ExpectValidCodePoint(u8"", 0x841);    // MANDAIC LETTER AB
  ExpectValidCodePoint(u8"", 0x8FF);  // ARABIC MARK SIDEWAYS NOON GHUNNA
  ExpectValidCodePoint(u8"", 0xB86);    // TAMIL LETTER AA
  ExpectValidCodePoint(u8"",
                       0xF03);  // TIBETAN MARK GTER YIG MGO -UM GTER TSHEG MA
  ExpectValidCodePoint(
      u8"",
      0xFC9);  // TIBETAN SYMBOL NOR BU (but on my system it really looks like
               // SOFT-SERVE ICE CREAM FROM ABOVE THE PLANE if you ask me)
  ExpectValidCodePoint(u8"", 0x102A);           // MYANMAR LETTER AU
  ExpectValidCodePoint(u8"", 0x168F);           // OGHAM LETTER RUIS
  ExpectValidCodePoint("\xE2\x80\xA8", 0x2028);  // (the hated) LINE SEPARATOR
  ExpectValidCodePoint("\xE2\x80\xA9",
                       0x2029);           // (the hated) PARAGRAPH SEPARATOR
  ExpectValidCodePoint(u8"", 0x262C);    // ADI SHAKTI
  ExpectValidCodePoint(u8"", 0x32AE);   // CIRCLED IDEOGRAPH RESOURCE
  ExpectValidCodePoint(u8"", 0x33D6);   // SQUARE MOL
  ExpectValidCodePoint(u8"", 0xA504);    // VAI SYLLABLE WEEN
  ExpectValidCodePoint(u8"", 0xD7D5);   // HANGUL JONGSEONG RIEUL-SSANGKIYEOK
  ExpectValidCodePoint(u8"", 0xD7FF);  // <not assigned>
  ExpectValidCodePoint(u8"", 0xE000);    // <Private Use>
  ExpectValidCodePoint(u8"", 0xF9F2);   // CJK COMPATIBILITY IDEOGRAPH-F9F
  ExpectValidCodePoint(
      u8"", 0xFDFD);  // ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHHHEEEEM
  ExpectValidCodePoint(u8"￿", 0xFFFF);  // <not assigned>

  // Length four.
  ExpectValidCodePoint(u8"𐀀", 0x10000);      // LINEAR B SYLLABLE B008 A
  ExpectValidCodePoint(u8"𔑀", 0x14440);   // ANATOLIAN HIEROGLYPH A058
  ExpectValidCodePoint(u8"𝛗", 0x1D6D7);      // MATHEMATICAL BOLD SMALL PHI
  ExpectValidCodePoint(u8"💩", 0x1F4A9);      // PILE OF POO
  ExpectValidCodePoint(u8"🔫", 0x1F52B);      // PISTOL
  ExpectValidCodePoint(u8"🥌", 0x1F94C);   // CURLING STONE
  ExpectValidCodePoint(u8"🥏", 0x1F94F);   // FLYING DISC
  ExpectValidCodePoint(u8"𠍆", 0x20346);     // CJK UNIFIED IDEOGRAPH-20346
  ExpectValidCodePoint(u8"𡠺", 0x2183A);     // CJK UNIFIED IDEOGRAPH-2183A
  ExpectValidCodePoint(u8"񁟶", 0x417F6);   // <not assigned>
  ExpectValidCodePoint(u8"񾠶", 0x7E836);   // <not assigned>
  ExpectValidCodePoint(u8"󾽧", 0xFEF67);      // <Plane 15 Private Use>
  ExpectValidCodePoint(u8"􏿿", 0x10FFFF);  //
}

static void TestDecodeBadLeadUnit() {
  // These tests are actually exhaustive.

  unsigned char badLead[] = {'\0', '\0'};

  for (uint8_t lead : IntegerRange(0b1000'0000, 0b1100'0000)) {
    badLead[0] = lead;
    ExpectBadLeadUnit(badLead);
  }

  {
    uint8_t lead = 0b1111'1000;
    do {
      badLead[0] = lead;
      ExpectBadLeadUnit(badLead);
      if (lead == 0b1111'1111) {
        break;
      }

      lead++;
    } while (true);
  }
}

static void TestTooFewOrBadTrailingUnits() {
  // Lead unit indicates a two-byte code point.

  char truncatedTwo[] = {'\0', '\0'};
  char badTrailTwo[] = {'\0', '\0', '\0'};

  for (uint8_t lead : IntegerRange(0b1100'0000, 0b1110'0000)) {
    truncatedTwo[0] = lead;
    ExpectNotEnoughUnits(truncatedTwo, 1, 2);

    badTrailTwo[0] = lead;
    for (uint8_t trail : IntegerRange(0b0000'0000, 0b1000'0000)) {
      badTrailTwo[1] = trail;
      ExpectBadTrailingUnit(badTrailTwo, 2);
    }

    for (uint8_t trail : IntegerRange(0b1100'0000, 0b1111'1111)) {
      badTrailTwo[1] = trail;
      ExpectBadTrailingUnit(badTrailTwo, 2);
    }
  }

  // Lead unit indicates a three-byte code point.

  char truncatedThreeOne[] = {'\0', '\0'};
  char truncatedThreeTwo[] = {'\0', '\0', '\0'};
  unsigned char badTrailThree[] = {'\0', '\0', '\0', '\0'};

  for (uint8_t lead : IntegerRange(0b1110'0000, 0b1111'0000)) {
    truncatedThreeOne[0] = lead;
    ExpectNotEnoughUnits(truncatedThreeOne, 1, 3);

    truncatedThreeTwo[0] = lead;
    ExpectNotEnoughUnits(truncatedThreeTwo, 2, 3);

    badTrailThree[0] = lead;
    badTrailThree[2] = 0b1011'1111;  // make valid to test overreads
    for (uint8_t mid : IntegerRange(0b0000'0000, 0b1000'0000)) {
      badTrailThree[1] = mid;
      ExpectBadTrailingUnit(badTrailThree, 2);
    }
    {
      uint8_t mid = 0b1100'0000;
      do {
        badTrailThree[1] = mid;
        ExpectBadTrailingUnit(badTrailThree, 2);
        if (mid == 0b1111'1111) {
          break;
        }

        mid++;
      } while (true);
    }

    badTrailThree[1] = 0b1011'1111;
    for (uint8_t last : IntegerRange(0b0000'0000, 0b1000'0000)) {
      badTrailThree[2] = last;
      ExpectBadTrailingUnit(badTrailThree, 3);
    }
    {
      uint8_t last = 0b1100'0000;
      do {
        badTrailThree[2] = last;
        ExpectBadTrailingUnit(badTrailThree, 3);
        if (last == 0b1111'1111) {
          break;
        }

        last++;
      } while (true);
    }
  }

  // Lead unit indicates a four-byte code point.

  char truncatedFourOne[] = {'\0', '\0'};
  char truncatedFourTwo[] = {'\0', '\0', '\0'};
  char truncatedFourThree[] = {'\0', '\0', '\0', '\0'};

  unsigned char badTrailFour[] = {'\0', '\0', '\0', '\0', '\0'};

  for (uint8_t lead : IntegerRange(0b1111'0000, 0b1111'1000)) {
    truncatedFourOne[0] = lead;
    ExpectNotEnoughUnits(truncatedFourOne, 1, 4);

    truncatedFourTwo[0] = lead;
    ExpectNotEnoughUnits(truncatedFourTwo, 2, 4);

    truncatedFourThree[0] = lead;
    ExpectNotEnoughUnits(truncatedFourThree, 3, 4);

    badTrailFour[0] = lead;
    badTrailFour[2] = badTrailFour[3] = 0b1011'1111;  // test for overreads
    for (uint8_t second : IntegerRange(0b0000'0000, 0b1000'0000)) {
      badTrailFour[1] = second;
      ExpectBadTrailingUnit(badTrailFour, 2);
    }
    {
      uint8_t second = 0b1100'0000;
      do {
        badTrailFour[1] = second;
        ExpectBadTrailingUnit(badTrailFour, 2);
        if (second == 0b1111'1111) {
          break;
        }

        second++;
      } while (true);
    }

    badTrailFour[1] = badTrailFour[3] = 0b1011'1111;  // test for overreads
    for (uint8_t third : IntegerRange(0b0000'0000, 0b1000'0000)) {
      badTrailFour[2] = third;
      ExpectBadTrailingUnit(badTrailFour, 3);
    }
    {
      uint8_t third = 0b1100'0000;
      do {
        badTrailFour[2] = third;
        ExpectBadTrailingUnit(badTrailFour, 3);
        if (third == 0b1111'1111) {
          break;
        }

        third++;
      } while (true);
    }

    badTrailFour[2] = 0b1011'1111;
    for (uint8_t fourth : IntegerRange(0b0000'0000, 0b1000'0000)) {
      badTrailFour[3] = fourth;
      ExpectBadTrailingUnit(badTrailFour, 4);
    }
    {
      uint8_t fourth = 0b1100'0000;
      do {
        badTrailFour[3] = fourth;
        ExpectBadTrailingUnit(badTrailFour, 4);
        if (fourth == 0b1111'1111) {
          break;
        }

        fourth++;
      } while (true);
    }
  }
}

static void TestBadSurrogate() {
  // These tests are actually exhaustive.

  ExpectValidCodePoint("\xED\x9F\xBF", 0xD7FF);  // last before surrogates
  ExpectValidCodePoint("\xEE\x80\x80", 0xE000);  // first after surrogates

  // First invalid surrogate encoding is { 0xED, 0xA0, 0x80 }.  Last invalid
  // surrogate encoding is { 0xED, 0xBF, 0xBF }.

  char badSurrogate[] = {'\xED', '\0', '\0', '\0'};

  for (char32_t c = 0xD800; c < 0xE000; c++) {
    badSurrogate[1] = 0b1000'0000 ^ ((c & 0b1111'1100'0000) >> 6);
    badSurrogate[2] = 0b1000'0000 ^ ((c & 0b0000'0011'1111));

    ExpectBadCodePoint(badSurrogate, c, 3);
  }
}

static void TestBadTooBig() {
  // These tests are actually exhaustive.

  ExpectValidCodePoint("\xF4\x8F\xBF\xBF", 0x10'FFFF);  // last code point

  // Four-byte code points are
  //
  //   0b1111'0xxx 0b10xx'xxxx 0b10xx'xxxx 0b10xx'xxxx
  //
  // with 3 + 6 + 6 + 6 == 21 unconstrained bytes, so the structurally
  // representable limit (exclusive) is 2**21 - 1 == 2097152.

  char tooLargeCodePoint[] = {'\0', '\0', '\0', '\0', '\0'};

  for (char32_t c = 0x11'0000; c < (1 << 21); c++) {
    tooLargeCodePoint[0] =
        0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
    tooLargeCodePoint[1] =
        0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
    tooLargeCodePoint[2] =
        0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
    tooLargeCodePoint[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));

    ExpectBadCodePoint(tooLargeCodePoint, c, 4);
  }
}

static void TestBadCodePoint() {
  TestBadSurrogate();
  TestBadTooBig();
}

static void TestNotShortestForm() {
  {
    // One-byte in two-byte.

    char oneInTwo[] = {'\0', '\0', '\0'};

    for (char32_t c = '\0'; c < 0x80; c++) {
      oneInTwo[0] = 0b1100'0000 ^ ((c & 0b0111'1100'0000) >> 6);
      oneInTwo[1] = 0b1000'0000 ^ ((c & 0b0000'0011'1111));

      ExpectNotShortestForm(oneInTwo, c, 2);
    }

    // One-byte in three-byte.

    char oneInThree[] = {'\0', '\0', '\0', '\0'};

    for (char32_t c = '\0'; c < 0x80; c++) {
      oneInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12);
      oneInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6);
      oneInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111));

      ExpectNotShortestForm(oneInThree, c, 3);
    }

    // One-byte in four-byte.

    char oneInFour[] = {'\0', '\0', '\0', '\0', '\0'};

    for (char32_t c = '\0'; c < 0x80; c++) {
      oneInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
      oneInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
      oneInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
      oneInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));

      ExpectNotShortestForm(oneInFour, c, 4);
    }
  }

  {
    // Two-byte in three-byte.

    char twoInThree[] = {'\0', '\0', '\0', '\0'};

    for (char32_t c = 0x80; c < 0x800; c++) {
      twoInThree[0] = 0b1110'0000 ^ ((c & 0b1111'0000'0000'0000) >> 12);
      twoInThree[1] = 0b1000'0000 ^ ((c & 0b0000'1111'1100'0000) >> 6);
      twoInThree[2] = 0b1000'0000 ^ ((c & 0b0000'0000'0011'1111));

      ExpectNotShortestForm(twoInThree, c, 3);
    }

    // Two-byte in four-byte.

    char twoInFour[] = {'\0', '\0', '\0', '\0', '\0'};

    for (char32_t c = 0x80; c < 0x800; c++) {
      twoInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
      twoInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
      twoInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
      twoInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));

      ExpectNotShortestForm(twoInFour, c, 4);
    }
  }

  {
    // Three-byte in four-byte.

    char threeInFour[] = {'\0', '\0', '\0', '\0', '\0'};

    for (char32_t c = 0x800; c < 0x1'0000; c++) {
      threeInFour[0] = 0b1111'0000 ^ ((c & 0b1'1100'0000'0000'0000'0000) >> 18);
      threeInFour[1] = 0b1000'0000 ^ ((c & 0b0'0011'1111'0000'0000'0000) >> 12);
      threeInFour[2] = 0b1000'0000 ^ ((c & 0b0'0000'0000'1111'1100'0000) >> 6);
      threeInFour[3] = 0b1000'0000 ^ ((c & 0b0'0000'0000'0000'0011'1111));

      ExpectNotShortestForm(threeInFour, c, 4);
    }
  }
}

static void TestDecodeOneInvalidUtf8CodePoint() {
  TestDecodeBadLeadUnit();
  TestTooFewOrBadTrailingUnits();
  TestBadCodePoint();
  TestNotShortestForm();
}

static void TestDecodeOneUtf8CodePoint() {
  TestDecodeOneValidUtf8CodePoint();
  TestDecodeOneInvalidUtf8CodePoint();
}

int main() {
  TestUtf8Unit();
  TestIsValidUtf8();
  TestDecodeOneUtf8CodePoint();
  return 0;
}

#if defined(__clang__) && (__clang_major__ >= 6)
#  pragma clang diagnostic pop
#endif