DXR is a code search and navigation tool aimed at making sense of large projects. It supports full-text and regex searches as well as structural queries.

Header

Mercurial (a1b6b5f58066)

VCS Links

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "nsUnicodeRange.h"

/**********************************************************************
 * Unicode subranges as defined in unicode 3.0
 * x-western  -> latin
 *  0000 - 036f
 *  1e00 - 1eff
 *  2000 - 206f  (general punctuation)
 *  20a0 - 20cf  (currency symbols)
 *  2100 - 214f  (letterlike symbols)
 *  2150 - 218f  (Number Forms)
 * el         -> greek
 *  0370 - 03ff
 *  1f00 - 1fff
 * x-cyrillic -> cyrillic
 *  0400 - 04ff
 * he         -> hebrew
 *  0590 - 05ff
 * ar         -> arabic
 *  0600 - 06ff
 *  fb50 - fdff (arabic presentation forms)
 *  fe70 - feff (arabic presentation forms b)
 * th - thai
 *  0e00 - 0e7f
 * ko        -> korean
 *  ac00 - d7af  (hangul Syllables)
 *  1100 - 11ff    (jamo)
 *  3130 - 318f (hangul compatibility jamo)
 * ja
 *  3040 - 309f (hiragana)
 *  30a0 - 30ff (katakana)
 * zh-CN
 * zh-TW
 *
 * CJK
 *  3100 - 312f (bopomofo)
 *  31a0 - 31bf (bopomofo extended)
 *  3000 - 303f (CJK Symbols and Punctuation)
 *  2e80 - 2eff (CJK radicals supplement)
 *  2f00 - 2fdf (Kangxi Radicals)
 *  2ff0 - 2fff (Ideographic Description Characters)
 *  3190 - 319f (kanbun)
 *  3200 - 32ff (Enclosed CJK letters and Months)
 *  3300 - 33ff (CJK compatibility)
 *  3400 - 4dbf (CJK Unified Ideographs Extension A)
 *  4e00 - 9faf (CJK Unified Ideographs)
 *  f900 - fa5f (CJK Compatibility Ideographs)
 *  fe30 - fe4f (CJK compatibility Forms)
 *  ff00 - ffef (halfwidth and fullwidth forms)
 *
 * Armenian
 *  0530 - 058f
 * Sriac
 *  0700 - 074f
 * Thaana
 *  0780 - 07bf
 * Devanagari
 *  0900 - 097f
 * Bengali
 *  0980 - 09ff
 * Gurmukhi
 *  0a00 - 0a7f
 * Gujarati
 *  0a80 - 0aff
 * Oriya
 *  0b00 - 0b7f
 * Tamil
 *  0b80 - 0bff
 * Telugu
 *  0c00 - 0c7f
 * Kannada
 *  0c80 - 0cff
 * Malayalam
 *  0d00 - 0d7f
 * Sinhala
 *  0d80 - 0def
 * Lao
 *  0e80 - 0eff
 * Tibetan
 *  0f00 - 0fbf
 * Myanmar
 *  1000 - 109f
 * Georgian
 *  10a0 - 10ff
 * Ethiopic
 *  1200 - 137f
 * Cherokee
 *  13a0 - 13ff
 * Canadian Aboriginal Syllabics
 *  1400 - 167f
 * Ogham
 *  1680 - 169f
 * Runic
 *  16a0 - 16ff
 * Khmer
 *  1780 - 17ff
 * Mongolian
 *  1800 - 18af
 * Misc - superscripts and subscripts
 *  2070 - 209f
 * Misc - Combining Diacritical Marks for Symbols
 *  20d0 - 20ff
 * Misc - Arrows
 *  2190 - 21ff
 * Misc - Mathematical Operators
 *  2200 - 22ff
 * Misc - Miscellaneous Technical
 *  2300 - 23ff
 * Misc - Control picture
 *  2400 - 243f
 * Misc - Optical character recognition
 *  2440 - 2450
 * Misc - Enclose Alphanumerics
 *  2460 - 24ff
 * Misc - Box Drawing
 *  2500 - 257f
 * Misc - Block Elements
 *  2580 - 259f
 * Misc - Geometric Shapes
 *  25a0 - 25ff
 * Misc - Miscellaneous Symbols
 *  2600 - 267f
 * Misc - Dingbats
 *  2700 - 27bf
 * Misc - Braille Patterns
 *  2800 - 28ff
 * Yi Syllables
 *  a000 - a48f
 * Yi radicals
 *  a490 - a4cf
 * Alphabetic Presentation Forms
 *  fb00 - fb4f
 * Misc - Combining half Marks
 *  fe20 - fe2f
 * Misc - small form variants
 *  fe50 - fe6f
 * Misc - Specials
 *  fff0 - ffff
 *********************************************************************/

#define NUM_OF_SUBTABLES 10
#define SUBTABLE_SIZE 16

static const uint8_t gUnicodeSubrangeTable[NUM_OF_SUBTABLES][SUBTABLE_SIZE] = {
    {
        // table for X---
        kRangeTableBase + 1,  // u0xxx
        kRangeTableBase + 2,  // u1xxx
        kRangeTableBase + 3,  // u2xxx
        kRangeSetCJK,         // u3xxx
        kRangeSetCJK,         // u4xxx
        kRangeSetCJK,         // u5xxx
        kRangeSetCJK,         // u6xxx
        kRangeSetCJK,         // u7xxx
        kRangeSetCJK,         // u8xxx
        kRangeSetCJK,         // u9xxx
        kRangeTableBase + 4,  // uaxxx
        kRangeKorean,         // ubxxx
        kRangeKorean,         // ucxxx
        kRangeTableBase + 5,  // udxxx
        kRangePrivate,        // uexxx
        kRangeTableBase + 6   // ufxxx
    },
    {
        // table for 0X--
        kRangeSetLatin,  // u00xx
        kRangeSetLatin,  // u01xx
        kRangeSetLatin,  // u02xx
        kRangeGreek,     // u03xx     XXX 0300-036f is in fact
                         // kRangeCombiningDiacriticalMarks
        kRangeCyrillic,  // u04xx
        kRangeTableBase +
            7,  // u05xx, includes Cyrillic supplement, Hebrew, and Armenian
        kRangeArabic,         // u06xx
        kRangeTertiaryTable,  // u07xx
        kRangeUnassigned,     // u08xx
        kRangeTertiaryTable,  // u09xx
        kRangeTertiaryTable,  // u0axx
        kRangeTertiaryTable,  // u0bxx
        kRangeTertiaryTable,  // u0cxx
        kRangeTertiaryTable,  // u0dxx
        kRangeTertiaryTable,  // u0exx
        kRangeTibetan         // u0fxx
    },
    {
        // table for 1x--
        kRangeTertiaryTable,  // u10xx
        kRangeKorean,         // u11xx
        kRangeEthiopic,       // u12xx
        kRangeTertiaryTable,  // u13xx
        kRangeCanadian,       // u14xx
        kRangeCanadian,       // u15xx
        kRangeTertiaryTable,  // u16xx
        kRangeKhmer,          // u17xx
        kRangeMongolian,      // u18xx
        kRangeUnassigned,     // u19xx
        kRangeUnassigned,     // u1axx
        kRangeUnassigned,     // u1bxx
        kRangeUnassigned,     // u1cxx
        kRangeUnassigned,     // u1dxx
        kRangeSetLatin,       // u1exx
        kRangeGreek           // u1fxx
    },
    {
        // table for 2x--
        kRangeSetLatin,               // u20xx
        kRangeSetLatin,               // u21xx
        kRangeMathOperators,          // u22xx
        kRangeMiscTechnical,          // u23xx
        kRangeControlOpticalEnclose,  // u24xx
        kRangeBoxBlockGeometrics,     // u25xx
        kRangeMiscSymbols,            // u26xx
        kRangeDingbats,               // u27xx
        kRangeBraillePattern,         // u28xx
        kRangeUnassigned,             // u29xx
        kRangeUnassigned,             // u2axx
        kRangeUnassigned,             // u2bxx
        kRangeUnassigned,             // u2cxx
        kRangeUnassigned,             // u2dxx
        kRangeSetCJK,                 // u2exx
        kRangeSetCJK                  // u2fxx
    },
    {
        // table for ax--
        kRangeYi,          // ua0xx
        kRangeYi,          // ua1xx
        kRangeYi,          // ua2xx
        kRangeYi,          // ua3xx
        kRangeYi,          // ua4xx
        kRangeUnassigned,  // ua5xx
        kRangeUnassigned,  // ua6xx
        kRangeUnassigned,  // ua7xx
        kRangeUnassigned,  // ua8xx
        kRangeUnassigned,  // ua9xx
        kRangeUnassigned,  // uaaxx
        kRangeUnassigned,  // uabxx
        kRangeKorean,      // uacxx
        kRangeKorean,      // uadxx
        kRangeKorean,      // uaexx
        kRangeKorean       // uafxx
    },
    {
        // table for dx--
        kRangeKorean,     // ud0xx
        kRangeKorean,     // ud1xx
        kRangeKorean,     // ud2xx
        kRangeKorean,     // ud3xx
        kRangeKorean,     // ud4xx
        kRangeKorean,     // ud5xx
        kRangeKorean,     // ud6xx
        kRangeKorean,     // ud7xx
        kRangeSurrogate,  // ud8xx
        kRangeSurrogate,  // ud9xx
        kRangeSurrogate,  // udaxx
        kRangeSurrogate,  // udbxx
        kRangeSurrogate,  // udcxx
        kRangeSurrogate,  // uddxx
        kRangeSurrogate,  // udexx
        kRangeSurrogate   // udfxx
    },
    {
        // table for fx--
        kRangePrivate,        // uf0xx
        kRangePrivate,        // uf1xx
        kRangePrivate,        // uf2xx
        kRangePrivate,        // uf3xx
        kRangePrivate,        // uf4xx
        kRangePrivate,        // uf5xx
        kRangePrivate,        // uf6xx
        kRangePrivate,        // uf7xx
        kRangePrivate,        // uf8xx
        kRangeSetCJK,         // uf9xx
        kRangeSetCJK,         // ufaxx
        kRangeArabic,         // ufbxx, includes alphabic presentation form
        kRangeArabic,         // ufcxx
        kRangeArabic,         // ufdxx
        kRangeTableBase + 8,  // ufexx
        kRangeTableBase +
            9  // uffxx, halfwidth and fullwidth forms, includes Specials
    },
    {
        // table for 0x0500 - 0x05ff
        kRangeCyrillic,  // u050x
        kRangeCyrillic,  // u051x
        kRangeCyrillic,  // u052x
        kRangeArmenian,  // u053x
        kRangeArmenian,  // u054x
        kRangeArmenian,  // u055x
        kRangeArmenian,  // u056x
        kRangeArmenian,  // u057x
        kRangeArmenian,  // u058x
        kRangeHebrew,    // u059x
        kRangeHebrew,    // u05ax
        kRangeHebrew,    // u05bx
        kRangeHebrew,    // u05cx
        kRangeHebrew,    // u05dx
        kRangeHebrew,    // u05ex
        kRangeHebrew     // u05fx
    },
    {
        // table for 0xfe00 - 0xfeff
        kRangeSetCJK,  // ufe0x
        kRangeSetCJK,  // ufe1x
        kRangeSetCJK,  // ufe2x
        kRangeSetCJK,  // ufe3x
        kRangeSetCJK,  // ufe4x
        kRangeSetCJK,  // ufe5x
        kRangeSetCJK,  // ufe6x
        kRangeArabic,  // ufe7x
        kRangeArabic,  // ufe8x
        kRangeArabic,  // ufe9x
        kRangeArabic,  // ufeax
        kRangeArabic,  // ufebx
        kRangeArabic,  // ufecx
        kRangeArabic,  // ufedx
        kRangeArabic,  // ufeex
        kRangeArabic   // ufefx
    },
    {
        // table for 0xff00 - 0xffff
        kRangeSetCJK,    // uff0x, fullwidth latin
        kRangeSetCJK,    // uff1x, fullwidth latin
        kRangeSetCJK,    // uff2x, fullwidth latin
        kRangeSetCJK,    // uff3x, fullwidth latin
        kRangeSetCJK,    // uff4x, fullwidth latin
        kRangeSetCJK,    // uff5x, fullwidth latin
        kRangeSetCJK,    // uff6x, halfwidth katakana
        kRangeSetCJK,    // uff7x, halfwidth katakana
        kRangeSetCJK,    // uff8x, halfwidth katakana
        kRangeSetCJK,    // uff9x, halfwidth katakana
        kRangeSetCJK,    // uffax, halfwidth hangul jamo
        kRangeSetCJK,    // uffbx, halfwidth hangul jamo
        kRangeSetCJK,    // uffcx, halfwidth hangul jamo
        kRangeSetCJK,    // uffdx, halfwidth hangul jamo
        kRangeSetCJK,    // uffex, fullwidth symbols
        kRangeSpecials,  // ufffx, Specials
    },
};

// Most scripts between U+0700 and U+16FF are assigned a chunk of 128 (0x80)
// code points  so that the number of entries in the tertiary range
// table for that range is obtained by dividing (0x1700 - 0x0700) by 128.
// Exceptions: Ethiopic, Tibetan, Hangul Jamo and Canadian aboriginal
// syllabaries take multiple chunks and Ogham and Runic share  a single chunk.
#define TERTIARY_TABLE_SIZE ((0x1700 - 0x0700) / 0x80)

static const uint8_t gUnicodeTertiaryRangeTable[TERTIARY_TABLE_SIZE] = {
    // table for 0x0700 - 0x1600
    kRangeSyriac,      // u070x
    kRangeThaana,      // u078x
    kRangeUnassigned,  // u080x  place holder(resolved in the 2ndary tab.)
    kRangeUnassigned,  // u088x  place holder(resolved in the 2ndary tab.)
    kRangeDevanagari,  // u090x
    kRangeBengali,     // u098x
    kRangeGurmukhi,    // u0a0x
    kRangeGujarati,    // u0a8x
    kRangeOriya,       // u0b0x
    kRangeTamil,       // u0b8x
    kRangeTelugu,      // u0c0x
    kRangeKannada,     // u0c8x
    kRangeMalayalam,   // u0d0x
    kRangeSinhala,     // u0d8x
    kRangeThai,        // u0e0x
    kRangeLao,         // u0e8x
    kRangeTibetan,     // u0f0x  place holder(resolved in the 2ndary tab.)
    kRangeTibetan,     // u0f8x  place holder(resolved in the 2ndary tab.)
    kRangeMyanmar,     // u100x
    kRangeGeorgian,    // u108x
    kRangeKorean,      // u110x  place holder(resolved in the 2ndary tab.)
    kRangeKorean,      // u118x  place holder(resolved in the 2ndary tab.)
    kRangeEthiopic,    // u120x  place holder(resolved in the 2ndary tab.)
    kRangeEthiopic,    // u128x  place holder(resolved in the 2ndary tab.)
    kRangeEthiopic,    // u130x
    kRangeCherokee,    // u138x
    kRangeCanadian,    // u140x  place holder(resolved in the 2ndary tab.)
    kRangeCanadian,    // u148x  place holder(resolved in the 2ndary tab.)
    kRangeCanadian,    // u150x  place holder(resolved in the 2ndary tab.)
    kRangeCanadian,    // u158x  place holder(resolved in the 2ndary tab.)
    kRangeCanadian,    // u160x
    kRangeOghamRunic   // u168x  this contains two scripts, Ogham & Runic
};

// A two level index is almost enough for locating a range, with the
// exception of u03xx and u05xx. Since we don't really care about range for
// combining diacritical marks in our font application, they are
// not discriminated further. But future adoption of this module for other use
// should be aware of this limitation. The implementation can be extended if
// there is such a need.
// For Indic, Southeast Asian scripts and some other scripts between
// U+0700 and U+16FF, it's extended to the third level.
uint32_t FindCharUnicodeRange(uint32_t ch) {
  uint32_t range;

  // aggregate ranges for non-BMP codepoints
  if (ch > 0xFFFF) {
    uint32_t p = (ch >> 16);
    if (p == 1) {
      return kRangeSMP;
    } else if (p == 2) {
      return kRangeSetCJK;
    }
    return kRangeHigherPlanes;
  }

  // lookup explicit range for BMP codepoints
  // first general range
  range = gUnicodeSubrangeTable[0][ch >> 12];

  // if general range is good enough, return that
  if (range < kRangeTableBase)
    // we try to get a specific range
    return range;

  // otherwise, use subrange tables
  range = gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x0f00) >> 8];
  if (range < kRangeTableBase) return range;
  if (range < kRangeTertiaryTable)
    return gUnicodeSubrangeTable[range - kRangeTableBase][(ch & 0x00f0) >> 4];

  // Yet another table to look at : U+0700 - U+16FF : 128 code point blocks
  return gUnicodeTertiaryRangeTable[(ch - 0x0700) >> 7];
}