DXR is a code search and navigation tool aimed at making sense of large projects. It supports full-text and regex searches as well as structural queries.

Implementation

Mercurial (31ec81b5d7bb)

VCS Links

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
/*
*******************************************************************************
*   Copyright (C) 2001-2011, International Business Machines
*   Corporation and others.  All Rights Reserved.
*******************************************************************************
*   file name:  bocsu.c
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   Author: Markus W. Scherer
*
*   Modification history:
*   05/18/2001  weiv    Made into separate module
*/

#ifndef BOCSU_H
#define BOCSU_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

U_NAMESPACE_BEGIN

class ByteSink;

U_NAMESPACE_END

/*
 * "BOCSU"
 * Binary Ordered Compression Scheme for Unicode
 *
 * Specific application:
 *
 * Encode a Unicode string for the identical level of a sort key.
 * Restrictions:
 * - byte stream (unsigned 8-bit bytes)
 * - lexical order of the identical-level run must be
 *   the same as code point order for the string
 * - avoid byte values 0, 1, 2
 *
 * Method: Slope Detection
 * Remember the previous code point (initial 0).
 * For each cp in the string, encode the difference to the previous one.
 *
 * With a compact encoding of differences, this yields good results for
 * small scripts and UTF-like results otherwise.
 *
 * Encoding of differences:
 * - Similar to a UTF, encoding the length of the byte sequence in the lead bytes.
 * - Does not need to be friendly for decoding or random access
 *   (trail byte values may overlap with lead/single byte values).
 * - The signedness must be encoded as the most significant part.
 *
 * We encode differences with few bytes if their absolute values are small.
 * For correct ordering, we must treat the entire value range -10ffff..+10ffff
 * in ascending order, which forbids encoding the sign and the absolute value separately.
 * Instead, we split the lead byte range in the middle and encode non-negative values
 * going up and negative values going down.
 *
 * For very small absolute values, the difference is added to a middle byte value
 * for single-byte encoded differences.
 * For somewhat larger absolute values, the difference is divided by the number
 * of byte values available, the modulo is used for one trail byte, and the remainder
 * is added to a lead byte avoiding the single-byte range.
 * For large absolute values, the difference is similarly encoded in three bytes.
 *
 * This encoding does not use byte values 0, 1, 2, but uses all other byte values
 * for lead/single bytes so that the middle range of single bytes is as large
 * as possible.
 * Note that the lead byte ranges overlap some, but that the sequences as a whole
 * are well ordered. I.e., even if the lead byte is the same for sequences of different
 * lengths, the trail bytes establish correct order.
 * It would be possible to encode slightly larger ranges for each length (>1) by
 * subtracting the lower bound of the range. However, that would also slow down the
 * calculation.
 *
 * For the actual string encoding, an optimization moves the previous code point value
 * to the middle of its Unicode script block to minimize the differences in
 * same-script text runs.
 */

/* Do not use byte values 0, 1, 2 because they are separators in sort keys. */
#define SLOPE_MIN           3
#define SLOPE_MAX           0xff
#define SLOPE_MIDDLE        0x81

#define SLOPE_TAIL_COUNT    (SLOPE_MAX-SLOPE_MIN+1)

#define SLOPE_MAX_BYTES     4

/*
 * Number of lead bytes:
 * 1        middle byte for 0
 * 2*80=160 single bytes for !=0
 * 2*42=84  for double-byte values
 * 2*3=6    for 3-byte values
 * 2*1=2    for 4-byte values
 *
 * The sum must be <=SLOPE_TAIL_COUNT.
 *
 * Why these numbers?
 * - There should be >=128 single-byte values to cover 128-blocks
 *   with small scripts.
 * - There should be >=20902 single/double-byte values to cover Unihan.
 * - It helps CJK Extension B some if there are 3-byte values that cover
 *   the distance between them and Unihan.
 *   This also helps to jump among distant places in the BMP.
 * - Four-byte values are necessary to cover the rest of Unicode.
 *
 * Symmetrical lead byte counts are for convenience.
 * With an equal distribution of even and odd differences there is also
 * no advantage to asymmetrical lead byte counts.
 */
#define SLOPE_SINGLE        80
#define SLOPE_LEAD_2        42
#define SLOPE_LEAD_3        3
#define SLOPE_LEAD_4        1

/* The difference value range for single-byters. */
#define SLOPE_REACH_POS_1   SLOPE_SINGLE
#define SLOPE_REACH_NEG_1   (-SLOPE_SINGLE)

/* The difference value range for double-byters. */
#define SLOPE_REACH_POS_2   (SLOPE_LEAD_2*SLOPE_TAIL_COUNT+(SLOPE_LEAD_2-1))
#define SLOPE_REACH_NEG_2   (-SLOPE_REACH_POS_2-1)

/* The difference value range for 3-byters. */
#define SLOPE_REACH_POS_3   (SLOPE_LEAD_3*SLOPE_TAIL_COUNT*SLOPE_TAIL_COUNT+(SLOPE_LEAD_3-1)*SLOPE_TAIL_COUNT+(SLOPE_TAIL_COUNT-1))
#define SLOPE_REACH_NEG_3   (-SLOPE_REACH_POS_3-1)

/* The lead byte start values. */
#define SLOPE_START_POS_2   (SLOPE_MIDDLE+SLOPE_SINGLE+1)
#define SLOPE_START_POS_3   (SLOPE_START_POS_2+SLOPE_LEAD_2)

#define SLOPE_START_NEG_2   (SLOPE_MIDDLE+SLOPE_REACH_NEG_1)
#define SLOPE_START_NEG_3   (SLOPE_START_NEG_2-SLOPE_LEAD_2)

/*
 * Integer division and modulo with negative numerators
 * yields negative modulo results and quotients that are one more than
 * what we need here.
 */
#define NEGDIVMOD(n, d, m) { \
    (m)=(n)%(d); \
    (n)/=(d); \
    if((m)<0) { \
        --(n); \
        (m)+=(d); \
    } \
}

U_CFUNC void
u_writeIdenticalLevelRun(const UChar *s, int32_t length, icu::ByteSink &sink);

U_CFUNC int32_t
u_writeIdenticalLevelRunTwoChars(UChar32 first, UChar32 second, uint8_t *p);

U_CFUNC uint8_t *
u_writeDiff(int32_t diff, uint8_t *p);

#endif /* #if !UCONFIG_NO_COLLATION */

#endif