DXR is a code search and navigation tool aimed at making sense of large projects. It supports full-text and regex searches as well as structural queries.

Header

Mercurial (31ec81b5d7bb)

VCS Links

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
/*
*******************************************************************************
*   Copyright (C) 2001-2011, International Business Machines
*   Corporation and others.  All Rights Reserved.
*******************************************************************************
*   file name:  bocsu.cpp
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
*   Author: Markus W. Scherer
*
*   Modification history:
*   05/18/2001  weiv    Made into separate module
*/


#include "unicode/utypes.h"

#if !UCONFIG_NO_COLLATION

#include "unicode/bytestream.h"
#include "unicode/utf16.h"
#include "bocsu.h"

/*
 * encode one difference value -0x10ffff..+0x10ffff in 1..3 bytes,
 * preserving lexical order
 */
U_CFUNC uint8_t *
u_writeDiff(int32_t diff, uint8_t *p) {
    if(diff>=SLOPE_REACH_NEG_1) {
        if(diff<=SLOPE_REACH_POS_1) {
            *p++=(uint8_t)(SLOPE_MIDDLE+diff);
        } else if(diff<=SLOPE_REACH_POS_2) {
            *p++=(uint8_t)(SLOPE_START_POS_2+(diff/SLOPE_TAIL_COUNT));
            *p++=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
        } else if(diff<=SLOPE_REACH_POS_3) {
            p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
            diff/=SLOPE_TAIL_COUNT;
            p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
            *p=(uint8_t)(SLOPE_START_POS_3+(diff/SLOPE_TAIL_COUNT));
            p+=3;
        } else {
            p[3]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
            diff/=SLOPE_TAIL_COUNT;
            p[2]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
            diff/=SLOPE_TAIL_COUNT;
            p[1]=(uint8_t)(SLOPE_MIN+diff%SLOPE_TAIL_COUNT);
            *p=SLOPE_MAX;
            p+=4;
        }
    } else {
        int32_t m;

        if(diff>=SLOPE_REACH_NEG_2) {
            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
            *p++=(uint8_t)(SLOPE_START_NEG_2+diff);
            *p++=(uint8_t)(SLOPE_MIN+m);
        } else if(diff>=SLOPE_REACH_NEG_3) {
            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
            p[2]=(uint8_t)(SLOPE_MIN+m);
            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
            p[1]=(uint8_t)(SLOPE_MIN+m);
            *p=(uint8_t)(SLOPE_START_NEG_3+diff);
            p+=3;
        } else {
            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
            p[3]=(uint8_t)(SLOPE_MIN+m);
            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
            p[2]=(uint8_t)(SLOPE_MIN+m);
            NEGDIVMOD(diff, SLOPE_TAIL_COUNT, m);
            p[1]=(uint8_t)(SLOPE_MIN+m);
            *p=SLOPE_MIN;
            p+=4;
        }
    }
    return p;
}

/*
 * Encode the code points of a string as
 * a sequence of byte-encoded differences (slope detection),
 * preserving lexical order.
 *
 * Optimize the difference-taking for runs of Unicode text within
 * small scripts:
 *
 * Most small scripts are allocated within aligned 128-blocks of Unicode
 * code points. Lexical order is preserved if "prev" is always moved
 * into the middle of such a block.
 *
 * Additionally, "prev" is moved from anywhere in the Unihan
 * area into the middle of that area.
 * Note that the identical-level run in a sort key is generated from
 * NFD text - there are never Hangul characters included.
 */
U_CFUNC void
u_writeIdenticalLevelRun(const UChar *s, int32_t length, icu::ByteSink &sink) {
    char scratch[64];
    int32_t capacity;

    UChar32 prev=0;
    int32_t i=0;
    while(i<length) {
        char *buffer=sink.GetAppendBuffer(1, length*2, scratch, (int32_t)sizeof(scratch), &capacity);
        uint8_t *p;
        // We must have capacity>=SLOPE_MAX_BYTES in case u_writeDiff() writes that much,
        // but we do not want to force the sink.GetAppendBuffer() to allocate
        // for a large min_capacity because we might actually only write one byte.
        if(capacity<16) {
            buffer=scratch;
            capacity=(int32_t)sizeof(scratch);
        }
        p=reinterpret_cast<uint8_t *>(buffer);
        uint8_t *lastSafe=p+capacity-SLOPE_MAX_BYTES;
        while(i<length && p<=lastSafe) {
            if(prev<0x4e00 || prev>=0xa000) {
                prev=(prev&~0x7f)-SLOPE_REACH_NEG_1;
            } else {
                /*
                 * Unihan U+4e00..U+9fa5:
                 * double-bytes down from the upper end
                 */
                prev=0x9fff-SLOPE_REACH_POS_2;
            }

            UChar32 c;
            U16_NEXT(s, i, length, c);
            p=u_writeDiff(c-prev, p);
            prev=c;
        }
        sink.Append(buffer, (int32_t)(p-reinterpret_cast<uint8_t *>(buffer)));
    }
}

U_CFUNC int32_t
u_writeIdenticalLevelRunTwoChars(UChar32 first, UChar32 second, uint8_t *p) {
    uint8_t *p0 = p;
    if(first<0x4e00 || first>=0xa000) {
        first=(first&~0x7f)-SLOPE_REACH_NEG_1;
    } else {
        /*
         * Unihan U+4e00..U+9fa5:
         * double-bytes down from the upper end
         */
        first=0x9fff-SLOPE_REACH_POS_2;
    }

    p=u_writeDiff(second-first, p);
    return (int32_t)(p-p0);
}

#endif /* #if !UCONFIG_NO_COLLATION */