DXR is a code search and navigation tool aimed at making sense of large projects. It supports full-text and regex searches as well as structural queries.

Implementation

Mercurial (31ec81b5d7bb)

VCS Links

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
/*
 **********************************************************************
 *   Copyright (C) 2005-2012, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */

#ifndef __CSRMBCS_H
#define __CSRMBCS_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_CONVERSION

#include "csrecog.h"

U_NAMESPACE_BEGIN

// "Character"  iterated character class.
//    Recognizers for specific mbcs encodings make their "characters" available
//    by providing a nextChar() function that fills in an instance of IteratedChar
//    with the next char from the input.
//    The returned characters are not converted to Unicode, but remain as the raw
//    bytes (concatenated into an int) from the codepage data.
//
//  For Asian charsets, use the raw input rather than the input that has been
//   stripped of markup.  Detection only considers multi-byte chars, effectively
//   stripping markup anyway, and double byte chars do occur in markup too.
//
class IteratedChar : public UMemory
{
public:
    uint32_t charValue;             // 1-4 bytes from the raw input data
    int32_t  index;
    int32_t  nextIndex;
    UBool    error;
    UBool    done;

public:
    IteratedChar();
    //void reset();
    int32_t nextByte(InputText* det);
};


class CharsetRecog_mbcs : public CharsetRecognizer {

protected:
    /**
     * Test the match of this charset with the input text data
     *      which is obtained via the CharsetDetector object.
     *
     * @param det  The CharsetDetector, which contains the input text
     *             to be checked for being in this charset.
     * @return     Two values packed into one int  (Damn java, anyhow)
     *             <br/>
     *             bits 0-7:  the match confidence, ranging from 0-100
     *             <br/>
     *             bits 8-15: The match reason, an enum-like value.
     */
    int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const;

public:

    virtual ~CharsetRecog_mbcs();

    /**
     * Get the IANA name of this charset.
     * @return the charset name.
     */

    const char *getName() const = 0;
    const char *getLanguage() const = 0;
    UBool match(InputText* input, CharsetMatch *results) const = 0;

    /**
     * Get the next character (however many bytes it is) from the input data
     *    Subclasses for specific charset encodings must implement this function
     *    to get characters according to the rules of their encoding scheme.
     *
     *  This function is not a method of class IteratedChar only because
     *   that would require a lot of extra derived classes, which is awkward.
     * @param it  The IteratedChar "struct" into which the returned char is placed.
     * @param det The charset detector, which is needed to get at the input byte data
     *            being iterated over.
     * @return    True if a character was returned, false at end of input.
     */
    virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0;

};


/**
 *   Shift-JIS charset recognizer.
 *
 */
class CharsetRecog_sjis : public CharsetRecog_mbcs {
public:
    virtual ~CharsetRecog_sjis();

    UBool nextChar(IteratedChar *it, InputText *det) const;

    UBool match(InputText* input, CharsetMatch *results) const;

    const char *getName() const;
    const char *getLanguage() const;

};


/**
 *   EUC charset recognizers.  One abstract class that provides the common function
 *             for getting the next character according to the EUC encoding scheme,
 *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
 *
 */
class CharsetRecog_euc : public CharsetRecog_mbcs
{
public:
    virtual ~CharsetRecog_euc();

    const char *getName() const = 0;
    const char *getLanguage() const = 0;

    UBool match(InputText* input, CharsetMatch *results) const = 0;
    /*
     *  (non-Javadoc)
     *  Get the next character value for EUC based encodings.
     *  Character "value" is simply the raw bytes that make up the character
     *     packed into an int.
     */
    UBool nextChar(IteratedChar *it, InputText *det) const;
};

/**
 * The charset recognize for EUC-JP.  A singleton instance of this class
 *    is created and kept by the public CharsetDetector class
 */
class CharsetRecog_euc_jp : public CharsetRecog_euc
{
public:
    virtual ~CharsetRecog_euc_jp();

    const char *getName() const;
    const char *getLanguage() const;

    UBool match(InputText* input, CharsetMatch *results) const;
};

/**
 * The charset recognize for EUC-KR.  A singleton instance of this class
 *    is created and kept by the public CharsetDetector class
 */
class CharsetRecog_euc_kr : public CharsetRecog_euc
{
public:
    virtual ~CharsetRecog_euc_kr();

    const char *getName() const;
    const char *getLanguage() const;

    UBool match(InputText* input, CharsetMatch *results) const;
};

/**
 *
 *   Big5 charset recognizer.
 *
 */
class CharsetRecog_big5 : public CharsetRecog_mbcs
{
public:
    virtual ~CharsetRecog_big5();

    UBool nextChar(IteratedChar* it, InputText* det) const;

    const char *getName() const;
    const char *getLanguage() const;

    UBool match(InputText* input, CharsetMatch *results) const;
};


/**
 *
 *   GB-18030 recognizer. Uses simplified Chinese statistics.
 *
 */
class CharsetRecog_gb_18030 : public CharsetRecog_mbcs
{
public:
    virtual ~CharsetRecog_gb_18030();

    UBool nextChar(IteratedChar* it, InputText* det) const;

    const char *getName() const;
    const char *getLanguage() const;

    UBool match(InputText* input, CharsetMatch *results) const;
};

U_NAMESPACE_END

#endif
#endif /* __CSRMBCS_H */