DXR is a code search and navigation tool aimed at making sense of large projects. It supports full-text and regex searches as well as structural queries.

Implementation

Mercurial (31ec81b5d7bb)

VCS Links

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
/*
**********************************************************************
* Copyright (C) 1999-2011, International Business Machines Corporation
* and others. All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/17/99    aliu        Creation.
**********************************************************************
*/
#ifndef RBT_PARS_H
#define RBT_PARS_H

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION
#ifdef __cplusplus

#include "unicode/uobject.h"
#include "unicode/parseerr.h"
#include "unicode/unorm.h"
#include "rbt.h"
#include "hash.h"
#include "uvector.h"

U_NAMESPACE_BEGIN

class TransliterationRuleData;
class UnicodeFunctor;
class ParseData;
class RuleHalf;
class ParsePosition;
class StringMatcher;

class TransliteratorParser : public UMemory {

 public:

    /**
     * A Vector of TransliterationRuleData objects, one for each discrete group
     * of rules in the rule set
     */
    UVector dataVector;

    /**
     * PUBLIC data member.
     * A Vector of UnicodeStrings containing all of the ID blocks in the rule set
     */
    UVector idBlockVector;

    /**
     * PUBLIC data member containing the parsed compound filter, if any.
     */
    UnicodeSet* compoundFilter;

 private:

    /**
     * The current data object for which we are parsing rules
     */
    TransliterationRuleData* curData;

    UTransDirection direction;

    /**
     * Parse error information.
     */
    UParseError parseError;

    /**
     * Temporary symbol table used during parsing.
     */
    ParseData* parseData;

    /**
     * Temporary vector of matcher variables.  When parsing is complete, this
     * is copied into the array data.variables.  As with data.variables,
     * element 0 corresponds to character data.variablesBase.
     */
    UVector variablesVector;

    /**
     * Temporary table of variable names.  When parsing is complete, this is
     * copied into data.variableNames.
     */
    Hashtable variableNames;    
    
    /**
     * String of standins for segments.  Used during the parsing of a single
     * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds
     * to StringMatcher object segmentObjects.elementAt(0), etc.
     */
    UnicodeString segmentStandins;

    /**
     * Vector of StringMatcher objects for segments.  Used during the
     * parsing of a single rule.  
     * segmentStandins.charAt(0) is the standin for "$1" and corresponds
     * to StringMatcher object segmentObjects.elementAt(0), etc.
     */
    UVector segmentObjects;

    /**
     * The next available stand-in for variables.  This starts at some point in
     * the private use area (discovered dynamically) and increments up toward
     * <code>variableLimit</code>.  At any point during parsing, available
     * variables are <code>variableNext..variableLimit-1</code>.
     */
    UChar variableNext;

    /**
     * The last available stand-in for variables.  This is discovered
     * dynamically.  At any point during parsing, available variables are
     * <code>variableNext..variableLimit-1</code>.
     */
    UChar variableLimit;

    /**
     * When we encounter an undefined variable, we do not immediately signal
     * an error, in case we are defining this variable, e.g., "$a = [a-z];".
     * Instead, we save the name of the undefined variable, and substitute
     * in the placeholder char variableLimit - 1, and decrement
     * variableLimit.
     */
    UnicodeString undefinedVariableName;

    /**
     * The stand-in character for the 'dot' set, represented by '.' in
     * patterns.  This is allocated the first time it is needed, and
     * reused thereafter.
     */
    UChar dotStandIn;

public:

    /**
     * Constructor.
     */
    TransliteratorParser(UErrorCode &statusReturn);

    /**
     * Destructor.
     */
    ~TransliteratorParser();

    /**
     * Parse the given string as a sequence of rules, separated by newline
     * characters ('\n'), and cause this object to implement those rules.  Any
     * previous rules are discarded.  Typically this method is called exactly
     * once after construction.
     *
     * Parse the given rules, in the given direction.  After this call
     * returns, query the public data members for results.  The caller
     * owns the 'data' and 'compoundFilter' data members after this
     * call returns.
     * @param rules      rules, separated by ';'
     * @param direction  either FORWARD or REVERSE.
     * @param pe         Struct to recieve information on position 
     *                   of error if an error is encountered
     * @param ec         Output param set to success/failure code.
     */
    void parse(const UnicodeString& rules,
               UTransDirection direction,
               UParseError& pe,
               UErrorCode& ec);

    /**
     * Return the compound filter parsed by parse().  Caller owns result.
     * @return the compound filter parsed by parse().
     */ 
    UnicodeSet* orphanCompoundFilter();

private:

    /**
     * Return a representation of this transliterator as source rules.
     * @param rules      Output param to receive the rules.
     * @param direction  either FORWARD or REVERSE.
     */
    void parseRules(const UnicodeString& rules,
                    UTransDirection direction,
                    UErrorCode& status);

    /**
     * MAIN PARSER.  Parse the next rule in the given rule string, starting
     * at pos.  Return the index after the last character parsed.  Do not
     * parse characters at or after limit.
     *
     * Important:  The character at pos must be a non-whitespace character
     * that is not the comment character.
     *
     * This method handles quoting, escaping, and whitespace removal.  It
     * parses the end-of-rule character.  It recognizes context and cursor
     * indicators.  Once it does a lexical breakdown of the rule at pos, it
     * creates a rule object and adds it to our rule list.
     * @param rules      Output param to receive the rules.
     * @param pos        the starting position.
     * @param limit      pointer past the last character of the rule.
     * @return           the index after the last character parsed.
     */
    int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

    /**
     * Set the variable range to [start, end] (inclusive).
     * @param start    the start value of the range.
     * @param end      the end value of the range.
     */
    void setVariableRange(int32_t start, int32_t end, UErrorCode& status);

    /**
     * Assert that the given character is NOT within the variable range.
     * If it is, return FALSE.  This is neccesary to ensure that the
     * variable range does not overlap characters used in a rule.
     * @param ch     the given character.
     * @return       True, if the given character is NOT within the variable range.
     */
    UBool checkVariableRange(UChar32 ch) const;

    /**
     * Set the maximum backup to 'backup', in response to a pragma
     * statement.
     * @param backup    the new value to be set.
     */
    void pragmaMaximumBackup(int32_t backup);

    /**
     * Begin normalizing all rules using the given mode, in response
     * to a pragma statement.
     * @param mode    the given mode.
     */
    void pragmaNormalizeRules(UNormalizationMode mode);

    /**
     * Return true if the given rule looks like a pragma.
     * @param pos offset to the first non-whitespace character
     * of the rule.
     * @param limit pointer past the last character of the rule.
     * @return true if the given rule looks like a pragma.
     */
    static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit);

    /**
     * Parse a pragma.  This method assumes resemblesPragma() has
     * already returned true.
     * @param pos offset to the first non-whitespace character
     * of the rule.
     * @param limit pointer past the last character of the rule.
     * @return the position index after the final ';' of the pragma,
     * or -1 on failure.
     */
    int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status);

    /**
     * Called by main parser upon syntax error.  Search the rule string
     * for the probable end of the rule.  Of course, if the error is that
     * the end of rule marker is missing, then the rule end will not be found.
     * In any case the rule start will be correctly reported.
     * @param parseErrorCode error code.
     * @param msg error description.
     * @param start position of first character of current rule.
     * @return start position of first character of current rule.
     */
    int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start,
                        UErrorCode& status);

    /**
     * Parse a UnicodeSet out, store it, and return the stand-in character
     * used to represent it.
     *
     * @param rule    the rule for UnicodeSet.
     * @param pos     the position in pattern at which to start parsing.
     * @return        the stand-in character used to represent it.
     */
    UChar parseSet(const UnicodeString& rule,
                   ParsePosition& pos,
                   UErrorCode& status);

    /**
     * Generate and return a stand-in for a new UnicodeFunctor.  Store
     * the matcher (adopt it).
     * @param adopted the UnicodeFunctor to be adopted.
     * @return        a stand-in for a new UnicodeFunctor.
     */
    UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status);

    /**
     * Return the standin for segment seg (1-based).
     * @param seg    the given segment.
     * @return       the standIn character for the given segment.
     */
    UChar getSegmentStandin(int32_t seg, UErrorCode& status);

    /**
     * Set the object for segment seg (1-based).
     * @param seg      the given segment.
     * @param adopted  the StringMatcher to be adopted.
     */
    void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status);

    /**
     * Return the stand-in for the dot set.  It is allocated the first
     * time and reused thereafter.
     * @return    the stand-in for the dot set.
     */
    UChar getDotStandIn(UErrorCode& status);

    /**
     * Append the value of the given variable name to the given
     * UnicodeString.
     * @param name    the variable name to be appended.
     * @param buf     the given UnicodeString to append to.
     */
    void appendVariableDef(const UnicodeString& name,
                           UnicodeString& buf,
                           UErrorCode& status);

    /**
     * Glue method to get around access restrictions in C++.
     */
    /*static Transliterator* createBasicInstance(const UnicodeString& id,
                                               const UnicodeString* canonID);*/

    friend class RuleHalf;

    // Disallowed methods; no impl.
    /**
     * Copy constructor
     */
    TransliteratorParser(const TransliteratorParser&);
    
    /**
     * Assignment operator
     */
    TransliteratorParser& operator=(const TransliteratorParser&);
};

U_NAMESPACE_END

#endif /* #ifdef __cplusplus */

/**
 * Strip/convert the following from the transliterator rules:
 * comments
 * newlines
 * white space at the beginning and end of a line
 * unescape \u notation
 *
 * The target must be equal in size as the source.
 * @internal
 */
U_CAPI int32_t
utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status);

#endif /* #if !UCONFIG_NO_TRANSLITERATION */

#endif