DXR is a code search and navigation tool aimed at making sense of large projects. It supports full-text and regex searches as well as structural queries.

Implementation

Mercurial (31ec81b5d7bb)

VCS Links

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
/*
***************************************************************************
* Copyright (C) 2008-2011, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
*
*  uspoof_impl.h
*
*    Implemenation header for spoof detection
*
*/

#ifndef USPOOFIM_H
#define USPOOFIM_H

#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "utrie2.h"
#include "unicode/uscript.h"
#include "unicode/udata.h"


#if !UCONFIG_NO_NORMALIZATION

#ifdef __cplusplus

U_NAMESPACE_BEGIN

// The maximium length (in UTF-16 UChars) of the skeleton replacement string resulting from
//   a single input code point.  This is function of the unicode.org data.
#define USPOOF_MAX_SKELETON_EXPANSION 20

// The default stack buffer size for copies or conversions or normalizations
// of input strings being checked.  (Used in multiple places.)
#define USPOOF_STACK_BUFFER_SIZE 100

// Magic number for sanity checking spoof data.
#define USPOOF_MAGIC 0x3845fdef

class SpoofData;
struct SpoofDataHeader;
struct SpoofStringLengthsElement;
class ScriptSet;

/**
  *  Class SpoofImpl corresponds directly to the plain C API opaque type
  *  USpoofChecker.  One can be cast to the other.
  */
class SpoofImpl : public UObject  {
public:
	SpoofImpl(SpoofData *data, UErrorCode &status);
	SpoofImpl();
	virtual ~SpoofImpl();

    /** Copy constructor, used by the user level uspoof_clone() function.
     */
    SpoofImpl(const SpoofImpl &src, UErrorCode &status);
    
    static SpoofImpl *validateThis(USpoofChecker *sc, UErrorCode &status);
    static const SpoofImpl *validateThis(const USpoofChecker *sc, UErrorCode &status);

    /** Get the confusable skeleton transform for a single code point.
     *  The result is a string with a length between 1 and 18.
     *  @param    tableMask  bit flag specifying which confusable table to use.
     *                       One of USPOOF_SL_TABLE_FLAG, USPOOF_MA_TABLE_FLAG, etc.
     *  @return   The length in UTF-16 code units of the substition string.
     */  
    int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const;

    /** Set and Get AllowedLocales, implementations of the corresponding API */
    void setAllowedLocales(const char *localesList, UErrorCode &status);
    const char * getAllowedLocales(UErrorCode &status);

    // Add (union) to the UnicodeSet all of the characters for the scripts used for
    // the specified locale.  Part of the implementation of setAllowedLocales.
    void addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status);


    /** parse a hex number.  Untility used by the builders.   */
    static UChar32 ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status);

    // Implementation for Whole Script tests.
    // Return the test bit flag to be ORed into the eventual user return value
    //    if a Spoof opportunity is detected.
    void wholeScriptCheck(
        const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const;
	    
    /** Scan a string to determine how many scripts it includes.
     * Ignore characters with script=Common and scirpt=Inherited.
     * @param    text     The UChar text to be scanned
     * @param    length   The length of the input text, -1 for nul termintated.
     * @param    pos      An out parameter, set to the first input postion at which
     *                    a second script was encountered, ignoring Common and Inherited.
     * @param    status   For errors.
     * @return            the number of (non-common,inherited) scripts encountered,
     *                    clipped to a max of two.
     */
    int32_t scriptScan(const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const;

    static UClassID U_EXPORT2 getStaticClassID(void);
    virtual UClassID getDynamicClassID(void) const;

    //
    // Data Members
    //

    int32_t           fMagic;             // Internal sanity check.
    int32_t           fChecks;            // Bit vector of checks to perform.

    SpoofData        *fSpoofData;
    
    const UnicodeSet *fAllowedCharsSet;   // The UnicodeSet of allowed characters.
                                          //   for this Spoof Checker.  Defaults to all chars. 

    const char       *fAllowedLocales;    // The list of allowed locales.
};



//
//  Confusable Mappings Data Structures
//
//    For the confusable data, we are essentially implementing a map,
//       key:    a code point
//       value:  a string.  Most commonly one char in length, but can be more.
//
//    The keys are stored as a sorted array of 32 bit ints.
//             bits 0-23    a code point value
//             bits 24-31   flags
//                24:  1 if entry applies to SL table
//                25:  1 if entry applies to SA table
//                26:  1 if entry applies to ML table
//                27:  1 if entry applies to MA table
//                28:  1 if there are multiple entries for this code point.
//                29-30:  length of value string, in UChars.
//                         values are (1, 2, 3, other)
//        The key table is sorted in ascending code point order.  (not on the
//        32 bit int value, the flag bits do not participate in the sorting.)
//
//        Lookup is done by means of a binary search in the key table.
//
//    The corresponding values are kept in a parallel array of 16 bit ints.
//        If the value string is of length 1, it is literally in the value array.
//        For longer strings, the value array contains an index into the strings table.
//
//    String Table:
//       The strings table contains all of the value strings (those of length two or greater)
//       concatentated together into one long UChar (UTF-16) array.
//
//       The array is arranged by length of the strings - all strings of the same length
//       are stored together.  The sections are ordered by length of the strings -
//       all two char strings first, followed by all of the three Char strings, etc.
//
//       There is no nul character or other mark between adjacent strings.
//
//    String Lengths table
//       The length of strings from 1 to 3 is flagged in the key table.
//       For strings of length 4 or longer, the string length table provides a
//       mapping between an index into the string table and the corresponding length.
//       Strings of these lengths are rare, so lookup time is not an issue.
//       Each entry consists of
//            uint16_t      index of the _last_ string with this length
//            uint16_t      the length
//

// Flag bits in the Key entries
#define USPOOF_SL_TABLE_FLAG (1<<24)
#define USPOOF_SA_TABLE_FLAG (1<<25)
#define USPOOF_ML_TABLE_FLAG (1<<26)
#define USPOOF_MA_TABLE_FLAG (1<<27)
#define USPOOF_KEY_MULTIPLE_VALUES (1<<28)
#define USPOOF_KEY_LENGTH_SHIFT 29
#define USPOOF_KEY_LENGTH_FIELD(x) (((x)>>29) & 3)


struct SpoofStringLengthsElement {
    uint16_t      fLastString;         // index in string table of last string with this length
    uint16_t      fStrLength;           // Length of strings
};


//-------------------------------------------------------------------------------
//
//  ScriptSet - Wrapper class for the Script code bit sets that are part of the
//              whole script confusable data.
//
//              This class is used both at data build and at run time.
//              The constructor is only used at build time.
//              At run time, just point at the prebuilt data and go.
//  
//-------------------------------------------------------------------------------
class ScriptSet: public UMemory {
  public:
    ScriptSet();
    ~ScriptSet();

    UBool operator == (const ScriptSet &other);
    ScriptSet & operator = (const ScriptSet &other);

    void Union(const ScriptSet &other);
    void Union(UScriptCode script);
    void intersect(const ScriptSet &other);
    void intersect(UScriptCode script);
    void setAll();
    void resetAll();
    int32_t countMembers();

  private:
    uint32_t  bits[6];
};




//-------------------------------------------------------------------------------
//
//  NFDBuffer   A little class to handle the NFD normalization that is
//               needed on incoming identifiers to be checked.
//               Takes care of buffer handling and normalization
//
//               Instances of this class are intended to be stack-allocated.
//
//               TODO:  how to map position offsets back to user values?
//
//--------------------------------------------------------------------------------
class NFDBuffer: public UMemory {
public:
    NFDBuffer(const UChar *text, int32_t length, UErrorCode &status);
    ~NFDBuffer();
    const UChar *getBuffer();
    int32_t getLength();

  private:
    const UChar *fOriginalText;
    UChar       *fNormalizedText;
    int32_t      fNormalizedTextLength;
    UChar        fSmallBuf[USPOOF_STACK_BUFFER_SIZE];
};





//-------------------------------------------------------------------------------------
//
//  SpoofData
//
//    A small class that wraps the raw (usually memory mapped) spoof data.
//    Serves two primary functions:
//      1.  Convenience.  Contains real pointers to the data, to avoid dealing with
//          the offsets in the raw data.
//      2.  Reference counting.  When a spoof checker is cloned, the raw data is shared
//          and must be retained until all checkers using the data are closed.
//    Nothing in this struct includes state that is specific to any particular
//    USpoofDetector object.
//
//---------------------------------------------------------------------------------------
class SpoofData: public UMemory {
  public:
    static SpoofData *getDefault(UErrorCode &status);   // Load standard ICU spoof data.
    SpoofData(UErrorCode &status);   // Create new spoof data wrapper.
                                     // Only used when building new data from rules.
    
    // Constructor for use when creating from prebuilt default data.
    //   A UDataMemory is what the ICU internal data loading functions provide.
    //   The udm is adopted by the SpoofData.
    SpoofData(UDataMemory *udm, UErrorCode &status);

    // Constructor for use when creating from serialized data.
    //
    SpoofData(const void *serializedData, int32_t length, UErrorCode &status);

    //  Check raw Spoof Data Version compatibility.
    //  Return TRUE it looks good.
    static UBool validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status);
    ~SpoofData();                    // Destructor not normally used.
                                     // Use removeReference() instead.
    // Reference Counting functions.
    //    Clone of a user-level spoof detector increments the ref count on the data.
    //    Close of a user-level spoof detector decrements the ref count.
    //    If the data is owned by us, it will be deleted when count goes to zero.
    SpoofData *addReference(); 
    void removeReference();

    // Reserve space in the raw data.  For use by builder when putting together a
    //   new set of data.  Init the new storage to zero, to prevent inconsistent
    //   results if it is not all otherwise set by the requester.
    //  Return:
    //    pointer to the new space that was added by this function.
    void *reserveSpace(int32_t numBytes, UErrorCode &status);

    // initialize the pointers from this object to the raw data.
    void initPtrs(UErrorCode &status);

    // Reset all fields to an initial state.
    // Called from the top of all constructors.
    void reset();
    
    SpoofDataHeader             *fRawData;          // Ptr to the raw memory-mapped data
    UBool                       fDataOwned;         // True if the raw data is owned, and needs
                                                    //  to be deleted when refcount goes to zero.
    UDataMemory                 *fUDM;              // If not NULL, our data came from a
                                                    //   UDataMemory, which we must close when
                                                    //   we are done.

    uint32_t                    fMemLimit;          // Limit of available raw data space
    int32_t                     fRefCount;

    // Confusable data
    int32_t                     *fCFUKeys;
    uint16_t                    *fCFUValues;
    SpoofStringLengthsElement   *fCFUStringLengths;
    UChar                       *fCFUStrings;

    // Whole Script Confusable Data
    UTrie2                      *fAnyCaseTrie;
    UTrie2                      *fLowerCaseTrie;
    ScriptSet                   *fScriptSets;
    };
    

//---------------------------------------------------------------------------------------
//
//  Raw Binary Data Formats, as loaded from the ICU data file,
//    or as built by the builder.
//
//---------------------------------------------------------------------------------------
struct SpoofDataHeader {
    int32_t       fMagic;                // (0x3845fdef)
    uint8_t       fFormatVersion[4];     // Data Format. Same as the value in struct UDataInfo
                                         //   if there is one associated with this data.
    int32_t       fLength;               // Total lenght in bytes of this spoof data,
                                         //   including all sections, not just the header.

    // The following four sections refer to data representing the confusable data
    //   from the Unicode.org data from "confusables.txt"

    int32_t       fCFUKeys;               // byte offset to Keys table (from SpoofDataHeader *)
    int32_t       fCFUKeysSize;           // number of entries in keys table  (32 bits each)

    // TODO: change name to fCFUValues, for consistency.
    int32_t       fCFUStringIndex;        // byte offset to String Indexes table
    int32_t       fCFUStringIndexSize;    // number of entries in String Indexes table (16 bits each)
                                          //     (number of entries must be same as in Keys table

    int32_t       fCFUStringTable;        // byte offset of String table
    int32_t       fCFUStringTableLen;     // length of string table (in 16 bit UChars)

    int32_t       fCFUStringLengths;      // byte offset to String Lengths table
    int32_t       fCFUStringLengthsSize;  // number of entries in lengths table. (2 x 16 bits each)


    // The following sections are for data from confusablesWholeScript.txt
    
    int32_t       fAnyCaseTrie;           // byte offset to the serialized Any Case Trie
    int32_t       fAnyCaseTrieLength;     // Length (bytes) of the serialized Any Case Trie
    
    int32_t       fLowerCaseTrie;         // byte offset to the serialized Lower Case Trie
    int32_t       fLowerCaseTrieLength;   // Length (bytes) of the serialized Lower Case Trie

    int32_t       fScriptSets;            // byte offset to array of ScriptSets
    int32_t       fScriptSetsLength;      // Number of ScriptSets (24 bytes each)
    

    // The following sections are for data from xidmodifications.txt
    
    
    int32_t       unused[15];              // Padding, Room for Expansion
    
 }; 



    
//
//  Structure for the Whole Script Confusable Data
//    See Unicode UAX-39, Unicode Security Mechanisms, for a description of the
//    Whole Script confusable data
//
//  The data provides mappings from code points to a set of scripts
//    that contain characters that might be confused with the code point.
//  There are two mappings, one for lower case only, and one for characters
//    of any case.
//
//  The actual data consists of a utrie2 to map from a code point to an offset,
//  and an array of UScriptSets (essentially bit maps) that is indexed
//  by the offsets obtained from the Trie.
//
//


U_NAMESPACE_END
#endif /* __cplusplus */

/**
  * Endianness swap function for binary spoof data.
  * @internal
  */
U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
            UErrorCode *status);


#endif

#endif  /* USPOOFIM_H */