DXR is a code search and navigation tool aimed at making sense of large projects. It supports full-text and regex searches as well as structural queries.

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
// file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// Mostly copied and pasted from
// third_party/rust/chardetng/src/lib.rs , so
// "top-level directory of this distribution" above refers to
// third_party/rust/chardetng/

#ifndef mozilla_EncodingDetector_h
#define mozilla_EncodingDetector_h

#include "mozilla/Encoding.h"

namespace mozilla {
class EncodingDetector;
};  // namespace mozilla

#define CHARDETNG_ENCODING_DETECTOR mozilla::EncodingDetector

#include "chardetng.h"

namespace mozilla {

/**
 * A Web browser-oriented detector for guessing what character
 * encoding a stream of bytes is encoded in.
 *
 * The bytes are fed to the detector incrementally using the `feed`
 * method. The current guess of the detector can be queried using
 * the `guess` method. The guessing parameters are arguments to the
 * `guess` method rather than arguments to the constructor in order
 * to enable the application to check if the arguments affect the
 * guessing outcome. (The specific use case is to disable UI for
 * re-running the detector with UTF-8 allowed and the top-level
 * domain name ignored if those arguments don't change the guess.)
 */
class EncodingDetector final {
 public:
  ~EncodingDetector() = default;

  static void operator delete(void* aDetector) {
    chardetng_encoding_detector_free(
        reinterpret_cast<EncodingDetector*>(aDetector));
  }

  /**
   * Creates a new instance of the detector.
   */
  static inline UniquePtr<EncodingDetector> Create() {
    UniquePtr<EncodingDetector> detector(chardetng_encoding_detector_new());
    return detector;
  }

  /**
   * Inform the detector of a chunk of input.
   *
   * The byte stream is represented as a sequence of calls to this
   * method such that the concatenation of the arguments to this
   * method form the byte stream. It does not matter how the application
   * chooses to chunk the stream. It is OK to call this method with
   * a zero-length byte slice.
   *
   * The end of the stream is indicated by calling this method with
   * `aLast` set to `true`. In that case, the end of the stream is
   * considered to occur after the last byte of the `aBuffer` (which
   * may be zero-length) passed in the same call. Once this method
   * has been called with `last` set to `true` this method must not
   * be called again.
   *
   * If you want to perform detection on just the prefix of a longer
   * stream, do not pass `aLast=true` after the prefix if the stream
   * actually still continues.
   *
   * Returns `true` if after processing `aBuffer` the stream has
   * contained at least one non-ASCII byte and `false` if only
   * ASCII has been seen so far.
   *
   * # Panics
   *
   * If this method has previously been called with `aLast` set to `true`.
   */
  inline bool Feed(Span<const uint8_t> aBuffer, bool aLast) {
    return chardetng_encoding_detector_feed(this, aBuffer.Elements(),
                                            aBuffer.Length(), aLast);
  }

  /**
   * Guess the encoding given the bytes pushed to the detector so far
   * (via `Feed()`), the top-level domain name from which the bytes were
   * loaded, and an indication of whether to consider UTF-8 as a permissible
   * guess.
   *
   * The `aTld` argument takes the rightmost DNS label of the hostname of the
   * host the stream was loaded from in lower-case ASCII form. That is, if
   * the label is an internationalized top-level domain name, it must be
   * provided in its Punycode form. If the TLD that the stream was loaded
   * from is unavalable, an empty `Spane` may be passed instead, which is
   * equivalent to passing a `Span` for "com".
   *
   * If the `aAllowUTF8` argument is set to `false`, the return value of
   * this method won't be `UTF_8_ENCODING`. When performing detection
   * on `text/html` on non-`file:` URLs, Web browsers must pass `false`,
   * unless the user has taken a specific contextual action to request an
   * override. This way, Web developers cannot start depending on UTF-8
   * detection. Such reliance would make the Web Platform more brittle.
   *
   * Returns the guessed encoding.
   *
   * # Panics
   *
   * If `aTld` contains non-ASCII, period, or upper-case letters. (The panic
   * condition is intentionally limited to signs of failing to extract the
   * label correctly, failing to provide it in its Punycode form, and failure
   * to lower-case it. Full DNS label validation is intentionally not performed
   * to avoid panics when the reality doesn't match the specs.)
   */
  inline mozilla::NotNull<const mozilla::Encoding*> Guess(
      Span<const char> aTLD, bool aAllowUTF8) const {
    return WrapNotNull(chardetng_encoding_detector_guess(
        this, aTLD.Elements(), aTLD.Length(), aAllowUTF8));
  }

 private:
  EncodingDetector() = delete;
  EncodingDetector(const EncodingDetector&) = delete;
  EncodingDetector& operator=(const EncodingDetector&) = delete;
};

};  // namespace mozilla

#endif  // mozilla_EncodingDetector_h