mozInlineSpellWordUtil.cpp

mozilla-central/extensions/spellcheck/src/mozInlineSpellWordUtil.cpp (file symbol)

Enable keyboard shortcuts

Source code

Revision control

Copy as Markdown

Other Tools

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "mozInlineSpellWordUtil.h"

#include <algorithm>

#include <utility>

#include "mozilla/BinarySearch.h"

#include "mozilla/EditorBase.h"

#include "mozilla/HTMLEditor.h"

#include "mozilla/Logging.h"

#include "mozilla/dom/Element.h"

#include "nsDebug.h"

#include "nsAtom.h"

#include "nsComponentManagerUtils.h"

#include "nsUnicodeProperties.h"

#include "nsServiceManagerUtils.h"

#include "nsIContent.h"

#include "nsTextFragment.h"

#include "nsRange.h"

#include "nsContentUtils.h"

#include "nsIFrame.h"

using namespace mozilla;

static LazyLogModule sInlineSpellWordUtilLog{"InlineSpellWordUtil"};

// IsIgnorableCharacter

//

//    These characters are ones that we should ignore in input.

inline bool IsIgnorableCharacter(char ch) {

  return (ch == static_cast<char>(0xAD));  // SOFT HYPHEN

inline bool IsIgnorableCharacter(char16_t ch) {

  return (ch == 0xAD ||   // SOFT HYPHEN

          ch == 0x1806);  // MONGOLIAN TODO SOFT HYPHEN

// IsConditionalPunctuation

//

//    Some characters (like apostrophes) require characters on each side to be

//    part of a word, and are otherwise punctuation.

inline bool IsConditionalPunctuation(char ch) {

  return (ch == '\'' ||                    // RIGHT SINGLE QUOTATION MARK

          ch == static_cast<char>(0xB7));  // MIDDLE DOT

inline bool IsConditionalPunctuation(char16_t ch) {

  return (ch == '\'' || ch == 0x2019 ||  // RIGHT SINGLE QUOTATION MARK

          ch == 0x00B7);                 // MIDDLE DOT

static bool IsAmbiguousDOMWordSeprator(char16_t ch) {

  // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.

  return (ch == '@' || ch == ':' || ch == '.' || ch == '/' || ch == '-' ||

          IsConditionalPunctuation(ch));

static bool IsAmbiguousDOMWordSeprator(char ch) {

  // This class may be CHAR_CLASS_SEPARATOR, but it depends on context.

  return IsAmbiguousDOMWordSeprator(static_cast<char16_t>(ch));

// IsDOMWordSeparator

//

//    Determines if the given character should be considered as a DOM Word

//    separator. Basically, this is whitespace, although it could also have

//    certain punctuation that we know ALWAYS breaks words. This is important.

//    For example, we can't have any punctuation that could appear in a URL

//    or email address in this, because those need to always fit into a single

//    DOM word.

static bool IsDOMWordSeparator(char ch) {

  // simple spaces or no-break space

  return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' ||

          ch == static_cast<char>(0xA0));

static bool IsDOMWordSeparator(char16_t ch) {

  // simple spaces

  if (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r') return true;

  // complex spaces - check only if char isn't ASCII (uncommon)

  if (ch >= 0xA0 && (ch == 0x00A0 ||  // NO-BREAK SPACE

                     ch == 0x2002 ||  // EN SPACE

                     ch == 0x2003 ||  // EM SPACE

                     ch == 0x2009 ||  // THIN SPACE

                     ch == 0x3000))   // IDEOGRAPHIC SPACE

    return true;

  // otherwise not a space

  return false;

bool NodeOffset::operator==(

    const mozilla::RangeBoundary& aRangeBoundary) const {

  if (aRangeBoundary.Container() != mNode) {

    return false;

  const Maybe<uint32_t> rangeBoundaryOffset =

      aRangeBoundary.Offset(RangeBoundary::OffsetFilter::kValidOffsets);

  MOZ_ASSERT(mOffset >= 0);

  return rangeBoundaryOffset &&

         (*rangeBoundaryOffset == static_cast<uint32_t>(mOffset));

bool NodeOffsetRange::operator==(const nsRange& aRange) const {

  return mBegin == aRange.StartRef() && mEnd == aRange.EndRef();

// static

Maybe<mozInlineSpellWordUtil> mozInlineSpellWordUtil::Create(

    const EditorBase& aEditorBase) {

  dom::Document* document = aEditorBase.GetDocument();

  if (NS_WARN_IF(!document)) {

    return Nothing();

  const bool isContentEditableOrDesignMode = aEditorBase.IsHTMLEditor();

  // Find the root node for the editor. For contenteditable the mRootNode could

  // change to shadow root if the begin and end are inside the shadowDOM.

  nsINode* rootNode = aEditorBase.GetRoot();

  if (NS_WARN_IF(!rootNode)) {

    return Nothing();

  mozInlineSpellWordUtil util{*document, isContentEditableOrDesignMode,

                              *rootNode};

  return Some(std::move(util));

static inline bool IsSpellCheckingTextNode(nsINode* aNode) {

  nsIContent* parent = aNode->GetParent();

  if (parent &&

      parent->IsAnyOfHTMLElements(nsGkAtoms::script, nsGkAtoms::style))

    return false;

  return aNode->IsText();

typedef void (*OnLeaveNodeFunPtr)(nsINode* aNode, void* aClosure);

// Find the next node in the DOM tree in preorder.

// Calls OnLeaveNodeFunPtr when the traversal leaves a node, which is

// why we can't just use GetNextNode here, sadly.

static nsINode* FindNextNode(nsINode* aNode, const nsINode* aRoot,

                             OnLeaveNodeFunPtr aOnLeaveNode, void* aClosure) {

  MOZ_ASSERT(aNode, "Null starting node?");

  nsINode* next = aNode->GetFirstChild();

  if (next) return next;

  // Don't look at siblings or otherwise outside of aRoot

  if (aNode == aRoot) return nullptr;

  next = aNode->GetNextSibling();

  if (next) return next;

  // Go up

  for (;;) {

    if (aOnLeaveNode) {

      aOnLeaveNode(aNode, aClosure);

    next = aNode->GetParent();

    if (next == aRoot || !next) return nullptr;

    aNode = next;

    next = aNode->GetNextSibling();

    if (next) return next;

// aNode is not a text node. Find the first text node starting at aNode/aOffset

// in a preorder DOM traversal.

static nsINode* FindNextTextNode(nsINode* aNode, int32_t aOffset,

                                 const nsINode* aRoot) {

  MOZ_ASSERT(aNode, "Null starting node?");

  MOZ_ASSERT(!IsSpellCheckingTextNode(aNode),

             "FindNextTextNode should start with a non-text node");

  nsINode* checkNode;

  // Need to start at the aOffset'th child

  nsIContent* child = aNode->GetChildAt_Deprecated(aOffset);

  if (child) {

    checkNode = child;

  } else {

    // aOffset was beyond the end of the child list.

    // goto next node after the last descendant of aNode in

    // a preorder DOM traversal.

    checkNode = aNode->GetNextNonChildNode(aRoot);

  while (checkNode && !IsSpellCheckingTextNode(checkNode)) {

    checkNode = checkNode->GetNextNode(aRoot);

  return checkNode;

// mozInlineSpellWordUtil::SetPositionAndEnd

//

//    We have two ranges "hard" and "soft". The hard boundary is simply

//    the scope of the root node. The soft boundary is that which is set

//    by the caller of this class by calling this function. If this function is

//    not called, the soft boundary is the same as the hard boundary.

//

//    When we reach the soft boundary (mSoftText.GetEnd()), we keep

//    going until we reach the end of a word. This allows the caller to set the

//    end of the range to anything, and we will always check whole multiples of

//    words. When we reach the hard boundary we stop no matter what.

//

//    There is no beginning soft boundary. This is because we only go to the

//    previous node once, when finding the previous word boundary in

//    SetPosition(). You might think of the soft boundary as being this initial

//    position.

nsresult mozInlineSpellWordUtil::SetPositionAndEnd(nsINode* aPositionNode,

                                                   int32_t aPositionOffset,

                                                   nsINode* aEndNode,

                                                   int32_t aEndOffset) {

  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,

          ("%s: pos=(%p, %i), end=(%p, %i)", __FUNCTION__, aPositionNode,

           aPositionOffset, aEndNode, aEndOffset));

  MOZ_ASSERT(aPositionNode, "Null begin node?");

  MOZ_ASSERT(aEndNode, "Null end node?");

  MOZ_ASSERT(mRootNode, "Not initialized");

  // Find a appropriate root if we are dealing with contenteditable nodes which

  // are in the shadow DOM.

  if (mIsContentEditableOrDesignMode) {

    nsINode* rootNode = aPositionNode->SubtreeRoot();

    if (rootNode != aEndNode->SubtreeRoot()) {

      return NS_ERROR_FAILURE;

    if (mozilla::dom::ShadowRoot::FromNode(rootNode)) {

      mRootNode = rootNode;

  mSoftText.Invalidate();

  if (!IsSpellCheckingTextNode(aPositionNode)) {

    // Start at the start of the first text node after aNode/aOffset.

    aPositionNode = FindNextTextNode(aPositionNode, aPositionOffset, mRootNode);

    aPositionOffset = 0;

  NodeOffset softBegin = NodeOffset(aPositionNode, aPositionOffset);

  if (!IsSpellCheckingTextNode(aEndNode)) {

    // End at the start of the first text node after aEndNode/aEndOffset.

    aEndNode = FindNextTextNode(aEndNode, aEndOffset, mRootNode);

    aEndOffset = 0;

  NodeOffset softEnd = NodeOffset(aEndNode, aEndOffset);

  nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd));

  if (NS_FAILED(rv)) {

    return rv;

  int32_t textOffset = MapDOMPositionToSoftTextOffset(mSoftText.GetBegin());

  if (textOffset < 0) {

    return NS_OK;

  mNextWordIndex = FindRealWordContaining(textOffset, HINT_END, true);

  return NS_OK;

nsresult mozInlineSpellWordUtil::EnsureWords(NodeOffset aSoftBegin,

                                             NodeOffset aSoftEnd) {

  if (mSoftText.mIsValid) return NS_OK;

  mSoftText.AdjustBeginAndBuildText(std::move(aSoftBegin), std::move(aSoftEnd),

                                    mRootNode);

  mRealWords.Clear();

  Result<RealWords, nsresult> realWords = BuildRealWords();

  if (realWords.isErr()) {

    return realWords.unwrapErr();

  mRealWords = realWords.unwrap();

  mSoftText.mIsValid = true;

  return NS_OK;

nsresult mozInlineSpellWordUtil::MakeRangeForWord(const RealWord& aWord,

                                                  nsRange** aRange) const {

  NodeOffset begin =

      MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);

  NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);

  return MakeRange(begin, end, aRange);

void mozInlineSpellWordUtil::MakeNodeOffsetRangeForWord(

    const RealWord& aWord, NodeOffsetRange* aNodeOffsetRange) {

  NodeOffset begin =

      MapSoftTextOffsetToDOMPosition(aWord.mSoftTextOffset, HINT_BEGIN);

  NodeOffset end = MapSoftTextOffsetToDOMPosition(aWord.EndOffset(), HINT_END);

  *aNodeOffsetRange = NodeOffsetRange(begin, end);

// mozInlineSpellWordUtil::GetRangeForWord

nsresult mozInlineSpellWordUtil::GetRangeForWord(nsINode* aWordNode,

                                                 int32_t aWordOffset,

                                                 nsRange** aRange) {

  // Set our soft end and start

  NodeOffset pt(aWordNode, aWordOffset);

  if (!mSoftText.mIsValid || pt != mSoftText.GetBegin() ||

      pt != mSoftText.GetEnd()) {

    mSoftText.Invalidate();

    NodeOffset softBegin = pt;

    NodeOffset softEnd = pt;

    nsresult rv = EnsureWords(std::move(softBegin), std::move(softEnd));

    if (NS_FAILED(rv)) {

      return rv;

  int32_t offset = MapDOMPositionToSoftTextOffset(pt);

  if (offset < 0) return MakeRange(pt, pt, aRange);

  int32_t wordIndex = FindRealWordContaining(offset, HINT_BEGIN, false);

  if (wordIndex < 0) return MakeRange(pt, pt, aRange);

  return MakeRangeForWord(mRealWords[wordIndex], aRange);

// This is to fix characters that the spellchecker may not like

static void NormalizeWord(const nsAString& aInput, int32_t aPos, int32_t aLen,

                          nsAString& aOutput) {

  aOutput.Truncate();

  for (int32_t i = 0; i < aLen; i++) {

    char16_t ch = aInput.CharAt(i + aPos);

    // remove ignorable characters from the word

    if (IsIgnorableCharacter(ch)) continue;

    // the spellchecker doesn't handle curly apostrophes in all languages

    if (ch == 0x2019) {  // RIGHT SINGLE QUOTATION MARK

      ch = '\'';

    aOutput.Append(ch);

// mozInlineSpellWordUtil::GetNextWord

//

//    FIXME-optimization: we shouldn't have to generate a range every single

//    time. It would be better if the inline spellchecker didn't require a

//    range unless the word was misspelled. This may or may not be possible.

bool mozInlineSpellWordUtil::GetNextWord(Word& aWord) {

  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,

          ("%s: mNextWordIndex=%d", __FUNCTION__, mNextWordIndex));

  if (mNextWordIndex < 0 || mNextWordIndex >= int32_t(mRealWords.Length())) {

    mNextWordIndex = -1;

    aWord.mSkipChecking = true;

    return false;

  const RealWord& realWord = mRealWords[mNextWordIndex];

  MakeNodeOffsetRangeForWord(realWord, &aWord.mNodeOffsetRange);

  ++mNextWordIndex;

  aWord.mSkipChecking = !realWord.mCheckableWord;

  ::NormalizeWord(mSoftText.GetValue(), realWord.mSoftTextOffset,

                  realWord.mLength, aWord.mText);

  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,

          ("%s: returning: %s (skip=%d)", __FUNCTION__,

           NS_ConvertUTF16toUTF8(aWord.mText).get(), aWord.mSkipChecking));

  return true;

// mozInlineSpellWordUtil::MakeRange

//

//    Convenience function for creating a range over the current document.

nsresult mozInlineSpellWordUtil::MakeRange(NodeOffset aBegin, NodeOffset aEnd,

                                           nsRange** aRange) const {

  NS_ENSURE_ARG_POINTER(aBegin.mNode);

  if (!mDocument) {

    return NS_ERROR_NOT_INITIALIZED;

  ErrorResult error;

  RefPtr<nsRange> range = nsRange::Create(aBegin.mNode, aBegin.mOffset,

                                          aEnd.mNode, aEnd.mOffset, error);

  if (NS_WARN_IF(error.Failed())) {

    return error.StealNSResult();

  MOZ_ASSERT(range);

  range.forget(aRange);

  return NS_OK;

// static

already_AddRefed<nsRange> mozInlineSpellWordUtil::MakeRange(

    const NodeOffsetRange& aRange) {

  IgnoredErrorResult ignoredError;

  RefPtr<nsRange> range =

      nsRange::Create(aRange.Begin().Node(), aRange.Begin().Offset(),

                      aRange.End().Node(), aRange.End().Offset(), ignoredError);

  NS_WARNING_ASSERTION(!ignoredError.Failed(), "Creating a range failed");

  return range.forget();

/*********** Word Splitting ************/

// classifies a given character in the DOM word

enum CharClass {

  CHAR_CLASS_WORD,

  CHAR_CLASS_SEPARATOR,

  CHAR_CLASS_END_OF_INPUT

};

// Encapsulates DOM-word to real-word splitting

template <class T>

struct MOZ_STACK_CLASS WordSplitState {

  const T& mDOMWordText;

  int32_t mDOMWordOffset;

  CharClass mCurCharClass;

  explicit WordSplitState(const T& aString)

      : mDOMWordText(aString),

        mDOMWordOffset(0),

        mCurCharClass(CHAR_CLASS_END_OF_INPUT) {}

  CharClass ClassifyCharacter(int32_t aIndex, bool aRecurse) const;

  void Advance();

  void AdvanceThroughSeparators();

  void AdvanceThroughWord();

  // Finds special words like email addresses and URLs that may start at the

  // current position, and returns their length, or 0 if not found. This allows

  // arbitrary word breaking rules to be used for these special entities, as

  // long as they can not contain whitespace.

  bool IsSpecialWord() const;

  // Similar to IsSpecialWord except that this takes a split word as

  // input. This checks for things that do not require special word-breaking

  // rules.

  bool ShouldSkipWord(int32_t aStart, int32_t aLength) const;

  // Finds the last sequence of DOM word separators before aBeforeOffset and

  // returns the offset to its first element.

  Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(

      int32_t aBeforeOffset) const;

  char16_t GetUnicharAt(int32_t aIndex) const;

};

// WordSplitState::ClassifyCharacter

template <class T>

CharClass WordSplitState<T>::ClassifyCharacter(int32_t aIndex,

                                               bool aRecurse) const {

  MOZ_ASSERT(aIndex >= 0 && aIndex <= int32_t(mDOMWordText.Length()),

             "Index out of range");

  if (aIndex == int32_t(mDOMWordText.Length())) return CHAR_CLASS_SEPARATOR;

  // this will classify the character, we want to treat "ignorable" characters

  // such as soft hyphens, and also ZWJ and ZWNJ as word characters.

  nsUGenCategory charCategory =

      mozilla::unicode::GetGenCategory(GetUnicharAt(aIndex));

  if (charCategory == nsUGenCategory::kLetter ||

      IsIgnorableCharacter(mDOMWordText[aIndex]) ||

      mDOMWordText[aIndex] == 0x200C /* ZWNJ */ ||

      mDOMWordText[aIndex] == 0x200D /* ZWJ */)

    return CHAR_CLASS_WORD;

  // If conditional punctuation is surrounded immediately on both sides by word

  // characters it also counts as a word character.

  if (IsConditionalPunctuation(mDOMWordText[aIndex])) {

    if (!aRecurse) {

      // not allowed to look around, this punctuation counts like a separator

      return CHAR_CLASS_SEPARATOR;

    // check the left-hand character

    if (aIndex == 0) return CHAR_CLASS_SEPARATOR;

    if (ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD)

      return CHAR_CLASS_SEPARATOR;

    // If the previous charatcer is a word-char, make sure that it's not a

    // special dot character.

    if (mDOMWordText[aIndex - 1] == '.') return CHAR_CLASS_SEPARATOR;

    // now we know left char is a word-char, check the right-hand character

    if (aIndex == int32_t(mDOMWordText.Length() - 1)) {

      return CHAR_CLASS_SEPARATOR;

    if (ClassifyCharacter(aIndex + 1, false) != CHAR_CLASS_WORD)

      return CHAR_CLASS_SEPARATOR;

    // If the next charatcer is a word-char, make sure that it's not a

    // special dot character.

    if (mDOMWordText[aIndex + 1] == '.') return CHAR_CLASS_SEPARATOR;

    // char on either side is a word, this counts as a word

    return CHAR_CLASS_WORD;

  // The dot character, if appearing at the end of a word, should

  // be considered part of that word.  Example: "etc.", or

  // abbreviations

  if (aIndex > 0 && mDOMWordText[aIndex] == '.' &&

      mDOMWordText[aIndex - 1] != '.' &&

      ClassifyCharacter(aIndex - 1, false) != CHAR_CLASS_WORD) {

    return CHAR_CLASS_WORD;

  // all other punctuation

  if (charCategory == nsUGenCategory::kSeparator ||

      charCategory == nsUGenCategory::kOther ||

      charCategory == nsUGenCategory::kPunctuation ||

      charCategory == nsUGenCategory::kSymbol) {

    // Don't break on hyphens, as hunspell handles them on its own.

    if (aIndex > 0 && mDOMWordText[aIndex] == '-' &&

        mDOMWordText[aIndex - 1] != '-' &&

        ClassifyCharacter(aIndex - 1, false) == CHAR_CLASS_WORD) {

      // A hyphen is only meaningful as a separator inside a word

      // if the previous and next characters are a word character.

      if (aIndex == int32_t(mDOMWordText.Length()) - 1)

        return CHAR_CLASS_SEPARATOR;

      if (mDOMWordText[aIndex + 1] != '.' &&

          ClassifyCharacter(aIndex + 1, false) == CHAR_CLASS_WORD)

        return CHAR_CLASS_WORD;

    return CHAR_CLASS_SEPARATOR;

  // any other character counts as a word

  return CHAR_CLASS_WORD;

// WordSplitState::Advance

template <class T>

void WordSplitState<T>::Advance() {

  MOZ_ASSERT(mDOMWordOffset >= 0, "Negative word index");

  MOZ_ASSERT(mDOMWordOffset < (int32_t)mDOMWordText.Length(),

             "Length beyond end");

  mDOMWordOffset++;

  if (mDOMWordOffset >= (int32_t)mDOMWordText.Length())

    mCurCharClass = CHAR_CLASS_END_OF_INPUT;

  else

    mCurCharClass = ClassifyCharacter(mDOMWordOffset, true);

// WordSplitState::AdvanceThroughSeparators

template <class T>

void WordSplitState<T>::AdvanceThroughSeparators() {

  while (mCurCharClass == CHAR_CLASS_SEPARATOR) Advance();

// WordSplitState::AdvanceThroughWord

template <class T>

void WordSplitState<T>::AdvanceThroughWord() {

  while (mCurCharClass == CHAR_CLASS_WORD) Advance();

// WordSplitState::IsSpecialWord

template <class T>

bool WordSplitState<T>::IsSpecialWord() const {

  // Search for email addresses. We simply define these as any sequence of

  // characters with an '@' character in the middle. The DOM word is already

  // split on whitepace, so we know that everything to the end is the address

  int32_t firstColon = -1;

  for (int32_t i = mDOMWordOffset; i < int32_t(mDOMWordText.Length()); i++) {

    if (mDOMWordText[i] == '@') {

      // only accept this if there are unambiguous word characters (don't bother

      // recursing to disambiguate apostrophes) on each side. This prevents

      // classifying, e.g. "@home" as an email address

      // Use this condition to only accept words with '@' in the middle of

      // them. It works, but the inlinespellcker doesn't like this. The problem

      // is that you type "fhsgfh@" that's a misspelled word followed by a

      // symbol, but when you type another letter "fhsgfh@g" that first word

      // need to be unmarked misspelled. It doesn't do this. it only checks the

      // current position for potentially removing a spelling range.

      if (i > 0 && ClassifyCharacter(i - 1, false) == CHAR_CLASS_WORD &&

          i < (int32_t)mDOMWordText.Length() - 1 &&

          ClassifyCharacter(i + 1, false) == CHAR_CLASS_WORD) {

        return true;

    } else if (mDOMWordText[i] == ':' && firstColon < 0) {

      firstColon = i;

      // If the first colon is followed by a slash, consider it a URL

      // This will catch things like asdf://foo.com

      if (firstColon < (int32_t)mDOMWordText.Length() - 1 &&

          mDOMWordText[firstColon + 1] == '/') {

        return true;

  // Check the text before the first colon against some known protocols. It

  // is impossible to check against all protocols, especially since you can

  // plug in new protocols. We also don't want to waste time here checking

  // against a lot of obscure protocols.

  if (firstColon > mDOMWordOffset) {

    nsString protocol(

        Substring(mDOMWordText, mDOMWordOffset, firstColon - mDOMWordOffset));

    if (protocol.EqualsIgnoreCase("http") ||

        protocol.EqualsIgnoreCase("https") ||

        protocol.EqualsIgnoreCase("news") ||

        protocol.EqualsIgnoreCase("file") ||

        protocol.EqualsIgnoreCase("javascript") ||

        protocol.EqualsIgnoreCase("data") || protocol.EqualsIgnoreCase("ftp")) {

      return true;

  // not anything special

  return false;

// WordSplitState::ShouldSkipWord

template <class T>

bool WordSplitState<T>::ShouldSkipWord(int32_t aStart, int32_t aLength) const {

  int32_t last = aStart + aLength;

  // check to see if the word contains a digit

  for (int32_t i = aStart; i < last; i++) {

    if (mozilla::unicode::GetGenCategory(GetUnicharAt(i)) ==

        nsUGenCategory::kNumber) {

      return true;

  // not special

  return false;

template <class T>

Maybe<int32_t> WordSplitState<T>::FindOffsetOfLastDOMWordSeparatorSequence(

    const int32_t aBeforeOffset) const {

  for (int32_t i = aBeforeOffset - 1; i >= 0; --i) {

    if (IsDOMWordSeparator(mDOMWordText[i]) ||

        (!IsAmbiguousDOMWordSeprator(mDOMWordText[i]) &&

         ClassifyCharacter(i, true) == CHAR_CLASS_SEPARATOR)) {

      // Be greedy, find as many separators as we can

      for (int32_t j = i - 1; j >= 0; --j) {

        if (IsDOMWordSeparator(mDOMWordText[j]) ||

            (!IsAmbiguousDOMWordSeprator(mDOMWordText[j]) &&

             ClassifyCharacter(j, true) == CHAR_CLASS_SEPARATOR)) {

          i = j;

        } else {

          break;

      return Some(i);

  return Nothing();

template <>

char16_t WordSplitState<nsDependentSubstring>::GetUnicharAt(

    int32_t aIndex) const {

  return mDOMWordText[aIndex];

template <>

char16_t WordSplitState<nsDependentCSubstring>::GetUnicharAt(

    int32_t aIndex) const {

  return static_cast<char16_t>(static_cast<uint8_t>(mDOMWordText[aIndex]));

static inline bool IsBRElement(nsINode* aNode) {

  return aNode->IsHTMLElement(nsGkAtoms::br);

/**

 * Given a TextNode, finds the last sequence of DOM word separators before

 * aBeforeOffset and returns the offset to its first element.

 * @param aContent the TextNode to check.

 * @param aBeforeOffset the offset in the TextNode before which we will search

 *        for the DOM separator. You can pass INT32_MAX to search the entire

 *        length of the string.

*/

static Maybe<int32_t> FindOffsetOfLastDOMWordSeparatorSequence(

    nsIContent* aContent, int32_t aBeforeOffset) {

  const nsTextFragment* textFragment = aContent->GetText();

  MOZ_ASSERT(textFragment, "Where is our text?");

  int32_t end = std::min(aBeforeOffset, int32_t(textFragment->GetLength()));

  if (textFragment->Is2b()) {

    nsDependentSubstring targetText(textFragment->Get2b(), end);

    WordSplitState<nsDependentSubstring> state(targetText);

    return state.FindOffsetOfLastDOMWordSeparatorSequence(end);

  nsDependentCSubstring targetText(textFragment->Get1b(), end);

  WordSplitState<nsDependentCSubstring> state(targetText);

  return state.FindOffsetOfLastDOMWordSeparatorSequence(end);

/**

 * Check if there's a DOM word separator before aBeforeOffset in this node.

 * Always returns true if it's a BR element.

 * aSeparatorOffset is set to the index of the first character in the last

 * separator if any is found (0 for BR elements).

 * This function does not modify aSeparatorOffset when it returns false.

*/

static bool ContainsDOMWordSeparator(nsINode* aNode, int32_t aBeforeOffset,

                                     int32_t* aSeparatorOffset) {

  if (IsBRElement(aNode)) {

    *aSeparatorOffset = 0;

    return true;

  if (!IsSpellCheckingTextNode(aNode)) return false;

  const Maybe<int32_t> separatorOffset =

      FindOffsetOfLastDOMWordSeparatorSequence(aNode->AsContent(),

                                               aBeforeOffset);

  if (separatorOffset) {

    *aSeparatorOffset = *separatorOffset;

    return true;

  return false;

static bool IsBreakElement(nsINode* aNode) {

  if (!aNode->IsElement()) {

    return false;

  dom::Element* element = aNode->AsElement();

  if (element->IsHTMLElement(nsGkAtoms::br)) {

    return true;

  // If we don't have a frame, we don't consider ourselves a break

  // element.  In particular, words can span us.

  nsIFrame* frame = element->GetPrimaryFrame();

  if (!frame) {

    return false;

  auto* disp = frame->StyleDisplay();

  // Anything that's not an inline element is a break element.

  // XXXbz should replaced inlines be break elements, though?

  // Also should inline-block and such be break elements?

//

  // FIXME(emilio): We should teach the spell checker to deal with generated

  // content (it doesn't at all), then remove the IsListItem() check, as there

  // could be no marker, etc...

  return !disp->IsInlineFlow() || disp->IsListItem();

struct CheckLeavingBreakElementClosure {

  bool mLeftBreakElement;

};

static void CheckLeavingBreakElement(nsINode* aNode, void* aClosure) {

  CheckLeavingBreakElementClosure* cl =

      static_cast<CheckLeavingBreakElementClosure*>(aClosure);

  if (!cl->mLeftBreakElement && IsBreakElement(aNode)) {

    cl->mLeftBreakElement = true;

void mozInlineSpellWordUtil::NormalizeWord(nsAString& aWord) {

  nsAutoString result;

  ::NormalizeWord(aWord, 0, aWord.Length(), result);

  aWord = result;

void mozInlineSpellWordUtil::SoftText::AdjustBeginAndBuildText(

    NodeOffset aBegin, NodeOffset aEnd, const nsINode* aRootNode) {

  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug, ("%s", __FUNCTION__));

  mBegin = std::move(aBegin);

  mEnd = std::move(aEnd);

  // First we have to work backwards from mBegin to find a text node

  // containing a DOM word separator, a non-inline-element

  // boundary, or the hard start node. That's where we'll start building the

  // soft string from.

  nsINode* node = mBegin.mNode;

  int32_t firstOffsetInNode = 0;

  int32_t checkBeforeOffset = mBegin.mOffset;

  while (node) {

    if (ContainsDOMWordSeparator(node, checkBeforeOffset, &firstOffsetInNode)) {

      if (node == mBegin.mNode) {

        // If we find a word separator on the first node, look at the preceding

        // word on the text node as well.

        if (firstOffsetInNode > 0) {

          // Try to find the previous word boundary in the current node. If

          // we can't find one, start checking previous sibling nodes (if any

          // adjacent ones exist) to see if we can find any text nodes with

          // DOM word separators. We bail out as soon as we see a node that is

          // not a text node, or we run out of previous sibling nodes. In the

          // event that we simply cannot find any preceding word separator, the

          // offset is set to 0, and the soft text beginning node is set to the

          // "most previous" text node before the original starting node, or

          // kept at the original starting node if no previous text nodes exist.

          int32_t newOffset = 0;

          if (!ContainsDOMWordSeparator(node, firstOffsetInNode - 1,

                                        &newOffset)) {

            nsIContent* prevNode = node->GetPreviousSibling();

            while (prevNode && IsSpellCheckingTextNode(prevNode)) {

              mBegin.mNode = prevNode;

              const Maybe<int32_t> separatorOffset =

                  FindOffsetOfLastDOMWordSeparatorSequence(prevNode, INT32_MAX);

              if (separatorOffset) {

                newOffset = *separatorOffset;

                break;

              prevNode = prevNode->GetPreviousSibling();

          firstOffsetInNode = newOffset;

        } else {

          firstOffsetInNode = 0;

        MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,

                ("%s: adjusting mBegin.mOffset from %i to %i.", __FUNCTION__,

                 mBegin.mOffset, firstOffsetInNode));

        mBegin.mOffset = firstOffsetInNode;

      break;

    checkBeforeOffset = INT32_MAX;

    if (IsBreakElement(node)) {

      // Since GerPrevNode follows tree *preorder*, we're about to traverse up

      // out of 'node'. Since node induces breaks (e.g., it's a block), don't

      // bother trying to look outside it, just stop now.

      break;

    // GetPreviousContent below expects aRootNode to be an ancestor of node.

    if (!node->IsInclusiveDescendantOf(aRootNode)) {

      break;

    node = node->GetPrevNode(aRootNode);

  // Now build up the string moving forward through the DOM until we reach

  // the soft end and *then* see a DOM word separator, a non-inline-element

  // boundary, or the hard end node.

  mValue.Truncate();

  mDOMMapping.Clear();

  bool seenSoftEnd = false;

  // Leave this outside the loop so large heap string allocations can be reused

  // across iterations

  while (node) {

    if (node == mEnd.mNode) {

      seenSoftEnd = true;

    bool exit = false;

    if (IsSpellCheckingTextNode(node)) {

      nsIContent* content = static_cast<nsIContent*>(node);

      MOZ_ASSERT(content, "Where is our content?");

      const nsTextFragment* textFragment = content->GetText();

      MOZ_ASSERT(textFragment, "Where is our text?");

      uint32_t lastOffsetInNode = textFragment->GetLength();

      if (seenSoftEnd) {

        // check whether we can stop after this

        for (uint32_t i =

                 node == mEnd.mNode ? AssertedCast<uint32_t>(mEnd.mOffset) : 0;

             i < textFragment->GetLength(); ++i) {

          if (IsDOMWordSeparator(textFragment->CharAt(i))) {

            exit = true;

            // stop at the first separator after the soft end point

            lastOffsetInNode = i;

            break;

      if (firstOffsetInNode >= 0 &&

          static_cast<uint32_t>(firstOffsetInNode) < lastOffsetInNode) {

        const uint32_t len = lastOffsetInNode - firstOffsetInNode;

        mDOMMapping.AppendElement(DOMTextMapping(

            NodeOffset(node, firstOffsetInNode), mValue.Length(), len));

        const bool ok = textFragment->AppendTo(

            mValue, static_cast<uint32_t>(firstOffsetInNode), len,

            mozilla::fallible);

        if (!ok) {

          // probably out of memory, remove from mDOMMapping

          mDOMMapping.RemoveLastElement();

          exit = true;

      firstOffsetInNode = 0;

    if (exit) break;

    CheckLeavingBreakElementClosure closure = {false};

    node = FindNextNode(node, aRootNode, CheckLeavingBreakElement, &closure);

    if (closure.mLeftBreakElement || (node && IsBreakElement(node))) {

      // We left, or are entering, a break element (e.g., block). Maybe we can

      // stop now.

      if (seenSoftEnd) break;

      // Record the break

      mValue.Append(' ');

  MOZ_LOG(sInlineSpellWordUtilLog, LogLevel::Debug,

          ("%s: got DOM string: %s", __FUNCTION__,

           NS_ConvertUTF16toUTF8(mValue).get()));

auto mozInlineSpellWordUtil::BuildRealWords() const

    -> Result<RealWords, nsresult> {

  // This is pretty simple. We just have to walk mSoftText.GetValue(),

  // tokenizing it into "real words". We do an outer traversal of words

  // delimited by IsDOMWordSeparator, calling SplitDOMWordAndAppendTo on each of

  // those DOM words

  int32_t wordStart = -1;

  RealWords realWords;

  for (int32_t i = 0; i < int32_t(mSoftText.GetValue().Length()); ++i) {

    if (IsDOMWordSeparator(mSoftText.GetValue().CharAt(i))) {

      if (wordStart >= 0) {

        nsresult rv = SplitDOMWordAndAppendTo(wordStart, i, realWords);

        if (NS_FAILED(rv)) {

          return Err(rv);

        wordStart = -1;

    } else {

      if (wordStart < 0) {

        wordStart = i;

  if (wordStart >= 0) {

    nsresult rv = SplitDOMWordAndAppendTo(

        wordStart, mSoftText.GetValue().Length(), realWords);

    if (NS_FAILED(rv)) {

      return Err(rv);

  return realWords;

/*********** DOM/realwords<->mSoftText.GetValue() mapping functions

 * ************/

int32_t mozInlineSpellWordUtil::MapDOMPositionToSoftTextOffset(

    const NodeOffset& aNodeOffset) const {

  if (!mSoftText.mIsValid) {

    NS_ERROR("Soft text must be valid if we're to map into it");

    return -1;

  for (int32_t i = 0; i < int32_t(mSoftText.GetDOMMapping().Length()); ++i) {

    const DOMTextMapping& map = mSoftText.GetDOMMapping()[i];

    if (map.mNodeOffset.mNode == aNodeOffset.mNode) {

      // Allow offsets at either end of the string, in particular, allow the

      // offset that's at the end of the contributed string

      int32_t offsetInContributedString =

          aNodeOffset.mOffset - map.mNodeOffset.mOffset;

      if (offsetInContributedString >= 0 &&

          offsetInContributedString <= map.mLength)

        return map.mSoftTextOffset + offsetInContributedString;

      return -1;

  return -1;

namespace {

template <class T>

class FirstLargerOffset {

  int32_t mSoftTextOffset;

 public:

  explicit FirstLargerOffset(int32_t aSoftTextOffset)

      : mSoftTextOffset(aSoftTextOffset) {}

  int operator()(const T& t) const {

    // We want the first larger offset, so never return 0 (which would

    // short-circuit evaluation before finding the last such offset).

    return mSoftTextOffset < t.mSoftTextOffset ? -1 : 1;

};

template <class T>

bool FindLastNongreaterOffset(const nsTArray<T>& aContainer,

                              int32_t aSoftTextOffset, size_t* aIndex) {

  if (aContainer.Length() == 0) {

    return false;

  BinarySearchIf(aContainer, 0, aContainer.Length(),

                 FirstLargerOffset<T>(aSoftTextOffset), aIndex);

  if (*aIndex > 0) {

    // There was at least one mapping with offset <= aSoftTextOffset. Step back

    // to find the last element with |mSoftTextOffset <= aSoftTextOffset|.

    *aIndex -= 1;

  } else {

    // Every mapping had offset greater than aSoftTextOffset.

    MOZ_ASSERT(aContainer[*aIndex].mSoftTextOffset > aSoftTextOffset);

  return true;

}  // namespace

NodeOffset mozInlineSpellWordUtil::MapSoftTextOffsetToDOMPosition(

    int32_t aSoftTextOffset, DOMMapHint aHint) const {

  MOZ_ASSERT(mSoftText.mIsValid,

             "Soft text must be valid if we're to map out of it");

  if (!mSoftText.mIsValid) return NodeOffset(nullptr, -1);

  // Find the last mapping, if any, such that mSoftTextOffset <= aSoftTextOffset

  size_t index;

  bool found = FindLastNongreaterOffset(mSoftText.GetDOMMapping(),

                                        aSoftTextOffset, &index);

  if (!found) {

    return NodeOffset(nullptr, -1);

  // 'index' is now the last mapping, if any, such that

  // mSoftTextOffset <= aSoftTextOffset.

  // If we're doing HINT_END, then we may want to return the end of the

  // the previous mapping instead of the start of this mapping

  if (aHint == HINT_END && index > 0) {

    const DOMTextMapping& map = mSoftText.GetDOMMapping()[index - 1];

    if (map.mSoftTextOffset + map.mLength == aSoftTextOffset)

      return NodeOffset(map.mNodeOffset.mNode,

                        map.mNodeOffset.mOffset + map.mLength);

  // We allow ourselves to return the end of this mapping even if we're

  // doing HINT_START. This will only happen if there is no mapping which this

  // point is the start of. I'm not 100% sure this is OK...

  const DOMTextMapping& map = mSoftText.GetDOMMapping()[index];

  int32_t offset = aSoftTextOffset - map.mSoftTextOffset;

  if (offset >= 0 && offset <= map.mLength)

    return NodeOffset(map.mNodeOffset.mNode, map.mNodeOffset.mOffset + offset);

  return NodeOffset(nullptr, -1);

// static

void mozInlineSpellWordUtil::ToString(const DOMMapHint aHint,

                                      nsACString& aResult) {

  switch (aHint) {

    case HINT_BEGIN:

      aResult.AssignLiteral("begin");

      break;

    case HINT_END:

      aResult.AssignLiteral("end");

      break;

int32_t mozInlineSpellWordUtil::FindRealWordContaining(

    int32_t aSoftTextOffset, DOMMapHint aHint, bool aSearchForward) const {

  if (MOZ_LOG_TEST(sInlineSpellWordUtilLog, LogLevel::Debug)) {

    nsAutoCString hint;

    mozInlineSpellWordUtil::ToString(aHint, hint);

    MOZ_LOG(

        sInlineSpellWordUtilLog, LogLevel::Debug,

        ("%s: offset=%i, hint=%s, searchForward=%i.", __FUNCTION__,

         aSoftTextOffset, hint.get(), static_cast<int32_t>(aSearchForward)));

  MOZ_ASSERT(mSoftText.mIsValid,

             "Soft text must be valid if we're to map out of it");

  if (!mSoftText.mIsValid) return -1;

  // Find the last word, if any, such that mRealWords[index].mSoftTextOffset

  // <= aSoftTextOffset

  size_t index;

  bool found = FindLastNongreaterOffset(mRealWords, aSoftTextOffset, &index);

  if (!found) {

    return -1;

  // 'index' is now the last word, if any, such that

  // mSoftTextOffset <= aSoftTextOffset.

  // If we're doing HINT_END, then we may want to return the end of the

  // the previous word instead of the start of this word

  if (aHint == HINT_END && index > 0) {

    const RealWord& word = mRealWords[index - 1];

    if (word.EndOffset() == aSoftTextOffset) {

      return index - 1;

  // We allow ourselves to return the end of this word even if we're

  // doing HINT_BEGIN. This will only happen if there is no word which this

  // point is the start of. I'm not 100% sure this is OK...

  const RealWord& word = mRealWords[index];

  int32_t offset = aSoftTextOffset - word.mSoftTextOffset;

  if (offset >= 0 && offset <= static_cast<int32_t>(word.mLength)) return index;

  if (aSearchForward) {

    if (mRealWords[0].mSoftTextOffset > aSoftTextOffset) {

      // All words have mSoftTextOffset > aSoftTextOffset

      return 0;

    // 'index' is the last word such that mSoftTextOffset <= aSoftTextOffset.

    // Word index+1, if it exists, will be the first with

    // mSoftTextOffset > aSoftTextOffset.

    if (index + 1 < mRealWords.Length()) return index + 1;

  return -1;

// mozInlineSpellWordUtil::SplitDOMWordAndAppendTo

nsresult mozInlineSpellWordUtil::SplitDOMWordAndAppendTo(

    int32_t aStart, int32_t aEnd, nsTArray<RealWord>& aRealWords) const {

  nsDependentSubstring targetText(mSoftText.GetValue(), aStart, aEnd - aStart);

  WordSplitState<nsDependentSubstring> state(targetText);

  state.mCurCharClass = state.ClassifyCharacter(0, true);

  state.AdvanceThroughSeparators();

  if (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT && state.IsSpecialWord()) {

    int32_t specialWordLength =

        state.mDOMWordText.Length() - state.mDOMWordOffset;

    if (!aRealWords.AppendElement(

            RealWord(aStart + state.mDOMWordOffset, specialWordLength, false),

            fallible)) {

      return NS_ERROR_OUT_OF_MEMORY;

    return NS_OK;

  while (state.mCurCharClass != CHAR_CLASS_END_OF_INPUT) {

    state.AdvanceThroughSeparators();

    if (state.mCurCharClass == CHAR_CLASS_END_OF_INPUT) break;

    // save the beginning of the word

    int32_t wordOffset = state.mDOMWordOffset;

    // find the end of the word

    state.AdvanceThroughWord();

    int32_t wordLen = state.mDOMWordOffset - wordOffset;

    if (!aRealWords.AppendElement(

            RealWord(aStart + wordOffset, wordLen,

                     !state.ShouldSkipWord(wordOffset, wordLen)),

            fallible)) {

      return NS_ERROR_OUT_OF_MEMORY;

  return NS_OK;