tokenizer.py - mozsearch

# -*- coding: utf-8 -*-

# Copyright JS Foundation and other contributors, https://js.foundation/

# Redistribution and use in source and binary forms, with or without

# modification, are permitted provided that the following conditions are met:

#   * Redistributions of source code must retain the above copyright

#     notice, this list of conditions and the following disclaimer.

#   * Redistributions in binary form must reproduce the above copyright

#     notice, this list of conditions and the following disclaimer in the

#     documentation and/or other materials provided with the distribution.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

# ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY

# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES

# LOSS OF USE, DATA, OR PROFITS OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF

# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from __future__ import absolute_import, unicode_literals

from collections import deque

from .objects import Object

from .error_handler import ErrorHandler

from .scanner import Scanner, SourceLocation, Position, RegExp

from .token import Token, TokenName

class BufferEntry(Object):

    def __init__(self, type, value, regex=None, range=None, loc=None):

        self.type = type

        self.value = value

        self.regex = regex

        self.range = range

        self.loc = loc

class Reader(object):

    def __init__(self):

        self.values = []

        self.curly = self.paren = -1

    # A function following one of those tokens is an expression.

    def beforeFunctionExpression(self, t):

        return t in (

            '(', '{', '[', 'in', 'typeof', 'instanceof', 'new',

            'return', 'case', 'delete', 'throw', 'void',

            # assignment operators

            '=', '+=', '-=', '*=', '**=', '/=', '%=', '<<=', '>>=', '>>>=',

            '&=', '|=', '^=', ',',

            # binary/unary operators

            '+', '-', '*', '**', '/', '%', '++', '--', '<<', '>>', '>>>', '&',

            '|', '^', '!', '~', '&&', '||', '?', ':', '===', '==', '>=',

            '<=', '<', '>', '!=', '!=='

    # Determine if forward slash (/) is an operator or part of a regular expression

    # https://github.com/mozilla/sweet.js/wiki/design

    def isRegexStart(self):

        if not self.values:

            return True

        previous = self.values[-1]

        regex = previous is not None

        if previous in (

            'this',

            ']',

):

            regex = False

        elif previous == ')':

            keyword = self.values[self.paren - 1]

            regex = keyword in ('if', 'while', 'for', 'with')

        elif previous == '}':

            # Dividing a function by anything makes little sense,

            # but we have to check for that.

            regex = True

            if len(self.values) >= 3 and self.values[self.curly - 3] == 'function':

                # Anonymous function, e.g. function(){} /42

                check = self.values[self.curly - 4]

                regex = not self.beforeFunctionExpression(check) if check else False

            elif len(self.values) >= 4 and self.values[self.curly - 4] == 'function':

                # Named function, e.g. function f(){} /42/

                check = self.values[self.curly - 5]

                regex = not self.beforeFunctionExpression(check) if check else True

        return regex

    def append(self, token):

        if token.type in (Token.Punctuator, Token.Keyword):

            if token.value == '{':

                self.curly = len(self.values)

            elif token.value == '(':

                self.paren = len(self.values)

            self.values.append(token.value)

        else:

            self.values.append(None)

class Config(Object):

    def __init__(self, tolerant=None, comment=None, range=None, loc=None, **options):

        self.tolerant = tolerant

        self.comment = comment

        self.range = range

        self.loc = loc

        for k, v in options.items():

            setattr(self, k, v)

class Tokenizer(object):

    def __init__(self, code, options):

        self.config = Config(**options)

        self.errorHandler = ErrorHandler()

        self.errorHandler.tolerant = self.config.tolerant

        self.scanner = Scanner(code, self.errorHandler)

        self.scanner.trackComment = self.config.comment

        self.trackRange = self.config.range

        self.trackLoc = self.config.loc

        self.buffer = deque()

        self.reader = Reader()

    def errors(self):

        return self.errorHandler.errors

    def getNextToken(self):

        if not self.buffer:

            comments = self.scanner.scanComments()

            if self.scanner.trackComment:

                for e in comments:

                    value = self.scanner.source[e.slice[0]:e.slice[1]]

                    comment = BufferEntry(

                        type='BlockComment' if e.multiLine else 'LineComment',

                        value=value

                    if self.trackRange:

                        comment.range = e.range

                    if self.trackLoc:

                        comment.loc = e.loc

                    self.buffer.append(comment)

            if not self.scanner.eof():

                if self.trackLoc:

                    loc = SourceLocation(

                        start=Position(

                            line=self.scanner.lineNumber,

                            column=self.scanner.index - self.scanner.lineStart

),

                        end=Position(),

                maybeRegex = self.scanner.source[self.scanner.index] == '/' and self.reader.isRegexStart()

                if maybeRegex:

                    state = self.scanner.saveState()

                    try:

                        token = self.scanner.scanRegExp()

                    except Exception:

                        self.scanner.restoreState(state)

                        token = self.scanner.lex()

                else:

                    token = self.scanner.lex()

                self.reader.append(token)

                entry = BufferEntry(

                    type=TokenName[token.type],

                    value=self.scanner.source[token.start:token.end]

                if self.trackRange:

                    entry.range = [token.start, token.end]

                if self.trackLoc:

                    loc.end = Position(

                        line=self.scanner.lineNumber,

                        column=self.scanner.index - self.scanner.lineStart

                    entry.loc = loc

                if token.type is Token.RegularExpression:

                    entry.regex = RegExp(

                        pattern=token.pattern,

                        flags=token.flags,

                self.buffer.append(entry)

        return self.buffer.popleft() if self.buffer else None