Merge pull request #10 from jg-rp/tidy

jg-rp · web-flow · commit cb80da2a14d9 · 2024-10-27T13:53:47.000Z
A general tidy up
diff --git a/jsonpath_rfc9535/lex.py b/jsonpath_rfc9535/lex.py
@@ -17,16 +17,11 @@
 RE_WHITESPACE = re.compile(r"[ \n\r\t]+")
 RE_PROPERTY = re.compile(r"[\u0080-\uFFFFa-zA-Z_][\u0080-\uFFFFa-zA-Z0-9_-]*")
 RE_INDEX = re.compile(r"-?[0-9]+")
-RE_INT = re.compile(r"-?[0-9]+")
-RE_EXPONENT = re.compile(r"[eE][+-]?[0-9]+")
-RE_NEGATIVE_EXPONENT = re.compile(r"[eE]-[0-9]+")
+RE_INT = re.compile(r"-?[0-9]+(?:[eE]\+?[0-9]+)?")
+# RE_FLOAT includes numbers with a negative exponent and no decimal point.
+RE_FLOAT = re.compile(r"(:?-?[0-9]+\.[0-9]+(?:[eE][+-]?[0-9]+)?)|(-?[0-9]+[eE]-[0-9]+)")
 RE_FUNCTION_NAME = re.compile(r"[a-z][a-z_0-9]*")
-RE_AND = re.compile(r"&&")
-RE_OR = re.compile(r"\|\|")
-RE_TRUE = re.compile(r"true")
-RE_FALSE = re.compile(r"false")
-RE_NULL = re.compile(r"null")
-RE_ESCAPE = re.compile(r"\\[bfnrtu/]")
+ESCAPES = frozenset(["b", "f", "n", "r", "t", "u", "/", "\\"])
 
 
 class Lexer:
@@ -77,13 +72,13 @@ def emit(self, t: TokenType) -> None:
 
     def next(self) -> str:
         """Return the next character, or the empty string if no more characters."""
-        if self.pos >= len(self.query):
+        try:
+            c = self.query[self.pos]
+            self.pos += 1
+            return c
+        except IndexError:
             return ""
 
-        c = self.query[self.pos]
-        self.pos += 1
-        return c
-
     def ignore(self) -> None:
         """Ignore characters up to the pointer."""
         self.start = self.pos
@@ -100,18 +95,16 @@ def backup(self) -> None:
 
     def peek(self) -> str:
         """Return the next character without advancing the pointer."""
-        c = self.next()
-        if c:
-            self.backup()
-        return c
-
-    def accept(self, pattern: Pattern[str]) -> bool:
-        """Increment the pointer if the current character matches _pattern_."""
-        c = self.next()
-        if pattern.match(c):
+        try:
+            return self.query[self.pos]
+        except IndexError:
+            return ""
+
+    def accept(self, s: str) -> bool:
+        """Increment the pointer if the current position starts with _s_."""
+        if self.query.startswith(s, self.pos):
+            self.pos += len(s)
             return True
-        if c:
-            self.backup()
         return False
 
     def accept_match(self, pattern: Pattern[str]) -> bool:
@@ -140,7 +133,16 @@ def ignore_whitespace(self) -> bool:
 
     def error(self, msg: str) -> None:
         """Emit an error token."""
-        self.tokens.append(Token(TokenType.ERROR, msg, self.pos, self.query))
+        # better error messages.
+        self.tokens.append(
+            Token(
+                TokenType.ERROR,
+                self.query[self.start : self.pos],
+                self.start,
+                self.query,
+                msg,
+            )
+        )
 
 
 StateFn = Callable[[Lexer], Optional["StateFn"]]
@@ -150,7 +152,6 @@ def lex_root(l: Lexer) -> Optional[StateFn]:  # noqa: D103
     c = l.next()
 
     if c != "$":
-        l.backup()
         l.error(f"expected '$', found {c!r}")
         return None
 
@@ -180,9 +181,8 @@ def lex_segment(l: Lexer) -> Optional[StateFn]:  # noqa: D103, PLR0911
         l.emit(TokenType.LBRACKET)
         return lex_inside_bracketed_segment
 
-    # default
-    l.backup()
     if l.filter_depth:
+        l.backup()
         return lex_inside_filter
 
     l.error(f"expected '.', '..' or a bracketed selection, found {c!r}")
@@ -204,21 +204,21 @@ def lex_descendant_segment(l: Lexer) -> Optional[StateFn]:  # noqa: D103
         l.emit(TokenType.LBRACKET)
         return lex_inside_bracketed_segment
 
-    # default
     l.backup()
 
     if l.accept_match(RE_PROPERTY):
         l.emit(TokenType.PROPERTY)
         return lex_segment
 
+    l.next()
     l.error(f"unexpected descendant selection token {c!r}")
     return None
 
 
 def lex_shorthand_selector(l: Lexer) -> Optional[StateFn]:  # noqa: D103
     l.ignore()  # ignore dot
 
-    if l.ignore_whitespace():
+    if l.accept_match(RE_WHITESPACE):
         l.error("unexpected whitespace after dot")
         return None
 
@@ -318,11 +318,9 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]:  # noqa: D103, PLR0915, PL
             return lex_inside_bracketed_segment
 
         if c == "'":
-            # String literal
             return lex_single_quoted_string_inside_filter_expression
 
         if c == '"':
-            # String literal
             return lex_double_quoted_string_inside_filter_expression
 
         if c == "(":
@@ -388,61 +386,31 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]:  # noqa: D103, PLR0915, PL
                 l.emit(TokenType.GT)
             continue
 
-        # default
         l.backup()
 
-        # numbers
-        if l.accept_match(RE_INT):
-            if l.peek() == ".":
-                # A float
-                l.next()
-                if not l.accept_match(RE_INT):
-                    l.error("a fractional digit is required after a decimal point")
-                    return None
-
-                l.accept_match(RE_EXPONENT)
-                l.emit(TokenType.FLOAT)
-                continue
-
-            # An int, or float if exponent is negative
-            if l.accept_match(RE_NEGATIVE_EXPONENT):
-                l.emit(TokenType.FLOAT)
-            else:
-                l.accept_match(RE_EXPONENT)
-                l.emit(TokenType.INT)
-            continue
-
-        if l.accept_match(RE_AND):
+        if l.accept("&&"):
             l.emit(TokenType.AND)
-            continue
-
-        if l.accept_match(RE_OR):
+        elif l.accept("||"):
             l.emit(TokenType.OR)
-            continue
-
-        if l.accept_match(RE_TRUE):
+        elif l.accept("true"):
             l.emit(TokenType.TRUE)
-            continue
-
-        if l.accept_match(RE_FALSE):
+        elif l.accept("false"):
             l.emit(TokenType.FALSE)
-            continue
-
-        if l.accept_match(RE_NULL):
+        elif l.accept("null"):
             l.emit(TokenType.NULL)
-            continue
-
-        # functions
-        if l.accept_match(RE_FUNCTION_NAME) and l.peek() == "(":
+        elif l.accept_match(RE_FLOAT):
+            l.emit(TokenType.FLOAT)
+        elif l.accept_match(RE_INT):
+            l.emit(TokenType.INT)
+        elif l.accept_match(RE_FUNCTION_NAME) and l.peek() == "(":
             # Keep track of parentheses for this function call.
             l.paren_stack.append(1)
             l.emit(TokenType.FUNCTION)
             l.next()
             l.ignore()  # ignore LPAREN
-            continue
-
-        l.error(f"unexpected filter selector token {c!r}")
-        return None
+        else:
+            l.error(f"unexpected filter selector token {c!r}")
+            return None
 
 
 def lex_string_factory(quote: str, state: StateFn) -> StateFn:
@@ -467,16 +435,15 @@ def _lex_string(l: Lexer) -> Optional[StateFn]:
             return state
 
         while True:
-            head = l.query[l.pos : l.pos + 2]
             c = l.next()
 
-            if head in ("\\\\", f"\\{quote}"):
-                l.next()
-                continue
-
-            if c == "\\" and not RE_ESCAPE.match(head):
-                l.error("invalid escape")
-                return None
+            if c == "\\":
+                peeked = l.peek()
+                if peeked in ESCAPES or peeked == quote:
+                    l.next()
+                else:
+                    l.error("invalid escape")
+                    return None
 
             if not c:
                 l.error(f"unclosed string starting at index {l.start}")
@@ -522,6 +489,6 @@ def tokenize(query: str) -> List[Token]:
     lexer.run()
 
     if tokens and tokens[-1].type_ == TokenType.ERROR:
-        raise JSONPathSyntaxError(tokens[-1].value, token=tokens[-1])
+        raise JSONPathSyntaxError(tokens[-1].message, token=tokens[-1])
 
     return tokens
diff --git a/jsonpath_rfc9535/parse.py b/jsonpath_rfc9535/parse.py
@@ -34,7 +34,7 @@
 from .segments import JSONPathChildSegment
 from .segments import JSONPathRecursiveDescentSegment
 from .segments import JSONPathSegment
-from .selectors import Filter
+from .selectors import FilterSelector
 from .selectors import IndexSelector
 from .selectors import JSONPathSelector
 from .selectors import NameSelector
@@ -113,9 +113,6 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
             TokenType.TRUE: self.parse_boolean,
         }
 
-        # TODO: can a function argument be a grouped expression?
-        # TODO: can a function argument contain a !?
-
         self.function_argument_map: Dict[
             TokenType, Callable[[TokenStream], Expression]
         ] = {
@@ -291,7 +288,7 @@ def parse_bracketed_selection(self, stream: TokenStream) -> List[JSONPathSelecto
                     )
                 )
             elif stream.current.type_ == TokenType.FILTER:
-                selectors.append(self.parse_filter(stream))
+                selectors.append(self.parse_filter_selector(stream))
             elif stream.current.type_ == TokenType.EOF:
                 raise JSONPathSyntaxError(
                     "unexpected end of query", token=stream.current
@@ -320,9 +317,9 @@ def parse_bracketed_selection(self, stream: TokenStream) -> List[JSONPathSelecto
 
         return selectors
 
-    def parse_filter(self, stream: TokenStream) -> Filter:
+    def parse_filter_selector(self, stream: TokenStream) -> FilterSelector:
         tok = stream.next_token()
-        expr = self.parse_filter_selector(stream)
+        expr = self.parse_filter_expression(stream)
 
         if isinstance(expr, FunctionExtension):
             func = self.env.function_extensions.get(expr.name)
@@ -342,7 +339,7 @@ def parse_filter(self, stream: TokenStream) -> Filter:
                 token=expr.token,
             )
 
-        return Filter(
+        return FilterSelector(
             env=self.env,
             token=tok,
             expression=FilterExpression(token=expr.token, expression=expr),
@@ -392,15 +389,17 @@ def parse_prefix_expression(self, stream: TokenStream) -> Expression:
         return PrefixExpression(
             tok,
             operator="!",
-            right=self.parse_filter_selector(stream, precedence=self.PRECEDENCE_PREFIX),
+            right=self.parse_filter_expression(
+                stream, precedence=self.PRECEDENCE_PREFIX
+            ),
         )
 
     def parse_infix_expression(
         self, stream: TokenStream, left: Expression
     ) -> Expression:
         tok = stream.next_token()
         precedence = self.PRECEDENCES.get(tok.type_, self.PRECEDENCE_LOWEST)
-        right = self.parse_filter_selector(stream, precedence)
+        right = self.parse_filter_expression(stream, precedence)
         operator = self.BINARY_OPERATORS[tok.type_]
 
         if operator in self.COMPARISON_OPERATORS:
@@ -425,7 +424,7 @@ def parse_infix_expression(
 
     def parse_grouped_expression(self, stream: TokenStream) -> Expression:
         stream.next_token()
-        expr = self.parse_filter_selector(stream)
+        expr = self.parse_filter_expression(stream)
         stream.next_token()
 
         while stream.current.type_ != TokenType.RPAREN:
@@ -497,7 +496,7 @@ def parse_function_extension(self, stream: TokenStream) -> Expression:
             ),
         )
 
-    def parse_filter_selector(
+    def parse_filter_expression(
         self, stream: TokenStream, precedence: int = PRECEDENCE_LOWEST
     ) -> Expression:
         try:
diff --git a/jsonpath_rfc9535/query.py b/jsonpath_rfc9535/query.py
@@ -1,4 +1,4 @@
-"""A compiled JSONPath ready to be applied to a JSON-like value."""
+"""A compiled JSONPath expression ready to be applied to JSON-like data."""
 
 from __future__ import annotations
 
@@ -20,7 +20,7 @@
 
 
 class JSONPathQuery:
-    """A compiled JSONPath expression ready to be applied to a JSON-like value.
+    """A compiled JSONPath expression ready to be applied to JSON-like data.
 
     Arguments:
         env: The `JSONPathEnvironment` this query is bound to.
diff --git a/jsonpath_rfc9535/selectors.py b/jsonpath_rfc9535/selectors.py
@@ -213,7 +213,7 @@ def resolve(self, node: JSONPathNode) -> Iterable[JSONPathNode]:
                 yield node.new_child(element, i)
 
 
-class Filter(JSONPathSelector):
+class FilterSelector(JSONPathSelector):
     """Filter array/list items or dict/object values with a filter expression."""
 
     __slots__ = ("expression",)
@@ -233,7 +233,7 @@ def __str__(self) -> str:
 
     def __eq__(self, __value: object) -> bool:
         return (
-            isinstance(__value, Filter)
+            isinstance(__value, FilterSelector)
             and self.expression == __value.expression
             and self.token == __value.token
         )
diff --git a/jsonpath_rfc9535/tokens.py b/jsonpath_rfc9535/tokens.py
@@ -67,24 +67,26 @@ class Token:
             token derives.
     """
 
-    __slots__ = ("type_", "value", "index", "query")
+    __slots__ = ("type_", "value", "index", "query", "message")
 
     def __init__(
         self,
         type_: TokenType,
         value: str,
         index: int,
         query: str,
+        message: str | None = None,
     ) -> None:
         self.type_ = type_
         self.value = value
         self.index = index
         self.query = query
+        self.message = message
 
     def __repr__(self) -> str:  # pragma: no cover
         return (
             f"Token(type={self.type_.name!r}, value={self.value!r}, "
-            f"index={self.index}, query={self.query!r})"
+            f"index={self.index}, query={self.query!r}, message={self.message!r})"
         )
 
     def __eq__(self, other: object) -> bool:
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/test_lex.py b/tests/test_lex.py