1717RE_WHITESPACE = re .compile (r"[ \n\r\t]+" )
1818RE_PROPERTY = re .compile (r"[\u0080-\uFFFFa-zA-Z_][\u0080-\uFFFFa-zA-Z0-9_-]*" )
1919RE_INDEX = re .compile (r"-?[0-9]+" )
20- RE_INT = re .compile (r"-?[0-9]+" )
21- RE_EXPONENT = re . compile ( r"[eE][+-]?[0-9]+" )
22- RE_NEGATIVE_EXPONENT = re .compile (r"[ eE]- [0-9]+" )
20+ RE_INT = re .compile (r"-?[0-9]+(?:[eE]\+?[0-9]+)? " )
21+ # RE_FLOAT includes numbers with a negative exponent and no decimal point.
22+ RE_FLOAT = re .compile (r"(:?-?[0-9]+\.[0-9]+(?:[ eE][+-]? [0-9]+)?)|(-?[0-9]+[eE]-[0-9]+) " )
2323RE_FUNCTION_NAME = re .compile (r"[a-z][a-z_0-9]*" )
24- RE_AND = re .compile (r"&&" )
25- RE_OR = re .compile (r"\|\|" )
26- RE_TRUE = re .compile (r"true" )
27- RE_FALSE = re .compile (r"false" )
28- RE_NULL = re .compile (r"null" )
29- RE_ESCAPE = re .compile (r"\\[bfnrtu/]" )
24+ ESCAPES = frozenset (["b" , "f" , "n" , "r" , "t" , "u" , "/" , "\\ " ])
3025
3126
3227class Lexer :
@@ -77,13 +72,13 @@ def emit(self, t: TokenType) -> None:
7772
7873 def next (self ) -> str :
7974 """Return the next character, or the empty string if no more characters."""
80- if self .pos >= len (self .query ):
75+ try :
76+ c = self .query [self .pos ]
77+ self .pos += 1
78+ return c
79+ except IndexError :
8180 return ""
8281
83- c = self .query [self .pos ]
84- self .pos += 1
85- return c
86-
8782 def ignore (self ) -> None :
8883 """Ignore characters up to the pointer."""
8984 self .start = self .pos
@@ -100,18 +95,16 @@ def backup(self) -> None:
10095
10196 def peek (self ) -> str :
10297 """Return the next character without advancing the pointer."""
103- c = self . next ()
104- if c :
105- self . backup ()
106- return c
107-
108- def accept (self , pattern : Pattern [ str ] ) -> bool :
109- """Increment the pointer if the current character matches _pattern_ ."""
110- c = self .next ()
111- if pattern . match ( c ):
98+ try :
99+ return self . query [ self . pos ]
100+ except IndexError :
101+ return ""
102+
103+ def accept (self , s : str ) -> bool :
104+ """Increment the pointer if the current position starts with _s_ ."""
105+ if self .query . startswith ( s , self . pos ):
106+ self . pos += len ( s )
112107 return True
113- if c :
114- self .backup ()
115108 return False
116109
117110 def accept_match (self , pattern : Pattern [str ]) -> bool :
@@ -140,7 +133,16 @@ def ignore_whitespace(self) -> bool:
140133
141134 def error (self , msg : str ) -> None :
142135 """Emit an error token."""
143- self .tokens .append (Token (TokenType .ERROR , msg , self .pos , self .query ))
136+ # better error messages.
137+ self .tokens .append (
138+ Token (
139+ TokenType .ERROR ,
140+ self .query [self .start : self .pos ],
141+ self .start ,
142+ self .query ,
143+ msg ,
144+ )
145+ )
144146
145147
146148StateFn = Callable [[Lexer ], Optional ["StateFn" ]]
@@ -150,7 +152,6 @@ def lex_root(l: Lexer) -> Optional[StateFn]: # noqa: D103
150152 c = l .next ()
151153
152154 if c != "$" :
153- l .backup ()
154155 l .error (f"expected '$', found { c !r} " )
155156 return None
156157
@@ -180,9 +181,8 @@ def lex_segment(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0911
180181 l .emit (TokenType .LBRACKET )
181182 return lex_inside_bracketed_segment
182183
183- # default
184- l .backup ()
185184 if l .filter_depth :
185+ l .backup ()
186186 return lex_inside_filter
187187
188188 l .error (f"expected '.', '..' or a bracketed selection, found { c !r} " )
@@ -204,21 +204,21 @@ def lex_descendant_segment(l: Lexer) -> Optional[StateFn]: # noqa: D103
204204 l .emit (TokenType .LBRACKET )
205205 return lex_inside_bracketed_segment
206206
207- # default
208207 l .backup ()
209208
210209 if l .accept_match (RE_PROPERTY ):
211210 l .emit (TokenType .PROPERTY )
212211 return lex_segment
213212
213+ l .next ()
214214 l .error (f"unexpected descendant selection token { c !r} " )
215215 return None
216216
217217
218218def lex_shorthand_selector (l : Lexer ) -> Optional [StateFn ]: # noqa: D103
219219 l .ignore () # ignore dot
220220
221- if l .ignore_whitespace ( ):
221+ if l .accept_match ( RE_WHITESPACE ):
222222 l .error ("unexpected whitespace after dot" )
223223 return None
224224
@@ -318,11 +318,9 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL
318318 return lex_inside_bracketed_segment
319319
320320 if c == "'" :
321- # String literal
322321 return lex_single_quoted_string_inside_filter_expression
323322
324323 if c == '"' :
325- # String literal
326324 return lex_double_quoted_string_inside_filter_expression
327325
328326 if c == "(" :
@@ -388,61 +386,31 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL
388386 l .emit (TokenType .GT )
389387 continue
390388
391- # default
392389 l .backup ()
393390
394- # numbers
395- if l .accept_match (RE_INT ):
396- if l .peek () == "." :
397- # A float
398- l .next ()
399- if not l .accept_match (RE_INT ):
400- l .error ("a fractional digit is required after a decimal point" )
401- return None
402-
403- l .accept_match (RE_EXPONENT )
404- l .emit (TokenType .FLOAT )
405- continue
406-
407- # An int, or float if exponent is negative
408- if l .accept_match (RE_NEGATIVE_EXPONENT ):
409- l .emit (TokenType .FLOAT )
410- else :
411- l .accept_match (RE_EXPONENT )
412- l .emit (TokenType .INT )
413- continue
414-
415- if l .accept_match (RE_AND ):
391+ if l .accept ("&&" ):
416392 l .emit (TokenType .AND )
417- continue
418-
419- if l .accept_match (RE_OR ):
393+ elif l .accept ("||" ):
420394 l .emit (TokenType .OR )
421- continue
422-
423- if l .accept_match (RE_TRUE ):
395+ elif l .accept ("true" ):
424396 l .emit (TokenType .TRUE )
425- continue
426-
427- if l .accept_match (RE_FALSE ):
397+ elif l .accept ("false" ):
428398 l .emit (TokenType .FALSE )
429- continue
430-
431- if l .accept_match (RE_NULL ):
399+ elif l .accept ("null" ):
432400 l .emit (TokenType .NULL )
433- continue
434-
435- # functions
436- if l .accept_match (RE_FUNCTION_NAME ) and l .peek () == "(" :
401+ elif l .accept_match (RE_FLOAT ):
402+ l .emit (TokenType .FLOAT )
403+ elif l .accept_match (RE_INT ):
404+ l .emit (TokenType .INT )
405+ elif l .accept_match (RE_FUNCTION_NAME ) and l .peek () == "(" :
437406 # Keep track of parentheses for this function call.
438407 l .paren_stack .append (1 )
439408 l .emit (TokenType .FUNCTION )
440409 l .next ()
441410 l .ignore () # ignore LPAREN
442- continue
443-
444- l .error (f"unexpected filter selector token { c !r} " )
445- return None
411+ else :
412+ l .error (f"unexpected filter selector token { c !r} " )
413+ return None
446414
447415
448416def lex_string_factory (quote : str , state : StateFn ) -> StateFn :
@@ -467,16 +435,15 @@ def _lex_string(l: Lexer) -> Optional[StateFn]:
467435 return state
468436
469437 while True :
470- head = l .query [l .pos : l .pos + 2 ]
471438 c = l .next ()
472439
473- if head in ( "\\ \\ " , f" \\ { quote } " ) :
474- l . next ()
475- continue
476-
477- if c == " \\ " and not RE_ESCAPE . match ( head ) :
478- l .error ("invalid escape" )
479- return None
440+ if c == "\\ " :
441+ peeked = l . peek ()
442+ if peeked in ESCAPES or peeked == quote :
443+ l . next ()
444+ else :
445+ l .error ("invalid escape" )
446+ return None
480447
481448 if not c :
482449 l .error (f"unclosed string starting at index { l .start } " )
@@ -522,6 +489,6 @@ def tokenize(query: str) -> List[Token]:
522489 lexer .run ()
523490
524491 if tokens and tokens [- 1 ].type_ == TokenType .ERROR :
525- raise JSONPathSyntaxError (tokens [- 1 ].value , token = tokens [- 1 ])
492+ raise JSONPathSyntaxError (tokens [- 1 ].message , token = tokens [- 1 ])
526493
527494 return tokens
0 commit comments