@@ -332,29 +332,42 @@ extension Source {
332332 /// Quantifier -> ('*' | '+' | '?' | '{' Range '}') QuantKind?
333333 /// QuantKind -> '?' | '+'
334334 ///
335- mutating func lexQuantifier( ) throws -> (
336- Located < Quant . Amount > , Located < Quant . Kind >
337- ) ? {
335+ mutating func lexQuantifier(
336+ context: ParsingContext
337+ ) throws -> ( Located < Quant . Amount > , Located < Quant . Kind > , [ AST . Trivia ] ) ? {
338+ var trivia : [ AST . Trivia ] = [ ]
339+
340+ if let t = try lexNonSemanticWhitespace ( context: context) {
341+ trivia. append ( t)
342+ }
343+
338344 let amt : Located < Quant . Amount > ? = try recordLoc { src in
339345 if src. tryEat ( " * " ) { return . zeroOrMore }
340346 if src. tryEat ( " + " ) { return . oneOrMore }
341347 if src. tryEat ( " ? " ) { return . zeroOrOne }
342348
343349 return try src. tryEating { src in
344- guard src. tryEat ( " { " ) , let range = try src. lexRange ( ) , src. tryEat ( " } " )
350+ guard src. tryEat ( " { " ) ,
351+ let range = try src. lexRange ( context: context) ,
352+ src. tryEat ( " } " )
345353 else { return nil }
346354 return range. value
347355 }
348356 }
349357 guard let amt = amt else { return nil }
350358
359+ // PCRE allows non-semantic whitespace here in extended syntax mode.
360+ if let t = try lexNonSemanticWhitespace ( context: context) {
361+ trivia. append ( t)
362+ }
363+
351364 let kind : Located < Quant . Kind > = recordLoc { src in
352365 if src. tryEat ( " ? " ) { return . reluctant }
353366 if src. tryEat ( " + " ) { return . possessive }
354367 return . eager
355368 }
356369
357- return ( amt, kind)
370+ return ( amt, kind, trivia )
358371 }
359372
360373 /// Try to consume a range, returning `nil` if unsuccessful.
@@ -363,7 +376,7 @@ extension Source {
363376 /// | ExpRange
364377 /// ExpRange -> '..<' <Int> | '...' <Int>
365378 /// | <Int> '..<' <Int> | <Int> '...' <Int>?
366- mutating func lexRange( ) throws -> Located < Quant . Amount > ? {
379+ mutating func lexRange( context : ParsingContext ) throws -> Located < Quant . Amount > ? {
367380 try recordLoc { src in
368381 try src. tryEating { src in
369382 let lowerOpt = try src. lexNumber ( )
@@ -375,7 +388,7 @@ extension Source {
375388 let closedRange : Bool ?
376389 if src. tryEat ( " , " ) {
377390 closedRange = true
378- } else if src . experimentalRanges && src. tryEat ( " . " ) {
391+ } else if context . experimentalRanges && src. tryEat ( " . " ) {
379392 try src. expect ( " . " )
380393 if src. tryEat ( " . " ) {
381394 closedRange = true
@@ -477,12 +490,12 @@ extension Source {
477490 ///
478491 /// TODO: Need to support some escapes
479492 ///
480- mutating func lexQuote( ) throws -> AST . Quote ? {
493+ mutating func lexQuote( context : ParsingContext ) throws -> AST . Quote ? {
481494 let str = try recordLoc { src -> String ? in
482495 if src. tryEat ( sequence: #"\Q"# ) {
483496 return try src. expectQuoted ( endingWith: #"\E"# ) . value
484497 }
485- if src . experimentalQuotes, src. tryEat ( " \" " ) {
498+ if context . experimentalQuotes, src. tryEat ( " \" " ) {
486499 return try src. expectQuoted ( endingWith: " \" " , ignoreEscaped: true ) . value
487500 }
488501 return nil
@@ -499,16 +512,27 @@ extension Source {
499512 ///
500513 /// ExpComment -> '/*' (!'*/' .)* '*/'
501514 ///
515+ /// With `SyntaxOptions.endOfLineComments`
516+ ///
517+ /// EndOfLineComment -> '#' .*
518+ ///
502519 /// TODO: Swift-style nested comments, line-ending comments, etc
503520 ///
504- mutating func lexComment( ) throws -> AST . Trivia ? {
521+ mutating func lexComment( context : ParsingContext ) throws -> AST . Trivia ? {
505522 let trivia : Located < String > ? = try recordLoc { src in
506523 if src. tryEat ( sequence: " (?# " ) {
507524 return try src. expectQuoted ( endingWith: " ) " ) . value
508525 }
509- if src . experimentalComments, src. tryEat ( sequence: " /*") {
526+ if context . experimentalComments, src. tryEat ( sequence: " /*") {
510527 return try src.expectQuoted(endingWith: "*/" ) . value
511528 }
529+ if context. endOfLineComments, src. tryEat ( " # " ) {
530+ // TODO: If we ever support multi-line regex literals, this will need
531+ // to be updated to stop at a newline. Note though that PCRE specifies
532+ // that the newline it matches against can be controlled by the global
533+ // matching options e.g `(*CR)`, `(*ANY)`, ...
534+ return src. lexUntil ( \. isEmpty) . value
535+ }
512536 return nil
513537 }
514538 guard let trivia = trivia else { return nil }
@@ -517,16 +541,55 @@ extension Source {
517541
518542 /// Try to consume non-semantic whitespace as trivia
519543 ///
544+ /// Whitespace -> WhitespaceChar+
545+ ///
520546 /// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set
521- mutating func lexNonSemanticWhitespace( ) throws -> AST . Trivia ? {
522- guard syntax. ignoreWhitespace else { return nil }
547+ mutating func lexNonSemanticWhitespace(
548+ context: ParsingContext
549+ ) throws -> AST . Trivia ? {
550+ guard context. ignoreWhitespace else { return nil }
551+
552+ func isWhitespace( _ c: Character ) -> Bool {
553+ // This is a list of characters that PCRE treats as whitespace when
554+ // compiled with Unicode support. It is a subset of the characters with
555+ // the `.isWhitespace` property. ICU appears to also follow this list.
556+ // Oniguruma and .NET follow a subset of this list.
557+ //
558+ // FIXME: PCRE only treats space and tab characters as whitespace when
559+ // inside a custom character class (and only treats whitespace as
560+ // non-semantic there for the extra-extended `(?xx)` mode). If we get a
561+ // strict-PCRE mode, we'll need to add a case for that.
562+ switch c {
563+ case " " , " \u{9} " ... " \u{D} " , // space, \t, \n, vertical tab, \f, \r
564+ " \u{85} " , " \u{200E} " , // next line, left-to-right mark
565+ " \u{200F} " , " \u{2028} " , // right-to-left-mark, line separator
566+ " \u{2029} " : // paragraph separator
567+ return true
568+ default :
569+ return false
570+ }
571+ }
523572 let trivia : Located < String > ? = recordLoc { src in
524- src. tryEatPrefix { $0 == " " } ? . string
573+ src. tryEatPrefix ( isWhitespace ) ? . string
525574 }
526575 guard let trivia = trivia else { return nil }
527576 return AST . Trivia ( trivia)
528577 }
529578
579+ /// Try to consume trivia.
580+ ///
581+ /// Trivia -> Comment | Whitespace
582+ ///
583+ mutating func lexTrivia( context: ParsingContext ) throws -> AST . Trivia ? {
584+ if let comment = try lexComment ( context: context) {
585+ return comment
586+ }
587+ if let whitespace = try lexNonSemanticWhitespace ( context: context) {
588+ return whitespace
589+ }
590+ return nil
591+ }
592+
530593 /// Try to lex a matching option.
531594 ///
532595 /// MatchingOption -> 'i' | 'J' | 'm' | 'n' | 's' | 'U' | 'x' | 'xx' | 'w'
@@ -761,6 +824,7 @@ extension Source {
761824 /// comments, like quotes, cannot be quantified.
762825 ///
763826 mutating func lexGroupStart(
827+ context: ParsingContext
764828 ) throws -> Located < AST . Group . Kind > ? {
765829 try recordLoc { src in
766830 try src. tryEating { src in
@@ -825,7 +889,7 @@ extension Source {
825889 }
826890
827891 // (_:)
828- if src . experimentalCaptures && src. tryEat ( sequence: " _: " ) {
892+ if context . experimentalCaptures && src. tryEat ( sequence: " _: " ) {
829893 return . nonCapture
830894 }
831895 // TODO: (name:)
@@ -960,9 +1024,12 @@ extension Source {
9601024 ///
9611025 /// GroupConditionalStart -> '(?' GroupStart
9621026 ///
963- mutating func lexGroupConditionalStart( ) throws -> Located < AST . Group . Kind > ? {
1027+ mutating func lexGroupConditionalStart(
1028+ context: ParsingContext
1029+ ) throws -> Located < AST . Group . Kind > ? {
9641030 try tryEating { src in
965- guard src. tryEat ( sequence: " (? " ) , let group = try src. lexGroupStart ( )
1031+ guard src. tryEat ( sequence: " (? " ) ,
1032+ let group = try src. lexGroupStart ( context: context)
9661033 else { return nil }
9671034
9681035 // Implicitly scoped groups are not supported here.
@@ -1607,7 +1674,7 @@ extension Source {
16071674 var name : Located < String > ?
16081675 if src. tryEat ( " : " ) {
16091676 // TODO: PCRE allows escaped delimiters or '\Q...\E' sequences in the
1610- // name under PCRE2_ALT_VERBNAMES.
1677+ // name under PCRE2_ALT_VERBNAMES. It also allows whitespace under (?x).
16111678 name = try src. expectQuoted ( endingWith: " ) " , eatEnding: false )
16121679 }
16131680 try src. expect ( " ) " )
0 commit comments