@@ -63,71 +63,137 @@ struct DelimiterLexError: Error, CustomStringConvertible {
6363 }
6464}
6565
66- /// Attempt to lex a regex literal between `start` and `end`, returning either
67- /// the contents and pointer from which to resume lexing, or an error.
68- func lexRegex(
69- start: UnsafeRawPointer , end: UnsafeRawPointer
70- ) throws -> ( contents: String , Delimiter , end: UnsafeRawPointer ) {
71- precondition ( start <= end)
72- var current = start
66+ fileprivate struct DelimiterLexer {
67+ let start : UnsafeRawPointer
68+ var cursor : UnsafeRawPointer
69+ let end : UnsafeRawPointer
70+
71+ init ( start: UnsafeRawPointer , end: UnsafeRawPointer ) {
72+ precondition ( start <= end)
73+ self . start = start
74+ self . cursor = start
75+ self . end = end
76+ }
7377
7478 func ascii( _ s: Unicode . Scalar ) -> UInt8 {
7579 assert ( s. value <= 0x7F )
7680 return UInt8 ( asserting: s. value)
7781 }
78- func load( offset: Int ) -> UInt8 ? {
79- guard current + offset < end else { return nil }
80- return current. load ( fromByteOffset: offset, as: UInt8 . self)
82+
83+ /// Return the byte at the current cursor, or `nil` if the end of the buffer
84+ /// has been reached.
85+ func load( ) -> UInt8 ? {
86+ guard cursor < end else { return nil }
87+ return cursor. load ( as: UInt8 . self)
8188 }
82- func load( ) -> UInt8 ? { load ( offset: 0 ) }
83- func advance( _ n: Int = 1 ) {
84- precondition ( current + n <= end, " Cannot advance past end " )
85- current = current. advanced ( by: n)
89+
90+ /// Return the slice of `count` bytes from a specified cursor position, or
91+ /// `nil` if there are fewer than `count` bytes until the end of the buffer.
92+ func slice(
93+ at cursor: UnsafeRawPointer , _ count: Int
94+ ) -> UnsafeRawBufferPointer ? {
95+ guard cursor + count <= end else { return nil }
96+ return UnsafeRawBufferPointer ( start: cursor, count: count)
8697 }
8798
88- func tryEat( _ utf8: String . UTF8View ) -> Bool {
89- for (i, idx) in utf8. indices. enumerated ( ) {
90- guard load ( offset: i) == utf8 [ idx] else { return false }
91- }
92- advance ( utf8. count)
99+ /// Return the slice of `count` bytes from the current cursor, or `nil` if
100+ /// there are fewer than `count` bytes until the end of the buffer.
101+ func slice( _ count: Int ) -> UnsafeRawBufferPointer ? {
102+ slice ( at: cursor, count)
103+ }
104+
105+ /// Advance the cursor `n` bytes.
106+ mutating func advanceCursor( _ n: Int = 1 ) {
107+ cursor += n
108+ precondition ( cursor <= end, " Cannot advance past end " )
109+ }
110+
111+ /// Check to see if a UTF-8 sequence can be eaten from the current cursor.
112+ func canEat( _ utf8: String . UTF8View ) -> Bool {
113+ guard let slice = slice ( utf8. count) else { return false }
114+ return slice. elementsEqual ( utf8)
115+ }
116+
117+ /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful.
118+ mutating func tryEat( _ utf8: String . UTF8View ) -> Bool {
119+ guard canEat ( utf8) else { return false }
120+ advanceCursor ( utf8. count)
93121 return true
94122 }
95123
96- // Try to lex the opening delimiter.
97- guard let delimiter = Delimiter . allCases. first (
98- where: { tryEat ( $0. opening. utf8) }
99- ) else {
100- throw DelimiterLexError ( . unknownDelimiter, resumeAt: current. successor ( ) )
124+ /// Attempt to eat a particular closing delimiter, returning the contents of
125+ /// the literal, and ending pointer, or `nil` if this is not a delimiter
126+ /// ending.
127+ mutating func tryEatEnding(
128+ _ delimiter: Delimiter , contentsStart: UnsafeRawPointer
129+ ) throws -> ( contents: String , end: UnsafeRawPointer ) ? {
130+ let contentsEnd = cursor
131+ guard tryEat ( delimiter. closing. utf8) else { return nil }
132+
133+ // Form a string from the contents and make sure it's valid UTF-8.
134+ let count = contentsEnd - contentsStart
135+ let contents = UnsafeRawBufferPointer (
136+ start: contentsStart, count: count)
137+ let s = String ( decoding: contents, as: UTF8 . self)
138+
139+ guard s. utf8. elementsEqual ( contents) else {
140+ throw DelimiterLexError ( . invalidUTF8, resumeAt: cursor)
141+ }
142+ return ( contents: s, end: cursor)
101143 }
102144
103- let contentsStart = current
104- while true {
105- switch load ( ) {
106- case nil , ascii ( " \n " ) , ascii ( " \r " ) :
107- throw DelimiterLexError ( . endOfString, resumeAt: current)
145+ /// Attempt to advance the lexer, throwing an error if the end of a line or
146+ /// the end of the buffer is reached.
147+ mutating func advance( escaped: Bool = false ) throws {
148+ guard let next = load ( ) else {
149+ throw DelimiterLexError ( . endOfString, resumeAt: cursor)
150+ }
151+ switch UnicodeScalar ( next) {
152+ case let next where !next. isASCII:
153+ // Just advance into a UTF-8 sequence. It shouldn't matter that we'll
154+ // iterate through each byte as we only match against ASCII, and we
155+ // validate it at the end. This case is separated out so we can just deal
156+ // with the ASCII cases below.
157+ advanceCursor ( )
158+
159+ case " \n " , " \r " :
160+ throw DelimiterLexError ( . endOfString, resumeAt: cursor)
161+
162+ case " \0 " :
163+ // TODO: Warn to match the behavior of String literal lexer? Or should
164+ // we error as unprintable?
165+ advanceCursor ( )
166+
167+ case " \\ " where !escaped:
168+ // Advance again for an escape sequence.
169+ advanceCursor ( )
170+ try advance ( escaped: true )
108171
109- case ascii ( " \\ " ) :
110- // Skip next byte.
111- advance ( 2 )
112172
113173 default :
114- // Try to lex the closing delimiter.
115- let contentsEnd = current
116- guard tryEat ( delimiter. closing. utf8) else {
117- advance ( )
118- continue
119- }
174+ advanceCursor ( )
175+ }
176+ }
120177
121- // Form a string from the contents and make sure it's valid UTF-8.
122- let count = contentsEnd - contentsStart
123- let contents = UnsafeRawBufferPointer (
124- start: contentsStart, count: count)
125- let s = String ( decoding: contents, as: UTF8 . self)
178+ /*consuming*/ mutating func lex(
179+ ) throws -> ( contents: String , Delimiter , end: UnsafeRawPointer ) {
180+
181+ // Try to lex the opening delimiter.
182+ guard let delimiter = Delimiter . allCases. first (
183+ where: { tryEat ( $0. opening. utf8) }
184+ ) else {
185+ throw DelimiterLexError ( . unknownDelimiter, resumeAt: cursor. successor ( ) )
186+ }
126187
127- guard s. utf8. elementsEqual ( contents) else {
128- throw DelimiterLexError ( . invalidUTF8, resumeAt: current)
188+ let contentsStart = cursor
189+ while true {
190+ // Try to lex the closing delimiter.
191+ if let ( contents, end) = try tryEatEnding ( delimiter,
192+ contentsStart: contentsStart) {
193+ return ( contents, delimiter, end)
129194 }
130- return ( contents: s, delimiter, end: current)
195+ // Try to advance the lexer.
196+ try advance ( )
131197 }
132198 }
133199}
@@ -151,3 +217,12 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
151217 }
152218 fatalError ( " No valid delimiters " )
153219}
220+
221+ /// Attempt to lex a regex literal between `start` and `end`, returning either
222+ /// the contents and pointer from which to resume lexing, or an error.
223+ func lexRegex(
224+ start: UnsafeRawPointer , end: UnsafeRawPointer
225+ ) throws -> ( contents: String , Delimiter , end: UnsafeRawPointer ) {
226+ var lexer = DelimiterLexer ( start: start, end: end)
227+ return try lexer. lex ( )
228+ }
0 commit comments