@@ -775,9 +775,127 @@ fileprivate extension Compiler.ByteCodeGen {
775775 builder. label ( exit)
776776 }
777777
778+ /// Coalesce any adjacent scalar members in a custom character class together.
779+ /// This is required in order to produce correct grapheme matching behavior.
780+ func coalescingCustomCharacterClassMembers(
781+ _ members: [ DSLTree . CustomCharacterClass . Member ]
782+ ) -> [ DSLTree . CustomCharacterClass . Member ] {
783+ struct Accumulator {
784+ /// A series of range operands. For example, in `[ab-cde-fg]`, this will
785+ /// contain the strings `["ab", "cde", "fg"]`. From there, the resulting
786+ /// ranges will be created.
787+ private var rangeOperands : [ String ] = [ " " ]
788+
789+ /// The current range operand.
790+ private var current : String {
791+ _read { yield rangeOperands [ rangeOperands. count - 1 ] }
792+ _modify { yield & rangeOperands[ rangeOperands. count - 1 ] }
793+ }
794+
795+ /// Try to accumulate a character class member, returning `true` if
796+ /// successful, `false` otherwise.
797+ mutating func tryAccumulate(
798+ _ member: DSLTree . CustomCharacterClass . Member
799+ ) -> Bool {
800+ switch member {
801+ case . atom( let a) :
802+ guard let c = a. literalCharacterValue else { return false }
803+ current. append ( c)
804+ return true
805+ case . quotedLiteral( let str) :
806+ current += str
807+ return true
808+ case let . range( lhs, rhs) :
809+ guard let lhs = lhs. literalCharacterValue,
810+ let rhs = rhs. literalCharacterValue
811+ else { return false }
812+ current. append ( lhs)
813+ rangeOperands. append ( String ( rhs) )
814+ return true
815+ default :
816+ return false
817+ }
818+ }
819+
820+ func finish( ) -> [ DSLTree . CustomCharacterClass . Member ] {
821+ if rangeOperands. count == 1 {
822+ // If we didn't have any additional range operands, this isn't a
823+ // range, we can just form a standard quoted literal.
824+ return [ . quotedLiteral( current) ]
825+ }
826+ var members = [ DSLTree . CustomCharacterClass. Member] ( )
827+
828+ // We have other range operands, splice them together. For N operands
829+ // we have N - 1 ranges.
830+ for (i, lhs) in rangeOperands. dropLast ( ) . enumerated ( ) {
831+ let rhs = rangeOperands [ i + 1 ]
832+
833+ // If this is the first operand we only need to drop the last
834+ // character for its quoted members, otherwise this is both an LHS
835+ // and RHS of a range, and as such needs both sides trimmed.
836+ let leading = i == 0 ? lhs. dropLast ( ) : lhs. dropFirst ( ) . dropLast ( )
837+ if !leading. isEmpty {
838+ members. append ( . quotedLiteral( String ( leading) ) )
839+ }
840+ members. append ( . range( . char( lhs. last!) , . char( rhs. first!) ) )
841+ }
842+ // We've handled everything except the quoted portion of the last
843+ // operand, add it now.
844+ let trailing = rangeOperands. last!. dropFirst ( )
845+ if !trailing. isEmpty {
846+ members. append ( . quotedLiteral( String ( trailing) ) )
847+ }
848+ return members
849+ }
850+ }
851+ return members
852+ . map { m -> DSLTree . CustomCharacterClass . Member in
853+ // First we need to recursively coalsce any child character classes.
854+ switch m {
855+ case . custom( let ccc) :
856+ return . custom( coalescingCustomCharacterClass ( ccc) )
857+ case . intersection( let lhs, let rhs) :
858+ return . intersection(
859+ coalescingCustomCharacterClass ( lhs) ,
860+ coalescingCustomCharacterClass ( rhs) )
861+ case . subtraction( let lhs, let rhs) :
862+ return . subtraction(
863+ coalescingCustomCharacterClass ( lhs) ,
864+ coalescingCustomCharacterClass ( rhs) )
865+ case . symmetricDifference( let lhs, let rhs) :
866+ return . symmetricDifference(
867+ coalescingCustomCharacterClass ( lhs) ,
868+ coalescingCustomCharacterClass ( rhs) )
869+ case . atom, . range, . quotedLiteral, . trivia:
870+ return m
871+ }
872+ }
873+ . coalescing ( with: Accumulator ( ) , into: { $0. finish ( ) } ) { accum, member in
874+ accum. tryAccumulate ( member)
875+ }
876+ }
877+
878+ func coalescingCustomCharacterClass(
879+ _ ccc: DSLTree . CustomCharacterClass
880+ ) -> DSLTree . CustomCharacterClass {
881+ // This only needs to be done in grapheme semantic mode. In scalar semantic
882+ // mode, we don't want to coalesce any scalars into a grapheme. This
883+ // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and
884+ // U+302.
885+ guard options. semanticLevel == . graphemeCluster else { return ccc }
886+
887+ let members = coalescingCustomCharacterClassMembers ( ccc. members)
888+ return . init( members: members, isInverted: ccc. isInverted)
889+ }
890+
778891 mutating func emitCustomCharacterClass(
779892 _ ccc: DSLTree . CustomCharacterClass
780893 ) throws {
894+ // Before emitting a custom character class in grapheme semantic mode, we
895+ // need to coalesce together any adjacent characters and scalars, over which
896+ // we can perform grapheme breaking. This includes e.g range bounds for
897+ // `[e\u{301}-\u{302}]`.
898+ let ccc = coalescingCustomCharacterClass ( ccc)
781899 if let asciiBitset = ccc. asAsciiBitset ( options) ,
782900 optimizationsEnabled {
783901 if options. semanticLevel == . unicodeScalar {
0 commit comments