diff --git a/grammars/html.cson b/grammars/html.cson index 215afe1..273d5df 100644 --- a/grammars/html.cson +++ b/grammars/html.cson @@ -398,7 +398,7 @@ ] } { - 'include': '#entities' + 'include': '#text-entities' } { 'match': '<>' @@ -415,21 +415,61 @@ 'include': '#python' } ] - 'entities': + 'text-entities': + # https://www.w3.org/TR/html51/syntax.html#consume-a-character-reference 'patterns': [ { - 'match': '(&)([a-zA-Z0-9]+|#[0-9]+|#x[0-9a-fA-F]+)(;)' - 'captures': + 'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX][0-9a-fA-F]+)' + 'beginCaptures': '1': 'name': 'punctuation.definition.entity.begin.html' '2': 'name': 'entity.name.entity.other.html' - '3': + 'end': ';' + 'endCaptures': + '0': + 'name': 'punctuation.definition.entity.end.html' + 'name': 'constant.character.entity.html' + } + { + 'match': '&(?!\\s|<|&)' + 'name': 'invalid.illegal.bad-ampersand.html' + } + ] + 'attribute-entities': + # https://www.w3.org/TR/html51/syntax.html#consume-a-character-reference + # Because it would be infeasible to include the entire list of allowed entities, + # make sure that an equals sign or the end of a string does not follow a potential reference. + 'patterns': [ + { + 'begin': '(&)(#\\d+|#[xX][0-9a-fA-F]+)' + 'beginCaptures': + '1': + 'name': 'punctuation.definition.entity.begin.html' + '2': + 'name': 'entity.name.entity.other.html' + 'end': ';' + 'endCaptures': + '0': + 'name': 'punctuation.definition.entity.end.html' + 'name': 'constant.character.entity.html' + } + { + 'begin': '(&)([a-zA-Z0-9]++)(?!["\'=])' + 'beginCaptures': + '1': + 'name': 'punctuation.definition.entity.begin.html' + '2': + 'name': 'entity.name.entity.other.html' + 'end': ';' + 'endCaptures': + '0': 'name': 'punctuation.definition.entity.end.html' 'name': 'constant.character.entity.html' } { - 'match': '&' + # In attributes, potential references that end with an equals sign are fine + 'match': '&(?!\\s|<|&|[a-zA-Z0-9]+=)' 'name': 'invalid.illegal.bad-ampersand.html' } ] @@ -480,7 +520,7 @@ 'include': '#embedded-code' } { - 'include': '#entities' + 'include': '#attribute-entities' } ] 'string-single-quoted': @@ -498,7 +538,7 @@ 'include': '#embedded-code' } { - 'include': '#entities' + 'include': '#attribute-entities' } ] 'tag-generic-attribute': @@ -559,7 +599,7 @@ 'include': '#embedded-code' } { - 'include': '#entities' + 'include': '#attribute-entities' } ] } @@ -579,7 +619,7 @@ 'include': '#embedded-code' } { - 'include': '#entities' + 'include': '#attribute-entities' } ] } @@ -608,7 +648,7 @@ 'unquoted-attribute': 'patterns': [ { - 'include': '#entities' + 'include': '#attribute-entities' } { # https://www.w3.org/TR/html51/syntax.html#attribute-value-unquoted-state diff --git a/spec/html-spec.coffee b/spec/html-spec.coffee index a6f7754..4e44b4d 100644 --- a/spec/html-spec.coffee +++ b/spec/html-spec.coffee @@ -376,14 +376,82 @@ describe 'HTML grammar', -> expect(lines[2][1]).toEqual value: 'disabled', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-without-value.html', 'entity.other.attribute-name.html'] expect(lines[2][2]).toEqual value: '>', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'punctuation.definition.tag.end.html'] - describe "entities", -> + describe "entities in text", -> it "tokenizes & and characters after it", -> {tokens} = grammar.tokenizeLine '& & &a' - expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic', 'invalid.illegal.bad-ampersand.html'] - expect(tokens[3]).toEqual value: 'amp', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html'] - expect(tokens[4]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] - expect(tokens[7]).toEqual value: 'a', scopes: ['text.html.basic'] + expect(tokens[0]).toEqual value: '& ', scopes: ['text.html.basic'] + expect(tokens[1]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] + expect(tokens[2]).toEqual value: 'amp', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html'] + expect(tokens[3]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] + expect(tokens[5]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] + expect(tokens[6]).toEqual value: 'a', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html'] + + lines = grammar.tokenizeLines '&\n' + expect(lines[0][0]).toEqual value: '&', scopes: ['text.html.basic'] + + it "tokenizes hexadecimal and digit entities", -> + {tokens} = grammar.tokenizeLine '" " "' + + expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] + expect(tokens[1]).toEqual value: '#x00022', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html'] + expect(tokens[2]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] + expect(tokens[4]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] + expect(tokens[5]).toEqual value: '#X00022', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html'] + expect(tokens[6]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] + expect(tokens[8]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] + expect(tokens[9]).toEqual value: '#34', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html'] + expect(tokens[10]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] + + it "tokenizes invalid ampersands", -> + {tokens} = grammar.tokenizeLine 'PSE&>' + expect(tokens[0]).toEqual value: 'PSE', scopes: ['text.html.basic'] + expect(tokens[1]).toEqual value: '&', scopes: ['text.html.basic', 'invalid.illegal.bad-ampersand.html'] + expect(tokens[2]).toEqual value: '>', scopes: ['text.html.basic'] + + {tokens} = grammar.tokenizeLine 'PSE&' + expect(tokens[0]).toEqual value: 'PSE&', scopes: ['text.html.basic'] + + {tokens} = grammar.tokenizeLine '&<' + expect(tokens[0]).toEqual value: '&<', scopes: ['text.html.basic'] + + {tokens} = grammar.tokenizeLine '& ' + expect(tokens[0]).toEqual value: '& ', scopes: ['text.html.basic'] + + {tokens} = grammar.tokenizeLine '&' + expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic'] + + {tokens} = grammar.tokenizeLine '&&' + expect(tokens[0]).toEqual value: '&&', scopes: ['text.html.basic'] + + describe "entities in attributes", -> + it "tokenizes entities", -> + {tokens} = grammar.tokenizeLine '' + expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] + expect(tokens[8]).toEqual value: 'amp', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'entity.name.entity.other.html'] + expect(tokens[9]).toEqual value: ';', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] + + it "does not tokenize query parameters as entities", -> + {tokens} = grammar.tokenizeLine '' + expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html'] + + it "tokenizes invalid ampersands", -> + # Note: in order to replicate the following tests' behaviors, make sure you have language-hyperlink disabled + {tokens} = grammar.tokenizeLine '' + expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] + + {tokens} = grammar.tokenizeLine '' + expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] + + {tokens} = grammar.tokenizeLine '' + expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html'] + + lines = grammar.tokenizeLines '' + expect(lines[0][6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html'] + + {tokens} = grammar.tokenizeLine '' + expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html'] + expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] describe "firstLineMatch", -> it "recognises HTML5 doctypes", ->