From f829ec56b7fbb9637af985c9f6c509bbaf92261f Mon Sep 17 00:00:00 2001 From: Wliu <50Wliu@users.noreply.github.com> Date: Fri, 28 Apr 2017 19:05:16 -0400 Subject: [PATCH 1/6] Improve entity scopes, take two --- grammars/html.cson | 60 ++++++++++++++++++++++++++++++------ spec/html-spec.coffee | 72 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 117 insertions(+), 15 deletions(-) diff --git a/grammars/html.cson b/grammars/html.cson index 1a21e67..f0dbeb3 100644 --- a/grammars/html.cson +++ b/grammars/html.cson @@ -343,7 +343,7 @@ ] } { - 'include': '#entities' + 'include': '#text-entities' } { 'match': '<>' @@ -360,21 +360,61 @@ 'include': '#python' } ] - 'entities': + 'text-entities': + # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference 'patterns': [ { - 'match': '(&)([a-zA-Z0-9]+|#[0-9]+|#x[0-9a-fA-F]+)(;)' - 'captures': + 'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX]\\h+)' + 'beginCaptures': '1': 'name': 'punctuation.definition.entity.begin.html' '2': 'name': 'entity.name.entity.other.html' - '3': + 'end': ';' + 'endCaptures': + '0': + 'name': 'punctuation.definition.entity.end.html' + 'name': 'constant.character.entity.html' + } + { + 'match': '&(?!\\s|<|&)' + 'name': 'invalid.illegal.bad-ampersand.html' + } + ] + 'attribute-entities': + # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference + # Because it would be infeasible to include the entire list of allowed entities, + # make sure that an equals sign or the end of a string does not follow a potential reference. + 'patterns': [ + { + 'begin': '(&)(#\\d+|#[xX]\\h+)' + 'beginCaptures': + '1': + 'name': 'punctuation.definition.entity.begin.html' + '2': + 'name': 'entity.name.entity.other.html' + 'end': ';' + 'endCaptures': + '0': + 'name': 'punctuation.definition.entity.end.html' + 'name': 'constant.character.entity.html' + } + { + 'begin': '(&)([a-zA-Z0-9]++)(?!["\'=])' + 'beginCaptures': + '1': + 'name': 'punctuation.definition.entity.begin.html' + '2': + 'name': 'entity.name.entity.other.html' + 'end': ';' + 'endCaptures': + '0': 'name': 'punctuation.definition.entity.end.html' 'name': 'constant.character.entity.html' } { - 'match': '&' + # In attributes, potential references that end with an equals sign are fine + 'match': '&(?!\\s|<|&|[a-zA-Z0-9]+=)' 'name': 'invalid.illegal.bad-ampersand.html' } ] @@ -425,7 +465,7 @@ 'include': '#embedded-code' } { - 'include': '#entities' + 'include': '#attribute-entities' } ] 'string-single-quoted': @@ -443,7 +483,7 @@ 'include': '#embedded-code' } { - 'include': '#entities' + 'include': '#attribute-entities' } ] 'tag-generic-attribute': @@ -475,7 +515,7 @@ 'include': '#embedded-code' } { - 'include': '#entities' + 'include': '#attribute-entities' } ] } @@ -495,7 +535,7 @@ 'include': '#embedded-code' } { - 'include': '#entities' + 'include': '#attribute-entities' } ] } diff --git a/spec/html-spec.coffee b/spec/html-spec.coffee index ef91223..7ebe5b5 100644 --- a/spec/html-spec.coffee +++ b/spec/html-spec.coffee @@ -101,14 +101,76 @@ describe 'HTML grammar', -> grammarTest path.join(__dirname, 'fixtures/syntax_test_html.html') grammarTest path.join(__dirname, 'fixtures/syntax_test_html_template_fragments.html') - describe "entities", -> + describe "entities in text", -> it "tokenizes & and characters after it", -> {tokens} = grammar.tokenizeLine '& & &a' - expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic', 'invalid.illegal.bad-ampersand.html'] - expect(tokens[3]).toEqual value: 'amp', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html'] - expect(tokens[4]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] - expect(tokens[7]).toEqual value: 'a', scopes: ['text.html.basic'] + expect(tokens[0]).toEqual value: '& ', scopes: ['text.html.basic'] + expect(tokens[1]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] + expect(tokens[2]).toEqual value: 'amp', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html'] + expect(tokens[3]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] + expect(tokens[5]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] + expect(tokens[6]).toEqual value: 'a', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html'] + + it "tokenizes hexadecimal and digit entities", -> + {tokens} = grammar.tokenizeLine '" " "' + + expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] + expect(tokens[1]).toEqual value: '#x00022', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html'] + expect(tokens[2]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] + expect(tokens[4]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] + expect(tokens[5]).toEqual value: '#X00022', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html'] + expect(tokens[6]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] + expect(tokens[8]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] + expect(tokens[9]).toEqual value: '#34', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html'] + expect(tokens[10]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] + + it "tokenizes invalid ampersands", -> + {tokens} = grammar.tokenizeLine 'PSE&>' + expect(tokens[0]).toEqual value: 'PSE', scopes: ['text.html.basic'] + expect(tokens[1]).toEqual value: '&', scopes: ['text.html.basic', 'invalid.illegal.bad-ampersand.html'] + expect(tokens[2]).toEqual value: '>', scopes: ['text.html.basic'] + + {tokens} = grammar.tokenizeLine 'PSE&' + expect(tokens[0]).toEqual value: 'PSE&', scopes: ['text.html.basic'] + + {tokens} = grammar.tokenizeLine '&<' + expect(tokens[0]).toEqual value: '&<', scopes: ['text.html.basic'] + + {tokens} = grammar.tokenizeLine '& ' + expect(tokens[0]).toEqual value: '& ', scopes: ['text.html.basic'] + + {tokens} = grammar.tokenizeLine '&' + expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic'] + + {tokens} = grammar.tokenizeLine '&&' + expect(tokens[0]).toEqual value: '&&', scopes: ['text.html.basic'] + + describe "entities in attributes", -> + it "tokenizes entities", -> + {tokens} = grammar.tokenizeLine '' + expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] + expect(tokens[8]).toEqual value: 'amp', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'entity.name.entity.other.html'] + expect(tokens[9]).toEqual value: ';', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] + + it "does not tokenize query parameters as entities", -> + {tokens} = grammar.tokenizeLine '' + expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html'] + + it "tokenizes invalid ampersands", -> + {tokens} = grammar.tokenizeLine '' + expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] + + {tokens} = grammar.tokenizeLine '' + expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] + + {tokens} = grammar.tokenizeLine '' + expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html'] + + # Note: in order to replicate this test's behavior, make sure you have language-hyperlink disabled + {tokens} = grammar.tokenizeLine '' + expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html'] + expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] describe "firstLineMatch", -> it "recognises HTML5 doctypes", -> From 4af9fac9153c1f3fbcef5c657f443c0c93183cd1 Mon Sep 17 00:00:00 2001 From: Wliu <50Wliu@users.noreply.github.com> Date: Wed, 7 Jun 2017 23:22:53 -0400 Subject: [PATCH 2/6] PCRE does not support \h --- grammars/html.cson | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grammars/html.cson b/grammars/html.cson index f0dbeb3..e392c15 100644 --- a/grammars/html.cson +++ b/grammars/html.cson @@ -364,7 +364,7 @@ # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference 'patterns': [ { - 'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX]\\h+)' + 'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX][0-9a-fA-F]+)' 'beginCaptures': '1': 'name': 'punctuation.definition.entity.begin.html' @@ -387,7 +387,7 @@ # make sure that an equals sign or the end of a string does not follow a potential reference. 'patterns': [ { - 'begin': '(&)(#\\d+|#[xX]\\h+)' + 'begin': '(&)(#\\d+|#[xX][0-9a-fA-F]+)' 'beginCaptures': '1': 'name': 'punctuation.definition.entity.begin.html' From ced920926b8f92464fb5a51e717d12705d17f098 Mon Sep 17 00:00:00 2001 From: Wliu <50Wliu@users.noreply.github.com> Date: Tue, 19 Sep 2017 21:22:52 +0200 Subject: [PATCH 3/6] Update specs --- spec/html-spec.coffee | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/spec/html-spec.coffee b/spec/html-spec.coffee index e379676..3b69d6b 100644 --- a/spec/html-spec.coffee +++ b/spec/html-spec.coffee @@ -424,28 +424,28 @@ describe 'HTML grammar', -> describe "entities in attributes", -> it "tokenizes entities", -> {tokens} = grammar.tokenizeLine '' - expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] - expect(tokens[8]).toEqual value: 'amp', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'entity.name.entity.other.html'] - expect(tokens[9]).toEqual value: ';', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] + expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] + expect(tokens[8]).toEqual value: 'amp', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'entity.name.entity.other.html'] + expect(tokens[9]).toEqual value: ';', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.end.html'] it "does not tokenize query parameters as entities", -> {tokens} = grammar.tokenizeLine '' - expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html'] + expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html'] it "tokenizes invalid ampersands", -> {tokens} = grammar.tokenizeLine '' - expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] + expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] {tokens} = grammar.tokenizeLine '' - expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] + expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] {tokens} = grammar.tokenizeLine '' - expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html'] + expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html'] # Note: in order to replicate this test's behavior, make sure you have language-hyperlink disabled {tokens} = grammar.tokenizeLine '' - expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html'] - expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] + expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html'] + expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] describe "firstLineMatch", -> it "recognises HTML5 doctypes", -> From 8e54d3a826f8ea8954d487266b3067367c3af8fa Mon Sep 17 00:00:00 2001 From: Wliu <50Wliu@users.noreply.github.com> Date: Wed, 20 Sep 2017 20:25:21 +0200 Subject: [PATCH 4/6] Add requested spec --- spec/html-spec.coffee | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/spec/html-spec.coffee b/spec/html-spec.coffee index 3b69d6b..4e44b4d 100644 --- a/spec/html-spec.coffee +++ b/spec/html-spec.coffee @@ -387,6 +387,9 @@ describe 'HTML grammar', -> expect(tokens[5]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html'] expect(tokens[6]).toEqual value: 'a', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html'] + lines = grammar.tokenizeLines '&\n' + expect(lines[0][0]).toEqual value: '&', scopes: ['text.html.basic'] + it "tokenizes hexadecimal and digit entities", -> {tokens} = grammar.tokenizeLine '" " "' @@ -433,6 +436,7 @@ describe 'HTML grammar', -> expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html'] it "tokenizes invalid ampersands", -> + # Note: in order to replicate the following tests' behaviors, make sure you have language-hyperlink disabled {tokens} = grammar.tokenizeLine '' expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] @@ -442,7 +446,9 @@ describe 'HTML grammar', -> {tokens} = grammar.tokenizeLine '' expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html'] - # Note: in order to replicate this test's behavior, make sure you have language-hyperlink disabled + lines = grammar.tokenizeLines '' + expect(lines[0][6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html'] + {tokens} = grammar.tokenizeLine '' expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html'] expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html'] From d78c555b78aeefca0af36640961c96de8b162548 Mon Sep 17 00:00:00 2001 From: Wliu <50Wliu@users.noreply.github.com> Date: Wed, 20 Sep 2017 20:26:20 +0200 Subject: [PATCH 5/6] :memo: Update comment to HTML 5.1 spec --- grammars/html.cson | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grammars/html.cson b/grammars/html.cson index ed0124b..9ea238f 100644 --- a/grammars/html.cson +++ b/grammars/html.cson @@ -416,7 +416,7 @@ } ] 'text-entities': - # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference + # https://www.w3.org/TR/html51/syntax.html#consume-a-character-reference 'patterns': [ { 'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX][0-9a-fA-F]+)' @@ -437,7 +437,7 @@ } ] 'attribute-entities': - # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference + # https://www.w3.org/TR/html51/syntax.html#consume-a-character-reference # Because it would be infeasible to include the entire list of allowed entities, # make sure that an equals sign or the end of a string does not follow a potential reference. 'patterns': [ From adef4ad003738f3462bff56091348a5d4a41dbd7 Mon Sep 17 00:00:00 2001 From: Wliu <50Wliu@users.noreply.github.com> Date: Wed, 20 Sep 2017 20:27:36 +0200 Subject: [PATCH 6/6] Update old #entities reference to point to #attribute-entities --- grammars/html.cson | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grammars/html.cson b/grammars/html.cson index 9ea238f..273d5df 100644 --- a/grammars/html.cson +++ b/grammars/html.cson @@ -648,7 +648,7 @@ 'unquoted-attribute': 'patterns': [ { - 'include': '#entities' + 'include': '#attribute-entities' } { # https://www.w3.org/TR/html51/syntax.html#attribute-value-unquoted-state