From f829ec56b7fbb9637af985c9f6c509bbaf92261f Mon Sep 17 00:00:00 2001
From: Wliu <50Wliu@users.noreply.github.com>
Date: Fri, 28 Apr 2017 19:05:16 -0400
Subject: [PATCH 1/6] Improve entity scopes, take two
---
grammars/html.cson | 60 ++++++++++++++++++++++++++++++------
spec/html-spec.coffee | 72 ++++++++++++++++++++++++++++++++++++++++---
2 files changed, 117 insertions(+), 15 deletions(-)
diff --git a/grammars/html.cson b/grammars/html.cson
index 1a21e67..f0dbeb3 100644
--- a/grammars/html.cson
+++ b/grammars/html.cson
@@ -343,7 +343,7 @@
]
}
{
- 'include': '#entities'
+ 'include': '#text-entities'
}
{
'match': '<>'
@@ -360,21 +360,61 @@
'include': '#python'
}
]
- 'entities':
+ 'text-entities':
+ # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
'patterns': [
{
- 'match': '(&)([a-zA-Z0-9]+|#[0-9]+|#x[0-9a-fA-F]+)(;)'
- 'captures':
+ 'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX]\\h+)'
+ 'beginCaptures':
'1':
'name': 'punctuation.definition.entity.begin.html'
'2':
'name': 'entity.name.entity.other.html'
- '3':
+ 'end': ';'
+ 'endCaptures':
+ '0':
+ 'name': 'punctuation.definition.entity.end.html'
+ 'name': 'constant.character.entity.html'
+ }
+ {
+ 'match': '&(?!\\s|<|&)'
+ 'name': 'invalid.illegal.bad-ampersand.html'
+ }
+ ]
+ 'attribute-entities':
+ # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
+ # Because it would be infeasible to include the entire list of allowed entities,
+ # make sure that an equals sign or the end of a string does not follow a potential reference.
+ 'patterns': [
+ {
+ 'begin': '(&)(#\\d+|#[xX]\\h+)'
+ 'beginCaptures':
+ '1':
+ 'name': 'punctuation.definition.entity.begin.html'
+ '2':
+ 'name': 'entity.name.entity.other.html'
+ 'end': ';'
+ 'endCaptures':
+ '0':
+ 'name': 'punctuation.definition.entity.end.html'
+ 'name': 'constant.character.entity.html'
+ }
+ {
+ 'begin': '(&)([a-zA-Z0-9]++)(?!["\'=])'
+ 'beginCaptures':
+ '1':
+ 'name': 'punctuation.definition.entity.begin.html'
+ '2':
+ 'name': 'entity.name.entity.other.html'
+ 'end': ';'
+ 'endCaptures':
+ '0':
'name': 'punctuation.definition.entity.end.html'
'name': 'constant.character.entity.html'
}
{
- 'match': '&'
+ # In attributes, potential references that end with an equals sign are fine
+ 'match': '&(?!\\s|<|&|[a-zA-Z0-9]+=)'
'name': 'invalid.illegal.bad-ampersand.html'
}
]
@@ -425,7 +465,7 @@
'include': '#embedded-code'
}
{
- 'include': '#entities'
+ 'include': '#attribute-entities'
}
]
'string-single-quoted':
@@ -443,7 +483,7 @@
'include': '#embedded-code'
}
{
- 'include': '#entities'
+ 'include': '#attribute-entities'
}
]
'tag-generic-attribute':
@@ -475,7 +515,7 @@
'include': '#embedded-code'
}
{
- 'include': '#entities'
+ 'include': '#attribute-entities'
}
]
}
@@ -495,7 +535,7 @@
'include': '#embedded-code'
}
{
- 'include': '#entities'
+ 'include': '#attribute-entities'
}
]
}
diff --git a/spec/html-spec.coffee b/spec/html-spec.coffee
index ef91223..7ebe5b5 100644
--- a/spec/html-spec.coffee
+++ b/spec/html-spec.coffee
@@ -101,14 +101,76 @@ describe 'HTML grammar', ->
grammarTest path.join(__dirname, 'fixtures/syntax_test_html.html')
grammarTest path.join(__dirname, 'fixtures/syntax_test_html_template_fragments.html')
- describe "entities", ->
+ describe "entities in text", ->
it "tokenizes & and characters after it", ->
{tokens} = grammar.tokenizeLine '& & &a'
- expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic', 'invalid.illegal.bad-ampersand.html']
- expect(tokens[3]).toEqual value: 'amp', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
- expect(tokens[4]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
- expect(tokens[7]).toEqual value: 'a', scopes: ['text.html.basic']
+ expect(tokens[0]).toEqual value: '& ', scopes: ['text.html.basic']
+ expect(tokens[1]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+ expect(tokens[2]).toEqual value: 'amp', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+ expect(tokens[3]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+ expect(tokens[5]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+ expect(tokens[6]).toEqual value: 'a', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+
+ it "tokenizes hexadecimal and digit entities", ->
+ {tokens} = grammar.tokenizeLine '" " "'
+
+ expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+ expect(tokens[1]).toEqual value: '#x00022', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+ expect(tokens[2]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+ expect(tokens[4]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+ expect(tokens[5]).toEqual value: '#X00022', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+ expect(tokens[6]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+ expect(tokens[8]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+ expect(tokens[9]).toEqual value: '#34', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+ expect(tokens[10]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+
+ it "tokenizes invalid ampersands", ->
+ {tokens} = grammar.tokenizeLine 'PSE&>'
+ expect(tokens[0]).toEqual value: 'PSE', scopes: ['text.html.basic']
+ expect(tokens[1]).toEqual value: '&', scopes: ['text.html.basic', 'invalid.illegal.bad-ampersand.html']
+ expect(tokens[2]).toEqual value: '>', scopes: ['text.html.basic']
+
+ {tokens} = grammar.tokenizeLine 'PSE&'
+ expect(tokens[0]).toEqual value: 'PSE&', scopes: ['text.html.basic']
+
+ {tokens} = grammar.tokenizeLine '&<'
+ expect(tokens[0]).toEqual value: '&<', scopes: ['text.html.basic']
+
+ {tokens} = grammar.tokenizeLine '& '
+ expect(tokens[0]).toEqual value: '& ', scopes: ['text.html.basic']
+
+ {tokens} = grammar.tokenizeLine '&'
+ expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic']
+
+ {tokens} = grammar.tokenizeLine '&&'
+ expect(tokens[0]).toEqual value: '&&', scopes: ['text.html.basic']
+
+ describe "entities in attributes", ->
+ it "tokenizes entities", ->
+ {tokens} = grammar.tokenizeLine ''
+ expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+ expect(tokens[8]).toEqual value: 'amp', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'entity.name.entity.other.html']
+ expect(tokens[9]).toEqual value: ';', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+
+ it "does not tokenize query parameters as entities", ->
+ {tokens} = grammar.tokenizeLine ''
+ expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html']
+
+ it "tokenizes invalid ampersands", ->
+ {tokens} = grammar.tokenizeLine ''
+ expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
+
+ {tokens} = grammar.tokenizeLine ''
+ expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
+
+ {tokens} = grammar.tokenizeLine ''
+ expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html']
+
+ # Note: in order to replicate this test's behavior, make sure you have language-hyperlink disabled
+ {tokens} = grammar.tokenizeLine ''
+ expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html']
+ expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
describe "firstLineMatch", ->
it "recognises HTML5 doctypes", ->
From 4af9fac9153c1f3fbcef5c657f443c0c93183cd1 Mon Sep 17 00:00:00 2001
From: Wliu <50Wliu@users.noreply.github.com>
Date: Wed, 7 Jun 2017 23:22:53 -0400
Subject: [PATCH 2/6] PCRE does not support \h
---
grammars/html.cson | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/grammars/html.cson b/grammars/html.cson
index f0dbeb3..e392c15 100644
--- a/grammars/html.cson
+++ b/grammars/html.cson
@@ -364,7 +364,7 @@
# https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
'patterns': [
{
- 'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX]\\h+)'
+ 'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX][0-9a-fA-F]+)'
'beginCaptures':
'1':
'name': 'punctuation.definition.entity.begin.html'
@@ -387,7 +387,7 @@
# make sure that an equals sign or the end of a string does not follow a potential reference.
'patterns': [
{
- 'begin': '(&)(#\\d+|#[xX]\\h+)'
+ 'begin': '(&)(#\\d+|#[xX][0-9a-fA-F]+)'
'beginCaptures':
'1':
'name': 'punctuation.definition.entity.begin.html'
From ced920926b8f92464fb5a51e717d12705d17f098 Mon Sep 17 00:00:00 2001
From: Wliu <50Wliu@users.noreply.github.com>
Date: Tue, 19 Sep 2017 21:22:52 +0200
Subject: [PATCH 3/6] Update specs
---
spec/html-spec.coffee | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/spec/html-spec.coffee b/spec/html-spec.coffee
index e379676..3b69d6b 100644
--- a/spec/html-spec.coffee
+++ b/spec/html-spec.coffee
@@ -424,28 +424,28 @@ describe 'HTML grammar', ->
describe "entities in attributes", ->
it "tokenizes entities", ->
{tokens} = grammar.tokenizeLine ''
- expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
- expect(tokens[8]).toEqual value: 'amp', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'entity.name.entity.other.html']
- expect(tokens[9]).toEqual value: ';', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+ expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+ expect(tokens[8]).toEqual value: 'amp', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'entity.name.entity.other.html']
+ expect(tokens[9]).toEqual value: ';', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
it "does not tokenize query parameters as entities", ->
{tokens} = grammar.tokenizeLine ''
- expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html']
+ expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
it "tokenizes invalid ampersands", ->
{tokens} = grammar.tokenizeLine ''
- expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
+ expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
{tokens} = grammar.tokenizeLine ''
- expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
+ expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
{tokens} = grammar.tokenizeLine ''
- expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html']
+ expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
# Note: in order to replicate this test's behavior, make sure you have language-hyperlink disabled
{tokens} = grammar.tokenizeLine ''
- expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html']
- expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
+ expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
+ expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
describe "firstLineMatch", ->
it "recognises HTML5 doctypes", ->
From 8e54d3a826f8ea8954d487266b3067367c3af8fa Mon Sep 17 00:00:00 2001
From: Wliu <50Wliu@users.noreply.github.com>
Date: Wed, 20 Sep 2017 20:25:21 +0200
Subject: [PATCH 4/6] Add requested spec
---
spec/html-spec.coffee | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/spec/html-spec.coffee b/spec/html-spec.coffee
index 3b69d6b..4e44b4d 100644
--- a/spec/html-spec.coffee
+++ b/spec/html-spec.coffee
@@ -387,6 +387,9 @@ describe 'HTML grammar', ->
expect(tokens[5]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
expect(tokens[6]).toEqual value: 'a', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+ lines = grammar.tokenizeLines '&\n'
+ expect(lines[0][0]).toEqual value: '&', scopes: ['text.html.basic']
+
it "tokenizes hexadecimal and digit entities", ->
{tokens} = grammar.tokenizeLine '" " "'
@@ -433,6 +436,7 @@ describe 'HTML grammar', ->
expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
it "tokenizes invalid ampersands", ->
+ # Note: in order to replicate the following tests' behaviors, make sure you have language-hyperlink disabled
{tokens} = grammar.tokenizeLine ''
expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
@@ -442,7 +446,9 @@ describe 'HTML grammar', ->
{tokens} = grammar.tokenizeLine ''
expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
- # Note: in order to replicate this test's behavior, make sure you have language-hyperlink disabled
+ lines = grammar.tokenizeLines ''
+ expect(lines[0][6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
+
{tokens} = grammar.tokenizeLine ''
expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
From d78c555b78aeefca0af36640961c96de8b162548 Mon Sep 17 00:00:00 2001
From: Wliu <50Wliu@users.noreply.github.com>
Date: Wed, 20 Sep 2017 20:26:20 +0200
Subject: [PATCH 5/6] :memo: Update comment to HTML 5.1 spec
---
grammars/html.cson | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/grammars/html.cson b/grammars/html.cson
index ed0124b..9ea238f 100644
--- a/grammars/html.cson
+++ b/grammars/html.cson
@@ -416,7 +416,7 @@
}
]
'text-entities':
- # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
+ # https://www.w3.org/TR/html51/syntax.html#consume-a-character-reference
'patterns': [
{
'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX][0-9a-fA-F]+)'
@@ -437,7 +437,7 @@
}
]
'attribute-entities':
- # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
+ # https://www.w3.org/TR/html51/syntax.html#consume-a-character-reference
# Because it would be infeasible to include the entire list of allowed entities,
# make sure that an equals sign or the end of a string does not follow a potential reference.
'patterns': [
From adef4ad003738f3462bff56091348a5d4a41dbd7 Mon Sep 17 00:00:00 2001
From: Wliu <50Wliu@users.noreply.github.com>
Date: Wed, 20 Sep 2017 20:27:36 +0200
Subject: [PATCH 6/6] Update old #entities reference to point to
#attribute-entities
---
grammars/html.cson | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/grammars/html.cson b/grammars/html.cson
index 9ea238f..273d5df 100644
--- a/grammars/html.cson
+++ b/grammars/html.cson
@@ -648,7 +648,7 @@
'unquoted-attribute':
'patterns': [
{
- 'include': '#entities'
+ 'include': '#attribute-entities'
}
{
# https://www.w3.org/TR/html51/syntax.html#attribute-value-unquoted-state