From f829ec56b7fbb9637af985c9f6c509bbaf92261f Mon Sep 17 00:00:00 2001
From: Wliu <50Wliu@users.noreply.github.com>
Date: Fri, 28 Apr 2017 19:05:16 -0400
Subject: [PATCH 1/6] Improve entity scopes, take two

---
 grammars/html.cson    | 60 ++++++++++++++++++++++++++++++------
 spec/html-spec.coffee | 72 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 117 insertions(+), 15 deletions(-)

diff --git a/grammars/html.cson b/grammars/html.cson
index 1a21e67..f0dbeb3 100644
--- a/grammars/html.cson
+++ b/grammars/html.cson
@@ -343,7 +343,7 @@
     ]
   }
   {
-    'include': '#entities'
+    'include': '#text-entities'
   }
   {
     'match': '<>'
@@ -360,21 +360,61 @@
         'include': '#python'
       }
     ]
-  'entities':
+  'text-entities':
+    # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
     'patterns': [
       {
-        'match': '(&)([a-zA-Z0-9]+|#[0-9]+|#x[0-9a-fA-F]+)(;)'
-        'captures':
+        'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX]\\h+)'
+        'beginCaptures':
           '1':
             'name': 'punctuation.definition.entity.begin.html'
           '2':
             'name': 'entity.name.entity.other.html'
-          '3':
+        'end': ';'
+        'endCaptures':
+          '0':
+            'name': 'punctuation.definition.entity.end.html'
+        'name': 'constant.character.entity.html'
+      }
+      {
+        'match': '&(?!\\s|<|&)'
+        'name': 'invalid.illegal.bad-ampersand.html'
+      }
+    ]
+  'attribute-entities':
+    # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
+    # Because it would be infeasible to include the entire list of allowed entities,
+    # make sure that an equals sign or the end of a string does not follow a potential reference.
+    'patterns': [
+      {
+        'begin': '(&)(#\\d+|#[xX]\\h+)'
+        'beginCaptures':
+          '1':
+            'name': 'punctuation.definition.entity.begin.html'
+          '2':
+            'name': 'entity.name.entity.other.html'
+        'end': ';'
+        'endCaptures':
+          '0':
+            'name': 'punctuation.definition.entity.end.html'
+        'name': 'constant.character.entity.html'
+      }
+      {
+        'begin': '(&)([a-zA-Z0-9]++)(?!["\'=])'
+        'beginCaptures':
+          '1':
+            'name': 'punctuation.definition.entity.begin.html'
+          '2':
+            'name': 'entity.name.entity.other.html'
+        'end': ';'
+        'endCaptures':
+          '0':
             'name': 'punctuation.definition.entity.end.html'
         'name': 'constant.character.entity.html'
       }
       {
-        'match': '&'
+        # In attributes, potential references that end with an equals sign are fine
+        'match': '&(?!\\s|<|&|[a-zA-Z0-9]+=)'
         'name': 'invalid.illegal.bad-ampersand.html'
       }
     ]
@@ -425,7 +465,7 @@
         'include': '#embedded-code'
       }
       {
-        'include': '#entities'
+        'include': '#attribute-entities'
       }
     ]
   'string-single-quoted':
@@ -443,7 +483,7 @@
         'include': '#embedded-code'
       }
       {
-        'include': '#entities'
+        'include': '#attribute-entities'
       }
     ]
   'tag-generic-attribute':
@@ -475,7 +515,7 @@
             'include': '#embedded-code'
           }
           {
-            'include': '#entities'
+            'include': '#attribute-entities'
           }
         ]
       }
@@ -495,7 +535,7 @@
             'include': '#embedded-code'
           }
           {
-            'include': '#entities'
+            'include': '#attribute-entities'
           }
         ]
       }
diff --git a/spec/html-spec.coffee b/spec/html-spec.coffee
index ef91223..7ebe5b5 100644
--- a/spec/html-spec.coffee
+++ b/spec/html-spec.coffee
@@ -101,14 +101,76 @@ describe 'HTML grammar', ->
   grammarTest path.join(__dirname, 'fixtures/syntax_test_html.html')
   grammarTest path.join(__dirname, 'fixtures/syntax_test_html_template_fragments.html')
 
-  describe "entities", ->
+  describe "entities in text", ->
     it "tokenizes & and characters after it", ->
       {tokens} = grammar.tokenizeLine '& &amp; &a'
 
-      expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic', 'invalid.illegal.bad-ampersand.html']
-      expect(tokens[3]).toEqual value: 'amp', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
-      expect(tokens[4]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
-      expect(tokens[7]).toEqual value: 'a', scopes: ['text.html.basic']
+      expect(tokens[0]).toEqual value: '& ', scopes: ['text.html.basic']
+      expect(tokens[1]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+      expect(tokens[2]).toEqual value: 'amp', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+      expect(tokens[3]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+      expect(tokens[5]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+      expect(tokens[6]).toEqual value: 'a', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+
+    it "tokenizes hexadecimal and digit entities", ->
+      {tokens} = grammar.tokenizeLine '&#x00022; &#X00022; &#34;'
+
+      expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+      expect(tokens[1]).toEqual value: '#x00022', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+      expect(tokens[2]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+      expect(tokens[4]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+      expect(tokens[5]).toEqual value: '#X00022', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+      expect(tokens[6]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+      expect(tokens[8]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+      expect(tokens[9]).toEqual value: '#34', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+      expect(tokens[10]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+
+    it "tokenizes invalid ampersands", ->
+      {tokens} = grammar.tokenizeLine 'PSE&>'
+      expect(tokens[0]).toEqual value: 'PSE', scopes: ['text.html.basic']
+      expect(tokens[1]).toEqual value: '&', scopes: ['text.html.basic', 'invalid.illegal.bad-ampersand.html']
+      expect(tokens[2]).toEqual value: '>', scopes: ['text.html.basic']
+
+      {tokens} = grammar.tokenizeLine 'PSE&'
+      expect(tokens[0]).toEqual value: 'PSE&', scopes: ['text.html.basic']
+
+      {tokens} = grammar.tokenizeLine '&<'
+      expect(tokens[0]).toEqual value: '&<', scopes: ['text.html.basic']
+
+      {tokens} = grammar.tokenizeLine '& '
+      expect(tokens[0]).toEqual value: '& ', scopes: ['text.html.basic']
+
+      {tokens} = grammar.tokenizeLine '&'
+      expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic']
+
+      {tokens} = grammar.tokenizeLine '&&'
+      expect(tokens[0]).toEqual value: '&&', scopes: ['text.html.basic']
+
+  describe "entities in attributes", ->
+    it "tokenizes entities", ->
+      {tokens} = grammar.tokenizeLine '<a href="http://example.com?&amp;">'
+      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+      expect(tokens[8]).toEqual value: 'amp', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'entity.name.entity.other.html']
+      expect(tokens[9]).toEqual value: ';', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+
+    it "does not tokenize query parameters as entities", ->
+      {tokens} = grammar.tokenizeLine '<a href="http://example.com?one=1&type=json&topic=css">'
+      expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html']
+
+    it "tokenizes invalid ampersands", ->
+      {tokens} = grammar.tokenizeLine '<a href="http://example.com?&">'
+      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
+
+      {tokens} = grammar.tokenizeLine '<a href="http://example.com?&=">'
+      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
+
+      {tokens} = grammar.tokenizeLine '<a href="http://example.com?& ">'
+      expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html']
+
+      # Note: in order to replicate this test's behavior, make sure you have language-hyperlink disabled
+      {tokens} = grammar.tokenizeLine '<a href="http://example.com?&&">'
+      expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html']
+      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
 
   describe "firstLineMatch", ->
     it "recognises HTML5 doctypes", ->

From 4af9fac9153c1f3fbcef5c657f443c0c93183cd1 Mon Sep 17 00:00:00 2001
From: Wliu <50Wliu@users.noreply.github.com>
Date: Wed, 7 Jun 2017 23:22:53 -0400
Subject: [PATCH 2/6] PCRE does not support \h

---
 grammars/html.cson | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/grammars/html.cson b/grammars/html.cson
index f0dbeb3..e392c15 100644
--- a/grammars/html.cson
+++ b/grammars/html.cson
@@ -364,7 +364,7 @@
     # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
     'patterns': [
       {
-        'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX]\\h+)'
+        'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX][0-9a-fA-F]+)'
         'beginCaptures':
           '1':
             'name': 'punctuation.definition.entity.begin.html'
@@ -387,7 +387,7 @@
     # make sure that an equals sign or the end of a string does not follow a potential reference.
     'patterns': [
       {
-        'begin': '(&)(#\\d+|#[xX]\\h+)'
+        'begin': '(&)(#\\d+|#[xX][0-9a-fA-F]+)'
         'beginCaptures':
           '1':
             'name': 'punctuation.definition.entity.begin.html'

From ced920926b8f92464fb5a51e717d12705d17f098 Mon Sep 17 00:00:00 2001
From: Wliu <50Wliu@users.noreply.github.com>
Date: Tue, 19 Sep 2017 21:22:52 +0200
Subject: [PATCH 3/6] Update specs

---
 spec/html-spec.coffee | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/spec/html-spec.coffee b/spec/html-spec.coffee
index e379676..3b69d6b 100644
--- a/spec/html-spec.coffee
+++ b/spec/html-spec.coffee
@@ -424,28 +424,28 @@ describe 'HTML grammar', ->
   describe "entities in attributes", ->
     it "tokenizes entities", ->
       {tokens} = grammar.tokenizeLine '<a href="http://example.com?&amp;">'
-      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
-      expect(tokens[8]).toEqual value: 'amp', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'entity.name.entity.other.html']
-      expect(tokens[9]).toEqual value: ';', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+      expect(tokens[8]).toEqual value: 'amp', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'entity.name.entity.other.html']
+      expect(tokens[9]).toEqual value: ';', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
 
     it "does not tokenize query parameters as entities", ->
       {tokens} = grammar.tokenizeLine '<a href="http://example.com?one=1&type=json&topic=css">'
-      expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html']
+      expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
 
     it "tokenizes invalid ampersands", ->
       {tokens} = grammar.tokenizeLine '<a href="http://example.com?&">'
-      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
+      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
 
       {tokens} = grammar.tokenizeLine '<a href="http://example.com?&=">'
-      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
+      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
 
       {tokens} = grammar.tokenizeLine '<a href="http://example.com?& ">'
-      expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html']
+      expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
 
       # Note: in order to replicate this test's behavior, make sure you have language-hyperlink disabled
       {tokens} = grammar.tokenizeLine '<a href="http://example.com?&&">'
-      expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html']
-      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
+      expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
+      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
 
   describe "firstLineMatch", ->
     it "recognises HTML5 doctypes", ->

From 8e54d3a826f8ea8954d487266b3067367c3af8fa Mon Sep 17 00:00:00 2001
From: Wliu <50Wliu@users.noreply.github.com>
Date: Wed, 20 Sep 2017 20:25:21 +0200
Subject: [PATCH 4/6] Add requested spec

---
 spec/html-spec.coffee | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/spec/html-spec.coffee b/spec/html-spec.coffee
index 3b69d6b..4e44b4d 100644
--- a/spec/html-spec.coffee
+++ b/spec/html-spec.coffee
@@ -387,6 +387,9 @@ describe 'HTML grammar', ->
       expect(tokens[5]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
       expect(tokens[6]).toEqual value: 'a', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
 
+      lines = grammar.tokenizeLines '&\n'
+      expect(lines[0][0]).toEqual value: '&', scopes: ['text.html.basic']
+
     it "tokenizes hexadecimal and digit entities", ->
       {tokens} = grammar.tokenizeLine '&#x00022; &#X00022; &#34;'
 
@@ -433,6 +436,7 @@ describe 'HTML grammar', ->
       expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
 
     it "tokenizes invalid ampersands", ->
+      # Note: in order to replicate the following tests' behaviors, make sure you have language-hyperlink disabled
       {tokens} = grammar.tokenizeLine '<a href="http://example.com?&">'
       expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
 
@@ -442,7 +446,9 @@ describe 'HTML grammar', ->
       {tokens} = grammar.tokenizeLine '<a href="http://example.com?& ">'
       expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
 
-      # Note: in order to replicate this test's behavior, make sure you have language-hyperlink disabled
+      lines = grammar.tokenizeLines '<a href="http://example.com?&\n">'
+      expect(lines[0][6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
+
       {tokens} = grammar.tokenizeLine '<a href="http://example.com?&&">'
       expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
       expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']

From d78c555b78aeefca0af36640961c96de8b162548 Mon Sep 17 00:00:00 2001
From: Wliu <50Wliu@users.noreply.github.com>
Date: Wed, 20 Sep 2017 20:26:20 +0200
Subject: [PATCH 5/6] :memo: Update comment to HTML 5.1 spec

---
 grammars/html.cson | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/grammars/html.cson b/grammars/html.cson
index ed0124b..9ea238f 100644
--- a/grammars/html.cson
+++ b/grammars/html.cson
@@ -416,7 +416,7 @@
       }
     ]
   'text-entities':
-    # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
+    # https://www.w3.org/TR/html51/syntax.html#consume-a-character-reference
     'patterns': [
       {
         'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX][0-9a-fA-F]+)'
@@ -437,7 +437,7 @@
       }
     ]
   'attribute-entities':
-    # https://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
+    # https://www.w3.org/TR/html51/syntax.html#consume-a-character-reference
     # Because it would be infeasible to include the entire list of allowed entities,
     # make sure that an equals sign or the end of a string does not follow a potential reference.
     'patterns': [

From adef4ad003738f3462bff56091348a5d4a41dbd7 Mon Sep 17 00:00:00 2001
From: Wliu <50Wliu@users.noreply.github.com>
Date: Wed, 20 Sep 2017 20:27:36 +0200
Subject: [PATCH 6/6] Update old #entities reference to point to
 #attribute-entities

---
 grammars/html.cson | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/grammars/html.cson b/grammars/html.cson
index 9ea238f..273d5df 100644
--- a/grammars/html.cson
+++ b/grammars/html.cson
@@ -648,7 +648,7 @@
   'unquoted-attribute':
     'patterns': [
       {
-        'include': '#entities'
+        'include': '#attribute-entities'
       }
       {
         # https://www.w3.org/TR/html51/syntax.html#attribute-value-unquoted-state