Skip to content

Commit 124297c

Browse files
committed
'Clean up' a few aspects of the code
1 parent 55fe763 commit 124297c

File tree

17 files changed

+127
-190
lines changed

17 files changed

+127
-190
lines changed
Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
11
import html
22
import re
3+
from typing import Callable
34
from urllib.parse import urlparse, urlunparse, quote, unquote # noqa: F401
45

5-
# TODO
6-
# replaced: parsed = mdurl.parse(url, True)
6+
# TODO below we port the use of the JS packages:
7+
# var mdurl = require('mdurl')
8+
# var punycode = require('punycode')
9+
10+
# e.g. mdurl: parsed = mdurl.parse(url, True)
11+
#
712
# but need to check these fixes from https://www.npmjs.com/package/mdurl:
813
#
914
# Parse url string. Similar to node's url.parse,
@@ -56,7 +61,15 @@ def normalize_uri(uri):
5661

5762

5863
def normalizeLink(url):
64+
"""Normalize destination URLs in links::
65+
66+
[label]: destination 'title'
67+
^^^^^^^^^^^
68+
"""
69+
url_unescaped = unescape_string(url)
70+
return normalize_uri(url_unescaped)
5971

72+
# markdown-it code:
6073
# parsed = urlparse(url)
6174

6275
# if parsed.hostname:
@@ -71,13 +84,18 @@ def normalizeLink(url):
7184
# parsed.hostname = punycode.toASCII(parsed.hostname)
7285
# except Exception:
7386
# pass
74-
# quote(urlunparse(parsed))
75-
return normalize_uri(unescape_string(url))
87+
# return quote(urlunparse(parsed))
7688

7789

7890
def normalizeLinkText(title):
79-
"""Normalize autolinks """
91+
"""Normalize autolink content::
8092
93+
<destination>
94+
~~~~~~~~~~~
95+
"""
96+
return unquote(unescape_string(title))
97+
98+
# markdown-it code:
8199
# parsed = urlparse(url)
82100

83101
# if parsed.hostname:
@@ -92,24 +110,26 @@ def normalizeLinkText(title):
92110
# parsed.hostname = punycode.toUnicode(parsed.hostname)
93111
# except Exception:
94112
# pass
95-
return unquote(unescape_string(title)) # unquote(urlunparse(parsed))
113+
# return unquote(urlunparse(parsed))
96114

97115

98-
################################################################################
99-
#
100-
# This validator can prohibit more than really needed to prevent XSS. It's a
101-
# tradeoff to keep code simple and to be secure by default.
102-
#
103-
# If you need different setup - override validator method as you wish. Or
104-
# replace it with dummy function and use external sanitizer.
105-
#
106-
107116
BAD_PROTO_RE = re.compile(r"^(vbscript|javascript|file|data):")
108117
GOOD_DATA_RE = re.compile(r"^data:image\/(gif|png|jpeg|webp);")
109118

110119

111-
def validateLink(url: str):
112-
"""url should be normalized at this point, and existing entities are decoded."""
120+
def validateLink(url: str, validator: Callable = None):
121+
"""Validate URL link is allowed in output.
122+
123+
This validator can prohibit more than really needed to prevent XSS.
124+
It's a tradeoff to keep code simple and to be secure by default.
125+
126+
If you need different setup - override validator method as you wish.
127+
Or replace it with dummy function and use external sanitizer.
128+
129+
Note: url should be normalized at this point, and existing entities decoded.
130+
"""
131+
if validator is not None:
132+
return validator(url)
113133
url = url.strip().lower()
114134
return (
115135
(True if GOOD_DATA_RE.search(url) else False)

markdown_it/common/utils.py

Lines changed: 8 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
1-
"""Utilities
1+
"""Utilities for parsing source text
22
"""
33
import html
44
import re
55

66
from .entities import entities
77

8+
# from .normalize_url import unescape_string
9+
810

911
def charCodeAt(src: str, pos: int):
1012
"""
@@ -117,7 +119,7 @@ def replaceEntityPattern(match, name):
117119
"""
118120
::
119121
In [2]: from markdown_it import MarkdownIt
120-
...: md = MarkdownIt("working")
122+
...: md = MarkdownIt()
121123
...: md.render("![](https://www.google.com)")
122124
Out[2]: '<p><img src="https%3A//www.google.com" alt=""></p>\n'
123125
"""
@@ -134,11 +136,10 @@ def replaceEntityPattern(match, name):
134136
return match
135137

136138

137-
# function replaceEntities(string) {
138-
# if (string.indexOf('&') < 0) { return string; }
139-
139+
# def replaceEntities(string):
140+
# if (string.indexOf('&') < 0):
141+
# return string
140142
# return string.replace(ENTITY_RE, replaceEntityPattern)
141-
# }
142143

143144

144145
def unescapeMd(string: str):
@@ -148,23 +149,7 @@ def unescapeMd(string: str):
148149

149150

150151
def unescapeAll(string: str):
151-
if "\\" in string and "&" in string:
152-
return string
153-
154-
# TODO here we use the built-in python method
155-
# check this is ok?
156-
return html.escape(string).replace("&#x27;", "'")
157-
158-
def func(match):
159-
# TODO how to get escaped?
160-
escaped = False
161-
entity = match.group()
162-
if escaped:
163-
return escaped
164-
return replaceEntityPattern(match, entity)
165-
166-
string, _ = UNESCAPE_ALL_RE.subn(func, string)
167-
return string
152+
return html.unescape(string)
168153

169154

170155
# //////////////////////////////////////////////////////////////////////////////
@@ -341,30 +326,3 @@ def normalizeReference(string: str) -> str:
341326
# most notably, `__proto__`)
342327
#
343328
return string.lower().upper()
344-
345-
346-
#########################################################################
347-
348-
# Re-export libraries commonly used in both markdown-it and its plugins,
349-
# so plugins won't have to depend on them explicitly, which reduces their
350-
# bundled size (e.g. a browser build).
351-
#
352-
# exports.lib = {}
353-
# exports.lib.mdurl = require('mdurl')
354-
# exports.lib.ucmicro = require('uc.micro')
355-
356-
# exports.assign = assign
357-
# exports.has = has
358-
# exports.unescapeMd = unescapeMd
359-
# exports.unescapeAll = unescapeAll
360-
# exports.isValidEntityCode = isValidEntityCode
361-
# exports.fromCodePoint = fromCodePoint
362-
# // exports.replaceEntities = replaceEntities
363-
# exports.escapeHtml = escapeHtml
364-
# exports.arrayReplaceAt = arrayReplaceAt
365-
# exports.isSpace = isSpace
366-
# exports.isWhiteSpace = isWhiteSpace
367-
# exports.isMdAsciiPunct = isMdAsciiPunct
368-
# exports.isPunctChar = isPunctChar
369-
# exports.escapeRE = escapeRE
370-
# exports.normalizeReference = normalizeReference

markdown_it/helpers/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
"""
2-
Just a shortcut for bulk export
1+
"""Functions for parsing Links
32
"""
43
from .parse_link_label import parseLinkLabel # noqa: F401
54
from .parse_link_destination import parseLinkDestination # noqa: F401

markdown_it/main.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from typing import Callable, List, Optional, Union
22

33
from . import helpers, presets # noqa F401
4-
from .normalize_url import normalizeLink, normalizeLinkText, validateLink
54
from .common import utils # noqa F401
65
from .parser_core import ParserCore # noqa F401
76
from .parser_block import ParserBlock # noqa F401
@@ -10,16 +9,12 @@
109
from .renderer import RendererHTML
1110
from .utils import AttrDict
1211

13-
# var LinkifyIt = require('linkify-it')
14-
# var mdurl = require('mdurl')
15-
# var punycode = require('punycode')
1612

1713
config = AttrDict(
1814
{
1915
"default": presets.default.presets,
2016
"zero": presets.zero.presets,
2117
"commonmark": presets.commonmark.presets,
22-
"working": presets.working.presets,
2318
}
2419
)
2520

@@ -46,11 +41,9 @@ def __init__(
4641
self.block = ParserBlock()
4742
self.core = ParserCore()
4843
self.renderer = RendererHTML() if renderer is None else renderer
44+
# var LinkifyIt = require('linkify-it')
4945
# self.linkify = LinkifyIt() # TODO maybe see https://github.com/Suor/autolink
5046

51-
self.validateLink = validateLink
52-
self.normalizeLink = normalizeLink
53-
self.normalizeLinkText = normalizeLinkText
5447
self.utils = utils
5548
self.helpers = helpers
5649
self.options = {}

markdown_it/port.yaml

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,34 @@
22
commit: f798bea9623277bbf89b9621cf7fb283c693fcab
33
date: Mar 12, 2020
44
notes:
5-
- Rename variables that use python built-in names
5+
- Rename variables that use python built-in names, e.g.
66
- `max` -> `maximum`
77
- `len` -> `length`
8-
- Convert JS for loops -> while loops
8+
- `str` -> `string`
9+
- |
10+
Convert JS for loops -to while loops
11+
this is generally the main difference between the codes,
12+
because in python you can't do e.g. `for {i=1;i<x;i++} {}`
913
- Use python version of `charCodeAt`
10-
- allow custom renderer to be passed to `MarkdownIt`
1114
- |
12-
render method signatures
15+
Remove indirect references within `MarkdownIt`;
16+
17+
self.validateLink = validateLink
18+
self.normalizeLink = normalizeLink
19+
self.normalizeLinkText = normalizeLinkText
20+
21+
in favour of using them directly through:
22+
23+
from markdown_it.common.normalize_url import normalizeLinkText
24+
25+
- Allow custom renderer to be passed to `MarkdownIt`
26+
- |
27+
change render method signatures
1328
`func(tokens, idx, options, env, slf)` to
1429
`func(self, tokens, idx, options, env)`
1530
- |
16-
Extension add render methods by format
31+
Extensions add render methods by format
1732
`MarkdownIt.add_render_rule(name, function, fmt="html")`,
18-
and renderers should declare a class property `__output__ = "html"`
33+
rather than `MarkdownIt.renderer.rules[name] = function`
34+
and renderers should declare a class property `__output__ = "html"`.
35+
This allows for extensability to more than just HTML renderers

markdown_it/presets/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
from . import commonmark, default, working, zero # noqa: F401
1+
from . import commonmark, default, zero # noqa: F401

markdown_it/presets/commonmark.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,32 @@
33

44
presets = {
55
"options": {
6-
"html": True, # Enable HTML tags in source
7-
"xhtmlOut": True, # Use '/' to close single tags (<br />)
8-
"breaks": False, # Convert '\n' in paragraphs into <br>
9-
"langPrefix": "language-", # CSS language prefix for fenced blocks
6+
"maxNesting": 20, # Internal protection, recursion limit
7+
"html": True, # Enable HTML tags in source,
8+
# this is just a shorthand for .enable(["html_inline", "html_block"])
9+
# used by the linkify rule:
1010
"linkify": False, # autoconvert URL-like texts to links
11+
# used by the replacements and smartquotes rules
1112
# Enable some language-neutral replacements + quotes beautification
1213
"typographer": False,
14+
# used by the smartquotes rule:
1315
# Double + single quotes replacement pairs, when typographer enabled,
1416
# and smartquotes on. Could be either a String or an Array.
1517
#
1618
# For example, you can use '«»„“' for Russian, '„“‚‘' for German,
1719
# and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp).
1820
"quotes": "\u201c\u201d\u2018\u2019", # /* “”‘’ */
21+
# Renderer specific; these options are used directly in the HTML renderer
22+
"xhtmlOut": True, # Use '/' to close single tags (<br />)
23+
"breaks": False, # Convert '\n' in paragraphs into <br>
24+
"langPrefix": "language-", # CSS language prefix for fenced blocks
1925
# Highlighter function. Should return escaped HTML,
2026
# or '' if the source string is not changed and should be escaped externally.
2127
# If result starts with <pre... internal wrapper is skipped.
2228
#
2329
# function (/*str, lang*/) { return ''; }
2430
#
2531
"highlight": None,
26-
"maxNesting": 20, # Internal protection, recursion limit
2732
},
2833
"components": {
2934
"core": {"rules": ["normalize", "block", "inline"]},

markdown_it/presets/default.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,27 +3,31 @@
33

44
presets = {
55
"options": {
6+
"maxNesting": 100, # Internal protection, recursion limit
67
"html": False, # Enable HTML tags in source
7-
"xhtmlOut": False, # Use '/' to close single tags (<br />)
8-
"breaks": False, # Convert '\n' in paragraphs into <br>
9-
"langPrefix": "language-", # CSS language prefix for fenced blocks
8+
# this is just a shorthand for .disable(["html_inline", "html_block"])
9+
# used by the linkify rule:
1010
"linkify": False, # autoconvert URL-like texts to links
11+
# used by the replacements and smartquotes rules:
1112
# Enable some language-neutral replacements + quotes beautification
1213
"typographer": False,
14+
# used by the smartquotes rule:
1315
# Double + single quotes replacement pairs, when typographer enabled,
1416
# and smartquotes on. Could be either a String or an Array.
15-
#
1617
# For example, you can use '«»„“' for Russian, '„“‚‘' for German,
1718
# and ['«\xA0', '\xA0»', '‹\xA0', '\xA0›'] for French (including nbsp).
1819
"quotes": "\u201c\u201d\u2018\u2019", # /* “”‘’ */
20+
# Renderer specific; these options are used directly in the HTML renderer
21+
"xhtmlOut": False, # Use '/' to close single tags (<br />)
22+
"breaks": False, # Convert '\n' in paragraphs into <br>
23+
"langPrefix": "language-", # CSS language prefix for fenced blocks
1924
# Highlighter function. Should return escaped HTML,
2025
# or '' if the source string is not changed and should be escaped externaly.
2126
# If result starts with <pre... internal wrapper is skipped.
2227
#
2328
# function (/*str, lang*/) { return ''; }
2429
#
2530
"highlight": None,
26-
"maxNesting": 100, # Internal protection, recursion limit
2731
},
2832
"components": {"core": {}, "block": {}, "inline": {}},
2933
}

0 commit comments

Comments
 (0)