Skip to content

Commit d7a7589

Browse files
authored
Optimizations for read_po (#1200)
* Avoid extra casts (`Message()` takes care of those) * Optimize empty normalized strings * Don't sort translations unless plural * Optimize unescape() * Optimize line processing * Optimize keyword parsing * Optimize comment parsing * Avoid hot `isinstance`ing in PO file parse loop * Add fast paths in `python_format` and `python_brace_format` * Inline distincting in `catalog.py`
1 parent 3ce1e61 commit d7a7589

File tree

4 files changed

+105
-138
lines changed

4 files changed

+105
-138
lines changed

babel/messages/catalog.py

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from babel.core import Locale, UnknownLocaleError
2424
from babel.dates import format_datetime
2525
from babel.messages.plurals import get_plural
26-
from babel.util import LOCALTZ, _cmp, distinct
26+
from babel.util import LOCALTZ, _cmp
2727

2828
if TYPE_CHECKING:
2929
from typing_extensions import TypeAlias
@@ -164,7 +164,7 @@ def __init__(
164164
if not string and self.pluralizable:
165165
string = ('', '')
166166
self.string = string
167-
self.locations = list(distinct(locations))
167+
self.locations = list(dict.fromkeys(locations)) if locations else []
168168
self.flags = set(flags)
169169
if id and self.python_format:
170170
self.flags.add('python-format')
@@ -174,12 +174,15 @@ def __init__(
174174
self.flags.add('python-brace-format')
175175
else:
176176
self.flags.discard('python-brace-format')
177-
self.auto_comments = list(distinct(auto_comments))
178-
self.user_comments = list(distinct(user_comments))
179-
if isinstance(previous_id, str):
180-
self.previous_id = [previous_id]
177+
self.auto_comments = list(dict.fromkeys(auto_comments)) if auto_comments else []
178+
self.user_comments = list(dict.fromkeys(user_comments)) if user_comments else []
179+
if previous_id:
180+
if isinstance(previous_id, str):
181+
self.previous_id = [previous_id]
182+
else:
183+
self.previous_id = list(previous_id)
181184
else:
182-
self.previous_id = list(previous_id)
185+
self.previous_id = []
183186
self.lineno = lineno
184187
self.context = context
185188

@@ -289,9 +292,12 @@ def python_format(self) -> bool:
289292
290293
:type: `bool`"""
291294
ids = self.id
292-
if not isinstance(ids, (list, tuple)):
293-
ids = [ids]
294-
return any(PYTHON_FORMAT.search(id) for id in ids)
295+
if isinstance(ids, (list, tuple)):
296+
for id in ids: # Explicit loop for performance reasons.
297+
if PYTHON_FORMAT.search(id):
298+
return True
299+
return False
300+
return bool(PYTHON_FORMAT.search(ids))
295301

296302
@property
297303
def python_brace_format(self) -> bool:
@@ -304,9 +310,12 @@ def python_brace_format(self) -> bool:
304310
305311
:type: `bool`"""
306312
ids = self.id
307-
if not isinstance(ids, (list, tuple)):
308-
ids = [ids]
309-
return any(_has_python_brace_format(id) for id in ids)
313+
if isinstance(ids, (list, tuple)):
314+
for id in ids: # Explicit loop for performance reasons.
315+
if _has_python_brace_format(id):
316+
return True
317+
return False
318+
return _has_python_brace_format(ids)
310319

311320

312321
class TranslationError(Exception):
@@ -729,12 +738,9 @@ def __setitem__(self, id: _MessageID, message: Message) -> None:
729738
# The new message adds pluralization
730739
current.id = message.id
731740
current.string = message.string
732-
current.locations = list(distinct(current.locations +
733-
message.locations))
734-
current.auto_comments = list(distinct(current.auto_comments +
735-
message.auto_comments))
736-
current.user_comments = list(distinct(current.user_comments +
737-
message.user_comments))
741+
current.locations = list(dict.fromkeys([*current.locations, *message.locations]))
742+
current.auto_comments = list(dict.fromkeys([*current.auto_comments, *message.auto_comments]))
743+
current.user_comments = list(dict.fromkeys([*current.user_comments, *message.user_comments]))
738744
current.flags |= message.flags
739745
elif id == '':
740746
# special treatment for the header message
@@ -916,8 +922,8 @@ def _merge(message: Message, oldkey: tuple[str, str] | str, newkey: tuple[str, s
916922
assert oldmsg is not None
917923
message.string = oldmsg.string
918924

919-
if keep_user_comments:
920-
message.user_comments = list(distinct(oldmsg.user_comments))
925+
if keep_user_comments and oldmsg.user_comments:
926+
message.user_comments = list(dict.fromkeys(oldmsg.user_comments))
921927

922928
if isinstance(message.id, (list, tuple)):
923929
if not isinstance(message.string, (list, tuple)):

babel/messages/pofile.py

Lines changed: 70 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,17 @@
1717

1818
from babel.core import Locale
1919
from babel.messages.catalog import Catalog, Message
20-
from babel.util import TextWrapper, _cmp
20+
from babel.util import TextWrapper
2121

2222
if TYPE_CHECKING:
2323
from typing import IO, AnyStr
2424

2525
from _typeshed import SupportsWrite
2626

2727

28+
_unescape_re = re.compile(r'\\([\\trn"])')
29+
30+
2831
def unescape(string: str) -> str:
2932
r"""Reverse `escape` the given string.
3033
@@ -45,7 +48,10 @@ def replace_escapes(match):
4548
return '\r'
4649
# m is \ or "
4750
return m
48-
return re.compile(r'\\([\\trn"])').sub(replace_escapes, string[1:-1])
51+
52+
if "\\" not in string: # Fast path: there's nothing to unescape
53+
return string[1:-1]
54+
return _unescape_re.sub(replace_escapes, string[1:-1])
4955

5056

5157
def denormalize(string: str) -> str:
@@ -73,7 +79,7 @@ def denormalize(string: str) -> str:
7379
escaped_lines = string.splitlines()
7480
if string.startswith('""'):
7581
escaped_lines = escaped_lines[1:]
76-
return ''.join(unescape(line) for line in escaped_lines)
82+
return ''.join(map(unescape, escaped_lines))
7783
else:
7884
return unescape(string)
7985

@@ -132,48 +138,14 @@ def __init__(self, message: str, catalog: Catalog, line: str, lineno: int) -> No
132138
self.lineno = lineno
133139

134140

135-
class _NormalizedString:
136-
141+
class _NormalizedString(list):
137142
def __init__(self, *args: str) -> None:
138-
self._strs: list[str] = []
139-
for arg in args:
140-
self.append(arg)
141-
142-
def append(self, s: str) -> None:
143-
self._strs.append(s.strip())
143+
super().__init__(map(str.strip, args))
144144

145145
def denormalize(self) -> str:
146-
return ''.join(unescape(s) for s in self._strs)
147-
148-
def __bool__(self) -> bool:
149-
return bool(self._strs)
150-
151-
def __repr__(self) -> str:
152-
return os.linesep.join(self._strs)
153-
154-
def __cmp__(self, other: object) -> int:
155-
if not other:
156-
return 1
157-
158-
return _cmp(str(self), str(other))
159-
160-
def __gt__(self, other: object) -> bool:
161-
return self.__cmp__(other) > 0
162-
163-
def __lt__(self, other: object) -> bool:
164-
return self.__cmp__(other) < 0
165-
166-
def __ge__(self, other: object) -> bool:
167-
return self.__cmp__(other) >= 0
168-
169-
def __le__(self, other: object) -> bool:
170-
return self.__cmp__(other) <= 0
171-
172-
def __eq__(self, other: object) -> bool:
173-
return self.__cmp__(other) == 0
174-
175-
def __ne__(self, other: object) -> bool:
176-
return self.__cmp__(other) != 0
146+
if not self:
147+
return ""
148+
return ''.join(map(unescape, self))
177149

178150

179151
class PoFileParser:
@@ -183,13 +155,6 @@ class PoFileParser:
183155
See `read_po` for simple cases.
184156
"""
185157

186-
_keywords = [
187-
'msgid',
188-
'msgstr',
189-
'msgctxt',
190-
'msgid_plural',
191-
]
192-
193158
def __init__(self, catalog: Catalog, ignore_obsolete: bool = False, abort_invalid: bool = False) -> None:
194159
self.catalog = catalog
195160
self.ignore_obsolete = ignore_obsolete
@@ -216,23 +181,20 @@ def _add_message(self) -> None:
216181
Add a message to the catalog based on the current parser state and
217182
clear the state ready to process the next message.
218183
"""
219-
self.translations.sort()
220184
if len(self.messages) > 1:
221185
msgid = tuple(m.denormalize() for m in self.messages)
222-
else:
223-
msgid = self.messages[0].denormalize()
224-
if isinstance(msgid, (list, tuple)):
225186
string = ['' for _ in range(self.catalog.num_plurals)]
226-
for idx, translation in self.translations:
187+
for idx, translation in sorted(self.translations):
227188
if idx >= self.catalog.num_plurals:
228189
self._invalid_pofile("", self.offset, "msg has more translations than num_plurals of catalog")
229190
continue
230191
string[idx] = translation.denormalize()
231192
string = tuple(string)
232193
else:
194+
msgid = self.messages[0].denormalize()
233195
string = self.translations[0][1].denormalize()
234196
msgctxt = self.context.denormalize() if self.context else None
235-
message = Message(msgid, string, list(self.locations), set(self.flags),
197+
message = Message(msgid, string, self.locations, self.flags,
236198
self.auto_comments, self.user_comments, lineno=self.offset + 1,
237199
context=msgctxt)
238200
if self.obsolete:
@@ -247,27 +209,19 @@ def _finish_current_message(self) -> None:
247209
if self.messages:
248210
if not self.translations:
249211
self._invalid_pofile("", self.offset, f"missing msgstr for msgid '{self.messages[0].denormalize()}'")
250-
self.translations.append([0, _NormalizedString("")])
212+
self.translations.append([0, _NormalizedString()])
251213
self._add_message()
252214

253215
def _process_message_line(self, lineno, line, obsolete=False) -> None:
254-
if line.startswith('"'):
216+
if not line:
217+
return
218+
if line[0] == '"':
255219
self._process_string_continuation_line(line, lineno)
256220
else:
257221
self._process_keyword_line(lineno, line, obsolete)
258222

259223
def _process_keyword_line(self, lineno, line, obsolete=False) -> None:
260-
261-
for keyword in self._keywords:
262-
try:
263-
if line.startswith(keyword) and line[len(keyword)] in [' ', '[']:
264-
arg = line[len(keyword):]
265-
break
266-
except IndexError:
267-
self._invalid_pofile(line, lineno, "Keyword must be followed by a string")
268-
else:
269-
self._invalid_pofile(line, lineno, "Start of line didn't match any expected keyword.")
270-
return
224+
keyword, _, arg = line.partition(' ')
271225

272226
if keyword in ['msgid', 'msgctxt']:
273227
self._finish_current_message()
@@ -283,19 +237,23 @@ def _process_keyword_line(self, lineno, line, obsolete=False) -> None:
283237
self.in_msgctxt = False
284238
self.in_msgid = True
285239
self.messages.append(_NormalizedString(arg))
240+
return
286241

287-
elif keyword == 'msgstr':
242+
if keyword == 'msgctxt':
243+
self.in_msgctxt = True
244+
self.context = _NormalizedString(arg)
245+
return
246+
247+
if keyword == 'msgstr' or keyword.startswith('msgstr['):
288248
self.in_msgid = False
289249
self.in_msgstr = True
290-
if arg.startswith('['):
291-
idx, msg = arg[1:].split(']', 1)
292-
self.translations.append([int(idx), _NormalizedString(msg)])
293-
else:
294-
self.translations.append([0, _NormalizedString(arg)])
250+
kwarg, has_bracket, idxarg = keyword.partition('[')
251+
idx = int(idxarg[:-1]) if has_bracket else 0
252+
s = _NormalizedString(arg) if arg != '""' else _NormalizedString()
253+
self.translations.append([idx, s])
254+
return
295255

296-
elif keyword == 'msgctxt':
297-
self.in_msgctxt = True
298-
self.context = _NormalizedString(arg)
256+
self._invalid_pofile(line, lineno, "Unknown or misformatted keyword")
299257

300258
def _process_string_continuation_line(self, line, lineno) -> None:
301259
if self.in_msgid:
@@ -307,49 +265,62 @@ def _process_string_continuation_line(self, line, lineno) -> None:
307265
else:
308266
self._invalid_pofile(line, lineno, "Got line starting with \" but not in msgid, msgstr or msgctxt")
309267
return
310-
s.append(line)
268+
s.append(line.strip()) # For performance reasons, `NormalizedString` doesn't strip internally
311269

312270
def _process_comment(self, line) -> None:
313271

314272
self._finish_current_message()
315273

316-
if line[1:].startswith(':'):
274+
prefix = line[:2]
275+
if prefix == '#:':
317276
for location in _extract_locations(line[2:]):
318-
pos = location.rfind(':')
319-
if pos >= 0:
277+
a, colon, b = location.rpartition(':')
278+
if colon:
320279
try:
321-
lineno = int(location[pos + 1:])
280+
self.locations.append((a, int(b)))
322281
except ValueError:
323282
continue
324-
self.locations.append((location[:pos], lineno))
325-
else:
283+
else: # No line number specified
326284
self.locations.append((location, None))
327-
elif line[1:].startswith(','):
328-
for flag in line[2:].lstrip().split(','):
329-
self.flags.append(flag.strip())
330-
elif line[1:].startswith('.'):
285+
return
286+
287+
if prefix == '#,':
288+
self.flags.extend(flag.strip() for flag in line[2:].lstrip().split(','))
289+
return
290+
291+
if prefix == '#.':
331292
# These are called auto-comments
332293
comment = line[2:].strip()
333294
if comment: # Just check that we're not adding empty comments
334295
self.auto_comments.append(comment)
335-
else:
336-
# These are called user comments
337-
self.user_comments.append(line[1:].strip())
296+
return
297+
298+
# These are called user comments
299+
self.user_comments.append(line[1:].strip())
338300

339301
def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None:
340302
"""
341-
Reads from the file-like object `fileobj` and adds any po file
342-
units found in it to the `Catalog` supplied to the constructor.
303+
Reads from the file-like object (or iterable of string-likes) `fileobj`
304+
and adds any po file units found in it to the `Catalog`
305+
supplied to the constructor.
306+
307+
All of the items in the iterable must be the same type; either `str`
308+
or `bytes` (decoded with the catalog charset), but not a mixture.
343309
"""
310+
needs_decode = None
344311

345312
for lineno, line in enumerate(fileobj):
346313
line = line.strip()
347-
if not isinstance(line, str):
348-
line = line.decode(self.catalog.charset)
314+
if needs_decode is None:
315+
# If we don't yet know whether we need to decode,
316+
# let's find out now.
317+
needs_decode = not isinstance(line, str)
349318
if not line:
350319
continue
351-
if line.startswith('#'):
352-
if line[1:].startswith('~'):
320+
if needs_decode:
321+
line = line.decode(self.catalog.charset)
322+
if line[0] == '#':
323+
if line[:2] == '#~':
353324
self._process_message_line(lineno, line[2:].lstrip(), obsolete=True)
354325
else:
355326
try:
@@ -364,8 +335,8 @@ def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None:
364335
# No actual messages found, but there was some info in comments, from which
365336
# we'll construct an empty header message
366337
if not self.counter and (self.flags or self.user_comments or self.auto_comments):
367-
self.messages.append(_NormalizedString('""'))
368-
self.translations.append([0, _NormalizedString('""')])
338+
self.messages.append(_NormalizedString())
339+
self.translations.append([0, _NormalizedString()])
369340
self._add_message()
370341

371342
def _invalid_pofile(self, line, lineno, msg) -> None:

0 commit comments

Comments
 (0)