diff --git a/docxtpl/template.py b/docxtpl/template.py index f20280a..a757037 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -15,10 +15,47 @@ from docx.opc.oxml import parse_xml from docx.opc.part import XmlPart import docx.oxml.ns +from docx.oxml import OxmlElement +from docx.oxml.ns import qn from docx.opc.constants import RELATIONSHIP_TYPE as REL_TYPE from jinja2 import Environment, Template, meta from jinja2.exceptions import TemplateError + +def _create_optimized_env(**kwargs): + """Create an optimized Jinja2 environment for better performance. + + Optimizations applied: + - auto_reload=False: Skip checking if template source changed + - cache_size=400: Larger template cache for repeated renders + - enable_async=False: Disable async support (not needed, adds overhead) + """ + return Environment( + auto_reload=False, # Disable template auto-reload (faster) + cache_size=400, # Increase template cache size + enable_async=False, # Disable async (not needed, reduces overhead) + **kwargs + ) + + +# Module-level cached environments (created once, reused across all instances) +_CACHED_ENV = None +_CACHED_ENV_AUTOESCAPE = None + + +def _get_cached_env(autoescape=False): + """Get or create a cached Jinja2 environment for performance.""" + global _CACHED_ENV, _CACHED_ENV_AUTOESCAPE + + if autoescape: + if _CACHED_ENV_AUTOESCAPE is None: + _CACHED_ENV_AUTOESCAPE = _create_optimized_env(autoescape=True) + return _CACHED_ENV_AUTOESCAPE + else: + if _CACHED_ENV is None: + _CACHED_ENV = _create_optimized_env(autoescape=False) + return _CACHED_ENV + try: from html import escape # noqa: F401 except ImportError: @@ -43,6 +80,56 @@ class DocxTemplate(object): "http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" ) + # Pre-compiled regex patterns for patch_xml() optimization + # These are compiled once at class load time, not on every render + _RE_JINJA_OPEN = re.compile( + r"(?<={)(<[^>]*>)+(?=[\{%\#])|(?<=[%\}#])(<[^>]*>)+(?=\})", + re.DOTALL + ) + _RE_JINJA_CONTENT = re.compile( + r"{%(?:(?!%}).)*|{#(?:(?!#}).)*|{{(?:(?!}}).)*", + re.DOTALL + ) + _RE_COLSPAN = re.compile( + r"(](?:(?!]).)*){%\s*colspan\s+([^%]*)\s*%}(.*?)", + re.DOTALL + ) + _RE_CELLBG = re.compile( + r"(](?:(?!]).)*){%\s*cellbg\s+([^%]*)\s*%}(.*?)", + re.DOTALL + ) + _RE_SPACE_PRESERVE = re.compile( + r"((?:(?!).)*)({{.*?}}|{%.*?%})", + re.DOTALL + ) + _RE_SPACE_PRESERVE_R = re.compile( + r"({{r\s.*?}}|{%r\s.*?%})", + re.DOTALL + ) + _RE_MERGE_PREV = re.compile(r"(?:(?!).)*?{%-", re.DOTALL) + _RE_MERGE_NEXT = re.compile(r"-%}(?:(?!]|{%|{{).)*?]*?>", re.DOTALL) + _RE_VMERGE = re.compile( + r"](?:(?!]).)*?{%\s*vm\s*%}.*?]", + re.DOTALL + ) + _RE_HMERGE = re.compile( + r"](?:(?!]).)*?{%\s*hm\s*%}.*?]", + re.DOTALL + ) + _RE_CLEAN_TAGS = re.compile(r"(?<=\{[\{%])(.*?)(?=[\}%]})") + _RE_PARAGRAPH_NEWLINE = re.compile(r"])") + _RE_PARAGRAPH_REMOVE_NEWLINE = re.compile(r"\n])") + _RE_STRIPTAGS = re.compile(r".*?(|]*>)", re.DOTALL) + _RE_COLSPAN_EMPTY = re.compile(r"](?:(?!]).)*.*?", re.DOTALL) + _RE_GRIDSPAN = re.compile(r"") + _RE_TCPR = re.compile(r"(]*>)") + _RE_SHD = re.compile(r"") + _RE_RESOLVE_PARAGRAPH = re.compile(r"]*)?>.*?", re.DOTALL) + _RE_RESOLVE_RUN = re.compile(r"]*)?>.*?", re.DOTALL) + _RE_RESOLVE_TEXT = re.compile(r"]*)?>.*?", re.DOTALL) + _RE_RUN_PROPS = re.compile(r".*?") + _RE_PARA_PROPS = re.compile(r".*?") + def __init__(self, template_file: Union[IO[bytes], str, PathLike]) -> None: self.template_file = template_file self.reset_replacements() @@ -88,94 +175,54 @@ def patch_xml(self, src_xml): unescape html entities, etc...""" # replace {{ by {{ ( works with {{ }} {% and %} {# and #}) - src_xml = re.sub( - r"(?<={)(<[^>]*>)+(?=[\{%\#])|(?<=[%\}\#])(<[^>]*>)+(?=\})", - "", - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_JINJA_OPEN.sub("", src_xml) # replace {{jinja2 stuff}} by {{jinja2 stuff}} # same thing with {% ... %} and {# #} # "jinja2 stuff" could a variable, a 'if' etc... anything jinja2 will understand def striptags(m): - return re.sub( - ".*?(|]*>)", "", m.group(0), flags=re.DOTALL - ) + return self._RE_STRIPTAGS.sub("", m.group(0)) - src_xml = re.sub( - r"{%(?:(?!%}).)*|{#(?:(?!#}).)*|{{(?:(?!}}).)*", - striptags, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_JINJA_CONTENT.sub(striptags, src_xml) # manage table cell colspan def colspan(m): cell_xml = m.group(1) + m.group(3) - cell_xml = re.sub( - r"](?:(?!]).)*.*?", - "", - cell_xml, - flags=re.DOTALL, - ) - cell_xml = re.sub(r"", "", cell_xml, count=1) - return re.sub( - r"(]*>)", + cell_xml = self._RE_COLSPAN_EMPTY.sub("", cell_xml) + cell_xml = self._RE_GRIDSPAN.sub("", cell_xml, count=1) + return self._RE_TCPR.sub( r'\1' % m.group(2), cell_xml, ) - src_xml = re.sub( - r"(](?:(?!]).)*){%\s*colspan\s+([^%]*)\s*%}(.*?)", - colspan, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_COLSPAN.sub(colspan, src_xml) # manage table cell background color def cellbg(m): cell_xml = m.group(1) + m.group(3) - cell_xml = re.sub( - r"](?:(?!]).)*.*?", - "", - cell_xml, - flags=re.DOTALL, - ) - cell_xml = re.sub(r"", "", cell_xml, count=1) - return re.sub( - r"(]*>)", + cell_xml = self._RE_COLSPAN_EMPTY.sub("", cell_xml) + cell_xml = self._RE_SHD.sub("", cell_xml, count=1) + return self._RE_TCPR.sub( r'\1' % m.group(2), cell_xml, ) - src_xml = re.sub( - r"(](?:(?!]).)*){%\s*cellbg\s+([^%]*)\s*%}(.*?)", - cellbg, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_CELLBG.sub(cellbg, src_xml) # ensure space preservation - src_xml = re.sub( - r"((?:(?!).)*)({{.*?}}|{%.*?%})", + src_xml = self._RE_SPACE_PRESERVE.sub( r'\1\2', src_xml, - flags=re.DOTALL, ) - src_xml = re.sub( - r"({{r\s.*?}}|{%r\s.*?%})", + src_xml = self._RE_SPACE_PRESERVE_R.sub( r'\1', src_xml, - flags=re.DOTALL, ) # {%- will merge with previous paragraph text - src_xml = re.sub(r"(?:(?!).)*?{%-", "{%", src_xml, flags=re.DOTALL) + src_xml = self._RE_MERGE_PREV.sub("{%", src_xml) # -%} will merge with next paragraph text - src_xml = re.sub( - r"-%}(?:(?!]|{%|{{).)*?]*?>", "%}", src_xml, flags=re.DOTALL - ) + src_xml = self._RE_MERGE_NEXT.sub("%}", src_xml) for y in ["tr", "tc", "p", "r"]: # replace into xml code the row/paragraph/run containing @@ -220,12 +267,7 @@ def v_merge(m1): flags=re.DOTALL, ) - src_xml = re.sub( - r"](?:(?!]).)*?{%\s*vm\s*%}.*?]", - v_merge_tc, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_VMERGE.sub(v_merge_tc, src_xml) # Use ``{% hm %}`` to make table cell become horizontally merged within # a ``{% for %}``. @@ -279,12 +321,7 @@ def without_gridspan(m2): # Discard every other cell generated in loop. return "{% if loop.first %}" + xml + "{% endif %}" - src_xml = re.sub( - r"](?:(?!]).)*?{%\s*hm\s*%}.*?]", - h_merge_tc, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_HMERGE.sub(h_merge_tc, src_xml) def clean_tags(m): return ( @@ -298,18 +335,17 @@ def clean_tags(m): .replace("’", "'") ) - src_xml = re.sub(r"(?<=\{[\{%])(.*?)(?=[\}%]})", clean_tags, src_xml) + src_xml = self._RE_CLEAN_TAGS.sub(clean_tags, src_xml) return src_xml def render_xml_part(self, src_xml, part, context, jinja_env=None): - src_xml = re.sub(r"])", r"\n])", r" None: if jinja_env is None: - jinja_env = Environment() + jinja_env = _get_cached_env() for section in self.docx.sections: for part in section.part.package.parts: @@ -403,30 +439,24 @@ def resolve_text(run_properties, paragraph_properties, m): return xml def resolve_run(paragraph_properties, m): - run_properties = re.search(r".*?", m.group(0)) + run_properties = self._RE_RUN_PROPS.search(m.group(0)) run_properties = run_properties.group(0) if run_properties else "" - return re.sub( - r"]*)?>.*?", + return self._RE_RESOLVE_TEXT.sub( lambda x: resolve_text(run_properties, paragraph_properties, x), m.group(0), - flags=re.DOTALL, ) def resolve_paragraph(m): - paragraph_properties = re.search(r".*?", m.group(0)) + paragraph_properties = self._RE_PARA_PROPS.search(m.group(0)) paragraph_properties = ( paragraph_properties.group(0) if paragraph_properties else "" ) - return re.sub( - r"]*)?>.*?", + return self._RE_RESOLVE_RUN.sub( lambda x: resolve_run(paragraph_properties, x), m.group(0), - flags=re.DOTALL, ) - xml = re.sub( - r"]*)?>.*?", resolve_paragraph, xml, flags=re.DOTALL - ) + xml = self._RE_RESOLVE_PARAGRAPH.sub(resolve_paragraph, xml) return xml @@ -437,9 +467,21 @@ def build_xml(self, context, jinja_env=None): return xml def map_tree(self, tree): - root = self.docx._element - body = root.body - root.replace(body, tree) + """Replace body content with rendered tree. + + Instead of replacing the entire element with replace() (which + triggers expensive reconciliation), we now mutate the body's children + directly. This is much cheaper for large trees. + """ + body = self.docx._element.body + + # Remove all existing children from body + for child in list(body): + body.remove(child) + + # Append all children from the new tree + for child in list(tree): + body.append(child) def get_headers_footers(self, uri): for relKey, val in self.docx._part.rels.items(): @@ -458,10 +500,17 @@ def get_headers_footers_encoding(self, xml): def build_headers_footers_xml(self, context, uri, jinja_env=None): for relKey, part in self.get_headers_footers(uri): xml = self.get_part_xml(part) - encoding = self.get_headers_footers_encoding(xml) - xml = self.patch_xml(xml) - xml = self.render_xml_part(xml, part, context, jinja_env) - yield relKey, xml.encode(encoding) + + # Skip rendering if no Jinja tags present + # Headers/footers are often static, so this avoids caching/parsing overhead + if self._RE_JINJA_OPEN.search(xml) or self._RE_JINJA_CONTENT.search(xml): + encoding = self.get_headers_footers_encoding(xml) + xml = self.patch_xml(xml) + xml = self.render_xml_part(xml, part, context, jinja_env) + yield relKey, xml.encode(encoding) + else: + encoding = self.get_headers_footers_encoding(xml) + yield relKey, xml.encode(encoding) def map_headers_footers_xml(self, relKey, xml): part = self.docx._part.rels[relKey].target_part @@ -479,11 +528,11 @@ def render( # init template working attributes self.render_init() - if autoescape: - if not jinja_env: - jinja_env = Environment(autoescape=autoescape) - else: - jinja_env.autoescape = autoescape + # Use cached environment by default + if not jinja_env: + jinja_env = _get_cached_env(autoescape=autoescape) + elif autoescape: + jinja_env.autoescape = autoescape # Body xml_src = self.build_xml(context, jinja_env) @@ -515,22 +564,52 @@ def render( self.is_rendered = True # using of TC tag in for cycle can cause that count of columns does not - # correspond to real count of columns in row. This function is able to fix it. + # correspond to real count of columns in row. def fix_tables(self, xml): - parser = etree.XMLParser(recover=True) - tree = etree.fromstring(xml, parser=parser) + # Use parse_xml with safe fallback for malformed XML + try: + tree = parse_xml(xml) # parse_xml() is significantly faster + except Exception: + # Fallback to permissive parser in the event of malformed XML + parser = etree.XMLParser(recover=True) + tree = etree.fromstring(xml, parser=parser) # get namespace ns = "{" + tree.nsmap["w"] + "}" # walk trough xml and find table for t in tree.iter(ns + "tbl"): tblGrid = t.find(ns + "tblGrid") + if tblGrid is None: + continue + columns = tblGrid.findall(ns + "gridCol") - to_add = 0 - # walk trough all rows and try to find if there is higher cell count + columns_len = len(columns) + + # Single pass row analysis with both counters + max_raw_cells = 0 # For ADD decision (raw tc count) + max_effective_cells = 0 # For REMOVE decision (with gridSpan) + for r in t.iter(ns + "tr"): cells = r.findall(ns + "tc") - if (len(columns) + to_add) < len(cells): - to_add = len(cells) - len(columns) + raw_count = len(cells) + effective_count = 0 + + for cell in cells: + tc_pr = cell.find(ns + "tcPr") + if tc_pr is not None: + grid_span = tc_pr.find(ns + "gridSpan") + if grid_span is not None: + effective_count += int(grid_span.get(ns + "val")) + continue + effective_count += 1 + + if raw_count > max_raw_cells: + max_raw_cells = raw_count + if effective_count > max_effective_cells: + max_effective_cells = effective_count + + # ADD columns based on RAW cell count (original behavior) + to_add = max_raw_cells - columns_len if max_raw_cells > columns_len else 0 + # is necessary to add columns? if to_add > 0: # at first, calculate width of table according to columns @@ -552,34 +631,16 @@ def fix_tables(self, xml): int(float(c.get(ns + "w")) * new_average / old_average) ), ) - # add new columns + # add new columns using OxmlElement for proper python-docx compatibility for i in range(to_add): - etree.SubElement( - tblGrid, ns + "gridCol", {ns + "w": str(int(new_average))} - ) + new_col = OxmlElement('w:gridCol') + new_col.set(qn('w:w'), str(int(new_average))) + tblGrid.append(new_col) - # Refetch columns after columns addition. + # REMOVE columns based on EFFECTIVE cell count (original behavior) columns = tblGrid.findall(ns + "gridCol") columns_len = len(columns) - - cells_len_max = 0 - - def get_cell_len(total, cell): - tc_pr = cell.find(ns + "tcPr") - grid_span = None if tc_pr is None else tc_pr.find(ns + "gridSpan") - - if grid_span is not None: - return total + int(grid_span.get(ns + "val")) - - return total + 1 - - # Calculate max of table cells to compare with `gridCol`. - for r in t.iter(ns + "tr"): - cells = r.findall(ns + "tc") - cells_len = functools.reduce(get_cell_len, cells, 0) - cells_len_max = max(cells_len_max, cells_len) - - to_remove = columns_len - cells_len_max + to_remove = columns_len - max_effective_cells if columns_len > max_effective_cells else 0 # If after the loop, there're less columns, than # originally was, remove extra `gridCol` declarations. @@ -608,8 +669,11 @@ def get_cell_len(total, cell): return tree def fix_docpr_ids(self, tree): - # some Ids may have some collisions : so renumbering all of them : - for elt in tree.xpath("//wp:docPr", namespaces=docx.oxml.ns.nsmap): + # Some Ids may have some collisions : so renumbering all of them + wp_ns = docx.oxml.ns.nsmap['wp'] + tag = "{%s}docPr" % wp_ns + + for elt in tree.iter(tag): self.docx_ids_index += 1 elt.attrib["id"] = str(self.docx_ids_index) @@ -913,7 +977,7 @@ def get_undeclared_template_variables( if jinja_env: env = jinja_env else: - env = Environment() + env = _get_cached_env() parse_content = env.parse(xml) all_variables = meta.find_undeclared_variables(parse_content) diff --git a/pyproject.toml b/pyproject.toml index 476fc0e..28533c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,10 @@ repository = "https://github.com/elapouya/python-docx-template.git" document = "https://docxtpl.readthedocs.org" [tool.poetry] +name = "docxtpl" version = "0.0.0" +description = "Python docx template engine" +authors = ["Eric Lapouyade "] [tool.poetry.requires-plugins] poetry-dynamic-versioning = { version = ">=1.0.0,<2.0.0", extras = ["plugin"] }