Split handling of HTML attributes & style CSS properties

py-pdf · Jun 24, 2024 · c38e2e0 · c38e2e0
1 parent d574b07
commit c38e2e0
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 44 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -23,9 +23,9 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
 * support for quadratic and cubic Bézier curves with [`FPDF.bezier()`](https://py-pdf.github.io/fpdf2/fpdf/Shapes.html#fpdf.fpdf.FPDF.bezier) - thanks to @awmc000
 * feature to identify the Unicode script of the input text and break it into fragments when different scripts are used, improving [text shaping](https://py-pdf.github.io/fpdf2/TextShaping.html) results
 * [`FPDF.image()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.image): now handles `keep_aspect_ratio` in combination with an enum value provided to `x`
-* file names are mentioned in errors when `fpdf2` fails to parse a SVG image
 * [`FPDF.write_html()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.write_html): now supports CSS page breaks properties : [documentation](https://py-pdf.github.io/fpdf2/HTML.html#page-breaks)
 * [`FPDF.write_html()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.write_html): spacing before lists can now be adjusted via the `HTML2FPDF.list_vertical_margin` attribute - thanks to @lcgeneralprojects
+* file names are mentioned in errors when `fpdf2` fails to parse a SVG image
 ### Fixed
 * [`FPDF.local_context()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.local_context) used to leak styling during page breaks, when rendering `footer()` & `header()`
 * [`fpdf.drawing.DeviceCMYK`](https://py-pdf.github.io/fpdf2/fpdf/drawing.html#fpdf.drawing.DeviceCMYK) objects can now be passed to [`FPDF.set_draw_color()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.set_draw_color), [`FPDF.set_fill_color()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.set_fill_color) and [`FPDF.set_text_color()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.set_text_color) without raising a `ValueError`: [documentation](https://py-pdf.github.io/fpdf2/Text.html#text-formatting).
@@ -36,7 +36,7 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
 * [`FPDF.write_html()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.write_html): fixed incoherent indentation of long `<ul>` list entries - _cf._ [issue #1073](https://github.com/py-pdf/fpdf2/issues/1073) - thanks to @lcgeneralprojects
 * default values for `top_margin` and `bottom_margin` in `HTML2FPDF._new_paragraph()` calls are now correctly converted into chosen document units.
 ### Removed
-* an obscure and undocumented [feature](https://github.com/py-pdf/fpdf2/issues/1198) of [`FPDF.write_html()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.write_html), which used to magically pass local variables as arguments.
+* an obscure and undocumented [feature](https://github.com/py-pdf/fpdf2/issues/1198) of [`FPDF.write_html()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.write_html), which used to magically pass class & instance properties as arguments.
 ### Changed
 * [`FPDF.table()`](https://py-pdf.github.io/fpdf2/Tables.html) now raises an error when a single row is too high to be rendered on a single page
 * [`FPDF.write_html()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.write_html): `tag_indents` can now be non-integer. Indentation of HTML elements is now independent of font size and bullet strings.

diff --git a/fpdf/html.py b/fpdf/html.py
@@ -242,22 +242,17 @@ def color_as_decimal(color="#000000"):
     return color_from_hex_string(hexcolor).colors255
 
 
-def parse_style(elem_attrs):
-    """Parse `style="..."` making it's key-value pairs element's attributes"""
-    try:
-        style = elem_attrs["style"]
-    except KeyError:
-        pass
-    else:
-        for element in style.split(";"):
-            if not element:
-                continue
-
-            pair = element.split(":")
-            if len(pair) == 2 and pair[0] and pair[1]:
-                attr, value = pair
-
-                elem_attrs[attr.strip()] = value.strip()
+def parse_css_style(style_attr):
+    """Parse `style="..."` HTML attributes, and return a dict of key-value"""
+    style = {}
+    for element in style_attr.split(";"):
+        if not element:
+            continue
+        pair = element.split(":")
+        if len(pair) == 2 and pair[0] and pair[1]:
+            attr, value = pair
+            style[attr.strip()] = value.strip()
+    return style
 
 
 class HTML2FPDF(HTMLParser):
@@ -281,7 +276,8 @@ def __init__(
         tag_indents=None,
         tag_styles=None,
         list_vertical_margin=None,
-        **_,
+        heading_above=0.2,
+        heading_below=0.4,
     ):
         """
         Args:
@@ -302,6 +298,8 @@ def __init__(
             tag_styles (dict): mapping of HTML tag names to colors
             list_vertical_margin (float): size of margins that precede lists.
                 The margin value is in the chosen pdf document units.
+            heading_above (float): extra space above heading, relative to font size
+            heading_below (float): extra space below heading, relative to font size
         """
         super().__init__()
         self.pdf = pdf
@@ -347,8 +345,8 @@ def __init__(
         self.list_vertical_margin = list_vertical_margin
         self.font_color = pdf.text_color.colors255
         self.heading_level = None
-        self.heading_above = 0.2  # extra space above heading, relative to font size
-        self.heading_below = 0.4  # extra space below heading, relative to font size
+        self.heading_above = heading_above
+        self.heading_below = heading_below
         self._tags_stack = []
         self._column = self.pdf.text_columns(skip_leading_spaces=True)
         self._paragraph = self._column.paragraph()
@@ -511,13 +509,17 @@ def handle_data(self, data):
                 emphasis |= TextEmphasis.I
             if self.td_th.get("U"):
                 emphasis |= TextEmphasis.U
-            style = None
+            font_style = None
             if bgcolor or emphasis:
-                style = FontFace(
+                font_style = FontFace(
                     emphasis=emphasis, fill_color=bgcolor, color=self.pdf.text_color
                 )
             self.table_row.cell(
-                text=data, align=align, style=style, colspan=colspan, rowspan=rowspan
+                text=data,
+                align=align,
+                style=font_style,
+                colspan=colspan,
+                rowspan=rowspan,
             )
             self.td_th["inserted"] = True
         elif self.table is not None:
@@ -561,9 +563,9 @@ def handle_starttag(self, tag, attrs):
         self._pre_started = False
         attrs = dict(attrs)
         LOGGER.debug("STARTTAG %s %s", tag, attrs)
-        parse_style(attrs)
+        css_style = parse_css_style(attrs.get("style", ""))
         self._tags_stack.append(tag)
-        if attrs.get("break-before") == "page":
+        if css_style.get("break-before") == "page":
             self._end_paragraph()
             # pylint: disable=protected-access
             self.pdf._perform_page_break()
@@ -606,13 +608,16 @@ def handle_starttag(self, tag, attrs):
                 align = attrs.get("align")[0].upper()
                 if not align in ["L", "R", "J", "C"]:
                     align = None
-            line_height = None
-            if "line-height" in attrs:
+            line_height = css_style.get("line-height", attrs.get("line-height"))
+            # "line-height" attributes are not valid in HTML,
+            # but we support it for backward compatibility,
+            # because fpdf2 honors it since 2.6.1 and PR #629
+            if line_height:
                 try:
                     # YYY parse and convert non-float line_height values
-                    line_height = float(attrs.get("line-height"))
+                    line_height = float(line_height)
                 except ValueError:
-                    pass
+                    line_height = None
             self._new_paragraph(align=align, line_height=line_height)
         if tag in HEADING_TAGS:
             prev_font_height = self.font_size / self.pdf.k
@@ -638,7 +643,11 @@ def handle_starttag(self, tag, attrs):
                 bottom_margin=self.heading_below * hsize,
             )
             color = None
-            if "color" in attrs:
+            if "color" in css_style:
+                color = color_as_decimal(css_style["color"])
+            elif "color" in attrs:
+                # "color" attributes are not valid in HTML,
+                # but we support it for backward compatibility:
                 color = color_as_decimal(attrs["color"])
             elif tag_style.color:
                 color = tag_style.color.colors255
@@ -650,7 +659,7 @@ def handle_starttag(self, tag, attrs):
             )
         if tag == "hr":
             self._end_paragraph()
-            width = attrs.get("width")
+            width = css_style.get("width", attrs.get("width"))
             if width:
                 if width[-1] == "%":
                     width = self.pdf.epw * int(width[:-1]) / 100
@@ -723,10 +732,14 @@ def handle_starttag(self, tag, attrs):
                 ul_prefix(attrs["type"]) if "type" in attrs else self.ul_bullet_char
             )
             self.bullet.append(bullet_char)
-            if "line-height" in attrs:
+            line_height = css_style.get("line-height", attrs.get("line-height"))
+            # "line-height" attributes are not valid in HTML,
+            # but we support it for backward compatibility,
+            # because fpdf2 honors it since 2.6.1 and PR #629
+            if line_height:
                 try:
                     # YYY parse and convert non-float line_height values
-                    self.line_height_stack.append(float(attrs.get("line-height")))
+                    self.line_height_stack.append(float(line_height))
                 except ValueError:
                     pass
             else:
@@ -740,10 +753,14 @@ def handle_starttag(self, tag, attrs):
             start = int(attrs["start"]) if "start" in attrs else 1
             self.bullet.append(start - 1)
             self.ol_type.append(attrs.get("type", "1"))
-            if "line-height" in attrs:
+            line_height = css_style.get("line-height", attrs.get("line-height"))
+            # "line-height" attributes are not valid in HTML,
+            # but we support it for backward compatibility,
+            # because fpdf2 honors it since 2.6.1 and PR #629
+            if line_height:
                 try:
                     # YYY parse and convert non-float line_height values
-                    self.line_height_stack.append(float(attrs.get("line-height")))
+                    self.line_height_stack.append(float(line_height))
                 except ValueError:
                     pass
             else:
@@ -792,12 +809,14 @@ def handle_starttag(self, tag, attrs):
                 # This may result in a FPDFException "font not found".
                 self.set_font(face)
                 self.font_family = face
-            if "size" in attrs:
+            if "font-size" in css_style:
+                self.font_size = int(css_style.get("font-size"))
+            elif "size" in attrs:
                 self.font_size = int(attrs.get("size"))
             self.set_font()
             self.set_text_color(*self.font_color)
         if tag == "table":
-            width = attrs.get("width")
+            width = css_style.get("width", attrs.get("width"))
             if width:
                 if width[-1] == "%":
                     width = self.pdf.epw * int(width[:-1]) / 100
@@ -908,7 +927,7 @@ def handle_starttag(self, tag, attrs):
             self.pdf.char_vpos = "SUP"
         if tag == "sub":
             self.pdf.char_vpos = "SUB"
-        if attrs.get("break-after") == "page":
+        if css_style.get("break-after") == "page":
             if tag in ("br", "hr", "img"):
                 self._end_paragraph()
                 # pylint: disable=protected-access

diff --git a/fpdf/svg.py b/fpdf/svg.py
@@ -303,17 +303,17 @@ def optional(value, converter=lambda noop: noop):
 @force_nodocument
 def apply_styles(stylable, svg_element):
     """Apply the known styles from `svg_element` to the pdf path/group `stylable`."""
-    html.parse_style(svg_element.attrib)
+    style = html.parse_css_style(svg_element.attrib.get("style", ""))
 
     stylable.style.auto_close = False
 
     for attr_name, converter in svg_attr_map.items():
-        value = svg_element.attrib.get(attr_name)
+        value = style.get(attr_name, svg_element.attrib.get(attr_name))
         if value:
             setattr(stylable.style, *converter(value))
 
     # handle this separately for now
-    opacity = svg_element.attrib.get("opacity")
+    opacity = style.get("opacity", svg_element.attrib.get("opacity"))
     if opacity:
         opacity = float(opacity)
         stylable.style.fill_opacity = opacity

diff --git a/test/html/html_heading_above_below.pdf b/test/html/html_heading_above_below.pdf
diff --git a/test/html/test_html.py b/test/html/test_html.py
@@ -735,15 +735,15 @@ def test_html_long_ol_bullets(tmp_path):
               <li>Item 3</li>
             </ol>
         """
+    pdf.write_html(html_arabic_indian)
     html_roman = f"""
             <ol start="{10**5}" type="i">
               <li>Item 1</li>
               <li>Item 2</li>
               <li>Item 3</li>
             </ol>
         """
-    pdf.write_html(html_arabic_indian)
-    pdf.write_html(html_roman, type="i")
+    pdf.write_html(html_roman)
     pdf.write_html(html_arabic_indian, tag_indents={"li": 50})
     pdf.write_html(html_roman, tag_indents={"li": 100})
     assert_pdf_equal(pdf, HERE / "html_long_ol_bullets.pdf", tmp_path)
@@ -871,3 +871,21 @@ def test_html_page_break_after(tmp_path):
         Content on third page."""
     )
     assert_pdf_equal(pdf, HERE / "html_page_break_after.pdf", tmp_path)
+
+
+def test_html_heading_above_below(tmp_path):
+    pdf = FPDF()
+    pdf.add_page()
+    pdf.write_html(
+        """
+        <h1>Top heading</h1>
+        <p>Lorem ipsum</p>
+        <h2>First heading</h2>
+        <p>Lorem ipsum</p>
+        <h2>Second heading</h2>
+        <p>Lorem ipsum</p>
+        """,
+        heading_above=1,
+        heading_below=0.5,
+    )
+    assert_pdf_equal(pdf, HERE / "html_heading_above_below.pdf", tmp_path)