Skip to content

Commit

Permalink
Split handling of HTML attributes & style CSS properties
Browse files Browse the repository at this point in the history
  • Loading branch information
Lucas-C committed Jun 24, 2024
1 parent d574b07 commit c38e2e0
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 44 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
* support for quadratic and cubic Bézier curves with [`FPDF.bezier()`](https://py-pdf.github.io/fpdf2/fpdf/Shapes.html#fpdf.fpdf.FPDF.bezier) - thanks to @awmc000
* feature to identify the Unicode script of the input text and break it into fragments when different scripts are used, improving [text shaping](https://py-pdf.github.io/fpdf2/TextShaping.html) results
* [`FPDF.image()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.image): now handles `keep_aspect_ratio` in combination with an enum value provided to `x`
* file names are mentioned in errors when `fpdf2` fails to parse a SVG image
* [`FPDF.write_html()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.write_html): now supports CSS page breaks properties : [documentation](https://py-pdf.github.io/fpdf2/HTML.html#page-breaks)
* [`FPDF.write_html()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.write_html): spacing before lists can now be adjusted via the `HTML2FPDF.list_vertical_margin` attribute - thanks to @lcgeneralprojects
* file names are mentioned in errors when `fpdf2` fails to parse a SVG image
### Fixed
* [`FPDF.local_context()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.local_context) used to leak styling during page breaks, when rendering `footer()` & `header()`
* [`fpdf.drawing.DeviceCMYK`](https://py-pdf.github.io/fpdf2/fpdf/drawing.html#fpdf.drawing.DeviceCMYK) objects can now be passed to [`FPDF.set_draw_color()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.set_draw_color), [`FPDF.set_fill_color()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.set_fill_color) and [`FPDF.set_text_color()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.set_text_color) without raising a `ValueError`: [documentation](https://py-pdf.github.io/fpdf2/Text.html#text-formatting).
Expand All @@ -36,7 +36,7 @@ This can also be enabled programmatically with `warnings.simplefilter('default',
* [`FPDF.write_html()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.write_html): fixed incoherent indentation of long `<ul>` list entries - _cf._ [issue #1073](https://github.com/py-pdf/fpdf2/issues/1073) - thanks to @lcgeneralprojects
* default values for `top_margin` and `bottom_margin` in `HTML2FPDF._new_paragraph()` calls are now correctly converted into chosen document units.
### Removed
* an obscure and undocumented [feature](https://github.com/py-pdf/fpdf2/issues/1198) of [`FPDF.write_html()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.write_html), which used to magically pass local variables as arguments.
* an obscure and undocumented [feature](https://github.com/py-pdf/fpdf2/issues/1198) of [`FPDF.write_html()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.write_html), which used to magically pass class & instance properties as arguments.
### Changed
* [`FPDF.table()`](https://py-pdf.github.io/fpdf2/Tables.html) now raises an error when a single row is too high to be rendered on a single page
* [`FPDF.write_html()`](https://py-pdf.github.io/fpdf2/fpdf/fpdf.html#fpdf.fpdf.FPDF.write_html): `tag_indents` can now be non-integer. Indentation of HTML elements is now independent of font size and bullet strings.
Expand Down
93 changes: 56 additions & 37 deletions fpdf/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,22 +242,17 @@ def color_as_decimal(color="#000000"):
return color_from_hex_string(hexcolor).colors255


def parse_style(elem_attrs):
"""Parse `style="..."` making it's key-value pairs element's attributes"""
try:
style = elem_attrs["style"]
except KeyError:
pass
else:
for element in style.split(";"):
if not element:
continue

pair = element.split(":")
if len(pair) == 2 and pair[0] and pair[1]:
attr, value = pair

elem_attrs[attr.strip()] = value.strip()
def parse_css_style(style_attr):
"""Parse `style="..."` HTML attributes, and return a dict of key-value"""
style = {}
for element in style_attr.split(";"):
if not element:
continue
pair = element.split(":")
if len(pair) == 2 and pair[0] and pair[1]:
attr, value = pair
style[attr.strip()] = value.strip()
return style


class HTML2FPDF(HTMLParser):
Expand All @@ -281,7 +276,8 @@ def __init__(
tag_indents=None,
tag_styles=None,
list_vertical_margin=None,
**_,
heading_above=0.2,
heading_below=0.4,
):
"""
Args:
Expand All @@ -302,6 +298,8 @@ def __init__(
tag_styles (dict): mapping of HTML tag names to colors
list_vertical_margin (float): size of margins that precede lists.
The margin value is in the chosen pdf document units.
heading_above (float): extra space above heading, relative to font size
heading_below (float): extra space below heading, relative to font size
"""
super().__init__()
self.pdf = pdf
Expand Down Expand Up @@ -347,8 +345,8 @@ def __init__(
self.list_vertical_margin = list_vertical_margin
self.font_color = pdf.text_color.colors255
self.heading_level = None
self.heading_above = 0.2 # extra space above heading, relative to font size
self.heading_below = 0.4 # extra space below heading, relative to font size
self.heading_above = heading_above
self.heading_below = heading_below
self._tags_stack = []
self._column = self.pdf.text_columns(skip_leading_spaces=True)
self._paragraph = self._column.paragraph()
Expand Down Expand Up @@ -511,13 +509,17 @@ def handle_data(self, data):
emphasis |= TextEmphasis.I
if self.td_th.get("U"):
emphasis |= TextEmphasis.U
style = None
font_style = None
if bgcolor or emphasis:
style = FontFace(
font_style = FontFace(
emphasis=emphasis, fill_color=bgcolor, color=self.pdf.text_color
)
self.table_row.cell(
text=data, align=align, style=style, colspan=colspan, rowspan=rowspan
text=data,
align=align,
style=font_style,
colspan=colspan,
rowspan=rowspan,
)
self.td_th["inserted"] = True
elif self.table is not None:
Expand Down Expand Up @@ -561,9 +563,9 @@ def handle_starttag(self, tag, attrs):
self._pre_started = False
attrs = dict(attrs)
LOGGER.debug("STARTTAG %s %s", tag, attrs)
parse_style(attrs)
css_style = parse_css_style(attrs.get("style", ""))
self._tags_stack.append(tag)
if attrs.get("break-before") == "page":
if css_style.get("break-before") == "page":
self._end_paragraph()
# pylint: disable=protected-access
self.pdf._perform_page_break()
Expand Down Expand Up @@ -606,13 +608,16 @@ def handle_starttag(self, tag, attrs):
align = attrs.get("align")[0].upper()
if not align in ["L", "R", "J", "C"]:
align = None
line_height = None
if "line-height" in attrs:
line_height = css_style.get("line-height", attrs.get("line-height"))
# "line-height" attributes are not valid in HTML,
# but we support it for backward compatibility,
# because fpdf2 honors it since 2.6.1 and PR #629
if line_height:
try:
# YYY parse and convert non-float line_height values
line_height = float(attrs.get("line-height"))
line_height = float(line_height)
except ValueError:
pass
line_height = None
self._new_paragraph(align=align, line_height=line_height)
if tag in HEADING_TAGS:
prev_font_height = self.font_size / self.pdf.k
Expand All @@ -638,7 +643,11 @@ def handle_starttag(self, tag, attrs):
bottom_margin=self.heading_below * hsize,
)
color = None
if "color" in attrs:
if "color" in css_style:
color = color_as_decimal(css_style["color"])
elif "color" in attrs:
# "color" attributes are not valid in HTML,
# but we support it for backward compatibility:
color = color_as_decimal(attrs["color"])
elif tag_style.color:
color = tag_style.color.colors255
Expand All @@ -650,7 +659,7 @@ def handle_starttag(self, tag, attrs):
)
if tag == "hr":
self._end_paragraph()
width = attrs.get("width")
width = css_style.get("width", attrs.get("width"))
if width:
if width[-1] == "%":
width = self.pdf.epw * int(width[:-1]) / 100
Expand Down Expand Up @@ -723,10 +732,14 @@ def handle_starttag(self, tag, attrs):
ul_prefix(attrs["type"]) if "type" in attrs else self.ul_bullet_char
)
self.bullet.append(bullet_char)
if "line-height" in attrs:
line_height = css_style.get("line-height", attrs.get("line-height"))
# "line-height" attributes are not valid in HTML,
# but we support it for backward compatibility,
# because fpdf2 honors it since 2.6.1 and PR #629
if line_height:
try:
# YYY parse and convert non-float line_height values
self.line_height_stack.append(float(attrs.get("line-height")))
self.line_height_stack.append(float(line_height))
except ValueError:
pass
else:
Expand All @@ -740,10 +753,14 @@ def handle_starttag(self, tag, attrs):
start = int(attrs["start"]) if "start" in attrs else 1
self.bullet.append(start - 1)
self.ol_type.append(attrs.get("type", "1"))
if "line-height" in attrs:
line_height = css_style.get("line-height", attrs.get("line-height"))
# "line-height" attributes are not valid in HTML,
# but we support it for backward compatibility,
# because fpdf2 honors it since 2.6.1 and PR #629
if line_height:
try:
# YYY parse and convert non-float line_height values
self.line_height_stack.append(float(attrs.get("line-height")))
self.line_height_stack.append(float(line_height))
except ValueError:
pass
else:
Expand Down Expand Up @@ -792,12 +809,14 @@ def handle_starttag(self, tag, attrs):
# This may result in a FPDFException "font not found".
self.set_font(face)
self.font_family = face
if "size" in attrs:
if "font-size" in css_style:
self.font_size = int(css_style.get("font-size"))
elif "size" in attrs:
self.font_size = int(attrs.get("size"))
self.set_font()
self.set_text_color(*self.font_color)
if tag == "table":
width = attrs.get("width")
width = css_style.get("width", attrs.get("width"))
if width:
if width[-1] == "%":
width = self.pdf.epw * int(width[:-1]) / 100
Expand Down Expand Up @@ -908,7 +927,7 @@ def handle_starttag(self, tag, attrs):
self.pdf.char_vpos = "SUP"
if tag == "sub":
self.pdf.char_vpos = "SUB"
if attrs.get("break-after") == "page":
if css_style.get("break-after") == "page":
if tag in ("br", "hr", "img"):
self._end_paragraph()
# pylint: disable=protected-access
Expand Down
6 changes: 3 additions & 3 deletions fpdf/svg.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,17 +303,17 @@ def optional(value, converter=lambda noop: noop):
@force_nodocument
def apply_styles(stylable, svg_element):
"""Apply the known styles from `svg_element` to the pdf path/group `stylable`."""
html.parse_style(svg_element.attrib)
style = html.parse_css_style(svg_element.attrib.get("style", ""))

stylable.style.auto_close = False

for attr_name, converter in svg_attr_map.items():
value = svg_element.attrib.get(attr_name)
value = style.get(attr_name, svg_element.attrib.get(attr_name))
if value:
setattr(stylable.style, *converter(value))

# handle this separately for now
opacity = svg_element.attrib.get("opacity")
opacity = style.get("opacity", svg_element.attrib.get("opacity"))
if opacity:
opacity = float(opacity)
stylable.style.fill_opacity = opacity
Expand Down
Binary file added test/html/html_heading_above_below.pdf
Binary file not shown.
22 changes: 20 additions & 2 deletions test/html/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,15 +735,15 @@ def test_html_long_ol_bullets(tmp_path):
<li>Item 3</li>
</ol>
"""
pdf.write_html(html_arabic_indian)
html_roman = f"""
<ol start="{10**5}" type="i">
<li>Item 1</li>
<li>Item 2</li>
<li>Item 3</li>
</ol>
"""
pdf.write_html(html_arabic_indian)
pdf.write_html(html_roman, type="i")
pdf.write_html(html_roman)
pdf.write_html(html_arabic_indian, tag_indents={"li": 50})
pdf.write_html(html_roman, tag_indents={"li": 100})
assert_pdf_equal(pdf, HERE / "html_long_ol_bullets.pdf", tmp_path)
Expand Down Expand Up @@ -871,3 +871,21 @@ def test_html_page_break_after(tmp_path):
Content on third page."""
)
assert_pdf_equal(pdf, HERE / "html_page_break_after.pdf", tmp_path)


def test_html_heading_above_below(tmp_path):
pdf = FPDF()
pdf.add_page()
pdf.write_html(
"""
<h1>Top heading</h1>
<p>Lorem ipsum</p>
<h2>First heading</h2>
<p>Lorem ipsum</p>
<h2>Second heading</h2>
<p>Lorem ipsum</p>
""",
heading_above=1,
heading_below=0.5,
)
assert_pdf_equal(pdf, HERE / "html_heading_above_below.pdf", tmp_path)

0 comments on commit c38e2e0

Please sign in to comment.