Adding test_table_extraction.py & related documentation

py-pdf · Mar 27, 2023 · ad28b22 · ad28b22
1 parent cbfb6e9
commit ad28b22
Show file tree

Hide file tree

Showing 3 changed files with 139 additions and 0 deletions.
diff --git a/docs/Tables.md b/docs/Tables.md
@@ -247,3 +247,15 @@ pdf.output('table_html.pdf')
 ```
 
 Note that `write_html` has [some limitations, notably regarding multi-lines cells](HTML.html#supported-html-features).
+
+## "Parsabilty" of the tables generated
+
+The PDF file format is not designed to embed structured tables.
+Hence, it can be tricky to extract tables data from PDF documents.
+
+In our tests suite, we ensure that several PDF-tables parsing Python libraries can successfully extract tables in documents generated with `fpdf2`.
+Namely, we test [camelot-py](https://camelot-py.readthedocs.io) & [tabula-py](https://tabula-py.readthedocs.io): [test/table/test_table_extraction.py](https://github.com/PyFPDF/fpdf2/blob/master/test/table/test_table_extraction.py).
+
+Based on those tests, if you want to ease table extraction from the documents you produce, we recommend the following guidelines:
+* avoid splitting tables on several pages
+* avoid the `INTERNAL` / `MINIMAL` / `SINGLE_TOP_LINE` borders layouts
diff --git a/test/requirements.txt b/test/requirements.txt
@@ -1,9 +1,11 @@
 bandit
 black
+camelot-py[base]
 endesive
 pylint
 pytest
 pytest-cov
 pytest-timeout
 qrcode
 semgrep
+tabula-py
diff --git a/test/table/test_table_extraction.py b/test/table/test_table_extraction.py
@@ -0,0 +1,125 @@
+"""
+Tests that ensure tables generated by fpdf2
+can be extracted by well-know PDF tables extraction tools
+"""
+from pathlib import Path
+
+import camelot
+from pandas import DataFrame
+from pandas.util.testing import assert_frame_equal
+import pytest
+import tabula
+
+# pylint: disable=import-error,no-name-in-module
+from test.table.test_table import TABLE_DATA
+
+HERE = Path(__file__).resolve().parent
+TABLE_DATA_AS_DF = DataFrame(TABLE_DATA)
+TABLE_DATA_AS_DF = DataFrame(
+    TABLE_DATA_AS_DF.values[1:], columns=TABLE_DATA_AS_DF.iloc[0]
+).astype({"Age": int})
+
+
+###############################################################################
+################################### camelot ###################################
+###############################################################################
+
+
+@pytest.mark.parametrize("flavor", ("lattice", "stream"))
+@pytest.mark.parametrize(
+    "filename",
+    (
+        "table_simple.pdf",
+        "table_with_images.pdf",
+        "table_with_images_and_img_fill_width.pdf",
+        "table_with_headings_styled.pdf",
+        "table_with_internal_layout.pdf",
+    ),
+)
+def test_camelot_extract_simple_table(flavor, filename):
+    _test_camelot_parse(HERE / filename, flavor, 4, 5)
+
+
+@pytest.mark.parametrize(
+    "filename",
+    (
+        "table_with_minimal_layout.pdf",
+        "table_with_single_top_line_layout.pdf",
+    ),
+)
+def test_camelot_extract_table_ok_with_only_stream_flavor(filename):
+    _test_camelot_parse(HERE / filename, "stream", 4, 5)
+
+
+@pytest.mark.parametrize(
+    "filename",
+    (
+        "table_align.pdf",
+        # "table_with_cell_fill.pdf",
+    ),
+)
+def test_camelot_extract_two_tables(filename):
+    _test_camelot_parse(HERE / filename, "lattice", 4, 5, table_count=2)
+
+
+@pytest.mark.xfail(
+    reason="camelot does not successfully parse tables splitted on several pages"
+)
+@pytest.mark.parametrize("flavor", ("lattice", "stream"))
+def test_camelot_extract_two_pages_table(flavor):
+    _test_camelot_parse(HERE / "table_with_multiline_cells.pdf", flavor, 2, 5)
+
+
+def _test_camelot_parse(pdf_path, flavor, col_count, row_count, table_count=1):
+    tables = camelot.read_pdf(str(pdf_path), flavor=flavor)
+    assert tables.n == table_count
+    for table in tables:
+        assert len(table.cols) == col_count
+        assert len(table.rows) == row_count
+
+
+###############################################################################
+################################### tabula ####################################
+###############################################################################
+
+
+@pytest.mark.parametrize(
+    "filename",
+    (
+        "table_simple.pdf",
+        "table_with_headings_styled.pdf",
+        # "table_with_internal_layout.pdf",  # tabula only parses the internal cells
+        "table_with_minimal_layout.pdf",
+        "table_with_single_top_line_layout.pdf",
+    ),
+)
+def test_tabula_extract_simple_table(filename):
+    dataframes = tabula.read_pdf(HERE / filename, pages="all")
+    assert len(dataframes) == 1
+    for df in dataframes:
+        assert_frame_equal(df, TABLE_DATA_AS_DF, check_names=False)
+
+
+@pytest.mark.parametrize(
+    "filename",
+    (
+        "table_align.pdf",
+        "table_with_cell_fill.pdf",
+    ),
+)
+def test_tabula_extract_two_tables(filename):
+    dataframes = tabula.read_pdf(HERE / filename, pages="all")
+    assert len(dataframes) == 2
+    for df in dataframes:
+        assert_frame_equal(df, TABLE_DATA_AS_DF, check_names=False)
+
+
+@pytest.mark.xfail(
+    reason="tabula does not successfully parse tables splitted on several pages"
+)
+def test_tabula_extract_two_pages_table():
+    dataframes = tabula.read_pdf(HERE / "table_with_multiline_cells.pdf", pages="all")
+    assert len(dataframes) == 2
+    for df in dataframes:
+        _rows_count, cols_count = df.shape
+        assert cols_count == 2