Skip to content

Commit

Permalink
Adding test_table_extraction.py & related documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
Lucas-C committed Mar 27, 2023
1 parent cbfb6e9 commit ad28b22
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 0 deletions.
12 changes: 12 additions & 0 deletions docs/Tables.md
Original file line number Diff line number Diff line change
Expand Up @@ -247,3 +247,15 @@ pdf.output('table_html.pdf')
```

Note that `write_html` has [some limitations, notably regarding multi-lines cells](HTML.html#supported-html-features).

## "Parsabilty" of the tables generated

The PDF file format is not designed to embed structured tables.
Hence, it can be tricky to extract tables data from PDF documents.

In our tests suite, we ensure that several PDF-tables parsing Python libraries can successfully extract tables in documents generated with `fpdf2`.
Namely, we test [camelot-py](https://camelot-py.readthedocs.io) & [tabula-py](https://tabula-py.readthedocs.io): [test/table/test_table_extraction.py](https://github.com/PyFPDF/fpdf2/blob/master/test/table/test_table_extraction.py).

Based on those tests, if you want to ease table extraction from the documents you produce, we recommend the following guidelines:
* avoid splitting tables on several pages
* avoid the `INTERNAL` / `MINIMAL` / `SINGLE_TOP_LINE` borders layouts
2 changes: 2 additions & 0 deletions test/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
bandit
black
camelot-py[base]
endesive
pylint
pytest
pytest-cov
pytest-timeout
qrcode
semgrep
tabula-py
125 changes: 125 additions & 0 deletions test/table/test_table_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""
Tests that ensure tables generated by fpdf2
can be extracted by well-know PDF tables extraction tools
"""
from pathlib import Path

import camelot
from pandas import DataFrame
from pandas.util.testing import assert_frame_equal
import pytest
import tabula

# pylint: disable=import-error,no-name-in-module
from test.table.test_table import TABLE_DATA

HERE = Path(__file__).resolve().parent
TABLE_DATA_AS_DF = DataFrame(TABLE_DATA)
TABLE_DATA_AS_DF = DataFrame(
TABLE_DATA_AS_DF.values[1:], columns=TABLE_DATA_AS_DF.iloc[0]
).astype({"Age": int})


###############################################################################
################################### camelot ###################################
###############################################################################


@pytest.mark.parametrize("flavor", ("lattice", "stream"))
@pytest.mark.parametrize(
"filename",
(
"table_simple.pdf",
"table_with_images.pdf",
"table_with_images_and_img_fill_width.pdf",
"table_with_headings_styled.pdf",
"table_with_internal_layout.pdf",
),
)
def test_camelot_extract_simple_table(flavor, filename):
_test_camelot_parse(HERE / filename, flavor, 4, 5)


@pytest.mark.parametrize(
"filename",
(
"table_with_minimal_layout.pdf",
"table_with_single_top_line_layout.pdf",
),
)
def test_camelot_extract_table_ok_with_only_stream_flavor(filename):
_test_camelot_parse(HERE / filename, "stream", 4, 5)


@pytest.mark.parametrize(
"filename",
(
"table_align.pdf",
# "table_with_cell_fill.pdf",
),
)
def test_camelot_extract_two_tables(filename):
_test_camelot_parse(HERE / filename, "lattice", 4, 5, table_count=2)


@pytest.mark.xfail(
reason="camelot does not successfully parse tables splitted on several pages"
)
@pytest.mark.parametrize("flavor", ("lattice", "stream"))
def test_camelot_extract_two_pages_table(flavor):
_test_camelot_parse(HERE / "table_with_multiline_cells.pdf", flavor, 2, 5)


def _test_camelot_parse(pdf_path, flavor, col_count, row_count, table_count=1):
tables = camelot.read_pdf(str(pdf_path), flavor=flavor)
assert tables.n == table_count
for table in tables:
assert len(table.cols) == col_count
assert len(table.rows) == row_count


###############################################################################
################################### tabula ####################################
###############################################################################


@pytest.mark.parametrize(
"filename",
(
"table_simple.pdf",
"table_with_headings_styled.pdf",
# "table_with_internal_layout.pdf", # tabula only parses the internal cells
"table_with_minimal_layout.pdf",
"table_with_single_top_line_layout.pdf",
),
)
def test_tabula_extract_simple_table(filename):
dataframes = tabula.read_pdf(HERE / filename, pages="all")
assert len(dataframes) == 1
for df in dataframes:
assert_frame_equal(df, TABLE_DATA_AS_DF, check_names=False)


@pytest.mark.parametrize(
"filename",
(
"table_align.pdf",
"table_with_cell_fill.pdf",
),
)
def test_tabula_extract_two_tables(filename):
dataframes = tabula.read_pdf(HERE / filename, pages="all")
assert len(dataframes) == 2
for df in dataframes:
assert_frame_equal(df, TABLE_DATA_AS_DF, check_names=False)


@pytest.mark.xfail(
reason="tabula does not successfully parse tables splitted on several pages"
)
def test_tabula_extract_two_pages_table():
dataframes = tabula.read_pdf(HERE / "table_with_multiline_cells.pdf", pages="all")
assert len(dataframes) == 2
for df in dataframes:
_rows_count, cols_count = df.shape
assert cols_count == 2

0 comments on commit ad28b22

Please sign in to comment.