Skip to content

Commit

Permalink
Remove LocalDiskArrowTableCache and use latest pickle protocol fo…
Browse files Browse the repository at this point in the history
…r local caching (#777)

Removing last class that was using deprecated pyarrow serialization (#612).
Bumping up pickle protocol to highet to improve performance of local
persistent caching.
  • Loading branch information
selitvin authored Sep 14, 2022
1 parent 4e80f2b commit 170b22a
Show file tree
Hide file tree
Showing 5 changed files with 5 additions and 59 deletions.
2 changes: 1 addition & 1 deletion docs/release-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Release notes

Release 0.12.1 (unreleased)
===========================

- `PR 777 <https://github.com/uber/petastorm/pull/777>`_: Remove ``LocalDiskArrowTableCache`` class as it was using deprecated pyarrow serialization API. Speed up ``LocalDiskCache`` by using the highest pickle protocol in cache serialization.

Release 0.12.0
===========================
Expand Down
40 changes: 0 additions & 40 deletions petastorm/local_disk_arrow_table_cache.py

This file was deleted.

2 changes: 2 additions & 0 deletions petastorm/local_disk_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
from __future__ import division

import pickle
import shutil
from diskcache import FanoutCache

Expand Down Expand Up @@ -43,6 +44,7 @@ def __init__(self, path, size_limit_bytes, expected_row_size_bytes, shards=6, cl
default_settings = {
'size_limit': size_limit_bytes,
'eviction_policy': 'least-recently-stored',
'disk_pickle_protocol': pickle.HIGHEST_PROTOCOL,
}
default_settings.update(settings)

Expand Down
5 changes: 2 additions & 3 deletions petastorm/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from petastorm.etl import dataset_metadata, rowgroup_indexing
from petastorm.etl.dataset_metadata import PetastormMetadataError, infer_or_load_unischema
from petastorm.fs_utils import get_filesystem_and_path_or_paths, normalize_dir_url
from petastorm.local_disk_arrow_table_cache import LocalDiskArrowTableCache
from petastorm.local_disk_cache import LocalDiskCache
from petastorm.ngram import NGram
from petastorm.predicates import PredicateBase
Expand Down Expand Up @@ -307,8 +306,8 @@ def make_batch_reader(dataset_url_or_urls,
if cache_type is None or cache_type == NULL_CACHE:
cache = NullCache()
elif cache_type == LOCAL_DISK_CACHE:
cache = LocalDiskArrowTableCache(cache_location, cache_size_limit, cache_row_size_estimate,
**cache_extra_settings or {})
cache = LocalDiskCache(cache_location, cache_size_limit, cache_row_size_estimate,
**cache_extra_settings or {})
else:
raise ValueError('Unknown cache_type: {}'.format(cache_type))

Expand Down
15 changes: 0 additions & 15 deletions petastorm/tests/test_disk_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,7 @@
import os

import numpy as np
import pandas as pd
import pyarrow as pa

from petastorm.local_disk_arrow_table_cache import LocalDiskArrowTableCache
from petastorm.local_disk_cache import LocalDiskCache

MB = 2 ** 20
Expand Down Expand Up @@ -59,15 +56,3 @@ def test_size_limit_constraint(tmpdir):

def _should_never_be_called():
assert False, 'Should not be called'


def test_arrow_table_caching(tmpdir):
cache = LocalDiskArrowTableCache(tmpdir.strpath, 10 * MB, 4)

df = pd.DataFrame(np.random.randn(50, 4), columns=list('ABCD'))
dummy_table = pa.Table.from_pandas(df)

table_from_cache = cache.get('my_key', lambda: dummy_table)
assert table_from_cache == dummy_table

cache.get('my_key', _should_never_be_called)

0 comments on commit 170b22a

Please sign in to comment.