Remove LocalDiskArrowTableCache and use latest pickle protocol fo…

…r local caching (#777) Removing last class that was using deprecated pyarrow serialization (#612). Bumping up pickle protocol to highet to improve performance of local persistent caching.
uber · Sep 14, 2022 · 170b22a · 170b22a
1 parent 4e80f2b
commit 170b22a
Show file tree

Hide file tree

Showing 5 changed files with 5 additions and 59 deletions.
diff --git a/docs/release-notes.rst b/docs/release-notes.rst
@@ -7,7 +7,7 @@ Release notes
 
 Release 0.12.1 (unreleased)
 ===========================
-
+- `PR 777 <https://github.com/uber/petastorm/pull/777>`_: Remove ``LocalDiskArrowTableCache`` class as it was using deprecated pyarrow serialization API. Speed up ``LocalDiskCache`` by using the highest pickle protocol in cache serialization.
 
 Release 0.12.0
 ===========================

diff --git a/petastorm/local_disk_arrow_table_cache.py b/petastorm/local_disk_arrow_table_cache.py
diff --git a/petastorm/local_disk_cache.py b/petastorm/local_disk_cache.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from __future__ import division
 
+import pickle
 import shutil
 from diskcache import FanoutCache
 
@@ -43,6 +44,7 @@ def __init__(self, path, size_limit_bytes, expected_row_size_bytes, shards=6, cl
         default_settings = {
             'size_limit': size_limit_bytes,
             'eviction_policy': 'least-recently-stored',
+            'disk_pickle_protocol': pickle.HIGHEST_PROTOCOL,
         }
         default_settings.update(settings)
 

diff --git a/petastorm/reader.py b/petastorm/reader.py
@@ -25,7 +25,6 @@
 from petastorm.etl import dataset_metadata, rowgroup_indexing
 from petastorm.etl.dataset_metadata import PetastormMetadataError, infer_or_load_unischema
 from petastorm.fs_utils import get_filesystem_and_path_or_paths, normalize_dir_url
-from petastorm.local_disk_arrow_table_cache import LocalDiskArrowTableCache
 from petastorm.local_disk_cache import LocalDiskCache
 from petastorm.ngram import NGram
 from petastorm.predicates import PredicateBase
@@ -307,8 +306,8 @@ def make_batch_reader(dataset_url_or_urls,
     if cache_type is None or cache_type == NULL_CACHE:
         cache = NullCache()
     elif cache_type == LOCAL_DISK_CACHE:
-        cache = LocalDiskArrowTableCache(cache_location, cache_size_limit, cache_row_size_estimate,
-                                         **cache_extra_settings or {})
+        cache = LocalDiskCache(cache_location, cache_size_limit, cache_row_size_estimate,
+                               **cache_extra_settings or {})
     else:
         raise ValueError('Unknown cache_type: {}'.format(cache_type))
 

diff --git a/petastorm/tests/test_disk_cache.py b/petastorm/tests/test_disk_cache.py
@@ -15,10 +15,7 @@
 import os
 
 import numpy as np
-import pandas as pd
-import pyarrow as pa
 
-from petastorm.local_disk_arrow_table_cache import LocalDiskArrowTableCache
 from petastorm.local_disk_cache import LocalDiskCache
 
 MB = 2 ** 20
@@ -59,15 +56,3 @@ def test_size_limit_constraint(tmpdir):
 
 def _should_never_be_called():
     assert False, 'Should not be called'
-
-
-def test_arrow_table_caching(tmpdir):
-    cache = LocalDiskArrowTableCache(tmpdir.strpath, 10 * MB, 4)
-
-    df = pd.DataFrame(np.random.randn(50, 4), columns=list('ABCD'))
-    dummy_table = pa.Table.from_pandas(df)
-
-    table_from_cache = cache.get('my_key', lambda: dummy_table)
-    assert table_from_cache == dummy_table
-
-    cache.get('my_key', _should_never_be_called)