Skip to content

Commit

Permalink
analyze: changed chunks per directory
Browse files Browse the repository at this point in the history
  • Loading branch information
ThomasWaldmann committed Sep 29, 2024
1 parent 1700c7a commit 6a314ed
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 0 deletions.
3 changes: 3 additions & 0 deletions src/borg/archiver/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def get_func(args):
raise Exception("expected func attributes not found")


from .analyze_cmd import AnalyzeMixIn
from .benchmark_cmd import BenchmarkMixIn
from .check_cmd import CheckMixIn
from .compact_cmd import CompactMixIn
Expand Down Expand Up @@ -94,6 +95,7 @@ def get_func(args):


class Archiver(
AnalyzeMixIn,
BenchmarkMixIn,
CheckMixIn,
CompactMixIn,
Expand Down Expand Up @@ -332,6 +334,7 @@ def build_parser(self):

subparsers = parser.add_subparsers(title="required arguments", metavar="<command>")

self.build_parser_analyze(subparsers, common_parser, mid_common_parser)
self.build_parser_benchmarks(subparsers, common_parser, mid_common_parser)
self.build_parser_check(subparsers, common_parser, mid_common_parser)
self.build_parser_compact(subparsers, common_parser, mid_common_parser)
Expand Down
116 changes: 116 additions & 0 deletions src/borg/archiver/analyze_cmd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import argparse
from collections import defaultdict
import os

from ._common import with_repository, define_archive_filters_group
from ..archive import Archive
from ..constants import * # NOQA
from ..helpers import bin_to_hex, Error
from ..helpers import ProgressIndicatorPercent
from ..manifest import Manifest
from ..remote import RemoteRepository
from ..repository import Repository

from ..logger import create_logger

logger = create_logger()


class ArchiveAnalyzer:
def __init__(self, args, repository, manifest):
self.args = args
self.repository = repository
assert isinstance(repository, (Repository, RemoteRepository))
self.manifest = manifest
self.difference_by_path = defaultdict(int) # directory path -> count of chunks changed

def analyze(self):
logger.info("Starting archives analysis...")
self.analyze_archives()
self.report_and_delete()
logger.info("Finished archives analysis.")

def analyze_archives(self) -> None:
"""Analyze all archives matching the given selection criteria."""
archive_infos = self.manifest.archives.list_considering(self.args)
num_archives = len(archive_infos)
if num_archives < 2:
raise Error("Need at least 2 archives to analyze.")

Check warning on line 38 in src/borg/archiver/analyze_cmd.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archiver/analyze_cmd.py#L38

Added line #L38 was not covered by tests

pi = ProgressIndicatorPercent(
total=num_archives, msg="Analyzing archives %3.1f%%", step=0.1, msgid="analyze.analyze_archives"
)
i = 0
info = archive_infos[i]
pi.show(i)
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 1}/{num_archives})")
base = self.analyze_archive(info.id)
for i, info in enumerate(archive_infos[1:]):
pi.show(i + 1)
logger.info(f"Analyzing archive {info.name} {info.ts} {bin_to_hex(info.id)} ({i + 2}/{num_archives})")
new = self.analyze_archive(info.id)
self.analyze_change(base, new)
base = new
pi.finish()

def analyze_archive(self, id):
"""compute the set of chunks for each directory in this archive"""
archive = Archive(self.manifest, id)
chunks_by_path = defaultdict(set) # collect all chunk IDs generated from files in this directory path
for item in archive.iter_items():
if "chunks" in item:
item_chunks = set(id for id, size in item.chunks)
directory_path = os.path.dirname(item.path)
chunks_by_path[directory_path].update(item_chunks)
return chunks_by_path

def analyze_change(self, base, new):
"""for each directory path, count the chunks changed (removed or added chunks) between base and new."""

def analyze_path_change(path):
base_chunks = base[path]
new_chunks = new[path]
different_chunks = base_chunks.symmetric_difference(new_chunks) # removed or added chunks
self.difference_by_path[directory_path] += len(different_chunks)

for directory_path in base:
analyze_path_change(directory_path)
for directory_path in new:
if directory_path not in base:
analyze_path_change(directory_path)

Check warning on line 80 in src/borg/archiver/analyze_cmd.py

View check run for this annotation

Codecov / codecov/patch

src/borg/archiver/analyze_cmd.py#L80

Added line #L80 was not covered by tests

def report_and_delete(self):
print()
print("chunks added or removed by directory path")
print("=========================================")
for directory_path in sorted(self.difference_by_path, key=lambda p: self.difference_by_path[p], reverse=True):
difference = self.difference_by_path[directory_path]
if difference > 0:
print(f"{directory_path}: {difference}")


class AnalyzeMixIn:
@with_repository(compatibility=(Manifest.Operation.READ,))
def do_analyze(self, args, repository, manifest):
"""Analyze archives"""
ArchiveAnalyzer(args, repository, manifest).analyze()

def build_parser_analyze(self, subparsers, common_parser, mid_common_parser):
from ._common import process_epilog

analyze_epilog = process_epilog(
"""
Analyze archives.
"""
)
subparser = subparsers.add_parser(
"analyze",
parents=[common_parser],
add_help=False,
description=self.do_analyze.__doc__,
epilog=analyze_epilog,
formatter_class=argparse.RawDescriptionHelpFormatter,
help="analyze archives",
)
subparser.set_defaults(func=self.do_analyze)
define_archive_filters_group(subparser)
41 changes: 41 additions & 0 deletions src/borg/testsuite/archiver/analyze_cmd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pathlib

from ...constants import * # NOQA
from . import cmd, generate_archiver_tests, RK_ENCRYPTION

pytest_generate_tests = lambda metafunc: generate_archiver_tests(metafunc, kinds="local") # NOQA


def test_analyze(archivers, request):
def create_archive():
cmd(archiver, "create", "archive", archiver.input_path)

def analyze_archives():
return cmd(archiver, "analyze", "-a", "archive")

archiver = request.getfixturevalue(archivers)

cmd(archiver, "repo-create", RK_ENCRYPTION)
input_path = pathlib.Path(archiver.input_path)

# 1st archive
(input_path / "file1").write_text("foo")
create_archive()

# 2nd archive
(input_path / "file2").write_text("bar")
create_archive()

assert "/input: 1" in analyze_archives() # 2nd archive added 1 chunk for input path

# 3rd archive
(input_path / "file3").write_text("baz")
create_archive()

assert "/input: 2" in analyze_archives() # 2nd/3rd archives added 2 chunks for input path

# 4th archive
(input_path / "file2").unlink()
create_archive()

assert "/input: 3" in analyze_archives() # 2nd/3rd archives added 2, 4th archive removed 1

0 comments on commit 6a314ed

Please sign in to comment.