From ec1937dc4c2928522505d30bba8881cb0a6d9f50 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Mon, 27 Mar 2023 20:02:47 +0200
Subject: [PATCH] extract: --skip-errors ignores corrupted chunks (w/ log
 message), see #840

Forward port of a change implemented by @enkore back in 2016:

https://github.com/enkore/borg/commit/09b21b117c6d34032a9483dd82086a4fae532cb6
---
 src/borg/archive.py                        | 57 ++++++++++++++++------
 src/borg/archiver/extract_cmd.py           | 21 ++++++--
 src/borg/testsuite/archiver/extract_cmd.py | 22 +++++++++
 3 files changed, 82 insertions(+), 18 deletions(-)

diff --git a/src/borg/archive.py b/src/borg/archive.py
index b048d5c160..4b1b9148d5 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -788,6 +788,7 @@ def extract_item(
         hlm=None,
         pi=None,
         continue_extraction=False,
+        skip_integrity_errors=False,
     ):
         """
         Extract archive item.
@@ -800,6 +801,8 @@ def extract_item(
         :param hlm: maps hlid to link_target for extracting subtrees with hardlinks correctly
         :param pi: ProgressIndicatorPercent (or similar) for file extraction progress (in bytes)
         :param continue_extraction: continue a previously interrupted extraction of same archive
+        :param skip_integrity_errors: skip over corrupted chunks instead of raising IntegrityError
+        (ignored for dry_run and stdout)
         """
 
         def same_item(item, st):
@@ -849,7 +852,7 @@ def same_item(item, st):
                                 )
             if has_damaged_chunks:
                 raise BackupError("File has damaged (all-zero) chunks. Try running borg check --repair.")
-            return
+            return True
 
         dest = self.cwd
         path = os.path.join(dest, item.path)
@@ -857,7 +860,7 @@ def same_item(item, st):
         try:
             st = os.stat(path, follow_symlinks=False)
             if continue_extraction and same_item(item, st):
-                return  # done! we already have fully extracted this file in a previous run.
+                return True  # done! we already have fully extracted this file in a previous run.
             elif stat.S_ISDIR(st.st_mode):
                 os.rmdir(path)
             else:
@@ -878,20 +881,43 @@ def make_parent(path):
                 make_parent(path)
             with self.extract_helper(item, path, hlm) as hardlink_set:
                 if hardlink_set:
-                    return
+                    return True
                 with backup_io("open"):
                     fd = open(path, "wb")
                 with fd:
                     ids = [c.id for c in item.chunks]
-                    for data in self.pipeline.fetch_many(ids, is_preloaded=True, ro_type=ROBJ_FILE_STREAM):
+                    chunk_index = -1
+                    chunk_iterator = self.pipeline.fetch_many(ids, is_preloaded=True, ro_type=ROBJ_FILE_STREAM)
+                    skipped_errors = False
+                    while True:
+                        try:
+                            chunk_index += 1
+                            data = next(chunk_iterator)
+                        except StopIteration:
+                            break
+                        except IntegrityError as err:
+                            if not skip_integrity_errors:
+                                raise
+                            c = item.chunks[chunk_index]
+                            size = c.size
+                            logger.warning("%s: chunk %s: %s", remove_surrogates(item.path), bin_to_hex(c.id), err)
+                            with backup_io("seek"):
+                                fd.seek(size, 1)
+                            skipped_errors = True
+                            # restart chunk data generator
+                            ids = [c.id for c in item.chunks[chunk_index + 1 :]]
+                            chunk_iterator = self.pipeline.fetch_many(ids, is_preloaded=True, ro_type=ROBJ_FILE_STREAM)
+                        else:
+                            with backup_io("write"):
+                                size = len(data)
+                                if sparse and zeros.startswith(data):
+                                    # all-zero chunk: create a hole in a sparse file
+                                    fd.seek(size, 1)
+                                else:
+                                    fd.write(data)
                         if pi:
-                            pi.show(increase=len(data), info=[remove_surrogates(item.path)])
-                        with backup_io("write"):
-                            if sparse and zeros.startswith(data):
-                                # all-zero chunk: create a hole in a sparse file
-                                fd.seek(len(data), 1)
-                            else:
-                                fd.write(data)
+                            pi.show(increase=size, info=[remove_surrogates(item.path)])
+
                     with backup_io("truncate_and_attrs"):
                         pos = item_chunks_size = fd.tell()
                         fd.truncate(pos)
@@ -905,7 +931,7 @@ def make_parent(path):
                         )
                 if has_damaged_chunks:
                     raise BackupError("File has damaged (all-zero) chunks. Try running borg check --repair.")
-            return
+            return not skipped_errors
         with backup_io:
             # No repository access beyond this point.
             if stat.S_ISDIR(mode):
@@ -919,7 +945,7 @@ def make_parent(path):
                 with self.extract_helper(item, path, hlm) as hardlink_set:
                     if hardlink_set:
                         # unusual, but possible: this is a hardlinked symlink.
-                        return
+                        return True
                     target = item.target
                     try:
                         os.symlink(target, path)
@@ -930,18 +956,19 @@ def make_parent(path):
                 make_parent(path)
                 with self.extract_helper(item, path, hlm) as hardlink_set:
                     if hardlink_set:
-                        return
+                        return True
                     os.mkfifo(path)
                     self.restore_attrs(path, item)
             elif stat.S_ISCHR(mode) or stat.S_ISBLK(mode):
                 make_parent(path)
                 with self.extract_helper(item, path, hlm) as hardlink_set:
                     if hardlink_set:
-                        return
+                        return True
                     os.mknod(path, item.mode, item.rdev)
                     self.restore_attrs(path, item)
             else:
                 raise Exception("Unknown archive item type %r" % item.mode)
+            return True
 
     def restore_attrs(self, path, item, symlink=False, fd=None):
         """
diff --git a/src/borg/archiver/extract_cmd.py b/src/borg/archiver/extract_cmd.py
index 452b9a9a56..f1d3e4e6dc 100644
--- a/src/borg/archiver/extract_cmd.py
+++ b/src/borg/archiver/extract_cmd.py
@@ -39,6 +39,7 @@ def do_extract(self, args, repository, manifest, archive):
         progress = args.progress
         output_list = args.output_list
         dry_run = args.dry_run
+        skip_errors = args.skip_errors
         stdout = args.stdout
         sparse = args.sparse
         strip_components = args.strip_components
@@ -76,9 +77,16 @@ def do_extract(self, args, repository, manifest, archive):
                         dirs.append(item)
                         archive.extract_item(item, stdout=stdout, restore_attrs=False)
                     else:
-                        archive.extract_item(
-                            item, stdout=stdout, sparse=sparse, hlm=hlm, pi=pi, continue_extraction=continue_extraction
-                        )
+                        if not archive.extract_item(
+                            item,
+                            stdout=stdout,
+                            sparse=sparse,
+                            hlm=hlm,
+                            pi=pi,
+                            continue_extraction=continue_extraction,
+                            skip_integrity_errors=skip_errors,
+                        ):
+                            self.exit_code = EXIT_WARNING
             except (BackupOSError, BackupError) as e:
                 self.print_warning("%s: %s", remove_surrogates(orig_path), e)
 
@@ -175,6 +183,13 @@ def build_parser_extract(self, subparsers, common_parser, mid_common_parser):
             action="store_true",
             help="continue a previously interrupted extraction of same archive",
         )
+        subparser.add_argument(
+            "--skip-errors",
+            dest="skip_errors",
+            action="store_true",
+            help="skip corrupted chunks with a log message (exit 1) instead of aborting "
+            "(no effect for --dry-run and --stdout)",
+        )
         subparser.add_argument("name", metavar="NAME", type=archivename_validator, help="specify the archive name")
         subparser.add_argument(
             "paths", metavar="PATH", nargs="*", type=str, help="paths to extract; patterns are supported"
diff --git a/src/borg/testsuite/archiver/extract_cmd.py b/src/borg/testsuite/archiver/extract_cmd.py
index 46d4d26dba..fef7ff5ed0 100644
--- a/src/borg/testsuite/archiver/extract_cmd.py
+++ b/src/borg/testsuite/archiver/extract_cmd.py
@@ -625,6 +625,28 @@ def test_overwrite(archivers, request):
         cmd(archiver, "extract", "test", exit_code=1)
 
 
+def test_extract_skip_errors(archivers, request):
+    archiver = request.getfixturevalue(archivers)
+    create_regular_file(archiver.input_path, "file1", contents=b"a" * 280 + b"b" * 280)
+    cmd(archiver, "rcreate", "-e" "none")
+    cmd(archiver, "create", "--chunker-params", "7,9,8,128", "test", "input")
+    segment_files = sorted(os.listdir(os.path.join(archiver.repository_path, "data", "0")), reverse=True)
+    print(
+        ", ".join(
+            f"{fn}: {os.stat(os.path.join(archiver.repository_path, 'data', '0', fn)).st_size}b" for fn in segment_files
+        )
+    )
+    name = segment_files[3]  # must be the segment file that has the file's chunks
+    with open(os.path.join(archiver.repository_path, "data", "0", name), "r+b") as fd:
+        fd.seek(100)
+        fd.write(b"XXXX")
+    with changedir("output"):
+        output = cmd(archiver, "extract", "--skip-errors", "test", exit_code=1)
+        assert "input/file1: chunk" in output
+        assert os.stat("input/file1").st_size == 560
+    cmd(archiver, "check", exit_code=1)
+
+
 # derived from test_extract_xattrs_errors()
 @pytest.mark.skipif(not xattr.XATTR_FAKEROOT, reason="xattr not supported on this system, or this version of fakeroot")
 def test_do_not_fail_when_percent_is_in_xattr_name(archivers, request):