From 735e7275c97d32c9e0dfe14e495cc13941d39310 Mon Sep 17 00:00:00 2001
From: Michael Dayan <michael.dayan@fcbg.ch>
Date: Wed, 7 Jun 2023 10:30:27 +0200
Subject: [PATCH 1/6] Fix inconsistent behavior of existing session when using
 -d compared to --files option

---
 heudiconv/parser.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/heudiconv/parser.py b/heudiconv/parser.py
index f8a11977..c4818fa4 100644
--- a/heudiconv/parser.py
+++ b/heudiconv/parser.py
@@ -278,15 +278,10 @@ def infotoids(
             )
             lgr.info("Study session for %r", study_session_info)
 
-            if study_session_info in study_sessions:
-                if grouping != "all":
-                    # MG - should this blow up to mimic -d invocation?
-                    lgr.warning(
-                        "Existing study session with the same values (%r)."
-                        " Skipping DICOMS %s",
-                        study_session_info,
-                        seqinfo.values(),
-                    )
-                    continue
+            if grouping != "all":
+                assert (
+                        study_session_info not in study_sessions
+                    ), f"Existing study session {study_session_info} already in analyzed sessions {study_sessions.keys()}"
+                
             study_sessions[study_session_info] = seqinfo
     return study_sessions

From a7d4af101c714f960df46305282ac57c5595529f Mon Sep 17 00:00:00 2001
From: Michael Dayan <michael.dayan@fcbg.ch>
Date: Wed, 7 Jun 2023 11:03:37 +0200
Subject: [PATCH 2/6] Fix linting

---
 heudiconv/parser.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/heudiconv/parser.py b/heudiconv/parser.py
index c4818fa4..bb8b73e1 100644
--- a/heudiconv/parser.py
+++ b/heudiconv/parser.py
@@ -279,9 +279,8 @@ def infotoids(
             lgr.info("Study session for %r", study_session_info)
 
             if grouping != "all":
-                assert (
-                        study_session_info not in study_sessions
-                    ), f"Existing study session {study_session_info} already in analyzed sessions {study_sessions.keys()}"
-                
+                assert (study_session_info not in study_sessions), (
+                    f"Existing study session {study_session_info} "
+                    f"already in analyzed sessions {study_sessions.keys()}")
             study_sessions[study_session_info] = seqinfo
     return study_sessions

From c6c71061e9635d79dcb59c1c3b6819af76955448 Mon Sep 17 00:00:00 2001
From: Michael Dayan <michael.dayan@fcbg.ch>
Date: Wed, 7 Jun 2023 15:26:40 +0200
Subject: [PATCH 3/6] Correct wrong series fusion when grouping across study
 UIDs

---
 heudiconv/dicoms.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/heudiconv/dicoms.py b/heudiconv/dicoms.py
index ef51086a..7f16b088 100644
--- a/heudiconv/dicoms.py
+++ b/heudiconv/dicoms.py
@@ -12,6 +12,7 @@
 from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Union, overload
 from unittest.mock import patch
 import warnings
+import hashlib
 
 import pydicom as dcm
 
@@ -114,7 +115,7 @@ def create_seqinfo(mw: dw.Wrapper, series_files: list[str], series_id: str) -> S
 
 def validate_dicom(
     fl: str, dcmfilter: Optional[Callable[[dcm.dataset.Dataset], Any]]
-) -> Optional[tuple[dw.Wrapper, tuple[int, str], Optional[str]]]:
+) -> Optional[tuple[dw.Wrapper, tuple[int, str, str], Optional[str]]]:
     """
     Parse DICOM attributes. Returns None if not valid.
     """
@@ -135,7 +136,7 @@ def validate_dicom(
     try:
         protocol_name = mw.dcm_data.ProtocolName
         assert isinstance(protocol_name, str)
-        series_id = (int(mw.dcm_data.SeriesNumber), protocol_name)
+        series_id = (int(mw.dcm_data.SeriesNumber), protocol_name, mw.dcm_data.SeriesInstanceUID)
     except AttributeError as e:
         lgr.warning('Ignoring %s since not quite a "normal" DICOM: %s', fl, e)
         return None
@@ -159,10 +160,12 @@ def validate_dicom(
 class SeriesID(NamedTuple):
     series_number: int
     protocol_name: str
+    series_uid: str
     file_studyUID: Optional[str] = None
 
     def __str__(self) -> str:
-        s = f"{self.series_number}-{self.protocol_name}"
+        suid_hex = hashlib.md5(self.series_uid.encode('utf-8')).hexdigest()[:16]
+        s = f"{self.series_number}-{self.protocol_name}-{suid_hex}"
         if self.file_studyUID is not None:
             s += f"-{self.file_studyUID}"
         return s
@@ -285,7 +288,7 @@ def group_dicoms_into_seqinfos(
             removeidx.append(idx)
             continue
         mw, series_id_, file_studyUID = mwinfo
-        series_id = SeriesID(series_id_[0], series_id_[1])
+        series_id = SeriesID(series_id_[0], series_id_[1], series_id_[2])
         if per_studyUID:
             series_id = series_id._replace(file_studyUID=file_studyUID)
 
@@ -320,6 +323,7 @@ def group_dicoms_into_seqinfos(
                 series_id = SeriesID(
                     mwgroup[idx].dcm_data.SeriesNumber,
                     mwgroup[idx].dcm_data.ProtocolName,
+                    mwgroup[idx].dcm_data.SeriesInstanceUID,
                 )
                 if per_studyUID:
                     series_id = series_id._replace(file_studyUID=file_studyUID)

From 7e75a8a4fc03dfe15982a1218189c0971058826d Mon Sep 17 00:00:00 2001
From: Michael Dayan <michael.dayan@fcbg.ch>
Date: Fri, 16 Jun 2023 17:49:13 +0200
Subject: [PATCH 4/6] Add check that conversion table using deprecated series
 UID is not used

---
 heudiconv/convert.py |  5 ++++-
 heudiconv/dicoms.py  |  3 ++-
 heudiconv/utils.py   | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/heudiconv/convert.py b/heudiconv/convert.py
index 8e149bb3..3608f540 100644
--- a/heudiconv/convert.py
+++ b/heudiconv/convert.py
@@ -46,6 +46,7 @@
     set_readonly,
     treat_infofile,
     write_config,
+    has_deprecated_seriesid,
 )
 
 if TYPE_CHECKING:
@@ -172,7 +173,8 @@ def prep_conversion(
     # if conversion table(s) do not exist -- we need to prepare them
     # (the *prepare* stage in https://github.com/nipy/heudiconv/issues/134)
     # if overwrite - recalculate this anyways
-    reuse_conversion_table = op.exists(edit_file)
+    # MD: a check for the deprecated series_id is added here to avoid reusing the conversion table
+    reuse_conversion_table = op.exists(edit_file) and not has_deprecated_seriesid(edit_file)
     # We also might need to redo it if changes in the heuristic file
     # detected
     # ref: https://github.com/nipy/heudiconv/issues/84#issuecomment-330048609
@@ -209,6 +211,7 @@ def prep_conversion(
         # DICOMs dumped with SOP UUIDs thus differing across runs etc
         # So either it would need to be brought back or reconsidered altogether
         # (since no sample data to test on etc)
+
     else:
         assure_no_file_exists(target_heuristic_filename)
         safe_copyfile(heuristic.filename, target_heuristic_filename)
diff --git a/heudiconv/dicoms.py b/heudiconv/dicoms.py
index 7f16b088..a4212380 100644
--- a/heudiconv/dicoms.py
+++ b/heudiconv/dicoms.py
@@ -164,7 +164,8 @@ class SeriesID(NamedTuple):
     file_studyUID: Optional[str] = None
 
     def __str__(self) -> str:
-        suid_hex = hashlib.md5(self.series_uid.encode('utf-8')).hexdigest()[:16]
+        len_hash_hex = 8
+        suid_hex = hashlib.md5(self.series_uid.encode('utf-8')).hexdigest()[:len_hash_hex]
         s = f"{self.series_number}-{self.protocol_name}-{suid_hex}"
         if self.file_studyUID is not None:
             s += f"-{self.file_studyUID}"
diff --git a/heudiconv/utils.py b/heudiconv/utils.py
index 9f988267..01e1760d 100644
--- a/heudiconv/utils.py
+++ b/heudiconv/utils.py
@@ -18,6 +18,7 @@
 import stat
 from subprocess import check_output
 import sys
+import string
 import tempfile
 from time import sleep
 from types import ModuleType
@@ -222,6 +223,41 @@ def load_json(filename: str | Path, retry: int = 0) -> Any:
     return data
 
 
+def is_deprecated_seriesid(series_id: str, len_hash_hex: int = 8) -> bool:
+    """Return True if series_id is in deprecated format.
+    A deprecated series ID is in the format "series_number-protocol_name".
+    A non-deprecated series ID is in the format "series_number-protocol_name-hash8hex(series_UID)"
+    """
+    # Check at least two '-' in the series_id
+    if series_id.count('-') <= 1:
+        return True
+    # Check if the first part of the series_id is a number
+    series_number = series_id.split('-')[0]
+    if not series_number.isdigit():
+        return True
+    # Check if the last part of the series_id is a hash in hexadecimal format of length 8
+    hash_hex = series_id.split('-')[-1]
+    hex_digits = set(string.hexdigits)
+    if len(hash_hex) != len_hash_hex or not all(c in hex_digits for c in hash_hex):
+        return True
+    # If all previous tests passed, then the series_id is not deprecated
+    return False
+
+
+def has_deprecated_seriesid(info_file: str) -> bool:
+    """Return True if any series_id in the info_file is in deprecated format."""
+    info = read_config(info_file)
+    series_ids = [series_id for sublist in info.values() for series_id in sublist]
+    if any(is_deprecated_seriesid(series_id) for series_id in series_ids):
+        lgr.warning(
+            "Found deprecated series identifier in info file in the format"
+            "<series num>-<protocol name> instead of <series num>-<protocol name>-<UID hash len 8>"
+            "The existing conversion table will be ignored."
+        )
+        return True
+    return False
+
+
 def assure_no_file_exists(path: str | Path) -> None:
     """Check if file or symlink (git-annex?) exists, and if so -- remove"""
     if os.path.lexists(path):

From a667acaf9be022b9737a3748ea3d1328da1cab42 Mon Sep 17 00:00:00 2001
From: Michael Dayan <michael.dayan@fcbg.ch>
Date: Fri, 16 Jun 2023 17:54:32 +0200
Subject: [PATCH 5/6] Add minor formatting and documentation precision

---
 heudiconv/convert.py | 1 -
 heudiconv/utils.py   | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/heudiconv/convert.py b/heudiconv/convert.py
index 3608f540..92f1c9b9 100644
--- a/heudiconv/convert.py
+++ b/heudiconv/convert.py
@@ -211,7 +211,6 @@ def prep_conversion(
         # DICOMs dumped with SOP UUIDs thus differing across runs etc
         # So either it would need to be brought back or reconsidered altogether
         # (since no sample data to test on etc)
-
     else:
         assure_no_file_exists(target_heuristic_filename)
         safe_copyfile(heuristic.filename, target_heuristic_filename)
diff --git a/heudiconv/utils.py b/heudiconv/utils.py
index 01e1760d..f7e92ecc 100644
--- a/heudiconv/utils.py
+++ b/heudiconv/utils.py
@@ -231,11 +231,11 @@ def is_deprecated_seriesid(series_id: str, len_hash_hex: int = 8) -> bool:
     # Check at least two '-' in the series_id
     if series_id.count('-') <= 1:
         return True
-    # Check if the first part of the series_id is a number
+    # Check the first part of the series_id is a number
     series_number = series_id.split('-')[0]
     if not series_number.isdigit():
         return True
-    # Check if the last part of the series_id is a hash in hexadecimal format of length 8
+    # Check the last part of the series_id is a hash in hexadecimal format of length len_hash_hex
     hash_hex = series_id.split('-')[-1]
     hex_digits = set(string.hexdigits)
     if len(hash_hex) != len_hash_hex or not all(c in hex_digits for c in hash_hex):

From e9c92892f7b8cabdc1a77f55636443697e683424 Mon Sep 17 00:00:00 2001
From: Michael Dayan <michael.dayan@fcbg.ch>
Date: Fri, 16 Jun 2023 18:03:15 +0200
Subject: [PATCH 6/6] Remove hardcoded reference to hash length of 8 in doc

---
 heudiconv/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/heudiconv/utils.py b/heudiconv/utils.py
index f7e92ecc..17948703 100644
--- a/heudiconv/utils.py
+++ b/heudiconv/utils.py
@@ -226,7 +226,7 @@ def load_json(filename: str | Path, retry: int = 0) -> Any:
 def is_deprecated_seriesid(series_id: str, len_hash_hex: int = 8) -> bool:
     """Return True if series_id is in deprecated format.
     A deprecated series ID is in the format "series_number-protocol_name".
-    A non-deprecated series ID is in the format "series_number-protocol_name-hash8hex(series_UID)"
+    A non-deprecated series ID is in the format "series_number-protocol_name-seriesUID_hash_hex"
     """
     # Check at least two '-' in the series_id
     if series_id.count('-') <= 1:
@@ -251,7 +251,7 @@ def has_deprecated_seriesid(info_file: str) -> bool:
     if any(is_deprecated_seriesid(series_id) for series_id in series_ids):
         lgr.warning(
             "Found deprecated series identifier in info file in the format"
-            "<series num>-<protocol name> instead of <series num>-<protocol name>-<UID hash len 8>"
+            "<series num>-<protocol name> instead of <series num>-<protocol name>-<UID hash>"
             "The existing conversion table will be ignored."
         )
         return True