From 735e7275c97d32c9e0dfe14e495cc13941d39310 Mon Sep 17 00:00:00 2001 From: Michael Dayan Date: Wed, 7 Jun 2023 10:30:27 +0200 Subject: [PATCH 1/6] Fix inconsistent behavior of existing session when using -d compared to --files option --- heudiconv/parser.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/heudiconv/parser.py b/heudiconv/parser.py index f8a11977..c4818fa4 100644 --- a/heudiconv/parser.py +++ b/heudiconv/parser.py @@ -278,15 +278,10 @@ def infotoids( ) lgr.info("Study session for %r", study_session_info) - if study_session_info in study_sessions: - if grouping != "all": - # MG - should this blow up to mimic -d invocation? - lgr.warning( - "Existing study session with the same values (%r)." - " Skipping DICOMS %s", - study_session_info, - seqinfo.values(), - ) - continue + if grouping != "all": + assert ( + study_session_info not in study_sessions + ), f"Existing study session {study_session_info} already in analyzed sessions {study_sessions.keys()}" + study_sessions[study_session_info] = seqinfo return study_sessions From a7d4af101c714f960df46305282ac57c5595529f Mon Sep 17 00:00:00 2001 From: Michael Dayan Date: Wed, 7 Jun 2023 11:03:37 +0200 Subject: [PATCH 2/6] Fix linting --- heudiconv/parser.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/heudiconv/parser.py b/heudiconv/parser.py index c4818fa4..bb8b73e1 100644 --- a/heudiconv/parser.py +++ b/heudiconv/parser.py @@ -279,9 +279,8 @@ def infotoids( lgr.info("Study session for %r", study_session_info) if grouping != "all": - assert ( - study_session_info not in study_sessions - ), f"Existing study session {study_session_info} already in analyzed sessions {study_sessions.keys()}" - + assert (study_session_info not in study_sessions), ( + f"Existing study session {study_session_info} " + f"already in analyzed sessions {study_sessions.keys()}") study_sessions[study_session_info] = seqinfo return study_sessions From c6c71061e9635d79dcb59c1c3b6819af76955448 Mon Sep 17 00:00:00 2001 From: Michael Dayan Date: Wed, 7 Jun 2023 15:26:40 +0200 Subject: [PATCH 3/6] Correct wrong series fusion when grouping across study UIDs --- heudiconv/dicoms.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/heudiconv/dicoms.py b/heudiconv/dicoms.py index ef51086a..7f16b088 100644 --- a/heudiconv/dicoms.py +++ b/heudiconv/dicoms.py @@ -12,6 +12,7 @@ from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Union, overload from unittest.mock import patch import warnings +import hashlib import pydicom as dcm @@ -114,7 +115,7 @@ def create_seqinfo(mw: dw.Wrapper, series_files: list[str], series_id: str) -> S def validate_dicom( fl: str, dcmfilter: Optional[Callable[[dcm.dataset.Dataset], Any]] -) -> Optional[tuple[dw.Wrapper, tuple[int, str], Optional[str]]]: +) -> Optional[tuple[dw.Wrapper, tuple[int, str, str], Optional[str]]]: """ Parse DICOM attributes. Returns None if not valid. """ @@ -135,7 +136,7 @@ def validate_dicom( try: protocol_name = mw.dcm_data.ProtocolName assert isinstance(protocol_name, str) - series_id = (int(mw.dcm_data.SeriesNumber), protocol_name) + series_id = (int(mw.dcm_data.SeriesNumber), protocol_name, mw.dcm_data.SeriesInstanceUID) except AttributeError as e: lgr.warning('Ignoring %s since not quite a "normal" DICOM: %s', fl, e) return None @@ -159,10 +160,12 @@ def validate_dicom( class SeriesID(NamedTuple): series_number: int protocol_name: str + series_uid: str file_studyUID: Optional[str] = None def __str__(self) -> str: - s = f"{self.series_number}-{self.protocol_name}" + suid_hex = hashlib.md5(self.series_uid.encode('utf-8')).hexdigest()[:16] + s = f"{self.series_number}-{self.protocol_name}-{suid_hex}" if self.file_studyUID is not None: s += f"-{self.file_studyUID}" return s @@ -285,7 +288,7 @@ def group_dicoms_into_seqinfos( removeidx.append(idx) continue mw, series_id_, file_studyUID = mwinfo - series_id = SeriesID(series_id_[0], series_id_[1]) + series_id = SeriesID(series_id_[0], series_id_[1], series_id_[2]) if per_studyUID: series_id = series_id._replace(file_studyUID=file_studyUID) @@ -320,6 +323,7 @@ def group_dicoms_into_seqinfos( series_id = SeriesID( mwgroup[idx].dcm_data.SeriesNumber, mwgroup[idx].dcm_data.ProtocolName, + mwgroup[idx].dcm_data.SeriesInstanceUID, ) if per_studyUID: series_id = series_id._replace(file_studyUID=file_studyUID) From 7e75a8a4fc03dfe15982a1218189c0971058826d Mon Sep 17 00:00:00 2001 From: Michael Dayan Date: Fri, 16 Jun 2023 17:49:13 +0200 Subject: [PATCH 4/6] Add check that conversion table using deprecated series UID is not used --- heudiconv/convert.py | 5 ++++- heudiconv/dicoms.py | 3 ++- heudiconv/utils.py | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/heudiconv/convert.py b/heudiconv/convert.py index 8e149bb3..3608f540 100644 --- a/heudiconv/convert.py +++ b/heudiconv/convert.py @@ -46,6 +46,7 @@ set_readonly, treat_infofile, write_config, + has_deprecated_seriesid, ) if TYPE_CHECKING: @@ -172,7 +173,8 @@ def prep_conversion( # if conversion table(s) do not exist -- we need to prepare them # (the *prepare* stage in https://github.com/nipy/heudiconv/issues/134) # if overwrite - recalculate this anyways - reuse_conversion_table = op.exists(edit_file) + # MD: a check for the deprecated series_id is added here to avoid reusing the conversion table + reuse_conversion_table = op.exists(edit_file) and not has_deprecated_seriesid(edit_file) # We also might need to redo it if changes in the heuristic file # detected # ref: https://github.com/nipy/heudiconv/issues/84#issuecomment-330048609 @@ -209,6 +211,7 @@ def prep_conversion( # DICOMs dumped with SOP UUIDs thus differing across runs etc # So either it would need to be brought back or reconsidered altogether # (since no sample data to test on etc) + else: assure_no_file_exists(target_heuristic_filename) safe_copyfile(heuristic.filename, target_heuristic_filename) diff --git a/heudiconv/dicoms.py b/heudiconv/dicoms.py index 7f16b088..a4212380 100644 --- a/heudiconv/dicoms.py +++ b/heudiconv/dicoms.py @@ -164,7 +164,8 @@ class SeriesID(NamedTuple): file_studyUID: Optional[str] = None def __str__(self) -> str: - suid_hex = hashlib.md5(self.series_uid.encode('utf-8')).hexdigest()[:16] + len_hash_hex = 8 + suid_hex = hashlib.md5(self.series_uid.encode('utf-8')).hexdigest()[:len_hash_hex] s = f"{self.series_number}-{self.protocol_name}-{suid_hex}" if self.file_studyUID is not None: s += f"-{self.file_studyUID}" diff --git a/heudiconv/utils.py b/heudiconv/utils.py index 9f988267..01e1760d 100644 --- a/heudiconv/utils.py +++ b/heudiconv/utils.py @@ -18,6 +18,7 @@ import stat from subprocess import check_output import sys +import string import tempfile from time import sleep from types import ModuleType @@ -222,6 +223,41 @@ def load_json(filename: str | Path, retry: int = 0) -> Any: return data +def is_deprecated_seriesid(series_id: str, len_hash_hex: int = 8) -> bool: + """Return True if series_id is in deprecated format. + A deprecated series ID is in the format "series_number-protocol_name". + A non-deprecated series ID is in the format "series_number-protocol_name-hash8hex(series_UID)" + """ + # Check at least two '-' in the series_id + if series_id.count('-') <= 1: + return True + # Check if the first part of the series_id is a number + series_number = series_id.split('-')[0] + if not series_number.isdigit(): + return True + # Check if the last part of the series_id is a hash in hexadecimal format of length 8 + hash_hex = series_id.split('-')[-1] + hex_digits = set(string.hexdigits) + if len(hash_hex) != len_hash_hex or not all(c in hex_digits for c in hash_hex): + return True + # If all previous tests passed, then the series_id is not deprecated + return False + + +def has_deprecated_seriesid(info_file: str) -> bool: + """Return True if any series_id in the info_file is in deprecated format.""" + info = read_config(info_file) + series_ids = [series_id for sublist in info.values() for series_id in sublist] + if any(is_deprecated_seriesid(series_id) for series_id in series_ids): + lgr.warning( + "Found deprecated series identifier in info file in the format" + "- instead of --" + "The existing conversion table will be ignored." + ) + return True + return False + + def assure_no_file_exists(path: str | Path) -> None: """Check if file or symlink (git-annex?) exists, and if so -- remove""" if os.path.lexists(path): From a667acaf9be022b9737a3748ea3d1328da1cab42 Mon Sep 17 00:00:00 2001 From: Michael Dayan Date: Fri, 16 Jun 2023 17:54:32 +0200 Subject: [PATCH 5/6] Add minor formatting and documentation precision --- heudiconv/convert.py | 1 - heudiconv/utils.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/heudiconv/convert.py b/heudiconv/convert.py index 3608f540..92f1c9b9 100644 --- a/heudiconv/convert.py +++ b/heudiconv/convert.py @@ -211,7 +211,6 @@ def prep_conversion( # DICOMs dumped with SOP UUIDs thus differing across runs etc # So either it would need to be brought back or reconsidered altogether # (since no sample data to test on etc) - else: assure_no_file_exists(target_heuristic_filename) safe_copyfile(heuristic.filename, target_heuristic_filename) diff --git a/heudiconv/utils.py b/heudiconv/utils.py index 01e1760d..f7e92ecc 100644 --- a/heudiconv/utils.py +++ b/heudiconv/utils.py @@ -231,11 +231,11 @@ def is_deprecated_seriesid(series_id: str, len_hash_hex: int = 8) -> bool: # Check at least two '-' in the series_id if series_id.count('-') <= 1: return True - # Check if the first part of the series_id is a number + # Check the first part of the series_id is a number series_number = series_id.split('-')[0] if not series_number.isdigit(): return True - # Check if the last part of the series_id is a hash in hexadecimal format of length 8 + # Check the last part of the series_id is a hash in hexadecimal format of length len_hash_hex hash_hex = series_id.split('-')[-1] hex_digits = set(string.hexdigits) if len(hash_hex) != len_hash_hex or not all(c in hex_digits for c in hash_hex): From e9c92892f7b8cabdc1a77f55636443697e683424 Mon Sep 17 00:00:00 2001 From: Michael Dayan Date: Fri, 16 Jun 2023 18:03:15 +0200 Subject: [PATCH 6/6] Remove hardcoded reference to hash length of 8 in doc --- heudiconv/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/heudiconv/utils.py b/heudiconv/utils.py index f7e92ecc..17948703 100644 --- a/heudiconv/utils.py +++ b/heudiconv/utils.py @@ -226,7 +226,7 @@ def load_json(filename: str | Path, retry: int = 0) -> Any: def is_deprecated_seriesid(series_id: str, len_hash_hex: int = 8) -> bool: """Return True if series_id is in deprecated format. A deprecated series ID is in the format "series_number-protocol_name". - A non-deprecated series ID is in the format "series_number-protocol_name-hash8hex(series_UID)" + A non-deprecated series ID is in the format "series_number-protocol_name-seriesUID_hash_hex" """ # Check at least two '-' in the series_id if series_id.count('-') <= 1: @@ -251,7 +251,7 @@ def has_deprecated_seriesid(info_file: str) -> bool: if any(is_deprecated_seriesid(series_id) for series_id in series_ids): lgr.warning( "Found deprecated series identifier in info file in the format" - "- instead of --" + "- instead of --" "The existing conversion table will be ignored." ) return True