diff --git a/MANIFEST.in b/MANIFEST.in index 4d2eeb43..2c5098f4 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -6,17 +6,18 @@ exclude *.yaml exclude *.yml exclude Dockerfile exclude *.ini -exclude asv.conf.json -exclude brainglobe_workflows/cellfinder/default_config.json recursive-include brainglobe_workflows *.py +recursive-include brainglobe_workflows/configs *.json +recursive-include brainglobe_benchmarks *.py +recursive-exclude brainglobe_benchmarks/results * +include asv.conf.json recursive-exclude * __pycache__ recursive-exclude * *.py[co] global-include *.pxd -prune benchmarks prune docs prune tests prune resources diff --git a/asv.conf.json b/asv.conf.json index d620a545..8ce9490d 100644 --- a/asv.conf.json +++ b/asv.conf.json @@ -11,7 +11,8 @@ // The URL or local path of the source code repository for the // project being benchmarked - "repo": ".", + // "repo": ".", + "repo": "https://github.com/brainglobe/brainglobe-workflows", // The Python project's subdirectory in your repo. If missing or // the empty string, the project is assumed to be located at the root @@ -39,14 +40,14 @@ // List of branches to benchmark. If not provided, defaults to "master" // (for git) or "default" (for mercurial). - "branches": ["main"], // for git + "branches": ["smg/tests-refactor"], // for git // "branches": ["default"], // for mercurial // The DVCS being used. If not set, it will be automatically // determined from "repo" by looking at the protocol in the URL // (if remote), or by looking for special directories, such as // ".git" (if local). - // "dvcs": "git", + "dvcs": "git", // The tool to use to create environments. May be "conda", // "virtualenv", "mamba" (above 3.8) @@ -146,7 +147,7 @@ // The directory (relative to the current directory) that benchmarks are // stored in. If not provided, defaults to "benchmarks" - // "benchmark_dir": "benchmarks", + "benchmark_dir": "brainglobe_benchmarks", // The directory (relative to the current directory) to cache the Python // environments in. If not provided, defaults to "env" @@ -154,11 +155,11 @@ // The directory (relative to the current directory) that raw benchmark // results are stored in. If not provided, defaults to "results". - "results_dir": "benchmarks/results", + "results_dir": "brainglobe_benchmarks/results", // The directory (relative to the current directory) that the html tree // should be written to. If not provided, defaults to "html". - "html_dir": "benchmarks/html", + "html_dir": "brainglobe_benchmarks/html", // The number of characters to retain in the commit hashes. // "hash_length": 8, diff --git a/benchmarks/__init__.py b/brainglobe_benchmarks/__init__.py similarity index 100% rename from benchmarks/__init__.py rename to brainglobe_benchmarks/__init__.py diff --git a/benchmarks/cellfinder.py b/brainglobe_benchmarks/cellfinder.py similarity index 95% rename from benchmarks/cellfinder.py rename to brainglobe_benchmarks/cellfinder.py index 76d364bc..e471aad7 100644 --- a/benchmarks/cellfinder.py +++ b/brainglobe_benchmarks/cellfinder.py @@ -7,14 +7,12 @@ from cellfinder_core.main import main as cellfinder_run from cellfinder_core.tools.IO import read_with_dask -from brainglobe_workflows.cellfinder.cellfinder_main import ( - DEFAULT_JSON_CONFIG_PATH, +from brainglobe_workflows.cellfinder import ( CellfinderConfig, run_workflow_from_cellfinder_run, ) -from brainglobe_workflows.cellfinder.cellfinder_main import ( - setup as setup_cellfinder_workflow, -) +from brainglobe_workflows.cellfinder import setup as setup_cellfinder_workflow +from brainglobe_workflows.utils import DEFAULT_JSON_CONFIG_PATH_CELLFINDER class TimeBenchmarkPrepGIN: @@ -79,7 +77,7 @@ class TimeBenchmarkPrepGIN: min_run_count = 2 # default:2 # Custom attributes - input_config_path = str(DEFAULT_JSON_CONFIG_PATH) + input_config_path = str(DEFAULT_JSON_CONFIG_PATH_CELLFINDER) def setup_cache( self, @@ -114,7 +112,7 @@ def setup_cache( known_hash=config.data_hash, path=config.install_path, progressbar=True, - processor=pooch.Unzip(extract_dir=config.extract_dir_relative), + processor=pooch.Unzip(extract_dir=config.data_dir_relative), ) # Check paths to input data should now exist in config diff --git a/brainglobe_workflows/cellfinder.py b/brainglobe_workflows/cellfinder.py new file mode 100644 index 00000000..ec6dfa60 --- /dev/null +++ b/brainglobe_workflows/cellfinder.py @@ -0,0 +1,420 @@ +"""This script reproduces the most common cellfinder workflow + +It receives as an (optional) command line input the path to a configuration +json file, that holds the values of the required parameters for the workflow. + +If no input json file is passed as a configuration, the default +configuration defined at brainglobe_workflows/cellfinder/default_config.json +is used. + +Example usage: + - to pass a custom configuration, run (from the cellfinder_main.py + parent directory): + python cellfinder_main.py --config path/to/input/config.json + - to use the default configuration, run + python cellfinder_main.py + + +""" + + +import datetime +import json +import logging +import os +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, Tuple, Union + +import pooch +from brainglobe_utils.IO.cells import save_cells +from cellfinder_core.main import main as cellfinder_run +from cellfinder_core.tools.IO import read_with_dask +from cellfinder_core.train.train_yml import depth_type + +from brainglobe_workflows.utils import ( + DEFAULT_JSON_CONFIG_PATH_CELLFINDER, + config_parser, + setup_logger, +) +from brainglobe_workflows.utils import __name__ as LOGGER_NAME + +Pathlike = Union[str, os.PathLike] + + +@dataclass +class CellfinderConfig: + """ + Define input and output data locations, and the parameters for + the cellfinder preprocessing steps. + """ + + # input data + # data_dir_relative: parent directory to signal and background, + # relative to install path + data_dir_relative: Pathlike + signal_subdir: str + background_subdir: str + + # output + output_path_basename_relative: Pathlike + detected_cells_filename: Pathlike + + # preprocessing parameters + voxel_sizes: Tuple[float, float, float] + start_plane: int + end_plane: int + trained_model: Optional[ + os.PathLike + ] # if None, it will use a default model + model_weights: Optional[os.PathLike] + model: str + batch_size: int + n_free_cpus: int + network_voxel_sizes: Tuple[int, int, int] + soma_diameter: int + ball_xy_size: int + ball_z_size: int + ball_overlap_fraction: float + log_sigma_size: float + n_sds_above_mean_thresh: int + soma_spread_factor: float + max_cluster_size: int + cube_width: int + cube_height: int + cube_depth: int + network_depth: depth_type + + # install path (root for all inputs and outputs) + install_path: Pathlike = ".cellfinder_workflows" + + # origin of data to download (if required) + data_url: Optional[str] = None + data_hash: Optional[str] = None + + # The following attributes are added + # during the setup phase of the workflow + list_signal_files: Optional[list] = None + list_background_files: Optional[list] = None + output_path: Pathlike = "" + detected_cells_path: Pathlike = "" + signal_dir_path: Pathlike = "" + background_dir_path: Pathlike = "" + + +def read_cellfinder_config(input_config_path: Path): + """Instantiate a CellfinderConfig from the input json file + (assumes config is json serializable) + + + Parameters + ---------- + input_config_path : Path + Absolute path to a cellfinder config file + + Returns + ------- + CellfinderConfig: + The cellfinder config object, populated with data from the input + """ + # read input config + with open(input_config_path) as cfg: + config_dict = json.load(cfg) + config = CellfinderConfig(**config_dict) + + return config + + +def add_signal_and_background_files( + config: CellfinderConfig, +) -> CellfinderConfig: + """ + Adds the lists of input data files (signal and background) + to the config. + + These files are first searched locally. If not found, we + attempt to download them from GIN. + + Specifically: + - If both parent data directories (signal and background) exist locally, + the lists of signal and background files are added to the config. + - If exactly one of the parent data directories is missing, an error + message is logged. + - If neither of them exist, the data is retrieved from the provided GIN + repository. If no URL or hash to GIN is provided, an error is thrown. + + Parameters + ---------- + config : CellfinderConfig + a cellfinder config with input data files to be validated + + Returns + ------- + config : CellfinderConfig + a cellfinder config with updated input data lists. + """ + # Fetch logger + logger = logging.getLogger(LOGGER_NAME) + + # Check if input data directories (signal and background) exist locally. + # If both directories exist, get list of signal and background files + if ( + Path(config.signal_dir_path).exists() + and Path(config.background_dir_path).exists() + ): + logger.info("Fetching input data from the local directories") + + config.list_signal_files = [ + f + for f in Path(config.signal_dir_path).resolve().iterdir() + if f.is_file() + ] + config.list_background_files = [ + f + for f in Path(config.background_dir_path).resolve().iterdir() + if f.is_file() + ] + + # If exactly one of the input data directories is missing, print error + elif ( + Path(config.signal_dir_path).resolve().exists() + or Path(config.background_dir_path).resolve().exists() + ): + if not Path(config.signal_dir_path).resolve().exists(): + logger.error( + f"The directory {config.signal_dir_path} does not exist" + ) + else: + logger.error( + f"The directory {config.background_dir_path} " "does not exist" + ) + + # If neither of the input data directories exist, + # retrieve data from GIN repository and add list of files to config + else: + # Check if GIN URL and hash are defined (log error otherwise) + if config.data_url and config.data_hash: + # get list of files in GIN archive with pooch.retrieve + list_files_archive = pooch.retrieve( + url=config.data_url, + known_hash=config.data_hash, + path=config.install_path, # zip will be downloaded here + progressbar=True, + processor=pooch.Unzip( + extract_dir=config.data_dir_relative + # path to unzipped dir, + # *relative* to the path set in 'path' + ), + ) + logger.info("Fetching input data from the provided GIN repository") + + # Check signal and background parent directories exist now + assert Path(config.signal_dir_path).resolve().exists() + assert Path(config.background_dir_path).resolve().exists() + + # Add signal files to config + config.list_signal_files = [ + f + for f in list_files_archive + if f.startswith( + str(Path(config.signal_dir_path).resolve()) + ) # if str(config.signal_dir_path) in f + ] + + # Add background files to config + config.list_background_files = [ + f + for f in list_files_archive + if f.startswith( + str(Path(config.background_dir_path).resolve()) + ) + ] + # If one of URL/hash to GIN repo not defined, throw an error + else: + logger.error( + "Input data not found locally, and URL/hash to " + "GIN repository not provided" + ) + + return config + + +def setup_workflow(input_config_path: Path) -> CellfinderConfig: + """Run setup steps prior to executing the workflow + + These setup steps include: + - instantiating a CellfinderConfig object with the required parameters, + - checking if the input data exists locally, and fetching from + GIN repository otherwise, + - adding the path to the input data files to the config, and + - creating a timestamped directory for the output of the workflow if + it doesn't exist and adding its path to the config + + Parameters + ---------- + input_config_path : Path + path to the input config file + + Returns + ------- + config : CellfinderConfig + a dataclass whose attributes are the parameters + for running cellfinder. + """ + + # Fetch logger + logger = logging.getLogger(LOGGER_NAME) + + # Check config file exists + assert input_config_path.exists() + + # Instantiate a CellfinderConfig from the input json file + # (assumes config is json serializable) + config = read_cellfinder_config(input_config_path) + + # Print info logs for status + logger.info(f"Input config read from {input_config_path}") + if input_config_path == DEFAULT_JSON_CONFIG_PATH_CELLFINDER: + logger.info("Using default config file") + + # Add lists of input data files to the config, + # if these are not defined yet + if not (config.list_signal_files and config.list_background_files): + # build fullpaths to input directories + config.signal_dir_path = str( + Path(config.install_path) + / config.data_dir_relative + / config.signal_subdir + ) + config.background_dir_path = str( + Path(config.install_path) + / config.data_dir_relative + / config.background_subdir + ) + + # add signal and background files to config + config = add_signal_and_background_files(config) + + # Create timestamped output directory if it doesn't exist + timestamp = datetime.datetime.now() + timestamp_formatted = timestamp.strftime("%Y%m%d_%H%M%S") + output_path_timestamped = Path(config.install_path) / ( + str(config.output_path_basename_relative) + timestamp_formatted + ) + output_path_timestamped.mkdir( + parents=True, # create any missing parents + exist_ok=True, # ignore FileExistsError exceptions + ) + + # Add output path and output file path to config + config.output_path = output_path_timestamped + config.detected_cells_path = ( + config.output_path / config.detected_cells_filename + ) + + return config + + +def setup(input_config_path: str) -> CellfinderConfig: + # setup logger + _ = setup_logger() + + # run setup steps and return config + cfg = setup_workflow(Path(input_config_path)) + + return cfg + + +def run_workflow_from_cellfinder_run(cfg: CellfinderConfig): + """ + Run workflow based on the cellfinder_core.main.main() + function. + + The steps are: + 1. Read the input signal and background data as two separate + Dask arrays. + 2. Run the main cellfinder pipeline on the input Dask arrays, + with the parameters defined in the input configuration (cfg). + 3. Save the detected cells as an xml file to the location specified in + the input configuration (cfg). + + Parameters + ---------- + cfg : CellfinderConfig + a class with the required setup methods and parameters for + the cellfinder workflow + """ + # Read input data as Dask arrays + signal_array = read_with_dask(cfg.signal_dir_path) + background_array = read_with_dask(cfg.background_dir_path) + + # Run main analysis using `cellfinder_run` + detected_cells = cellfinder_run( + signal_array, background_array, cfg.voxel_sizes + ) + + # Save results to xml file + save_cells( + detected_cells, + cfg.detected_cells_path, + ) + + +def main( + input_config: str = str(DEFAULT_JSON_CONFIG_PATH_CELLFINDER), +) -> CellfinderConfig: + """ + Setup and run cellfinder workflow. + + This function runs the setup steps required + to run the cellfinder workflow, and the + workflow itself. Note that only the workflow + will be benchmarked. + + Parameters + ---------- + input_config : str, optional + Absolute path to input config file, + by default str(DEFAULT_JSON_CONFIG_PATH_CELLFINDER) + + Returns + ------- + cfg : CellfinderConfig + a class with the required setup methods and parameters for + the cellfinder workflow + """ + # run setup + cfg = setup(input_config) + + # run workflow + run_workflow_from_cellfinder_run(cfg) # only this will be benchmarked + + return cfg + + +def main_app_wrapper(): + """ + Parse command line arguments and + run cellfinder setup and workflow + + This function is used to define an entry-point, + that allows the user to run the cellfinder workflow + for a given input config file as: + `cellfinder-workflow --config `. + + If no input config file is provided, the default is used. + + """ + # parse CLI arguments + args = config_parser( + sys.argv[1:], # sys.argv[0] is the script name + str(DEFAULT_JSON_CONFIG_PATH_CELLFINDER), + ) + + # run setup and workflow + _ = main(args.config) + + +if __name__ == "__main__": + main_app_wrapper() diff --git a/brainglobe_workflows/cellfinder/cellfinder_main.py b/brainglobe_workflows/cellfinder/cellfinder_main.py deleted file mode 100644 index fd19db34..00000000 --- a/brainglobe_workflows/cellfinder/cellfinder_main.py +++ /dev/null @@ -1,404 +0,0 @@ -"""This script reproduces the most common cellfinder workflow - -It receives as an (optional) command line input the path to a configuration -json file, that holds the values of the required parameters for the workflow. - -If no input json file is passed as a configuration, the default -configuration defined at brainglobe_workflows/cellfinder/default_config.json -is used. - -Example usage: - - to pass a custom configuration, run (from the cellfinder_main.py - parent directory): - python cellfinder_main.py --config path/to/input/config.json - - to use the default configuration, run - python cellfinder_main.py - - -""" - -import argparse -import datetime -import json -import logging -import os -import sys -from dataclasses import dataclass -from pathlib import Path -from typing import Optional, Tuple, Union - -import pooch -from brainglobe_utils.IO.cells import save_cells -from cellfinder_core.main import main as cellfinder_run -from cellfinder_core.tools.IO import read_with_dask -from cellfinder_core.train.train_yml import depth_type - -Pathlike = Union[str, os.PathLike] - -DEFAULT_JSON_CONFIG_PATH = ( - Path(__file__).resolve().parent / "default_config.json" -) - - -@dataclass -class CellfinderConfig: - """ - Define input and output data locations, and the parameters for - the cellfinder preprocessing steps. - """ - - # cellfinder workflows cache directory - install_path: Pathlike - - # cached subdirectory to save data to - extract_dir_relative: Pathlike - signal_subdir: str - background_subdir: str - output_path_basename_relative: Pathlike - detected_cells_filename: Pathlike - - # preprocessing parameters - voxel_sizes: Tuple[float, float, float] - start_plane: int - end_plane: int - trained_model: Optional[ - os.PathLike - ] # if None, it will use a default model - model_weights: Optional[os.PathLike] - model: str - batch_size: int - n_free_cpus: int - network_voxel_sizes: Tuple[int, int, int] - soma_diameter: int - ball_xy_size: int - ball_z_size: int - ball_overlap_fraction: float - log_sigma_size: float - n_sds_above_mean_thresh: int - soma_spread_factor: float - max_cluster_size: int - cube_width: int - cube_height: int - cube_depth: int - network_depth: depth_type - - # origin of data to download (if required) - data_url: Optional[str] = None - data_hash: Optional[str] = None - - # The following attributes are added - # during the setup phase of the workflow - list_signal_files: Optional[list] = None - list_background_files: Optional[list] = None - output_path: Pathlike = "" - signal_dir_path: Pathlike = "" - background_dir_path: Pathlike = "" - detected_cells_path: Pathlike = "" - - -def setup(argv=None) -> CellfinderConfig: - def parse_cli_arguments(argv_) -> argparse.Namespace: - """Define argument parser for cellfinder - workflow script. - - It expects a path to a json file with the - parameters required to run the workflow. - If none is provided, the default - - Returns - ------- - args : argparse.Namespace - command line input arguments parsed - """ - # initialise argument parser - parser = argparse.ArgumentParser( - description=( - "To launch the workflow with " - "a specific set of input parameters, run: " - "`python cellfinder_main.py --config path/to/config.json`" - "where path/to/input/config.json is the json file " - "containing the workflow parameters." - ) - ) - # add arguments - parser.add_argument( - "-c", - "--config", - default=str(DEFAULT_JSON_CONFIG_PATH), - type=str, - metavar="CONFIG", # a name for usage messages - help="", - ) - - # build parser object - args = parser.parse_args(argv_) - - # print error if required arguments not provided - if not args.config: - logger.error("Paths to input config not provided.") - parser.print_help() - - return args - - def setup_logger() -> logging.Logger: - """Setup a logger for this script - - The logger's level is set to DEBUG, and it - is linked to a handler that writes to the - console and whose level is - - Returns - ------- - logging.Logger - a logger object - """ - # define handler that writes to stdout - console_handler = logging.StreamHandler(sys.stdout) - console_format = logging.Formatter( - "%(name)s %(levelname)s: %(message)s" - ) - console_handler.setFormatter(console_format) - - # define logger and link to handler - logger = logging.getLogger( - __name__ - ) # if imported as a module, the logger is named after the module - logger.setLevel(logging.DEBUG) - logger.addHandler(console_handler) - return logger - - def setup_workflow(input_config_path: Path) -> CellfinderConfig: - """Run setup steps prior to executing the workflow - - These setup steps include: - - instantiating a CellfinderConfig object with the required parameters, - - checking if the input data exists locally, and fetching from - GIN repository otherwise, - - adding the path to the input data files to the config, and - - creating a timestamped directory for the output of the workflow if - it doesn't exist and adding its path to the config - - Parameters - ---------- - input_config_path : Path - path to the input config file - - Returns - ------- - config : CellfinderConfig - a dataclass whose attributes are the parameters - for running cellfinder. - """ - - # Check config file exists - assert input_config_path.exists() - - # Instantiate a CellfinderConfig from the input json file - # (assumes config is json serializable) - with open(input_config_path) as cfg: - config_dict = json.load(cfg) - config = CellfinderConfig(**config_dict) - - # Print info logs for status - logger.info(f"Input config read from {input_config_path}") - if input_config_path == DEFAULT_JSON_CONFIG_PATH: - logger.info("Using default config file") - - # Retrieve and add lists of input data to the config, - # if these are defined yet - if not (config.list_signal_files and config.list_signal_files): - # build fullpaths to inputs - config.signal_dir_path = str( - Path(config.install_path) - / config.extract_dir_relative - / config.signal_subdir - ) - config.background_dir_path = str( - Path(config.install_path) - / config.extract_dir_relative - / config.background_subdir - ) - # retrieve data - config = retrieve_input_data(config) - - # Create timestamped output directory if it doesn't exist - timestamp = datetime.datetime.now() - timestamp_formatted = timestamp.strftime("%Y%m%d_%H%M%S") - output_path_timestamped = Path(config.install_path) / ( - str(config.output_path_basename_relative) + timestamp_formatted - ) - output_path_timestamped.mkdir(parents=True, exist_ok=True) - - # Add output path and output file path to config - config.output_path = output_path_timestamped - config.detected_cells_path = ( - config.output_path / config.detected_cells_filename - ) - - return config - - def retrieve_input_data(config: CellfinderConfig) -> CellfinderConfig: - """ - Adds the lists of input data files (signal and background) - to the config. - - It first checks if the input data exists locally. - - If both directories (signal and background) exist, the lists of - signal and background files are added to the config. - - If exactly one of the input data directories is missing, an error - message is logged. - - If neither of them exist, the data is retrieved from the provided GIN - repository. If no URL or hash to GIN is provided, an error is shown. - - Parameters - ---------- - config : CellfinderConfig - a dataclass whose attributes are the parameters - for running cellfinder. - - Returns - ------- - config : CellfinderConfig - a dataclass whose attributes are the parameters - for running cellfinder. - """ - # Check if input data (signal and background) exist locally. - # If both directories exist, get list of signal and background files - if ( - Path(config.signal_dir_path).exists() - and Path(config.background_dir_path).exists() - ): - logger.info("Fetching input data from the local directories") - - config.list_signal_files = [ - f - for f in Path(config.signal_dir_path).resolve().iterdir() - if f.is_file() - ] - config.list_background_files = [ - f - for f in Path(config.background_dir_path).resolve().iterdir() - if f.is_file() - ] - - # If exactly one of the input data directories is missing, print error - elif ( - Path(config.signal_dir_path).resolve().exists() - or Path(config.background_dir_path).resolve().exists() - ): - if not Path(config.signal_dir_path).resolve().exists(): - logger.error( - f"The directory {config.signal_dir_path} does not exist" - ) - else: - logger.error( - f"The directory {config.background_dir_path} " - "does not exist" - ) - - # If neither of them exist, retrieve data from GIN repository - else: - # check if GIN URL and hash are defined (log error otherwise) - if (not config.data_url) or (not config.data_hash): - logger.error( - "Input data not found locally, and URL/hash to " - "GIN repository not provided" - ) - - else: - # get list of files in GIN archive with pooch.retrieve - list_files_archive = pooch.retrieve( - url=config.data_url, - known_hash=config.data_hash, - path=config.install_path, # zip will be downloaded here - progressbar=True, - processor=pooch.Unzip( - extract_dir=config.extract_dir_relative - # path to unzipped dir, - # *relative* to the path set in 'path' - ), - ) - logger.info( - "Fetching input data from the provided GIN repository" - ) - - # Check signal and background parent directories exist now - assert Path(config.signal_dir_path).resolve().exists() - assert Path(config.background_dir_path).resolve().exists() - - # Add signal files to config - config.list_signal_files = [ - f - for f in list_files_archive - if f.startswith( - str(Path(config.signal_dir_path).resolve()) - ) # if str(config.signal_dir_path) in f - ] - - # Add background files to config - config.list_background_files = [ - f - for f in list_files_archive - if f.startswith( - str(Path(config.background_dir_path).resolve()) - ) # if str(config.background_dir_path) in f - ] - - return config - - # parse command line input arguments: - # sys.argv in most cases except for testing - # see https://paiml.com/docs/home/books/testing-in-python/chapter08-monkeypatching/#the-simplest-monkeypatching - argv = argv or sys.argv[1:] - args = parse_cli_arguments(argv) - - # setup logger - logger = setup_logger() - - # run setup steps and return config - cfg = setup_workflow(Path(args.config)) - - return cfg - - -def run_workflow_from_cellfinder_run(cfg: CellfinderConfig): - """ - Run workflow based on the cellfinder_core.main.main() - function. - - The steps are: - 1. Read the input signal and background data as two separate - Dask arrays. - 2. Run the main cellfinder pipeline on the input Dask arrays, - with the parameters defined in the input configuration (cfg). - 3. Save the detected cells as an xml file to the location specified in - the input configuration (cfg). - - Parameters - ---------- - cfg : CellfinderConfig - a class with the required setup methods and parameters for - the cellfinder workflow - """ - # Read input data as Dask arrays - signal_array = read_with_dask(cfg.signal_dir_path) - background_array = read_with_dask(cfg.background_dir_path) - - # Run main analysis using `cellfinder_run` - detected_cells = cellfinder_run( - signal_array, background_array, cfg.voxel_sizes - ) - - # Save results to xml file - save_cells( - detected_cells, - cfg.detected_cells_path, - ) - - -if __name__ == "__main__": - # run setup - cfg = setup() - - # run workflow - run_workflow_from_cellfinder_run(cfg) # only this will be benchmarked diff --git a/brainglobe_workflows/cellfinder/default_config.json b/brainglobe_workflows/configs/cellfinder.json similarity index 95% rename from brainglobe_workflows/cellfinder/default_config.json rename to brainglobe_workflows/configs/cellfinder.json index a80a4ba4..daf056a5 100644 --- a/brainglobe_workflows/cellfinder/default_config.json +++ b/brainglobe_workflows/configs/cellfinder.json @@ -2,7 +2,7 @@ "install_path": ".cellfinder_workflows", "data_url": "https://gin.g-node.org/BrainGlobe/test-data/raw/master/cellfinder/cellfinder-test-data.zip", "data_hash": "b0ef53b1530e4fa3128fcc0a752d0751909eab129d701f384fc0ea5f138c5914", - "extract_dir_relative": "cellfinder_test_data", + "data_dir_relative": "cellfinder_test_data", "signal_subdir": "signal", "background_subdir": "background", "output_path_basename_relative": "cellfinder_output_", diff --git a/brainglobe_workflows/utils.py b/brainglobe_workflows/utils.py new file mode 100644 index 00000000..4b3bdac3 --- /dev/null +++ b/brainglobe_workflows/utils.py @@ -0,0 +1,94 @@ +import argparse +import logging +import sys +from pathlib import Path +from typing import List + +DEFAULT_JSON_CONFIGS_PATH = Path(__file__).resolve().parent / "configs" + +DEFAULT_JSON_CONFIG_PATH_CELLFINDER = ( + DEFAULT_JSON_CONFIGS_PATH / "cellfinder.json" +) + + +def setup_logger() -> logging.Logger: + """Setup a logger for workflow runs + + The logger's level is set to DEBUG, and it + is linked to a handler that writes to the + console. This utility function helps run + workflows, and test their logs, in a + consistent way. + + Returns + ------- + logging.Logger + a logger object configured for workflow runs + """ + # define handler that writes to stdout + console_handler = logging.StreamHandler(sys.stdout) + console_format = logging.Formatter("%(name)s %(levelname)s: %(message)s") + console_handler.setFormatter(console_format) + console_handler.set_name("console_handler") + + # define logger and link to handler + logger = logging.getLogger( + __name__ + ) # if imported as a module, the logger is named after the module + logger.setLevel(logging.DEBUG) + logger.addHandler(console_handler) + return logger + + +def config_parser( + argv: List[str], + default_config: str, +) -> argparse.Namespace: + """Define argument parser for a workflow script. + + The only CLI argument defined in the parser is + the input config file. The list of input arguments + `argv` can be an empty list. + + Both the list of input arguments and the default config to use if + no config is specified must be passed as an input to this + function. + + Parameters + ---------- + argv_ : List[str] + _description_ + default_config : str + _description_ + + Returns + ------- + args : argparse.Namespace + command line input arguments parsed + """ + + # initialise argument parser + parser = argparse.ArgumentParser( + description=( + "To launch the workflow with " + "a specific set of input parameters, run: " + "`python brainglobe_workflows/cellfinder.py " + "--config path/to/config.json`" + "where path/to/input/config.json is the json file " + "containing the workflow parameters." + ) + ) + # add arguments + parser.add_argument( + "-c", + "--config", + default=default_config, + type=str, + metavar="CONFIG", # a name for usage messages + help="", + ) + + # build parser object + args = parser.parse_args(argv) + + return args diff --git a/pyproject.toml b/pyproject.toml index 9defcc92..6689ae2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,8 +8,10 @@ authors = [ ] description = "A collection of end-to-end data analysis workflows executed using BrainGlobe tools." readme = "README.md" -license = { file = "LICENSE" } +license = { file = "LICENSE" } #{text = "BSD-3-Clause"} requires-python = ">=3.9" +dynamic = ["version"] + classifiers = [ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", @@ -21,6 +23,7 @@ classifiers = [ "Programming Language :: Python", "Topic :: Scientific/Engineering :: Image Recognition", ] + dependencies = [ "brainreg>=1.0.0", "cellfinder-core>=0.2.4,<1.0.0", @@ -37,11 +40,9 @@ dependencies = [ "scikit-image", "tifffile", "tqdm", + "asv", + "pooch", ] -dynamic = ["version"] - -[project.scripts] -cellfinder = "brainglobe_workflows.main:main" [project.optional-dependencies] dev = [ @@ -65,10 +66,22 @@ napari = [ "Homepage" = "https://brainglobe.info" "Source Code" = "https://github.com/brainglobe/brainglobe-workflows" +[project.scripts] +cellfinder-workflow = "brainglobe_workflows.cellfinder:main_app_wrapper" +cellfinder = "brainglobe_workflows.main:main" + [build-system] requires = ["setuptools>=45", "wheel", "setuptools_scm[toml]>=6.2"] build-backend = "setuptools.build_meta" +[tool.setuptools] +include-package-data = true +zip-safe = false + +[tool.setuptools.packages.find] +include = ["brainglobe_workflows"] +exclude = ["tests", "resources"] + [tool.black] target-version = ["py39", "py310"] skip-string-normalization = false @@ -92,14 +105,6 @@ exclude = ["__init__.py", "build", ".eggs"] select = ["I", "E", "F"] fix = true -[tool.setuptools] -include-package-data = true -zip-safe = false - -[tool.setuptools.packages.find] -include = ["brainglobe_workflows"] -exclude = ["benchmarks", "tests", "resources"] - [tool.setuptools_scm] [tool.tox] diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..83e05553 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,98 @@ +"""Pytest fixtures shared across unit and integration tests""" + +from pathlib import Path + +import pooch +import pytest + +from brainglobe_workflows.cellfinder import read_cellfinder_config + + +@pytest.fixture() +def input_configs_dir() -> Path: + """Return the directory path to the input configs + used for testing + + Returns + ------- + Path + Test data directory path + """ + return Path(__file__).parent / "data" + + +@pytest.fixture(scope="session") +def cellfinder_GIN_data() -> dict: + """Return the URL and hash to the GIN repository with the input data + + Returns + ------- + dict + URL and hash of the GIN repository with the cellfinder test data + """ + return { + "url": "https://gin.g-node.org/BrainGlobe/test-data/raw/master/cellfinder/cellfinder-test-data.zip", + "hash": "b0ef53b1530e4fa3128fcc0a752d0751909eab129d701f384fc0ea5f138c5914", # noqa + } + + +@pytest.fixture() +def input_config_fetch_GIN(input_configs_dir: Path) -> Path: + """ + Return the cellfinder config json file that is configured to fetch from GIN + + Parameters + ---------- + input_configs_dir : Path + Path to the directory holding the test config files. + + Returns + ------- + Path + Path to the config json file for fetching data from GIN + """ + return input_configs_dir / "input_data_GIN.json" + + +@pytest.fixture() +def input_config_fetch_local( + input_configs_dir: Path, + cellfinder_GIN_data: dict, +) -> Path: + """ + Download the cellfinder data locally and return the config json + file configured to fetch local data. + + The data is downloaded to a directory under the current working + directory (that is, to a directory under the directory from where + pytest is launched). + + Parameters + ---------- + input_configs_dir : Path + Path to the directory holding the test config files. + cellfinder_GIN_data : dict + URL and hash of the GIN repository with the cellfinder test data + + Returns + ------- + Path + Path to the config json file for fetching data locally + """ + # read local config + input_config_path = input_configs_dir / "input_data_locally.json" + config = read_cellfinder_config(input_config_path) + + # fetch data from GIN and download locally + pooch.retrieve( + url=cellfinder_GIN_data["url"], + known_hash=cellfinder_GIN_data["hash"], + path=config.install_path, # path to download zip to + progressbar=True, + processor=pooch.Unzip( + extract_dir=config.data_dir_relative + # path to unzipped dir, *relative* to 'path' + ), + ) + + return input_config_path diff --git a/tests/data/input_data_GIN.json b/tests/data/input_data_GIN.json new file mode 100644 index 00000000..daf056a5 --- /dev/null +++ b/tests/data/input_data_GIN.json @@ -0,0 +1,39 @@ +{ + "install_path": ".cellfinder_workflows", + "data_url": "https://gin.g-node.org/BrainGlobe/test-data/raw/master/cellfinder/cellfinder-test-data.zip", + "data_hash": "b0ef53b1530e4fa3128fcc0a752d0751909eab129d701f384fc0ea5f138c5914", + "data_dir_relative": "cellfinder_test_data", + "signal_subdir": "signal", + "background_subdir": "background", + "output_path_basename_relative": "cellfinder_output_", + "detected_cells_filename": "detected_cells.xml", + "voxel_sizes": [ + 5, + 2, + 2 + ], + "start_plane": 0, + "end_plane": -1, + "trained_model": null, + "model_weights": null, + "model": "resnet50_tv", + "batch_size": 32, + "n_free_cpus": 2, + "network_voxel_sizes": [ + 5, + 1, + 1 + ], + "soma_diameter": 16, + "ball_xy_size": 6, + "ball_z_size": 15, + "ball_overlap_fraction": 0.6, + "log_sigma_size": 0.2, + "n_sds_above_mean_thresh": 10, + "soma_spread_factor": 1.4, + "max_cluster_size": 100000, + "cube_width": 50, + "cube_height": 50, + "cube_depth": 20, + "network_depth": "50" +} diff --git a/tests/data/input_data_locally.json b/tests/data/input_data_locally.json new file mode 100644 index 00000000..e3761543 --- /dev/null +++ b/tests/data/input_data_locally.json @@ -0,0 +1,37 @@ +{ + "install_path": ".cellfinder_workflows", + "data_dir_relative": "cellfinder_test_data", + "signal_subdir": "signal", + "background_subdir": "background", + "output_path_basename_relative": "cellfinder_output_", + "detected_cells_filename": "detected_cells.xml", + "voxel_sizes": [ + 5, + 2, + 2 + ], + "start_plane": 0, + "end_plane": -1, + "trained_model": null, + "model_weights": null, + "model": "resnet50_tv", + "batch_size": 32, + "n_free_cpus": 2, + "network_voxel_sizes": [ + 5, + 1, + 1 + ], + "soma_diameter": 16, + "ball_xy_size": 6, + "ball_z_size": 15, + "ball_overlap_fraction": 0.6, + "log_sigma_size": 0.2, + "n_sds_above_mean_thresh": 10, + "soma_spread_factor": 1.4, + "max_cluster_size": 100000, + "cube_width": 50, + "cube_height": 50, + "cube_depth": 20, + "network_depth": "50" +} diff --git a/tests/data/input_data_missing_background.json b/tests/data/input_data_missing_background.json new file mode 100644 index 00000000..52454f9b --- /dev/null +++ b/tests/data/input_data_missing_background.json @@ -0,0 +1,37 @@ +{ + "install_path": ".cellfinder_workflows", + "data_dir_relative": "cellfinder_test_data", + "signal_subdir": "signal", + "background_subdir": "__", + "output_path_basename_relative": "cellfinder_output_", + "detected_cells_filename": "detected_cells.xml", + "voxel_sizes": [ + 5, + 2, + 2 + ], + "start_plane": 0, + "end_plane": -1, + "trained_model": null, + "model_weights": null, + "model": "resnet50_tv", + "batch_size": 32, + "n_free_cpus": 2, + "network_voxel_sizes": [ + 5, + 1, + 1 + ], + "soma_diameter": 16, + "ball_xy_size": 6, + "ball_z_size": 15, + "ball_overlap_fraction": 0.6, + "log_sigma_size": 0.2, + "n_sds_above_mean_thresh": 10, + "soma_spread_factor": 1.4, + "max_cluster_size": 100000, + "cube_width": 50, + "cube_height": 50, + "cube_depth": 20, + "network_depth": "50" +} diff --git a/tests/data/input_data_missing_signal.json b/tests/data/input_data_missing_signal.json new file mode 100644 index 00000000..22c5247b --- /dev/null +++ b/tests/data/input_data_missing_signal.json @@ -0,0 +1,37 @@ +{ + "install_path": ".cellfinder_workflows", + "data_dir_relative": "cellfinder_test_data", + "signal_subdir": "__", + "background_subdir": "background", + "output_path_basename_relative": "cellfinder_output_", + "detected_cells_filename": "detected_cells.xml", + "voxel_sizes": [ + 5, + 2, + 2 + ], + "start_plane": 0, + "end_plane": -1, + "trained_model": null, + "model_weights": null, + "model": "resnet50_tv", + "batch_size": 32, + "n_free_cpus": 2, + "network_voxel_sizes": [ + 5, + 1, + 1 + ], + "soma_diameter": 16, + "ball_xy_size": 6, + "ball_z_size": 15, + "ball_overlap_fraction": 0.6, + "log_sigma_size": 0.2, + "n_sds_above_mean_thresh": 10, + "soma_spread_factor": 1.4, + "max_cluster_size": 100000, + "cube_width": 50, + "cube_height": 50, + "cube_depth": 20, + "network_depth": "50" +} diff --git a/tests/data/input_data_not_locally_or_GIN.json b/tests/data/input_data_not_locally_or_GIN.json new file mode 100644 index 00000000..e3761543 --- /dev/null +++ b/tests/data/input_data_not_locally_or_GIN.json @@ -0,0 +1,37 @@ +{ + "install_path": ".cellfinder_workflows", + "data_dir_relative": "cellfinder_test_data", + "signal_subdir": "signal", + "background_subdir": "background", + "output_path_basename_relative": "cellfinder_output_", + "detected_cells_filename": "detected_cells.xml", + "voxel_sizes": [ + 5, + 2, + 2 + ], + "start_plane": 0, + "end_plane": -1, + "trained_model": null, + "model_weights": null, + "model": "resnet50_tv", + "batch_size": 32, + "n_free_cpus": 2, + "network_voxel_sizes": [ + 5, + 1, + 1 + ], + "soma_diameter": 16, + "ball_xy_size": 6, + "ball_z_size": 15, + "ball_overlap_fraction": 0.6, + "log_sigma_size": 0.2, + "n_sds_above_mean_thresh": 10, + "soma_spread_factor": 1.4, + "max_cluster_size": 100000, + "cube_width": 50, + "cube_height": 50, + "cube_depth": 20, + "network_depth": "50" +} diff --git a/brainglobe_workflows/cellfinder/__init__.py b/tests/test_integration/brainglobe_benchmarks/__init__.py similarity index 100% rename from brainglobe_workflows/cellfinder/__init__.py rename to tests/test_integration/brainglobe_benchmarks/__init__.py diff --git a/tests/test_integration/brainglobe_benchmarks/test_cellfinder.py b/tests/test_integration/brainglobe_benchmarks/test_cellfinder.py new file mode 100644 index 00000000..44d031cc --- /dev/null +++ b/tests/test_integration/brainglobe_benchmarks/test_cellfinder.py @@ -0,0 +1,100 @@ +import json +import subprocess +from pathlib import Path + +import pytest +from asv import util + + +@pytest.fixture() +def asv_config_monkeypatched_path(tmp_path: Path) -> str: + """ + Create a monkeypatched asv.conf.json file + in a Pytest-generated temporary directory + and return its path + + Parameters + ---------- + tmp_path : Path + path to pytest-generated temporary directory + + Returns + ------- + str + Path to monkeypatched asv config file + """ + # read reference asv config + asv_original_path = Path(__file__).resolve().parents[3] / "asv.conf.json" + asv_monkeypatched_dict = util.load_json( + asv_original_path, js_comments=True + ) + + # change directories + for ky in ["env_dir", "results_dir", "html_dir"]: + asv_monkeypatched_dict[ky] = str( + Path(tmp_path) / asv_monkeypatched_dict[ky] + ) + + # change repo to URL rather than local + asv_monkeypatched_dict[ + "repo" + ] = "https://github.com/brainglobe/brainglobe-workflows.git" + + # define path to a temp json file to dump config data + asv_monkeypatched_path = tmp_path / "asv.conf.json" + + # save monkeypatched config data to json file + with open(asv_monkeypatched_path, "w") as js: + json.dump(asv_monkeypatched_dict, js) + + # check json file exists + assert asv_monkeypatched_path.is_file() + + return str(asv_monkeypatched_path) + + +@pytest.mark.skip(reason="will be worked on a separate PR") +def test_run_benchmarks(asv_config_monkeypatched_path): + # --- ideally monkeypatch an asv config so that results are in tmp_dir? + + # set up machine (env_dir, results_dir, html_dir) + asv_machine_output = subprocess.run( + [ + "asv", + "machine", + "--yes", + "--config", + asv_config_monkeypatched_path, + ] + ) + assert asv_machine_output.returncode == 0 + + # run benchmarks + asv_benchmark_output = subprocess.run( + [ + "asv", + "run", + "--config", + asv_config_monkeypatched_path, + # "--dry-run" + # # Do not save any results to disk? not truly testing then + ], + cwd=str( + Path(asv_config_monkeypatched_path).parent + ), # run from where asv config is + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + encoding="utf-8", + ) + # STDOUT: "· Cloning project\n· Fetching recent changes\n· + # Creating environments\n· No __init__.py file in 'benchmarks'\n" + + # check returncode + assert asv_benchmark_output.returncode == 0 + + # check logs? + + # delete directories? + # check teardown after yield: + # https://docs.pytest.org/en/6.2.x/fixture.html#yield-fixtures-recommended diff --git a/tests/test_integration/brainglobe_workflows/__init__.py b/tests/test_integration/brainglobe_workflows/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_integration/brainglobe_workflows/test_cellfinder.py b/tests/test_integration/brainglobe_workflows/test_cellfinder.py new file mode 100644 index 00000000..1f179b7b --- /dev/null +++ b/tests/test_integration/brainglobe_workflows/test_cellfinder.py @@ -0,0 +1,154 @@ +import subprocess +import sys +from pathlib import Path +from typing import Optional + +import pytest + +from brainglobe_workflows.cellfinder import main + + +@pytest.mark.parametrize( + "input_config", + [ + None, + "input_config_fetch_GIN", + "input_config_fetch_local", + ], +) +def test_main( + input_config: Optional[str], + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + request: pytest.FixtureRequest, +): + """Test main function for setting up and running cellfinder workflow + + Parameters + ---------- + input_config : Optional[str] + Path to input config json file + monkeypatch : pytest.MonkeyPatch + Pytest fixture to use monkeypatching utils + tmp_path : Path + Pytest fixture providing a temporary path for each test + request : pytest.FixtureRequest + Pytest fixture to enable requesting fixtures by name + """ + # monkeypatch to change current directory to + # pytest temporary directory + # (cellfinder cache directory is created in cwd) + monkeypatch.chdir(tmp_path) + + # run main + if not input_config: + cfg = main() + else: + cfg = main(str(request.getfixturevalue(input_config))) + + # check output files exist + assert Path(cfg.detected_cells_path).is_file() + + +@pytest.mark.parametrize( + "input_config", + [ + None, + "input_config_fetch_GIN", + "input_config_fetch_local", + ], +) +def test_script( + input_config: Optional[str], + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + request: pytest.FixtureRequest, +): + """Test running the cellfinder worklfow from the command line + + Parameters + ---------- + input_config : Optional[str] + Path to input config json file + monkeypatch : pytest.MonkeyPatch + Pytest fixture to use monkeypatching utils + tmp_path : Path + Pytest fixture providing a temporary path for each test + request : pytest.FixtureRequest + Pytest fixture to enable requesting fixtures by name + """ + # monkeypatch to change current directory to + # pytest temporary directory + # (cellfinder cache directory is created in cwd) + monkeypatch.chdir(tmp_path) + + # define CLI input + script_path = ( + Path(__file__).resolve().parents[3] + / "brainglobe_workflows" + / "cellfinder.py" + ) + subprocess_input = [ + sys.executable, + str(script_path), + ] + # append config if required + if input_config: + subprocess_input.append("--config") + subprocess_input.append(str(request.getfixturevalue(input_config))) + + # run workflow script from the CLI + subprocess_output = subprocess.run( + subprocess_input, + ) + + # check returncode + assert subprocess_output.returncode == 0 + + +@pytest.mark.parametrize( + "input_config", + [ + None, + "input_config_fetch_GIN", + "input_config_fetch_local", + ], +) +def test_entry_point( + input_config: Optional[str], + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + request: pytest.FixtureRequest, +): + """Test running the cellfinder workflow via the predefined entry point + + Parameters + ---------- + input_config : Optional[str] + Path to input config json file + monkeypatch : pytest.MonkeyPatch + Pytest fixture to use monkeypatching utils + tmp_path : Path + Pytest fixture providing a temporary path for each test + request : pytest.FixtureRequest + Pytest fixture to enable requesting fixtures by name + """ + # monkeypatch to change current directory to + # pytest temporary directory + # (cellfinder cache directory is created in cwd) + monkeypatch.chdir(tmp_path) + + # define CLI input + subprocess_input = ["cellfinder-workflow"] + # append config if required + if input_config: + subprocess_input.append("--config") + subprocess_input.append(str(request.getfixturevalue(input_config))) + + # run workflow with no CLI arguments, + subprocess_output = subprocess.run( + subprocess_input, + ) + + # check returncode + assert subprocess_output.returncode == 0 diff --git a/tests/test_integration/conftest.py b/tests/test_integration/conftest.py deleted file mode 100644 index d9207917..00000000 --- a/tests/test_integration/conftest.py +++ /dev/null @@ -1,290 +0,0 @@ -import json -from pathlib import Path -from typing import Any - -import pooch -import pytest - -from brainglobe_workflows.cellfinder.cellfinder_main import CellfinderConfig - - -def make_config_dict_fetch_from_local(cellfinder_cache_dir: Path) -> dict: - """Generate a config dictionary with the required parameters - for the workflow - - The input data is assumed to be locally at cellfinder_cache_dir. - The results are saved in a timestamped output subdirectory under - cellfinder_cache_dir - - Parameters - ---------- - cellfinder_cache_dir : Path - Path to the directory where the downloaded input data will be unzipped, - and the output will be saved - - Returns - ------- - dict - dictionary with the required parameters for the workflow - """ - return { - "install_path": cellfinder_cache_dir, - "extract_dir_relative": "cellfinder_test_data", # relative path - "signal_subdir": "signal", - "background_subdir": "background", - "output_path_basename_relative": "cellfinder_output_", - "detected_cells_filename": "detected_cells.xml", - "voxel_sizes": [5, 2, 2], # microns - "start_plane": 0, - "end_plane": -1, - "trained_model": None, # if None, it will use a default model - "model_weights": None, - "model": "resnet50_tv", - "batch_size": 32, - "n_free_cpus": 2, - "network_voxel_sizes": [5, 1, 1], - "soma_diameter": 16, - "ball_xy_size": 6, - "ball_z_size": 15, - "ball_overlap_fraction": 0.6, - "log_sigma_size": 0.2, - "n_sds_above_mean_thresh": 10, - "soma_spread_factor": 1.4, - "max_cluster_size": 100000, - "cube_width": 50, - "cube_height": 50, - "cube_depth": 20, - "network_depth": "50", - } - - -def make_config_dict_fetch_from_GIN( - cellfinder_cache_dir: Path, - data_url: str, - data_hash: str, -) -> dict: - """Generate a config dictionary with the required parameters - for the workflow - - The input data is fetched from GIN and downloaded to cellfinder_cache_dir. - The results are also saved in a timestamped output subdirectory under - cellfinder_cache_dir - - Parameters - ---------- - cellfinder_cache_dir : Path - Path to the directory where the downloaded input data will be unzipped, - and the output will be saved - data_url: str - URL to the GIN repository with the data to download - data_hash: str - Hash of the data to download - - Returns - ------- - dict - dictionary with the required parameters for the workflow - """ - - config = make_config_dict_fetch_from_local(cellfinder_cache_dir) - config["data_url"] = data_url - config["data_hash"] = data_hash - - return config - - -def prep_json(obj: Any) -> Any: - """ - Returns a JSON encodable version of the input object. - - It uses the JSON default encoder for all objects - except those of type `Path`. - - - Parameters - ---------- - obj : Any - _description_ - - Returns - ------- - Any - JSON serializable version of input object - """ - if isinstance(obj, Path): - return str(obj) - else: - json_decoder = json.JSONEncoder() - return json_decoder.default(obj) - - -@pytest.fixture(autouse=True) -def cellfinder_cache_dir(tmp_path: Path) -> Path: - """Create a .cellfinder_workflows directory - under a temporary pytest directory and return - its path. - - The temporary directory is available via pytest's tmp_path - fixture. A new temporary directory is created every function call - (i.e., scope="function") - - Parameters - ---------- - tmp_path : Path - path to pytest-generated temporary directory - - Returns - ------- - Path - path to the created cellfinder_workflows cache directory - """ - - return Path(tmp_path) / ".cellfinder_workflows" - - -@pytest.fixture(scope="session") -def data_url() -> str: - """Return the URL to the GIN repository with the input data - - Returns - ------- - str - URL to the GIN repository with the input data - """ - return "https://gin.g-node.org/BrainGlobe/test-data/raw/master/cellfinder/cellfinder-test-data.zip" - - -@pytest.fixture(scope="session") -def data_hash() -> str: - """Return the hash of the GIN input data - - Returns - ------- - str - Hash to the GIN input data - """ - return "b0ef53b1530e4fa3128fcc0a752d0751909eab129d701f384fc0ea5f138c5914" - - -@pytest.fixture(scope="session") -def default_json_config_path() -> Path: - """Return the path to the json file - with the default config parameters - - Returns - ------- - Path - path to the json file with the default config parameters - """ - from brainglobe_workflows.cellfinder.cellfinder_main import ( - DEFAULT_JSON_CONFIG_PATH, - ) - - return DEFAULT_JSON_CONFIG_PATH - - -@pytest.fixture() -def path_to_config_fetch_GIN( - tmp_path: Path, cellfinder_cache_dir: Path, data_url: str, data_hash: str -) -> Path: - """Create an input config that fetches data from GIN and - return its path - - Parameters - ---------- - tmp_path : Path - path to a fresh pytest-generated temporary directory. The - generated config is saved here. - - cellfinder_cache_dir : Path - path to the cellfinder cache directory, where the paths - in the config should point to. - - data_url: str - URL to the GIN repository with the input data - - data_hash: str - hash to the GIN input data - - Returns - ------- - input_config_path : Path - path to config file that fetches data from GIN - """ - # create config dict - config_dict = make_config_dict_fetch_from_GIN( - cellfinder_cache_dir, data_url, data_hash - ) - - # create a temp json file to dump config data - input_config_path = ( - tmp_path / "input_config.json" - ) # save it in a temp dir separate from cellfinder_cache_dir - - # save config data to json file - with open(input_config_path, "w") as js: - json.dump(config_dict, js, default=prep_json) - - # check json file exists - assert Path(input_config_path).is_file() - - return input_config_path - - -@pytest.fixture() -def path_to_config_fetch_local( - tmp_path: Path, cellfinder_cache_dir: Path, data_url: str, data_hash: str -) -> Path: - """Create an input config that points to local data and - return its path. - - The local data is downloaded from GIN, but no reference - to the GIN repository is included in the config. - - Parameters - ---------- - tmp_path : Path - path to a fresh pytest-generated temporary directory. The - generated config is saved here. - - cellfinder_cache_dir : Path - path to the cellfinder cache directory, where the paths - in the config should point to. - - data_url: str - URL to the GIN repository with the input data - - data_hash: str - hash to the GIN input data - - Returns - ------- - path_to_config_fetch_GIN : Path - path to a config file that fetches data from GIN - """ - - # instantiate basic config (assumes data is local) - config_dict = make_config_dict_fetch_from_local(cellfinder_cache_dir) - config = CellfinderConfig(**config_dict) - - # download GIN data to specified local directory - pooch.retrieve( - url=data_url, - known_hash=data_hash, - path=config.install_path, # path to download zip to - progressbar=True, - processor=pooch.Unzip( - extract_dir=config.extract_dir_relative - # path to unzipped dir, *relative* to 'path' - ), - ) - - # save config to json - input_config_path = tmp_path / "input_config.json" - with open(input_config_path, "w") as js: - json.dump(config_dict, js, default=prep_json) - - # check json file exists - assert Path(input_config_path).is_file() - - return input_config_path diff --git a/tests/test_integration/test_cellfinder_workflow.py b/tests/test_integration/test_cellfinder_workflow.py deleted file mode 100644 index e55d0a46..00000000 --- a/tests/test_integration/test_cellfinder_workflow.py +++ /dev/null @@ -1,211 +0,0 @@ -import json -import subprocess -import sys -from pathlib import Path - -from brainglobe_workflows.cellfinder.cellfinder_main import CellfinderConfig - - -def test_run_with_default_config(tmp_path, default_json_config_path): - """Test workflow run with no command line arguments - - If no command line arguments are provided, the default - config at brainglobe_workflows/cellfinder/default_config.json - should be used. - - After the workflow is run we check that: - - there are no errors (via returncode), - - the logs reflect the default config file was used, and - - a single output directory exists with the expected - output file inside it - - Parameters - ---------- - tmp_path : Path - path to a pytest-generated temporary directory. - """ - - # run workflow with no CLI arguments, - # with cwd=tmp_path - subprocess_output = subprocess.run( - [ - sys.executable, - Path(__file__).resolve().parents[2] - / "brainglobe_workflows" - / "cellfinder" - / "cellfinder_main.py", - ], - cwd=tmp_path, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - encoding="utf-8", - ) - - # check returncode - assert subprocess_output.returncode == 0 - - # check logs - assert "Using default config file" in subprocess_output.stdout - - # Check one output directory exists and has expected - # output file inside it - assert_outputs(default_json_config_path, tmp_path) - - -def test_run_with_GIN_data( - path_to_config_fetch_GIN, -): - """Test workflow runs when passing a config that fetches data - from the GIN repository - - After the workflow is run we check that: - - there are no errors (via returncode), - - the logs reflect the input config file was used, - - the logs reflect the data was downloaded from GIN, and - - a single output directory exists with the expected - output file inside it - - Parameters - ---------- - tmp_path : Path - path to a pytest-generated temporary directory. - """ - # run workflow with CLI and capture log - subprocess_output = subprocess.run( - [ - sys.executable, - Path(__file__).resolve().parents[2] - / "brainglobe_workflows" - / "cellfinder" - / "cellfinder_main.py", - "--config", - str(path_to_config_fetch_GIN), - ], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - encoding="utf-8", - ) - - # check returncode - assert subprocess_output.returncode == 0 - - # check logs - assert ( - f"Input config read from {str(path_to_config_fetch_GIN)}" - in subprocess_output.stdout - ) - assert ( - "Fetching input data from the provided GIN repository" - in subprocess_output.stdout - ) - - # check one output directory exists and - # has expected output file inside it - assert_outputs(path_to_config_fetch_GIN) - - -def test_run_with_local_data( - path_to_config_fetch_local, -): - """Test workflow runs when passing a config that uses - local data - - After the workflow is run we check that: - - there are no errors (via returncode), - - the logs reflect the input config file was used, - - the logs reflect the data was found locally, and - - a single output directory exists with the expected - output file inside it - - Parameters - ---------- - tmp_path : Path - path to a pytest-generated temporary directory. - """ - - # run workflow with CLI - subprocess_output = subprocess.run( - [ - sys.executable, - Path(__file__).resolve().parents[2] - / "brainglobe_workflows" - / "cellfinder" - / "cellfinder_main.py", - "--config", - str(path_to_config_fetch_local), - ], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - encoding="utf-8", - ) - - # check returncode - assert subprocess_output.returncode == 0 - - # check logs - assert ( - f"Input config read from {str(path_to_config_fetch_local)}" - in subprocess_output.stdout - ) - assert ( - "Fetching input data from the local directories" - in subprocess_output.stdout - ) - - # check one output directory exists and - # has expected output file inside it - assert_outputs(path_to_config_fetch_local) - - -def assert_outputs(path_to_config, parent_dir_of_install_path=""): - """Helper function to determine whether the output is - as expected. - - It checks that: - - a single output directory exists, and - - the expected output file exists inside it - - Note that config.output_path is only defined after the workflow - setup is run, because its name is timestamped. Therefore, - we search for an output directory based on config.output_path_basename. - - Parameters - ---------- - path_to_config : Path - path to the input config used to generate the - output. - - parent_dir_of_install_path : str, optional - If the install_path in the input config is relative to the - directory the script is launched from (as is the case in the - default_config.json file), the absolute path to its parent_dir - must be specified here. If the paths to install_path is - absolute, this input is not required. By default "". - """ - - # load input config - with open(path_to_config) as config: - config_dict = json.load(config) - config = CellfinderConfig(**config_dict) - - # check one output directory exists and - # it has expected output file inside it - output_path_without_timestamp = ( - Path(parent_dir_of_install_path) - / config.install_path - / config.output_path_basename_relative - ) - output_path_timestamped = [ - x - for x in output_path_without_timestamp.parent.glob("*") - if x.is_dir() and x.name.startswith(output_path_without_timestamp.name) - ] - - assert len(output_path_timestamped) == 1 - assert (output_path_timestamped[0]).exists() - assert ( - output_path_timestamped[0] / config.detected_cells_filename - ).is_file() diff --git a/tests/test_unit/brainglobe_benchmarks/__init__.py b/tests/test_unit/brainglobe_benchmarks/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_unit/brainglobe_workflows/__init__.py b/tests/test_unit/brainglobe_workflows/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_unit/brainglobe_workflows/conftest.py b/tests/test_unit/brainglobe_workflows/conftest.py new file mode 100644 index 00000000..ae85bc53 --- /dev/null +++ b/tests/test_unit/brainglobe_workflows/conftest.py @@ -0,0 +1,15 @@ +import pytest + + +@pytest.fixture() +def custom_logger_name() -> str: + """Return name of custom logger created in workflow utils + + Returns + ------- + str + Name of custom logger + """ + from brainglobe_workflows.utils import __name__ as logger_name + + return logger_name diff --git a/tests/test_unit/brainglobe_workflows/test_cellfinder.py b/tests/test_unit/brainglobe_workflows/test_cellfinder.py new file mode 100644 index 00000000..ddb4c706 --- /dev/null +++ b/tests/test_unit/brainglobe_workflows/test_cellfinder.py @@ -0,0 +1,343 @@ +import json +import logging +import re +from pathlib import Path + +import pooch +import pytest + +from brainglobe_workflows.cellfinder import ( + CellfinderConfig, + add_signal_and_background_files, + read_cellfinder_config, + run_workflow_from_cellfinder_run, + setup_workflow, +) +from brainglobe_workflows.cellfinder import setup as setup_full +from brainglobe_workflows.utils import setup_logger + + +@pytest.fixture() +def default_input_config_cellfinder() -> Path: + """Return path to default input config for cellfinder workflow + + Returns + ------- + Path + Path to default input config + + """ + from brainglobe_workflows.utils import DEFAULT_JSON_CONFIG_PATH_CELLFINDER + + return DEFAULT_JSON_CONFIG_PATH_CELLFINDER + + +@pytest.mark.parametrize( + "input_config", + [ + "input_data_GIN.json", + "input_data_locally.json", + "input_data_missing_background.json", + "input_data_missing_signal.json", + "input_data_not_locally_or_GIN.json", + ], +) +def test_read_cellfinder_config(input_config: str, input_configs_dir: Path): + """Test for reading a cellfinder config file + + Parameters + ---------- + input_config : str + Name of input config json file + input_configs_dir : Path + Test data directory path + """ + # path to config json file + input_config_path = input_configs_dir / input_config + + # read json as Cellfinder config + config = read_cellfinder_config(input_config_path) + + # read json as dict + with open(input_config_path) as cfg: + config_dict = json.load(cfg) + + # check keys of dictionary are a subset of Cellfinder config attributes + assert all( + [ky in config.__dataclass_fields__.keys() for ky in config_dict.keys()] + ) + + +@pytest.mark.parametrize( + "input_config, message_pattern", + [ + ( + "input_data_GIN.json", + "Fetching input data from the provided GIN repository", + ), + ( + "input_data_locally.json", + "Fetching input data from the local directories", + ), + ( + "input_data_missing_background.json", + "The directory .+ does not exist$", + ), + ("input_data_missing_signal.json", "The directory .+ does not exist$"), + ( + "input_data_not_locally_or_GIN.json", + "Input data not found locally, and URL/hash to " + "GIN repository not provided", + ), + ], +) +def test_add_signal_and_background_files( + caplog: pytest.LogCaptureFixture, + tmp_path: Path, + cellfinder_GIN_data: dict, + input_configs_dir: Path, + input_config: str, + message_pattern: str, +): + """Test signal and background files addition to the cellfinder config + + Parameters + ---------- + caplog : pytest.LogCaptureFixture + Pytest fixture to capture the logs during testing + tmp_path : Path + Pytest fixture providing a temporary path for each test + cellfinder_GIN_data : dict + Dict holding the URL and hash of the cellfinder test data in GIN + input_configs_dir : Path + Test data directory path + input_config : str + Name of input config json file + message_pattern : str + Expected pattern in the log + """ + # instantiate our custom logger + _ = setup_logger() + + # read json as Cellfinder config + config = read_cellfinder_config(input_configs_dir / input_config) + + # monkeypatch cellfinder config: + # set install_path to pytest temporary directory + config.install_path = tmp_path / config.install_path + + # check lists of signal and background files are not defined + assert not (config.list_signal_files and config.list_background_files) + + # build fullpaths to input data directories + config.signal_dir_path = str( + Path(config.install_path) + / config.data_dir_relative + / config.signal_subdir + ) + config.background_dir_path = str( + Path(config.install_path) + / config.data_dir_relative + / config.background_subdir + ) + + # monkeypatch cellfinder config: + # if config is "local" or "signal/background missing": + # ensure signal and background data from GIN are downloaded locally + if input_config in [ + "input_data_locally.json", + "input_data_missing_signal.json", + "input_data_missing_background.json", + ]: + # fetch data from GIN and download locally + pooch.retrieve( + url=cellfinder_GIN_data["url"], + known_hash=cellfinder_GIN_data["hash"], + path=config.install_path, # path to download zip to + progressbar=True, + processor=pooch.Unzip( + extract_dir=config.data_dir_relative + # path to unzipped dir, *relative* to 'path' + ), + ) + + # add signal and background files lists to config + add_signal_and_background_files(config) + + # check log messages + assert len(caplog.messages) > 0 + out = re.fullmatch(message_pattern, caplog.messages[-1]) + assert out is not None + assert out.group() is not None + + +@pytest.mark.parametrize( + "input_config, message", + [ + ("default_input_config_cellfinder", "Using default config file"), + ("input_config_fetch_GIN", "Input config read from"), + ], +) +def test_setup_workflow( + input_config: str, + message: str, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + caplog: pytest.LogCaptureFixture, + request: pytest.FixtureRequest, +): + """Test setup steps for the cellfinder workflow, using the default config + and passing a specific config file. + + These setup steps include: + - instantiating a CellfinderConfig object using the input json file, + - add the signal and background files to the config if these are not + defined, + - create a timestamped directory for the output of the workflow if + it doesn't exist and add its path to the config + + Parameters + ---------- + input_config : str + Name of input config json file + message : str + Expected log message + monkeypatch : pytest.MonkeyPatch + Pytest fixture to use monkeypatching utils + tmp_path : Path + Pytest fixture providing a temporary path for each test + caplog : pytest.LogCaptureFixture + Pytest fixture to capture the logs during testing + request : pytest.FixtureRequest + Pytest fixture to enable requesting fixtures by name + """ + + # setup logger + _ = setup_logger() + + # monkeypatch to change current directory to + # pytest temporary directory + # (cellfinder cache directory is created in cwd) + monkeypatch.chdir(tmp_path) + + # setup workflow + config = setup_workflow(request.getfixturevalue(input_config)) + + # check logs + assert message in caplog.text + + # check all signal files exist + assert config.list_signal_files + assert all([Path(f).is_file() for f in config.list_signal_files]) + + # check all background files exist + assert config.list_background_files + assert all([Path(f).is_file() for f in config.list_background_files]) + + # check output directory exists + assert Path(config.output_path).resolve().is_dir() + + # check output directory name has correct format + out = re.fullmatch( + str(config.output_path_basename_relative) + "\\d{8}_\\d{6}$", + Path(config.output_path).stem, + ) + assert out is not None + assert out.group() is not None + + # check output file path + assert ( + Path(config.detected_cells_path) + == Path(config.output_path) / config.detected_cells_filename + ) + + +@pytest.mark.parametrize( + "input_config", + [ + "default_input_config_cellfinder", + "input_config_fetch_GIN", + "input_config_fetch_local", + ], +) +def test_setup( + input_config: str, + custom_logger_name: str, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + request: pytest.FixtureRequest, +): + """Test full setup for cellfinder workflow, using the default config + and passing a specific config file. + + Parameters + ---------- + input_config : str + Path to input config file + custom_logger_name : str + Name of custom logger + monkeypatch : MonkeyPatch + Pytest fixture to use monkeypatching utils + tmp_path : Path + Pytest fixture providing a temporary path for each test + request : pytest.FixtureRequest + Pytest fixture to enable requesting fixtures by name + """ + # Monkeypatch to change current directory to + # pytest temporary directory + # (cellfinder cache directory is created in cwd) + monkeypatch.chdir(tmp_path) + + # run setup on default configuration + cfg = setup_full(request.getfixturevalue(input_config)) + + # check logger exists + logger = logging.getLogger(custom_logger_name) + assert logger.level == logging.DEBUG + assert logger.hasHandlers() + + # check config is CellfinderConfig + assert isinstance(cfg, CellfinderConfig) + + +@pytest.mark.parametrize( + "input_config", + [ + "default_input_config_cellfinder", + "input_config_fetch_GIN", + "input_config_fetch_local", + ], +) +def test_run_workflow_from_cellfinder_run( + input_config: str, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + request: pytest.FixtureRequest, +): + """Test running cellfinder workflow with default input config + (fetches data from GIN) and local input config + + Parameters + ---------- + input_config : str + Path to input config json file + monkeypatch : MonkeyPatch + Pytest fixture to use monkeypatching utils + tmp_path : Path + Pytest fixture providing a temporary path for each test + request : pytest.FixtureRequest + Pytest fixture to enable requesting fixtures by name + """ + # monkeypatch to change current directory to + # pytest temporary directory + # (cellfinder cache directory is created in cwd) + monkeypatch.chdir(tmp_path) + + # run setup + cfg = setup_full(str(request.getfixturevalue(input_config))) + + # run workflow + run_workflow_from_cellfinder_run(cfg) + + # check output files are those expected? + assert Path(cfg.detected_cells_path).is_file() diff --git a/tests/test_unit/brainglobe_workflows/test_utils.py b/tests/test_unit/brainglobe_workflows/test_utils.py new file mode 100644 index 00000000..2ec8d19e --- /dev/null +++ b/tests/test_unit/brainglobe_workflows/test_utils.py @@ -0,0 +1,39 @@ +import logging +from typing import List + +import pytest + +from brainglobe_workflows.utils import ( + DEFAULT_JSON_CONFIG_PATH_CELLFINDER, + config_parser, + setup_logger, +) + + +def test_setup_logger(custom_logger_name: str): + """Test custom logger is correctly created + + Parameters + ---------- + custom_logger_name : str + Pytest fixture for the custom logger name + """ + logger = setup_logger() + + assert logger.level == logging.DEBUG + assert logger.name == custom_logger_name + assert logger.hasHandlers() + assert logger.handlers[0].name == "console_handler" + + +@pytest.mark.parametrize( + "list_input_args", + [[], ["--config", str(DEFAULT_JSON_CONFIG_PATH_CELLFINDER)]], +) +def test_config_parser(list_input_args: List[str]): + args = config_parser( + list_input_args, + str(DEFAULT_JSON_CONFIG_PATH_CELLFINDER), + ) + + assert args.config diff --git a/tests/test_unit/test_placeholder.py b/tests/test_unit/test_placeholder.py deleted file mode 100644 index 3ada1ee4..00000000 --- a/tests/test_unit/test_placeholder.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_placeholder(): - assert True