From 7c72fb68dfc0991a24d1a098df4cdfda5ffbf898 Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Wed, 2 Oct 2024 15:43:59 +1000 Subject: [PATCH 1/7] Move the main validate function to entrypoint.py and change validate.py to validator.py --- pyshacl/__init__.py | 3 +- pyshacl/entrypoints.py | 236 +++++++++++++++++++ pyshacl/{validate.py => validator.py} | 311 ++++---------------------- pyshacl/validator_conformance.py | 2 +- 4 files changed, 283 insertions(+), 269 deletions(-) create mode 100644 pyshacl/entrypoints.py rename pyshacl/{validate.py => validator.py} (65%) diff --git a/pyshacl/__init__.py b/pyshacl/__init__.py index 7db1108..031c094 100644 --- a/pyshacl/__init__.py +++ b/pyshacl/__init__.py @@ -1,8 +1,9 @@ # -*- coding: latin-1 -*- # +from .entrypoints import validate from .shape import Shape from .shapes_graph import ShapesGraph -from .validate import Validator, validate +from .validator import Validator # version compliant with https://www.python.org/dev/peps/pep-0440/ __version__ = '0.26.0' diff --git a/pyshacl/entrypoints.py b/pyshacl/entrypoints.py new file mode 100644 index 0000000..6a354d1 --- /dev/null +++ b/pyshacl/entrypoints.py @@ -0,0 +1,236 @@ +import logging +import os +import sys +from functools import wraps +from io import BufferedIOBase, TextIOBase +from sys import stderr +from typing import List, Optional, Tuple, Union + +from rdflib import Dataset, Graph, URIRef + +from pyshacl.errors import ReportableRuntimeError, ValidationFailure +from pyshacl.pytypes import GraphLike + +from .monkey import apply_patches, rdflib_bool_patch, rdflib_bool_unpatch +from .rdfutil import load_from_source +from .validator import Validator, assign_baked_in +from .validator_conformance import check_dash_result + + +def validate( + data_graph: Union[GraphLike, BufferedIOBase, TextIOBase, str, bytes], + *args, + shacl_graph: Optional[Union[GraphLike, BufferedIOBase, TextIOBase, str, bytes]] = None, + ont_graph: Optional[Union[GraphLike, BufferedIOBase, TextIOBase, str, bytes]] = None, + advanced: Optional[bool] = False, + inference: Optional[str] = None, + inplace: Optional[bool] = False, + abort_on_first: Optional[bool] = False, + allow_infos: Optional[bool] = False, + allow_warnings: Optional[bool] = False, + max_validation_depth: Optional[int] = None, + sparql_mode: Optional[bool] = False, + focus_nodes: Optional[List[Union[str, URIRef]]] = None, + use_shapes: Optional[List[Union[str, URIRef]]] = None, + **kwargs, +): + """ + :param data_graph: rdflib.Graph or file path or web url of the data to validate + :type data_graph: rdflib.Graph | str | bytes + :param args: + :type args: list + :param shacl_graph: rdflib.Graph or file path or web url of the SHACL Shapes graph to use to + validate the data graph + :type shacl_graph: rdflib.Graph | str | bytes + :param ont_graph: rdflib.Graph or file path or web url of an extra ontology document to mix into the data graph + :type ont_graph: rdflib.Graph | str | bytes + :param advanced: Enable advanced SHACL features, default=False + :type advanced: bool | None + :param inference: One of "rdfs", "owlrl", "both", "none", or None + :type inference: str | None + :param inplace: If this is enabled, do not clone the datagraph, manipulate it in-place + :type inplace: bool + :param abort_on_first: Stop evaluating constraints after first violation is found + :type abort_on_first: bool | None + :param allow_infos: Shapes marked with severity of sh:Info will not cause result to be invalid. + :type allow_infos: bool | None + :param allow_warnings: Shapes marked with severity of sh:Warning or sh:Info will not cause result to be invalid. + :type allow_warnings: bool | None + :param max_validation_depth: The maximum number of SHACL shapes "deep" that the validator can go before reaching an "endpoint" constraint. + :type max_validation_depth: int | None + :param sparql_mode: Treat the DataGraph as a SPARQL endpoint, validate the graph at the SPARQL endpoint. + :type sparql_mode: bool | None + :param focus_nodes: A list of IRIs to validate only those nodes. + :type focus_nodes: list | None + :param use_shapes: A list of IRIs to use only those shapes from the SHACL ShapesGraph. + :type use_shapes: list | None + :param kwargs: + :return: + """ + + do_debug = kwargs.get('debug', False) + log = make_default_logger(name="pyshacl-validate", debug=do_debug) + apply_patches() + assign_baked_in() + do_check_dash_result = kwargs.pop('check_dash_result', False) # type: bool + if kwargs.get('meta_shacl', False): + to_meta_val = shacl_graph or data_graph + conforms, v_r, v_t = meta_validate(to_meta_val, inference=inference, **kwargs) + if not conforms: + msg = f"SHACL File does not validate against the SHACL Shapes SHACL (MetaSHACL) file.\n{v_t}" + log.error(msg) + raise ReportableRuntimeError(msg) + do_owl_imports = kwargs.pop('do_owl_imports', False) + data_graph_format = kwargs.pop('data_graph_format', None) + + if isinstance(data_graph, (str, bytes, BufferedIOBase, TextIOBase)): + # DataGraph is passed in as Text. It is not an rdflib.Graph + # That means we load it into an ephemeral graph at runtime + # that means we don't need to make a copy to prevent polluting it. + ephemeral = True + else: + ephemeral = False + use_js = kwargs.pop('js', None) + if sparql_mode: + if use_js: + raise ReportableRuntimeError("Cannot use SHACL-JS in SPARQL Remote Graph Mode.") + if inplace: + raise ReportableRuntimeError("Cannot use inplace mode in SPARQL Remote Graph Mode.") + if ont_graph is not None: + raise ReportableRuntimeError("Cannot use SPARQL Remote Graph Mode with extra Ontology Graph inoculation.") + if isinstance(data_graph, bytes): + data_graph = data_graph.decode('utf-8') + else: + data_graph = data_graph + ephemeral = False + inplace = True + if ( + sparql_mode + and isinstance(data_graph, str) + and (data_graph.lower().startswith("http:") or data_graph.lower().startswith("https:")) + ): + from rdflib.plugins.stores.sparqlstore import SPARQLStore + + query_endpoint: str = data_graph + username = os.getenv("PYSHACL_SPARQL_USERNAME", "") + method = os.getenv("PYSHACL_SPARQL_METHOD", "GET") + auth: Optional[Tuple[str, str]] + if username: + password: str = os.getenv("PYSHACL_SPARQL_PASSWORD", "") + auth = (username, password) + else: + auth = None + store = SPARQLStore(query_endpoint=query_endpoint, auth=auth, method=method) + loaded_dg = Dataset(store=store, default_union=True) + else: + # force no owl imports on data_graph + loaded_dg = load_from_source( + data_graph, rdf_format=data_graph_format, multigraph=True, do_owl_imports=False, logger=log + ) + ont_graph_format = kwargs.pop('ont_graph_format', None) + if ont_graph is not None: + loaded_og = load_from_source( + ont_graph, rdf_format=ont_graph_format, multigraph=True, do_owl_imports=do_owl_imports, logger=log + ) + else: + loaded_og = None + shacl_graph_format = kwargs.pop('shacl_graph_format', None) + if shacl_graph is not None: + rdflib_bool_patch() + loaded_sg = load_from_source( + shacl_graph, rdf_format=shacl_graph_format, multigraph=True, do_owl_imports=do_owl_imports, logger=log + ) + rdflib_bool_unpatch() + else: + loaded_sg = None + iterate_rules = kwargs.pop('iterate_rules', False) + if "abort_on_error" in kwargs: + log.warning("Usage of abort_on_error is deprecated. Use abort_on_first instead.") + ae = kwargs.pop("abort_on_error") + abort_on_first = bool(abort_on_first) or bool(ae) + validator_options_dict = { + 'debug': do_debug or False, + 'inference': inference, + 'inplace': inplace or ephemeral, + 'abort_on_first': abort_on_first, + 'allow_infos': allow_infos, + 'allow_warnings': allow_warnings, + 'advanced': advanced, + 'iterate_rules': iterate_rules, + 'use_js': use_js, + 'sparql_mode': sparql_mode, + 'logger': log, + 'focus_nodes': focus_nodes, + 'use_shapes': use_shapes, + } + if max_validation_depth is not None: + validator_options_dict['max_validation_depth'] = max_validation_depth + validator = None + try: + validator = Validator( + loaded_dg, + shacl_graph=loaded_sg, + ont_graph=loaded_og, + options=validator_options_dict, + ) + conforms, report_graph, report_text = validator.run() + except ValidationFailure as e: + conforms = False + report_graph = e + report_text = "Validation Failure - {}".format(e.message) + if do_check_dash_result and validator is not None: + passes = check_dash_result(validator, report_graph, loaded_sg or loaded_dg) + return passes, report_graph, report_text + do_serialize_report_graph = kwargs.pop('serialize_report_graph', False) + if do_serialize_report_graph and isinstance(report_graph, Graph): + if not (isinstance(do_serialize_report_graph, str)): + do_serialize_report_graph = 'turtle' + report_graph = report_graph.serialize(None, encoding='utf-8', format=do_serialize_report_graph) + return conforms, report_graph, report_text + + +def with_metashacl_shacl_graph_cache(f): + # noinspection PyPep8Naming + EMPTY = object() + + @wraps(f) + def wrapped(*args, **kwargs): + graph_cache = getattr(wrapped, "graph_cache", None) + assert graph_cache is not None + if graph_cache is EMPTY: + import pickle + + if getattr(sys, 'frozen', False): + # runs in a pyinstaller bundle + here_dir = sys._MEIPASS + else: + here_dir = os.path.dirname(__file__) + pickle_file = os.path.join(here_dir, "assets", "shacl-shacl.pickle") + with open(pickle_file, 'rb') as shacl_pickle: + u = pickle.Unpickler(shacl_pickle, fix_imports=False) + shacl_shacl_store, identifier = u.load() + shacl_shacl_graph = Graph(store=shacl_shacl_store, identifier=identifier) + setattr(wrapped, "graph_cache", shacl_shacl_graph) + return f(*args, **kwargs) + + setattr(wrapped, "graph_cache", EMPTY) + return wrapped + + +@with_metashacl_shacl_graph_cache +def meta_validate(shacl_graph: Union[GraphLike, str], inference: Optional[str] = 'rdfs', **kwargs): + shacl_shacl_graph = meta_validate.graph_cache + shacl_graph = load_from_source(shacl_graph, rdf_format=kwargs.pop('shacl_graph_format', None), multigraph=True) + _ = kwargs.pop('meta_shacl', None) + return validate(shacl_graph, shacl_graph=shacl_shacl_graph, inference=inference, **kwargs) + + +def make_default_logger(name: Union[str, None] = None, debug: bool = False) -> logging.Logger: + log_handler = logging.StreamHandler(stderr) + log = logging.getLogger() + for h in log.handlers: + log.removeHandler(h) # pragma:no cover + log.addHandler(log_handler) + log.setLevel(logging.INFO if not debug else logging.DEBUG) + log_handler.setLevel(logging.INFO if not debug else logging.DEBUG) + return log diff --git a/pyshacl/validate.py b/pyshacl/validator.py similarity index 65% rename from pyshacl/validate.py rename to pyshacl/validator.py index 20c4793..a259c0c 100644 --- a/pyshacl/validate.py +++ b/pyshacl/validator.py @@ -1,13 +1,9 @@ # -*- coding: utf-8 -*- # import logging -import os import sys -from functools import wraps -from io import BufferedIOBase, TextIOBase from os import getenv, path -from sys import stderr -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import rdflib from rdflib import BNode, Literal, URIRef @@ -19,10 +15,9 @@ SH_ValidationReport, env_truths, ) -from .errors import ReportableRuntimeError, ValidationFailure +from .errors import ReportableRuntimeError from .extras import check_extra_installed from .functions import apply_functions, gather_functions, unapply_functions -from .monkey import apply_patches, rdflib_bool_patch, rdflib_bool_unpatch from .pytypes import GraphLike, SHACLExecutor from .rdfutil import ( add_baked_in, @@ -30,27 +25,59 @@ clone_graph, inoculate, inoculate_dataset, - load_from_source, mix_datasets, mix_graphs, ) from .rules import apply_rules, gather_rules from .shapes_graph import ShapesGraph from .target import apply_target_types, gather_target_types -from .validator_conformance import check_dash_result USE_FULL_MIXIN = getenv("PYSHACL_USE_FULL_MIXIN") in env_truths -log_handler = logging.StreamHandler(stderr) -log = logging.getLogger(__name__) -for h in log.handlers: - log.removeHandler(h) # pragma:no cover -log.addHandler(log_handler) -log.setLevel(logging.INFO) -log_handler.setLevel(logging.INFO) - class Validator(object): + def __init__( + self, + data_graph: GraphLike, + *args, + shacl_graph: Optional[GraphLike] = None, + ont_graph: Optional[GraphLike] = None, + options: Optional[Dict[str, Any]] = None, + **kwargs, + ): + options = options or {} + self._load_default_options(options) + self.options = options # type: dict + self.logger = options['logger'] # type: logging.Logger + self.debug = options['debug'] + self.pre_inferenced = kwargs.pop('pre_inferenced', False) + self.inplace = options['inplace'] + if not isinstance(data_graph, rdflib.Graph): + raise RuntimeError("data_graph must be a rdflib Graph object") + self.data_graph = data_graph # type: GraphLike + self._target_graph = None + self.ont_graph = ont_graph # type: Optional[GraphLike] + self.data_graph_is_multigraph = isinstance(self.data_graph, (rdflib.Dataset, rdflib.ConjunctiveGraph)) + if self.ont_graph is not None and isinstance(self.ont_graph, (rdflib.Dataset, rdflib.ConjunctiveGraph)): + self.ont_graph.default_union = True + if self.ont_graph is not None and options['sparql_mode']: + raise ReportableRuntimeError("Cannot use SPARQL Remote Graph Mode with extra Ontology Graph inoculation.") + if shacl_graph is None: + if options['sparql_mode']: + raise ReportableRuntimeError( + "SHACL Shapes Graph must be a separate local graph or file when in SPARQL Remote Graph Mode." + ) + shacl_graph = clone_graph(data_graph, identifier='shacl') + assert isinstance(shacl_graph, rdflib.Graph), "shacl_graph must be a rdflib Graph object" + self.shacl_graph = ShapesGraph(shacl_graph, self.debug, self.logger) # type: ShapesGraph + + if options['use_js']: + if options['sparql_mode']: + raise ReportableRuntimeError("Cannot use SHACL-JS in SPARQL Remote Graph Mode.") + is_js_installed = check_extra_installed('js') + if is_js_installed: + self.shacl_graph.enable_js() + @classmethod def _load_default_options(cls, options_dict: dict): options_dict.setdefault('debug', False) @@ -163,48 +190,6 @@ def create_validation_report(cls, sg, conforms: bool, results: List[Tuple]): vg.add((s, p, o)) return vg, v_text - def __init__( - self, - data_graph: GraphLike, - *args, - shacl_graph: Optional[GraphLike] = None, - ont_graph: Optional[GraphLike] = None, - options: Optional[dict] = None, - **kwargs, - ): - options = options or {} - self._load_default_options(options) - self.options = options # type: dict - self.logger = options['logger'] # type: logging.Logger - self.debug = options['debug'] - self.pre_inferenced = kwargs.pop('pre_inferenced', False) - self.inplace = options['inplace'] - if not isinstance(data_graph, rdflib.Graph): - raise RuntimeError("data_graph must be a rdflib Graph object") - self.data_graph = data_graph # type: GraphLike - self._target_graph = None - self.ont_graph = ont_graph # type: Optional[GraphLike] - self.data_graph_is_multigraph = isinstance(self.data_graph, (rdflib.Dataset, rdflib.ConjunctiveGraph)) - if self.ont_graph is not None and isinstance(self.ont_graph, (rdflib.Dataset, rdflib.ConjunctiveGraph)): - self.ont_graph.default_union = True - if self.ont_graph is not None and options['sparql_mode']: - raise ReportableRuntimeError("Cannot use SPARQL Remote Graph Mode with extra Ontology Graph inoculation.") - if shacl_graph is None: - if options['sparql_mode']: - raise ReportableRuntimeError( - "SHACL Shapes Graph must be a separate local graph or file when in SPARQL Remote Graph Mode." - ) - shacl_graph = clone_graph(data_graph, identifier='shacl') - assert isinstance(shacl_graph, rdflib.Graph), "shacl_graph must be a rdflib Graph object" - self.shacl_graph = ShapesGraph(shacl_graph, self.debug, self.logger) # type: ShapesGraph - - if options['use_js']: - if options['sparql_mode']: - raise ReportableRuntimeError("Cannot use SHACL-JS in SPARQL Remote Graph Mode.") - is_js_installed = check_extra_installed('js') - if is_js_installed: - self.shacl_graph.enable_js() - @property def target_graph(self): return self._target_graph @@ -412,211 +397,3 @@ def assign_baked_in(): add_baked_in("http://datashapes.org/schema", schema_file) add_baked_in("https://datashapes.org/schema", schema_file) add_baked_in("http://datashapes.org/schema.ttl", schema_file) - - -def with_metashacl_shacl_graph_cache(f): - # noinspection PyPep8Naming - EMPTY = object() - - @wraps(f) - def wrapped(*args, **kwargs): - graph_cache = getattr(wrapped, "graph_cache", None) - assert graph_cache is not None - if graph_cache is EMPTY: - import pickle - - if getattr(sys, 'frozen', False): - # runs in a pyinstaller bundle - here_dir = sys._MEIPASS - else: - here_dir = path.dirname(__file__) - pickle_file = path.join(here_dir, "assets", "shacl-shacl.pickle") - with open(pickle_file, 'rb') as shacl_pickle: - u = pickle.Unpickler(shacl_pickle, fix_imports=False) - shacl_shacl_store, identifier = u.load() - shacl_shacl_graph = rdflib.Graph(store=shacl_shacl_store, identifier=identifier) - setattr(wrapped, "graph_cache", shacl_shacl_graph) - return f(*args, **kwargs) - - setattr(wrapped, "graph_cache", EMPTY) - return wrapped - - -@with_metashacl_shacl_graph_cache -def meta_validate(shacl_graph: Union[GraphLike, str], inference: Optional[str] = 'rdfs', **kwargs): - shacl_shacl_graph = meta_validate.graph_cache - shacl_graph = load_from_source(shacl_graph, rdf_format=kwargs.pop('shacl_graph_format', None), multigraph=True) - _ = kwargs.pop('meta_shacl', None) - return validate(shacl_graph, shacl_graph=shacl_shacl_graph, inference=inference, **kwargs) - - -def validate( - data_graph: Union[GraphLike, BufferedIOBase, TextIOBase, str, bytes], - *args, - shacl_graph: Optional[Union[GraphLike, BufferedIOBase, TextIOBase, str, bytes]] = None, - ont_graph: Optional[Union[GraphLike, BufferedIOBase, TextIOBase, str, bytes]] = None, - advanced: Optional[bool] = False, - inference: Optional[str] = None, - inplace: Optional[bool] = False, - abort_on_first: Optional[bool] = False, - allow_infos: Optional[bool] = False, - allow_warnings: Optional[bool] = False, - max_validation_depth: Optional[int] = None, - sparql_mode: Optional[bool] = False, - focus_nodes: Optional[List[Union[str, URIRef]]] = None, - use_shapes: Optional[List[Union[str, URIRef]]] = None, - **kwargs, -): - """ - :param data_graph: rdflib.Graph or file path or web url of the data to validate - :type data_graph: rdflib.Graph | str | bytes - :param args: - :type args: list - :param shacl_graph: rdflib.Graph or file path or web url of the SHACL Shapes graph to use to - validate the data graph - :type shacl_graph: rdflib.Graph | str | bytes - :param ont_graph: rdflib.Graph or file path or web url of an extra ontology document to mix into the data graph - :type ont_graph: rdflib.Graph | str | bytes - :param advanced: Enable advanced SHACL features, default=False - :type advanced: bool | None - :param inference: One of "rdfs", "owlrl", "both", "none", or None - :type inference: str | None - :param inplace: If this is enabled, do not clone the datagraph, manipulate it in-place - :type inplace: bool - :param abort_on_first: Stop evaluating constraints after first violation is found - :type abort_on_first: bool | None - :param allow_infos: Shapes marked with severity of sh:Info will not cause result to be invalid. - :type allow_infos: bool | None - :param allow_warnings: Shapes marked with severity of sh:Warning or sh:Info will not cause result to be invalid. - :type allow_warnings: bool | None - :param max_validation_depth: The maximum number of SHACL shapes "deep" that the validator can go before reaching an "endpoint" constraint. - :type max_validation_depth: int | None - :param sparql_mode: Treat the DataGraph as a SPARQL endpoint, validate the graph at the SPARQL endpoint. - :type sparql_mode: bool | None - :param focus_nodes: A list of IRIs to validate only those nodes. - :type focus_nodes: list | None - :param kwargs: - :return: - """ - - do_debug = kwargs.get('debug', False) - if do_debug: - log_handler.setLevel(logging.DEBUG) - log.setLevel(logging.DEBUG) - apply_patches() - assign_baked_in() - do_check_dash_result = kwargs.pop('check_dash_result', False) # type: bool - if kwargs.get('meta_shacl', False): - to_meta_val = shacl_graph or data_graph - conforms, v_r, v_t = meta_validate(to_meta_val, inference=inference, **kwargs) - if not conforms: - msg = f"SHACL File does not validate against the SHACL Shapes SHACL (MetaSHACL) file.\n{v_t}" - log.error(msg) - raise ReportableRuntimeError(msg) - do_owl_imports = kwargs.pop('do_owl_imports', False) - data_graph_format = kwargs.pop('data_graph_format', None) - - if isinstance(data_graph, (str, bytes, BufferedIOBase, TextIOBase)): - # DataGraph is passed in as Text. It is not an rdflib.Graph - # That means we load it into an ephemeral graph at runtime - # that means we don't need to make a copy to prevent polluting it. - ephemeral = True - else: - ephemeral = False - use_js = kwargs.pop('js', None) - if sparql_mode: - if use_js: - raise ReportableRuntimeError("Cannot use SHACL-JS in SPARQL Remote Graph Mode.") - if inplace: - raise ReportableRuntimeError("Cannot use inplace mode in SPARQL Remote Graph Mode.") - if ont_graph is not None: - raise ReportableRuntimeError("Cannot use SPARQL Remote Graph Mode with extra Ontology Graph inoculation.") - if isinstance(data_graph, bytes): - data_graph = data_graph.decode('utf-8') - else: - data_graph = data_graph - ephemeral = False - inplace = True - if ( - sparql_mode - and isinstance(data_graph, str) - and (data_graph.lower().startswith("http:") or data_graph.lower().startswith("https:")) - ): - from rdflib.plugins.stores.sparqlstore import SPARQLStore - - query_endpoint: str = data_graph - username = os.getenv("PYSHACL_SPARQL_USERNAME", "") - method = os.getenv("PYSHACL_SPARQL_METHOD", "GET") - auth: Optional[Tuple[str, str]] - if username: - password: str = os.getenv("PYSHACL_SPARQL_PASSWORD", "") - auth = (username, password) - else: - auth = None - store = SPARQLStore(query_endpoint=query_endpoint, auth=auth, method=method) - loaded_dg = rdflib.Dataset(store=store, default_union=True) - else: - # force no owl imports on data_graph - loaded_dg = load_from_source( - data_graph, rdf_format=data_graph_format, multigraph=True, do_owl_imports=False, logger=log - ) - ont_graph_format = kwargs.pop('ont_graph_format', None) - if ont_graph is not None: - loaded_og = load_from_source( - ont_graph, rdf_format=ont_graph_format, multigraph=True, do_owl_imports=do_owl_imports, logger=log - ) - else: - loaded_og = None - shacl_graph_format = kwargs.pop('shacl_graph_format', None) - if shacl_graph is not None: - rdflib_bool_patch() - loaded_sg = load_from_source( - shacl_graph, rdf_format=shacl_graph_format, multigraph=True, do_owl_imports=do_owl_imports, logger=log - ) - rdflib_bool_unpatch() - else: - loaded_sg = None - iterate_rules = kwargs.pop('iterate_rules', False) - if "abort_on_error" in kwargs: - log.warning("Usage of abort_on_error is deprecated. Use abort_on_first instead.") - ae = kwargs.pop("abort_on_error") - abort_on_first = bool(abort_on_first) or bool(ae) - validator_options_dict = { - 'debug': do_debug or False, - 'inference': inference, - 'inplace': inplace or ephemeral, - 'abort_on_first': abort_on_first, - 'allow_infos': allow_infos, - 'allow_warnings': allow_warnings, - 'advanced': advanced, - 'iterate_rules': iterate_rules, - 'use_js': use_js, - 'sparql_mode': sparql_mode, - 'logger': log, - 'focus_nodes': focus_nodes, - 'use_shapes': use_shapes, - } - if max_validation_depth is not None: - validator_options_dict['max_validation_depth'] = max_validation_depth - validator = None - try: - validator = Validator( - loaded_dg, - shacl_graph=loaded_sg, - ont_graph=loaded_og, - options=validator_options_dict, - ) - conforms, report_graph, report_text = validator.run() - except ValidationFailure as e: - conforms = False - report_graph = e - report_text = "Validation Failure - {}".format(e.message) - if do_check_dash_result and validator is not None: - passes = check_dash_result(validator, report_graph, loaded_sg or loaded_dg) - return passes, report_graph, report_text - do_serialize_report_graph = kwargs.pop('serialize_report_graph', False) - if do_serialize_report_graph and isinstance(report_graph, rdflib.Graph): - if not (isinstance(do_serialize_report_graph, str)): - do_serialize_report_graph = 'turtle' - report_graph = report_graph.serialize(None, encoding='utf-8', format=do_serialize_report_graph) - return conforms, report_graph, report_text diff --git a/pyshacl/validator_conformance.py b/pyshacl/validator_conformance.py index d788b20..6d230e2 100644 --- a/pyshacl/validator_conformance.py +++ b/pyshacl/validator_conformance.py @@ -26,7 +26,7 @@ from pyshacl.rdfutil import compare_blank_node, compare_node, order_graph_literal, stringify_node if TYPE_CHECKING: - from pyshacl.validate import Validator + from pyshacl.validator import Validator def clean_validation_reports(actual_graph, actual_report, expected_graph, expected_report): From 5e1237284f5d018a60332154d120f757d05b0e72 Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Wed, 2 Oct 2024 18:59:03 +1000 Subject: [PATCH 2/7] SHACL Rules Expander mode. Refactor Validator as a PySHACLRunnerType Add Rules Expander mode as a new PySHACLRunnerType Add a CLI script pyshacl_rules for Rules Expander mode Add tests for SHACL Rules Expander mode Add tests for pyshacl_rules cli executable --- Dockerfile | 2 +- pyproject.toml | 1 + pyshacl/__init__.py | 5 +- pyshacl/__main__.py | 13 +- pyshacl/cli.py | 12 + pyshacl/cli_rules.py | 316 +++++++++++++++++++++++ pyshacl/entrypoints.py | 109 +++++++- pyshacl/rule_expand_runner.py | 291 +++++++++++++++++++++ pyshacl/run_type.py | 9 + pyshacl/validator.py | 3 +- test/resources/cmdline_tests/rules_d.ttl | 20 ++ test/resources/cmdline_tests/rules_s.ttl | 96 +++++++ test/test_cmdline.py | 12 +- test/test_cmdline_rules.py | 109 ++++++++ test/test_shacl_rules_runner.py | 147 +++++++++++ 15 files changed, 1129 insertions(+), 16 deletions(-) create mode 100644 pyshacl/cli_rules.py create mode 100644 pyshacl/rule_expand_runner.py create mode 100644 pyshacl/run_type.py create mode 100644 test/resources/cmdline_tests/rules_d.ttl create mode 100644 test/resources/cmdline_tests/rules_s.ttl create mode 100644 test/test_cmdline_rules.py create mode 100644 test/test_shacl_rules_runner.py diff --git a/Dockerfile b/Dockerfile index 0c74b19..246d826 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ COPY . . RUN chown -R pyshacl:pyshacl /home/pyshacl /app && chmod -R 775 /home/pyshacl /app USER pyshacl ENV PATH="/home/pyshacl/.local/bin:$PATH" -RUN pip3 install "poetry>=1.5.0,<2.0" +RUN pip3 install "poetry>=1.8.3,<2.0" RUN poetry install --no-dev --extras "js http" USER root RUN apk del build-dependencies diff --git a/pyproject.toml b/pyproject.toml index daa0be8..29176a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,6 +109,7 @@ dev-coverage = ["pytest-cov", "coverage", "platformdirs"] [tool.poetry.scripts] pyshacl = "pyshacl.cli:main" +pyshacl_rules = "pyshacl.cli_rules:main" pyshacl_validate = "pyshacl.cli:main" pyshacl_server = "pyshacl.http:cli" diff --git a/pyshacl/__init__.py b/pyshacl/__init__.py index 031c094..78aa013 100644 --- a/pyshacl/__init__.py +++ b/pyshacl/__init__.py @@ -1,6 +1,7 @@ # -*- coding: latin-1 -*- # -from .entrypoints import validate +from .entrypoints import shacl_rules, validate +from .rule_expand_runner import RuleExpandRunner from .shape import Shape from .shapes_graph import ShapesGraph from .validator import Validator @@ -9,4 +10,4 @@ __version__ = '0.26.0' # Don't forget to change the version number in pyproject.toml, Dockerfile, and CITATION.cff along with this one -__all__ = ['validate', 'Validator', '__version__', 'Shape', 'ShapesGraph'] +__all__ = ['validate', 'shacl_rules', 'Validator', 'RuleExpandRunner', '__version__', 'Shape', 'ShapesGraph'] diff --git a/pyshacl/__main__.py b/pyshacl/__main__.py index 1dd0dd9..0c53f60 100644 --- a/pyshacl/__main__.py +++ b/pyshacl/__main__.py @@ -3,7 +3,8 @@ import os import sys -from pyshacl.cli import main +from pyshacl.cli import main as validate_main +from pyshacl.cli_rules import main as rules_main def str_is_true(s_var: str): @@ -16,11 +17,15 @@ def str_is_true(s_var: str): do_server = os.getenv("PYSHACL_HTTP", "") do_server = os.getenv("PYSHACL_SERVER", do_server) -if (len(sys.argv) > 1 and str(sys.argv[1]).lower() in ('serve', 'server', '--server')) or ( +first_arg = None if len(sys.argv) < 2 else sys.argv[1] + +if first_arg is not None and str(first_arg).lower() in ('rules', '--rules'): + rules_main(prog="python3 -m pyshacl") +elif (first_arg is not None and str(first_arg).lower() in ('serve', 'server', '--server')) or ( do_server and str_is_true(do_server) ): from pyshacl.sh_http import main as http_main http_main() - -main(prog="python3 -m pyshacl") +else: + validate_main(prog="python3 -m pyshacl") diff --git a/pyshacl/cli.py b/pyshacl/cli.py index c82dc55..c02f07d 100644 --- a/pyshacl/cli.py +++ b/pyshacl/cli.py @@ -216,6 +216,13 @@ def str_is_true(s_var: str): help='Send output to a file (defaults to stdout).', default=sys.stdout, ) +parser.add_argument( + '--rules', + help='Ignore validation options, run PySHACL in Rules Expansion mode. Same as `pyshacl_rules`.', + action='store_true', + dest='do_rules', + default=False, +) parser.add_argument( '--server', help='Ignore all the rest of the options, start the HTTP Server. Same as `pyshacl_server`.', @@ -240,6 +247,11 @@ def main(prog: Union[str, None] = None) -> None: # http_main calls sys.exit(0) and never returns http_main() + if args.do_rules: + from pyshacl.cli_rules import main as rules_main + + # rules_main calls sys.exit(0) and never returns + rules_main() if not args.data: # No datafile give, and not starting in server mode. sys.stderr.write('Input Error. No DataGraph file or endpoint supplied.\n') diff --git a/pyshacl/cli_rules.py b/pyshacl/cli_rules.py new file mode 100644 index 0000000..37d380a --- /dev/null +++ b/pyshacl/cli_rules.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import sys +from io import BufferedReader +from typing import Union, cast + +from pyshacl import __version__, shacl_rules +from pyshacl.cli import ShowVersion +from pyshacl.errors import ( + ConstraintLoadError, + ReportableRuntimeError, + RuleLoadError, + ShapeLoadError, + ValidationFailure, +) + +parser = argparse.ArgumentParser( + description='PySHACL {} SHACL Rules Expander command line tool.'.format(str(__version__)) +) +parser.add_argument( + 'data', + metavar='DataGraph', + help='The file or endpoint containing the Target Data Graph.', + default=None, + nargs='?', +) +parser.add_argument( + '-s', + '--shapes', + '--shacl', + dest='shacl', + action='store', + nargs='?', + help='A file containing the SHACL Shapes Graph.', +) +parser.add_argument( + '-e', + '--ont-graph', + dest='ont', + action='store', + nargs='?', + help='A file path or URL to a document containing extra ontological information. ' + 'RDFS and OWL definitions from this are used to inoculate the DataGraph.', +) +parser.add_argument( + '-i', + '--inference', + dest='inference', + action='store', + default='none', + choices=('none', 'rdfs', 'owlrl', 'both'), + help='Choose a type of inferencing to run against the Data Graph before validating.', +) +parser.add_argument( + '-m', + '--metashacl', + dest='metashacl', + action='store_true', + default=False, + help='Validate the SHACL Shapes graph against the shacl-shacl Shapes Graph before validating the Data Graph.', +) +parser.add_argument( + '-im', + '--imports', + dest='imports', + action='store_true', + default=False, + help='Allow import of sub-graphs defined in statements with owl:imports.', +) +parser.add_argument( + '-a', + '--advanced', + dest='advanced', + action='store_true', + default=False, + help='Enable features from the SHACL Advanced Features specification.', +) +parser.add_argument( + '-j', + '--js', + dest='js', + action='store_true', + default=False, + help='Enable features from the SHACL-JS Specification.', +) +parser.add_argument( + '-it', + '--iterate-rules', + dest='iterate_rules', + action='store_true', + default=False, + help="Run Shape's SHACL Rules iteratively until the data_graph reaches a steady state.", +) +parser.add_argument('--abort', dest='abort', action='store_true', default=False, help='Abort on first invalid data.') +parser.add_argument( + '--allow-info', + '--allow-infos', + dest='allow_infos', + action='store_true', + default=False, + help='Shapes marked with severity of Info will not cause result to be invalid.', +) +parser.add_argument( + '-w', + '--allow-warning', + '--allow-warnings', + dest='allow_warnings', + action='store_true', + default=False, + help='Shapes marked with severity of Warning or Info will not cause result to be invalid.', +) +parser.add_argument( + '--max-depth', + dest='max_depth', + action='store', + nargs='?', + type=int, + help="The maximum number of SHACL shapes \"deep\" that the validator can go before reaching an \"endpoint\" constraint.", +) +parser.add_argument( + '-d', + '--debug', + dest='debug', + action='store_true', + default=False, + help='Output additional verbose runtime messages.', +) +parser.add_argument( + '--focus', + dest='focus', + action='store', + help='Optional IRIs of focus nodes from the DataGraph, the shapes will validate only these node. Comma-separated list.', + nargs="?", + default=None, +) +parser.add_argument( + '--shape', + dest='shape', + action='store', + help='Optional IRIs of a NodeShape or PropertyShape from the SHACL ShapesGraph, only these shapes will be used to validate the DataGraph. Comma-separated list.', + nargs="?", + default=None, +) +parser.add_argument( + '-f', + '--format', + dest='format', + action='store', + help='Choose an output format. Default is \"trig\" for Datasets and \"turtle\" for Graphs.', + default='auto', + choices=('auto', 'turtle', 'xml', 'trig', 'json-ld', 'nt', 'n3', 'nquads'), +) +parser.add_argument( + '-df', + '--data-file-format', + dest='data_file_format', + action='store', + help='Explicitly state the RDF File format of the input DataGraph file. Default=\"auto\".', + default='auto', + choices=('auto', 'turtle', 'xml', 'trig', 'json-ld', 'nt', 'n3', 'nquads'), +) +parser.add_argument( + '-sf', + '--shacl-file-format', + dest='shacl_file_format', + action='store', + help='Explicitly state the RDF File format of the input SHACL file. Default=\"auto\".', + default='auto', + choices=('auto', 'turtle', 'xml', 'trig', 'json-ld', 'nt', 'n3', 'nquads'), +) +parser.add_argument( + '-ef', + '--ont-file-format', + dest='ont_file_format', + action='store', + help='Explicitly state the RDF File format of the extra ontology file. Default=\"auto\".', + default='auto', + choices=('auto', 'turtle', 'xml', 'trig', 'json-ld', 'nt', 'n3', 'nquads'), +) +parser.add_argument('-V', '--version', action=ShowVersion, help='Show PySHACL version and exit.') +parser.add_argument( + '-o', + '--output', + dest='output', + nargs='?', + type=argparse.FileType('w'), + help='Send output to a file (defaults to stdout).', + default=sys.stdout, +) +# parser.add_argument('-h', '--help', action="help", help='Show this help text.') + + +def main(prog: Union[str, None] = None) -> None: + if prog is not None and len(prog) > 0: + parser.prog = prog + + args = parser.parse_args() + if not args.data: + # No datafile give, and not starting in server mode. + sys.stderr.write('Input Error. No DataGraph file or endpoint supplied.\n') + parser.print_usage(sys.stderr) + sys.exit(1) + runner_kwargs = { + 'debug': args.debug, + 'serialize_expanded_graph': True, + } + data_file = None + data_graph: Union[BufferedReader, str] + + try: + data_file = open(args.data, 'rb') + except FileNotFoundError: + sys.stderr.write('Input Error. DataGraph file not found.\n') + sys.exit(1) + except PermissionError: + sys.stderr.write('Input Error. DataGraph file not readable.\n') + sys.exit(1) + else: + # NOTE: This cast is not necessary in Python >= 3.10. + data_graph = cast(BufferedReader, data_file) + if args.shacl is not None: + runner_kwargs['shacl_graph'] = args.shacl + if args.ont is not None: + runner_kwargs['ont_graph'] = args.ont + if args.inference != 'none': + runner_kwargs['inference'] = args.inference + if args.imports: + runner_kwargs['do_owl_imports'] = True + if args.js: + runner_kwargs['js'] = True + if args.focus: + runner_kwargs['focus_nodes'] = [_f.strip() for _f in args.focus.split(',')] + if args.shape: + runner_kwargs['use_shapes'] = [_s.strip() for _s in args.shape.split(',')] + if args.iterate_rules: + runner_kwargs['iterate_rules'] = True + if args.shacl_file_format: + _f: str = args.shacl_file_format + if _f != "auto": + runner_kwargs['shacl_graph_format'] = _f + if args.ont_file_format: + _f = args.ont_file_format + if _f != "auto": + runner_kwargs['ont_graph_format'] = _f + if args.data_file_format: + _f = args.data_file_format + if _f != "auto": + runner_kwargs['data_graph_format'] = _f + if args.format != "auto": + runner_kwargs['serialize_expanded_graph_format'] = args.format + exit_code: Union[int, None] = None + try: + output_txt = shacl_rules(data_graph, **runner_kwargs) + if isinstance(output_txt, BaseException): + raise output_txt + except ValidationFailure as vf: + args.output.write("Rules Runner generated a Validation Failure result:\n") + args.output.write(str(vf.message)) + args.output.write("\n") + exit_code = 1 + except ShapeLoadError as sle: + sys.stderr.write("Rules Runner encountered a Shape Load Error:\n") + sys.stderr.write(str(sle)) + exit_code = 2 + except ConstraintLoadError as cle: + sys.stderr.write("Rules Runner encountered a Constraint Load Error:\n") + sys.stderr.write(str(cle)) + exit_code = 2 + except RuleLoadError as rle: + sys.stderr.write("Rules Runner encountered a Rule Load Error:\n") + sys.stderr.write(str(rle)) + exit_code = 2 + except ReportableRuntimeError as rre: + sys.stderr.write("Rules Runner encountered a Runtime Error:\n") + sys.stderr.write(str(rre.message)) + sys.stderr.write("\nIf you believe this is a bug in pyshacl, open an Issue on the pyshacl github page.\n") + exit_code = 2 + except NotImplementedError as nie: + sys.stderr.write("Rules Runner feature is not implemented:\n") + if len(nie.args) > 0: + sys.stderr.write(str(nie.args[0])) + else: + sys.stderr.write("No message provided.") + sys.stderr.write("\nIf your use-case requires this feature, open an Issue on the pyshacl github page.\n") + exit_code = 3 + except RuntimeError as re: + import traceback + + traceback.print_tb(re.__traceback__) + sys.stderr.write( + "\n\nRules Runner encountered a Runtime Error. Please report this to the PySHACL issue tracker.\n" + ) + exit_code = 2 + finally: + if data_file is not None: + try: + data_file.close() + except Exception as e: + sys.stderr.write("Error closing data file:\n") + sys.stderr.write(str(e)) + if exit_code is not None: + sys.exit(exit_code) + + if isinstance(output_txt, bytes): + output_unicode = output_txt.decode('utf-8') + else: + output_unicode = output_txt + args.output.write(output_unicode) + args.output.close() + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/pyshacl/entrypoints.py b/pyshacl/entrypoints.py index 6a354d1..6ad8290 100644 --- a/pyshacl/entrypoints.py +++ b/pyshacl/entrypoints.py @@ -6,13 +6,15 @@ from sys import stderr from typing import List, Optional, Tuple, Union -from rdflib import Dataset, Graph, URIRef +from rdflib import ConjunctiveGraph, Dataset, Graph, Literal, URIRef from pyshacl.errors import ReportableRuntimeError, ValidationFailure from pyshacl.pytypes import GraphLike +from .consts import RDF, SH, RDF_type from .monkey import apply_patches, rdflib_bool_patch, rdflib_bool_unpatch from .rdfutil import load_from_source +from .rule_expand_runner import RuleExpandRunner from .validator import Validator, assign_baked_in from .validator_conformance import check_dash_result @@ -234,3 +236,108 @@ def make_default_logger(name: Union[str, None] = None, debug: bool = False) -> l log.setLevel(logging.INFO if not debug else logging.DEBUG) log_handler.setLevel(logging.INFO if not debug else logging.DEBUG) return log + + +def shacl_rules( + data_graph: Union[GraphLike, BufferedIOBase, TextIOBase, str, bytes], + *args, + shacl_graph: Optional[Union[GraphLike, BufferedIOBase, TextIOBase, str, bytes]] = None, + ont_graph: Optional[Union[GraphLike, BufferedIOBase, TextIOBase, str, bytes]] = None, + inference: Optional[str] = None, + inplace: Optional[bool] = False, + focus_nodes: Optional[List[Union[str, URIRef]]] = None, + use_shapes: Optional[List[Union[str, URIRef]]] = None, + **kwargs, +) -> Union[str, GraphLike]: + """ + :param data_graph: rdflib.Graph or file path or web url of the data to validate + :type data_graph: rdflib.Graph | str | bytes + :param args: + :type args: list + :param shacl_graph: rdflib.Graph or file path or web url of the SHACL Shapes graph to use to + validate the data graph + :type shacl_graph: rdflib.Graph | str | bytes + :param ont_graph: rdflib.Graph or file path or web url of an extra ontology document to mix into the data graph + :type ont_graph: rdflib.Graph | str | bytes + :param inference: One of "rdfs", "owlrl", "both", "none", or None + :type inference: str | None + :param inplace: If this is enabled, do not clone the datagraph, manipulate it in-place + :type inplace: bool + :param focus_nodes: A list of IRIs to validate only those nodes. + :type focus_nodes: list | None + :param use_shapes: A list of IRIs to use only those shapes from the SHACL ShapesGraph. + :type use_shapes: list | None + :param kwargs: + :return: + """ + + do_debug = kwargs.get('debug', False) + log = make_default_logger(name="pyshacl-rules", debug=do_debug) + apply_patches() + assign_baked_in() + do_owl_imports = kwargs.pop('do_owl_imports', False) + data_graph_format = kwargs.pop('data_graph_format', None) + if kwargs.get('sparql_mode', None): + raise ReportableRuntimeError("The SHACL Rules expander cannot be used in SPARQL Remote Graph Mode.") + if isinstance(data_graph, (str, bytes, BufferedIOBase, TextIOBase)): + # DataGraph is passed in as Text. It is not a rdflib.Graph + # That means we load it into an ephemeral graph at runtime + # that means we don't need to make a copy to prevent polluting it. + ephemeral = True + else: + ephemeral = False + use_js = kwargs.pop('js', None) + # force no owl imports on data_graph + loaded_dg = load_from_source( + data_graph, rdf_format=data_graph_format, multigraph=True, do_owl_imports=False, logger=log + ) + ont_graph_format = kwargs.pop('ont_graph_format', None) + if ont_graph is not None: + loaded_og = load_from_source( + ont_graph, rdf_format=ont_graph_format, multigraph=True, do_owl_imports=do_owl_imports, logger=log + ) + else: + loaded_og = None + shacl_graph_format = kwargs.pop('shacl_graph_format', None) + if shacl_graph is not None: + rdflib_bool_patch() + loaded_sg = load_from_source( + shacl_graph, rdf_format=shacl_graph_format, multigraph=True, do_owl_imports=do_owl_imports, logger=log + ) + rdflib_bool_unpatch() + else: + loaded_sg = None + iterate_rules = kwargs.pop('iterate_rules', False) + runner_options_dict = { + 'debug': do_debug or False, + 'inference': inference, + 'inplace': inplace or ephemeral, + 'iterate_rules': iterate_rules, + 'use_js': use_js, + 'logger': log, + 'focus_nodes': focus_nodes, + 'use_shapes': use_shapes, + } + serialize_expanded_graph = kwargs.get('serialize_expanded_graph', None) + try: + runner = RuleExpandRunner( + loaded_dg, + shacl_graph=loaded_sg, + ont_graph=loaded_og, + options=runner_options_dict, + ) + expanded_graph = runner.run() + except ValidationFailure as e: + error = "SHACL Rules Expansion Failure - {}".format(e.message) + if serialize_expanded_graph: + return error + else: + g = Graph() + g.add((URIRef(""), RDF_type, SH.ValidationFailure)) + g.add((URIRef(""), SH.message, Literal(error))) + return g + if serialize_expanded_graph: + guess_format = "trig" if isinstance(expanded_graph, (Dataset, ConjunctiveGraph)) else "turtle" + serialize_format = kwargs.get('serialize_expanded_graph_format', guess_format) + return expanded_graph.serialize(format=serialize_format) + return expanded_graph diff --git a/pyshacl/rule_expand_runner.py b/pyshacl/rule_expand_runner.py new file mode 100644 index 0000000..13a20da --- /dev/null +++ b/pyshacl/rule_expand_runner.py @@ -0,0 +1,291 @@ +# -*- coding: utf-8 -*- +# +import logging +from os import getenv +from typing import Any, Dict, List, Optional, Union + +import rdflib +from rdflib import URIRef + +from .consts import ( + env_truths, +) +from .errors import ReportableRuntimeError +from .extras import check_extra_installed +from .functions import apply_functions, gather_functions, unapply_functions +from .pytypes import GraphLike, SHACLExecutor +from .rdfutil import ( + clone_graph, + inoculate, + inoculate_dataset, + mix_datasets, + mix_graphs, +) +from .rules import apply_rules, gather_rules +from .run_type import PySHACLRunType +from .shapes_graph import ShapesGraph +from .target import apply_target_types, gather_target_types + +USE_FULL_MIXIN = getenv("PYSHACL_USE_FULL_MIXIN") in env_truths + + +class RuleExpandRunner(PySHACLRunType): + def __init__( + self, + data_graph: GraphLike, + *args, + shacl_graph: Optional[GraphLike] = None, + ont_graph: Optional[GraphLike] = None, + options: Optional[Dict[str, Any]] = None, + **kwargs, + ): + options = options or {} + self._load_default_options(options) + self.options = options # type: dict + self.logger = options['logger'] # type: logging.Logger + self.debug = options['debug'] + self.pre_inferenced = kwargs.pop('pre_inferenced', False) + self.inplace = options['inplace'] + if not isinstance(data_graph, rdflib.Graph): + raise RuntimeError("data_graph must be a rdflib Graph-like object") + self.data_graph = data_graph # type: GraphLike + self._target_graph = None + self.ont_graph = ont_graph # type: Optional[GraphLike] + self.data_graph_is_multigraph = isinstance(self.data_graph, (rdflib.Dataset, rdflib.ConjunctiveGraph)) + if self.ont_graph is not None and isinstance(self.ont_graph, (rdflib.Dataset, rdflib.ConjunctiveGraph)): + self.ont_graph.default_union = True + if shacl_graph is None: + shacl_graph = clone_graph(data_graph, identifier='shacl') + assert isinstance(shacl_graph, rdflib.Graph), "shacl_graph must be a rdflib Graph object" + self.shacl_graph = ShapesGraph(shacl_graph, self.debug, self.logger) # type: ShapesGraph + + if options['use_js']: + is_js_installed = check_extra_installed('js') + if is_js_installed: + self.shacl_graph.enable_js() + + @classmethod + def _load_default_options(cls, options_dict: dict): + options_dict.setdefault('debug', False) + options_dict.setdefault('inference', 'none') + options_dict.setdefault('inplace', False) + options_dict.setdefault('use_js', False) + options_dict.setdefault('iterate_rules', False) + options_dict.setdefault('focus_nodes', None) + options_dict.setdefault('use_shapes', None) + if 'logger' not in options_dict: + options_dict['logger'] = logging.getLogger(__name__) + if options_dict['debug']: + options_dict['logger'].setLevel(logging.DEBUG) + + @classmethod + def _run_pre_inference( + cls, target_graph: GraphLike, inference_option: str, logger: Optional[logging.Logger] = None + ): + """ + Note, this is the OWL/RDFS pre-inference, + it is not the Advanced Spec SHACL-Rule inferencing step. + :param target_graph: + :type target_graph: rdflib.Graph|rdflib.ConjunctiveGraph|rdflib.Dataset + :param inference_option: + :type inference_option: str + :return: + :rtype: NoneType + """ + # Lazy import owlrl + import owlrl + + from .inference import CustomRDFSOWLRLSemantics, CustomRDFSSemantics + + if logger is None: + logger = logging.getLogger(__name__) + try: + if inference_option == 'rdfs': + inferencer = owlrl.DeductiveClosure(CustomRDFSSemantics) + elif inference_option == 'owlrl': + inferencer = owlrl.DeductiveClosure(owlrl.OWLRL_Semantics) + elif inference_option == 'both' or inference_option == 'all' or inference_option == 'rdfsowlrl': + inferencer = owlrl.DeductiveClosure(CustomRDFSOWLRLSemantics) + else: + raise ReportableRuntimeError("Don't know how to do '{}' type inferencing.".format(inference_option)) + except Exception as e: # pragma: no cover + logger.error("Error during creation of OWL-RL Deductive Closure") + if isinstance(e, ReportableRuntimeError): + raise e + raise ReportableRuntimeError( + "Error during creation of OWL-RL Deductive Closure\n{}".format(str(e.args[0])) + ) + if isinstance(target_graph, (rdflib.Dataset, rdflib.ConjunctiveGraph)): + named_graphs = [] + for i in target_graph.store.contexts(None): + if isinstance(i, rdflib.Graph): + named_graphs.append(i) + else: + named_graphs.append( + rdflib.Graph(target_graph.store, i, namespace_manager=target_graph.namespace_manager) + ) + else: + named_graphs = [target_graph] + try: + for g in named_graphs: + inferencer.expand(g) + except Exception as e: # pragma: no cover + logger.error("Error while running OWL-RL Deductive Closure") + raise ReportableRuntimeError("Error while running OWL-RL Deductive Closure\n{}".format(str(e.args[0]))) + + @property + def target_graph(self): + return self._target_graph + + def mix_in_ontology(self): + if USE_FULL_MIXIN: + if not self.data_graph_is_multigraph: + return mix_graphs(self.data_graph, self.ont_graph, "inplace" if self.inplace else None) + return mix_datasets(self.data_graph, self.ont_graph, "inplace" if self.inplace else None) + if not self.data_graph_is_multigraph: + if self.inplace: + to_graph = self.data_graph + else: + to_graph = clone_graph(self.data_graph, identifier=self.data_graph.identifier) + return inoculate(to_graph, self.ont_graph) + return inoculate_dataset(self.data_graph, self.ont_graph, self.data_graph if self.inplace else None) + + def make_executor(self) -> SHACLExecutor: + return SHACLExecutor( + validator=self, + advanced_mode=True, + abort_on_first=False, + allow_infos=False, + allow_warnings=False, + iterate_rules=bool(self.options.get("iterate_rules", False)), + sparql_mode=False, + max_validation_depth=999, + focus_nodes=self.options.get("focus_nodes", None), + debug=self.debug, + ) + + def run(self) -> GraphLike: + if self.target_graph is not None: + # Target graph is already set up with pre-inferenced and pre-cloned data_graph + the_target_graph = self.target_graph + else: + has_cloned = False + if self.ont_graph is not None: + if self.inplace: + self.logger.debug("Adding ontology definitions to DataGraph") + else: + self.logger.debug("Cloning DataGraph to temporary memory graph, to add ontology definitions.") + # creates a copy of self.data_graph, doesn't modify it + the_target_graph = self.mix_in_ontology() + has_cloned = True + else: + the_target_graph = self.data_graph + inference_option = self.options.get('inference', 'none') + if self.inplace and self.debug: + self.logger.debug("Skipping DataGraph clone because PySHACL is operating in inplace mode.") + if inference_option and not self.pre_inferenced and str(inference_option) != "none": + if not has_cloned and not self.inplace: + self.logger.debug("Cloning DataGraph to temporary memory graph before pre-inferencing.") + the_target_graph = clone_graph(the_target_graph) + has_cloned = True + self.logger.debug(f"Running pre-inferencing with option='{inference_option}'.") + self._run_pre_inference(the_target_graph, inference_option, logger=self.logger) + self.pre_inferenced = True + if not has_cloned and not self.inplace: + # We still need to clone in advanced mode, because of triple rules + self.logger.debug( + "Forcing clone of DataGraph because expanding rules cannot modify the input datagraph." + ) + the_target_graph = clone_graph(the_target_graph) + has_cloned = True + self._target_graph = the_target_graph + + if self.options.get("use_shapes", None) is not None and len(self.options["use_shapes"]) > 0: + using_manually_specified_shapes = True + expanded_use_shapes = [] + for s in self.options["use_shapes"]: + s_lower = s.lower() + if ( + s_lower.startswith("http:") + or s_lower.startswith("https:") + or s_lower.startswith("urn:") + or s_lower.startswith("file:") + ): + expanded_use_shapes.append(URIRef(s)) + else: + try: + expanded_use_shape = self.shacl_graph.graph.namespace_manager.expand_curie(s) + except ValueError: + expanded_use_shape = URIRef(s) + expanded_use_shapes.append(expanded_use_shape) + shapes = self.shacl_graph.shapes_from_uris(expanded_use_shapes) + else: + using_manually_specified_shapes = False + shapes = self.shacl_graph.shapes # This property getter triggers shapes harvest. + option_focus_nodes = self.options.get("focus_nodes", None) + if option_focus_nodes is not None and len(option_focus_nodes) > 0: + # Expand any CURIEs in the focus_nodes list + expanded_focus_nodes: List[URIRef] = [] + for f in option_focus_nodes: + f_lower = f.lower() + if ( + f_lower.startswith("http:") + or f_lower.startswith("https:") + or f_lower.startswith("urn:") + or f_lower.startswith("file:") + ): + expanded_focus_nodes.append(URIRef(f)) + else: + try: + expanded_focus_node = self.target_graph.namespace_manager.expand_curie(f) + except ValueError: + expanded_focus_node = URIRef(f) + expanded_focus_nodes.append(expanded_focus_node) + self.options["focus_nodes"] = expanded_focus_nodes + specified_focus_nodes: Union[None, List[URIRef]] = expanded_focus_nodes + else: + specified_focus_nodes = None + executor = self.make_executor() + + # Special hack, if we are using manually specified shapes, and have + # manually specified focus nodes, then we need to disable the + # focus_nodes in the executor, because we apply the specified focus + # nodes directly to the specified shapes. + if using_manually_specified_shapes and specified_focus_nodes is not None: + executor.focus_nodes = None + + self.logger.debug("Activating SHACL-AF Features.") + target_types = gather_target_types(self.shacl_graph) + advanced = { + 'functions': gather_functions(executor, self.shacl_graph), + 'rules': gather_rules(executor, self.shacl_graph), + } + for s in shapes: + s.set_advanced(True) + apply_target_types(target_types) + if isinstance(the_target_graph, (rdflib.Dataset, rdflib.ConjunctiveGraph)): + named_graphs = [ + ( + rdflib.Graph(the_target_graph.store, i, namespace_manager=the_target_graph.namespace_manager) + if not isinstance(i, rdflib.Graph) + else i + ) + for i in the_target_graph.store.contexts(None) + ] + else: + named_graphs = [the_target_graph] + if self.debug: + self.logger.debug(f"Will run SHACL Rules expansion on {len(named_graphs)} named graph/s.") + for g in named_graphs: + if self.debug: + self.logger.debug(f"Running SHACL Rules on DataGraph named {g.identifier}") + if advanced['functions']: + apply_functions(executor, advanced['functions'], g) + try: + if advanced['rules']: + apply_rules(executor, advanced['rules'], g) + finally: + if advanced: + unapply_functions(advanced['functions'], g) + + return the_target_graph diff --git a/pyshacl/run_type.py b/pyshacl/run_type.py new file mode 100644 index 0000000..08b376b --- /dev/null +++ b/pyshacl/run_type.py @@ -0,0 +1,9 @@ +from abc import ABC, ABCMeta, abstractmethod + + +class PySHACLRunType(metaclass=ABCMeta): + __slots__ = () + + @abstractmethod + def run(self): + raise NotImplementedError() # pragma: no cover diff --git a/pyshacl/validator.py b/pyshacl/validator.py index a259c0c..66fca54 100644 --- a/pyshacl/validator.py +++ b/pyshacl/validator.py @@ -29,13 +29,14 @@ mix_graphs, ) from .rules import apply_rules, gather_rules +from .run_type import PySHACLRunType from .shapes_graph import ShapesGraph from .target import apply_target_types, gather_target_types USE_FULL_MIXIN = getenv("PYSHACL_USE_FULL_MIXIN") in env_truths -class Validator(object): +class Validator(PySHACLRunType): def __init__( self, data_graph: GraphLike, diff --git a/test/resources/cmdline_tests/rules_d.ttl b/test/resources/cmdline_tests/rules_d.ttl new file mode 100644 index 0000000..d125f97 --- /dev/null +++ b/test/resources/cmdline_tests/rules_d.ttl @@ -0,0 +1,20 @@ +# prefix: ex + +@prefix ex: . +@prefix exOnt: . +@prefix owl: . +@prefix rdf: . +@prefix rdfs: . +@prefix xsd: . + +ex:Kate + rdf:type exOnt:Person ; + exOnt:firstName "Kate" ; + exOnt:lastName "Jones" ; +. + +ex:Jenny + rdf:type exOnt:Administrator ; + exOnt:firstName "Jennifer" ; + exOnt:lastName "Wolfeschlegelsteinhausenbergerdorff" ; +. diff --git a/test/resources/cmdline_tests/rules_s.ttl b/test/resources/cmdline_tests/rules_s.ttl new file mode 100644 index 0000000..acfc1a4 --- /dev/null +++ b/test/resources/cmdline_tests/rules_s.ttl @@ -0,0 +1,96 @@ +# prefix: ex + +@prefix ex: . +@prefix exOnt: . +@prefix exData: . +@prefix owl: . +@prefix rdf: . +@prefix rdfs: . +@prefix sh: . +@prefix xsd: . + + + rdf:type owl:Ontology ; + rdfs:label "Test of SHACL Rules expander mode" ; +. + +ex:concat + a sh:SPARQLFunction ; + rdfs:comment "Concatenates strings $op1 and $op2." ; + sh:parameter [ + sh:path ex:op1 ; + sh:datatype xsd:string ; + sh:description "The first string" ; + ] ; + sh:parameter [ + sh:path ex:op2 ; + sh:datatype xsd:string ; + sh:description "The second string" ; + ] ; + sh:returnType xsd:string ; + sh:select """ + SELECT ?result + WHERE { + BIND(CONCAT(STR(?op1),STR(?op2)) AS ?result) . + } + """ . + +ex:strlen + a sh:SPARQLFunction ; + rdfs:comment "Returns length of the given string." ; + sh:parameter [ + sh:path ex:op1 ; + sh:datatype xsd:string ; + sh:description "The string" ; + ] ; + sh:returnType xsd:integer ; + sh:select """ + SELECT ?result + WHERE { + BIND(STRLEN(?op1) AS ?result) . + } + """ . + +ex:lessThan + a sh:SPARQLFunction ; + rdfs:comment "Returns True if op1 < op2." ; + sh:parameter [ + sh:path ex:op1 ; + sh:datatype xsd:integer ; + sh:description "The first int" ; + ] ; + sh:parameter [ + sh:path ex:op2 ; + sh:datatype xsd:integer ; + sh:description "The second int" ; + ] ; + sh:returnType xsd:boolean ; + sh:select """ + SELECT ?result + WHERE { + BIND(IF(?op1 < ?op2, true, false) AS ?result) . + } + """ . + +ex:PersonExpressionShape + a sh:NodeShape ; + sh:targetClass exOnt:Person ; + sh:expression [ + sh:message "Person's firstName and lastName together should be less than 35 chars long." ; + ex:lessThan ( + [ ex:strlen ( + [ ex:concat ( [ sh:path exOnt:firstName] [ sh:path exOnt:lastName ] ) ] ) + ] + 35 ); + ] . + +ex:PersonRuleShape + a sh:NodeShape ; + sh:targetClass exOnt:Administrator ; + sh:message "An administrator is a person too." ; + sh:rule [ + a sh:TripleRule ; + sh:subject sh:this ; + sh:predicate rdf:type ; + sh:object exOnt:Person ; + ] . diff --git a/test/test_cmdline.py b/test/test_cmdline.py index 69d05d3..d228c05 100644 --- a/test/test_cmdline.py +++ b/test/test_cmdline.py @@ -4,12 +4,9 @@ import platform import subprocess import sys - from os import getenv, path from sys import stderr - -print(os.environ, file=stderr) PATH = getenv("PATH", "") PP = getenv('PYTHONPATH', "") here_dir = path.abspath(path.dirname(__file__)) @@ -37,7 +34,6 @@ ENV_VARS["PYTHONPATH"] = ':'.join((lib_dir, PP)) it = ENV_VARS["PYTHONPATH"].split(":") -print(it, file=stderr, flush=True) scr_dir = "scripts-{}.{}".format(sys.version_info[0], sys.version_info[1]) if in_test_dir: scr_dir = path.join('..', scr_dir) @@ -131,9 +127,11 @@ def test_cmdline_table(): args = [graph_file, '-s', shacl_file, '-f', 'table'] res = subprocess.run(pyshacl_command + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=ENV_VARS) output_table = res.stdout.decode('utf-8') - assert "+-----+-----------+---------------------------+---------------------------+" \ - "---------------------------+--------------------------+---------------------------+" \ - "---------------------------+" in output_table + assert ( + "+-----+-----------+---------------------------+---------------------------+" + "---------------------------+--------------------------+---------------------------+" + "---------------------------+" in output_table + ) assert "| 1 | Violation | http://example.com/ex#Hum | http://example.com/exOnt# " in output_table diff --git a/test/test_cmdline_rules.py b/test/test_cmdline_rules.py new file mode 100644 index 0000000..d3d3128 --- /dev/null +++ b/test/test_cmdline_rules.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- +# +import os +import platform +import subprocess +import sys +from os import getenv, path +from sys import stderr + +from rdflib import RDF, Graph, URIRef + +PATH = getenv("PATH", "") +PP = getenv('PYTHONPATH', "") +here_dir = path.abspath(path.dirname(__file__)) +ENV_VARS = {"PATH": PATH, "PYTHONPATH": ':'.join((here_dir, PP))} +PH = getenv('PYTHONHOME', "") +if PH: + ENV_VARS['PYTHONHOME'] = PH +VE = getenv('VIRTUAL_ENV', "") +if VE: + ENV_VARS['VIRTUAL_ENV'] = VE + virtual_bin = path.join(VE, "bin") + ENV_VARS['PATH'] = ':'.join((virtual_bin, PATH)) +abs_resources_dir = path.join(here_dir, 'resources') +cmdline_files_dir = path.join(abs_resources_dir, 'cmdline_tests') + +check_resources = path.join(path.abspath(os.getcwd()), 'resources') +in_test_dir = False +if path.exists(check_resources) and path.isdir(check_resources): + in_test_dir = True +else: + in_test_dir = False + +if in_test_dir: + lib_dir = os.path.abspath(os.path.join(here_dir, os.pardir)) + ENV_VARS["PYTHONPATH"] = ':'.join((lib_dir, PP)) + +it = ENV_VARS["PYTHONPATH"].split(":") +scr_dir = "scripts-{}.{}".format(sys.version_info[0], sys.version_info[1]) +if in_test_dir: + scr_dir = path.join('..', scr_dir) +check_scrdir = path.join(path.abspath(os.getcwd()), scr_dir) +if path.exists(check_scrdir) and path.isdir(check_scrdir): + has_scripts_dir = True +else: + has_scripts_dir = False + +bin_dir = "bin" +if in_test_dir: + bin_dir = path.join('..', bin_dir) +check_bindir = path.join(path.abspath(os.getcwd()), bin_dir) +if path.exists(check_bindir) and path.isdir(check_bindir): + has_bin_dir = True +else: + has_bin_dir = False + +cli_rules_script = "pyshacl/cli_rules.py" +if in_test_dir: + cli_rules_script = path.join('..', cli_rules_script) +check_cli_script = path.join(path.abspath(os.getcwd()), cli_rules_script) +if path.exists(check_cli_script) and path.isfile(check_cli_script): + has_cli_script = True +else: + has_cli_script = False + +if has_scripts_dir: + pyshacl_rules_command = ["{}/pyshacl_rules".format(scr_dir)] +elif has_bin_dir: + pyshacl_rules_command = ["{}/pyshacl_rules".format(bin_dir)] +elif has_cli_script: + pyshacl_rules_command = ["python3", cli_rules_script] +else: + pyshacl_rules_command = ["pyshacl_rules"] + + +def test_cmdline_rules(): + if not hasattr(subprocess, 'run'): + print("Subprocess.run() not available, skip this test") + assert True + return True + if platform.system() == "Windows": + print("Commandline tests cannot run on Windows.") + assert True + return True + if os.environ.get("PYBUILD_NAME", None) is not None: + print("We don't have access to scripts dir during pybuild process.") + assert True + return True + graph_file = path.join(cmdline_files_dir, 'rules_d.ttl') + shacl_file = path.join(cmdline_files_dir, 'rules_s.ttl') + cmd = pyshacl_rules_command + args = [graph_file, '-s', shacl_file, '-i', 'rdfs'] + res = subprocess.run(cmd + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=ENV_VARS) + print("result = {}".format(res.returncode)) + output_unicode = res.stdout.decode('utf-8') + print(res.stderr.decode('utf-8')) + assert res.returncode == 0 + output_g = Graph().parse(data=output_unicode, format='trig') + person_classes = set( + output_g.objects( + URIRef("http://datashapes.org/shasf/tests/expression/rules.test.data#Jenny"), predicate=RDF.type + ) + ) + assert URIRef("http://datashapes.org/shasf/tests/expression/rules.test.ont#Administrator") in person_classes + assert URIRef("http://datashapes.org/shasf/tests/expression/rules.test.ont#Person") in person_classes + + +if __name__ == "__main__": + test_cmdline_rules() diff --git a/test/test_shacl_rules_runner.py b/test/test_shacl_rules_runner.py new file mode 100644 index 0000000..f7c0d47 --- /dev/null +++ b/test/test_shacl_rules_runner.py @@ -0,0 +1,147 @@ +"""\ +A Test for the SHACL Rules Runner mode. +""" + +from rdflib import RDF, Graph, URIRef + +from pyshacl import shacl_rules + +shacl_file = '''\ +# prefix: ex + +@prefix ex: . +@prefix exOnt: . +@prefix exData: . +@prefix owl: . +@prefix rdf: . +@prefix rdfs: . +@prefix sh: . +@prefix xsd: . + + + rdf:type owl:Ontology ; + rdfs:label "Test of SHACL Rules expander mode" ; +. + +ex:concat + a sh:SPARQLFunction ; + rdfs:comment "Concatenates strings $op1 and $op2." ; + sh:parameter [ + sh:path ex:op1 ; + sh:datatype xsd:string ; + sh:description "The first string" ; + ] ; + sh:parameter [ + sh:path ex:op2 ; + sh:datatype xsd:string ; + sh:description "The second string" ; + ] ; + sh:returnType xsd:string ; + sh:select """ + SELECT ?result + WHERE { + BIND(CONCAT(STR(?op1),STR(?op2)) AS ?result) . + } + """ . + +ex:strlen + a sh:SPARQLFunction ; + rdfs:comment "Returns length of the given string." ; + sh:parameter [ + sh:path ex:op1 ; + sh:datatype xsd:string ; + sh:description "The string" ; + ] ; + sh:returnType xsd:integer ; + sh:select """ + SELECT ?result + WHERE { + BIND(STRLEN(?op1) AS ?result) . + } + """ . + +ex:lessThan + a sh:SPARQLFunction ; + rdfs:comment "Returns True if op1 < op2." ; + sh:parameter [ + sh:path ex:op1 ; + sh:datatype xsd:integer ; + sh:description "The first int" ; + ] ; + sh:parameter [ + sh:path ex:op2 ; + sh:datatype xsd:integer ; + sh:description "The second int" ; + ] ; + sh:returnType xsd:boolean ; + sh:select """ + SELECT ?result + WHERE { + BIND(IF(?op1 < ?op2, true, false) AS ?result) . + } + """ . + +ex:PersonExpressionShape + a sh:NodeShape ; + sh:targetClass exOnt:Person ; + sh:expression [ + sh:message "Person's firstName and lastName together should be less than 35 chars long." ; + ex:lessThan ( + [ ex:strlen ( + [ ex:concat ( [ sh:path exOnt:firstName] [ sh:path exOnt:lastName ] ) ] ) + ] + 35 ); + ] . + +ex:PersonRuleShape + a sh:NodeShape ; + sh:targetClass exOnt:Administrator ; + sh:message "An administrator is a person too." ; + sh:rule [ + a sh:TripleRule ; + sh:subject sh:this ; + sh:predicate rdf:type ; + sh:object exOnt:Person ; + ] . +''' + +data_graph = ''' +# prefix: ex + +@prefix ex: . +@prefix exOnt: . +@prefix owl: . +@prefix rdf: . +@prefix rdfs: . +@prefix xsd: . + +ex:Kate + rdf:type exOnt:Person ; + exOnt:firstName "Kate" ; + exOnt:lastName "Jones" ; +. + +ex:Jenny + rdf:type exOnt:Administrator ; + exOnt:firstName "Jennifer" ; + exOnt:lastName "Wolfeschlegelsteinhausenbergerdorff" ; +. +''' + + +def test_rules_runner(): + d = Graph().parse(data=data_graph, format="turtle") + s = Graph().parse(data=shacl_file, format="turtle") + output_g = shacl_rules(d, shacl_graph=s, advanced=True, debug=False) + person_classes = set( + output_g.objects( + URIRef("http://datashapes.org/shasf/tests/expression/rules.test.data#Jenny"), predicate=RDF.type + ) + ) + assert URIRef("http://datashapes.org/shasf/tests/expression/rules.test.ont#Administrator") in person_classes + assert URIRef("http://datashapes.org/shasf/tests/expression/rules.test.ont#Person") in person_classes + print(output_g.serialize(format="turtle")) + + +if __name__ == "__main__": + exit(test_rules_runner()) From d4c77117903e5996f0c897b2c9580a6b1e3eaf5f Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Wed, 2 Oct 2024 19:04:31 +1000 Subject: [PATCH 3/7] Add Changelog entries for Shacl Rules Expander mode --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 36a7b57..2541ed1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Python PEP 440 Versioning](https://www.python.org/d ## [Unreleased] ### Added +- SHACL Rules Expander Mode + - A new alternative Run Mode for PySHACL + - PySHACL will not validate the DataGraph against Shapes and Constraints, instead it will simply run all SHACL-AF Rules to expand the DataGraph. + - By default it will output a new graph containing the existing DataGraph Triples plus the expanded triples + - Run with inplace mode to expand the new triples directly into the input DataGraph - Focus Node Filtering - You can now pass in a list of focus nodes to the validator, and it will only validate those focus nodes. - Note, you still need to pass in a SHACL Shapes Graph, and the shapes still need to target the focus nodes. @@ -19,6 +24,9 @@ and this project adheres to [Python PEP 440 Versioning](https://www.python.org/d - If you give the validator a list of Shapes to use, and a list of focus nodes, the validator will operate in a highly-targeted mode, it feeds those focus nodes directly into those given Shapes for validation. - In this mode, the selected SHACL Shape does not need to specify any focus-targeting mechanisms of its own. +- Combined Rules Expander Mode with Shape Selection + - The combination of SHACL Rules Expander Mode and Shape Selection will allow specialised workflows. + - For example, you can run specific expansion rules from a SHACL Shapes File, based on the new triples required. ### Changed - Don't make a clone of the DataGraph if the input data graph is ephemeral. From c1f743d1912042064f69febfb2bc0cc9b4e6ac3a Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Wed, 2 Oct 2024 19:57:10 +1000 Subject: [PATCH 4/7] Don't initialize unused variable --- pyshacl/shape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyshacl/shape.py b/pyshacl/shape.py index 8c885b5..b689005 100644 --- a/pyshacl/shape.py +++ b/pyshacl/shape.py @@ -632,7 +632,7 @@ def validate( if executor.debug: self.logger.debug(f"Skipping shape because it is deactivated: {str(self)}") return True, [] - focus_list: Sequence[RDFNode] = [] + focus_list: Sequence[RDFNode] if focus is not None: lh_shape = False rh_shape = True From ac88ca266879c7f8f46132ddef3079754b01b47e Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Wed, 2 Oct 2024 20:07:03 +1000 Subject: [PATCH 5/7] Allow SHACL Rules to be gathered only from specified Shapes, allow SHACL Rules to be applied only to specified focus nodes. --- pyshacl/extras/js/rules.py | 29 +++++++++++++++++++++++++---- pyshacl/rules/__init__.py | 24 +++++++++++++++++++----- pyshacl/rules/shacl_rule.py | 11 ++++++++--- pyshacl/rules/sparql/__init__.py | 27 ++++++++++++++++++++++----- pyshacl/rules/triple/__init__.py | 29 +++++++++++++++++++++++------ 5 files changed, 97 insertions(+), 23 deletions(-) diff --git a/pyshacl/extras/js/rules.py b/pyshacl/extras/js/rules.py index 5d2548c..bd16557 100644 --- a/pyshacl/extras/js/rules.py +++ b/pyshacl/extras/js/rules.py @@ -1,6 +1,9 @@ # # import typing +from typing import List, Sequence, Union + +import rdflib from pyshacl.consts import SH from pyshacl.errors import ReportableRuntimeError @@ -9,7 +12,8 @@ from .js_executable import JSExecutable if typing.TYPE_CHECKING: - from pyshacl.pytypes import GraphLike, SHACLExecutor + + from pyshacl.pytypes import GraphLike, RDFNode, SHACLExecutor from pyshacl.shape import Shape from pyshacl.shapes_graph import ShapesGraph @@ -24,8 +28,25 @@ def __init__(self, executor: 'SHACLExecutor', shape: 'Shape', rule_node, **kwarg shapes_graph: 'ShapesGraph' = shape.sg self.js_exe = JSExecutable(shapes_graph, rule_node) - def apply(self, data_graph: 'GraphLike') -> int: - focus_nodes = self.shape.focus_nodes(data_graph) # uses target nodes to find focus nodes + def apply( + self, + data_graph: 'GraphLike', + focus_nodes: Union[Sequence['RDFNode'], None] = None, + ) -> int: + focus_list: Sequence['RDFNode'] + if focus_nodes is not None: + focus_list = list(focus_nodes) + else: + focus_list = list(self.shape.focus_nodes(data_graph)) + if self.executor.focus_nodes is not None and len(self.executor.focus_nodes) > 0: + filtered_focus_nodes: List[Union[rdflib.URIRef]] = [] + for _fo in focus_list: # type: RDFNode + if isinstance(_fo, rdflib.URIRef) and _fo in self.executor.focus_nodes: + filtered_focus_nodes.append(_fo) + len_filtered_focus = len(filtered_focus_nodes) + if len_filtered_focus < 1: + return 0 + focus_list = filtered_focus_nodes all_added = 0 iterate_limit = 100 while True: @@ -33,7 +54,7 @@ def apply(self, data_graph: 'GraphLike') -> int: raise ReportableRuntimeError("Local rule iteration exceeded iteration limit of 100.") iterate_limit -= 1 added = 0 - applicable_nodes = self.filter_conditions(focus_nodes, data_graph) + applicable_nodes = self.filter_conditions(focus_list, data_graph) sets_to_add = [] for a in applicable_nodes: args_map = {"this": a} diff --git a/pyshacl/rules/__init__.py b/pyshacl/rules/__init__.py index f25fd3e..3eae4cc 100644 --- a/pyshacl/rules/__init__.py +++ b/pyshacl/rules/__init__.py @@ -1,10 +1,12 @@ # -*- coding: utf-8 -*- from collections import defaultdict -from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, Type, Union + +from rdflib import BNode, URIRef from pyshacl.consts import RDF_type, SH_rule, SH_SPARQLRule, SH_TripleRule from pyshacl.errors import ReportableRuntimeError, RuleLoadError -from pyshacl.pytypes import GraphLike, SHACLExecutor +from pyshacl.pytypes import GraphLike, RDFNode, SHACLExecutor from pyshacl.rules.sparql import SPARQLRule from pyshacl.rules.triple import TripleRule @@ -15,7 +17,11 @@ from .shacl_rule import SHACLRule -def gather_rules(executor: SHACLExecutor, shacl_graph: 'ShapesGraph') -> Dict['Shape', List['SHACLRule']]: +def gather_rules( + executor: SHACLExecutor, + shacl_graph: 'ShapesGraph', + from_shapes: Union[Sequence[Union[URIRef, BNode]], None] = None, +) -> Dict['Shape', List['SHACLRule']]: """ :param executor: :type executor: SHACLExecutor @@ -55,6 +61,9 @@ def gather_rules(executor: SHACLExecutor, shacl_graph: 'ShapesGraph') -> Dict['S used_rules = shacl_graph.subject_objects(SH_rule) ret_rules = defaultdict(list) for sub, obj in used_rules: + if from_shapes is not None and sub not in from_shapes: + # Skipping rule that is not in the whitelist of Shapes to use + continue try: shape: Shape = shacl_graph.lookup_shape_from_node(sub) except (AttributeError, KeyError): @@ -77,7 +86,12 @@ def gather_rules(executor: SHACLExecutor, shacl_graph: 'ShapesGraph') -> Dict['S return ret_rules -def apply_rules(executor: SHACLExecutor, shapes_rules: Dict, data_graph: GraphLike) -> int: +def apply_rules( + executor: SHACLExecutor, + shapes_rules: Dict, + data_graph: GraphLike, + focus_nodes: Union[Sequence[RDFNode], None] = None, +) -> int: # short the shapes dict by shapes sh:order before execution sorted_shapes_rules: List[Tuple[Any, Any]] = sorted(shapes_rules.items(), key=lambda x: x[0].order) total_modified = 0 @@ -93,7 +107,7 @@ def apply_rules(executor: SHACLExecutor, shapes_rules: Dict, data_graph: GraphLi for r in rules: if r.deactivated: continue - n_modified = r.apply(data_graph) + n_modified = r.apply(data_graph, focus_nodes=focus_nodes) this_modified += n_modified if this_modified > 0: total_modified += this_modified diff --git a/pyshacl/rules/shacl_rule.py b/pyshacl/rules/shacl_rule.py index 2d8c69c..0014823 100644 --- a/pyshacl/rules/shacl_rule.py +++ b/pyshacl/rules/shacl_rule.py @@ -1,11 +1,12 @@ # -*- coding: utf-8 -*- from decimal import Decimal +from typing import Sequence, Union from rdflib import RDF, Literal from pyshacl.consts import SH_condition, SH_deactivated, SH_order from pyshacl.errors import RuleLoadError -from pyshacl.pytypes import SHACLExecutor +from pyshacl.pytypes import RDFNode, SHACLExecutor RDF_first = RDF.first @@ -96,7 +97,7 @@ def get_conditions(self): conditions.append(condition) return conditions - def filter_conditions(self, focus_nodes, data_graph): + def filter_conditions(self, focus_nodes: Sequence[RDFNode], data_graph): conditions = self.get_conditions() applicable_focus_nodes = [] for f in focus_nodes: @@ -108,5 +109,9 @@ def filter_conditions(self, focus_nodes, data_graph): applicable_focus_nodes.append(f) return applicable_focus_nodes - def apply(self, data_graph): + def apply( + self, + data_graph, + focus_nodes: Union[Sequence[RDFNode], None] = None, + ): raise NotImplementedError() diff --git a/pyshacl/rules/sparql/__init__.py b/pyshacl/rules/sparql/__init__.py index 0d9f23c..9942a9a 100644 --- a/pyshacl/rules/sparql/__init__.py +++ b/pyshacl/rules/sparql/__init__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, List, Sequence, Union import rdflib from rdflib import Literal @@ -13,7 +13,7 @@ from ..shacl_rule import SHACLRule if TYPE_CHECKING: - from pyshacl.pytypes import GraphLike, SHACLExecutor + from pyshacl.pytypes import GraphLike, RDFNode, SHACLExecutor from pyshacl.shape import Shape XSD_string = XSD.string @@ -49,8 +49,25 @@ def __init__(self, executor: 'SHACLExecutor', shape: 'Shape', rule_node: 'rdflib query_helper.collect_prefixes() self._qh = query_helper - def apply(self, data_graph: 'GraphLike') -> int: - focus_nodes = self.shape.focus_nodes(data_graph) # uses target nodes to find focus nodes + def apply( + self, + data_graph: 'GraphLike', + focus_nodes: Union[Sequence['RDFNode'], None] = None, + ) -> int: + focus_list: Sequence['RDFNode'] + if focus_nodes is not None: + focus_list = list(focus_nodes) + else: + focus_list = list(self.shape.focus_nodes(data_graph)) + if self.executor.focus_nodes is not None and len(self.executor.focus_nodes) > 0: + filtered_focus_nodes: List[Union[rdflib.URIRef]] = [] + for _fo in focus_list: # type: RDFNode + if isinstance(_fo, rdflib.URIRef) and _fo in self.executor.focus_nodes: + filtered_focus_nodes.append(_fo) + len_filtered_focus = len(filtered_focus_nodes) + if len_filtered_focus < 1: + return 0 + focus_list = filtered_focus_nodes all_added = 0 SPARQLQueryHelper = get_query_helper_cls() iterate_limit = 100 @@ -59,7 +76,7 @@ def apply(self, data_graph: 'GraphLike') -> int: raise ReportableRuntimeError("Local SPARQLRule iteration exceeded iteration limit of 100.") iterate_limit -= 1 added = 0 - applicable_nodes = self.filter_conditions(focus_nodes, data_graph) + applicable_nodes = self.filter_conditions(focus_list, data_graph) construct_graphs = set() for a in applicable_nodes: for c in self._constructs: diff --git a/pyshacl/rules/triple/__init__.py b/pyshacl/rules/triple/__init__.py index 88979e8..cbc51c3 100644 --- a/pyshacl/rules/triple/__init__.py +++ b/pyshacl/rules/triple/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- import itertools -from typing import TYPE_CHECKING, Tuple, cast +from typing import TYPE_CHECKING, List, Sequence, Tuple, Union, cast import rdflib @@ -10,9 +10,8 @@ from pyshacl.rules.shacl_rule import SHACLRule if TYPE_CHECKING: - from rdflib.term import Node - from pyshacl.pytypes import GraphLike, SHACLExecutor + from pyshacl.pytypes import GraphLike, RDFNode, SHACLExecutor from pyshacl.shape import Shape @@ -50,9 +49,27 @@ def __init__(self, executor: 'SHACLExecutor', shape: 'Shape', rule_node: 'rdflib raise RuntimeError("Too many sh:object") self.o = next(iter(my_object_nodes)) - def apply(self, data_graph: 'GraphLike') -> int: - focus_nodes = self.shape.focus_nodes(data_graph) # uses target nodes to find focus nodes - applicable_nodes = self.filter_conditions(focus_nodes, data_graph) + def apply( + self, + data_graph: 'GraphLike', + focus_nodes: Union[Sequence['RDFNode'], None] = None, + ) -> int: + focus_list: Sequence['RDFNode'] + if focus_nodes is not None: + focus_list = list(focus_nodes) + else: + focus_list = list(self.shape.focus_nodes(data_graph)) + if self.executor.focus_nodes is not None and len(self.executor.focus_nodes) > 0: + filtered_focus_nodes: List[Union[rdflib.URIRef]] = [] + for _fo in focus_list: # type: RDFNode + if isinstance(_fo, rdflib.URIRef) and _fo in self.executor.focus_nodes: + filtered_focus_nodes.append(_fo) + len_filtered_focus = len(filtered_focus_nodes) + if len_filtered_focus < 1: + return 0 + focus_list = filtered_focus_nodes + # uses target nodes to find focus nodes + applicable_nodes = self.filter_conditions(focus_list, data_graph) all_added = 0 iterate_limit = 100 while True: From d300e4a4beaa8351ae1f4e92a0612ce8d5a3b773 Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Wed, 2 Oct 2024 20:08:19 +1000 Subject: [PATCH 6/7] In Validator Mode and SHACL Rule Expander mode, allow SHACLRules to be gathered only from the specified SHACL Shapes. Allow SHACL Rules to be applied only to the passed in Focus nodes. --- pyshacl/rule_expand_runner.py | 26 ++++++++++++++------------ pyshacl/validator.py | 18 ++++++++++-------- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/pyshacl/rule_expand_runner.py b/pyshacl/rule_expand_runner.py index 13a20da..1cce053 100644 --- a/pyshacl/rule_expand_runner.py +++ b/pyshacl/rule_expand_runner.py @@ -246,20 +246,18 @@ def run(self) -> GraphLike: else: specified_focus_nodes = None executor = self.make_executor() - # Special hack, if we are using manually specified shapes, and have # manually specified focus nodes, then we need to disable the # focus_nodes in the executor, because we apply the specified focus # nodes directly to the specified shapes. if using_manually_specified_shapes and specified_focus_nodes is not None: executor.focus_nodes = None - self.logger.debug("Activating SHACL-AF Features.") target_types = gather_target_types(self.shacl_graph) - advanced = { - 'functions': gather_functions(executor, self.shacl_graph), - 'rules': gather_rules(executor, self.shacl_graph), - } + gather_from_shapes = None if not using_manually_specified_shapes else [s.node for s in shapes] + gathered_functions = gather_functions(executor, self.shacl_graph) + gathered_rules = gather_rules(executor, self.shacl_graph, from_shapes=gather_from_shapes) + for s in shapes: s.set_advanced(True) apply_target_types(target_types) @@ -274,18 +272,22 @@ def run(self) -> GraphLike: ] else: named_graphs = [the_target_graph] + if specified_focus_nodes is not None and using_manually_specified_shapes: + on_focus_nodes = specified_focus_nodes + else: + on_focus_nodes = None if self.debug: self.logger.debug(f"Will run SHACL Rules expansion on {len(named_graphs)} named graph/s.") for g in named_graphs: if self.debug: self.logger.debug(f"Running SHACL Rules on DataGraph named {g.identifier}") - if advanced['functions']: - apply_functions(executor, advanced['functions'], g) + if gathered_functions: + apply_functions(executor, gathered_functions, g) try: - if advanced['rules']: - apply_rules(executor, advanced['rules'], g) + if gathered_rules: + apply_rules(executor, gathered_rules, g, focus_nodes=on_focus_nodes) finally: - if advanced: - unapply_functions(advanced['functions'], g) + if gathered_functions: + unapply_functions(gathered_functions, g) return the_target_graph diff --git a/pyshacl/validator.py b/pyshacl/validator.py index 66fca54..df9d9b9 100644 --- a/pyshacl/validator.py +++ b/pyshacl/validator.py @@ -319,15 +319,17 @@ def run(self): if executor.advanced_mode: self.logger.debug("Activating SHACL-AF Features.") target_types = gather_target_types(self.shacl_graph) + gather_from_shapes = None if not using_manually_specified_shapes else [s.node for s in shapes] advanced = { 'functions': gather_functions(executor, self.shacl_graph), - 'rules': gather_rules(executor, self.shacl_graph), + 'rules': gather_rules(executor, self.shacl_graph, from_shapes=gather_from_shapes), } for s in shapes: s.set_advanced(True) apply_target_types(target_types) else: advanced = {} + if isinstance(the_target_graph, (rdflib.Dataset, rdflib.ConjunctiveGraph)): named_graphs = [ ( @@ -339,8 +341,11 @@ def run(self): ] else: named_graphs = [the_target_graph] + if specified_focus_nodes is not None and using_manually_specified_shapes: + on_focus_nodes = specified_focus_nodes + else: + on_focus_nodes = None reports = [] - non_conformant = False aborted = False if executor.abort_on_first and self.debug: @@ -359,13 +364,10 @@ def run(self): if executor.sparql_mode: self.logger.warning("Skipping SHACL Rules because operating in SPARQL Remote Graph Mode.") else: - apply_rules(executor, advanced['rules'], g) + apply_rules(executor, advanced['rules'], g, focus_nodes=on_focus_nodes) try: for s in shapes: - if using_manually_specified_shapes and specified_focus_nodes is not None: - _is_conform, _reports = s.validate(executor, g, focus=specified_focus_nodes) - else: - _is_conform, _reports = s.validate(executor, g) + _is_conform, _reports = s.validate(executor, g, focus=on_focus_nodes) non_conformant = non_conformant or (not _is_conform) reports.extend(_reports) if executor.abort_on_first and non_conformant: @@ -374,7 +376,7 @@ def run(self): if aborted: break finally: - if advanced: + if advanced and advanced['functions']: unapply_functions(advanced['functions'], g) v_report, v_text = self.create_validation_report(self.shacl_graph, not non_conformant, reports) return (not non_conformant), v_report, v_text From acc2aa8675824abf6e6323287867048027ed2cd5 Mon Sep 17 00:00:00 2001 From: Ashley Sommer Date: Wed, 2 Oct 2024 20:19:30 +1000 Subject: [PATCH 7/7] Fixes for Type-checking, linting and formatting --- pyshacl/entrypoints.py | 2 +- pyshacl/rule_expand_runner.py | 8 ++++---- pyshacl/rules/triple/__init__.py | 2 +- pyshacl/run_type.py | 2 +- pyshacl/validator.py | 8 ++++---- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pyshacl/entrypoints.py b/pyshacl/entrypoints.py index 6ad8290..bb43222 100644 --- a/pyshacl/entrypoints.py +++ b/pyshacl/entrypoints.py @@ -11,7 +11,7 @@ from pyshacl.errors import ReportableRuntimeError, ValidationFailure from pyshacl.pytypes import GraphLike -from .consts import RDF, SH, RDF_type +from .consts import SH, RDF_type from .monkey import apply_patches, rdflib_bool_patch, rdflib_bool_unpatch from .rdfutil import load_from_source from .rule_expand_runner import RuleExpandRunner diff --git a/pyshacl/rule_expand_runner.py b/pyshacl/rule_expand_runner.py index 1cce053..269deec 100644 --- a/pyshacl/rule_expand_runner.py +++ b/pyshacl/rule_expand_runner.py @@ -2,7 +2,7 @@ # import logging from os import getenv -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Sequence, Union import rdflib from rdflib import URIRef @@ -242,7 +242,7 @@ def run(self) -> GraphLike: expanded_focus_node = URIRef(f) expanded_focus_nodes.append(expanded_focus_node) self.options["focus_nodes"] = expanded_focus_nodes - specified_focus_nodes: Union[None, List[URIRef]] = expanded_focus_nodes + specified_focus_nodes: Union[None, Sequence[URIRef]] = expanded_focus_nodes else: specified_focus_nodes = None executor = self.make_executor() @@ -264,7 +264,7 @@ def run(self) -> GraphLike: if isinstance(the_target_graph, (rdflib.Dataset, rdflib.ConjunctiveGraph)): named_graphs = [ ( - rdflib.Graph(the_target_graph.store, i, namespace_manager=the_target_graph.namespace_manager) + rdflib.Graph(the_target_graph.store, i, namespace_manager=the_target_graph.namespace_manager) # type: ignore[arg-type] if not isinstance(i, rdflib.Graph) else i ) @@ -273,7 +273,7 @@ def run(self) -> GraphLike: else: named_graphs = [the_target_graph] if specified_focus_nodes is not None and using_manually_specified_shapes: - on_focus_nodes = specified_focus_nodes + on_focus_nodes: Union[Sequence[URIRef], None] = specified_focus_nodes else: on_focus_nodes = None if self.debug: diff --git a/pyshacl/rules/triple/__init__.py b/pyshacl/rules/triple/__init__.py index cbc51c3..1dd25f2 100644 --- a/pyshacl/rules/triple/__init__.py +++ b/pyshacl/rules/triple/__init__.py @@ -92,7 +92,7 @@ def apply( added += 1 if added > 0: for i in to_add: - data_graph.add(cast(Tuple['Node', 'Node', 'Node'], i)) + data_graph.add(cast(Tuple['RDFNode', 'RDFNode', 'RDFNode'], i)) all_added += added if self.iterate: continue # Jump up to iterate diff --git a/pyshacl/run_type.py b/pyshacl/run_type.py index 08b376b..89b4b7b 100644 --- a/pyshacl/run_type.py +++ b/pyshacl/run_type.py @@ -1,4 +1,4 @@ -from abc import ABC, ABCMeta, abstractmethod +from abc import ABCMeta, abstractmethod class PySHACLRunType(metaclass=ABCMeta): diff --git a/pyshacl/validator.py b/pyshacl/validator.py index df9d9b9..762bb22 100644 --- a/pyshacl/validator.py +++ b/pyshacl/validator.py @@ -3,7 +3,7 @@ import logging import sys from os import getenv, path -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union import rdflib from rdflib import BNode, Literal, URIRef @@ -304,7 +304,7 @@ def run(self): expanded_focus_node = URIRef(f) expanded_focus_nodes.append(expanded_focus_node) self.options["focus_nodes"] = expanded_focus_nodes - specified_focus_nodes: Union[None, List[URIRef]] = expanded_focus_nodes + specified_focus_nodes: Union[None, Sequence[URIRef]] = expanded_focus_nodes else: specified_focus_nodes = None executor = self.make_executor() @@ -333,7 +333,7 @@ def run(self): if isinstance(the_target_graph, (rdflib.Dataset, rdflib.ConjunctiveGraph)): named_graphs = [ ( - rdflib.Graph(the_target_graph.store, i, namespace_manager=the_target_graph.namespace_manager) + rdflib.Graph(the_target_graph.store, i, namespace_manager=the_target_graph.namespace_manager) # type: ignore[arg-type] if not isinstance(i, rdflib.Graph) else i ) @@ -342,7 +342,7 @@ def run(self): else: named_graphs = [the_target_graph] if specified_focus_nodes is not None and using_manually_specified_shapes: - on_focus_nodes = specified_focus_nodes + on_focus_nodes: Union[Sequence[URIRef], None] = specified_focus_nodes else: on_focus_nodes = None reports = []