From 3d5a20f854a02ff6e32b5c05cb18d1ecef12a3ea Mon Sep 17 00:00:00 2001 From: Sam Xiao Date: Fri, 27 Sep 2024 12:24:24 -0400 Subject: [PATCH] Tag redacted document in ingest pipeline (#113552) Adds a new option trace_redact in redact processor to indicate a document has been redacted in the ingest pipeline. If a document is processed by a redact processor AND any field is redacted, ingest metadata _ingest._redact._is_redacted = true will be set. Closes #94633 --- docs/changelog/113552.yaml | 5 + .../ingest/processors/redact.asciidoc | 1 + .../xpack/redact/RedactProcessor.java | 34 +++++- .../redact/RedactProcessorFactoryTests.java | 1 + .../xpack/redact/RedactProcessorTests.java | 115 +++++++++++++++++- .../test/redact/10_redact_processor.yml | 24 +++- 6 files changed, 176 insertions(+), 4 deletions(-) create mode 100644 docs/changelog/113552.yaml diff --git a/docs/changelog/113552.yaml b/docs/changelog/113552.yaml new file mode 100644 index 0000000000000..48f7da309e82e --- /dev/null +++ b/docs/changelog/113552.yaml @@ -0,0 +1,5 @@ +pr: 113552 +summary: Tag redacted document in ingest metadata +area: Ingest Node +type: enhancement +issues: [] diff --git a/docs/reference/ingest/processors/redact.asciidoc b/docs/reference/ingest/processors/redact.asciidoc index 6706106e92655..9b8ac1e15d1a8 100644 --- a/docs/reference/ingest/processors/redact.asciidoc +++ b/docs/reference/ingest/processors/redact.asciidoc @@ -39,6 +39,7 @@ patterns. Legacy Grok patterns are not supported. | `ignore_missing` | no | `true` | If `true` and `field` does not exist or is `null`, the processor quietly exits without modifying the document include::common-options.asciidoc[] | `skip_if_unlicensed` | no | `false` | If `true` and the current license does not support running redact processors, then the processor quietly exits without modifying the document +| `trace_redact` | no | `false` | If `true` then ingest metadata `_ingest._redact._is_redacted` is set to `true` if the document has been redacted |====== In this example the predefined `IP` Grok pattern is used to match diff --git a/x-pack/plugin/redact/src/main/java/org/elasticsearch/xpack/redact/RedactProcessor.java b/x-pack/plugin/redact/src/main/java/org/elasticsearch/xpack/redact/RedactProcessor.java index 04a423c7ea330..187126fb31e3e 100644 --- a/x-pack/plugin/redact/src/main/java/org/elasticsearch/xpack/redact/RedactProcessor.java +++ b/x-pack/plugin/redact/src/main/java/org/elasticsearch/xpack/redact/RedactProcessor.java @@ -55,6 +55,12 @@ public class RedactProcessor extends AbstractProcessor { private static final String DEFAULT_REDACTED_START = "<"; private static final String DEFAULT_REDACTED_END = ">"; + protected static final String REDACT_KEY = "_redact"; + protected static final String IS_REDACTED_KEY = "_is_redacted"; + protected static final String METADATA_PATH_REDACT = IngestDocument.INGEST_KEY + "." + REDACT_KEY; + // indicates if document has been redacted, path: _ingest._redact._is_redacted + protected static final String METADATA_PATH_REDACT_IS_REDACTED = METADATA_PATH_REDACT + "." + IS_REDACTED_KEY; + private final String redactField; private final List groks; private final boolean ignoreMissing; @@ -65,6 +71,8 @@ public class RedactProcessor extends AbstractProcessor { private final XPackLicenseState licenseState; private final boolean skipIfUnlicensed; + private final boolean traceRedact; + RedactProcessor( String tag, String description, @@ -76,7 +84,8 @@ public class RedactProcessor extends AbstractProcessor { String redactedEndToken, MatcherWatchdog matcherWatchdog, XPackLicenseState licenseState, - boolean skipIfUnlicensed + boolean skipIfUnlicensed, + boolean traceRedact ) { super(tag, description); this.redactField = redactField; @@ -94,6 +103,7 @@ public class RedactProcessor extends AbstractProcessor { } this.licenseState = licenseState; this.skipIfUnlicensed = skipIfUnlicensed; + this.traceRedact = traceRedact; } @Override @@ -128,6 +138,8 @@ public IngestDocument execute(IngestDocument ingestDocument) { try { String redacted = matchRedact(fieldValue, groks, redactedStartToken, redactedEndToken); ingestDocument.setFieldValue(redactField, redacted); + updateMetadataIfNecessary(ingestDocument, fieldValue, redacted); + return ingestDocument; } catch (RuntimeException e) { // grok throws a RuntimeException when the watchdog interrupts the match @@ -203,6 +215,21 @@ private static void matchRepeat(Grok grok, byte[] utf8Bytes, RegionTrackingMatch } while (offset != length); } + private void updateMetadataIfNecessary(IngestDocument ingestDocument, String fieldValue, String redacted) { + if (traceRedact == false || fieldValue == null) { + return; + } + + Boolean isRedactedMetadata = ingestDocument.getFieldValue(METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true); + boolean alreadyRedacted = Boolean.TRUE.equals(isRedactedMetadata); + boolean isRedacted = fieldValue.equals(redacted) == false; + + // document newly redacted + if (alreadyRedacted == false && isRedacted) { + ingestDocument.setFieldValue(METADATA_PATH_REDACT_IS_REDACTED, true); + } + } + /** * A Grok capture extractor which tracks matched regions * and the Grok pattern name for redaction later. @@ -389,6 +416,8 @@ public RedactProcessor create( String redactStart = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "prefix", DEFAULT_REDACTED_START); String redactEnd = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "suffix", DEFAULT_REDACTED_END); + boolean traceRedact = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "trace_redact", false); + if (matchPatterns == null || matchPatterns.isEmpty()) { throw newConfigurationException(TYPE, processorTag, "patterns", "List of patterns must not be empty"); } @@ -406,7 +435,8 @@ public RedactProcessor create( redactEnd, matcherWatchdog, licenseState, - skipIfUnlicensed + skipIfUnlicensed, + traceRedact ); } catch (Exception e) { throw newConfigurationException( diff --git a/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorFactoryTests.java b/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorFactoryTests.java index 376e7caa8137d..affcc72614aa8 100644 --- a/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorFactoryTests.java +++ b/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorFactoryTests.java @@ -68,6 +68,7 @@ public void testConfigKeysRemoved() throws Exception { config.put("patterns", List.of("%{MY_PATTERN:name}!")); config.put("pattern_definitions", Map.of("MY_PATTERN", "foo")); config.put("ignore_missing", true); + config.put("trace_redact", true); config.put("extra", "unused"); factory.create(null, null, null, config); diff --git a/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorTests.java b/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorTests.java index a775adb7a4c15..3f44957201ef0 100644 --- a/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorTests.java +++ b/x-pack/plugin/redact/src/test/java/org/elasticsearch/xpack/redact/RedactProcessorTests.java @@ -259,7 +259,8 @@ public void testLicenseChecks() throws Exception { ">", MatcherWatchdog.noop(), notAllowed, - false // set skip_if_unlicensed to false, we do not want to skip, we do want to fail + false, // set skip_if_unlicensed to false, we do not want to skip, we do want to fail + false ); assertThat(processor.getSkipIfUnlicensed(), equalTo(false)); var ingestDoc = createIngestDoc(Map.of("not_the_field", "fieldValue")); @@ -314,6 +315,118 @@ public void testLicenseChanges() throws Exception { } } + @SuppressWarnings("unchecked") + public void testTraceRedact() throws Exception { + var config = new HashMap(); + config.put("field", "to_redact"); + config.put("patterns", List.of("%{EMAILADDRESS:REDACTED}")); + config.put("trace_redact", true); + { + var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( + null, + "t", + "d", + new HashMap<>(config) + ); + var message = "this should not be redacted"; + var ingestDoc = createIngestDoc(Map.of("to_redact", message)); + var redactedDoc = processor.execute(ingestDoc); + + assertEquals(message, redactedDoc.getFieldValue("to_redact", String.class)); + assertNull(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true)); + } + { + var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( + null, + "t", + "d", + new HashMap<>(config) + ); + var ingestDoc = createIngestDoc(Map.of("to_redact", "thisisanemail@address.com will be redacted")); + var redactedDoc = processor.execute(ingestDoc); + + assertEquals(" will be redacted", redactedDoc.getFieldValue("to_redact", String.class)); + // validate ingest metadata path correctly resolved + assertTrue(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class)); + // validate ingest metadata structure correct + var ingestMeta = redactedDoc.getIngestMetadata(); + assertTrue(ingestMeta.containsKey(RedactProcessor.REDACT_KEY)); + var redactMetadata = (HashMap) ingestMeta.get(RedactProcessor.REDACT_KEY); + assertTrue(redactMetadata.containsKey(RedactProcessor.IS_REDACTED_KEY)); + assertTrue((Boolean) redactMetadata.get(RedactProcessor.IS_REDACTED_KEY)); + } + { + var configNoTrace = new HashMap(); + configNoTrace.put("field", "to_redact"); + configNoTrace.put("patterns", List.of("%{EMAILADDRESS:REDACTED}")); + + var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(null, "t", "d", configNoTrace); + var ingestDoc = createIngestDoc(Map.of("to_redact", "thisisanemail@address.com will be redacted")); + var redactedDoc = processor.execute(ingestDoc); + + assertEquals(" will be redacted", redactedDoc.getFieldValue("to_redact", String.class)); + assertNull(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true)); + } + } + + public void testTraceRedactMultipleProcessors() throws Exception { + var configRedact = new HashMap(); + configRedact.put("field", "to_redact"); + configRedact.put("patterns", List.of("%{EMAILADDRESS:REDACTED}")); + configRedact.put("trace_redact", true); + + var configNoRedact = new HashMap(); + configNoRedact.put("field", "to_redact"); + configNoRedact.put("patterns", List.of("%{IP:REDACTED}")); // not in the doc + configNoRedact.put("trace_redact", true); + + // first processor does not redact doc, second one does + { + var processorRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( + null, + "t1", + "d", + new HashMap<>(configRedact) + ); + var processorNoRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( + null, + "t2", + "d", + new HashMap<>(configNoRedact) + ); + var ingestDocWithEmail = createIngestDoc(Map.of("to_redact", "thisisanemail@address.com will be redacted")); + + var docNotRedacted = processorNoRedact.execute(ingestDocWithEmail); + assertNull(docNotRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true)); + + var docRedacted = processorRedact.execute(docNotRedacted); + assertTrue(docRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class)); + } + // first processor redacts doc, second one does not + { + var processorRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( + null, + "t1", + "d", + new HashMap<>(configRedact) + ); + var processorNoRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create( + null, + "t2", + "d", + new HashMap<>(configNoRedact) + ); + var ingestDocWithEmail = createIngestDoc(Map.of("to_redact", "thisisanemail@address.com will be redacted")); + + var docRedacted = processorRedact.execute(ingestDocWithEmail); + assertTrue(docRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class)); + + // validate does not override already redacted doc metadata + var docRedactedAlready = processorNoRedact.execute(docRedacted); + assertTrue(docRedactedAlready.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class)); + } + } + public void testMergeLongestRegion() { var r = List.of( new RedactProcessor.RegionTrackingMatchExtractor.Replacement(10, 20, "first"), diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/redact/10_redact_processor.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/redact/10_redact_processor.yml index 559d87879faad..e864d191a3ec1 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/redact/10_redact_processor.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/redact/10_redact_processor.yml @@ -24,7 +24,7 @@ index: test id: "1" pipeline: "pipeline-using-a-redact-processor" - body: {to_redact: "0.0.0.1 is my secret IP to redact"} + body: { to_redact: "0.0.0.1 is my secret IP to redact" } - do: get: @@ -96,3 +96,25 @@ } - length: { docs: 1 } - match: { docs.0.doc._source.to_redact: "==*EMAIL*== will be redacted" } +--- +"Test redact with trace_redact": + - do: + ingest.simulate: + body: > + { + "pipeline": { + "processors": [ + { + "redact": { + "field": "to_redact", + "patterns": ["%{EMAILADDRESS:EMAIL}", "%{IP:IP_ADDRESS}"], + "trace_redact": true + } + } + ] + }, + "docs": [{"_source": {"to_redact": "this-email@address.com will be redacted"}}] + } + - length: { docs: 1 } + - match: { docs.0.doc._source.to_redact: " will be redacted" } + - match: { docs.0.doc._ingest._redact._is_redacted: true }