Skip to content

Commit

Permalink
Tag redacted document in ingest pipeline (#113552)
Browse files Browse the repository at this point in the history
Adds a new option trace_redact in redact processor to indicate a document has been redacted in the ingest pipeline. If a document is processed by a redact processor AND any field is redacted, ingest metadata _ingest._redact._is_redacted = true will be set.

Closes #94633
  • Loading branch information
samxbr committed Sep 29, 2024
1 parent 2fc2d05 commit 3d5a20f
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 4 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/113552.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 113552
summary: Tag redacted document in ingest metadata
area: Ingest Node
type: enhancement
issues: []
1 change: 1 addition & 0 deletions docs/reference/ingest/processors/redact.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ patterns. Legacy Grok patterns are not supported.
| `ignore_missing` | no | `true` | If `true` and `field` does not exist or is `null`, the processor quietly exits without modifying the document
include::common-options.asciidoc[]
| `skip_if_unlicensed` | no | `false` | If `true` and the current license does not support running redact processors, then the processor quietly exits without modifying the document
| `trace_redact` | no | `false` | If `true` then ingest metadata `_ingest._redact._is_redacted` is set to `true` if the document has been redacted
|======

In this example the predefined `IP` Grok pattern is used to match
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ public class RedactProcessor extends AbstractProcessor {
private static final String DEFAULT_REDACTED_START = "<";
private static final String DEFAULT_REDACTED_END = ">";

protected static final String REDACT_KEY = "_redact";
protected static final String IS_REDACTED_KEY = "_is_redacted";
protected static final String METADATA_PATH_REDACT = IngestDocument.INGEST_KEY + "." + REDACT_KEY;
// indicates if document has been redacted, path: _ingest._redact._is_redacted
protected static final String METADATA_PATH_REDACT_IS_REDACTED = METADATA_PATH_REDACT + "." + IS_REDACTED_KEY;

private final String redactField;
private final List<Grok> groks;
private final boolean ignoreMissing;
Expand All @@ -65,6 +71,8 @@ public class RedactProcessor extends AbstractProcessor {
private final XPackLicenseState licenseState;
private final boolean skipIfUnlicensed;

private final boolean traceRedact;

RedactProcessor(
String tag,
String description,
Expand All @@ -76,7 +84,8 @@ public class RedactProcessor extends AbstractProcessor {
String redactedEndToken,
MatcherWatchdog matcherWatchdog,
XPackLicenseState licenseState,
boolean skipIfUnlicensed
boolean skipIfUnlicensed,
boolean traceRedact
) {
super(tag, description);
this.redactField = redactField;
Expand All @@ -94,6 +103,7 @@ public class RedactProcessor extends AbstractProcessor {
}
this.licenseState = licenseState;
this.skipIfUnlicensed = skipIfUnlicensed;
this.traceRedact = traceRedact;
}

@Override
Expand Down Expand Up @@ -128,6 +138,8 @@ public IngestDocument execute(IngestDocument ingestDocument) {
try {
String redacted = matchRedact(fieldValue, groks, redactedStartToken, redactedEndToken);
ingestDocument.setFieldValue(redactField, redacted);
updateMetadataIfNecessary(ingestDocument, fieldValue, redacted);

return ingestDocument;
} catch (RuntimeException e) {
// grok throws a RuntimeException when the watchdog interrupts the match
Expand Down Expand Up @@ -203,6 +215,21 @@ private static void matchRepeat(Grok grok, byte[] utf8Bytes, RegionTrackingMatch
} while (offset != length);
}

private void updateMetadataIfNecessary(IngestDocument ingestDocument, String fieldValue, String redacted) {
if (traceRedact == false || fieldValue == null) {
return;
}

Boolean isRedactedMetadata = ingestDocument.getFieldValue(METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true);
boolean alreadyRedacted = Boolean.TRUE.equals(isRedactedMetadata);
boolean isRedacted = fieldValue.equals(redacted) == false;

// document newly redacted
if (alreadyRedacted == false && isRedacted) {
ingestDocument.setFieldValue(METADATA_PATH_REDACT_IS_REDACTED, true);
}
}

/**
* A Grok capture extractor which tracks matched regions
* and the Grok pattern name for redaction later.
Expand Down Expand Up @@ -389,6 +416,8 @@ public RedactProcessor create(
String redactStart = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "prefix", DEFAULT_REDACTED_START);
String redactEnd = ConfigurationUtils.readStringProperty(TYPE, processorTag, config, "suffix", DEFAULT_REDACTED_END);

boolean traceRedact = ConfigurationUtils.readBooleanProperty(TYPE, processorTag, config, "trace_redact", false);

if (matchPatterns == null || matchPatterns.isEmpty()) {
throw newConfigurationException(TYPE, processorTag, "patterns", "List of patterns must not be empty");
}
Expand All @@ -406,7 +435,8 @@ public RedactProcessor create(
redactEnd,
matcherWatchdog,
licenseState,
skipIfUnlicensed
skipIfUnlicensed,
traceRedact
);
} catch (Exception e) {
throw newConfigurationException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ public void testConfigKeysRemoved() throws Exception {
config.put("patterns", List.of("%{MY_PATTERN:name}!"));
config.put("pattern_definitions", Map.of("MY_PATTERN", "foo"));
config.put("ignore_missing", true);
config.put("trace_redact", true);
config.put("extra", "unused");

factory.create(null, null, null, config);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,8 @@ public void testLicenseChecks() throws Exception {
">",
MatcherWatchdog.noop(),
notAllowed,
false // set skip_if_unlicensed to false, we do not want to skip, we do want to fail
false, // set skip_if_unlicensed to false, we do not want to skip, we do want to fail
false
);
assertThat(processor.getSkipIfUnlicensed(), equalTo(false));
var ingestDoc = createIngestDoc(Map.of("not_the_field", "fieldValue"));
Expand Down Expand Up @@ -314,6 +315,118 @@ public void testLicenseChanges() throws Exception {
}
}

@SuppressWarnings("unchecked")
public void testTraceRedact() throws Exception {
var config = new HashMap<String, Object>();
config.put("field", "to_redact");
config.put("patterns", List.of("%{EMAILADDRESS:REDACTED}"));
config.put("trace_redact", true);
{
var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(
null,
"t",
"d",
new HashMap<>(config)
);
var message = "this should not be redacted";
var ingestDoc = createIngestDoc(Map.of("to_redact", message));
var redactedDoc = processor.execute(ingestDoc);

assertEquals(message, redactedDoc.getFieldValue("to_redact", String.class));
assertNull(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true));
}
{
var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(
null,
"t",
"d",
new HashMap<>(config)
);
var ingestDoc = createIngestDoc(Map.of("to_redact", "[email protected] will be redacted"));
var redactedDoc = processor.execute(ingestDoc);

assertEquals("<REDACTED> will be redacted", redactedDoc.getFieldValue("to_redact", String.class));
// validate ingest metadata path correctly resolved
assertTrue(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class));
// validate ingest metadata structure correct
var ingestMeta = redactedDoc.getIngestMetadata();
assertTrue(ingestMeta.containsKey(RedactProcessor.REDACT_KEY));
var redactMetadata = (HashMap<String, Object>) ingestMeta.get(RedactProcessor.REDACT_KEY);
assertTrue(redactMetadata.containsKey(RedactProcessor.IS_REDACTED_KEY));
assertTrue((Boolean) redactMetadata.get(RedactProcessor.IS_REDACTED_KEY));
}
{
var configNoTrace = new HashMap<String, Object>();
configNoTrace.put("field", "to_redact");
configNoTrace.put("patterns", List.of("%{EMAILADDRESS:REDACTED}"));

var processor = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(null, "t", "d", configNoTrace);
var ingestDoc = createIngestDoc(Map.of("to_redact", "[email protected] will be redacted"));
var redactedDoc = processor.execute(ingestDoc);

assertEquals("<REDACTED> will be redacted", redactedDoc.getFieldValue("to_redact", String.class));
assertNull(redactedDoc.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true));
}
}

public void testTraceRedactMultipleProcessors() throws Exception {
var configRedact = new HashMap<String, Object>();
configRedact.put("field", "to_redact");
configRedact.put("patterns", List.of("%{EMAILADDRESS:REDACTED}"));
configRedact.put("trace_redact", true);

var configNoRedact = new HashMap<String, Object>();
configNoRedact.put("field", "to_redact");
configNoRedact.put("patterns", List.of("%{IP:REDACTED}")); // not in the doc
configNoRedact.put("trace_redact", true);

// first processor does not redact doc, second one does
{
var processorRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(
null,
"t1",
"d",
new HashMap<>(configRedact)
);
var processorNoRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(
null,
"t2",
"d",
new HashMap<>(configNoRedact)
);
var ingestDocWithEmail = createIngestDoc(Map.of("to_redact", "[email protected] will be redacted"));

var docNotRedacted = processorNoRedact.execute(ingestDocWithEmail);
assertNull(docNotRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class, true));

var docRedacted = processorRedact.execute(docNotRedacted);
assertTrue(docRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class));
}
// first processor redacts doc, second one does not
{
var processorRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(
null,
"t1",
"d",
new HashMap<>(configRedact)
);
var processorNoRedact = new RedactProcessor.Factory(mockLicenseState(), MatcherWatchdog.noop()).create(
null,
"t2",
"d",
new HashMap<>(configNoRedact)
);
var ingestDocWithEmail = createIngestDoc(Map.of("to_redact", "[email protected] will be redacted"));

var docRedacted = processorRedact.execute(ingestDocWithEmail);
assertTrue(docRedacted.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class));

// validate does not override already redacted doc metadata
var docRedactedAlready = processorNoRedact.execute(docRedacted);
assertTrue(docRedactedAlready.getFieldValue(RedactProcessor.METADATA_PATH_REDACT_IS_REDACTED, Boolean.class));
}
}

public void testMergeLongestRegion() {
var r = List.of(
new RedactProcessor.RegionTrackingMatchExtractor.Replacement(10, 20, "first"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
index: test
id: "1"
pipeline: "pipeline-using-a-redact-processor"
body: {to_redact: "0.0.0.1 is my secret IP to redact"}
body: { to_redact: "0.0.0.1 is my secret IP to redact" }

- do:
get:
Expand Down Expand Up @@ -96,3 +96,25 @@
}
- length: { docs: 1 }
- match: { docs.0.doc._source.to_redact: "==*EMAIL*== will be redacted" }
---
"Test redact with trace_redact":
- do:
ingest.simulate:
body: >
{
"pipeline": {
"processors": [
{
"redact": {
"field": "to_redact",
"patterns": ["%{EMAILADDRESS:EMAIL}", "%{IP:IP_ADDRESS}"],
"trace_redact": true
}
}
]
},
"docs": [{"_source": {"to_redact": "[email protected] will be redacted"}}]
}
- length: { docs: 1 }
- match: { docs.0.doc._source.to_redact: "<EMAIL> will be redacted" }
- match: { docs.0.doc._ingest._redact._is_redacted: true }

0 comments on commit 3d5a20f

Please sign in to comment.