Merge branch 'main' into lucene_snapshot_9_12

elastic · Sep 28, 2024 · 4390ea1 · 4390ea1
2 parents 0765efb + 2f192e8
commit 4390ea1
Show file tree

Hide file tree

Showing 166 changed files with 3,398 additions and 845 deletions.
diff --git a/.buildkite/pull-requests.json b/.buildkite/pull-requests.json
@@ -14,21 +14,6 @@
       "trigger_comment_regex": "(run\\W+elasticsearch-ci.+)|(^\\s*((buildkite|@elastic(search)?machine)\\s*)?test\\s+this(\\s+please)?)",
       "cancel_intermediate_builds": true,
       "cancel_intermediate_builds_on_comment": false
-    },
-    {
-      "enabled": true,
-      "pipeline_slug": "elasticsearch-pull-request-check-serverless-submodule",
-      "allow_org_users": true,
-      "allowed_repo_permissions": [
-        "admin",
-        "write"
-      ],
-      "set_commit_status": false,
-      "build_on_commit": true,
-      "build_on_comment": false,
-      "labels": [
-        "test-update-serverless"
-      ]
     }
   ]
 }
diff --git a/build-tools-internal/gradle/wrapper/gradle-wrapper.properties b/build-tools-internal/gradle/wrapper/gradle-wrapper.properties
@@ -1,7 +1,7 @@
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionSha256Sum=fdfca5dbc2834f0ece5020465737538e5ba679deeff5ab6c09621d67f8bb1a15
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.10.1-all.zip
+distributionSha256Sum=2ab88d6de2c23e6adae7363ae6e29cbdd2a709e992929b48b6530fd0c7133bd6
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.10.2-all.zip
 networkTimeout=10000
 validateDistributionUrl=true
 zipStoreBase=GRADLE_USER_HOME

diff --git a/build-tools-internal/src/main/groovy/elasticsearch.ide.gradle b/build-tools-internal/src/main/groovy/elasticsearch.ide.gradle
@@ -169,6 +169,7 @@ if (providers.systemProperty('idea.active').getOrNull() == 'true') {
               '-ea',
               '-Djava.security.manager=allow',
               '-Djava.locale.providers=CLDR',
+              '-Dtests.testfeatures.enabled=true',
               '-Des.nativelibs.path="' + testLibraryPath + '"',
               // TODO: only open these for mockito when it is modularized
               '--add-opens=java.base/java.security.cert=ALL-UNNAMED',

diff --git a/...internal/src/main/java/org/elasticsearch/gradle/internal/ElasticsearchTestBasePlugin.java b/...internal/src/main/java/org/elasticsearch/gradle/internal/ElasticsearchTestBasePlugin.java
@@ -108,6 +108,7 @@ public void execute(Task t) {
                 "-Xmx" + System.getProperty("tests.heap.size", "512m"),
                 "-Xms" + System.getProperty("tests.heap.size", "512m"),
                 "-Djava.security.manager=allow",
+                "-Dtests.testfeatures.enabled=true",
                 "--add-opens=java.base/java.util=ALL-UNNAMED",
                 // TODO: only open these for mockito when it is modularized
                 "--add-opens=java.base/java.security.cert=ALL-UNNAMED",

diff --git a/build-tools-internal/src/main/resources/minimumGradleVersion b/build-tools-internal/src/main/resources/minimumGradleVersion
@@ -1 +1 @@
-8.10.1
+8.10.2
diff --git a/docs/changelog/111834.yaml b/docs/changelog/111834.yaml
@@ -0,0 +1,5 @@
+pr: 111834
+summary: Add inner hits support to semantic query
+area: Search
+type: enhancement
+issues: []
diff --git a/docs/changelog/113413.yaml b/docs/changelog/113413.yaml
@@ -0,0 +1,6 @@
+pr: 113413
+summary: Fixed a `NullPointerException` in `_capabilities` API when the `path` parameter is null.
+area: Infra/REST API
+type: bug
+issues:
+  - 113413
diff --git a/docs/changelog/113552.yaml b/docs/changelog/113552.yaml
@@ -0,0 +1,5 @@
+pr: 113552
+summary: Tag redacted document in ingest metadata
+area: Ingest Node
+type: enhancement
+issues: []
diff --git a/docs/changelog/113699.yaml b/docs/changelog/113699.yaml
@@ -0,0 +1,5 @@
+pr: 113699
+summary: "[ESQL] Fix init value in max float aggregation"
+area: ES|QL
+type: bug
+issues: []
diff --git a/docs/reference/cluster/nodes-stats.asciidoc b/docs/reference/cluster/nodes-stats.asciidoc
@@ -1716,6 +1716,10 @@ See <<disk-based-shard-allocation>> for more information about disk watermarks a
 
 `io_stats` (Linux only)::
 (objects) Contains I/O statistics for the node.
+
+NOTE: These statistics are derived from the `/proc/diskstats` kernel interface.
+This interface accounts for IO performed by all processes on the system, even
+if you are running {es} within a container.
 +
 .Properties of `io_stats`
 [%collapsible%open]

diff --git a/docs/reference/ingest/processors/redact.asciidoc b/docs/reference/ingest/processors/redact.asciidoc
@@ -39,6 +39,7 @@ patterns. Legacy Grok patterns are not supported.
 | `ignore_missing`       | no        | `true`              | If `true` and `field` does not exist or is `null`, the processor quietly exits without modifying the document
 include::common-options.asciidoc[]
 | `skip_if_unlicensed`   | no        | `false`             | If `true` and the current license does not support running redact processors, then the processor quietly exits without modifying the document
+| `trace_redact`         | no        | `false`             | If `true` then ingest metadata `_ingest._redact._is_redacted` is set to `true` if the document has been redacted
 |======
 
 In this example the predefined `IP` Grok pattern is used to match

diff --git a/docs/reference/query-dsl/semantic-query.asciidoc b/docs/reference/query-dsl/semantic-query.asciidoc
@@ -25,7 +25,7 @@ GET my-index-000001/_search
   }
 }
 ------------------------------------------------------------
-// TEST[skip:TBD]
+// TEST[skip: Requires inference endpoints]
 
 
 [discrete]
@@ -40,9 +40,209 @@ The `semantic_text` field to perform the query on.
 (Required, string)
 The query text to be searched for on the field.
 
+`inner_hits`::
+(Optional, object)
+Retrieves the specific passages that match the query.
+See <<semantic-query-passage-ranking, passage ranking with the `semantic` query>> for more information.
++
+.Properties of `inner_hits`
+[%collapsible%open]
+====
+`from`::
+(Optional, integer)
+The offset from the first matching passage to fetch.
+Used to paginate through the passages.
+Defaults to `0`.
+
+`size`::
+(Optional, integer)
+The maximum number of matching passages to return.
+Defaults to `3`.
+====
 
 Refer to <<semantic-search-semantic-text,this tutorial>> to learn more about semantic search using `semantic_text` and `semantic` query.
 
+[discrete]
+[[semantic-query-passage-ranking]]
+==== Passage ranking with the `semantic` query
+The `inner_hits` parameter can be used for _passage ranking_, which allows you to determine which passages in the document best match the query.
+For example, if you have a document that covers varying topics:
+
+[source,console]
+------------------------------------------------------------
+POST my-index/_doc/lake_tahoe
+{
+  "inference_field": [
+    "Lake Tahoe is the largest alpine lake in North America",
+    "When hiking in the area, please be on alert for bears"
+  ]
+}
+------------------------------------------------------------
+// TEST[skip: Requires inference endpoints]
+
+You can use passage ranking to find the passage that best matches your query:
+
+[source,console]
+------------------------------------------------------------
+GET my-index/_search
+{
+  "query": {
+    "semantic": {
+      "field": "inference_field",
+      "query": "mountain lake",
+      "inner_hits": { }
+    }
+  }
+}
+------------------------------------------------------------
+// TEST[skip: Requires inference endpoints]
+
+[source,console-result]
+------------------------------------------------------------
+{
+    "took": 67,
+    "timed_out": false,
+    "_shards": {
+        "total": 1,
+        "successful": 1,
+        "skipped": 0,
+        "failed": 0
+    },
+    "hits": {
+        "total": {
+            "value": 1,
+            "relation": "eq"
+        },
+        "max_score": 10.844536,
+        "hits": [
+            {
+                "_index": "my-index",
+                "_id": "lake_tahoe",
+                "_score": 10.844536,
+                "_source": {
+                    ...
+                },
+                "inner_hits": { <1>
+                    "inference_field": {
+                        "hits": {
+                            "total": {
+                                "value": 2,
+                                "relation": "eq"
+                            },
+                            "max_score": 10.844536,
+                            "hits": [
+                                {
+                                    "_index": "my-index",
+                                    "_id": "lake_tahoe",
+                                    "_nested": {
+                                        "field": "inference_field.inference.chunks",
+                                        "offset": 0
+                                    },
+                                    "_score": 10.844536,
+                                    "_source": {
+                                        "text": "Lake Tahoe is the largest alpine lake in North America"
+                                    }
+                                },
+                                {
+                                    "_index": "my-index",
+                                    "_id": "lake_tahoe",
+                                    "_nested": {
+                                        "field": "inference_field.inference.chunks",
+                                        "offset": 1
+                                    },
+                                    "_score": 3.2726858,
+                                    "_source": {
+                                        "text": "When hiking in the area, please be on alert for bears"
+                                    }
+                                }
+                            ]
+                        }
+                    }
+                }
+            }
+        ]
+    }
+}
+------------------------------------------------------------
+<1> Ranked passages will be returned using the <<inner-hits,`inner_hits` response format>>, with `<inner_hits_name>` set to the `semantic_text` field name.
+
+By default, the top three matching passages will be returned.
+You can use the `size` parameter to control the number of passages returned and the `from` parameter to page through the matching passages:
+
+[source,console]
+------------------------------------------------------------
+GET my-index/_search
+{
+  "query": {
+    "semantic": {
+      "field": "inference_field",
+      "query": "mountain lake",
+      "inner_hits": {
+        "from": 1,
+        "size": 1
+      }
+    }
+  }
+}
+------------------------------------------------------------
+// TEST[skip: Requires inference endpoints]
+
+[source,console-result]
+------------------------------------------------------------
+{
+    "took": 42,
+    "timed_out": false,
+    "_shards": {
+        "total": 1,
+        "successful": 1,
+        "skipped": 0,
+        "failed": 0
+    },
+    "hits": {
+        "total": {
+            "value": 1,
+            "relation": "eq"
+        },
+        "max_score": 10.844536,
+        "hits": [
+            {
+                "_index": "my-index",
+                "_id": "lake_tahoe",
+                "_score": 10.844536,
+                "_source": {
+                    ...
+                },
+                "inner_hits": {
+                    "inference_field": {
+                        "hits": {
+                            "total": {
+                                "value": 2,
+                                "relation": "eq"
+                            },
+                            "max_score": 10.844536,
+                            "hits": [
+                                {
+                                    "_index": "my-index",
+                                    "_id": "lake_tahoe",
+                                    "_nested": {
+                                        "field": "inference_field.inference.chunks",
+                                        "offset": 1
+                                    },
+                                    "_score": 3.2726858,
+                                    "_source": {
+                                        "text": "When hiking in the area, please be on alert for bears"
+                                    }
+                                }
+                            ]
+                        }
+                    }
+                }
+            }
+        ]
+    }
+}
+------------------------------------------------------------
+
 [discrete]
 [[hybrid-search-semantic]]
 ==== Hybrid search with the `semantic` query
@@ -79,7 +279,7 @@ POST my-index/_search
   }
 }
 ------------------------------------------------------------
-// TEST[skip:TBD]
+// TEST[skip: Requires inference endpoints]
 
 You can also use semantic_text as part of <<rrf,Reciprocal Rank Fusion>> to make ranking relevant results easier:
 
@@ -116,12 +316,12 @@ GET my-index/_search
   }
 }
 ------------------------------------------------------------
-// TEST[skip:TBD]
+// TEST[skip: Requires inference endpoints]
 
 
 [discrete]
 [[advanced-search]]
-=== Advanced search on `semantic_text` fields
+==== Advanced search on `semantic_text` fields
 
 The `semantic` query uses default settings for searching on `semantic_text` fields for ease of use.
 If you want to fine-tune a search on a `semantic_text` field, you need to know the task type used by the `inference_id` configured in `semantic_text`.
@@ -135,7 +335,7 @@ on a `semantic_text` field, it is not supported to use the `semantic_query` on a
 
 [discrete]
 [[search-sparse-inference]]
-==== Search with `sparse_embedding` inference
+===== Search with `sparse_embedding` inference
 
 When the {infer} endpoint uses a `sparse_embedding` model, you can use a <<query-dsl-sparse-vector-query,`sparse_vector` query>> on a <<semantic-text,`semantic_text`>> field in the following way:
 
@@ -157,14 +357,14 @@ GET test-index/_search
   }
 }
 ------------------------------------------------------------
-// TEST[skip:TBD]
+// TEST[skip: Requires inference endpoints]
 
 You can customize the `sparse_vector` query to include specific settings, like <<sparse-vector-query-with-pruning-config-and-rescore-example,pruning configuration>>.
 
 
 [discrete]
 [[search-text-inferece]]
-==== Search with `text_embedding` inference
+===== Search with `text_embedding` inference
 
 When the {infer} endpoint uses a `text_embedding` model, you can use a <<query-dsl-knn-query,`knn` query>> on a `semantic_text` field in the following way:
 
@@ -190,6 +390,6 @@ GET test-index/_search
   }
 }
 ------------------------------------------------------------
-// TEST[skip:TBD]
+// TEST[skip: Requires inference endpoints]
 
 You can customize the `knn` query to include specific settings, like `num_candidates` and `k`.
diff --git a/docs/reference/snapshot-restore/repository-s3.asciidoc b/docs/reference/snapshot-restore/repository-s3.asciidoc
@@ -378,7 +378,7 @@ If you use a Glacier storage class, or another unsupported storage class, or
 object expiry, then you may permanently lose access to your repository
 contents.
 
-You may use the `intellligent_tiering` storage class to automatically manage
+You may use the `intelligent_tiering` storage class to automatically manage
 the class of objects, but you must not enable the optional Archive Access or
 Deep Archive Access tiers. If you use these tiers then you may permanently lose
 access to your repository contents.

diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
@@ -1,7 +1,7 @@
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionSha256Sum=fdfca5dbc2834f0ece5020465737538e5ba679deeff5ab6c09621d67f8bb1a15
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.10.1-all.zip
+distributionSha256Sum=2ab88d6de2c23e6adae7363ae6e29cbdd2a709e992929b48b6530fd0c7133bd6
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.10.2-all.zip
 networkTimeout=10000
 validateDistributionUrl=true
 zipStoreBase=GRADLE_USER_HOME