elastic · quux00 · Sep 30, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024
diff --git a/docs/changelog/112595.yaml b/docs/changelog/112595.yaml
@@ -0,0 +1,6 @@
+pr: 112595
+summary: Collect and display execution metadata for ES|QL cross cluster searches
+area: ES|QL
+type: enhancement
+issues:
+ - 112402
diff --git a/docs/reference/esql/esql-across-clusters.asciidoc b/docs/reference/esql/esql-across-clusters.asciidoc
@@ -85,7 +85,7 @@ POST /_security/role/remote1
       "privileges": [ "read","read_cross_cluster" ], <4>
       "clusters" : ["my_remote_cluster"] <5>
     }
-  ], 
+  ],
    "remote_cluster": [ <6>
         {
             "privileges": [
@@ -174,6 +174,184 @@ FROM *:my-index-000001
 | LIMIT 10
 ----
 
+[discrete]
+[[ccq-cluster-details]]
+==== Cross-cluster metadata
+
+ES|QL {ccs} responses include metadata about the search on each cluster when the response format is JSON.
+Here we show an example using the async search endpoint. {ccs-cap} metadata is also present in the synchronous
+search endpoint.
+
+[source,esql]
+----
+POST /_query/async?format=json
+{
+  "query": """
+    FROM my-index-000001,cluster_one:my-index-000001,cluster_two:my-index*
+    | KEEP author, name, page_count
+    | SORT page_count DESC
+    | LIMIT 50
+  """
+}
+----
+
+Which returns:
+
+[source,console-result]
+----
+{
+  "is_running": false,
+  "took": 42,  <1>
+  "columns": [
+     ... // not shown
+  ],
+  "values": [
+     ... // not shown
+  ],
+  "_clusters": {  <2>
+    "total": 3,
+    "successful": 3,
+    "running": 0,
+    "skipped": 0,
+    "partial": 0,
+    "failed": 0,
+    "details": { <3>
+      "(local)": { <4>
+        "status": "successful",
+        "indices": "blogs",
+        "took": 36,  <5>
+        "_shards": { <6>
+          "total": 13,
+          "successful": 13,
+          "skipped": 0,
+          "failed": 0
+        }
+      },
+      "cluster_one": {
+        "status": "successful",
+        "indices": "cluster_one:my-index-000001",
+        "took": 38,
+        "_shards": {
+          "total": 4,
+          "successful": 4,
+          "skipped": 0,
+          "failed": 0
+        }
+      },
+      "cluster_two": {
+        "status": "successful",
+        "indices": "cluster_two:my-index-000001", <7>
+        "took": 41,
+        "_shards": {
+          "total": 18,
+          "successful": 18,
+          "skipped": 1,
+          "failed": 0
+        }
+      }
+    }
+  }
+}
+----
+// TEST[skip: cross-cluster testing env not set up]
+
+<1> How the long the entire search (across all clusters) took, in milliseconds.
+<2> This section of counters shows all possible cluster search states and how many cluster
+searches are currently in that state. The clusters can be one of the following statuses: *running*,
+*successful* (searches on all shards were successful), *partial* (searches on at least
+one shard of the cluster was successful and at least one failed), *skipped* (the search
+failed on a cluster marked with `skip_unavailable`=`true`) or *failed* (the search
+failed on a cluster marked with `skip_unavailable`=`false`).
+<3> The `_clusters/details` section shows metadata about the search on each cluster.
+<4> If you included indices from the local cluster you sent the request to in your {ccs},
+it is identified as "(local)".
+<5> How long (in milliseconds) the search took on each cluster. This can be useful to determine
+which clusters have slower response times than others.
+<6> The shard details for the search on that cluster, including a count of shards that were
+skipped due to the can-match phase indicating it had no matching data so it did not need
+to be included in the full ESQL query.
+<7> The index expression supplied by the user. If you provide a wildcard such as `my-index*`,
+this section will show the resolved index name(s) here, unless no matching indices could
+be found on that cluster, in which case the wildcard expression will be retained here.
+
+
+The cross-cluster metadata can be used to determine if any data came back from a cluster.
+For instance in this query, you see that wildcard expression for `cluster-one` did not
+resolve to a concrete index (or indices) and that the total number of shards searched is
+zero. This indicates that no matching index was found on that cluster. But since the other
+cluster did have a matching index, the search did not return an error, but instead
+returned all the matching data it could find.
+
+[source,esql]
+----
+POST /_query/async?format=json
+{
+  "query": """
+    FROM cluster_one:my-index*,cluster_two:logs*
+    | KEEP author, name, page_count
+    | SORT page_count DESC
+    | LIMIT 5
+  """
+}
+----
+
+Which returns:
+
+[source,console-result]
+----
+{
+  "is_running": false,
+  "took": 55,
+  "columns": [
+     ... // not shown
+  ],
+  "values": [
+     ... // not shown
+  ],
+  "_clusters": {
+    "total": 2,
+    "successful": 2,
+    "running": 0,
+    "skipped": 0,
+    "partial": 0,
+    "failed": 0,
+    "details": {
+      "cluster_one": {
+        "status": "successful",
+        "indices": "cluster_one:my-index-000001",
+        "took": 38,
+        "_shards": {
+          "total": 4,
+          "successful": 4,
+          "skipped": 0,
+          "failed": 0
+        }
+      },
+      "cluster_two": {
+        "status": "successful", <1>
+        "indices": "cluster_two:logs*", <2>
+        "took": 0,
+        "_shards": {
+          "total": 0, <3>
+          "successful": 0,
+          "skipped": 0,
+          "failed": 0
+        }
+      }
+    }
+  }
+}
+----
+// TEST[skip: cross-cluster testing env not set up]
+
+<1> This search is still marked as successful, even though no data was searched.
+<2> Since there were no matching indices for the wildcard pattern provided, the original
+index expression provided by the user is retained here.
+<3> Indicates that no shards were searched (due to not having any matching indices).
+
+
+
+
 [discrete]
 [[ccq-enrich]]
 ==== Enrich across clusters
@@ -331,8 +509,7 @@ setting. As a result, if a remote cluster specified in the request is
 unavailable or failed, {ccs} for {esql} queries will fail regardless of the setting.
 
 We are actively working to align the behavior of {ccs} for {esql} with other
-{ccs} APIs. This includes providing detailed execution information for each cluster
-in the response, such as execution time, selected target indices, and shards.
+{ccs} APIs.
 
 [discrete]
 [[ccq-during-upgrade]]

diff --git a/docs/reference/esql/esql-rest.asciidoc b/docs/reference/esql/esql-rest.asciidoc
@@ -192,6 +192,7 @@ Which returns:
 [source,console-result]
 ----
 {
+  "took": 28,
   "columns": [
     {"name": "author", "type": "text"},
     {"name": "name", "type": "text"},
@@ -206,6 +207,7 @@ Which returns:
   ]
 }
 ----
+// TESTRESPONSE[s/"took": 28/"took": "$body.took"/]
 
 [discrete]
 [[esql-locale-param]]
@@ -384,12 +386,13 @@ GET /_query/async/FmNJRUZ1YWZCU3dHY1BIOUhaenVSRkEaaXFlZ3h4c1RTWFNocDdnY2FSaERnUT
 // TEST[skip: no access to query ID - may return response values]
 
 If the response's `is_running` value is `false`, the query has finished
-and the results are returned.
+and the results are returned, along with the `took` time for the query.
 
 [source,console-result]
 ----
 {
   "is_running": false,
+  "took": 48,
   "columns": ...
 }
 ----

diff --git a/docs/reference/esql/multivalued-fields.asciidoc b/docs/reference/esql/multivalued-fields.asciidoc
@@ -26,6 +26,7 @@ Multivalued fields come back as a JSON array:
 [source,console-result]
 ----
 {
+  "took": 28,
   "columns": [
     { "name": "a", "type": "long"},
     { "name": "b", "type": "long"}
@@ -36,6 +37,8 @@ Multivalued fields come back as a JSON array:
   ]
 }
 ----
+// TESTRESPONSE[s/"took": 28/"took": "$body.took"/]
+
 
 The relative order of values in a multivalued field is undefined. They'll frequently be in
 ascending order but don't rely on that.
@@ -74,6 +77,7 @@ And {esql} sees that removal:
 [source,console-result]
 ----
 {
+  "took": 28,
   "columns": [
     { "name": "a", "type": "long"},
     { "name": "b", "type": "keyword"}
@@ -84,6 +88,8 @@ And {esql} sees that removal:
   ]
 }
 ----
+// TESTRESPONSE[s/"took": 28/"took": "$body.took"/]
+
 
 But other types, like `long` don't remove duplicates.
 
@@ -115,6 +121,7 @@ And {esql} also sees that:
 [source,console-result]
 ----
 {
+  "took": 28,
   "columns": [
     { "name": "a", "type": "long"},
     { "name": "b", "type": "long"}
@@ -125,6 +132,8 @@ And {esql} also sees that:
   ]
 }
 ----
+// TESTRESPONSE[s/"took": 28/"took": "$body.took"/]
+
 
 This is all at the storage layer. If you store duplicate `long`s and then
 convert them to strings the duplicates will stay:
@@ -155,6 +164,7 @@ POST /_query
 [source,console-result]
 ----
 {
+  "took": 28,
   "columns": [
     { "name": "a", "type": "long"},
     { "name": "b", "type": "keyword"}
@@ -165,6 +175,7 @@ POST /_query
   ]
 }
 ----
+// TESTRESPONSE[s/"took": 28/"took": "$body.took"/]
 
 [discrete]
 [[esql-multivalued-fields-functions]]
@@ -198,6 +209,7 @@ POST /_query
 [source,console-result]
 ----
 {
+  "took": 28,
   "columns": [
     { "name": "a",   "type": "long"},
     { "name": "b",   "type": "long"},
@@ -210,6 +222,7 @@ POST /_query
   ]
 }
 ----
+// TESTRESPONSE[s/"took": 28/"took": "$body.took"/]
 
 Work around this limitation by converting the field to single value with one of:
 
@@ -233,6 +246,7 @@ POST /_query
 [source,console-result]
 ----
 {
+  "took": 28,
   "columns": [
     { "name": "a",   "type": "long"},
     { "name": "b",   "type": "long"},
@@ -245,4 +259,4 @@ POST /_query
   ]
 }
 ----
-
+// TESTRESPONSE[s/"took": 28/"took": "$body.took"/]
diff --git a/server/src/main/java/org/elasticsearch/ExceptionsHelper.java b/server/src/main/java/org/elasticsearch/ExceptionsHelper.java
@@ -18,6 +18,9 @@
 import org.elasticsearch.core.Nullable;
 import org.elasticsearch.index.Index;
 import org.elasticsearch.rest.RestStatus;
+import org.elasticsearch.transport.ConnectTransportException;
+import org.elasticsearch.transport.NoSeedNodeLeftException;
+import org.elasticsearch.transport.NoSuchRemoteClusterException;
 import org.elasticsearch.xcontent.XContentParseException;
 
 import java.io.IOException;
@@ -471,7 +474,7 @@ public static ShardOperationFailedException[] groupBy(ShardOperationFailedExcept
     }
 
     /**
-     * Utility method useful for determine whether to log an Exception or perhaps
+     * Utility method useful for determining whether to log an Exception or perhaps
      * avoid logging a stacktrace if the caller/logger is not interested in these
      * types of node/shard issues.
      *
@@ -489,6 +492,27 @@ public static boolean isNodeOrShardUnavailableTypeException(Throwable t) {
             || t instanceof org.elasticsearch.cluster.block.ClusterBlockException);
     }
 
+    /**
+     * Checks the exception against a known list of exceptions that indicate a remote cluster
+     * cannot be connected to.
+     *
+     * @param e Exception to inspect
+     * @return true if the Exception is known to indicate that a remote cluster
+     *         is unavailable (cannot be connected to by the transport layer)
+     */
+    public static boolean isRemoteUnavailableException(Exception e) {
+        Throwable unwrap = unwrap(e, ConnectTransportException.class, NoSuchRemoteClusterException.class, NoSeedNodeLeftException.class);
+        if (unwrap != null) {
+            return true;
+        }
+        Throwable ill = unwrap(e, IllegalStateException.class, IllegalArgumentException.class);
+        if (ill != null && (ill.getMessage().contains("Unable to open any connections") || ill.getMessage().contains("unknown host"))) {
+            return true;
+        }
+        // doesn't look like any of the known remote exceptions
+        return false;
+    }
+
     private static class GroupBy {
         final String reason;
         final String index;

diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java
@@ -209,6 +209,7 @@ static TransportVersion def(int id) {
     public static final TransportVersion CCS_TELEMETRY_STATS = def(8_739_00_0);
     public static final TransportVersion GLOBAL_RETENTION_TELEMETRY = def(8_740_00_0);
     public static final TransportVersion ROUTING_TABLE_VERSION_REMOVED = def(8_741_00_0);
+    public static final TransportVersion ESQL_CCS_COMPUTE_RESPONSE = def(8_742_00_0);
 
     /*
      * STOP! READ THIS FIRST! No, really,