From 74905eabe73fa8f0a33ae0a8467a75420a8a79e9 Mon Sep 17 00:00:00 2001 From: matthewabbott Date: Fri, 4 Oct 2024 13:38:26 -0700 Subject: [PATCH 1/6] Added max_retry docs explanation and linked to the docs in allocation explanation. --- .../cluster/allocation-explain.asciidoc | 17 ++++++++++------- .../decider/MaxRetryAllocationDecider.java | 5 ++++- .../org/elasticsearch/common/ReferenceDocs.java | 1 + .../common/reference-docs-links.txt | 3 ++- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/docs/reference/cluster/allocation-explain.asciidoc b/docs/reference/cluster/allocation-explain.asciidoc index 7547dd74c5ecd..8ad93772ec1d2 100644 --- a/docs/reference/cluster/allocation-explain.asciidoc +++ b/docs/reference/cluster/allocation-explain.asciidoc @@ -159,10 +159,11 @@ node. <5> The decider which led to the `no` decision for the node. <6> An explanation as to why the decider returned a `no` decision, with a helpful hint pointing to the setting that led to the decision. In this example, a newly created index has <> that requires that it only be allocated to a node named `nonexistent_node`, which does not exist, so the index is unable to allocate. +[[maximum-number-of-retries-exceeded]] ====== Maximum number of retries exceeded The following response contains an allocation explanation for an unassigned -primary shard that has reached the maximum number of allocation retry attempts. +primary shard that has reached the maximum number of allocation retry attempts. [source,js] ---- @@ -195,7 +196,7 @@ primary shard that has reached the maximum number of allocation retry attempts. { "decider": "max_retry", "decision" : "NO", - "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]" + "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [POST /_cluster/reroute?retry_failed&metric=none] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]" } ] } @@ -203,9 +204,11 @@ primary shard that has reached the maximum number of allocation retry attempts. } ---- // NOTCONSOLE - -If decider message indicates a transient allocation issue, use -<> to retry allocation. +Elasticsearch queues shard allocation retries in batches. If there are long-running shard +recoveries or a high quantity of shard recoveries occurring within the cluster, this +process may time out for some shards, resulting in `max_retry`. This surfaces infrequently +but is expected to prevent infinite retries which may impact cluster performance. When +encountered, run <> to retry allocation. ====== No valid shard copy @@ -334,7 +337,7 @@ queued to allocate but currently waiting on other queued shards. ---- // NOTCONSOLE -This is a transient message that might appear when a large amount of shards are allocating. +This is a transient message that might appear when a large amount of shards are allocating. ===== Assigned shard @@ -437,7 +440,7 @@ cluster balance. ===== No arguments If you call the API with no arguments, {es} retrieves an allocation explanation -for an arbitrary unassigned primary or replica shard, returning any unassigned primary shards first. +for an arbitrary unassigned primary or replica shard, returning any unassigned primary shards first. [source,console] ---- diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java index b20cd3ecaf992..7004c892b1588 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java @@ -14,6 +14,7 @@ import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.UnassignedInfo; import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; +import org.elasticsearch.common.ReferenceDocs; import org.elasticsearch.common.settings.Setting; /** @@ -72,7 +73,9 @@ private static Decision debugDecision(Decision decision, UnassignedInfo info, in return Decision.single( Decision.Type.NO, NAME, - "shard has exceeded the maximum number of retries [%d] on failed allocation attempts - manually call [%s] to retry, [%s]", + "shard has exceeded the maximum number of retries [%d] on failed allocation attempts - " + + "manually call [%s] to retry, and for more information, see [%s] [%s]", + ReferenceDocs.ALLOCATION_EXPLAIN_MAX_RETRY, maxRetries, RETRY_FAILED_API, info.toString() diff --git a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java index b059113b4098c..2f431e358ed29 100644 --- a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java +++ b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java @@ -81,6 +81,7 @@ public enum ReferenceDocs { X_OPAQUE_ID, FORMING_SINGLE_NODE_CLUSTERS, CIRCUIT_BREAKER_ERRORS, + ALLOCATION_EXPLAIN_MAX_RETRY, // this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner ; diff --git a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt index ab9a6b253be7a..e4d4a70f3d817 100644 --- a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt +++ b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt @@ -42,4 +42,5 @@ MAX_SHARDS_PER_NODE size-your-shards FLOOD_STAGE_WATERMARK fix-watermark-errors.html X_OPAQUE_ID api-conventions.html#x-opaque-id FORMING_SINGLE_NODE_CLUSTERS modules-discovery-bootstrap-cluster.html#modules-discovery-bootstrap-cluster-joining -CIRCUIT_BREAKER_ERRORS circuit-breaker-errors.html \ No newline at end of file +CIRCUIT_BREAKER_ERRORS circuit-breaker-errors.html +ALLOCATION_EXPLAIN_MAX_RETRY cluster-allocation-explain.html#maximum-number-of-retries-exceeded From 0c4d80ebcf67a2858d3afb39e0a7c07403e846a5 Mon Sep 17 00:00:00 2001 From: matthewabbott Date: Fri, 4 Oct 2024 13:45:59 -0700 Subject: [PATCH 2/6] ran spotlessapply --- .../routing/allocation/decider/MaxRetryAllocationDecider.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java index 7004c892b1588..05d780f227df6 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java @@ -73,8 +73,8 @@ private static Decision debugDecision(Decision decision, UnassignedInfo info, in return Decision.single( Decision.Type.NO, NAME, - "shard has exceeded the maximum number of retries [%d] on failed allocation attempts - " + - "manually call [%s] to retry, and for more information, see [%s] [%s]", + "shard has exceeded the maximum number of retries [%d] on failed allocation attempts - " + + "manually call [%s] to retry, and for more information, see [%s] [%s]", ReferenceDocs.ALLOCATION_EXPLAIN_MAX_RETRY, maxRetries, RETRY_FAILED_API, From 7ab3606d85ae32c8970be6e3b7e522c59b15b93e Mon Sep 17 00:00:00 2001 From: matthewabbott Date: Thu, 10 Oct 2024 16:51:18 -0700 Subject: [PATCH 3/6] fixed max retry doc link --- .../resources/org/elasticsearch/common/reference-docs-links.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt index b2a4c7992d1fe..3b0816aabf4aa 100644 --- a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt +++ b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt @@ -44,3 +44,4 @@ X_OPAQUE_ID api-conventions. FORMING_SINGLE_NODE_CLUSTERS modules-discovery-bootstrap-cluster.html#modules-discovery-bootstrap-cluster-joining CIRCUIT_BREAKER_ERRORS circuit-breaker-errors.html ALLOCATION_EXPLAIN_NO_COPIES cluster-allocation-explain.html#no-valid-shard-copy +ALLOCATION_EXPLAIN_MAX_RETRY cluster-allocation-explain.html#maximum-number-of-retries-exceeded From 95ddbc89ba02d27252b8e47e633f85e945de2b47 Mon Sep 17 00:00:00 2001 From: matthewabbott Date: Fri, 11 Oct 2024 15:16:54 -0700 Subject: [PATCH 4/6] fix retry explanation message string --- .../routing/allocation/decider/MaxRetryAllocationDecider.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java index f1515bb972f22..0ab842276efc4 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java @@ -75,9 +75,9 @@ private static Decision debugDecision(Decision decision, UnassignedInfo info, in NAME, "shard has exceeded the maximum number of retries [%d] on failed allocation attempts - " + "manually call [%s] to retry, and for more information, see [%s] [%s]", - ReferenceDocs.ALLOCATION_EXPLAIN_MAX_RETRY, maxRetries, RETRY_FAILED_API, + ReferenceDocs.ALLOCATION_EXPLAIN_MAX_RETRY, info.toString() ); } else { From 7d77f8d1743164c35fb67cb2970329bbc3d2152d Mon Sep 17 00:00:00 2001 From: matthewabbott Date: Fri, 11 Oct 2024 15:29:36 -0700 Subject: [PATCH 5/6] ran spotlessapply/precommit --- .../function/scalar/math/HypotEvaluator.java | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/math/HypotEvaluator.java b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/math/HypotEvaluator.java index f5684bcb4be18..22094f7e623e6 100644 --- a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/math/HypotEvaluator.java +++ b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/math/HypotEvaluator.java @@ -13,16 +13,16 @@ import org.elasticsearch.compute.data.Page; import org.elasticsearch.compute.operator.DriverContext; import org.elasticsearch.compute.operator.EvalOperator; +import org.elasticsearch.compute.operator.Warnings; import org.elasticsearch.core.Releasables; import org.elasticsearch.xpack.esql.core.tree.Source; -import org.elasticsearch.xpack.esql.expression.function.Warnings; /** * {@link EvalOperator.ExpressionEvaluator} implementation for {@link Hypot}. * This class is generated. Do not edit it. */ public final class HypotEvaluator implements EvalOperator.ExpressionEvaluator { - private final Warnings warnings; + private final Source source; private final EvalOperator.ExpressionEvaluator n1; @@ -30,12 +30,14 @@ public final class HypotEvaluator implements EvalOperator.ExpressionEvaluator { private final DriverContext driverContext; + private Warnings warnings; + public HypotEvaluator(Source source, EvalOperator.ExpressionEvaluator n1, EvalOperator.ExpressionEvaluator n2, DriverContext driverContext) { + this.source = source; this.n1 = n1; this.n2 = n2; this.driverContext = driverContext; - this.warnings = Warnings.createWarnings(driverContext.warningsMode(), source); } @Override @@ -64,7 +66,7 @@ public DoubleBlock eval(int positionCount, DoubleBlock n1Block, DoubleBlock n2Bl } if (n1Block.getValueCount(p) != 1) { if (n1Block.getValueCount(p) > 1) { - warnings.registerException(new IllegalArgumentException("single-value function encountered multi-value")); + warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); } result.appendNull(); continue position; @@ -75,7 +77,7 @@ public DoubleBlock eval(int positionCount, DoubleBlock n1Block, DoubleBlock n2Bl } if (n2Block.getValueCount(p) != 1) { if (n2Block.getValueCount(p) > 1) { - warnings.registerException(new IllegalArgumentException("single-value function encountered multi-value")); + warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); } result.appendNull(); continue position; @@ -105,6 +107,18 @@ public void close() { Releasables.closeExpectNoException(n1, n2); } + private Warnings warnings() { + if (warnings == null) { + this.warnings = Warnings.createWarnings( + driverContext.warningsMode(), + source.source().getLineNumber(), + source.source().getColumnNumber(), + source.text() + ); + } + return warnings; + } + static class Factory implements EvalOperator.ExpressionEvaluator.Factory { private final Source source; From b56252a8abc380d9ffda356e2a3cbef3880cb4a2 Mon Sep 17 00:00:00 2001 From: matthewabbott Date: Thu, 17 Oct 2024 12:57:13 -0700 Subject: [PATCH 6/6] Tweak MAX_RETRY docs message and API example --- docs/reference/cluster/allocation-explain.asciidoc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/reference/cluster/allocation-explain.asciidoc b/docs/reference/cluster/allocation-explain.asciidoc index 63f992fd7d793..bbbea192f0f86 100644 --- a/docs/reference/cluster/allocation-explain.asciidoc +++ b/docs/reference/cluster/allocation-explain.asciidoc @@ -196,7 +196,7 @@ primary shard that has reached the maximum number of allocation retry attempts. { "decider": "max_retry", "decision" : "NO", - "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [POST /_cluster/reroute?retry_failed&metric=none] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]" + "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [POST /_cluster/reroute?retry_failed] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]" } ] } @@ -204,11 +204,11 @@ primary shard that has reached the maximum number of allocation retry attempts. } ---- // NOTCONSOLE -Elasticsearch queues shard allocation retries in batches. If there are long-running shard -recoveries or a high quantity of shard recoveries occurring within the cluster, this -process may time out for some shards, resulting in `max_retry`. This surfaces infrequently -but is expected to prevent infinite retries which may impact cluster performance. When -encountered, run the <> API to retry allocation. +When Elasticsearch is unable to allocate a shard, it will attempt to retry allocation up to +the maximum number of retries allowed. After this, Elasticsearch will stop attempting to +allocate the shard in order to prevent infinite retries which may impact cluster +performance. Run the <> API to retry allocation, which +will allocate the shard if the issue preventing allocation has been resolved. [[no-valid-shard-copy]] ====== No valid shard copy