elastic · matthewabbott · Sep 27, 2024 · Sep 27, 2024 · DaveCTurner · Sep 27, 2024
diff --git a/docs/reference/cluster/allocation-explain.asciidoc b/docs/reference/cluster/allocation-explain.asciidoc
@@ -162,7 +162,7 @@ node.
 ====== Maximum number of retries exceeded
 
 The following response contains an allocation explanation for an unassigned
-primary shard that has reached the maximum number of allocation retry attempts. 
+primary shard that has reached the maximum number of allocation retry attempts.
 
 [source,js]
 ----
@@ -195,17 +195,20 @@ primary shard that has reached the maximum number of allocation retry attempts.
         {
           "decider": "max_retry",
           "decision" : "NO",
-          "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [/_cluster/reroute?retry_failed=true] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]"
+          "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [POST /_cluster/reroute?retry_failed=true] to retry, and for more information, see https://www.elastic.co/guide/en/elasticsearch/reference/current/cluster-allocation-explain.html#_maximum_number_of_retries_exceeded, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]"
-          "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [POST /_cluster/reroute?retry_failed=true] to retry, and for more information, see https://www.elastic.co/guide/en/elasticsearch/reference/current/cluster-allocation-explain.html#_maximum_number_of_retries_exceeded, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]"
+          "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [POST /_cluster/reroute?retry_failed&metric=none] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]"
-          "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [POST /_cluster/reroute?retry_failed=true] to retry, and for more information, see https://www.elastic.co/guide/en/elasticsearch/reference/current/cluster-allocation-explain.html#_maximum_number_of_retries_exceeded, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]"
+          "explanation": "shard has exceeded the maximum number of retries [5] on failed allocation attempts - manually call [POST /_cluster/reroute?retry_failed&metric=none] to retry, [unassigned_info[[reason=ALLOCATION_FAILED], at[2024-07-30T21:04:12.166Z], failed_attempts[5], failed_nodes[[mEKjwwzLT1yJVb8UxT6anw]], delayed=false, details[failed shard on node [mEKjwwzLT1yJVb8UxT6anw]: failed recovery, failure RecoveryFailedException], allocation_status[deciders_no]]]"
         }
       ]
     }
   ]
 }
 ----
 // NOTCONSOLE
-
-If decider message indicates a transient allocation issue, use 
-<<cluster-reroute,the cluster reroute API>> to retry allocation. 
+This message indicates that the cluster was previously unable to
+allocate this shard and chose to put a hold on further attempts.
+This is done to avoid burdening the cluster with repeated requests that will fail.
+If no other `no` decisions are present, then the transient allocation issue
+that caused these failures has most likely been resolved, and you can use the
+<<cluster-reroute,the cluster reroute API>> to retry allocation.
 
 ====== No valid shard copy
 
@@ -334,7 +337,7 @@ queued to allocate but currently waiting on other queued shards.
 ----
 // NOTCONSOLE
 
-This is a transient message that might appear when a large amount of shards are allocating. 
+This is a transient message that might appear when a large amount of shards are allocating.
 
 ===== Assigned shard
 
@@ -437,7 +440,7 @@ cluster balance.
 ===== No arguments
 
 If you call the API with no arguments, {es} retrieves an allocation explanation
-for an arbitrary unassigned primary or replica shard, returning any unassigned primary shards first. 
+for an arbitrary unassigned primary or replica shard, returning any unassigned primary shards first.
 
 [source,console]
 ----

diff --git a/.../java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java b/.../java/org/elasticsearch/cluster/routing/allocation/decider/MaxRetryAllocationDecider.java
@@ -15,6 +15,7 @@
 import org.elasticsearch.cluster.routing.UnassignedInfo;
 import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
 import org.elasticsearch.common.settings.Setting;
+import org.elasticsearch.common.ReferenceDocs;
 
 /**
  * An allocation decider that prevents shards from being allocated on any node if the shards allocation has been retried N times without
@@ -72,9 +73,10 @@ private static Decision debugDecision(Decision decision, UnassignedInfo info, in
             return Decision.single(
                 Decision.Type.NO,
                 NAME,
-                "shard has exceeded the maximum number of retries [%d] on failed allocation attempts - manually call [%s] to retry, [%s]",
+                "shard has exceeded the maximum number of retries [%d] on failed allocation attempts - manually call [%s] to retry, and for more information, see [%s], [%s]",
                 maxRetries,
                 RETRY_FAILED_API,
+                ReferenceDocs.ALLOCATION_EXPLAIN_MAX_RETRY,
                 info.toString()
             );
         } else {

diff --git a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java
@@ -84,6 +84,7 @@ public enum ReferenceDocs {
     FLOOD_STAGE_WATERMARK,
     X_OPAQUE_ID,
     FORMING_SINGLE_NODE_CLUSTERS,
+    ALLOCATION_EXPLAIN_MAX_RETRY,
     // this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner
     ;
 

diff --git a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json
@@ -43,5 +43,6 @@
   "MAX_SHARDS_PER_NODE": "size-your-shards.html#troubleshooting-max-shards-open",
   "FLOOD_STAGE_WATERMARK": "fix-watermark-errors.html",
   "X_OPAQUE_ID": "api-conventions.html#x-opaque-id",
-  "FORMING_SINGLE_NODE_CLUSTERS": "modules-discovery-bootstrap-cluster.html#modules-discovery-bootstrap-cluster-joining"
+  "FORMING_SINGLE_NODE_CLUSTERS": "modules-discovery-bootstrap-cluster.html#modules-discovery-bootstrap-cluster-joining",
+  "ALLOCATION_EXPLAIN_MAX_RETRY": "cluster-allocation-explain.html#_maximum_number_of_retries_exceeded"
 }