From 21b8d5a0661fbbf5dd48e9b73ac2e8abcfc1967f Mon Sep 17 00:00:00 2001 From: Nhat Nguyen Date: Mon, 30 Sep 2024 13:11:30 -0700 Subject: [PATCH] [8.x] Add indices metrics for each index mode (#113737) (#113813) * Add indices metrics for each index mode (#113737) This change introduces index metrics per node, grouped by by index mode. For each index mode, we track the number of indices, document count, and store size. These metrics will help compare the usage of logsdb and time_series indices to standard indices. Other metrics, such as index longevity and newly created indices, could be added in a follow-up. Here is the list of 9 metrics introduced in this PR: es.indices.standard.total es.indices.standard.docs.total es.indices.standard.bytes.total es.indices.time_series.total es.indices.time_series.docs.total es.indices.time_series.bytes.total es.indices.logsdb.total es.indices.logsdb.docs.total es.indices.logsdb.bytes.total * Fix compile --- .../monitor/metrics/IndicesMetricsIT.java | 245 ++++++++++++++++++ .../monitor/metrics/IndicesMetrics.java | 177 +++++++++++++ .../java/org/elasticsearch/node/Node.java | 4 + .../elasticsearch/node/NodeConstruction.java | 3 + 4 files changed, 429 insertions(+) create mode 100644 server/src/internalClusterTest/java/org/elasticsearch/monitor/metrics/IndicesMetricsIT.java create mode 100644 server/src/main/java/org/elasticsearch/monitor/metrics/IndicesMetrics.java diff --git a/server/src/internalClusterTest/java/org/elasticsearch/monitor/metrics/IndicesMetricsIT.java b/server/src/internalClusterTest/java/org/elasticsearch/monitor/metrics/IndicesMetricsIT.java new file mode 100644 index 0000000000000..c9cff473353d5 --- /dev/null +++ b/server/src/internalClusterTest/java/org/elasticsearch/monitor/metrics/IndicesMetricsIT.java @@ -0,0 +1,245 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.monitor.metrics; + +import org.elasticsearch.common.settings.Setting; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.core.TimeValue; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.plugins.PluginsService; +import org.elasticsearch.telemetry.Measurement; +import org.elasticsearch.telemetry.TestTelemetryPlugin; +import org.elasticsearch.test.ESIntegTestCase; +import org.hamcrest.Matcher; + +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.index.mapper.DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.hasSize; + +@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0, numClientNodes = 0) +public class IndicesMetricsIT extends ESIntegTestCase { + + public static class TestAPMInternalSettings extends Plugin { + @Override + public List> getSettings() { + return List.of( + Setting.timeSetting("telemetry.agent.metrics_interval", TimeValue.timeValueSeconds(0), Setting.Property.NodeScope) + ); + } + } + + @Override + protected Collection> nodePlugins() { + return List.of(TestTelemetryPlugin.class, TestAPMInternalSettings.class); + } + + @Override + protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) { + return Settings.builder() + .put(super.nodeSettings(nodeOrdinal, otherSettings)) + .put("telemetry.agent.metrics_interval", TimeValue.timeValueSeconds(0)) // disable metrics cache refresh delay + .build(); + } + + static final String STANDARD_INDEX_COUNT = "es.indices.standard.total"; + static final String STANDARD_DOCS_COUNT = "es.indices.standard.docs.total"; + static final String STANDARD_BYTES_SIZE = "es.indices.standard.bytes.total"; + + static final String TIME_SERIES_INDEX_COUNT = "es.indices.time_series.total"; + static final String TIME_SERIES_DOCS_COUNT = "es.indices.time_series.docs.total"; + static final String TIME_SERIES_BYTES_SIZE = "es.indices.time_series.bytes.total"; + + static final String LOGSDB_INDEX_COUNT = "es.indices.logsdb.total"; + static final String LOGSDB_DOCS_COUNT = "es.indices.logsdb.docs.total"; + static final String LOGSDB_BYTES_SIZE = "es.indices.logsdb.bytes.total"; + + public void testIndicesMetrics() { + String node = internalCluster().startNode(); + ensureStableCluster(1); + final TestTelemetryPlugin telemetry = internalCluster().getInstance(PluginsService.class, node) + .filterPlugins(TestTelemetryPlugin.class) + .findFirst() + .orElseThrow(); + telemetry.resetMeter(); + long numStandardIndices = randomIntBetween(1, 5); + long numStandardDocs = populateStandardIndices(numStandardIndices); + collectThenAssertMetrics( + telemetry, + 1, + Map.of( + STANDARD_INDEX_COUNT, + equalTo(numStandardIndices), + STANDARD_DOCS_COUNT, + equalTo(numStandardDocs), + STANDARD_BYTES_SIZE, + greaterThan(0L), + + TIME_SERIES_INDEX_COUNT, + equalTo(0L), + TIME_SERIES_DOCS_COUNT, + equalTo(0L), + TIME_SERIES_BYTES_SIZE, + equalTo(0L), + + LOGSDB_INDEX_COUNT, + equalTo(0L), + LOGSDB_DOCS_COUNT, + equalTo(0L), + LOGSDB_BYTES_SIZE, + equalTo(0L) + ) + ); + + long numTimeSeriesIndices = randomIntBetween(1, 5); + long numTimeSeriesDocs = populateTimeSeriesIndices(numTimeSeriesIndices); + collectThenAssertMetrics( + telemetry, + 2, + Map.of( + STANDARD_INDEX_COUNT, + equalTo(numStandardIndices), + STANDARD_DOCS_COUNT, + equalTo(numStandardDocs), + STANDARD_BYTES_SIZE, + greaterThan(0L), + + TIME_SERIES_INDEX_COUNT, + equalTo(numTimeSeriesIndices), + TIME_SERIES_DOCS_COUNT, + equalTo(numTimeSeriesDocs), + TIME_SERIES_BYTES_SIZE, + greaterThan(20L), + + LOGSDB_INDEX_COUNT, + equalTo(0L), + LOGSDB_DOCS_COUNT, + equalTo(0L), + LOGSDB_BYTES_SIZE, + equalTo(0L) + ) + ); + + long numLogsdbIndices = randomIntBetween(1, 5); + long numLogsdbDocs = populateLogsdbIndices(numLogsdbIndices); + collectThenAssertMetrics( + telemetry, + 3, + Map.of( + STANDARD_INDEX_COUNT, + equalTo(numStandardIndices), + STANDARD_DOCS_COUNT, + equalTo(numStandardDocs), + STANDARD_BYTES_SIZE, + greaterThan(0L), + + TIME_SERIES_INDEX_COUNT, + equalTo(numTimeSeriesIndices), + TIME_SERIES_DOCS_COUNT, + equalTo(numTimeSeriesDocs), + TIME_SERIES_BYTES_SIZE, + greaterThan(20L), + + LOGSDB_INDEX_COUNT, + equalTo(numLogsdbIndices), + LOGSDB_DOCS_COUNT, + equalTo(numLogsdbDocs), + LOGSDB_BYTES_SIZE, + greaterThan(0L) + ) + ); + } + + void collectThenAssertMetrics(TestTelemetryPlugin telemetry, int times, Map> matchers) { + telemetry.collect(); + for (Map.Entry> e : matchers.entrySet()) { + String name = e.getKey(); + List measurements = telemetry.getLongGaugeMeasurement(name); + assertThat(name, measurements, hasSize(times)); + assertThat(name, measurements.get(measurements.size() - 1).getLong(), e.getValue()); + } + } + + int populateStandardIndices(long numIndices) { + int totalDocs = 0; + for (int i = 0; i < numIndices; i++) { + String indexName = "standard-" + i; + createIndex(indexName); + int numDocs = between(1, 5); + for (int d = 0; d < numDocs; d++) { + indexDoc(indexName, Integer.toString(d), "f", Integer.toString(d)); + } + totalDocs += numDocs; + flush(indexName); + } + return totalDocs; + } + + int populateTimeSeriesIndices(long numIndices) { + int totalDocs = 0; + for (int i = 0; i < numIndices; i++) { + String indexName = "time_series-" + i; + Settings settings = Settings.builder().put("mode", "time_series").putList("routing_path", List.of("host")).build(); + client().admin() + .indices() + .prepareCreate(indexName) + .setSettings(settings) + .setMapping( + "@timestamp", + "type=date", + "host", + "type=keyword,time_series_dimension=true", + "cpu", + "type=long,time_series_metric=gauge" + ) + .get(); + long timestamp = DEFAULT_DATE_TIME_FORMATTER.parseMillis("2024-04-15T00:00:00Z"); + int numDocs = between(1, 5); + for (int d = 0; d < numDocs; d++) { + timestamp += between(1, 5) * 1000L; + client().prepareIndex(indexName) + .setSource("@timestamp", timestamp, "host", randomFrom("prod", "qa"), "cpu", randomIntBetween(1, 100)) + .get(); + } + totalDocs += numDocs; + flush(indexName); + } + return totalDocs; + } + + int populateLogsdbIndices(long numIndices) { + int totalDocs = 0; + for (int i = 0; i < numIndices; i++) { + String indexName = "logsdb-" + i; + Settings settings = Settings.builder().put("mode", "logsdb").build(); + client().admin() + .indices() + .prepareCreate(indexName) + .setSettings(settings) + .setMapping("@timestamp", "type=date", "host.name", "type=keyword", "cpu", "type=long") + .get(); + long timestamp = DEFAULT_DATE_TIME_FORMATTER.parseMillis("2024-04-15T00:00:00Z"); + int numDocs = between(1, 5); + for (int d = 0; d < numDocs; d++) { + timestamp += between(1, 5) * 1000L; + client().prepareIndex(indexName) + .setSource("@timestamp", timestamp, "host.name", randomFrom("prod", "qa"), "cpu", randomIntBetween(1, 100)) + .get(); + } + totalDocs += numDocs; + flush(indexName); + } + return totalDocs; + } +} diff --git a/server/src/main/java/org/elasticsearch/monitor/metrics/IndicesMetrics.java b/server/src/main/java/org/elasticsearch/monitor/metrics/IndicesMetrics.java new file mode 100644 index 0000000000000..17e290283d5e0 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/monitor/metrics/IndicesMetrics.java @@ -0,0 +1,177 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.monitor.metrics; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.lucene.store.AlreadyClosedException; +import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.common.component.AbstractLifecycleComponent; +import org.elasticsearch.common.util.SingleObjectCache; +import org.elasticsearch.core.TimeValue; +import org.elasticsearch.index.IndexMode; +import org.elasticsearch.index.IndexService; +import org.elasticsearch.index.shard.IllegalIndexShardStateException; +import org.elasticsearch.index.shard.IndexShard; +import org.elasticsearch.indices.IndicesService; +import org.elasticsearch.telemetry.metric.LongWithAttributes; +import org.elasticsearch.telemetry.metric.MeterRegistry; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.List; +import java.util.Map; + +/** + * {@link IndicesMetrics} monitors index statistics on an Elasticsearch node and exposes them as metrics + * through the provided {@link MeterRegistry}. It tracks the current total number of indices, document count, and + * store size (in bytes) for each index mode. + */ +public class IndicesMetrics extends AbstractLifecycleComponent { + private final Logger logger = LogManager.getLogger(IndicesMetrics.class); + private final MeterRegistry registry; + private final List metrics = new ArrayList<>(); + private final IndicesStatsCache stateCache; + + public IndicesMetrics(MeterRegistry meterRegistry, IndicesService indicesService, TimeValue metricsInterval) { + this.registry = meterRegistry; + // Use half of the update interval to ensure that results aren't cached across updates, + // while preventing the cache from expiring when reading different gauges within the same update. + var cacheExpiry = new TimeValue(metricsInterval.getMillis() / 2); + this.stateCache = new IndicesStatsCache(indicesService, cacheExpiry); + } + + private static List registerAsyncMetrics(MeterRegistry registry, IndicesStatsCache cache) { + List metrics = new ArrayList<>(IndexMode.values().length * 3); + assert IndexMode.values().length == 3 : "index modes have changed"; + for (IndexMode indexMode : IndexMode.values()) { + String name = indexMode.getName(); + metrics.add( + registry.registerLongGauge( + "es.indices." + name + ".total", + "total number of " + name + " indices", + "unit", + () -> new LongWithAttributes(cache.getOrRefresh().get(indexMode).numIndices) + ) + ); + metrics.add( + registry.registerLongGauge( + "es.indices." + name + ".docs.total", + "total documents of " + name + " indices", + "unit", + () -> new LongWithAttributes(cache.getOrRefresh().get(indexMode).numDocs) + ) + ); + metrics.add( + registry.registerLongGauge( + "es.indices." + name + ".bytes.total", + "total size in bytes of " + name + " indices", + "unit", + () -> new LongWithAttributes(cache.getOrRefresh().get(indexMode).numBytes) + ) + ); + } + return metrics; + } + + @Override + protected void doStart() { + metrics.addAll(registerAsyncMetrics(registry, stateCache)); + } + + @Override + protected void doStop() { + stateCache.stopRefreshing(); + } + + @Override + protected void doClose() throws IOException { + metrics.forEach(metric -> { + try { + metric.close(); + } catch (Exception e) { + logger.warn("metrics close() method should not throw Exception", e); + } + }); + } + + static class IndexStats { + int numIndices = 0; + long numDocs = 0; + long numBytes = 0; + } + + private static class IndicesStatsCache extends SingleObjectCache> { + private static final Map MISSING_STATS; + static { + MISSING_STATS = new EnumMap<>(IndexMode.class); + for (IndexMode value : IndexMode.values()) { + MISSING_STATS.put(value, new IndexStats()); + } + } + + private boolean refresh; + private final IndicesService indicesService; + + IndicesStatsCache(IndicesService indicesService, TimeValue interval) { + super(interval, MISSING_STATS); + this.indicesService = indicesService; + this.refresh = true; + } + + private Map internalGetIndicesStats() { + Map stats = new EnumMap<>(IndexMode.class); + for (IndexMode mode : IndexMode.values()) { + stats.put(mode, new IndexStats()); + } + for (IndexService indexService : indicesService) { + for (IndexShard indexShard : indexService) { + if (indexShard.isSystem()) { + continue; // skip system indices + } + final ShardRouting shardRouting = indexShard.routingEntry(); + if (shardRouting.primary() == false) { + continue; // count primaries only + } + if (shardRouting.recoverySource() != null) { + continue; // exclude relocating shards + } + final IndexMode indexMode = indexShard.indexSettings().getMode(); + final IndexStats indexStats = stats.get(indexMode); + if (shardRouting.shardId().id() == 0) { + indexStats.numIndices++; + } + try { + indexStats.numDocs += indexShard.commitStats().getNumDocs(); + indexStats.numBytes += indexShard.storeStats().sizeInBytes(); + } catch (IllegalIndexShardStateException | AlreadyClosedException ignored) { + // ignored + } + } + } + return stats; + } + + @Override + protected Map refresh() { + return refresh ? internalGetIndicesStats() : getNoRefresh(); + } + + @Override + protected boolean needsRefresh() { + return getNoRefresh() == MISSING_STATS || super.needsRefresh(); + } + + void stopRefreshing() { + this.refresh = false; + } + } +} diff --git a/server/src/main/java/org/elasticsearch/node/Node.java b/server/src/main/java/org/elasticsearch/node/Node.java index 1447ac1c5b59b..5024cc5468866 100644 --- a/server/src/main/java/org/elasticsearch/node/Node.java +++ b/server/src/main/java/org/elasticsearch/node/Node.java @@ -66,6 +66,7 @@ import org.elasticsearch.injection.guice.Injector; import org.elasticsearch.monitor.fs.FsHealthService; import org.elasticsearch.monitor.jvm.JvmInfo; +import org.elasticsearch.monitor.metrics.IndicesMetrics; import org.elasticsearch.monitor.metrics.NodeMetrics; import org.elasticsearch.node.internal.TerminationHandler; import org.elasticsearch.plugins.ClusterCoordinationPlugin; @@ -441,6 +442,7 @@ public void onTimeout(TimeValue timeout) { } injector.getInstance(NodeMetrics.class).start(); + injector.getInstance(IndicesMetrics.class).start(); injector.getInstance(HealthPeriodicLogger.class).start(); logger.info("started {}", transportService.getLocalNode()); @@ -489,6 +491,7 @@ private void stop() { stopIfStarted(SearchService.class); stopIfStarted(TransportService.class); stopIfStarted(NodeMetrics.class); + stopIfStarted(IndicesMetrics.class); pluginLifecycleComponents.forEach(Node::stopIfStarted); // we should stop this last since it waits for resources to get released @@ -558,6 +561,7 @@ public synchronized void close() throws IOException { toClose.add(() -> stopWatch.stop().start("transport")); toClose.add(injector.getInstance(TransportService.class)); toClose.add(injector.getInstance(NodeMetrics.class)); + toClose.add(injector.getInstance(IndicesService.class)); if (ReadinessService.enabled(environment)) { toClose.add(injector.getInstance(ReadinessService.class)); } diff --git a/server/src/main/java/org/elasticsearch/node/NodeConstruction.java b/server/src/main/java/org/elasticsearch/node/NodeConstruction.java index c4816b440f568..b3c95186b6037 100644 --- a/server/src/main/java/org/elasticsearch/node/NodeConstruction.java +++ b/server/src/main/java/org/elasticsearch/node/NodeConstruction.java @@ -141,6 +141,7 @@ import org.elasticsearch.monitor.MonitorService; import org.elasticsearch.monitor.fs.FsHealthService; import org.elasticsearch.monitor.jvm.JvmInfo; +import org.elasticsearch.monitor.metrics.IndicesMetrics; import org.elasticsearch.monitor.metrics.NodeMetrics; import org.elasticsearch.node.internal.TerminationHandler; import org.elasticsearch.node.internal.TerminationHandlerProvider; @@ -1063,6 +1064,7 @@ private void construct( final TimeValue metricsInterval = settings.getAsTime("telemetry.agent.metrics_interval", TimeValue.timeValueSeconds(10)); final NodeMetrics nodeMetrics = new NodeMetrics(telemetryProvider.getMeterRegistry(), nodeService, metricsInterval); + final IndicesMetrics indicesMetrics = new IndicesMetrics(telemetryProvider.getMeterRegistry(), indicesService, metricsInterval); final SearchService searchService = serviceProvider.newSearchService( pluginsService, @@ -1162,6 +1164,7 @@ private void construct( b.bind(Transport.class).toInstance(transport); b.bind(TransportService.class).toInstance(transportService); b.bind(NodeMetrics.class).toInstance(nodeMetrics); + b.bind(IndicesMetrics.class).toInstance(indicesMetrics); b.bind(NetworkService.class).toInstance(networkService); b.bind(IndexMetadataVerifier.class).toInstance(indexMetadataVerifier); b.bind(ClusterInfoService.class).toInstance(clusterInfoService);