elastic · cbuescher · Oct 2, 2024 · Sep 23, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/docs/changelog/113482.yaml b/docs/changelog/113482.yaml
@@ -0,0 +1,27 @@
+pr: 113482
+summary: The 'persian' analyzer has stemmer by default
+area: Analysis
+type: breaking
+issues:
+- 113050
+breaking:
+  title: The 'persian' analyzer has stemmer by default
+  area: Analysis
+  details: >-
+    Lucene 10 has added a final stemming step to its PersianAnalyzer that we 
+    expose as 'persian' analyzer. Existing indices will keep the old
+    non-stemming behaviour while new indices will see the updated behaviour with
+    added stemming.
+    Users that want to maintain the non-stemming behaviour need to define their
+    own analyzer as outlines in
+    https://www.elastic.co/guide/en/elasticsearch/reference/8.15/analysis-lang-analyzer.html#persian-analyzer.
+    Users that want to use the new stemming behaviour for existing indices will
+    have to reindex their data.
+  impact: >-
+    Indexing with the 'persian' analyzer will produce slightly different token.
+    Users should check if this impacts their search results. If they want to
+    maintain the legacy non-stemming behaviour they can define their own
+    analyzer equivalent as explained in 
+    https://www.elastic.co/guide/en/elasticsearch/reference/8.15/analysis-lang-analyzer.html#persian-analyzer.
+  notable: false
+
diff --git a/...lysis-common/src/main/java/org/elasticsearch/analysis/common/PersianAnalyzerProvider.java b/...lysis-common/src/main/java/org/elasticsearch/analysis/common/PersianAnalyzerProvider.java
@@ -9,24 +9,72 @@
 
 package org.elasticsearch.analysis.common;
 
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
+import org.apache.lucene.analysis.core.DecimalDigitFilter;
 import org.apache.lucene.analysis.fa.PersianAnalyzer;
+import org.apache.lucene.analysis.fa.PersianCharFilter;
+import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.IndexVersions;
 import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
 import org.elasticsearch.index.analysis.Analysis;
 
-public class PersianAnalyzerProvider extends AbstractIndexAnalyzerProvider<PersianAnalyzer> {
+import java.io.Reader;
 
-    private final PersianAnalyzer analyzer;
+public class PersianAnalyzerProvider extends AbstractIndexAnalyzerProvider<StopwordAnalyzerBase> {
+
+    private final StopwordAnalyzerBase analyzer;
 
     PersianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
         super(name, settings);
-        analyzer = new PersianAnalyzer(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet()));
+        if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)) {
+            // since Lucene 10 this analyzer contains stemming by default
+            analyzer = new PersianAnalyzer(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet()));
+        } else {
+            // for older index versions we need the old analyzer behaviour without stemming
+            analyzer = new StopwordAnalyzerBase(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet())) {
+
+                protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
+                    final Tokenizer source = new StandardTokenizer();
+                    TokenStream result = new LowerCaseFilter(source);
+                    result = new DecimalDigitFilter(result);
+                    result = new ArabicNormalizationFilter(result);
+                    /* additional persian-specific normalization */
+                    result = new PersianNormalizationFilter(result);
+                    /*
+                     * the order here is important: the stopword list is normalized with the
+                     * above!
+                     */
+                    return new TokenStreamComponents(source, new StopFilter(result, stopwords));
+                }
+
+                protected TokenStream normalize(String fieldName, TokenStream in) {
+                    TokenStream result = new LowerCaseFilter(in);
+                    result = new DecimalDigitFilter(result);
+                    result = new ArabicNormalizationFilter(result);
+                    /* additional persian-specific normalization */
+                    result = new PersianNormalizationFilter(result);
+                    return result;
+                }
+
+                protected Reader initReader(String fieldName, Reader reader) {
+                    return new PersianCharFilter(reader);
+                }
+            };
+        }
     }
 
     @Override
-    public PersianAnalyzer get() {
+    public StopwordAnalyzerBase get() {
         return this.analyzer;
     }
 }
diff --git a/...-common/src/test/java/org/elasticsearch/analysis/common/PersianAnalyzerProviderTests.java b/...-common/src/test/java/org/elasticsearch/analysis/common/PersianAnalyzerProviderTests.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.analysis.common;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.elasticsearch.cluster.metadata.IndexMetadata;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.IndexSettings;
+import org.elasticsearch.index.IndexVersion;
+import org.elasticsearch.index.IndexVersions;
+import org.elasticsearch.test.ESTestCase;
+import org.elasticsearch.test.ESTokenStreamTestCase;
+import org.elasticsearch.test.IndexSettingsModule;
+import org.elasticsearch.test.index.IndexVersionUtils;
+
+import java.io.IOException;
+
+import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;
+
+/**
+ * Tests Persian Analyzer factory and behavioural changes with Lucene 10
+ */
+public class PersianAnalyzerProviderTests extends ESTokenStreamTestCase {
+
+    public void testPersianAnalyzerPostLucene10() throws IOException {
+        IndexVersion postLucene10Version = IndexVersionUtils.randomVersionBetween(
+            random(),
+            IndexVersions.UPGRADE_TO_LUCENE_10_0_0,
+            IndexVersion.current()
+        );
+        Settings settings = ESTestCase.indexSettings(1, 1)
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .put(IndexMetadata.SETTING_VERSION_CREATED, postLucene10Version)
+            .build();
+        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
+        Environment environment = new Environment(settings, null);
+
+        PersianAnalyzerProvider persianAnalyzerProvider = new PersianAnalyzerProvider(
+            idxSettings,
+            environment,
+            "my-analyzer",
+            Settings.EMPTY
+        );
+        Analyzer analyzer = persianAnalyzerProvider.get();
+        assertAnalyzesTo(analyzer, "من کتاب های زیادی خوانده ام", new String[] { "كتاب", "زياد", "خوانده" });
+    }
+
+    public void testPersianAnalyzerPreLucene10() throws IOException {
+        IndexVersion preLucene10Version = IndexVersionUtils.randomVersionBetween(
+            random(),
+            IndexVersionUtils.getFirstVersion(),
+            IndexVersionUtils.getPreviousVersion(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)
+        );
+        Settings settings = ESTestCase.indexSettings(1, 1)
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .put(IndexMetadata.SETTING_VERSION_CREATED, preLucene10Version)
+            .build();
+        IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
+        Environment environment = new Environment(settings, null);
+
+        PersianAnalyzerProvider persianAnalyzerProvider = new PersianAnalyzerProvider(
+            idxSettings,
+            environment,
+            "my-analyzer",
+            Settings.EMPTY
+        );
+        Analyzer analyzer = persianAnalyzerProvider.get();
+        assertAnalyzesTo(analyzer, "من کتاب های زیادی خوانده ام", new String[] { "كتاب", "زيادي", "خوانده" });
+    }
+}
diff --git a/...sis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml b/...sis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/20_analyzers.yml
@@ -901,6 +901,31 @@
     - length: { tokens: 1 }
     - match:  { tokens.0.token: خورد }
 
+---
+"persian stemming":
+    - requires:
+        cluster_features: ["lucene_10_upgrade"]
+        reason: "test requires persian analyzer stemming capabilities that come with Lucene 10"
+
+    - do:
+        indices.create:
+          index: test
+          body:
+            settings:
+              analysis:
+                analyzer:
+                  my_analyzer:
+                    type: persian
+
+    - do:
+        indices.analyze:
+          index: test
+          body:
+            text: كتابها
+            analyzer: my_analyzer
+    - length: { tokens: 1 }
+    - match: { tokens.0.token: كتاب }
+
 ---
 "portuguese":
     - do:

diff --git a/...luster-restart/src/javaRestTest/java/org/elasticsearch/upgrades/FullClusterRestartIT.java b/...luster-restart/src/javaRestTest/java/org/elasticsearch/upgrades/FullClusterRestartIT.java
@@ -33,6 +33,7 @@
 import org.elasticsearch.index.IndexVersions;
 import org.elasticsearch.index.mapper.DateFieldMapper;
 import org.elasticsearch.rest.action.admin.indices.RestPutIndexTemplateAction;
+import org.elasticsearch.search.SearchFeatures;
 import org.elasticsearch.test.NotEqualMessageBuilder;
 import org.elasticsearch.test.XContentTestUtils;
 import org.elasticsearch.test.cluster.ElasticsearchCluster;
@@ -1726,6 +1727,106 @@ public void testSystemIndexMetadataIsUpgraded() throws Exception {
         }
     }
 
+    /**
+     * This test ensures that search results on old indices using "persian" analyzer don't change
+     * after we introduce Lucene 10
+     */
+    public void testPersianAnalyzerBWC() throws Exception {
+        var originalClusterLegacyPersianAnalyzer = oldClusterHasFeature(SearchFeatures.LUCENE_10_0_0_UPGRADE) == false;
+        assumeTrue("Don't run this test if both versions already support stemming", originalClusterLegacyPersianAnalyzer);
+        final String indexName = "test_persian_stemmer";
+        Settings idxSettings = indexSettings(1, 1).build();
+        String mapping = """
+                {
+                  "properties": {
+                    "textfield" : {
+                      "type": "text",
+                      "analyzer": "persian"
+                    }
+                  }
+                }
+            """;
+
+        String query = """
+                {
+                  "query": {
+                    "match": {
+                      "textfield": "كتابها"
+                    }
+                  }
+                }
+            """;
+
+        if (isRunningAgainstOldCluster()) {
+            createIndex(client(), indexName, idxSettings, mapping);
+            ensureGreen(indexName);
+
+            assertOK(
+                client().performRequest(
+                    newXContentRequest(
+                        HttpMethod.POST,
+                        "/" + indexName + "/" + "_doc/1",
+                        (builder, params) -> builder.field("textfield", "كتابها")
+                    )
+                )
+            );
+            assertOK(
+                client().performRequest(
+                    newXContentRequest(
+                        HttpMethod.POST,
+                        "/" + indexName + "/" + "_doc/2",
+                        (builder, params) -> builder.field("textfield", "كتاب")
+                    )
+                )
+            );
+            refresh(indexName);
+
+            assertNumHits(indexName, 2, 1);
+
+            Request searchRequest = new Request("POST", "/" + indexName + "/_search");
+            searchRequest.setJsonEntity(query);
+            assertTotalHits(1, entityAsMap(client().performRequest(searchRequest)));
+        } else {
+            // old index should still only return one doc
+            Request searchRequest = new Request("POST", "/" + indexName + "/_search");
+            searchRequest.setJsonEntity(query);
+            assertTotalHits(1, entityAsMap(client().performRequest(searchRequest)));
+
+            String newIndexName = indexName + "_new";
+            createIndex(client(), newIndexName, idxSettings, mapping);
+            ensureGreen(newIndexName);
+
+            assertOK(
+                client().performRequest(
+                    newXContentRequest(
+                        HttpMethod.POST,
+                        "/" + newIndexName + "/" + "_doc/1",
+                        (builder, params) -> builder.field("textfield", "كتابها")
+                    )
+                )
+            );
+            assertOK(
+                client().performRequest(
+                    newXContentRequest(
+                        HttpMethod.POST,
+                        "/" + newIndexName + "/" + "_doc/2",
+                        (builder, params) -> builder.field("textfield", "كتاب")
+                    )
+                )
+            );
+            refresh(newIndexName);
+
+            searchRequest = new Request("POST", "/" + newIndexName + "/_search");
+            searchRequest.setJsonEntity(query);
+            assertTotalHits(2, entityAsMap(client().performRequest(searchRequest)));
+
+            // searching both indices (old and new analysis version) we should get 1 hit from the old and 2 from the new index
+            searchRequest = new Request("POST", "/" + indexName + "," + newIndexName + "/_search");
+            searchRequest.setJsonEntity(query);
+            assertTotalHits(3, entityAsMap(client().performRequest(searchRequest)));
+        }
+    }
+
     /**
      * This test ensures that soft deletes are enabled a when upgrading a pre-8 cluster to 8.0+
      */