Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Conditional stemming for 'persian' analyzer #113482

Merged
merged 19 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions docs/changelog/113482.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
pr: 113482
summary: The 'persian' analyzer has stemmer by default
area: Analysis
type: breaking
issues:
- 113050
breaking:
title: The 'persian' analyzer has stemmer by default
area: Analysis
details: >-
Lucene 10 has added a final stemming step to its PersianAnalyzer that we
expose as 'persian' analyzer. Existing indices will keep the old
non-stemming behaviour while new indices will see the updated behaviour with
added stemming.
Users that want to maintain the non-stemming behaviour need to define their
own analyzer as outlines in
https://www.elastic.co/guide/en/elasticsearch/reference/8.15/analysis-lang-analyzer.html#persian-analyzer.
Users that want to use the new stemming behaviour for existing indices will
have to reindex their data.
impact: >-
Indexing with the 'persian' analyzer will produce slightly different token.
Users should check if this impacts their search results. If they want to
maintain the legacy non-stemming behaviour they can define their own
analyzer equivalent as explained in
https://www.elastic.co/guide/en/elasticsearch/reference/8.15/analysis-lang-analyzer.html#persian-analyzer.
notable: false

Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,72 @@

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.StopwordAnalyzerBase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.DecimalDigitFilter;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.fa.PersianCharFilter;
import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
import org.elasticsearch.index.analysis.Analysis;

public class PersianAnalyzerProvider extends AbstractIndexAnalyzerProvider<PersianAnalyzer> {
import java.io.Reader;

private final PersianAnalyzer analyzer;
public class PersianAnalyzerProvider extends AbstractIndexAnalyzerProvider<StopwordAnalyzerBase> {

private final StopwordAnalyzerBase analyzer;

PersianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(name, settings);
analyzer = new PersianAnalyzer(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet()));
if (indexSettings.getIndexVersionCreated().onOrAfter(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)) {
// since Lucene 10 this analyzer contains stemming by default
analyzer = new PersianAnalyzer(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet()));
} else {
// for older index versions we need the old analyzer behaviour without stemming
analyzer = new StopwordAnalyzerBase(Analysis.parseStopWords(env, settings, PersianAnalyzer.getDefaultStopSet())) {

protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new DecimalDigitFilter(result);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
/*
* the order here is important: the stopword list is normalized with the
* above!
*/
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
}

protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = new LowerCaseFilter(in);
result = new DecimalDigitFilter(result);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
result = new PersianNormalizationFilter(result);
return result;
}

protected Reader initReader(String fieldName, Reader reader) {
return new PersianCharFilter(reader);
}
};
}
}

@Override
public PersianAnalyzer get() {
public StopwordAnalyzerBase get() {
return this.analyzer;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.IndexVersion;
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.ESTokenStreamTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.index.IndexVersionUtils;

import java.io.IOException;

import static org.apache.lucene.tests.analysis.BaseTokenStreamTestCase.assertAnalyzesTo;

/**
* Tests Persian Analyzer factory and behavioural changes with Lucene 10
*/
public class PersianAnalyzerProviderTests extends ESTokenStreamTestCase {

public void testPersianAnalyzerPostLucene10() throws IOException {
IndexVersion postLucene10Version = IndexVersionUtils.randomVersionBetween(
random(),
IndexVersions.UPGRADE_TO_LUCENE_10_0_0,
IndexVersion.current()
);
Settings settings = ESTestCase.indexSettings(1, 1)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetadata.SETTING_VERSION_CREATED, postLucene10Version)
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
Environment environment = new Environment(settings, null);

PersianAnalyzerProvider persianAnalyzerProvider = new PersianAnalyzerProvider(
idxSettings,
environment,
"my-analyzer",
Settings.EMPTY
);
Analyzer analyzer = persianAnalyzerProvider.get();
assertAnalyzesTo(analyzer, "من کتاب های زیادی خوانده ام", new String[] { "كتاب", "زياد", "خوانده" });
}

public void testPersianAnalyzerPreLucene10() throws IOException {
IndexVersion preLucene10Version = IndexVersionUtils.randomVersionBetween(
random(),
IndexVersionUtils.getFirstVersion(),
IndexVersionUtils.getPreviousVersion(IndexVersions.UPGRADE_TO_LUCENE_10_0_0)
);
Settings settings = ESTestCase.indexSettings(1, 1)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(IndexMetadata.SETTING_VERSION_CREATED, preLucene10Version)
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
Environment environment = new Environment(settings, null);

PersianAnalyzerProvider persianAnalyzerProvider = new PersianAnalyzerProvider(
idxSettings,
environment,
"my-analyzer",
Settings.EMPTY
);
Analyzer analyzer = persianAnalyzerProvider.get();
assertAnalyzesTo(analyzer, "من کتاب های زیادی خوانده ام", new String[] { "كتاب", "زيادي", "خوانده" });
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,31 @@
- length: { tokens: 1 }
- match: { tokens.0.token: خورد }

---
"persian stemming":
- requires:
cluster_features: ["lucene_10_upgrade"]
reason: "test requires persian analyzer stemming capabilities that come with Lucene 10"

- do:
indices.create:
index: test
body:
settings:
analysis:
analyzer:
my_analyzer:
type: persian

- do:
indices.analyze:
index: test
body:
text: كتابها
analyzer: my_analyzer
- length: { tokens: 1 }
- match: { tokens.0.token: كتاب }

---
"portuguese":
- do:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.elasticsearch.index.IndexVersions;
import org.elasticsearch.index.mapper.DateFieldMapper;
import org.elasticsearch.rest.action.admin.indices.RestPutIndexTemplateAction;
import org.elasticsearch.search.SearchFeatures;
import org.elasticsearch.test.NotEqualMessageBuilder;
import org.elasticsearch.test.XContentTestUtils;
import org.elasticsearch.test.cluster.ElasticsearchCluster;
Expand Down Expand Up @@ -1726,6 +1727,106 @@ public void testSystemIndexMetadataIsUpgraded() throws Exception {
}
}

/**
* This test ensures that search results on old indices using "persian" analyzer don't change
* after we introduce Lucene 10
*/
public void testPersianAnalyzerBWC() throws Exception {
var originalClusterLegacyPersianAnalyzer = oldClusterHasFeature(SearchFeatures.LUCENE_10_0_0_UPGRADE) == false;
assumeTrue("Don't run this test if both versions already support stemming", originalClusterLegacyPersianAnalyzer);
final String indexName = "test_persian_stemmer";
Settings idxSettings = indexSettings(1, 1).build();
String mapping = """
{
"properties": {
"textfield" : {
"type": "text",
"analyzer": "persian"
}
}
}
""";

String query = """
{
"query": {
"match": {
"textfield": "كتابها"
}
}
}
""";

if (isRunningAgainstOldCluster()) {
createIndex(client(), indexName, idxSettings, mapping);
ensureGreen(indexName);

assertOK(
client().performRequest(
newXContentRequest(
HttpMethod.POST,
"/" + indexName + "/" + "_doc/1",
(builder, params) -> builder.field("textfield", "كتابها")
)
)
);
assertOK(
client().performRequest(
newXContentRequest(
HttpMethod.POST,
"/" + indexName + "/" + "_doc/2",
(builder, params) -> builder.field("textfield", "كتاب")
)
)
);
refresh(indexName);

assertNumHits(indexName, 2, 1);

Request searchRequest = new Request("POST", "/" + indexName + "/_search");
searchRequest.setJsonEntity(query);
assertTotalHits(1, entityAsMap(client().performRequest(searchRequest)));
} else {
// old index should still only return one doc
Request searchRequest = new Request("POST", "/" + indexName + "/_search");
searchRequest.setJsonEntity(query);
assertTotalHits(1, entityAsMap(client().performRequest(searchRequest)));

String newIndexName = indexName + "_new";
createIndex(client(), newIndexName, idxSettings, mapping);
ensureGreen(newIndexName);

assertOK(
client().performRequest(
newXContentRequest(
HttpMethod.POST,
"/" + newIndexName + "/" + "_doc/1",
(builder, params) -> builder.field("textfield", "كتابها")
)
)
);
assertOK(
client().performRequest(
newXContentRequest(
HttpMethod.POST,
"/" + newIndexName + "/" + "_doc/2",
(builder, params) -> builder.field("textfield", "كتاب")
)
)
);
refresh(newIndexName);

searchRequest = new Request("POST", "/" + newIndexName + "/_search");
searchRequest.setJsonEntity(query);
assertTotalHits(2, entityAsMap(client().performRequest(searchRequest)));

// searching both indices (old and new analysis version) we should get 1 hit from the old and 2 from the new index
searchRequest = new Request("POST", "/" + indexName + "," + newIndexName + "/_search");
searchRequest.setJsonEntity(query);
assertTotalHits(3, entityAsMap(client().performRequest(searchRequest)));
}
}

/**
* This test ensures that soft deletes are enabled a when upgrading a pre-8 cluster to 8.0+
*/
Expand Down