From 5c840f72b7d01a5e15c535f5edaf974e45c1e847 Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Mon, 30 Sep 2024 14:03:44 -0400 Subject: [PATCH] Deprecate dutch_kp and lovins stemmer as they are removed in Lucene 10 (#113143) Lucene 10 has upgraded its Snowball stemming support, as part of those upgrades, two no longer supported stemmers were removed, `KpStemmer` and `LovinsStemmer`. These are `dutch_kp` and `lovins`, respectively. We will deprecate in 8.16 and will remove support for these in a future version. --- docs/changelog/113143.yaml | 10 +++++++ .../snowball-tokenfilter.asciidoc | 4 ++- .../tokenfilters/stemmer-tokenfilter.asciidoc | 4 +-- .../common/StemmerTokenFilterFactory.java | 18 +++++++++++++ .../StemmerTokenFilterFactoryTests.java | 27 ++++++++++++++++++- 5 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 docs/changelog/113143.yaml diff --git a/docs/changelog/113143.yaml b/docs/changelog/113143.yaml new file mode 100644 index 0000000000000..4a2044cca0ce4 --- /dev/null +++ b/docs/changelog/113143.yaml @@ -0,0 +1,10 @@ +pr: 113143 +summary: Deprecate dutch_kp and lovins stemmer as they are removed in Lucene 10 +area: Analysis +type: deprecation +issues: [] +deprecation: + title: Deprecate dutch_kp and lovins stemmer as they are removed in Lucene 10 + area: Analysis + details: kp, dutch_kp, dutchKp and lovins stemmers are deprecated and will be removed. + impact: These stemmers will be removed and will be no longer supported. diff --git a/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc index 57e402988cc5a..d8300288c9f4b 100644 --- a/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/snowball-tokenfilter.asciidoc @@ -11,6 +11,8 @@ values: `Arabic`, `Armenian`, `Basque`, `Catalan`, `Danish`, `Dutch`, `English`, `Lithuanian`, `Lovins`, `Norwegian`, `Porter`, `Portuguese`, `Romanian`, `Russian`, `Serbian`, `Spanish`, `Swedish`, `Turkish`. +deprecated:[8.16.0, `Kp` and `Lovins` support will be removed in a future version] + For example: [source,console] @@ -28,7 +30,7 @@ PUT /my-index-000001 "filter": { "my_snow": { "type": "snowball", - "language": "Lovins" + "language": "English" } } } diff --git a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc index 42ac594fca3bf..4cd088935af19 100644 --- a/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/stemmer-tokenfilter.asciidoc @@ -144,12 +144,12 @@ https://snowballstem.org/algorithms/danish/stemmer.html[*`danish`*] Dutch:: https://snowballstem.org/algorithms/dutch/stemmer.html[*`dutch`*], -https://snowballstem.org/algorithms/kraaij_pohlmann/stemmer.html[`dutch_kp`] +https://snowballstem.org/algorithms/kraaij_pohlmann/stemmer.html[`dutch_kp`] deprecated:[8.16.0, `dutch_kp` will be removed in a future version] English:: https://snowballstem.org/algorithms/porter/stemmer.html[*`english`*], https://ciir.cs.umass.edu/pubfiles/ir-35.pdf[`light_english`], -https://snowballstem.org/algorithms/lovins/stemmer.html[`lovins`], +https://snowballstem.org/algorithms/lovins/stemmer.html[`lovins`] deprecated:[8.16.0, `lovins` will be removed in a future version], https://www.researchgate.net/publication/220433848_How_effective_is_suffixing[`minimal_english`], https://snowballstem.org/algorithms/english/stemmer.html[`porter2`], {lucene-analysis-docs}/en/EnglishPossessiveFilter.html[`possessive_english`] diff --git a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java index afb3d69733d02..1c71c64311517 100644 --- a/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java +++ b/modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactory.java @@ -47,6 +47,8 @@ import org.apache.lucene.analysis.snowball.SnowballFilter; import org.apache.lucene.analysis.sv.SwedishLightStemFilter; import org.elasticsearch.common.Strings; +import org.elasticsearch.common.logging.DeprecationCategory; +import org.elasticsearch.common.logging.DeprecationLogger; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; import org.elasticsearch.index.IndexSettings; @@ -81,6 +83,8 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory { + private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(StemmerTokenFilterFactory.class); + private static final TokenStream EMPTY_TOKEN_STREAM = new EmptyTokenStream(); private String language; @@ -90,6 +94,20 @@ public class StemmerTokenFilterFactory extends AbstractTokenFilterFactory { this.language = Strings.capitalize(settings.get("language", settings.get("name", "porter"))); // check that we have a valid language by trying to create a TokenStream create(EMPTY_TOKEN_STREAM).close(); + if ("lovins".equalsIgnoreCase(language)) { + deprecationLogger.critical( + DeprecationCategory.ANALYSIS, + "lovins_deprecation", + "The [lovins] stemmer is deprecated and will be removed in a future version." + ); + } + if ("dutch_kp".equalsIgnoreCase(language) || "dutchKp".equalsIgnoreCase(language) || "kp".equalsIgnoreCase(language)) { + deprecationLogger.critical( + DeprecationCategory.ANALYSIS, + "dutch_kp_deprecation", + "The [dutch_kp] stemmer is deprecated and will be removed in a future version." + ); + } } @Override diff --git a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java index a1c95deb65a52..8f3d52f0174c6 100644 --- a/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java +++ b/modules/analysis-common/src/test/java/org/elasticsearch/analysis/common/StemmerTokenFilterFactoryTests.java @@ -32,7 +32,6 @@ import static org.hamcrest.Matchers.instanceOf; public class StemmerTokenFilterFactoryTests extends ESTokenStreamTestCase { - private static final CommonAnalysisPlugin PLUGIN = new CommonAnalysisPlugin(); public void testEnglishFilterFactory() throws IOException { @@ -103,4 +102,30 @@ public void testMultipleLanguagesThrowsException() throws IOException { ); assertEquals("Invalid stemmer class specified: [english, light_english]", e.getMessage()); } + + public void testKpDeprecation() throws IOException { + IndexVersion v = IndexVersionUtils.randomVersion(random()); + Settings settings = Settings.builder() + .put("index.analysis.filter.my_kp.type", "stemmer") + .put("index.analysis.filter.my_kp.language", "kp") + .put(SETTING_VERSION_CREATED, v) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + + AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN); + assertCriticalWarnings("The [dutch_kp] stemmer is deprecated and will be removed in a future version."); + } + + public void testLovinsDeprecation() throws IOException { + IndexVersion v = IndexVersionUtils.randomVersion(random()); + Settings settings = Settings.builder() + .put("index.analysis.filter.my_lovins.type", "stemmer") + .put("index.analysis.filter.my_lovins.language", "lovins") + .put(SETTING_VERSION_CREATED, v) + .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()) + .build(); + + AnalysisTestsHelper.createTestAnalysisFromSettings(settings, PLUGIN); + assertCriticalWarnings("The [lovins] stemmer is deprecated and will be removed in a future version."); + } }