diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index f99553b963f5..cdd47e4e1de5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -7,6 +7,9 @@ http://s.apache.org/luceneversions New Features +* LUCENE-9476: Add new getBulkPath API to DirectoryTaxonomyReader to more efficiently retrieve FacetLabels for multiple + facet ordinals at once (Gautam Worah, Mike McCandless) + * LUCENE-9322: Vector-valued fields, Lucene90 Codec (Mike Sokolov, Julie Tibshirani, Tomoko Uchida) * LUCENE-9004: Approximate nearest vector search via NSW graphs diff --git a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java index 45b9a714b582..75339e24748f 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java @@ -18,10 +18,12 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.function.IntUnaryOperator; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.lucene.document.Document; @@ -31,7 +33,7 @@ import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays; import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.CorruptIndexException; // javadocs +import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.LeafReader; @@ -46,6 +48,7 @@ import org.apache.lucene.util.Accountables; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.InPlaceMergeSorter; import org.apache.lucene.util.RamUsageEstimator; /** @@ -320,23 +323,16 @@ public FacetLabel getPath(int ordinal) throws IOException { // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR // instance recognizes. Therefore we do this check up front, before we hit // the cache. - if (ordinal < 0 || ordinal >= indexReader.maxDoc()) { - return null; - } + int indexReaderMaxDoc = indexReader.maxDoc(); + checkOrdinalBounds(ordinal, indexReaderMaxDoc); - // TODO: can we use an int-based hash impl, such as IntToObjectMap, - // wrapped as LRU? - Integer catIDInteger = Integer.valueOf(ordinal); - synchronized (categoryCache) { - FacetLabel res = categoryCache.get(catIDInteger); - if (res != null) { - return res; - } + FacetLabel ordinalPath = getPathFromCache(ordinal); + if (ordinalPath != null) { + return ordinalPath; } int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves()); LeafReader leafReader = indexReader.leaves().get(readerIndex).reader(); - // TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL); FacetLabel ret; @@ -353,12 +349,137 @@ public FacetLabel getPath(int ordinal) throws IOException { } synchronized (categoryCache) { - categoryCache.put(catIDInteger, ret); + categoryCache.put(ordinal, ret); } return ret; } + private FacetLabel getPathFromCache(int ordinal) { + // TODO: can we use an int-based hash impl, such as IntToObjectMap, + // wrapped as LRU? + synchronized (categoryCache) { + return categoryCache.get(ordinal); + } + } + + private void checkOrdinalBounds(int ordinal, int indexReaderMaxDoc) + throws IllegalArgumentException { + if (ordinal < 0 || ordinal >= indexReaderMaxDoc) { + throw new IllegalArgumentException( + "ordinal " + + ordinal + + " is out of the range of the indexReader " + + indexReader.toString()); + } + } + + /** + * Returns an array of FacetLabels for a given array of ordinals. + * + *

This API is generally faster than iteratively calling {@link #getPath(int)} over an array of + * ordinals. It uses the {@link #getPath(int)} method iteratively when it detects that the index + * was created using StoredFields (with no performance gains) and uses DocValues based iteration + * when the index is based on DocValues. + * + * @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy + * index + */ + public FacetLabel[] getBulkPath(int... ordinals) throws IOException { + ensureOpen(); + + int ordinalsLength = ordinals.length; + FacetLabel[] bulkPath = new FacetLabel[ordinalsLength]; + // remember the original positions of ordinals before they are sorted + int originalPosition[] = new int[ordinalsLength]; + Arrays.setAll(originalPosition, IntUnaryOperator.identity()); + int indexReaderMaxDoc = indexReader.maxDoc(); + + for (int i = 0; i < ordinalsLength; i++) { + // check whether the ordinal is valid before accessing the cache + checkOrdinalBounds(ordinals[i], indexReaderMaxDoc); + // check the cache before trying to find it in the index + FacetLabel ordinalPath = getPathFromCache(ordinals[i]); + if (ordinalPath != null) { + bulkPath[i] = ordinalPath; + } + } + + // parallel sort the ordinals and originalPosition array based on the values in the ordinals + // array + new InPlaceMergeSorter() { + @Override + protected void swap(int i, int j) { + int x = ordinals[i]; + ordinals[i] = ordinals[j]; + ordinals[j] = x; + + x = originalPosition[i]; + originalPosition[i] = originalPosition[j]; + originalPosition[j] = x; + } + ; + + @Override + public int compare(int i, int j) { + return Integer.compare(ordinals[i], ordinals[j]); + } + }.sort(0, ordinalsLength); + + int readerIndex; + int leafReaderMaxDoc = 0; + int leafReaderDocBase = 0; + LeafReader leafReader; + LeafReaderContext leafReaderContext; + BinaryDocValues values = null; + + for (int i = 0; i < ordinalsLength; i++) { + if (bulkPath[originalPosition[i]] == null) { + if (values == null || ordinals[i] >= leafReaderMaxDoc) { + + readerIndex = ReaderUtil.subIndex(ordinals[i], indexReader.leaves()); + leafReaderContext = indexReader.leaves().get(readerIndex); + leafReader = leafReaderContext.reader(); + leafReaderMaxDoc = leafReader.maxDoc(); + leafReaderDocBase = leafReaderContext.docBase; + values = leafReader.getBinaryDocValues(Consts.FULL); + + // this check is only needed once to confirm that the index uses BinaryDocValues + boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase); + if (success == false) { + return getBulkPathForOlderIndexes(ordinals); + } + } + boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase); + assert success; + bulkPath[originalPosition[i]] = + new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString())); + } + } + + for (int i = 0; i < ordinalsLength; i++) { + synchronized (categoryCache) { + categoryCache.put(ordinals[i], bulkPath[originalPosition[i]]); + } + } + + return bulkPath; + } + + /** + * This function is only used when the underlying taxonomy index was constructed using an older + * (slower) StoredFields based codec (< 8.7). The {@link #getBulkPath(int...)} function calls it + * internally when it realizes that the index uses StoredFields. + */ + private FacetLabel[] getBulkPathForOlderIndexes(int... ordinals) throws IOException { + FacetLabel[] bulkPath = new FacetLabel[ordinals.length]; + for (int i = 0; i < ordinals.length; i++) { + bulkPath[i] = getPath(ordinals[i]); + } + + return bulkPath; + } + @Override public int getSize() { ensureOpen(); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java index 2c1e7087a7ea..1376a29ee9b0 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java @@ -366,9 +366,9 @@ public void testReaderBasic() throws Exception { } } // (also test invalid ordinals:) - assertNull(tr.getPath(-1)); - assertNull(tr.getPath(tr.getSize())); - assertNull(tr.getPath(TaxonomyReader.INVALID_ORDINAL)); + expectThrows(IllegalArgumentException.class, () -> tr.getPath(-1)); + expectThrows(IllegalArgumentException.class, () -> tr.getPath(tr.getSize())); + expectThrows(IllegalArgumentException.class, () -> tr.getPath(TaxonomyReader.INVALID_ORDINAL)); // test TaxonomyReader.getOrdinal(): for (int i = 1; i < expectedCategories.length; i++) { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java index d62040f8c12d..b4894cce9968 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java @@ -84,6 +84,31 @@ private void createNewTaxonomyIndex(String dirName) throws IOException { dir.close(); } + // Opens up a pre-existing index and tries to run getBulkPath on it + public void testGetBulkPathOnOlderCodec() throws Exception { + Path indexDir = createTempDir(oldTaxonomyIndexName); + TestUtil.unzip(getDataInputStream(oldTaxonomyIndexName + ".zip"), indexDir); + Directory dir = newFSDirectory(indexDir); + + DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir); + + FacetLabel cp_b = new FacetLabel("b"); + writer.addCategory(cp_b); + writer.getInternalIndexWriter().forceMerge(1); + writer.commit(); + + DirectoryTaxonomyReader reader = new DirectoryTaxonomyReader(writer); + + FacetLabel[] facetLabels = {new FacetLabel("a"), cp_b}; + int[] ordinals = + new int[] {reader.getOrdinal(facetLabels[0]), reader.getOrdinal(facetLabels[1])}; + // The zip file already contains a category "a" stored with the older StoredFields codec + assertArrayEquals(facetLabels, reader.getBulkPath(ordinals)); + reader.close(); + writer.close(); + dir.close(); + } + // Used to create a fresh taxonomy index with StoredFields @Ignore public void testCreateOldTaxonomy() throws IOException { diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java index 47e42dc8e238..39c19304c2bf 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java @@ -413,7 +413,7 @@ public void testOpenIfChangedReuse() throws Exception { // check that r1 doesn't see cp_b assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b)); - assertNull(r1.getPath(2)); + expectThrows(IllegalArgumentException.class, () -> r1.getPath(2)); r1.close(); r2.close(); @@ -569,4 +569,39 @@ public void testAccountable() throws Exception { taxoReader.close(); dir.close(); } + + public void testCallingBulkPathReturnsCorrectResult() throws Exception { + float PROBABILITY_OF_COMMIT = 0.5f; + Directory src = newDirectory(); + DirectoryTaxonomyWriter w = new DirectoryTaxonomyWriter(src); + String randomArray[] = new String[random().nextInt(1000)]; + // adding a smaller bound on ints ensures that we will have some duplicate ordinals in random + // test cases + Arrays.setAll(randomArray, i -> Integer.toString(random().nextInt(500))); + + FacetLabel allPaths[] = new FacetLabel[randomArray.length]; + int allOrdinals[] = new int[randomArray.length]; + + for (int i = 0; i < randomArray.length; i++) { + allPaths[i] = new FacetLabel(randomArray[i]); + w.addCategory(allPaths[i]); + // add random commits to create multiple segments in the index + if (random().nextFloat() < PROBABILITY_OF_COMMIT) { + w.commit(); + } + } + w.commit(); + w.close(); + + DirectoryTaxonomyReader r1 = new DirectoryTaxonomyReader(src); + + for (int i = 0; i < allPaths.length; i++) { + allOrdinals[i] = r1.getOrdinal(allPaths[i]); + } + + FacetLabel allBulkPaths[] = r1.getBulkPath(allOrdinals); + assertArrayEquals(allPaths, allBulkPaths); + r1.close(); + src.close(); + } }