From 1875ed65503a2ea9cc4010581bb0fe2a4c40b5bc Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Wed, 20 Sep 2023 10:13:47 -0400 Subject: [PATCH] Version bump. --- CMakeLists.txt | 2 +- Doxyfile | 2 +- include/simdjson/simdjson_version.h | 6 +- singleheader/simdjson.cpp | 904 +++++++++++++--------------- singleheader/simdjson.h | 88 ++- 5 files changed, 446 insertions(+), 556 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 31e114a2ef..c7722a661b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.14) project( simdjson # The version number is modified by tools/release.py - VERSION 3.2.3 + VERSION 3.3.0 DESCRIPTION "Parsing gigabytes of JSON per second" HOMEPAGE_URL "https://simdjson.org/" LANGUAGES CXX C diff --git a/Doxyfile b/Doxyfile index 1bea93fa8a..24bb536578 100644 --- a/Doxyfile +++ b/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = simdjson # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "3.2.3" +PROJECT_NUMBER = "3.3.0" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/include/simdjson/simdjson_version.h b/include/simdjson/simdjson_version.h index 88ca8c1735..29db5b7c66 100644 --- a/include/simdjson/simdjson_version.h +++ b/include/simdjson/simdjson_version.h @@ -4,7 +4,7 @@ #define SIMDJSON_SIMDJSON_VERSION_H /** The version of simdjson being used (major.minor.revision) */ -#define SIMDJSON_VERSION "3.2.3" +#define SIMDJSON_VERSION "3.3.0" namespace simdjson { enum { @@ -15,11 +15,11 @@ enum { /** * The minor version (major.MINOR.revision) of simdjson being used. */ - SIMDJSON_VERSION_MINOR = 2, + SIMDJSON_VERSION_MINOR = 3, /** * The revision (major.minor.REVISION) of simdjson being used. */ - SIMDJSON_VERSION_REVISION = 3 + SIMDJSON_VERSION_REVISION = 0 }; } // namespace simdjson diff --git a/singleheader/simdjson.cpp b/singleheader/simdjson.cpp index 5e8e8b8f6f..291df4d930 100644 --- a/singleheader/simdjson.cpp +++ b/singleheader/simdjson.cpp @@ -1,4 +1,4 @@ -/* auto-generated on 2023-08-18 14:37:10 -0400. Do not edit! */ +/* auto-generated on . Do not edit! */ /* including simdjson.cpp: */ /* begin file simdjson.cpp */ #define SIMDJSON_SRC_SIMDJSON_CPP @@ -59,6 +59,14 @@ #error simdjson requires a compiler compliant with the C++11 standard #endif +#ifndef SIMDJSON_IF_CONSTEXPR +#if SIMDJSON_CPLUSPLUS17 +#define SIMDJSON_IF_CONSTEXPR if constexpr +#else +#define SIMDJSON_IF_CONSTEXPR if +#endif +#endif + #endif // SIMDJSON_COMPILER_CHECK_H /* end file simdjson/compiler_check.h */ /* including simdjson/portability.h: #include "simdjson/portability.h" */ @@ -8787,8 +8795,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q // to the desired approximation using one multiplication. Sometimes it does not suffice. // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and - // then we get a better approximation to i * 5^q. In very rare cases, even that - // will not suffice, though it is seemingly very hard to find such a scenario. + // then we get a better approximation to i * 5^q. // // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat // more complicated. @@ -8803,12 +8810,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]); firstproduct.low += secondproduct.high; if(secondproduct.high > firstproduct.low) { firstproduct.high++; } - // At this point, we might need to add at most one to firstproduct, but this - // can only change the value of firstproduct.high if firstproduct.low is maximal. - if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) { - // This is very unlikely, but if so, we need to do much more work! - return false; - } + // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without + // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product + // is sufficiently accurate, and more computation is not needed. } uint64_t lower = firstproduct.low; uint64_t upper = firstproduct.high; @@ -11544,14 +11548,6 @@ using namespace simd; this->error |= this->prev_incomplete; } -#ifndef SIMDJSON_IF_CONSTEXPR -#if SIMDJSON_CPLUSPLUS17 -#define SIMDJSON_IF_CONSTEXPR if constexpr -#else -#define SIMDJSON_IF_CONSTEXPR if -#endif -#endif - simdjson_inline void check_next_input(const simd8x64& input) { if(simdjson_likely(is_ascii(input))) { this->error |= this->prev_incomplete; @@ -12009,6 +12005,58 @@ class bit_indexer { simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {} +#if SIMDJSON_PREFER_REVERSE_BITS + /** + * ARM lacks a fast trailing zero instruction, but it has a fast + * bit reversal instruction and a fast leading zero instruction. + * Thus it may be profitable to reverse the bits (once) and then + * to rely on a sequence of instructions that call the leading + * zero instruction. + * + * Performance notes: + * The chosen routine is not optimal in terms of data dependency + * since zero_leading_bit might require two instructions. However, + * it tends to minimize the total number of instructions which is + * beneficial. + */ + simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } +#else + /** + * Under recent x64 systems, we often have both a fast trailing zero + * instruction and a fast 'clear-lower-bit' instruction so the following + * algorithm can be competitive. + */ + + simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) { + this->tail[i] = idx + trailing_zeroes(bits); + bits = clear_lowest_bit(bits); + } +#endif // SIMDJSON_PREFER_REVERSE_BITS + + template + simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) { + write_index(idx, bits, START); + SIMDJSON_IF_CONSTEXPR (N > 1) { + write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits); + } + return START+N; + } + + template + simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) { + write_indexes(idx, bits); + SIMDJSON_IF_CONSTEXPR ((START+STEP) < END) { + if (simdjson_unlikely((START+STEP) < cnt)) { + write_indexes_stepped<(START+STEP(idx, bits, cnt); + } + } + return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP; + } + // flatten out values in 'bits' assuming that they are are to have values of idx // plus their position in the bitvector, and store these indexes at // base_ptr[base] incrementing base as we go @@ -12026,91 +12074,29 @@ class bit_indexer { // it helps tremendously. if (bits == 0) return; -#if SIMDJSON_PREFER_REVERSE_BITS - /** - * ARM lacks a fast trailing zero instruction, but it has a fast - * bit reversal instruction and a fast leading zero instruction. - * Thus it may be profitable to reverse the bits (once) and then - * to rely on a sequence of instructions that call the leading - * zero instruction. - * - * Performance notes: - * The chosen routine is not optimal in terms of data dependency - * since zero_leading_bit might require two instructions. However, - * it tends to minimize the total number of instructions which is - * beneficial. - */ - uint64_t rev_bits = reverse_bits(bits); int cnt = static_cast(count_ones(bits)); - int i = 0; - // Do the first 8 all together - for (; i<8; i++) { - int lz = leading_zeroes(rev_bits); - this->tail[i] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); - } - // Do the next 8 all together (we hope in most cases it won't happen at all - // and the branch is easily predicted). - if (simdjson_unlikely(cnt > 8)) { - i = 8; - for (; i<16; i++) { - int lz = leading_zeroes(rev_bits); - this->tail[i] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); - } +#if SIMDJSON_PREFER_REVERSE_BITS + bits = reverse_bits(bits); +#endif +#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP + static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP; +#else + static constexpr const int STEP = 4; +#endif + static constexpr const int STEP_UNTIL = 24; - // Most files don't have 16+ structurals per block, so we take several basically guaranteed - // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) - // or the start of a value ("abc" true 123) every four characters. - if (simdjson_unlikely(cnt > 16)) { - i = 16; - while (rev_bits != 0) { - int lz = leading_zeroes(rev_bits); - this->tail[i++] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); + write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt); + SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) { + if (simdjson_unlikely(STEP_UNTIL < cnt)) { + for (int i=STEP_UNTIL; itail += cnt; -#else // SIMDJSON_PREFER_REVERSE_BITS - /** - * Under recent x64 systems, we often have both a fast trailing zero - * instruction and a fast 'clear-lower-bit' instruction so the following - * algorithm can be competitive. - */ - - int cnt = static_cast(count_ones(bits)); - // Do the first 8 all together - for (int i=0; i<8; i++) { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - } - - // Do the next 8 all together (we hope in most cases it won't happen at all - // and the branch is easily predicted). - if (simdjson_unlikely(cnt > 8)) { - for (int i=8; i<16; i++) { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - } - - // Most files don't have 16+ structurals per block, so we take several basically guaranteed - // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) - // or the start of a value ("abc" true 123) every four characters. - if (simdjson_unlikely(cnt > 16)) { - int i = 16; - do { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - i++; - } while (i < cnt); - } - } this->tail += cnt; -#endif } #endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER @@ -12959,15 +12945,17 @@ simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primit } template simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept { + // Use the fact that most scalars are going to be either strings or numbers. + if(*value == '"') { + return visitor.visit_string(*this, value); + } else if (((*value - '0') < 10) || (*value == '-')) { + return visitor.visit_number(*this, value); + } + // true, false, null are uncommon. switch (*value) { - case '"': return visitor.visit_string(*this, value); case 't': return visitor.visit_true_atom(*this, value); case 'f': return visitor.visit_false_atom(*this, value); case 'n': return visitor.visit_null_atom(*this, value); - case '-': - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - return visitor.visit_number(*this, value); default: log_error("Non-value found when value was expected!"); return TAPE_ERROR; @@ -14645,8 +14633,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q // to the desired approximation using one multiplication. Sometimes it does not suffice. // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and - // then we get a better approximation to i * 5^q. In very rare cases, even that - // will not suffice, though it is seemingly very hard to find such a scenario. + // then we get a better approximation to i * 5^q. // // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat // more complicated. @@ -14661,12 +14648,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]); firstproduct.low += secondproduct.high; if(secondproduct.high > firstproduct.low) { firstproduct.high++; } - // At this point, we might need to add at most one to firstproduct, but this - // can only change the value of firstproduct.high if firstproduct.low is maximal. - if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) { - // This is very unlikely, but if so, we need to do much more work! - return false; - } + // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without + // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product + // is sufficiently accurate, and more computation is not needed. } uint64_t lower = firstproduct.low; uint64_t upper = firstproduct.high; @@ -16840,15 +16824,17 @@ simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primit } template simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept { + // Use the fact that most scalars are going to be either strings or numbers. + if(*value == '"') { + return visitor.visit_string(*this, value); + } else if (((*value - '0') < 10) || (*value == '-')) { + return visitor.visit_number(*this, value); + } + // true, false, null are uncommon. switch (*value) { - case '"': return visitor.visit_string(*this, value); case 't': return visitor.visit_true_atom(*this, value); case 'f': return visitor.visit_false_atom(*this, value); case 'n': return visitor.visit_null_atom(*this, value); - case '-': - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - return visitor.visit_number(*this, value); default: log_error("Non-value found when value was expected!"); return TAPE_ERROR; @@ -19054,8 +19040,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q // to the desired approximation using one multiplication. Sometimes it does not suffice. // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and - // then we get a better approximation to i * 5^q. In very rare cases, even that - // will not suffice, though it is seemingly very hard to find such a scenario. + // then we get a better approximation to i * 5^q. // // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat // more complicated. @@ -19070,12 +19055,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]); firstproduct.low += secondproduct.high; if(secondproduct.high > firstproduct.low) { firstproduct.high++; } - // At this point, we might need to add at most one to firstproduct, but this - // can only change the value of firstproduct.high if firstproduct.low is maximal. - if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) { - // This is very unlikely, but if so, we need to do much more work! - return false; - } + // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without + // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product + // is sufficiently accurate, and more computation is not needed. } uint64_t lower = firstproduct.low; uint64_t upper = firstproduct.high; @@ -21652,14 +21634,6 @@ using namespace simd; this->error |= this->prev_incomplete; } -#ifndef SIMDJSON_IF_CONSTEXPR -#if SIMDJSON_CPLUSPLUS17 -#define SIMDJSON_IF_CONSTEXPR if constexpr -#else -#define SIMDJSON_IF_CONSTEXPR if -#endif -#endif - simdjson_inline void check_next_input(const simd8x64& input) { if(simdjson_likely(is_ascii(input))) { this->error |= this->prev_incomplete; @@ -22117,6 +22091,58 @@ class bit_indexer { simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {} +#if SIMDJSON_PREFER_REVERSE_BITS + /** + * ARM lacks a fast trailing zero instruction, but it has a fast + * bit reversal instruction and a fast leading zero instruction. + * Thus it may be profitable to reverse the bits (once) and then + * to rely on a sequence of instructions that call the leading + * zero instruction. + * + * Performance notes: + * The chosen routine is not optimal in terms of data dependency + * since zero_leading_bit might require two instructions. However, + * it tends to minimize the total number of instructions which is + * beneficial. + */ + simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } +#else + /** + * Under recent x64 systems, we often have both a fast trailing zero + * instruction and a fast 'clear-lower-bit' instruction so the following + * algorithm can be competitive. + */ + + simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) { + this->tail[i] = idx + trailing_zeroes(bits); + bits = clear_lowest_bit(bits); + } +#endif // SIMDJSON_PREFER_REVERSE_BITS + + template + simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) { + write_index(idx, bits, START); + SIMDJSON_IF_CONSTEXPR (N > 1) { + write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits); + } + return START+N; + } + + template + simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) { + write_indexes(idx, bits); + SIMDJSON_IF_CONSTEXPR ((START+STEP) < END) { + if (simdjson_unlikely((START+STEP) < cnt)) { + write_indexes_stepped<(START+STEP(idx, bits, cnt); + } + } + return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP; + } + // flatten out values in 'bits' assuming that they are are to have values of idx // plus their position in the bitvector, and store these indexes at // base_ptr[base] incrementing base as we go @@ -22134,91 +22160,29 @@ class bit_indexer { // it helps tremendously. if (bits == 0) return; -#if SIMDJSON_PREFER_REVERSE_BITS - /** - * ARM lacks a fast trailing zero instruction, but it has a fast - * bit reversal instruction and a fast leading zero instruction. - * Thus it may be profitable to reverse the bits (once) and then - * to rely on a sequence of instructions that call the leading - * zero instruction. - * - * Performance notes: - * The chosen routine is not optimal in terms of data dependency - * since zero_leading_bit might require two instructions. However, - * it tends to minimize the total number of instructions which is - * beneficial. - */ - uint64_t rev_bits = reverse_bits(bits); int cnt = static_cast(count_ones(bits)); - int i = 0; - // Do the first 8 all together - for (; i<8; i++) { - int lz = leading_zeroes(rev_bits); - this->tail[i] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); - } - // Do the next 8 all together (we hope in most cases it won't happen at all - // and the branch is easily predicted). - if (simdjson_unlikely(cnt > 8)) { - i = 8; - for (; i<16; i++) { - int lz = leading_zeroes(rev_bits); - this->tail[i] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); - } +#if SIMDJSON_PREFER_REVERSE_BITS + bits = reverse_bits(bits); +#endif +#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP + static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP; +#else + static constexpr const int STEP = 4; +#endif + static constexpr const int STEP_UNTIL = 24; - // Most files don't have 16+ structurals per block, so we take several basically guaranteed - // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) - // or the start of a value ("abc" true 123) every four characters. - if (simdjson_unlikely(cnt > 16)) { - i = 16; - while (rev_bits != 0) { - int lz = leading_zeroes(rev_bits); - this->tail[i++] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); + write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt); + SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) { + if (simdjson_unlikely(STEP_UNTIL < cnt)) { + for (int i=STEP_UNTIL; itail += cnt; -#else // SIMDJSON_PREFER_REVERSE_BITS - /** - * Under recent x64 systems, we often have both a fast trailing zero - * instruction and a fast 'clear-lower-bit' instruction so the following - * algorithm can be competitive. - */ - - int cnt = static_cast(count_ones(bits)); - // Do the first 8 all together - for (int i=0; i<8; i++) { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - } - - // Do the next 8 all together (we hope in most cases it won't happen at all - // and the branch is easily predicted). - if (simdjson_unlikely(cnt > 8)) { - for (int i=8; i<16; i++) { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - } - - // Most files don't have 16+ structurals per block, so we take several basically guaranteed - // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) - // or the start of a value ("abc" true 123) every four characters. - if (simdjson_unlikely(cnt > 16)) { - int i = 16; - do { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - i++; - } while (i < cnt); - } - } this->tail += cnt; -#endif } #endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER @@ -23067,15 +23031,17 @@ simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primit } template simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept { + // Use the fact that most scalars are going to be either strings or numbers. + if(*value == '"') { + return visitor.visit_string(*this, value); + } else if (((*value - '0') < 10) || (*value == '-')) { + return visitor.visit_number(*this, value); + } + // true, false, null are uncommon. switch (*value) { - case '"': return visitor.visit_string(*this, value); case 't': return visitor.visit_true_atom(*this, value); case 'f': return visitor.visit_false_atom(*this, value); case 'n': return visitor.visit_null_atom(*this, value); - case '-': - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - return visitor.visit_number(*this, value); default: log_error("Non-value found when value was expected!"); return TAPE_ERROR; @@ -25242,8 +25208,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q // to the desired approximation using one multiplication. Sometimes it does not suffice. // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and - // then we get a better approximation to i * 5^q. In very rare cases, even that - // will not suffice, though it is seemingly very hard to find such a scenario. + // then we get a better approximation to i * 5^q. // // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat // more complicated. @@ -25258,12 +25223,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]); firstproduct.low += secondproduct.high; if(secondproduct.high > firstproduct.low) { firstproduct.high++; } - // At this point, we might need to add at most one to firstproduct, but this - // can only change the value of firstproduct.high if firstproduct.low is maximal. - if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) { - // This is very unlikely, but if so, we need to do much more work! - return false; - } + // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without + // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product + // is sufficiently accurate, and more computation is not needed. } uint64_t lower = firstproduct.low; uint64_t upper = firstproduct.high; @@ -27838,14 +27800,6 @@ using namespace simd; this->error |= this->prev_incomplete; } -#ifndef SIMDJSON_IF_CONSTEXPR -#if SIMDJSON_CPLUSPLUS17 -#define SIMDJSON_IF_CONSTEXPR if constexpr -#else -#define SIMDJSON_IF_CONSTEXPR if -#endif -#endif - simdjson_inline void check_next_input(const simd8x64& input) { if(simdjson_likely(is_ascii(input))) { this->error |= this->prev_incomplete; @@ -28303,6 +28257,58 @@ class bit_indexer { simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {} +#if SIMDJSON_PREFER_REVERSE_BITS + /** + * ARM lacks a fast trailing zero instruction, but it has a fast + * bit reversal instruction and a fast leading zero instruction. + * Thus it may be profitable to reverse the bits (once) and then + * to rely on a sequence of instructions that call the leading + * zero instruction. + * + * Performance notes: + * The chosen routine is not optimal in terms of data dependency + * since zero_leading_bit might require two instructions. However, + * it tends to minimize the total number of instructions which is + * beneficial. + */ + simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } +#else + /** + * Under recent x64 systems, we often have both a fast trailing zero + * instruction and a fast 'clear-lower-bit' instruction so the following + * algorithm can be competitive. + */ + + simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) { + this->tail[i] = idx + trailing_zeroes(bits); + bits = clear_lowest_bit(bits); + } +#endif // SIMDJSON_PREFER_REVERSE_BITS + + template + simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) { + write_index(idx, bits, START); + SIMDJSON_IF_CONSTEXPR (N > 1) { + write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits); + } + return START+N; + } + + template + simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) { + write_indexes(idx, bits); + SIMDJSON_IF_CONSTEXPR ((START+STEP) < END) { + if (simdjson_unlikely((START+STEP) < cnt)) { + write_indexes_stepped<(START+STEP(idx, bits, cnt); + } + } + return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP; + } + // flatten out values in 'bits' assuming that they are are to have values of idx // plus their position in the bitvector, and store these indexes at // base_ptr[base] incrementing base as we go @@ -28320,91 +28326,29 @@ class bit_indexer { // it helps tremendously. if (bits == 0) return; -#if SIMDJSON_PREFER_REVERSE_BITS - /** - * ARM lacks a fast trailing zero instruction, but it has a fast - * bit reversal instruction and a fast leading zero instruction. - * Thus it may be profitable to reverse the bits (once) and then - * to rely on a sequence of instructions that call the leading - * zero instruction. - * - * Performance notes: - * The chosen routine is not optimal in terms of data dependency - * since zero_leading_bit might require two instructions. However, - * it tends to minimize the total number of instructions which is - * beneficial. - */ - uint64_t rev_bits = reverse_bits(bits); int cnt = static_cast(count_ones(bits)); - int i = 0; - // Do the first 8 all together - for (; i<8; i++) { - int lz = leading_zeroes(rev_bits); - this->tail[i] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); - } - // Do the next 8 all together (we hope in most cases it won't happen at all - // and the branch is easily predicted). - if (simdjson_unlikely(cnt > 8)) { - i = 8; - for (; i<16; i++) { - int lz = leading_zeroes(rev_bits); - this->tail[i] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); - } +#if SIMDJSON_PREFER_REVERSE_BITS + bits = reverse_bits(bits); +#endif +#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP + static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP; +#else + static constexpr const int STEP = 4; +#endif + static constexpr const int STEP_UNTIL = 24; - // Most files don't have 16+ structurals per block, so we take several basically guaranteed - // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) - // or the start of a value ("abc" true 123) every four characters. - if (simdjson_unlikely(cnt > 16)) { - i = 16; - while (rev_bits != 0) { - int lz = leading_zeroes(rev_bits); - this->tail[i++] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); + write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt); + SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) { + if (simdjson_unlikely(STEP_UNTIL < cnt)) { + for (int i=STEP_UNTIL; itail += cnt; -#else // SIMDJSON_PREFER_REVERSE_BITS - /** - * Under recent x64 systems, we often have both a fast trailing zero - * instruction and a fast 'clear-lower-bit' instruction so the following - * algorithm can be competitive. - */ - - int cnt = static_cast(count_ones(bits)); - // Do the first 8 all together - for (int i=0; i<8; i++) { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - } - - // Do the next 8 all together (we hope in most cases it won't happen at all - // and the branch is easily predicted). - if (simdjson_unlikely(cnt > 8)) { - for (int i=8; i<16; i++) { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - } - - // Most files don't have 16+ structurals per block, so we take several basically guaranteed - // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) - // or the start of a value ("abc" true 123) every four characters. - if (simdjson_unlikely(cnt > 16)) { - int i = 16; - do { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - i++; - } while (i < cnt); - } - } this->tail += cnt; -#endif } #endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER @@ -29253,15 +29197,17 @@ simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primit } template simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept { + // Use the fact that most scalars are going to be either strings or numbers. + if(*value == '"') { + return visitor.visit_string(*this, value); + } else if (((*value - '0') < 10) || (*value == '-')) { + return visitor.visit_number(*this, value); + } + // true, false, null are uncommon. switch (*value) { - case '"': return visitor.visit_string(*this, value); case 't': return visitor.visit_true_atom(*this, value); case 'f': return visitor.visit_false_atom(*this, value); case 'n': return visitor.visit_null_atom(*this, value); - case '-': - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - return visitor.visit_number(*this, value); default: log_error("Non-value found when value was expected!"); return TAPE_ERROR; @@ -31586,8 +31532,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q // to the desired approximation using one multiplication. Sometimes it does not suffice. // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and - // then we get a better approximation to i * 5^q. In very rare cases, even that - // will not suffice, though it is seemingly very hard to find such a scenario. + // then we get a better approximation to i * 5^q. // // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat // more complicated. @@ -31602,12 +31547,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]); firstproduct.low += secondproduct.high; if(secondproduct.high > firstproduct.low) { firstproduct.high++; } - // At this point, we might need to add at most one to firstproduct, but this - // can only change the value of firstproduct.high if firstproduct.low is maximal. - if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) { - // This is very unlikely, but if so, we need to do much more work! - return false; - } + // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without + // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product + // is sufficiently accurate, and more computation is not needed. } uint64_t lower = firstproduct.low; uint64_t upper = firstproduct.high; @@ -34295,14 +34237,6 @@ using namespace simd; this->error |= this->prev_incomplete; } -#ifndef SIMDJSON_IF_CONSTEXPR -#if SIMDJSON_CPLUSPLUS17 -#define SIMDJSON_IF_CONSTEXPR if constexpr -#else -#define SIMDJSON_IF_CONSTEXPR if -#endif -#endif - simdjson_inline void check_next_input(const simd8x64& input) { if(simdjson_likely(is_ascii(input))) { this->error |= this->prev_incomplete; @@ -34760,6 +34694,58 @@ class bit_indexer { simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {} +#if SIMDJSON_PREFER_REVERSE_BITS + /** + * ARM lacks a fast trailing zero instruction, but it has a fast + * bit reversal instruction and a fast leading zero instruction. + * Thus it may be profitable to reverse the bits (once) and then + * to rely on a sequence of instructions that call the leading + * zero instruction. + * + * Performance notes: + * The chosen routine is not optimal in terms of data dependency + * since zero_leading_bit might require two instructions. However, + * it tends to minimize the total number of instructions which is + * beneficial. + */ + simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } +#else + /** + * Under recent x64 systems, we often have both a fast trailing zero + * instruction and a fast 'clear-lower-bit' instruction so the following + * algorithm can be competitive. + */ + + simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) { + this->tail[i] = idx + trailing_zeroes(bits); + bits = clear_lowest_bit(bits); + } +#endif // SIMDJSON_PREFER_REVERSE_BITS + + template + simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) { + write_index(idx, bits, START); + SIMDJSON_IF_CONSTEXPR (N > 1) { + write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits); + } + return START+N; + } + + template + simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) { + write_indexes(idx, bits); + SIMDJSON_IF_CONSTEXPR ((START+STEP) < END) { + if (simdjson_unlikely((START+STEP) < cnt)) { + write_indexes_stepped<(START+STEP(idx, bits, cnt); + } + } + return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP; + } + // flatten out values in 'bits' assuming that they are are to have values of idx // plus their position in the bitvector, and store these indexes at // base_ptr[base] incrementing base as we go @@ -34777,91 +34763,29 @@ class bit_indexer { // it helps tremendously. if (bits == 0) return; -#if SIMDJSON_PREFER_REVERSE_BITS - /** - * ARM lacks a fast trailing zero instruction, but it has a fast - * bit reversal instruction and a fast leading zero instruction. - * Thus it may be profitable to reverse the bits (once) and then - * to rely on a sequence of instructions that call the leading - * zero instruction. - * - * Performance notes: - * The chosen routine is not optimal in terms of data dependency - * since zero_leading_bit might require two instructions. However, - * it tends to minimize the total number of instructions which is - * beneficial. - */ - uint64_t rev_bits = reverse_bits(bits); int cnt = static_cast(count_ones(bits)); - int i = 0; - // Do the first 8 all together - for (; i<8; i++) { - int lz = leading_zeroes(rev_bits); - this->tail[i] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); - } - // Do the next 8 all together (we hope in most cases it won't happen at all - // and the branch is easily predicted). - if (simdjson_unlikely(cnt > 8)) { - i = 8; - for (; i<16; i++) { - int lz = leading_zeroes(rev_bits); - this->tail[i] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); - } +#if SIMDJSON_PREFER_REVERSE_BITS + bits = reverse_bits(bits); +#endif +#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP + static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP; +#else + static constexpr const int STEP = 4; +#endif + static constexpr const int STEP_UNTIL = 24; - // Most files don't have 16+ structurals per block, so we take several basically guaranteed - // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) - // or the start of a value ("abc" true 123) every four characters. - if (simdjson_unlikely(cnt > 16)) { - i = 16; - while (rev_bits != 0) { - int lz = leading_zeroes(rev_bits); - this->tail[i++] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); + write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt); + SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) { + if (simdjson_unlikely(STEP_UNTIL < cnt)) { + for (int i=STEP_UNTIL; itail += cnt; -#else // SIMDJSON_PREFER_REVERSE_BITS - /** - * Under recent x64 systems, we often have both a fast trailing zero - * instruction and a fast 'clear-lower-bit' instruction so the following - * algorithm can be competitive. - */ - - int cnt = static_cast(count_ones(bits)); - // Do the first 8 all together - for (int i=0; i<8; i++) { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - } - - // Do the next 8 all together (we hope in most cases it won't happen at all - // and the branch is easily predicted). - if (simdjson_unlikely(cnt > 8)) { - for (int i=8; i<16; i++) { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - } - - // Most files don't have 16+ structurals per block, so we take several basically guaranteed - // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) - // or the start of a value ("abc" true 123) every four characters. - if (simdjson_unlikely(cnt > 16)) { - int i = 16; - do { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - i++; - } while (i < cnt); - } - } this->tail += cnt; -#endif } #endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER @@ -35710,15 +35634,17 @@ simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primit } template simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept { + // Use the fact that most scalars are going to be either strings or numbers. + if(*value == '"') { + return visitor.visit_string(*this, value); + } else if (((*value - '0') < 10) || (*value == '-')) { + return visitor.visit_number(*this, value); + } + // true, false, null are uncommon. switch (*value) { - case '"': return visitor.visit_string(*this, value); case 't': return visitor.visit_true_atom(*this, value); case 'f': return visitor.visit_false_atom(*this, value); case 'n': return visitor.visit_null_atom(*this, value); - case '-': - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - return visitor.visit_number(*this, value); default: log_error("Non-value found when value was expected!"); return TAPE_ERROR; @@ -38296,8 +38222,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q // to the desired approximation using one multiplication. Sometimes it does not suffice. // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and - // then we get a better approximation to i * 5^q. In very rare cases, even that - // will not suffice, though it is seemingly very hard to find such a scenario. + // then we get a better approximation to i * 5^q. // // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat // more complicated. @@ -38312,12 +38237,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]); firstproduct.low += secondproduct.high; if(secondproduct.high > firstproduct.low) { firstproduct.high++; } - // At this point, we might need to add at most one to firstproduct, but this - // can only change the value of firstproduct.high if firstproduct.low is maximal. - if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) { - // This is very unlikely, but if so, we need to do much more work! - return false; - } + // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without + // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product + // is sufficiently accurate, and more computation is not needed. } uint64_t lower = firstproduct.low; uint64_t upper = firstproduct.high; @@ -41326,14 +41248,6 @@ using namespace simd; this->error |= this->prev_incomplete; } -#ifndef SIMDJSON_IF_CONSTEXPR -#if SIMDJSON_CPLUSPLUS17 -#define SIMDJSON_IF_CONSTEXPR if constexpr -#else -#define SIMDJSON_IF_CONSTEXPR if -#endif -#endif - simdjson_inline void check_next_input(const simd8x64& input) { if(simdjson_likely(is_ascii(input))) { this->error |= this->prev_incomplete; @@ -41791,6 +41705,58 @@ class bit_indexer { simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {} +#if SIMDJSON_PREFER_REVERSE_BITS + /** + * ARM lacks a fast trailing zero instruction, but it has a fast + * bit reversal instruction and a fast leading zero instruction. + * Thus it may be profitable to reverse the bits (once) and then + * to rely on a sequence of instructions that call the leading + * zero instruction. + * + * Performance notes: + * The chosen routine is not optimal in terms of data dependency + * since zero_leading_bit might require two instructions. However, + * it tends to minimize the total number of instructions which is + * beneficial. + */ + simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) { + int lz = leading_zeroes(rev_bits); + this->tail[i] = static_cast(idx) + lz; + rev_bits = zero_leading_bit(rev_bits, lz); + } +#else + /** + * Under recent x64 systems, we often have both a fast trailing zero + * instruction and a fast 'clear-lower-bit' instruction so the following + * algorithm can be competitive. + */ + + simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) { + this->tail[i] = idx + trailing_zeroes(bits); + bits = clear_lowest_bit(bits); + } +#endif // SIMDJSON_PREFER_REVERSE_BITS + + template + simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) { + write_index(idx, bits, START); + SIMDJSON_IF_CONSTEXPR (N > 1) { + write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits); + } + return START+N; + } + + template + simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) { + write_indexes(idx, bits); + SIMDJSON_IF_CONSTEXPR ((START+STEP) < END) { + if (simdjson_unlikely((START+STEP) < cnt)) { + write_indexes_stepped<(START+STEP(idx, bits, cnt); + } + } + return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP; + } + // flatten out values in 'bits' assuming that they are are to have values of idx // plus their position in the bitvector, and store these indexes at // base_ptr[base] incrementing base as we go @@ -41808,91 +41774,29 @@ class bit_indexer { // it helps tremendously. if (bits == 0) return; -#if SIMDJSON_PREFER_REVERSE_BITS - /** - * ARM lacks a fast trailing zero instruction, but it has a fast - * bit reversal instruction and a fast leading zero instruction. - * Thus it may be profitable to reverse the bits (once) and then - * to rely on a sequence of instructions that call the leading - * zero instruction. - * - * Performance notes: - * The chosen routine is not optimal in terms of data dependency - * since zero_leading_bit might require two instructions. However, - * it tends to minimize the total number of instructions which is - * beneficial. - */ - uint64_t rev_bits = reverse_bits(bits); int cnt = static_cast(count_ones(bits)); - int i = 0; - // Do the first 8 all together - for (; i<8; i++) { - int lz = leading_zeroes(rev_bits); - this->tail[i] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); - } - // Do the next 8 all together (we hope in most cases it won't happen at all - // and the branch is easily predicted). - if (simdjson_unlikely(cnt > 8)) { - i = 8; - for (; i<16; i++) { - int lz = leading_zeroes(rev_bits); - this->tail[i] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); - } +#if SIMDJSON_PREFER_REVERSE_BITS + bits = reverse_bits(bits); +#endif +#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP + static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP; +#else + static constexpr const int STEP = 4; +#endif + static constexpr const int STEP_UNTIL = 24; - // Most files don't have 16+ structurals per block, so we take several basically guaranteed - // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) - // or the start of a value ("abc" true 123) every four characters. - if (simdjson_unlikely(cnt > 16)) { - i = 16; - while (rev_bits != 0) { - int lz = leading_zeroes(rev_bits); - this->tail[i++] = static_cast(idx) + lz; - rev_bits = zero_leading_bit(rev_bits, lz); + write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt); + SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) { + if (simdjson_unlikely(STEP_UNTIL < cnt)) { + for (int i=STEP_UNTIL; itail += cnt; -#else // SIMDJSON_PREFER_REVERSE_BITS - /** - * Under recent x64 systems, we often have both a fast trailing zero - * instruction and a fast 'clear-lower-bit' instruction so the following - * algorithm can be competitive. - */ - - int cnt = static_cast(count_ones(bits)); - // Do the first 8 all together - for (int i=0; i<8; i++) { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - } - - // Do the next 8 all together (we hope in most cases it won't happen at all - // and the branch is easily predicted). - if (simdjson_unlikely(cnt > 8)) { - for (int i=8; i<16; i++) { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - } - - // Most files don't have 16+ structurals per block, so we take several basically guaranteed - // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :) - // or the start of a value ("abc" true 123) every four characters. - if (simdjson_unlikely(cnt > 16)) { - int i = 16; - do { - this->tail[i] = idx + trailing_zeroes(bits); - bits = clear_lowest_bit(bits); - i++; - } while (i < cnt); - } - } this->tail += cnt; -#endif } #endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER @@ -42741,15 +42645,17 @@ simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primit } template simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept { + // Use the fact that most scalars are going to be either strings or numbers. + if(*value == '"') { + return visitor.visit_string(*this, value); + } else if (((*value - '0') < 10) || (*value == '-')) { + return visitor.visit_number(*this, value); + } + // true, false, null are uncommon. switch (*value) { - case '"': return visitor.visit_string(*this, value); case 't': return visitor.visit_true_atom(*this, value); case 'f': return visitor.visit_false_atom(*this, value); case 'n': return visitor.visit_null_atom(*this, value); - case '-': - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - return visitor.visit_number(*this, value); default: log_error("Non-value found when value was expected!"); return TAPE_ERROR; diff --git a/singleheader/simdjson.h b/singleheader/simdjson.h index 18795965b2..7264bc8561 100644 --- a/singleheader/simdjson.h +++ b/singleheader/simdjson.h @@ -1,4 +1,4 @@ -/* auto-generated on 2023-08-18 14:37:10 -0400. Do not edit! */ +/* auto-generated on . Do not edit! */ /* including simdjson.h: */ /* begin file simdjson.h */ #ifndef SIMDJSON_H @@ -79,6 +79,14 @@ #error simdjson requires a compiler compliant with the C++11 standard #endif +#ifndef SIMDJSON_IF_CONSTEXPR +#if SIMDJSON_CPLUSPLUS17 +#define SIMDJSON_IF_CONSTEXPR if constexpr +#else +#define SIMDJSON_IF_CONSTEXPR if +#endif +#endif + #endif // SIMDJSON_COMPILER_CHECK_H /* end file simdjson/compiler_check.h */ /* including simdjson/portability.h: #include "simdjson/portability.h" */ @@ -2314,7 +2322,7 @@ namespace std { #define SIMDJSON_SIMDJSON_VERSION_H /** The version of simdjson being used (major.minor.revision) */ -#define SIMDJSON_VERSION "3.2.3" +#define SIMDJSON_VERSION "3.3.0" namespace simdjson { enum { @@ -2325,11 +2333,11 @@ enum { /** * The minor version (major.MINOR.revision) of simdjson being used. */ - SIMDJSON_VERSION_MINOR = 2, + SIMDJSON_VERSION_MINOR = 3, /** * The revision (major.minor.REVISION) of simdjson being used. */ - SIMDJSON_VERSION_REVISION = 3 + SIMDJSON_VERSION_REVISION = 0 }; } // namespace simdjson @@ -11813,8 +11821,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q // to the desired approximation using one multiplication. Sometimes it does not suffice. // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and - // then we get a better approximation to i * 5^q. In very rare cases, even that - // will not suffice, though it is seemingly very hard to find such a scenario. + // then we get a better approximation to i * 5^q. // // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat // more complicated. @@ -11829,12 +11836,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]); firstproduct.low += secondproduct.high; if(secondproduct.high > firstproduct.low) { firstproduct.high++; } - // At this point, we might need to add at most one to firstproduct, but this - // can only change the value of firstproduct.high if firstproduct.low is maximal. - if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) { - // This is very unlikely, but if so, we need to do much more work! - return false; - } + // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without + // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product + // is sufficiently accurate, and more computation is not needed. } uint64_t lower = firstproduct.low; uint64_t upper = firstproduct.high; @@ -13869,8 +13873,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q // to the desired approximation using one multiplication. Sometimes it does not suffice. // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and - // then we get a better approximation to i * 5^q. In very rare cases, even that - // will not suffice, though it is seemingly very hard to find such a scenario. + // then we get a better approximation to i * 5^q. // // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat // more complicated. @@ -13885,12 +13888,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]); firstproduct.low += secondproduct.high; if(secondproduct.high > firstproduct.low) { firstproduct.high++; } - // At this point, we might need to add at most one to firstproduct, but this - // can only change the value of firstproduct.high if firstproduct.low is maximal. - if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) { - // This is very unlikely, but if so, we need to do much more work! - return false; - } + // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without + // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product + // is sufficiently accurate, and more computation is not needed. } uint64_t lower = firstproduct.low; uint64_t upper = firstproduct.high; @@ -16417,8 +16417,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q // to the desired approximation using one multiplication. Sometimes it does not suffice. // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and - // then we get a better approximation to i * 5^q. In very rare cases, even that - // will not suffice, though it is seemingly very hard to find such a scenario. + // then we get a better approximation to i * 5^q. // // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat // more complicated. @@ -16433,12 +16432,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]); firstproduct.low += secondproduct.high; if(secondproduct.high > firstproduct.low) { firstproduct.high++; } - // At this point, we might need to add at most one to firstproduct, but this - // can only change the value of firstproduct.high if firstproduct.low is maximal. - if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) { - // This is very unlikely, but if so, we need to do much more work! - return false; - } + // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without + // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product + // is sufficiently accurate, and more computation is not needed. } uint64_t lower = firstproduct.low; uint64_t upper = firstproduct.high; @@ -18964,8 +18960,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q // to the desired approximation using one multiplication. Sometimes it does not suffice. // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and - // then we get a better approximation to i * 5^q. In very rare cases, even that - // will not suffice, though it is seemingly very hard to find such a scenario. + // then we get a better approximation to i * 5^q. // // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat // more complicated. @@ -18980,12 +18975,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]); firstproduct.low += secondproduct.high; if(secondproduct.high > firstproduct.low) { firstproduct.high++; } - // At this point, we might need to add at most one to firstproduct, but this - // can only change the value of firstproduct.high if firstproduct.low is maximal. - if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) { - // This is very unlikely, but if so, we need to do much more work! - return false; - } + // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without + // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product + // is sufficiently accurate, and more computation is not needed. } uint64_t lower = firstproduct.low; uint64_t upper = firstproduct.high; @@ -21626,8 +21618,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q // to the desired approximation using one multiplication. Sometimes it does not suffice. // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and - // then we get a better approximation to i * 5^q. In very rare cases, even that - // will not suffice, though it is seemingly very hard to find such a scenario. + // then we get a better approximation to i * 5^q. // // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat // more complicated. @@ -21642,12 +21633,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]); firstproduct.low += secondproduct.high; if(secondproduct.high > firstproduct.low) { firstproduct.high++; } - // At this point, we might need to add at most one to firstproduct, but this - // can only change the value of firstproduct.high if firstproduct.low is maximal. - if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) { - // This is very unlikely, but if so, we need to do much more work! - return false; - } + // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without + // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product + // is sufficiently accurate, and more computation is not needed. } uint64_t lower = firstproduct.low; uint64_t upper = firstproduct.high; @@ -24611,8 +24599,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q // to the desired approximation using one multiplication. Sometimes it does not suffice. // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and - // then we get a better approximation to i * 5^q. In very rare cases, even that - // will not suffice, though it is seemingly very hard to find such a scenario. + // then we get a better approximation to i * 5^q. // // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat // more complicated. @@ -24627,12 +24614,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative, simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]); firstproduct.low += secondproduct.high; if(secondproduct.high > firstproduct.low) { firstproduct.high++; } - // At this point, we might need to add at most one to firstproduct, but this - // can only change the value of firstproduct.high if firstproduct.low is maximal. - if(simdjson_unlikely(firstproduct.low == 0xFFFFFFFFFFFFFFFF)) { - // This is very unlikely, but if so, we need to do much more work! - return false; - } + // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without + // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product + // is sufficiently accurate, and more computation is not needed. } uint64_t lower = firstproduct.low; uint64_t upper = firstproduct.high;