From 1875ed65503a2ea9cc4010581bb0fe2a4c40b5bc Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 20 Sep 2023 10:13:47 -0400
Subject: [PATCH] Version bump.

---
 CMakeLists.txt                      |   2 +-
 Doxyfile                            |   2 +-
 include/simdjson/simdjson_version.h |   6 +-
 singleheader/simdjson.cpp           | 904 +++++++++++++---------------
 singleheader/simdjson.h             |  88 ++-
 5 files changed, 446 insertions(+), 556 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 31e114a2ef..c7722a661b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.14)
 project(
     simdjson
     # The version number is modified by tools/release.py
-    VERSION 3.2.3
+    VERSION 3.3.0
     DESCRIPTION "Parsing gigabytes of JSON per second"
     HOMEPAGE_URL "https://simdjson.org/"
     LANGUAGES CXX C
diff --git a/Doxyfile b/Doxyfile
index 1bea93fa8a..24bb536578 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = simdjson
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "3.2.3"
+PROJECT_NUMBER         = "3.3.0"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/include/simdjson/simdjson_version.h b/include/simdjson/simdjson_version.h
index 88ca8c1735..29db5b7c66 100644
--- a/include/simdjson/simdjson_version.h
+++ b/include/simdjson/simdjson_version.h
@@ -4,7 +4,7 @@
 #define SIMDJSON_SIMDJSON_VERSION_H
 
 /** The version of simdjson being used (major.minor.revision) */
-#define SIMDJSON_VERSION "3.2.3"
+#define SIMDJSON_VERSION "3.3.0"
 
 namespace simdjson {
 enum {
@@ -15,11 +15,11 @@ enum {
   /**
    * The minor version (major.MINOR.revision) of simdjson being used.
    */
-  SIMDJSON_VERSION_MINOR = 2,
+  SIMDJSON_VERSION_MINOR = 3,
   /**
    * The revision (major.minor.REVISION) of simdjson being used.
    */
-  SIMDJSON_VERSION_REVISION = 3
+  SIMDJSON_VERSION_REVISION = 0
 };
 } // namespace simdjson
 
diff --git a/singleheader/simdjson.cpp b/singleheader/simdjson.cpp
index 5e8e8b8f6f..291df4d930 100644
--- a/singleheader/simdjson.cpp
+++ b/singleheader/simdjson.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on 2023-08-18 14:37:10 -0400. Do not edit! */
+/* auto-generated on . Do not edit! */
 /* including simdjson.cpp:  */
 /* begin file simdjson.cpp */
 #define SIMDJSON_SRC_SIMDJSON_CPP
@@ -59,6 +59,14 @@
 #error simdjson requires a compiler compliant with the C++11 standard
 #endif
 
+#ifndef SIMDJSON_IF_CONSTEXPR
+#if SIMDJSON_CPLUSPLUS17
+#define SIMDJSON_IF_CONSTEXPR if constexpr
+#else
+#define SIMDJSON_IF_CONSTEXPR if
+#endif
+#endif
+
 #endif // SIMDJSON_COMPILER_CHECK_H
 /* end file simdjson/compiler_check.h */
 /* including simdjson/portability.h: #include "simdjson/portability.h" */
@@ -8787,8 +8795,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
     // to the desired approximation using one multiplication. Sometimes it does not suffice.
     // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
-    // then we get a better approximation to i * 5^q. In very rare cases, even that
-    // will not suffice, though it is seemingly very hard to find such a scenario.
+    // then we get a better approximation to i * 5^q.
     //
     // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
     // more complicated.
@@ -8803,12 +8810,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
     if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
-    // At this point, we might need to add at most one to firstproduct, but this
-    // can only change the value of firstproduct.high if firstproduct.low is maximal.
-    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
-      // This is very unlikely, but if so, we need to do much more work!
-      return false;
-    }
+    // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
+    // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
+    // is sufficiently accurate, and more computation is not needed.
   }
   uint64_t lower = firstproduct.low;
   uint64_t upper = firstproduct.high;
@@ -11544,14 +11548,6 @@ using namespace simd;
       this->error |= this->prev_incomplete;
     }
 
-#ifndef SIMDJSON_IF_CONSTEXPR
-#if SIMDJSON_CPLUSPLUS17
-#define SIMDJSON_IF_CONSTEXPR if constexpr
-#else
-#define SIMDJSON_IF_CONSTEXPR if
-#endif
-#endif
-
     simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
       if(simdjson_likely(is_ascii(input))) {
         this->error |= this->prev_incomplete;
@@ -12009,6 +12005,58 @@ class bit_indexer {
 
   simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
 
+#if SIMDJSON_PREFER_REVERSE_BITS
+  /**
+    * ARM lacks a fast trailing zero instruction, but it has a fast
+    * bit reversal instruction and a fast leading zero instruction.
+    * Thus it may be profitable to reverse the bits (once) and then
+    * to rely on a sequence of instructions that call the leading
+    * zero instruction.
+    *
+    * Performance notes:
+    * The chosen routine is not optimal in terms of data dependency
+    * since zero_leading_bit might require two instructions. However,
+    * it tends to minimize the total number of instructions which is
+    * beneficial.
+    */
+  simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) {
+    int lz = leading_zeroes(rev_bits);
+    this->tail[i] = static_cast<uint32_t>(idx) + lz;
+    rev_bits = zero_leading_bit(rev_bits, lz);
+  }
+#else
+  /**
+    * Under recent x64 systems, we often have both a fast trailing zero
+    * instruction and a fast 'clear-lower-bit' instruction so the following
+    * algorithm can be competitive.
+    */
+
+  simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) {
+    this->tail[i] = idx + trailing_zeroes(bits);
+    bits = clear_lowest_bit(bits);
+  }
+#endif // SIMDJSON_PREFER_REVERSE_BITS
+
+  template <int START, int N>
+  simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) {
+    write_index(idx, bits, START);
+    SIMDJSON_IF_CONSTEXPR (N > 1) {
+      write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits);
+    }
+    return START+N;
+  }
+
+  template <int START, int END, int STEP>
+  simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) {
+    write_indexes<START, STEP>(idx, bits);
+    SIMDJSON_IF_CONSTEXPR ((START+STEP)  < END) {
+      if (simdjson_unlikely((START+STEP) < cnt)) {
+        write_indexes_stepped<(START+STEP<END?START+STEP:END), END, STEP>(idx, bits, cnt);
+      }
+    }
+    return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP;
+  }
+
   // flatten out values in 'bits' assuming that they are are to have values of idx
   // plus their position in the bitvector, and store these indexes at
   // base_ptr[base] incrementing base as we go
@@ -12026,91 +12074,29 @@ class bit_indexer {
     // it helps tremendously.
     if (bits == 0)
         return;
-#if SIMDJSON_PREFER_REVERSE_BITS
-    /**
-     * ARM lacks a fast trailing zero instruction, but it has a fast
-     * bit reversal instruction and a fast leading zero instruction.
-     * Thus it may be profitable to reverse the bits (once) and then
-     * to rely on a sequence of instructions that call the leading
-     * zero instruction.
-     *
-     * Performance notes:
-     * The chosen routine is not optimal in terms of data dependency
-     * since zero_leading_bit might require two instructions. However,
-     * it tends to minimize the total number of instructions which is
-     * beneficial.
-     */
 
-    uint64_t rev_bits = reverse_bits(bits);
     int cnt = static_cast<int>(count_ones(bits));
-    int i = 0;
-    // Do the first 8 all together
-    for (; i<8; i++) {
-      int lz = leading_zeroes(rev_bits);
-      this->tail[i] = static_cast<uint32_t>(idx) + lz;
-      rev_bits = zero_leading_bit(rev_bits, lz);
-    }
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (simdjson_unlikely(cnt > 8)) {
-      i = 8;
-      for (; i<16; i++) {
-        int lz = leading_zeroes(rev_bits);
-        this->tail[i] = static_cast<uint32_t>(idx) + lz;
-        rev_bits = zero_leading_bit(rev_bits, lz);
-      }
 
+#if SIMDJSON_PREFER_REVERSE_BITS
+    bits = reverse_bits(bits);
+#endif
+#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP
+    static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP;
+#else
+    static constexpr const int STEP = 4;
+#endif
+    static constexpr const int STEP_UNTIL = 24;
 
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (simdjson_unlikely(cnt > 16)) {
-        i = 16;
-        while (rev_bits != 0) {
-          int lz = leading_zeroes(rev_bits);
-          this->tail[i++] = static_cast<uint32_t>(idx) + lz;
-          rev_bits = zero_leading_bit(rev_bits, lz);
+    write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt);
+    SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) {
+      if (simdjson_unlikely(STEP_UNTIL < cnt)) {
+        for (int i=STEP_UNTIL; i<cnt; i++) {
+          write_index(idx, bits, i);
         }
       }
     }
-    this->tail += cnt;
-#else // SIMDJSON_PREFER_REVERSE_BITS
-    /**
-     * Under recent x64 systems, we often have both a fast trailing zero
-     * instruction and a fast 'clear-lower-bit' instruction so the following
-     * algorithm can be competitive.
-     */
-
-    int cnt = static_cast<int>(count_ones(bits));
-    // Do the first 8 all together
-    for (int i=0; i<8; i++) {
-      this->tail[i] = idx + trailing_zeroes(bits);
-      bits = clear_lowest_bit(bits);
-    }
-
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (simdjson_unlikely(cnt > 8)) {
-      for (int i=8; i<16; i++) {
-        this->tail[i] = idx + trailing_zeroes(bits);
-        bits = clear_lowest_bit(bits);
-      }
-
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (simdjson_unlikely(cnt > 16)) {
-        int i = 16;
-        do {
-          this->tail[i] = idx + trailing_zeroes(bits);
-          bits = clear_lowest_bit(bits);
-          i++;
-        } while (i < cnt);
-      }
-    }
 
     this->tail += cnt;
-#endif
   }
 #endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
 
@@ -12959,15 +12945,17 @@ simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primit
 }
 template<typename V>
 simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
+  // Use the fact that most scalars are going to be either strings or numbers.
+  if(*value == '"') {
+    return visitor.visit_string(*this, value);
+  } else if (((*value - '0')  < 10) || (*value == '-')) {
+    return visitor.visit_number(*this, value);
+  }
+  // true, false, null are uncommon.
   switch (*value) {
-    case '"': return visitor.visit_string(*this, value);
     case 't': return visitor.visit_true_atom(*this, value);
     case 'f': return visitor.visit_false_atom(*this, value);
     case 'n': return visitor.visit_null_atom(*this, value);
-    case '-':
-    case '0': case '1': case '2': case '3': case '4':
-    case '5': case '6': case '7': case '8': case '9':
-      return visitor.visit_number(*this, value);
     default:
       log_error("Non-value found when value was expected!");
       return TAPE_ERROR;
@@ -14645,8 +14633,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
     // to the desired approximation using one multiplication. Sometimes it does not suffice.
     // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
-    // then we get a better approximation to i * 5^q. In very rare cases, even that
-    // will not suffice, though it is seemingly very hard to find such a scenario.
+    // then we get a better approximation to i * 5^q.
     //
     // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
     // more complicated.
@@ -14661,12 +14648,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
     if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
-    // At this point, we might need to add at most one to firstproduct, but this
-    // can only change the value of firstproduct.high if firstproduct.low is maximal.
-    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
-      // This is very unlikely, but if so, we need to do much more work!
-      return false;
-    }
+    // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
+    // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
+    // is sufficiently accurate, and more computation is not needed.
   }
   uint64_t lower = firstproduct.low;
   uint64_t upper = firstproduct.high;
@@ -16840,15 +16824,17 @@ simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primit
 }
 template<typename V>
 simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
+  // Use the fact that most scalars are going to be either strings or numbers.
+  if(*value == '"') {
+    return visitor.visit_string(*this, value);
+  } else if (((*value - '0')  < 10) || (*value == '-')) {
+    return visitor.visit_number(*this, value);
+  }
+  // true, false, null are uncommon.
   switch (*value) {
-    case '"': return visitor.visit_string(*this, value);
     case 't': return visitor.visit_true_atom(*this, value);
     case 'f': return visitor.visit_false_atom(*this, value);
     case 'n': return visitor.visit_null_atom(*this, value);
-    case '-':
-    case '0': case '1': case '2': case '3': case '4':
-    case '5': case '6': case '7': case '8': case '9':
-      return visitor.visit_number(*this, value);
     default:
       log_error("Non-value found when value was expected!");
       return TAPE_ERROR;
@@ -19054,8 +19040,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
     // to the desired approximation using one multiplication. Sometimes it does not suffice.
     // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
-    // then we get a better approximation to i * 5^q. In very rare cases, even that
-    // will not suffice, though it is seemingly very hard to find such a scenario.
+    // then we get a better approximation to i * 5^q.
     //
     // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
     // more complicated.
@@ -19070,12 +19055,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
     if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
-    // At this point, we might need to add at most one to firstproduct, but this
-    // can only change the value of firstproduct.high if firstproduct.low is maximal.
-    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
-      // This is very unlikely, but if so, we need to do much more work!
-      return false;
-    }
+    // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
+    // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
+    // is sufficiently accurate, and more computation is not needed.
   }
   uint64_t lower = firstproduct.low;
   uint64_t upper = firstproduct.high;
@@ -21652,14 +21634,6 @@ using namespace simd;
       this->error |= this->prev_incomplete;
     }
 
-#ifndef SIMDJSON_IF_CONSTEXPR
-#if SIMDJSON_CPLUSPLUS17
-#define SIMDJSON_IF_CONSTEXPR if constexpr
-#else
-#define SIMDJSON_IF_CONSTEXPR if
-#endif
-#endif
-
     simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
       if(simdjson_likely(is_ascii(input))) {
         this->error |= this->prev_incomplete;
@@ -22117,6 +22091,58 @@ class bit_indexer {
 
   simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
 
+#if SIMDJSON_PREFER_REVERSE_BITS
+  /**
+    * ARM lacks a fast trailing zero instruction, but it has a fast
+    * bit reversal instruction and a fast leading zero instruction.
+    * Thus it may be profitable to reverse the bits (once) and then
+    * to rely on a sequence of instructions that call the leading
+    * zero instruction.
+    *
+    * Performance notes:
+    * The chosen routine is not optimal in terms of data dependency
+    * since zero_leading_bit might require two instructions. However,
+    * it tends to minimize the total number of instructions which is
+    * beneficial.
+    */
+  simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) {
+    int lz = leading_zeroes(rev_bits);
+    this->tail[i] = static_cast<uint32_t>(idx) + lz;
+    rev_bits = zero_leading_bit(rev_bits, lz);
+  }
+#else
+  /**
+    * Under recent x64 systems, we often have both a fast trailing zero
+    * instruction and a fast 'clear-lower-bit' instruction so the following
+    * algorithm can be competitive.
+    */
+
+  simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) {
+    this->tail[i] = idx + trailing_zeroes(bits);
+    bits = clear_lowest_bit(bits);
+  }
+#endif // SIMDJSON_PREFER_REVERSE_BITS
+
+  template <int START, int N>
+  simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) {
+    write_index(idx, bits, START);
+    SIMDJSON_IF_CONSTEXPR (N > 1) {
+      write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits);
+    }
+    return START+N;
+  }
+
+  template <int START, int END, int STEP>
+  simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) {
+    write_indexes<START, STEP>(idx, bits);
+    SIMDJSON_IF_CONSTEXPR ((START+STEP)  < END) {
+      if (simdjson_unlikely((START+STEP) < cnt)) {
+        write_indexes_stepped<(START+STEP<END?START+STEP:END), END, STEP>(idx, bits, cnt);
+      }
+    }
+    return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP;
+  }
+
   // flatten out values in 'bits' assuming that they are are to have values of idx
   // plus their position in the bitvector, and store these indexes at
   // base_ptr[base] incrementing base as we go
@@ -22134,91 +22160,29 @@ class bit_indexer {
     // it helps tremendously.
     if (bits == 0)
         return;
-#if SIMDJSON_PREFER_REVERSE_BITS
-    /**
-     * ARM lacks a fast trailing zero instruction, but it has a fast
-     * bit reversal instruction and a fast leading zero instruction.
-     * Thus it may be profitable to reverse the bits (once) and then
-     * to rely on a sequence of instructions that call the leading
-     * zero instruction.
-     *
-     * Performance notes:
-     * The chosen routine is not optimal in terms of data dependency
-     * since zero_leading_bit might require two instructions. However,
-     * it tends to minimize the total number of instructions which is
-     * beneficial.
-     */
 
-    uint64_t rev_bits = reverse_bits(bits);
     int cnt = static_cast<int>(count_ones(bits));
-    int i = 0;
-    // Do the first 8 all together
-    for (; i<8; i++) {
-      int lz = leading_zeroes(rev_bits);
-      this->tail[i] = static_cast<uint32_t>(idx) + lz;
-      rev_bits = zero_leading_bit(rev_bits, lz);
-    }
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (simdjson_unlikely(cnt > 8)) {
-      i = 8;
-      for (; i<16; i++) {
-        int lz = leading_zeroes(rev_bits);
-        this->tail[i] = static_cast<uint32_t>(idx) + lz;
-        rev_bits = zero_leading_bit(rev_bits, lz);
-      }
 
+#if SIMDJSON_PREFER_REVERSE_BITS
+    bits = reverse_bits(bits);
+#endif
+#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP
+    static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP;
+#else
+    static constexpr const int STEP = 4;
+#endif
+    static constexpr const int STEP_UNTIL = 24;
 
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (simdjson_unlikely(cnt > 16)) {
-        i = 16;
-        while (rev_bits != 0) {
-          int lz = leading_zeroes(rev_bits);
-          this->tail[i++] = static_cast<uint32_t>(idx) + lz;
-          rev_bits = zero_leading_bit(rev_bits, lz);
+    write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt);
+    SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) {
+      if (simdjson_unlikely(STEP_UNTIL < cnt)) {
+        for (int i=STEP_UNTIL; i<cnt; i++) {
+          write_index(idx, bits, i);
         }
       }
     }
-    this->tail += cnt;
-#else // SIMDJSON_PREFER_REVERSE_BITS
-    /**
-     * Under recent x64 systems, we often have both a fast trailing zero
-     * instruction and a fast 'clear-lower-bit' instruction so the following
-     * algorithm can be competitive.
-     */
-
-    int cnt = static_cast<int>(count_ones(bits));
-    // Do the first 8 all together
-    for (int i=0; i<8; i++) {
-      this->tail[i] = idx + trailing_zeroes(bits);
-      bits = clear_lowest_bit(bits);
-    }
-
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (simdjson_unlikely(cnt > 8)) {
-      for (int i=8; i<16; i++) {
-        this->tail[i] = idx + trailing_zeroes(bits);
-        bits = clear_lowest_bit(bits);
-      }
-
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (simdjson_unlikely(cnt > 16)) {
-        int i = 16;
-        do {
-          this->tail[i] = idx + trailing_zeroes(bits);
-          bits = clear_lowest_bit(bits);
-          i++;
-        } while (i < cnt);
-      }
-    }
 
     this->tail += cnt;
-#endif
   }
 #endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
 
@@ -23067,15 +23031,17 @@ simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primit
 }
 template<typename V>
 simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
+  // Use the fact that most scalars are going to be either strings or numbers.
+  if(*value == '"') {
+    return visitor.visit_string(*this, value);
+  } else if (((*value - '0')  < 10) || (*value == '-')) {
+    return visitor.visit_number(*this, value);
+  }
+  // true, false, null are uncommon.
   switch (*value) {
-    case '"': return visitor.visit_string(*this, value);
     case 't': return visitor.visit_true_atom(*this, value);
     case 'f': return visitor.visit_false_atom(*this, value);
     case 'n': return visitor.visit_null_atom(*this, value);
-    case '-':
-    case '0': case '1': case '2': case '3': case '4':
-    case '5': case '6': case '7': case '8': case '9':
-      return visitor.visit_number(*this, value);
     default:
       log_error("Non-value found when value was expected!");
       return TAPE_ERROR;
@@ -25242,8 +25208,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
     // to the desired approximation using one multiplication. Sometimes it does not suffice.
     // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
-    // then we get a better approximation to i * 5^q. In very rare cases, even that
-    // will not suffice, though it is seemingly very hard to find such a scenario.
+    // then we get a better approximation to i * 5^q.
     //
     // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
     // more complicated.
@@ -25258,12 +25223,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
     if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
-    // At this point, we might need to add at most one to firstproduct, but this
-    // can only change the value of firstproduct.high if firstproduct.low is maximal.
-    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
-      // This is very unlikely, but if so, we need to do much more work!
-      return false;
-    }
+    // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
+    // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
+    // is sufficiently accurate, and more computation is not needed.
   }
   uint64_t lower = firstproduct.low;
   uint64_t upper = firstproduct.high;
@@ -27838,14 +27800,6 @@ using namespace simd;
       this->error |= this->prev_incomplete;
     }
 
-#ifndef SIMDJSON_IF_CONSTEXPR
-#if SIMDJSON_CPLUSPLUS17
-#define SIMDJSON_IF_CONSTEXPR if constexpr
-#else
-#define SIMDJSON_IF_CONSTEXPR if
-#endif
-#endif
-
     simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
       if(simdjson_likely(is_ascii(input))) {
         this->error |= this->prev_incomplete;
@@ -28303,6 +28257,58 @@ class bit_indexer {
 
   simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
 
+#if SIMDJSON_PREFER_REVERSE_BITS
+  /**
+    * ARM lacks a fast trailing zero instruction, but it has a fast
+    * bit reversal instruction and a fast leading zero instruction.
+    * Thus it may be profitable to reverse the bits (once) and then
+    * to rely on a sequence of instructions that call the leading
+    * zero instruction.
+    *
+    * Performance notes:
+    * The chosen routine is not optimal in terms of data dependency
+    * since zero_leading_bit might require two instructions. However,
+    * it tends to minimize the total number of instructions which is
+    * beneficial.
+    */
+  simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) {
+    int lz = leading_zeroes(rev_bits);
+    this->tail[i] = static_cast<uint32_t>(idx) + lz;
+    rev_bits = zero_leading_bit(rev_bits, lz);
+  }
+#else
+  /**
+    * Under recent x64 systems, we often have both a fast trailing zero
+    * instruction and a fast 'clear-lower-bit' instruction so the following
+    * algorithm can be competitive.
+    */
+
+  simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) {
+    this->tail[i] = idx + trailing_zeroes(bits);
+    bits = clear_lowest_bit(bits);
+  }
+#endif // SIMDJSON_PREFER_REVERSE_BITS
+
+  template <int START, int N>
+  simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) {
+    write_index(idx, bits, START);
+    SIMDJSON_IF_CONSTEXPR (N > 1) {
+      write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits);
+    }
+    return START+N;
+  }
+
+  template <int START, int END, int STEP>
+  simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) {
+    write_indexes<START, STEP>(idx, bits);
+    SIMDJSON_IF_CONSTEXPR ((START+STEP)  < END) {
+      if (simdjson_unlikely((START+STEP) < cnt)) {
+        write_indexes_stepped<(START+STEP<END?START+STEP:END), END, STEP>(idx, bits, cnt);
+      }
+    }
+    return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP;
+  }
+
   // flatten out values in 'bits' assuming that they are are to have values of idx
   // plus their position in the bitvector, and store these indexes at
   // base_ptr[base] incrementing base as we go
@@ -28320,91 +28326,29 @@ class bit_indexer {
     // it helps tremendously.
     if (bits == 0)
         return;
-#if SIMDJSON_PREFER_REVERSE_BITS
-    /**
-     * ARM lacks a fast trailing zero instruction, but it has a fast
-     * bit reversal instruction and a fast leading zero instruction.
-     * Thus it may be profitable to reverse the bits (once) and then
-     * to rely on a sequence of instructions that call the leading
-     * zero instruction.
-     *
-     * Performance notes:
-     * The chosen routine is not optimal in terms of data dependency
-     * since zero_leading_bit might require two instructions. However,
-     * it tends to minimize the total number of instructions which is
-     * beneficial.
-     */
 
-    uint64_t rev_bits = reverse_bits(bits);
     int cnt = static_cast<int>(count_ones(bits));
-    int i = 0;
-    // Do the first 8 all together
-    for (; i<8; i++) {
-      int lz = leading_zeroes(rev_bits);
-      this->tail[i] = static_cast<uint32_t>(idx) + lz;
-      rev_bits = zero_leading_bit(rev_bits, lz);
-    }
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (simdjson_unlikely(cnt > 8)) {
-      i = 8;
-      for (; i<16; i++) {
-        int lz = leading_zeroes(rev_bits);
-        this->tail[i] = static_cast<uint32_t>(idx) + lz;
-        rev_bits = zero_leading_bit(rev_bits, lz);
-      }
 
+#if SIMDJSON_PREFER_REVERSE_BITS
+    bits = reverse_bits(bits);
+#endif
+#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP
+    static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP;
+#else
+    static constexpr const int STEP = 4;
+#endif
+    static constexpr const int STEP_UNTIL = 24;
 
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (simdjson_unlikely(cnt > 16)) {
-        i = 16;
-        while (rev_bits != 0) {
-          int lz = leading_zeroes(rev_bits);
-          this->tail[i++] = static_cast<uint32_t>(idx) + lz;
-          rev_bits = zero_leading_bit(rev_bits, lz);
+    write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt);
+    SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) {
+      if (simdjson_unlikely(STEP_UNTIL < cnt)) {
+        for (int i=STEP_UNTIL; i<cnt; i++) {
+          write_index(idx, bits, i);
         }
       }
     }
-    this->tail += cnt;
-#else // SIMDJSON_PREFER_REVERSE_BITS
-    /**
-     * Under recent x64 systems, we often have both a fast trailing zero
-     * instruction and a fast 'clear-lower-bit' instruction so the following
-     * algorithm can be competitive.
-     */
-
-    int cnt = static_cast<int>(count_ones(bits));
-    // Do the first 8 all together
-    for (int i=0; i<8; i++) {
-      this->tail[i] = idx + trailing_zeroes(bits);
-      bits = clear_lowest_bit(bits);
-    }
-
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (simdjson_unlikely(cnt > 8)) {
-      for (int i=8; i<16; i++) {
-        this->tail[i] = idx + trailing_zeroes(bits);
-        bits = clear_lowest_bit(bits);
-      }
-
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (simdjson_unlikely(cnt > 16)) {
-        int i = 16;
-        do {
-          this->tail[i] = idx + trailing_zeroes(bits);
-          bits = clear_lowest_bit(bits);
-          i++;
-        } while (i < cnt);
-      }
-    }
 
     this->tail += cnt;
-#endif
   }
 #endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
 
@@ -29253,15 +29197,17 @@ simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primit
 }
 template<typename V>
 simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
+  // Use the fact that most scalars are going to be either strings or numbers.
+  if(*value == '"') {
+    return visitor.visit_string(*this, value);
+  } else if (((*value - '0')  < 10) || (*value == '-')) {
+    return visitor.visit_number(*this, value);
+  }
+  // true, false, null are uncommon.
   switch (*value) {
-    case '"': return visitor.visit_string(*this, value);
     case 't': return visitor.visit_true_atom(*this, value);
     case 'f': return visitor.visit_false_atom(*this, value);
     case 'n': return visitor.visit_null_atom(*this, value);
-    case '-':
-    case '0': case '1': case '2': case '3': case '4':
-    case '5': case '6': case '7': case '8': case '9':
-      return visitor.visit_number(*this, value);
     default:
       log_error("Non-value found when value was expected!");
       return TAPE_ERROR;
@@ -31586,8 +31532,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
     // to the desired approximation using one multiplication. Sometimes it does not suffice.
     // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
-    // then we get a better approximation to i * 5^q. In very rare cases, even that
-    // will not suffice, though it is seemingly very hard to find such a scenario.
+    // then we get a better approximation to i * 5^q.
     //
     // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
     // more complicated.
@@ -31602,12 +31547,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
     if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
-    // At this point, we might need to add at most one to firstproduct, but this
-    // can only change the value of firstproduct.high if firstproduct.low is maximal.
-    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
-      // This is very unlikely, but if so, we need to do much more work!
-      return false;
-    }
+    // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
+    // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
+    // is sufficiently accurate, and more computation is not needed.
   }
   uint64_t lower = firstproduct.low;
   uint64_t upper = firstproduct.high;
@@ -34295,14 +34237,6 @@ using namespace simd;
       this->error |= this->prev_incomplete;
     }
 
-#ifndef SIMDJSON_IF_CONSTEXPR
-#if SIMDJSON_CPLUSPLUS17
-#define SIMDJSON_IF_CONSTEXPR if constexpr
-#else
-#define SIMDJSON_IF_CONSTEXPR if
-#endif
-#endif
-
     simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
       if(simdjson_likely(is_ascii(input))) {
         this->error |= this->prev_incomplete;
@@ -34760,6 +34694,58 @@ class bit_indexer {
 
   simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
 
+#if SIMDJSON_PREFER_REVERSE_BITS
+  /**
+    * ARM lacks a fast trailing zero instruction, but it has a fast
+    * bit reversal instruction and a fast leading zero instruction.
+    * Thus it may be profitable to reverse the bits (once) and then
+    * to rely on a sequence of instructions that call the leading
+    * zero instruction.
+    *
+    * Performance notes:
+    * The chosen routine is not optimal in terms of data dependency
+    * since zero_leading_bit might require two instructions. However,
+    * it tends to minimize the total number of instructions which is
+    * beneficial.
+    */
+  simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) {
+    int lz = leading_zeroes(rev_bits);
+    this->tail[i] = static_cast<uint32_t>(idx) + lz;
+    rev_bits = zero_leading_bit(rev_bits, lz);
+  }
+#else
+  /**
+    * Under recent x64 systems, we often have both a fast trailing zero
+    * instruction and a fast 'clear-lower-bit' instruction so the following
+    * algorithm can be competitive.
+    */
+
+  simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) {
+    this->tail[i] = idx + trailing_zeroes(bits);
+    bits = clear_lowest_bit(bits);
+  }
+#endif // SIMDJSON_PREFER_REVERSE_BITS
+
+  template <int START, int N>
+  simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) {
+    write_index(idx, bits, START);
+    SIMDJSON_IF_CONSTEXPR (N > 1) {
+      write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits);
+    }
+    return START+N;
+  }
+
+  template <int START, int END, int STEP>
+  simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) {
+    write_indexes<START, STEP>(idx, bits);
+    SIMDJSON_IF_CONSTEXPR ((START+STEP)  < END) {
+      if (simdjson_unlikely((START+STEP) < cnt)) {
+        write_indexes_stepped<(START+STEP<END?START+STEP:END), END, STEP>(idx, bits, cnt);
+      }
+    }
+    return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP;
+  }
+
   // flatten out values in 'bits' assuming that they are are to have values of idx
   // plus their position in the bitvector, and store these indexes at
   // base_ptr[base] incrementing base as we go
@@ -34777,91 +34763,29 @@ class bit_indexer {
     // it helps tremendously.
     if (bits == 0)
         return;
-#if SIMDJSON_PREFER_REVERSE_BITS
-    /**
-     * ARM lacks a fast trailing zero instruction, but it has a fast
-     * bit reversal instruction and a fast leading zero instruction.
-     * Thus it may be profitable to reverse the bits (once) and then
-     * to rely on a sequence of instructions that call the leading
-     * zero instruction.
-     *
-     * Performance notes:
-     * The chosen routine is not optimal in terms of data dependency
-     * since zero_leading_bit might require two instructions. However,
-     * it tends to minimize the total number of instructions which is
-     * beneficial.
-     */
 
-    uint64_t rev_bits = reverse_bits(bits);
     int cnt = static_cast<int>(count_ones(bits));
-    int i = 0;
-    // Do the first 8 all together
-    for (; i<8; i++) {
-      int lz = leading_zeroes(rev_bits);
-      this->tail[i] = static_cast<uint32_t>(idx) + lz;
-      rev_bits = zero_leading_bit(rev_bits, lz);
-    }
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (simdjson_unlikely(cnt > 8)) {
-      i = 8;
-      for (; i<16; i++) {
-        int lz = leading_zeroes(rev_bits);
-        this->tail[i] = static_cast<uint32_t>(idx) + lz;
-        rev_bits = zero_leading_bit(rev_bits, lz);
-      }
 
+#if SIMDJSON_PREFER_REVERSE_BITS
+    bits = reverse_bits(bits);
+#endif
+#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP
+    static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP;
+#else
+    static constexpr const int STEP = 4;
+#endif
+    static constexpr const int STEP_UNTIL = 24;
 
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (simdjson_unlikely(cnt > 16)) {
-        i = 16;
-        while (rev_bits != 0) {
-          int lz = leading_zeroes(rev_bits);
-          this->tail[i++] = static_cast<uint32_t>(idx) + lz;
-          rev_bits = zero_leading_bit(rev_bits, lz);
+    write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt);
+    SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) {
+      if (simdjson_unlikely(STEP_UNTIL < cnt)) {
+        for (int i=STEP_UNTIL; i<cnt; i++) {
+          write_index(idx, bits, i);
         }
       }
     }
-    this->tail += cnt;
-#else // SIMDJSON_PREFER_REVERSE_BITS
-    /**
-     * Under recent x64 systems, we often have both a fast trailing zero
-     * instruction and a fast 'clear-lower-bit' instruction so the following
-     * algorithm can be competitive.
-     */
-
-    int cnt = static_cast<int>(count_ones(bits));
-    // Do the first 8 all together
-    for (int i=0; i<8; i++) {
-      this->tail[i] = idx + trailing_zeroes(bits);
-      bits = clear_lowest_bit(bits);
-    }
-
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (simdjson_unlikely(cnt > 8)) {
-      for (int i=8; i<16; i++) {
-        this->tail[i] = idx + trailing_zeroes(bits);
-        bits = clear_lowest_bit(bits);
-      }
-
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (simdjson_unlikely(cnt > 16)) {
-        int i = 16;
-        do {
-          this->tail[i] = idx + trailing_zeroes(bits);
-          bits = clear_lowest_bit(bits);
-          i++;
-        } while (i < cnt);
-      }
-    }
 
     this->tail += cnt;
-#endif
   }
 #endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
 
@@ -35710,15 +35634,17 @@ simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primit
 }
 template<typename V>
 simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
+  // Use the fact that most scalars are going to be either strings or numbers.
+  if(*value == '"') {
+    return visitor.visit_string(*this, value);
+  } else if (((*value - '0')  < 10) || (*value == '-')) {
+    return visitor.visit_number(*this, value);
+  }
+  // true, false, null are uncommon.
   switch (*value) {
-    case '"': return visitor.visit_string(*this, value);
     case 't': return visitor.visit_true_atom(*this, value);
     case 'f': return visitor.visit_false_atom(*this, value);
     case 'n': return visitor.visit_null_atom(*this, value);
-    case '-':
-    case '0': case '1': case '2': case '3': case '4':
-    case '5': case '6': case '7': case '8': case '9':
-      return visitor.visit_number(*this, value);
     default:
       log_error("Non-value found when value was expected!");
       return TAPE_ERROR;
@@ -38296,8 +38222,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
     // to the desired approximation using one multiplication. Sometimes it does not suffice.
     // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
-    // then we get a better approximation to i * 5^q. In very rare cases, even that
-    // will not suffice, though it is seemingly very hard to find such a scenario.
+    // then we get a better approximation to i * 5^q.
     //
     // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
     // more complicated.
@@ -38312,12 +38237,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
     if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
-    // At this point, we might need to add at most one to firstproduct, but this
-    // can only change the value of firstproduct.high if firstproduct.low is maximal.
-    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
-      // This is very unlikely, but if so, we need to do much more work!
-      return false;
-    }
+    // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
+    // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
+    // is sufficiently accurate, and more computation is not needed.
   }
   uint64_t lower = firstproduct.low;
   uint64_t upper = firstproduct.high;
@@ -41326,14 +41248,6 @@ using namespace simd;
       this->error |= this->prev_incomplete;
     }
 
-#ifndef SIMDJSON_IF_CONSTEXPR
-#if SIMDJSON_CPLUSPLUS17
-#define SIMDJSON_IF_CONSTEXPR if constexpr
-#else
-#define SIMDJSON_IF_CONSTEXPR if
-#endif
-#endif
-
     simdjson_inline void check_next_input(const simd8x64<uint8_t>& input) {
       if(simdjson_likely(is_ascii(input))) {
         this->error |= this->prev_incomplete;
@@ -41791,6 +41705,58 @@ class bit_indexer {
 
   simdjson_inline bit_indexer(uint32_t *index_buf) : tail(index_buf) {}
 
+#if SIMDJSON_PREFER_REVERSE_BITS
+  /**
+    * ARM lacks a fast trailing zero instruction, but it has a fast
+    * bit reversal instruction and a fast leading zero instruction.
+    * Thus it may be profitable to reverse the bits (once) and then
+    * to rely on a sequence of instructions that call the leading
+    * zero instruction.
+    *
+    * Performance notes:
+    * The chosen routine is not optimal in terms of data dependency
+    * since zero_leading_bit might require two instructions. However,
+    * it tends to minimize the total number of instructions which is
+    * beneficial.
+    */
+  simdjson_inline void write_index(uint32_t idx, uint64_t& rev_bits, int i) {
+    int lz = leading_zeroes(rev_bits);
+    this->tail[i] = static_cast<uint32_t>(idx) + lz;
+    rev_bits = zero_leading_bit(rev_bits, lz);
+  }
+#else
+  /**
+    * Under recent x64 systems, we often have both a fast trailing zero
+    * instruction and a fast 'clear-lower-bit' instruction so the following
+    * algorithm can be competitive.
+    */
+
+  simdjson_inline void write_index(uint32_t idx, uint64_t& bits, int i) {
+    this->tail[i] = idx + trailing_zeroes(bits);
+    bits = clear_lowest_bit(bits);
+  }
+#endif // SIMDJSON_PREFER_REVERSE_BITS
+
+  template <int START, int N>
+  simdjson_inline int write_indexes(uint32_t idx, uint64_t& bits) {
+    write_index(idx, bits, START);
+    SIMDJSON_IF_CONSTEXPR (N > 1) {
+      write_indexes<(N-1>0?START+1:START), (N-1>=0?N-1:1)>(idx, bits);
+    }
+    return START+N;
+  }
+
+  template <int START, int END, int STEP>
+  simdjson_inline int write_indexes_stepped(uint32_t idx, uint64_t& bits, int cnt) {
+    write_indexes<START, STEP>(idx, bits);
+    SIMDJSON_IF_CONSTEXPR ((START+STEP)  < END) {
+      if (simdjson_unlikely((START+STEP) < cnt)) {
+        write_indexes_stepped<(START+STEP<END?START+STEP:END), END, STEP>(idx, bits, cnt);
+      }
+    }
+    return ((END-START) % STEP) == 0 ? END : (END-START) - ((END-START) % STEP) + STEP;
+  }
+
   // flatten out values in 'bits' assuming that they are are to have values of idx
   // plus their position in the bitvector, and store these indexes at
   // base_ptr[base] incrementing base as we go
@@ -41808,91 +41774,29 @@ class bit_indexer {
     // it helps tremendously.
     if (bits == 0)
         return;
-#if SIMDJSON_PREFER_REVERSE_BITS
-    /**
-     * ARM lacks a fast trailing zero instruction, but it has a fast
-     * bit reversal instruction and a fast leading zero instruction.
-     * Thus it may be profitable to reverse the bits (once) and then
-     * to rely on a sequence of instructions that call the leading
-     * zero instruction.
-     *
-     * Performance notes:
-     * The chosen routine is not optimal in terms of data dependency
-     * since zero_leading_bit might require two instructions. However,
-     * it tends to minimize the total number of instructions which is
-     * beneficial.
-     */
 
-    uint64_t rev_bits = reverse_bits(bits);
     int cnt = static_cast<int>(count_ones(bits));
-    int i = 0;
-    // Do the first 8 all together
-    for (; i<8; i++) {
-      int lz = leading_zeroes(rev_bits);
-      this->tail[i] = static_cast<uint32_t>(idx) + lz;
-      rev_bits = zero_leading_bit(rev_bits, lz);
-    }
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (simdjson_unlikely(cnt > 8)) {
-      i = 8;
-      for (; i<16; i++) {
-        int lz = leading_zeroes(rev_bits);
-        this->tail[i] = static_cast<uint32_t>(idx) + lz;
-        rev_bits = zero_leading_bit(rev_bits, lz);
-      }
 
+#if SIMDJSON_PREFER_REVERSE_BITS
+    bits = reverse_bits(bits);
+#endif
+#ifdef SIMDJSON_STRUCTURAL_INDEXER_STEP
+    static constexpr const int STEP = SIMDJSON_STRUCTURAL_INDEXER_STEP;
+#else
+    static constexpr const int STEP = 4;
+#endif
+    static constexpr const int STEP_UNTIL = 24;
 
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (simdjson_unlikely(cnt > 16)) {
-        i = 16;
-        while (rev_bits != 0) {
-          int lz = leading_zeroes(rev_bits);
-          this->tail[i++] = static_cast<uint32_t>(idx) + lz;
-          rev_bits = zero_leading_bit(rev_bits, lz);
+    write_indexes_stepped<0, STEP_UNTIL, STEP>(idx, bits, cnt);
+    SIMDJSON_IF_CONSTEXPR (STEP_UNTIL < 64) {
+      if (simdjson_unlikely(STEP_UNTIL < cnt)) {
+        for (int i=STEP_UNTIL; i<cnt; i++) {
+          write_index(idx, bits, i);
         }
       }
     }
-    this->tail += cnt;
-#else // SIMDJSON_PREFER_REVERSE_BITS
-    /**
-     * Under recent x64 systems, we often have both a fast trailing zero
-     * instruction and a fast 'clear-lower-bit' instruction so the following
-     * algorithm can be competitive.
-     */
-
-    int cnt = static_cast<int>(count_ones(bits));
-    // Do the first 8 all together
-    for (int i=0; i<8; i++) {
-      this->tail[i] = idx + trailing_zeroes(bits);
-      bits = clear_lowest_bit(bits);
-    }
-
-    // Do the next 8 all together (we hope in most cases it won't happen at all
-    // and the branch is easily predicted).
-    if (simdjson_unlikely(cnt > 8)) {
-      for (int i=8; i<16; i++) {
-        this->tail[i] = idx + trailing_zeroes(bits);
-        bits = clear_lowest_bit(bits);
-      }
-
-      // Most files don't have 16+ structurals per block, so we take several basically guaranteed
-      // branch mispredictions here. 16+ structurals per block means either punctuation ({} [] , :)
-      // or the start of a value ("abc" true 123) every four characters.
-      if (simdjson_unlikely(cnt > 16)) {
-        int i = 16;
-        do {
-          this->tail[i] = idx + trailing_zeroes(bits);
-          bits = clear_lowest_bit(bits);
-          i++;
-        } while (i < cnt);
-      }
-    }
 
     this->tail += cnt;
-#endif
   }
 #endif // SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
 
@@ -42741,15 +42645,17 @@ simdjson_warn_unused simdjson_inline error_code json_iterator::visit_root_primit
 }
 template<typename V>
 simdjson_warn_unused simdjson_inline error_code json_iterator::visit_primitive(V &visitor, const uint8_t *value) noexcept {
+  // Use the fact that most scalars are going to be either strings or numbers.
+  if(*value == '"') {
+    return visitor.visit_string(*this, value);
+  } else if (((*value - '0')  < 10) || (*value == '-')) {
+    return visitor.visit_number(*this, value);
+  }
+  // true, false, null are uncommon.
   switch (*value) {
-    case '"': return visitor.visit_string(*this, value);
     case 't': return visitor.visit_true_atom(*this, value);
     case 'f': return visitor.visit_false_atom(*this, value);
     case 'n': return visitor.visit_null_atom(*this, value);
-    case '-':
-    case '0': case '1': case '2': case '3': case '4':
-    case '5': case '6': case '7': case '8': case '9':
-      return visitor.visit_number(*this, value);
     default:
       log_error("Non-value found when value was expected!");
       return TAPE_ERROR;
diff --git a/singleheader/simdjson.h b/singleheader/simdjson.h
index 18795965b2..7264bc8561 100644
--- a/singleheader/simdjson.h
+++ b/singleheader/simdjson.h
@@ -1,4 +1,4 @@
-/* auto-generated on 2023-08-18 14:37:10 -0400. Do not edit! */
+/* auto-generated on . Do not edit! */
 /* including simdjson.h:  */
 /* begin file simdjson.h */
 #ifndef SIMDJSON_H
@@ -79,6 +79,14 @@
 #error simdjson requires a compiler compliant with the C++11 standard
 #endif
 
+#ifndef SIMDJSON_IF_CONSTEXPR
+#if SIMDJSON_CPLUSPLUS17
+#define SIMDJSON_IF_CONSTEXPR if constexpr
+#else
+#define SIMDJSON_IF_CONSTEXPR if
+#endif
+#endif
+
 #endif // SIMDJSON_COMPILER_CHECK_H
 /* end file simdjson/compiler_check.h */
 /* including simdjson/portability.h: #include "simdjson/portability.h" */
@@ -2314,7 +2322,7 @@ namespace std {
 #define SIMDJSON_SIMDJSON_VERSION_H
 
 /** The version of simdjson being used (major.minor.revision) */
-#define SIMDJSON_VERSION "3.2.3"
+#define SIMDJSON_VERSION "3.3.0"
 
 namespace simdjson {
 enum {
@@ -2325,11 +2333,11 @@ enum {
   /**
    * The minor version (major.MINOR.revision) of simdjson being used.
    */
-  SIMDJSON_VERSION_MINOR = 2,
+  SIMDJSON_VERSION_MINOR = 3,
   /**
    * The revision (major.minor.REVISION) of simdjson being used.
    */
-  SIMDJSON_VERSION_REVISION = 3
+  SIMDJSON_VERSION_REVISION = 0
 };
 } // namespace simdjson
 
@@ -11813,8 +11821,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
     // to the desired approximation using one multiplication. Sometimes it does not suffice.
     // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
-    // then we get a better approximation to i * 5^q. In very rare cases, even that
-    // will not suffice, though it is seemingly very hard to find such a scenario.
+    // then we get a better approximation to i * 5^q.
     //
     // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
     // more complicated.
@@ -11829,12 +11836,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
     if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
-    // At this point, we might need to add at most one to firstproduct, but this
-    // can only change the value of firstproduct.high if firstproduct.low is maximal.
-    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
-      // This is very unlikely, but if so, we need to do much more work!
-      return false;
-    }
+    // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
+    // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
+    // is sufficiently accurate, and more computation is not needed.
   }
   uint64_t lower = firstproduct.low;
   uint64_t upper = firstproduct.high;
@@ -13869,8 +13873,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
     // to the desired approximation using one multiplication. Sometimes it does not suffice.
     // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
-    // then we get a better approximation to i * 5^q. In very rare cases, even that
-    // will not suffice, though it is seemingly very hard to find such a scenario.
+    // then we get a better approximation to i * 5^q.
     //
     // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
     // more complicated.
@@ -13885,12 +13888,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
     if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
-    // At this point, we might need to add at most one to firstproduct, but this
-    // can only change the value of firstproduct.high if firstproduct.low is maximal.
-    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
-      // This is very unlikely, but if so, we need to do much more work!
-      return false;
-    }
+    // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
+    // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
+    // is sufficiently accurate, and more computation is not needed.
   }
   uint64_t lower = firstproduct.low;
   uint64_t upper = firstproduct.high;
@@ -16417,8 +16417,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
     // to the desired approximation using one multiplication. Sometimes it does not suffice.
     // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
-    // then we get a better approximation to i * 5^q. In very rare cases, even that
-    // will not suffice, though it is seemingly very hard to find such a scenario.
+    // then we get a better approximation to i * 5^q.
     //
     // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
     // more complicated.
@@ -16433,12 +16432,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
     if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
-    // At this point, we might need to add at most one to firstproduct, but this
-    // can only change the value of firstproduct.high if firstproduct.low is maximal.
-    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
-      // This is very unlikely, but if so, we need to do much more work!
-      return false;
-    }
+    // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
+    // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
+    // is sufficiently accurate, and more computation is not needed.
   }
   uint64_t lower = firstproduct.low;
   uint64_t upper = firstproduct.high;
@@ -18964,8 +18960,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
     // to the desired approximation using one multiplication. Sometimes it does not suffice.
     // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
-    // then we get a better approximation to i * 5^q. In very rare cases, even that
-    // will not suffice, though it is seemingly very hard to find such a scenario.
+    // then we get a better approximation to i * 5^q.
     //
     // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
     // more complicated.
@@ -18980,12 +18975,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
     if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
-    // At this point, we might need to add at most one to firstproduct, but this
-    // can only change the value of firstproduct.high if firstproduct.low is maximal.
-    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
-      // This is very unlikely, but if so, we need to do much more work!
-      return false;
-    }
+    // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
+    // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
+    // is sufficiently accurate, and more computation is not needed.
   }
   uint64_t lower = firstproduct.low;
   uint64_t upper = firstproduct.high;
@@ -21626,8 +21618,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
     // to the desired approximation using one multiplication. Sometimes it does not suffice.
     // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
-    // then we get a better approximation to i * 5^q. In very rare cases, even that
-    // will not suffice, though it is seemingly very hard to find such a scenario.
+    // then we get a better approximation to i * 5^q.
     //
     // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
     // more complicated.
@@ -21642,12 +21633,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
     if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
-    // At this point, we might need to add at most one to firstproduct, but this
-    // can only change the value of firstproduct.high if firstproduct.low is maximal.
-    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
-      // This is very unlikely, but if so, we need to do much more work!
-      return false;
-    }
+    // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
+    // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
+    // is sufficiently accurate, and more computation is not needed.
   }
   uint64_t lower = firstproduct.low;
   uint64_t upper = firstproduct.high;
@@ -24611,8 +24599,7 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     // power_of_five_128[index]. Usually, that's good enough to approximate i * 5^q
     // to the desired approximation using one multiplication. Sometimes it does not suffice.
     // Then we store the next most significant 64 bits in power_of_five_128[index + 1], and
-    // then we get a better approximation to i * 5^q. In very rare cases, even that
-    // will not suffice, though it is seemingly very hard to find such a scenario.
+    // then we get a better approximation to i * 5^q.
     //
     // That's for when q>=0. The logic for q<0 is somewhat similar but it is somewhat
     // more complicated.
@@ -24627,12 +24614,9 @@ simdjson_inline bool compute_float_64(int64_t power, uint64_t i, bool negative,
     simdjson::internal::value128 secondproduct = full_multiplication(i, simdjson::internal::power_of_five_128[index + 1]);
     firstproduct.low += secondproduct.high;
     if(secondproduct.high > firstproduct.low) { firstproduct.high++; }
-    // At this point, we might need to add at most one to firstproduct, but this
-    // can only change the value of firstproduct.high if firstproduct.low is maximal.
-    if(simdjson_unlikely(firstproduct.low  == 0xFFFFFFFFFFFFFFFF)) {
-      // This is very unlikely, but if so, we need to do much more work!
-      return false;
-    }
+    // As it has been proven by Noble Mushtak and Daniel Lemire in "Fast Number Parsing Without
+    // Fallback" (https://arxiv.org/abs/2212.06644), at this point we are sure that the product
+    // is sufficiently accurate, and more computation is not needed.
   }
   uint64_t lower = firstproduct.low;
   uint64_t upper = firstproduct.high;