diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ab44366..f70840c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -69,6 +69,23 @@ jobs: command: fmt args: -- --check + exmaples: + name: Rust exmaples + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Run datafusion example + uses: actions-rs/cargo@v1 + with: + command: run + args: --example datafusion --features=datafusion + + - name: Run declare example + uses: actions-rs/cargo@v1 + with: + command: run + args: --example declare --all-features + benchmark: name: Rust benchmark runs-on: self-hosted diff --git a/Cargo.toml b/Cargo.toml index 8eaf57a..c429ce7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,21 +49,23 @@ path = "benches/criterion/writes.rs" required-features = ["sled"] [dependencies] -arrow = "52" +arrow = "53" async-lock = "3" async-stream = "0.3" async-trait = { version = "0.1", optional = true } bytes = { version = "1.7", optional = true } crc32fast = "1" crossbeam-skiplist = "0.1" -datafusion = { version = "41", optional = true } +datafusion = { version = "42", optional = true } flume = { version = "0.11", features = ["async"] } +fusio = { git = "https://github.com/tonbo-io/fusio.git", package = "fusio", rev = "317b1b0621b297f52145b41b90506632f2dc7a1d", features = ["tokio", "dyn"] } +fusio-parquet = { git = "https://github.com/tonbo-io/fusio.git", package = "fusio-parquet", rev = "317b1b0621b297f52145b41b90506632f2dc7a1d" } futures-core = "0.3" futures-io = "0.3" futures-util = "0.3" lockable = "0.0.8" once_cell = "1" -parquet = { version = "52", features = ["async"] } +parquet = { version = "53", features = ["async"] } pin-project-lite = "0.2" regex = "1" thiserror = "1" @@ -74,6 +76,7 @@ tracing = "0.1" ulid = "1" # Only used for benchmarks +log = "0.4.22" redb = { version = "2", optional = true } rocksdb = { version = "0.22", optional = true } sled = { version = "0.34", optional = true } diff --git a/benches/common.rs b/benches/common.rs index b3f6d82..d7ed5d3 100644 --- a/benches/common.rs +++ b/benches/common.rs @@ -5,16 +5,19 @@ use std::{ fs::File, io::{BufRead, BufReader}, path::{Path, PathBuf}, + sync::Arc, }; use async_stream::stream; +use fusio::local::TokioFs; use futures_core::Stream; use futures_util::StreamExt; use parquet::data_type::AsBytes; use redb::TableDefinition; use rocksdb::{Direction, IteratorMode, TransactionDB}; use tonbo::{ - executor::tokio::TokioExecutor, stream, transaction::TransactionEntry, DbOption, Projection, + executor::tokio::TokioExecutor, fs::manager::StoreManager, stream, + transaction::TransactionEntry, DbOption, Projection, }; use tonbo_macros::Record; @@ -196,8 +199,14 @@ impl TonboBenchDataBase { } impl BenchDatabase for TonboBenchDataBase { - type W<'db> = TonboBenchWriteTransaction<'db> where Self: 'db; - type R<'db> = TonboBenchReadTransaction<'db> where Self: 'db; + type W<'db> + = TonboBenchWriteTransaction<'db> + where + Self: 'db; + type R<'db> + = TonboBenchReadTransaction<'db> + where + Self: 'db; fn db_type_name() -> &'static str { "tonbo" @@ -216,19 +225,27 @@ impl BenchDatabase for TonboBenchDataBase { } async fn build(path: impl AsRef) -> Self { - let option = DbOption::from(path.as_ref()).disable_wal(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + let option = + DbOption::from(fusio::path::Path::from_filesystem_path(path.as_ref()).unwrap()) + .disable_wal(); - let db = tonbo::DB::new(option, TokioExecutor::new()).await.unwrap(); + let db = tonbo::DB::new(option, TokioExecutor::new(), manager) + .await + .unwrap(); TonboBenchDataBase::new(db) } } pub struct TonboBenchReadTransaction<'a> { - txn: tonbo::transaction::Transaction<'a, Customer, TokioExecutor>, + txn: tonbo::transaction::Transaction<'a, Customer>, } impl<'db> BenchReadTransaction for TonboBenchReadTransaction<'db> { - type T<'txn> = TonboBenchReader<'db, 'txn> where Self: 'txn; + type T<'txn> + = TonboBenchReader<'db, 'txn> + where + Self: 'txn; fn get_reader(&self) -> Self::T<'_> { TonboBenchReader { txn: &self.txn } @@ -236,7 +253,7 @@ impl<'db> BenchReadTransaction for TonboBenchReadTransaction<'db> { } pub struct TonboBenchReader<'db, 'txn> { - txn: &'txn tonbo::transaction::Transaction<'db, Customer, TokioExecutor>, + txn: &'txn tonbo::transaction::Transaction<'db, Customer>, } impl BenchReader for TonboBenchReader<'_, '_> { @@ -276,11 +293,14 @@ impl BenchReader for TonboBenchReader<'_, '_> { } pub struct TonboBenchWriteTransaction<'a> { - txn: tonbo::transaction::Transaction<'a, Customer, TokioExecutor>, + txn: tonbo::transaction::Transaction<'a, Customer>, } impl<'db> BenchWriteTransaction for TonboBenchWriteTransaction<'db> { - type W<'txn> = TonboBenchInserter<'db, 'txn> where Self: 'txn; + type W<'txn> + = TonboBenchInserter<'db, 'txn> + where + Self: 'txn; fn get_inserter(&mut self) -> Self::W<'_> { TonboBenchInserter { txn: &mut self.txn } @@ -293,7 +313,7 @@ impl<'db> BenchWriteTransaction for TonboBenchWriteTransaction<'db> { } pub struct TonboBenchInserter<'db, 'txn> { - txn: &'txn mut tonbo::transaction::Transaction<'db, Customer, TokioExecutor>, + txn: &'txn mut tonbo::transaction::Transaction<'db, Customer>, } impl BenchInserter for TonboBenchInserter<'_, '_> { @@ -320,8 +340,14 @@ impl RedbBenchDatabase { } impl BenchDatabase for RedbBenchDatabase { - type W<'db> = RedbBenchWriteTransaction where Self: 'db; - type R<'db> = RedbBenchReadTransaction where Self: 'db; + type W<'db> + = RedbBenchWriteTransaction + where + Self: 'db; + type R<'db> + = RedbBenchReadTransaction + where + Self: 'db; fn db_type_name() -> &'static str { "redb" @@ -351,7 +377,10 @@ pub struct RedbBenchReadTransaction { } impl BenchReadTransaction for RedbBenchReadTransaction { - type T<'txn> = RedbBenchReader where Self: 'txn; + type T<'txn> + = RedbBenchReader + where + Self: 'txn; fn get_reader(&self) -> Self::T<'_> { let table = self.txn.open_table(X).unwrap(); @@ -416,7 +445,10 @@ pub struct RedbBenchWriteTransaction { } impl BenchWriteTransaction for RedbBenchWriteTransaction { - type W<'txn> = RedbBenchInserter<'txn> where Self: 'txn; + type W<'txn> + = RedbBenchInserter<'txn> + where + Self: 'txn; fn get_inserter(&mut self) -> Self::W<'_> { let table = self.txn.open_table(X).unwrap(); @@ -464,11 +496,11 @@ impl SledBenchDatabase { impl BenchDatabase for SledBenchDatabase { type W<'db> - = SledBenchWriteTransaction<'db> + = SledBenchWriteTransaction<'db> where Self: 'db; type R<'db> - = SledBenchReadTransaction<'db> + = SledBenchReadTransaction<'db> where Self: 'db; @@ -500,7 +532,7 @@ pub struct SledBenchReadTransaction<'db> { impl BenchReadTransaction for SledBenchReadTransaction<'_> { type T<'txn> - = SledBenchReader<'txn> + = SledBenchReader<'txn> where Self: 'txn; @@ -568,7 +600,7 @@ pub struct SledBenchWriteTransaction<'a> { impl BenchWriteTransaction for SledBenchWriteTransaction<'_> { type W<'txn> - = SledBenchInserter<'txn> + = SledBenchInserter<'txn> where Self: 'txn; @@ -624,11 +656,11 @@ impl RocksdbBenchDatabase { impl BenchDatabase for RocksdbBenchDatabase { type W<'db> - = RocksdbBenchWriteTransaction<'db> + = RocksdbBenchWriteTransaction<'db> where Self: 'db; type R<'db> - = RocksdbBenchReadTransaction<'db> + = RocksdbBenchReadTransaction<'db> where Self: 'db; @@ -667,7 +699,7 @@ pub struct RocksdbBenchWriteTransaction<'a> { impl<'a> BenchWriteTransaction for RocksdbBenchWriteTransaction<'a> { type W<'txn> - = RocksdbBenchInserter<'txn> + = RocksdbBenchInserter<'txn> where Self: 'txn; @@ -706,7 +738,7 @@ pub struct RocksdbBenchReadTransaction<'db> { impl<'db> BenchReadTransaction for RocksdbBenchReadTransaction<'db> { type T<'txn> - = RocksdbBenchReader<'db, 'txn> + = RocksdbBenchReader<'db, 'txn> where Self: 'txn; diff --git a/benches/criterion/writes.rs b/benches/criterion/writes.rs index d30eb0f..8d3fcce 100644 --- a/benches/criterion/writes.rs +++ b/benches/criterion/writes.rs @@ -1,8 +1,9 @@ use std::{iter::repeat_with, sync::Arc}; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use fusio::local::TokioFs; use mimalloc::MiMalloc; -use tonbo::{executor::tokio::TokioExecutor, DbOption, Record, DB}; +use tonbo::{executor::tokio::TokioExecutor, fs::manager::StoreManager, DbOption, Record, DB}; #[global_allocator] static GLOBAL: MiMalloc = MiMalloc; @@ -55,10 +56,14 @@ fn single_write(c: &mut Criterion) { let batches = [1, 16, 128]; let _ = std::fs::remove_dir_all("/tmp/tonbo"); + let _ = std::fs::create_dir_all("/tmp/tonbo"); + for batch in batches { - let option = DbOption::from("/tmp/tonbo").disable_wal(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + let option = DbOption::from(fusio::path::Path::from_filesystem_path("/tmp/tonbo").unwrap()) + .disable_wal(); let db = runtime - .block_on(async { DB::new(option, TokioExecutor::default()).await }) + .block_on(async { DB::new(option, TokioExecutor::default(), manager).await }) .unwrap(); group.bench_with_input(BenchmarkId::new("Tonbo", batch), &batch, |b, batch| { @@ -67,9 +72,12 @@ fn single_write(c: &mut Criterion) { .iter(|| async { tonbo_write(&db, *batch).await }); }); let _ = std::fs::remove_dir_all("/tmp/tonbo"); + let _ = std::fs::create_dir_all("/tmp/tonbo"); } let _ = std::fs::remove_dir_all("/tmp/sled"); + let _ = std::fs::create_dir_all("/tmp/sled"); + for batch in batches { let sled = sled::open("/tmp/sled").unwrap(); group.bench_with_input(BenchmarkId::new("Sled", batch), &batch, |b, batch| { @@ -78,6 +86,7 @@ fn single_write(c: &mut Criterion) { .iter(|| async { sled_write(&sled, *batch).await }); }); let _ = std::fs::remove_dir_all("/tmp/sled"); + let _ = std::fs::create_dir_all("/tmp/sled"); } group.finish(); diff --git a/benches/read_bench.rs b/benches/read_bench.rs index 7eb69b7..c849fcb 100644 --- a/benches/read_bench.rs +++ b/benches/read_bench.rs @@ -2,15 +2,13 @@ mod common; use std::{ collections::Bound, - env::current_dir, path::{Path, PathBuf}, sync::Arc, time::{Duration, Instant}, }; use futures_util::{future::join_all, StreamExt}; -use tokio::io::AsyncWriteExt; -use tonbo::{executor::tokio::TokioExecutor, fs::FileProvider}; +use tokio::{fs, io::AsyncWriteExt}; use crate::common::{ read_tbl, BenchDatabase, BenchReadTransaction, BenchReader, RedbBenchDatabase, @@ -181,7 +179,7 @@ async fn main() { println!(); println!("{table}"); - let mut file = TokioExecutor::open("read_benchmark.md").await.unwrap(); + let mut file = fs::File::create("read_benchmark.md").await.unwrap(); file.write_all(b"Read: \n```shell\n").await.unwrap(); for line in table.lines() { file.write_all(line.as_bytes()).await.unwrap(); diff --git a/benches/write_bench.rs b/benches/write_bench.rs index cabecef..f2f4335 100644 --- a/benches/write_bench.rs +++ b/benches/write_bench.rs @@ -12,7 +12,6 @@ use common::*; use futures_util::future::join_all; use tempfile::TempDir; use tokio::io::AsyncWriteExt; -use tonbo::{executor::tokio::TokioExecutor, fs::FileProvider}; const WRITE_TIMES: usize = 500_000; const WRITE_BATCH_TIMES: usize = 5000; @@ -227,7 +226,7 @@ async fn main() { println!(); println!("{table}"); - let mut file = TokioExecutor::open("write_benchmark.md").await.unwrap(); + let mut file = tokio::fs::File::create("write_benchmark.md").await.unwrap(); file.write_all(b"Write: \n```shell\n").await.unwrap(); for line in table.lines() { file.write_all(line.as_bytes()).await.unwrap(); diff --git a/examples/datafusion.rs b/examples/datafusion.rs index 82a44f4..6f0a101 100644 --- a/examples/datafusion.rs +++ b/examples/datafusion.rs @@ -17,12 +17,20 @@ use datafusion::{ error::{DataFusionError, Result}, execution::{RecordBatchStream, SendableRecordBatchStream, TaskContext}, physical_expr::EquivalenceProperties, - physical_plan::{DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, PlanProperties}, + physical_plan::{ + execute_stream, DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, PlanProperties, + }, prelude::*, + sql::parser::DFParser, }; +use fusio::{local::TokioFs, path::Path}; use futures_core::Stream; use futures_util::StreamExt; -use tonbo::{executor::tokio::TokioExecutor, inmem::immutable::ArrowArrays, record::Record, DB}; +use tokio::fs; +use tonbo::{ + executor::tokio::TokioExecutor, fs::manager::StoreManager, inmem::immutable::ArrowArrays, + record::Record, DbOption, DB, +}; use tonbo_macros::Record; #[derive(Record, Debug)] @@ -198,7 +206,13 @@ impl ExecutionPlan for MusicExec { #[tokio::main] async fn main() -> Result<()> { - let db = DB::new("./db_path/music".into(), TokioExecutor::default()) + // make sure the path exists + let _ = fs::create_dir_all("./db_path/music").await; + + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + let options = DbOption::from(Path::from_filesystem_path("./db_path/music").unwrap()); + + let db = DB::new(options, TokioExecutor::default(), manager) .await .unwrap(); for (id, name, like) in [ @@ -214,9 +228,29 @@ async fn main() -> Result<()> { let provider = MusicProvider { db: Arc::new(db) }; ctx.register_table("music", Arc::new(provider))?; - let df = ctx.table("music").await?; - let df = df.select(vec![col("name")])?; - let batches = df.collect().await?; - pretty::print_batches(&batches).unwrap(); + { + let df = ctx.table("music").await?; + let df = df.select(vec![col("name")])?; + let batches = df.collect().await?; + pretty::print_batches(&batches).unwrap(); + } + + { + // support sql query for tonbo + let statements = DFParser::parse_sql("select * from music")?; + let plan = ctx + .state() + .statement_to_plan(statements.front().cloned().unwrap()) + .await?; + ctx.execute_logical_plan(plan).await?; + let df = ctx.table("music").await?; + let physical_plan = df.create_physical_plan().await?; + let mut stream = execute_stream(physical_plan, ctx.task_ctx())?; + while let Some(maybe_batch) = stream.next().await { + let batch = maybe_batch?; + pretty::print_batches(&[batch]).unwrap(); + } + } + Ok(()) } diff --git a/examples/declare.rs b/examples/declare.rs index b6ffe62..7906ee1 100644 --- a/examples/declare.rs +++ b/examples/declare.rs @@ -1,8 +1,12 @@ -use std::ops::Bound; +use std::{ops::Bound, sync::Arc}; use bytes::Bytes; +use fusio::{local::TokioFs, path::Path}; use futures_util::stream::StreamExt; -use tonbo::{executor::tokio::TokioExecutor, Projection, Record, DB}; +use tokio::fs; +use tonbo::{ + executor::tokio::TokioExecutor, fs::manager::StoreManager, DbOption, Projection, Record, DB, +}; /// Use macro to define schema of column family just like ORM /// It provides type-safe read & write API @@ -17,8 +21,13 @@ pub struct User { #[tokio::main] async fn main() { + // make sure the path exists + let _ = fs::create_dir_all("./db_path/users").await; + + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + let options = DbOption::from(Path::from_filesystem_path("./db_path/users").unwrap()); // pluggable async runtime and I/O - let db = DB::new("./db_path/users".into(), TokioExecutor::default()) + let db = DB::new(options, TokioExecutor::default(), manager) .await .unwrap(); diff --git a/src/compaction/mod.rs b/src/compaction/mod.rs index 888ab25..db1c39a 100644 --- a/src/compaction/mod.rs +++ b/src/compaction/mod.rs @@ -1,6 +1,8 @@ use std::{cmp, collections::Bound, mem, pin::Pin, sync::Arc}; use async_lock::{RwLock, RwLockUpgradableReadGuard}; +use fusio::DynFs; +use fusio_parquet::writer::AsyncWriter; use futures_util::StreamExt; use parquet::arrow::{AsyncArrowWriter, ProjectionMask}; use thiserror::Error; @@ -8,13 +10,13 @@ use tokio::sync::oneshot; use ulid::Ulid; use crate::{ - fs::{FileId, FileProvider}, + fs::{default_open_options, manager::StoreManager, FileId}, inmem::{ immutable::{ArrowArrays, Builder, Immutable}, mutable::Mutable, }, ondisk::sstable::SsTable, - record::{KeyRef, Record}, + record::{KeyRef, Record, RecordInstance}, scope::Scope, stream::{level::LevelStream, merge::MergeStream, ScanStream}, transaction::CommitError, @@ -30,30 +32,31 @@ pub enum CompactTask { Flush(Option>), } -pub(crate) struct Compactor +pub(crate) struct Compactor where R: Record, - FP: FileProvider, { pub(crate) option: Arc>, - pub(crate) schema: Arc>>, - pub(crate) version_set: VersionSet, + pub(crate) schema: Arc>>, + pub(crate) version_set: VersionSet, + pub(crate) manager: Arc, } -impl Compactor +impl Compactor where R: Record, - FP: FileProvider, { pub(crate) fn new( - schema: Arc>>, + schema: Arc>>, option: Arc>, - version_set: VersionSet, + version_set: VersionSet, + manager: Arc, ) -> Self { - Compactor:: { + Compactor:: { option, schema, version_set, + manager, } } @@ -75,9 +78,9 @@ where let trigger_clone = guard.trigger.clone(); let mutable = mem::replace( &mut guard.mutable, - Mutable::new(&self.option, trigger_clone).await?, + Mutable::new(&self.option, trigger_clone, self.manager.base_fs()).await?, ); - let (file_id, immutable) = mutable.into_immutable().await?; + let (file_id, immutable) = mutable.into_immutable(&guard.record_instance).await?; guard.immutables.push((file_id, immutable)); if guard.immutables.len() > self.option.immutable_chunk_max_num { @@ -88,8 +91,14 @@ where let chunk_num = self.option.immutable_chunk_num; let excess = &guard.immutables[0..chunk_num]; - if let Some(scope) = - Self::minor_compaction(&self.option, recover_wal_ids, excess).await? + if let Some(scope) = Self::minor_compaction( + &self.option, + recover_wal_ids, + excess, + &guard.record_instance, + &self.manager, + ) + .await? { let version_ref = self.version_set.current().await; let mut version_edits = vec![]; @@ -103,6 +112,8 @@ where &scope.max, &mut version_edits, &mut delete_gens, + &guard.record_instance, + &self.manager, ) .await?; } @@ -129,8 +140,13 @@ where option: &DbOption, recover_wal_ids: Option>, batches: &[(Option, Immutable)], + instance: &RecordInstance, + manager: &StoreManager, ) -> Result>, CompactionError> { if !batches.is_empty() { + let level_0_path = option.level_fs_path(0).unwrap_or(&option.base_path); + let level_0_fs = manager.get_fs(level_0_path); + let mut min = None; let mut max = None; @@ -138,8 +154,12 @@ where let mut wal_ids = Vec::with_capacity(batches.len()); let mut writer = AsyncArrowWriter::try_new( - FP::open(option.table_path(&gen)).await?, - R::arrow_schema().clone(), + AsyncWriter::new( + level_0_fs + .open_options(&option.table_path(&gen), default_open_options()) + .await?, + ), + instance.arrow_schema::().clone(), Some(option.write_parquet_properties.clone()), )?; @@ -171,13 +191,16 @@ where Ok(None) } + #[allow(clippy::too_many_arguments)] pub(crate) async fn major_compaction( - version: &Version, + version: &Version, option: &DbOption, mut min: &R::Key, mut max: &R::Key, version_edits: &mut Vec>, - delete_gens: &mut Vec, + delete_gens: &mut Vec<(FileId, usize)>, + instance: &RecordInstance, + manager: &StoreManager, ) -> Result<(), CompactionError> { let mut level = 0; @@ -189,14 +212,19 @@ where let (meet_scopes_ll, start_ll, end_ll) = Self::next_level_scopes(version, &mut min, &mut max, level, &meet_scopes_l)?; + let level_path = option.level_fs_path(level).unwrap_or(&option.base_path); + let level_fs = manager.get_fs(level_path); let mut streams = Vec::with_capacity(meet_scopes_l.len() + meet_scopes_ll.len()); // This Level if level == 0 { for scope in meet_scopes_l.iter() { - let file = FP::open(option.table_path(&scope.gen)).await?; + let file = level_fs + .open_options(&option.table_path(&scope.gen), default_open_options()) + .await?; streams.push(ScanStream::SsTable { inner: SsTable::open(file) + .await? .scan( (Bound::Unbounded, Bound::Unbounded), u32::MAX.into(), @@ -217,6 +245,7 @@ where u32::MAX.into(), None, ProjectionMask::all(), + level_fs.clone(), ) .ok_or(CompactionError::EmptyLevel)?; @@ -236,6 +265,7 @@ where u32::MAX.into(), None, ProjectionMask::all(), + level_fs.clone(), ) .ok_or(CompactionError::EmptyLevel)?; @@ -243,21 +273,21 @@ where inner: level_scan_ll, }); } - Self::build_tables(option, version_edits, level, streams).await?; + Self::build_tables(option, version_edits, level, streams, instance, level_fs).await?; for scope in meet_scopes_l { version_edits.push(VersionEdit::Remove { level: level as u8, gen: scope.gen, }); - delete_gens.push(scope.gen); + delete_gens.push((scope.gen, level)); } for scope in meet_scopes_ll { version_edits.push(VersionEdit::Remove { level: (level + 1) as u8, gen: scope.gen, }); - delete_gens.push(scope.gen); + delete_gens.push((scope.gen, level)); } level += 1; } @@ -266,7 +296,7 @@ where } fn next_level_scopes<'a>( - version: &'a Version, + version: &'a Version, min: &mut &'a ::Key, max: &mut &'a ::Key, level: usize, @@ -289,8 +319,8 @@ where .max() .ok_or(CompactionError::EmptyLevel)?; - start_ll = Version::::scope_search(min, &version.level_slice[level + 1]); - end_ll = Version::::scope_search(max, &version.level_slice[level + 1]); + start_ll = Version::::scope_search(min, &version.level_slice[level + 1]); + end_ll = Version::::scope_search(max, &version.level_slice[level + 1]); let next_level_len = version.level_slice[level + 1].len(); for scope in version.level_slice[level + 1] @@ -306,13 +336,13 @@ where } fn this_level_scopes<'a>( - version: &'a Version, + version: &'a Version, min: &::Key, max: &::Key, level: usize, ) -> (Vec<&'a Scope<::Key>>, usize, usize) { let mut meet_scopes_l = Vec::new(); - let mut start_l = Version::::scope_search(min, &version.level_slice[level]); + let mut start_l = Version::::scope_search(min, &version.level_slice[level]); let mut end_l = start_l; let option = version.option(); @@ -347,15 +377,14 @@ where option: &DbOption, version_edits: &mut Vec::Key>>, level: usize, - streams: Vec>, - ) -> Result<(), CompactionError> - where - FP: 'scan, - { - let mut stream = MergeStream::::from_vec(streams, u32::MAX.into()).await?; + streams: Vec>, + instance: &RecordInstance, + fs: &Arc, + ) -> Result<(), CompactionError> { + let mut stream = MergeStream::::from_vec(streams, u32::MAX.into()).await?; // Kould: is the capacity parameter necessary? - let mut builder = R::Columns::builder(8192); + let mut builder = R::Columns::builder(&instance.arrow_schema::(), 8192); let mut min = None; let mut max = None; @@ -377,6 +406,8 @@ where &mut builder, &mut min, &mut max, + instance, + fs, ) .await?; } @@ -389,6 +420,8 @@ where &mut builder, &mut min, &mut max, + instance, + fs, ) .await?; } @@ -403,6 +436,7 @@ where Ok((lower, upper)) } + #[allow(clippy::too_many_arguments)] async fn build_table( option: &DbOption, version_edits: &mut Vec>, @@ -410,6 +444,8 @@ where builder: &mut ::Builder, min: &mut Option, max: &mut Option, + instance: &RecordInstance, + fs: &Arc, ) -> Result<(), CompactionError> { debug_assert!(min.is_some()); debug_assert!(max.is_some()); @@ -417,8 +453,11 @@ where let gen = Ulid::new(); let columns = builder.finish(None); let mut writer = AsyncArrowWriter::try_new( - FP::open(option.table_path(&gen)).await?, - R::arrow_schema().clone(), + AsyncWriter::new( + fs.open_options(&option.table_path(&gen), default_open_options()) + .await?, + ), + instance.arrow_schema::().clone(), Some(option.write_parquet_properties.clone()), )?; writer.write(columns.as_record_batch()).await?; @@ -445,6 +484,8 @@ where Io(#[from] std::io::Error), #[error("compaction parquet error: {0}")] Parquet(#[from] parquet::errors::ParquetError), + #[error("compaction fusio error: {0}")] + Fusio(#[from] fusio::Error), #[error("compaction version error: {0}")] Version(#[from] VersionError), #[error("database error: {0}")] @@ -458,15 +499,17 @@ pub(crate) mod tests { use std::sync::{atomic::AtomicU32, Arc}; use flume::bounded; - use parquet::{arrow::AsyncArrowWriter, errors::ParquetError}; + use fusio::{local::TokioFs, path::Path, DynFs}; + use fusio_parquet::writer::AsyncWriter; + use parquet::arrow::AsyncArrowWriter; use tempfile::TempDir; use crate::{ compaction::Compactor, - executor::{tokio::TokioExecutor, Executor}, - fs::{FileId, FileProvider}, + executor::tokio::TokioExecutor, + fs::{default_open_options, manager::StoreManager, FileId}, inmem::{immutable::Immutable, mutable::Mutable}, - record::Record, + record::{Column, ColumnDesc, Datatype, DynRecord, Record, RecordInstance}, scope::Scope, tests::Test, timestamp::Timestamp, @@ -476,38 +519,41 @@ pub(crate) mod tests { DbError, DbOption, DB, }; - async fn build_immutable( + async fn build_immutable( option: &DbOption, records: Vec<(LogType, R, Timestamp)>, + instance: &RecordInstance, + fs: &Arc, ) -> Result, DbError> where R: Record + Send, - FP: FileProvider, { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mutable: Mutable = Mutable::new(option, trigger).await?; + let mutable: Mutable = Mutable::new(option, trigger, fs).await?; for (log_ty, record, ts) in records { let _ = mutable.insert(log_ty, record, ts).await?; } - Ok(Immutable::from(mutable.data)) + Ok(Immutable::from((mutable.data, instance))) } - pub(crate) async fn build_parquet_table( + pub(crate) async fn build_parquet_table( option: &DbOption, gen: FileId, records: Vec<(LogType, R, Timestamp)>, + instance: &RecordInstance, + fs: &Arc, ) -> Result<(), DbError> where R: Record + Send, - FP: Executor, { - let immutable = build_immutable::(option, records).await?; + let immutable = build_immutable::(option, records, instance, fs).await?; let mut writer = AsyncArrowWriter::try_new( - FP::open(option.table_path(&gen)) - .await - .map_err(ParquetError::from)?, + AsyncWriter::new( + fs.open_options(&option.table_path(&gen), default_open_options()) + .await?, + ), R::arrow_schema().clone(), None, )?; @@ -520,12 +566,15 @@ pub(crate) mod tests { #[tokio::test] async fn minor_compaction() { let temp_dir = tempfile::tempdir().unwrap(); - let option = DbOption::from(temp_dir.path()); - TokioExecutor::create_dir_all(&option.wal_dir_path()) + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + + manager + .create_dir_all(&option.wal_dir_path()) .await .unwrap(); - let batch_1 = build_immutable::( + let batch_1 = build_immutable::( &option, vec![ ( @@ -556,11 +605,13 @@ pub(crate) mod tests { 0.into(), ), ], + &RecordInstance::Normal, + manager.base_fs(), ) .await .unwrap(); - let batch_2 = build_immutable::( + let batch_2 = build_immutable::( &option, vec![ ( @@ -591,17 +642,21 @@ pub(crate) mod tests { 0.into(), ), ], + &RecordInstance::Normal, + manager.base_fs(), ) .await .unwrap(); - let scope = Compactor::::minor_compaction( - &DbOption::from(temp_dir.path()), + let scope = Compactor::::minor_compaction( + &DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()), None, &vec![ (Some(FileId::new()), batch_1), (Some(FileId::new()), batch_2), ], + &RecordInstance::Normal, + &manager, ) .await .unwrap() @@ -610,31 +665,113 @@ pub(crate) mod tests { assert_eq!(scope.max, 6.to_string()); } + #[tokio::test] + async fn dyn_minor_compaction() { + let temp_dir = tempfile::tempdir().unwrap(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + let option = DbOption::with_path( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + "id".to_string(), + 0, + ); + manager + .create_dir_all(&option.wal_dir_path()) + .await + .unwrap(); + + let empty_record = DynRecord::empty_record( + vec![ColumnDesc::new("id".to_owned(), Datatype::Int32, false)], + 0, + ); + let instance = RecordInstance::Runtime(empty_record); + + let mut batch1_data = vec![]; + let mut batch2_data = vec![]; + for i in 0..40 { + let col = Column::new(Datatype::Int32, "id".to_owned(), Arc::new(i), false); + if i % 4 == 0 { + continue; + } + if i < 35 && (i % 2 == 0 || i % 5 == 0) { + batch1_data.push((LogType::Full, DynRecord::new(vec![col], 0), 0.into())); + } else if i >= 7 { + batch2_data.push((LogType::Full, DynRecord::new(vec![col], 0), 0.into())); + } + } + + // data range: [2, 34] + let batch_1 = + build_immutable::(&option, batch1_data, &instance, manager.base_fs()) + .await + .unwrap(); + + // data range: [7, 39] + let batch_2 = + build_immutable::(&option, batch2_data, &instance, manager.base_fs()) + .await + .unwrap(); + + let scope = Compactor::::minor_compaction( + &option, + None, + &vec![ + (Some(FileId::new()), batch_1), + (Some(FileId::new()), batch_2), + ], + &instance, + &manager, + ) + .await + .unwrap() + .unwrap(); + assert_eq!( + scope.min, + Column::new(Datatype::Int32, "id".to_owned(), Arc::new(2), false) + ); + assert_eq!( + scope.max, + Column::new(Datatype::Int32, "id".to_owned(), Arc::new(39), false) + ); + } + #[tokio::test] async fn major_compaction() { let temp_dir = TempDir::new().unwrap(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); - let mut option = DbOption::from(temp_dir.path()); + let mut option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); option.major_threshold_with_sst_size = 2; let option = Arc::new(option); + manager + .create_dir_all(&option.version_log_dir_path()) + .await + .unwrap(); + manager + .create_dir_all(&option.wal_dir_path()) + .await + .unwrap(); + let ((table_gen_1, table_gen_2, table_gen_3, table_gen_4, _), version) = - build_version(&option).await; + build_version(&option, &manager).await; let min = 2.to_string(); let max = 5.to_string(); let mut version_edits = Vec::new(); - Compactor::::major_compaction( + Compactor::::major_compaction( &version, &option, &min, &max, &mut version_edits, &mut vec![], + &RecordInstance::Normal, + &manager, ) .await .unwrap(); + if let VersionEdit::Add { level, scope } = &version_edits[0] { assert_eq!(*level, 1); assert_eq!(scope.min, 1.to_string()); @@ -665,18 +802,21 @@ pub(crate) mod tests { pub(crate) async fn build_version( option: &Arc>, - ) -> ( - (FileId, FileId, FileId, FileId, FileId), - Version, - ) { - TokioExecutor::create_dir_all(&option.wal_dir_path()) - .await - .unwrap(); + manager: &StoreManager, + ) -> ((FileId, FileId, FileId, FileId, FileId), Version) { + let level_0_fs = option + .level_fs_path(0) + .map(|path| manager.get_fs(path)) + .unwrap_or(manager.base_fs()); + let level_1_fs = option + .level_fs_path(1) + .map(|path| manager.get_fs(path)) + .unwrap_or(manager.base_fs()); // level 0 let table_gen_1 = FileId::new(); let table_gen_2 = FileId::new(); - build_parquet_table::( + build_parquet_table::( option, table_gen_1, vec![ @@ -708,10 +848,12 @@ pub(crate) mod tests { 0.into(), ), ], + &RecordInstance::Normal, + level_0_fs, ) .await .unwrap(); - build_parquet_table::( + build_parquet_table::( option, table_gen_2, vec![ @@ -743,6 +885,8 @@ pub(crate) mod tests { 0.into(), ), ], + &RecordInstance::Normal, + level_0_fs, ) .await .unwrap(); @@ -751,7 +895,7 @@ pub(crate) mod tests { let table_gen_3 = FileId::new(); let table_gen_4 = FileId::new(); let table_gen_5 = FileId::new(); - build_parquet_table::( + build_parquet_table::( option, table_gen_3, vec![ @@ -783,10 +927,12 @@ pub(crate) mod tests { 0.into(), ), ], + &RecordInstance::Normal, + level_1_fs, ) .await .unwrap(); - build_parquet_table::( + build_parquet_table::( option, table_gen_4, vec![ @@ -818,10 +964,12 @@ pub(crate) mod tests { 0.into(), ), ], + &RecordInstance::Normal, + level_1_fs, ) .await .unwrap(); - build_parquet_table::( + build_parquet_table::( option, table_gen_5, vec![ @@ -853,16 +1001,15 @@ pub(crate) mod tests { 0.into(), ), ], + &RecordInstance::Normal, + level_1_fs, ) .await .unwrap(); let (sender, _) = bounded(1); - let mut version = Version::::new( - option.clone(), - sender, - Arc::new(AtomicU32::default()), - ); + let mut version = + Version::::new(option.clone(), sender, Arc::new(AtomicU32::default())); version.level_slice[0].push(Scope { min: 1.to_string(), max: 3.to_string(), @@ -909,13 +1056,29 @@ pub(crate) mod tests { #[tokio::test] pub(crate) async fn major_panic() { let temp_dir = TempDir::new().unwrap(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); - let mut option = DbOption::from(temp_dir.path()); + let mut option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); option.major_threshold_with_sst_size = 1; option.level_sst_magnification = 1; - TokioExecutor::create_dir_all(&option.wal_dir_path()) + + manager + .create_dir_all(&option.version_log_dir_path()) .await .unwrap(); + manager + .create_dir_all(&option.wal_dir_path()) + .await + .unwrap(); + + let level_0_fs = option + .level_fs_path(0) + .map(|path| manager.get_fs(path)) + .unwrap_or(manager.base_fs()); + let level_1_fs = option + .level_fs_path(1) + .map(|path| manager.get_fs(path)) + .unwrap_or(manager.base_fs()); let table_gen0 = FileId::new(); let table_gen1 = FileId::new(); @@ -937,20 +1100,29 @@ pub(crate) mod tests { records1.push(record); } } - build_parquet_table::(&option, table_gen0, records0) - .await - .unwrap(); - build_parquet_table::(&option, table_gen1, records1) - .await - .unwrap(); + build_parquet_table::( + &option, + table_gen0, + records0, + &RecordInstance::Normal, + level_0_fs, + ) + .await + .unwrap(); + build_parquet_table::( + &option, + table_gen1, + records1, + &RecordInstance::Normal, + level_1_fs, + ) + .await + .unwrap(); let option = Arc::new(option); let (sender, _) = bounded(1); - let mut version = Version::::new( - option.clone(), - sender, - Arc::new(AtomicU32::default()), - ); + let mut version = + Version::::new(option.clone(), sender, Arc::new(AtomicU32::default())); version.level_slice[0].push(Scope { min: 0.to_string(), max: 4.to_string(), @@ -968,13 +1140,15 @@ pub(crate) mod tests { let min = 6.to_string(); let max = 9.to_string(); - Compactor::::major_compaction( + Compactor::::major_compaction( &version, &option, &min, &max, &mut version_edits, &mut vec![], + &RecordInstance::Normal, + &manager, ) .await .unwrap(); @@ -984,8 +1158,9 @@ pub(crate) mod tests { #[tokio::test] async fn test_flush_major_level_sort() { let temp_dir = TempDir::new().unwrap(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); - let mut option = DbOption::from(temp_dir.path()); + let mut option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); option.immutable_chunk_num = 1; option.immutable_chunk_max_num = 0; option.major_threshold_with_sst_size = 2; @@ -995,7 +1170,9 @@ pub(crate) mod tests { option.major_default_oldest_table_num = 1; option.trigger_type = TriggerType::Length(5); - let db: DB = DB::new(option, TokioExecutor::new()).await.unwrap(); + let db: DB = DB::new(option, TokioExecutor::new(), manager) + .await + .unwrap(); for i in 5..9 { let item = Test { diff --git a/src/executor.rs b/src/executor.rs index 1ee88db..79cc6da 100644 --- a/src/executor.rs +++ b/src/executor.rs @@ -1,8 +1,6 @@ use std::future::Future; -use crate::fs::FileProvider; - -pub trait Executor: FileProvider { +pub trait Executor { fn spawn(&self, future: F) where F: Future + Send + 'static; diff --git a/src/fs/manager.rs b/src/fs/manager.rs new file mode 100644 index 0000000..0418880 --- /dev/null +++ b/src/fs/manager.rs @@ -0,0 +1,44 @@ +use std::{collections::HashMap, sync::Arc}; + +use fusio::{dynamic::DynFs, path::Path, Error}; + +pub struct StoreManager { + base_fs: Arc, + fs_map: HashMap>>, +} + +impl StoreManager { + pub fn new(base_fs: Arc, levels_fs: Vec<(Path, Option>)>) -> Self { + let mut fs_map = HashMap::with_capacity(levels_fs.len()); + + for (path, fs) in levels_fs { + fs_map.entry(path).or_insert(fs); + } + + StoreManager { base_fs, fs_map } + } + + pub async fn create_dir_all(&self, path: &Path) -> Result<(), Error> { + self.base_fs.create_dir_all(path).await?; + for (_, fs) in self.fs_map.iter() { + if let Some(fs) = fs { + fs.create_dir_all(path).await?; + } + } + + Ok(()) + } + + pub fn base_fs(&self) -> &Arc { + &self.base_fs + } + + pub fn get_fs(&self, path: &Path) -> &Arc { + self.fs_map + .get(path) + .and_then(Option::as_ref) + .unwrap_or(&self.base_fs) + } +} + +// TODO: TestCases diff --git a/src/fs/mod.rs b/src/fs/mod.rs index e3f0e90..8bd0f16 100644 --- a/src/fs/mod.rs +++ b/src/fs/mod.rs @@ -1,16 +1,12 @@ -#[cfg(any(test, feature = "tokio"))] -pub mod tokio_fs; +pub mod manager; use std::{ fmt::{Display, Formatter}, - future::Future, - io, - path::Path, + str::FromStr, }; -use futures_core::Stream; -use tokio::io::{AsyncRead, AsyncSeek, AsyncWrite}; -use ulid::Ulid; +use fusio::{fs::OpenOptions, path::Path}; +use ulid::{DecodeError, Ulid}; pub(crate) type FileId = Ulid; @@ -20,26 +16,6 @@ pub enum FileType { Log, } -pub trait AsyncFile: AsyncRead + AsyncWrite + AsyncSeek + Send + Sync + Unpin + 'static {} - -impl AsyncFile for T where T: AsyncRead + AsyncWrite + AsyncSeek + Send + Sync + Unpin + 'static {} - -pub trait FileProvider { - type File: AsyncFile; - - fn create_dir_all(path: impl AsRef) -> impl Future>; - - fn open(path: impl AsRef + Send) -> impl Future> + Send; - - fn remove(path: impl AsRef + Send) -> impl Future> + Send; - - fn list( - dir_path: impl AsRef + Send, - file_type: FileType, - is_reverse: bool, - ) -> io::Result>>; -} - impl Display for FileType { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { match self { @@ -49,3 +25,18 @@ impl Display for FileType { } } } + +pub(crate) fn default_open_options() -> OpenOptions { + OpenOptions::default().create().append().read() +} + +pub(crate) fn parse_file_id(path: &Path, suffix: FileType) -> Result, DecodeError> { + path.filename() + .map(|file_name| { + let file_id = file_name + .strip_suffix(&format!(".{}", suffix)) + .unwrap_or(file_name); + FileId::from_str(file_id) + }) + .transpose() +} diff --git a/src/fs/tokio_fs.rs b/src/fs/tokio_fs.rs deleted file mode 100644 index 1774e69..0000000 --- a/src/fs/tokio_fs.rs +++ /dev/null @@ -1,79 +0,0 @@ -use std::{fs, fs::DirEntry, io, path::Path}; - -use async_stream::stream; -use futures_core::Stream; -use regex::Regex; -use tokio::fs::{create_dir_all, remove_file, File, OpenOptions}; - -use super::{FileId, FileProvider, FileType}; -use crate::executor::tokio::TokioExecutor; - -impl FileProvider for TokioExecutor { - type File = File; - - async fn create_dir_all(path: impl AsRef) -> io::Result<()> { - create_dir_all(path).await - } - - async fn open(path: impl AsRef + Send) -> io::Result { - OpenOptions::new() - .truncate(false) - .create(true) - .write(true) - .read(true) - .open(path) - .await - } - - async fn remove(path: impl AsRef + Send) -> io::Result<()> { - remove_file(path).await - } - - fn list( - dir_path: impl AsRef + Send, - file_type: FileType, - is_reverse: bool, - ) -> io::Result>> { - let dir_path = dir_path.as_ref().to_path_buf(); - let mut entries: Vec = - fs::read_dir(&dir_path)?.collect::, io::Error>>()?; - entries.sort_by_key(|entry| entry.file_name()); - - if is_reverse { - entries.reverse(); - } - Ok(stream! { - for entry in entries { - let path = entry.path(); - if path.is_file() { - if let Some(filename) = path.file_name().and_then(|s| s.to_str()) { - if Regex::new(format!("^[0123456789ABCDEFGHJKMNPQRSTVWXYZ]{{26}}.{}$", file_type).as_str()).unwrap().is_match(filename) { - // SAFETY: Checked on WAL_REGEX - let file_id = FileId::from_string(filename - .split('.') - .next() - .unwrap()).unwrap(); - yield Ok((Self::open(dir_path.join(filename)).await?, file_id)) - } - } - } - } - }) - } -} - -#[cfg(test)] -impl TokioExecutor { - pub(crate) async fn file_exist(path: impl AsRef + Send) -> io::Result { - match tokio::fs::metadata(path).await { - Ok(_) => Ok(true), - Err(err) => { - if err.kind() == io::ErrorKind::NotFound { - Ok(false) - } else { - Err(err) - } - } - } - } -} diff --git a/src/inmem/immutable.rs b/src/inmem/immutable.rs index 8f29c4b..aad9f7e 100644 --- a/src/inmem/immutable.rs +++ b/src/inmem/immutable.rs @@ -2,24 +2,25 @@ use std::{ collections::{btree_map::Range, BTreeMap}, mem::transmute, ops::Bound, + sync::Arc, }; -use arrow::array::RecordBatch; +use arrow::{array::RecordBatch, datatypes::Schema}; use crossbeam_skiplist::SkipMap; use parquet::arrow::ProjectionMask; use crate::{ - record::{internal::InternalRecordRef, Key, Record, RecordRef}, + record::{internal::InternalRecordRef, Key, Record, RecordInstance, RecordRef}, stream::record_batch::RecordBatchEntry, timestamp::{Timestamp, Timestamped, TimestampedRef, EPOCH}, }; -pub trait ArrowArrays: Sized { +pub trait ArrowArrays: Sized + Sync { type Record: Record; type Builder: Builder; - fn builder(capacity: usize) -> Self::Builder; + fn builder(schema: &Arc, capacity: usize) -> Self::Builder; fn get( &self, @@ -53,14 +54,23 @@ where index: BTreeMap::Key>, u32>, } -impl From::Key>, Option>> for Immutable +impl + From<( + SkipMap::Key>, Option>, + &RecordInstance, + )> for Immutable where A: ArrowArrays, A::Record: Send, { - fn from(mutable: SkipMap::Key>, Option>) -> Self { + fn from( + (mutable, instance): ( + SkipMap::Key>, Option>, + &RecordInstance, + ), + ) -> Self { let mut index = BTreeMap::new(); - let mut builder = A::builder(mutable.len()); + let mut builder = A::builder(&instance.arrow_schema::(), mutable.len()); for (offset, (key, value)) in mutable.into_iter().enumerate() { builder.push( @@ -182,10 +192,12 @@ where fn next(&mut self) -> Option { self.range.next().map(|(_, &offset)| { + let schema = self.record_batch.schema(); let record_ref = R::Ref::from_record_batch( self.record_batch, offset as usize, &self.projection_mask, + &schema, ); // TODO: remove cloning record batch RecordBatchEntry::new(self.record_batch.clone(), { @@ -209,7 +221,7 @@ pub(crate) mod tests { Array, BooleanArray, BooleanBufferBuilder, BooleanBuilder, PrimitiveBuilder, RecordBatch, StringArray, StringBuilder, UInt32Array, UInt32Builder, }, - datatypes::{ArrowPrimitiveType, UInt32Type}, + datatypes::{ArrowPrimitiveType, Schema, UInt32Type}, }; use parquet::arrow::ProjectionMask; @@ -236,7 +248,7 @@ pub(crate) mod tests { type Builder = TestBuilder; - fn builder(capacity: usize) -> Self::Builder { + fn builder(_schema: &Arc, capacity: usize) -> Self::Builder { TestBuilder { vstring: StringBuilder::with_capacity(capacity, 0), vu32: PrimitiveBuilder::::with_capacity(capacity), diff --git a/src/inmem/mutable.rs b/src/inmem/mutable.rs index fa81cee..69b44b3 100644 --- a/src/inmem/mutable.rs +++ b/src/inmem/mutable.rs @@ -5,13 +5,13 @@ use crossbeam_skiplist::{ map::{Entry, Range}, SkipMap, }; -use futures_util::io; +use fusio::{dynamic::DynFile, DynFs}; use ulid::Ulid; use crate::{ - fs::{FileId, FileProvider}, + fs::{default_open_options, FileId}, inmem::immutable::Immutable, - record::{Key, KeyRef, Record}, + record::{Key, KeyRef, Record, RecordInstance}, timestamp::{ timestamped::{Timestamped, TimestampedRef}, Timestamp, EPOCH, @@ -32,30 +32,30 @@ pub(crate) type MutableScan<'scan, R> = Range< Option, >; -#[derive(Debug)] -pub struct Mutable +pub struct Mutable where R: Record, - FP: FileProvider, { pub(crate) data: SkipMap, Option>, - wal: Option>>, + wal: Option, R>>>, pub(crate) trigger: Arc + Send + Sync>>, } -impl Mutable +impl Mutable where - FP: FileProvider, R: Record, { pub async fn new( option: &DbOption, trigger: Arc + Send + Sync>>, - ) -> io::Result { + fs: &Arc, + ) -> Result { let mut wal = None; if option.use_wal { let file_id = Ulid::new(); - let file = FP::open(option.wal_path(&file_id)).await?; + let file = fs + .open_options(&option.wal_path(&file_id), default_open_options()) + .await?; wal = Some(Mutex::new(WalFile::new(file, file_id))); }; @@ -68,10 +68,9 @@ where } } -impl Mutable +impl Mutable where R: Record + Send, - FP: FileProvider, { pub(crate) async fn insert( &self, @@ -168,7 +167,8 @@ where pub(crate) async fn into_immutable( self, - ) -> io::Result<(Option, Immutable)> { + instance: &RecordInstance, + ) -> Result<(Option, Immutable), fusio::Error> { let mut file_id = None; if let Some(wal) = self.wal { @@ -177,14 +177,13 @@ where file_id = Some(wal_guard.file_id()); } - Ok((file_id, Immutable::from(self.data))) + Ok((file_id, Immutable::from((self.data, instance)))) } } -impl Mutable +impl Mutable where R: Record, - FP: FileProvider, { #[allow(unused)] pub(crate) fn len(&self) -> usize { @@ -196,11 +195,11 @@ where mod tests { use std::{ops::Bound, sync::Arc}; + use fusio::{local::TokioFs, path::Path, DynFs}; + use super::Mutable; use crate::{ - executor::tokio::TokioExecutor, - fs::FileProvider, - record::Record, + record::{Column, Datatype, DynRecord, Record}, tests::{Test, TestRef}, timestamp::Timestamped, trigger::TriggerFactory, @@ -214,15 +213,12 @@ mod tests { let key_2 = "key_2".to_owned(); let temp_dir = tempfile::tempdir().unwrap(); - let option = DbOption::from(temp_dir.path()); - TokioExecutor::create_dir_all(&option.wal_dir_path()) - .await - .unwrap(); + let fs = Arc::new(TokioFs) as Arc; + let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mem_table = Mutable::::new(&option, trigger) - .await - .unwrap(); + let mem_table = Mutable::::new(&option, trigger, &fs).await.unwrap(); mem_table .insert( @@ -265,16 +261,13 @@ mod tests { #[tokio::test] async fn range() { let temp_dir = tempfile::tempdir().unwrap(); - let option = DbOption::from(temp_dir.path()); - TokioExecutor::create_dir_all(&option.wal_dir_path()) - .await - .unwrap(); + let fs = Arc::new(TokioFs) as Arc; + let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mutable = Mutable::::new(&option, trigger) - .await - .unwrap(); + let mutable = Mutable::::new(&option, trigger, &fs).await.unwrap(); mutable .insert(LogType::Full, "1".into(), 0_u32.into()) @@ -348,4 +341,55 @@ mod tests { &Timestamped::new("4".into(), 0_u32.into()) ); } + + #[tokio::test] + async fn test_dyn_read() { + let temp_dir = tempfile::tempdir().unwrap(); + let option = DbOption::with_path( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + "age".to_string(), + 0, + ); + let fs = Arc::new(TokioFs) as Arc; + fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); + + let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); + + let mutable = Mutable::::new(&option, trigger, &fs) + .await + .unwrap(); + + mutable + .insert( + LogType::Full, + DynRecord::new( + vec![ + Column::new(Datatype::Int8, "age".to_string(), Arc::new(1_i8), false), + Column::new( + Datatype::Int16, + "height".to_string(), + Arc::new(1236_i16), + true, + ), + ], + 0, + ), + 0_u32.into(), + ) + .await + .unwrap(); + + { + let mut scan = mutable.scan((Bound::Unbounded, Bound::Unbounded), 0_u32.into()); + let entry = scan.next().unwrap(); + assert_eq!( + entry.key(), + &Timestamped::new( + Column::new(Datatype::Int8, "age".to_string(), Arc::new(1_i8), false), + 0_u32.into() + ) + ); + dbg!(entry.clone().value().as_ref().unwrap()); + } + } } diff --git a/src/lib.rs b/src/lib.rs index 12adc61..0bd56e1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -31,10 +31,15 @@ //! # Examples //! //! ```no_run -//! use std::ops::Bound; +//! use std::{ops::Bound, sync::Arc}; //! +//! use fusio::{local::TokioFs, path::Path}; //! use futures_util::stream::StreamExt; -//! use tonbo::{executor::tokio::TokioExecutor, Projection, Record, DB}; +//! use tokio::fs; +//! use tokio_util::bytes::Bytes; +//! use tonbo::{ +//! executor::tokio::TokioExecutor, fs::manager::StoreManager, DbOption, Projection, Record, DB, +//! }; //! //! // use macro to define schema of column family just like ORM //! // it provides type safety read & write API @@ -48,11 +53,15 @@ //! //! #[tokio::main] //! async fn main() { +//! // make sure the path exists +//! let _ = fs::create_dir_all("./db_path/users").await; +//! +//! let manager = StoreManager::new(Arc::new(TokioFs), vec![]); +//! let options = DbOption::from(Path::from_filesystem_path("./db_path/users").unwrap()); //! // pluggable async runtime and I/O -//! let db = DB::new("./db_path/users".into(), TokioExecutor::default()) +//! let db = DB::new(options, TokioExecutor::default(), manager) //! .await //! .unwrap(); -//! //! // insert with owned value //! db.insert(User { //! name: "Alice".into(), @@ -130,7 +139,7 @@ pub use arrow; use async_lock::RwLock; use async_stream::stream; use flume::{bounded, Sender}; -use fs::FileProvider; +use fusio::dynamic::DynFile; use futures_core::Stream; use futures_util::StreamExt; use inmem::{immutable::Immutable, mutable::Mutable}; @@ -141,7 +150,7 @@ use parquet::{ arrow::{arrow_to_parquet_schema, ProjectionMask}, errors::ParquetError, }; -use record::Record; +use record::{ColumnDesc, DynRecord, Record, RecordInstance}; use thiserror::Error; use timestamp::{Timestamp, TimestampedRef}; use tokio::sync::oneshot; @@ -153,7 +162,7 @@ pub use crate::option::*; use crate::{ compaction::{CompactTask, Compactor}, executor::Executor, - fs::{FileId, FileType}, + fs::{default_open_options, manager::StoreManager, parse_file_id, FileId, FileType}, serdes::Decode, stream::{ mem_projection::MemProjectionStream, merge::MergeStream, package::PackageStream, Entry, @@ -170,12 +179,48 @@ where R: Record, E: Executor, { - schema: Arc>>, - version_set: VersionSet, + schema: Arc>>, + version_set: VersionSet, lock_map: LockMap, + manager: Arc, _p: PhantomData, } +impl DB +where + E: Executor + Send + Sync + 'static, +{ + /// Open [`DB`] with schema which determined by [`ColumnDesc`]. + pub async fn with_schema( + option: DbOption, + executor: E, + manager: StoreManager, + column_descs: Vec, + primary_index: usize, + ) -> Result> { + let option = Arc::new(option); + let manager = Arc::new(manager); + + { + let base_fs = manager.base_fs(); + + base_fs + .create_dir_all(&option.wal_dir_path()) + .await + .map_err(DbError::Fusio)?; + base_fs + .create_dir_all(&option.version_log_dir_path()) + .await + .map_err(DbError::Fusio)?; + } + + let instance = + RecordInstance::Runtime(DynRecord::empty_record(column_descs, primary_index)); + + Self::build(option, executor, instance, manager).await + } +} + impl DB where R: Record + Send + Sync, @@ -187,22 +232,45 @@ where /// according to the configuration of [`DbOption`]. /// /// For more configurable options, please refer to [`DbOption`]. - pub async fn new(option: DbOption, executor: E) -> Result> { + pub async fn new( + option: DbOption, + executor: E, + manager: StoreManager, + ) -> Result> { let option = Arc::new(option); - E::create_dir_all(&option.path).await?; - E::create_dir_all(&option.wal_dir_path()).await?; - E::create_dir_all(&option.version_log_dir_path()).await?; + let manager = Arc::new(manager); + + { + let base_fs = manager.base_fs(); + + // FIXME: error handle + let _ = base_fs.create_dir_all(&option.wal_dir_path()).await; + let _ = base_fs.create_dir_all(&option.version_log_dir_path()).await; + } + + Self::build(option, executor, RecordInstance::Normal, manager).await + } + async fn build( + option: Arc>, + executor: E, + instance: RecordInstance, + manager: Arc, + ) -> Result> { let (task_tx, task_rx) = bounded(1); - let (mut cleaner, clean_sender) = Cleaner::::new(option.clone()); + let (mut cleaner, clean_sender) = Cleaner::::new(option.clone(), manager.clone()); - let version_set = VersionSet::new(clean_sender, option.clone()).await?; + let version_set = VersionSet::new(clean_sender, option.clone(), manager.clone()).await?; let schema = Arc::new(RwLock::new( - Schema::new(option.clone(), task_tx, &version_set).await?, + Schema::new(option.clone(), task_tx, &version_set, instance, &manager).await?, )); - let mut compactor = - Compactor::::new(schema.clone(), option.clone(), version_set.clone()); + let mut compactor = Compactor::::new( + schema.clone(), + option.clone(), + version_set.clone(), + manager.clone(), + ); executor.spawn(async move { if let Err(err) = cleaner.listen().await { @@ -221,21 +289,22 @@ where } } }); - Ok(Self { schema, version_set, lock_map: Arc::new(Default::default()), + manager, _p: Default::default(), }) } /// open an optimistic ACID transaction - pub async fn transaction(&self) -> Transaction<'_, R, E> { + pub async fn transaction(&self) -> Transaction<'_, R> { Transaction::new( self.version_set.current().await, self.schema.read().await, self.lock_map.clone(), + self.manager.clone(), ) } @@ -286,6 +355,7 @@ where .await .get( &*self.version_set.current().await, + &self.manager, key, self.version_set.load_ts(), Projection::All, @@ -302,9 +372,11 @@ where ) -> impl Stream>> + 'scan { stream! { let schema = self.schema.read().await; + let manager = &self.manager; let current = self.version_set.current().await; let mut scan = Scan::new( &schema, + manager, range, self.version_set.load_ts(), &*current, @@ -358,44 +430,64 @@ where } } -pub(crate) struct Schema +pub(crate) struct Schema where R: Record, - FP: FileProvider, { - mutable: Mutable, - immutables: Vec<(Option, Immutable)>, + pub mutable: Mutable, + pub immutables: Vec<(Option, Immutable)>, compaction_tx: Sender, recover_wal_ids: Option>, trigger: Arc + Send + Sync>>, + record_instance: RecordInstance, } -impl Schema +impl Schema where R: Record + Send, - FP: FileProvider, { async fn new( option: Arc>, compaction_tx: Sender, - version_set: &VersionSet, + version_set: &VersionSet, + record_instance: RecordInstance, + manager: &StoreManager, ) -> Result> { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); let mut schema = Schema { - mutable: Mutable::new(&option, trigger.clone()).await?, + mutable: Mutable::new(&option, trigger.clone(), manager.base_fs()).await?, immutables: Default::default(), compaction_tx, recover_wal_ids: None, trigger, + record_instance, }; + let base_fs = manager.base_fs(); + let wal_dir_path = option.wal_dir_path(); let mut transaction_map = HashMap::new(); - let mut wal_stream = pin!(FP::list(option.wal_dir_path(), FileType::Wal, false)?); let mut wal_ids = Vec::new(); - while let Some(wal) = wal_stream.next().await { - let (file, wal_id) = wal?; - let mut wal = WalFile::::new(file, wal_id); + let wal_metas = { + let mut wal_metas = Vec::new(); + let mut wal_stream = base_fs.list(&wal_dir_path).await?; + + while let Some(file_meta) = wal_stream.next().await { + wal_metas.push(file_meta?); + } + wal_metas.sort_by(|meta_a, meta_b| meta_a.path.cmp(&meta_b.path)); + wal_metas + }; + + for wal_meta in wal_metas { + let wal_path = wal_meta.path; + + let file = base_fs + .open_options(&wal_path, default_open_options()) + .await?; + // SAFETY: wal_stream return only file name + let wal_id = parse_file_id(&wal_path, FileType::Wal)?.unwrap(); + let mut wal = WalFile::, R>::new(file, wal_id); wal_ids.push(wal_id); let mut recover_stream = pin!(wal.recover()); @@ -465,29 +557,28 @@ where async fn get<'get>( &'get self, - version: &'get Version, + version: &'get Version, + manager: &StoreManager, key: &'get R::Key, ts: Timestamp, projection: Projection, - ) -> Result>, DbError> - where - FP: FileProvider, - { + ) -> Result>, DbError> { if let Some(entry) = self.mutable.get(key, ts) { return Ok(Some(Entry::Mutable(entry))); } + let primary_key_index = self.record_instance.primary_key_index::(); let projection = match projection { Projection::All => ProjectionMask::all(), Projection::Parts(projection) => { - let mut fixed_projection: Vec = [0, 1, R::primary_key_index()] + let mut fixed_projection: Vec = [0, 1, primary_key_index] .into_iter() .chain(projection.into_iter().map(|p| p + 2)) .collect(); fixed_projection.dedup(); ProjectionMask::roots( - &arrow_to_parquet_schema(R::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(&self.record_instance.arrow_schema::()).unwrap(), fixed_projection, ) } @@ -500,7 +591,7 @@ where } Ok(version - .query(TimestampedRef::new(key, ts), projection) + .query(manager, TimestampedRef::new(key, ts), projection) .await? .map(|entry| Entry::RecordBatch(entry))) } @@ -516,41 +607,42 @@ where } /// scan configuration intermediate structure -pub struct Scan<'scan, R, FP> +pub struct Scan<'scan, R> where R: Record, - FP: FileProvider, { - schema: &'scan Schema, + schema: &'scan Schema, + manager: &'scan StoreManager, lower: Bound<&'scan R::Key>, upper: Bound<&'scan R::Key>, ts: Timestamp, - version: &'scan Version, + version: &'scan Version, fn_pre_stream: - Box) -> Option> + 'scan>, + Box) -> Option> + Send + 'scan>, limit: Option, projection_indices: Option>, projection: ProjectionMask, } -impl<'scan, R, FP> Scan<'scan, R, FP> +impl<'scan, R> Scan<'scan, R> where R: Record + Send, - FP: FileProvider, { fn new( - schema: &'scan Schema, + schema: &'scan Schema, + manager: &'scan StoreManager, (lower, upper): (Bound<&'scan R::Key>, Bound<&'scan R::Key>), ts: Timestamp, - version: &'scan Version, + version: &'scan Version, fn_pre_stream: Box< - dyn FnOnce(Option) -> Option> + 'scan, + dyn FnOnce(Option) -> Option> + Send + 'scan, >, ) -> Self { Self { schema, + manager, lower, upper, ts, @@ -576,12 +668,13 @@ where for p in &mut projection { *p += 2; } - let mut fixed_projection = vec![0, 1, R::primary_key_index()]; + let primary_key_index = self.schema.record_instance.primary_key_index::(); + let mut fixed_projection = vec![0, 1, primary_key_index]; fixed_projection.append(&mut projection); fixed_projection.dedup(); let mask = ProjectionMask::roots( - &arrow_to_parquet_schema(R::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(&self.schema.record_instance.arrow_schema::()).unwrap(), fixed_projection.clone(), ); @@ -627,6 +720,7 @@ where } self.version .streams( + self.manager, &mut streams, (self.lower, self.upper), self.ts, @@ -678,6 +772,7 @@ where } self.version .streams( + self.manager, &mut streams, (self.lower, self.upper), self.ts, @@ -691,6 +786,7 @@ where batch_size, merge_stream, self.projection_indices, + &self.schema.record_instance, )) } } @@ -706,12 +802,18 @@ where Version(#[from] VersionError), #[error("write parquet error: {0}")] Parquet(#[from] ParquetError), + #[error("write ulid decode error: {0}")] + UlidDecode(#[from] ulid::DecodeError), + #[error("write fusio error: {0}")] + Fusio(#[from] fusio::Error), // #[error("write encode error: {0}")] // Encode(<::Ref as Encode>::Error), #[error("write recover error: {0}")] Recover(#[from] RecoverError<::Error>), #[error("wal write error: {0}")] WalWrite(Box), + #[error("exceeds the maximum level(0-6)")] + ExceedsMaxLevel, } type LockMap = Arc>; @@ -735,18 +837,24 @@ pub(crate) mod tests { }; use async_lock::RwLock; use flume::{bounded, Receiver}; + use fusio::{local::TokioFs, path::Path, DynFs, Read, Write}; + use futures::StreamExt; use once_cell::sync::Lazy; use parquet::{arrow::ProjectionMask, format::SortingColumn, schema::types::ColumnPath}; use tempfile::TempDir; - use tokio::io; use tracing::error; use crate::{ compaction::{CompactTask, Compactor}, executor::{tokio::TokioExecutor, Executor}, - fs::{FileId, FileProvider}, + fs::{manager::StoreManager, FileId}, inmem::{immutable::tests::TestImmutableArrays, mutable::Mutable}, - record::{internal::InternalRecordRef, RecordDecodeError, RecordEncodeError, RecordRef}, + record::{ + internal::InternalRecordRef, + test::{test_dyn_item_schema, test_dyn_items}, + Column, Datatype, DynRecord, RecordDecodeError, RecordEncodeError, RecordInstance, + RecordRef, + }, serdes::{Decode, Encode}, trigger::{TriggerFactory, TriggerType}, version::{cleaner::Cleaner, set::tests::build_version_set, Version}, @@ -766,7 +874,7 @@ pub(crate) mod tests { async fn decode(reader: &mut R) -> Result where - R: tokio::io::AsyncRead + Unpin, + R: Read + Unpin, { let vstring = String::decode(reader) @@ -803,7 +911,8 @@ pub(crate) mod tests { type Key = String; - type Ref<'r> = TestRef<'r> + type Ref<'r> + = TestRef<'r> where Self: 'r; @@ -867,7 +976,7 @@ pub(crate) mod tests { async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: io::AsyncWrite + Unpin + Send, + W: Write + Unpin + Send, { self.vstring .encode(writer) @@ -919,6 +1028,7 @@ pub(crate) mod tests { record_batch: &'r RecordBatch, offset: usize, projection_mask: &'r ProjectionMask, + _: &Arc, ) -> InternalRecordRef<'r, Self> { let mut column_i = 2; let null = record_batch.column(0).as_boolean().value(offset); @@ -969,8 +1079,10 @@ pub(crate) mod tests { pub(crate) async fn get_test_record_batch( option: DbOption, executor: E, + manager: StoreManager, ) -> RecordBatch { - let db: DB = DB::new(option.clone(), executor).await.unwrap(); + let base_fs = manager.base_fs().clone(); + let db: DB = DB::new(option.clone(), executor, manager).await.unwrap(); db.write( Test { @@ -998,20 +1110,21 @@ pub(crate) mod tests { let trigger = schema.trigger.clone(); let mutable = mem::replace( &mut schema.mutable, - Mutable::new(&option, trigger).await.unwrap(), + Mutable::new(&option, trigger, &base_fs).await.unwrap(), ); - Immutable::<::Columns>::from(mutable.data) + Immutable::<::Columns>::from((mutable.data, &RecordInstance::Normal)) .as_record_batch() .clone() } pub(crate) async fn build_schema( option: Arc>, - ) -> io::Result<(crate::Schema, Receiver)> { + fs: &Arc, + ) -> Result<(crate::Schema, Receiver), fusio::Error> { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mutable = Mutable::new(&option, trigger.clone()).await?; + let mutable = Mutable::new(&option, trigger.clone(), fs).await?; mutable .insert( @@ -1053,8 +1166,7 @@ pub(crate) mod tests { let immutables = { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mutable: Mutable = - Mutable::new(&option, trigger.clone()).await?; + let mutable: Mutable = Mutable::new(&option, trigger.clone(), fs).await?; mutable .insert( @@ -1093,7 +1205,10 @@ pub(crate) mod tests { .await .unwrap(); - vec![(Some(FileId::new()), Immutable::from(mutable.data))] + vec![( + Some(FileId::new()), + Immutable::from((mutable.data, &RecordInstance::Normal)), + )] }; let (compaction_tx, compaction_rx) = bounded(1); @@ -1105,6 +1220,7 @@ pub(crate) mod tests { compaction_tx, recover_wal_ids: None, trigger, + record_instance: RecordInstance::Normal, }, compaction_rx, )) @@ -1114,25 +1230,33 @@ pub(crate) mod tests { option: Arc>, compaction_rx: Receiver, executor: E, - schema: crate::Schema, - version: Version, + schema: crate::Schema, + version: Version, + manager: Arc, ) -> Result, DbError> where R: Record + Send + Sync, R::Columns: Send + Sync, E: Executor + Send + Sync + 'static, { - E::create_dir_all(&option.path).await?; - E::create_dir_all(&option.version_log_dir_path()) - .await - .unwrap(); + { + let base_fs = manager.base_fs(); + + let _ = base_fs.create_dir_all(&option.wal_dir_path()).await; + let _ = base_fs.create_dir_all(&option.version_log_dir_path()).await; + } let schema = Arc::new(RwLock::new(schema)); - let (mut cleaner, clean_sender) = Cleaner::::new(option.clone()); - let version_set = build_version_set(version, clean_sender, option.clone()).await?; - let mut compactor = - Compactor::::new(schema.clone(), option.clone(), version_set.clone()); + let (mut cleaner, clean_sender) = Cleaner::::new(option.clone(), manager.clone()); + let version_set = + build_version_set(version, clean_sender, option.clone(), manager.clone()).await?; + let mut compactor = Compactor::::new( + schema.clone(), + option.clone(), + version_set.clone(), + manager.clone(), + ); executor.spawn(async move { if let Err(err) = cleaner.listen().await { @@ -1156,6 +1280,7 @@ pub(crate) mod tests { schema, version_set, lock_map: Arc::new(Default::default()), + manager, _p: Default::default(), }) } @@ -1368,8 +1493,9 @@ pub(crate) mod tests { #[tokio::test(flavor = "multi_thread")] async fn read_from_disk() { let temp_dir = TempDir::new().unwrap(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); - let mut option = DbOption::from(temp_dir.path()); + let mut option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); option.immutable_chunk_num = 1; option.immutable_chunk_max_num = 1; option.major_threshold_with_sst_size = 3; @@ -1378,12 +1504,16 @@ pub(crate) mod tests { option.major_default_oldest_table_num = 1; option.trigger_type = TriggerType::Length(/* max_mutable_len */ 5); - let db: DB = DB::new(option, TokioExecutor::new()).await.unwrap(); + let db: DB = DB::new(option, TokioExecutor::new(), manager) + .await + .unwrap(); - for item in test_items() { + for (i, item) in test_items().into_iter().enumerate() { db.write(item, 0.into()).await.unwrap(); + if i % 5 == 0 { + db.flush().await.unwrap(); + } } - let _ = db.flush().await; let tx = db.transaction().await; let key = 20.to_string(); @@ -1402,8 +1532,9 @@ pub(crate) mod tests { #[tokio::test] async fn test_flush() { let temp_dir = TempDir::new().unwrap(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); - let mut option = DbOption::from(temp_dir.path()); + let mut option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); option.immutable_chunk_num = 1; option.immutable_chunk_max_num = 1; option.major_threshold_with_sst_size = 3; @@ -1412,7 +1543,9 @@ pub(crate) mod tests { option.major_default_oldest_table_num = 1; option.trigger_type = TriggerType::Length(/* max_mutable_len */ 5); - let db: DB = DB::new(option, TokioExecutor::new()).await.unwrap(); + let db: DB = DB::new(option, TokioExecutor::new(), manager) + .await + .unwrap(); for item in &test_items()[0..10] { db.write(item.clone(), 0.into()).await.unwrap(); @@ -1429,20 +1562,23 @@ pub(crate) mod tests { #[tokio::test] async fn schema_recover() { let temp_dir = TempDir::new().unwrap(); - let option = Arc::new(DbOption::from(temp_dir.path())); - TokioExecutor::create_dir_all(&option.wal_dir_path()) - .await - .unwrap(); + let fs = Arc::new(TokioFs) as Arc; + + let option = Arc::new(DbOption::from( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + )); + fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let (task_tx, _task_rx) = bounded(1); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let schema: crate::Schema = crate::Schema { - mutable: Mutable::new(&option, trigger.clone()).await.unwrap(), + let schema: crate::Schema = crate::Schema { + mutable: Mutable::new(&option, trigger.clone(), &fs).await.unwrap(), immutables: Default::default(), compaction_tx: task_tx.clone(), recover_wal_ids: None, trigger, + record_instance: RecordInstance::Normal, }; for (i, item) in test_items().into_iter().enumerate() { @@ -1455,12 +1591,13 @@ pub(crate) mod tests { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let schema: crate::Schema = crate::Schema { - mutable: Mutable::new(&option, trigger.clone()).await.unwrap(), + let schema: crate::Schema = crate::Schema { + mutable: Mutable::new(&option, trigger.clone(), &fs).await.unwrap(), immutables: Default::default(), compaction_tx: task_tx, recover_wal_ids: None, trigger, + record_instance: RecordInstance::Normal, }; let range = schema .mutable @@ -1480,6 +1617,432 @@ pub(crate) mod tests { } } + #[tokio::test] + async fn dyn_schema_recover() { + let temp_dir = TempDir::new().unwrap(); + let fs = Arc::new(TokioFs) as Arc; + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + + let (desc, primary_key_index) = test_dyn_item_schema(); + let option = Arc::new(DbOption::with_path( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + "age".to_owned(), + primary_key_index, + )); + fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); + + let (task_tx, _task_rx) = bounded(1); + + let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); + let schema: crate::Schema = crate::Schema { + mutable: Mutable::new(&option, trigger.clone(), &fs).await.unwrap(), + immutables: Default::default(), + compaction_tx: task_tx.clone(), + recover_wal_ids: None, + trigger, + record_instance: RecordInstance::Normal, + }; + + for item in test_dyn_items().into_iter() { + schema + .write(LogType::Full, item, 0_u32.into()) + .await + .unwrap(); + } + drop(schema); + + let option = DbOption::with_path( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + "age".to_owned(), + primary_key_index, + ); + let db: DB = DB::with_schema( + option, + TokioExecutor::new(), + manager, + desc, + primary_key_index, + ) + .await + .unwrap(); + + let mut sort_items = BTreeMap::new(); + for item in test_dyn_items() { + sort_items.insert(item.key(), item); + } + + { + let tx = db.transaction().await; + let mut scan = tx + .scan((Bound::Unbounded, Bound::Unbounded)) + .projection(vec![0, 1, 2]) + .take() + .await + .unwrap(); + + while let Some(entry) = scan.next().await.transpose().unwrap() { + let columns1 = entry.value().unwrap().columns; + let (_, record) = sort_items.pop_first().unwrap(); + let columns2 = record.as_record_ref().columns; + + assert_eq!(columns1.get(1), columns2.get(1)); + assert_eq!(columns1.get(2), columns2.get(2)); + } + } + } + + #[tokio::test] + async fn test_read_write_dyn() { + let temp_dir = TempDir::new().unwrap(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + + let (cols_desc, primary_key_index) = test_dyn_item_schema(); + let mut option = DbOption::with_path( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + "age".to_string(), + primary_key_index, + ); + option.immutable_chunk_num = 1; + option.immutable_chunk_max_num = 1; + option.major_threshold_with_sst_size = 3; + option.level_sst_magnification = 10; + option.max_sst_file_size = 2 * 1024 * 1024; + option.major_default_oldest_table_num = 1; + option.trigger_type = TriggerType::Length(5); + + let db: DB = DB::with_schema( + option, + TokioExecutor::new(), + manager, + cols_desc, + primary_key_index, + ) + .await + .unwrap(); + + for (i, item) in test_dyn_items().into_iter().enumerate() { + if i == 28 { + db.remove(item.key()).await.unwrap(); + } else { + db.write(item, 0.into()).await.unwrap(); + } + } + + dbg!(db.version_set.current().await); + // test get + { + let tx = db.transaction().await; + + for i in 0..50 { + let key = Column::new(Datatype::Int8, "age".to_string(), Arc::new(i as i8), false); + let option1 = tx.get(&key, Projection::All).await.unwrap(); + if i == 28 { + assert!(option1.is_none()); + continue; + } + let entry = option1.unwrap(); + let record_ref = entry.get(); + + assert_eq!( + *record_ref + .columns + .first() + .unwrap() + .value + .as_ref() + .downcast_ref::() + .unwrap(), + i as i8 + ); + let height = record_ref + .columns + .get(1) + .unwrap() + .value + .as_ref() + .downcast_ref::>() + .unwrap(); + if i < 45 { + assert_eq!(*height, Some(20 * i as i16),); + } else { + assert!(height.is_none()); + } + assert_eq!( + *record_ref + .columns + .get(2) + .unwrap() + .value + .as_ref() + .downcast_ref::>() + .unwrap(), + Some(200 * i), + ); + } + tx.commit().await.unwrap(); + } + // test scan + { + let tx = db.transaction().await; + let lower = Column::new(Datatype::Int8, "age".to_owned(), Arc::new(0_i8), false); + let upper = Column::new(Datatype::Int8, "age".to_owned(), Arc::new(49_i8), false); + let mut scan = tx + .scan((Bound::Included(&lower), Bound::Included(&upper))) + .projection(vec![0, 1]) + .take() + .await + .unwrap(); + + let mut i = 0_i8; + while let Some(entry) = scan.next().await.transpose().unwrap() { + if i == 28 { + assert!(entry.value().is_none()); + i += 1; + continue; + } + let columns = entry.value().unwrap().columns; + + let primary_key_col = columns.first().unwrap(); + assert_eq!(primary_key_col.datatype, Datatype::Int8); + assert_eq!(primary_key_col.name, "age".to_string()); + assert_eq!( + *primary_key_col.value.as_ref().downcast_ref::().unwrap(), + i + ); + + let col = columns.get(1).unwrap(); + assert_eq!(col.datatype, Datatype::Int16); + assert_eq!(col.name, "height".to_string()); + let height = *col.value.as_ref().downcast_ref::>().unwrap(); + if i < 45 { + assert_eq!(height, Some(i as i16 * 20)); + } else { + assert!(col + .value + .as_ref() + .downcast_ref::>() + .unwrap() + .is_none(),); + } + + let col = columns.get(2).unwrap(); + assert_eq!(col.datatype, Datatype::Int32); + assert_eq!(col.name, "weight".to_string()); + let weight = col.value.as_ref().downcast_ref::>(); + assert!(weight.is_some()); + assert_eq!(*weight.unwrap(), None); + i += 1 + } + } + } + + #[tokio::test] + async fn test_dyn_multiple_db() { + let manager1 = StoreManager::new(Arc::new(TokioFs), vec![]); + let manager2 = StoreManager::new(Arc::new(TokioFs), vec![]); + let manager3 = StoreManager::new(Arc::new(TokioFs), vec![]); + let temp_dir1 = TempDir::with_prefix("db1").unwrap(); + + let (cols_desc, primary_key_index) = test_dyn_item_schema(); + let mut option = DbOption::with_path( + Path::from_filesystem_path(temp_dir1.path()).unwrap(), + "age".to_string(), + primary_key_index, + ); + option.immutable_chunk_num = 1; + option.immutable_chunk_max_num = 1; + option.major_threshold_with_sst_size = 3; + option.major_default_oldest_table_num = 1; + option.trigger_type = TriggerType::Length(5); + + let temp_dir2 = TempDir::with_prefix("db2").unwrap(); + let mut option2 = DbOption::with_path( + Path::from_filesystem_path(temp_dir2.path()).unwrap(), + "age".to_string(), + primary_key_index, + ); + option2.immutable_chunk_num = 1; + option2.immutable_chunk_max_num = 1; + option2.major_threshold_with_sst_size = 3; + option2.major_default_oldest_table_num = 1; + option2.trigger_type = TriggerType::Length(5); + + let temp_dir3 = TempDir::with_prefix("db3").unwrap(); + let mut option3 = DbOption::with_path( + Path::from_filesystem_path(temp_dir3.path()).unwrap(), + "age".to_string(), + primary_key_index, + ); + option3.immutable_chunk_num = 1; + option3.immutable_chunk_max_num = 1; + option3.major_threshold_with_sst_size = 3; + option3.major_default_oldest_table_num = 1; + option3.trigger_type = TriggerType::Length(5); + + let db1: DB = DB::with_schema( + option, + TokioExecutor::new(), + manager1, + cols_desc.clone(), + primary_key_index, + ) + .await + .unwrap(); + let db2: DB = DB::with_schema( + option2, + TokioExecutor::new(), + manager2, + cols_desc.clone(), + primary_key_index, + ) + .await + .unwrap(); + let db3: DB = DB::with_schema( + option3, + TokioExecutor::new(), + manager3, + cols_desc, + primary_key_index, + ) + .await + .unwrap(); + + for (i, item) in test_dyn_items().into_iter().enumerate() { + if i >= 40 { + db3.write(item, 0.into()).await.unwrap(); + } else if i % 2 == 0 { + db1.write(item, 0.into()).await.unwrap(); + } else { + db2.write(item, 0.into()).await.unwrap(); + } + } + + // test get + { + let tx1 = db1.transaction().await; + let tx2 = db2.transaction().await; + let tx3 = db3.transaction().await; + + for i in 0..50 { + let key = Column::new(Datatype::Int8, "age".to_string(), Arc::new(i as i8), false); + let option1 = tx1.get(&key, Projection::All).await.unwrap(); + let option2 = tx2.get(&key, Projection::All).await.unwrap(); + let option3 = tx3.get(&key, Projection::All).await.unwrap(); + let entry = if i >= 40 { + assert!(option2.is_none()); + assert!(option1.is_none()); + option3.unwrap() + } else if i % 2 == 0 { + assert!(option2.is_none()); + assert!(option3.is_none()); + option1.unwrap() + } else { + assert!(option1.is_none()); + assert!(option3.is_none()); + option2.unwrap() + }; + let record_ref = entry.get(); + + assert_eq!( + *record_ref + .columns + .first() + .unwrap() + .value + .as_ref() + .downcast_ref::() + .unwrap(), + i as i8 + ); + assert_eq!( + *record_ref + .columns + .get(2) + .unwrap() + .value + .as_ref() + .downcast_ref::>() + .unwrap(), + Some(200 * i), + ); + } + tx1.commit().await.unwrap(); + } + // test scan + { + let tx1 = db1.transaction().await; + let lower = Column::new(Datatype::Int8, "age".to_owned(), Arc::new(8_i8), false); + let upper = Column::new(Datatype::Int8, "age".to_owned(), Arc::new(43_i8), false); + let mut scan = tx1 + .scan((Bound::Included(&lower), Bound::Included(&upper))) + .projection(vec![0, 1]) + .take() + .await + .unwrap(); + + let mut i = 8_i8; + while let Some(entry) = scan.next().await.transpose().unwrap() { + let columns = entry.value().unwrap().columns; + + let primary_key_col = columns.first().unwrap(); + assert_eq!(primary_key_col.datatype, Datatype::Int8); + assert_eq!(primary_key_col.name, "age".to_string()); + assert_eq!( + *primary_key_col.value.as_ref().downcast_ref::().unwrap(), + i + ); + + i += 2 + } + assert_eq!(i, 40); + let tx2 = db2.transaction().await; + let mut scan = tx2 + .scan((Bound::Included(&lower), Bound::Included(&upper))) + .projection(vec![0, 1]) + .take() + .await + .unwrap(); + + let mut i = 9_i8; + while let Some(entry) = scan.next().await.transpose().unwrap() { + let columns = entry.value().unwrap().columns; + + let primary_key_col = columns.first().unwrap(); + assert_eq!(primary_key_col.datatype, Datatype::Int8); + assert_eq!(primary_key_col.name, "age".to_string()); + assert_eq!( + *primary_key_col.value.as_ref().downcast_ref::().unwrap(), + i + ); + + i += 2 + } + assert_eq!(i, 41); + let tx3 = db3.transaction().await; + let mut scan = tx3 + .scan((Bound::Included(&lower), Bound::Included(&upper))) + .projection(vec![0, 1]) + .take() + .await + .unwrap(); + + let mut i = 40_i8; + while let Some(entry) = scan.next().await.transpose().unwrap() { + let columns = entry.value().unwrap().columns; + + let primary_key_col = columns.first().unwrap(); + assert_eq!(primary_key_col.datatype, Datatype::Int8); + assert_eq!(primary_key_col.name, "age".to_string()); + assert_eq!( + *primary_key_col.value.as_ref().downcast_ref::().unwrap(), + i + ); + + i += 1 + } + } + } + #[test] fn build_test() { let t = trybuild::TestCases::new(); diff --git a/src/ondisk/arrows.rs b/src/ondisk/arrows.rs index d257019..fdca1b1 100644 --- a/src/ondisk/arrows.rs +++ b/src/ondisk/arrows.rs @@ -71,10 +71,7 @@ where predictions.push(Box::new(ArrowPredicateFn::new( ProjectionMask::roots(schema_descriptor, [2]), move |record_batch| { - lower_cmp( - record_batch.column(0), - &lower_key.to_arrow_datum() as &dyn Datum, - ) + lower_cmp(record_batch.column(0), lower_key.to_arrow_datum().as_ref()) }, ))); } @@ -82,10 +79,7 @@ where predictions.push(Box::new(ArrowPredicateFn::new( ProjectionMask::roots(schema_descriptor, [2]), move |record_batch| { - upper_cmp( - &upper_key.to_arrow_datum() as &dyn Datum, - record_batch.column(0), - ) + upper_cmp(upper_key.to_arrow_datum().as_ref(), record_batch.column(0)) }, ))); } diff --git a/src/ondisk/scan.rs b/src/ondisk/scan.rs index cc693f3..5a8ce66 100644 --- a/src/ondisk/scan.rs +++ b/src/ondisk/scan.rs @@ -1,54 +1,52 @@ use std::{ marker::PhantomData, pin::Pin, + sync::Arc, task::{Context, Poll}, }; +use arrow::datatypes::Schema; +use fusio_parquet::reader::AsyncReader; use futures_core::{ready, Stream}; use parquet::arrow::{async_reader::ParquetRecordBatchStream, ProjectionMask}; use pin_project_lite::pin_project; use crate::{ - fs::FileProvider, record::Record, stream::record_batch::{RecordBatchEntry, RecordBatchIterator}, }; pin_project! { #[derive(Debug)] - pub struct SsTableScan<'scan, R, FP> - where - FP: FileProvider, - { + pub struct SsTableScan<'scan, R>{ #[pin] - stream: ParquetRecordBatchStream, + stream: ParquetRecordBatchStream, iter: Option>, projection_mask: ProjectionMask, + full_schema: Arc, _marker: PhantomData<&'scan ()> } } -impl SsTableScan<'_, R, FP> -where - FP: FileProvider, -{ +impl SsTableScan<'_, R> { pub fn new( - stream: ParquetRecordBatchStream, + stream: ParquetRecordBatchStream, projection_mask: ProjectionMask, + full_schema: Arc, ) -> Self { SsTableScan { stream, iter: None, projection_mask, + full_schema, _marker: PhantomData, } } } -impl<'scan, R, FP> Stream for SsTableScan<'scan, R, FP> +impl<'scan, R> Stream for SsTableScan<'scan, R> where R: Record, - FP: FileProvider, { type Item = Result, parquet::errors::ParquetError>; @@ -71,6 +69,7 @@ where *this.iter = Some(RecordBatchIterator::new( record_batch, this.projection_mask.clone(), + this.full_schema.clone(), )); } } diff --git a/src/ondisk/sstable.rs b/src/ondisk/sstable.rs index 6338dc8..2f681c2 100644 --- a/src/ondisk/sstable.rs +++ b/src/ondisk/sstable.rs @@ -1,84 +1,48 @@ use std::{marker::PhantomData, ops::Bound}; +use fusio::{dynamic::DynFile, DynRead}; +use fusio_parquet::reader::AsyncReader; use futures_util::StreamExt; -use parquet::{ - arrow::{ - arrow_reader::{ArrowReaderBuilder, ArrowReaderOptions}, - arrow_writer::ArrowWriterOptions, - async_reader::AsyncReader, - AsyncArrowWriter, ParquetRecordBatchStreamBuilder, ProjectionMask, - }, - basic::{Compression, ZstdLevel}, - file::properties::WriterProperties, +use parquet::arrow::{ + arrow_reader::{ArrowReaderBuilder, ArrowReaderOptions}, + ParquetRecordBatchStreamBuilder, ProjectionMask, }; use super::{arrows::get_range_filter, scan::SsTableScan}; use crate::{ - fs::{AsyncFile, FileProvider}, record::Record, stream::record_batch::RecordBatchEntry, timestamp::{Timestamp, TimestampedRef}, }; -pub(crate) struct SsTable +pub(crate) struct SsTable where R: Record, - FP: FileProvider, { - reader: FP::File, + reader: AsyncReader, _marker: PhantomData, } -impl SsTable +impl SsTable where R: Record, - FP: FileProvider, { - pub(crate) fn open(file: FP::File) -> Self { - SsTable { - reader: file, - _marker: PhantomData, - } - } - - #[allow(unused)] - fn create_writer(&mut self) -> AsyncArrowWriter<&mut dyn AsyncFile> { - // TODO: expose writer options - let options = ArrowWriterOptions::new().with_properties( - WriterProperties::builder() - .set_created_by(concat!("tonbo version ", env!("CARGO_PKG_VERSION")).to_owned()) - .set_compression(Compression::ZSTD(ZstdLevel::try_new(3).unwrap())) - .build(), - ); - AsyncArrowWriter::try_new_with_options( - (&mut self.reader as &mut dyn AsyncFile), - R::arrow_schema().clone(), - options, - ) - .expect("Failed to create writer") - } + pub(crate) async fn open(file: Box) -> Result { + let size = DynRead::size(&file).await?; - #[cfg(test)] - async fn write( - &mut self, - record_batch: arrow::array::RecordBatch, - ) -> parquet::errors::Result<()> { - let mut writer = self.create_writer(); - writer.write(&record_batch).await?; - - if writer.in_progress_size() > (1 << 20) - 1 { - writer.flush().await?; - } - - writer.close().await?; - Ok(()) + Ok(SsTable { + reader: AsyncReader::new(file, size), + _marker: PhantomData, + }) } async fn into_parquet_builder( self, limit: Option, projection_mask: ProjectionMask, - ) -> parquet::errors::Result>> { + ) -> parquet::errors::Result< + ArrowReaderBuilder>, + > { let mut builder = ParquetRecordBatchStreamBuilder::new_with_options( self.reader, ArrowReaderOptions::default().with_page_index(true), @@ -96,7 +60,7 @@ where projection_mask: ProjectionMask, ) -> parquet::errors::Result>> { self.scan( - (Bound::Included(key.value()), Bound::Unbounded), + (Bound::Included(key.value()), Bound::Included(key.value())), key.ts(), Some(1), projection_mask, @@ -113,12 +77,13 @@ where ts: Timestamp, limit: Option, projection_mask: ProjectionMask, - ) -> Result, parquet::errors::ParquetError> { + ) -> Result, parquet::errors::ParquetError> { let builder = self .into_parquet_builder(limit, projection_mask.clone()) .await?; let schema_descriptor = builder.metadata().file_metadata().schema_descr(); + let full_schema = builder.schema().clone(); // Safety: filter's lifetime relies on range's lifetime, sstable must not live longer than // it @@ -127,89 +92,104 @@ where Ok(SsTableScan::new( builder.with_row_filter(filter).build()?, projection_mask, + full_schema, )) } } #[cfg(test)] pub(crate) mod tests { - use std::{borrow::Borrow, ops::Bound, path::PathBuf}; + use std::{borrow::Borrow, fs::File, ops::Bound, sync::Arc}; + use arrow::array::RecordBatch; + use fusio::{dynamic::DynFile, local::TokioFs, path::Path, DynFs}; + use fusio_parquet::writer::AsyncWriter; use futures_util::StreamExt; - use parquet::arrow::{arrow_to_parquet_schema, ProjectionMask}; + use parquet::{ + arrow::{ + arrow_to_parquet_schema, arrow_writer::ArrowWriterOptions, AsyncArrowWriter, + ProjectionMask, + }, + basic::{Compression, ZstdLevel}, + file::properties::WriterProperties, + }; use super::SsTable; use crate::{ executor::tokio::TokioExecutor, - fs::FileProvider, + fs::{default_open_options, manager::StoreManager}, record::Record, - tests::{get_test_record_batch, Test, TestRef}, + tests::{get_test_record_batch, Test}, timestamp::Timestamped, DbOption, }; - pub(crate) async fn open_sstable(path: &PathBuf) -> SsTable - where - R: Record, - FP: FileProvider, - { - SsTable::open(FP::open(path).await.unwrap()) - } - - #[tokio::test(flavor = "multi_thread", worker_threads = 2)] - async fn write_sstable() { - let temp_dir = tempfile::tempdir().unwrap(); - let record_batch = get_test_record_batch::( - DbOption::from(temp_dir.path()), - TokioExecutor::new(), + async fn write_record_batch( + file: Box, + record_batch: &RecordBatch, + ) -> Result<(), parquet::errors::ParquetError> { + // TODO: expose writer options + let options = ArrowWriterOptions::new().with_properties( + WriterProperties::builder() + .set_created_by(concat!("tonbo version ", env!("CARGO_PKG_VERSION")).to_owned()) + .set_compression(Compression::ZSTD(ZstdLevel::try_new(3).unwrap())) + .build(), + ); + let mut writer = AsyncArrowWriter::try_new_with_options( + AsyncWriter::new(file), + Test::arrow_schema().clone(), + options, ) - .await; - let table_path = temp_dir.path().join("write_test.parquet"); + .expect("Failed to create writer"); + writer.write(record_batch).await?; - open_sstable::(&table_path) - .await - .write(record_batch) - .await - .unwrap(); + if writer.in_progress_size() > (1 << 20) - 1 { + writer.flush().await?; + } - let key = Timestamped::new("hello".to_owned(), 1.into()); + writer.close().await?; + Ok(()) + } - assert_eq!( - open_sstable::(&table_path) - .await - .get(key.borrow(), ProjectionMask::all()) + pub(crate) async fn open_sstable(store: &Arc, path: &Path) -> SsTable + where + R: Record, + { + SsTable::open( + store + .open_options(path, default_open_options()) .await - .unwrap() - .unwrap() - .get(), - Some(TestRef { - vstring: "hello", - vu32: Some(12), - vbool: Some(true), - }) - ); + .unwrap(), + ) + .await + .unwrap() } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn projection_query() { let temp_dir = tempfile::tempdir().unwrap(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + let base_fs = manager.base_fs().clone(); let record_batch = get_test_record_batch::( - DbOption::from(temp_dir.path()), + DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()), TokioExecutor::new(), + manager, ) .await; let table_path = temp_dir.path().join("projection_query_test.parquet"); + let _ = File::create(&table_path).unwrap(); + let table_path = Path::from_filesystem_path(table_path).unwrap(); - open_sstable::(&table_path) - .await - .write(record_batch) + let file = base_fs + .open_options(&table_path, default_open_options()) .await .unwrap(); + write_record_batch(file, &record_batch).await.unwrap(); let key = Timestamped::new("hello".to_owned(), 1.into()); { - let test_ref_1 = open_sstable::(&table_path) + let test_ref_1 = open_sstable::(&base_fs, &table_path) .await .get( key.borrow(), @@ -226,7 +206,7 @@ pub(crate) mod tests { assert_eq!(test_ref_1.get().unwrap().vbool, None); } { - let test_ref_2 = open_sstable::(&table_path) + let test_ref_2 = open_sstable::(&base_fs, &table_path) .await .get( key.borrow(), @@ -243,7 +223,7 @@ pub(crate) mod tests { assert_eq!(test_ref_2.get().unwrap().vbool, Some(true)); } { - let test_ref_3 = open_sstable::(&table_path) + let test_ref_3 = open_sstable::(&base_fs, &table_path) .await .get( key.borrow(), @@ -264,21 +244,26 @@ pub(crate) mod tests { #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn projection_scan() { let temp_dir = tempfile::tempdir().unwrap(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + let base_fs = manager.base_fs().clone(); let record_batch = get_test_record_batch::( - DbOption::from(temp_dir.path()), + DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()), TokioExecutor::new(), + manager, ) .await; let table_path = temp_dir.path().join("projection_scan_test.parquet"); + let _ = File::create(&table_path).unwrap(); + let table_path = Path::from_filesystem_path(table_path).unwrap(); - open_sstable::(&table_path) - .await - .write(record_batch) + let file = base_fs + .open_options(&table_path, default_open_options()) .await .unwrap(); + write_record_batch(file, &record_batch).await.unwrap(); { - let mut test_ref_1 = open_sstable::(&table_path) + let mut test_ref_1 = open_sstable::(&base_fs, &table_path) .await .scan( (Bound::Unbounded, Bound::Unbounded), @@ -303,7 +288,7 @@ pub(crate) mod tests { assert_eq!(entry_1.get().unwrap().vbool, None); } { - let mut test_ref_2 = open_sstable::(&table_path) + let mut test_ref_2 = open_sstable::(&base_fs, &table_path) .await .scan( (Bound::Unbounded, Bound::Unbounded), @@ -328,7 +313,7 @@ pub(crate) mod tests { assert_eq!(entry_1.get().unwrap().vbool, None); } { - let mut test_ref_3 = open_sstable::(&table_path) + let mut test_ref_3 = open_sstable::(&base_fs, &table_path) .await .scan( (Bound::Unbounded, Bound::Unbounded), diff --git a/src/option.rs b/src/option.rs index afd132b..e10d825 100644 --- a/src/option.rs +++ b/src/option.rs @@ -1,21 +1,32 @@ -use std::{marker::PhantomData, path::PathBuf}; +use std::{ + fmt::{Debug, Formatter}, + marker::PhantomData, + sync::Arc, +}; +use fusio::{path::Path, DynFs}; use parquet::{ basic::Compression, file::properties::{EnabledStatistics, WriterProperties}, + format::SortingColumn, + schema::types::ColumnPath, }; use crate::{ - fs::{FileId, FileProvider, FileType}, + fs::{FileId, FileType}, record::Record, trigger::TriggerType, - version::Version, + version::{Version, MAX_LEVEL}, + DbError, }; /// configure the operating parameters of each component in the [`DB`](crate::DB) -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct DbOption { pub(crate) clean_channel_buffer: usize, + pub(crate) base_path: Path, + // TODO: DEBUG + pub(crate) level_paths: Vec)>>, pub(crate) immutable_chunk_num: usize, pub(crate) immutable_chunk_max_num: usize, pub(crate) level_sst_magnification: usize, @@ -23,7 +34,6 @@ pub struct DbOption { pub(crate) major_l_selection_table_max_num: usize, pub(crate) major_threshold_with_sst_size: usize, pub(crate) max_sst_file_size: usize, - pub(crate) path: PathBuf, pub(crate) version_log_snapshot_threshold: u32, pub(crate) trigger_type: TriggerType, pub(crate) use_wal: bool, @@ -31,22 +41,70 @@ pub struct DbOption { _p: PhantomData, } -impl From

for DbOption +impl DbOption +where + R: Record, +{ + /// build the default configured [`DbOption`] with base path and primary key + pub fn with_path(base_path: Path, primary_key_name: String, primary_key_index: usize) -> Self { + let (column_paths, sorting_columns) = + Self::primary_key_path(primary_key_name, primary_key_index); + + DbOption { + immutable_chunk_num: 3, + immutable_chunk_max_num: 5, + major_threshold_with_sst_size: 4, + level_sst_magnification: 10, + max_sst_file_size: 256 * 1024 * 1024, + clean_channel_buffer: 10, + base_path, + write_parquet_properties: WriterProperties::builder() + .set_compression(Compression::LZ4) + .set_column_statistics_enabled(column_paths.clone(), EnabledStatistics::Page) + .set_column_bloom_filter_enabled(column_paths.clone(), true) + .set_sorting_columns(Some(sorting_columns)) + .set_created_by(concat!("tonbo version ", env!("CARGO_PKG_VERSION")).to_owned()) + .build(), + + use_wal: true, + major_default_oldest_table_num: 3, + major_l_selection_table_max_num: 4, + trigger_type: TriggerType::SizeOfMem(64 * 1024 * 1024), + _p: Default::default(), + version_log_snapshot_threshold: 200, + level_paths: vec![None; MAX_LEVEL], + } + } + + fn primary_key_path( + primary_key_name: String, + primary_key_index: usize, + ) -> (ColumnPath, Vec) { + ( + ColumnPath::new(vec!["_ts".to_string(), primary_key_name]), + vec![ + SortingColumn::new(1_i32, true, true), + SortingColumn::new(primary_key_index as i32, false, true), + ], + ) + } +} + +impl From for DbOption where - P: Into, R: Record, { /// build the default configured [`DbOption`] based on the passed path - fn from(path: P) -> Self { + fn from(base_path: Path) -> Self { let (column_paths, sorting_columns) = R::primary_key_path(); DbOption { - path: path.into(), immutable_chunk_num: 3, immutable_chunk_max_num: 5, major_threshold_with_sst_size: 4, level_sst_magnification: 10, max_sst_file_size: 256 * 1024 * 1024, clean_channel_buffer: 10, + base_path, write_parquet_properties: WriterProperties::builder() .set_compression(Compression::LZ4) .set_column_statistics_enabled(column_paths.clone(), EnabledStatistics::Page) @@ -61,6 +119,7 @@ where trigger_type: TriggerType::SizeOfMem(64 * 1024 * 1024), _p: Default::default(), version_log_snapshot_threshold: 200, + level_paths: vec![None; MAX_LEVEL], } } } @@ -70,9 +129,9 @@ where R: Record, { /// build the [`DB`](crate::DB) storage directory based on the passed path - pub fn path(self, path: impl Into) -> Self { + pub fn path(self, path: impl Into) -> Self { DbOption { - path: path.into(), + base_path: path.into(), ..self } } @@ -152,43 +211,88 @@ where ..self } } + + pub fn level_path( + mut self, + level: usize, + path: Path, + store: Arc, + ) -> Result> { + if level >= MAX_LEVEL { + Err(DbError::ExceedsMaxLevel)?; + } + self.level_paths[level] = Some((path, store)); + Ok(self) + } } impl DbOption where R: Record, { - pub(crate) fn table_path(&self, gen: &FileId) -> PathBuf { - self.path.join(format!("{}.{}", gen, FileType::Parquet)) + pub(crate) fn table_path(&self, gen: &FileId) -> Path { + self.base_path + .child(format!("{}.{}", gen, FileType::Parquet)) } - pub(crate) fn wal_dir_path(&self) -> PathBuf { - self.path.join("wal") + pub(crate) fn wal_dir_path(&self) -> Path { + self.base_path.child("wal") } - pub(crate) fn wal_path(&self, gen: &FileId) -> PathBuf { + pub(crate) fn wal_path(&self, gen: &FileId) -> Path { self.wal_dir_path() - .join(format!("{}.{}", gen, FileType::Wal)) + .child(format!("{}.{}", gen, FileType::Wal)) } - pub(crate) fn version_log_dir_path(&self) -> PathBuf { - self.path.join("version") + pub(crate) fn version_log_dir_path(&self) -> Path { + self.base_path.child("version") } - pub(crate) fn version_log_path(&self, gen: &FileId) -> PathBuf { + pub(crate) fn version_log_path(&self, gen: &FileId) -> Path { self.version_log_dir_path() - .join(format!("{}.{}", gen, FileType::Log)) + .child(format!("{}.{}", gen, FileType::Log)) } - pub(crate) fn is_threshold_exceeded_major( - &self, - version: &Version, - level: usize, - ) -> bool - where - E: FileProvider, - { - Version::::tables_len(version, level) + pub(crate) fn level_fs_path(&self, level: usize) -> Option<&Path> { + self.level_paths[level].as_ref().map(|(path, _)| path) + } + + pub(crate) fn is_threshold_exceeded_major(&self, version: &Version, level: usize) -> bool { + Version::::tables_len(version, level) >= (self.major_threshold_with_sst_size * self.level_sst_magnification.pow(level as u32)) } } + +impl Debug for DbOption { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DbOption") + .field("clean_channel_buffer", &self.clean_channel_buffer) + .field("base_path", &self.base_path) + // TODO + // .field("level_paths", &self.level_paths) + .field("immutable_chunk_num", &self.immutable_chunk_num) + .field("immutable_chunk_max_num", &self.immutable_chunk_max_num) + .field("level_sst_magnification", &self.level_sst_magnification) + .field( + "major_default_oldest_table_num", + &self.major_default_oldest_table_num, + ) + .field( + "major_l_selection_table_max_num", + &self.major_l_selection_table_max_num, + ) + .field( + "major_threshold_with_sst_size", + &self.major_threshold_with_sst_size, + ) + .field("max_sst_file_size", &self.max_sst_file_size) + .field( + "version_log_snapshot_threshold", + &self.version_log_snapshot_threshold, + ) + .field("trigger_type", &self.trigger_type) + .field("use_wal", &self.use_wal) + .field("write_parquet_properties", &self.write_parquet_properties) + .finish() + } +} diff --git a/src/record/key/mod.rs b/src/record/key/mod.rs index 8b08a90..e69051f 100644 --- a/src/record/key/mod.rs +++ b/src/record/key/mod.rs @@ -1,7 +1,7 @@ mod num; mod str; -use std::hash::Hash; +use std::{hash::Hash, sync::Arc}; use arrow::array::Datum; @@ -16,7 +16,7 @@ pub trait Key: fn as_key_ref(&self) -> Self::Ref<'_>; - fn to_arrow_datum(&self) -> impl Datum; + fn to_arrow_datum(&self) -> Arc; } pub trait KeyRef<'r>: Clone + Encode + Send + Sync + Ord + std::fmt::Debug { diff --git a/src/record/key/num.rs b/src/record/key/num.rs index 83300a5..8f75405 100644 --- a/src/record/key/num.rs +++ b/src/record/key/num.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow::array::{ Datum, Int16Array, Int32Array, Int64Array, Int8Array, UInt16Array, UInt32Array, UInt64Array, UInt8Array, @@ -15,8 +17,8 @@ macro_rules! implement_key { *self } - fn to_arrow_datum(&self) -> impl Datum { - $array_name::new_scalar(*self) + fn to_arrow_datum(&self) -> Arc { + Arc::new($array_name::new_scalar(*self)) } } diff --git a/src/record/key/str.rs b/src/record/key/str.rs index b96c33a..dc86c33 100644 --- a/src/record/key/str.rs +++ b/src/record/key/str.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow::array::{Datum, StringArray}; use super::{Key, KeyRef}; @@ -9,8 +11,8 @@ impl Key for String { self } - fn to_arrow_datum(&self) -> impl Datum { - StringArray::new_scalar(self) + fn to_arrow_datum(&self) -> Arc { + Arc::new(StringArray::new_scalar(self)) } } diff --git a/src/record/mod.rs b/src/record/mod.rs index 1651167..9df59ec 100644 --- a/src/record/mod.rs +++ b/src/record/mod.rs @@ -1,5 +1,6 @@ pub mod internal; mod key; +pub mod runtime; #[cfg(test)] mod str; @@ -9,6 +10,7 @@ use arrow::{array::RecordBatch, datatypes::Schema}; use internal::InternalRecordRef; pub use key::{Key, KeyRef}; use parquet::{arrow::ProjectionMask, format::SortingColumn, schema::types::ColumnPath}; +pub use runtime::*; use thiserror::Error; use crate::{ @@ -16,6 +18,35 @@ use crate::{ serdes::{Decode, Encode}, }; +#[allow(unused)] +pub(crate) enum RecordInstance { + Normal, + Runtime(DynRecord), +} + +#[allow(unused)] +impl RecordInstance { + pub(crate) fn primary_key_index(&self) -> usize + where + R: Record, + { + match self { + RecordInstance::Normal => R::primary_key_index(), + RecordInstance::Runtime(record) => record.primary_key_index(), + } + } + + pub(crate) fn arrow_schema(&self) -> Arc + where + R: Record, + { + match self { + RecordInstance::Normal => R::arrow_schema().clone(), + RecordInstance::Runtime(record) => record.arrow_schema(), + } + } +} + pub trait Record: 'static + Sized + Decode + Debug + Send + Sync { type Columns: ArrowArrays; @@ -51,6 +82,7 @@ pub trait RecordRef<'r>: Clone + Sized + Encode + Send + Sync { record_batch: &'r RecordBatch, offset: usize, projection_mask: &'r ProjectionMask, + full_schema: &'r Arc, ) -> InternalRecordRef<'r, Self>; } @@ -63,6 +95,8 @@ pub enum RecordEncodeError { }, #[error("record io error: {0}")] Io(#[from] io::Error), + #[error("record fusio error: {0}")] + Fusio(#[from] fusio::Error), } #[derive(Debug, Error)] @@ -74,4 +108,6 @@ pub enum RecordDecodeError { }, #[error("record io error: {0}")] Io(#[from] io::Error), + #[error("record fusio error: {0}")] + Fusio(#[from] fusio::Error), } diff --git a/src/record/runtime/array.rs b/src/record/runtime/array.rs new file mode 100644 index 0000000..4bb3772 --- /dev/null +++ b/src/record/runtime/array.rs @@ -0,0 +1,373 @@ +use std::{any::Any, sync::Arc}; + +use arrow::{ + array::{ + Array, ArrayBuilder, ArrayRef, BooleanArray, BooleanBufferBuilder, PrimitiveBuilder, + UInt32Builder, + }, + datatypes::{Int16Type, Int32Type, Int8Type, Schema}, +}; + +use super::{column::Column, record::DynRecord, record_ref::DynRecordRef, Datatype}; +use crate::{ + inmem::immutable::{ArrowArrays, Builder}, + record::{Key, Record}, + timestamp::Timestamped, +}; + +#[allow(unused)] +pub struct DynRecordImmutableArrays { + _null: Arc, + _ts: Arc, + columns: Vec, + record_batch: arrow::record_batch::RecordBatch, +} + +impl ArrowArrays for DynRecordImmutableArrays { + type Record = DynRecord; + + type Builder = DynRecordBuilder; + + fn builder(schema: &Arc, capacity: usize) -> Self::Builder { + let mut builders: Vec> = vec![]; + let mut datatypes = vec![]; + for field in schema.fields().iter().skip(2) { + let datatype = Datatype::from(field.data_type()); + match datatype { + Datatype::Int8 => { + builders.push(Box::new(PrimitiveBuilder::::with_capacity( + capacity, + ))); + } + Datatype::Int16 => { + builders.push(Box::new(PrimitiveBuilder::::with_capacity( + capacity, + ))); + } + Datatype::Int32 => { + builders.push(Box::new(PrimitiveBuilder::::with_capacity( + capacity, + ))); + } + } + datatypes.push(datatype); + } + DynRecordBuilder { + builders, + datatypes, + _null: arrow::array::BooleanBufferBuilder::new(capacity), + _ts: arrow::array::UInt32Builder::with_capacity(capacity), + schema: schema.clone(), + } + } + + fn get( + &self, + offset: u32, + projection_mask: &parquet::arrow::ProjectionMask, + ) -> Option::Ref<'_>>> { + let offset = offset as usize; + + if offset >= Array::len(self._null.as_ref()) { + return None; + } + if self._null.value(offset) { + return Some(None); + } + + let mut columns = vec![]; + for (idx, col) in self.columns.iter().enumerate() { + if projection_mask.leaf_included(idx + 2) && !col.is_nullable { + let datatype = col.datatype; + let name = col.name.to_string(); + let value = match datatype { + Datatype::Int8 => { + Arc::new(col.value.as_ref().downcast_ref::().copied()) as Arc + } + Datatype::Int16 => { + Arc::new(col.value.as_ref().downcast_ref::().copied()) as Arc + } + Datatype::Int32 => { + Arc::new(col.value.as_ref().downcast_ref::().copied()) as Arc + } + }; + columns.push(Column { + datatype, + name, + value, + is_nullable: true, + }); + } + + columns.push(col.clone()); + } + Some(Some(DynRecordRef::new(columns, 2))) + } + + fn as_record_batch(&self) -> &arrow::array::RecordBatch { + &self.record_batch + } +} + +pub struct DynRecordBuilder { + builders: Vec>, + datatypes: Vec, + _null: BooleanBufferBuilder, + _ts: UInt32Builder, + schema: Arc, +} + +impl Builder for DynRecordBuilder { + fn push( + &mut self, + key: Timestamped<<::Key as Key>::Ref<'_>>, + row: Option, + ) { + self._null.append(row.is_none()); + self._ts.append_value(key.ts.into()); + let metadata = self.schema.metadata(); + let primary_key_index = metadata + .get("primary_key_index") + .unwrap() + .parse::() + .unwrap(); + self.push_primary_key(key, primary_key_index); + match row { + Some(record_ref) => { + for (idx, (builder, col)) in self + .builders + .iter_mut() + .zip(record_ref.columns.iter()) + .enumerate() + { + if idx == primary_key_index { + continue; + } + let datatype = col.datatype; + match datatype { + Datatype::Int8 => { + let bd = builder + .as_any_mut() + .downcast_mut::>() + .unwrap(); + + let value = col.value.as_ref().downcast_ref::>().unwrap(); + match value { + Some(value) => bd.append_value(*value), + None => bd.append_null(), + } + } + Datatype::Int16 => { + let bd = builder + .as_any_mut() + .downcast_mut::>() + .unwrap(); + let value = col.value.as_ref().downcast_ref::>().unwrap(); + match value { + Some(value) => bd.append_value(*value), + None => bd.append_null(), + } + } + Datatype::Int32 => { + let bd = builder + .as_any_mut() + .downcast_mut::>() + .unwrap(); + let value = col.value.as_ref().downcast_ref::>().unwrap(); + match value { + Some(value) => bd.append_value(*value), + None => bd.append_null(), + } + } + } + } + } + None => { + for (idx, (builder, datatype)) in self + .builders + .iter_mut() + .zip(self.datatypes.iter_mut()) + .enumerate() + { + if idx == primary_key_index { + continue; + } + match datatype { + Datatype::Int8 => { + builder + .as_any_mut() + .downcast_mut::>() + .unwrap() + .append_value(i8::default()); + } + Datatype::Int16 => { + builder + .as_any_mut() + .downcast_mut::>() + .unwrap() + .append_value(i16::default()); + } + Datatype::Int32 => { + builder + .as_any_mut() + .downcast_mut::>() + .unwrap() + .append_value(i32::default()); + } + } + } + } + } + } + + fn written_size(&self) -> usize { + let size = self._null.as_slice().len() + std::mem::size_of_val(self._ts.values_slice()); + self.builders + .iter() + .zip(self.datatypes.iter()) + .fold(size, |acc, (builder, datatype)| { + acc + match datatype { + Datatype::Int8 => std::mem::size_of_val( + builder + .as_any() + .downcast_ref::>() + .unwrap() + .values_slice(), + ), + Datatype::Int16 => std::mem::size_of_val( + builder + .as_any() + .downcast_ref::>() + .unwrap() + .values_slice(), + ), + Datatype::Int32 => std::mem::size_of_val( + builder + .as_any() + .downcast_ref::>() + .unwrap() + .values_slice(), + ), + } + }) + } + + fn finish(&mut self, indices: Option<&[usize]>) -> DynRecordImmutableArrays { + let mut columns = vec![]; + let _null = Arc::new(BooleanArray::new(self._null.finish(), None)); + let _ts = Arc::new(self._ts.finish()); + + let mut array_refs = vec![Arc::clone(&_null) as ArrayRef, Arc::clone(&_ts) as ArrayRef]; + for (idx, (builder, datatype)) in self + .builders + .iter_mut() + .zip(self.datatypes.iter()) + .enumerate() + { + let field = self.schema.field(idx + 2); + let is_nullable = field.is_nullable(); + match datatype { + Datatype::Int8 => { + let value = Arc::new( + builder + .as_any_mut() + .downcast_mut::>() + .unwrap() + .finish(), + ); + columns.push(Column { + datatype: Datatype::Int8, + name: field.name().to_owned(), + value: value.clone(), + is_nullable, + }); + array_refs.push(value); + } + Datatype::Int16 => { + let value = Arc::new( + builder + .as_any_mut() + .downcast_mut::>() + .unwrap() + .finish(), + ); + columns.push(Column { + datatype: Datatype::Int16, + name: field.name().to_owned(), + value: value.clone(), + is_nullable, + }); + array_refs.push(value); + } + Datatype::Int32 => { + let value = Arc::new( + builder + .as_any_mut() + .downcast_mut::>() + .unwrap() + .finish(), + ); + columns.push(Column { + datatype: Datatype::Int32, + name: field.name().to_owned(), + value: value.clone(), + is_nullable, + }); + array_refs.push(value); + } + }; + } + + let mut record_batch = + arrow::record_batch::RecordBatch::try_new(self.schema.clone(), array_refs) + .expect("create record batch must be successful"); + if let Some(indices) = indices { + record_batch = record_batch + .project(indices) + .expect("projection indices must be successful"); + } + + DynRecordImmutableArrays { + _null, + _ts, + columns, + record_batch, + } + } +} + +impl DynRecordBuilder { + fn push_primary_key( + &mut self, + key: Timestamped<<::Key as Key>::Ref<'_>>, + primary_key_index: usize, + ) { + let builder = self.builders.get_mut(primary_key_index).unwrap(); + let datatype = self.datatypes.get_mut(primary_key_index).unwrap(); + let col = key.value; + // *col.value.as_ref().downcast_ref::().unwrap() + match datatype { + Datatype::Int8 => builder + .as_any_mut() + .downcast_mut::>() + .unwrap() + .append_value(*col.value.as_ref().downcast_ref::().unwrap()), + Datatype::Int16 => builder + .as_any_mut() + .downcast_mut::>() + .unwrap() + .append_value(*col.value.as_ref().downcast_ref::().unwrap()), + Datatype::Int32 => builder + .as_any_mut() + .downcast_mut::>() + .unwrap() + .append_value(*col.value.as_ref().downcast_ref::().unwrap()), + }; + } +} + +unsafe impl Send for DynRecordBuilder {} +unsafe impl Sync for DynRecordBuilder {} + +unsafe impl Send for DynRecordImmutableArrays {} +unsafe impl Sync for DynRecordImmutableArrays {} diff --git a/src/record/runtime/column.rs b/src/record/runtime/column.rs new file mode 100644 index 0000000..afca02f --- /dev/null +++ b/src/record/runtime/column.rs @@ -0,0 +1,386 @@ +use std::{any::Any, fmt::Debug, hash::Hash, sync::Arc}; + +use arrow::{ + array::{Int16Array, Int32Array, Int8Array}, + datatypes::{DataType, Field}, +}; +use fusio::{Read, Write}; + +use super::Datatype; +use crate::{ + record::{Key, KeyRef}, + serdes::{option::DecodeError, Decode, Encode}, +}; + +#[derive(Debug, Clone)] +pub struct ColumnDesc { + pub datatype: Datatype, + pub is_nullable: bool, + pub name: String, +} + +impl ColumnDesc { + pub fn new(name: String, datatype: Datatype, is_nullable: bool) -> Self { + Self { + name, + datatype, + is_nullable, + } + } +} + +#[derive(Clone)] +pub struct Column { + pub datatype: Datatype, + pub value: Arc, + pub is_nullable: bool, + pub name: String, +} + +unsafe impl Send for Column {} +unsafe impl Sync for Column {} + +impl Column { + pub fn new(datatype: Datatype, name: String, value: Arc, is_nullable: bool) -> Self { + Self { + datatype, + name, + value, + is_nullable, + } + } + + pub fn with_none_value(datatype: Datatype, name: String, is_nullable: bool) -> Self { + match datatype { + Datatype::Int8 => Self::new(datatype, name, Arc::>::new(None), is_nullable), + Datatype::Int16 => { + Self::new(datatype, name, Arc::>::new(None), is_nullable) + } + Datatype::Int32 => { + Self::new(datatype, name, Arc::>::new(None), is_nullable) + } + } + } +} + +impl Ord for Column { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + match self.datatype { + Datatype::Int8 => self + .value + .downcast_ref::() + .cmp(&other.value.downcast_ref::()), + Datatype::Int16 => self + .value + .downcast_ref::() + .cmp(&other.value.downcast_ref::()), + Datatype::Int32 => self + .value + .downcast_ref::() + .cmp(&other.value.downcast_ref::()), + } + } +} + +impl Eq for Column {} + +impl PartialOrd for Column { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl PartialEq for Column { + fn eq(&self, other: &Self) -> bool { + self.datatype == other.datatype + && self.is_nullable == other.is_nullable + && match self.datatype { + Datatype::Int8 => self + .value + .downcast_ref::() + .eq(&other.value.downcast_ref::()), + Datatype::Int16 => self + .value + .downcast_ref::() + .eq(&other.value.downcast_ref::()), + Datatype::Int32 => self + .value + .downcast_ref::() + .eq(&other.value.downcast_ref::()), + } + } +} + +impl Debug for Column { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let mut debug_struct = f.debug_struct("Column"); + match self.datatype { + Datatype::Int8 => { + debug_struct.field("datatype", &"i8".to_string()); + if let Some(value) = self.value.as_ref().downcast_ref::() { + debug_struct.field("value", value); + } else { + debug_struct.field( + "value", + self.value.as_ref().downcast_ref::>().unwrap(), + ); + } + } + Datatype::Int16 => { + debug_struct.field("datatype", &"i16".to_string()); + if let Some(value) = self.value.as_ref().downcast_ref::() { + debug_struct.field("value", value); + } else { + debug_struct.field( + "value", + self.value.as_ref().downcast_ref::>().unwrap(), + ); + } + } + Datatype::Int32 => { + debug_struct.field("datatype", &"i32".to_string()); + if let Some(value) = self.value.as_ref().downcast_ref::() { + debug_struct.field("value", value); + } else { + debug_struct.field( + "value", + self.value.as_ref().downcast_ref::>().unwrap(), + ); + } + } + } + debug_struct.field("nullable", &self.is_nullable).finish() + } +} + +impl Hash for Column { + fn hash(&self, state: &mut H) { + match self.datatype { + Datatype::Int8 => self.value.downcast_ref::().hash(state), + Datatype::Int16 => self.value.downcast_ref::().hash(state), + Datatype::Int32 => self.value.downcast_ref::().hash(state), + } + } +} + +impl Key for Column { + type Ref<'a> = Column; + + fn as_key_ref(&self) -> Self::Ref<'_> { + self.clone() + } + + fn to_arrow_datum(&self) -> Arc { + match self.datatype { + Datatype::Int8 => Arc::new(Int8Array::new_scalar( + *self + .value + .as_ref() + .downcast_ref::() + .expect("unexpected datatype, expected: i8"), + )), + Datatype::Int16 => Arc::new(Int16Array::new_scalar( + *self + .value + .as_ref() + .downcast_ref::() + .expect("unexpected datatype, expected: i16"), + )), + Datatype::Int32 => Arc::new(Int32Array::new_scalar( + *self + .value + .as_ref() + .downcast_ref::() + .expect("unexpected datatype, expected: i32"), + )), + } + } +} + +impl<'r> KeyRef<'r> for Column { + type Key = Column; + + fn to_key(self) -> Self::Key { + self + } +} + +impl Decode for Column { + type Error = fusio::Error; + + async fn decode(reader: &mut R) -> Result + where + R: Read + Unpin, + { + let tag = u8::decode(reader).await?; + let datatype = Self::tag_to_datatype(tag); + let is_nullable = bool::decode(reader).await?; + let is_some = !bool::decode(reader).await?; + let value = + match datatype { + Datatype::Int8 => match is_some { + true => Arc::new(Option::::decode(reader).await.map_err( + |err| match err { + DecodeError::Io(error) => fusio::Error::Io(error), + DecodeError::Fusio(error) => error, + DecodeError::Inner(error) => fusio::Error::Other(Box::new(error)), + }, + )?) as Arc, + false => Arc::new(i8::decode(reader).await?) as Arc, + }, + Datatype::Int16 => match is_some { + true => Arc::new(Option::::decode(reader).await.map_err( + |err| match err { + DecodeError::Io(error) => fusio::Error::Io(error), + DecodeError::Fusio(error) => error, + DecodeError::Inner(error) => fusio::Error::Other(Box::new(error)), + }, + )?) as Arc, + false => Arc::new(i16::decode(reader).await?) as Arc, + }, + Datatype::Int32 => match is_some { + true => Arc::new(Option::::decode(reader).await.map_err( + |err| match err { + DecodeError::Io(error) => fusio::Error::Io(error), + DecodeError::Fusio(error) => error, + DecodeError::Inner(error) => fusio::Error::Other(Box::new(error)), + }, + )?) as Arc, + false => Arc::new(i32::decode(reader).await?) as Arc, + }, + }; + Ok(Column { + datatype, + is_nullable, + name: "".to_owned(), + value, + }) + } +} + +impl Encode for Column { + type Error = fusio::Error; + + async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> + where + W: Write + Unpin + Send, + { + Self::tag(self.datatype).encode(writer).await?; + self.is_nullable.encode(writer).await?; + match self.datatype { + Datatype::Int8 => { + if let Some(value) = self.value.as_ref().downcast_ref::() { + true.encode(writer).await?; + value.encode(writer).await? + } else { + false.encode(writer).await?; + self.value + .as_ref() + .downcast_ref::>() + .unwrap() + .encode(writer) + .await + .map_err(|err| fusio::Error::Other(Box::new(err)))?; + } + } + Datatype::Int16 => { + if let Some(value) = self.value.as_ref().downcast_ref::() { + true.encode(writer).await?; + value.encode(writer).await? + } else { + false.encode(writer).await?; + self.value + .as_ref() + .downcast_ref::>() + .unwrap() + .encode(writer) + .await + .map_err(|err| fusio::Error::Other(Box::new(err)))?; + } + } + Datatype::Int32 => { + if let Some(value) = self.value.as_ref().downcast_ref::() { + true.encode(writer).await?; + value.encode(writer).await? + } else { + false.encode(writer).await?; + self.value + .as_ref() + .downcast_ref::>() + .unwrap() + .encode(writer) + .await + .map_err(|err| fusio::Error::Other(Box::new(err)))?; + } + } + }; + Ok(()) + } + + fn size(&self) -> usize { + 3 + match self.datatype { + Datatype::Int8 => { + if let Some(value) = self.value.as_ref().downcast_ref::() { + value.size() + } else { + self.value + .as_ref() + .downcast_ref::>() + .unwrap() + .size() + } + } + Datatype::Int16 => { + if let Some(value) = self.value.as_ref().downcast_ref::() { + value.size() + } else { + self.value + .as_ref() + .downcast_ref::>() + .unwrap() + .size() + } + } + Datatype::Int32 => { + if let Some(value) = self.value.as_ref().downcast_ref::() { + value.size() + } else { + self.value + .as_ref() + .downcast_ref::>() + .unwrap() + .size() + } + } + } + } +} + +impl Column { + fn tag(datatype: Datatype) -> u8 { + match datatype { + Datatype::Int8 => 0, + Datatype::Int16 => 1, + Datatype::Int32 => 2, + } + } + + fn tag_to_datatype(tag: u8) -> Datatype { + match tag { + 0 => Datatype::Int8, + 1 => Datatype::Int16, + 2 => Datatype::Int32, + _ => panic!("invalid datatype tag"), + } + } +} + +impl From<&Column> for Field { + fn from(col: &Column) -> Self { + match col.datatype { + Datatype::Int8 => Field::new(&col.name, DataType::Int8, col.is_nullable), + Datatype::Int16 => Field::new(&col.name, DataType::Int16, col.is_nullable), + Datatype::Int32 => Field::new(&col.name, DataType::Int32, col.is_nullable), + } + } +} diff --git a/src/record/runtime/mod.rs b/src/record/runtime/mod.rs new file mode 100644 index 0000000..7fadcb7 --- /dev/null +++ b/src/record/runtime/mod.rs @@ -0,0 +1,29 @@ +mod array; +mod column; +mod record; +mod record_ref; + +use arrow::datatypes::DataType; +pub use column::*; +pub use record::*; +pub use record_ref::*; + +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub enum Datatype { + Int8, + Int16, + Int32, + // String, +} + +impl From<&DataType> for Datatype { + fn from(datatype: &DataType) -> Self { + match datatype { + DataType::Int8 => Datatype::Int8, + DataType::Int16 => Datatype::Int16, + DataType::Int32 => Datatype::Int32, + // DataType::Utf8 => Datatype::String, + _ => todo!(), + } + } +} diff --git a/src/record/runtime/record.rs b/src/record/runtime/record.rs new file mode 100644 index 0000000..d52f8b2 --- /dev/null +++ b/src/record/runtime/record.rs @@ -0,0 +1,251 @@ +use std::{any::Any, collections::HashMap, sync::Arc}; + +use arrow::datatypes::{DataType, Field, Schema}; +use fusio::Read; +use parquet::{format::SortingColumn, schema::types::ColumnPath}; + +use super::{array::DynRecordImmutableArrays, Column, ColumnDesc, Datatype, DynRecordRef}; +use crate::{ + record::{Record, RecordDecodeError}, + serdes::{Decode, Encode}, +}; + +#[derive(Debug)] +pub struct DynRecord { + columns: Vec, + primary_index: usize, +} + +#[allow(unused)] +impl DynRecord { + pub fn new(columns: Vec, primary_index: usize) -> Self { + Self { + columns, + primary_index, + } + } + + pub(crate) fn primary_key_index(&self) -> usize { + self.primary_index + 2 + } + + pub(crate) fn arrow_schema(&self) -> Arc { + let mut fields = vec![ + Field::new("_null", DataType::Boolean, false), + Field::new("_ts", DataType::UInt32, false), + ]; + + for (idx, col) in self.columns.iter().enumerate() { + if idx == self.primary_index && col.is_nullable { + panic!("Primary key must not be nullable") + } + let mut field = Field::from(col); + fields.push(field); + } + let mut metadata = HashMap::new(); + metadata.insert( + "primary_key_index".to_string(), + self.primary_index.to_string(), + ); + Arc::new(Schema::new_with_metadata(fields, metadata)) + } +} + +impl DynRecord { + pub(crate) fn empty_record(column_descs: Vec, primary_index: usize) -> DynRecord { + let mut columns = vec![]; + for desc in column_descs.iter() { + match desc.datatype { + Datatype::Int8 => match desc.is_nullable { + true => columns.push(Column::new( + desc.datatype, + desc.name.to_owned(), + Arc::>::new(None), + desc.is_nullable, + )), + false => columns.push(Column::new( + desc.datatype, + desc.name.to_owned(), + Arc::new(0_i8), + desc.is_nullable, + )), + }, + Datatype::Int16 => match desc.is_nullable { + true => columns.push(Column::new( + desc.datatype, + desc.name.to_owned(), + Arc::>::new(None), + desc.is_nullable, + )), + false => columns.push(Column::new( + desc.datatype, + desc.name.to_owned(), + Arc::new(0_i16), + desc.is_nullable, + )), + }, + Datatype::Int32 => match desc.is_nullable { + true => columns.push(Column::new( + desc.datatype, + desc.name.to_owned(), + Arc::>::new(None), + desc.is_nullable, + )), + false => columns.push(Column::new( + desc.datatype, + desc.name.to_owned(), + Arc::new(0_i32), + desc.is_nullable, + )), + }, + } + } + + DynRecord::new(columns, primary_index) + } +} + +impl Decode for DynRecord { + type Error = RecordDecodeError; + + async fn decode(reader: &mut R) -> Result + where + R: Read + Unpin, + { + let len = u32::decode(reader).await? as usize; + let primary_index = u32::decode(reader).await? as usize; + let mut columns = vec![]; + // keep invariant for record: nullable --> Some(v); non-nullable --> v + for i in 0..len { + let mut col = Column::decode(reader).await?; + if i != primary_index && !col.is_nullable { + match col.datatype { + Datatype::Int8 => { + let value = col.value.as_ref().downcast_ref::>().unwrap(); + col.value = Arc::new(value.unwrap()); + } + Datatype::Int16 => { + let value = col.value.as_ref().downcast_ref::>().unwrap(); + col.value = Arc::new(value.unwrap()); + } + Datatype::Int32 => { + let value = col.value.as_ref().downcast_ref::>().unwrap(); + col.value = Arc::new(value.unwrap()); + } + } + } + columns.push(col); + } + + Ok(DynRecord { + columns, + primary_index, + }) + } +} + +impl Record for DynRecord { + type Columns = DynRecordImmutableArrays; + + type Key = Column; + + type Ref<'r> = DynRecordRef<'r>; + + fn primary_key_index() -> usize { + unreachable!("This method is not used.") + } + + fn primary_key_path() -> (ColumnPath, Vec) { + unreachable!("This method is not used.") + } + + fn as_record_ref(&self) -> Self::Ref<'_> { + let mut columns = vec![]; + for (idx, col) in self.columns.iter().enumerate() { + let datatype = col.datatype; + let is_nullable = col.is_nullable; + let mut value = col.value.clone(); + if idx != self.primary_index { + value = match datatype { + super::Datatype::Int8 if !is_nullable => { + let v = *col.value.as_ref().downcast_ref::().unwrap(); + Arc::new(Some(v)) as Arc + } + super::Datatype::Int16 if !is_nullable => { + let v = *col.value.as_ref().downcast_ref::().unwrap(); + Arc::new(Some(v)) as Arc + } + super::Datatype::Int32 if !is_nullable => { + let v = *col.value.as_ref().downcast_ref::().unwrap(); + Arc::new(Some(v)) as Arc + } + _ => col.value.clone() as Arc, + }; + } + + columns.push(Column::new( + datatype, + col.name.to_owned(), + value, + is_nullable, + )); + } + DynRecordRef::new(columns, self.primary_index) + } + + fn arrow_schema() -> &'static std::sync::Arc { + unreachable!("This method is not used.") + } + + fn size(&self) -> usize { + self.columns.iter().fold(0, |acc, col| acc + col.size()) + } +} + +unsafe impl Send for DynRecord {} +unsafe impl Sync for DynRecord {} + +#[cfg(test)] +pub(crate) mod test { + use std::sync::Arc; + + use super::DynRecord; + use crate::record::{Column, ColumnDesc, Datatype}; + + pub(crate) fn test_dyn_item_schema() -> (Vec, usize) { + let descs = vec![ + ColumnDesc::new("age".to_string(), Datatype::Int8, false), + ColumnDesc::new("height".to_string(), Datatype::Int16, true), + ColumnDesc::new("weight".to_string(), Datatype::Int32, false), + ]; + (descs, 0) + } + + pub(crate) fn test_dyn_items() -> Vec { + let mut items = vec![]; + for i in 0..50 { + let mut columns = vec![ + Column::new(Datatype::Int8, "age".to_string(), Arc::new(i as i8), false), + Column::new( + Datatype::Int16, + "height".to_string(), + Arc::new(Some(i as i16 * 20)), + true, + ), + Column::new( + Datatype::Int32, + "weight".to_string(), + Arc::new(i * 200_i32), + false, + ), + ]; + if i >= 45 { + columns[1].value = Arc::>::new(None); + } + + let record = DynRecord::new(columns, 0); + items.push(record); + } + items + } +} diff --git a/src/record/runtime/record_ref.rs b/src/record/runtime/record_ref.rs new file mode 100644 index 0000000..8467e6c --- /dev/null +++ b/src/record/runtime/record_ref.rs @@ -0,0 +1,172 @@ +use std::{any::Any, marker::PhantomData, sync::Arc}; + +use arrow::{ + array::{Array, AsArray}, + datatypes::Schema, +}; +use fusio::Write; + +use super::{Column, Datatype, DynRecord}; +use crate::{ + record::{internal::InternalRecordRef, Key, Record, RecordEncodeError, RecordRef}, + serdes::Encode, +}; + +#[derive(Clone)] +pub struct DynRecordRef<'r> { + pub columns: Vec, + // XXX: log encode should keep the same behavior + pub primary_index: usize, + _marker: PhantomData<&'r ()>, +} + +impl<'r> DynRecordRef<'r> { + pub(crate) fn new(columns: Vec, primary_index: usize) -> Self { + Self { + columns, + primary_index, + _marker: PhantomData, + } + } +} + +impl<'r> Encode for DynRecordRef<'r> { + type Error = RecordEncodeError; + + async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> + where + W: Write + Unpin + Send, + { + (self.columns.len() as u32).encode(writer).await?; + (self.primary_index as u32).encode(writer).await?; + for col in self.columns.iter() { + col.encode(writer).await.map_err(RecordEncodeError::Fusio)?; + } + Ok(()) + } + + fn size(&self) -> usize { + let mut size = 2 * size_of::(); + for col in self.columns.iter() { + size += col.size(); + } + size + } +} + +impl<'r> RecordRef<'r> for DynRecordRef<'r> { + type Record = DynRecord; + + fn key(self) -> <::Key as Key>::Ref<'r> { + self.columns + .get(self.primary_index) + .cloned() + .expect("The primary key must exist") + } + + fn from_record_batch( + record_batch: &'r arrow::array::RecordBatch, + offset: usize, + projection_mask: &'r parquet::arrow::ProjectionMask, + full_schema: &'r Arc, + ) -> InternalRecordRef<'r, Self> { + let null = record_batch.column(0).as_boolean().value(offset); + let metadata = full_schema.metadata(); + + let primary_index = metadata + .get("primary_key_index") + .unwrap() + .parse::() + .unwrap(); + let ts = record_batch + .column(1) + .as_primitive::() + .value(offset) + .into(); + + let mut columns = vec![]; + + for (idx, field) in full_schema.flattened_fields().iter().enumerate().skip(2) { + let datatype = Datatype::from(field.data_type()); + let schema = record_batch.schema(); + let flattened_fields = schema.flattened_fields(); + let batch_field = flattened_fields + .iter() + .enumerate() + .find(|(_idx, f)| field.contains(f)); + if batch_field.is_none() { + columns.push(Column::with_none_value( + datatype, + field.name().to_owned(), + field.is_nullable(), + )); + continue; + } + let col = record_batch.column(batch_field.unwrap().0); + let is_nullable = field.is_nullable(); + let value = match datatype { + Datatype::Int8 => { + let v = col.as_primitive::(); + + if primary_index == idx - 2 { + Arc::new(v.value(offset)) as Arc + } else { + let value = (!v.is_null(offset) && projection_mask.leaf_included(idx)) + .then_some(v.value(offset)); + Arc::new(value) as Arc + } + } + Datatype::Int16 => { + let v = col.as_primitive::(); + + if primary_index == idx - 2 { + Arc::new(v.value(offset)) as Arc + } else { + let value = (!v.is_null(offset) && projection_mask.leaf_included(idx)) + .then_some(v.value(offset)); + Arc::new(value) as Arc + } + } + Datatype::Int32 => { + let v = col.as_primitive::(); + + if primary_index == idx - 2 { + Arc::new(v.value(offset)) as Arc + } else { + let value = (!v.is_null(offset) && projection_mask.leaf_included(idx)) + .then_some(v.value(offset)); + Arc::new(value) as Arc + } + } + }; + columns.push(Column::new( + datatype, + field.name().to_owned(), + value, + is_nullable, + )); + } + + let record = DynRecordRef { + columns, + primary_index, + _marker: PhantomData, + }; + InternalRecordRef::new(ts, record, null) + } + + fn projection(&mut self, projection_mask: &parquet::arrow::ProjectionMask) { + for (idx, col) in self.columns.iter_mut().enumerate() { + if idx != self.primary_index && !projection_mask.leaf_included(idx + 2) { + match col.datatype { + Datatype::Int8 => col.value = Arc::>::new(None), + Datatype::Int16 => col.value = Arc::>::new(None), + Datatype::Int32 => col.value = Arc::>::new(None), + }; + } + } + } +} + +unsafe impl<'r> Send for DynRecordRef<'r> {} +unsafe impl<'r> Sync for DynRecordRef<'r> {} diff --git a/src/record/str.rs b/src/record/str.rs index 88b4f88..c981695 100644 --- a/src/record/str.rs +++ b/src/record/str.rs @@ -23,7 +23,8 @@ impl Record for String { type Key = Self; - type Ref<'r> = &'r str + type Ref<'r> + = &'r str where Self: 'r; @@ -79,6 +80,7 @@ impl<'r> RecordRef<'r> for &'r str { record_batch: &'r RecordBatch, offset: usize, _: &'r ProjectionMask, + _: &'r Arc, ) -> InternalRecordRef<'r, Self> { let ts = record_batch .column(1) @@ -106,7 +108,7 @@ impl ArrowArrays for StringColumns { type Builder = StringColumnsBuilder; - fn builder(capacity: usize) -> Self::Builder { + fn builder(_schema: &Arc, capacity: usize) -> Self::Builder { StringColumnsBuilder { _null: BooleanBufferBuilder::new(capacity), _ts: UInt32Builder::with_capacity(capacity), diff --git a/src/scope.rs b/src/scope.rs index f970d05..247af07 100644 --- a/src/scope.rs +++ b/src/scope.rs @@ -1,6 +1,6 @@ use std::ops::Bound; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; +use fusio::{Read, Write}; use crate::{ fs::FileId, @@ -44,19 +44,25 @@ where pub(crate) fn meets_range(&self, range: (Bound<&K>, Bound<&K>)) -> bool { let excluded_contains = |key| -> bool { &self.min < key && key < &self.max }; + let included_by = |min, max| -> bool { min <= &self.min && &self.max <= max }; match (range.0, range.1) { (Bound::Included(start), Bound::Included(end)) => { - self.contains(start) || self.contains(end) + self.contains(start) || self.contains(end) || included_by(start, end) } (Bound::Included(start), Bound::Excluded(end)) => { - start != end && (self.contains(start) || excluded_contains(end)) + start != end + && (self.contains(start) || excluded_contains(end) || included_by(start, end)) } (Bound::Excluded(start), Bound::Included(end)) => { - start != end && (excluded_contains(start) || self.contains(end)) + start != end + && (excluded_contains(start) || self.contains(end) || included_by(start, end)) } (Bound::Excluded(start), Bound::Excluded(end)) => { - start != end && (excluded_contains(start) || excluded_contains(end)) + start != end + && (excluded_contains(start) + || excluded_contains(end) + || included_by(start, end)) } (Bound::Included(start), Bound::Unbounded) => start <= &self.max, (Bound::Excluded(start), Bound::Unbounded) => start < &self.max, @@ -79,12 +85,13 @@ where async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: AsyncWrite + Unpin + Send, + W: Write + Unpin + Send, { self.min.encode(writer).await?; self.max.encode(writer).await?; - writer.write_all(&self.gen.to_bytes()).await?; + let (result, _) = writer.write_all(&self.gen.to_bytes()[..]).await; + result?; match &self.wal_ids { None => { @@ -94,7 +101,8 @@ where 1u8.encode(writer).await?; (ids.len() as u32).encode(writer).await?; for id in ids { - writer.write_all(&id.to_bytes()).await?; + let (result, _) = writer.write_all(&id.to_bytes()[..]).await; + result?; } } } @@ -113,14 +121,15 @@ where { type Error = ::Error; - async fn decode(reader: &mut R) -> Result { + async fn decode(reader: &mut R) -> Result { + let mut buf = vec![0u8; 16]; let min = K::decode(reader).await?; let max = K::decode(reader).await?; let gen = { - let mut slice = [0; 16]; - reader.read_exact(&mut slice).await?; - FileId::from_bytes(slice) + buf = reader.read_exact(buf).await?; + // SAFETY + FileId::from_bytes(buf.as_slice().try_into().unwrap()) }; let wal_ids = match u8::decode(reader).await? { 0 => None, @@ -129,9 +138,9 @@ where let mut ids = Vec::with_capacity(len); for _ in 0..len { - let mut slice = [0; 16]; - reader.read_exact(&mut slice).await?; - ids.push(FileId::from_bytes(slice)); + buf = reader.read_exact(buf).await?; + // SAFETY + ids.push(FileId::from_bytes(buf.as_slice().try_into().unwrap())); } Some(ids) } @@ -188,8 +197,6 @@ mod test { assert!(!scope.meets_range((Bound::Included(&99), Bound::Excluded(&100)))); assert!(!scope.meets_range((Bound::Excluded(&99), Bound::Excluded(&100)))); - assert!(!scope.meets_range((Bound::Included(&99), Bound::Excluded(&201)))); - assert!(!scope.meets_range((Bound::Excluded(&99), Bound::Included(&201)))); } // test in range { @@ -210,7 +217,12 @@ mod test { assert!(scope.meets_range((Bound::Excluded(&99), Bound::Included(&100)))); assert!(scope.meets_range((Bound::Included(&150), Bound::Included(&150)))); assert!(scope.meets_range((Bound::Included(&100), Bound::Included(&200)))); - assert!(!scope.meets_range((Bound::Excluded(&99), Bound::Excluded(&201)))); + assert!(scope.meets_range((Bound::Included(&99), Bound::Included(&150)))); + assert!(scope.meets_range((Bound::Included(&99), Bound::Included(&201)))); + assert!(scope.meets_range((Bound::Included(&99), Bound::Excluded(&201)))); + assert!(scope.meets_range((Bound::Excluded(&99), Bound::Included(&201)))); + assert!(scope.meets_range((Bound::Excluded(&99), Bound::Excluded(&201)))); + assert!(scope.meets_range((Bound::Excluded(&100), Bound::Excluded(&200)))); } } } diff --git a/src/serdes/arc.rs b/src/serdes/arc.rs index 55b06ee..7cd94c4 100644 --- a/src/serdes/arc.rs +++ b/src/serdes/arc.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use tokio::io::{AsyncRead, AsyncWrite}; +use fusio::{Read, Write}; use super::{Decode, Encode}; @@ -12,7 +12,7 @@ where async fn decode(reader: &mut R) -> Result where - R: AsyncRead + Unpin, + R: Read + Unpin, { Ok(Arc::from(T::decode(reader).await?)) } @@ -26,7 +26,7 @@ where async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: AsyncWrite + Unpin + Send, + W: Write + Unpin + Send, { self.as_ref().encode(writer).await } @@ -35,3 +35,31 @@ where Encode::size(self.as_ref()) } } + +#[cfg(test)] +mod tests { + use std::{io::Cursor, sync::Arc}; + + use fusio::Seek; + + use crate::serdes::{Decode, Encode}; + + #[tokio::test] + async fn test_encode_decode() { + let source_0 = Arc::new(1u64); + let source_1 = Arc::new("Hello! Tonbo".to_string()); + + let mut bytes = Vec::new(); + let mut cursor = Cursor::new(&mut bytes); + + source_0.encode(&mut cursor).await.unwrap(); + source_1.encode(&mut cursor).await.unwrap(); + + cursor.seek(0).await.unwrap(); + let decoded_0 = Arc::::decode(&mut cursor).await.unwrap(); + let decoded_1 = Arc::::decode(&mut cursor).await.unwrap(); + + assert_eq!(source_0, decoded_0); + assert_eq!(source_1, decoded_1); + } +} diff --git a/src/serdes/boolean.rs b/src/serdes/boolean.rs index c332307..5be3aa8 100644 --- a/src/serdes/boolean.rs +++ b/src/serdes/boolean.rs @@ -1,16 +1,14 @@ -use std::{io, mem::size_of}; +use std::mem::size_of; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; +use fusio::{Read, Write}; use crate::serdes::{Decode, Encode}; impl Encode for bool { - type Error = io::Error; + type Error = fusio::Error; - async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> { - writer - .write_all(&if *self { 1u8 } else { 0u8 }.to_le_bytes()) - .await + async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> { + if *self { 1u8 } else { 0u8 }.encode(writer).await } fn size(&self) -> usize { @@ -19,15 +17,41 @@ impl Encode for bool { } impl Decode for bool { - type Error = io::Error; + type Error = fusio::Error; - async fn decode(reader: &mut R) -> Result { - let buf = { - let mut buf = [0; size_of::()]; - reader.read_exact(&mut buf).await?; - buf - }; + async fn decode(reader: &mut R) -> Result { + Ok(u8::decode(reader).await? == 1u8) + } +} + +#[cfg(test)] +mod tests { + use std::io::Cursor; + + use fusio::Seek; + + use crate::serdes::{Decode, Encode}; + + #[tokio::test] + async fn test_encode_decode() { + let source_0 = true; + let source_1 = false; + let source_2 = true; + + let mut bytes = Vec::new(); + let mut cursor = Cursor::new(&mut bytes); + + source_0.encode(&mut cursor).await.unwrap(); + source_1.encode(&mut cursor).await.unwrap(); + source_2.encode(&mut cursor).await.unwrap(); + + cursor.seek(0).await.unwrap(); + let decoded_0 = bool::decode(&mut cursor).await.unwrap(); + let decoded_1 = bool::decode(&mut cursor).await.unwrap(); + let decoded_2 = bool::decode(&mut cursor).await.unwrap(); - Ok(u8::from_le_bytes(buf) == 1u8) + assert_eq!(source_0, decoded_0); + assert_eq!(source_1, decoded_1); + assert_eq!(source_2, decoded_2); } } diff --git a/src/serdes/bytes.rs b/src/serdes/bytes.rs index 27bb4ee..7b0085d 100644 --- a/src/serdes/bytes.rs +++ b/src/serdes/bytes.rs @@ -1,15 +1,17 @@ -use std::io; - use bytes::Bytes; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; +use fusio::{IoBuf, Read, Write}; use crate::serdes::{Decode, Encode}; impl Encode for &[u8] { - type Error = io::Error; + type Error = fusio::Error; + + async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> { + (self.len() as u32).encode(writer).await?; + let (result, _) = writer.write_all(*self).await; + result?; - async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> { - writer.write_all(self).await + Ok(()) } fn size(&self) -> usize { @@ -18,10 +20,14 @@ impl Encode for &[u8] { } impl Encode for Bytes { - type Error = io::Error; + type Error = fusio::Error; - async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> { - writer.write_all(self).await + async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> { + (self.len() as u32).encode(writer).await?; + let (result, _) = writer.write_all(self.as_slice()).await; + result?; + + Ok(()) } fn size(&self) -> usize { @@ -30,12 +36,37 @@ impl Encode for Bytes { } impl Decode for Bytes { - type Error = io::Error; + type Error = fusio::Error; + + async fn decode(reader: &mut R) -> Result { + let len = u32::decode(reader).await?; + let buf = reader.read_exact(vec![0u8; len as usize]).await?; + + Ok(buf.as_bytes()) + } +} + +#[cfg(test)] +mod tests { + use std::io::Cursor; + + use bytes::Bytes; + use fusio::Seek; + + use crate::serdes::{Decode, Encode}; + + #[tokio::test] + async fn test_encode_decode() { + let source = Bytes::from_static(b"hello! Tonbo"); + + let mut bytes = Vec::new(); + let mut cursor = Cursor::new(&mut bytes); + + source.encode(&mut cursor).await.unwrap(); - async fn decode(reader: &mut R) -> Result { - let mut buf = Vec::new(); - reader.read_exact(&mut buf).await?; + cursor.seek(0).await.unwrap(); + let decoded = Bytes::decode(&mut cursor).await.unwrap(); - Ok(Bytes::from(buf)) + assert_eq!(source, decoded); } } diff --git a/src/serdes/mod.rs b/src/serdes/mod.rs index 69b9dd3..a88790f 100644 --- a/src/serdes/mod.rs +++ b/src/serdes/mod.rs @@ -6,16 +6,16 @@ mod num; pub(crate) mod option; mod string; -use std::{future::Future, io}; +use std::future::Future; -use tokio::io::{AsyncRead, AsyncWrite}; +use fusio::{Read, Write}; pub trait Encode { - type Error: From + std::error::Error + Send + Sync + 'static; + type Error: From + std::error::Error + Send + Sync + 'static; fn encode(&self, writer: &mut W) -> impl Future> + Send where - W: AsyncWrite + Unpin + Send; + W: Write + Unpin + Send; fn size(&self) -> usize; } @@ -25,7 +25,7 @@ impl Encode for &T { async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: AsyncWrite + Unpin + Send, + W: Write + Unpin + Send, { Encode::encode(*self, writer).await } @@ -36,16 +36,18 @@ impl Encode for &T { } pub trait Decode: Sized { - type Error: From + std::error::Error + Send + Sync + 'static; + type Error: From + std::error::Error + Send + Sync + 'static; fn decode(reader: &mut R) -> impl Future> where - R: AsyncRead + Unpin; + R: Read + Unpin; } #[cfg(test)] mod tests { - use tokio::io::AsyncWriteExt; + use std::io; + + use fusio::{Read, Seek}; use super::*; @@ -55,13 +57,14 @@ mod tests { struct TestStruct(u32); impl Encode for TestStruct { - type Error = io::Error; + type Error = fusio::Error; async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: AsyncWrite + Unpin + Send, + W: Write + Unpin + Send, { - writer.write_u32(self.0).await?; + self.0.encode(writer).await?; + Ok(()) } @@ -71,25 +74,23 @@ mod tests { } impl Decode for TestStruct { - type Error = io::Error; + type Error = fusio::Error; async fn decode(reader: &mut R) -> Result where - R: AsyncRead + Unpin, + R: Read + Unpin, { - let value = tokio::io::AsyncReadExt::read_u32(reader).await?; - Ok(TestStruct(value)) + Ok(TestStruct(u32::decode(reader).await?)) } } // Test encoding and decoding let original = TestStruct(42); - let mut buffer = Vec::new(); - - original.encode(&mut buffer).await.unwrap(); - assert_eq!(buffer.len(), original.size()); + let mut buf = Vec::new(); + let mut cursor = io::Cursor::new(&mut buf); + original.encode(&mut cursor).await.unwrap(); - let mut cursor = std::io::Cursor::new(buffer); + cursor.seek(0).await.unwrap(); let decoded = TestStruct::decode(&mut cursor).await.unwrap(); assert_eq!(original.0, decoded.0); diff --git a/src/serdes/num.rs b/src/serdes/num.rs index 274b488..c240c3a 100644 --- a/src/serdes/num.rs +++ b/src/serdes/num.rs @@ -1,6 +1,6 @@ -use std::{io, mem::size_of}; +use std::mem::size_of; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; +use fusio::{Read, Write}; use super::{Decode, Encode}; @@ -8,13 +8,13 @@ use super::{Decode, Encode}; macro_rules! implement_encode_decode { ($struct_name:ident) => { impl Encode for $struct_name { - type Error = io::Error; + type Error = fusio::Error; - async fn encode( - &self, - writer: &mut W, - ) -> Result<(), Self::Error> { - writer.write_all(&self.to_le_bytes()).await + async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> { + let (result, _) = writer.write_all(&self.to_le_bytes()[..]).await; + result?; + + Ok(()) } fn size(&self) -> usize { @@ -23,16 +23,13 @@ macro_rules! implement_encode_decode { } impl Decode for $struct_name { - type Error = io::Error; + type Error = fusio::Error; - async fn decode(reader: &mut R) -> Result { - let buf = { - let mut buf = [0; size_of::()]; - reader.read_exact(&mut buf).await?; - buf - }; + async fn decode(reader: &mut R) -> Result { + let mut bytes = [0u8; size_of::()]; + let _ = reader.read_exact(&mut bytes[..]).await?; - Ok(Self::from_le_bytes(buf)) + Ok(Self::from_le_bytes(bytes)) } } }; @@ -46,3 +43,55 @@ implement_encode_decode!(u8); implement_encode_decode!(u16); implement_encode_decode!(u32); implement_encode_decode!(u64); + +#[cfg(test)] +mod tests { + use std::io::Cursor; + + use fusio::Seek; + + use crate::serdes::{Decode, Encode}; + + #[tokio::test] + async fn test_encode_decode() { + let source_0 = 8u8; + let source_1 = 16u16; + let source_2 = 32u32; + let source_3 = 64u64; + let source_4 = 8i8; + let source_5 = 16i16; + let source_6 = 32i32; + let source_7 = 64i64; + + let mut bytes = Vec::new(); + let mut cursor = Cursor::new(&mut bytes); + + source_0.encode(&mut cursor).await.unwrap(); + source_1.encode(&mut cursor).await.unwrap(); + source_2.encode(&mut cursor).await.unwrap(); + source_3.encode(&mut cursor).await.unwrap(); + source_4.encode(&mut cursor).await.unwrap(); + source_5.encode(&mut cursor).await.unwrap(); + source_6.encode(&mut cursor).await.unwrap(); + source_7.encode(&mut cursor).await.unwrap(); + + cursor.seek(0).await.unwrap(); + let decoded_0 = u8::decode(&mut cursor).await.unwrap(); + let decoded_1 = u16::decode(&mut cursor).await.unwrap(); + let decoded_2 = u32::decode(&mut cursor).await.unwrap(); + let decoded_3 = u64::decode(&mut cursor).await.unwrap(); + let decoded_4 = i8::decode(&mut cursor).await.unwrap(); + let decoded_5 = i16::decode(&mut cursor).await.unwrap(); + let decoded_6 = i32::decode(&mut cursor).await.unwrap(); + let decoded_7 = i64::decode(&mut cursor).await.unwrap(); + + assert_eq!(source_0, decoded_0); + assert_eq!(source_1, decoded_1); + assert_eq!(source_2, decoded_2); + assert_eq!(source_3, decoded_3); + assert_eq!(source_4, decoded_4); + assert_eq!(source_5, decoded_5); + assert_eq!(source_6, decoded_6); + assert_eq!(source_7, decoded_7); + } +} diff --git a/src/serdes/option.rs b/src/serdes/option.rs index 74d8f11..f540d99 100644 --- a/src/serdes/option.rs +++ b/src/serdes/option.rs @@ -1,7 +1,7 @@ use std::io; +use fusio::{Read, Write}; use thiserror::Error; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; use super::{Decode, Encode}; @@ -13,6 +13,8 @@ where { #[error("io error: {0}")] Io(#[from] io::Error), + #[error("fusio error: {0}")] + Fusio(#[from] fusio::Error), #[error("inner error: {0}")] Inner(#[source] E), } @@ -25,6 +27,8 @@ where { #[error("io error: {0}")] Io(#[from] io::Error), + #[error("fusio error: {0}")] + Fusio(#[from] fusio::Error), #[error("inner error: {0}")] Inner(#[source] E), } @@ -37,12 +41,12 @@ where async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: AsyncWrite + Unpin + Send, + W: Write + Unpin + Send, { match self { - None => writer.write_all(&[0]).await?, + None => 0u8.encode(writer).await?, Some(v) => { - writer.write_all(&[1]).await?; + 1u8.encode(writer).await?; v.encode(writer).await.map_err(EncodeError::Inner)?; } } @@ -63,13 +67,43 @@ where { type Error = DecodeError; - async fn decode(reader: &mut R) -> Result { - let mut o = [0]; - reader.read_exact(&mut o).await?; - match o[0] { + async fn decode(reader: &mut R) -> Result { + match u8::decode(reader).await? { 0 => Ok(None), 1 => Ok(Some(V::decode(reader).await.map_err(DecodeError::Inner)?)), _ => panic!("invalid option tag"), } } } + +#[cfg(test)] +mod tests { + use std::io::Cursor; + + use fusio::Seek; + + use crate::serdes::{Decode, Encode}; + + #[tokio::test] + async fn test_encode_decode() { + let source_0 = Some(1u64); + let source_1 = None; + let source_2 = Some("Hello! Tonbo".to_string()); + + let mut bytes = Vec::new(); + let mut cursor = Cursor::new(&mut bytes); + + source_0.encode(&mut cursor).await.unwrap(); + source_1.encode(&mut cursor).await.unwrap(); + source_2.encode(&mut cursor).await.unwrap(); + + cursor.seek(0).await.unwrap(); + let decoded_0 = Option::::decode(&mut cursor).await.unwrap(); + let decoded_1 = Option::::decode(&mut cursor).await.unwrap(); + let decoded_2 = Option::::decode(&mut cursor).await.unwrap(); + + assert_eq!(source_0, decoded_0); + assert_eq!(source_1, decoded_1); + assert_eq!(source_2, decoded_2); + } +} diff --git a/src/serdes/string.rs b/src/serdes/string.rs index bc5ff0b..10e3e9c 100644 --- a/src/serdes/string.rs +++ b/src/serdes/string.rs @@ -1,18 +1,21 @@ -use std::{io, mem::size_of}; +use std::mem::size_of; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; +use fusio::{Read, Write}; use super::{Decode, Encode}; impl<'r> Encode for &'r str { - type Error = io::Error; + type Error = fusio::Error; async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: AsyncWrite + Unpin, + W: Write + Unpin, { - writer.write_all(&(self.len() as u16).to_le_bytes()).await?; - writer.write_all(self.as_bytes()).await + (self.len() as u16).encode(writer).await?; + let (result, _) = writer.write_all(self.as_bytes()).await; + result?; + + Ok(()) } fn size(&self) -> usize { @@ -21,11 +24,11 @@ impl<'r> Encode for &'r str { } impl Encode for String { - type Error = io::Error; + type Error = fusio::Error; async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: AsyncWrite + Unpin + Send, + W: Write + Unpin + Send, { self.as_str().encode(writer).await } @@ -36,21 +39,40 @@ impl Encode for String { } impl Decode for String { - type Error = io::Error; - - async fn decode(reader: &mut R) -> Result { - let len = { - let mut len = [0; size_of::()]; - reader.read_exact(&mut len).await?; - u16::from_le_bytes(len) as usize - }; - - let vec = { - let mut vec = vec![0; len]; - reader.read_exact(&mut vec).await?; - vec - }; - - Ok(unsafe { String::from_utf8_unchecked(vec) }) + type Error = fusio::Error; + + async fn decode(reader: &mut R) -> Result { + let len = u16::decode(reader).await?; + let buf = reader.read_exact(vec![0u8; len as usize]).await?; + + Ok(unsafe { String::from_utf8_unchecked(buf.as_slice().to_vec()) }) + } +} + +#[cfg(test)] +mod tests { + use std::io::Cursor; + + use fusio::Seek; + + use crate::serdes::{Decode, Encode}; + + #[tokio::test] + async fn test_encode_decode() { + let source_0 = "Hello! World"; + let source_1 = "Hello! Tonbo".to_string(); + + let mut bytes = Vec::new(); + let mut cursor = Cursor::new(&mut bytes); + + source_0.encode(&mut cursor).await.unwrap(); + source_1.encode(&mut cursor).await.unwrap(); + + cursor.seek(0).await.unwrap(); + let decoded_0 = String::decode(&mut cursor).await.unwrap(); + let decoded_1 = String::decode(&mut cursor).await.unwrap(); + + assert_eq!(source_0, decoded_0); + assert_eq!(source_1, decoded_1); } } diff --git a/src/stream/level.rs b/src/stream/level.rs index c6867d6..655a78e 100644 --- a/src/stream/level.rs +++ b/src/stream/level.rs @@ -1,17 +1,21 @@ use std::{ collections::{Bound, VecDeque}, future::Future, - io, pin::Pin, sync::Arc, task::{Context, Poll}, }; +use fusio::{ + dynamic::{DynFile, MaybeSendFuture}, + path::Path, + DynFs, Error, +}; use futures_core::Stream; use parquet::{arrow::ProjectionMask, errors::ParquetError}; use crate::{ - fs::{FileId, FileProvider}, + fs::{default_open_options, FileId}, ondisk::{scan::SsTableScan, sstable::SsTable}, record::Record, scope::Scope, @@ -21,29 +25,22 @@ use crate::{ DbOption, }; -enum FutureStatus<'level, R, FP> +enum FutureStatus<'level, R> where R: Record, - FP: FileProvider, { Init(FileId), - Ready(SsTableScan<'level, R, FP>), - OpenFile(Pin> + Send + 'level>>), + Ready(SsTableScan<'level, R>), + OpenFile(Pin, Error>> + 'level>>), + OpenSst(Pin, Error>> + Send + 'level>>), LoadStream( - Pin< - Box< - dyn Future, ParquetError>> - + Send - + 'level, - >, - >, + Pin, ParquetError>> + Send + 'level>>, ), } -pub(crate) struct LevelStream<'level, R, FP> +pub(crate) struct LevelStream<'level, R> where R: Record, - FP: FileProvider, { lower: Bound<&'level R::Key>, upper: Bound<&'level R::Key>, @@ -52,18 +49,19 @@ where gens: VecDeque, limit: Option, projection_mask: ProjectionMask, - status: FutureStatus<'level, R, FP>, + status: FutureStatus<'level, R>, + fs: Arc, + path: Option, } -impl<'level, R, FP> LevelStream<'level, R, FP> +impl<'level, R> LevelStream<'level, R> where R: Record, - FP: FileProvider, { // Kould: only used by Compaction now, and the start and end of the sstables range are known #[allow(clippy::too_many_arguments)] pub(crate) fn new( - version: &Version, + version: &Version, level: usize, start: usize, end: usize, @@ -71,6 +69,7 @@ where ts: Timestamp, limit: Option, projection_mask: ProjectionMask, + fs: Arc, ) -> Option { let (lower, upper) = range; let mut gens: VecDeque = version.level_slice[level][start..end + 1] @@ -89,14 +88,15 @@ where limit, projection_mask, status, + fs, + path: None, }) } } -impl<'level, R, FP> Stream for LevelStream<'level, R, FP> +impl<'level, R> Stream for LevelStream<'level, R> where R: Record, - FP: FileProvider + 'level, { type Item = Result, ParquetError>; @@ -105,17 +105,49 @@ where return match &mut self.status { FutureStatus::Init(gen) => { let gen = *gen; - self.status = - FutureStatus::OpenFile(Box::pin(FP::open(self.option.table_path(&gen)))); + self.path = Some(self.option.table_path(&gen)); + + let reader = self + .fs + .open_options(self.path.as_ref().unwrap(), default_open_options()); + #[allow(clippy::missing_transmute_annotations)] + let reader = unsafe { + std::mem::transmute::< + _, + Pin< + Box< + dyn MaybeSendFuture, Error>> + + 'static, + >, + >, + >(reader) + }; + self.status = FutureStatus::OpenFile(reader); continue; } FutureStatus::Ready(stream) => match Pin::new(stream).poll_next(cx) { Poll::Ready(None) => match self.gens.pop_front() { None => Poll::Ready(None), Some(gen) => { - self.status = FutureStatus::OpenFile(Box::pin(FP::open( - self.option.table_path(&gen), - ))); + self.path = Some(self.option.table_path(&gen)); + + let reader = self + .fs + .open_options(self.path.as_ref().unwrap(), default_open_options()); + #[allow(clippy::missing_transmute_annotations)] + let reader = unsafe { + std::mem::transmute::< + _, + Pin< + Box< + dyn MaybeSendFuture< + Output = Result, Error>, + > + 'static, + >, + >, + >(reader) + }; + self.status = FutureStatus::OpenFile(reader); continue; } }, @@ -129,7 +161,17 @@ where }, FutureStatus::OpenFile(file_future) => match Pin::new(file_future).poll(cx) { Poll::Ready(Ok(file)) => { - self.status = FutureStatus::LoadStream(Box::pin(SsTable::open(file).scan( + self.status = FutureStatus::OpenSst(Box::pin(SsTable::open(file))); + continue; + } + Poll::Ready(Err(err)) => { + Poll::Ready(Some(Err(ParquetError::External(Box::new(err))))) + } + Poll::Pending => Poll::Pending, + }, + FutureStatus::OpenSst(sst_future) => match Pin::new(sst_future).poll(cx) { + Poll::Ready(Ok(sst)) => { + self.status = FutureStatus::LoadStream(Box::pin(sst.scan( (self.lower, self.upper), self.ts, self.limit, @@ -137,7 +179,9 @@ where ))); continue; } - Poll::Ready(Err(err)) => Poll::Ready(Some(Err(ParquetError::from(err)))), + Poll::Ready(Err(err)) => { + Poll::Ready(Some(Err(ParquetError::External(Box::new(err))))) + } Poll::Pending => Poll::Pending, }, FutureStatus::LoadStream(stream_future) => match Pin::new(stream_future).poll(cx) { @@ -157,21 +201,34 @@ where mod tests { use std::{collections::Bound, sync::Arc}; + use fusio::{local::TokioFs, path::Path}; use futures_util::StreamExt; use parquet::arrow::{arrow_to_parquet_schema, ProjectionMask}; use tempfile::TempDir; use crate::{ - compaction::tests::build_version, record::Record, stream::level::LevelStream, tests::Test, - DbOption, + compaction::tests::build_version, fs::manager::StoreManager, record::Record, + stream::level::LevelStream, tests::Test, DbOption, }; #[tokio::test] async fn projection_scan() { let temp_dir = TempDir::new().unwrap(); - let option = Arc::new(DbOption::from(temp_dir.path())); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + let option = Arc::new(DbOption::from( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + )); + + manager + .create_dir_all(&option.version_log_dir_path()) + .await + .unwrap(); + manager + .create_dir_all(&option.wal_dir_path()) + .await + .unwrap(); - let (_, version) = build_version(&option).await; + let (_, version) = build_version(&option, &manager).await; { let mut level_stream_1 = LevelStream::new( @@ -186,6 +243,7 @@ mod tests { &arrow_to_parquet_schema(Test::arrow_schema()).unwrap(), [0, 1, 2, 3], ), + manager.base_fs().clone(), ) .unwrap(); @@ -221,6 +279,7 @@ mod tests { &arrow_to_parquet_schema(Test::arrow_schema()).unwrap(), [0, 1, 2, 4], ), + manager.base_fs().clone(), ) .unwrap(); @@ -256,6 +315,7 @@ mod tests { &arrow_to_parquet_schema(Test::arrow_schema()).unwrap(), [0, 1, 2], ), + manager.base_fs().clone(), ) .unwrap(); diff --git a/src/stream/mem_projection.rs b/src/stream/mem_projection.rs index 8702424..02f5371 100644 --- a/src/stream/mem_projection.rs +++ b/src/stream/mem_projection.rs @@ -9,31 +9,25 @@ use parquet::{arrow::ProjectionMask, errors::ParquetError}; use pin_project_lite::pin_project; use crate::{ - fs::FileProvider, record::Record, stream::{Entry, ScanStream}, }; pin_project! { - pub struct MemProjectionStream<'projection, R, FP> + pub struct MemProjectionStream<'projection, R> where R: Record, - FP: FileProvider, { - stream: Box>, + stream: Box>, projection_mask: Arc, } } -impl<'projection, R, FP> MemProjectionStream<'projection, R, FP> +impl<'projection, R> MemProjectionStream<'projection, R> where R: Record, - FP: FileProvider + 'projection, { - pub(crate) fn new( - stream: ScanStream<'projection, R, FP>, - projection_mask: ProjectionMask, - ) -> Self { + pub(crate) fn new(stream: ScanStream<'projection, R>, projection_mask: ProjectionMask) -> Self { Self { stream: Box::new(stream), projection_mask: Arc::new(projection_mask), @@ -41,10 +35,9 @@ where } } -impl<'projection, R, FP> Stream for MemProjectionStream<'projection, R, FP> +impl<'projection, R> Stream for MemProjectionStream<'projection, R> where R: Record, - FP: FileProvider + 'projection, { type Item = Result, ParquetError>; @@ -65,28 +58,26 @@ where mod tests { use std::{ops::Bound, sync::Arc}; + use fusio::{local::TokioFs, path::Path, DynFs}; use futures_util::StreamExt; use parquet::arrow::{arrow_to_parquet_schema, ProjectionMask}; use crate::{ - executor::tokio::TokioExecutor, fs::FileProvider, inmem::mutable::Mutable, record::Record, - stream::mem_projection::MemProjectionStream, tests::Test, trigger::TriggerFactory, - wal::log::LogType, DbOption, + inmem::mutable::Mutable, record::Record, stream::mem_projection::MemProjectionStream, + tests::Test, trigger::TriggerFactory, wal::log::LogType, DbOption, }; #[tokio::test] async fn merge_mutable() { let temp_dir = tempfile::tempdir().unwrap(); - let option = DbOption::from(temp_dir.path()); - TokioExecutor::create_dir_all(&option.wal_dir_path()) - .await - .unwrap(); + let fs = Arc::new(TokioFs) as Arc; + let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + + fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mutable = Mutable::::new(&option, trigger) - .await - .unwrap(); + let mutable = Mutable::::new(&option, trigger, &fs).await.unwrap(); mutable .insert( @@ -130,7 +121,7 @@ mod tests { vec![0, 1, 2, 4], ); - let mut stream = MemProjectionStream::::new( + let mut stream = MemProjectionStream::::new( mutable .scan((Bound::Unbounded, Bound::Unbounded), 6.into()) .into(), diff --git a/src/stream/merge.rs b/src/stream/merge.rs index acf89ce..4fbb953 100644 --- a/src/stream/merge.rs +++ b/src/stream/merge.rs @@ -10,15 +10,14 @@ use futures_util::stream::StreamExt; use pin_project_lite::pin_project; use super::{Entry, ScanStream}; -use crate::{fs::FileProvider, record::Record, timestamp::Timestamp}; +use crate::{record::Record, timestamp::Timestamp}; pin_project! { - pub struct MergeStream<'merge, R, FP> + pub struct MergeStream<'merge, R> where R: Record, - FP: FileProvider, { - streams: Vec>, + streams: Vec>, peeked: BinaryHeap>, buf: Option>, ts: Timestamp, @@ -26,13 +25,12 @@ pin_project! { } } -impl<'merge, R, FP> MergeStream<'merge, R, FP> +impl<'merge, R> MergeStream<'merge, R> where R: Record, - FP: FileProvider + 'merge, { pub(crate) async fn from_vec( - mut streams: Vec>, + mut streams: Vec>, ts: Timestamp, ) -> Result { let mut peeked = BinaryHeap::with_capacity(streams.len()); @@ -64,10 +62,9 @@ where } } -impl<'merge, R, FP> Stream for MergeStream<'merge, R, FP> +impl<'merge, R> Stream for MergeStream<'merge, R> where R: Record, - FP: FileProvider + 'merge, { type Item = Result, parquet::errors::ParquetError>; @@ -161,27 +158,26 @@ where mod tests { use std::{ops::Bound, sync::Arc}; + use fusio::{local::TokioFs, path::Path, DynFs}; use futures_util::StreamExt; use super::MergeStream; use crate::{ - executor::tokio::TokioExecutor, fs::FileProvider, inmem::mutable::Mutable, stream::Entry, - trigger::TriggerFactory, wal::log::LogType, DbOption, + inmem::mutable::Mutable, stream::Entry, trigger::TriggerFactory, wal::log::LogType, + DbOption, }; #[tokio::test] async fn merge_mutable() { let temp_dir = tempfile::tempdir().unwrap(); - let option = DbOption::from(temp_dir.path()); - TokioExecutor::create_dir_all(&option.wal_dir_path()) - .await - .unwrap(); + let fs = Arc::new(TokioFs) as Arc; + let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + + fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let m1 = Mutable::::new(&option, trigger) - .await - .unwrap(); + let m1 = Mutable::::new(&option, trigger, &fs).await.unwrap(); m1.remove(LogType::Full, "b".into(), 3.into()) .await @@ -195,9 +191,7 @@ mod tests { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let m2 = Mutable::::new(&option, trigger) - .await - .unwrap(); + let m2 = Mutable::::new(&option, trigger, &fs).await.unwrap(); m2.insert(LogType::Full, "a".into(), 1.into()) .await .unwrap(); @@ -210,9 +204,7 @@ mod tests { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let m3 = Mutable::::new(&option, trigger) - .await - .unwrap(); + let m3 = Mutable::::new(&option, trigger, &fs).await.unwrap(); m3.insert(LogType::Full, "e".into(), 4.into()) .await .unwrap(); @@ -220,7 +212,7 @@ mod tests { let lower = "a".to_string(); let upper = "e".to_string(); let bound = (Bound::Included(&lower), Bound::Included(&upper)); - let mut merge = MergeStream::::from_vec( + let mut merge = MergeStream::::from_vec( vec![ m1.scan(bound, 6.into()).into(), m2.scan(bound, 6.into()).into(), @@ -272,16 +264,14 @@ mod tests { #[tokio::test] async fn merge_mutable_remove_duplicates() { let temp_dir = tempfile::tempdir().unwrap(); - let option = DbOption::from(temp_dir.path()); - TokioExecutor::create_dir_all(&option.wal_dir_path()) - .await - .unwrap(); + let fs = Arc::new(TokioFs) as Arc; + let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + + fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let m1 = Mutable::::new(&option, trigger) - .await - .unwrap(); + let m1 = Mutable::::new(&option, trigger, &fs).await.unwrap(); m1.insert(LogType::Full, "1".into(), 0_u32.into()) .await .unwrap(); @@ -301,12 +291,10 @@ mod tests { let lower = "1".to_string(); let upper = "4".to_string(); let bound = (Bound::Included(&lower), Bound::Included(&upper)); - let mut merge = MergeStream::::from_vec( - vec![m1.scan(bound, 0.into()).into()], - 0.into(), - ) - .await - .unwrap(); + let mut merge = + MergeStream::::from_vec(vec![m1.scan(bound, 0.into()).into()], 0.into()) + .await + .unwrap(); if let Some(Ok(Entry::Mutable(entry))) = merge.next().await { assert_eq!(entry.key().value, "1"); @@ -331,12 +319,10 @@ mod tests { let lower = "1".to_string(); let upper = "4".to_string(); let bound = (Bound::Included(&lower), Bound::Included(&upper)); - let mut merge = MergeStream::::from_vec( - vec![m1.scan(bound, 1.into()).into()], - 1.into(), - ) - .await - .unwrap(); + let mut merge = + MergeStream::::from_vec(vec![m1.scan(bound, 1.into()).into()], 1.into()) + .await + .unwrap(); if let Some(Ok(Entry::Mutable(entry))) = merge.next().await { assert_eq!(entry.key().value, "1"); @@ -364,16 +350,14 @@ mod tests { #[tokio::test] async fn merge_mutable_limit() { let temp_dir = tempfile::tempdir().unwrap(); - let option = DbOption::from(temp_dir.path()); - TokioExecutor::create_dir_all(&option.wal_dir_path()) - .await - .unwrap(); + let fs = Arc::new(TokioFs) as Arc; + let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + + fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let m1 = Mutable::::new(&option, trigger) - .await - .unwrap(); + let m1 = Mutable::::new(&option, trigger, &fs).await.unwrap(); m1.insert(LogType::Full, "1".into(), 0_u32.into()) .await .unwrap(); @@ -387,7 +371,7 @@ mod tests { let lower = "1".to_string(); let upper = "3".to_string(); { - let mut merge = MergeStream::::from_vec( + let mut merge = MergeStream::::from_vec( vec![m1 .scan((Bound::Included(&lower), Bound::Included(&upper)), 0.into()) .into()], @@ -407,7 +391,7 @@ mod tests { assert!(merge.next().await.is_none()); } { - let mut merge = MergeStream::::from_vec( + let mut merge = MergeStream::::from_vec( vec![m1 .scan((Bound::Included(&lower), Bound::Included(&upper)), 0.into()) .into()], diff --git a/src/stream/mod.rs b/src/stream/mod.rs index cac1c40..fa0b5af 100644 --- a/src/stream/mod.rs +++ b/src/stream/mod.rs @@ -19,7 +19,6 @@ use pin_project_lite::pin_project; use record_batch::RecordBatchEntry; use crate::{ - fs::FileProvider, inmem::{immutable::ImmutableScan, mutable::MutableScan}, ondisk::scan::SsTableScan, record::{Key, Record, RecordRef}, @@ -101,10 +100,9 @@ where pin_project! { #[project = ScanStreamProject] - pub enum ScanStream<'scan, R, FP> + pub enum ScanStream<'scan, R> where R: Record, - FP: FileProvider, { Transaction { #[pin] @@ -120,23 +118,22 @@ pin_project! { }, SsTable { #[pin] - inner: SsTableScan<'scan, R, FP>, + inner: SsTableScan<'scan, R>, }, Level { #[pin] - inner: LevelStream<'scan, R, FP>, + inner: LevelStream<'scan, R>, }, MemProjection { #[pin] - inner: MemProjectionStream<'scan, R, FP>, + inner: MemProjectionStream<'scan, R>, } } } -impl<'scan, R, FP> From> for ScanStream<'scan, R, FP> +impl<'scan, R> From> for ScanStream<'scan, R> where R: Record, - FP: FileProvider, { fn from(inner: TransactionScan<'scan, R>) -> Self { ScanStream::Transaction { @@ -145,10 +142,9 @@ where } } -impl<'scan, R, FP> From> for ScanStream<'scan, R, FP> +impl<'scan, R> From> for ScanStream<'scan, R> where R: Record, - FP: FileProvider, { fn from(inner: MutableScan<'scan, R>) -> Self { ScanStream::Mutable { @@ -157,10 +153,9 @@ where } } -impl<'scan, R, FP> From> for ScanStream<'scan, R, FP> +impl<'scan, R> From> for ScanStream<'scan, R> where R: Record, - FP: FileProvider, { fn from(inner: ImmutableScan<'scan, R>) -> Self { ScanStream::Immutable { @@ -169,30 +164,27 @@ where } } -impl<'scan, R, FP> From> for ScanStream<'scan, R, FP> +impl<'scan, R> From> for ScanStream<'scan, R> where R: Record, - FP: FileProvider, { - fn from(inner: SsTableScan<'scan, R, FP>) -> Self { + fn from(inner: SsTableScan<'scan, R>) -> Self { ScanStream::SsTable { inner } } } -impl<'scan, R, FP> From> for ScanStream<'scan, R, FP> +impl<'scan, R> From> for ScanStream<'scan, R> where R: Record, - FP: FileProvider, { - fn from(inner: MemProjectionStream<'scan, R, FP>) -> Self { + fn from(inner: MemProjectionStream<'scan, R>) -> Self { ScanStream::MemProjection { inner } } } -impl fmt::Debug for ScanStream<'_, R, FP> +impl fmt::Debug for ScanStream<'_, R> where R: Record, - FP: FileProvider, { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { @@ -206,10 +198,9 @@ where } } -impl<'scan, R, FP> Stream for ScanStream<'scan, R, FP> +impl<'scan, R> Stream for ScanStream<'scan, R> where R: Record, - FP: FileProvider + 'scan, { type Item = Result, parquet::errors::ParquetError>; diff --git a/src/stream/package.rs b/src/stream/package.rs index 314c4eb..90fe42e 100644 --- a/src/stream/package.rs +++ b/src/stream/package.rs @@ -7,50 +7,47 @@ use futures_core::Stream; use pin_project_lite::pin_project; use crate::{ - fs::FileProvider, inmem::immutable::{ArrowArrays, Builder}, - record::Record, + record::{Record, RecordInstance}, stream::merge::MergeStream, }; pin_project! { - pub struct PackageStream<'package, R, FP> + pub struct PackageStream<'package, R> where R: Record, - FP: FileProvider, { row_count: usize, batch_size: usize, - inner: MergeStream<'package, R, FP>, + inner: MergeStream<'package, R>, builder: ::Builder, projection_indices: Option>, } } -impl<'package, R, FP> PackageStream<'package, R, FP> +impl<'package, R> PackageStream<'package, R> where R: Record, - FP: FileProvider + 'package, { pub(crate) fn new( batch_size: usize, - merge: MergeStream<'package, R, FP>, + merge: MergeStream<'package, R>, projection_indices: Option>, + instance: &RecordInstance, ) -> Self { Self { row_count: 0, batch_size, inner: merge, - builder: R::Columns::builder(batch_size), + builder: R::Columns::builder(&instance.arrow_schema::(), batch_size), projection_indices, } } } -impl<'package, R, FP> Stream for PackageStream<'package, R, FP> +impl<'package, R> Stream for PackageStream<'package, R> where R: Record, - FP: FileProvider + 'package, { type Item = Result; @@ -85,12 +82,11 @@ mod tests { use std::{collections::Bound, sync::Arc}; use arrow::array::{BooleanArray, RecordBatch, StringArray, UInt32Array}; + use fusio::{local::TokioFs, path::Path, DynFs}; use futures_util::StreamExt; use tempfile::TempDir; use crate::{ - executor::tokio::TokioExecutor, - fs::FileProvider, inmem::{ immutable::{tests::TestImmutableArrays, ArrowArrays}, mutable::Mutable, @@ -106,16 +102,14 @@ mod tests { #[tokio::test] async fn iter() { let temp_dir = TempDir::new().unwrap(); - let option = DbOption::from(temp_dir.path()); - TokioExecutor::create_dir_all(option.wal_dir_path()) - .await - .unwrap(); + let fs = Arc::new(TokioFs) as Arc; + let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + + fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let m1 = Mutable::::new(&option, trigger) - .await - .unwrap(); + let m1 = Mutable::::new(&option, trigger, &fs).await.unwrap(); m1.insert( LogType::Full, Test { @@ -183,7 +177,7 @@ mod tests { .await .unwrap(); - let merge = MergeStream::::from_vec( + let merge = MergeStream::::from_vec( vec![m1 .scan((Bound::Unbounded, Bound::Unbounded), 6.into()) .into()], @@ -197,7 +191,7 @@ mod tests { row_count: 0, batch_size: 8192, inner: merge, - builder: TestImmutableArrays::builder(8192), + builder: TestImmutableArrays::builder(Test::arrow_schema(), 8192), projection_indices: Some(projection_indices.clone()), }; diff --git a/src/stream/record_batch.rs b/src/stream/record_batch.rs index 321b8ba..c974f8e 100644 --- a/src/stream/record_batch.rs +++ b/src/stream/record_batch.rs @@ -2,9 +2,10 @@ use std::{ fmt::{self, Debug, Formatter}, marker::PhantomData, mem::transmute, + sync::Arc, }; -use arrow::array::RecordBatch; +use arrow::{array::RecordBatch, datatypes::Schema}; use parquet::arrow::ProjectionMask; use crate::{ @@ -62,6 +63,7 @@ pub struct RecordBatchIterator { record_batch: RecordBatch, offset: usize, projection_mask: ProjectionMask, + full_schema: Arc, _marker: PhantomData, } @@ -69,11 +71,16 @@ impl RecordBatchIterator where R: Record, { - pub(crate) fn new(record_batch: RecordBatch, projection_mask: ProjectionMask) -> Self { + pub(crate) fn new( + record_batch: RecordBatch, + projection_mask: ProjectionMask, + full_schema: Arc, + ) -> Self { Self { record_batch, offset: 0, projection_mask, + full_schema, _marker: PhantomData, } } @@ -91,8 +98,12 @@ where } let record_batch = self.record_batch.clone(); - let record = - R::Ref::from_record_batch(&self.record_batch, self.offset, &self.projection_mask); + let record = R::Ref::from_record_batch( + &self.record_batch, + self.offset, + &self.projection_mask, + &self.full_schema, + ); let entry = RecordBatchEntry::new(record_batch, unsafe { // Safety: self-referring lifetime is safe transmute::< diff --git a/src/timestamp/mod.rs b/src/timestamp/mod.rs index 1a14109..26d265e 100644 --- a/src/timestamp/mod.rs +++ b/src/timestamp/mod.rs @@ -1,12 +1,10 @@ pub mod timestamped; -use std::io; - use arrow::{ array::{PrimitiveArray, Scalar}, datatypes::UInt32Type, }; -use tokio::io::{AsyncRead, AsyncWrite}; +use fusio::{Read, Write}; pub(crate) use self::timestamped::*; use crate::serdes::{Decode, Encode}; @@ -36,10 +34,10 @@ impl Timestamp { } impl Encode for Timestamp { - type Error = io::Error; + type Error = fusio::Error; async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: AsyncWrite + Unpin + Send, + W: Write + Unpin + Send, { self.0.encode(writer).await } @@ -48,10 +46,10 @@ impl Encode for Timestamp { } } impl Decode for Timestamp { - type Error = io::Error; + type Error = fusio::Error; async fn decode(reader: &mut R) -> Result where - R: AsyncRead + Unpin, + R: Read + Unpin, { u32::decode(reader).await.map(Timestamp) } diff --git a/src/timestamp/timestamped.rs b/src/timestamp/timestamped.rs index bed001f..9ef18c0 100644 --- a/src/timestamp/timestamped.rs +++ b/src/timestamp/timestamped.rs @@ -1,6 +1,6 @@ use std::{borrow::Borrow, cmp::Ordering, marker::PhantomData, mem::size_of, ptr}; -use tokio::io::{AsyncRead, AsyncWrite}; +use fusio::{Read, Write}; use crate::{ serdes::{Decode, Encode}, @@ -150,7 +150,7 @@ where async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: AsyncWrite + Unpin + Send, + W: Write + Unpin + Send, { self.ts.encode(writer).await?; self.value.encode(writer).await @@ -169,7 +169,7 @@ where async fn decode(reader: &mut R) -> Result where - R: AsyncRead + Unpin, + R: Read + Unpin, { let ts = Timestamp::decode(reader).await?; let value = V::decode(reader).await?; diff --git a/src/transaction.rs b/src/transaction.rs index 1d7cb28..2ed1500 100644 --- a/src/transaction.rs +++ b/src/transaction.rs @@ -5,6 +5,7 @@ use std::{ }, io, mem::transmute, + sync::Arc, }; use async_lock::RwLockReadGuard; @@ -15,7 +16,7 @@ use thiserror::Error; use crate::{ compaction::CompactTask, - fs::FileProvider, + fs::manager::StoreManager, record::{Key, KeyRef}, stream, stream::mem_projection::MemProjectionStream, @@ -44,27 +45,27 @@ where } /// optimistic ACID transaction, open with /// [`DB::transaction`](crate::DB::transaction) method -pub struct Transaction<'txn, R, FP> +pub struct Transaction<'txn, R> where R: Record, - FP: FileProvider, { ts: Timestamp, local: BTreeMap>, - share: RwLockReadGuard<'txn, Schema>, - version: VersionRef, + share: RwLockReadGuard<'txn, Schema>, + version: VersionRef, lock_map: LockMap, + manager: Arc, } -impl<'txn, R, FP> Transaction<'txn, R, FP> +impl<'txn, R> Transaction<'txn, R> where R: Record + Send, - FP: FileProvider, { pub(crate) fn new( - version: VersionRef, - share: RwLockReadGuard<'txn, Schema>, + version: VersionRef, + share: RwLockReadGuard<'txn, Schema>, lock_map: LockMap, + manager: Arc, ) -> Self { Self { ts: version.load_ts(), @@ -72,6 +73,7 @@ where share, version, lock_map, + manager, } } @@ -86,7 +88,7 @@ where Some(v) => Some(TransactionEntry::Local(v.as_record_ref())), None => self .share - .get(&self.version, key, self.ts, projection) + .get(&self.version, &self.manager, key, self.ts, projection) .await? .and_then(|entry| { if entry.value().is_none() { @@ -102,9 +104,10 @@ where pub fn scan<'scan>( &'scan self, range: (Bound<&'scan R::Key>, Bound<&'scan R::Key>), - ) -> Scan<'scan, R, FP> { + ) -> Scan<'scan, R> { Scan::new( &self.share, + &self.manager, range, self.ts, &self.version, @@ -191,7 +194,7 @@ where } async fn append( - schema: &Schema, + schema: &Schema, log_ty: LogType, key: ::Key, record: Option, @@ -250,12 +253,18 @@ where mod tests { use std::{collections::Bound, sync::Arc}; + use fusio::{local::TokioFs, path::Path}; use futures_util::StreamExt; use tempfile::TempDir; use crate::{ compaction::tests::build_version, executor::tokio::TokioExecutor, + fs::manager::StoreManager, + record::{ + runtime::{Column, Datatype, DynRecord}, + ColumnDesc, + }, tests::{build_db, build_schema, Test}, transaction::CommitError, version::TransactionTs, @@ -265,11 +274,15 @@ mod tests { #[tokio::test] async fn transaction_read_write() { let temp_dir = TempDir::new().unwrap(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); - let db = - DB::::new(DbOption::from(temp_dir.path()), TokioExecutor::new()) - .await - .unwrap(); + let db = DB::::new( + DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()), + TokioExecutor::new(), + manager, + ) + .await + .unwrap(); { let mut txn1 = db.transaction().await; txn1.insert("foo".to_string()); @@ -299,13 +312,34 @@ mod tests { #[tokio::test] async fn transaction_get() { let temp_dir = TempDir::new().unwrap(); - let option = Arc::new(DbOption::from(temp_dir.path())); + let manager = Arc::new(StoreManager::new(Arc::new(TokioFs), vec![])); + let option = Arc::new(DbOption::from( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + )); + + manager + .create_dir_all(&option.version_log_dir_path()) + .await + .unwrap(); + manager + .create_dir_all(&option.wal_dir_path()) + .await + .unwrap(); - let (_, version) = build_version(&option).await; - let (schema, compaction_rx) = build_schema(option.clone()).await.unwrap(); - let db = build_db(option, compaction_rx, TokioExecutor::new(), schema, version) + let (_, version) = build_version(&option, &manager).await; + let (schema, compaction_rx) = build_schema(option.clone(), manager.base_fs()) .await .unwrap(); + let db = build_db( + option, + compaction_rx, + TokioExecutor::new(), + schema, + version, + manager, + ) + .await + .unwrap(); { let _ = db.version_set.increase_ts(); @@ -367,11 +401,12 @@ mod tests { #[tokio::test] async fn write_conflicts() { let temp_dir = TempDir::new().unwrap(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); - let db = - DB::::new(DbOption::from(temp_dir.path()), TokioExecutor::new()) - .await - .unwrap(); + let db = DB::::new(option, TokioExecutor::new(), manager) + .await + .unwrap(); let mut txn = db.transaction().await; txn.insert(0.to_string()); @@ -400,11 +435,12 @@ mod tests { #[tokio::test] async fn transaction_projection() { let temp_dir = TempDir::new().unwrap(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); - let db = - DB::::new(DbOption::from(temp_dir.path()), TokioExecutor::new()) - .await - .unwrap(); + let db = DB::::new(option, TokioExecutor::new(), manager) + .await + .unwrap(); let mut txn1 = db.transaction().await; txn1.insert(Test { @@ -427,13 +463,34 @@ mod tests { #[tokio::test] async fn transaction_scan() { let temp_dir = TempDir::new().unwrap(); - let option = Arc::new(DbOption::from(temp_dir.path())); + let manager = Arc::new(StoreManager::new(Arc::new(TokioFs), vec![])); + let option = Arc::new(DbOption::from( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + )); + + manager + .create_dir_all(&option.version_log_dir_path()) + .await + .unwrap(); + manager + .create_dir_all(&option.wal_dir_path()) + .await + .unwrap(); - let (_, version) = build_version(&option).await; - let (schema, compaction_rx) = build_schema(option.clone()).await.unwrap(); - let db = build_db(option, compaction_rx, TokioExecutor::new(), schema, version) + let (_, version) = build_version(&option, &manager).await; + let (schema, compaction_rx) = build_schema(option.clone(), manager.base_fs()) .await .unwrap(); + let db = build_db( + option, + compaction_rx, + TokioExecutor::new(), + schema, + version, + manager, + ) + .await + .unwrap(); { // to increase timestamps to 1 because the data ts built in advance is 1 @@ -499,13 +556,34 @@ mod tests { #[tokio::test] async fn test_transaction_scan_bound() { let temp_dir = TempDir::new().unwrap(); - let option = Arc::new(DbOption::from(temp_dir.path())); + let manager = Arc::new(StoreManager::new(Arc::new(TokioFs), vec![])); + let option = Arc::new(DbOption::from( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + )); + + manager + .create_dir_all(&option.version_log_dir_path()) + .await + .unwrap(); + manager + .create_dir_all(&option.wal_dir_path()) + .await + .unwrap(); - let (_, version) = build_version(&option).await; - let (schema, compaction_rx) = build_schema(option.clone()).await.unwrap(); - let db = build_db(option, compaction_rx, TokioExecutor::new(), schema, version) + let (_, version) = build_version(&option, &manager).await; + let (schema, compaction_rx) = build_schema(option.clone(), manager.base_fs()) .await .unwrap(); + let db = build_db( + option, + compaction_rx, + TokioExecutor::new(), + schema, + version, + manager, + ) + .await + .unwrap(); { // to increase timestamps to 1 because the data ts built in advance is 1 db.version_set.increase_ts(); @@ -652,13 +730,34 @@ mod tests { #[tokio::test] async fn test_transaction_scan_limit() { let temp_dir = TempDir::new().unwrap(); - let option = Arc::new(DbOption::from(temp_dir.path())); + let manager = Arc::new(StoreManager::new(Arc::new(TokioFs), vec![])); + let option = Arc::new(DbOption::from( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + )); + + manager + .create_dir_all(&option.version_log_dir_path()) + .await + .unwrap(); + manager + .create_dir_all(&option.wal_dir_path()) + .await + .unwrap(); - let (_, version) = build_version(&option).await; - let (schema, compaction_rx) = build_schema(option.clone()).await.unwrap(); - let db = build_db(option, compaction_rx, TokioExecutor::new(), schema, version) + let (_, version) = build_version(&option, &manager).await; + let (schema, compaction_rx) = build_schema(option.clone(), manager.base_fs()) .await .unwrap(); + let db = build_db( + option, + compaction_rx, + TokioExecutor::new(), + schema, + version, + manager, + ) + .await + .unwrap(); let txn = db.transaction().await; txn.commit().await.unwrap(); @@ -688,4 +787,106 @@ mod tests { } } } + + #[tokio::test] + async fn test_dyn_record() { + let descs = vec![ + ColumnDesc::new("age".to_string(), Datatype::Int8, false), + ColumnDesc::new("height".to_string(), Datatype::Int16, true), + ColumnDesc::new("weight".to_string(), Datatype::Int32, false), + ]; + + let temp_dir = TempDir::new().unwrap(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); + let option = DbOption::with_path( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + "age".to_string(), + 0, + ); + let db = DB::with_schema(option, TokioExecutor::default(), manager, descs, 0) + .await + .unwrap(); + + db.insert(DynRecord::new( + vec![ + Column::new(Datatype::Int8, "age".to_string(), Arc::new(1_i8), false), + Column::new( + Datatype::Int16, + "height".to_string(), + Arc::new(Some(180_i16)), + true, + ), + Column::new( + Datatype::Int32, + "weight".to_string(), + Arc::new(56_i32), + false, + ), + ], + 0, + )) + .await + .unwrap(); + + let txn = db.transaction().await; + { + let key = Column::new(Datatype::Int8, "age".to_string(), Arc::new(1_i8), false); + + let record_ref = txn.get(&key, Projection::All).await.unwrap(); + assert!(record_ref.is_some()); + let res = record_ref.unwrap(); + let record_ref = res.get(); + + assert_eq!(record_ref.columns.len(), 3); + let col = record_ref.columns.first().unwrap(); + assert_eq!(col.datatype, Datatype::Int8); + let name = col.value.as_ref().downcast_ref::(); + assert!(name.is_some()); + assert_eq!(*name.unwrap(), 1); + + let col = record_ref.columns.get(1).unwrap(); + let height = col.value.as_ref().downcast_ref::>(); + assert!(height.is_some()); + assert_eq!(*height.unwrap(), Some(180_i16)); + + let col = record_ref.columns.get(2).unwrap(); + let weight = col.value.as_ref().downcast_ref::>(); + assert!(weight.is_some()); + assert_eq!(*weight.unwrap(), Some(56_i32)); + } + { + let mut scan = txn + .scan((Bound::Unbounded, Bound::Unbounded)) + .projection(vec![0, 1, 2]) + .take() + .await + .unwrap(); + while let Some(entry) = scan.next().await.transpose().unwrap() { + assert_eq!(entry.value().unwrap().primary_index, 0); + assert_eq!(entry.value().unwrap().columns.len(), 3); + let columns = entry.value().unwrap().columns; + dbg!(columns.clone()); + + let primary_key_col = columns.first().unwrap(); + assert_eq!(primary_key_col.datatype, Datatype::Int8); + assert_eq!( + *primary_key_col.value.as_ref().downcast_ref::().unwrap(), + 1 + ); + + let col = columns.get(1).unwrap(); + assert_eq!(col.datatype, Datatype::Int16); + assert_eq!( + *col.value.as_ref().downcast_ref::>().unwrap(), + Some(180) + ); + + let col = columns.get(2).unwrap(); + assert_eq!(col.datatype, Datatype::Int32); + let weight = col.value.as_ref().downcast_ref::>(); + assert!(weight.is_some()); + assert_eq!(*weight.unwrap(), Some(56_i32)); + } + } + } } diff --git a/src/version/cleaner.rs b/src/version/cleaner.rs index fc84d5e..5fa157e 100644 --- a/src/version/cleaner.rs +++ b/src/version/cleaner.rs @@ -1,37 +1,45 @@ -use std::{collections::BTreeMap, io, marker::PhantomData, sync::Arc}; +use std::{collections::BTreeMap, sync::Arc}; use flume::{Receiver, Sender}; use crate::{ - fs::{FileId, FileProvider}, + fs::{manager::StoreManager, FileId}, record::Record, timestamp::Timestamp, - DbOption, + DbError, DbOption, }; pub enum CleanTag { - Add { ts: Timestamp, gens: Vec }, - Clean { ts: Timestamp }, - RecoverClean { gen: FileId }, + Add { + ts: Timestamp, + gens: Vec<(FileId, usize)>, + }, + Clean { + ts: Timestamp, + }, + RecoverClean { + wal_id: FileId, + }, } -pub(crate) struct Cleaner +pub(crate) struct Cleaner where R: Record, - FP: FileProvider, { tag_recv: Receiver, - gens_map: BTreeMap, bool)>, + gens_map: BTreeMap, bool)>, option: Arc>, - _p: PhantomData, + manager: Arc, } -impl Cleaner +impl Cleaner where R: Record, - FP: FileProvider, { - pub(crate) fn new(option: Arc>) -> (Self, Sender) { + pub(crate) fn new( + option: Arc>, + manager: Arc, + ) -> (Self, Sender) { let (tag_send, tag_recv) = flume::bounded(option.clean_channel_buffer); ( @@ -39,13 +47,13 @@ where tag_recv, gens_map: Default::default(), option, - _p: Default::default(), + manager, }, tag_send, ) } - pub(crate) async fn listen(&mut self) -> Result<(), io::Error> { + pub(crate) async fn listen(&mut self) -> Result<(), DbError> { while let Ok(tag) = self.tag_recv.recv_async().await { match tag { CleanTag::Add { ts, gens } => { @@ -60,13 +68,23 @@ where let _ = self.gens_map.insert(first_version, (gens, false)); break; } - for gen in gens { - FP::remove(self.option.table_path(&gen)).await?; + for (gen, level) in gens { + let fs = self + .option + .level_fs_path(level) + .map(|path| self.manager.get_fs(path)) + .unwrap_or(self.manager.base_fs()); + fs.remove(&self.option.table_path(&gen)).await?; } } } - CleanTag::RecoverClean { gen } => { - FP::remove(self.option.table_path(&gen)).await?; + CleanTag::RecoverClean { wal_id: gen } => { + let fs = self + .option + .level_fs_path(0) + .map(|path| self.manager.get_fs(path)) + .unwrap_or(self.manager.base_fs()); + fs.remove(&self.option.table_path(&gen)).await?; } } } @@ -79,13 +97,17 @@ where pub(crate) mod tests { use std::{sync::Arc, time::Duration}; + use fusio::{ + local::TokioFs, + path::{path_to_local, Path}, + }; use tempfile::TempDir; use tokio::time::sleep; use tracing::error; use crate::{ executor::{tokio::TokioExecutor, Executor}, - fs::{FileId, FileProvider}, + fs::{default_open_options, manager::StoreManager, FileId}, tests::Test, version::cleaner::{CleanTag, Cleaner}, DbOption, @@ -94,28 +116,35 @@ pub(crate) mod tests { #[tokio::test] async fn test_cleaner() { let temp_dir = TempDir::new().unwrap(); - let option = Arc::new(DbOption::from(temp_dir.path())); + let manager = Arc::new(StoreManager::new(Arc::new(TokioFs), vec![])); + let option = Arc::new(DbOption::from( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + )); let gen_0 = FileId::new(); let gen_1 = FileId::new(); let gen_2 = FileId::new(); let gen_3 = FileId::new(); + let fs = option + .level_fs_path(0) + .map(|path| manager.get_fs(path)) + .unwrap_or(manager.base_fs()); { - TokioExecutor::open(option.table_path(&gen_0)) + fs.open_options(&option.table_path(&gen_0), default_open_options()) .await .unwrap(); - TokioExecutor::open(option.table_path(&gen_1)) + fs.open_options(&option.table_path(&gen_1), default_open_options()) .await .unwrap(); - TokioExecutor::open(option.table_path(&gen_2)) + fs.open_options(&option.table_path(&gen_2), default_open_options()) .await .unwrap(); - TokioExecutor::open(option.table_path(&gen_3)) + fs.open_options(&option.table_path(&gen_3), default_open_options()) .await .unwrap(); } - let (mut cleaner, tx) = Cleaner::::new(option.clone()); + let (mut cleaner, tx) = Cleaner::::new(option.clone(), manager.clone()); let executor = TokioExecutor::new(); @@ -127,19 +156,19 @@ pub(crate) mod tests { tx.send_async(CleanTag::Add { ts: 1.into(), - gens: vec![gen_1], + gens: vec![(gen_1, 0)], }) .await .unwrap(); tx.send_async(CleanTag::Add { ts: 0.into(), - gens: vec![gen_0], + gens: vec![(gen_0, 0)], }) .await .unwrap(); tx.send_async(CleanTag::Add { ts: 2.into(), - gens: vec![gen_2], + gens: vec![(gen_2, 0)], }) .await .unwrap(); @@ -147,56 +176,34 @@ pub(crate) mod tests { tx.send_async(CleanTag::Clean { ts: 2.into() }) .await .unwrap(); - assert!(TokioExecutor::file_exist(option.table_path(&gen_0)) - .await - .unwrap()); - assert!(TokioExecutor::file_exist(option.table_path(&gen_1)) - .await - .unwrap()); - assert!(TokioExecutor::file_exist(option.table_path(&gen_2)) - .await - .unwrap()); - assert!(TokioExecutor::file_exist(option.table_path(&gen_3)) - .await - .unwrap()); + + // FIXME + assert!(path_to_local(&option.table_path(&gen_0)).unwrap().exists()); + assert!(path_to_local(&option.table_path(&gen_1)).unwrap().exists()); + assert!(path_to_local(&option.table_path(&gen_2)).unwrap().exists()); + assert!(path_to_local(&option.table_path(&gen_3)).unwrap().exists()); tx.send_async(CleanTag::Clean { ts: 0.into() }) .await .unwrap(); - sleep(Duration::from_millis(1)).await; - assert!(!TokioExecutor::file_exist(option.table_path(&gen_0)) - .await - .unwrap()); - assert!(TokioExecutor::file_exist(option.table_path(&gen_1)) - .await - .unwrap()); - assert!(TokioExecutor::file_exist(option.table_path(&gen_2)) - .await - .unwrap()); - assert!(TokioExecutor::file_exist(option.table_path(&gen_3)) - .await - .unwrap()); + sleep(Duration::from_millis(10)).await; + assert!(!path_to_local(&option.table_path(&gen_0)).unwrap().exists()); + assert!(path_to_local(&option.table_path(&gen_1)).unwrap().exists()); + assert!(path_to_local(&option.table_path(&gen_2)).unwrap().exists()); + assert!(path_to_local(&option.table_path(&gen_3)).unwrap().exists()); tx.send_async(CleanTag::Clean { ts: 1.into() }) .await .unwrap(); - sleep(Duration::from_millis(1)).await; - assert!(!TokioExecutor::file_exist(option.table_path(&gen_1)) - .await - .unwrap()); - assert!(!TokioExecutor::file_exist(option.table_path(&gen_2)) - .await - .unwrap()); - assert!(TokioExecutor::file_exist(option.table_path(&gen_3)) - .await - .unwrap()); + sleep(Duration::from_millis(10)).await; + assert!(!path_to_local(&option.table_path(&gen_1)).unwrap().exists()); + assert!(!path_to_local(&option.table_path(&gen_2)).unwrap().exists()); + assert!(path_to_local(&option.table_path(&gen_3)).unwrap().exists()); - tx.send_async(CleanTag::RecoverClean { gen: gen_3 }) + tx.send_async(CleanTag::RecoverClean { wal_id: gen_3 }) .await .unwrap(); - sleep(Duration::from_millis(1)).await; - assert!(!TokioExecutor::file_exist(option.table_path(&gen_3)) - .await - .unwrap()); + sleep(Duration::from_millis(10)).await; + assert!(!path_to_local(&option.table_path(&gen_3)).unwrap().exists()); } } diff --git a/src/version/edit.rs b/src/version/edit.rs index 93cce83..d591eb1 100644 --- a/src/version/edit.rs +++ b/src/version/edit.rs @@ -1,6 +1,6 @@ use std::mem::size_of; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; +use fusio::{Read, Write}; use crate::{ fs::FileId, @@ -21,7 +21,7 @@ impl VersionEdit where K: Decode, { - pub(crate) async fn recover(reader: &mut R) -> Vec> { + pub(crate) async fn recover(reader: &mut R) -> Vec> { let mut edits = Vec::new(); while let Ok(edit) = VersionEdit::decode(reader).await { @@ -39,25 +39,26 @@ where async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: AsyncWrite + Unpin + Send, + W: Write + Unpin + Send, { match self { VersionEdit::Add { scope, level } => { - writer.write_all(&0u8.to_le_bytes()).await?; - writer.write_all(&level.to_le_bytes()).await?; + 0u8.encode(writer).await?; + level.encode(writer).await?; scope.encode(writer).await?; } VersionEdit::Remove { gen, level } => { - writer.write_all(&1u8.to_le_bytes()).await?; - writer.write_all(&level.to_le_bytes()).await?; - writer.write_all(&gen.to_bytes()).await?; + 1u8.encode(writer).await?; + level.encode(writer).await?; + let (result, _) = writer.write_all(&gen.to_bytes()[..]).await; + result?; } VersionEdit::LatestTimeStamp { ts } => { - writer.write_all(&2u8.to_le_bytes()).await?; + 2u8.encode(writer).await?; ts.encode(writer).await?; } VersionEdit::NewLogLength { len } => { - writer.write_all(&3u8.to_le_bytes()).await?; + 3u8.encode(writer).await?; len.encode(writer).await?; } } @@ -83,34 +84,22 @@ where { type Error = ::Error; - async fn decode(reader: &mut R) -> Result { - let edit_type = { - let mut len = [0; size_of::()]; - reader.read_exact(&mut len).await?; - u8::from_le_bytes(len) as usize - }; + async fn decode(reader: &mut R) -> Result { + let edit_type = u8::decode(reader).await?; Ok(match edit_type { 0 => { - let level = { - let mut level = [0; size_of::()]; - reader.read_exact(&mut level).await?; - u8::from_le_bytes(level) - }; + let level = u8::decode(reader).await?; let scope = Scope::::decode(reader).await?; VersionEdit::Add { level, scope } } 1 => { - let level = { - let mut level = [0; size_of::()]; - reader.read_exact(&mut level).await?; - u8::from_le_bytes(level) - }; + let level = u8::decode(reader).await?; let gen = { - let mut slice = [0; 16]; - reader.read_exact(&mut slice).await?; - FileId::from_bytes(slice) + let buf = reader.read_exact(vec![0u8; 16]).await?; + // SAFETY + FileId::from_bytes(buf.as_slice().try_into().unwrap()) }; VersionEdit::Remove { level, gen } } @@ -131,6 +120,8 @@ where mod tests { use std::io::Cursor; + use fusio::Seek; + use crate::{fs::FileId, scope::Scope, serdes::Encode, version::edit::VersionEdit}; #[tokio::test] @@ -153,17 +144,15 @@ mod tests { VersionEdit::NewLogLength { len: 233 }, ]; - let bytes = { - let mut cursor = Cursor::new(vec![]); + let mut buf = Vec::new(); + let mut cursor = Cursor::new(&mut buf); - for edit in edits.clone() { - edit.encode(&mut cursor).await.unwrap(); - } - cursor.into_inner() - }; + for edit in edits.clone() { + edit.encode(&mut cursor).await.unwrap(); + } let decode_edits = { - let mut cursor = Cursor::new(bytes); + cursor.seek(0).await.unwrap(); VersionEdit::::recover(&mut cursor).await }; diff --git a/src/version/mod.rs b/src/version/mod.rs index fdc2c94..95d319b 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -3,7 +3,6 @@ pub(crate) mod edit; pub(crate) mod set; use std::{ - marker::PhantomData, ops::Bound, sync::{ atomic::{AtomicU32, Ordering}, @@ -12,12 +11,13 @@ use std::{ }; use flume::{SendError, Sender}; +use fusio::DynFs; use parquet::arrow::ProjectionMask; use thiserror::Error; use tracing::error; use crate::{ - fs::{FileId, FileProvider}, + fs::{default_open_options, manager::StoreManager, FileId}, ondisk::sstable::SsTable, record::Record, scope::Scope, @@ -30,7 +30,7 @@ use crate::{ pub(crate) const MAX_LEVEL: usize = 7; -pub(crate) type VersionRef = Arc>; +pub(crate) type VersionRef = Arc>; pub(crate) trait TransactionTs { fn load_ts(&self) -> Timestamp; @@ -39,10 +39,9 @@ pub(crate) trait TransactionTs { } #[derive(Debug)] -pub(crate) struct Version +pub(crate) struct Version where R: Record, - FP: FileProvider, { ts: Timestamp, pub(crate) level_slice: [Vec>; MAX_LEVEL], @@ -50,13 +49,11 @@ where option: Arc>, timestamp: Arc, log_length: u32, - _p: PhantomData, } -impl Version +impl Version where R: Record, - FP: FileProvider, { #[cfg(test)] pub(crate) fn new( @@ -71,7 +68,6 @@ where option: option.clone(), timestamp, log_length: 0, - _p: Default::default(), } } @@ -80,10 +76,9 @@ where } } -impl TransactionTs for Version +impl TransactionTs for Version where R: Record, - FP: FileProvider, { fn load_ts(&self) -> Timestamp { self.timestamp.load(Ordering::Acquire).into() @@ -94,10 +89,9 @@ where } } -impl Clone for Version +impl Clone for Version where R: Record, - FP: FileProvider, { fn clone(&self) -> Self { let mut level_slice = [const { Vec::new() }; MAX_LEVEL]; @@ -113,42 +107,56 @@ where option: self.option.clone(), timestamp: self.timestamp.clone(), log_length: self.log_length, - _p: Default::default(), } } } -impl Version +impl Version where R: Record, - FP: FileProvider, { pub(crate) async fn query( &self, + manager: &StoreManager, key: &TimestampedRef, projection_mask: ProjectionMask, ) -> Result>, VersionError> { + let level_0_path = self + .option + .level_fs_path(0) + .unwrap_or(&self.option.base_path); + let level_0_fs = manager.get_fs(level_0_path); for scope in self.level_slice[0].iter().rev() { if !scope.contains(key.value()) { continue; } if let Some(entry) = self - .table_query(key, &scope.gen, projection_mask.clone()) + .table_query(level_0_fs, key, &scope.gen, projection_mask.clone()) .await? { return Ok(Some(entry)); } } - for level in self.level_slice[1..6].iter() { - if level.is_empty() { + for (i, sort_runs) in self.level_slice[1..MAX_LEVEL].iter().enumerate() { + let level_path = self + .option + .level_fs_path(i + 1) + .unwrap_or(&self.option.base_path); + let level_fs = manager.get_fs(level_path); + if sort_runs.is_empty() { continue; } - let index = Self::scope_search(key.value(), level); - if !level[index].contains(key.value()) { + let index = Self::scope_search(key.value(), sort_runs); + if !sort_runs[index].contains(key.value()) { continue; } if let Some(entry) = self - .table_query(key, &level[index].gen, projection_mask.clone()) + .table_query( + level_fs, + key, + &sort_runs[index].gen, + projection_mask.clone(), + ) .await? { return Ok(Some(entry)); @@ -160,14 +168,17 @@ where async fn table_query( &self, + store: &Arc, key: &TimestampedRef<::Key>, gen: &FileId, projection_mask: ProjectionMask, ) -> Result>, VersionError> { - let file = FP::open(self.option.table_path(gen)) + let file = store + .open_options(&self.option.table_path(gen), default_open_options()) .await - .map_err(VersionError::Io)?; - SsTable::::open(file) + .map_err(VersionError::Fusio)?; + SsTable::::open(file) + .await? .get(key, projection_mask) .await .map_err(VersionError::Parquet) @@ -185,20 +196,27 @@ where pub(crate) async fn streams<'streams>( &self, - streams: &mut Vec>, + manager: &StoreManager, + streams: &mut Vec>, range: (Bound<&'streams R::Key>, Bound<&'streams R::Key>), ts: Timestamp, limit: Option, projection_mask: ProjectionMask, ) -> Result<(), VersionError> { + let level_0_path = self + .option + .level_fs_path(0) + .unwrap_or(&self.option.base_path); + let level_0_fs = manager.get_fs(level_0_path); for scope in self.level_slice[0].iter() { if !scope.meets_range(range) { continue; } - let file = FP::open(self.option.table_path(&scope.gen)) + let file = level_0_fs + .open_options(&self.option.table_path(&scope.gen), default_open_options()) .await - .map_err(VersionError::Io)?; - let table = SsTable::open(file); + .map_err(VersionError::Fusio)?; + let table = SsTable::open(file).await?; streams.push(ScanStream::SsTable { inner: table @@ -211,6 +229,11 @@ where if scopes.is_empty() { continue; } + let level_path = self + .option + .level_fs_path(i + 1) + .unwrap_or(&self.option.base_path); + let level_fs = manager.get_fs(level_path); let (mut start, mut end) = (None, None); @@ -237,6 +260,7 @@ where ts, limit, projection_mask.clone(), + level_fs.clone(), ) .unwrap(), }); @@ -261,10 +285,9 @@ where } } -impl Drop for Version +impl Drop for Version where R: Record, - FP: FileProvider, { fn drop(&mut self) { if let Err(err) = self.clean_sender.send(CleanTag::Clean { ts: self.ts }) { @@ -284,6 +307,10 @@ where Io(#[from] std::io::Error), #[error("version parquet error: {0}")] Parquet(#[from] parquet::errors::ParquetError), + #[error("version fusio error: {0}")] + Fusio(#[from] fusio::Error), + #[error("version ulid decode error: {0}")] + UlidDecode(#[from] ulid::DecodeError), #[error("version send error: {0}")] Send(#[from] SendError), } diff --git a/src/version/set.rs b/src/version/set.rs index 3d5f65a..3c3b2bc 100644 --- a/src/version/set.rs +++ b/src/version/set.rs @@ -1,7 +1,6 @@ use std::{ - io::SeekFrom, + collections::BinaryHeap, mem, - pin::pin, sync::{ atomic::{AtomicU32, Ordering}, Arc, @@ -10,12 +9,12 @@ use std::{ use async_lock::RwLock; use flume::Sender; +use fusio::{dynamic::DynFile, fs::FileMeta, Seek}; use futures_util::StreamExt; -use tokio::io::{AsyncSeekExt, AsyncWriteExt}; use super::{TransactionTs, MAX_LEVEL}; use crate::{ - fs::{FileId, FileProvider, FileType}, + fs::{default_open_options, manager::StoreManager, parse_file_id, FileId, FileType}, record::Record, serdes::Encode, timestamp::Timestamp, @@ -23,30 +22,50 @@ use crate::{ DbOption, }; -pub(crate) struct VersionSetInner +struct CmpMeta(FileMeta); + +impl Eq for CmpMeta {} + +impl PartialEq for CmpMeta { + fn eq(&self, other: &Self) -> bool { + self.0.path.eq(&other.0.path) + } +} + +impl PartialOrd for CmpMeta { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for CmpMeta { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.0.path.cmp(&other.0.path) + } +} + +pub(crate) struct VersionSetInner where R: Record, - FP: FileProvider, { - current: VersionRef, - log_with_id: (FP::File, FileId), + current: VersionRef, + log_with_id: (Box, FileId), } -pub(crate) struct VersionSet +pub(crate) struct VersionSet where R: Record, - FP: FileProvider, { - inner: Arc>>, + inner: Arc>>, clean_sender: Sender, timestamp: Arc, option: Arc>, + manager: Arc, } -impl Clone for VersionSet +impl Clone for VersionSet where R: Record, - FP: FileProvider, { fn clone(&self) -> Self { VersionSet { @@ -54,14 +73,14 @@ where clean_sender: self.clean_sender.clone(), timestamp: self.timestamp.clone(), option: self.option.clone(), + manager: self.manager.clone(), } } } -impl TransactionTs for VersionSet +impl TransactionTs for VersionSet where R: Record, - FP: FileProvider, { fn load_ts(&self) -> Timestamp { self.timestamp.load(Ordering::Acquire).into() @@ -72,88 +91,90 @@ where } } -impl VersionSet +impl VersionSet where R: Record, - FP: FileProvider, { pub(crate) async fn new( clean_sender: Sender, option: Arc>, + manager: Arc, ) -> Result> { - let mut log_stream = pin!(FP::list( - option.version_log_dir_path(), - FileType::Log, - true - )?); - let mut first_log_id = None; - let mut version_log_id = None; - let mut version_log = None; + let fs = manager.base_fs(); + let version_dir = option.version_log_dir_path(); + let mut log_stream = fs.list(&version_dir).await?; + let mut log_binary_heap = BinaryHeap::with_capacity(3); // when there are multiple logs, this means that a downtime occurred during the // `version_log_snap_shot` process, the second newest file has the highest data // integrity, so it is used as the version log, and the older log is deleted first // to avoid midway downtime, which will cause the second newest file to become the // first newest after restart. - let mut i = 0; while let Some(result) = log_stream.next().await { - let (log, log_id) = result?; + let file_meta = result?; - if i <= 1 { - version_log = Some(log); - first_log_id = mem::replace(&mut version_log_id, Some(log_id)); - } else { - FP::remove(option.version_log_path(&log_id)).await?; - } + log_binary_heap.push(CmpMeta(file_meta)); - i += 1; + if log_binary_heap.len() > 2 { + if let Some(old_meta) = log_binary_heap.pop() { + fs.remove(&old_meta.0.path).await?; + } + } } - if let Some(log_id) = first_log_id { - FP::remove(option.version_log_path(&log_id)).await?; + + let second_log_id = log_binary_heap.pop(); + let latest_log_id = log_binary_heap.pop(); + + if let (Some(log_id), Some(_)) = (&latest_log_id, &second_log_id) { + fs.remove(&log_id.0.path).await?; } - let (mut log, log_id) = if let (Some(log), Some(log_id)) = (version_log, version_log_id) { - (log, log_id) - } else { - let log_id = FileId::new(); - let log = FP::open(option.version_log_path(&log_id)).await?; - (log, log_id) - }; + let log_id = second_log_id + .or(latest_log_id) + .map(|file_meta| parse_file_id(&file_meta.0.path, FileType::Log)) + .transpose()? + .flatten() + .unwrap_or_else(FileId::new); + + let mut log = fs + .open_options(&option.version_log_path(&log_id), default_open_options()) + .await?; + log.seek(0).await.unwrap(); let edits = VersionEdit::recover(&mut log).await; - log.seek(SeekFrom::End(0)).await?; let timestamp = Arc::new(AtomicU32::default()); - let set = VersionSet:: { + drop(log_stream); + let set = VersionSet:: { inner: Arc::new(RwLock::new(VersionSetInner { - current: Arc::new(Version:: { + current: Arc::new(Version:: { ts: Timestamp::from(0), level_slice: [const { Vec::new() }; MAX_LEVEL], clean_sender: clean_sender.clone(), option: option.clone(), timestamp: timestamp.clone(), log_length: 0, - _p: Default::default(), }), log_with_id: (log, log_id), })), clean_sender, timestamp, option, + manager, }; set.apply_edits(edits, None, true).await?; Ok(set) } - pub(crate) async fn current(&self) -> VersionRef { + pub(crate) async fn current(&self) -> VersionRef { self.inner.read().await.current.clone() } pub(crate) async fn apply_edits( &self, mut version_edits: Vec>, - delete_gens: Option>, + delete_gens: Option>, is_recover: bool, ) -> Result<(), VersionError> { let timestamp = &self.timestamp; @@ -178,7 +199,11 @@ where if let Some(wal_ids) = scope.wal_ids.take() { for wal_id in wal_ids { // may have been removed after multiple starts - let _ = FP::remove(option.wal_path(&wal_id)).await; + let _ = self + .manager + .base_fs() + .remove(&option.wal_path(&wal_id)) + .await; } } if level == 0 { @@ -203,7 +228,7 @@ where // issue: https://github.com/tonbo-io/tonbo/issues/123 new_version .clean_sender - .send_async(CleanTag::RecoverClean { gen }) + .send_async(CleanTag::RecoverClean { wal_id: gen }) .await .map_err(VersionError::Send)?; } @@ -229,17 +254,22 @@ where .await .map_err(VersionError::Send)?; } - log.flush().await?; + log.sync_all().await?; if edit_len >= option.version_log_snapshot_threshold { + let fs = self.manager.base_fs(); let old_log_id = mem::replace(log_id, FileId::new()); - let _ = mem::replace(log, FP::open(option.version_log_path(log_id)).await?); + let new_log = fs + .open_options(&option.version_log_path(log_id), default_open_options()) + .await?; + let mut old_log = mem::replace(log, new_log); + old_log.close().await?; new_version.log_length = 0; for new_edit in new_version.to_edits() { new_edit.encode(log).await.map_err(VersionError::Encode)?; } - log.flush().await?; - FP::remove(option.version_log_path(&old_log_id)).await?; + log.sync_all().await?; + fs.remove(&option.version_log_path(&old_log_id)).await?; } guard.current = Arc::new(new_version); Ok(()) @@ -248,17 +278,16 @@ where #[cfg(test)] pub(crate) mod tests { - use std::{io::SeekFrom, pin::pin, sync::Arc}; + use std::sync::Arc; use async_lock::RwLock; use flume::{bounded, Sender}; + use fusio::{local::TokioFs, path::Path}; use futures_util::StreamExt; use tempfile::TempDir; - use tokio::io::AsyncSeekExt; use crate::{ - executor::tokio::TokioExecutor, - fs::{FileId, FileProvider, FileType}, + fs::{default_open_options, manager::StoreManager, FileId}, record::Record, scope::Scope, version::{ @@ -270,21 +299,23 @@ pub(crate) mod tests { DbOption, }; - pub(crate) async fn build_version_set( - version: Version, + pub(crate) async fn build_version_set( + version: Version, clean_sender: Sender, option: Arc>, - ) -> Result, VersionError> + manager: Arc, + ) -> Result, VersionError> where R: Record, - FP: FileProvider, { let log_id = FileId::new(); - let mut log = FP::open(option.version_log_path(&log_id)).await?; - log.seek(SeekFrom::End(0)).await?; - + let log = manager + .base_fs() + .open_options(&option.version_log_path(&log_id), default_open_options()) + .await?; let timestamp = version.timestamp.clone(); - Ok(VersionSet:: { + + Ok(VersionSet:: { inner: Arc::new(RwLock::new(VersionSetInner { current: Arc::new(version), log_with_id: (log, log_id), @@ -292,20 +323,26 @@ pub(crate) mod tests { clean_sender, timestamp, option, + manager, }) } #[tokio::test] async fn timestamp_persistence() { let temp_dir = TempDir::new().unwrap(); + let manager = Arc::new(StoreManager::new(Arc::new(TokioFs), vec![])); let (sender, _) = bounded(1); - let option = Arc::new(DbOption::from(temp_dir.path())); - TokioExecutor::create_dir_all(&option.version_log_dir_path()) + let option = Arc::new(DbOption::from( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + )); + manager + .base_fs() + .create_dir_all(&option.version_log_dir_path()) .await .unwrap(); - let version_set: VersionSet = - VersionSet::new(sender.clone(), option.clone()) + let version_set: VersionSet = + VersionSet::new(sender.clone(), option.clone(), manager.clone()) .await .unwrap(); @@ -320,8 +357,8 @@ pub(crate) mod tests { drop(version_set); - let version_set: VersionSet = - VersionSet::new(sender.clone(), option.clone()) + let version_set: VersionSet = + VersionSet::new(sender.clone(), option.clone(), manager) .await .unwrap(); assert_eq!(version_set.load_ts(), 20_u32.into()); @@ -330,17 +367,19 @@ pub(crate) mod tests { #[tokio::test] async fn version_log_snap_shot() { let temp_dir = TempDir::new().unwrap(); + let manager = Arc::new(StoreManager::new(Arc::new(TokioFs), vec![])); let (sender, _) = bounded(1); - let mut option = DbOption::from(temp_dir.path()); + let mut option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); option.version_log_snapshot_threshold = 4; let option = Arc::new(option); - TokioExecutor::create_dir_all(&option.version_log_dir_path()) + manager + .create_dir_all(&option.version_log_dir_path()) .await .unwrap(); - let version_set: VersionSet = - VersionSet::new(sender.clone(), option.clone()) + let version_set: VersionSet = + VersionSet::new(sender.clone(), option.clone(), manager.clone()) .await .unwrap(); let gen_0 = FileId::new(); @@ -395,7 +434,7 @@ pub(crate) mod tests { let mut guard = version_set.inner.write().await; let log = &mut guard.log_with_id.0; - log.seek(SeekFrom::Start(0)).await.unwrap(); + log.seek(0).await.unwrap(); let edits = VersionEdit::::recover(log).await; assert_eq!(edits.len(), 3); @@ -418,12 +457,20 @@ pub(crate) mod tests { drop(guard); drop(version_set); - let (mut log, _) = - pin!(TokioExecutor::list(option.version_log_dir_path(), FileType::Log, true).unwrap()) - .next() - .await - .unwrap() - .unwrap(); + let version_dir_path = option.version_log_dir_path(); + let mut stream = manager.base_fs().list(&version_dir_path).await.unwrap(); + let mut logs = Vec::new(); + + while let Some(log) = stream.next().await { + logs.push(log.unwrap()); + } + logs.sort_by(|meta_a, meta_b| meta_a.path.cmp(&meta_b.path)); + + let mut log = manager + .base_fs() + .open_options(&logs.pop().unwrap().path, default_open_options()) + .await + .unwrap(); let edits = VersionEdit::::recover(&mut log).await; assert_eq!(edits.len(), 3); @@ -448,16 +495,20 @@ pub(crate) mod tests { #[tokio::test] async fn version_level_sort() { let temp_dir = TempDir::new().unwrap(); - let option = DbOption::from(temp_dir.path()); - let option = Arc::new(option); + let manager = Arc::new(StoreManager::new(Arc::new(TokioFs), vec![])); + let option = Arc::new(DbOption::from( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + )); let (sender, _) = bounded(1); - TokioExecutor::create_dir_all(&option.version_log_dir_path()) + manager + .base_fs() + .create_dir_all(&option.version_log_dir_path()) .await .unwrap(); - let version_set: VersionSet = - VersionSet::new(sender.clone(), option.clone()) + let version_set: VersionSet = + VersionSet::new(sender.clone(), option.clone(), manager) .await .unwrap(); let gen_0 = FileId::new(); diff --git a/src/wal/checksum.rs b/src/wal/checksum.rs index c3a1191..2dabbd8 100644 --- a/src/wal/checksum.rs +++ b/src/wal/checksum.rs @@ -1,24 +1,15 @@ -use std::{ - hash::Hasher, - io, - io::Error, - pin::Pin, - task::{Context, Poll}, -}; - -use futures_core::ready; -use pin_project_lite::pin_project; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt, ReadBuf}; - -pin_project! { - pub(crate) struct HashWriter { - hasher: crc32fast::Hasher, - #[pin] - writer: W, - } +use std::{future::Future, hash::Hasher}; + +use fusio::{Error, IoBuf, IoBufMut, MaybeSend, Read, Write}; + +use crate::serdes::{Decode, Encode}; + +pub(crate) struct HashWriter { + hasher: crc32fast::Hasher, + writer: W, } -impl HashWriter { +impl HashWriter { pub(crate) fn new(writer: W) -> Self { Self { hasher: crc32fast::Hasher::new(), @@ -26,46 +17,39 @@ impl HashWriter { } } - pub(crate) async fn eol(mut self) -> io::Result { - self.writer.write(&self.hasher.finish().to_le_bytes()).await + pub(crate) async fn eol(mut self) -> Result<(), fusio::Error> { + let i = self.hasher.finish(); + i.encode(&mut self.writer).await } } -impl AsyncWrite for HashWriter { - fn poll_write( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &[u8], - ) -> Poll> { - let this = self.project(); - - Poll::Ready(match ready!(this.writer.poll_write(cx, buf)) { - Ok(n) => { - this.hasher.write(&buf[..n]); - Ok(n) - } - e => e, - }) +impl Write for HashWriter { + async fn write_all(&mut self, buf: B) -> (Result<(), Error>, B) { + let (result, buf) = self.writer.write_all(buf).await; + self.hasher.write(buf.as_slice()); + + (result, buf) } - fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().writer.poll_flush(cx) + fn sync_data(&self) -> impl Future> + MaybeSend { + self.writer.sync_data() } - fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - self.project().writer.poll_shutdown(cx) + fn sync_all(&self) -> impl Future> + MaybeSend { + self.writer.sync_all() } -} -pin_project! { - pub(crate) struct HashReader { - hasher: crc32fast::Hasher, - #[pin] - reader: R, + fn close(&mut self) -> impl Future> + MaybeSend { + self.writer.close() } } -impl HashReader { +pub(crate) struct HashReader { + hasher: crc32fast::Hasher, + reader: R, +} + +impl HashReader { pub(crate) fn new(reader: R) -> Self { Self { hasher: crc32fast::Hasher::new(), @@ -73,28 +57,55 @@ impl HashReader { } } - pub(crate) async fn checksum(mut self) -> io::Result { - let mut hash = [0; 8]; - self.reader.read_exact(&mut hash).await?; - let checksum = u64::from_le_bytes(hash); + pub(crate) async fn checksum(mut self) -> Result { + let checksum = u64::decode(&mut self.reader).await?; Ok(self.hasher.finish() == checksum) } } -impl AsyncRead for HashReader { - fn poll_read( - self: Pin<&mut Self>, - cx: &mut Context<'_>, - buf: &mut ReadBuf<'_>, - ) -> Poll> { - let this = self.project(); - Poll::Ready(match ready!(this.reader.poll_read(cx, buf)) { - Ok(()) => { - this.hasher.write(buf.filled()); - Ok(()) - } - e => e, - }) +impl Read for HashReader { + async fn read_exact(&mut self, buf: B) -> Result { + let bytes = self.reader.read_exact(buf).await?; + self.hasher.write(bytes.as_slice()); + + Ok(bytes) + } + + async fn size(&self) -> Result { + self.reader.size().await + } +} + +#[cfg(test)] +pub(crate) mod tests { + use std::io::Cursor; + + use fusio::Seek; + + use crate::{ + serdes::{Decode, Encode}, + wal::checksum::{HashReader, HashWriter}, + }; + + #[tokio::test] + async fn test_encode_decode() { + let mut bytes = Vec::new(); + let mut cursor = Cursor::new(&mut bytes); + + let mut writer = HashWriter::new(&mut cursor); + 4_u64.encode(&mut writer).await.unwrap(); + 3_u32.encode(&mut writer).await.unwrap(); + 2_u16.encode(&mut writer).await.unwrap(); + 1_u8.encode(&mut writer).await.unwrap(); + writer.eol().await.unwrap(); + + cursor.seek(0).await.unwrap(); + let mut reader = HashReader::new(&mut cursor); + assert_eq!(u64::decode(&mut reader).await.unwrap(), 4); + assert_eq!(u32::decode(&mut reader).await.unwrap(), 3); + assert_eq!(u16::decode(&mut reader).await.unwrap(), 2); + assert_eq!(u8::decode(&mut reader).await.unwrap(), 1); + assert!(reader.checksum().await.unwrap()); } } diff --git a/src/wal/log.rs b/src/wal/log.rs index c27285c..10a816b 100644 --- a/src/wal/log.rs +++ b/src/wal/log.rs @@ -1,6 +1,6 @@ use std::mem::size_of; -use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; +use fusio::{Read, Write}; use crate::serdes::{Decode, Encode}; @@ -45,9 +45,9 @@ where async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: AsyncWrite + Unpin + Send, + W: Write + Unpin + Send, { - writer.write_all(&[self.log_type as u8]).await?; + (self.log_type as u8).encode(writer).await?; self.record.encode(writer).await } @@ -64,12 +64,9 @@ where async fn decode(reader: &mut R) -> Result where - R: AsyncRead + Unpin, + R: Read + Unpin, { - let mut log_type = [0]; - reader.read_exact(&mut log_type).await?; - let log_type = LogType::from(log_type[0]); - + let log_type = LogType::from(u8::decode(reader).await?); let log = Re::decode(reader).await?; Ok(Self { diff --git a/src/wal/mod.rs b/src/wal/mod.rs index 0c59d25..fd5dcf6 100644 --- a/src/wal/mod.rs +++ b/src/wal/mod.rs @@ -2,14 +2,14 @@ mod checksum; pub(crate) mod log; pub(crate) mod record_entry; -use std::{io, marker::PhantomData}; +use std::marker::PhantomData; use async_stream::stream; use checksum::{HashReader, HashWriter}; +use fusio::{Read, Write}; use futures_core::Stream; use log::Log; use thiserror::Error; -use tokio::io::{AsyncBufReadExt, AsyncRead, AsyncWrite, AsyncWriteExt, BufReader}; use crate::{ fs::FileId, @@ -42,7 +42,7 @@ impl WalFile { impl WalFile where - F: AsyncWrite + Unpin + Send, + F: Write + Unpin + Send, R: Record, { pub(crate) async fn write<'r>( @@ -59,14 +59,14 @@ where Ok(()) } - pub(crate) async fn flush(&mut self) -> io::Result<()> { - self.file.flush().await + pub(crate) async fn flush(&mut self) -> Result<(), fusio::Error> { + self.file.close().await } } impl WalFile where - F: AsyncRead + Unpin, + F: Read + Unpin, R: Record, { pub(crate) fn recover( @@ -78,17 +78,13 @@ where >, > + '_ { stream! { - let mut file = BufReader::new(&mut self.file); - loop { - if file.buffer().is_empty() && file.fill_buf().await?.is_empty() { - return; - } - - let mut reader = HashReader::new(&mut file); - - let record = Log::>::decode(&mut reader).await.map_err(RecoverError::Io)?; + let mut reader = HashReader::new(&mut self.file); + let record = match Log::>::decode(&mut reader).await { + Ok(record) => record, + Err(_) => return, + }; if !reader.checksum().await? { yield Err(RecoverError::Checksum); return; @@ -111,12 +107,15 @@ pub enum RecoverError { Checksum, #[error("wal recover io error")] Io(#[from] std::io::Error), + #[error("wal recover fusio error")] + Fusio(#[from] fusio::Error), } #[cfg(test)] mod tests { use std::{io::Cursor, pin::pin}; + use fusio::Seek; use futures_util::StreamExt; use super::{log::LogType, FileId, WalFile}; @@ -124,9 +123,10 @@ mod tests { #[tokio::test] async fn write_and_recover() { - let mut file = Vec::new(); + let mut bytes = Vec::new(); + let mut file = Cursor::new(&mut bytes); { - let mut wal = WalFile::<_, String>::new(Cursor::new(&mut file), FileId::new()); + let mut wal = WalFile::<_, String>::new(&mut file, FileId::new()); wal.write( LogType::Full, Timestamped::new("hello", 0.into()), @@ -137,7 +137,8 @@ mod tests { wal.flush().await.unwrap(); } { - let mut wal = WalFile::<_, String>::new(Cursor::new(&mut file), FileId::new()); + file.seek(0).await.unwrap(); + let mut wal = WalFile::<_, String>::new(&mut file, FileId::new()); { let mut stream = pin!(wal.recover()); @@ -146,6 +147,8 @@ mod tests { assert_eq!(value, Some("hello".to_string())); } + let mut wal = WalFile::<_, String>::new(&mut file, FileId::new()); + wal.write( LogType::Full, Timestamped::new("world", 1.into()), @@ -156,7 +159,8 @@ mod tests { } { - let mut wal = WalFile::<_, String>::new(Cursor::new(&mut file), FileId::new()); + file.seek(0).await.unwrap(); + let mut wal = WalFile::<_, String>::new(&mut file, FileId::new()); { let mut stream = pin!(wal.recover()); diff --git a/src/wal/record_entry.rs b/src/wal/record_entry.rs index 47517c4..1bbf142 100644 --- a/src/wal/record_entry.rs +++ b/src/wal/record_entry.rs @@ -1,6 +1,4 @@ -use std::io; - -use tokio::io::{AsyncRead, AsyncWrite}; +use fusio::{Read, Write}; use crate::{ record::{Key, Record}, @@ -20,11 +18,11 @@ impl Encode for RecordEntry<'_, R> where R: Record, { - type Error = io::Error; + type Error = fusio::Error; async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: AsyncWrite + Unpin + Send, + W: Write + Unpin + Send, { if let RecordEntry::Encode((key, recode_ref)) = self { key.encode(writer).await.unwrap(); @@ -47,11 +45,11 @@ impl Decode for RecordEntry<'_, Re> where Re: Record, { - type Error = io::Error; + type Error = fusio::Error; async fn decode(reader: &mut R) -> Result where - R: AsyncRead + Unpin, + R: Read + Unpin, { let key = Timestamped::::decode(reader).await.unwrap(); let record = Option::::decode(reader).await.unwrap(); @@ -64,6 +62,8 @@ where mod tests { use std::io::Cursor; + use fusio::Seek; + use crate::{ serdes::{Decode, Encode}, timestamp::Timestamped, @@ -74,15 +74,12 @@ mod tests { async fn encode_and_decode() { let entry: RecordEntry<'static, String> = RecordEntry::Encode((Timestamped::new("hello", 0.into()), Some("hello"))); - let bytes = { - let mut cursor = Cursor::new(vec![]); - - entry.encode(&mut cursor).await.unwrap(); - cursor.into_inner() - }; + let mut bytes = Vec::new(); + let mut cursor = Cursor::new(&mut bytes); + entry.encode(&mut cursor).await.unwrap(); let decode_entry = { - let mut cursor = Cursor::new(bytes); + cursor.seek(0).await.unwrap(); RecordEntry::<'static, String>::decode(&mut cursor) .await diff --git a/tests/data_integrity.rs b/tests/data_integrity.rs index 4343c22..2e4449f 100644 --- a/tests/data_integrity.rs +++ b/tests/data_integrity.rs @@ -1,10 +1,11 @@ #[cfg(test)] mod tests { - use std::{hash::Hasher, ops::Bound}; + use std::{hash::Hasher, ops::Bound, sync::Arc}; + use fusio::{local::TokioFs, path::Path}; use futures_util::StreamExt; use tempfile::TempDir; - use tonbo::{executor::tokio::TokioExecutor, DbOption, Record, DB}; + use tonbo::{executor::tokio::TokioExecutor, fs::manager::StoreManager, DbOption, Record, DB}; const WRITE_TIMES: usize = 500_000; const STRING_SIZE: usize = 50; @@ -68,10 +69,13 @@ mod tests { let mut primary_key_count = 0; let mut write_hasher = crc32fast::Hasher::new(); + let manager = StoreManager::new(Arc::new(TokioFs), vec![]); let temp_dir = TempDir::new().unwrap(); - let option = DbOption::from(temp_dir.path()); + let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); - let db: DB = DB::new(option, TokioExecutor::new()).await.unwrap(); + let db: DB = DB::new(option, TokioExecutor::new(), manager) + .await + .unwrap(); for _ in 0..WRITE_TIMES { let customer = gen_record(&mut rng, &mut primary_key_count); diff --git a/tests/macros_correctness.rs b/tests/macros_correctness.rs index 135ab55..e3c77bf 100644 --- a/tests/macros_correctness.rs +++ b/tests/macros_correctness.rs @@ -10,9 +10,10 @@ pub struct User { #[cfg(test)] mod tests { - use std::sync::Arc; + use std::{io::Cursor, sync::Arc}; use arrow::array::{BooleanArray, RecordBatch, StringArray, UInt32Array, UInt8Array}; + use fusio::Seek; use parquet::{ arrow::{arrow_to_parquet_schema, ProjectionMask}, format::SortingColumn, @@ -126,7 +127,8 @@ mod tests { &arrow_to_parquet_schema(User::arrow_schema()).unwrap(), vec![0, 1, 2, 3, 4], ); - let record_ref = UserRef::from_record_batch(&record_batch, 0, &project_mask); + let record_ref = + UserRef::from_record_batch(&record_batch, 0, &project_mask, User::arrow_schema()); assert_eq!( record_ref.value(), Timestamped { @@ -158,7 +160,8 @@ mod tests { &arrow_to_parquet_schema(User::arrow_schema()).unwrap(), vec![0, 1, 3, 4], ); - let record_ref = UserRef::from_record_batch(&record_batch, 0, &project_mask); + let record_ref = + UserRef::from_record_batch(&record_batch, 0, &project_mask, User::arrow_schema()); assert_eq!( record_ref.value(), Timestamped { @@ -184,20 +187,20 @@ mod tests { age: 32, }; let original_ref = original.as_record_ref(); - let mut buffer = Vec::new(); + let mut bytes = Vec::new(); + let mut cursor = Cursor::new(&mut bytes); assert_eq!(original_ref.size(), 26); - original_ref.encode(&mut buffer).await.unwrap(); + original_ref.encode(&mut cursor).await.unwrap(); - let mut cursor = std::io::Cursor::new(buffer); + cursor.seek(0).await.unwrap(); let decoded = User::decode(&mut cursor).await.unwrap(); - assert_eq!(original, decoded); } #[tokio::test] async fn test_record_arrays() { - let mut builder = UserImmutableArrays::builder(10); + let mut builder = UserImmutableArrays::builder(User::arrow_schema(), 10); let cat = User { email: Some("cat@example.com".to_string()), @@ -258,7 +261,7 @@ mod tests { #[tokio::test] async fn test_record_arrays_projection() { - let mut builder = UserImmutableArrays::builder(10); + let mut builder = UserImmutableArrays::builder(User::arrow_schema(), 10); let cat = User { email: Some("cat@example.com".to_string()), diff --git a/tonbo_macros/src/record.rs b/tonbo_macros/src/record.rs index 8b0e024..5ffb887 100644 --- a/tonbo_macros/src/record.rs +++ b/tonbo_macros/src/record.rs @@ -310,7 +310,7 @@ fn trait_decode_codegen(struct_name: &Ident, fields: &[RecordStructFieldOpt]) -> async fn decode(reader: &mut R) -> Result where - R: ::tokio::io::AsyncRead + Unpin, + R: ::fusio::Read + Unpin, { #(#decode_method_fields)* @@ -449,6 +449,7 @@ fn trait_decode_ref_codegen( record_batch: &'r ::tonbo::arrow::record_batch::RecordBatch, offset: usize, projection_mask: &'r ::tonbo::parquet::arrow::ProjectionMask, + _: &::std::sync::Arc<::tonbo::arrow::datatypes::Schema>, ) -> ::tonbo::record::internal::InternalRecordRef<'r, Self> { use ::tonbo::arrow::array::AsArray; @@ -498,7 +499,7 @@ fn trait_encode_codegen(struct_name: &Ident, fields: &[RecordStructFieldOpt]) -> async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> where - W: ::tokio::io::AsyncWrite + Unpin + Send, + W: ::fusio::Write + Unpin + Send, { #(#encode_method_fields)* @@ -598,7 +599,7 @@ fn trait_arrow_array_codegen( type Builder = #struct_builder_name; - fn builder(capacity: usize) -> Self::Builder { + fn builder(schema: &::std::sync::Arc<::tonbo::arrow::datatypes::Schema>, capacity: usize) -> Self::Builder { #struct_builder_name { #(#builder_init_fields)* @@ -691,15 +692,21 @@ fn struct_builder_codegen( self.#field_name.append_null(); }); } else { - builder_push_some_fields.push(quote! { - self.#field_name.append_value(row.#field_name.unwrap()); - }); - builder_push_none_fields.push(if is_string { - quote!(self.#field_name.append_value("");) + let append_default = if is_string { + quote!(self.#field_name.append_value("")) } else if is_bytes { - quote!(self.#field_name.append_value(&[]);) + quote!(self.#field_name.append_value(&[])) } else { - quote!(self.#field_name.append_value(Default::default());) + quote!(self.#field_name.append_value(Default::default())) + }; + builder_push_some_fields.push(quote! { + match row.#field_name { + Some(#field_name) => self.#field_name.append_value(#field_name), + None => #append_default, + } + }); + builder_push_none_fields.push(quote! { + #append_default; }); } }