diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cc0f3e4b..e85c69e7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -147,3 +147,23 @@ jobs: - name: Run native examples run: make native-examples + # 7 + python-tests: + name: Python bindings tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Install stable toolchain + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Run python binding tests + run: make test-python diff --git a/Cargo.lock b/Cargo.lock index 062751a5..284ef84b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1215,6 +1215,15 @@ dependencies = [ "web-time", ] +[[package]] +name = "indoc" +version = "2.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" +dependencies = [ + "rustversion", +] + [[package]] name = "inferno" version = "0.11.21" @@ -1380,6 +1389,7 @@ dependencies = [ "paste", "pgwire", "pprof", + "pyo3", "recursive", "regex", "rocksdb", @@ -1593,6 +1603,15 @@ dependencies = [ "libc", ] +[[package]] +name = "memoffset" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" +dependencies = [ + "autocfg", +] + [[package]] name = "minicov" version = "0.3.8" @@ -2030,6 +2049,69 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "pyo3" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7778bffd85cf38175ac1f545509665d0b9b92a198ca7941f131f85f7a4f9a872" +dependencies = [ + "cfg-if", + "indoc", + "libc", + "memoffset", + "once_cell", + "portable-atomic", + "pyo3-build-config", + "pyo3-ffi", + "pyo3-macros", + "unindent", +] + +[[package]] +name = "pyo3-build-config" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94f6cbe86ef3bf18998d9df6e0f3fc1050a8c5efa409bf712e661a4366e010fb" +dependencies = [ + "once_cell", + "target-lexicon", +] + +[[package]] +name = "pyo3-ffi" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9f1b4c431c0bb1c8fb0a338709859eed0d030ff6daa34368d3b152a63dfdd8d" +dependencies = [ + "libc", + "pyo3-build-config", +] + +[[package]] +name = "pyo3-macros" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc2201328f63c4710f68abdf653c89d8dbc2858b88c5d88b0ff38a75288a9da" +dependencies = [ + "proc-macro2", + "pyo3-macros-backend", + "quote", + "syn 2.0.104", +] + +[[package]] +name = "pyo3-macros-backend" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fca6726ad0f3da9c9de093d6f116a93c1a38e417ed73bf138472cf4064f72028" +dependencies = [ + "heck", + "proc-macro2", + "pyo3-build-config", + "quote", + "syn 2.0.104", +] + [[package]] name = "quick-xml" version = "0.26.0" @@ -2722,6 +2804,12 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" +[[package]] +name = "target-lexicon" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" + [[package]] name = "tempfile" version = "3.20.0" @@ -3022,6 +3110,12 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" +[[package]] +name = "unindent" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" + [[package]] name = "untrusted" version = "0.7.1" diff --git a/Cargo.toml b/Cargo.toml index 92d5d0e3..15c93b57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ default = ["macros"] macros = [] net = ["dep:pgwire", "dep:async-trait", "dep:clap", "dep:env_logger", "dep:futures", "dep:log", "dep:tokio"] pprof = ["pprof/criterion", "pprof/flamegraph"] +python = ["dep:pyo3"] [[bench]] name = "query_bench" @@ -48,6 +49,7 @@ itertools = { version = "0.12" } ordered-float = { version = "4", features = ["serde"] } paste = { version = "1" } parking_lot = { version = "0.12", features = ["arc_lock"] } +pyo3 = { version = "0.23", features = ["auto-initialize"], optional = true } recursive = { version = "0.1" } regex = { version = "1" } rust_decimal = { version = "1" } diff --git a/Makefile b/Makefile index dc834df2..48514d3c 100644 --- a/Makefile +++ b/Makefile @@ -3,12 +3,16 @@ CARGO ?= cargo WASM_PACK ?= wasm-pack SQLLOGIC_PATH ?= tests/slt/**/*.slt -.PHONY: test test-wasm test-slt test-all wasm-build check tpcc tpcc-dual cargo-check build wasm-examples native-examples fmt clippy +.PHONY: test test-python test-wasm test-slt test-all wasm-build check tpcc tpcc-dual cargo-check build wasm-examples native-examples fmt clippy ## Run default Rust tests in the current environment (non-WASM). test: $(CARGO) test --all +## Run Python binding API tests implemented with pyo3. +test-python: + $(CARGO) test --features python test_python_ + ## Perform a `cargo check` across the workspace. cargo-check: $(CARGO) check @@ -30,7 +34,7 @@ test-slt: $(CARGO) run -p sqllogictest-test -- --path '$(SQLLOGIC_PATH)' ## Convenience target to run every suite in sequence. -test-all: test test-wasm test-slt +test-all: test test-wasm test-slt test-python ## Run formatting (check mode) across the workspace. fmt: diff --git a/README.md b/README.md index 18a953fc..c0e33475 100755 --- a/README.md +++ b/README.md @@ -52,6 +52,20 @@ console.log(rows.map((r) => r.values.map((v) => v.Int32 ?? v))); ``` - In Node.js, provide a small `localStorage` shim if you enable statistics-related features (see `examples/wasm_index_usage.test.mjs`). +## Python (PyO3) +- Enable bindings with Cargo feature `python`. +- Constructor is explicit: `Database(path)`; in-memory usage is `Database.in_memory()`. +- Minimal usage: +```python +import kite_sql + +db = kite_sql.Database.in_memory() +db.execute("create table demo(id int primary key, v int)") +db.execute("insert into demo values (1, 2), (2, 4)") +for row in db.run("select * from demo"): + print(row["values"]) +``` + ## Examples ```rust diff --git a/src/lib.rs b/src/lib.rs index bd36d60f..0bf5554c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -115,6 +115,8 @@ mod optimizer; pub mod parser; pub mod paths; pub mod planner; +#[cfg(all(not(target_arch = "wasm32"), feature = "python"))] +pub mod python; pub mod serdes; pub mod storage; pub mod types; diff --git a/src/python.rs b/src/python.rs new file mode 100644 index 00000000..a9f83040 --- /dev/null +++ b/src/python.rs @@ -0,0 +1,422 @@ +// Copyright 2024 KipData/KiteSQL +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#![cfg(all(not(target_arch = "wasm32"), feature = "python"))] + +use crate::db::{DataBaseBuilder, Database, DatabaseIter, ResultIter}; +use crate::errors::DatabaseError; +use crate::storage::memory::MemoryStorage; +use crate::storage::rocksdb::RocksStorage; +use crate::types::tuple::{SchemaRef, Tuple}; +use crate::types::value::DataValue; +use pyo3::exceptions::{PyRuntimeError, PyValueError}; +use pyo3::prelude::*; +use pyo3::types::{PyDict, PyList, PyModule}; + +fn to_py_err(err: impl ToString) -> PyErr { + PyRuntimeError::new_err(err.to_string()) +} + +#[allow(deprecated)] +fn data_value_to_py(py: Python<'_>, value: &DataValue) -> PyResult { + let object = match value { + DataValue::Null => py.None(), + DataValue::Boolean(value) => value.into_py(py), + DataValue::Float32(value) => value.0.into_py(py), + DataValue::Float64(value) => value.0.into_py(py), + DataValue::Int8(value) => value.into_py(py), + DataValue::Int16(value) => value.into_py(py), + DataValue::Int32(value) => value.into_py(py), + DataValue::Int64(value) => value.into_py(py), + DataValue::UInt8(value) => value.into_py(py), + DataValue::UInt16(value) => value.into_py(py), + DataValue::UInt32(value) => value.into_py(py), + DataValue::UInt64(value) => value.into_py(py), + DataValue::Utf8 { value, .. } => value.clone().into_py(py), + DataValue::Date32(_) + | DataValue::Date64(_) + | DataValue::Time32(_, _) + | DataValue::Time64(_, _, _) + | DataValue::Decimal(_) => value.to_string().into_py(py), + DataValue::Tuple(values, _is_upper) => { + let py_values = values + .iter() + .map(|value| data_value_to_py(py, value)) + .collect::>>()?; + PyList::new(py, py_values)?.into_any().unbind() + } + }; + + Ok(object) +} + +fn tuple_to_python_row(py: Python<'_>, tuple: Tuple) -> PyResult { + let row = PyDict::new(py); + + match tuple.pk.as_ref() { + Some(pk) => row.set_item("pk", data_value_to_py(py, pk)?)?, + None => row.set_item("pk", py.None())?, + } + + let values = tuple + .values + .iter() + .map(|value| data_value_to_py(py, value)) + .collect::>>()?; + row.set_item("values", PyList::new(py, values)?)?; + + Ok(row.into_any().unbind()) +} + +fn schema_to_python(py: Python<'_>, schema: &SchemaRef) -> PyResult> { + schema + .iter() + .map(|col| { + let column = PyDict::new(py); + column.set_item("name", col.name())?; + column.set_item("datatype", col.datatype().to_string())?; + column.set_item("nullable", col.nullable())?; + Ok(column.into_any().unbind()) + }) + .collect() +} + +enum PythonDatabaseInner { + Memory(Database), + Rocks(Database), +} + +impl PythonDatabaseInner { + fn run(&self, sql: &str) -> Result { + match self { + PythonDatabaseInner::Memory(db) => { + let iter = db.run(sql)?; + // DatabaseIter owns state internally; only the type carries the lifetime. + let iter_static: DatabaseIter<'static, MemoryStorage> = + unsafe { std::mem::transmute(iter) }; + Ok(PythonResultIterInner::Memory(iter_static)) + } + PythonDatabaseInner::Rocks(db) => { + let iter = db.run(sql)?; + // DatabaseIter owns state internally; only the type carries the lifetime. + let iter_static: DatabaseIter<'static, RocksStorage> = + unsafe { std::mem::transmute(iter) }; + Ok(PythonResultIterInner::Rocks(iter_static)) + } + } + } +} + +enum PythonResultIterInner { + Memory(DatabaseIter<'static, MemoryStorage>), + Rocks(DatabaseIter<'static, RocksStorage>), +} + +impl PythonResultIterInner { + fn next_tuple(&mut self) -> Option> { + match self { + PythonResultIterInner::Memory(iter) => iter.next(), + PythonResultIterInner::Rocks(iter) => iter.next(), + } + } + + fn schema(&self) -> &SchemaRef { + match self { + PythonResultIterInner::Memory(iter) => iter.schema(), + PythonResultIterInner::Rocks(iter) => iter.schema(), + } + } + + fn done(self) -> Result<(), DatabaseError> { + match self { + PythonResultIterInner::Memory(iter) => iter.done(), + PythonResultIterInner::Rocks(iter) => iter.done(), + } + } +} + +#[pyclass(name = "Database", unsendable)] +pub struct PythonDatabase { + inner: PythonDatabaseInner, +} + +#[pymethods] +impl PythonDatabase { + #[new] + pub fn new(path: String) -> PyResult { + let inner = + PythonDatabaseInner::Rocks(DataBaseBuilder::path(path).build().map_err(to_py_err)?); + + Ok(PythonDatabase { inner }) + } + + #[staticmethod] + pub fn in_memory() -> PyResult { + let inner = PythonDatabaseInner::Memory( + DataBaseBuilder::path(".") + .build_in_memory() + .map_err(to_py_err)?, + ); + Ok(PythonDatabase { inner }) + } + + pub fn run(&self, sql: &str) -> PyResult { + let iter = self.inner.run(sql).map_err(to_py_err)?; + Ok(PythonResultIter { inner: Some(iter) }) + } + + pub fn execute(&self, sql: &str) -> PyResult<()> { + let mut iter = self.inner.run(sql).map_err(to_py_err)?; + while let Some(tuple) = iter.next_tuple() { + tuple.map_err(to_py_err)?; + } + iter.done().map_err(to_py_err)?; + + Ok(()) + } +} + +#[pyclass(name = "ResultIter", unsendable)] +pub struct PythonResultIter { + inner: Option, +} + +impl PythonResultIter { + fn inner_ref(&self) -> PyResult<&PythonResultIterInner> { + self.inner + .as_ref() + .ok_or_else(|| PyValueError::new_err("iterator already consumed")) + } + + fn inner_mut(&mut self) -> PyResult<&mut PythonResultIterInner> { + self.inner + .as_mut() + .ok_or_else(|| PyValueError::new_err("iterator already consumed")) + } +} + +#[pymethods] +impl PythonResultIter { + pub fn next(&mut self, py: Python<'_>) -> PyResult> { + let iter = self.inner_mut()?; + + match iter.next_tuple() { + Some(Ok(tuple)) => tuple_to_python_row(py, tuple).map(Some), + Some(Err(err)) => Err(to_py_err(err)), + None => Ok(None), + } + } + + pub fn schema(&self, py: Python<'_>) -> PyResult> { + let iter = self.inner_ref()?; + schema_to_python(py, iter.schema()) + } + + pub fn rows(&mut self, py: Python<'_>) -> PyResult> { + let mut iter = self + .inner + .take() + .ok_or_else(|| PyValueError::new_err("iterator already consumed"))?; + + let mut rows = Vec::new(); + while let Some(tuple) = iter.next_tuple() { + rows.push(tuple_to_python_row(py, tuple.map_err(to_py_err)?)?); + } + iter.done().map_err(to_py_err)?; + + Ok(rows) + } + + pub fn finish(&mut self) -> PyResult<()> { + if let Some(iter) = self.inner.take() { + iter.done().map_err(to_py_err)?; + } + Ok(()) + } + + fn __iter__(slf: PyRefMut<'_, Self>) -> PyRefMut<'_, Self> { + slf + } + + fn __next__(&mut self, py: Python<'_>) -> PyResult> { + self.next(py) + } +} + +#[pymodule] +fn kite_sql(_py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::kite_sql; + use pyo3::exceptions::PyRuntimeError; + use pyo3::ffi::c_str; + use pyo3::prelude::*; + use pyo3::types::{PyDict, PyModule}; + use std::ffi::CStr; + use tempfile::TempDir; + + fn register_module<'py>(py: Python<'py>) -> PyResult> { + let module = PyModule::new(py, "kite_sql")?; + kite_sql(py, &module)?; + Ok(module) + } + + fn run_script( + py: Python<'_>, + module: &Bound<'_, PyModule>, + script: &'static CStr, + use_memory: bool, + db_path: &str, + ) -> PyResult<()> { + let locals = PyDict::new(py); + locals.set_item("kite_sql", module)?; + locals.set_item("use_memory", use_memory)?; + locals.set_item("db_path", db_path)?; + py.run(script, None, Some(&locals)) + } + + fn run_script_on_all_backends( + py: Python<'_>, + module: &Bound<'_, PyModule>, + script: &'static CStr, + ) -> PyResult<()> { + run_script(py, module, script, true, "")?; + + let temp_dir = + TempDir::new().map_err(|e| PyRuntimeError::new_err(format!("create tempdir: {e}")))?; + let path = temp_dir.path().to_string_lossy().to_string(); + run_script(py, module, script, false, &path)?; + + Ok(()) + } + + #[test] + fn test_python_hello_world_api() -> PyResult<()> { + Python::with_gil(|py| { + let module = register_module(py)?; + run_script_on_all_backends( + py, + &module, + c_str!( + r#" +db = kite_sql.Database.in_memory() if use_memory else kite_sql.Database(db_path) +db.execute("drop table if exists my_struct") +db.execute("create table my_struct (c1 int primary key, c2 int)") +db.execute("insert into my_struct values(0, 0), (1, 1)") + +iter_obj = db.run("select * from my_struct") +schema = iter_obj.schema() +assert schema == [ + {"name": "c1", "datatype": "Integer", "nullable": False}, + {"name": "c2", "datatype": "Integer", "nullable": True}, +] +rows = iter_obj.rows() +assert len(rows) == 2 +assert rows[0]["values"] == [0, 0] +assert rows[1]["values"] == [1, 1] + +db.execute("update my_struct set c2 = c2 + 10 where c1 = 1") +after = db.run("select c2 from my_struct where c1 = 1").rows() +assert after[0]["values"] == [11] + +stream = db.run("select * from my_struct") +streamed = [] +row = stream.next() +while row is not None: + streamed.append(row["values"]) + row = stream.next() +stream.finish() +assert streamed == [[0, 0], [1, 11]] + +db.execute("drop table my_struct") +"# + ), + )?; + Ok(()) + }) + } + + #[test] + fn test_python_index_usage_api() -> PyResult<()> { + Python::with_gil(|py| { + let module = register_module(py)?; + run_script_on_all_backends( + py, + &module, + c_str!( + r#" +db = kite_sql.Database.in_memory() if use_memory else kite_sql.Database(db_path) +db.execute("drop table if exists t1") +db.execute("create table t1(id int primary key, c1 int, c2 int)") + +for i in range(2000): + id_v = i * 3 + c1_v = id_v + 1 + c2_v = id_v + 2 + db.execute(f"insert into t1 values({id_v}, {c1_v}, {c2_v})") + +db.execute("create unique index u_c1_index on t1 (c1)") +db.execute("create index c2_index on t1 (c2)") +db.execute("create index p_index on t1 (c1, c2)") +db.execute("analyze table t1") + +def row_vals(row): + ints = row["values"] + pk = row["pk"] if row["pk"] is not None else ints[0] + return [pk] + ints[1:] + +first10 = db.run("select * from t1 limit 10").rows() +assert len(first10) == 10 + +pk_row = [row_vals(r) for r in db.run("select * from t1 where id = 0").rows()] +assert pk_row == [[0, 1, 2]] + +range_pk = [row_vals(r) for r in db.run("select * from t1 where id >= 9 and id <= 15").rows()] +assert range_pk == [ + [9, 10, 11], + [12, 13, 14], + [15, 16, 17], +] + +c1_eq = [row_vals(r) for r in db.run("select * from t1 where c1 = 7 and c2 = 8").rows()] +assert c1_eq == [[6, 7, 8]] + +c2_range = [row_vals(r) for r in db.run("select * from t1 where c2 > 100 and c2 < 110").rows()] +assert c2_range == [ + [99, 100, 101], + [102, 103, 104], + [105, 106, 107], +] + +db.execute("update t1 set c2 = 123456 where c1 = 7") +after_update = [row_vals(r) for r in db.run("select * from t1 where c2 = 123456").rows()] +assert after_update == [[6, 7, 123456]] + +db.execute("delete from t1 where c1 = 7") +after_delete = db.run("select * from t1 where c2 = 123456").rows() +assert len(after_delete) == 0 + +db.execute("drop table t1") +"# + ), + )?; + Ok(()) + }) + } +}