diff --git a/scripts/cache_data.json b/scripts/cache_data.json index 3e3b7ee1..fea6034d 100644 --- a/scripts/cache_data.json +++ b/scripts/cache_data.json @@ -530,7 +530,9 @@ "name": "polars", "children": [ "polars.DataFrame", - "polars.LazyFrame" + "polars.LazyFrame", + "polars.col", + "polars.lit" ], "required": false }, @@ -808,5 +810,17 @@ "full_path": "typing.Union", "name": "Union", "children": [] + }, + "polars.col": { + "type": "attribute", + "full_path": "polars.col", + "name": "col", + "children": [] + }, + "polars.lit": { + "type": "attribute", + "full_path": "polars.lit", + "name": "lit", + "children": [] } } \ No newline at end of file diff --git a/scripts/imports.py b/scripts/imports.py index ee4c4597..26e0394b 100644 --- a/scripts/imports.py +++ b/scripts/imports.py @@ -109,6 +109,8 @@ polars.DataFrame polars.LazyFrame +polars.col +polars.lit import duckdb import duckdb.filesystem diff --git a/src/duckdb_py/arrow/CMakeLists.txt b/src/duckdb_py/arrow/CMakeLists.txt index 9a9188b8..2f92f09b 100644 --- a/src/duckdb_py/arrow/CMakeLists.txt +++ b/src/duckdb_py/arrow/CMakeLists.txt @@ -1,5 +1,6 @@ # this is used for clang-tidy checks -add_library(python_arrow OBJECT arrow_array_stream.cpp arrow_export_utils.cpp - pyarrow_filter_pushdown.cpp) +add_library( + python_arrow OBJECT arrow_array_stream.cpp arrow_export_utils.cpp + polars_filter_pushdown.cpp pyarrow_filter_pushdown.cpp) target_link_libraries(python_arrow PRIVATE _duckdb_dependencies) diff --git a/src/duckdb_py/arrow/arrow_array_stream.cpp b/src/duckdb_py/arrow/arrow_array_stream.cpp index 3284d6e1..4f438dec 100644 --- a/src/duckdb_py/arrow/arrow_array_stream.cpp +++ b/src/duckdb_py/arrow/arrow_array_stream.cpp @@ -1,4 +1,5 @@ #include "duckdb_python/arrow/arrow_array_stream.hpp" +#include "duckdb_python/arrow/polars_filter_pushdown.hpp" #include "duckdb_python/arrow/pyarrow_filter_pushdown.hpp" #include "duckdb_python/pyconnection/pyconnection.hpp" @@ -66,6 +67,55 @@ unique_ptr PythonTableArrowArrayStreamFactory::Produce( py::handle arrow_obj_handle(factory->arrow_object); auto arrow_object_type = factory->cached_arrow_type; + if (arrow_object_type == PyArrowObjectType::PolarsLazyFrame) { + py::object lf = py::reinterpret_borrow(arrow_obj_handle); + + auto filters = parameters.filters; + bool filters_pushed = false; + + // Translate DuckDB filters to Polars expressions and push into the lazy plan + if (filters && !filters->filters.empty()) { + try { + auto filter_expr = PolarsFilterPushdown::TransformFilter( + *filters, parameters.projected_columns.projection_map, parameters.projected_columns.filter_to_col, + factory->client_properties); + if (!filter_expr.is(py::none())) { + lf = lf.attr("filter")(filter_expr); + filters_pushed = true; + } + } catch (...) { + // Fallback: DuckDB handles filtering post-scan + } + } + + // If no filters were pushed and we have a cached Arrow table, reuse it. This avoids re-reading from source and + // re-converting on repeated unfiltered scans. + py::object arrow_table; + if (!filters_pushed && factory->cached_arrow_table.ptr() != nullptr) { + arrow_table = factory->cached_arrow_table; + } else { + arrow_table = lf.attr("collect")().attr("to_arrow")(); + // Cache only unfiltered results (filtered results are partial) + if (!filters_pushed) { + factory->cached_arrow_table = arrow_table; + } + } + + // Apply column projection + auto &column_list = parameters.projected_columns.columns; + if (!column_list.empty()) { + arrow_table = arrow_table.attr("select")(py::cast(column_list)); + } + + auto capsule_obj = arrow_table.attr("__arrow_c_stream__")(); + auto capsule = py::reinterpret_borrow(capsule_obj); + auto stream = capsule.get_pointer(); + auto res = make_uniq(); + res->arrow_array_stream = *stream; + stream->release = nullptr; + return res; + } + if (arrow_object_type == PyArrowObjectType::PyCapsuleInterface || arrow_object_type == PyArrowObjectType::Table) { py::object capsule_obj = arrow_obj_handle.attr("__arrow_c_stream__")(); auto capsule = py::reinterpret_borrow(capsule_obj); @@ -190,6 +240,20 @@ void PythonTableArrowArrayStreamFactory::GetSchema(uintptr_t factory_ptr, ArrowS py::handle arrow_obj_handle(factory->arrow_object); auto type = factory->cached_arrow_type; + if (type == PyArrowObjectType::PolarsLazyFrame) { + // head(0).collect().to_arrow() gives the Arrow-exported schema (e.g. large_string) without materializing data. + // collect_schema() would give Polars-native types (e.g. string_view) that don't match the actual export. + const auto empty_arrow = arrow_obj_handle.attr("head")(0).attr("collect")().attr("to_arrow")(); + const auto schema_capsule = empty_arrow.attr("schema").attr("__arrow_c_schema__")(); + const auto capsule = py::reinterpret_borrow(schema_capsule); + const auto arrow_schema = capsule.get_pointer(); + factory->cached_schema = *arrow_schema; + arrow_schema->release = nullptr; + factory->schema_cached = true; + schema.arrow_schema = factory->cached_schema; + schema.arrow_schema.release = nullptr; + return; + } if (type == PyArrowObjectType::PyCapsuleInterface || type == PyArrowObjectType::Table) { // Get __arrow_c_schema__ if it exists if (py::hasattr(arrow_obj_handle, "__arrow_c_schema__")) { diff --git a/src/duckdb_py/arrow/polars_filter_pushdown.cpp b/src/duckdb_py/arrow/polars_filter_pushdown.cpp new file mode 100644 index 00000000..493189a3 --- /dev/null +++ b/src/duckdb_py/arrow/polars_filter_pushdown.cpp @@ -0,0 +1,161 @@ +#include "duckdb_python/arrow/polars_filter_pushdown.hpp" + +#include "duckdb/planner/filter/in_filter.hpp" +#include "duckdb/planner/filter/optional_filter.hpp" +#include "duckdb/planner/filter/conjunction_filter.hpp" +#include "duckdb/planner/filter/constant_filter.hpp" +#include "duckdb/planner/filter/struct_filter.hpp" +#include "duckdb/planner/table_filter.hpp" + +#include "duckdb_python/pyconnection/pyconnection.hpp" +#include "duckdb_python/python_objects.hpp" + +namespace duckdb { + +static py::object TransformFilterRecursive(TableFilter &filter, py::object col_expr, + const ClientProperties &client_properties) { + auto &import_cache = *DuckDBPyConnection::ImportCache(); + + switch (filter.filter_type) { + case TableFilterType::CONSTANT_COMPARISON: { + auto &constant_filter = filter.Cast(); + auto &constant = constant_filter.constant; + auto &constant_type = constant.type(); + + // Check for NaN + bool is_nan = false; + if (constant_type.id() == LogicalTypeId::FLOAT) { + is_nan = Value::IsNan(constant.GetValue()); + } else if (constant_type.id() == LogicalTypeId::DOUBLE) { + is_nan = Value::IsNan(constant.GetValue()); + } + + if (is_nan) { + switch (constant_filter.comparison_type) { + case ExpressionType::COMPARE_EQUAL: + case ExpressionType::COMPARE_GREATERTHANOREQUALTO: + return col_expr.attr("is_nan")(); + case ExpressionType::COMPARE_LESSTHAN: + case ExpressionType::COMPARE_NOTEQUAL: + return col_expr.attr("is_nan")().attr("__invert__")(); + case ExpressionType::COMPARE_GREATERTHAN: + return import_cache.polars.lit()(false); + case ExpressionType::COMPARE_LESSTHANOREQUALTO: + return import_cache.polars.lit()(true); + default: + return py::none(); + } + } + + // Convert DuckDB Value to Python object + auto py_value = PythonObject::FromValue(constant, constant_type, client_properties); + + switch (constant_filter.comparison_type) { + case ExpressionType::COMPARE_EQUAL: + return col_expr.attr("__eq__")(py_value); + case ExpressionType::COMPARE_LESSTHAN: + return col_expr.attr("__lt__")(py_value); + case ExpressionType::COMPARE_GREATERTHAN: + return col_expr.attr("__gt__")(py_value); + case ExpressionType::COMPARE_LESSTHANOREQUALTO: + return col_expr.attr("__le__")(py_value); + case ExpressionType::COMPARE_GREATERTHANOREQUALTO: + return col_expr.attr("__ge__")(py_value); + case ExpressionType::COMPARE_NOTEQUAL: + return col_expr.attr("__ne__")(py_value); + default: + return py::none(); + } + } + case TableFilterType::IS_NULL: { + return col_expr.attr("is_null")(); + } + case TableFilterType::IS_NOT_NULL: { + return col_expr.attr("is_not_null")(); + } + case TableFilterType::CONJUNCTION_AND: { + auto &and_filter = filter.Cast(); + py::object expression = py::none(); + for (idx_t i = 0; i < and_filter.child_filters.size(); i++) { + auto child_expression = TransformFilterRecursive(*and_filter.child_filters[i], col_expr, client_properties); + if (child_expression.is(py::none())) { + continue; + } + if (expression.is(py::none())) { + expression = std::move(child_expression); + } else { + expression = expression.attr("__and__")(child_expression); + } + } + return expression; + } + case TableFilterType::CONJUNCTION_OR: { + auto &or_filter = filter.Cast(); + py::object expression = py::none(); + for (idx_t i = 0; i < or_filter.child_filters.size(); i++) { + auto child_expression = TransformFilterRecursive(*or_filter.child_filters[i], col_expr, client_properties); + if (child_expression.is(py::none())) { + // Can't skip children in OR + return py::none(); + } + if (expression.is(py::none())) { + expression = std::move(child_expression); + } else { + expression = expression.attr("__or__")(child_expression); + } + } + return expression; + } + case TableFilterType::STRUCT_EXTRACT: { + auto &struct_filter = filter.Cast(); + auto child_col = col_expr.attr("struct").attr("field")(struct_filter.child_name); + return TransformFilterRecursive(*struct_filter.child_filter, child_col, client_properties); + } + case TableFilterType::IN_FILTER: { + auto &in_filter = filter.Cast(); + py::list py_values; + for (const auto &value : in_filter.values) { + py_values.append(PythonObject::FromValue(value, value.type(), client_properties)); + } + return col_expr.attr("is_in")(py_values); + } + case TableFilterType::OPTIONAL_FILTER: { + auto &optional_filter = filter.Cast(); + if (!optional_filter.child_filter) { + return py::none(); + } + return TransformFilterRecursive(*optional_filter.child_filter, col_expr, client_properties); + } + default: + // We skip DYNAMIC_FILTER, EXPRESSION_FILTER, BLOOM_FILTER + return py::none(); + } +} + +py::object PolarsFilterPushdown::TransformFilter(const TableFilterSet &filter_collection, + unordered_map &columns, + const unordered_map &filter_to_col, + const ClientProperties &client_properties) { + auto &import_cache = *DuckDBPyConnection::ImportCache(); + auto &filters_map = filter_collection.filters; + + py::object expression = py::none(); + for (auto &it : filters_map) { + auto column_idx = it.first; + auto &column_name = columns[column_idx]; + auto col_expr = import_cache.polars.col()(column_name); + + auto child_expression = TransformFilterRecursive(*it.second, col_expr, client_properties); + if (child_expression.is(py::none())) { + continue; + } + if (expression.is(py::none())) { + expression = std::move(child_expression); + } else { + expression = expression.attr("__and__")(child_expression); + } + } + return expression; +} + +} // namespace duckdb diff --git a/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp b/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp index 69b72708..90974ff7 100644 --- a/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp +++ b/src/duckdb_py/include/duckdb_python/arrow/arrow_array_stream.hpp @@ -51,7 +51,16 @@ class Table : public py::object { } // namespace pyarrow -enum class PyArrowObjectType { Invalid, Table, Scanner, Dataset, PyCapsule, PyCapsuleInterface, MessageReader }; +enum class PyArrowObjectType { + Invalid, + Table, + Scanner, + Dataset, + PyCapsule, + PyCapsuleInterface, + MessageReader, + PolarsLazyFrame +}; void TransformDuckToArrowChunk(ArrowSchema &arrow_schema, ArrowArray &data, py::list &batches); @@ -66,6 +75,10 @@ class PythonTableArrowArrayStreamFactory { } ~PythonTableArrowArrayStreamFactory() { + if (cached_arrow_table.ptr() != nullptr) { + py::gil_scoped_acquire acquire; + cached_arrow_table = py::object(); + } if (cached_schema.release) { cached_schema.release(&cached_schema); } @@ -84,6 +97,10 @@ class PythonTableArrowArrayStreamFactory { const ClientProperties client_properties; const PyArrowObjectType cached_arrow_type; + //! Cached Arrow table from an unfiltered .collect().to_arrow() on a LazyFrame. + //! Avoids re-reading from source and re-converting on repeated scans without filters. + py::object cached_arrow_table; + private: ArrowSchema cached_schema; bool schema_cached = false; diff --git a/src/duckdb_py/include/duckdb_python/arrow/polars_filter_pushdown.hpp b/src/duckdb_py/include/duckdb_python/arrow/polars_filter_pushdown.hpp new file mode 100644 index 00000000..adf485c9 --- /dev/null +++ b/src/duckdb_py/include/duckdb_python/arrow/polars_filter_pushdown.hpp @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb_python/arrow/polars_filter_pushdown.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/common/unordered_map.hpp" +#include "duckdb/planner/table_filter.hpp" +#include "duckdb/main/client_properties.hpp" +#include "duckdb_python/pybind11/pybind_wrapper.hpp" + +namespace duckdb { + +struct PolarsFilterPushdown { + static py::object TransformFilter(const TableFilterSet &filter_collection, unordered_map &columns, + const unordered_map &filter_to_col, + const ClientProperties &client_properties); +}; + +} // namespace duckdb diff --git a/src/duckdb_py/include/duckdb_python/import_cache/modules/polars_module.hpp b/src/duckdb_py/include/duckdb_python/import_cache/modules/polars_module.hpp index c22173b4..17f746fb 100644 --- a/src/duckdb_py/include/duckdb_python/import_cache/modules/polars_module.hpp +++ b/src/duckdb_py/include/duckdb_python/import_cache/modules/polars_module.hpp @@ -26,13 +26,17 @@ struct PolarsCacheItem : public PythonImportCacheItem { static constexpr const char *Name = "polars"; public: - PolarsCacheItem() : PythonImportCacheItem("polars"), DataFrame("DataFrame", this), LazyFrame("LazyFrame", this) { + PolarsCacheItem() + : PythonImportCacheItem("polars"), DataFrame("DataFrame", this), LazyFrame("LazyFrame", this), col("col", this), + lit("lit", this) { } ~PolarsCacheItem() override { } PythonImportCacheItem DataFrame; PythonImportCacheItem LazyFrame; + PythonImportCacheItem col; + PythonImportCacheItem lit; protected: bool IsRequired() const override final { diff --git a/src/duckdb_py/python_replacement_scan.cpp b/src/duckdb_py/python_replacement_scan.cpp index 87979ece..8bff9e8f 100644 --- a/src/duckdb_py/python_replacement_scan.cpp +++ b/src/duckdb_py/python_replacement_scan.cpp @@ -152,9 +152,7 @@ unique_ptr PythonReplacementScan::TryReplacementObject(const py::objec CreateArrowScan(name, arrow_dataset, *table_function, children, client_properties, PyArrowObjectType::Table, *context.db); } else if (PolarsDataFrame::IsLazyFrame(entry)) { - auto materialized = entry.attr("collect")(); - auto arrow_dataset = materialized.attr("to_arrow")(); - CreateArrowScan(name, arrow_dataset, *table_function, children, client_properties, PyArrowObjectType::Table, + CreateArrowScan(name, entry, *table_function, children, client_properties, PyArrowObjectType::PolarsLazyFrame, *context.db); } else if ((arrow_type = DuckDBPyConnection::GetArrowType(entry)) != PyArrowObjectType::Invalid && !(arrow_type == PyArrowObjectType::MessageReader && !relation)) { diff --git a/tests/fast/arrow/test_polars_filter_pushdown.py b/tests/fast/arrow/test_polars_filter_pushdown.py new file mode 100644 index 00000000..320dffae --- /dev/null +++ b/tests/fast/arrow/test_polars_filter_pushdown.py @@ -0,0 +1,192 @@ +# ruff: noqa: F841 +import math + +import pytest + +import duckdb + +pl = pytest.importorskip("polars") +pytest.importorskip("pyarrow") + + +class TestPolarsLazyFrameFilterPushdown: + """Tests for filter pushdown on LazyFrames. + + All tests use pl.LazyFrame (the target of this change). DuckDB pushes filters and projections into the Polars lazy + plan before collection, so only surviving rows are ever materialized. + """ + + ##### CONSTANT_COMPARISON: all six comparison operators + + def test_comparison_equal(self): + lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) + assert duckdb.sql("SELECT * FROM lf WHERE a = 3").fetchall() == [(3,)] + + def test_comparison_not_equal(self): + lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) + assert duckdb.sql("SELECT * FROM lf WHERE a != 3").fetchall() == [(1,), (2,), (4,), (5,)] + + def test_comparison_less_than(self): + lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) + assert duckdb.sql("SELECT * FROM lf WHERE a < 3").fetchall() == [(1,), (2,)] + + def test_comparison_less_than_or_equal(self): + lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) + assert duckdb.sql("SELECT * FROM lf WHERE a <= 3").fetchall() == [(1,), (2,), (3,)] + + def test_comparison_greater_than(self): + lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) + assert duckdb.sql("SELECT * FROM lf WHERE a > 3").fetchall() == [(4,), (5,)] + + def test_comparison_greater_than_or_equal(self): + lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) + assert duckdb.sql("SELECT * FROM lf WHERE a >= 3").fetchall() == [(3,), (4,), (5,)] + + def test_string_comparison(self): + lf = pl.LazyFrame({"name": ["alice", "bob", "charlie"], "val": [1, 2, 3]}) + assert duckdb.sql("SELECT * FROM lf WHERE name = 'bob'").fetchall() == [("bob", 2)] + + ##### NaN comparisons (CONSTANT_COMPARISON with is_nan path) + + def test_nan_equal(self): + """NaN = NaN is true in DuckDB; pushes is_nan().""" + lf = pl.LazyFrame({"a": [1.0, float("nan"), 3.0]}) + result = duckdb.sql("SELECT * FROM lf WHERE a = 'NaN'::DOUBLE").fetchall() + assert len(result) == 1 + assert math.isnan(result[0][0]) + + def test_nan_greater_than_or_equal(self): + """NaN >= NaN is true; pushes is_nan().""" + lf = pl.LazyFrame({"a": [1.0, float("nan"), 3.0]}) + result = duckdb.sql("SELECT * FROM lf WHERE a >= 'NaN'::DOUBLE").fetchall() + assert len(result) == 1 + assert math.isnan(result[0][0]) + + def test_nan_less_than(self): + """X < NaN is true for non-NaN values; pushes is_nan().__invert__().""" + lf = pl.LazyFrame({"a": [1.0, float("nan"), 3.0]}) + result = duckdb.sql("SELECT * FROM lf WHERE a < 'NaN'::DOUBLE").fetchall() + assert sorted(result) == [(1.0,), (3.0,)] + + def test_nan_not_equal(self): + """X != NaN is true for non-NaN values; pushes is_nan().__invert__().""" + lf = pl.LazyFrame({"a": [1.0, float("nan"), 3.0]}) + result = duckdb.sql("SELECT * FROM lf WHERE a != 'NaN'::DOUBLE").fetchall() + assert sorted(result) == [(1.0,), (3.0,)] + + def test_nan_greater_than(self): + """X > NaN is always false; pushes lit(false).""" + lf = pl.LazyFrame({"a": [1.0, float("nan"), 3.0]}) + result = duckdb.sql("SELECT * FROM lf WHERE a > 'NaN'::DOUBLE").fetchall() + assert result == [] + + def test_nan_less_than_or_equal(self): + """X <= NaN is always true; pushes lit(true).""" + lf = pl.LazyFrame({"a": [1.0, float("nan"), 3.0]}) + result = duckdb.sql("SELECT * FROM lf WHERE a <= 'NaN'::DOUBLE").fetchall() + assert len(result) == 3 + + ##### IS_NULL / IS_NOT_NULL (triggered via DISTINCT FROM NULL inside OR) + + def test_is_null_filter(self): + """IS NOT DISTINCT FROM NULL inside an OR pushes IS_NULL as a child of CONJUNCTION_OR.""" + lf = pl.LazyFrame({"a": [1, None, 3, None, 5]}) + result = duckdb.sql("SELECT * FROM lf WHERE a = 1 OR a IS NOT DISTINCT FROM NULL").fetchall() + values = [row[0] for row in result] + assert values.count(None) == 2 + assert 1 in values + assert len(values) == 3 + + def test_is_not_null_filter(self): + """IS DISTINCT FROM NULL inside an OR pushes IS_NOT_NULL as a child of CONJUNCTION_OR.""" + lf = pl.LazyFrame({"a": [1, None, 3, None, 5]}) + result = duckdb.sql("SELECT * FROM lf WHERE a = 1 OR a IS DISTINCT FROM NULL").fetchall() + assert sorted(result) == [(1,), (3,), (5,)] + + # ── CONJUNCTION_AND ── + + def test_conjunction_and_range(self): + """BETWEEN on a single column pushes a CONJUNCTION_AND with GTE + LTE children.""" + lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) + result = duckdb.sql("SELECT * FROM lf WHERE a BETWEEN 2 AND 4").fetchall() + assert result == [(2,), (3,), (4,)] + + def test_conjunction_and_multi_column(self): + """Filters on two different columns combine via AND in TransformFilter.""" + lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5], "b": ["x", "y", "x", "y", "x"]}) + result = duckdb.sql("SELECT * FROM lf WHERE a > 2 AND b = 'x'").fetchall() + assert result == [(3, "x"), (5, "x")] + + ##### CONJUNCTION_OR + + def test_conjunction_or(self): + lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) + result = duckdb.sql("SELECT * FROM lf WHERE a = 1 OR a = 5").fetchall() + assert sorted(result) == [(1,), (5,)] + + ##### IN_FILTER + + def test_in_filter(self): + lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) + result = duckdb.sql("SELECT * FROM lf WHERE a IN (2, 4)").fetchall() + assert sorted(result) == [(2,), (4,)] + + ##### STRUCT_EXTRACT + + def test_struct_extract(self): + lf = pl.LazyFrame({"s": [{"x": 1, "y": "a"}, {"x": 2, "y": "b"}, {"x": 3, "y": "c"}]}) + result = duckdb.sql("SELECT * FROM lf WHERE s.x > 1").fetchall() + assert len(result) == 2 + assert all(row[0]["x"] > 1 for row in result) + + ##### OPTIONAL_FILTER + + def test_optional_filter(self): + """OR filters are wrapped in OPTIONAL_FILTER by DuckDB's optimizer.""" + lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) + result = duckdb.sql("SELECT * FROM lf WHERE a = 1 OR a = 3").fetchall() + assert sorted(result) == [(1,), (3,)] + + ##### Produce path, no filters + + def test_unfiltered_scan(self): + lf = pl.LazyFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = duckdb.sql("SELECT * FROM lf").fetchall() + assert result == [(1, 4), (2, 5), (3, 6)] + + ##### Produce path, column projection + + def test_column_projection(self): + lf = pl.LazyFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + result = duckdb.sql("SELECT a, c FROM lf").fetchall() + assert result == [(1, 7), (2, 8), (3, 9)] + + ##### Produce path, cached DataFrame reuse + + def test_cached_dataframe_reuse(self): + """Repeated unfiltered scans on a registered LazyFrame reuse the cached DataFrame.""" + con = duckdb.connect() + lf = pl.LazyFrame({"a": [1, 2, 3]}) + con.register("my_lf", lf) + r1 = con.sql("SELECT * FROM my_lf").fetchall() + r2 = con.sql("SELECT * FROM my_lf").fetchall() + assert r1 == r2 == [(1,), (2,), (3,)] + + ##### Produce path, filter + collect (no cache) + + def test_filtered_scan_not_cached(self): + """Filtered scans collect a new DataFrame each time (not cached).""" + con = duckdb.connect() + lf = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}) + con.register("my_lf", lf) + r1 = con.sql("SELECT * FROM my_lf WHERE a > 3").fetchall() + r2 = con.sql("SELECT * FROM my_lf WHERE a < 3").fetchall() + assert sorted(r1) == [(4,), (5,)] + assert sorted(r2) == [(1,), (2,)] + + ##### Empty result + + def test_empty_result(self): + lf = pl.LazyFrame({"a": [1, 2, 3]}) + result = duckdb.sql("SELECT * FROM lf WHERE a > 100").fetchall() + assert result == []