From f241957e8f1cf8cd535c9ccfb8f8348a769a1a41 Mon Sep 17 00:00:00 2001 From: Michael Coady Date: Sat, 14 Feb 2026 22:43:10 +0000 Subject: [PATCH 1/5] fix out of bounds exception when calling `aggregate` and `distinct` on a dataframe with no rows; split out `decodeSeparated` from `readSeparated` --- src/DataFrame/IO/CSV.hs | 8 +++-- src/DataFrame/Operations/Aggregation.hs | 43 +++++++++++++++---------- 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/src/DataFrame/IO/CSV.hs b/src/DataFrame/IO/CSV.hs index 23cca806..c4bcbd50 100644 --- a/src/DataFrame/IO/CSV.hs +++ b/src/DataFrame/IO/CSV.hs @@ -237,10 +237,14 @@ ghci> D.readSeparated (D.defaultReadOptions { columnSeparator = ';' }) ".\/data\ @ -} readSeparated :: ReadOptions -> FilePath -> IO DataFrame -readSeparated !opts !path = do - let sep = columnSeparator opts +readSeparated opts !path = do let stripUtf8Bom bs = fromMaybe bs (BL.stripPrefix "\xEF\xBB\xBF" bs) csvData <- stripUtf8Bom <$> BL.readFile path + decodeSeparated opts csvData + +decodeSeparated :: ReadOptions -> BL.ByteString -> IO DataFrame +decodeSeparated !opts csvData = do + let sep = columnSeparator opts let decodeOpts = Csv.defaultDecodeOptions{Csv.decDelimiter = fromIntegral (ord sep)} let stream = CsvStream.decodeWith decodeOpts Csv.NoHeader csvData diff --git a/src/DataFrame/Operations/Aggregation.hs b/src/DataFrame/Operations/Aggregation.hs index 68e32161..37a68500 100644 --- a/src/DataFrame/Operations/Aggregation.hs +++ b/src/DataFrame/Operations/Aggregation.hs @@ -254,23 +254,30 @@ computeRowHashes indices df = runST $ do All ungrouped columns will be dropped. -} aggregate :: [NamedExpr] -> GroupedDataFrame -> DataFrame -aggregate aggs gdf@(Grouped df groupingColumns valueIndices offsets) = - let - df' = - selectIndices - (VU.map (valueIndices VU.!) (VU.init offsets)) - (select groupingColumns df) +aggregate aggs gdf@(Grouped df groupingColumns valueIndices offsets) + | VU.null valueIndices = + let + df' = exclude (M.keys (columnIndices df) L.\\ groupingColumns) df + f (name, UExpr (expr :: Expr a)) = insert name ([] :: [a]) + in + fold f aggs df' + | otherwise = + let + df' = + selectIndices + (VU.map (valueIndices VU.!) (VU.init offsets)) + (select groupingColumns df) - f (name, UExpr (expr :: Expr a)) d = - let - value = case interpretAggregation @a gdf expr of - Left e -> throw e - Right (UnAggregated _) -> throw $ UnaggregatedException (T.pack $ show expr) - Right (Aggregated (TColumn col)) -> col - in - insertColumn name value d - in - fold f aggs df' + f (name, UExpr (expr :: Expr a)) d = + let + value = case interpretAggregation @a gdf expr of + Left e -> throw e + Right (UnAggregated _) -> throw $ UnaggregatedException (T.pack $ show expr) + Right (Aggregated (TColumn col)) -> col + in + insertColumn name value d + in + fold f aggs df' selectIndices :: VU.Vector Int -> DataFrame -> DataFrame selectIndices xs df = @@ -281,6 +288,8 @@ selectIndices xs df = -- | Filter out all non-unique values in a dataframe. distinct :: DataFrame -> DataFrame -distinct df = selectIndices (VU.map (indices VU.!) (VU.init os)) df +distinct df + | nRows df == 0 = df + | otherwise = selectIndices (VU.map (indices VU.!) (VU.init os)) df where (Grouped _ _ indices os) = groupBy (columnNames df) df From f3e18c7741d2cfec0324054aa2333f54aa3ba865 Mon Sep 17 00:00:00 2001 From: Michael Coady Date: Sat, 14 Feb 2026 23:46:19 +0000 Subject: [PATCH 2/5] doing what ghc-9.12.2 tells me to do... --- src/DataFrame/Operations/Aggregation.hs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/DataFrame/Operations/Aggregation.hs b/src/DataFrame/Operations/Aggregation.hs index 37a68500..7fb214c4 100644 --- a/src/DataFrame/Operations/Aggregation.hs +++ b/src/DataFrame/Operations/Aggregation.hs @@ -255,10 +255,12 @@ All ungrouped columns will be dropped. -} aggregate :: [NamedExpr] -> GroupedDataFrame -> DataFrame aggregate aggs gdf@(Grouped df groupingColumns valueIndices offsets) - | VU.null valueIndices = + | VU.null valueIndices = let df' = exclude (M.keys (columnIndices df) L.\\ groupingColumns) df - f (name, UExpr (expr :: Expr a)) = insert name ([] :: [a]) + + f :: NamedExpr -> DataFrame -> DataFrame + f (name, UExpr (_ :: Expr a)) = insert name ([] :: [a]) in fold f aggs df' | otherwise = @@ -289,7 +291,7 @@ selectIndices xs df = -- | Filter out all non-unique values in a dataframe. distinct :: DataFrame -> DataFrame distinct df - | nRows df == 0 = df + | nRows df == 0 = df | otherwise = selectIndices (VU.map (indices VU.!) (VU.init os)) df where (Grouped _ _ indices os) = groupBy (columnNames df) df From b6af77444689e1c125315d95b54bf38f4968c13f Mon Sep 17 00:00:00 2001 From: Michael Coady Date: Sun, 15 Feb 2026 00:04:14 +0000 Subject: [PATCH 3/5] formatting --- src/DataFrame/Operations/Aggregation.hs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/DataFrame/Operations/Aggregation.hs b/src/DataFrame/Operations/Aggregation.hs index 7fb214c4..5259fad5 100644 --- a/src/DataFrame/Operations/Aggregation.hs +++ b/src/DataFrame/Operations/Aggregation.hs @@ -258,10 +258,10 @@ aggregate aggs gdf@(Grouped df groupingColumns valueIndices offsets) | VU.null valueIndices = let df' = exclude (M.keys (columnIndices df) L.\\ groupingColumns) df - + f :: NamedExpr -> DataFrame -> DataFrame f (name, UExpr (_ :: Expr a)) = insert name ([] :: [a]) - in + in fold f aggs df' | otherwise = let @@ -276,9 +276,9 @@ aggregate aggs gdf@(Grouped df groupingColumns valueIndices offsets) Left e -> throw e Right (UnAggregated _) -> throw $ UnaggregatedException (T.pack $ show expr) Right (Aggregated (TColumn col)) -> col - in + in insertColumn name value d - in + in fold f aggs df' selectIndices :: VU.Vector Int -> DataFrame -> DataFrame From 634313c27718625b4f69a5f6a753d0495485d9e2 Mon Sep 17 00:00:00 2001 From: Michael Coady Date: Mon, 16 Feb 2026 10:13:24 +0000 Subject: [PATCH 4/5] simplify handling of `aggregate` on no rows; add test case `aggregationOnNoRows`; --- src/DataFrame/Operations/Aggregation.hs | 51 +++++++++++-------------- tests/Operations/Aggregations.hs | 21 ++++++++++ 2 files changed, 44 insertions(+), 28 deletions(-) diff --git a/src/DataFrame/Operations/Aggregation.hs b/src/DataFrame/Operations/Aggregation.hs index 5259fad5..c6381b5a 100644 --- a/src/DataFrame/Operations/Aggregation.hs +++ b/src/DataFrame/Operations/Aggregation.hs @@ -53,6 +53,12 @@ groupBy names df (T.pack $ show $ names L.\\ columnNames df) "groupBy" (columnNames df) + | nRows df == 0 = + Grouped + df + names + VU.empty + (VU.fromList [0]) | otherwise = Grouped df @@ -254,32 +260,23 @@ computeRowHashes indices df = runST $ do All ungrouped columns will be dropped. -} aggregate :: [NamedExpr] -> GroupedDataFrame -> DataFrame -aggregate aggs gdf@(Grouped df groupingColumns valueIndices offsets) - | VU.null valueIndices = - let - df' = exclude (M.keys (columnIndices df) L.\\ groupingColumns) df - - f :: NamedExpr -> DataFrame -> DataFrame - f (name, UExpr (_ :: Expr a)) = insert name ([] :: [a]) - in - fold f aggs df' - | otherwise = - let - df' = - selectIndices - (VU.map (valueIndices VU.!) (VU.init offsets)) - (select groupingColumns df) +aggregate aggs gdf@(Grouped df groupingColumns valueIndices offsets) = + let + df' = + selectIndices + (VU.map (valueIndices VU.!) (VU.init offsets)) + (select groupingColumns df) - f (name, UExpr (expr :: Expr a)) d = - let - value = case interpretAggregation @a gdf expr of - Left e -> throw e - Right (UnAggregated _) -> throw $ UnaggregatedException (T.pack $ show expr) - Right (Aggregated (TColumn col)) -> col - in - insertColumn name value d - in - fold f aggs df' + f (name, UExpr (expr :: Expr a)) d = + let + value = case interpretAggregation @a gdf expr of + Left e -> throw e + Right (UnAggregated _) -> throw $ UnaggregatedException (T.pack $ show expr) + Right (Aggregated (TColumn col)) -> col + in + insertColumn name value d + in + fold f aggs df' selectIndices :: VU.Vector Int -> DataFrame -> DataFrame selectIndices xs df = @@ -290,8 +287,6 @@ selectIndices xs df = -- | Filter out all non-unique values in a dataframe. distinct :: DataFrame -> DataFrame -distinct df - | nRows df == 0 = df - | otherwise = selectIndices (VU.map (indices VU.!) (VU.init os)) df +distinct df = selectIndices (VU.map (indices VU.!) (VU.init os)) df where (Grouped _ _ indices os) = groupBy (columnNames df) df diff --git a/tests/Operations/Aggregations.hs b/tests/Operations/Aggregations.hs index 6e9a2df0..d43b0ae8 100644 --- a/tests/Operations/Aggregations.hs +++ b/tests/Operations/Aggregations.hs @@ -133,6 +133,24 @@ reduceAggregationOfUnaggregatedBinaryOp = ) ) +aggregationOnNoRows :: Test +aggregationOnNoRows = + TestCase + ( assertEqual + "Aggregation on DataFrame with no rows" + ( D.fromNamedColumns + [ ("test1", DI.fromList ([] :: [Int])) + , ("sum(test2)", DI.fromList ([] :: [Int])) + ] + ) + ( testData + & D.drop 12 + & D.groupBy ["test1"] + & D.aggregate + [F.sum (F.col @Int "test2") `as` "sum(test2)"] + ) + ) + tests :: [Test] tests = [ TestLabel "foldAggregation" foldAggregation @@ -149,4 +167,7 @@ tests = , TestLabel "reduceAggregationOfUnaggregatedBinaryOp" reduceAggregationOfUnaggregatedBinaryOp + , TestLabel + "aggregationOnNoRows" + aggregationOnNoRows ] From 59cc9f9cc831456ba0c52d3140c690852384d028 Mon Sep 17 00:00:00 2001 From: Michael Coady Date: Mon, 16 Feb 2026 10:20:11 +0000 Subject: [PATCH 5/5] formatting --- src/DataFrame/Operations/Aggregation.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/DataFrame/Operations/Aggregation.hs b/src/DataFrame/Operations/Aggregation.hs index c6381b5a..6d9fdb2d 100644 --- a/src/DataFrame/Operations/Aggregation.hs +++ b/src/DataFrame/Operations/Aggregation.hs @@ -273,9 +273,9 @@ aggregate aggs gdf@(Grouped df groupingColumns valueIndices offsets) = Left e -> throw e Right (UnAggregated _) -> throw $ UnaggregatedException (T.pack $ show expr) Right (Aggregated (TColumn col)) -> col - in + in insertColumn name value d - in + in fold f aggs df' selectIndices :: VU.Vector Int -> DataFrame -> DataFrame