DataHaskell · Anamika1608 · Mar 20, 2026 · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/src/DataFrame/Errors.hs b/src/DataFrame/Errors.hs
@@ -11,6 +11,7 @@ import qualified Data.Vector.Unboxed as VU
 
 import Control.Exception
 import Data.Array
+import qualified Data.List as L
 import Data.Typeable (Typeable)
 import DataFrame.Display.Terminal.Colours
 import Type.Reflection (TypeRep)
@@ -30,6 +31,7 @@ data DataFrameException where
         DataFrameException
     AggregatedAndNonAggregatedException :: T.Text -> T.Text -> DataFrameException
     ColumnNotFoundException :: T.Text -> T.Text -> [T.Text] -> DataFrameException
+    ColumnsNotFoundException :: [T.Text] -> T.Text -> [T.Text] -> DataFrameException
     EmptyDataSetException :: T.Text -> DataFrameException
     InternalException :: T.Text -> DataFrameException
     NonColumnReferenceException :: T.Text -> DataFrameException
@@ -52,6 +54,7 @@ instance Show DataFrameException where
                 (callingFunctionName context)
                 errorString
     show (ColumnNotFoundException columnName callPoint availableColumns) = columnNotFound columnName callPoint availableColumns
+    show (ColumnsNotFoundException columnNames callPoint availableColumns) = columnsNotFound columnNames callPoint availableColumns
     show (EmptyDataSetException callPoint) = emptyDataSetError callPoint
     show (WrongQuantileNumberException q) = wrongQuantileNumberError q
     show (WrongQuantileIndexException qs q) = wrongQuantileIndexError qs q
@@ -65,15 +68,46 @@ instance Show DataFrameException where
             ++ T.unpack expr2
 
 columnNotFound :: T.Text -> T.Text -> [T.Text] -> String
-columnNotFound name callPoint columns =
+columnNotFound name = columnsNotFound [name]
+
+columnsNotFound :: [T.Text] -> T.Text -> [T.Text] -> String
+columnsNotFound missingColumns callPoint availableColumns =
     red "\n\n[ERROR] "
-        ++ "Column not found: "
-        ++ T.unpack name
+        ++ missingColumnsLabel missingColumns
+        ++ ": "
+        ++ T.unpack (T.intercalate ", " missingColumns)
         ++ " for operation "
         ++ T.unpack callPoint
-        ++ "\n\tDid you mean "
-        ++ T.unpack (guessColumnName name columns)
-        ++ "?\n\n"
+        ++ formatSuggestions missingColumns availableColumns
+        ++ "\n\n"
+  where
+    missingColumnsLabel [_] = "Column not found"
+    missingColumnsLabel _ = "Columns not found"
+
+    formatSuggestions [missingColumn] columns =
+        case guessColumnName missingColumn columns of
+            "" -> ""
+            guessed ->
+                "\n\tDid you mean "
+                    ++ T.unpack guessed
+                    ++ "?"
+    formatSuggestions names columns =
+        case traverse (`suggestColumnName` columns) names of
+            Just guessedColumns
+                | not (null guessedColumns) ->
+                    "\n\tDid you mean "
+                        ++ formatColumnSuggestions guessedColumns
+                        ++ "?"
+            _ -> ""
+
+    suggestColumnName missingColumn columns = case guessColumnName missingColumn columns of
+        "" -> Nothing
+        guessed -> Just guessed
+
+    formatColumnSuggestions guessedColumns =
+        "["
+            ++ L.intercalate ", " (map (show . T.unpack) guessedColumns)
+            ++ "]"
 
 typeMismatchError :: String -> String -> String
 typeMismatchError givenType expectedType =

diff --git a/src/DataFrame/Operations/Join.hs b/src/DataFrame/Operations/Join.hs
@@ -10,6 +10,7 @@
 module DataFrame.Operations.Join where
 
 import Control.Applicative ((<|>))
+import Control.Exception (throw)
 import Control.Monad (forM_, when)
 import Control.Monad.ST (ST, runST)
 import qualified Data.HashMap.Strict as HM
@@ -24,6 +25,9 @@ import qualified Data.Vector as VB
 import qualified Data.Vector.Algorithms.Merge as VA
 import qualified Data.Vector.Unboxed as VU
 import qualified Data.Vector.Unboxed.Mutable as VUM
+import DataFrame.Errors (
+    DataFrameException (ColumnNotFoundException, ColumnsNotFoundException),
+ )
 import DataFrame.Internal.Column as D
 import DataFrame.Internal.DataFrame as D
 import DataFrame.Operations.Aggregation as D
@@ -145,6 +149,16 @@ fillCrossProduct !leftSI !rightSI !lStart !lEnd !rStart !rEnd !lv !rv !pos = goL
 keyColIndices :: S.Set T.Text -> DataFrame -> [Int]
 keyColIndices csSet df = M.elems $ M.restrictKeys (D.columnIndices df) csSet
 
+-- | Validate that all requested join keys exist, then return their indices.
+validatedKeyColIndices :: T.Text -> S.Set T.Text -> DataFrame -> [Int]
+validatedKeyColIndices callPoint csSet df =
+    let columnIdxs = D.columnIndices df
+        missingKeys = S.toAscList (csSet `S.difference` M.keysSet columnIdxs)
+     in case missingKeys of
+            [] -> M.elems $ M.restrictKeys columnIdxs csSet
+            [missingKey] -> throw (ColumnNotFoundException missingKey callPoint (M.keys columnIdxs))
+            _ -> throw (ColumnsNotFoundException missingKeys callPoint (M.keys columnIdxs))
+
 -- ============================================================
 -- Inner Join
 -- ============================================================
@@ -172,36 +186,39 @@ ghci> D.innerJoin ["key"] df other
 innerJoin :: [T.Text] -> DataFrame -> DataFrame -> DataFrame
 innerJoin cs left right
     | D.null right || D.null left = D.empty
-    | otherwise =
-        let
-            csSet = S.fromList cs
-            leftRows = fst (D.dimensions left)
-            rightRows = fst (D.dimensions right)
-
-            leftKeyIdxs = keyColIndices csSet left
-            rightKeyIdxs = keyColIndices csSet right
-            leftHashes = D.computeRowHashes leftKeyIdxs left
-            rightHashes = D.computeRowHashes rightKeyIdxs right
-
-            buildRows = min leftRows rightRows
-            (leftIxs, rightIxs)
-                | buildRows > joinStrategyThreshold =
-                    sortMergeInnerKernel leftHashes rightHashes
-                | rightRows <= leftRows =
-                    -- Build on right (smaller or equal), probe with left
-                    hashInnerKernel leftHashes rightHashes
-                | otherwise =
-                    -- Build on left (smaller), probe with right, swap result
-                    let (!rIxs, !lIxs) = hashInnerKernel rightHashes leftHashes
-                     in (lIxs, rIxs)
-         in
-            assembleInner csSet left right leftIxs rightIxs
+    | otherwise = innerJoinNonEmpty cs left right
+
+innerJoinNonEmpty :: [T.Text] -> DataFrame -> DataFrame -> DataFrame
+innerJoinNonEmpty cs left right =
+    let
+        csSet = S.fromList cs
+        leftRows = fst (D.dimensions left)
+        rightRows = fst (D.dimensions right)
+
+        leftKeyIdxs = validatedKeyColIndices "innerJoin" csSet left
+        rightKeyIdxs = validatedKeyColIndices "innerJoin" csSet right
+        leftHashes = D.computeRowHashes leftKeyIdxs left
+        rightHashes = D.computeRowHashes rightKeyIdxs right
+
+        buildRows = min leftRows rightRows
+        (leftIxs, rightIxs)
+            | buildRows > joinStrategyThreshold =
+                sortMergeInnerKernel leftHashes rightHashes
+            | rightRows <= leftRows =
+                -- Build on right (smaller or equal), probe with left
+                hashInnerKernel leftHashes rightHashes
+            | otherwise =
+                -- Build on left (smaller), probe with right, swap result
+                let (!rIxs, !lIxs) = hashInnerKernel rightHashes leftHashes
+                 in (lIxs, rIxs)
+     in
+        assembleInner csSet left right leftIxs rightIxs
 
 -- | Compute hashes for the given key column names in a DataFrame.
 buildHashColumn :: [T.Text] -> DataFrame -> VU.Vector Int
 buildHashColumn keys df =
     let csSet = S.fromList keys
-        keyIdxs = keyColIndices csSet df
+        keyIdxs = validatedKeyColIndices "buildHashColumn" csSet df
      in D.computeRowHashes keyIdxs df
 
 {- | Probe one batch of rows against a pre-built 'CompactIndex'.
@@ -527,28 +544,35 @@ ghci> D.leftJoin ["key"] df other
 @
 -}
 leftJoin :: [T.Text] -> DataFrame -> DataFrame -> DataFrame
-leftJoin cs left right
+leftJoin = leftJoinWithCallPoint "leftJoin"
+
+leftJoinWithCallPoint ::
+    T.Text -> [T.Text] -> DataFrame -> DataFrame -> DataFrame
+leftJoinWithCallPoint callPoint cs left right
     | D.null right || D.nRows right == 0 = left
     | D.null left || D.nRows left == 0 = D.empty
-    | otherwise =
-        let
-            csSet = S.fromList cs
-            rightRows = fst (D.dimensions right)
-
-            leftKeyIdxs = keyColIndices csSet left
-            rightKeyIdxs = keyColIndices csSet right
-            leftHashes = D.computeRowHashes leftKeyIdxs left
-            rightHashes = D.computeRowHashes rightKeyIdxs right
-
-            -- Right is always the build side for left join
-            (leftIxs, rightIxs)
-                | rightRows > joinStrategyThreshold =
-                    sortMergeLeftKernel leftHashes rightHashes
-                | otherwise =
-                    hashLeftKernel leftHashes rightHashes
-         in
-            -- rightIxs uses -1 as sentinel for "no match"
-            assembleLeft csSet left right leftIxs rightIxs
+    | otherwise = leftJoinNonEmpty callPoint cs left right
+
+leftJoinNonEmpty :: T.Text -> [T.Text] -> DataFrame -> DataFrame -> DataFrame
+leftJoinNonEmpty callPoint cs left right =
+    let
+        csSet = S.fromList cs
+        rightRows = fst (D.dimensions right)
+
+        leftKeyIdxs = validatedKeyColIndices callPoint csSet left
+        rightKeyIdxs = validatedKeyColIndices callPoint csSet right
+        leftHashes = D.computeRowHashes leftKeyIdxs left
+        rightHashes = D.computeRowHashes rightKeyIdxs right
+
+        -- Right is always the build side for left join
+        (leftIxs, rightIxs)
+            | rightRows > joinStrategyThreshold =
+                sortMergeLeftKernel leftHashes rightHashes
+            | otherwise =
+                hashLeftKernel leftHashes rightHashes
+     in
+        -- rightIxs uses -1 as sentinel for "no match"
+        assembleLeft csSet left right leftIxs rightIxs
 
 {- | Hash-based left join kernel.
 Returns @(leftExpandedIndices, rightExpandedIndices)@ where
@@ -798,33 +822,36 @@ ghci> D.rightJoin ["key"] df other
 -}
 rightJoin ::
     [T.Text] -> DataFrame -> DataFrame -> DataFrame
-rightJoin cs left right = leftJoin cs right left
+rightJoin cs left right = leftJoinWithCallPoint "rightJoin" cs right left
 
 fullOuterJoin ::
     [T.Text] -> DataFrame -> DataFrame -> DataFrame
 fullOuterJoin cs left right
     | D.null right || D.nRows right == 0 = left
     | D.null left || D.nRows left == 0 = right
-    | otherwise =
-        let
-            csSet = S.fromList cs
-            leftRows = fst (D.dimensions left)
-            rightRows = fst (D.dimensions right)
-
-            leftKeyIdxs = keyColIndices csSet left
-            rightKeyIdxs = keyColIndices csSet right
-            leftHashes = D.computeRowHashes leftKeyIdxs left
-            rightHashes = D.computeRowHashes rightKeyIdxs right
-
-            -- Both sides can have nulls in full outer
-            (leftIxs, rightIxs)
-                | max leftRows rightRows > joinStrategyThreshold =
-                    sortMergeFullOuterKernel leftHashes rightHashes
-                | otherwise =
-                    hashFullOuterKernel leftHashes rightHashes
-         in
-            -- Both index vectors use -1 as sentinel
-            assembleFullOuter csSet left right leftIxs rightIxs
+    | otherwise = fullOuterJoinNonEmpty cs left right
+
+fullOuterJoinNonEmpty :: [T.Text] -> DataFrame -> DataFrame -> DataFrame
+fullOuterJoinNonEmpty cs left right =
+    let
+        csSet = S.fromList cs
+        leftRows = fst (D.dimensions left)
+        rightRows = fst (D.dimensions right)
+
+        leftKeyIdxs = validatedKeyColIndices "fullOuterJoin" csSet left
+        rightKeyIdxs = validatedKeyColIndices "fullOuterJoin" csSet right
+        leftHashes = D.computeRowHashes leftKeyIdxs left
+        rightHashes = D.computeRowHashes rightKeyIdxs right
+
+        -- Both sides can have nulls in full outer
+        (leftIxs, rightIxs)
+            | max leftRows rightRows > joinStrategyThreshold =
+                sortMergeFullOuterKernel leftHashes rightHashes
+            | otherwise =
+                hashFullOuterKernel leftHashes rightHashes
+     in
+        -- Both index vectors use -1 as sentinel
+        assembleFullOuter csSet left right leftIxs rightIxs
 
 {- | Hash-based full outer join kernel.
 Builds compact indices on both sides.

diff --git a/src/DataFrame/Operations/Subset.hs b/src/DataFrame/Operations/Subset.hs
@@ -1,4 +1,6 @@
 {-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE ConstraintKinds #-}
 {-# LANGUAGE ExplicitNamespaces #-}
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE GADTs #-}
@@ -48,6 +50,18 @@ import System.Random
 import Type.Reflection
 import Prelude hiding (filter, take)
 
+#if MIN_VERSION_random(1,3,0)
+type SplittableGen g = (SplitGen g, RandomGen g)
+
+splitForStratified :: SplittableGen g => g -> (g, g)
+splitForStratified = splitGen
+#else
+type SplittableGen g = RandomGen g
+
+splitForStratified :: SplittableGen g => g -> (g, g)
+splitForStratified = split
+#endif
+
 -- | O(k * n) Take the first n rows of a DataFrame.
 take :: Int -> DataFrame -> DataFrame
 take n d = d{columns = V.map (takeColumn n') (columns d), dataframeDimensions = (n', c)}
@@ -513,7 +527,7 @@ ghci> D.stratifiedSample (mkStdGen 42) 0.8 "label" df
 -}
 stratifiedSample ::
     forall a g.
-    (SplitGen g, RandomGen g, Columnable a) =>
+    (SplittableGen g, Columnable a) =>
     g -> Double -> Expr a -> DataFrame -> DataFrame
 stratifiedSample gen p strataCol df =
     let col = case strataCol of
@@ -523,7 +537,7 @@ stratifiedSample gen p strataCol df =
         go _ [] = mempty
         go g (ixs : rest) =
             let stratum = rowsAtIndices ixs df
-                (g1, g2) = splitGen g
+                (g1, g2) = splitForStratified g
              in sample g1 p stratum <> go g2 rest
      in go gen groups
 
@@ -537,7 +551,7 @@ ghci> D.stratifiedSplit (mkStdGen 42) 0.8 "label" df
 -}
 stratifiedSplit ::
     forall a g.
-    (SplitGen g, RandomGen g, Columnable a) =>
+    (SplittableGen g, Columnable a) =>
     g -> Double -> Expr a -> DataFrame -> (DataFrame, DataFrame)
 stratifiedSplit gen p strataCol df =
     let col = case strataCol of
@@ -547,7 +561,7 @@ stratifiedSplit gen p strataCol df =
         go _ [] = (mempty, mempty)
         go g (ixs : rest) =
             let stratum = rowsAtIndices ixs df
-                (g1, g2) = splitGen g
+                (g1, g2) = splitForStratified g
                 (tr, va) = randomSplit g1 p stratum
                 (trAcc, vaAcc) = go g2 rest
              in (tr <> trAcc, va <> vaAcc)