DataHaskell
diff --git a/‎app/Synthesis.hs‎
Lines changed: 3 additions & 3 deletions b/‎app/Synthesis.hs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎dataframe.cabal‎
Lines changed: 1 addition & 0 deletions b/‎dataframe.cabal‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/DataFrame.hs‎
Lines changed: 2 additions & 0 deletions b/‎src/DataFrame.hs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/DataFrame/DecisionTree.hs‎
Lines changed: 85 additions & 47 deletions b/‎src/DataFrame/DecisionTree.hs‎
Lines changed: 85 additions & 47 deletions
diff --git a/‎src/DataFrame/Operations/Subset.hs‎
Lines changed: 81 additions & 0 deletions b/‎src/DataFrame/Operations/Subset.hs‎
Lines changed: 81 additions & 0 deletions
@@ -53,13 +53,13 @@ main = do
             fitDecisionTree
                 ( defaultTreeConfig
                     { maxTreeDepth = 5
-                    , minSamplesSplit = 10
+                    , minSamplesSplit = 5
                     , minLeafSize = 3
                     , taoIterations = 100
                     , synthConfig =
                         defaultSynthConfig
-                            { complexityPenalty = 0.00
-                            , maxExprDepth = 2
+                            { complexityPenalty = 0.1
+                            , maxExprDepth = 3
                             , disallowedCombinations =
                                 [ (F.name age, F.name fare)
                                 , ("passenger_class", "number_of_siblings_and_spouses")
 
@@ -249,6 +249,7 @@ test-suite tests
     type: exitcode-stdio-1.0
     main-is: Main.hs
     other-modules: Assertions,
+                   DecisionTree,
                    Functions,
                    GenDataFrame,
                    Internal.Parsing,
 
@@ -362,6 +362,8 @@ import DataFrame.Operations.Subset as Subset (
     sample,
     select,
     selectBy,
+    stratifiedSample,
+    stratifiedSplit,
     take,
     takeLast,
  )
 
@@ -548,38 +548,108 @@ findBestGreedySplit cfg target conds df =
                             (boolExpansion (synthConfig cfg))
                         )
 
+-- | Unifies non-nullable and nullable Double expressions for feature generation.
+data NumExpr
+    = NDouble !(Expr Double)
+    | NMaybeDouble !(Expr (Maybe Double))
+
+numExprCols :: NumExpr -> [T.Text]
+numExprCols (NDouble e) = getColumns e
+numExprCols (NMaybeDouble e) = getColumns e
+
+numExprEq :: NumExpr -> NumExpr -> Bool
+numExprEq (NDouble e1) (NDouble e2) = e1 == e2
+numExprEq (NMaybeDouble e1) (NMaybeDouble e2) = e1 == e2
+numExprEq _ _ = False
+
+combineNumExprs :: NumExpr -> NumExpr -> [NumExpr]
+combineNumExprs (NDouble e1) (NDouble e2) =
+    [ NDouble (e1 .+ e2)
+    , NDouble (e1 .- e2)
+    , NDouble (e1 .* e2)
+    , NDouble
+        (F.ifThenElse (e2 ./= F.lit (0 :: Double)) (e1 ./ e2) (F.lit (0 :: Double)))
+    ]
+combineNumExprs (NDouble e1) (NMaybeDouble e2) =
+    [ NMaybeDouble (e1 .+ e2)
+    , NMaybeDouble (e1 .- e2)
+    , NMaybeDouble (e1 .* e2)
+    , NMaybeDouble
+        ( F.ifThenElse
+            (F.fromMaybe False (e2 ./= F.lit (0 :: Double)))
+            (e1 ./ e2)
+            (F.lit (Nothing :: Maybe Double))
+        )
+    ]
+combineNumExprs (NMaybeDouble e1) (NDouble e2) =
+    [ NMaybeDouble (e1 .+ e2)
+    , NMaybeDouble (e1 .- e2)
+    , NMaybeDouble (e1 .* e2)
+    , NMaybeDouble
+        ( F.ifThenElse
+            (e2 ./= F.lit (0 :: Double))
+            (e1 ./ e2)
+            (F.lit (Nothing :: Maybe Double))
+        )
+    ]
+combineNumExprs (NMaybeDouble e1) (NMaybeDouble e2) =
+    [ NMaybeDouble (e1 .+ e2)
+    , NMaybeDouble (e1 .- e2)
+    , NMaybeDouble (e1 .* e2)
+    , NMaybeDouble
+        ( F.ifThenElse
+            (F.fromMaybe False (e2 ./= F.lit (0 :: Double)))
+            (e1 ./ e2)
+            (F.lit (Nothing :: Maybe Double))
+        )
+    ]
+
 numericConditions :: TreeConfig -> DataFrame -> [Expr Bool]
 numericConditions = generateNumericConds
 
 generateNumericConds :: TreeConfig -> DataFrame -> [Expr Bool]
 generateNumericConds cfg df = do
     expr <- numericExprsWithTerms (synthConfig cfg) df
-    let thresholds = map (\p -> percentile p expr df) (percentiles cfg)
+    let thresholds = numericThresholds expr
     threshold <- thresholds
-    [ expr .<= F.lit threshold
-        , expr .>= F.lit threshold
-        , expr .< F.lit threshold
-        , expr .> F.lit threshold
+    numericCondsFromExpr expr threshold
+  where
+    numericThresholds (NDouble e) = map (\p -> percentile p e df) (percentiles cfg)
+    numericThresholds (NMaybeDouble e) = map (\p -> percentile p (F.fromMaybe 0 e) df) (percentiles cfg)
+
+    numericCondsFromExpr (NDouble e) t =
+        [e .<= F.lit t, e .>= F.lit t, e .< F.lit t, e .> F.lit t]
+    numericCondsFromExpr (NMaybeDouble e) t =
+        [ F.fromMaybe False (e .<= F.lit t)
+        , F.fromMaybe False (e .>= F.lit t)
+        , F.fromMaybe False (e .< F.lit t)
+        , F.fromMaybe False (e .> F.lit t)
         ]
 
-numericExprsWithTerms :: SynthConfig -> DataFrame -> [Expr Double]
+numericExprsWithTerms :: SynthConfig -> DataFrame -> [NumExpr]
 numericExprsWithTerms cfg df =
     concatMap (numericExprs cfg df [] 0) [0 .. maxExprDepth cfg]
 
-numericCols :: DataFrame -> [Expr Double]
+numericCols :: DataFrame -> [NumExpr]
 numericCols df = concatMap extract (columnNames df)
   where
     extract col = case unsafeGetColumn col df of
         UnboxedColumn (_ :: VU.Vector b) ->
             case testEquality (typeRep @b) (typeRep @Double) of
-                Just Refl -> [Col col]
+                Just Refl -> [NDouble (Col col)]
                 Nothing -> case sIntegral @b of
-                    STrue -> [F.toDouble (Col @b col)]
+                    STrue -> [NDouble (F.toDouble (Col @b col))]
+                    SFalse -> []
+        OptionalColumn (_ :: V.Vector (Maybe b)) ->
+            case testEquality (typeRep @b) (typeRep @Double) of
+                Just Refl -> [NMaybeDouble (Col @(Maybe b) col)]
+                Nothing -> case sIntegral @b of
+                    STrue -> [NMaybeDouble (F.whenPresent (realToFrac @b @Double) (Col @(Maybe b) col))]
                     SFalse -> []
         _ -> []
 
 numericExprs ::
-    SynthConfig -> DataFrame -> [Expr Double] -> Int -> Int -> [Expr Double]
+    SynthConfig -> DataFrame -> [NumExpr] -> Int -> Int -> [NumExpr]
 numericExprs cfg df prevExprs depth maxDepth
     | depth == 0 = baseExprs ++ numericExprs cfg df baseExprs (depth + 1) maxDepth
     | depth >= maxDepth = []
@@ -592,20 +662,16 @@ numericExprs cfg df prevExprs depth maxDepth
         | otherwise = do
             e1 <- prevExprs
             e2 <- baseExprs
-            let cols = getColumns e1 <> getColumns e2
+            let cols = numExprCols e1 <> numExprCols e2
             guard
-                ( e1 /= e2
+                ( not (numExprEq e1 e2)
                     && not
                         ( any
                             (\(l, r) -> l `elem` cols && r `elem` cols)
                             (disallowedCombinations cfg)
                         )
                 )
-            [ e1 + e2
-                , e1 - e2
-                , e1 * e2
-                , F.ifThenElse (e2 ./= (0 :: Expr Double)) (e1 / e2) 0
-                ]
+            combineNumExprs e1 e2
 
 boolExprs ::
     DataFrame -> [Expr Bool] -> [Expr Bool] -> Int -> Int -> [Expr Bool]
@@ -631,37 +697,9 @@ generateConditionsOld cfg df =
                 let ps = map (Lit . (`percentileOrd'` col)) [1, 25, 75, 99]
                  in map (F.lift2 (==) (Col @a colName)) ps
             (OptionalColumn (col :: V.Vector (Maybe a))) -> case sFloating @a of
-                STrue ->
-                    let doubleCol =
-                            VU.convert
-                                (V.map fromJust (V.filter isJust (V.map (fmap (realToFrac @a @Double)) col)))
-                     in zipWith
-                            ($)
-                            [ F.lift2 (==) (Col @(Maybe a) colName)
-                            , F.lift2 (<=) (Col @(Maybe a) colName)
-                            , F.lift2 (>=) (Col @(Maybe a) colName)
-                            ]
-                            ( Lit Nothing
-                                : map
-                                    (Lit . Just . realToFrac . (`percentile'` doubleCol))
-                                    (percentiles cfg)
-                            )
+                STrue -> [] -- handled by numericCols / numericExprs
                 SFalse -> case sIntegral @a of
-                    STrue ->
-                        let doubleCol =
-                                VU.convert
-                                    (V.map fromJust (V.filter isJust (V.map (fmap (fromIntegral @a @Double)) col)))
-                         in zipWith
-                                ($)
-                                [ F.lift2 (==) (Col @(Maybe a) colName)
-                                , F.lift2 (<=) (Col @(Maybe a) colName)
-                                , F.lift2 (>=) (Col @(Maybe a) colName)
-                                ]
-                                ( Lit Nothing
-                                    : map
-                                        (Lit . Just . round . (`percentile'` doubleCol))
-                                        (percentiles cfg)
-                                )
+                    STrue -> [] -- handled by numericCols / numericExprs
                     SFalse ->
                         map
                             (F.lift2 (==) (Col @(Maybe a) colName) . Lit . (`percentileOrd'` col))
 
@@ -1,3 +1,4 @@
+{-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE ExplicitNamespaces #-}
 {-# LANGUAGE FlexibleContexts #-}
 {-# LANGUAGE GADTs #-}
@@ -36,10 +37,12 @@ import DataFrame.Internal.DataFrame (
     derivingExpressions,
     empty,
     getColumn,
+    unsafeGetColumn,
  )
 import DataFrame.Internal.Expression
 import DataFrame.Internal.Interpreter
 import DataFrame.Operations.Core
+import DataFrame.Operations.Merge ()
 import DataFrame.Operations.Transformations (apply)
 import System.Random
 import Type.Reflection
@@ -471,3 +474,81 @@ generateRandomVector pureGen k = VU.fromList $ go pureGen k
             (v, g') = uniformR (0 :: Double, 1 :: Double) g
          in
             v : go g' (n - 1)
+
+-- | Convert any Column to a vector of Text labels (one per row).
+columnToTextVec :: Column -> V.Vector T.Text
+columnToTextVec (BoxedColumn (col :: V.Vector a)) =
+    case testEquality (typeRep @a) (typeRep @T.Text) of
+        Just Refl -> col
+        Nothing -> V.map (T.pack . show) col
+columnToTextVec (UnboxedColumn col) = V.map (T.pack . show) (V.convert col)
+columnToTextVec (OptionalColumn col) = V.map (T.pack . show) col
+
+-- | Build a map from stringified label to row indices.
+groupByIndices :: Column -> M.Map T.Text (VU.Vector Int)
+groupByIndices col =
+    let textVec = columnToTextVec col
+        (grouped, _) =
+            V.foldl'
+                (\(!m, !i) key -> (M.insertWith (++) key [i] m, i + 1))
+                (M.empty, 0)
+                textVec
+     in M.map (VU.fromList . L.reverse) grouped
+
+-- | Select rows at the given indices from all columns.
+rowsAtIndices :: VU.Vector Int -> DataFrame -> DataFrame
+rowsAtIndices ixs df =
+    df
+        { columns = V.map (atIndicesStable ixs) (columns df)
+        , dataframeDimensions = (VU.length ixs, snd (dataframeDimensions df))
+        }
+
+{- | Sample a dataframe, preserving per-stratum proportions.
+
+==== __Example__
+@
+ghci> import System.Random
+ghci> D.stratifiedSample (mkStdGen 42) 0.8 "label" df
+@
+-}
+stratifiedSample ::
+    forall a g.
+    (SplitGen g, RandomGen g, Columnable a) =>
+    g -> Double -> Expr a -> DataFrame -> DataFrame
+stratifiedSample gen p strataCol df =
+    let col = case strataCol of
+            Col name -> unsafeGetColumn name df
+            _ -> unwrapTypedColumn (either throw id (interpret @a df strataCol))
+        groups = M.elems (groupByIndices col)
+        go _ [] = mempty
+        go g (ixs : rest) =
+            let stratum = rowsAtIndices ixs df
+                (g1, g2) = splitGen g
+             in sample g1 p stratum <> go g2 rest
+     in go gen groups
+
+{- | Split a dataframe into two, preserving per-stratum proportions.
+
+==== __Example__
+@
+ghci> import System.Random
+ghci> D.stratifiedSplit (mkStdGen 42) 0.8 "label" df
+@
+-}
+stratifiedSplit ::
+    forall a g.
+    (SplitGen g, RandomGen g, Columnable a) =>
+    g -> Double -> Expr a -> DataFrame -> (DataFrame, DataFrame)
+stratifiedSplit gen p strataCol df =
+    let col = case strataCol of
+            Col name -> unsafeGetColumn name df
+            _ -> unwrapTypedColumn (either throw id (interpret @a df strataCol))
+        groups = M.elems (groupByIndices col)
+        go _ [] = (mempty, mempty)
+        go g (ixs : rest) =
+            let stratum = rowsAtIndices ixs df
+                (g1, g2) = splitGen g
+                (tr, va) = randomSplit g1 p stratum
+                (trAcc, vaAcc) = go g2 rest
+             in (tr <> trAcc, va <> vaAcc)
+     in go gen groups
Original file line number	Diff line number	Diff line change
`@@ -362,6 +362,8 @@ import DataFrame.Operations.Subset as Subset (`
`362`	`362`	`sample,`
`363`	`363`	`select,`
`364`	`364`	`selectBy,`
	`365`	`+ stratifiedSample,`
	`366`	`+ stratifiedSplit,`
`365`	`367`	`take,`
`366`	`368`	`takeLast,`
`367`	`369`	`)`