daft-examples/pipelines/context_engineering/lambda_mapreduce.py at main · Eventual-Inc/daft-examples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
# /// script
# description = "Lambda MapReduce: 6 long-context reasoning patterns expressed as native Daft query plans"
# requires-python = ">=3.12, <3.13"
# dependencies = ["daft[openai]>=0.7.10", "pymupdf", "python-dotenv"]
# ///

"""
Lambda MapReduce — Long-Context Reasoning via Daft
===================================================

Implements the core insight from lambda-RLM (Roy et al., 2026):
arbitrary recursive LLM reasoning reduces to MAP + REDUCE over
bounded chunks, with the LLM as a leaf oracle.

The key realization: REDUCE is a Daft aggregation, not a Python loop.

    SPLIT  = from_glob_path → extract_pdf → unnest      (pages are chunks)
    FILTER = where(prompt(...).startswith("Y"))      (LLM as predicate)
    MAP    = with_column("result", prompt(col(...)))     (LLM as leaf oracle)
    REDUCE = groupby().agg() → list_join() [→ prompt()]  (fold via Daft expressions)

Six patterns, one skeleton. Every operation stays in the query plan.

Usage:
    uv run pipelines/context_engineering/lambda_mapreduce.py
    uv run pipelines/context_engineering/lambda_mapreduce.py --pattern summarize
    uv run pipelines/context_engineering/lambda_mapreduce.py --pattern qa --query "What methods are proposed?"
"""

from __future__ import annotations

import argparse
import time
from collections.abc import Iterator
from typing import TypedDict

import pymupdf

import daft
from daft import DataFrame, col, lit
from daft.functions import format, prompt, unnest

# ==============================================================================
# SPLIT: PDF → pages (document structure IS the split)
# ==============================================================================


class PdfPage(TypedDict):
    page_number: int
    page_text: str


@daft.func
def extract_pdf(file: daft.File) -> Iterator[PdfPage]:
    """Extract text from each page of a PDF. Generator UDF: 1 file → N pages."""
    pymupdf.TOOLS.mupdf_display_errors(False)
    with file.to_tempfile() as tmp:
        doc = pymupdf.Document(filename=str(tmp.name), filetype="pdf")
        for pno, page in enumerate(doc):
            text = page.get_text("text")
            if len(text.strip()) > 50:
                yield PdfPage(page_number=pno, page_text=text)


def load_papers(source: str, max_papers: int = 2, max_pages: int | None = None) -> DataFrame:
    """SPLIT: Load PDFs and extract pages into a flat DataFrame."""
    df = (
        daft.from_glob_path(source)
        .limit(max_papers)
        .with_column("pdf_file", daft.functions.file(col("path")))
        .with_column("page", extract_pdf(col("pdf_file")))
        .select("path", "size", unnest(col("page")))
    )
    if max_pages is not None:
        df = df.limit(max_pages)
    return df


# ==============================================================================
# The six lambda-RLM patterns — all MAP + REDUCE via Daft expressions
# ==============================================================================


def pattern_search(df: DataFrame, query: str, model: str) -> DataFrame:
    """
    Search: PEEK → FILTER → MAP → SELECT_BEST

    Find a needle in a haystack. Filter cheap (preview), answer expensive (full page).
    """
    return (
        df
        # PEEK: first 200 chars
        .with_column("preview", col("page_text").substr(0, 200))
        # FILTER: LLM relevance check on preview
        .with_column(
            "is_relevant",
            prompt(
                format(
                    "Question: {}\n\nDoes this excerpt likely contain the answer? Reply YES or NO only.\n\nExcerpt:\n{}",
                    lit(query),
                    col("preview"),
                ),
                model=model,
            ),
        )
        .where(col("is_relevant").upper().startswith("Y"))
        # MAP: answer from relevant pages
        .with_column(
            "result",
            prompt(
                format("Question: {}\n\nAnswer using ONLY this context:\n\n{}", lit(query), col("page_text")),
                model=model,
            ),
        )
        # REDUCE: select best — filter noise, take earliest page
        .where(~col("result").lower().contains("not found"))
        .where(~col("result").lower().contains("no information"))
        .sort("page_number")
        .limit(1)
        .select("path", "page_number", "result")
    )


def pattern_summarize(df: DataFrame, model: str) -> DataFrame:
    """
    Summarize: MAP → AGG → LLM_REDUCE

    Summarize each page, aggregate, then merge summaries via one LLM call.
    """
    return (
        df
        # MAP: summarize each page
        .with_column(
            "result",
            prompt(
                lit("Summarize the following text concisely:\n\n") + col("page_text"),
                model=model,
            ),
        )
        # REDUCE: aggregate all summaries per document, then merge via LLM
        .groupby("path")
        .agg(col("result").list_agg().alias("partial_summaries"))
        .with_column("context", col("partial_summaries").list_join("\n\n---\n\n"))
        .with_column(
            "result",
            prompt(
                format(
                    "Merge these partial summaries into one concise, coherent summary. Preserve all key facts:\n\n{}",
                    col("context"),
                ),
                model=model,
            ),
        )
        .select("path", "result")
    )


def pattern_classify(df: DataFrame, model: str) -> DataFrame:
    """
    Classify: MAP → MAJORITY_VOTE

    Classify each page, then majority vote via groupby + count.
    """
    return (
        df
        # MAP: classify each page
        .with_column(
            "label",
            prompt(
                lit(
                    "Classify this text into exactly one category "
                    "(Methods, Results, Introduction, Discussion, Related Work). "
                    "Reply with ONLY the category name:\n\n"
                )
                + col("page_text"),
                model=model,
            ),
        )
        # REDUCE: majority vote — normalize, count, take most frequent
        .with_column("label_norm", col("label").lower().lstrip().rstrip())
        .groupby("label_norm")
        .agg(col("label_norm").count().alias("votes"))
        .sort("votes", desc=True)
        .limit(1)
        .select(col("label_norm").alias("result"), "votes")
    )


def pattern_extract(df: DataFrame, model: str) -> DataFrame:
    """
    Extract: MAP → SPLIT_LINES → DEDUP

    Extract facts from each page, split into lines, deduplicate.
    """
    return (
        df
        # MAP: extract facts, one per line
        .with_column(
            "result",
            prompt(
                lit("Extract all key facts, entities, numbers, and findings. One fact per line:\n\n")
                + col("page_text"),
                model=model,
            ),
        )
        # REDUCE: split lines, explode, dedup
        .with_column("facts", col("result").split("\n"))
        .explode("facts")
        .with_column("fact", col("facts").lstrip().rstrip())
        .where(col("fact").length() > 0)
        .distinct()
        .select(col("fact").alias("result"))
    )


def pattern_qa(df: DataFrame, query: str, model: str) -> DataFrame:
    """
    QA: FILTER → MAP → AGG → LLM_SYNTHESIZE

    Filter relevant pages, answer from each, aggregate, synthesize.
    """
    return (
        df
        # FILTER: LLM relevance check on preview
        .with_column("preview", col("page_text").substr(0, 200))
        .with_column(
            "is_relevant",
            prompt(
                format(
                    "Question: {}\n\nDoes this excerpt contain relevant information? Reply YES or NO only.\n\nExcerpt:\n{}",
                    lit(query),
                    col("preview"),
                ),
                model=model,
            ),
        )
        .where(col("is_relevant").upper().startswith("Y"))
        # MAP: answer from each relevant page
        .with_column(
            "result",
            prompt(
                format("Question: {}\n\nAnswer based on this context only:\n\n{}", lit(query), col("page_text")),
                model=model,
            ),
        )
        # REDUCE: aggregate partial answers, then synthesize via LLM
        .where(~col("result").lower().contains("not found"))
        .groupby("path")
        .agg(col("result").list_agg().alias("partial_answers"))
        .with_column("context", col("partial_answers").list_join("\n\n---\n\n"))
        .with_column(
            "result",
            prompt(
                format(
                    "Question: {}\n\nSynthesise these partial answers into one complete, accurate answer:\n\n{}",
                    lit(query),
                    col("context"),
                ),
                model=model,
            ),
        )
        .select("path", "result")
    )


def pattern_analyze(df: DataFrame, model: str) -> DataFrame:
    """
    Analyze: MAP → AGG → LLM_COMBINE

    Analyze each page, aggregate, combine insights via LLM.
    """
    return (
        df
        # MAP: analyze each page
        .with_column(
            "result",
            prompt(
                lit("Analyze the following text. Identify key themes, arguments, and implications:\n\n")
                + col("page_text"),
                model=model,
            ),
        )
        # REDUCE: aggregate analyses, then combine via LLM
        .groupby("path")
        .agg(col("result").list_agg().alias("partial_analyses"))
        .with_column("context", col("partial_analyses").list_join("\n\n---\n\n"))
        .with_column(
            "result",
            prompt(
                format(
                    "Combine these partial analyses into one comprehensive, well-structured analysis:\n\n{}",
                    col("context"),
                ),
                model=model,
            ),
        )
        .select("path", "result")
    )


# ==============================================================================
# Pattern registry
# ==============================================================================

PATTERNS = {
    "search": pattern_search,
    "summarize": pattern_summarize,
    "classify": pattern_classify,
    "extract": pattern_extract,
    "qa": pattern_qa,
    "analyze": pattern_analyze,
}

QUERY_PATTERNS = {"search", "qa"}
DEFAULT_SOURCE = "hf://datasets/Eventual-Inc/sample-files/papers/*.pdf"


# ==============================================================================
# Runner
# ==============================================================================


def run_pattern(name: str, pages_df: DataFrame, model: str, query: str | None) -> tuple[DataFrame, float]:
    """Run a single pattern. Returns (result_df, elapsed_seconds)."""
    fn = PATTERNS[name]
    t0 = time.perf_counter()

    if name in QUERY_PATTERNS:
        q = query or "What are the main contributions of this paper?"
        result_df = fn(pages_df, query=q, model=model)
    else:
        result_df = fn(pages_df, model=model)

    # Materialize the query plan
    result_df = result_df.collect()
    elapsed = time.perf_counter() - t0
    return result_df, elapsed


def main():
    parser = argparse.ArgumentParser(description="Lambda MapReduce — Long-Context Reasoning via Daft")
    parser.add_argument("--pattern", choices=list(PATTERNS.keys()) + ["all"], default="all")
    parser.add_argument("--model", default="gpt-5-mini")
    parser.add_argument("--source", default=DEFAULT_SOURCE)
    parser.add_argument("--max-papers", type=int, default=1)
    parser.add_argument("--max-pages", type=int, default=None)
    parser.add_argument("--query", default=None)
    args = parser.parse_args()

    import os

    from dotenv import load_dotenv

    load_dotenv()
    daft.set_provider("openai", api_key=os.environ.get("OPENAI_API_KEY"))

    print("=" * 60)
    print("Lambda MapReduce — Long-Context Reasoning via Daft")
    print("=" * 60)

    # SPLIT
    pages_df = load_papers(args.source, args.max_papers, args.max_pages)
    print("\nPages loaded:")
    pages_df.select("path", "page_number", col("page_text").length().alias("chars")).show(10)

    patterns_to_run = list(PATTERNS.keys()) if args.pattern == "all" else [args.pattern]

    for name in patterns_to_run:
        print(f"\n{'─' * 60}")
        print(f"Pattern: {name}")
        if name in QUERY_PATTERNS:
            q = args.query or "What are the main contributions of this paper?"
            print(f"Query:   {q}")
        print(f"{'─' * 60}")

        result_df, elapsed = run_pattern(name, pages_df, args.model, args.query)
        result_df.show()
        print(f"({elapsed:.1f}s)")


if __name__ == "__main__":
    main()