kagent-dev · skashmeri · May 27, 2026 · May 28, 2026 · Jun 1, 2026
@@ -304,6 +304,7 @@ func CreateLLM(ctx context.Context, m adk.Model, log logr.Logger) (adkmodel.LLM,
 			Model:                        modelName,
 			Region:                       region,
 			AdditionalModelRequestFields: m.AdditionalModelRequestFields,
+			PromptCaching:                m.PromptCaching,
 		}
 		return models.NewBedrockModelWithLogger(ctx, cfg, log)
 

@@ -77,6 +77,13 @@ type BedrockConfig struct {
 	Temperature                  *float64
 	TopP                         *float64
 	AdditionalModelRequestFields map[string]any
+	// PromptCaching, when true, appends a default CachePoint block at the
+	// end of the Converse request's system content array and the end of
+	// the tools array. Bedrock caches up to and including those markers
+	// across requests in the same region; cached prefix is billed at a
+	// reduced rate. The marker is silently ignored by Bedrock for models
+	// that do not support prompt caching.
+	PromptCaching bool
 }
 
 // BedrockModel implements model.LLM for Amazon Bedrock using the Converse API.
@@ -151,7 +158,7 @@ func (m *BedrockModel) GenerateContent(ctx context.Context, req *model.LLMReques
 		var toolConfig *types.ToolConfiguration
 		nameMap := make(map[string]string)
 		if req.Config != nil && len(req.Config.Tools) > 0 {
-			tools, nm := convertGenaiToolsToBedrock(req.Config.Tools)
+			tools, nm := convertGenaiToolsToBedrock(req.Config.Tools, m.Config.PromptCaching)
 			nameMap = nm
 			if len(tools) > 0 {
 				toolConfig = &types.ToolConfiguration{
@@ -193,6 +200,16 @@ func (m *BedrockModel) GenerateContent(ctx context.Context, req *model.LLMReques
 				Value: systemInstruction,
 			})
 		}
+		// If prompt caching is enabled, mark the end of the system content
+		// as a cache breakpoint. Bedrock caches everything up to and including
+		// this point for ~5 minutes; subsequent requests with the same prefix
+		// hit the cache. Skipped for empty systems — caching nothing is a no-op
+		// that wastes a marker.
+		if m.Config.PromptCaching && len(systemPrompt) > 0 {
+			systemPrompt = append(systemPrompt, &types.SystemContentBlockMemberCachePoint{
+				Value: types.CachePointBlock{Type: types.CachePointTypeDefault},
+			})
+		}
 
 		additionalFields := m.buildAdditionalModelRequestFields()
 
@@ -568,7 +585,12 @@ func convertGenaiContentsToBedrockMessages(contents []*genai.Content, nameMap ma
 // It sanitizes tool names to satisfy Bedrock's [a-zA-Z0-9_-]+ constraint and
 // returns the original->sanitized name mapping so callers can apply it to
 // conversation history and reverse it when restoring names from responses.
-func convertGenaiToolsToBedrock(tools []*genai.Tool) ([]types.Tool, map[string]string) {
+//
+// When promptCaching is true, a CachePoint marker is appended after the
+// last tool spec — Bedrock then caches the entire (typically large) tool
+// definitions array for ~5 minutes, billing the prefix at a reduced rate
+// on cache hits.
+func convertGenaiToolsToBedrock(tools []*genai.Tool, promptCaching bool) ([]types.Tool, map[string]string) {
 	if len(tools) == 0 {
 		return nil, nil
 	}
@@ -625,6 +647,17 @@ func convertGenaiToolsToBedrock(tools []*genai.Tool) ([]types.Tool, map[string]s
 		}
 	}
 
+	// If prompt caching is enabled, append a CachePoint at the END of the
+	// tool list. Bedrock caches the entire tool definitions array up to
+	// this marker; this is usually the biggest single chunk of static
+	// prefix in an agent conversation and benefits most from caching.
+	// Skipped when there are no tools — a cache marker by itself is a no-op.
+	if promptCaching && len(bedrockTools) > 0 {
+		bedrockTools = append(bedrockTools, &types.ToolMemberCachePoint{
+			Value: types.CachePointBlock{Type: types.CachePointTypeDefault},
+		})
+	}
+
 	return bedrockTools, nameMap
 }
 

@@ -162,7 +162,7 @@ func TestConvertGenaiToolsToBedrock(t *testing.T) {
 			},
 		}}}}
 
-		bt1, nm1 := convertGenaiToolsToBedrock(tools)
+		bt1, nm1 := convertGenaiToolsToBedrock(tools, false)
 		schema := extractSchema(t, bt1, nm1)
 
 		props := schema["properties"].(map[string]any)
@@ -190,7 +190,7 @@ func TestConvertGenaiToolsToBedrock(t *testing.T) {
 			},
 		}}}}
 
-		bt2, nm2 := convertGenaiToolsToBedrock(tools)
+		bt2, nm2 := convertGenaiToolsToBedrock(tools, false)
 		schema := extractSchema(t, bt2, nm2)
 		props, ok := schema["properties"].(map[string]any)
 		if !ok || len(props) == 0 {
@@ -211,7 +211,7 @@ func TestConvertGenaiToolsToBedrock(t *testing.T) {
 			ParametersJsonSchema: s,
 		}}}}
 
-		bt3, nm3 := convertGenaiToolsToBedrock(tools)
+		bt3, nm3 := convertGenaiToolsToBedrock(tools, false)
 		schema := extractSchema(t, bt3, nm3)
 		props, ok := schema["properties"].(map[string]any)
 		if !ok || len(props) == 0 {
@@ -366,7 +366,7 @@ func TestConvertGenaiToolsToBedrockSanitizesNames(t *testing.T) {
 		{Name: "filesystem:read_file", Description: "Read a file"},
 	}}}
 
-	bedrockTools, nameMap := convertGenaiToolsToBedrock(tools)
+	bedrockTools, nameMap := convertGenaiToolsToBedrock(tools, false)
 	if len(bedrockTools) != 2 {
 		t.Fatalf("expected 2 tools, got %d", len(bedrockTools))
 	}
@@ -424,3 +424,50 @@ func TestStreamingToolCallParseArgs(t *testing.T) {
 		})
 	}
 }
+
+func TestConvertGenaiToolsToBedrockPromptCaching(t *testing.T) {
+	tools := []*genai.Tool{{FunctionDeclarations: []*genai.FunctionDeclaration{
+		{Name: "get_weather", Description: "lookup weather"},
+		{Name: "list_pods", Description: "list pods"},
+	}}}
+
+	t.Run("disabled: no cache marker appended", func(t *testing.T) {
+		out, _ := convertGenaiToolsToBedrock(tools, false)
+		if len(out) != 2 {
+			t.Fatalf("expected 2 tools, got %d", len(out))
+		}
+		for i, tool := range out {
+			if _, ok := tool.(*types.ToolMemberCachePoint); ok {
+				t.Fatalf("did not expect a CachePoint at index %d when caching disabled", i)
+			}
+		}
+	})
+
+	t.Run("enabled: cache marker appended at the END of the tool list", func(t *testing.T) {
+		out, _ := convertGenaiToolsToBedrock(tools, true)
+		if len(out) != 3 {
+			t.Fatalf("expected 3 entries (2 tools + 1 CachePoint), got %d", len(out))
+		}
+		// The first two must remain ToolSpec entries (order preserved).
+		for i := 0; i < 2; i++ {
+			if _, ok := out[i].(*types.ToolMemberToolSpec); !ok {
+				t.Fatalf("entry %d: expected ToolMemberToolSpec, got %T", i, out[i])
+			}
+		}
+		// The trailing entry must be a CachePoint with type=default.
+		cp, ok := out[2].(*types.ToolMemberCachePoint)
+		if !ok {
+			t.Fatalf("trailing entry: expected ToolMemberCachePoint, got %T", out[2])
+		}
+		if cp.Value.Type != types.CachePointTypeDefault {
+			t.Errorf("expected CachePointType=default, got %v", cp.Value.Type)
+		}
+	})
+
+	t.Run("enabled but no tools: no cache marker (skipped)", func(t *testing.T) {
+		out, _ := convertGenaiToolsToBedrock(nil, true)
+		if len(out) != 0 {
+			t.Fatalf("expected empty slice for no tools, got %d entries", len(out))
+		}
+	})
+}
@@ -251,6 +251,11 @@ type Bedrock struct {
 	// additionalModelRequestFields in the Converse API. Use this for provider-specific
 	// options outside the standard InferenceConfiguration block.
 	AdditionalModelRequestFields map[string]any `json:"additional_model_request_fields,omitempty"`
+	// PromptCaching enables Bedrock prompt caching by appending a CachePoint
+	// block to the end of the system content array and the end of the tools
+	// array in the Converse request. See the v1alpha2.BedrockConfig CRD doc
+	// for context.
+	PromptCaching bool `json:"prompt_caching,omitempty"`
 }
 
 func (b *Bedrock) MarshalJSON() ([]byte, error) {

@@ -483,6 +483,24 @@ spec:
                       Claude extended thinking or top_k. Values are forwarded as-is to the API.
                       Example: {"top_k": 5, "thinking": {"type": "enabled", "budget_tokens": 16000}}
                     x-kubernetes-preserve-unknown-fields: true
+                  promptCaching:
+                    default: false
+                    description: |-
+                      PromptCaching enables Bedrock prompt caching by appending a CachePoint
+                      block at the end of the Converse request's `system` content array and
+                      the end of the `tools` array. Bedrock will cache the prefix up to and
+                      including those cache points across requests in the same region for
+                      roughly 5 minutes after first use, billing the cached portion at a
+                      reduced rate on cache hits.
+
+                      Recommended for tool-using agents that make many Converse calls per
+                      task with a stable system prompt and tool set — the per-call input
+                      token count can drop by 70-90% on hit. Has no effect on models that
+                      don't support caching; the marker is ignored by Bedrock for those.
+
+                      See https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html
+                      for the current list of supported models and minimum prefix sizes.
+                    type: boolean
                   region:
                     description: AWS region where the Bedrock model is available (e.g.,
                       us-east-1, us-west-2)

@@ -256,6 +256,24 @@ type BedrockConfig struct {
 	// +optional
 	// +kubebuilder:pruning:PreserveUnknownFields
 	AdditionalModelRequestFields *apiextensionsv1.JSON `json:"additionalModelRequestFields,omitempty"`
+
+	// PromptCaching enables Bedrock prompt caching by appending a CachePoint
+	// block at the end of the Converse request's `system` content array and
+	// the end of the `tools` array. Bedrock will cache the prefix up to and
+	// including those cache points across requests in the same region for
+	// roughly 5 minutes after first use, billing the cached portion at a
+	// reduced rate on cache hits.
+	//
+	// Recommended for tool-using agents that make many Converse calls per
+	// task with a stable system prompt and tool set — the per-call input
+	// token count can drop by 70-90% on hit. Has no effect on models that
+	// don't support caching; the marker is ignored by Bedrock for those.
+	//
+	// See https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html
+	// for the current list of supported models and minimum prefix sizes.
+	// +optional
+	// +kubebuilder:default=false
+	PromptCaching bool `json:"promptCaching,omitempty"`
 }
 
 // SAPAICoreConfig contains SAP AI Core-specific configuration options.

@@ -698,6 +698,7 @@ func (a *adkApiTranslator) translateModel(ctx context.Context, namespace, modelC
 			},
 			Region:                       model.Spec.Bedrock.Region,
 			AdditionalModelRequestFields: additionalFields,
+			PromptCaching:                model.Spec.Bedrock.PromptCaching,
 		}
 
 		// Populate TLS fields in BaseModel

@@ -483,6 +483,24 @@ spec:
                       Claude extended thinking or top_k. Values are forwarded as-is to the API.
                       Example: {"top_k": 5, "thinking": {"type": "enabled", "budget_tokens": 16000}}
                     x-kubernetes-preserve-unknown-fields: true
+                  promptCaching:
+                    default: false
+                    description: |-
+                      PromptCaching enables Bedrock prompt caching by appending a CachePoint
+                      block at the end of the Converse request's `system` content array and
+                      the end of the `tools` array. Bedrock will cache the prefix up to and
+                      including those cache points across requests in the same region for
+                      roughly 5 minutes after first use, billing the cached portion at a
+                      reduced rate on cache hits.
+
+                      Recommended for tool-using agents that make many Converse calls per
+                      task with a stable system prompt and tool set — the per-call input
+                      token count can drop by 70-90% on hit. Has no effect on models that
+                      don't support caching; the marker is ignored by Bedrock for those.
+
+                      See https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-caching.html
+                      for the current list of supported models and minimum prefix sizes.
+                    type: boolean
                   region:
                     description: AWS region where the Bedrock model is available (e.g.,
                       us-east-1, us-west-2)

@@ -251,6 +251,12 @@ class KAgentBedrockLlm(KAgentTLSMixin, BaseLlm):
 
     extra_headers: Optional[dict[str, str]] = None
     additional_model_request_fields: Optional[dict[str, Any]] = None
+    # When True, append a CachePoint block to the end of the Converse
+    # request's `system` content array and the end of the `toolConfig.tools`
+    # array. Bedrock caches the prefix up to and including those markers
+    # across requests in the same region; cached portion is billed at a
+    # reduced rate on hit. See AWS docs for supported models / minimums.
+    prompt_caching: bool = False
 
     model_config = {"arbitrary_types_allowed": True}
 
@@ -288,12 +294,23 @@ async def generate_content_async(
                 text = "\n".join(p.text for p in si.parts or [] if p.text)
                 if text:
                     kwargs["system"] = [{"text": text}]
+            # If prompt caching is on, mark the end of the system content as
+            # a cache breakpoint. Bedrock caches everything up to and including
+            # this point for ~5 minutes; subsequent requests with the same
+            # prefix hit the cache. No-op if we didn't produce any system text.
+            if self.prompt_caching and kwargs.get("system"):
+                kwargs["system"].append({"cachePoint": {"type": "default"}})
 
         if llm_request.config and llm_request.config.tools:
             genai_tools = [t for t in llm_request.config.tools if hasattr(t, "function_declarations")]
             if genai_tools:
                 converse_tools = _convert_tools_to_converse(genai_tools, tool_name_map, tool_name_counter)
                 if converse_tools:
+                    # CachePoint at the END of the tool list: tool definitions
+                    # are usually the biggest static chunk of an agent request
+                    # and benefit most from caching.
+                    if self.prompt_caching:
+                        converse_tools.append({"cachePoint": {"type": "default"}})
                     kwargs["toolConfig"] = {"tools": converse_tools}
 
         # Reverse map lets us restore original tool names from sanitized names in Bedrock responses.

@@ -240,6 +240,11 @@ class Bedrock(BaseLLM):
     # additionalModelRequestFields in the Converse API. Use this for provider-specific
     # options outside the standard InferenceConfiguration block.
     additional_model_request_fields: dict | None = None
+    # prompt_caching enables Bedrock prompt caching: a CachePoint marker is
+    # appended to the end of the Converse request's system content array and
+    # toolConfig.tools array. Bedrock caches the prefix across requests in the
+    # same region; cached portion is billed at a reduced rate on hit.
+    prompt_caching: bool = False
     type: Literal["bedrock"]
 
 
@@ -600,6 +605,7 @@ def _create_llm_from_model_config(model_config: ModelUnion):
             model=model_config.model,
             extra_headers=extra_headers,
             additional_model_request_fields=model_config.additional_model_request_fields,
+            prompt_caching=model_config.prompt_caching,
             **_transport_kwargs(model_config),
         )
     if model_config.type == "sap_ai_core":