提示词缓存通过缓存令牌来降低 API 成本,但不提供对话记忆。要在跨调用中持久化对话历史,请使用类似 MemorySaver 的 检查点器。
from langchain_aws import ChatBedrockConversefrom langchain_aws.middleware.prompt_caching import BedrockPromptCachingMiddlewarefrom langchain.agents import create_agentfrom langchain_core.runnables import RunnableConfigfrom langchain.messages import HumanMessagefrom langchain.tools import toolfrom langgraph.checkpoint.memory import MemorySaver@tooldef get_weather(city: str) -> str: """Get the current weather for a city.""" return f"The weather in {city} is sunny and 72F."# System prompt must exceed 1,024 tokens for caching to take effectLONG_PROMPT = ( "You are a helpful weather assistant with deep expertise in meteorology, " "climate science, and atmospheric phenomena. When answering questions about " "weather, provide accurate and up-to-date information. " + "You should always strive to give the most helpful response possible. " * 85)agent = create_agent( model=ChatBedrockConverse(model="us.anthropic.claude-sonnet-4-5-20250929-v1:0"), system_prompt=LONG_PROMPT, tools=[get_weather], middleware=[BedrockPromptCachingMiddleware(ttl="5m")], checkpointer=MemorySaver(), # Persists conversation history)# Use a thread_id to maintain conversation stateconfig: RunnableConfig = {"configurable": {"thread_id": "user-123"}}# First invocation: Creates cache with system prompt, tools, and user messageresponse = agent.invoke( {"messages": [HumanMessage("What is the weather in Miami?")]}, config=config)last_msg = response["messages"][-1]print(last_msg.content)# Check cache token usageum = last_msg.usage_metadataif um: details = um.get("input_token_details", {}) cache_read = details.get("cache_read", 0) or 0 cache_write = details.get("cache_creation", 0) or 0 print(f"Cache read: {cache_read}, Cache write: {cache_write}")# Second invocation: Reuses cached system prompt, tools, and previous messagesresponse = agent.invoke( {"messages": [HumanMessage("How about Seattle?")]}, config=config)print(response["messages"][-1].content)