Structured Output.py - 乘风的人

# 核心概念
# Structured Output = 将 LLM 的自然语言输出转为结构化 Python 对象
# 在 LangChain 1.0 中，使用 with_structured_output() 方法结合 Pydantic 模型，可以确保 LLM 返回符合预定义模式的数据。

# 基本用法

# 定义 Pydantic 模型

from typing import List, Optional

from pydantic import BaseModel, Field, model_serializer


class Person(BaseModel):
    # 变量名是英文 (name)，描述可以是中文，帮助模型理解
    name: str = Field(description="人的姓名")

    # 变量名是英文 (age)
    age: int = Field(description="人的年龄")

    # 变量名是英文 (job)
    job: Optional[str] = Field(default=None, description="人的职业")

    # 自定义序列化方法，消除警告
    @model_serializer
    def serialize_model(self):
        return {"name": self.name, "age": self.age, "job": self.job}

    # email: Optional[str] = Field(default=None, description="人的邮箱")


import os
import sys
from dotenv import load_dotenv
from langchain.chat_models import init_chat_model
from langchain.agents import create_agent
from langchain_core.tools import tool
from langchain.agents.middleware import AgentMiddleware
from langgraph.checkpoint.memory import InMemorySaver

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from init_model import get_chat_model

chat_model = get_chat_model()

# 使用 with_structured_output()


# 核心组件
# 1. Pydantic BaseModel
# 所有结构化输出的数据模型都必须继承 BaseModel：

# from pydantic import BaseModel

# class MyModel(BaseModel):
#     field1: str
#     field2: int

# 2. Field 描述
# 使用 Field() 添加字段描述，帮助 LLM 理解：
from pydantic import Field


# 高级特性
# 可选字段
class Person(BaseModel):
    name: str
    age: Optional[int] = None
    job: Optional[str] = None  # 可以为 None


def demo1():
    # 创建结构化输出的 LLM
    structured_llm = chat_model.with_structured_output(Person, include_raw=False)
    response = structured_llm.invoke(
        "张三是一名 30 岁的软件工程师。请提取信息。按照要求返回JSON格式数据，字段名必须为英文（name/age/job）"
    )
    # print(response)
    print(f"name={response.name} age={response.age} job={response.job}")


# 工作原理
# 传统方式 vs 结构化输出
# 传统方式（繁琐）：

# 1. 提示词要求 JSON
# prompt = "以JSON格式返回：{name, age, occupation}"
# response = chat_model.invoke(prompt)

# # 2. 手动解析
# import json

# data = json.loads(response.content)
# # 3. 手动验证类型
# if not isinstance(data["age"], int):
#     raise ValueError("age must be int")
# # 4. 手动创建对象
# person = Person(**data)


# 结构化输出（简洁）：
# 一步到位
# structured_llm = chat_model.with_structured_output(Person)
# person = structured_llm.invoke("张三是一名 30 岁的软件工程师")
# ✅ 自动解析、验证、创建对象

# 幕后流程
# 1. Pydantic 模型 → JSON Schema
#    Person → {
#      "type": "object",
#      "properties": {
#        "name": {"type": "string", "description": "姓名"},
#        "age": {"type": "integer", "description": "年龄"}
#      }
#    }
# 2. JSON Schema → LLM (函数调用)
#    LLM 被强制返回符合 schema 的 JSON
# 3. JSON → Pydantic 对象
#    自动验证类型并创建 Person 实例


# 创建结构化输出的 LLM


# 实际应用
# 1. 客户信息提取
class CustomerInfo(BaseModel):
    name: str = Field(description="客户姓名")
    phone: str = Field(description="电话号码")
    email: Optional[str] = Field(None, description="邮箱")
    issue: str = Field(description="问题描述")


def demo2():
    structured_llm = chat_model.with_structured_output(CustomerInfo)
    conversation = "客户: 我是李明，电话 138-1234-5678，问题是：订单没发货.仅返回JSON数据，不添加任何额外说明文字"
    info = structured_llm.invoke(f"提取客户信息：{conversation}")
    print(info)


#       return self.__pydantic_serializer__.to_python(
# name='李明' phone='138-1234-5678' email=None issue='订单没发货'


# 应用：
# 自动填充 CRM 系统
# 工单自动分类
# 客服辅助


# 2. 产品评论分析
class Review(BaseModel):
    product: str
    rating: int = Field(description="评分 1-5")
    pros: List[str] = Field(description="优点列表")
    cons: List[str] = Field(description="缺点列表")


def demo3():
    structured_llm = chat_model.with_structured_output(Review)
    review = structured_llm.invoke(
        """
       iPhone 15 很棒！摄像头强大，手感好。但是价格贵，没有充电器。4分。仅返回JSON数据，不添加任何额外说明文字
    """
    )
    print(review)


# return self.__pydantic_serializer__.to_python(
# product='iPhone 15' rating=4 pros=['摄像头强大', '手感好'] cons=['价格贵', '没有充电器']
# 应用：

# 批量处理用户评论
# 自动生成分析报告
# 发现产品改进点


# 3. 文档信息提取 嵌套模型
# 1. 定义子模型：商品明细
class InvoiceItem(BaseModel):
    name: str = Field(description="商品名称")
    amount: float = Field(description="商品单价或该项小计金额")
    # quantity: int = Field(default=1, description="数量，如果未提及默认为1") # 可选


# 2. 定义主模型：发票
class Invoice(BaseModel):
    invoice_number: str = Field(description="发票号")
    date: str = Field(description="日期，格式务必为 YYYY-MM-DD")
    total_amount: float = Field(description="发票总金额")
    items: List[InvoiceItem] = Field(description="商品明细列表，包含名称和价格")


def demo4():
    structured_llm = chat_model.with_structured_output(Invoice)
    document = """
    发票号：20230801001
    日期：2023-08-01
    总金额：1234.56
    项目：
    1. 商品A - 100.00
    2. 商品B - 200.00
    3. 服务费 - 934.56
    """
    # 优化后的 Prompt：更自然，不需要强调“仅返回JSON”，因为 structured_output 已经做了约束
    prompt_text = f"""
    请从以下文本中提取发票信息，并**以 JSON 格式返回**结果：
    {document}
    注意：
    1. 确保日期格式为 YYYY-MM-DD。
    2. 将每个项目拆分为名称和对应的金额。
    3. 如果总金额与各项之和不符，以文档中明确写出的"总金额"为准。
    """
    info: Invoice = structured_llm.invoke(prompt_text)
    print(info)


# 应用：

# 自动化财务处理
# OCR 后结构化
# 数据录入

# 最佳实践
# # 1. 使用清晰的字段描述
# class Good(BaseModel):
#     created_at: str = Field(description="创建时间，格式 YYYY-MM-DD")

# class Bad(BaseModel):
#     created_at: str  # 没有描述，LLM 可能格式错误

# # 2. 合理使用 Optional
# class Good(BaseModel):
#     email: Optional[str] = None  # 邮箱可能没有

# class Bad(BaseModel):
#     email: str  # 强制必填，可能导致提取失败

# # 3. 使用枚举限制值
# class Good(BaseModel):
#     status: Status  # 枚举

# class Bad(BaseModel):
#     status: str  # 可能返回任意字符串

# # 4. 列表设置合理的描述
# class Good(BaseModel):
#     tags: List[str] = Field(description="产品标签，如 '电子产品', '手机'")

# class Bad(BaseModel):
#     tags: List[str]  # LLM 不知道该提取什么

# # 5. 嵌套模型保持简单
# class Good(BaseModel):
#     user: User      # 1 层嵌套
#     settings: dict  # 复杂数据用 dict

# class Bad(BaseModel):
#     user: User
#         company: Company
#             address: Address
#                 country: Country  # 4 层嵌套，容易出错


if __name__ == "__main__":
    # demo1()
    # demo2()
    # demo3()
    demo4()