การใช้ตัวอย่างอ้างอิง

คุณสามารถปรับปรุงคุณภาพของการสกัดข้อมูลได้โดยการให้ตัวอย่างอ้างอิงให้กับ LLM

เคล็ม: ขณะที่บทชี้แนะนี้เน้นการใช้ตัวอย่างกับโมเดลเครื่องมือ วิธีการนี้เป็นเรื่องที่สามารถปรับใช้ได้ทั่วไป และสามารถทำงานได้เช่นกันด้วยเทคนิค JSON หรือตัวช่วยในการพิมพ์

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "คุณเป็นอัลกอริทึมในการสกัดข้อมูลที่เชี่ยวชาญ เฉพาะการสกัดข้อมูลที่เกี่ยวข้องจากเนื้อหา หากคุณไม่ทราบค่าของแอตทริบิวต์ที่ถามให้สกัดข้อมูล ให้ส่งค่า null สำหรับค่าของแอตทริบิวต์",
        ),
        MessagesPlaceholder("ตัวอย่าง"),  # <-- ตัวอย่าง!
        ("human", "{ข้อความ}"),
    ]
)

ลองใช้เทมเพลต:

from langchain_core.messages import (
    HumanMessage,
)

prompt.invoke(
    {"text": "นี่คือข้อความบางอย่าง", "examples": [HumanMessage(content="ทดสอบ 1 2 3")]}
)

ChatPromptValue(messages=[SystemMessage(content="คุณเป็นอัลกอริทึมในการสกัดข้อมูลที่เชี่ยวชาญ เฉพาะการสกัดข้อมูลที่เกี่ยวข้องจากเนื้อหา หากคุณไม่ทราบค่าของแอตทริบิวต์ที่ถามให้สกัดข้อมูล ให้ส่งค่า null สำหรับค่าของแอตทริบิวต์"), HumanMessage(content='ทดสอบ 1 2 3'), HumanMessage(content='นี่คือข้อความบางอย่าง')])

กำหนดสกีมา

เราจะใช้โครงสร้างข้อมูลของบุคคลที่ถามแรกภายในคำส้มสไตล์

from typing import List, Optional

from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI

class Person(BaseModel):
    """ข้อมูลเกี่ยวกับบุคคล."""
  
    name: Optional[str] = Field(..., description="ชื่อของบุคคล")
    hair_color: Optional[str] = Field(
        ..., description="สีของเส้นผมของบุคคลหากทราบ"
    )
    height_in_meters: Optional[str] = Field(..., description="ส่วนสูงเป็นเมตร")

class Data(BaseModel):
    """ข้อมูลที่สกัดเกี่ยวกับบุคคล."""

    people: List[Person]

นิยามตัวอย่างอ้างอิง

ตัวอย่างสามารถนิยามได้เป็นรายการของคู่ระหว่างข้อมูลเข้า-ข้อมูลออก

แต่ละตัวอย่างประกอบด้วยข้อมูลเข้าตัวอย่าง input และข้อมูล output ซึ่งแสดงว่าควรถูกดึงออกจากข้อความ

ข้อมูลสารบัญ

ข้อมูลโครงร่างของตัวอย่างจะตรงกับรูปแบบที่คาดหวังสำหรับเครื่องมือที่ใช้ (เช่นการโทร API หรือโหมด JSON เป็นต้น)

ที่นี่ตัวอย่างที่จัดรูปแบบจะตรงกับรูปแบบที่คาดหวังสำหรับ API การโทรเครื่องมือเนื่องจากนั้นคือสิ่งที่เรากำลังใช้

import uuid
from typing import Dict, List, TypedDict

from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    HumanMessage,
    SystemMessage,
    ToolMessage,
)
from langchain_core.pydantic_v1 import BaseModel, Field


class Example(TypedDict):
    """การแสดงตัวอย่างโดยรวมของข้อความที่ป้อนและการโทรเครื่องมือที่คาดหวัง"""

    input: str  # นี่คือข้อความตัวอย่าง
    tool_calls: List[BaseModel]  # ตัวอย่างของรายการ pydantic ที่ควรถูกดึงมา


def tool_example_to_messages(example: Example) -> List[BaseMessage]:
    """แปลงตัวอย่างเป็นรายการของข้อความที่สามารถป้อนเข้า LLM ได้

    รหัสนี้เป็นอะแดปเตอร์ที่แปลงตัวอย่างของเราเป็นรายการของข้อความ
    ที่สามารถป้อนให้แบบจากชุดแชทได้

    รายการของข้อความต่อตัวอย่างเป็นดังนี้:

    1) HumanMessage: มีเนื้อหาที่ต้องการถูกดึง
    2) AIMessage: มีข้อมูลที่ถูกดึงออกจากแบบจำลอง
    3) ToolMessage: มีการยืนยันแก่แบบจำลองว่าคำขอการโทรเครื่องมือถูกต้อง

    ToolMessage จำเป็นเพราะว่าบางแบบจำลองการแชทถูกปรับรุ่นเพื่อประสิทธิภาพสูงสุดสำหรับตัวแทน
    มากกว่าสำหรับกรณีการใช้การดึงออก
    """
    messages: List[BaseMessage] = [HumanMessage(content=example["input"])]
    openai_tool_calls = []
    for tool_call in example["tool_calls"]:
        openai_tool_calls.append(
            {
                "id": str(uuid.uuid4()),
                "type": "function",
                "function": {
                    "name": tool_call.__class__.__name__,
                    "arguments": tool_call.json(),
                },
            }
        )
    messages.append(
        AIMessage(content="", additional_kwargs={"tool_calls": openai_tool_calls})
    )
    tool_outputs = example.get("tool_outputs") or [
        "You have correctly called this tool."
    ] * len(openai_tool_calls)
    for output, tool_call in zip(tool_outputs, openai_tool_calls):
        messages.append(ToolMessage(content=output, tool_call_id=tool_call["id"]))
    return messages

ถัดไปเราจะนิยามตัวอย่างแล้วแปลงมันไปสู่รูปแบบข้อความ

examples = [
    (
        "ทะเลกว้างและสีคราม มีความลึกเกิน 20,000 ฟุต มีปลามากมายอยู่ในนั้น",
        Person(name=None, height_in_meters=None, hair_color=None),
    ),
    (
        "ฟิโอน่าเดินทางไกลจากฝรั่งเศสไปสเปน",
        Person(name="Fiona", height_in_meters=None, hair_color=None),
    ),
]


messages = []

for text, tool_call in examples:
    messages.extend(
        tool_example_to_messages({"input": text, "tool_calls": [tool_call]})
    )

มาลองทดสอบคำสั่งจำลองดู

prompt.invoke({"text": "นี่คือข้อความบางส่วน", "examples": messages})

ChatPromptValue(messages=[SystemMessage(content="คุณเป็นอัลกอริทึมดึงข้อมูลระดับมืออาชีพ เฉพาะดึงข้อมูลที่เกี่ยวข้องจากข้อความ หากคุณไม่ทราบค่าของแอตทริบิวต์ที่ถามให้ดึง ให้คืนค่า null สำหรับค่าของแอตทริบิวต์นั้น"), HumanMessage(content="มหาสมุทรใหญ่และสีน้ำเงินทะเลสาบ มีความลึกเกิน 20,000 ฟุต มีปลามากมายอยู่ในนั้น"), AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'c75e57cc-8212-4959-81e9-9477b0b79126', 'type': 'function', 'function': {'name': 'Person', 'arguments': '{"name": null, "hair_color": null, "height_in_meters": null}'}}]}), ToolMessage(content='คุณได้เรียกใช้เครื่องมือนี้อย่างถูกต้อง', tool_call_id='c75e57cc-8212-4959-81e9-9477b0b79126'), HumanMessage(content='Fiona เดินทางไกลจากฝรั่งเศสไปสเปน'), AIMessage(content='', additional_kwargs={'tool_calls': [{'id': '69da50b5-e427-44be-b396-1e56d821c6b0', 'type': 'function', 'function': {'name': 'Person', 'arguments': '{"name": "Fiona", "hair_color": null, "height_in_meters": null}'}}]}), ToolMessage(content='คุณได้เรียกใช้เครื่องมือนี้อย่างถูกต้อง', tool_call_id='69da50b5-e427-44be-b396-1e56d821c6b0'), HumanMessage(content='นี่คือข้อความบางส่วน')])

สร้างตัวสกัด

ที่นี่เราจะสร้างตัวสกัดโดยใช้ gpt-4.

llm = ChatOpenAI(
    model="gpt-4-0125-preview",
    temperature=0,
)


runnable = prompt | llm.with_structured_output(
    schema=Data,
    method="function_calling",
    include_raw=False,
)

/Users/harrisonchase/workplace/langchain/libs/core/langchain_core/_api/beta_decorator.py:86: LangChainBetaWarning: ฟังก์ชั่น `with_structured_output` เป็นรุ่นทดลองอยู่  อยู่ในขั้นตอนการพัฒนาอยู่ เพราะฉะนั้น API อาจเปลี่ยนได้
  warn_beta(

โดยไม่มีตัวอย่าง

โปรดทราบว่าแม้ว่าเราใช้ gpt-4 แต่มันล้มเหลวเมื่อมีกรณีทดสอบ ง่ายมาก!

for _ in range(5):
    text = "ระบบสุริยะใหญ่มาก แต่โลกมีแค่ 1 ดวงจันทร์"
    print(runnable.invoke({"text": text, "examples": []}))

people=[]
people=[Person(name='earth', hair_color=None, height_in_meters=None)]
people=[Person(name='earth', hair_color=None, height_in_meters=None)]
people=[]
people=[]

พร้อมตัวอย่าง

ตัวอย่างอ้างอิงช่วยในการแก้ไขความล้มเหลว!

for _ in range(5):
    text = "ระบบสุริยะใหญ่มาก แต่โลกมีแค่ 1 ดวงจันทร์"
    print(runnable.invoke({"text": text, "examples": messages}))

people=[]
people=[]
people=[]
people=[]
people=[]

runnable.invoke(
    {
        "text": "ชื่อของฉันคือ Harrison. ผมสีดำ",
        "examples": messages,
    }
)

Data(people=[Person(name='Harrison', hair_color='black', height_in_meters=None)])

การใช้ตัวอย่างอ้างอิง