การจัดการข้อความยาว

การจัดการกับข้อความยาว

เมื่อทำงานกับไฟล์ เช่น PDF คุณมักจะพบกับข้อความที่เกินขอบเขตของหน้าต่างความสามารถในการวิเคราะห์ภาษาของระบบของคุณ ในการจัดการกับข้อความนี้ คุณสามารถพิจารณากลยุทธ์ต่อไปนี้:

เปลี่ยน LLM เลือก LLM ที่สนับสนุนหน้าต่างบรรยายที่ใหญ่กว่า
บรูทฟอร์ซ แบ่งเอกสารเป็นชิ้นเล็ก และสกัดเนื้อหาจากแต่ละชิ้น
RAG แบ่งเอกสารเป็นชิ้นทำดัชนีและสกัดเนื้อหาจากชิ้นบางชิ้นที่ดูเหมาะสม

โปรดจำไว้ว่ากลยุทธ์เหล่านี้มีการต่อรองที่แตกต่างกัน และกลยุทธ์ที่ดีที่สุดมักขึ้นอยู่กับแอปพลิเคชันที่คุณกำลังออกแบบ!

การตั้งค่า

เราต้องใช้ข้อมูลตัวอย่างบางส่วน! เรามาดาวน์โหลดบทความเกี่ยวกับ รถยนต์จากวิกิพีเดีย และโหลดมันเป็น เอกสาร ใน LangChain.

import re

import requests
from langchain_community.document_loaders import BSHTMLLoader

response = requests.get("https://en.wikipedia.org/wiki/Car")
with open("car.html", "w", encoding="utf-8") as f:
    f.write(response.text)
loader = BSHTMLLoader("car.html")
document = loader.load()[0]
document.page_content = re.sub("\n\n+", "\n", document.page_content)

print(len(document.page_content))

กำหนดโครงสร้าง

ที่นี่เราจะกำหนดโครงสร้างเพื่อสกัดข้อมูลสำคัญจากข้อความ

from typing import List, Optional

from langchain.chains import create_structured_output_runnable
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI


class KeyDevelopment(BaseModel):
    """ข้อมูลเกี่ยวกับการพัฒนาที่สำคัญในประวัติศาสตร์ของรถยนต์"""

    year: int = Field(
        ..., description="ปีที่มีการพัฒนาที่สำคัญในประวัติศาสตร์"
    )
    description: str = Field(
        ..., description="เกิดอะไรขึ้นในปีนี้? มีการพัฒนาอะไรบ้าง?"
    )
    evidence: str = Field(
        ...,
        description="เหมือนเดิมให้ทวนทุกรายละเอียดของประโยคที่ใช้สกัดข้อมูลว่าปีและรายละเอียด",
    )

class ExtractionData(BaseModel):
    """ข้อมูลที่สกัดได้เกี่ยวกับการพัฒนาที่สำคัญในประวัติศาสตร์ของรถยนต์"""

    key_developments: List[KeyDevelopment]


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "คุณเป็นผู้เชี่ยวชาญที่การระบุพัฒนาการที่สำคัญในประวัติศาสตร์ของข้อความ สกัดเฉพาะข้อมูลที่สำคัญเท่านั้น อย่าสกัดข้อมูลถ้าไม่พบข้อมูลที่สำคัญในข้อความ",
        ),
        ("human", "{text}"),
    ]
)

llm = ChatOpenAI(
    model="gpt-4-0125-preview",
    temperature=0,
)

extractor = prompt | llm.with_structured_output(
    schema=ExtractionData,
    method="function_calling",
    include_raw=False,
)

/home/eugene/.pyenv/versions/3.11.2/envs/langchain_3_11/lib/python3.11/site-packages/langchain_core/_api/beta_decorator.py:86: LangChainBetaWarning: The function `with_structured_output` is in beta. It is actively being worked on, so the API may change.
  warn_beta(

วิธีการบรูทฟอร์ซ

แบ่งเอกสารออกเป็นชิ้นๆ โดยที่แต่ละชิ้นเข้ากับหน้าต่างบรรยายของ LLMs

from langchain_text_splitters import TokenTextSplitter

text_splitter = TokenTextSplitter(
    chunk_size=2000,
    chunk_overlap=20,
)

texts = text_splitter.split_text(document.page_content)

ใช้ .batch() ฟังก์ชันเพื่อรันการสกัดข้อมูล พร้อมกัน ทั้งหมดของแต่ละชิ้น!

เคล็ดลับ

คุณสามารถใช้ .batch() เพื่อทำการสกัดข้อมูลพร้อมกัน! batch ใช้ threadpool ภายในเพื่อช่วยโปรแกรมของคุณทำงานพร้อมกัน

ถ้าโมเดลของคุณถูกเปิดเผยผ่าน API นั้นอาจช่วยเพิ่มความเร็วของกระบวนการสกัดข้อมูลของคุณ!

first_few = texts[:3]

extractions = extractor.batch(
    [{"text": text} for text in first_few],
    {"max_concurrency": 5},  # จำกัดการทำงานพร้อมกันด้วยการระบุความสามารถในการทำงานพร้อมกันสูงสุด!
)

ผลลัพธ์การผสม

หลังจากการสกัดข้อมูลจากแต่ละชิ้น ทีเราจะต้องการผสมข้อมูลที่ได้มาด้วยกัน

key_developments = []

for extraction in extractions:
    key_developments.extend(extraction.key_developments)

key_developments[:20]

[KeyDevelopment(year=1966, description="The Toyota Corolla began production, recognized as the world's best-selling automobile.", evidence="The Toyota Corolla has been in production since 1966 and is recognized as the world's best-selling automobile."),
 KeyDevelopment(year=1769, description='Nicolas-Joseph Cugnot built the first steam-powered road vehicle.', evidence='French inventor Nicolas-Joseph Cugnot built the first steam-powered road vehicle in 1769.'),
 KeyDevelopment(year=1808, description='François Isaac de Rivaz designed and constructed the first internal combustion-powered automobile.', evidence='French-born Swiss inventor François Isaac de Rivaz designed and constructed the first internal combustion-powered automobile in 1808.'),
 KeyDevelopment(year=1886, description='Carl Benz patented his Benz Patent-Motorwagen, inventing the modern car.', evidence='The modern car—a practical, marketable automobile for everyday use—was invented in 1886, when German inventor Carl Benz patented his Benz Patent-Motorwagen.'),
 KeyDevelopment(year=1908, description='The 1908 Model T, an affordable car for the masses, was manufactured by the Ford Motor Company.', evidence='One of the first cars affordable by the masses was the 1908 Model T, an American car manufactured by the Ford Motor Company.'),
 KeyDevelopment(year=1881, description='Gustave Trouvé demonstrated a three-wheeled car powered by electricity.', evidence='In November 1881, French inventor Gustave Trouvé demonstrated a three-wheeled car powered by electricity at the International Exposition of Electricity.'),
 KeyDevelopment(year=1888, description="Bertha Benz undertook the first road trip by car to prove the road-worthiness of her husband's invention.", evidence="In August 1888, Bertha Benz, the wife of Carl Benz, undertook the first road trip by car, to prove the road-worthiness of her husband's invention."),
 KeyDevelopment(year=1896, description='Benz designed and patented the first internal-combustion flat engine, called boxermotor.', evidence='In 1896, Benz designed and patented the first internal-combustion flat engine, called boxermotor.'),
 KeyDevelopment(year=1897, description='Nesselsdorfer Wagenbau produced the Präsident automobil, one of the first factory-made cars in the world.', evidence='The first motor car in central Europe and one of the first factory-made cars in the world, was produced by Czech company Nesselsdorfer Wagenbau (later renamed to Tatra) in 1897, the Präsident automobil.'),
 KeyDevelopment(year=1890, description='Daimler Motoren Gesellschaft (DMG) was founded by Daimler and Maybach in Cannstatt.', evidence='Daimler and Maybach founded Daimler Motoren Gesellschaft (DMG) in Cannstatt in 1890.'),
 KeyDevelopment(year=1902, description='A new model DMG car was produced and named Mercedes after the Maybach engine.', evidence='Two years later, in 1902, a new model DMG car was produced and the model was named Mercedes after the Maybach engine, which generated 35 hp.'),
 KeyDevelopment(year=1891, description='Auguste Doriot and Louis Rigoulot completed the longest trip by a petrol-driven vehicle using a Daimler powered Peugeot Type 3.', evidence='In 1891, Auguste Doriot and his Peugeot colleague Louis Rigoulot completed the longest trip by a petrol-driven vehicle when their self-designed and built Daimler powered Peugeot Type 3 completed 2,100 kilometres (1,300 mi) from Valentigney to Paris and Brest and back again.'),
 KeyDevelopment(year=1895, description='George Selden was granted a US patent for a two-stroke car engine.', evidence='After a delay of 16 years and a series of attachments to his application, on 5 November 1895, Selden was granted a US patent (U.S. patent 549,160) for a two-stroke car engine.'),
 KeyDevelopment(year=1893, description='The first running, petrol-driven American car was built and road-tested by the Duryea brothers.', evidence='In 1893, the first running, petrol-driven American car was built and road-tested by the Duryea brothers of Springfield, Massachusetts.'),
 KeyDevelopment(year=1897, description='Rudolf Diesel built the first diesel engine.', evidence='In 1897, he built the first diesel engine.'),
 KeyDevelopment(year=1901, description='Ransom Olds started large-scale, production-line manufacturing of affordable cars at his Oldsmobile factory.', evidence='Large-scale, production-line manufacturing of affordable cars was started by Ransom Olds in 1901 at his Oldsmobile factory in Lansing, Michigan.'),
 KeyDevelopment(year=1913, description="Henry Ford began the world's first moving assembly line for cars at the Highland Park Ford Plant.", evidence="This concept was greatly expanded by Henry Ford, beginning in 1913 with the world's first moving assembly line for cars at the Highland Park Ford Plant."),
 KeyDevelopment(year=1914, description="Ford's assembly line worker could buy a Model T with four months' pay.", evidence="In 1914, an assembly line worker could buy a Model T with four months' pay."),
 KeyDevelopment(year=1926, description='Fast-drying Duco lacquer was developed, allowing for a variety of car colors.', evidence='Only Japan black would dry fast enough, forcing the company to drop the variety of colours available before 1913, until fast-drying Duco lacquer was developed in 1926.')]

I have received the role definition. Let me know if you need any help with the translation!

วิธีการใช้ RAG

อย่างไรก็ตาม, วิธีที่ง่ายและชัดเจนในการสร้าง RAG คือการแบ่งข้อความออกเป็นชิ้นเล็ก ๆ แต่ไม่จำเป็นต้องแยกข้อมูลจากทุกชิ้น โดยให้ให้ความสำคัญกับชิ้นเนื้อสำคัญที่สุด

คำเตือน

การระบุว่าชิ้นเนื้อไหนที่สำคัญนั้นอาจเป็นเรื่องยาก

เช่น เช่นในบทความเกี่ยวกับ car ที่เราใช้ที่นี่ ส่วนใหญ่ของบทความประกอบด้วยข้อมูลพัฒนาการสำคัญ ดังนั้น โดยใช้ RAG, เรามีโอกาสที่จะสูญเสียข้อมูลที่เกี่ยวข้องเป็นจำนวนมาก

เราขอแนะนำให้ลองทดสอบกับกรณีการใช้ของคุณและกำหนดว่าวิธีการนี้เหมาะสมหรือไม่

นี่คือตัวอย่างง่าย ๆ ที่ขึ้นอยู่กับ FAISS vectorstore

from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter

texts = text_splitter.split_text(document.page_content)
vectorstore = FAISS.from_texts(texts, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever(
    search_kwargs={"k": 1}
)  # แค่ดึงข้อมูลจากเอกสารแรก

ในกรณีนี้ตัว RAG extractor จะดูเฉพาะเอกสารบนสุด

rag_extractor = {
    "text": retriever | (lambda docs: docs[0].page_content)  # ดึงเนื้อหาของเอกสารบนสุด
} | extractor

results = rag_extractor.invoke("Key developments associated with cars")

for key_development in results.key_developments:
    print(key_development)

year=1924 description="Germany's first mass-manufactured car, the Opel 4PS Laubfrosch, was produced, making Opel the top car builder in Germany with 37.5% of the market." evidence="Germany's first mass-manufactured car, the Opel 4PS Laubfrosch (Tree Frog), came off the line at Rüsselsheim in 1924, soon making Opel the top car builder in Germany, with 37.5 per cent of the market."
year=1925 description='Morris had 41% of total British car production, dominating the market.' evidence='in 1925, Morris had 41 per cent of total British car production.'
year=1925 description='Citroën, Renault, and Peugeot produced 550,000 cars in France, dominating the market.' evidence="Citroën did the same in France, coming to cars in 1919; between them and other cheap cars in reply such as Renault's 10CV and Peugeot's 5CV, they produced 550,000 cars in 1925."
year=2017 description='Production of petrol-fuelled cars peaked.' evidence='Production of petrol-fuelled cars peaked in 2017.'

ปัญหาที่พบบ่อย

วิธีการต่าง ๆ มีข้อดีและข้อเสียของตัวเองเกี่ยวกับค่าใช้จ่าย ความเร็ว และความแม่นยำ

ระวังปัญหาเหล่านี้:

การแยกข้อความแปลภาษาอาจทำให้ LLM ไม่สามารถสกัดข้อมูลได้ หากข้อมูลกระจายอยู่ในหลายชิ้น
การทับซ้อนของชิ้นเนื้อที่ใหญ่อาจทำให้ข้อมูลเดียวกันถูกสกัดซ้ำ ๆ ดังนั้นต้องมีการลบข้อมูลที่ซ้ำ
LLMs อาจสร้างข้อมูลขึ้นมา หากมองหาข้อมูลเดียวในข้อความที่ใหญ่ โดยใช้วิธีการทำลายผลลัพธ์ข้อมูลจริง อาจส่งผลให้ได้ข้อมูลที่สร้างขึ้นมามากขึ้น