[LangChain] RAG

Woong·2025년 2월 21일
0

Python / Machine Learning

목록 보기
24/27
  • RAG
# WebBaseLoader 사용하는 웹 페이지 예시
import bs4
from langchain_community.document_loaders import WebBaseLoader

# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()

assert len(docs) == 1
print(f"Total characters: {len(docs[0].page_content)}")
  • RecursiveCharacterTextSplitter 로 텍스트를 청크화
    • 임베딩하기 적합하도록 chunk 로 분할
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")
  • vector store 에 저장 (milvus 등)
    • vector store 인스턴스화할 때 어떤 임베딩 쓸 지 정해서 전달
# azure 로 임베딩한다고 할 경우.
import getpass
import os

if not os.environ.get("AZURE_OPENAI_API_KEY"):
  os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass("Enter API key for Azure: ")

from langchain_openai import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings(
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
)
# milvus vector store 인스턴스화하면서 어떤 임베딩 쓸지 전달
from langchain_milvus import Milvus

vector_store = Milvus(embedding_function=embeddings)
# vector store 에 insert
document_ids = vector_store.add_documents(documents=all_splits)

print(document_ids[:3])
  • 검색
    • ex) 채팅 모듈로 사용자 질문에 답할 경우
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages) == 1
print(example_messages[0].content)
docs = retriever.invoke(query)
def vector_query(search_query: str) -> Dict:
    vector = embeddings.embed_query(search_query)  # same embeddings as for indexing
    return {
        "knn": {
            "field": dense_vector_field,
            "query_vector": vector,
            "k": 5,
            "num_candidates": 10,
        }
    }


vector_retriever = ElasticsearchRetriever.from_es_params(
    index_name=index_name,
    body_func=vector_query,
    content_field=text_field,
    url=es_url,
)

vector_retriever.invoke("foo")
vectorstore = MyVectorStore()
retriever = vectorstore.as_retriever()
  • retriever 가 여럿일 경우 앙상블 리시버로 결합도 가능
# Initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vector_store_retriever], weights=[0.5, 0.5]
)

reference

0개의 댓글

관련 채용 정보