- document loader 를 통해 Document 타입 객체로 데이터 로딩
import bs4
from langchain_community.document_loaders import WebBaseLoader
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()
assert len(docs) == 1
print(f"Total characters: {len(docs[0].page_content)}")
RecursiveCharacterTextSplitter
로 텍스트를 청크화
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
add_start_index=True,
)
all_splits = text_splitter.split_documents(docs)
print(f"Split blog post into {len(all_splits)} sub-documents.")
- vector store 에 저장 (milvus 등)
- vector store 인스턴스화할 때 어떤 임베딩 쓸 지 정해서 전달
import getpass
import os
if not os.environ.get("AZURE_OPENAI_API_KEY"):
os.environ["AZURE_OPENAI_API_KEY"] = getpass.getpass("Enter API key for Azure: ")
from langchain_openai import AzureOpenAIEmbeddings
embeddings = AzureOpenAIEmbeddings(
azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
)
# milvus vector store 인스턴스화하면서 어떤 임베딩 쓸지 전달
from langchain_milvus import Milvus
vector_store = Milvus(embedding_function=embeddings)
document_ids = vector_store.add_documents(documents=all_splits)
print(document_ids[:3])
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")
example_messages = prompt.invoke(
{"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()
assert len(example_messages) == 1
print(example_messages[0].content)
docs = retriever.invoke(query)
- ex) elasticsearch 도 retriever 로 사용 가능
def vector_query(search_query: str) -> Dict:
vector = embeddings.embed_query(search_query)
return {
"knn": {
"field": dense_vector_field,
"query_vector": vector,
"k": 5,
"num_candidates": 10,
}
}
vector_retriever = ElasticsearchRetriever.from_es_params(
index_name=index_name,
body_func=vector_query,
content_field=text_field,
url=es_url,
)
vector_retriever.invoke("foo")
- vector store 를
as_retriever
메소드를 통해 retriever 로 사용 가능
vectorstore = MyVectorStore()
retriever = vectorstore.as_retriever()
- retriever 가 여럿일 경우 앙상블 리시버로 결합도 가능
ensemble_retriever = EnsembleRetriever(
retrievers=[bm25_retriever, vector_store_retriever], weights=[0.5, 0.5]
)
reference