from llama_index import SimpleDirectoryReader
from llama_index import Document
from llama_index.node_parser import SentenceWindowNodeParser
documents = SimpleDirectoryReader(
input_files=["./eBook-How-to-Build-a-Career-in-AI.pdf"]
).load_data()
# 앤드류응 교수님의 책
document = Document(text="\n\n".join([doc.text for doc in documents]))
# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
window_size=3,
window_metadata_key="window",
original_text_metadata_key="original_text",
)
text = " How was your day? I was the best. What are you going to do? I will read a book. Im not busy. Take your time, please. I really don’t know what to say. I’m sorry. Did you clean this place? It’s really clean."
nodes = node_parser.get_nodes_from_documents([Document(text=text)])
print([x.text for x in nodes])
[' How was your day? ', 'I was the best. ', 'What are you going to do? ', 'I will read a book. ', 'Im not busy. ', 'Take your time, please. ', 'I really don’t know what to say. ', 'I’m sorry. ', 'Did you clean this place? ', 'It’s really clean.']
print(nodes[0].text)
print(nodes[4].text)
How was your day?
Im not busy.
print(nodes[0].metadata["window"])
print(nodes[4].metadata["window"])
How was your day? I was the best. What are you going to do?
What are you going to do? I will read a book. Im not busy. Take your time, please. I really don’t know what to say. I’m sorry.
from llama_index.llms import OpenAI
from llama_index import ServiceContext
from llama_index import VectorStoreIndex
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
sentence_context = ServiceContext.from_defaults(
llm=llm,
embed_model="local:BAAI/bge-small-en-v1.5",
# embed_model="local:BAAI/bge-large-en-v1.5"
node_parser=node_parser,
)
sentence_index = VectorStoreIndex.from_documents(
[document], service_context=sentence_context
)
# 생성된 Index 저장.
sentence_index.storage_context.persist(persist_dir="./sentence_index")
# This block of code is optional to check
# if an index file exist, then it will load it
# if not, it will rebuild it
import os
from llama_index import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index import load_index_from_storage
if not os.path.exists("./sentence_index"):
sentence_index = VectorStoreIndex.from_documents(
[document], service_context=sentence_context
)
sentence_index.storage_context.persist(persist_dir="./sentence_index")
else:
sentence_index = load_index_from_storage(
StorageContext.from_defaults(persist_dir="./sentence_index"),
service_context=sentence_context
)
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.schema import NodeWithScore
from copy import deepcopy
postproc = MetadataReplacementPostProcessor(
target_metadata_key="window"
)
# 위의 sentence-window parsing 진행된 노드들
scored_nodes = [NodeWithScore(node=x, score=1.0) for x in nodes]
nodes_old = [deepcopy(n) for n in nodes]
nodes_old[4].text
# 원래 문장
'Im not busy.
replaced_nodes = postproc.postprocess_nodes(scored_nodes)
print(replaced_nodes[4].text)
I was the best. What are you going to do? I will read a book. Im not busy. Take your time, please. I really don’t know what to say.
from llama_index.indices.postprocessor import SentenceTransformerRerank
# BAAI/bge-reranker-base
# link: https://huggingface.co/BAAI/bge-reranker-base
rerank = SentenceTransformerRerank(
top_n=2, model="BAAI/bge-reranker-base"
)
from llama_index import QueryBundle
from llama_index.schema import TextNode, NodeWithScore
query = QueryBundle("I want a dog.")
# query와 관련된 text의 score를 의도적으로 바꿔놨다.
# 이럴 때(관련된 text의 score가 낮을 때)를 위해 Reranker가 필요
scored_nodes = [
NodeWithScore(node=TextNode(text="This is a cat"), score=0.6),
NodeWithScore(node=TextNode(text="This is a dog"), score=0.4),
]
reranked_nodes = rerank.postprocess_nodes(
scored_nodes, query_bundle=query
)
print([(x.text, x.score) for x in reranked_nodes])
[('This is a dog', 0.91827345), ('This is a cat', 0.0014040739)]
sentence_window_engine = sentence_index.as_query_engine(
similarity_top_k=6, node_postprocessors=[postproc, rerank]
)
sentence_window_engine = sentence_index.as_query_engine(
similarity_top_k=6, node_postprocessors=[postproc, rerank]
)
window_response = sentence_window_engine.query(
"What are the keys to building a career in AI?"
)
from llama_index.response.notebook_utils import display_response
display_response(window_response)
Final Response:
The keys to building a career in AI are learning foundational technical skills, working on projects, and finding a job, all of which is supported by being part of a community.
import os
from llama_index import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import load_index_from_storage
def build_sentence_window_index(
documents,
llm,
embed_model="local:BAAI/bge-small-en-v1.5",
sentence_window_size=3,
save_dir="sentence_index",
):
# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
window_size=sentence_window_size,
window_metadata_key="window",
original_text_metadata_key="original_text",
)
sentence_context = ServiceContext.from_defaults(
llm=llm,
embed_model=embed_model,
node_parser=node_parser,
)
if not os.path.exists(save_dir):
sentence_index = VectorStoreIndex.from_documents(
documents, service_context=sentence_context
)
sentence_index.storage_context.persist(persist_dir=save_dir)
else:
sentence_index = load_index_from_storage(
StorageContext.from_defaults(persist_dir=save_dir),
service_context=sentence_context,
)
return sentence_index
def get_sentence_window_query_engine(
sentence_index, similarity_top_k=6, rerank_top_n=2
):
# define postprocessors
postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
rerank = SentenceTransformerRerank(
top_n=rerank_top_n, model="BAAI/bge-reranker-base"
)
sentence_window_engine = sentence_index.as_query_engine(
similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
)
return sentence_window_engine
from llama_index.llms import OpenAI
index = build_sentence_window_index(
[document],
llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
save_dir="./sentence_index",
)
query_engine = get_sentence_window_query_engine(index, similarity_top_k=6)
window size를 1부터 3,5 까지 늘리면서 실험
각각에 대해 RAG Triad에 대해서 평가
Note trade-off between token usage/cost and context relevance
Note trade-off between window size and groundedness(context relevance에서도 간접적으로 평가)
retrieval step에서 충분히 많은 관련된 context를 모으지 못하면 LLM은 Completion step에서 부족한 점을 메우기 위해 context에 드러난 정보들에 기반하는것이 아닌 pre-train된 정보를 사용하게 되고 이는 groundedness를 작아지게 만든다.
Note relationship between context relevance and groundedness
위의 이유로 context relevance가 작아지면 groundedness도 작아진다.
Context relevance가 증가하면 어느지점까지 같이 증가하지만, context size가 너무 커지면 context relevance점수가 높더라도 groundedness가 감소하게 된다. 이는 LLM이 너무 많은 컨텍스트에 오히려 압도되어서 pre-trained knowledge를 사용하게 되기 때문이다.
# 사전에 쓰여진 evaluation용 질문들
eval_questions = []
with open('generated_questions.text', 'r') as file:
for line in file:
# Remove newline character and convert to integer
item = line.strip()
eval_questions.append(item)
from trulens_eval import Tru
def run_evals(eval_questions, tru_recorder, query_engine):
for question in eval_questions:
with tru_recorder as recording:
response = query_engine.query(question)
from utils import get_prebuilt_trulens_recorder # 강의를 위해 사전에 정의된 helper함수
from trulens_eval import Tru
Tru().reset_database() # 초기화
sentence_index_1 = build_sentence_window_index(
documents,
llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
embed_model="local:BAAI/bge-small-en-v1.5",
sentence_window_size=1,
save_dir="sentence_index_1",
)
sentence_window_engine_1 = get_sentence_window_query_engine(
sentence_index_1
)
tru_recorder_1 = get_prebuilt_trulens_recorder(
sentence_window_engine_1,
app_id='sentence window engine 1'
)
run_evals(eval_questions, tru_recorder_1, sentence_window_engine_1)
sentence_index_3 = build_sentence_window_index(
documents,
llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
embed_model="local:BAAI/bge-small-en-v1.5",
sentence_window_size=3,
save_dir="sentence_index_3",
)
sentence_window_engine_3 = get_sentence_window_query_engine(
sentence_index_3
)
tru_recorder_3 = get_prebuilt_trulens_recorder(
sentence_window_engine_3,
app_id='sentence window engine 3'
)
run_evals(eval_questions, tru_recorder_3, sentence_window_engine_3)
sentence_index_5 = build_sentence_window_index(
documents,
llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
embed_model="local:BAAI/bge-small-en-v1.5",
sentence_window_size=5,
save_dir="sentence_index_5",
)
sentence_window_engine_5 = get_sentence_window_query_engine(
sentence_index_5
)
tru_recorder_5 = get_prebuilt_trulens_recorder(
sentence_window_engine_5,
app_id='sentence window engine 5'
)
run_evals(eval_questions, tru_recorder_5, sentence_window_engine_5)
Tru().run_dashboard()
https://learn.deeplearning.ai/building-evaluating-advanced-rag/lesson/4/sentence-window-retrieval