pip install chromadb tiktoken transformers sentence_transformers openai langchain pypdf
import os
import openai
import tiktoken
os.environ["OPENAI_API_KEY"] = 'API_KEY'
tokenizer = tiktoken.get_encoding("cl100k_base")
# 토크나이저 기반으로 토큰 개수를 새줌
def tiktoken_len(text):
tokens = tokenizer.encode(text)
return len(tokens)
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("/Users/kimjehyeon/study/강의대화1.pdf")
pages = loader.load_and_split()
# chunking
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50, length_function=tiktoken_len)
texts = text_splitter.split_documents(pages)
# text embedding
# embedding model
from langchain.embeddings import HuggingFaceEmbeddings
model_name ="jhgan/ko-sbert-nli"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
# texts들에 대해 hf 임베딩 모델로 임베딩 하는 과정 -> 이를 docsearch에 저장
docsearch = Chroma.from_documents(texts, hf)
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
openai = ChatOpenAI(model_name="gpt-3.5-turbo",
streaming=True, callbacks=[StreamingStdOutCallbackHandler()],
temperature = 0)
qa = RetrievalQA.from_chain_type(llm = openai,
chain_type= "stuff",
retriever = docsearch.as_retriever(
search_type="mmr",
search_kwargs={'k':3, 'fetch_k': 10}),
return_source_documents=True)
query="1대1 대화하는 법에 대해 설명해줘"
result = qa(query)