pip install chromadb tiktoken transformers sentence_transformers
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
tokenizer = tiktoken.get_encoding("cl100k_base")
def tiktoken_len(text):
tokens = tokenizer.encode(text)
return len(tokens)
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("/Users/kimjehyeon/study/강의대화1.pdf")
pages = loader.load_and_split()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0, length_function=tiktoken_len)
docs = text_splitter.split_documents(pages)
from langchain.embeddings import HuggingFaceEmbeddings
model_name ="jhgan/ko-sbert-nli"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
# 문서들을 Chroma VectorStore에 저장
db = Chroma.from_documents(docs, hf)
# query
query = "1대1 대화 방법이 뭔가요?"
# 이것과 관련된 내용
docs = db.similarity_search(query)
print(docs[0].page_content)
문서들을 hf라는 huggingface 임베딩을 통해서 임베딩화 시키고 그것을 chroma 객체를 통해서 vector 저장을 한 것고 그 문장과 유사도가 높은 문장을 출력하는 것이다.
대부분의 경우에서 내가 활용하고자 하는 문서를 나만의 디스크에 저장하고 필요할 때마다 호출해야한다. persist()함수를 통해 벡터저장소를 로컬 저장하고, Chroma 객체를 선언할 때 로컬 저장소 경로를 지정하여 필요할 때 다시 불러올 수 있다.
# save to disk
db2 = Chroma.from_documents(docs, hf, persist_directory="./chroma_db")
docs = db2.similarity_search(query)
# load from disk
db3 = Chroma(persist_directory="./chroma_db", embedding_function=hf)
docs = db3.similarity_search(query)
print(docs[0].page_content)
docs = db3.similarity_search_with_relevance_scores(query, k=3)
print("가장 유사한 문서:\n\n {}\n\n".format(docs[0][0].page_content))
print("문서 유사도: \n {}".format(docs[0][1]))
pip install faiss-cpu # For CPU Installation
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
tokenizer = tiktoken.get_encoding("cl100k_base")
def tiktoken_len(text):
tokens = tokenizer.encode(text)
return len(tokens)
loader = PyPDFLoader("/Users/kimjehyeon/study/강의대화1.pdf")
pages = loader.load_and_split()
# split it into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0, length_function=tiktoken_len)
docs = text_splitter.split_documents(pages)
from langchain.embeddings import HuggingFaceEmbeddings
model_name ="jhgan/ko-sbert-nli"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
ko = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
db = FAISS.from_documents(docs, ko)
query = "1대1 대화 하는 방법이 뭐야?"
docs = db.similarity_search(query)
print(docs[0].page_content)