[Langchain] Document Loader
[Langchain] Text Splitter
from langchain_community.document_loaders import PyPDFLoader
# PDF ๋ก๋ ์ด๊ธฐํ
pdf_loader = PyPDFLoader('./data/transformer.pdf')
# ๋๊ธฐ ๋ก๋ฉ
pdf_docs = pdf_loader.load()
print(f'PDF ๋ฌธ์ ๊ฐ์: {len(pdf_docs)}')
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
# Hugging Face์ ์๋ฒ ๋ฉ ๋ชจ๋ธ ์์ฑ
embeddings_huggingface = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")
# ํ ํฌ๋์ด์ ์ง์ ์ ๊ทผ
tokenizer = embeddings_huggingface._client.tokenizer
# ํ ํฌ๋์ด์ ๋ฅผ ์ฌ์ฉํ ์์
text = "ํ
์คํธ ํ
์คํธ์
๋๋ค."
tokens = tokenizer(text)
print(tokens)
# ํ ํฌ๋์ด์ ์ค์ ํ์ธ
print(tokenizer.model_max_length) # ์ต๋ ํ ํฐ ๊ธธ์ด
print(tokenizer.vocab_size) # ์ดํ ํฌ๊ธฐ
- ์ถ๋ ฅ
{'input_ids': [0, 153924, 239355, 5826, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1]}
8192
250002
# ํ ํฐ ์๋ฅผ ๊ณ์ฐํ๋ ํจ์
def count_tokens(text):
return len(tokenizer(text)['input_ids'])
# ํ ํฐ ์ ๊ณ์ฐ
text = "ํ
์คํธ ํ
์คํธ์
๋๋ค."
print(count_tokens(text))
- ์ถ๋ ฅ
6
from langchain_text_splitters import RecursiveCharacterTextSplitter
# ํ
์คํธ ๋ถํ ๊ธฐ ์์ฑ
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=100,
length_function=count_tokens, # ํ ํฐ ์๋ฅผ ๊ธฐ์ค์ผ๋ก ๋ถํ
separators=["\n\n", "\n", " ", ""], # ๊ตฌ๋ถ์ - ์ฌ๊ท์ ์ผ๋ก ์์ฐจ์ ์ผ๋ก ์ ์ฉ
)
# ํ
์คํธ ๋ถํ
chunks = text_splitter.split_documents(pdf_docs)
print(f"์์ฑ๋ ํ
์คํธ ์ฒญํฌ ์: {len(chunks)}")
print(f"๊ฐ ์ฒญํฌ์ ๊ธธ์ด: {list(len(chunk.page_content) for chunk in chunks)}")
print(f"๊ฐ ์ฒญํฌ์ ํ ํฐ ์: {list(count_tokens(chunk.page_content) for chunk in chunks)}")
- ์ถ๋ ฅ
์์ฑ๋ ํ
์คํธ ์ฒญํฌ ์: 38
๊ฐ ์ฒญํฌ์ ๊ธธ์ด: [1378, 1796, 1831, 1857, 1292, 1609, 503, 1554, 1278, 1362, 1608, 833, 1418, 1680, 999, 1764, 1604, 539, 1219, 1645, 926, 1213, 1688, 716, 1409, 1626, 624, 1411, 1437, 913, 1493, 1337, 845, 812, 470, 438, 470, 441]
๊ฐ ์ฒญํฌ์ ํ ํฐ ์: [336, 415, 405, 419, 327, 424, 127, 388, 294, 384, 411, 204, 419, 417, 226, 419, 395, 149, 390, 400, 221, 356, 411, 181, 394, 405, 188, 424, 399, 277, 420, 409, 250, 190, 131, 117, 131, 113]
# ์ฒญํฌ์ ํ
์คํธ ํ์ธ
print(chunks[2].page_content)
- ์ถ๋ ฅ
1 Introduction
Recurrent neural networks, long short-term memory [13] and gated recurrent [7] neural networks
in particular, ...
The Transformer allows for significantly more parallelization and can reach a new state of the art in
from langchain_chroma import Chroma
# Chroma ๋ฒกํฐ ์ ์ฅ์ ์์ฑํ๊ธฐ
chroma_db = Chroma.from_documents(
documents=chunks,
embedding=embeddings_huggingface, # huggingface ์๋ฒ ๋ฉ ์ฌ์ฉ
collection_name="db_transformer", # ์ปฌ๋ ์
์ด๋ฆ
persist_directory="./chroma_db",
collection_metadata = {'hnsw:space': 'cosine'}, # l2, ip, cosine ์ค์์ ์ ํ
)
# ํ์ฌ ์ ์ฅ๋ ์ปฌ๋ ์
๋ฐ์ดํฐ ํ์ธ
chroma_db.get()
- ์ถ๋ ฅ
{'ids': ['...' ,
...],
'embeddings': None,
'documents': ['Provided ...',
...],
'uris': None,
'data': None,
'metadatas': [
{'page': 0, 'page_label': 1, 'source': './data/transformer.pdf'},
...],
...
}
k
: ๋ฐํํ ๋ฌธ์์ ๊ฐ์chroma_k_retriever = chroma_db.as_retriever(
search_kwargs={"k": 2},
)
query = "๋ํ์ ์ธ ์ํ์ค ๋ชจ๋ธ์ ์ด๋ค ๊ฒ๋ค์ด ์๋์?"
retrieved_docs = chroma_k_retriever.invoke(query)
print(f"์ฟผ๋ฆฌ: {query}")
print("๊ฒ์ ๊ฒฐ๊ณผ:")
for i, doc in enumerate(retrieved_docs, 1):
print(f"-{i}-\n{doc.page_content[:100]}...{doc.page_content[-100:]} [์ถ์ฒ: {doc.metadata['source']}]")
print("-" * 100)
- ์ถ๋ ฅ
์ฟผ๋ฆฌ: ๋ํ์ ์ธ ์ํ์ค ๋ชจ๋ธ์ ์ด๋ค ๊ฒ๋ค์ด ์๋์?
๊ฒ์ ๊ฒฐ๊ณผ:
-1-
1 Introduction
Recurrent neural networks, long short-term memory [13] and gated recurrent [7] neural...he Transformer allows for significantly more parallelization and can reach a new state of the art in [์ถ์ฒ: ./data/transformer.pdf]
----------------------------------------------------------------------------------------------------
-2-
In contrast to RNN sequence-to-sequence models [37], the Transformer outperforms the Berkeley-
Parse... 2016.
[2] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly [์ถ์ฒ: ./data/transformer.pdf]
----------------------------------------------------------------------------------------------------
search_type
: ๊ฒ์ ๊ธฐ์ค (similarity
, mmr
, similarity_score_threshold
)search_kwargs={'score_threshold': 0.5}
: ๊ธฐ์ค ์ ์ ์ค์ (0.5 ์ด์๋ง ๊ฒ์)from langchain_community.utils.math import cosine_similarity
chroma_threshold_retriever = chroma_db.as_retriever(
search_type='similarity_score_threshold', # cosine ์ ์ฌ๋
search_kwargs={'score_threshold': 0.5, 'k':2}, # 0.5 ์ด์์ธ ๋ฌธ์๋ฅผ ์ถ์ถ
)
query = "๋ํ์ ์ธ ์ํ์ค ๋ชจ๋ธ์ ์ด๋ค ๊ฒ๋ค์ด ์๋์?"
retrieved_docs = chroma_threshold_retriever.invoke(query)
print(f"์ฟผ๋ฆฌ: {query}")
print("๊ฒ์ ๊ฒฐ๊ณผ:")
for i, doc in enumerate(retrieved_docs, 1):
score = cosine_similarity(
[embeddings_huggingface.embed_query(query)],
[embeddings_huggingface.embed_query(doc.page_content)]
)[0][0]
print(f"-{i}-\n[์ ์ฌ๋: {score}]\n{doc.page_content[:100]}...{doc.page_content[-100:]}")
print("-" * 100)
- ์ถ๋ ฅ
์ฟผ๋ฆฌ: ๋ํ์ ์ธ ์ํ์ค ๋ชจ๋ธ์ ์ด๋ค ๊ฒ๋ค์ด ์๋์?
๊ฒ์ ๊ฒฐ๊ณผ:
-1-
[์ ์ฌ๋: 0.5069071561705342]
1 Introduction
Recurrent neural networks, long short-term memory [13] and gated recurrent [7] neural...he Transformer allows for significantly more parallelization and can reach a new state of the art in
----------------------------------------------------------------------------------------------------
-2-
[์ ์ฌ๋: 0.5020665604450864]
In contrast to RNN sequence-to-sequence models [37], the Transformer outperforms the Berkeley-
Parse... 2016.
[2] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly
----------------------------------------------------------------------------------------------------
fetch_k
: MMR ์๊ณ ๋ฆฌ์ฆ์ ์ ๋ฌํ ๋ฌธ์์ ๊ฐ์lambda_mult
: ๊ฒฐ๊ณผ์ ๋ค์์ฑ ( 1 : ์ต์ ๋ค์์ฑ, 0 : ์ต๋ ๋ค์์ฑ)# MMR - ๋ค์์ฑ ๊ณ ๋ ค (lambda_mult ์์์๋ก ๋ ๋ค์ํ๊ฒ ์ถ์ถ)
chroma_mmr = chroma_db.as_retriever(
search_type='mmr',
search_kwargs={
'k': 3, # ๊ฒ์ํ ๋ฌธ์์ ์
'fetch_k': 8, # mmr ์๊ณ ๋ฆฌ์ฆ์ ์ ๋ฌํ ๋ฌธ์์ ์ (fetch_k > k)
'lambda_mult': 0.5, # ๋ค์์ฑ์ ๊ณ ๋ คํ๋ ์ ๋ (1์ ์ต์ ๋ค์์ฑ, 0์ ์ต๋ ๋ค์์ฑ์ ์๋ฏธ. ๊ธฐ๋ณธ๊ฐ์ 0.5)
},
)
query = "๋ํ์ ์ธ ์ํ์ค ๋ชจ๋ธ์ ์ด๋ค ๊ฒ๋ค์ด ์๋์?"
retrieved_docs = chroma_mmr.invoke(query)
print(f"์ฟผ๋ฆฌ: {query}")
print("๊ฒ์ ๊ฒฐ๊ณผ:")
for i, doc in enumerate(retrieved_docs, 1):
score = cosine_similarity(
[embeddings_huggingface.embed_query(query)],
[embeddings_huggingface.embed_query(doc.page_content)]
)[0][0]
print(f"-{i}-\n[์ ์ฌ๋: {score}]\n{doc.page_content[:100]}...{doc.page_content[-100:]}")
print("-" * 100)
- ์ถ๋ ฅ
์ฟผ๋ฆฌ: ๋ํ์ ์ธ ์ํ์ค ๋ชจ๋ธ์ ์ด๋ค ๊ฒ๋ค์ด ์๋์?
๊ฒ์ ๊ฒฐ๊ณผ:
-1-
[์ ์ฌ๋: 0.5069071561705342]
1 Introduction
Recurrent neural networks, long short-term memory [13] and gated recurrent [7] neural...he Transformer allows for significantly more parallelization and can reach a new state of the art in
----------------------------------------------------------------------------------------------------
-2-
[์ ์ฌ๋: 0.47915489021788504]
Table 1: Maximum path lengths, per-layer complexity and minimum number of sequential operations
for ...ng
corresponds to a sinusoid. The wavelengths form a geometric progression from 2ฯ to 10000 ยท 2ฯ. We
----------------------------------------------------------------------------------------------------
-3-
[์ ์ฌ๋: 0.4709169133567156]
from our models and present and discuss examples in the appendix. Not only do individual attention
h..., according to the formula:
lrate = dโ0.5
model ยท min(step_numโ0.5, step_num ยท warmup_stepsโ1.5) (3)
----------------------------------------------------------------------------------------------------
# ๋ฉํ๋ฐ์ดํฐ ํ์ธ
chunks[0].metadata
- ์ถ๋ ฅ
{'source': './data/transformer.pdf', 'page': 0, 'page_label': '1'}
# ๋ฌธ์ ๊ฐ์ฒด์ metadata๋ฅผ ์ด์ฉํ ํํฐ๋ง
chrom_metadata = chroma_db.as_retriever(
search_kwargs={
'filter': {'source': './data/transformer.pdf'},
'k': 5,
}
)
query = "๋ํ์ ์ธ ์ํ์ค ๋ชจ๋ธ์ ์ด๋ค ๊ฒ๋ค์ด ์๋์?"
retrieved_docs = chrom_metadata.invoke(query)
print(f"์ฟผ๋ฆฌ: {query}")
print("๊ฒ์ ๊ฒฐ๊ณผ:")
for i, doc in enumerate(retrieved_docs, 1):
print(f"-{i}-\n{doc.page_content[:100]}...{doc.page_content[-100:]}\n[์ถ์ฒ: {doc.metadata['source']}]")
print("-" * 100)
- ์ถ๋ ฅ
์ฟผ๋ฆฌ: ๋ํ์ ์ธ ์ํ์ค ๋ชจ๋ธ์ ์ด๋ค ๊ฒ๋ค์ด ์๋์?
๊ฒ์ ๊ฒฐ๊ณผ:
-1-
1 Introduction
Recurrent neural networks, long short-term memory [13] and gated recurrent [7] neural...he Transformer allows for significantly more parallelization and can reach a new state of the art in
[์ถ์ฒ: ./data/transformer.pdf]
----------------------------------------------------------------------------------------------------
-2-
In contrast to RNN sequence-to-sequence models [37], the Transformer outperforms the Berkeley-
Parse... 2016.
[2] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly
[์ถ์ฒ: ./data/transformer.pdf]
----------------------------------------------------------------------------------------------------
...
-'where_document': {'$contains': 'recurrent'}
: page_content ๋ณธ๋ฌธ์ 'recurrent' ๋ฅผ ํฌํจํ ๋ฌธ์ ์ค์์ ๊ฒ์
# page_content๋ฅผ ์ด์ฉํ ํํฐ๋ง
chroma_content = chroma_db.as_retriever(
search_kwargs={
'k': 2,
'where_document': {'$contains': 'recurrent'},
}
)
query = "๋ํ์ ์ธ ์ํ์ค ๋ชจ๋ธ์ ์ด๋ค ๊ฒ๋ค์ด ์๋์?"
retrieved_docs = chroma_content.invoke(query)
print(f"์ฟผ๋ฆฌ: {query}")
print("๊ฒ์ ๊ฒฐ๊ณผ:")
for i, doc in enumerate(retrieved_docs, 1):
print(f"-{i}-\n{doc.page_content} [์ถ์ฒ: {doc.metadata['source']}]")
print("-" * 100)
- ์ถ๋ ฅ
์ฟผ๋ฆฌ: ๋ํ์ ์ธ ์ํ์ค ๋ชจ๋ธ์ ์ด๋ค ๊ฒ๋ค์ด ์๋์?
๊ฒ์ ๊ฒฐ๊ณผ:
-1-
1 Introduction
Recurrent neural networks, long short-term memory [13] and gated recurrent [7] neural...he Transformer allows for significantly more parallelization and can reach a new state of the art in
[์ถ์ฒ: ./data/transformer.pdf]
----------------------------------------------------------------------------------------------------
-2-
In contrast to RNN sequence-to-sequence models [37], the Transformer outperforms the Berkeley-
Parse... 2016.
[2] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly
[์ถ์ฒ: ./data/transformer.pdf]
----------------------------------------------------------------------------------------------------