RAG에서 Text를 분할하는 내용 위주로 정리하였다.
from langchain_community.document_loaders import TextLoader
loader = TextLoader("sample.txt", autodetect_encoding=True)
docs = loader.load()
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader("data/", glob="**/*.txt", use_multithreading=True)
docs = loader.load()
from langchain_community.document_loaders import CSVLoader
loader = CSVLoader("data/articles.csv", source_column="source")
docs = loader.load()
from langchain_community.document_loaders import UnstructuredMarkdownLoader
loader = UnstructuredMarkdownLoader("notes.md", mode="elements")
docs = loader.load()
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
chunks = splitter.split_documents(docs)
from langchain_text_splitters import TokenTextSplitter
splitter = TokenTextSplitter(encoding_name="cl100k_base", chunk_size=400, chunk_overlap=40)
chunks = splitter.split_documents(docs)
from langchain_text_splitters import MarkdownHeaderTextSplitter
headers = [("#", "h1"), ("##", "h2"), ("###", "h3")]
splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers)
sections = splitter.split_text(open("notes.md").read())
from langchain_text_splitters import PythonCodeTextSplitter
splitter = PythonCodeTextSplitter(chunk_size=800, chunk_overlap=80)
chunks = splitter.split_text(open("app.py").read())