강의 링크 (link)
import datasets
code_dataset = datasets.Dataset.from_list(code_dataset)
import datasets
dataset = datasets.concatenate_datasets(
[pretraining_dataset, code_dataset]
)
import heapq
import datasets
def paragraph_length_filter(x):
"""Returns False iff a page has too few lines or lines are too short."""
lines = x['text'].split('\n')
if (
len(lines) < 3 # 최소 3줄 이상
or min(heapq.nlargest(3, [len(line) for line in lines])) < 3 # 각 줄에 최소 단어 3개 이상
):
return False
return True
dataset = dataset.filter(
paragraph_length_filter,
load_from_cache_file=False
)
import re
import datasets
def find_duplicates(paragraphs):
"""
Use this function to find the number of repetitions
in the paragraphs.
"""
unique_x = set()
duplicate_chars = 0
duplicate_elements = 0
for element in paragraphs:
if element in unique_x:
duplicate_chars += len(element)
duplicate_elements += 1
else:
unique_x.add(element)
return duplicate_elements, duplicate_chars
def paragraph_repetition_filter(x):
"""
Returns False iff a page has too many repetitions.
"""
text = x['text']
paragraphs = re.compile(r"\n{2,}").split(text.strip()) # Split by paragraphs (2 or more newlines)
paragraphs_duplicates, char_duplicates = find_duplicates(paragraphs) # Find number of duplicates in paragraphs
if paragraphs_duplicates / len(paragraphs) > 0.3:
return False
if char_duplicates / len(text) > 0.2:
return False
return True
dataset = dataset.filter(
paragraph_repetition_filter,
load_from_cache_file=False
)
def deduplication(ds):
def dedup_func(x):
"""Use this function to remove duplicate entries"""
if x['text'] in unique_text:
return False
else:
unique_text.add(x['text'])
return True
unique_text = set()
ds = ds.filter(dedup_func, load_from_cache_file=False, num_proc=1)
return ds
dataset = deduplication(dataset)
import urllib
from fasttext.FastText import _FastText
def english_language_filter(ds):
# load language detection model
model = _FastText('./models/upstage/L2_language_model.bin')
def is_english(x):
# Predict language of the text and probability
language, score = model.predict(x['text'].replace("\n", ""))
language = language[0].split("__")[2]
return score > 0.4 and language == "en" # change code here if building a model in another language
ds = ds.filter(is_english, load_from_cache_file=False, num_proc=1)
return ds
dataset = english_language_filter(dataset)