ํ
์คํธ ๋ฐ์ดํฐ๋ ๊ตฌ์กฐ์ ๋ฐ์ดํฐ(์ซ์, ๋ ์ง)์ ๋ฌ๋ฆฌ ๊ฐ๊ณต์ด ํ์ํ๋ค.
Pandas์ ๋ฌธ์์ด ์ฒ๋ฆฌ ๊ธฐ๋ฅ(.str)๊ณผ ๊ธฐ๋ณธ ํ์ด์ฌ ๋ฌธ๋ฒ์ ํ์ฉํ๋ฉด ์ ์ ยท๋ถ์์ด ๊ฐ๋ฅํ๋ค.
import pandas as pd
df = pd.read_csv('text_data.csv')
df.head()
df.info()
df['text'].head()
df['text'].describe()
ํต์ฌ ํ์ ์งํ
len(df['text']): ๋ฌธ์ฅ ์df['text'].str.len().mean(): ํ๊ท ๊ธ์ ์df['text'].isnull().sum(): ๊ฒฐ์ธก๊ฐ ํ์ธdf['text_lower'] = df['text'].str.lower()
df['text_upper'] = df['text'].str.upper()
โ ๊ฐ์ ๋จ์ด๋ผ๋ โAppleโ vs โappleโ์ฒ๋ผ ๋ค๋ฅธ ๊ฐ์ผ๋ก ์ธ์๋๋ ๋ฌธ์ ๋ฅผ ๋ฐฉ์ง.
df['word_count'] = df['text'].str.split().str.len()
df['contains_ai'] = df['text'].str.contains('AI', case=False)
df['replaced'] = df['text'].str.replace('data', 'information', regex=False)
| ํจ์ | ๊ธฐ๋ฅ |
|---|---|
.str.len() | ๋ฌธ์์ด ๊ธธ์ด |
.str.split() | ๋จ์ด ๋ถํ |
.str.contains() | ํน์ ํจํด ํฌํจ ์ฌ๋ถ |
.str.replace() | ํน์ ๋จ์ด ๋์ฒด |
ํ ์คํธ ์ ์ฒ๋ฆฌ์ ๊ธฐ๋ณธ์ ๋ถํ์ํ ๊ธฐํธ ์ ๊ฑฐ๋ค.
import string
def remove_punct(text):
return text.translate(str.maketrans('', '', string.punctuation))
df['clean_text'] = df['text'].apply(remove_punct)
์์:
"Hello, world!" โ "Hello world""AI-driven, data-based." โ "AIdriven databased"์๋ฏธ ์๋ ๋จ์ด(์: the, and, is)๋ ์ ๊ฑฐํด์ผ ํต๊ณ์ ์๊ณก์ด ์ค์ด๋ ๋ค.
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['no_stopwords'] = df['clean_text'].apply(
lambda x: ' '.join([word for word in x.split() if word not in stop_words])
)
๋ฌธ์ฅ์ ๋จ์ด ๋๋ ํํ์ ๋จ์๋ก ๋ถํ ํ๋ค.
from nltk.tokenize import word_tokenize
df['tokens'] = df['clean_text'].apply(word_tokenize)
์ถ๋ ฅ ์์
["Artificial", "Intelligence", "drives", "future", "innovation"]
ํ๊ตญ์ด์ ๊ฒฝ์ฐ
konlpyํจํค์ง์Okt,Mecabํํ์ ๋ถ์๊ธฐ๋ฅผ ํ์ฉํ๋ค.
from collections import Counter
word_counts = Counter(" ".join(df['no_stopwords']).split())
pd.DataFrame(word_counts.most_common(10), columns=['word', 'count'])
from wordcloud import WordCloud
import matplotlib.pyplot as plt
text = " ".join(df['no_stopwords'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
WordCloud Tips
- ๋ถ์ฉ์ด ์ ๊ฑฐ ํ ์์ฑํด์ผ ์๊ฐ์ ๋ ธ์ด์ฆ ๊ฐ์
colormap='coolwarm',max_words=100๋ฑ์ผ๋ก ์ปค์คํฐ๋ง์ด์ง ๊ฐ๋ฅ
x = 10
name = "Python"
is_active = True
| ์๋ฃํ | ์์ |
|---|---|
| ์ ์(int) | 5 |
| ์ค์(float) | 3.14 |
| ๋ฌธ์์ด(str) | "text" |
| ๋ถ๋ฆฌ์ธ(bool) | True, False |
a, b = 5, 3
a + b, a - b, a * b, a / b
a > b, a == b
a > 2 and b < 5
score = 85
if score >= 90:
print("A")
elif score >= 80:
print("B")
else:
print("C")
for i in range(5):
print(i)
n = 0
while n < 3:
print("Loop", n)
n += 1
๋ฆฌ์คํธ ์ปดํ๋ฆฌํจ์
squares = [x**2 for x in range(5)]
def greet(name):
return f"Hello, {name}!"
add = lambda x, y: x + y
| ํจ์ | ๊ธฐ๋ฅ |
|---|---|
len() | ๊ธธ์ด ๊ณ์ฐ |
sum() | ํฉ๊ณ |
sorted() | ์ ๋ ฌ |
map(), filter() | ํจ์ํ ๋ฐ์ดํฐ ์ฒ๋ฆฌ |
| ์ ํ | ์์ | ํน์ง |
|---|---|---|
| ๋ฆฌ์คํธ | [1, 2, 3] | ์์ O, ์์ ๊ฐ๋ฅ |
| ํํ | (1, 2, 3) | ์์ O, ์์ ๋ถ๊ฐ |
| ์ฌ์ | {'a': 1, 'b': 2} | ํค-๊ฐ ์ |
| ์งํฉ | {1, 2, 3} | ์ค๋ณต ๋ถ๊ฐ |
# ํ
์คํธ ํ์ผ
with open('sample.txt', 'w') as f:
f.write('Hello World')
# CSV ํ์ผ
import pandas as pd
df.to_csv('output.csv', index=False)
import numpy as np
arr = np.array([1, 2, 3, 4])
print(arr.shape, arr.dtype)
print(arr + 10)
| ๊ธฐ๋ฅ | ์์ |
|---|---|
| ๋ฐฐ์ด ์์ฑ | np.array([1,2,3]) |
| ์ฌ๋ผ์ด์ฑ | arr[1:3] |
| ๋ธ๋ก๋์บ์คํ | arr * 2 |
| ์ํ ์ฐ์ฐ | np.mean(arr) |
| ๊ตฌ๋ถ | ์ฃผ์ ํ์ต ๋ด์ฉ | ํต์ฌ ์ฝ๋ |
|---|---|---|
| ํ ์คํธ ์ ๊ทํ | ๋์๋ฌธ์, ๊ตฌ๋์ , ๋ถ์ฉ์ด ์ฒ๋ฆฌ | .str.lower(), translate(), stopwords |
| ํ ํฐํ | ๋จ์ด ๋จ์ ๋ถํ | word_tokenize() |
| ์๊ฐํ | ๋จ์ด๋น๋, ์๋ํด๋ผ์ฐ๋ | Counter, WordCloud |
| ํ์ด์ฌ ๊ธฐ์ด | ๋ณ์, ์กฐ๊ฑด, ๋ฐ๋ณต, ํจ์ | if, for, def, lambda |
| ๋ฐ์ดํฐ ํ์ ๋ณต์ต | ๋ฆฌ์คํธยท๋์ ๋๋ฆฌยทํํยท์งํฉ | [ ], { }, ( ), set() |
| NumPy ํ์ฉ | ์์นํ ๋ฐฐ์ด ์ฐ์ฐ | np.array, np.mean, np.shape |
.str ๋ฉ์๋๋ Pandas ๋ด์์ ๋ฒกํฐํ ์ฐ์ฐ ์ง์ โ ๋๊ท๋ชจ ๋ฐ์ดํฐ์๋ ํจ์จ์