0) 라이브러리 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
!pip install seaborn
import seaborn as sns
1) 데이터 수집
df = pd.read_csv('파일명.csv')
df.head()
df.tail()
df.shape
df.info()
df.index
df.columns
df.values
df.describe()
df.isnull().sum()
df['컬럼명'].value_counts()
df.select_dtypes(type)
df.groupby(by=['그룹기준 열'])['집계 대상 열'].집계함수()
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
2) 데이터 전처리
df.drop(columns=['컬럼1', '컬럼2', ..., '컬럼n'], axis=1, inplace=True)
df.replace('바뀌기 전', '바뀐 후', inplace=True)
df['컬럼명'].replace('바뀌기 전', '바뀐 후', inplace=True)
df['컬럼명'].fillna('바뀌는 값', inplace = False/True)
df['컬럼명'].dtypes
df['컬럼명'] = df['컬럼명'].astype(int)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
3-1) 인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['범주형 열'] = le.fit_transform(df['범주형 열'])
3-2) 원-핫 인코딩
dummies = ['컬럼1', '컬럼2', ..., '컬럼n']
df = pd.get_dummies(data=df, columns=dummies, drop_first=True)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
4) 데이터 분할
from sklearn.model_selection import train_test_split
X = data.drop('target', axis=1)
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
5) 정규화
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
6) 모델 학습 (랜덤포레스트)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
7) 모델 최적화 (GridSearchCV)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
8) 성능평가
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
9) 파일 저장
test_predictions = best_model.predict(X_test)
test_df = pd.DataFrame({'Actual': y_test, 'Predicted': test_predictions})
test_df.to_csv('test_predictions.csv', index=False)
import joblib
joblib.dump(best_model, 'best_model.h5')
import pandas as pd
import re
0) 데이터프레임 불러오기
df = pd.read_csv('/content/drive/MyDrive/KT에이블/AICE/IM.csv')
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
1) 텍스트 전처리
def preprocess_text(text):
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
normalized_tokens = [stemmer.stem(token.lower()) for token in tokens]
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in normalized_tokens]
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]
preprocessed_text = ' '.join(filtered_tokens)
return preprocessed_text
df['preprocessed_text'] = df['텍스트열_이름'].apply(preprocess_text)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
2) 데이터 분할
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(df['preprocessed_text'], df['라벨열_이름'], test_size=0.2, random_state=42)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
3) TF-IDF 벡터화
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_texts)
test_vectors = vectorizer.transform(test_texts)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
4) 모델 학습 (나이브 베이즈 분류기)
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(train_vectors, train_labels)
predictions = model.predict(test_vectors)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
5) 성능 평가
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_labels, predictions)
print(f"모델의 정확도: {accuracy}")
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
6) 파일 저장
df = pd.DataFrame({'Actual': test_labels, 'Predicted': predictions})
df.to_csv('predictions.csv', index=False)
from joblib import dump, load
dump(model, 'optimized_model.joblib')
model = load('optimized_model.joblib')
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.image import ImageDataGenerator
0) 데이터 로드 및 전처리
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
1) 이미지를 [0, 1] 범위로 정규화합니다.
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
2) 이미지 차원을 (배치 크기, 높이, 너비, 채널 수)로 변경합니다.
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
3) 레이블을 원-핫 인코딩합니다.
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
4) 데이터 증강 초기화
datagen = ImageDataGenerator(
rotation_range=10,
width_shift_range=0.1,
height_shift_range=0.1,
shear_range=0.1,
zoom_range=0.1,
horizontal_flip=True
)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
5) 모델 정의 (CNN)
model = Sequential([
Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=(28, 28, 1)),
MaxPooling2D((2, 2)),
Dropout(0.25),
Conv2D(64, (3, 3), padding='same', activation='relu'),
MaxPooling2D((2, 2)),
Dropout(0.25),
Conv2D(128, (3, 3), padding='same', activation='relu'),
Flatten(),
Dropout(0.5),
Dense(128, activation='relu'),
Dense(10, activation='softmax')
])
model.summary()
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
6) 모델 컴파일
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
7) 콜백 정의
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint_cb = ModelCheckpoint('best_model.h5', save_best_only=True)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
8) 모델 훈련
history = model.fit(datagen.flow(X_train, y_train, batch_size=64),
epochs=5,
validation_data=(X_test, y_test),
callbacks=[early_stopping_cb, model_checkpoint_cb])
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
9) 모델 평가
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'테스트 정확도: {test_accuracy:.4f}')
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
10) 모델 및 결과 저장
model.save('final_model.h5')
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)
np.savetxt("predictions.csv", predicted_classes, delimiter=",")