ex010

정승원·2024년 1월 23일
0
post-custom-banner

0. GPU 할당

import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
   try:
    for gpu in pgus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), 'Physical GPUs,', len(logical_gpus), 'Logical GPUs')
  except RuntimeError as e:
    print(e)        

1. Tabular Data

0) 라이브러리 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 설치가 되어 있지 않은 라이브러리는 설치 필요: !pip install 라이브러리명
!pip install seaborn
import seaborn as sns

1) 데이터 수집
# tabular 데이터 로딩하기
df = pd.read_csv('파일명.csv')

# 데이터 구성 확인하기
df.head()
df.tail()

# 데이터프레임 행, 열 개수 확인하기
df.shape

# 데이터프레임 정보(컬럼정보, Null 여부, 타입 등) 출력하기
df.info()

# 데이터프레임 인덱스 확인하기
df.index

# 데이터프레임 컬럼 확인하기
df.columns

# 데이터프레임 값(value) 확인하기
df.values

# 데이터프레임의 계산 가능한 값(숫자형 변수)들에 대한 통계치를 확인하기
df.describe()

# DataFrame 컬럼별로 Null값 존재 확인하기
df.isnull().sum()

# 원하는 컬럼 데이터별 건수 보기(값 분포 확인)
df['컬럼명'].value_counts()

# 원하는 데이터 타입에 해당하는 열만 데이터 프레임 형태로 확인
df.select_dtypes(type)  # type 자리에 int, float, str 등 원하는 데이터 타입의 열만 추출

# 그룹별 집계
df.groupby(by=['그룹기준 열'])['집계 대상 열'].집계함수() #mean, max, min 등 집계 함수 기입
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
2) 데이터 전처리
# 컬럼 삭제 
df.drop(columns=['컬럼1', '컬럼2', ..., '컬럼n'], axis=1, inplace=True)

# 값 변경
df.replace('바뀌기 전', '바뀐 후', inplace=True) # 전체 데이터 대상
df['컬럼명'].replace('바뀌기 전', '바뀐 후', inplace=True) # 해당 컬럼의 데이터 대상

# 결측값 대체
df['컬럼명'].fillna('바뀌는 값', inplace = False/True)

# 대상 컬럼의 type 확인: object 여부
df['컬럼명'].dtypes #dtype('O'): object를 의미

# 대상 컬럼의 데이터 타입 변경
df['컬럼명'] = df['컬럼명'].astype(int) #int, float, object... 등
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
3-1) 인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['범주형 열'] = le.fit_transform(df['범주형 열'])

3-2)-핫 인코딩
dummies = ['컬럼1', '컬럼2', ..., '컬럼n']
df = pd.get_dummies(data=df, columns=dummies, drop_first=True)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
4) 데이터 분할
from sklearn.model_selection import train_test_split
X = data.drop('target', axis=1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
5) 정규화
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
6) 모델 학습 (랜덤포레스트)
from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import RandomForestRegressor
model = RandomForestClassifier()
model.fit(X_train, y_train)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
7) 모델 최적화 (GridSearchCV)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score  #, r2_score
param_grid = {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10]}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
8) 성능평가
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)  # 회귀문제 경우에는 r2_score
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
9) 파일 저장
# 테스트 예측 (.csv)
test_predictions = best_model.predict(X_test)
test_df = pd.DataFrame({'Actual': y_test, 'Predicted': test_predictions})
test_df.to_csv('test_predictions.csv', index=False)

# 최적화된 모델 저장 (.h5)
import joblib
joblib.dump(best_model, 'best_model.h5')

# 전체 코드 파일 저장 (.ipynb)

2. 텍스트 분석 (영문)

import pandas as pd
import re

0) 데이터프레임 불러오기
df = pd.read_csv('/content/drive/MyDrive/KT에이블/AICE/IM.csv')  # 데이터 파일 경로에 따라 적절히 수정해주세요
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
1) 텍스트 전처리 
def preprocess_text(text):
    # 토큰화
    import nltk
    nltk.download('punkt')
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(text)

    # 정규화 및 어간 추출
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    normalized_tokens = [stemmer.stem(token.lower()) for token in tokens]

    # 표제어 추출
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    nltk.download('wordnet')
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in normalized_tokens]

    # 불용어 제거
    from nltk.corpus import stopwords
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))  # 불용어 집합은 필요한 언어로 수정해주세요
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]

    # 전처리된 텍스트 반환
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text

# 텍스트 전처리 적용
df['preprocessed_text'] = df['텍스트열_이름'].apply(preprocess_text)  # 텍스트 열 이름에는 실제 텍스트 데이터가 있는 열의 이름을 넣어주세요
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
2) 데이터 분할 
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_labels, test_labels = train_test_split(df['preprocessed_text'], df['라벨열_이름'], test_size=0.2, random_state=42)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
3) TF-IDF 벡터화
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_texts)
test_vectors = vectorizer.transform(test_texts)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
4) 모델 학습 (나이브 베이즈 분류기)
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(train_vectors, train_labels)

predictions = model.predict(test_vectors)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
5) 성능 평가
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_labels, predictions)
print(f"모델의 정확도: {accuracy}")
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
6) 파일 저장
df = pd.DataFrame({'Actual': test_labels, 'Predicted': predictions})
df.to_csv('predictions.csv', index=False)

from joblib import dump, load
# 모델 저장
dump(model, 'optimized_model.joblib') 
# 모델 불러오기
model = load('optimized_model.joblib')

3. 이미지 분석

import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing.image import ImageDataGenerator

0) 데이터 로드 및 전처리
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
1) 이미지를 [0, 1] 범위로 정규화합니다.
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
2) 이미지 차원을 (배치 크기, 높이, 너비, 채널 수)로 변경합니다.
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
3) 레이블을 원-핫 인코딩합니다.
y_train = to_categorical(y_train, 10)
y_test = to_categorical(y_test, 10)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
4) 데이터 증강 초기화
datagen = ImageDataGenerator(
   rotation_range=10,
   width_shift_range=0.1,
   height_shift_range=0.1,
   shear_range=0.1,
   zoom_range=0.1,
   horizontal_flip=True
)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
5) 모델 정의 (CNN)
model = Sequential([
   Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=(28, 28, 1)),
   MaxPooling2D((2, 2)),
   Dropout(0.25),

   Conv2D(64, (3, 3), padding='same', activation='relu'),
   MaxPooling2D((2, 2)),
   Dropout(0.25),

   Conv2D(128, (3, 3), padding='same', activation='relu'),
   Flatten(),
   Dropout(0.5),
   Dense(128, activation='relu'),
   Dense(10, activation='softmax')
])

model.summary()
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
6) 모델 컴파일
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
7) 콜백 정의
early_stopping_cb = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model_checkpoint_cb = ModelCheckpoint('best_model.h5', save_best_only=True)
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
8) 모델 훈련
history = model.fit(datagen.flow(X_train, y_train, batch_size=64),
                   epochs=5,
                   validation_data=(X_test, y_test),
                   callbacks=[early_stopping_cb, model_checkpoint_cb])
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
9) 모델 평가
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'테스트 정확도: {test_accuracy:.4f}')
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# 결과 시각화
# plt.figure(figsize=(12, 5))
# plt.subplot(1, 2, 1)
# plt.plot(history.history['accuracy'], label='Accuracy')
# plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
# plt.legend()
# plt.title('Accuracy over epochs')

# plt.subplot(1, 2, 2)
# plt.plot(history.history['loss'], label='Loss')
# plt.plot(history.history['val_loss'], label='Validation Loss')
# plt.legend()
# plt.title('Loss over epochs')
# plt.show()
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
10) 모델 및 결과 저장
model.save('final_model.h5')

predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)
np.savetxt("predictions.csv", predicted_classes, delimiter=",")
post-custom-banner

0개의 댓글