: 머신 러닝 시스템은 전처리기, 분류기, 특징 선택기 등 여러 세부 모듈로 구성되어 있다. 이러한 여러 모듈을 연결해 학습하는 프로세스를 학습 파이프라인이라고 한다.
# Select top K features
k_best_selector = SelectKBest(f_regression, k=9)
# Initialize Extremely Random Forests classifier
classifier = ExtraTreesClassifier(n_estimators=60, max_depth=4)
# Construct the pipeline
processor_pipeline = Pipeline([('selector', k_best_selector), ('erf', classifier)])
# Set the parameters
processor_pipeline.set_params(selector__k=7, erf__n_estimators=30)
# Training the pipeline
processor_pipeline.fit(X, y)
# Predict outputs for the input data
output = processor_pipeline.predict(X)
print("\nPredicted output:\n", output)
# Print scores
print("\nScore:", processor_pipeline.score(X, y))
# Print the features chosen by the pipeline selector
status = processor_pipeline.named_steps['selector'].get_support()
# Extract and print indices of selected features
selected = [i for i, x in enumerate(status) if x]
print("\nIndices of selected features:", ', '.join([str(x) for x in selected]))
: 최근접 이웃 알고리즘은 주어진 데이터셋에서 입력 데이터 포인트에 가장 가까운 포인트를 찾는 알고리즘으로 데이터 분류 시스템에서 자주 사용한다.
# Build K Nearest Neighbors model
knn_model = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(X)
distances, indices = knn_model.kneighbors(test_datapoint)
# Print the 'k' nearest neighbors
print("\nK Nearest Neighbors:")
for rank, index in enumerate(indices[0][:k], start=1):
print(str(rank) + " ==>", X[index])
: k-최근접 이웃 분류기는 최근접 이웃 알고리즘을 사용해 다수결에 따라 데이터를 분류하는 분류 모델이다.
# Create a K Nearest Neighbors classifier model
classifier = neighbors.KNeighborsClassifier(num_neighbors, weights='distance')
# Train the K Nearest Neighbours model
classifier.fit(X, y)
# Evaluate the classifier on all the points on the grid
output = classifier.predict(np.c_[x_values.ravel(), y_values.ravel()])
# Extract the K nearest neighbors
_, indices = classifier.kneighbors([test_datapoint])
indices = indices.astype(np.int)[0]
$ python3 compute_scores.py --user1 "David Smith" --user2 "Bill Duffy" --score-type Euclidean
: 두 데이터 포인트 사이의 유클리드 거리를 계산해 0과 1까지의 범위로 변환하여 활용하는 점수를 말한다.
# Compute the Euclidean distance score between user1 and user2
def euclidean_score(dataset, user1, user2):
if user1 not in dataset:
raise TypeError('Cannot find ' + user1 + ' in the dataset')
if user2 not in dataset:
raise TypeError('Cannot find ' + user2 + ' in the dataset')
# Movies rated by both user1 and user2
common_movies = {}
for item in dataset[user1]:
if item in dataset[user2]:
common_movies[item] = 1
# If there are no common movies between the users,
# then the score is 0
if len(common_movies) == 0:
return 0
squared_diff = []
for item in dataset[user1]:
if item in dataset[user2]:
squared_diff.append(np.square(dataset[user1][item] - dataset[user2][item]))
return 1 / (1 + np.sqrt(np.sum(squared_diff)))
: 두 객체 간의 상관관계를 측정한 값으로 공분산과 개별 표준편차를 사용해 -1부터 +1까지의 범위로 표현한 점수를 말한다.
# Compute the Pearson correlation score between user1 and user2
def pearson_score(dataset, user1, user2):
if user1 not in dataset:
raise TypeError('Cannot find ' + user1 + ' in the dataset')
if user2 not in dataset:
raise TypeError('Cannot find ' + user2 + ' in the dataset')
# Movies rated by both user1 and user2
common_movies = {}
for item in dataset[user1]:
if item in dataset[user2]:
common_movies[item] = 1
num_ratings = len(common_movies)
# If there are no common movies between user1 and user2, then the score is 0
if num_ratings == 0:
return 0
# Calculate the sum of ratings of all the common movies
user1_sum = np.sum([dataset[user1][item] for item in common_movies])
user2_sum = np.sum([dataset[user2][item] for item in common_movies])
# Calculate the sum of squares of ratings of all the common movies
user1_squared_sum = np.sum([np.square(dataset[user1][item]) for item in common_movies])
user2_squared_sum = np.sum([np.square(dataset[user2][item]) for item in common_movies])
# Calculate the sum of products of the ratings of the common movies
sum_of_products = np.sum([dataset[user1][item] * dataset[user2][item] for item in common_movies])
# Calculate the Pearson correlation score
Sxy = sum_of_products - (user1_sum * user2_sum / num_ratings)
Sxx = user1_squared_sum - np.square(user1_sum) / num_ratings
Syy = user2_squared_sum - np.square(user2_sum) / num_ratings
if Sxx * Syy == 0:
return 0
return Sxy / np.sqrt(Sxx * Syy)
: 협업 필터링은 새로운 객체를 분류하기 위해 데이터셋에 포함된 객체 간의 패턴을 식별하는 프로세스를 말한다.
# Finds users in the dataset that are similar to the input user
def find_similar_users(dataset, user, num_users):
if user not in dataset:
raise TypeError('Cannot find ' + user + ' in the dataset')
# Compute Pearson score between input user
# and all the users in the dataset
scores = np.array([[x, pearson_score(dataset, user,
x)] for x in dataset if x != user])
# Sort the scores in decreasing order
scores_sorted = np.argsort(scores[:, 1])[::-1]
# Extract the top 'num_users' scores
top_users = scores_sorted[:num_users]
return scores[top_users]