from pyspark.sql import SparkSession
import numpy as np
spark = SparkSession.builder.master("local").appName("SparkSQL").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
cus_id = 7364
Order_path = 'C:/Users/r2com/Desktop/Python/Cralling/total_Custommer.csv'
Order_df = spark.read.option("header", True).csv(Order_path)
Order_df = Order_df.select("*").toPandas()
Order_df = Order_df.drop('_c0', axis=1)
Order_df = Order_df.drop('allergy_list', axis=1)
def bar_to_comma(x):
return x.replace('|', ', ')
item_path = 'C:/Users/r2com/Desktop/Python/Cralling/Item.csv'
item_df = spark.read.option("header", True).csv(item_path)
item_df = item_df.select("*").toPandas()
item_df['Ingredient'] = item_df['Ingredient'].apply(bar_to_comma)
Cus_path = 'C:/Users/r2com/Desktop/Python/Cralling/Custommer_list.csv'
Cus_df = spark.read.option("header", True).csv(Cus_path)
Cus_df = Cus_df.select("*").toPandas()
import pandas as pd
def text_cleaning(text):
text_list = []
for item in text.split(', '):
item = item.strip('[')
item = item.strip(']')
item = item.strip('\'')
text_list.append(item)
return text_list
def get_Items(cus_id):
cus_order_df = Order_df[Order_df['customer_id']==cus_id]
text = []
for i in range(len(cus_order_df)):
text += text_cleaning(cus_order_df.iloc[i]['items'])
text = set(text)
text = list(text)
return (', ').join(text)
def Search_Custommer(cus_id):
items = get_Items(cus_id)
name = Cus_df.iloc[int(cus_id)]['Custommer_Name']
allergy = Cus_df.iloc[int(cus_id)]['Allergy_List']
return pd.DataFrame(data=[cus_id, name, allergy, items], index=['Custommer_ID', 'Custommer_Name', 'Allergy_List', 'items']).T
custommer_id = Cus_df.iloc[cus_id]['Custommer_ID']
custommer_df = Search_Custommer(custommer_id)
item_custommer_items = pd.concat([custommer_df['items'], item_df['Ingredient']])
item_custommer_items = item_custommer_items.reset_index()
item_custommer_items['Item_key'] = [-1] + item_df['Item_key'].tolist()
item_custommer_items.columns=['index', 'items', 'Item_key']
item_custommer_items = item_custommer_items.drop(['index'],axis=1)
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
item_custommer_matrix = vect.fit_transform(item_custommer_items['items'])
from sklearn.metrics.pairwise import cosine_similarity
item_custommer_similarity = cosine_similarity(item_custommer_matrix, item_custommer_matrix)
item_custommer_sorted = item_custommer_similarity.argsort()[:, ::-1]
item_df['point'] = 0
point_list = [0]*len(item_df)
item_custommer_reverse = item_custommer_sorted[0][1:].copy()
item_custommer_reverse = item_custommer_reverse[::-1]
for point, i in enumerate(item_custommer_reverse):
point_list[i-1] = point+1
def get_Allergy_List(x):
x_list = x.tolist()
item_list = []
if x_list:
for item in x_list:
if item == True:
item_list += item.split(', ')
return x_list
if not (None in Cus_df[Cus_df['Custommer_ID']==str(cus_id)]['Allergy_List'].tolist()):
allergy_list = get_Allergy_List(Cus_df[Cus_df['Custommer_ID']==str(cus_id)]['Allergy_List'])
for i in range(len(item_df)):
for allergy in allergy_list:
if int(item_df.iloc[i][allergy]):
point_list[i] = 0
item_df['point'] = point_list
print (item_df.sort_values(by='point', ascending=False)['Item_key'].tolist())