Full Source Code
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
articles_df = pd.read_csv('shared_articles.csv')
print(articles_df.shape)
print(articles_df['timestamp'].head(5))
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df = articles_df[articles_df['lang'] == 'en']
print(articles_df.shape)
articles_df = pd.DataFrame(articles_df, columns=['contentId',
'authorPersonId',
'content',
'title',
'text'])
articles_df['soup'] = articles_df.apply(create_soup, axis=1)
def create_soup(x):
soup = ' '.join(x['text'])
return soup
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(articles_df['text'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix, True)
display(cosine_sim.shape)
display(cosine_sim)
metadata = articles_df.reset_index()
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()
display(indices[:10])
print(get_recommendations('The Rise And Growth of Ethereum Gets Mainstream Coverage', indices, cosine_sim,
metadata))
def get_recommendations(title, indices, cosine_sim, data):
# Get the index of the article that matches the title
idx = indices[title]
# Get the pairwsie similarity scores of all articles with that article
sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the articles based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 10 most similar movies
sim_scores = sim_scores[1:11]
# Get the article indices
movie_indices = [i[0] for i in sim_scores]
# Return the top 10 most similar articles
> return data['title'].iloc[movie_indices]
Authored by
https://medium.com/web-mining-is688-spring-2021/article-recommendation-system-using-python-8b0fec6e6de8
Article.csv file link
https://www.kaggle.com/datasets/gspmoreira/articles-sharing-reading-from-cit-deskdrop?select=shared_articles.csv