import numpy as np
import pandas as pd
from recommender import remove_stop_words, recommend_top_three

posts = pd.read_csv("data/post_production_2024_06_21_12_54.csv")

posts['post_description'] = posts['post_description'].str.replace(r'<.*?>', ' ', regex=True)
posts['post_description'] = posts['post_description'].str.replace(r'\r+|\n+|\t+',' ', regex=True)
posts['post_description'] = posts['post_description'].str.replace('&nbsp;', ' ')
posts['tags'] = posts['post_description']

# droping those extra columns
new_dataset = posts[['post_id', 'post_title', 'categories', 'tags']]

# Converting to lower case
new_dataset['tags'] = new_dataset['tags'].apply(lambda x:x.lower())

new_dataset['tags'] = new_dataset['tags'].apply(remove_stop_words)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

vector = cv.fit_transform(new_dataset['tags']).toarray()

from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vector)

import pickle

pickle.dump(new_dataset, open('clymb_resource_list.pkl','wb'))
pickle.dump(similarity, open('clymb_resource_similarity.pkl','wb'))

top_three_post = recommend_top_three("Social Awareness yes,Self-Awareness,Responsible Decision Making", "Angry, I don’t, Highly Motivated, I am open to accepting and listening to new people., Trying to communicate, Trying to work together, Thinking about asking for help, Trying to ignore peer pressure")
print(top_three_post)