diff --git a/.gitignore b/.gitignore index ca2b75a..3fa3a99 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,9 @@ __pycache__/ *.pyc *.jsonl +*.json *.code-workspace .env -topic_buckets.txt # React App Vite node_modules/ diff --git a/server/app.py b/server/app.py index 48e757a..7e062b9 100644 --- a/server/app.py +++ b/server/app.py @@ -4,6 +4,7 @@ from server.stat_gen import StatGen import pandas as pd import traceback +import json app = Flask(__name__) @@ -13,7 +14,8 @@ CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}}) # Global State posts_df = pd.read_json('posts.jsonl', lines=True) comments_df = pd.read_json('comments.jsonl', lines=True) -domain_topics = open("topic_buckets.txt").read().splitlines() +with open("topic_buckets.json", "r", encoding="utf-8") as f: + domain_topics = json.load(f) stat_obj = StatGen(posts_df, comments_df, domain_topics) @app.route('/upload', methods=['POST']) @@ -28,15 +30,15 @@ def upload_data(): if post_file.filename == "" or comment_file.filename == "" or topic_file == "": return jsonify({"error": "Empty filename"}), 400 - if not post_file.filename.endswith('.jsonl') or not comment_file.filename.endswith('.jsonl') or not topic_file.endswith('.txt'): - return jsonify({"error": "Invalid file type. Only .jsonl and .txt files are allowed."}), 400 + if not post_file.filename.endswith('.jsonl') or not comment_file.filename.endswith('.jsonl') or not topic_file.endswith('.json'): + return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400 try: global stat_obj posts_df = pd.read_json(post_file, lines=True) comments_df = pd.read_json(comment_file, lines=True) - stat_obj = StatGen(posts_df, comments_df, topic_file.splitlines()) + stat_obj = StatGen(posts_df, comments_df, json.load(topic_file)) return jsonify({"message": "File uploaded successfully", "event_count": len(stat_obj.df)}), 200 except ValueError as e: return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400 diff --git a/server/nlp.py b/server/nlp.py index d5a04f3..dbc9021 100644 --- a/server/nlp.py +++ b/server/nlp.py @@ -1,13 +1,12 @@ import torch import pandas as pd -import numpy as np from transformers import pipeline from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity -model = SentenceTransformer("all-MiniLM-L6-v2", device=0 if torch.cuda.is_available() else 1) +model = SentenceTransformer("all-mpnet-base-v2", device=0 if torch.cuda.is_available() else 1) def add_emotion_cols( df: pd.DataFrame, @@ -38,16 +37,28 @@ def add_emotion_cols( def add_topic_col( df: pd.DataFrame, + title_col: str, content_col: str, - domain_topics: list[str], - confidence_threshold: float = 0.15 + domain_topics: dict, + confidence_threshold: float = 0.20 ) -> None: + + topic_labels = list(domain_topics.keys()) + topic_texts = list(domain_topics.values()) + topic_embeddings = model.encode( - domain_topics, + topic_texts, normalize_embeddings=True, ) - texts = df[content_col].astype(str).tolist() + titles = df[title_col].fillna("").astype(str) + contents = df[content_col].fillna("").astype(str) + + texts = [ + f"{title}. {content}" if title else content + for title, content in zip(titles, contents) + ] + text_embeddings = model.encode( texts, normalize_embeddings=True, @@ -60,8 +71,9 @@ def add_topic_col( best_idx = sims.argmax(axis=1) best_score = sims.max(axis=1) - df["topic"] = [domain_topics[i] for i in best_idx] + df["topic"] = [topic_labels[i] for i in best_idx] df["topic_confidence"] = best_score + df.loc[df["topic_confidence"] < confidence_threshold, "topic"] = "Misc" return df \ No newline at end of file diff --git a/server/stat_gen.py b/server/stat_gen.py index d2beed6..35abb08 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -42,7 +42,7 @@ class StatGen: df["weekday"] = df["dt"].dt.day_name() add_emotion_cols(df, "content") - add_topic_col(df, "content", self.domain_topics) + add_topic_col(df, "title", "content", self.domain_topics) def _tokenize(self, text: str): tokens = re.findall(r"\b[a-z]{3,}\b", text)