Storage of user data and datasets in PostGreSQL #2
2
.gitignore
vendored
2
.gitignore
vendored
@@ -9,3 +9,5 @@ __pycache__/
|
|||||||
# React App Vite
|
# React App Vite
|
||||||
node_modules/
|
node_modules/
|
||||||
dist/
|
dist/
|
||||||
|
|
||||||
|
*.sh
|
||||||
138
db/database.py
Normal file
138
db/database.py
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
import os
|
||||||
|
import psycopg2
|
||||||
|
import pandas as pd
|
||||||
|
from psycopg2.extras import RealDictCursor
|
||||||
|
from psycopg2.extras import execute_batch, Json
|
||||||
|
|
||||||
|
|
||||||
|
class PostgresConnector:
|
||||||
|
"""
|
||||||
|
Simple PostgreSQL connector (single connection).
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.connection = psycopg2.connect(
|
||||||
|
host=os.getenv("POSTGRES_HOST", "localhost"),
|
||||||
|
port=os.getenv("POSTGRES_PORT", 5432),
|
||||||
|
user=os.getenv("POSTGRES_USER", "postgres"),
|
||||||
|
password=os.getenv("POSTGRES_PASSWORD", "postgres"),
|
||||||
|
database=os.getenv("POSTGRES_DB", "postgres"),
|
||||||
|
)
|
||||||
|
self.connection.autocommit = False
|
||||||
|
|
||||||
|
def execute(self, query, params=None, fetch=False) -> list:
|
||||||
|
with self.connection.cursor(cursor_factory=RealDictCursor) as cursor:
|
||||||
|
cursor.execute(query, params)
|
||||||
|
if fetch:
|
||||||
|
return cursor.fetchall()
|
||||||
|
self.connection.commit()
|
||||||
|
|
||||||
|
def executemany(self, query, param_list) -> list:
|
||||||
|
with self.connection.cursor(cursor_factory=RealDictCursor) as cursor:
|
||||||
|
cursor.executemany(query, param_list)
|
||||||
|
self.connection.commit()
|
||||||
|
|
||||||
|
## User Management Methods
|
||||||
|
def save_user(self, username, email, password_hash):
|
||||||
|
query = """
|
||||||
|
INSERT INTO users (username, email, password_hash)
|
||||||
|
VALUES (%s, %s, %s)
|
||||||
|
"""
|
||||||
|
self.execute(query, (username, email, password_hash))
|
||||||
|
|
||||||
|
def get_user_by_username(self, username) -> dict:
|
||||||
|
query = "SELECT id, username, email, password_hash FROM users WHERE username = %s"
|
||||||
|
result = self.execute(query, (username,), fetch=True)
|
||||||
|
return result[0] if result else None
|
||||||
|
|
||||||
|
def get_user_by_email(self, email) -> dict:
|
||||||
|
query = "SELECT id, username, email, password_hash FROM users WHERE email = %s"
|
||||||
|
result = self.execute(query, (email,), fetch=True)
|
||||||
|
return result[0] if result else None
|
||||||
|
|
||||||
|
# Dataset Management Methods
|
||||||
|
def save_dataset_info(self, user_id: int, dataset_name: str, topics: dict) -> int:
|
||||||
|
query = """
|
||||||
|
INSERT INTO datasets (user_id, name, topics)
|
||||||
|
VALUES (%s, %s, %s)
|
||||||
|
RETURNING id
|
||||||
|
"""
|
||||||
|
result = self.execute(query, (user_id, dataset_name, Json(topics)), fetch=True)
|
||||||
|
return result[0]["id"] if result else None
|
||||||
|
|
||||||
|
def save_dataset_content(self, dataset_id: int, event_data: pd.DataFrame):
|
||||||
|
query = """
|
||||||
|
INSERT INTO events (
|
||||||
|
dataset_id,
|
||||||
|
type,
|
||||||
|
parent_id,
|
||||||
|
author,
|
||||||
|
content,
|
||||||
|
timestamp,
|
||||||
|
date,
|
||||||
|
dt,
|
||||||
|
hour,
|
||||||
|
weekday,
|
||||||
|
reply_to,
|
||||||
|
source,
|
||||||
|
topic,
|
||||||
|
topic_confidence,
|
||||||
|
ner_entities,
|
||||||
|
emotion_anger,
|
||||||
|
emotion_disgust,
|
||||||
|
emotion_fear,
|
||||||
|
emotion_joy,
|
||||||
|
emotion_sadness
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
%s, %s, %s, %s, %s,
|
||||||
|
%s, %s, %s, %s, %s,
|
||||||
|
%s, %s, %s, %s, %s,
|
||||||
|
%s, %s, %s, %s, %s
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
values = []
|
||||||
|
|
||||||
|
for _, row in event_data.iterrows():
|
||||||
|
values.append((
|
||||||
|
dataset_id,
|
||||||
|
row["type"],
|
||||||
|
row["parent_id"],
|
||||||
|
row["author"],
|
||||||
|
row["content"],
|
||||||
|
row["timestamp"],
|
||||||
|
row["date"],
|
||||||
|
row["dt"],
|
||||||
|
row["hour"],
|
||||||
|
row["weekday"],
|
||||||
|
row.get("reply_to"),
|
||||||
|
row["source"],
|
||||||
|
row.get("topic"),
|
||||||
|
row.get("topic_confidence"),
|
||||||
|
Json(row["ner_entities"]) if row.get("ner_entities") else None,
|
||||||
|
row.get("emotion_anger"),
|
||||||
|
row.get("emotion_disgust"),
|
||||||
|
row.get("emotion_fear"),
|
||||||
|
row.get("emotion_joy"),
|
||||||
|
row.get("emotion_sadness"),
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
with self.connection.cursor(cursor_factory=RealDictCursor) as cursor:
|
||||||
|
execute_batch(cursor, query, values)
|
||||||
|
self.connection.commit()
|
||||||
|
|
||||||
|
def get_dataset_content(self, dataset_id: int) -> pd.DataFrame:
|
||||||
|
query = "SELECT * FROM events WHERE dataset_id = %s"
|
||||||
|
result = self.execute(query, (dataset_id,), fetch=True)
|
||||||
|
return pd.DataFrame(result)
|
||||||
|
|
||||||
|
def get_dataset_info(self, dataset_id: int) -> dict:
|
||||||
|
query = "SELECT * FROM datasets WHERE id = %s"
|
||||||
|
result = self.execute(query, (dataset_id,), fetch=True)
|
||||||
|
return result[0] if result else None
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
if self.connection:
|
||||||
|
self.connection.close()
|
||||||
51
db/schema.sql
Normal file
51
db/schema.sql
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
CREATE TABLE users (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
username VARCHAR(255) NOT NULL UNIQUE,
|
||||||
|
email VARCHAR(255) NOT NULL UNIQUE,
|
||||||
|
password_hash VARCHAR(255) NOT NULL,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE datasets (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
user_id INTEGER NOT NULL,
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
description TEXT,
|
||||||
|
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
topics JSONB,
|
||||||
|
FOREIGN KEY (user_id) REFERENCES users(id) ON DELETE CASCADE
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE events (
|
||||||
|
/* Required Fields */
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
dataset_id INTEGER NOT NULL,
|
||||||
|
type VARCHAR(255) NOT NULL,
|
||||||
|
|
||||||
|
author VARCHAR(255) NOT NULL,
|
||||||
|
content TEXT NOT NULL,
|
||||||
|
timestamp BIGINT NOT NULL,
|
||||||
|
date DATE NOT NULL,
|
||||||
|
dt TIMESTAMP NOT NULL,
|
||||||
|
hour INTEGER NOT NULL,
|
||||||
|
weekday VARCHAR(255) NOT NULL,
|
||||||
|
|
||||||
|
/* Comments and Replies */
|
||||||
|
parent_id VARCHAR(255),
|
||||||
|
reply_to VARCHAR(255),
|
||||||
|
source VARCHAR(255) NOT NULL,
|
||||||
|
|
||||||
|
/* NLP Fields */
|
||||||
|
topic VARCHAR(255),
|
||||||
|
topic_confidence FLOAT,
|
||||||
|
|
||||||
|
ner_entities JSONB,
|
||||||
|
|
||||||
|
emotion_anger FLOAT,
|
||||||
|
emotion_disgust FLOAT,
|
||||||
|
emotion_fear FLOAT,
|
||||||
|
emotion_joy FLOAT,
|
||||||
|
emotion_sadness FLOAT,
|
||||||
|
|
||||||
|
FOREIGN KEY (dataset_id) REFERENCES datasets(id) ON DELETE CASCADE
|
||||||
|
);
|
||||||
15
docker-compose.yml
Normal file
15
docker-compose.yml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
image: postgres:16
|
||||||
|
container_name: postgres_db
|
||||||
|
restart: unless-stopped
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
ports:
|
||||||
|
- "5432:5432"
|
||||||
|
volumes:
|
||||||
|
- ./db/postgres_vol:/var/lib/postgresql/data
|
||||||
|
- ./db/schema.sql:/docker-entrypoint-initdb.d/schema.sql
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
postgres_data:
|
||||||
@@ -1,10 +1,12 @@
|
|||||||
beautifulsoup4==4.14.3
|
beautifulsoup4==4.14.3
|
||||||
Flask==3.1.2
|
Flask==3.1.3
|
||||||
flask_cors==6.0.2
|
flask_cors==6.0.2
|
||||||
google_api_python_client==2.188.0
|
google_api_python_client==2.188.0
|
||||||
keybert==0.9.0
|
|
||||||
nltk==3.9.2
|
nltk==3.9.2
|
||||||
pandas==3.0.0
|
numpy==2.4.2
|
||||||
|
pandas==3.0.1
|
||||||
|
psycopg2==2.9.11
|
||||||
|
psycopg2_binary==2.9.11
|
||||||
python-dotenv==1.2.1
|
python-dotenv==1.2.1
|
||||||
Requests==2.32.5
|
Requests==2.32.5
|
||||||
sentence_transformers==5.2.2
|
sentence_transformers==5.2.2
|
||||||
|
|||||||
@@ -6,13 +6,12 @@ from typing import Any
|
|||||||
|
|
||||||
|
|
||||||
class CulturalAnalysis:
|
class CulturalAnalysis:
|
||||||
def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"):
|
def __init__(self, content_col: str = "content", topic_col: str = "topic"):
|
||||||
self.df = df
|
|
||||||
self.content_col = content_col
|
self.content_col = content_col
|
||||||
self.topic_col = topic_col
|
self.topic_col = topic_col
|
||||||
|
|
||||||
def get_identity_markers(self):
|
def get_identity_markers(self, original_df: pd.DataFrame) -> dict[str, Any]:
|
||||||
df = self.df.copy()
|
df = original_df.copy()
|
||||||
s = df[self.content_col].fillna("").astype(str).str.lower()
|
s = df[self.content_col].fillna("").astype(str).str.lower()
|
||||||
|
|
||||||
in_group_words = {"we", "us", "our", "ourselves"}
|
in_group_words = {"we", "us", "our", "ourselves"}
|
||||||
@@ -60,8 +59,8 @@ class CulturalAnalysis:
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def get_stance_markers(self) -> dict[str, Any]:
|
def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
|
||||||
s = self.df[self.content_col].fillna("").astype(str)
|
s = df[self.content_col].fillna("").astype(str)
|
||||||
|
|
||||||
hedges = {
|
hedges = {
|
||||||
"maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem",
|
"maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem",
|
||||||
@@ -104,13 +103,11 @@ class CulturalAnalysis:
|
|||||||
"permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3),
|
"permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3),
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
|
def get_avg_emotions_per_entity(self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
|
||||||
if "entities" not in self.df.columns:
|
if "entities" not in df.columns:
|
||||||
return {"entity_emotion_avg": {}}
|
return {"entity_emotion_avg": {}}
|
||||||
|
|
||||||
df = self.df
|
|
||||||
emotion_cols = [c for c in df.columns if c.startswith("emotion_")]
|
emotion_cols = [c for c in df.columns if c.startswith("emotion_")]
|
||||||
|
|
||||||
entity_counter = Counter()
|
entity_counter = Counter()
|
||||||
|
|
||||||
for row in df["entities"].dropna():
|
for row in df["entities"].dropna():
|
||||||
|
|||||||
@@ -1,18 +1,15 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
class EmotionalAnalysis:
|
class EmotionalAnalysis:
|
||||||
def __init__(self, df: pd.DataFrame):
|
def avg_emotion_by_topic(self, df: pd.DataFrame) -> dict:
|
||||||
self.df = df
|
|
||||||
|
|
||||||
def avg_emotion_by_topic(self) -> dict:
|
|
||||||
emotion_cols = [
|
emotion_cols = [
|
||||||
col for col in self.df.columns
|
col for col in df.columns
|
||||||
if col.startswith("emotion_")
|
if col.startswith("emotion_")
|
||||||
]
|
]
|
||||||
|
|
||||||
counts = (
|
counts = (
|
||||||
self.df[
|
df[
|
||||||
(self.df["topic"] != "Misc")
|
(df["topic"] != "Misc")
|
||||||
]
|
]
|
||||||
.groupby("topic")
|
.groupby("topic")
|
||||||
.size()
|
.size()
|
||||||
@@ -20,8 +17,8 @@ class EmotionalAnalysis:
|
|||||||
)
|
)
|
||||||
|
|
||||||
avg_emotion_by_topic = (
|
avg_emotion_by_topic = (
|
||||||
self.df[
|
df[
|
||||||
(self.df["topic"] != "Misc")
|
(df["topic"] != "Misc")
|
||||||
]
|
]
|
||||||
.groupby("topic")[emotion_cols]
|
.groupby("topic")[emotion_cols]
|
||||||
.mean()
|
.mean()
|
||||||
|
|||||||
@@ -5,8 +5,7 @@ from collections import Counter
|
|||||||
|
|
||||||
|
|
||||||
class InteractionAnalysis:
|
class InteractionAnalysis:
|
||||||
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
|
def __init__(self, word_exclusions: set[str]):
|
||||||
self.df = df
|
|
||||||
self.word_exclusions = word_exclusions
|
self.word_exclusions = word_exclusions
|
||||||
|
|
||||||
def _tokenize(self, text: str):
|
def _tokenize(self, text: str):
|
||||||
@@ -14,9 +13,9 @@ class InteractionAnalysis:
|
|||||||
return [t for t in tokens if t not in self.word_exclusions]
|
return [t for t in tokens if t not in self.word_exclusions]
|
||||||
|
|
||||||
def _vocab_richness_per_user(
|
def _vocab_richness_per_user(
|
||||||
self, min_words: int = 20, top_most_used_words: int = 100
|
self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100
|
||||||
) -> list:
|
) -> list:
|
||||||
df = self.df.copy()
|
df = df.copy()
|
||||||
df["content"] = df["content"].fillna("").astype(str).str.lower()
|
df["content"] = df["content"].fillna("").astype(str).str.lower()
|
||||||
df["tokens"] = df["content"].apply(self._tokenize)
|
df["tokens"] = df["content"].apply(self._tokenize)
|
||||||
|
|
||||||
@@ -58,10 +57,8 @@ class InteractionAnalysis:
|
|||||||
|
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
def top_users(self) -> list:
|
def top_users(self, df: pd.DataFrame) -> list:
|
||||||
counts = (
|
counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)
|
||||||
self.df.groupby(["author", "source"]).size().sort_values(ascending=False)
|
|
||||||
)
|
|
||||||
|
|
||||||
top_users = [
|
top_users = [
|
||||||
{"author": author, "source": source, "count": int(count)}
|
{"author": author, "source": source, "count": int(count)}
|
||||||
@@ -70,14 +67,14 @@ class InteractionAnalysis:
|
|||||||
|
|
||||||
return top_users
|
return top_users
|
||||||
|
|
||||||
def per_user_analysis(self) -> dict:
|
def per_user_analysis(self, df: pd.DataFrame) -> dict:
|
||||||
per_user = self.df.groupby(["author", "type"]).size().unstack(fill_value=0)
|
per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
|
||||||
|
|
||||||
emotion_cols = [col for col in self.df.columns if col.startswith("emotion_")]
|
emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
|
||||||
|
|
||||||
avg_emotions_by_author = {}
|
avg_emotions_by_author = {}
|
||||||
if emotion_cols:
|
if emotion_cols:
|
||||||
avg_emotions = self.df.groupby("author")[emotion_cols].mean().fillna(0.0)
|
avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
|
||||||
avg_emotions_by_author = {
|
avg_emotions_by_author = {
|
||||||
author: {emotion: float(score) for emotion, score in row.items()}
|
author: {emotion: float(score) for emotion, score in row.items()}
|
||||||
for author, row in avg_emotions.iterrows()
|
for author, row in avg_emotions.iterrows()
|
||||||
@@ -97,7 +94,7 @@ class InteractionAnalysis:
|
|||||||
per_user = per_user.sort_values("comment_post_ratio", ascending=True)
|
per_user = per_user.sort_values("comment_post_ratio", ascending=True)
|
||||||
per_user_records = per_user.reset_index().to_dict(orient="records")
|
per_user_records = per_user.reset_index().to_dict(orient="records")
|
||||||
|
|
||||||
vocab_rows = self._vocab_richness_per_user()
|
vocab_rows = self._vocab_richness_per_user(df)
|
||||||
vocab_by_author = {row["author"]: row for row in vocab_rows}
|
vocab_by_author = {row["author"]: row for row in vocab_rows}
|
||||||
|
|
||||||
# merge vocab richness + per_user information
|
# merge vocab richness + per_user information
|
||||||
@@ -112,7 +109,14 @@ class InteractionAnalysis:
|
|||||||
"comment_post_ratio": float(row.get("comment_post_ratio", 0)),
|
"comment_post_ratio": float(row.get("comment_post_ratio", 0)),
|
||||||
"comment_share": float(row.get("comment_share", 0)),
|
"comment_share": float(row.get("comment_share", 0)),
|
||||||
"avg_emotions": avg_emotions_by_author.get(author, {}),
|
"avg_emotions": avg_emotions_by_author.get(author, {}),
|
||||||
"vocab": vocab_by_author.get(author, {"vocab_richness": 0, "avg_words_per_event": 0, "top_words": []}),
|
"vocab": vocab_by_author.get(
|
||||||
|
author,
|
||||||
|
{
|
||||||
|
"vocab_richness": 0,
|
||||||
|
"avg_words_per_event": 0,
|
||||||
|
"top_words": [],
|
||||||
|
},
|
||||||
|
),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -120,13 +124,13 @@ class InteractionAnalysis:
|
|||||||
|
|
||||||
return merged_users
|
return merged_users
|
||||||
|
|
||||||
def interaction_graph(self):
|
def interaction_graph(self, df: pd.DataFrame):
|
||||||
interactions = {a: {} for a in self.df["author"].dropna().unique()}
|
interactions = {a: {} for a in df["author"].dropna().unique()}
|
||||||
|
|
||||||
# reply_to refers to the comment id, this allows us to map comment ids to usernames
|
# reply_to refers to the comment id, this allows us to map comment ids to usernames
|
||||||
id_to_author = self.df.set_index("id")["author"].to_dict()
|
id_to_author = df.set_index("id")["author"].to_dict()
|
||||||
|
|
||||||
for _, row in self.df.iterrows():
|
for _, row in df.iterrows():
|
||||||
a = row["author"]
|
a = row["author"]
|
||||||
reply_id = row["reply_to"]
|
reply_id = row["reply_to"]
|
||||||
|
|
||||||
@@ -141,10 +145,10 @@ class InteractionAnalysis:
|
|||||||
|
|
||||||
return interactions
|
return interactions
|
||||||
|
|
||||||
def average_thread_depth(self):
|
def average_thread_depth(self, df: pd.DataFrame):
|
||||||
depths = []
|
depths = []
|
||||||
id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
|
id_to_reply = df.set_index("id")["reply_to"].to_dict()
|
||||||
for _, row in self.df.iterrows():
|
for _, row in df.iterrows():
|
||||||
depth = 0
|
depth = 0
|
||||||
current_id = row["id"]
|
current_id = row["id"]
|
||||||
|
|
||||||
@@ -163,16 +167,16 @@ class InteractionAnalysis:
|
|||||||
|
|
||||||
return round(sum(depths) / len(depths), 2)
|
return round(sum(depths) / len(depths), 2)
|
||||||
|
|
||||||
def average_thread_length_by_emotion(self):
|
def average_thread_length_by_emotion(self, df: pd.DataFrame):
|
||||||
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
|
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
|
||||||
|
|
||||||
emotion_cols = [
|
emotion_cols = [
|
||||||
c
|
c
|
||||||
for c in self.df.columns
|
for c in df.columns
|
||||||
if c.startswith("emotion_") and c not in emotion_exclusions
|
if c.startswith("emotion_") and c not in emotion_exclusions
|
||||||
]
|
]
|
||||||
|
|
||||||
id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
|
id_to_reply = df.set_index("id")["reply_to"].to_dict()
|
||||||
length_cache = {}
|
length_cache = {}
|
||||||
|
|
||||||
def thread_length_from(start_id):
|
def thread_length_from(start_id):
|
||||||
@@ -211,7 +215,7 @@ class InteractionAnalysis:
|
|||||||
emotion_to_lengths = {}
|
emotion_to_lengths = {}
|
||||||
|
|
||||||
# Fill NaNs in emotion cols to avoid max() issues
|
# Fill NaNs in emotion cols to avoid max() issues
|
||||||
emo_df = self.df[["id"] + emotion_cols].copy()
|
emo_df = df[["id"] + emotion_cols].copy()
|
||||||
emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
|
emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
|
||||||
|
|
||||||
for _, row in emo_df.iterrows():
|
for _, row in emo_df.iterrows():
|
||||||
|
|||||||
@@ -4,9 +4,9 @@ import re
|
|||||||
from collections import Counter
|
from collections import Counter
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
|
||||||
|
|
||||||
class LinguisticAnalysis:
|
class LinguisticAnalysis:
|
||||||
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
|
def __init__(self, word_exclusions: set[str]):
|
||||||
self.df = df
|
|
||||||
self.word_exclusions = word_exclusions
|
self.word_exclusions = word_exclusions
|
||||||
|
|
||||||
def _tokenize(self, text: str):
|
def _tokenize(self, text: str):
|
||||||
@@ -21,22 +21,13 @@ class LinguisticAnalysis:
|
|||||||
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
|
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def word_frequencies(self, limit: int = 100) -> dict:
|
def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
|
||||||
texts = (
|
texts = df["content"].dropna().astype(str).str.lower()
|
||||||
self.df["content"]
|
|
||||||
.dropna()
|
|
||||||
.astype(str)
|
|
||||||
.str.lower()
|
|
||||||
)
|
|
||||||
|
|
||||||
words = []
|
words = []
|
||||||
for text in texts:
|
for text in texts:
|
||||||
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
||||||
words.extend(
|
words.extend(w for w in tokens if w not in self.word_exclusions)
|
||||||
w for w in tokens
|
|
||||||
if w not in self.word_exclusions
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
counts = Counter(words)
|
counts = Counter(words)
|
||||||
|
|
||||||
@@ -49,8 +40,8 @@ class LinguisticAnalysis:
|
|||||||
|
|
||||||
return word_frequencies.to_dict(orient="records")
|
return word_frequencies.to_dict(orient="records")
|
||||||
|
|
||||||
def ngrams(self, n=2, limit=100):
|
def ngrams(self, df: pd.DataFrame, n=2, limit=100):
|
||||||
texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
|
texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
|
||||||
all_ngrams = []
|
all_ngrams = []
|
||||||
|
|
||||||
for text in texts:
|
for text in texts:
|
||||||
|
|||||||
@@ -1,16 +1,14 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
class TemporalAnalysis:
|
|
||||||
def __init__(self, df: pd.DataFrame):
|
|
||||||
self.df = df
|
|
||||||
|
|
||||||
def avg_reply_time_per_emotion(self) -> dict:
|
class TemporalAnalysis:
|
||||||
df = self.df.copy()
|
def avg_reply_time_per_emotion(self, df: pd.DataFrame) -> list[dict]:
|
||||||
|
df = df.copy()
|
||||||
|
|
||||||
replies = df[
|
replies = df[
|
||||||
(df["type"] == "comment") &
|
(df["type"] == "comment")
|
||||||
(df["reply_to"].notna()) &
|
& (df["reply_to"].notna())
|
||||||
(df["reply_to"] != "")
|
& (df["reply_to"] != "")
|
||||||
]
|
]
|
||||||
|
|
||||||
id_to_time = df.set_index("id")["dt"].to_dict()
|
id_to_time = df.set_index("id")["dt"].to_dict()
|
||||||
@@ -25,42 +23,45 @@ class TemporalAnalysis:
|
|||||||
return (row["dt"] - parent_time).total_seconds()
|
return (row["dt"] - parent_time).total_seconds()
|
||||||
|
|
||||||
replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
|
replies["reply_time"] = replies.apply(compute_reply_time, axis=1)
|
||||||
emotion_cols = [col for col in df.columns if col.startswith("emotion_") and col not in ("emotion_neutral", "emotion_surprise")]
|
emotion_cols = [
|
||||||
|
col
|
||||||
|
for col in df.columns
|
||||||
|
if col.startswith("emotion_")
|
||||||
|
and col not in ("emotion_neutral", "emotion_surprise")
|
||||||
|
]
|
||||||
replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
|
replies["dominant_emotion"] = replies[emotion_cols].idxmax(axis=1)
|
||||||
|
|
||||||
grouped = (
|
grouped = (
|
||||||
replies
|
replies.groupby("dominant_emotion")["reply_time"]
|
||||||
.groupby("dominant_emotion")["reply_time"]
|
|
||||||
.agg(["mean", "count"])
|
.agg(["mean", "count"])
|
||||||
.reset_index()
|
.reset_index()
|
||||||
)
|
)
|
||||||
|
|
||||||
return grouped.to_dict(orient="records")
|
return grouped.to_dict(orient="records")
|
||||||
|
|
||||||
def posts_per_day(self) -> dict:
|
def posts_per_day(self, df: pd.DataFrame) -> list[dict]:
|
||||||
per_day = (
|
per_day = df.groupby("date").size().reset_index(name="count")
|
||||||
self.df.groupby("date")
|
|
||||||
.size()
|
|
||||||
.reset_index(name="count")
|
|
||||||
)
|
|
||||||
|
|
||||||
return per_day.to_dict(orient="records")
|
return per_day.to_dict(orient="records")
|
||||||
|
|
||||||
def heatmap(self) -> dict:
|
def heatmap(self, df: pd.DataFrame) -> list[dict]:
|
||||||
weekday_order = [
|
weekday_order = [
|
||||||
"Monday", "Tuesday", "Wednesday",
|
"Monday",
|
||||||
"Thursday", "Friday", "Saturday", "Sunday"
|
"Tuesday",
|
||||||
|
"Wednesday",
|
||||||
|
"Thursday",
|
||||||
|
"Friday",
|
||||||
|
"Saturday",
|
||||||
|
"Sunday",
|
||||||
]
|
]
|
||||||
|
|
||||||
self.df["weekday"] = pd.Categorical(
|
df = df.copy()
|
||||||
self.df["weekday"],
|
df["weekday"] = pd.Categorical(
|
||||||
categories=weekday_order,
|
df["weekday"], categories=weekday_order, ordered=True
|
||||||
ordered=True
|
|
||||||
)
|
)
|
||||||
|
|
||||||
heatmap = (
|
heatmap = (
|
||||||
self.df
|
df.groupby(["weekday", "hour"], observed=True)
|
||||||
.groupby(["weekday", "hour"], observed=True)
|
|
||||||
.size()
|
.size()
|
||||||
.unstack(fill_value=0)
|
.unstack(fill_value=0)
|
||||||
.reindex(columns=range(24), fill_value=0)
|
.reindex(columns=range(24), fill_value=0)
|
||||||
|
|||||||
386
server/app.py
386
server/app.py
@@ -1,23 +1,110 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
from flask import Flask, jsonify, request
|
from flask import Flask, jsonify, request
|
||||||
from flask_cors import CORS
|
from flask_cors import CORS
|
||||||
|
from flask_bcrypt import Bcrypt
|
||||||
|
from flask_jwt_extended import (
|
||||||
|
JWTManager,
|
||||||
|
create_access_token,
|
||||||
|
jwt_required,
|
||||||
|
get_jwt_identity,
|
||||||
|
)
|
||||||
|
|
||||||
from server.stat_gen import StatGen
|
from server.stat_gen import StatGen
|
||||||
|
from server.dataset_processor import DatasetProcessor
|
||||||
|
from db.database import PostgresConnector
|
||||||
|
from server.auth import AuthManager
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import traceback
|
import traceback
|
||||||
import json
|
import json
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
db = PostgresConnector()
|
||||||
|
|
||||||
# Allow for CORS from localhost:5173
|
# Env Variables
|
||||||
CORS(app, resources={r"/*": {"origins": "http://localhost:5173"}})
|
load_dotenv()
|
||||||
|
frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173")
|
||||||
|
jwt_secret_key = os.getenv("JWT_SECRET_KEY", "super-secret-change-this")
|
||||||
|
jwt_access_token_expires = int(
|
||||||
|
os.getenv("JWT_ACCESS_TOKEN_EXPIRES", 1200)
|
||||||
|
) # Default to 20 minutes
|
||||||
|
|
||||||
# Global State
|
# Flask Configuration
|
||||||
posts_df = pd.read_json('small.jsonl', lines=True)
|
CORS(app, resources={r"/*": {"origins": frontend_url}})
|
||||||
with open("topic_buckets.json", "r", encoding="utf-8") as f:
|
app.config["JWT_SECRET_KEY"] = jwt_secret_key
|
||||||
domain_topics = json.load(f)
|
app.config["JWT_ACCESS_TOKEN_EXPIRES"] = jwt_access_token_expires
|
||||||
stat_obj = StatGen(posts_df, domain_topics)
|
|
||||||
|
|
||||||
@app.route('/upload', methods=['POST'])
|
bcrypt = Bcrypt(app)
|
||||||
|
jwt = JWTManager(app)
|
||||||
|
auth_manager = AuthManager(db, bcrypt)
|
||||||
|
|
||||||
|
stat_gen = StatGen()
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/register", methods=["POST"])
|
||||||
|
def register_user():
|
||||||
|
data = request.get_json()
|
||||||
|
|
||||||
|
if (
|
||||||
|
not data
|
||||||
|
or "username" not in data
|
||||||
|
or "email" not in data
|
||||||
|
or "password" not in data
|
||||||
|
):
|
||||||
|
return jsonify({"error": "Missing username, email, or password"}), 400
|
||||||
|
|
||||||
|
username = data["username"]
|
||||||
|
email = data["email"]
|
||||||
|
password = data["password"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
auth_manager.register_user(username, email, password)
|
||||||
|
except ValueError as e:
|
||||||
|
return jsonify({"error": str(e)}), 400
|
||||||
|
except Exception as e:
|
||||||
|
print(traceback.format_exc())
|
||||||
|
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||||
|
|
||||||
|
print(f"Registered new user: {username}")
|
||||||
|
return jsonify({"message": f"User '{username}' registered successfully"}), 200
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/login", methods=["POST"])
|
||||||
|
def login_user():
|
||||||
|
data = request.get_json()
|
||||||
|
|
||||||
|
if not data or "username" not in data or "password" not in data:
|
||||||
|
return jsonify({"error": "Missing username or password"}), 400
|
||||||
|
|
||||||
|
username = data["username"]
|
||||||
|
password = data["password"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
user = auth_manager.authenticate_user(username, password)
|
||||||
|
if user:
|
||||||
|
access_token = create_access_token(identity=str(user["id"]))
|
||||||
|
return jsonify({"access_token": access_token}), 200
|
||||||
|
else:
|
||||||
|
return jsonify({"error": "Invalid username or password"}), 401
|
||||||
|
except Exception as e:
|
||||||
|
print(traceback.format_exc())
|
||||||
|
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/profile", methods=["GET"])
|
||||||
|
@jwt_required()
|
||||||
|
def profile():
|
||||||
|
current_user = get_jwt_identity()
|
||||||
|
|
||||||
|
return jsonify(
|
||||||
|
message="Access granted", user=auth_manager.get_user_by_id(current_user)
|
||||||
|
), 200
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/upload", methods=["POST"])
|
||||||
|
@jwt_required()
|
||||||
def upload_data():
|
def upload_data():
|
||||||
if "posts" not in request.files or "topics" not in request.files:
|
if "posts" not in request.files or "topics" not in request.files:
|
||||||
return jsonify({"error": "Missing required files or form data"}), 400
|
return jsonify({"error": "Missing required files or form data"}), 400
|
||||||
@@ -28,172 +115,241 @@ def upload_data():
|
|||||||
if post_file.filename == "" or topic_file == "":
|
if post_file.filename == "" or topic_file == "":
|
||||||
return jsonify({"error": "Empty filename"}), 400
|
return jsonify({"error": "Empty filename"}), 400
|
||||||
|
|
||||||
if not post_file.filename.endswith('.jsonl') or not topic_file.filename.endswith('.json'):
|
if not post_file.filename.endswith(".jsonl") or not topic_file.filename.endswith(
|
||||||
return jsonify({"error": "Invalid file type. Only .jsonl and .json files are allowed."}), 400
|
".json"
|
||||||
|
):
|
||||||
|
return jsonify(
|
||||||
|
{"error": "Invalid file type. Only .jsonl and .json files are allowed."}
|
||||||
|
), 400
|
||||||
|
|
||||||
try:
|
try:
|
||||||
global stat_obj
|
current_user = get_jwt_identity()
|
||||||
|
|
||||||
posts_df = pd.read_json(post_file, lines=True)
|
posts_df = pd.read_json(post_file, lines=True, convert_dates=False)
|
||||||
stat_obj = StatGen(posts_df, json.load(topic_file))
|
topics = json.load(topic_file)
|
||||||
return jsonify({"message": "File uploaded successfully", "event_count": len(stat_obj.df)}), 200
|
|
||||||
|
processor = DatasetProcessor(posts_df, topics)
|
||||||
|
enriched_df = processor.enrich()
|
||||||
|
dataset_id = db.save_dataset_info(
|
||||||
|
current_user, f"dataset_{current_user}", topics
|
||||||
|
)
|
||||||
|
db.save_dataset_content(dataset_id, enriched_df)
|
||||||
|
|
||||||
|
return jsonify(
|
||||||
|
{"message": "File uploaded successfully", "event_count": len(enriched_df), "dataset_id": dataset_id}
|
||||||
|
), 200
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
|
return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||||
|
|
||||||
@app.route('/dataset', methods=['GET'])
|
|
||||||
def get_dataset():
|
|
||||||
if stat_obj is None:
|
|
||||||
return jsonify({"error": "No data uploaded"}), 400
|
|
||||||
|
|
||||||
return stat_obj.df.to_json(orient="records"), 200, {"Content-Type": "application/json"}
|
@app.route("/dataset/<int:dataset_id>", methods=["GET"])
|
||||||
|
@jwt_required()
|
||||||
|
def get_dataset(dataset_id):
|
||||||
|
current_user = get_jwt_identity()
|
||||||
|
dataset = db.get_dataset_info(dataset_id)
|
||||||
|
|
||||||
@app.route('/stats/content', methods=['GET'])
|
if dataset.get("user_id") != int(current_user):
|
||||||
def word_frequencies():
|
return jsonify({"error": "Unauthorized access to dataset"}), 403
|
||||||
if stat_obj is None:
|
|
||||||
return jsonify({"error": "No data uploaded"}), 400
|
|
||||||
|
|
||||||
|
dataset_content = db.get_dataset_content(dataset_id)
|
||||||
|
|
||||||
|
if dataset_content.empty:
|
||||||
|
return jsonify({"error": "Dataset content not found"}), 404
|
||||||
|
|
||||||
|
return jsonify(dataset_content.to_dict(orient="records")), 200
|
||||||
|
|
||||||
|
|
||||||
|
@app.route("/dataset/<int:dataset_id>/content", methods=["GET"])
|
||||||
|
@jwt_required()
|
||||||
|
def content_endpoint(dataset_id):
|
||||||
|
current_user = get_jwt_identity()
|
||||||
|
dataset = db.get_dataset_info(dataset_id)
|
||||||
|
|
||||||
|
if dataset.get("user_id") != int(current_user):
|
||||||
|
return jsonify({"error": "Unauthorized access to dataset"}), 403
|
||||||
|
|
||||||
|
dataset_content = db.get_dataset_content(dataset_id)
|
||||||
try:
|
try:
|
||||||
return jsonify(stat_obj.get_content_analysis()), 200
|
return jsonify(stat_gen.get_content_analysis(dataset_content)), 200
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||||
|
|
||||||
@app.route('/stats/summary', methods=["GET"])
|
|
||||||
def get_summary():
|
@app.route("/dataset/<int:dataset_id>/summary", methods=["GET"])
|
||||||
if stat_obj is None:
|
@jwt_required()
|
||||||
return jsonify({"error": "No data uploaded"}), 400
|
def get_summary(dataset_id):
|
||||||
|
current_user = get_jwt_identity()
|
||||||
|
dataset = db.get_dataset_info(dataset_id)
|
||||||
|
|
||||||
|
if dataset.get("user_id") != int(current_user):
|
||||||
|
return jsonify({"error": "Unauthorized access to dataset"}), 403
|
||||||
|
|
||||||
|
dataset_content = db.get_dataset_content(dataset_id)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return jsonify(stat_obj.summary()), 200
|
return jsonify(stat_gen.summary(dataset_content)), 200
|
||||||
except ValueError as e:
|
|
||||||
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
|
||||||
except Exception as e:
|
|
||||||
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
|
||||||
|
|
||||||
@app.route("/stats/time", methods=["GET"])
|
|
||||||
def get_time_analysis():
|
|
||||||
if stat_obj is None:
|
|
||||||
return jsonify({"error": "No data uploaded"}), 400
|
|
||||||
|
|
||||||
try:
|
|
||||||
return jsonify(stat_obj.get_time_analysis()), 200
|
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||||
|
|
||||||
@app.route("/stats/user", methods=["GET"])
|
|
||||||
def get_user_analysis():
|
@app.route("/dataset/<int:dataset_id>/time", methods=["GET"])
|
||||||
if stat_obj is None:
|
@jwt_required()
|
||||||
return jsonify({"error": "No data uploaded"}), 400
|
def get_time_analysis(dataset_id):
|
||||||
|
current_user = get_jwt_identity()
|
||||||
|
dataset = db.get_dataset_info(dataset_id)
|
||||||
|
|
||||||
|
if dataset.get("user_id") != int(current_user):
|
||||||
|
return jsonify({"error": "Unauthorized access to dataset"}), 403
|
||||||
|
|
||||||
|
dataset_content = db.get_dataset_content(dataset_id)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return jsonify(stat_obj.get_user_analysis()), 200
|
return jsonify(stat_gen.get_time_analysis(dataset_content)), 200
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||||
|
|
||||||
@app.route("/stats/cultural", methods=["GET"])
|
|
||||||
def get_cultural_analysis():
|
@app.route("/dataset/<int:dataset_id>/user", methods=["GET"])
|
||||||
if stat_obj is None:
|
@jwt_required()
|
||||||
return jsonify({"error": "No data uploaded"}), 400
|
def get_user_analysis(dataset_id):
|
||||||
|
current_user = get_jwt_identity()
|
||||||
|
dataset = db.get_dataset_info(dataset_id)
|
||||||
|
|
||||||
|
if dataset.get("user_id") != int(current_user):
|
||||||
|
return jsonify({"error": "Unauthorized access to dataset"}), 403
|
||||||
|
|
||||||
|
dataset_content = db.get_dataset_content(dataset_id)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return jsonify(stat_obj.get_cultural_analysis()), 200
|
return jsonify(stat_gen.get_user_analysis(dataset_content)), 200
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||||
|
|
||||||
@app.route("/stats/interaction", methods=["GET"])
|
|
||||||
def get_interaction_analysis():
|
@app.route("/dataset/<int:dataset_id>/cultural", methods=["GET"])
|
||||||
if stat_obj is None:
|
@jwt_required()
|
||||||
return jsonify({"error": "No data uploaded"}), 400
|
def get_cultural_analysis(dataset_id):
|
||||||
|
current_user = get_jwt_identity()
|
||||||
|
dataset = db.get_dataset_info(dataset_id)
|
||||||
|
|
||||||
|
if dataset.get("user_id") != int(current_user):
|
||||||
|
return jsonify({"error": "Unauthorized access to dataset"}), 403
|
||||||
|
|
||||||
|
dataset_content = db.get_dataset_content(dataset_id)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return jsonify(stat_obj.get_interactional_analysis()), 200
|
return jsonify(stat_gen.get_cultural_analysis(dataset_content)), 200
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||||
|
|
||||||
@app.route('/filter/query', methods=["POST"])
|
|
||||||
def filter_query():
|
|
||||||
if stat_obj is None:
|
|
||||||
return jsonify({"error": "No data uploaded"}), 400
|
|
||||||
|
|
||||||
data = request.get_json(silent=True) or {}
|
@app.route("/dataset/<int:dataset_id>/interaction", methods=["GET"])
|
||||||
|
@jwt_required()
|
||||||
|
def get_interaction_analysis(dataset_id):
|
||||||
|
current_user = get_jwt_identity()
|
||||||
|
dataset = db.get_dataset_info(dataset_id)
|
||||||
|
|
||||||
if "query" not in data:
|
if dataset.get("user_id") != int(current_user):
|
||||||
return jsonify(stat_obj.df.to_dict(orient="records")), 200
|
return jsonify({"error": "Unauthorized access to dataset"}), 403
|
||||||
|
|
||||||
query = data["query"]
|
dataset_content = db.get_dataset_content(dataset_id)
|
||||||
filtered_df = stat_obj.filter_by_query(query)
|
|
||||||
|
|
||||||
return jsonify(filtered_df), 200
|
|
||||||
|
|
||||||
@app.route('/filter/time', methods=["POST"])
|
|
||||||
def filter_time():
|
|
||||||
if stat_obj is None:
|
|
||||||
return jsonify({"error": "No data uploaded"}), 400
|
|
||||||
|
|
||||||
data = request.get_json(silent=True)
|
|
||||||
if not data:
|
|
||||||
return jsonify({"error": "Invalid or missing JSON body"}), 400
|
|
||||||
|
|
||||||
if "start" not in data or "end" not in data:
|
|
||||||
return jsonify({"error": "Please include both start and end dates"}), 400
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
start = pd.to_datetime(data["start"], utc=True)
|
return jsonify(stat_gen.get_interactional_analysis(dataset_content)), 200
|
||||||
end = pd.to_datetime(data["end"], utc=True)
|
except ValueError as e:
|
||||||
filtered_df = stat_obj.set_time_range(start, end)
|
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
||||||
return jsonify(filtered_df), 200
|
|
||||||
except Exception:
|
|
||||||
return jsonify({"error": "Invalid datetime format"}), 400
|
|
||||||
|
|
||||||
@app.route('/filter/sources', methods=["POST"])
|
|
||||||
def filter_sources():
|
|
||||||
if stat_obj is None:
|
|
||||||
return jsonify({"error": "No data uploaded"}), 400
|
|
||||||
|
|
||||||
data = request.get_json(silent=True)
|
|
||||||
if not data:
|
|
||||||
return jsonify({"error": "Invalid or missing JSON body"}), 400
|
|
||||||
|
|
||||||
if "sources" not in data:
|
|
||||||
return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
|
|
||||||
|
|
||||||
try:
|
|
||||||
filtered_df = stat_obj.filter_data_sources(data["sources"])
|
|
||||||
return jsonify(filtered_df), 200
|
|
||||||
except ValueError:
|
|
||||||
return jsonify({"error": "Please enable at least one data source"}), 400
|
|
||||||
except Exception as e:
|
|
||||||
return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
|
|
||||||
|
|
||||||
|
|
||||||
@app.route('/filter/reset', methods=["GET"])
|
|
||||||
def reset_dataset():
|
|
||||||
if stat_obj is None:
|
|
||||||
return jsonify({"error": "No data uploaded"}), 400
|
|
||||||
|
|
||||||
try:
|
|
||||||
stat_obj.reset_dataset()
|
|
||||||
return jsonify({"success": "Dataset successfully reset"})
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||||
|
|
||||||
|
|
||||||
|
# @app.route("/filter/query", methods=["POST"])
|
||||||
|
# def filter_query():
|
||||||
|
# if stat_obj is None:
|
||||||
|
# return jsonify({"error": "No data uploaded"}), 400
|
||||||
|
|
||||||
|
# data = request.get_json(silent=True) or {}
|
||||||
|
|
||||||
|
# if "query" not in data:
|
||||||
|
# return jsonify(stat_obj.df.to_dict(orient="records")), 200
|
||||||
|
|
||||||
|
# query = data["query"]
|
||||||
|
# filtered_df = stat_obj.filter_by_query(query)
|
||||||
|
|
||||||
|
# return jsonify(filtered_df), 200
|
||||||
|
|
||||||
|
|
||||||
|
# @app.route("/filter/time", methods=["POST"])
|
||||||
|
# def filter_time():
|
||||||
|
# if stat_obj is None:
|
||||||
|
# return jsonify({"error": "No data uploaded"}), 400
|
||||||
|
|
||||||
|
# data = request.get_json(silent=True)
|
||||||
|
# if not data:
|
||||||
|
# return jsonify({"error": "Invalid or missing JSON body"}), 400
|
||||||
|
|
||||||
|
# if "start" not in data or "end" not in data:
|
||||||
|
# return jsonify({"error": "Please include both start and end dates"}), 400
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# start = pd.to_datetime(data["start"], utc=True)
|
||||||
|
# end = pd.to_datetime(data["end"], utc=True)
|
||||||
|
# filtered_df = stat_obj.set_time_range(start, end)
|
||||||
|
# return jsonify(filtered_df), 200
|
||||||
|
# except Exception:
|
||||||
|
# return jsonify({"error": "Invalid datetime format"}), 400
|
||||||
|
|
||||||
|
|
||||||
|
# @app.route("/filter/sources", methods=["POST"])
|
||||||
|
# def filter_sources():
|
||||||
|
# if stat_obj is None:
|
||||||
|
# return jsonify({"error": "No data uploaded"}), 400
|
||||||
|
|
||||||
|
# data = request.get_json(silent=True)
|
||||||
|
# if not data:
|
||||||
|
# return jsonify({"error": "Invalid or missing JSON body"}), 400
|
||||||
|
|
||||||
|
# if "sources" not in data:
|
||||||
|
# return jsonify({"error": "Ensure sources hash map is in 'sources' key"}), 400
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# filtered_df = stat_obj.filter_data_sources(data["sources"])
|
||||||
|
# return jsonify(filtered_df), 200
|
||||||
|
# except ValueError:
|
||||||
|
# return jsonify({"error": "Please enable at least one data source"}), 400
|
||||||
|
# except Exception as e:
|
||||||
|
# return jsonify({"error": "An unexpected server error occured: " + str(e)}), 500
|
||||||
|
|
||||||
|
|
||||||
|
# @app.route("/filter/reset", methods=["GET"])
|
||||||
|
# def reset_dataset():
|
||||||
|
# if stat_obj is None:
|
||||||
|
# return jsonify({"error": "No data uploaded"}), 400
|
||||||
|
|
||||||
|
# try:
|
||||||
|
# stat_obj.reset_dataset()
|
||||||
|
# return jsonify({"success": "Dataset successfully reset"})
|
||||||
|
# except Exception as e:
|
||||||
|
# print(traceback.format_exc())
|
||||||
|
# return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.run(debug=True)
|
app.run(debug=True)
|
||||||
29
server/auth.py
Normal file
29
server/auth.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
from db.database import PostgresConnector
|
||||||
|
from flask_bcrypt import Bcrypt
|
||||||
|
|
||||||
|
class AuthManager:
|
||||||
|
def __init__(self, db: PostgresConnector, bcrypt: Bcrypt):
|
||||||
|
self.db = db
|
||||||
|
self.bcrypt = bcrypt
|
||||||
|
|
||||||
|
def register_user(self, username, email, password):
|
||||||
|
hashed_password = self.bcrypt.generate_password_hash(password).decode("utf-8")
|
||||||
|
|
||||||
|
if self.db.get_user_by_email(email):
|
||||||
|
raise ValueError("Email already registered")
|
||||||
|
|
||||||
|
if self.db.get_user_by_username(username):
|
||||||
|
raise ValueError("Username already taken")
|
||||||
|
|
||||||
|
self.db.save_user(username, email, hashed_password)
|
||||||
|
|
||||||
|
def authenticate_user(self, username, password):
|
||||||
|
user = self.db.get_user_by_username(username)
|
||||||
|
if user and self.bcrypt.check_password_hash(user['password_hash'], password):
|
||||||
|
return user
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_user_by_id(self, user_id):
|
||||||
|
query = "SELECT id, username, email FROM users WHERE id = %s"
|
||||||
|
result = self.db.execute(query, (user_id,), fetch=True)
|
||||||
|
return result[0] if result else None
|
||||||
39
server/dataset_processor.py
Normal file
39
server/dataset_processor.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from server.analysis.nlp import NLP
|
||||||
|
|
||||||
|
class DatasetProcessor:
|
||||||
|
def __init__(self, df, topics):
|
||||||
|
self.df = self._explode_comments(df)
|
||||||
|
self.topics = topics
|
||||||
|
self.nlp = NLP(self.df, "title", "content", self.topics)
|
||||||
|
|
||||||
|
def _explode_comments(self, df) -> pd.DataFrame:
|
||||||
|
comments_df = df[["id", "comments"]].explode("comments")
|
||||||
|
comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))]
|
||||||
|
comments_df = pd.json_normalize(comments_df["comments"])
|
||||||
|
|
||||||
|
posts_df = df.drop(columns=["comments"])
|
||||||
|
posts_df["type"] = "post"
|
||||||
|
posts_df["parent_id"] = None
|
||||||
|
|
||||||
|
comments_df["type"] = "comment"
|
||||||
|
comments_df["parent_id"] = comments_df.get("post_id")
|
||||||
|
|
||||||
|
df = pd.concat([posts_df, comments_df])
|
||||||
|
df.drop(columns=["post_id"], inplace=True, errors="ignore")
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
def enrich(self) -> pd.DataFrame:
|
||||||
|
self.df['timestamp'] = pd.to_numeric(self.df['timestamp'], errors='raise')
|
||||||
|
self.df['date'] = pd.to_datetime(self.df['timestamp'], unit='s').dt.date
|
||||||
|
self.df["dt"] = pd.to_datetime(self.df["timestamp"], unit="s", utc=True)
|
||||||
|
self.df["hour"] = self.df["dt"].dt.hour
|
||||||
|
self.df["weekday"] = self.df["dt"].dt.day_name()
|
||||||
|
|
||||||
|
self.nlp.add_emotion_cols()
|
||||||
|
self.nlp.add_topic_col()
|
||||||
|
self.nlp.add_ner_cols()
|
||||||
|
|
||||||
|
return self.df
|
||||||
@@ -1,170 +1,135 @@
|
|||||||
import pandas as pd
|
|
||||||
import datetime
|
import datetime
|
||||||
import nltk
|
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
import pandas as pd
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
from server.analysis.nlp import NLP
|
|
||||||
from server.analysis.temporal import TemporalAnalysis
|
from server.analysis.cultural import CulturalAnalysis
|
||||||
from server.analysis.emotional import EmotionalAnalysis
|
from server.analysis.emotional import EmotionalAnalysis
|
||||||
from server.analysis.interactional import InteractionAnalysis
|
from server.analysis.interactional import InteractionAnalysis
|
||||||
from server.analysis.linguistic import LinguisticAnalysis
|
from server.analysis.linguistic import LinguisticAnalysis
|
||||||
from server.analysis.cultural import CulturalAnalysis
|
from server.analysis.temporal import TemporalAnalysis
|
||||||
|
|
||||||
DOMAIN_STOPWORDS = {
|
DOMAIN_STOPWORDS = {
|
||||||
"www", "https", "http",
|
"www",
|
||||||
"boards", "boardsie",
|
"https",
|
||||||
"comment", "comments",
|
"http",
|
||||||
"discussion", "thread",
|
"boards",
|
||||||
"post", "posts",
|
"boardsie",
|
||||||
"would", "get", "one"
|
"comment",
|
||||||
|
"comments",
|
||||||
|
"discussion",
|
||||||
|
"thread",
|
||||||
|
"post",
|
||||||
|
"posts",
|
||||||
|
"would",
|
||||||
|
"get",
|
||||||
|
"one",
|
||||||
}
|
}
|
||||||
|
|
||||||
nltk.download('stopwords')
|
nltk.download("stopwords")
|
||||||
EXCLUDE_WORDS = set(stopwords.words('english')) | DOMAIN_STOPWORDS
|
EXCLUDE_WORDS = set(stopwords.words("english")) | DOMAIN_STOPWORDS
|
||||||
|
|
||||||
|
|
||||||
class StatGen:
|
class StatGen:
|
||||||
def __init__(self, df: pd.DataFrame, domain_topics: dict) -> None:
|
def __init__(self) -> None:
|
||||||
comments_df = df[["id", "comments"]].explode("comments")
|
self.temporal_analysis = TemporalAnalysis()
|
||||||
comments_df = comments_df[comments_df["comments"].apply(lambda x: isinstance(x, dict))]
|
self.emotional_analysis = EmotionalAnalysis()
|
||||||
comments_df = pd.json_normalize(comments_df["comments"])
|
self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS)
|
||||||
|
self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS)
|
||||||
|
self.cultural_analysis = CulturalAnalysis()
|
||||||
|
|
||||||
posts_df = df.drop(columns=["comments"])
|
def get_time_analysis(self, df: pd.DataFrame) -> dict:
|
||||||
posts_df["type"] = "post"
|
|
||||||
posts_df["parent_id"] = None
|
|
||||||
|
|
||||||
comments_df["type"] = "comment"
|
|
||||||
comments_df["parent_id"] = comments_df.get("post_id")
|
|
||||||
self.domain_topics = domain_topics
|
|
||||||
|
|
||||||
self.df = pd.concat([posts_df, comments_df])
|
|
||||||
self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
|
|
||||||
|
|
||||||
self.nlp = NLP(self.df, "title", "content", domain_topics)
|
|
||||||
self.nlp.add_emotion_cols()
|
|
||||||
self.nlp.add_topic_col()
|
|
||||||
self.nlp.add_ner_cols()
|
|
||||||
self._add_time_cols(self.df)
|
|
||||||
|
|
||||||
self.temporal_analysis = TemporalAnalysis(self.df)
|
|
||||||
self.emotional_analysis = EmotionalAnalysis(self.df)
|
|
||||||
self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
|
|
||||||
self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
|
|
||||||
self.cultural_analysis = CulturalAnalysis(self.df)
|
|
||||||
|
|
||||||
self.original_df = self.df.copy(deep=True)
|
|
||||||
|
|
||||||
## Private Methods
|
|
||||||
def _add_time_cols(self, df: pd.DataFrame) -> None:
|
|
||||||
df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
|
|
||||||
df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
|
|
||||||
df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
|
|
||||||
df["hour"] = df["dt"].dt.hour
|
|
||||||
df["weekday"] = df["dt"].dt.day_name()
|
|
||||||
|
|
||||||
## Public
|
|
||||||
|
|
||||||
# topics over time
|
|
||||||
# emotions over time
|
|
||||||
def get_time_analysis(self) -> dict:
|
|
||||||
return {
|
return {
|
||||||
"events_per_day": self.temporal_analysis.posts_per_day(),
|
"events_per_day": self.temporal_analysis.posts_per_day(df),
|
||||||
"weekday_hour_heatmap": self.temporal_analysis.heatmap()
|
"weekday_hour_heatmap": self.temporal_analysis.heatmap(df),
|
||||||
}
|
}
|
||||||
|
|
||||||
# average topic duration
|
def get_content_analysis(self, df: pd.DataFrame) -> dict:
|
||||||
def get_content_analysis(self) -> dict:
|
|
||||||
return {
|
return {
|
||||||
"word_frequencies": self.linguistic_analysis.word_frequencies(),
|
"word_frequencies": self.linguistic_analysis.word_frequencies(df),
|
||||||
"common_two_phrases": self.linguistic_analysis.ngrams(),
|
"common_two_phrases": self.linguistic_analysis.ngrams(df),
|
||||||
"common_three_phrases": self.linguistic_analysis.ngrams(n=3),
|
"common_three_phrases": self.linguistic_analysis.ngrams(df, n=3),
|
||||||
"average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(),
|
"average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(df),
|
||||||
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
|
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion(df),
|
||||||
}
|
}
|
||||||
|
|
||||||
# average emotion per user
|
def get_user_analysis(self, df: pd.DataFrame) -> dict:
|
||||||
# average chain length
|
|
||||||
def get_user_analysis(self) -> dict:
|
|
||||||
return {
|
return {
|
||||||
"top_users": self.interaction_analysis.top_users(),
|
"top_users": self.interaction_analysis.top_users(df),
|
||||||
"users": self.interaction_analysis.per_user_analysis()
|
"users": self.interaction_analysis.per_user_analysis(df),
|
||||||
|
"interaction_graph": self.interaction_analysis.interaction_graph(df),
|
||||||
}
|
}
|
||||||
|
|
||||||
# average / max thread depth
|
def get_interactional_analysis(self, df: pd.DataFrame) -> dict:
|
||||||
# high engagment threads based on volume
|
|
||||||
def get_interactional_analysis(self) -> dict:
|
|
||||||
return {
|
return {
|
||||||
"average_thread_depth": self.interaction_analysis.average_thread_depth(),
|
"average_thread_depth": self.interaction_analysis.average_thread_depth(df),
|
||||||
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(),
|
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(df),
|
||||||
"interaction_graph": self.interaction_analysis.interaction_graph()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
# detect community jargon
|
def get_cultural_analysis(self, df: pd.DataFrame) -> dict:
|
||||||
# in-group and out-group linguistic markers
|
|
||||||
def get_cultural_analysis(self) -> dict:
|
|
||||||
return {
|
return {
|
||||||
"identity_markers": self.cultural_analysis.get_identity_markers(),
|
"identity_markers": self.cultural_analysis.get_identity_markers(df),
|
||||||
"stance_markers": self.cultural_analysis.get_stance_markers(),
|
"stance_markers": self.cultural_analysis.get_stance_markers(df),
|
||||||
"entity_salience": self.cultural_analysis.get_avg_emotions_per_entity()
|
"entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(df),
|
||||||
}
|
}
|
||||||
|
|
||||||
def summary(self) -> dict:
|
def summary(self, df: pd.DataFrame) -> dict:
|
||||||
total_posts = (self.df["type"] == "post").sum()
|
total_posts = (df["type"] == "post").sum()
|
||||||
total_comments = (self.df["type"] == "comment").sum()
|
total_comments = (df["type"] == "comment").sum()
|
||||||
|
events_per_user = df.groupby("author").size()
|
||||||
events_per_user = self.df.groupby("author").size()
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"total_events": int(len(self.df)),
|
"total_events": int(len(df)),
|
||||||
"total_posts": int(total_posts),
|
"total_posts": int(total_posts),
|
||||||
"total_comments": int(total_comments),
|
"total_comments": int(total_comments),
|
||||||
"unique_users": int(events_per_user.count()),
|
"unique_users": int(events_per_user.count()),
|
||||||
"comments_per_post": round(total_comments / max(total_posts, 1), 2),
|
"comments_per_post": round(total_comments / max(total_posts, 1), 2),
|
||||||
"lurker_ratio": round((events_per_user == 1).mean(), 2),
|
"lurker_ratio": round((events_per_user == 1).mean(), 2),
|
||||||
"time_range": {
|
"time_range": {
|
||||||
"start": int(self.df["dt"].min().timestamp()),
|
"start": int(df["dt"].min().timestamp()),
|
||||||
"end": int(self.df["dt"].max().timestamp())
|
"end": int(df["dt"].max().timestamp()),
|
||||||
},
|
},
|
||||||
"sources": self.df["source"].dropna().unique().tolist()
|
"sources": df["source"].dropna().unique().tolist(),
|
||||||
}
|
}
|
||||||
|
|
||||||
def filter_by_query(self, search_query: str) -> dict:
|
# def filter_by_query(self, df: pd.DataFrame, search_query: str) -> dict:
|
||||||
self.df = self.df[
|
# filtered_df = df[df["content"].str.contains(search_query, na=False)]
|
||||||
self.df["content"].str.contains(search_query)
|
|
||||||
]
|
|
||||||
|
|
||||||
return {
|
# return {
|
||||||
"rows": len(self.df),
|
# "rows": len(filtered_df),
|
||||||
"data": self.df.to_dict(orient="records")
|
# "data": filtered_df.to_dict(orient="records"),
|
||||||
}
|
# }
|
||||||
|
|
||||||
def set_time_range(self, start: datetime.datetime, end: datetime.datetime) -> dict:
|
# def set_time_range(
|
||||||
self.df = self.df[
|
# self,
|
||||||
(self.df["dt"] >= start) &
|
# original_df: pd.DataFrame,
|
||||||
(self.df["dt"] <= end)
|
# start: datetime.datetime,
|
||||||
]
|
# end: datetime.datetime,
|
||||||
|
# ) -> dict:
|
||||||
|
# df = self._prepare_df(original_df)
|
||||||
|
# filtered_df = df[(df["dt"] >= start) & (df["dt"] <= end)]
|
||||||
|
|
||||||
return {
|
# return {
|
||||||
"rows": len(self.df),
|
# "rows": len(filtered_df),
|
||||||
"data": self.df.to_dict(orient="records")
|
# "data": filtered_df.to_dict(orient="records"),
|
||||||
}
|
# }
|
||||||
|
|
||||||
"""
|
# def filter_data_sources(
|
||||||
Input is a hash map (source_name: str -> enabled: bool)
|
# self, original_df: pd.DataFrame, data_sources: dict
|
||||||
"""
|
# ) -> dict:
|
||||||
def filter_data_sources(self, data_sources: dict) -> dict:
|
# df = self._prepare_df(original_df)
|
||||||
enabled_sources = [src for src, enabled in data_sources.items() if enabled]
|
# enabled_sources = [src for src, enabled in data_sources.items() if enabled]
|
||||||
|
|
||||||
if not enabled_sources:
|
# if not enabled_sources:
|
||||||
raise ValueError("Please choose at least one data source")
|
# raise ValueError("Please choose at least one data source")
|
||||||
|
|
||||||
self.df = self.df[self.df["source"].isin(enabled_sources)]
|
# filtered_df = df[df["source"].isin(enabled_sources)]
|
||||||
|
|
||||||
return {
|
# return {
|
||||||
"rows": len(self.df),
|
# "rows": len(filtered_df),
|
||||||
"data": self.df.to_dict(orient="records")
|
# "data": filtered_df.to_dict(orient="records"),
|
||||||
}
|
# }
|
||||||
|
|
||||||
|
|
||||||
def reset_dataset(self) -> None:
|
|
||||||
self.df = self.original_df.copy(deep=True)
|
|
||||||
|
|
||||||
|
# def reset_dataset(self, original_df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
# return self._prepare_df(original_df)
|
||||||
|
|||||||
Reference in New Issue
Block a user