feat(frontend): implement corpus explorer

This allows you to view the posts & comments associated with a specific aggregate.
This commit is contained in:
2026-04-01 00:04:25 +01:00
parent 1dde5f7b08
commit b270ed03ae
11 changed files with 1064 additions and 179 deletions

View File

@@ -1,4 +1,5 @@
import nltk
import json
import pandas as pd
from nltk.corpus import stopwords
@@ -27,6 +28,8 @@ DOMAIN_STOPWORDS = {
"one",
}
EXCLUDED_AUTHORS = {"[deleted]", "automoderator"}
nltk.download("stopwords")
EXCLUDE_WORDS = set(stopwords.words("english")) | DOMAIN_STOPWORDS
@@ -46,6 +49,12 @@ class StatGen:
filters = filters or {}
filtered_df = df.copy()
if "author" in filtered_df.columns:
normalized_authors = (
filtered_df["author"].fillna("").astype(str).str.strip().str.lower()
)
filtered_df = filtered_df[~normalized_authors.isin(EXCLUDED_AUTHORS)]
search_query = filters.get("search_query", None)
start_date_filter = filters.get("start_date", None)
end_date_filter = filters.get("end_date", None)
@@ -75,9 +84,15 @@ class StatGen:
return filtered_df
def _json_ready_records(self, df: pd.DataFrame) -> list[dict]:
return json.loads(
df.to_json(orient="records", date_format="iso", date_unit="s")
)
## Public Methods
def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]:
return self._prepare_filtered_df(df, filters).to_dict(orient="records")
filtered_df = self._prepare_filtered_df(df, filters)
return self._json_ready_records(filtered_df)
def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)