diff --git a/frontend/src/pages/Stats.tsx b/frontend/src/pages/Stats.tsx index d520ca4..0d4c10e 100644 --- a/frontend/src/pages/Stats.tsx +++ b/frontend/src/pages/Stats.tsx @@ -66,6 +66,38 @@ const EMPTY_EXPLORER_STATE: ExplorerState = { error: "", }; +const getExplorerRecordIdentity = (record: DatasetRecord) => + JSON.stringify({ + post_id: record.post_id ?? null, + parent_id: record.parent_id ?? null, + reply_to: record.reply_to ?? null, + author: record.author ?? null, + type: record.type ?? null, + timestamp: record.timestamp ?? null, + dt: record.dt ?? null, + title: record.title ?? null, + content: record.content ?? null, + source: record.source ?? null, + topic: record.topic ?? null, + }); + +const dedupeExplorerRecords = (records: DatasetRecord[]) => { + const uniqueRecords: DatasetRecord[] = []; + const seen = new Set(); + + for (const record of records) { + const identity = getExplorerRecordIdentity(record); + if (seen.has(identity)) { + continue; + } + + seen.add(identity); + uniqueRecords.push(record); + } + + return uniqueRecords; +}; + const normalizeRecordPayload = (payload: unknown): DatasetRecord[] => { if (typeof payload === "string") { try { @@ -233,7 +265,9 @@ const StatPage = () => { }, ); - const normalizedRecords = normalizeRecordPayload(response.data); + const normalizedRecords = dedupeExplorerRecords( + normalizeRecordPayload(response.data), + ); setAllRecords(normalizedRecords); setAllRecordsKey(filterKey); @@ -254,7 +288,9 @@ const StatPage = () => { try { const records = await ensureFilteredRecords(); const context = buildExplorerContext(records); - const matched = records.filter((record) => spec.matcher(record, context)); + const matched = dedupeExplorerRecords( + records.filter((record) => spec.matcher(record, context)), + ); matched.sort((a, b) => { const aValue = String(a.dt ?? a.date ?? a.timestamp ?? ""); const bValue = String(b.dt ?? b.date ?? b.timestamp ?? ""); @@ -662,7 +698,7 @@ const StatPage = () => { )} {activeView === "interactional" && interactionData && ( - + )} {activeView === "interactional" && !interactionData && ( diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index d45ab1d..8eecb7f 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -89,10 +89,37 @@ class StatGen: df.to_json(orient="records", date_format="iso", date_unit="s") ) + def _dedupe_records(self, records: list[dict]) -> list[dict]: + unique_records = [] + seen = set() + + for record in records: + key_data = { + "post_id": record.get("post_id"), + "parent_id": record.get("parent_id"), + "reply_to": record.get("reply_to"), + "author": record.get("author"), + "type": record.get("type"), + "timestamp": record.get("timestamp"), + "dt": record.get("dt"), + "title": record.get("title"), + "content": record.get("content"), + "source": record.get("source"), + "topic": record.get("topic"), + } + key = json.dumps(key_data, sort_keys=True, separators=(",", ":")) + if key in seen: + continue + + seen.add(key) + unique_records.append(record) + + return unique_records + ## Public Methods def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]: filtered_df = self._prepare_filtered_df(df, filters) - return self._json_ready_records(filtered_df) + return self._dedupe_records(self._json_ready_records(filtered_df)) def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters)