fix(stats): remove duplicated entries in corpus explorer

This commit is contained in:
2026-04-01 00:22:29 +01:00
parent 430793cd09
commit 6378015726
2 changed files with 67 additions and 4 deletions

View File

@@ -66,6 +66,38 @@ const EMPTY_EXPLORER_STATE: ExplorerState = {
error: "", error: "",
}; };
const getExplorerRecordIdentity = (record: DatasetRecord) =>
JSON.stringify({
post_id: record.post_id ?? null,
parent_id: record.parent_id ?? null,
reply_to: record.reply_to ?? null,
author: record.author ?? null,
type: record.type ?? null,
timestamp: record.timestamp ?? null,
dt: record.dt ?? null,
title: record.title ?? null,
content: record.content ?? null,
source: record.source ?? null,
topic: record.topic ?? null,
});
const dedupeExplorerRecords = (records: DatasetRecord[]) => {
const uniqueRecords: DatasetRecord[] = [];
const seen = new Set<string>();
for (const record of records) {
const identity = getExplorerRecordIdentity(record);
if (seen.has(identity)) {
continue;
}
seen.add(identity);
uniqueRecords.push(record);
}
return uniqueRecords;
};
const normalizeRecordPayload = (payload: unknown): DatasetRecord[] => { const normalizeRecordPayload = (payload: unknown): DatasetRecord[] => {
if (typeof payload === "string") { if (typeof payload === "string") {
try { try {
@@ -233,7 +265,9 @@ const StatPage = () => {
}, },
); );
const normalizedRecords = normalizeRecordPayload(response.data); const normalizedRecords = dedupeExplorerRecords(
normalizeRecordPayload(response.data),
);
setAllRecords(normalizedRecords); setAllRecords(normalizedRecords);
setAllRecordsKey(filterKey); setAllRecordsKey(filterKey);
@@ -254,7 +288,9 @@ const StatPage = () => {
try { try {
const records = await ensureFilteredRecords(); const records = await ensureFilteredRecords();
const context = buildExplorerContext(records); const context = buildExplorerContext(records);
const matched = records.filter((record) => spec.matcher(record, context)); const matched = dedupeExplorerRecords(
records.filter((record) => spec.matcher(record, context)),
);
matched.sort((a, b) => { matched.sort((a, b) => {
const aValue = String(a.dt ?? a.date ?? a.timestamp ?? ""); const aValue = String(a.dt ?? a.date ?? a.timestamp ?? "");
const bValue = String(b.dt ?? b.date ?? b.timestamp ?? ""); const bValue = String(b.dt ?? b.date ?? b.timestamp ?? "");
@@ -662,7 +698,7 @@ const StatPage = () => {
)} )}
{activeView === "interactional" && interactionData && ( {activeView === "interactional" && interactionData && (
<InteractionalStats data={interactionData} onExplore={openExplorer} /> <InteractionalStats data={interactionData} />
)} )}
{activeView === "interactional" && !interactionData && ( {activeView === "interactional" && !interactionData && (

View File

@@ -89,10 +89,37 @@ class StatGen:
df.to_json(orient="records", date_format="iso", date_unit="s") df.to_json(orient="records", date_format="iso", date_unit="s")
) )
def _dedupe_records(self, records: list[dict]) -> list[dict]:
unique_records = []
seen = set()
for record in records:
key_data = {
"post_id": record.get("post_id"),
"parent_id": record.get("parent_id"),
"reply_to": record.get("reply_to"),
"author": record.get("author"),
"type": record.get("type"),
"timestamp": record.get("timestamp"),
"dt": record.get("dt"),
"title": record.get("title"),
"content": record.get("content"),
"source": record.get("source"),
"topic": record.get("topic"),
}
key = json.dumps(key_data, sort_keys=True, separators=(",", ":"))
if key in seen:
continue
seen.add(key)
unique_records.append(record)
return unique_records
## Public Methods ## Public Methods
def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]: def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return self._json_ready_records(filtered_df) return self._dedupe_records(self._json_ready_records(filtered_df))
def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)