fix(stats): remove duplicated entries in corpus explorer
This commit is contained in:
@@ -89,10 +89,37 @@ class StatGen:
|
||||
df.to_json(orient="records", date_format="iso", date_unit="s")
|
||||
)
|
||||
|
||||
def _dedupe_records(self, records: list[dict]) -> list[dict]:
|
||||
unique_records = []
|
||||
seen = set()
|
||||
|
||||
for record in records:
|
||||
key_data = {
|
||||
"post_id": record.get("post_id"),
|
||||
"parent_id": record.get("parent_id"),
|
||||
"reply_to": record.get("reply_to"),
|
||||
"author": record.get("author"),
|
||||
"type": record.get("type"),
|
||||
"timestamp": record.get("timestamp"),
|
||||
"dt": record.get("dt"),
|
||||
"title": record.get("title"),
|
||||
"content": record.get("content"),
|
||||
"source": record.get("source"),
|
||||
"topic": record.get("topic"),
|
||||
}
|
||||
key = json.dumps(key_data, sort_keys=True, separators=(",", ":"))
|
||||
if key in seen:
|
||||
continue
|
||||
|
||||
seen.add(key)
|
||||
unique_records.append(record)
|
||||
|
||||
return unique_records
|
||||
|
||||
## Public Methods
|
||||
def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]:
|
||||
filtered_df = self._prepare_filtered_df(df, filters)
|
||||
return self._json_ready_records(filtered_df)
|
||||
return self._dedupe_records(self._json_ready_records(filtered_df))
|
||||
|
||||
def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
|
||||
filtered_df = self._prepare_filtered_df(df, filters)
|
||||
|
||||
Reference in New Issue
Block a user