From b270ed03aeb9395e08bcd6993ddf6b5c02368191 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Wed, 1 Apr 2026 00:04:25 +0100 Subject: [PATCH] feat(frontend): implement corpus explorer This allows you to view the posts & comments associated with a specific aggregate. --- frontend/src/components/CorpusExplorer.tsx | 175 +++++++++ frontend/src/components/CulturalStats.tsx | 85 +++- frontend/src/components/EmotionalStats.tsx | 30 +- frontend/src/components/LinguisticStats.tsx | 26 +- frontend/src/components/SummaryStats.tsx | 122 +++--- frontend/src/components/UserStats.tsx | 69 +++- frontend/src/pages/DatasetEdit.tsx | 11 - frontend/src/pages/Stats.tsx | 300 +++++++++++---- frontend/src/utils/corpusExplorer.ts | 405 ++++++++++++++++++++ server/analysis/stat_gen.py | 17 +- server/app.py | 3 +- 11 files changed, 1064 insertions(+), 179 deletions(-) create mode 100644 frontend/src/components/CorpusExplorer.tsx create mode 100644 frontend/src/utils/corpusExplorer.ts diff --git a/frontend/src/components/CorpusExplorer.tsx b/frontend/src/components/CorpusExplorer.tsx new file mode 100644 index 0000000..e382b51 --- /dev/null +++ b/frontend/src/components/CorpusExplorer.tsx @@ -0,0 +1,175 @@ +import { Dialog, DialogPanel, DialogTitle } from "@headlessui/react"; + +import StatsStyling from "../styles/stats_styling"; +import type { DatasetRecord } from "../utils/corpusExplorer"; + +const styles = StatsStyling; + +const cleanText = (value: unknown) => { + if (typeof value !== "string") { + return ""; + } + + const trimmed = value.trim(); + if (!trimmed) { + return ""; + } + + const lowered = trimmed.toLowerCase(); + if (lowered === "nan" || lowered === "null" || lowered === "undefined") { + return ""; + } + + return trimmed; +}; + +const displayText = (value: unknown, fallback: string) => { + const cleaned = cleanText(value); + return cleaned || fallback; +}; + +type CorpusExplorerProps = { + open: boolean; + onClose: () => void; + title: string; + description: string; + records: DatasetRecord[]; + loading: boolean; + error: string; + emptyMessage: string; +}; + +const formatRecordDate = (record: DatasetRecord) => { + if (typeof record.dt === "string" && record.dt) { + const date = new Date(record.dt); + if (!Number.isNaN(date.getTime())) { + return date.toLocaleString(); + } + } + + if (typeof record.date === "string" && record.date) { + return record.date; + } + + if (typeof record.timestamp === "number") { + return new Date(record.timestamp * 1000).toLocaleString(); + } + + return "Unknown time"; +}; + +const getRecordKey = (record: DatasetRecord, index: number) => + String(record.id ?? record.post_id ?? `${record.author ?? "record"}-${index}`); + +const getRecordTitle = (record: DatasetRecord) => { + if (record.type === "comment") { + return ""; + } + + const title = cleanText(record.title); + if (title) { + return title; + } + + const content = cleanText(record.content); + if (!content) { + return "Untitled record"; + } + + return content.length > 120 ? `${content.slice(0, 117)}...` : content; +}; + +const getRecordExcerpt = (record: DatasetRecord) => { + const content = cleanText(record.content); + if (!content) { + return "No content available."; + } + + return content.length > 320 ? `${content.slice(0, 317)}...` : content; +}; + +const CorpusExplorer = ({ + open, + onClose, + title, + description, + records, + loading, + error, + emptyMessage, +}: CorpusExplorerProps) => ( + +
+ +
+ +
+
+ {title} +

+ {description} {loading ? "Loading records..." : `${records.length.toLocaleString()} records.`} +

+
+ + +
+ + {error ?

{error}

: null} + + {!loading && !error && !records.length ? ( +

{emptyMessage}

+ ) : null} + + {loading ? ( +
Preparing corpus slice...
+ ) : null} + + {!loading && !error && records.length ? ( +
+ {records.map((record, index) => ( +
+
+
+ {getRecordTitle(record) ? ( +
{getRecordTitle(record)}
+ ) : null} +
+ {displayText(record.author, "Unknown author")} • {displayText(record.source, "Unknown source")} • {displayText(record.type, "record")} • {formatRecordDate(record)} +
+
+
+ {cleanText(record.topic) ? `Topic: ${cleanText(record.topic)}` : ""} +
+
+ +
+ {getRecordExcerpt(record)} +
+
+ ))} +
+ ) : null} +
+
+
+); + +export default CorpusExplorer; diff --git a/frontend/src/components/CulturalStats.tsx b/frontend/src/components/CulturalStats.tsx index e62b956..81e059d 100644 --- a/frontend/src/components/CulturalStats.tsx +++ b/frontend/src/components/CulturalStats.tsx @@ -1,14 +1,34 @@ import Card from "./Card"; import StatsStyling from "../styles/stats_styling"; import type { CulturalAnalysisResponse } from "../types/ApiTypes"; +import { + buildCertaintySpec, + buildDeonticSpec, + buildEntitySpec, + buildHedgeSpec, + buildIdentityBucketSpec, + buildPermissionSpec, + getExplorerButtonStyle, + type CorpusExplorerSpec, +} from "../utils/corpusExplorer"; const styles = StatsStyling; type CulturalStatsProps = { data: CulturalAnalysisResponse; + onExplore: (spec: CorpusExplorerSpec) => void; }; -const CulturalStats = ({ data }: CulturalStatsProps) => { +const renderExploreButton = (onClick: () => void) => ( + +); + +const CulturalStats = ({ data, onExplore }: CulturalStatsProps) => { const identity = data.identity_markers; const stance = data.stance_markers; const inGroupWords = identity?.in_group_usage ?? 0; @@ -30,7 +50,7 @@ const CulturalStats = ({ data }: CulturalStatsProps) => { const topEmotion = (emotionAvg: Record | undefined) => { const entries = Object.entries(emotionAvg ?? {}); if (!entries.length) { - return "—"; + return "-"; } entries.sort((a, b) => b[1] - a[1]); @@ -64,21 +84,30 @@ const CulturalStats = ({ data }: CulturalStatsProps) => { /> + onExplore(buildIdentityBucketSpec("in")), + )} style={{ gridColumn: "span 3" }} /> + onExplore(buildIdentityBucketSpec("out")), + )} style={{ gridColumn: "span 3" }} /> + onExplore(buildIdentityBucketSpec("tie")), + )} style={{ gridColumn: "span 3" }} /> { { { onExplore(buildHedgeSpec()))} style={{ gridColumn: "span 3" }} /> onExplore(buildCertaintySpec()))} style={{ gridColumn: "span 3" }} /> onExplore(buildDeonticSpec()))} style={{ gridColumn: "span 3" }} /> onExplore(buildPermissionSpec()))} style={{ gridColumn: "span 3" }} /> @@ -150,8 +183,14 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {

Most likely emotion when in-group wording is stronger.

-
- {topEmotion(identity?.in_group_emotion_avg)} +
{topEmotion(identity?.in_group_emotion_avg)}
+
+
@@ -160,8 +199,14 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {

Most likely emotion when out-group wording is stronger.

-
- {topEmotion(identity?.out_group_emotion_avg)} +
{topEmotion(identity?.out_group_emotion_avg)}
+
+
@@ -171,9 +216,7 @@ const CulturalStats = ({ data }: CulturalStatsProps) => { Most mentioned entities and the mood that appears most with each.

{!entities.length ? ( -
- No entity-level cultural data available. -
+
No entity-level cultural data available.
) : (
{ }} > {entities.map(([entity, aggregate]) => ( -
+
onExplore(buildEntitySpec(entity))} + >
{entity}
{aggregate.post_count.toLocaleString()} posts • Likely mood:{" "} diff --git a/frontend/src/components/EmotionalStats.tsx b/frontend/src/components/EmotionalStats.tsx index 6350e0c..0e22b3f 100644 --- a/frontend/src/components/EmotionalStats.tsx +++ b/frontend/src/components/EmotionalStats.tsx @@ -1,13 +1,20 @@ import type { EmotionalAnalysisResponse } from "../types/ApiTypes"; import StatsStyling from "../styles/stats_styling"; +import { + buildDominantEmotionSpec, + buildSourceSpec, + buildTopicSpec, + type CorpusExplorerSpec, +} from "../utils/corpusExplorer"; const styles = StatsStyling; type EmotionalStatsProps = { emotionalData: EmotionalAnalysisResponse; + onExplore: (spec: CorpusExplorerSpec) => void; }; -const EmotionalStats = ({ emotionalData }: EmotionalStatsProps) => { +const EmotionalStats = ({ emotionalData, onExplore }: EmotionalStatsProps) => { const rows = emotionalData.average_emotion_by_topic ?? []; const overallEmotionAverage = emotionalData.overall_emotion_average ?? []; const dominantEmotionDistribution = @@ -126,7 +133,11 @@ const EmotionalStats = ({ emotionalData }: EmotionalStatsProps) => { {[...overallEmotionAverage] .sort((a, b) => b.score - a.score) .map((row) => ( -
+
onExplore(buildDominantEmotionSpec(row.emotion))} + >
{formatEmotion(row.emotion)}
@@ -157,7 +168,11 @@ const EmotionalStats = ({ emotionalData }: EmotionalStatsProps) => { {[...dominantEmotionDistribution] .sort((a, b) => b.ratio - a.ratio) .map((row) => ( -
+
onExplore(buildDominantEmotionSpec(row.emotion))} + >
{formatEmotion(row.emotion)}
@@ -189,7 +204,11 @@ const EmotionalStats = ({ emotionalData }: EmotionalStatsProps) => { {[...emotionBySource] .sort((a, b) => b.event_count - a.event_count) .map((row) => ( -
+
onExplore(buildSourceSpec(row.source))} + >
{row.source}
{formatEmotion(row.dominant_emotion)} •{" "} @@ -211,7 +230,8 @@ const EmotionalStats = ({ emotionalData }: EmotionalStatsProps) => { {strongestPerTopic.map((topic) => (
onExplore(buildTopicSpec(topic.topic))} >

{topic.topic} diff --git a/frontend/src/components/LinguisticStats.tsx b/frontend/src/components/LinguisticStats.tsx index 794d118..aaee748 100644 --- a/frontend/src/components/LinguisticStats.tsx +++ b/frontend/src/components/LinguisticStats.tsx @@ -1,14 +1,20 @@ import Card from "./Card"; import StatsStyling from "../styles/stats_styling"; import type { LinguisticAnalysisResponse } from "../types/ApiTypes"; +import { + buildNgramSpec, + buildWordSpec, + type CorpusExplorerSpec, +} from "../utils/corpusExplorer"; const styles = StatsStyling; type LinguisticStatsProps = { data: LinguisticAnalysisResponse; + onExplore: (spec: CorpusExplorerSpec) => void; }; -const LinguisticStats = ({ data }: LinguisticStatsProps) => { +const LinguisticStats = ({ data, onExplore }: LinguisticStatsProps) => { const lexical = data.lexical_diversity; const words = data.word_frequencies ?? []; const bigrams = data.common_two_phrases ?? []; @@ -60,7 +66,11 @@ const LinguisticStats = ({ data }: LinguisticStatsProps) => { }} > {topWords.map((item) => ( -
+
onExplore(buildWordSpec(item.word))} + >
{item.word}
{item.count.toLocaleString()} uses @@ -81,7 +91,11 @@ const LinguisticStats = ({ data }: LinguisticStatsProps) => { }} > {topBigrams.map((item) => ( -
+
onExplore(buildNgramSpec(item.ngram))} + >
{item.ngram}
{item.count.toLocaleString()} uses @@ -102,7 +116,11 @@ const LinguisticStats = ({ data }: LinguisticStatsProps) => { }} > {topTrigrams.map((item) => ( -
+
onExplore(buildNgramSpec(item.ngram))} + >
{item.ngram}
{item.count.toLocaleString()} uses diff --git a/frontend/src/components/SummaryStats.tsx b/frontend/src/components/SummaryStats.tsx index 01cedb3..d53d6da 100644 --- a/frontend/src/components/SummaryStats.tsx +++ b/frontend/src/components/SummaryStats.tsx @@ -1,4 +1,4 @@ -import { memo, useMemo, useState } from "react"; +import { memo, useMemo } from "react"; import { LineChart, Line, @@ -13,7 +13,6 @@ import ActivityHeatmap from "../stats/ActivityHeatmap"; import { ReactWordcloud } from "@cp949/react-wordcloud"; import StatsStyling from "../styles/stats_styling"; import Card from "../components/Card"; -import UserModal from "../components/UserModal"; import { type SummaryResponse, @@ -21,8 +20,15 @@ import { type UserEndpointResponse, type TimeAnalysisResponse, type LinguisticAnalysisResponse, - type User, } from "../types/ApiTypes"; +import { + buildAllRecordsSpec, + buildDateBucketSpec, + buildOneTimeUsersSpec, + buildUserSpec, + getExplorerButtonStyle, + type CorpusExplorerSpec, +} from "../utils/corpusExplorer"; const styles = StatsStyling; const MAX_WORDCLOUD_WORDS = 250; @@ -39,6 +45,7 @@ type SummaryStatsProps = { timeData: TimeAnalysisResponse | null; linguisticData: LinguisticAnalysisResponse | null; summary: SummaryResponse | null; + onExplore: (spec: CorpusExplorerSpec) => void; }; type WordCloudPanelProps = { @@ -60,7 +67,7 @@ function formatDateRange(startUnix: number, endUnix: number) { day: "2-digit", }); - return `${fmt(start)} → ${fmt(end)}`; + return `${fmt(start)} -> ${fmt(end)}`; } function convertFrequencyData(data: FrequencyWord[]) { @@ -70,25 +77,22 @@ function convertFrequencyData(data: FrequencyWord[]) { })); } +const renderExploreButton = (onClick: () => void) => ( + +); + const SummaryStats = ({ userData, timeData, linguisticData, summary, + onExplore, }: SummaryStatsProps) => { - const [selectedUser, setSelectedUser] = useState(null); - const usersByAuthor = useMemo(() => { - const nextMap = new Map(); - for (const user of userData?.users ?? []) { - nextMap.set(user.author, user); - } - return nextMap; - }, [userData?.users]); - - const selectedUserData: User | null = selectedUser - ? usersByAuthor.get(selectedUser) ?? null - : null; - const wordCloudWords = useMemo( () => convertFrequencyData( @@ -104,49 +108,41 @@ const SummaryStats = ({ return (
- {/* main grid*/}
onExplore(buildAllRecordsSpec()))} + style={{ gridColumn: "span 4" }} /> onExplore(buildAllRecordsSpec()))} + style={{ gridColumn: "span 4" }} /> onExplore(buildAllRecordsSpec()))} + style={{ gridColumn: "span 4" }} /> onExplore(buildAllRecordsSpec()))} + style={{ gridColumn: "span 4" }} /> onExplore(buildOneTimeUsersSpec()))} + style={{ gridColumn: "span 4" }} /> 3 ? "…" : "") - : "—" + (summary.sources.length > 3 ? "..." : "") + : "-" } - style={{ - gridColumn: "span 4", - }} + rightSlot={renderExploreButton(() => onExplore(buildAllRecordsSpec()))} + style={{ gridColumn: "span 4" }} /> - {/* events per day */}

Activity Over Time

-

- How much posting happened each day. -

+

How much posting happened each day.

- + { + const payload = (state as { activePayload?: Array<{ payload?: { date?: string } }> }) + ?.activePayload?.[0]?.payload as + | { date?: string } + | undefined; + if (payload?.date) { + onExplore(buildDateBucketSpec(String(payload.date))); + } + }} + > @@ -201,7 +203,6 @@ const SummaryStats = ({
- {/* Word Cloud */}

Common Words

@@ -213,7 +214,6 @@ const SummaryStats = ({

- {/* Top Users */}
@@ -225,7 +225,7 @@ const SummaryStats = ({
setSelectedUser(item.author)} + onClick={() => onExplore(buildUserSpec(item.author))} >
{item.author}
@@ -236,7 +236,6 @@ const SummaryStats = ({
- {/* Heatmap */}

Weekly Activity Pattern

@@ -248,13 +247,6 @@ const SummaryStats = ({

- - setSelectedUser(null)} - username={selectedUser ?? ""} - userData={selectedUserData} - />
); }; diff --git a/frontend/src/components/UserStats.tsx b/frontend/src/components/UserStats.tsx index fd38f77..50d96f3 100644 --- a/frontend/src/components/UserStats.tsx +++ b/frontend/src/components/UserStats.tsx @@ -5,6 +5,12 @@ import { type TopUser, type InteractionGraph } from "../types/ApiTypes"; import StatsStyling from "../styles/stats_styling"; import Card from "./Card"; +import { + buildReplyPairSpec, + toText, + buildUserSpec, + type CorpusExplorerSpec, +} from "../utils/corpusExplorer"; const styles = StatsStyling; @@ -39,6 +45,7 @@ type UserStatsProps = { interactionGraph: InteractionGraph; totalUsers: number; mostCommentHeavyUser: { author: string; commentShare: number } | null; + onExplore: (spec: CorpusExplorerSpec) => void; }; const UserStats = ({ @@ -46,6 +53,7 @@ const UserStats = ({ interactionGraph, totalUsers, mostCommentHeavyUser, + onExplore, }: UserStatsProps) => { const graphData = useMemo( () => ApiToGraphData(interactionGraph), @@ -87,9 +95,9 @@ const UserStats = ({ null, ); - const mostActiveUser = topUsers.find( - (u) => u.author !== "[deleted]", - ); + const mostActiveUser = topUsers.find((u) => u.author !== "[deleted]"); + const strongestLinkSource = strongestLink ? toText(strongestLink.source) : ""; + const strongestLinkTarget = strongestLink ? toText(strongestLink.target) : ""; return (
@@ -114,37 +122,69 @@ const UserStats = ({ /> onExplore(buildUserSpec(mostActiveUser.author))} + style={styles.buttonSecondary} + > + Explore + + ) : null + } style={{ gridColumn: "span 3" }} /> ${strongestLink.target}` - : "—" + strongestLinkSource && strongestLinkTarget + ? `${strongestLinkSource} -> ${strongestLinkTarget}` + : "-" } sublabel={ strongestLink ? `${strongestLink.value.toLocaleString()} replies` : "No graph links after filtering" } + rightSlot={ + strongestLinkSource && strongestLinkTarget ? ( + + ) : null + } style={{ gridColumn: "span 6" }} /> onExplore(buildUserSpec(mostCommentHeavyUser.author))} + style={styles.buttonSecondary} + > + Explore + + ) : null + } style={{ gridColumn: "span 6" }} /> @@ -166,6 +206,19 @@ const UserStats = ({ linkDirectionalParticleSpeed={0.004} linkWidth={(link) => Math.sqrt(Number(link.value))} nodeLabel={(node) => `${node.id}`} + onNodeClick={(node) => { + const userId = toText(node.id); + if (userId) { + onExplore(buildUserSpec(userId)); + } + }} + onLinkClick={(link) => { + const source = toText(link.source); + const target = toText(link.target); + if (source && target) { + onExplore(buildReplyPairSpec(source, target)); + } + }} />
diff --git a/frontend/src/pages/DatasetEdit.tsx b/frontend/src/pages/DatasetEdit.tsx index 798ec35..34c556d 100644 --- a/frontend/src/pages/DatasetEdit.tsx +++ b/frontend/src/pages/DatasetEdit.tsx @@ -22,12 +22,10 @@ const DatasetEditPage = () => { const [isSaving, setIsSaving] = useState(false); const [isDeleting, setIsDeleting] = useState(false); const [isDeleteModalOpen, setIsDeleteModalOpen] = useState(false); - const [hasError, setHasError] = useState(false); const [datasetName, setDatasetName] = useState(""); useEffect(() => { if (!Number.isInteger(parsedDatasetId) || parsedDatasetId <= 0) { - setHasError(true); setStatusMessage("Invalid dataset id."); setLoading(false); return; @@ -35,7 +33,6 @@ const DatasetEditPage = () => { const token = localStorage.getItem("access_token"); if (!token) { - setHasError(true); setStatusMessage("You must be signed in to edit datasets."); setLoading(false); return; @@ -49,7 +46,6 @@ const DatasetEditPage = () => { setDatasetName(response.data.name || ""); }) .catch((error: unknown) => { - setHasError(true); if (axios.isAxiosError(error)) { setStatusMessage( String(error.response?.data?.error || error.message), @@ -68,21 +64,18 @@ const DatasetEditPage = () => { const trimmedName = datasetName.trim(); if (!trimmedName) { - setHasError(true); setStatusMessage("Please enter a valid dataset name."); return; } const token = localStorage.getItem("access_token"); if (!token) { - setHasError(true); setStatusMessage("You must be signed in to save changes."); return; } try { setIsSaving(true); - setHasError(false); setStatusMessage(""); await axios.patch( @@ -93,7 +86,6 @@ const DatasetEditPage = () => { navigate("/datasets", { replace: true }); } catch (error: unknown) { - setHasError(true); if (axios.isAxiosError(error)) { setStatusMessage( String( @@ -111,7 +103,6 @@ const DatasetEditPage = () => { const deleteDataset = async () => { const deleteToken = localStorage.getItem("access_token"); if (!deleteToken) { - setHasError(true); setStatusMessage("You must be signed in to delete datasets."); setIsDeleteModalOpen(false); return; @@ -119,7 +110,6 @@ const DatasetEditPage = () => { try { setIsDeleting(true); - setHasError(false); setStatusMessage(""); await axios.delete(`${API_BASE_URL}/dataset/${parsedDatasetId}`, { @@ -129,7 +119,6 @@ const DatasetEditPage = () => { setIsDeleteModalOpen(false); navigate("/datasets", { replace: true }); } catch (error: unknown) { - setHasError(true); if (axios.isAxiosError(error)) { setStatusMessage( String( diff --git a/frontend/src/pages/Stats.tsx b/frontend/src/pages/Stats.tsx index 4327dcf..d520ca4 100644 --- a/frontend/src/pages/Stats.tsx +++ b/frontend/src/pages/Stats.tsx @@ -1,4 +1,4 @@ -import { useEffect, useState, useRef } from "react"; +import { useEffect, useRef, useState } from "react"; import axios from "axios"; import { useParams } from "react-router-dom"; import StatsStyling from "../styles/stats_styling"; @@ -8,6 +8,7 @@ import UserStats from "../components/UserStats"; import LinguisticStats from "../components/LinguisticStats"; import InteractionalStats from "../components/InteractionalStats"; import CulturalStats from "../components/CulturalStats"; +import CorpusExplorer from "../components/CorpusExplorer"; import { type SummaryResponse, @@ -19,10 +20,15 @@ import { type InteractionAnalysisResponse, type CulturalAnalysisResponse, } from "../types/ApiTypes"; +import { + buildExplorerContext, + type CorpusExplorerSpec, + type DatasetRecord, +} from "../utils/corpusExplorer"; const API_BASE_URL = import.meta.env.VITE_BACKEND_URL; const styles = StatsStyling; -const DELETED_USERS = ["[deleted]"]; +const DELETED_USERS = ["[deleted]", "automoderator"]; const isDeletedUser = (value: string | null | undefined) => DELETED_USERS.includes((value ?? "").trim().toLowerCase()); @@ -40,6 +46,97 @@ type UserStatsMeta = { mostCommentHeavyUser: { author: string; commentShare: number } | null; }; +type ExplorerState = { + open: boolean; + title: string; + description: string; + emptyMessage: string; + records: DatasetRecord[]; + loading: boolean; + error: string; +}; + +const EMPTY_EXPLORER_STATE: ExplorerState = { + open: false, + title: "Corpus Explorer", + description: "", + emptyMessage: "No records found.", + records: [], + loading: false, + error: "", +}; + +const normalizeRecordPayload = (payload: unknown): DatasetRecord[] => { + if (typeof payload === "string") { + try { + return normalizeRecordPayload(JSON.parse(payload)); + } catch { + throw new Error("Corpus endpoint returned a non-JSON string payload."); + } + } + + if ( + payload && + typeof payload === "object" && + "error" in payload && + typeof (payload as { error?: unknown }).error === "string" + ) { + throw new Error((payload as { error: string }).error); + } + + if (Array.isArray(payload)) { + return payload as DatasetRecord[]; + } + + if ( + payload && + typeof payload === "object" && + "data" in payload && + Array.isArray((payload as { data?: unknown }).data) + ) { + return (payload as { data: DatasetRecord[] }).data; + } + + if ( + payload && + typeof payload === "object" && + "records" in payload && + Array.isArray((payload as { records?: unknown }).records) + ) { + return (payload as { records: DatasetRecord[] }).records; + } + + if ( + payload && + typeof payload === "object" && + "rows" in payload && + Array.isArray((payload as { rows?: unknown }).rows) + ) { + return (payload as { rows: DatasetRecord[] }).rows; + } + + if ( + payload && + typeof payload === "object" && + "result" in payload && + Array.isArray((payload as { result?: unknown }).result) + ) { + return (payload as { result: DatasetRecord[] }).result; + } + + if (payload && typeof payload === "object") { + const values = Object.values(payload); + if (values.length === 1 && Array.isArray(values[0])) { + return values[0] as DatasetRecord[]; + } + if (values.every((value) => value && typeof value === "object")) { + return values as DatasetRecord[]; + } + } + + throw new Error("Corpus endpoint returned an unexpected payload."); +}; + const StatPage = () => { const { datasetId: routeDatasetId } = useParams<{ datasetId: string }>(); const [error, setError] = useState(""); @@ -61,6 +158,12 @@ const StatPage = () => { totalUsers: 0, mostCommentHeavyUser: null, }); + const [appliedFilters, setAppliedFilters] = useState>({}); + const [allRecords, setAllRecords] = useState(null); + const [allRecordsKey, setAllRecordsKey] = useState(""); + const [explorerState, setExplorerState] = useState( + EMPTY_EXPLORER_STATE, + ); const searchInputRef = useRef(null); const beforeDateRef = useRef(null); @@ -104,6 +207,82 @@ const StatPage = () => { }; }; + const getFilterKey = (params: Record) => + JSON.stringify(Object.entries(params).sort(([a], [b]) => a.localeCompare(b))); + + const ensureFilteredRecords = async () => { + if (!datasetId) { + throw new Error("Missing dataset id."); + } + + const authHeaders = getAuthHeaders(); + if (!authHeaders) { + throw new Error("You must be signed in to load corpus records."); + } + + const filterKey = getFilterKey(appliedFilters); + if (allRecords && allRecordsKey === filterKey) { + return allRecords; + } + + const response = await axios.get( + `${API_BASE_URL}/dataset/${datasetId}/all`, + { + params: appliedFilters, + headers: authHeaders, + }, + ); + + const normalizedRecords = normalizeRecordPayload(response.data); + + setAllRecords(normalizedRecords); + setAllRecordsKey(filterKey); + return normalizedRecords; + }; + + const openExplorer = async (spec: CorpusExplorerSpec) => { + setExplorerState({ + open: true, + title: spec.title, + description: spec.description, + emptyMessage: spec.emptyMessage ?? "No matching records found.", + records: [], + loading: true, + error: "", + }); + + try { + const records = await ensureFilteredRecords(); + const context = buildExplorerContext(records); + const matched = records.filter((record) => spec.matcher(record, context)); + matched.sort((a, b) => { + const aValue = String(a.dt ?? a.date ?? a.timestamp ?? ""); + const bValue = String(b.dt ?? b.date ?? b.timestamp ?? ""); + return bValue.localeCompare(aValue); + }); + + setExplorerState({ + open: true, + title: spec.title, + description: spec.description, + emptyMessage: spec.emptyMessage ?? "No matching records found.", + records: matched, + loading: false, + error: "", + }); + } catch (e) { + setExplorerState({ + open: true, + title: spec.title, + description: spec.description, + emptyMessage: spec.emptyMessage ?? "No matching records found.", + records: [], + loading: false, + error: `Failed to load corpus records: ${String(e)}`, + }); + } + }; + const getStats = (params: Record = {}) => { if (!datasetId) { setError("Missing dataset id. Open /dataset//stats."); @@ -118,22 +297,20 @@ const StatPage = () => { setError(""); setLoading(true); + setAppliedFilters(params); + setAllRecords(null); + setAllRecordsKey(""); + setExplorerState((current) => ({ ...current, open: false })); Promise.all([ - axios.get( - `${API_BASE_URL}/dataset/${datasetId}/temporal`, - { - params, - headers: authHeaders, - }, - ), - axios.get( - `${API_BASE_URL}/dataset/${datasetId}/user`, - { - params, - headers: authHeaders, - }, - ), + axios.get(`${API_BASE_URL}/dataset/${datasetId}/temporal`, { + params, + headers: authHeaders, + }), + axios.get(`${API_BASE_URL}/dataset/${datasetId}/user`, { + params, + headers: authHeaders, + }), axios.get( `${API_BASE_URL}/dataset/${datasetId}/linguistic`, { @@ -141,13 +318,10 @@ const StatPage = () => { headers: authHeaders, }, ), - axios.get( - `${API_BASE_URL}/dataset/${datasetId}/emotional`, - { - params, - headers: authHeaders, - }, - ), + axios.get(`${API_BASE_URL}/dataset/${datasetId}/emotional`, { + params, + headers: authHeaders, + }), axios.get( `${API_BASE_URL}/dataset/${datasetId}/interactional`, { @@ -155,20 +329,14 @@ const StatPage = () => { headers: authHeaders, }, ), - axios.get( - `${API_BASE_URL}/dataset/${datasetId}/summary`, - { - params, - headers: authHeaders, - }, - ), - axios.get( - `${API_BASE_URL}/dataset/${datasetId}/cultural`, - { - params, - headers: authHeaders, - }, - ), + axios.get(`${API_BASE_URL}/dataset/${datasetId}/summary`, { + params, + headers: authHeaders, + }), + axios.get(`${API_BASE_URL}/dataset/${datasetId}/cultural`, { + params, + headers: authHeaders, + }), ]) .then( ([ @@ -182,8 +350,7 @@ const StatPage = () => { ]) => { const usersList = userRes.data.users ?? []; const topUsersList = userRes.data.top_users ?? []; - const interactionGraphRaw = - interactionRes.data?.interaction_graph ?? {}; + const interactionGraphRaw = interactionRes.data?.interaction_graph ?? {}; const topPairsRaw = interactionRes.data?.top_interaction_pairs ?? []; const filteredUsers: typeof usersList = []; @@ -194,18 +361,14 @@ const StatPage = () => { const filteredTopUsers: typeof topUsersList = []; for (const user of topUsersList) { - if (isDeletedUser(user.author)) continue; - filteredTopUsers.push(user); + if (isDeletedUser(user.author)) continue; + filteredTopUsers.push(user); } - let mostCommentHeavyUser: UserStatsMeta["mostCommentHeavyUser"] = - null; + let mostCommentHeavyUser: UserStatsMeta["mostCommentHeavyUser"] = null; for (const user of filteredUsers) { const currentShare = user.comment_share ?? 0; - if ( - !mostCommentHeavyUser || - currentShare > mostCommentHeavyUser.commentShare - ) { + if (!mostCommentHeavyUser || currentShare > mostCommentHeavyUser.commentShare) { mostCommentHeavyUser = { author: user.author, commentShare: currentShare, @@ -221,8 +384,7 @@ const StatPage = () => { } } - const filteredInteractionGraph: Record> = - {}; + const filteredInteractionGraph: Record> = {}; for (const [source, targets] of Object.entries(interactionGraphRaw)) { if (isDeletedUser(source)) { continue; @@ -279,7 +441,7 @@ const StatPage = () => { setSummary(filteredSummary || null); }, ) - .catch((e) => setError("Failed to load statistics: " + String(e))) + .catch((e) => setError(`Failed to load statistics: ${String(e)}`)) .finally(() => setLoading(false)); }; @@ -302,6 +464,9 @@ const StatPage = () => { useEffect(() => { setError(""); + setAllRecords(null); + setAllRecordsKey(""); + setExplorerState(EMPTY_EXPLORER_STATE); if (!datasetId) { setError("Missing dataset id. Open /dataset//stats."); return; @@ -398,9 +563,7 @@ const StatPage = () => { @@ -449,9 +608,7 @@ const StatPage = () => {
)} + + setExplorerState((current) => ({ ...current, open: false }))} + title={explorerState.title} + description={explorerState.description} + records={explorerState.records} + loading={explorerState.loading} + error={explorerState.error} + emptyMessage={explorerState.emptyMessage} + />
); }; diff --git a/frontend/src/utils/corpusExplorer.ts b/frontend/src/utils/corpusExplorer.ts new file mode 100644 index 0000000..e1ddb70 --- /dev/null +++ b/frontend/src/utils/corpusExplorer.ts @@ -0,0 +1,405 @@ +import type { CSSProperties } from "react"; + +type EntityRecord = { + text?: string; + [key: string]: unknown; +}; + +type DatasetRecord = { + id?: string | number; + post_id?: string | number | null; + parent_id?: string | number | null; + author?: string | null; + title?: string | null; + content?: string | null; + timestamp?: string | number | null; + date?: string | null; + dt?: string | null; + hour?: number | null; + weekday?: string | null; + reply_to?: string | number | null; + source?: string | null; + topic?: string | null; + topic_confidence?: number | null; + type?: string | null; + ner_entities?: EntityRecord[] | null; + emotion_anger?: number | null; + emotion_disgust?: number | null; + emotion_fear?: number | null; + emotion_joy?: number | null; + emotion_sadness?: number | null; + [key: string]: unknown; +}; + +type CorpusExplorerContext = { + authorByPostId: Map; + authorEventCounts: Map; + authorCommentCounts: Map; +}; + +type CorpusExplorerSpec = { + title: string; + description: string; + emptyMessage?: string; + matcher: (record: DatasetRecord, context: CorpusExplorerContext) => boolean; +}; + +const IN_GROUP_PATTERN = /\b(we|us|our|ourselves)\b/gi; +const OUT_GROUP_PATTERN = /\b(they|them|their|themselves)\b/gi; +const HEDGE_PATTERN = /\b(maybe|perhaps|possibly|probably|likely|seems|seem|i think|i feel|i guess|kind of|sort of|somewhat)\b/i; +const CERTAINTY_PATTERN = /\b(definitely|certainly|clearly|obviously|undeniably|always|never)\b/i; +const DEONTIC_PATTERN = /\b(must|should|need|needs|have to|has to|ought|required|require)\b/i; +const PERMISSION_PATTERN = /\b(can|allowed|okay|ok|permitted)\b/i; +const EMOTION_KEYS = [ + "emotion_anger", + "emotion_disgust", + "emotion_fear", + "emotion_joy", + "emotion_sadness", +] as const; + +const shrinkButtonStyle: CSSProperties = { + padding: "4px 8px", + fontSize: 12, +}; + +const toText = (value: unknown) => { + if (typeof value === "string") { + return value; + } + + if (typeof value === "number" || typeof value === "boolean") { + return String(value); + } + + if (value && typeof value === "object" && "id" in value) { + const id = (value as { id?: unknown }).id; + if (typeof id === "string" || typeof id === "number") { + return String(id); + } + } + + return ""; +}; + +const normalize = (value: unknown) => toText(value).trim().toLowerCase(); + +const getRecordText = (record: DatasetRecord) => + `${record.title ?? ""} ${record.content ?? ""}`.trim(); + +const escapeRegExp = (value: string) => + value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + +const buildPhrasePattern = (phrase: string) => { + const tokens = phrase + .toLowerCase() + .trim() + .split(/\s+/) + .filter(Boolean) + .map(escapeRegExp); + + if (!tokens.length) { + return null; + } + + return new RegExp(`\\b${tokens.join("\\s+")}\\b`, "i"); +}; + +const countMatches = (pattern: RegExp, text: string) => + Array.from(text.matchAll(new RegExp(pattern.source, "gi"))).length; + +const getDateBucket = (record: DatasetRecord) => { + if (typeof record.date === "string" && record.date) { + return record.date.slice(0, 10); + } + + if (typeof record.dt === "string" && record.dt) { + return record.dt.slice(0, 10); + } + + if (typeof record.timestamp === "number") { + return new Date(record.timestamp * 1000).toISOString().slice(0, 10); + } + + if (typeof record.timestamp === "string" && record.timestamp) { + const numeric = Number(record.timestamp); + if (Number.isFinite(numeric)) { + return new Date(numeric * 1000).toISOString().slice(0, 10); + } + } + + return ""; +}; + +const getDominantEmotion = (record: DatasetRecord) => { + let bestKey = ""; + let bestValue = Number.NEGATIVE_INFINITY; + + for (const key of EMOTION_KEYS) { + const value = Number(record[key] ?? Number.NEGATIVE_INFINITY); + if (value > bestValue) { + bestValue = value; + bestKey = key; + } + } + + return bestKey.replace("emotion_", ""); +}; + +const matchesPhrase = (record: DatasetRecord, phrase: string) => { + const pattern = buildPhrasePattern(phrase); + if (!pattern) { + return false; + } + + return pattern.test(getRecordText(record).toLowerCase()); +}; + +const recordIdentityBucket = (record: DatasetRecord) => { + const text = getRecordText(record).toLowerCase(); + const inHits = countMatches(IN_GROUP_PATTERN, text); + const outHits = countMatches(OUT_GROUP_PATTERN, text); + + if (inHits > outHits) { + return "in"; + } + + if (outHits > inHits) { + return "out"; + } + + return "tie"; +}; + +const createAuthorEventCounts = (records: DatasetRecord[]) => { + const counts = new Map(); + for (const record of records) { + const author = toText(record.author).trim(); + if (!author) { + continue; + } + counts.set(author, (counts.get(author) ?? 0) + 1); + } + return counts; +}; + +const createAuthorCommentCounts = (records: DatasetRecord[]) => { + const counts = new Map(); + for (const record of records) { + const author = toText(record.author).trim(); + if (!author || record.type !== "comment") { + continue; + } + counts.set(author, (counts.get(author) ?? 0) + 1); + } + return counts; +}; + +const createAuthorByPostId = (records: DatasetRecord[]) => { + const map = new Map(); + for (const record of records) { + const postId = record.post_id; + const author = toText(record.author).trim(); + if (postId === null || postId === undefined || !author) { + continue; + } + map.set(String(postId), author); + } + return map; +}; + +const buildExplorerContext = (records: DatasetRecord[]): CorpusExplorerContext => ({ + authorByPostId: createAuthorByPostId(records), + authorEventCounts: createAuthorEventCounts(records), + authorCommentCounts: createAuthorCommentCounts(records), +}); + +const buildAllRecordsSpec = (): CorpusExplorerSpec => ({ + title: "Corpus Explorer", + description: "All records in the current filtered dataset.", + emptyMessage: "No records match the current filters.", + matcher: () => true, +}); + +const buildUserSpec = (author: string): CorpusExplorerSpec => ({ + title: `User: ${author}`, + description: `All records authored by ${author}.`, + emptyMessage: `No records found for ${author}.`, + matcher: (record) => normalize(record.author) === normalize(author), +}); + +const buildTopicSpec = (topic: string): CorpusExplorerSpec => ({ + title: `Topic: ${topic}`, + description: `Records assigned to the ${topic} topic bucket.`, + emptyMessage: `No records found in the ${topic} topic bucket.`, + matcher: (record) => normalize(record.topic) === normalize(topic), +}); + +const buildDateBucketSpec = (date: string): CorpusExplorerSpec => ({ + title: `Date Bucket: ${date}`, + description: `Records from the ${date} activity bucket.`, + emptyMessage: `No records found on ${date}.`, + matcher: (record) => getDateBucket(record) === date, +}); + +const buildWordSpec = (word: string): CorpusExplorerSpec => ({ + title: `Word: ${word}`, + description: `Records containing the word ${word}.`, + emptyMessage: `No records mention ${word}.`, + matcher: (record) => matchesPhrase(record, word), +}); + +const buildNgramSpec = (ngram: string): CorpusExplorerSpec => ({ + title: `N-gram: ${ngram}`, + description: `Records containing the phrase ${ngram}.`, + emptyMessage: `No records contain the phrase ${ngram}.`, + matcher: (record) => matchesPhrase(record, ngram), +}); + +const buildEntitySpec = (entity: string): CorpusExplorerSpec => ({ + title: `Entity: ${entity}`, + description: `Records mentioning the ${entity} entity.`, + emptyMessage: `No records found for the ${entity} entity.`, + matcher: (record) => { + const target = normalize(entity); + const entities = Array.isArray(record.ner_entities) ? record.ner_entities : []; + return entities.some((item) => normalize(item?.text) === target) || matchesPhrase(record, entity); + }, +}); + +const buildSourceSpec = (source: string): CorpusExplorerSpec => ({ + title: `Source: ${source}`, + description: `Records from the ${source} source.`, + emptyMessage: `No records found for ${source}.`, + matcher: (record) => normalize(record.source) === normalize(source), +}); + +const buildDominantEmotionSpec = (emotion: string): CorpusExplorerSpec => ({ + title: `Dominant Emotion: ${emotion}`, + description: `Records where ${emotion} is the strongest emotion score.`, + emptyMessage: `No records found with dominant emotion ${emotion}.`, + matcher: (record) => getDominantEmotion(record) === normalize(emotion), +}); + +const buildReplyPairSpec = (source: string, target: string): CorpusExplorerSpec => ({ + title: `Reply Path: ${source} -> ${target}`, + description: `Reply records authored by ${source} in response to ${target}.`, + emptyMessage: `No reply records found for ${source} -> ${target}.`, + matcher: (record, context) => { + if (normalize(record.author) !== normalize(source)) { + return false; + } + + const replyTo = record.reply_to; + if (replyTo === null || replyTo === undefined || replyTo === "") { + return false; + } + + const replyTarget = context.authorByPostId.get(String(replyTo)); + return normalize(replyTarget) === normalize(target); + }, +}); + +const buildOneTimeUsersSpec = (): CorpusExplorerSpec => ({ + title: "One-Time Users", + description: "Records written by authors who appear exactly once in the filtered corpus.", + emptyMessage: "No one-time-user records found.", + matcher: (record, context) => { + const author = toText(record.author).trim(); + return !!author && context.authorEventCounts.get(author) === 1; + }, +}); + +const buildTopCommentersSpec = (topAuthorCount: number): CorpusExplorerSpec => ({ + title: "Top Commenters", + description: `Comment records from the top ${topAuthorCount} commenters in the filtered corpus.`, + emptyMessage: "No top-commenter records found.", + matcher: (record, context) => { + if (record.type !== "comment") { + return false; + } + + const rankedAuthors = Array.from(context.authorCommentCounts.entries()) + .sort((a, b) => b[1] - a[1]) + .slice(0, topAuthorCount) + .map(([author]) => author); + + return rankedAuthors.includes(toText(record.author).trim()); + }, +}); + +const buildSingleCommentAuthorsSpec = (): CorpusExplorerSpec => ({ + title: "Single-Comment Authors", + description: "Comment records from authors who commented exactly once.", + emptyMessage: "No single-comment-author records found.", + matcher: (record, context) => { + const author = toText(record.author).trim(); + return record.type === "comment" && !!author && context.authorCommentCounts.get(author) === 1; + }, +}); + +const buildIdentityBucketSpec = (bucket: "in" | "out" | "tie"): CorpusExplorerSpec => { + const labels = { + in: "In-Group Posts", + out: "Out-Group Posts", + tie: "Balanced Posts", + } as const; + + return { + title: labels[bucket], + description: `Records in the ${labels[bucket].toLowerCase()} cultural bucket.`, + emptyMessage: `No records found for ${labels[bucket].toLowerCase()}.`, + matcher: (record) => recordIdentityBucket(record) === bucket, + }; +}; + +const buildPatternSpec = ( + title: string, + description: string, + pattern: RegExp, +): CorpusExplorerSpec => ({ + title, + description, + emptyMessage: `No records found for ${title.toLowerCase()}.`, + matcher: (record) => pattern.test(getRecordText(record)), +}); + +const buildHedgeSpec = () => + buildPatternSpec("Hedging Words", "Records containing hedging language.", HEDGE_PATTERN); + +const buildCertaintySpec = () => + buildPatternSpec("Certainty Words", "Records containing certainty language.", CERTAINTY_PATTERN); + +const buildDeonticSpec = () => + buildPatternSpec("Need/Should Words", "Records containing deontic language.", DEONTIC_PATTERN); + +const buildPermissionSpec = () => + buildPatternSpec("Permission Words", "Records containing permission language.", PERMISSION_PATTERN); + +const getExplorerButtonStyle = () => shrinkButtonStyle; + +export type { DatasetRecord, CorpusExplorerContext, CorpusExplorerSpec }; +export { + buildAllRecordsSpec, + buildCertaintySpec, + buildDateBucketSpec, + buildDeonticSpec, + buildDominantEmotionSpec, + buildEntitySpec, + buildExplorerContext, + buildHedgeSpec, + buildIdentityBucketSpec, + buildNgramSpec, + buildOneTimeUsersSpec, + buildPermissionSpec, + buildReplyPairSpec, + buildSingleCommentAuthorsSpec, + buildSourceSpec, + buildTopicSpec, + buildTopCommentersSpec, + buildUserSpec, + buildWordSpec, + getDateBucket, + getExplorerButtonStyle, + toText, +}; diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index f60d809..d45ab1d 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -1,4 +1,5 @@ import nltk +import json import pandas as pd from nltk.corpus import stopwords @@ -27,6 +28,8 @@ DOMAIN_STOPWORDS = { "one", } +EXCLUDED_AUTHORS = {"[deleted]", "automoderator"} + nltk.download("stopwords") EXCLUDE_WORDS = set(stopwords.words("english")) | DOMAIN_STOPWORDS @@ -46,6 +49,12 @@ class StatGen: filters = filters or {} filtered_df = df.copy() + if "author" in filtered_df.columns: + normalized_authors = ( + filtered_df["author"].fillna("").astype(str).str.strip().str.lower() + ) + filtered_df = filtered_df[~normalized_authors.isin(EXCLUDED_AUTHORS)] + search_query = filters.get("search_query", None) start_date_filter = filters.get("start_date", None) end_date_filter = filters.get("end_date", None) @@ -75,9 +84,15 @@ class StatGen: return filtered_df + def _json_ready_records(self, df: pd.DataFrame) -> list[dict]: + return json.loads( + df.to_json(orient="records", date_format="iso", date_unit="s") + ) + ## Public Methods def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]: - return self._prepare_filtered_df(df, filters).to_dict(orient="records") + filtered_df = self._prepare_filtered_df(df, filters) + return self._json_ready_records(filtered_df) def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) diff --git a/server/app.py b/server/app.py index ea59579..aa22a25 100644 --- a/server/app.py +++ b/server/app.py @@ -591,7 +591,8 @@ def get_full_dataset(dataset_id: int): ) dataset_content = dataset_manager.get_dataset_content(dataset_id) - return jsonify(dataset_content.to_dict(orient="records")), 200 + filters = get_request_filters() + return jsonify(stat_gen.filter_dataset(dataset_content, filters)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: