Corpus Explorer Feature #11

Merged
dylan merged 14 commits from feat/corpus-explorer into main 2026-04-13 19:02:45 +01:00
6 changed files with 147 additions and 202 deletions
Showing only changes of commit c11434344a - Show all commits

View File

@@ -1,4 +1,4 @@
import { useEffect, useMemo, useState } from "react"; import { useEffect, useState } from "react";
import { Dialog, DialogPanel, DialogTitle } from "@headlessui/react"; import { Dialog, DialogPanel, DialogTitle } from "@headlessui/react";
import StatsStyling from "../styles/stats_styling"; import StatsStyling from "../styles/stats_styling";
@@ -103,11 +103,6 @@ const CorpusExplorer = ({
} }
}, [open, title, records.length]); }, [open, title, records.length]);
const visibleRecords = useMemo(
() => records.slice(0, visibleCount),
[records, visibleCount],
);
const hasMoreRecords = visibleCount < records.length; const hasMoreRecords = visibleCount < records.length;
return ( return (
@@ -158,7 +153,7 @@ const CorpusExplorer = ({
paddingRight: 4, paddingRight: 4,
}} }}
> >
{visibleRecords.map((record, index) => { {records.slice(0, visibleCount).map((record, index) => {
const recordKey = getRecordKey(record, index); const recordKey = getRecordKey(record, index);
const titleText = getRecordTitle(record); const titleText = getRecordTitle(record);
const content = cleanText(record.content); const content = cleanText(record.content);

View File

@@ -8,11 +8,11 @@ import {
buildHedgeSpec, buildHedgeSpec,
buildIdentityBucketSpec, buildIdentityBucketSpec,
buildPermissionSpec, buildPermissionSpec,
getExplorerButtonStyle,
type CorpusExplorerSpec, type CorpusExplorerSpec,
} from "../utils/corpusExplorer"; } from "../utils/corpusExplorer";
const styles = StatsStyling; const styles = StatsStyling;
const exploreButtonStyle = { padding: "4px 8px", fontSize: 12 };
type CulturalStatsProps = { type CulturalStatsProps = {
data: CulturalAnalysisResponse; data: CulturalAnalysisResponse;
@@ -22,7 +22,7 @@ type CulturalStatsProps = {
const renderExploreButton = (onClick: () => void) => ( const renderExploreButton = (onClick: () => void) => (
<button <button
onClick={onClick} onClick={onClick}
style={{ ...styles.buttonSecondary, ...getExplorerButtonStyle() }} style={{ ...styles.buttonSecondary, ...exploreButtonStyle }}
> >
Explore Explore
</button> </button>
@@ -59,21 +59,6 @@ const CulturalStats = ({ data, onExplore }: CulturalStatsProps) => {
return `${dominantLabel} (${(dominant[1] * 100).toFixed(1)}%)`; return `${dominantLabel} (${(dominant[1] * 100).toFixed(1)}%)`;
}; };
const stanceSublabel = (
per1kTokens: number | undefined,
emotionAvg: Record<string, number> | undefined,
) => {
const rateLabel =
typeof per1kTokens === "number"
? `${per1kTokens.toFixed(1)} per 1k words`
: "Word frequency";
const emotionLabel = topEmotion(emotionAvg);
return emotionLabel === "—"
? rateLabel
: `${rateLabel} • Avg mood: ${emotionLabel}`;
};
return ( return (
<div style={styles.page}> <div style={styles.page}>
<div style={{ ...styles.container, ...styles.grid }}> <div style={{ ...styles.container, ...styles.grid }}>

View File

@@ -26,12 +26,12 @@ import {
buildDateBucketSpec, buildDateBucketSpec,
buildOneTimeUsersSpec, buildOneTimeUsersSpec,
buildUserSpec, buildUserSpec,
getExplorerButtonStyle,
type CorpusExplorerSpec, type CorpusExplorerSpec,
} from "../utils/corpusExplorer"; } from "../utils/corpusExplorer";
const styles = StatsStyling; const styles = StatsStyling;
const MAX_WORDCLOUD_WORDS = 250; const MAX_WORDCLOUD_WORDS = 250;
const exploreButtonStyle = { padding: "4px 8px", fontSize: 12 };
const WORDCLOUD_OPTIONS = { const WORDCLOUD_OPTIONS = {
rotations: 2, rotations: 2,
@@ -80,7 +80,7 @@ function convertFrequencyData(data: FrequencyWord[]) {
const renderExploreButton = (onClick: () => void) => ( const renderExploreButton = (onClick: () => void) => (
<button <button
onClick={onClick} onClick={onClick}
style={{ ...styles.buttonSecondary, ...getExplorerButtonStyle() }} style={{ ...styles.buttonSecondary, ...exploreButtonStyle }}
> >
Explore Explore
</button> </button>

View File

@@ -20,7 +20,7 @@ type GraphLink = {
value: number; value: number;
}; };
function ApiToGraphData(apiData: InteractionGraph) { function toGraphData(apiData: InteractionGraph) {
const links: GraphLink[] = []; const links: GraphLink[] = [];
const connectedNodeIds = new Set<string>(); const connectedNodeIds = new Set<string>();
@@ -56,7 +56,7 @@ const UserStats = ({
onExplore, onExplore,
}: UserStatsProps) => { }: UserStatsProps) => {
const graphData = useMemo( const graphData = useMemo(
() => ApiToGraphData(interactionGraph), () => toGraphData(interactionGraph),
[interactionGraph], [interactionGraph],
); );
const graphContainerRef = useRef<HTMLDivElement | null>(null); const graphContainerRef = useRef<HTMLDivElement | null>(null);

View File

@@ -66,6 +66,26 @@ const EMPTY_EXPLORER_STATE: ExplorerState = {
error: "", error: "",
}; };
const createExplorerState = (
spec: CorpusExplorerSpec,
patch: Partial<ExplorerState> = {},
): ExplorerState => ({
open: true,
title: spec.title,
description: spec.description,
emptyMessage: spec.emptyMessage ?? "No matching records found.",
records: [],
loading: false,
error: "",
...patch,
});
const compareRecordsByNewest = (a: DatasetRecord, b: DatasetRecord) => {
const aValue = String(a.dt ?? a.date ?? a.timestamp ?? "");
const bValue = String(b.dt ?? b.date ?? b.timestamp ?? "");
return bValue.localeCompare(aValue);
};
const parseJsonLikePayload = (value: string): unknown => { const parseJsonLikePayload = (value: string): unknown => {
const normalized = value const normalized = value
.replace(/\uFEFF/g, "") .replace(/\uFEFF/g, "")
@@ -86,16 +106,23 @@ const parseJsonLikePayload = (value: string): unknown => {
return JSON.parse(normalized); return JSON.parse(normalized);
}; };
const tryParseRecords = (value: string) => {
try {
return normalizeRecordPayload(parseJsonLikePayload(value));
} catch {
return null;
}
};
const parseRecordStringPayload = (payload: string): DatasetRecord[] | null => { const parseRecordStringPayload = (payload: string): DatasetRecord[] | null => {
const trimmed = payload.trim(); const trimmed = payload.trim();
if (!trimmed) { if (!trimmed) {
return []; return [];
} }
try { const direct = tryParseRecords(trimmed);
return normalizeRecordPayload(parseJsonLikePayload(trimmed)); if (direct) {
} catch { return direct;
// Continue with additional fallback formats below.
} }
const ndjsonLines = trimmed const ndjsonLines = trimmed
@@ -106,29 +133,24 @@ const parseRecordStringPayload = (payload: string): DatasetRecord[] | null => {
try { try {
return ndjsonLines.map((line) => parseJsonLikePayload(line)) as DatasetRecord[]; return ndjsonLines.map((line) => parseJsonLikePayload(line)) as DatasetRecord[];
} catch { } catch {
// Continue with wrapped JSON extraction.
} }
} }
const bracketStart = trimmed.indexOf("["); const bracketStart = trimmed.indexOf("[");
const bracketEnd = trimmed.lastIndexOf("]"); const bracketEnd = trimmed.lastIndexOf("]");
if (bracketStart !== -1 && bracketEnd > bracketStart) { if (bracketStart !== -1 && bracketEnd > bracketStart) {
const candidate = trimmed.slice(bracketStart, bracketEnd + 1); const parsed = tryParseRecords(trimmed.slice(bracketStart, bracketEnd + 1));
try { if (parsed) {
return normalizeRecordPayload(parseJsonLikePayload(candidate)); return parsed;
} catch {
// Continue with object extraction.
} }
} }
const braceStart = trimmed.indexOf("{"); const braceStart = trimmed.indexOf("{");
const braceEnd = trimmed.lastIndexOf("}"); const braceEnd = trimmed.lastIndexOf("}");
if (braceStart !== -1 && braceEnd > braceStart) { if (braceStart !== -1 && braceEnd > braceStart) {
const candidate = trimmed.slice(braceStart, braceEnd + 1); const parsed = tryParseRecords(trimmed.slice(braceStart, braceEnd + 1));
try { if (parsed) {
return normalizeRecordPayload(parseJsonLikePayload(candidate)); return parsed;
} catch {
return null;
} }
} }
@@ -316,45 +338,22 @@ const StatPage = () => {
}; };
const openExplorer = async (spec: CorpusExplorerSpec) => { const openExplorer = async (spec: CorpusExplorerSpec) => {
setExplorerState({ setExplorerState(createExplorerState(spec, { loading: true }));
open: true,
title: spec.title,
description: spec.description,
emptyMessage: spec.emptyMessage ?? "No matching records found.",
records: [],
loading: true,
error: "",
});
try { try {
const records = await ensureFilteredRecords(); const records = await ensureFilteredRecords();
const context = buildExplorerContext(records); const context = buildExplorerContext(records);
const matched = records.filter((record) => spec.matcher(record, context)); const matched = records
matched.sort((a, b) => { .filter((record) => spec.matcher(record, context))
const aValue = String(a.dt ?? a.date ?? a.timestamp ?? ""); .sort(compareRecordsByNewest);
const bValue = String(b.dt ?? b.date ?? b.timestamp ?? "");
return bValue.localeCompare(aValue);
});
setExplorerState({ setExplorerState(createExplorerState(spec, { records: matched }));
open: true,
title: spec.title,
description: spec.description,
emptyMessage: spec.emptyMessage ?? "No matching records found.",
records: matched,
loading: false,
error: "",
});
} catch (e) { } catch (e) {
setExplorerState({ setExplorerState(
open: true, createExplorerState(spec, {
title: spec.title, error: `Failed to load corpus records: ${String(e)}`,
description: spec.description, }),
emptyMessage: spec.emptyMessage ?? "No matching records found.", );
records: [],
loading: false,
error: `Failed to load corpus records: ${String(e)}`,
});
} }
}; };

View File

@@ -1,5 +1,3 @@
import type { CSSProperties } from "react";
type EntityRecord = { type EntityRecord = {
text?: string; text?: string;
[key: string]: unknown; [key: string]: unknown;
@@ -58,11 +56,6 @@ const EMOTION_KEYS = [
"emotion_sadness", "emotion_sadness",
] as const; ] as const;
const shrinkButtonStyle: CSSProperties = {
padding: "4px 8px",
fontSize: 12,
};
const toText = (value: unknown) => { const toText = (value: unknown) => {
if (typeof value === "string") { if (typeof value === "string") {
return value; return value;
@@ -83,6 +76,7 @@ const toText = (value: unknown) => {
}; };
const normalize = (value: unknown) => toText(value).trim().toLowerCase(); const normalize = (value: unknown) => toText(value).trim().toLowerCase();
const getAuthor = (record: DatasetRecord) => toText(record.author).trim();
const getRecordText = (record: DatasetRecord) => const getRecordText = (record: DatasetRecord) =>
`${record.title ?? ""} ${record.content ?? ""}`.trim(); `${record.title ?? ""} ${record.content ?? ""}`.trim();
@@ -152,11 +146,11 @@ const matchesPhrase = (record: DatasetRecord, phrase: string) => {
return false; return false;
} }
return pattern.test(getRecordText(record).toLowerCase()); return pattern.test(getRecordText(record));
}; };
const recordIdentityBucket = (record: DatasetRecord) => { const recordIdentityBucket = (record: DatasetRecord) => {
const text = getRecordText(record).toLowerCase(); const text = getRecordText(record);
const inHits = countMatches(IN_GROUP_PATTERN, text); const inHits = countMatches(IN_GROUP_PATTERN, text);
const outHits = countMatches(OUT_GROUP_PATTERN, text); const outHits = countMatches(OUT_GROUP_PATTERN, text);
@@ -171,48 +165,30 @@ const recordIdentityBucket = (record: DatasetRecord) => {
return "tie"; return "tie";
}; };
const createAuthorEventCounts = (records: DatasetRecord[]) => { const buildExplorerContext = (records: DatasetRecord[]): CorpusExplorerContext => {
const counts = new Map<string, number>(); const authorByPostId = new Map<string, string>();
const authorEventCounts = new Map<string, number>();
const authorCommentCounts = new Map<string, number>();
for (const record of records) { for (const record of records) {
const author = toText(record.author).trim(); const author = getAuthor(record);
if (!author) { if (!author) {
continue; continue;
} }
counts.set(author, (counts.get(author) ?? 0) + 1);
}
return counts;
};
const createAuthorCommentCounts = (records: DatasetRecord[]) => { authorEventCounts.set(author, (authorEventCounts.get(author) ?? 0) + 1);
const counts = new Map<string, number>();
for (const record of records) { if (record.type === "comment") {
const author = toText(record.author).trim(); authorCommentCounts.set(author, (authorCommentCounts.get(author) ?? 0) + 1);
if (!author || record.type !== "comment") {
continue;
} }
counts.set(author, (counts.get(author) ?? 0) + 1);
}
return counts;
};
const createAuthorByPostId = (records: DatasetRecord[]) => { if (record.post_id !== null && record.post_id !== undefined) {
const map = new Map<string, string>(); authorByPostId.set(String(record.post_id), author);
for (const record of records) {
const postId = record.post_id;
const author = toText(record.author).trim();
if (postId === null || postId === undefined || !author) {
continue;
} }
map.set(String(postId), author);
} }
return map;
};
const buildExplorerContext = (records: DatasetRecord[]): CorpusExplorerContext => ({ return { authorByPostId, authorEventCounts, authorCommentCounts };
authorByPostId: createAuthorByPostId(records), };
authorEventCounts: createAuthorEventCounts(records),
authorCommentCounts: createAuthorCommentCounts(records),
});
const buildAllRecordsSpec = (): CorpusExplorerSpec => ({ const buildAllRecordsSpec = (): CorpusExplorerSpec => ({
title: "Corpus Explorer", title: "Corpus Explorer",
@@ -221,19 +197,27 @@ const buildAllRecordsSpec = (): CorpusExplorerSpec => ({
matcher: () => true, matcher: () => true,
}); });
const buildUserSpec = (author: string): CorpusExplorerSpec => ({ const buildUserSpec = (author: string): CorpusExplorerSpec => {
title: `User: ${author}`, const target = normalize(author);
description: `All records authored by ${author}.`,
emptyMessage: `No records found for ${author}.`,
matcher: (record) => normalize(record.author) === normalize(author),
});
const buildTopicSpec = (topic: string): CorpusExplorerSpec => ({ return {
title: `Topic: ${topic}`, title: `User: ${author}`,
description: `Records assigned to the ${topic} topic bucket.`, description: `All records authored by ${author}.`,
emptyMessage: `No records found in the ${topic} topic bucket.`, emptyMessage: `No records found for ${author}.`,
matcher: (record) => normalize(record.topic) === normalize(topic), matcher: (record) => normalize(record.author) === target,
}); };
};
const buildTopicSpec = (topic: string): CorpusExplorerSpec => {
const target = normalize(topic);
return {
title: `Topic: ${topic}`,
description: `Records assigned to the ${topic} topic bucket.`,
emptyMessage: `No records found in the ${topic} topic bucket.`,
matcher: (record) => normalize(record.topic) === target,
};
};
const buildDateBucketSpec = (date: string): CorpusExplorerSpec => ({ const buildDateBucketSpec = (date: string): CorpusExplorerSpec => ({
title: `Date Bucket: ${date}`, title: `Date Bucket: ${date}`,
@@ -256,88 +240,75 @@ const buildNgramSpec = (ngram: string): CorpusExplorerSpec => ({
matcher: (record) => matchesPhrase(record, ngram), matcher: (record) => matchesPhrase(record, ngram),
}); });
const buildEntitySpec = (entity: string): CorpusExplorerSpec => ({ const buildEntitySpec = (entity: string): CorpusExplorerSpec => {
title: `Entity: ${entity}`, const target = normalize(entity);
description: `Records mentioning the ${entity} entity.`,
emptyMessage: `No records found for the ${entity} entity.`,
matcher: (record) => {
const target = normalize(entity);
const entities = Array.isArray(record.ner_entities) ? record.ner_entities : [];
return entities.some((item) => normalize(item?.text) === target) || matchesPhrase(record, entity);
},
});
const buildSourceSpec = (source: string): CorpusExplorerSpec => ({ return {
title: `Source: ${source}`, title: `Entity: ${entity}`,
description: `Records from the ${source} source.`, description: `Records mentioning the ${entity} entity.`,
emptyMessage: `No records found for ${source}.`, emptyMessage: `No records found for the ${entity} entity.`,
matcher: (record) => normalize(record.source) === normalize(source), matcher: (record) => {
}); const entities = Array.isArray(record.ner_entities) ? record.ner_entities : [];
return entities.some((item) => normalize(item?.text) === target) || matchesPhrase(record, entity);
},
};
};
const buildDominantEmotionSpec = (emotion: string): CorpusExplorerSpec => ({ const buildSourceSpec = (source: string): CorpusExplorerSpec => {
title: `Dominant Emotion: ${emotion}`, const target = normalize(source);
description: `Records where ${emotion} is the strongest emotion score.`,
emptyMessage: `No records found with dominant emotion ${emotion}.`,
matcher: (record) => getDominantEmotion(record) === normalize(emotion),
});
const buildReplyPairSpec = (source: string, target: string): CorpusExplorerSpec => ({ return {
title: `Reply Path: ${source} -> ${target}`, title: `Source: ${source}`,
description: `Reply records authored by ${source} in response to ${target}.`, description: `Records from the ${source} source.`,
emptyMessage: `No reply records found for ${source} -> ${target}.`, emptyMessage: `No records found for ${source}.`,
matcher: (record, context) => { matcher: (record) => normalize(record.source) === target,
if (normalize(record.author) !== normalize(source)) { };
return false; };
}
const replyTo = record.reply_to; const buildDominantEmotionSpec = (emotion: string): CorpusExplorerSpec => {
if (replyTo === null || replyTo === undefined || replyTo === "") { const target = normalize(emotion);
return false;
}
const replyTarget = context.authorByPostId.get(String(replyTo)); return {
return normalize(replyTarget) === normalize(target); title: `Dominant Emotion: ${emotion}`,
}, description: `Records where ${emotion} is the strongest emotion score.`,
}); emptyMessage: `No records found with dominant emotion ${emotion}.`,
matcher: (record) => getDominantEmotion(record) === target,
};
};
const buildReplyPairSpec = (source: string, target: string): CorpusExplorerSpec => {
const sourceName = normalize(source);
const targetName = normalize(target);
return {
title: `Reply Path: ${source} -> ${target}`,
description: `Reply records authored by ${source} in response to ${target}.`,
emptyMessage: `No reply records found for ${source} -> ${target}.`,
matcher: (record, context) => {
if (normalize(record.author) !== sourceName) {
return false;
}
const replyTo = record.reply_to;
if (replyTo === null || replyTo === undefined || replyTo === "") {
return false;
}
return normalize(context.authorByPostId.get(String(replyTo))) === targetName;
},
};
};
const buildOneTimeUsersSpec = (): CorpusExplorerSpec => ({ const buildOneTimeUsersSpec = (): CorpusExplorerSpec => ({
title: "One-Time Users", title: "One-Time Users",
description: "Records written by authors who appear exactly once in the filtered corpus.", description: "Records written by authors who appear exactly once in the filtered corpus.",
emptyMessage: "No one-time-user records found.", emptyMessage: "No one-time-user records found.",
matcher: (record, context) => { matcher: (record, context) => {
const author = toText(record.author).trim(); const author = getAuthor(record);
return !!author && context.authorEventCounts.get(author) === 1; return !!author && context.authorEventCounts.get(author) === 1;
}, },
}); });
const buildTopCommentersSpec = (topAuthorCount: number): CorpusExplorerSpec => ({
title: "Top Commenters",
description: `Comment records from the top ${topAuthorCount} commenters in the filtered corpus.`,
emptyMessage: "No top-commenter records found.",
matcher: (record, context) => {
if (record.type !== "comment") {
return false;
}
const rankedAuthors = Array.from(context.authorCommentCounts.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, topAuthorCount)
.map(([author]) => author);
return rankedAuthors.includes(toText(record.author).trim());
},
});
const buildSingleCommentAuthorsSpec = (): CorpusExplorerSpec => ({
title: "Single-Comment Authors",
description: "Comment records from authors who commented exactly once.",
emptyMessage: "No single-comment-author records found.",
matcher: (record, context) => {
const author = toText(record.author).trim();
return record.type === "comment" && !!author && context.authorCommentCounts.get(author) === 1;
},
});
const buildIdentityBucketSpec = (bucket: "in" | "out" | "tie"): CorpusExplorerSpec => { const buildIdentityBucketSpec = (bucket: "in" | "out" | "tie"): CorpusExplorerSpec => {
const labels = { const labels = {
in: "In-Group Posts", in: "In-Group Posts",
@@ -376,9 +347,7 @@ const buildDeonticSpec = () =>
const buildPermissionSpec = () => const buildPermissionSpec = () =>
buildPatternSpec("Permission Words", "Records containing permission language.", PERMISSION_PATTERN); buildPatternSpec("Permission Words", "Records containing permission language.", PERMISSION_PATTERN);
const getExplorerButtonStyle = () => shrinkButtonStyle; export type { DatasetRecord, CorpusExplorerSpec };
export type { DatasetRecord, CorpusExplorerContext, CorpusExplorerSpec };
export { export {
buildAllRecordsSpec, buildAllRecordsSpec,
buildCertaintySpec, buildCertaintySpec,
@@ -393,13 +362,10 @@ export {
buildOneTimeUsersSpec, buildOneTimeUsersSpec,
buildPermissionSpec, buildPermissionSpec,
buildReplyPairSpec, buildReplyPairSpec,
buildSingleCommentAuthorsSpec,
buildSourceSpec, buildSourceSpec,
buildTopicSpec, buildTopicSpec,
buildTopCommentersSpec,
buildUserSpec, buildUserSpec,
buildWordSpec, buildWordSpec,
getDateBucket, getDateBucket,
getExplorerButtonStyle,
toText, toText,
}; };