Corpus Explorer Feature #11
@@ -1,4 +1,4 @@
|
||||
import { useEffect, useMemo, useState } from "react";
|
||||
import { useEffect, useState } from "react";
|
||||
import { Dialog, DialogPanel, DialogTitle } from "@headlessui/react";
|
||||
|
||||
import StatsStyling from "../styles/stats_styling";
|
||||
@@ -103,11 +103,6 @@ const CorpusExplorer = ({
|
||||
}
|
||||
}, [open, title, records.length]);
|
||||
|
||||
const visibleRecords = useMemo(
|
||||
() => records.slice(0, visibleCount),
|
||||
[records, visibleCount],
|
||||
);
|
||||
|
||||
const hasMoreRecords = visibleCount < records.length;
|
||||
|
||||
return (
|
||||
@@ -158,7 +153,7 @@ const CorpusExplorer = ({
|
||||
paddingRight: 4,
|
||||
}}
|
||||
>
|
||||
{visibleRecords.map((record, index) => {
|
||||
{records.slice(0, visibleCount).map((record, index) => {
|
||||
const recordKey = getRecordKey(record, index);
|
||||
const titleText = getRecordTitle(record);
|
||||
const content = cleanText(record.content);
|
||||
|
||||
@@ -8,11 +8,11 @@ import {
|
||||
buildHedgeSpec,
|
||||
buildIdentityBucketSpec,
|
||||
buildPermissionSpec,
|
||||
getExplorerButtonStyle,
|
||||
type CorpusExplorerSpec,
|
||||
} from "../utils/corpusExplorer";
|
||||
|
||||
const styles = StatsStyling;
|
||||
const exploreButtonStyle = { padding: "4px 8px", fontSize: 12 };
|
||||
|
||||
type CulturalStatsProps = {
|
||||
data: CulturalAnalysisResponse;
|
||||
@@ -22,7 +22,7 @@ type CulturalStatsProps = {
|
||||
const renderExploreButton = (onClick: () => void) => (
|
||||
<button
|
||||
onClick={onClick}
|
||||
style={{ ...styles.buttonSecondary, ...getExplorerButtonStyle() }}
|
||||
style={{ ...styles.buttonSecondary, ...exploreButtonStyle }}
|
||||
>
|
||||
Explore
|
||||
</button>
|
||||
@@ -59,21 +59,6 @@ const CulturalStats = ({ data, onExplore }: CulturalStatsProps) => {
|
||||
return `${dominantLabel} (${(dominant[1] * 100).toFixed(1)}%)`;
|
||||
};
|
||||
|
||||
const stanceSublabel = (
|
||||
per1kTokens: number | undefined,
|
||||
emotionAvg: Record<string, number> | undefined,
|
||||
) => {
|
||||
const rateLabel =
|
||||
typeof per1kTokens === "number"
|
||||
? `${per1kTokens.toFixed(1)} per 1k words`
|
||||
: "Word frequency";
|
||||
const emotionLabel = topEmotion(emotionAvg);
|
||||
|
||||
return emotionLabel === "—"
|
||||
? rateLabel
|
||||
: `${rateLabel} • Avg mood: ${emotionLabel}`;
|
||||
};
|
||||
|
||||
return (
|
||||
<div style={styles.page}>
|
||||
<div style={{ ...styles.container, ...styles.grid }}>
|
||||
|
||||
@@ -26,12 +26,12 @@ import {
|
||||
buildDateBucketSpec,
|
||||
buildOneTimeUsersSpec,
|
||||
buildUserSpec,
|
||||
getExplorerButtonStyle,
|
||||
type CorpusExplorerSpec,
|
||||
} from "../utils/corpusExplorer";
|
||||
|
||||
const styles = StatsStyling;
|
||||
const MAX_WORDCLOUD_WORDS = 250;
|
||||
const exploreButtonStyle = { padding: "4px 8px", fontSize: 12 };
|
||||
|
||||
const WORDCLOUD_OPTIONS = {
|
||||
rotations: 2,
|
||||
@@ -80,7 +80,7 @@ function convertFrequencyData(data: FrequencyWord[]) {
|
||||
const renderExploreButton = (onClick: () => void) => (
|
||||
<button
|
||||
onClick={onClick}
|
||||
style={{ ...styles.buttonSecondary, ...getExplorerButtonStyle() }}
|
||||
style={{ ...styles.buttonSecondary, ...exploreButtonStyle }}
|
||||
>
|
||||
Explore
|
||||
</button>
|
||||
|
||||
@@ -20,7 +20,7 @@ type GraphLink = {
|
||||
value: number;
|
||||
};
|
||||
|
||||
function ApiToGraphData(apiData: InteractionGraph) {
|
||||
function toGraphData(apiData: InteractionGraph) {
|
||||
const links: GraphLink[] = [];
|
||||
const connectedNodeIds = new Set<string>();
|
||||
|
||||
@@ -56,7 +56,7 @@ const UserStats = ({
|
||||
onExplore,
|
||||
}: UserStatsProps) => {
|
||||
const graphData = useMemo(
|
||||
() => ApiToGraphData(interactionGraph),
|
||||
() => toGraphData(interactionGraph),
|
||||
[interactionGraph],
|
||||
);
|
||||
const graphContainerRef = useRef<HTMLDivElement | null>(null);
|
||||
|
||||
@@ -66,6 +66,26 @@ const EMPTY_EXPLORER_STATE: ExplorerState = {
|
||||
error: "",
|
||||
};
|
||||
|
||||
const createExplorerState = (
|
||||
spec: CorpusExplorerSpec,
|
||||
patch: Partial<ExplorerState> = {},
|
||||
): ExplorerState => ({
|
||||
open: true,
|
||||
title: spec.title,
|
||||
description: spec.description,
|
||||
emptyMessage: spec.emptyMessage ?? "No matching records found.",
|
||||
records: [],
|
||||
loading: false,
|
||||
error: "",
|
||||
...patch,
|
||||
});
|
||||
|
||||
const compareRecordsByNewest = (a: DatasetRecord, b: DatasetRecord) => {
|
||||
const aValue = String(a.dt ?? a.date ?? a.timestamp ?? "");
|
||||
const bValue = String(b.dt ?? b.date ?? b.timestamp ?? "");
|
||||
return bValue.localeCompare(aValue);
|
||||
};
|
||||
|
||||
const parseJsonLikePayload = (value: string): unknown => {
|
||||
const normalized = value
|
||||
.replace(/\uFEFF/g, "")
|
||||
@@ -86,16 +106,23 @@ const parseJsonLikePayload = (value: string): unknown => {
|
||||
return JSON.parse(normalized);
|
||||
};
|
||||
|
||||
const tryParseRecords = (value: string) => {
|
||||
try {
|
||||
return normalizeRecordPayload(parseJsonLikePayload(value));
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
const parseRecordStringPayload = (payload: string): DatasetRecord[] | null => {
|
||||
const trimmed = payload.trim();
|
||||
if (!trimmed) {
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
return normalizeRecordPayload(parseJsonLikePayload(trimmed));
|
||||
} catch {
|
||||
// Continue with additional fallback formats below.
|
||||
const direct = tryParseRecords(trimmed);
|
||||
if (direct) {
|
||||
return direct;
|
||||
}
|
||||
|
||||
const ndjsonLines = trimmed
|
||||
@@ -106,29 +133,24 @@ const parseRecordStringPayload = (payload: string): DatasetRecord[] | null => {
|
||||
try {
|
||||
return ndjsonLines.map((line) => parseJsonLikePayload(line)) as DatasetRecord[];
|
||||
} catch {
|
||||
// Continue with wrapped JSON extraction.
|
||||
}
|
||||
}
|
||||
|
||||
const bracketStart = trimmed.indexOf("[");
|
||||
const bracketEnd = trimmed.lastIndexOf("]");
|
||||
if (bracketStart !== -1 && bracketEnd > bracketStart) {
|
||||
const candidate = trimmed.slice(bracketStart, bracketEnd + 1);
|
||||
try {
|
||||
return normalizeRecordPayload(parseJsonLikePayload(candidate));
|
||||
} catch {
|
||||
// Continue with object extraction.
|
||||
const parsed = tryParseRecords(trimmed.slice(bracketStart, bracketEnd + 1));
|
||||
if (parsed) {
|
||||
return parsed;
|
||||
}
|
||||
}
|
||||
|
||||
const braceStart = trimmed.indexOf("{");
|
||||
const braceEnd = trimmed.lastIndexOf("}");
|
||||
if (braceStart !== -1 && braceEnd > braceStart) {
|
||||
const candidate = trimmed.slice(braceStart, braceEnd + 1);
|
||||
try {
|
||||
return normalizeRecordPayload(parseJsonLikePayload(candidate));
|
||||
} catch {
|
||||
return null;
|
||||
const parsed = tryParseRecords(trimmed.slice(braceStart, braceEnd + 1));
|
||||
if (parsed) {
|
||||
return parsed;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -316,45 +338,22 @@ const StatPage = () => {
|
||||
};
|
||||
|
||||
const openExplorer = async (spec: CorpusExplorerSpec) => {
|
||||
setExplorerState({
|
||||
open: true,
|
||||
title: spec.title,
|
||||
description: spec.description,
|
||||
emptyMessage: spec.emptyMessage ?? "No matching records found.",
|
||||
records: [],
|
||||
loading: true,
|
||||
error: "",
|
||||
});
|
||||
setExplorerState(createExplorerState(spec, { loading: true }));
|
||||
|
||||
try {
|
||||
const records = await ensureFilteredRecords();
|
||||
const context = buildExplorerContext(records);
|
||||
const matched = records.filter((record) => spec.matcher(record, context));
|
||||
matched.sort((a, b) => {
|
||||
const aValue = String(a.dt ?? a.date ?? a.timestamp ?? "");
|
||||
const bValue = String(b.dt ?? b.date ?? b.timestamp ?? "");
|
||||
return bValue.localeCompare(aValue);
|
||||
});
|
||||
const matched = records
|
||||
.filter((record) => spec.matcher(record, context))
|
||||
.sort(compareRecordsByNewest);
|
||||
|
||||
setExplorerState({
|
||||
open: true,
|
||||
title: spec.title,
|
||||
description: spec.description,
|
||||
emptyMessage: spec.emptyMessage ?? "No matching records found.",
|
||||
records: matched,
|
||||
loading: false,
|
||||
error: "",
|
||||
});
|
||||
setExplorerState(createExplorerState(spec, { records: matched }));
|
||||
} catch (e) {
|
||||
setExplorerState({
|
||||
open: true,
|
||||
title: spec.title,
|
||||
description: spec.description,
|
||||
emptyMessage: spec.emptyMessage ?? "No matching records found.",
|
||||
records: [],
|
||||
loading: false,
|
||||
setExplorerState(
|
||||
createExplorerState(spec, {
|
||||
error: `Failed to load corpus records: ${String(e)}`,
|
||||
});
|
||||
}),
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
import type { CSSProperties } from "react";
|
||||
|
||||
type EntityRecord = {
|
||||
text?: string;
|
||||
[key: string]: unknown;
|
||||
@@ -58,11 +56,6 @@ const EMOTION_KEYS = [
|
||||
"emotion_sadness",
|
||||
] as const;
|
||||
|
||||
const shrinkButtonStyle: CSSProperties = {
|
||||
padding: "4px 8px",
|
||||
fontSize: 12,
|
||||
};
|
||||
|
||||
const toText = (value: unknown) => {
|
||||
if (typeof value === "string") {
|
||||
return value;
|
||||
@@ -83,6 +76,7 @@ const toText = (value: unknown) => {
|
||||
};
|
||||
|
||||
const normalize = (value: unknown) => toText(value).trim().toLowerCase();
|
||||
const getAuthor = (record: DatasetRecord) => toText(record.author).trim();
|
||||
|
||||
const getRecordText = (record: DatasetRecord) =>
|
||||
`${record.title ?? ""} ${record.content ?? ""}`.trim();
|
||||
@@ -152,11 +146,11 @@ const matchesPhrase = (record: DatasetRecord, phrase: string) => {
|
||||
return false;
|
||||
}
|
||||
|
||||
return pattern.test(getRecordText(record).toLowerCase());
|
||||
return pattern.test(getRecordText(record));
|
||||
};
|
||||
|
||||
const recordIdentityBucket = (record: DatasetRecord) => {
|
||||
const text = getRecordText(record).toLowerCase();
|
||||
const text = getRecordText(record);
|
||||
const inHits = countMatches(IN_GROUP_PATTERN, text);
|
||||
const outHits = countMatches(OUT_GROUP_PATTERN, text);
|
||||
|
||||
@@ -171,48 +165,30 @@ const recordIdentityBucket = (record: DatasetRecord) => {
|
||||
return "tie";
|
||||
};
|
||||
|
||||
const createAuthorEventCounts = (records: DatasetRecord[]) => {
|
||||
const counts = new Map<string, number>();
|
||||
const buildExplorerContext = (records: DatasetRecord[]): CorpusExplorerContext => {
|
||||
const authorByPostId = new Map<string, string>();
|
||||
const authorEventCounts = new Map<string, number>();
|
||||
const authorCommentCounts = new Map<string, number>();
|
||||
|
||||
for (const record of records) {
|
||||
const author = toText(record.author).trim();
|
||||
const author = getAuthor(record);
|
||||
if (!author) {
|
||||
continue;
|
||||
}
|
||||
counts.set(author, (counts.get(author) ?? 0) + 1);
|
||||
}
|
||||
return counts;
|
||||
};
|
||||
|
||||
const createAuthorCommentCounts = (records: DatasetRecord[]) => {
|
||||
const counts = new Map<string, number>();
|
||||
for (const record of records) {
|
||||
const author = toText(record.author).trim();
|
||||
if (!author || record.type !== "comment") {
|
||||
continue;
|
||||
}
|
||||
counts.set(author, (counts.get(author) ?? 0) + 1);
|
||||
}
|
||||
return counts;
|
||||
};
|
||||
authorEventCounts.set(author, (authorEventCounts.get(author) ?? 0) + 1);
|
||||
|
||||
const createAuthorByPostId = (records: DatasetRecord[]) => {
|
||||
const map = new Map<string, string>();
|
||||
for (const record of records) {
|
||||
const postId = record.post_id;
|
||||
const author = toText(record.author).trim();
|
||||
if (postId === null || postId === undefined || !author) {
|
||||
continue;
|
||||
if (record.type === "comment") {
|
||||
authorCommentCounts.set(author, (authorCommentCounts.get(author) ?? 0) + 1);
|
||||
}
|
||||
map.set(String(postId), author);
|
||||
}
|
||||
return map;
|
||||
};
|
||||
|
||||
const buildExplorerContext = (records: DatasetRecord[]): CorpusExplorerContext => ({
|
||||
authorByPostId: createAuthorByPostId(records),
|
||||
authorEventCounts: createAuthorEventCounts(records),
|
||||
authorCommentCounts: createAuthorCommentCounts(records),
|
||||
});
|
||||
if (record.post_id !== null && record.post_id !== undefined) {
|
||||
authorByPostId.set(String(record.post_id), author);
|
||||
}
|
||||
}
|
||||
|
||||
return { authorByPostId, authorEventCounts, authorCommentCounts };
|
||||
};
|
||||
|
||||
const buildAllRecordsSpec = (): CorpusExplorerSpec => ({
|
||||
title: "Corpus Explorer",
|
||||
@@ -221,19 +197,27 @@ const buildAllRecordsSpec = (): CorpusExplorerSpec => ({
|
||||
matcher: () => true,
|
||||
});
|
||||
|
||||
const buildUserSpec = (author: string): CorpusExplorerSpec => ({
|
||||
const buildUserSpec = (author: string): CorpusExplorerSpec => {
|
||||
const target = normalize(author);
|
||||
|
||||
return {
|
||||
title: `User: ${author}`,
|
||||
description: `All records authored by ${author}.`,
|
||||
emptyMessage: `No records found for ${author}.`,
|
||||
matcher: (record) => normalize(record.author) === normalize(author),
|
||||
});
|
||||
matcher: (record) => normalize(record.author) === target,
|
||||
};
|
||||
};
|
||||
|
||||
const buildTopicSpec = (topic: string): CorpusExplorerSpec => ({
|
||||
const buildTopicSpec = (topic: string): CorpusExplorerSpec => {
|
||||
const target = normalize(topic);
|
||||
|
||||
return {
|
||||
title: `Topic: ${topic}`,
|
||||
description: `Records assigned to the ${topic} topic bucket.`,
|
||||
emptyMessage: `No records found in the ${topic} topic bucket.`,
|
||||
matcher: (record) => normalize(record.topic) === normalize(topic),
|
||||
});
|
||||
matcher: (record) => normalize(record.topic) === target,
|
||||
};
|
||||
};
|
||||
|
||||
const buildDateBucketSpec = (date: string): CorpusExplorerSpec => ({
|
||||
title: `Date Bucket: ${date}`,
|
||||
@@ -256,37 +240,52 @@ const buildNgramSpec = (ngram: string): CorpusExplorerSpec => ({
|
||||
matcher: (record) => matchesPhrase(record, ngram),
|
||||
});
|
||||
|
||||
const buildEntitySpec = (entity: string): CorpusExplorerSpec => ({
|
||||
const buildEntitySpec = (entity: string): CorpusExplorerSpec => {
|
||||
const target = normalize(entity);
|
||||
|
||||
return {
|
||||
title: `Entity: ${entity}`,
|
||||
description: `Records mentioning the ${entity} entity.`,
|
||||
emptyMessage: `No records found for the ${entity} entity.`,
|
||||
matcher: (record) => {
|
||||
const target = normalize(entity);
|
||||
const entities = Array.isArray(record.ner_entities) ? record.ner_entities : [];
|
||||
return entities.some((item) => normalize(item?.text) === target) || matchesPhrase(record, entity);
|
||||
},
|
||||
});
|
||||
};
|
||||
};
|
||||
|
||||
const buildSourceSpec = (source: string): CorpusExplorerSpec => ({
|
||||
const buildSourceSpec = (source: string): CorpusExplorerSpec => {
|
||||
const target = normalize(source);
|
||||
|
||||
return {
|
||||
title: `Source: ${source}`,
|
||||
description: `Records from the ${source} source.`,
|
||||
emptyMessage: `No records found for ${source}.`,
|
||||
matcher: (record) => normalize(record.source) === normalize(source),
|
||||
});
|
||||
matcher: (record) => normalize(record.source) === target,
|
||||
};
|
||||
};
|
||||
|
||||
const buildDominantEmotionSpec = (emotion: string): CorpusExplorerSpec => ({
|
||||
const buildDominantEmotionSpec = (emotion: string): CorpusExplorerSpec => {
|
||||
const target = normalize(emotion);
|
||||
|
||||
return {
|
||||
title: `Dominant Emotion: ${emotion}`,
|
||||
description: `Records where ${emotion} is the strongest emotion score.`,
|
||||
emptyMessage: `No records found with dominant emotion ${emotion}.`,
|
||||
matcher: (record) => getDominantEmotion(record) === normalize(emotion),
|
||||
});
|
||||
matcher: (record) => getDominantEmotion(record) === target,
|
||||
};
|
||||
};
|
||||
|
||||
const buildReplyPairSpec = (source: string, target: string): CorpusExplorerSpec => ({
|
||||
const buildReplyPairSpec = (source: string, target: string): CorpusExplorerSpec => {
|
||||
const sourceName = normalize(source);
|
||||
const targetName = normalize(target);
|
||||
|
||||
return {
|
||||
title: `Reply Path: ${source} -> ${target}`,
|
||||
description: `Reply records authored by ${source} in response to ${target}.`,
|
||||
emptyMessage: `No reply records found for ${source} -> ${target}.`,
|
||||
matcher: (record, context) => {
|
||||
if (normalize(record.author) !== normalize(source)) {
|
||||
if (normalize(record.author) !== sourceName) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -295,49 +294,21 @@ const buildReplyPairSpec = (source: string, target: string): CorpusExplorerSpec
|
||||
return false;
|
||||
}
|
||||
|
||||
const replyTarget = context.authorByPostId.get(String(replyTo));
|
||||
return normalize(replyTarget) === normalize(target);
|
||||
return normalize(context.authorByPostId.get(String(replyTo))) === targetName;
|
||||
},
|
||||
});
|
||||
};
|
||||
};
|
||||
|
||||
const buildOneTimeUsersSpec = (): CorpusExplorerSpec => ({
|
||||
title: "One-Time Users",
|
||||
description: "Records written by authors who appear exactly once in the filtered corpus.",
|
||||
emptyMessage: "No one-time-user records found.",
|
||||
matcher: (record, context) => {
|
||||
const author = toText(record.author).trim();
|
||||
const author = getAuthor(record);
|
||||
return !!author && context.authorEventCounts.get(author) === 1;
|
||||
},
|
||||
});
|
||||
|
||||
const buildTopCommentersSpec = (topAuthorCount: number): CorpusExplorerSpec => ({
|
||||
title: "Top Commenters",
|
||||
description: `Comment records from the top ${topAuthorCount} commenters in the filtered corpus.`,
|
||||
emptyMessage: "No top-commenter records found.",
|
||||
matcher: (record, context) => {
|
||||
if (record.type !== "comment") {
|
||||
return false;
|
||||
}
|
||||
|
||||
const rankedAuthors = Array.from(context.authorCommentCounts.entries())
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.slice(0, topAuthorCount)
|
||||
.map(([author]) => author);
|
||||
|
||||
return rankedAuthors.includes(toText(record.author).trim());
|
||||
},
|
||||
});
|
||||
|
||||
const buildSingleCommentAuthorsSpec = (): CorpusExplorerSpec => ({
|
||||
title: "Single-Comment Authors",
|
||||
description: "Comment records from authors who commented exactly once.",
|
||||
emptyMessage: "No single-comment-author records found.",
|
||||
matcher: (record, context) => {
|
||||
const author = toText(record.author).trim();
|
||||
return record.type === "comment" && !!author && context.authorCommentCounts.get(author) === 1;
|
||||
},
|
||||
});
|
||||
|
||||
const buildIdentityBucketSpec = (bucket: "in" | "out" | "tie"): CorpusExplorerSpec => {
|
||||
const labels = {
|
||||
in: "In-Group Posts",
|
||||
@@ -376,9 +347,7 @@ const buildDeonticSpec = () =>
|
||||
const buildPermissionSpec = () =>
|
||||
buildPatternSpec("Permission Words", "Records containing permission language.", PERMISSION_PATTERN);
|
||||
|
||||
const getExplorerButtonStyle = () => shrinkButtonStyle;
|
||||
|
||||
export type { DatasetRecord, CorpusExplorerContext, CorpusExplorerSpec };
|
||||
export type { DatasetRecord, CorpusExplorerSpec };
|
||||
export {
|
||||
buildAllRecordsSpec,
|
||||
buildCertaintySpec,
|
||||
@@ -393,13 +362,10 @@ export {
|
||||
buildOneTimeUsersSpec,
|
||||
buildPermissionSpec,
|
||||
buildReplyPairSpec,
|
||||
buildSingleCommentAuthorsSpec,
|
||||
buildSourceSpec,
|
||||
buildTopicSpec,
|
||||
buildTopCommentersSpec,
|
||||
buildUserSpec,
|
||||
buildWordSpec,
|
||||
getDateBucket,
|
||||
getExplorerButtonStyle,
|
||||
toText,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user