Finish off the links between frontend and backend #10

Merged
dylan merged 24 commits from feat/add-frontend-pages into main 2026-03-18 20:30:19 +00:00
21 changed files with 1364 additions and 406 deletions

View File

@@ -2,7 +2,7 @@
<html lang="en"> <html lang="en">
<head> <head>
<meta charset="UTF-8" /> <meta charset="UTF-8" />
<link rel="icon" type="image/svg+xml" href="/vite.svg" /> <link rel="icon" type="image/png" href="/icon.png" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>frontend</title> <title>frontend</title>
</head> </head>

BIN
frontend/public/icon.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

View File

@@ -0,0 +1,158 @@
import Card from "./Card";
import StatsStyling from "../styles/stats_styling";
import type { CulturalAnalysisResponse } from "../types/ApiTypes";
const styles = StatsStyling;
type CulturalStatsProps = {
data: CulturalAnalysisResponse;
};
const CulturalStats = ({ data }: CulturalStatsProps) => {
const identity = data.identity_markers;
const stance = data.stance_markers;
const inGroupWords = identity?.in_group_usage ?? 0;
const outGroupWords = identity?.out_group_usage ?? 0;
const totalGroupWords = inGroupWords + outGroupWords;
const inGroupWordRate = typeof identity?.in_group_ratio === "number"
? identity.in_group_ratio * 100
: null;
const outGroupWordRate = typeof identity?.out_group_ratio === "number"
? identity.out_group_ratio * 100
: null;
const rawEntities = data.avg_emotion_per_entity?.entity_emotion_avg ?? {};
const entities = Object.entries(rawEntities)
.sort((a, b) => (b[1].post_count - a[1].post_count))
.slice(0, 20);
const topEmotion = (emotionAvg: Record<string, number> | undefined) => {
const entries = Object.entries(emotionAvg ?? {});
if (!entries.length) {
return "—";
}
entries.sort((a, b) => b[1] - a[1]);
const dominant = entries[0] ?? ["emotion_unknown", 0];
const dominantLabel = dominant[0].replace("emotion_", "");
return `${dominantLabel} (${(dominant[1] * 100).toFixed(1)}%)`;
};
return (
<div style={styles.page}>
<div style={{ ...styles.container, ...styles.grid }}>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Community Framing Overview</h2>
<p style={styles.sectionSubtitle}>Simple view of how often people use "us" words vs "them" words, and the tone around that language.</p>
</div>
<Card
label="In-Group Words"
value={inGroupWords.toLocaleString()}
sublabel="Times we/us/our appears"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Out-Group Words"
value={outGroupWords.toLocaleString()}
sublabel="Times they/them/their appears"
style={{ gridColumn: "span 3" }}
/>
<Card
label="In-Group Posts"
value={identity?.in_group_posts?.toLocaleString() ?? "—"}
sublabel='Posts leaning toward "us" language'
style={{ gridColumn: "span 3" }}
/>
<Card
label="Out-Group Posts"
value={identity?.out_group_posts?.toLocaleString() ?? "—"}
sublabel='Posts leaning toward "them" language'
style={{ gridColumn: "span 3" }}
/>
<Card
label="Balanced Posts"
value={identity?.tie_posts?.toLocaleString() ?? "—"}
sublabel="Posts with equal us/them signals"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Total Group Words"
value={totalGroupWords.toLocaleString()}
sublabel="In-group + out-group words"
style={{ gridColumn: "span 3" }}
/>
<Card
label="In-Group Share"
value={inGroupWordRate === null ? "—" : `${inGroupWordRate.toFixed(2)}%`}
sublabel="Share of all words"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Out-Group Share"
value={outGroupWordRate === null ? "—" : `${outGroupWordRate.toFixed(2)}%`}
sublabel="Share of all words"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Hedging Words"
value={stance?.hedge_total?.toLocaleString() ?? "—"}
sublabel={typeof stance?.hedge_per_1k_tokens === "number" ? `${stance.hedge_per_1k_tokens.toFixed(1)} per 1k words` : "Word frequency"}
style={{ gridColumn: "span 3" }}
/>
<Card
label="Certainty Words"
value={stance?.certainty_total?.toLocaleString() ?? "—"}
sublabel={typeof stance?.certainty_per_1k_tokens === "number" ? `${stance.certainty_per_1k_tokens.toFixed(1)} per 1k words` : "Word frequency"}
style={{ gridColumn: "span 3" }}
/>
<Card
label="Need/Should Words"
value={stance?.deontic_total?.toLocaleString() ?? "—"}
sublabel={typeof stance?.deontic_per_1k_tokens === "number" ? `${stance.deontic_per_1k_tokens.toFixed(1)} per 1k words` : "Word frequency"}
style={{ gridColumn: "span 3" }}
/>
<Card
label="Permission Words"
value={stance?.permission_total?.toLocaleString() ?? "—"}
sublabel={typeof stance?.permission_per_1k_tokens === "number" ? `${stance.permission_per_1k_tokens.toFixed(1)} per 1k words` : "Word frequency"}
style={{ gridColumn: "span 3" }}
/>
<div style={{ ...styles.card, gridColumn: "span 6" }}>
<h2 style={styles.sectionTitle}>Mood in "Us" Posts</h2>
<p style={styles.sectionSubtitle}>Most likely emotion when in-group wording is stronger.</p>
<div style={styles.topUserName}>{topEmotion(identity?.in_group_emotion_avg)}</div>
</div>
<div style={{ ...styles.card, gridColumn: "span 6" }}>
<h2 style={styles.sectionTitle}>Mood in "Them" Posts</h2>
<p style={styles.sectionSubtitle}>Most likely emotion when out-group wording is stronger.</p>
<div style={styles.topUserName}>{topEmotion(identity?.out_group_emotion_avg)}</div>
</div>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Entity Mood Snapshot</h2>
<p style={styles.sectionSubtitle}>Most mentioned entities and the mood that appears most with each.</p>
{!entities.length ? (
<div style={styles.topUserMeta}>No entity-level cultural data available.</div>
) : (
<div style={{ ...styles.topUsersList, maxHeight: 420, overflowY: "auto" }}>
{entities.map(([entity, aggregate]) => (
<div key={entity} style={styles.topUserItem}>
<div style={styles.topUserName}>{entity}</div>
<div style={styles.topUserMeta}>
{aggregate.post_count.toLocaleString()} posts Likely mood: {topEmotion(aggregate.emotion_avg)}
</div>
</div>
))}
</div>
)}
</div>
</div>
</div>
);
};
export default CulturalStats;

View File

@@ -9,6 +9,9 @@ type EmotionalStatsProps = {
const EmotionalStats = ({contentData}: EmotionalStatsProps) => { const EmotionalStats = ({contentData}: EmotionalStatsProps) => {
const rows = contentData.average_emotion_by_topic ?? []; const rows = contentData.average_emotion_by_topic ?? [];
const overallEmotionAverage = contentData.overall_emotion_average ?? [];
const dominantEmotionDistribution = contentData.dominant_emotion_distribution ?? [];
const emotionBySource = contentData.emotion_by_source ?? [];
const lowSampleThreshold = 20; const lowSampleThreshold = 20;
const stableSampleThreshold = 50; const stableSampleThreshold = 50;
const emotionKeys = rows.length const emotionKeys = rows.length
@@ -64,41 +67,106 @@ const EmotionalStats = ({contentData}: EmotionalStatsProps) => {
return ( return (
<div style={styles.page}> <div style={styles.page}>
<div style={{ ...styles.container, ...styles.card, marginTop: 16 }}> <div style={{ ...styles.container, ...styles.card, marginTop: 16 }}>
<h2 style={styles.sectionTitle}>Average Emotion by Topic</h2> <h2 style={styles.sectionTitle}>Topic Mood Overview</h2>
<p style={styles.sectionSubtitle}>Read confidence together with sample size. Topics with fewer than {lowSampleThreshold} events are usually noisy and less reliable.</p> <p style={styles.sectionSubtitle}>Use the strength score together with post count. Topics with fewer than {lowSampleThreshold} events are often noisy.</p>
<div style={styles.emotionalSummaryRow}> <div style={styles.emotionalSummaryRow}>
<span><strong style={{ color: "#24292f" }}>Topics:</strong> {strongestPerTopic.length}</span> <span><strong style={{ color: "#24292f" }}>Topics:</strong> {strongestPerTopic.length}</span>
<span><strong style={{ color: "#24292f" }}>Median Sample:</strong> {medianSampleSize} events</span> <span><strong style={{ color: "#24292f" }}>Median Posts:</strong> {medianSampleSize}</span>
<span><strong style={{ color: "#24292f" }}>Low Sample (&lt;{lowSampleThreshold}):</strong> {lowSampleTopics}</span> <span><strong style={{ color: "#24292f" }}>Small Topics (&lt;{lowSampleThreshold}):</strong> {lowSampleTopics}</span>
<span><strong style={{ color: "#24292f" }}>Stable Sample ({stableSampleThreshold}+):</strong> {stableSampleTopics}</span> <span><strong style={{ color: "#24292f" }}>Stable Topics ({stableSampleThreshold}+):</strong> {stableSampleTopics}</span>
</div> </div>
<p style={{ ...styles.sectionSubtitle, marginTop: 10, marginBottom: 0 }}> <p style={{ ...styles.sectionSubtitle, marginTop: 10, marginBottom: 0 }}>
Confidence reflects how strongly one emotion leads within a topic, not model accuracy. Use larger samples for stronger conclusions. Strength means how far the top emotion is ahead in that topic. It does not mean model accuracy.
</p> </p>
</div> </div>
<div style={{ ...styles.container, ...styles.grid }}> <div style={{ ...styles.container, ...styles.grid }}>
<div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Mood Averages</h2>
<p style={styles.sectionSubtitle}>Average score for each emotion.</p>
{!overallEmotionAverage.length ? (
<div style={styles.topUserMeta}>No overall emotion averages available.</div>
) : (
<div style={{ ...styles.topUsersList, maxHeight: 260, overflowY: "auto" }}>
{[...overallEmotionAverage]
.sort((a, b) => b.score - a.score)
.map((row) => (
<div key={row.emotion} style={styles.topUserItem}>
<div style={styles.topUserName}>{formatEmotion(row.emotion)}</div>
<div style={styles.topUserMeta}>{row.score.toFixed(3)}</div>
</div>
))}
</div>
)}
</div>
<div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Mood Split</h2>
<p style={styles.sectionSubtitle}>How often each emotion is dominant.</p>
{!dominantEmotionDistribution.length ? (
<div style={styles.topUserMeta}>No dominant-emotion split available.</div>
) : (
<div style={{ ...styles.topUsersList, maxHeight: 260, overflowY: "auto" }}>
{[...dominantEmotionDistribution]
.sort((a, b) => b.ratio - a.ratio)
.map((row) => (
<div key={row.emotion} style={styles.topUserItem}>
<div style={styles.topUserName}>{formatEmotion(row.emotion)}</div>
<div style={styles.topUserMeta}>{(row.ratio * 100).toFixed(1)}% {row.count.toLocaleString()} events</div>
</div>
))}
</div>
)}
</div>
<div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Mood by Source</h2>
<p style={styles.sectionSubtitle}>Leading emotion in each source.</p>
{!emotionBySource.length ? (
<div style={styles.topUserMeta}>No source emotion profile available.</div>
) : (
<div style={{ ...styles.topUsersList, maxHeight: 260, overflowY: "auto" }}>
{[...emotionBySource]
.sort((a, b) => b.event_count - a.event_count)
.map((row) => (
<div key={row.source} style={styles.topUserItem}>
<div style={styles.topUserName}>{row.source}</div>
<div style={styles.topUserMeta}>
{formatEmotion(row.dominant_emotion)} {row.dominant_score.toFixed(3)} {row.event_count.toLocaleString()} events
</div>
</div>
))}
</div>
)}
</div>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Topic Snapshots</h2>
<p style={styles.sectionSubtitle}>Per-topic mood with strength and post count.</p>
<div style={{ ...styles.grid, marginTop: 10 }}>
{strongestPerTopic.map((topic) => ( {strongestPerTopic.map((topic) => (
<div key={topic.topic} style={{ ...styles.card, gridColumn: "span 4" }}> <div key={topic.topic} style={{ ...styles.cardBase, gridColumn: "span 4" }}>
<h3 style={{ ...styles.sectionTitle, marginBottom: 6 }}>{topic.topic}</h3> <h3 style={{ ...styles.sectionTitle, marginBottom: 6 }}>{topic.topic}</h3>
<div style={styles.emotionalTopicLabel}> <div style={styles.emotionalTopicLabel}>
Top Emotion Likely Mood
</div> </div>
<div style={styles.emotionalTopicValue}> <div style={styles.emotionalTopicValue}>
{formatEmotion(topic.emotion)} {formatEmotion(topic.emotion)}
</div> </div>
<div style={styles.emotionalMetricRow}> <div style={styles.emotionalMetricRow}>
<span>Confidence</span> <span>Strength</span>
<span style={styles.emotionalMetricValue}>{topic.value.toFixed(3)}</span> <span style={styles.emotionalMetricValue}>{topic.value.toFixed(3)}</span>
</div> </div>
<div style={styles.emotionalMetricRowCompact}> <div style={styles.emotionalMetricRowCompact}>
<span>Sample Size</span> <span>Posts in Topic</span>
<span style={styles.emotionalMetricValue}>{topic.count} events</span> <span style={styles.emotionalMetricValue}>{topic.count}</span>
</div> </div>
</div> </div>
))} ))}
</div> </div>
</div> </div>
</div>
</div>
); );
} }

View File

@@ -0,0 +1,208 @@
import Card from "./Card";
import StatsStyling from "../styles/stats_styling";
import type { InteractionAnalysisResponse } from "../types/ApiTypes";
import {
ResponsiveContainer,
BarChart,
Bar,
XAxis,
YAxis,
CartesianGrid,
Tooltip,
PieChart,
Pie,
Cell,
Legend,
} from "recharts";
const styles = StatsStyling;
type InteractionalStatsProps = {
data: InteractionAnalysisResponse;
};
const InteractionalStats = ({ data }: InteractionalStatsProps) => {
const graph = data.interaction_graph ?? {};
const userCount = Object.keys(graph).length;
const edges = Object.values(graph).flatMap((targets) => Object.values(targets));
const edgeCount = edges.length;
const interactionVolume = edges.reduce((sum, value) => sum + value, 0);
const concentration = data.conversation_concentration;
const topTenCommentShare = typeof concentration?.top_10pct_comment_share === "number"
? concentration?.top_10pct_comment_share
: null;
const topTenAuthorCount = typeof concentration?.top_10pct_author_count === "number"
? concentration.top_10pct_author_count
: null;
const totalCommentingAuthors = typeof concentration?.total_commenting_authors === "number"
? concentration.total_commenting_authors
: null;
const singleCommentAuthorRatio = typeof concentration?.single_comment_author_ratio === "number"
? concentration.single_comment_author_ratio
: null;
const singleCommentAuthors = typeof concentration?.single_comment_authors === "number"
? concentration.single_comment_authors
: null;
const topPairs = (data.top_interaction_pairs ?? [])
.filter((item): item is [[string, string], number] => {
if (!Array.isArray(item) || item.length !== 2) {
return false;
}
const pair = item[0];
const count = item[1];
return Array.isArray(pair)
&& pair.length === 2
&& typeof pair[0] === "string"
&& typeof pair[1] === "string"
&& typeof count === "number";
})
.slice(0, 20);
const topPairChartData = topPairs.slice(0, 8).map(([[source, target], value], index) => ({
pair: `${source} -> ${target}`,
replies: value,
rank: index + 1,
}));
const topTenSharePercent = topTenCommentShare === null
? null
: topTenCommentShare * 100;
const nonTopTenSharePercent = topTenSharePercent === null
? null
: Math.max(0, 100 - topTenSharePercent);
let concentrationPieData: { name: string; value: number }[] = [];
if (topTenSharePercent !== null && nonTopTenSharePercent !== null) {
concentrationPieData = [
{ name: "Top 10% authors", value: topTenSharePercent },
{ name: "Other authors", value: nonTopTenSharePercent },
];
}
const PIE_COLORS = ["#2b6777", "#c8d8e4"];
return (
<div style={styles.page}>
<div style={{ ...styles.container, ...styles.grid }}>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Conversation Overview</h2>
<p style={styles.sectionSubtitle}>Who talks to who, and how concentrated the replies are.</p>
</div>
<Card
label="Average Reply Depth"
value={typeof data.average_thread_depth === "number" ? data.average_thread_depth.toFixed(2) : "—"}
sublabel="How deep reply chains usually go"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Users in Network"
value={userCount.toLocaleString()}
sublabel="Users in the reply graph"
style={{ gridColumn: "span 3" }}
/>
<Card
label="User-to-User Links"
value={edgeCount.toLocaleString()}
sublabel="Unique reply directions"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Total Replies"
value={interactionVolume.toLocaleString()}
sublabel="All reply links combined"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Concentrated Replies"
value={topTenSharePercent === null ? "-" : `${topTenSharePercent.toFixed(1)}%`}
sublabel={topTenAuthorCount === null || totalCommentingAuthors === null
? "Reply share from the top 10% commenters"
: `${topTenAuthorCount.toLocaleString()} of ${totalCommentingAuthors.toLocaleString()} authors`}
style={{ gridColumn: "span 6" }}
/>
<Card
label="Single-Comment Authors"
value={singleCommentAuthorRatio === null ? "-" : `${(singleCommentAuthorRatio * 100).toFixed(1)}%`}
sublabel={singleCommentAuthors === null
? "Authors who commented exactly once"
: `${singleCommentAuthors.toLocaleString()} authors commented exactly once`}
style={{ gridColumn: "span 6" }}
/>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Conversation Visuals</h2>
<p style={styles.sectionSubtitle}>Main reply links and concentration split.</p>
<div style={{ ...styles.grid, marginTop: 12 }}>
<div style={{ ...styles.cardBase, gridColumn: "span 6" }}>
<h3 style={{ ...styles.sectionTitle, fontSize: "1rem" }}>Top Interaction Pairs</h3>
<div style={{ width: "100%", height: 300 }}>
<ResponsiveContainer>
<BarChart data={topPairChartData} layout="vertical" margin={{ top: 8, right: 16, left: 16, bottom: 8 }}>
<CartesianGrid strokeDasharray="3 3" stroke="#d9e2ec" />
<XAxis type="number" allowDecimals={false} />
<YAxis
type="category"
dataKey="rank"
tickFormatter={(value) => `#${value}`}
width={36}
/>
<Tooltip />
<Bar dataKey="replies" fill="#2b6777" radius={[0, 6, 6, 0]} />
</BarChart>
</ResponsiveContainer>
</div>
</div>
<div style={{ ...styles.cardBase, gridColumn: "span 6" }}>
<h3 style={{ ...styles.sectionTitle, fontSize: "1rem" }}>Top 10% vs Other Comment Share</h3>
<div style={{ width: "100%", height: 300 }}>
<ResponsiveContainer>
<PieChart>
<Pie
data={concentrationPieData}
dataKey="value"
nameKey="name"
innerRadius={56}
outerRadius={88}
paddingAngle={2}
>
{concentrationPieData.map((entry, index) => (
<Cell key={`${entry.name}-${index}`} fill={PIE_COLORS[index % PIE_COLORS.length]} />
))}
</Pie>
<Tooltip />
<Legend verticalAlign="bottom" height={36} />
</PieChart>
</ResponsiveContainer>
</div>
</div>
</div>
</div>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Frequent Reply Paths</h2>
<p style={styles.sectionSubtitle}>Most common user-to-user reply paths.</p>
{!topPairs.length ? (
<div style={styles.topUserMeta}>No interaction pair data available.</div>
) : (
<div style={{ ...styles.topUsersList, maxHeight: 420, overflowY: "auto" }}>
{topPairs.map(([[source, target], value], index) => (
<div key={`${source}->${target}-${index}`} style={styles.topUserItem}>
<div style={styles.topUserName}>{source} -&gt; {target}</div>
<div style={styles.topUserMeta}>{value.toLocaleString()} replies</div>
</div>
))}
</div>
)}
</div>
</div>
</div>
);
};
export default InteractionalStats;

View File

@@ -0,0 +1,91 @@
import Card from "./Card";
import StatsStyling from "../styles/stats_styling";
import type { LinguisticAnalysisResponse } from "../types/ApiTypes";
const styles = StatsStyling;
type LinguisticStatsProps = {
data: LinguisticAnalysisResponse;
};
const LinguisticStats = ({ data }: LinguisticStatsProps) => {
const lexical = data.lexical_diversity;
const words = data.word_frequencies ?? [];
const bigrams = data.common_two_phrases ?? [];
const trigrams = data.common_three_phrases ?? [];
const topWords = words.slice(0, 20);
const topBigrams = bigrams.slice(0, 10);
const topTrigrams = trigrams.slice(0, 10);
return (
<div style={styles.page}>
<div style={{ ...styles.container, ...styles.grid }}>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Language Overview</h2>
<p style={styles.sectionSubtitle}>Quick read on how broad and repetitive the wording is.</p>
</div>
<Card
label="Total Words"
value={lexical?.total_tokens?.toLocaleString() ?? "—"}
sublabel="Words after basic filtering"
style={{ gridColumn: "span 4" }}
/>
<Card
label="Unique Words"
value={lexical?.unique_tokens?.toLocaleString() ?? "—"}
sublabel="Different words used"
style={{ gridColumn: "span 4" }}
/>
<Card
label="Vocabulary Variety"
value={typeof lexical?.ttr === "number" ? lexical.ttr.toFixed(4) : "—"}
sublabel="Higher means less repetition"
style={{ gridColumn: "span 4" }}
/>
<div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Top Words</h2>
<p style={styles.sectionSubtitle}>Most used single words.</p>
<div style={{ ...styles.topUsersList, maxHeight: 360, overflowY: "auto" }}>
{topWords.map((item) => (
<div key={item.word} style={styles.topUserItem}>
<div style={styles.topUserName}>{item.word}</div>
<div style={styles.topUserMeta}>{item.count.toLocaleString()} uses</div>
</div>
))}
</div>
</div>
<div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Top Bigrams</h2>
<p style={styles.sectionSubtitle}>Most used 2-word phrases.</p>
<div style={{ ...styles.topUsersList, maxHeight: 360, overflowY: "auto" }}>
{topBigrams.map((item) => (
<div key={item.ngram} style={styles.topUserItem}>
<div style={styles.topUserName}>{item.ngram}</div>
<div style={styles.topUserMeta}>{item.count.toLocaleString()} uses</div>
</div>
))}
</div>
</div>
<div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Top Trigrams</h2>
<p style={styles.sectionSubtitle}>Most used 3-word phrases.</p>
<div style={{ ...styles.topUsersList, maxHeight: 360, overflowY: "auto" }}>
{topTrigrams.map((item) => (
<div key={item.ngram} style={styles.topUserItem}>
<div style={styles.topUserName}>{item.ngram}</div>
<div style={styles.topUserMeta}>{item.count.toLocaleString()} uses</div>
</div>
))}
</div>
</div>
</div>
</div>
);
};
export default LinguisticStats;

View File

@@ -58,15 +58,13 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
const [selectedUser, setSelectedUser] = useState<string | null>(null); const [selectedUser, setSelectedUser] = useState<string | null>(null);
const selectedUserData: User | null = userData?.users.find((u) => u.author === selectedUser) ?? null; const selectedUserData: User | null = userData?.users.find((u) => u.author === selectedUser) ?? null;
console.log(summary)
return ( return (
<div style={styles.page}> <div style={styles.page}>
{/* main grid*/} {/* main grid*/}
<div style={{ ...styles.container, ...styles.grid}}> <div style={{ ...styles.container, ...styles.grid}}>
<Card <Card
label="Total Events" label="Total Activity"
value={summary?.total_events ?? "—"} value={summary?.total_events ?? "—"}
sublabel="Posts + comments" sublabel="Posts + comments"
style={{ style={{
@@ -74,15 +72,15 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
}} }}
/> />
<Card <Card
label="Unique Users" label="Active People"
value={summary?.unique_users ?? "—"} value={summary?.unique_users ?? "—"}
sublabel="Distinct authors" sublabel="Distinct users"
style={{ style={{
gridColumn: "span 4" gridColumn: "span 4"
}} }}
/> />
<Card <Card
label="Posts / Comments" label="Posts vs Comments"
value={ value={
summary summary
? `${summary.total_posts} / ${summary.total_comments}` ? `${summary.total_posts} / ${summary.total_comments}`
@@ -108,13 +106,13 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
/> />
<Card <Card
label="Lurker Ratio" label="One-Time Users"
value={ value={
typeof summary?.lurker_ratio === "number" typeof summary?.lurker_ratio === "number"
? `${Math.round(summary.lurker_ratio * 100)}%` ? `${Math.round(summary.lurker_ratio * 100)}%`
: "—" : "—"
} }
sublabel="Users with only 1 event" sublabel="Users with only one event"
style={{ style={{
gridColumn: "span 4" gridColumn: "span 4"
}} }}
@@ -136,12 +134,12 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
{/* events per day */} {/* events per day */}
<div style={{ ...styles.card, gridColumn: "span 5" }}> <div style={{ ...styles.card, gridColumn: "span 5" }}>
<h2 style={styles.sectionTitle}>Events per Day</h2> <h2 style={styles.sectionTitle}>Activity Over Time</h2>
<p style={styles.sectionSubtitle}>Trend of activity over time</p> <p style={styles.sectionSubtitle}>How much posting happened each day.</p>
<div style={styles.chartWrapper}> <div style={styles.chartWrapper}>
<ResponsiveContainer width="100%" height="100%"> <ResponsiveContainer width="100%" height="100%">
<LineChart data={timeData?.events_per_day.filter((d) => new Date(d.date) >= new Date('2026-01-10'))}> <LineChart data={timeData?.events_per_day ?? []}>
<CartesianGrid strokeDasharray="3 3" /> <CartesianGrid strokeDasharray="3 3" />
<XAxis dataKey="date" /> <XAxis dataKey="date" />
<YAxis /> <YAxis />
@@ -154,8 +152,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
{/* Word Cloud */} {/* Word Cloud */}
<div style={{ ...styles.card, gridColumn: "span 4" }}> <div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Word Cloud</h2> <h2 style={styles.sectionTitle}>Common Words</h2>
<p style={styles.sectionSubtitle}>Most common terms across events</p> <p style={styles.sectionSubtitle}>Frequently used words across the dataset.</p>
<div style={styles.chartWrapper}> <div style={styles.chartWrapper}>
<ReactWordcloud <ReactWordcloud
@@ -174,8 +172,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
<div style={{...styles.card, ...styles.scrollArea, gridColumn: "span 3", <div style={{...styles.card, ...styles.scrollArea, gridColumn: "span 3",
}} }}
> >
<h2 style={styles.sectionTitle}>Top Users</h2> <h2 style={styles.sectionTitle}>Most Active Users</h2>
<p style={styles.sectionSubtitle}>Most active authors</p> <p style={styles.sectionSubtitle}>Who posted the most events.</p>
<div style={styles.topUsersList}> <div style={styles.topUsersList}>
{userData?.top_users.slice(0, 100).map((item) => ( {userData?.top_users.slice(0, 100).map((item) => (
@@ -195,8 +193,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
{/* Heatmap */} {/* Heatmap */}
<div style={{ ...styles.card, gridColumn: "span 12" }}> <div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Heatmap</h2> <h2 style={styles.sectionTitle}>Weekly Activity Pattern</h2>
<p style={styles.sectionSubtitle}>Activity density across time</p> <p style={styles.sectionSubtitle}>When activity tends to happen by weekday and hour.</p>
<div style={styles.heatmapWrapper}> <div style={styles.heatmapWrapper}>
<ActivityHeatmap data={timeData?.weekday_hour_heatmap ?? []} /> <ActivityHeatmap data={timeData?.weekday_hour_heatmap ?? []} />

View File

@@ -12,6 +12,9 @@ type Props = {
}; };
export default function UserModal({ open, onClose, userData, username }: Props) { export default function UserModal({ open, onClose, userData, username }: Props) {
const dominantEmotionEntry = Object.entries(userData?.avg_emotions ?? {})
.sort((a, b) => b[1] - a[1])[0];
return ( return (
<Dialog open={open} onClose={onClose} style={styles.modalRoot}> <Dialog open={open} onClose={onClose} style={styles.modalRoot}>
<div style={styles.modalBackdrop} /> <div style={styles.modalBackdrop} />
@@ -66,6 +69,15 @@ export default function UserModal({ open, onClose, userData, username }: Props)
</div> </div>
</div> </div>
) : null} ) : null}
{dominantEmotionEntry ? (
<div style={styles.topUserItem}>
<div style={styles.topUserName}>Dominant Avg Emotion</div>
<div style={styles.topUserMeta}>
{dominantEmotionEntry[0].replace("emotion_", "")} ({dominantEmotionEntry[1].toFixed(3)})
</div>
</div>
) : null}
</div> </div>
)} )}
</DialogPanel> </DialogPanel>

View File

@@ -87,15 +87,15 @@ const UserStats = (props: { data: UserAnalysisResponse }) => {
style={{ gridColumn: "span 3" }} style={{ gridColumn: "span 3" }}
/> />
<Card <Card
label="Interactions" label="Replies"
value={totalInteractions.toLocaleString()} value={totalInteractions.toLocaleString()}
sublabel="Filtered links (2+ interactions)" sublabel="Links with at least 2 replies"
style={{ gridColumn: "span 3" }} style={{ gridColumn: "span 3" }}
/> />
<Card <Card
label="Average Intensity" label="Replies per Connected User"
value={avgInteractionsPerConnectedUser.toFixed(1)} value={avgInteractionsPerConnectedUser.toFixed(1)}
sublabel="Interactions per connected user" sublabel="Average from visible graph links"
style={{ gridColumn: "span 3" }} style={{ gridColumn: "span 3" }}
/> />
<Card <Card
@@ -106,13 +106,13 @@ const UserStats = (props: { data: UserAnalysisResponse }) => {
/> />
<Card <Card
label="Strongest Connection" label="Strongest User Link"
value={strongestLink ? `${strongestLink.source} -> ${strongestLink.target}` : "—"} value={strongestLink ? `${strongestLink.source} -> ${strongestLink.target}` : "—"}
sublabel={strongestLink ? `${strongestLink.value.toLocaleString()} interactions` : "No graph edges after filtering"} sublabel={strongestLink ? `${strongestLink.value.toLocaleString()} replies` : "No graph links after filtering"}
style={{ gridColumn: "span 6" }} style={{ gridColumn: "span 6" }}
/> />
<Card <Card
label="Most Reply-Driven User" label="Most Comment-Heavy User"
value={highlyInteractiveUser?.author ?? "—"} value={highlyInteractiveUser?.author ?? "—"}
sublabel={ sublabel={
highlyInteractiveUser highlyInteractiveUser
@@ -125,7 +125,7 @@ const UserStats = (props: { data: UserAnalysisResponse }) => {
<div style={{ ...styles.card, gridColumn: "span 12" }}> <div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>User Interaction Graph</h2> <h2 style={styles.sectionTitle}>User Interaction Graph</h2>
<p style={styles.sectionSubtitle}> <p style={styles.sectionSubtitle}>
Nodes represent users and links represent conversation interactions. Each node is a user, and each link shows replies between them.
</p> </p>
<div ref={graphContainerRef} style={{ width: "100%", height: graphSize.height }}> <div ref={graphContainerRef} style={{ width: "100%", height: graphSize.height }}>
<ForceGraph3D <ForceGraph3D

View File

@@ -191,6 +191,9 @@ const AutoScrapePage = () => {
<p style={styles.sectionHeaderSubtitle}> <p style={styles.sectionHeaderSubtitle}>
Select sources and scrape settings, then queue processing automatically. Select sources and scrape settings, then queue processing automatically.
</p> </p>
<p style={{ ...styles.subtleBodyText, marginTop: 6, color: "#9a6700" }}>
Warning: Scraping more than 250 posts from any single site can take hours due to rate limits.
</p>
</div> </div>
<button <button
type="button" type="button"

View File

@@ -5,26 +5,42 @@ import StatsStyling from "../styles/stats_styling";
import SummaryStats from "../components/SummaryStats"; import SummaryStats from "../components/SummaryStats";
import EmotionalStats from "../components/EmotionalStats"; import EmotionalStats from "../components/EmotionalStats";
import UserStats from "../components/UserStats"; import UserStats from "../components/UserStats";
import LinguisticStats from "../components/LinguisticStats";
import InteractionalStats from "../components/InteractionalStats";
import CulturalStats from "../components/CulturalStats";
import { import {
type SummaryResponse, type SummaryResponse,
type UserAnalysisResponse, type UserAnalysisResponse,
type TimeAnalysisResponse, type TimeAnalysisResponse,
type ContentAnalysisResponse type ContentAnalysisResponse,
type UserEndpointResponse,
type LinguisticAnalysisResponse,
type EmotionalAnalysisResponse,
type InteractionAnalysisResponse,
type CulturalAnalysisResponse
} from '../types/ApiTypes' } from '../types/ApiTypes'
const API_BASE_URL = import.meta.env.VITE_BACKEND_URL const API_BASE_URL = import.meta.env.VITE_BACKEND_URL
const styles = StatsStyling; const styles = StatsStyling;
const DELETED_USERS = ["[deleted]"];
const isDeletedUser = (value: string | null | undefined) => (
DELETED_USERS.includes((value ?? "").trim().toLowerCase())
);
const StatPage = () => { const StatPage = () => {
const { datasetId: routeDatasetId } = useParams<{ datasetId: string }>(); const { datasetId: routeDatasetId } = useParams<{ datasetId: string }>();
const [error, setError] = useState(''); const [error, setError] = useState('');
const [loading, setLoading] = useState(false); const [loading, setLoading] = useState(false);
const [activeView, setActiveView] = useState<"summary" | "emotional" | "user">("summary"); const [activeView, setActiveView] = useState<"summary" | "emotional" | "user" | "linguistic" | "interactional" | "cultural">("summary");
const [userData, setUserData] = useState<UserAnalysisResponse | null>(null); const [userData, setUserData] = useState<UserAnalysisResponse | null>(null);
const [timeData, setTimeData] = useState<TimeAnalysisResponse | null>(null); const [timeData, setTimeData] = useState<TimeAnalysisResponse | null>(null);
const [contentData, setContentData] = useState<ContentAnalysisResponse | null>(null); const [contentData, setContentData] = useState<ContentAnalysisResponse | null>(null);
const [linguisticData, setLinguisticData] = useState<LinguisticAnalysisResponse | null>(null);
const [interactionData, setInteractionData] = useState<InteractionAnalysisResponse | null>(null);
const [culturalData, setCulturalData] = useState<CulturalAnalysisResponse | null>(null);
const [summary, setSummary] = useState<SummaryResponse | null>(null); const [summary, setSummary] = useState<SummaryResponse | null>(null);
@@ -83,15 +99,23 @@ const StatPage = () => {
setLoading(true); setLoading(true);
Promise.all([ Promise.all([
axios.get<TimeAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/time`, { axios.get<TimeAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/temporal`, {
params, params,
headers: authHeaders, headers: authHeaders,
}), }),
axios.get<UserAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/user`, { axios.get<UserEndpointResponse>(`${API_BASE_URL}/dataset/${datasetId}/user`, {
params, params,
headers: authHeaders, headers: authHeaders,
}), }),
axios.get<ContentAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/content`, { axios.get<LinguisticAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/linguistic`, {
params,
headers: authHeaders,
}),
axios.get<EmotionalAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/emotional`, {
params,
headers: authHeaders,
}),
axios.get<InteractionAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/interactional`, {
params, params,
headers: authHeaders, headers: authHeaders,
}), }),
@@ -99,12 +123,87 @@ const StatPage = () => {
params, params,
headers: authHeaders, headers: authHeaders,
}), }),
axios.get<CulturalAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/cultural`, {
params,
headers: authHeaders,
}),
]) ])
.then(([timeRes, userRes, contentRes, summaryRes]) => { .then(([timeRes, userRes, linguisticRes, emotionalRes, interactionRes, summaryRes, culturalRes]) => {
setUserData(userRes.data || null); const usersList = userRes.data.users ?? [];
const topUsersList = userRes.data.top_users ?? [];
const interactionGraphRaw = interactionRes.data?.interaction_graph ?? {};
const topPairsRaw = interactionRes.data?.top_interaction_pairs ?? [];
const filteredUsers: typeof usersList = [];
for (const user of usersList) {
if (isDeletedUser(user.author)) continue;
filteredUsers.push(user);
}
const filteredTopUsers: typeof topUsersList = [];
for (const user of topUsersList) {
if (isDeletedUser(user.author)) continue;
filteredTopUsers.push(user);
}
const filteredInteractionGraph: Record<string, Record<string, number>> = {};
for (const [source, targets] of Object.entries(interactionGraphRaw)) {
if (isDeletedUser(source)) {
continue;
}
const nextTargets: Record<string, number> = {};
for (const [target, count] of Object.entries(targets)) {
if (isDeletedUser(target)) {
continue;
}
nextTargets[target] = count;
}
filteredInteractionGraph[source] = nextTargets;
}
const filteredTopInteractionPairs: typeof topPairsRaw = [];
for (const pairEntry of topPairsRaw) {
const pair = pairEntry[0];
const source = pair[0];
const target = pair[1];
if (isDeletedUser(source) || isDeletedUser(target)) {
continue;
}
filteredTopInteractionPairs.push(pairEntry);
}
const combinedUserData: UserAnalysisResponse = {
...userRes.data,
users: filteredUsers,
top_users: filteredTopUsers,
interaction_graph: filteredInteractionGraph,
};
const combinedContentData: ContentAnalysisResponse = {
...linguisticRes.data,
...emotionalRes.data,
};
const filteredInteractionData: InteractionAnalysisResponse = {
...interactionRes.data,
interaction_graph: filteredInteractionGraph,
top_interaction_pairs: filteredTopInteractionPairs,
};
const filteredSummary: SummaryResponse = {
...summaryRes.data,
unique_users: filteredUsers.length,
};
setUserData(combinedUserData);
setTimeData(timeRes.data || null); setTimeData(timeRes.data || null);
setContentData(contentRes.data || null); setContentData(combinedContentData);
setSummary(summaryRes.data || null); setLinguisticData(linguisticRes.data || null);
setInteractionData(filteredInteractionData || null);
setCulturalData(culturalRes.data || null);
setSummary(filteredSummary || null);
}) })
.catch((e) => setError("Failed to load statistics: " + String(e))) .catch((e) => setError("Failed to load statistics: " + String(e)))
.finally(() => setLoading(false)); .finally(() => setLoading(false));
@@ -198,7 +297,7 @@ return (
<div style={styles.dashboardMeta}>Dataset #{datasetId ?? "-"}</div> <div style={styles.dashboardMeta}>Dataset #{datasetId ?? "-"}</div>
</div> </div>
<div style={{ ...styles.container, ...styles.tabsRow }}> <div style={{ ...styles.container, ...styles.tabsRow, justifyContent: "center" }}>
<button <button
onClick={() => setActiveView("summary")} onClick={() => setActiveView("summary")}
style={activeView === "summary" ? styles.buttonPrimary : styles.buttonSecondary} style={activeView === "summary" ? styles.buttonPrimary : styles.buttonSecondary}
@@ -218,6 +317,24 @@ return (
> >
Users Users
</button> </button>
<button
onClick={() => setActiveView("linguistic")}
style={activeView === "linguistic" ? styles.buttonPrimary : styles.buttonSecondary}
>
Linguistic
</button>
<button
onClick={() => setActiveView("interactional")}
style={activeView === "interactional" ? styles.buttonPrimary : styles.buttonSecondary}
>
Interactional
</button>
<button
onClick={() => setActiveView("cultural")}
style={activeView === "cultural" ? styles.buttonPrimary : styles.buttonSecondary}
>
Cultural
</button>
</div> </div>
{activeView === "summary" && ( {activeView === "summary" && (
@@ -243,6 +360,36 @@ return (
<UserStats data={userData} /> <UserStats data={userData} />
)} )}
{activeView === "linguistic" && linguisticData && (
<LinguisticStats data={linguisticData} />
)}
{activeView === "linguistic" && !linguisticData && (
<div style={{ ...styles.container, ...styles.card, marginTop: 16 }}>
No linguistic data available.
</div>
)}
{activeView === "interactional" && interactionData && (
<InteractionalStats data={interactionData} />
)}
{activeView === "interactional" && !interactionData && (
<div style={{ ...styles.container, ...styles.card, marginTop: 16 }}>
No interactional data available.
</div>
)}
{activeView === "cultural" && culturalData && (
<CulturalStats data={culturalData} />
)}
{activeView === "cultural" && !culturalData && (
<div style={{ ...styles.container, ...styles.card, marginTop: 16 }}>
No cultural data available.
</div>
)}
</div> </div>
); );
} }

View File

@@ -1,14 +1,28 @@
// User Responses // Shared types
type TopUser = {
author: string;
source: string;
count: number
};
type FrequencyWord = { type FrequencyWord = {
word: string; word: string;
count: number; count: number;
} };
type NGram = {
count: number;
ngram: string;
};
type Emotion = {
emotion_anger: number;
emotion_disgust: number;
emotion_fear: number;
emotion_joy: number;
emotion_sadness: number;
};
// User
type TopUser = {
author: string;
source: string;
count: number;
};
type Vocab = { type Vocab = {
author: string; author: string;
@@ -26,60 +40,145 @@ type User = {
comment: number; comment: number;
comment_post_ratio: number; comment_post_ratio: number;
comment_share: number; comment_share: number;
avg_emotions?: Record<string, number>;
vocab?: Vocab | null; vocab?: Vocab | null;
}; };
type InteractionGraph = Record<string, Record<string, number>>; type InteractionGraph = Record<string, Record<string, number>>;
type UserEndpointResponse = {
top_users: TopUser[];
users: User[];
};
type UserAnalysisResponse = { type UserAnalysisResponse = {
top_users: TopUser[]; top_users: TopUser[];
users: User[]; users: User[];
interaction_graph: InteractionGraph; interaction_graph: InteractionGraph;
}; };
// Time Analysis // Time
type EventsPerDay = { type EventsPerDay = {
date: Date; date: Date;
count: number; count: number;
} };
type HeatmapCell = { type HeatmapCell = {
date: Date; date: Date;
hour: number; hour: number;
count: number; count: number;
} };
type TimeAnalysisResponse = { type TimeAnalysisResponse = {
events_per_day: EventsPerDay[]; events_per_day: EventsPerDay[];
weekday_hour_heatmap: HeatmapCell[]; weekday_hour_heatmap: HeatmapCell[];
}
// Content Analysis
type Emotion = {
emotion_anger: number;
emotion_disgust: number;
emotion_fear: number;
emotion_joy: number;
emotion_sadness: number;
}; };
type NGram = { // Content (combines emotional and linguistic)
count: number;
ngram: string;
}
type AverageEmotionByTopic = Emotion & { type AverageEmotionByTopic = Emotion & {
n: number; n: number;
topic: string; topic: string;
[key: string]: string | number;
}; };
type OverallEmotionAverage = {
emotion: string;
score: number;
};
type DominantEmotionDistribution = {
emotion: string;
count: number;
ratio: number;
};
type EmotionBySource = {
source: string;
dominant_emotion: string;
dominant_score: number;
event_count: number;
};
type ContentAnalysisResponse = { type ContentAnalysisResponse = {
word_frequencies: FrequencyWord[]; word_frequencies: FrequencyWord[];
average_emotion_by_topic: AverageEmotionByTopic[]; average_emotion_by_topic: AverageEmotionByTopic[];
common_three_phrases: NGram[]; common_three_phrases: NGram[];
common_two_phrases: NGram[]; common_two_phrases: NGram[];
} overall_emotion_average?: OverallEmotionAverage[];
dominant_emotion_distribution?: DominantEmotionDistribution[];
emotion_by_source?: EmotionBySource[];
};
// Linguistic
type LinguisticAnalysisResponse = {
word_frequencies: FrequencyWord[];
common_two_phrases: NGram[];
common_three_phrases: NGram[];
lexical_diversity?: Record<string, number>;
};
// Emotional
type EmotionalAnalysisResponse = {
average_emotion_by_topic: AverageEmotionByTopic[];
overall_emotion_average?: OverallEmotionAverage[];
dominant_emotion_distribution?: DominantEmotionDistribution[];
emotion_by_source?: EmotionBySource[];
};
// Interactional
type ConversationConcentration = {
total_commenting_authors: number;
top_10pct_author_count: number;
top_10pct_comment_share: number;
single_comment_authors: number;
single_comment_author_ratio: number;
};
type InteractionAnalysisResponse = {
average_thread_depth?: number;
top_interaction_pairs?: [[string, string], number][];
conversation_concentration?: ConversationConcentration;
interaction_graph: InteractionGraph;
};
// Cultural
type IdentityMarkers = {
in_group_usage: number;
out_group_usage: number;
in_group_ratio: number;
out_group_ratio: number;
in_group_posts: number;
out_group_posts: number;
tie_posts: number;
in_group_emotion_avg?: Record<string, number>;
out_group_emotion_avg?: Record<string, number>;
};
type StanceMarkers = {
hedge_total: number;
certainty_total: number;
deontic_total: number;
permission_total: number;
hedge_per_1k_tokens: number;
certainty_per_1k_tokens: number;
deontic_per_1k_tokens: number;
permission_per_1k_tokens: number;
};
type EntityEmotionAggregate = {
post_count: number;
emotion_avg: Record<string, number>;
};
type AverageEmotionPerEntity = {
entity_emotion_avg: Record<string, EntityEmotionAggregate>;
};
type CulturalAnalysisResponse = {
identity_markers?: IdentityMarkers;
stance_markers?: StanceMarkers;
avg_emotion_per_entity?: AverageEmotionPerEntity;
};
// Summary // Summary
type SummaryResponse = { type SummaryResponse = {
@@ -96,22 +195,35 @@ type SummaryResponse = {
sources: string[]; sources: string[];
}; };
// Filtering Response // Filter
type FilterResponse = { type FilterResponse = {
rows: number rows: number;
data: any; data: any;
} };
export type { export type {
TopUser, TopUser,
Vocab, Vocab,
User, User,
InteractionGraph, InteractionGraph,
ConversationConcentration,
UserAnalysisResponse, UserAnalysisResponse,
UserEndpointResponse,
FrequencyWord, FrequencyWord,
AverageEmotionByTopic, AverageEmotionByTopic,
OverallEmotionAverage,
DominantEmotionDistribution,
EmotionBySource,
SummaryResponse, SummaryResponse,
TimeAnalysisResponse, TimeAnalysisResponse,
ContentAnalysisResponse, ContentAnalysisResponse,
FilterResponse LinguisticAnalysisResponse,
} EmotionalAnalysisResponse,
InteractionAnalysisResponse,
IdentityMarkers,
StanceMarkers,
EntityEmotionAggregate,
AverageEmotionPerEntity,
CulturalAnalysisResponse,
FilterResponse,
};

View File

@@ -1,33 +1,86 @@
import pandas as pd import pandas as pd
class EmotionalAnalysis: class EmotionalAnalysis:
def avg_emotion_by_topic(self, df: pd.DataFrame) -> dict: def _emotion_cols(self, df: pd.DataFrame) -> list[str]:
emotion_cols = [ return [col for col in df.columns if col.startswith("emotion_")]
col for col in df.columns
if col.startswith("emotion_") def avg_emotion_by_topic(self, df: pd.DataFrame) -> list[dict]:
] emotion_cols = self._emotion_cols(df)
if not emotion_cols:
return []
counts = ( counts = (
df[ df[(df["topic"] != "Misc")].groupby("topic").size().reset_index(name="n")
(df["topic"] != "Misc")
]
.groupby("topic")
.size()
.rename("n")
) )
avg_emotion_by_topic = ( avg_emotion_by_topic = (
df[ df[(df["topic"] != "Misc")]
(df["topic"] != "Misc")
]
.groupby("topic")[emotion_cols] .groupby("topic")[emotion_cols]
.mean() .mean()
.reset_index() .reset_index()
) )
avg_emotion_by_topic = avg_emotion_by_topic.merge( avg_emotion_by_topic = avg_emotion_by_topic.merge(counts, on="topic")
counts,
on="topic" return avg_emotion_by_topic.to_dict(orient="records")
def overall_emotion_average(self, df: pd.DataFrame) -> list[dict]:
emotion_cols = self._emotion_cols(df)
if not emotion_cols:
return []
means = df[emotion_cols].mean()
return [
{
"emotion": col.replace("emotion_", ""),
"score": float(means[col]),
}
for col in emotion_cols
]
def dominant_emotion_distribution(self, df: pd.DataFrame) -> list[dict]:
emotion_cols = self._emotion_cols(df)
if not emotion_cols or df.empty:
return []
dominant_per_row = df[emotion_cols].idxmax(axis=1)
counts = dominant_per_row.value_counts()
total = max(len(dominant_per_row), 1)
return [
{
"emotion": col.replace("emotion_", ""),
"count": int(count),
"ratio": round(float(count / total), 4),
}
for col, count in counts.items()
]
def emotion_by_source(self, df: pd.DataFrame) -> list[dict]:
emotion_cols = self._emotion_cols(df)
if not emotion_cols or "source" not in df.columns or df.empty:
return []
source_counts = df.groupby("source").size()
source_means = df.groupby("source")[emotion_cols].mean().reset_index()
rows = source_means.to_dict(orient="records")
output = []
for row in rows:
source = row["source"]
dominant_col = max(emotion_cols, key=lambda col: float(row.get(col, 0)))
output.append(
{
"source": str(source),
"dominant_emotion": dominant_col.replace("emotion_", ""),
"dominant_score": round(float(row.get(dominant_col, 0)), 4),
"event_count": int(source_counts.get(source, 0)),
}
) )
return avg_emotion_by_topic.to_dict(orient='records') return output

View File

@@ -1,9 +1,6 @@
import pandas as pd import pandas as pd
import re import re
from collections import Counter
class InteractionAnalysis: class InteractionAnalysis:
def __init__(self, word_exclusions: set[str]): def __init__(self, word_exclusions: set[str]):
self.word_exclusions = word_exclusions self.word_exclusions = word_exclusions
@@ -12,118 +9,6 @@ class InteractionAnalysis:
tokens = re.findall(r"\b[a-z]{3,}\b", text) tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in self.word_exclusions] return [t for t in tokens if t not in self.word_exclusions]
def _vocab_richness_per_user(
self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100
) -> list:
df = df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower()
df["tokens"] = df["content"].apply(self._tokenize)
rows = []
for author, group in df.groupby("author"):
all_tokens = [t for tokens in group["tokens"] for t in tokens]
total_words = len(all_tokens)
unique_words = len(set(all_tokens))
events = len(group)
# Min amount of words for a user, any less than this might give weird results
if total_words < min_words:
continue
# 100% = they never reused a word (excluding stop words)
vocab_richness = unique_words / total_words
avg_words = total_words / max(events, 1)
counts = Counter(all_tokens)
top_words = [
{"word": w, "count": int(c)}
for w, c in counts.most_common(top_most_used_words)
]
rows.append(
{
"author": author,
"events": int(events),
"total_words": int(total_words),
"unique_words": int(unique_words),
"vocab_richness": round(vocab_richness, 3),
"avg_words_per_event": round(avg_words, 2),
"top_words": top_words,
}
)
rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True)
return rows
def top_users(self, df: pd.DataFrame) -> list:
counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)
top_users = [
{"author": author, "source": source, "count": int(count)}
for (author, source), count in counts.items()
]
return top_users
def per_user_analysis(self, df: pd.DataFrame) -> dict:
per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
avg_emotions_by_author = {}
if emotion_cols:
avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
avg_emotions_by_author = {
author: {emotion: float(score) for emotion, score in row.items()}
for author, row in avg_emotions.iterrows()
}
# ensure columns always exist
for col in ("post", "comment"):
if col not in per_user.columns:
per_user[col] = 0
per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(
0, 1
)
per_user["comment_share"] = per_user["comment"] / (
per_user["post"] + per_user["comment"]
).replace(0, 1)
per_user = per_user.sort_values("comment_post_ratio", ascending=True)
per_user_records = per_user.reset_index().to_dict(orient="records")
vocab_rows = self._vocab_richness_per_user(df)
vocab_by_author = {row["author"]: row for row in vocab_rows}
# merge vocab richness + per_user information
merged_users = []
for row in per_user_records:
author = row["author"]
merged_users.append(
{
"author": author,
"post": int(row.get("post", 0)),
"comment": int(row.get("comment", 0)),
"comment_post_ratio": float(row.get("comment_post_ratio", 0)),
"comment_share": float(row.get("comment_share", 0)),
"avg_emotions": avg_emotions_by_author.get(author, {}),
"vocab": vocab_by_author.get(
author,
{
"vocab_richness": 0,
"avg_words_per_event": 0,
"top_words": [],
},
),
}
)
merged_users.sort(key=lambda u: u["comment_post_ratio"])
return merged_users
def interaction_graph(self, df: pd.DataFrame): def interaction_graph(self, df: pd.DataFrame):
interactions = {a: {} for a in df["author"].dropna().unique()} interactions = {a: {} for a in df["author"].dropna().unique()}
@@ -167,67 +52,36 @@ class InteractionAnalysis:
return round(sum(depths) / len(depths), 2) return round(sum(depths) / len(depths), 2)
def average_thread_length_by_emotion(self, df: pd.DataFrame): def top_interaction_pairs(self, df: pd.DataFrame, top_n=10):
emotion_exclusions = {"emotion_neutral", "emotion_surprise"} graph = self.interaction_graph(df)
pairs = []
emotion_cols = [ for a, targets in graph.items():
c for b, count in targets.items():
for c in df.columns pairs.append(((a, b), count))
if c.startswith("emotion_") and c not in emotion_exclusions
]
id_to_reply = df.set_index("id")["reply_to"].to_dict() pairs.sort(key=lambda x: x[1], reverse=True)
length_cache = {} return pairs[:top_n]
def thread_length_from(start_id): def conversation_concentration(self, df: pd.DataFrame) -> dict:
if start_id in length_cache: if "type" not in df.columns:
return length_cache[start_id] return {}
seen = set() comments = df[df["type"] == "comment"]
length = 1 if comments.empty:
current = start_id return {}
while True: author_counts = comments["author"].value_counts()
if current in seen: total_comments = len(comments)
# infinite loop shouldn't happen, but just in case total_authors = len(author_counts)
break
seen.add(current)
reply_to = id_to_reply.get(current) top_10_pct_n = max(1, int(total_authors * 0.1))
top_10_pct_share = round(author_counts.head(top_10_pct_n).sum() / total_comments, 4)
if (
reply_to is None
or (isinstance(reply_to, float) and pd.isna(reply_to))
or reply_to == ""
):
break
length += 1
current = reply_to
if current in length_cache:
length += length_cache[current] - 1
break
length_cache[start_id] = length
return length
emotion_to_lengths = {}
# Fill NaNs in emotion cols to avoid max() issues
emo_df = df[["id"] + emotion_cols].copy()
emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
for _, row in emo_df.iterrows():
msg_id = row["id"]
length = thread_length_from(msg_id)
emotions = {c: row[c] for c in emotion_cols}
dominant = max(emotions, key=emotions.get)
emotion_to_lengths.setdefault(dominant, []).append(length)
return { return {
emotion: round(sum(lengths) / len(lengths), 2) "total_commenting_authors": total_authors,
for emotion, lengths in emotion_to_lengths.items() "top_10pct_author_count": top_10_pct_n,
"top_10pct_comment_share": float(top_10_pct_share),
"single_comment_authors": int((author_counts == 1).sum()),
"single_comment_author_ratio": float(round((author_counts == 1).sum() / total_authors, 4)),
} }

View File

@@ -61,3 +61,19 @@ class LinguisticAnalysis:
.head(limit) .head(limit)
.to_dict(orient="records") .to_dict(orient="records")
) )
def lexical_diversity(self, df: pd.DataFrame) -> dict:
tokens = (
df["content"].fillna("").astype(str).str.lower()
.str.findall(r"\b[a-z]{2,}\b")
.explode()
)
tokens = tokens[~tokens.isin(self.word_exclusions)]
total = max(len(tokens), 1)
unique = int(tokens.nunique())
return {
"total_tokens": total,
"unique_tokens": unique,
"ttr": round(unique / total, 4),
}

View File

@@ -6,7 +6,9 @@ from server.analysis.cultural import CulturalAnalysis
from server.analysis.emotional import EmotionalAnalysis from server.analysis.emotional import EmotionalAnalysis
from server.analysis.interactional import InteractionAnalysis from server.analysis.interactional import InteractionAnalysis
from server.analysis.linguistic import LinguisticAnalysis from server.analysis.linguistic import LinguisticAnalysis
from server.analysis.summary import SummaryAnalysis
from server.analysis.temporal import TemporalAnalysis from server.analysis.temporal import TemporalAnalysis
from server.analysis.user import UserAnalysis
DOMAIN_STOPWORDS = { DOMAIN_STOPWORDS = {
"www", "www",
@@ -36,12 +38,11 @@ class StatGen:
self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS) self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS)
self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS) self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS)
self.cultural_analysis = CulturalAnalysis() self.cultural_analysis = CulturalAnalysis()
self.summary_analysis = SummaryAnalysis()
self.user_analysis = UserAnalysis(EXCLUDE_WORDS)
## Private Methods ## Private Methods
def _prepare_filtered_df(self, def _prepare_filtered_df(self, df: pd.DataFrame, filters: dict | None = None) -> pd.DataFrame:
df: pd.DataFrame,
filters: dict | None = None
) -> pd.DataFrame:
filters = filters or {} filters = filters or {}
filtered_df = df.copy() filtered_df = df.copy()
@@ -51,10 +52,9 @@ class StatGen:
data_source_filter = filters.get("data_sources", None) data_source_filter = filters.get("data_sources", None)
if search_query: if search_query:
mask = ( mask = filtered_df["content"].str.contains(
filtered_df["content"].str.contains(search_query, case=False, na=False) search_query, case=False, na=False
| filtered_df["author"].str.contains(search_query, case=False, na=False) ) | filtered_df["author"].str.contains(search_query, case=False, na=False)
)
# Only include title if the column exists # Only include title if the column exists
if "title" in filtered_df.columns: if "title" in filtered_df.columns:
@@ -76,10 +76,10 @@ class StatGen:
return filtered_df return filtered_df
## Public Methods ## Public Methods
def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]:
return self._prepare_filtered_df(df, filters).to_dict(orient="records") return self._prepare_filtered_df(df, filters).to_dict(orient="records")
def get_time_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
@@ -87,84 +87,54 @@ class StatGen:
"weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df), "weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df),
} }
def get_content_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def linguistic(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
"word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df), "word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df),
"common_two_phrases": self.linguistic_analysis.ngrams(filtered_df), "common_two_phrases": self.linguistic_analysis.ngrams(filtered_df),
"common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3), "common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3),
"average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic( "lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df)
filtered_df
)
} }
def get_user_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
"top_users": self.interaction_analysis.top_users(filtered_df), "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(filtered_df),
"users": self.interaction_analysis.per_user_analysis(filtered_df), "overall_emotion_average": self.emotional_analysis.overall_emotion_average(filtered_df),
"interaction_graph": self.interaction_analysis.interaction_graph(filtered_df) "dominant_emotion_distribution": self.emotional_analysis.dominant_emotion_distribution(filtered_df),
"emotion_by_source": self.emotional_analysis.emotion_by_source(filtered_df)
} }
def get_interactional_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def user(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
"average_thread_depth": self.interaction_analysis.average_thread_depth( "top_users": self.user_analysis.top_users(filtered_df),
filtered_df "users": self.user_analysis.per_user_analysis(filtered_df)
),
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(
filtered_df
),
} }
def get_cultural_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def interactional(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
"identity_markers": self.cultural_analysis.get_identity_markers( "average_thread_depth": self.interaction_analysis.average_thread_depth(filtered_df),
filtered_df "top_interaction_pairs": self.interaction_analysis.top_interaction_pairs(filtered_df, top_n=100),
), "interaction_graph": self.interaction_analysis.interaction_graph(filtered_df),
"conversation_concentration": self.interaction_analysis.conversation_concentration(filtered_df)
}
def cultural(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
return {
"identity_markers": self.cultural_analysis.get_identity_markers(filtered_df),
"stance_markers": self.cultural_analysis.get_stance_markers(filtered_df), "stance_markers": self.cultural_analysis.get_stance_markers(filtered_df),
"entity_salience": self.cultural_analysis.get_avg_emotions_per_entity( "avg_emotion_per_entity": self.cultural_analysis.get_avg_emotions_per_entity(filtered_df)
filtered_df
),
} }
def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
total_posts = (filtered_df["type"] == "post").sum() return self.summary_analysis.summary(filtered_df)
total_comments = (filtered_df["type"] == "comment").sum()
events_per_user = filtered_df.groupby("author").size()
if filtered_df.empty:
return {
"total_events": 0,
"total_posts": 0,
"total_comments": 0,
"unique_users": 0,
"comments_per_post": 0,
"lurker_ratio": 0,
"time_range": {
"start": None,
"end": None,
},
"sources": [],
}
return {
"total_events": int(len(filtered_df)),
"total_posts": int(total_posts),
"total_comments": int(total_comments),
"unique_users": int(events_per_user.count()),
"comments_per_post": round(total_comments / max(total_posts, 1), 2),
"lurker_ratio": round((events_per_user == 1).mean(), 2),
"time_range": {
"start": int(filtered_df["dt"].min().timestamp()),
"end": int(filtered_df["dt"].max().timestamp()),
},
"sources": filtered_df["source"].dropna().unique().tolist(),
}

View File

@@ -0,0 +1,64 @@
import pandas as pd
class SummaryAnalysis:
def total_events(self, df: pd.DataFrame) -> int:
return int(len(df))
def total_posts(self, df: pd.DataFrame) -> int:
return int(len(df[df["type"] == "post"]))
def total_comments(self, df: pd.DataFrame) -> int:
return int(len(df[df["type"] == "comment"]))
def unique_users(self, df: pd.DataFrame) -> int:
return int(len(df["author"].dropna().unique()))
def comments_per_post(self, total_comments: int, total_posts: int) -> float:
return round(total_comments / max(total_posts, 1), 2)
def lurker_ratio(self, df: pd.DataFrame) -> float:
events_per_user = df.groupby("author").size()
return round((events_per_user == 1).mean(), 2)
def time_range(self, df: pd.DataFrame) -> dict:
return {
"start": int(df["dt"].min().timestamp()),
"end": int(df["dt"].max().timestamp()),
}
def sources(self, df: pd.DataFrame) -> list:
return df["source"].dropna().unique().tolist()
def empty_summary(self) -> dict:
return {
"total_events": 0,
"total_posts": 0,
"total_comments": 0,
"unique_users": 0,
"comments_per_post": 0,
"lurker_ratio": 0,
"time_range": {
"start": None,
"end": None,
},
"sources": [],
}
def summary(self, df: pd.DataFrame) -> dict:
if df.empty:
return self.empty_summary()
total_posts = self.total_posts(df)
total_comments = self.total_comments(df)
return {
"total_events": self.total_events(df),
"total_posts": total_posts,
"total_comments": total_comments,
"unique_users": self.unique_users(df),
"comments_per_post": self.comments_per_post(total_comments, total_posts),
"lurker_ratio": self.lurker_ratio(df),
"time_range": self.time_range(df),
"sources": self.sources(df),
}

124
server/analysis/user.py Normal file
View File

@@ -0,0 +1,124 @@
import pandas as pd
import re
from collections import Counter
class UserAnalysis:
def __init__(self, word_exclusions: set[str]):
self.word_exclusions = word_exclusions
def _tokenize(self, text: str):
tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in self.word_exclusions]
def _vocab_richness_per_user(
self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100
) -> list:
df = df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower()
df["tokens"] = df["content"].apply(self._tokenize)
rows = []
for author, group in df.groupby("author"):
all_tokens = [t for tokens in group["tokens"] for t in tokens]
total_words = len(all_tokens)
unique_words = len(set(all_tokens))
events = len(group)
# Min amount of words for a user, any less than this might give weird results
if total_words < min_words:
continue
# 100% = they never reused a word (excluding stop words)
vocab_richness = unique_words / total_words
avg_words = total_words / max(events, 1)
counts = Counter(all_tokens)
top_words = [
{"word": w, "count": int(c)}
for w, c in counts.most_common(top_most_used_words)
]
rows.append(
{
"author": author,
"events": int(events),
"total_words": int(total_words),
"unique_words": int(unique_words),
"vocab_richness": round(vocab_richness, 3),
"avg_words_per_event": round(avg_words, 2),
"top_words": top_words,
}
)
rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True)
return rows
def top_users(self, df: pd.DataFrame) -> list:
counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)
top_users = [
{"author": author, "source": source, "count": int(count)}
for (author, source), count in counts.items()
]
return top_users
def per_user_analysis(self, df: pd.DataFrame) -> dict:
per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
avg_emotions_by_author = {}
if emotion_cols:
avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
avg_emotions_by_author = {
author: {emotion: float(score) for emotion, score in row.items()}
for author, row in avg_emotions.iterrows()
}
# ensure columns always exist
for col in ("post", "comment"):
if col not in per_user.columns:
per_user[col] = 0
per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(
0, 1
)
per_user["comment_share"] = per_user["comment"] / (
per_user["post"] + per_user["comment"]
).replace(0, 1)
per_user = per_user.sort_values("comment_post_ratio", ascending=True)
per_user_records = per_user.reset_index().to_dict(orient="records")
vocab_rows = self._vocab_richness_per_user(df)
vocab_by_author = {row["author"]: row for row in vocab_rows}
# merge vocab richness + per_user information
merged_users = []
for row in per_user_records:
author = row["author"]
merged_users.append(
{
"author": author,
"post": int(row.get("post", 0)),
"comment": int(row.get("comment", 0)),
"comment_post_ratio": float(row.get("comment_post_ratio", 0)),
"comment_share": float(row.get("comment_share", 0)),
"avg_emotions": avg_emotions_by_author.get(author, {}),
"vocab": vocab_by_author.get(
author,
{
"vocab_richness": 0,
"avg_words_per_event": 0,
"top_words": [],
},
),
}
)
merged_users.sort(key=lambda u: u["comment_post_ratio"])
return merged_users

View File

@@ -186,7 +186,7 @@ def scrape_data():
dataset_manager.set_dataset_status( dataset_manager.set_dataset_status(
dataset_id, dataset_id,
"fetching", "fetching",
f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}" f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}",
) )
fetch_and_process_dataset.delay( fetch_and_process_dataset.delay(
@@ -198,12 +198,14 @@ def scrape_data():
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": "Failed to queue dataset processing"}), 500 return jsonify({"error": "Failed to queue dataset processing"}), 500
return jsonify(
return jsonify({ {
"message": "Dataset queued for processing", "message": "Dataset queued for processing",
"dataset_id": dataset_id, "dataset_id": dataset_id,
"status": "processing" "status": "processing",
}), 202 }
), 202
@app.route("/datasets/upload", methods=["POST"]) @app.route("/datasets/upload", methods=["POST"])
@jwt_required() @jwt_required()
@@ -233,7 +235,9 @@ def upload_data():
posts_df = pd.read_json(post_file, lines=True, convert_dates=False) posts_df = pd.read_json(post_file, lines=True, convert_dates=False)
topics = json.load(topic_file) topics = json.load(topic_file)
dataset_id = dataset_manager.save_dataset_info(current_user, dataset_name, topics) dataset_id = dataset_manager.save_dataset_info(
current_user, dataset_name, topics
)
process_dataset.delay(dataset_id, posts_df.to_dict(orient="records"), topics) process_dataset.delay(dataset_id, posts_df.to_dict(orient="records"), topics)
@@ -249,6 +253,7 @@ def upload_data():
except Exception as e: except Exception as e:
return jsonify({"error": f"An unexpected error occurred"}), 500 return jsonify({"error": f"An unexpected error occurred"}), 500
@app.route("/dataset/<int:dataset_id>", methods=["GET"]) @app.route("/dataset/<int:dataset_id>", methods=["GET"])
@jwt_required() @jwt_required()
def get_dataset(dataset_id): def get_dataset(dataset_id):
@@ -256,7 +261,9 @@ def get_dataset(dataset_id):
user_id = int(get_jwt_identity()) user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id): if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset") raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_info = dataset_manager.get_dataset_info(dataset_id) dataset_info = dataset_manager.get_dataset_info(dataset_id)
included_cols = {"id", "name", "created_at"} included_cols = {"id", "name", "created_at"}
@@ -270,6 +277,7 @@ def get_dataset(dataset_id):
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": "An unexpected error occured"}), 500 return jsonify({"error": "An unexpected error occured"}), 500
@app.route("/dataset/<int:dataset_id>", methods=["PATCH"]) @app.route("/dataset/<int:dataset_id>", methods=["PATCH"])
@jwt_required() @jwt_required()
def update_dataset(dataset_id): def update_dataset(dataset_id):
@@ -277,7 +285,9 @@ def update_dataset(dataset_id):
user_id = int(get_jwt_identity()) user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id): if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset") raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
body = request.get_json() body = request.get_json()
new_name = body.get("name") new_name = body.get("name")
@@ -286,7 +296,9 @@ def update_dataset(dataset_id):
return jsonify({"error": "A valid name must be provided"}), 400 return jsonify({"error": "A valid name must be provided"}), 400
dataset_manager.update_dataset_name(dataset_id, new_name.strip()) dataset_manager.update_dataset_name(dataset_id, new_name.strip())
return jsonify({"message": f"Dataset {dataset_id} renamed to '{new_name.strip()}'"}), 200 return jsonify(
{"message": f"Dataset {dataset_id} renamed to '{new_name.strip()}'"}
), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -295,6 +307,7 @@ def update_dataset(dataset_id):
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": "An unexpected error occurred"}), 500 return jsonify({"error": "An unexpected error occurred"}), 500
@app.route("/dataset/<int:dataset_id>", methods=["DELETE"]) @app.route("/dataset/<int:dataset_id>", methods=["DELETE"])
@jwt_required() @jwt_required()
def delete_dataset(dataset_id): def delete_dataset(dataset_id):
@@ -302,11 +315,17 @@ def delete_dataset(dataset_id):
user_id = int(get_jwt_identity()) user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id): if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset") raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_manager.delete_dataset_info(dataset_id) dataset_manager.delete_dataset_info(dataset_id)
dataset_manager.delete_dataset_content(dataset_id) dataset_manager.delete_dataset_content(dataset_id)
return jsonify({"message": f"Dataset {dataset_id} metadata and content successfully deleted"}), 200 return jsonify(
{
"message": f"Dataset {dataset_id} metadata and content successfully deleted"
}
), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -315,6 +334,7 @@ def delete_dataset(dataset_id):
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": "An unexpected error occured"}), 500 return jsonify({"error": "An unexpected error occured"}), 500
@app.route("/dataset/<int:dataset_id>/status", methods=["GET"]) @app.route("/dataset/<int:dataset_id>/status", methods=["GET"])
@jwt_required() @jwt_required()
def get_dataset_status(dataset_id): def get_dataset_status(dataset_id):
@@ -322,7 +342,9 @@ def get_dataset_status(dataset_id):
user_id = int(get_jwt_identity()) user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id): if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset") raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_status = dataset_manager.get_dataset_status(dataset_id) dataset_status = dataset_manager.get_dataset_status(dataset_id)
return jsonify(dataset_status), 200 return jsonify(dataset_status), 200
@@ -334,17 +356,44 @@ def get_dataset_status(dataset_id):
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": "An unexpected error occured"}), 500 return jsonify({"error": "An unexpected error occured"}), 500
@app.route("/dataset/<int:dataset_id>/content", methods=["GET"])
@app.route("/dataset/<int:dataset_id>/linguistic", methods=["GET"])
@jwt_required() @jwt_required()
def content_endpoint(dataset_id): def get_linguistic_analysis(dataset_id):
try: try:
user_id = int(get_jwt_identity()) user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id): if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset") raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.get_content_analysis(dataset_content, filters)), 200 return jsonify(stat_gen.linguistic(dataset_content, filters)), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
return jsonify({"error": "Dataset does not exist"}), 404
except ValueError as e:
return jsonify({"error": f"Malformed or missing data"}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred"}), 500
@app.route("/dataset/<int:dataset_id>/emotional", methods=["GET"])
@jwt_required()
def get_emotional_analysis(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
return jsonify(stat_gen.emotional(dataset_content, filters)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -362,7 +411,9 @@ def get_summary(dataset_id):
try: try:
user_id = int(get_jwt_identity()) user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id): if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset") raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
@@ -378,17 +429,19 @@ def get_summary(dataset_id):
return jsonify({"error": f"An unexpected error occurred"}), 500 return jsonify({"error": f"An unexpected error occurred"}), 500
@app.route("/dataset/<int:dataset_id>/time", methods=["GET"]) @app.route("/dataset/<int:dataset_id>/temporal", methods=["GET"])
@jwt_required() @jwt_required()
def get_time_analysis(dataset_id): def get_temporal_analysis(dataset_id):
try: try:
user_id = int(get_jwt_identity()) user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id): if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset") raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.get_time_analysis(dataset_content, filters)), 200 return jsonify(stat_gen.temporal(dataset_content, filters)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -406,11 +459,13 @@ def get_user_analysis(dataset_id):
try: try:
user_id = int(get_jwt_identity()) user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id): if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset") raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.get_user_analysis(dataset_content, filters)), 200 return jsonify(stat_gen.user(dataset_content, filters)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -428,11 +483,13 @@ def get_cultural_analysis(dataset_id):
try: try:
user_id = int(get_jwt_identity()) user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id): if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset") raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.get_cultural_analysis(dataset_content, filters)), 200 return jsonify(stat_gen.cultural(dataset_content, filters)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -444,17 +501,19 @@ def get_cultural_analysis(dataset_id):
return jsonify({"error": f"An unexpected error occurred"}), 500 return jsonify({"error": f"An unexpected error occurred"}), 500
@app.route("/dataset/<int:dataset_id>/interaction", methods=["GET"]) @app.route("/dataset/<int:dataset_id>/interactional", methods=["GET"])
@jwt_required() @jwt_required()
def get_interaction_analysis(dataset_id): def get_interaction_analysis(dataset_id):
try: try:
user_id = int(get_jwt_identity()) user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id): if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset") raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.get_interactional_analysis(dataset_content, filters)), 200 return jsonify(stat_gen.interactional(dataset_content, filters)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -465,6 +524,27 @@ def get_interaction_analysis(dataset_id):
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred"}), 500 return jsonify({"error": f"An unexpected error occurred"}), 500
@app.route("/dataset/<int:dataset_id>/all", methods=["GET"])
@jwt_required()
def get_full_dataset(dataset_id: int):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id)
return jsonify(dataset_content.to_dict(orient="records")), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
return jsonify({"error": "Dataset does not exist"}), 404
except ValueError as e:
return jsonify({"error": f"Malformed or missing data"}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred"}), 500
if __name__ == "__main__": if __name__ == "__main__":
app.run(debug=True) app.run(debug=True)

View File

@@ -101,7 +101,7 @@ class DatasetManager:
row["source"], row["source"],
row.get("topic"), row.get("topic"),
row.get("topic_confidence"), row.get("topic_confidence"),
Json(row["ner_entities"]) if row.get("ner_entities") else None, Json(row["entities"]) if row.get("entities") is not None else None,
row.get("emotion_anger"), row.get("emotion_anger"),
row.get("emotion_disgust"), row.get("emotion_disgust"),
row.get("emotion_fear"), row.get("emotion_fear"),

View File

@@ -43,7 +43,7 @@ CREATE TABLE events (
weekday VARCHAR(255) NOT NULL, weekday VARCHAR(255) NOT NULL,
/* Posts Only */ /* Posts Only */
title VARCHAR(255), title TEXT,
/* Comments Only*/ /* Comments Only*/
parent_id VARCHAR(255), parent_id VARCHAR(255),