Compare commits

..

25 Commits

Author SHA1 Message Date
acc591ff1e Merge pull request 'Finish off the links between frontend and backend' (#10) from feat/add-frontend-pages into main
Reviewed-on: #10
2026-03-18 20:30:19 +00:00
e054997bb1 feat(frontend): reword CulturalStats to improve understandability 2026-03-18 19:23:35 +00:00
e5414befa7 feat(frontend): add dominant emotion display to UserModal 2026-03-18 19:12:25 +00:00
86926898ce feat(frontend): improve labels to be more understandable 2026-03-18 19:12:11 +00:00
b1177540a1 feat(frontend): enhance EmotionalStats component with detailed mood analysis 2026-03-18 19:11:18 +00:00
f604fcc531 feat(frontend): add warning message for scraping limits 2026-03-18 19:02:11 +00:00
b7aec2b0ea feat(frontend): add favicon
Credit goes to `srip` on flaticon for the image.
2026-03-18 19:00:31 +00:00
1446dd176d feat(frontend): center page selection 2026-03-18 18:53:14 +00:00
c215024ef2 feat(frontend): add deleted user filter
Reddit often contains "[Deleted]" when a user is banned or deletes their post/comment. Keeping the backend faithful to the original dataset is important so the filtering is being done on the frontend.
2026-03-18 18:50:51 +00:00
17ef42e548 feat!(frontend): add cultural, interactional and linguistic stat pages 2026-03-18 18:43:49 +00:00
7e4a91bb5e style(frontend): style api types to be in order of the endpoint 2026-03-18 18:40:39 +00:00
436549641f chore(frontend): add api types for new backend data 2026-03-18 18:37:39 +00:00
3e78a54388 feat(stat): add conversation concentration metric
Remove old `initiator_ratio` metric which wasn't working due every event having a `reply_to` value.

This metric was suggested by AI, and is a surprisingly interesting one that gave interesting insights.
2026-03-18 18:36:09 +00:00
71998c450e fix(db): change title type to text
Occasionally a Reddit post would have a long title, and would break in the schema.
2026-03-17 19:49:03 +00:00
2a00384a55 feat(interaction): add top interaction pairs and initiator ratio methods 2026-03-17 19:03:56 +00:00
8372aa7278 feat(api): add endpoint to view entire dataset 2026-03-17 13:36:41 +00:00
7b5a939271 fix(stats): missing private methods in User obj 2026-03-17 13:36:10 +00:00
2fa1dff4b7 feat(stat): add lexical diversity stat 2026-03-17 13:27:49 +00:00
31fb275ee3 fix(db): incorrect NER column being inserted 2026-03-17 12:53:30 +00:00
8a0f6e71e8 chore(api): rename cultural entity emotion endpoint 2026-03-17 12:31:53 +00:00
9093059d05 refactor(stats): move user stats out of interactional into users 2026-03-17 12:23:03 +00:00
8a13444b16 chore(frontend): add new API types 2026-03-16 16:46:07 +00:00
3468fdc2ea feat(api): add new user and linguistic endpoints 2026-03-16 16:45:11 +00:00
09a4f9036f refactor(stats): add summary and user stat classes for consistency 2026-03-16 16:43:24 +00:00
97fccd073b feat(emotional): add average emotion & dominant emotion stats 2026-03-16 16:41:28 +00:00
21 changed files with 1364 additions and 406 deletions

View File

@@ -2,7 +2,7 @@
<html lang="en">
<head>
<meta charset="UTF-8" />
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
<link rel="icon" type="image/png" href="/icon.png" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>frontend</title>
</head>

BIN
frontend/public/icon.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

View File

@@ -0,0 +1,158 @@
import Card from "./Card";
import StatsStyling from "../styles/stats_styling";
import type { CulturalAnalysisResponse } from "../types/ApiTypes";
const styles = StatsStyling;
type CulturalStatsProps = {
data: CulturalAnalysisResponse;
};
const CulturalStats = ({ data }: CulturalStatsProps) => {
const identity = data.identity_markers;
const stance = data.stance_markers;
const inGroupWords = identity?.in_group_usage ?? 0;
const outGroupWords = identity?.out_group_usage ?? 0;
const totalGroupWords = inGroupWords + outGroupWords;
const inGroupWordRate = typeof identity?.in_group_ratio === "number"
? identity.in_group_ratio * 100
: null;
const outGroupWordRate = typeof identity?.out_group_ratio === "number"
? identity.out_group_ratio * 100
: null;
const rawEntities = data.avg_emotion_per_entity?.entity_emotion_avg ?? {};
const entities = Object.entries(rawEntities)
.sort((a, b) => (b[1].post_count - a[1].post_count))
.slice(0, 20);
const topEmotion = (emotionAvg: Record<string, number> | undefined) => {
const entries = Object.entries(emotionAvg ?? {});
if (!entries.length) {
return "—";
}
entries.sort((a, b) => b[1] - a[1]);
const dominant = entries[0] ?? ["emotion_unknown", 0];
const dominantLabel = dominant[0].replace("emotion_", "");
return `${dominantLabel} (${(dominant[1] * 100).toFixed(1)}%)`;
};
return (
<div style={styles.page}>
<div style={{ ...styles.container, ...styles.grid }}>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Community Framing Overview</h2>
<p style={styles.sectionSubtitle}>Simple view of how often people use "us" words vs "them" words, and the tone around that language.</p>
</div>
<Card
label="In-Group Words"
value={inGroupWords.toLocaleString()}
sublabel="Times we/us/our appears"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Out-Group Words"
value={outGroupWords.toLocaleString()}
sublabel="Times they/them/their appears"
style={{ gridColumn: "span 3" }}
/>
<Card
label="In-Group Posts"
value={identity?.in_group_posts?.toLocaleString() ?? "—"}
sublabel='Posts leaning toward "us" language'
style={{ gridColumn: "span 3" }}
/>
<Card
label="Out-Group Posts"
value={identity?.out_group_posts?.toLocaleString() ?? "—"}
sublabel='Posts leaning toward "them" language'
style={{ gridColumn: "span 3" }}
/>
<Card
label="Balanced Posts"
value={identity?.tie_posts?.toLocaleString() ?? "—"}
sublabel="Posts with equal us/them signals"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Total Group Words"
value={totalGroupWords.toLocaleString()}
sublabel="In-group + out-group words"
style={{ gridColumn: "span 3" }}
/>
<Card
label="In-Group Share"
value={inGroupWordRate === null ? "—" : `${inGroupWordRate.toFixed(2)}%`}
sublabel="Share of all words"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Out-Group Share"
value={outGroupWordRate === null ? "—" : `${outGroupWordRate.toFixed(2)}%`}
sublabel="Share of all words"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Hedging Words"
value={stance?.hedge_total?.toLocaleString() ?? "—"}
sublabel={typeof stance?.hedge_per_1k_tokens === "number" ? `${stance.hedge_per_1k_tokens.toFixed(1)} per 1k words` : "Word frequency"}
style={{ gridColumn: "span 3" }}
/>
<Card
label="Certainty Words"
value={stance?.certainty_total?.toLocaleString() ?? "—"}
sublabel={typeof stance?.certainty_per_1k_tokens === "number" ? `${stance.certainty_per_1k_tokens.toFixed(1)} per 1k words` : "Word frequency"}
style={{ gridColumn: "span 3" }}
/>
<Card
label="Need/Should Words"
value={stance?.deontic_total?.toLocaleString() ?? "—"}
sublabel={typeof stance?.deontic_per_1k_tokens === "number" ? `${stance.deontic_per_1k_tokens.toFixed(1)} per 1k words` : "Word frequency"}
style={{ gridColumn: "span 3" }}
/>
<Card
label="Permission Words"
value={stance?.permission_total?.toLocaleString() ?? "—"}
sublabel={typeof stance?.permission_per_1k_tokens === "number" ? `${stance.permission_per_1k_tokens.toFixed(1)} per 1k words` : "Word frequency"}
style={{ gridColumn: "span 3" }}
/>
<div style={{ ...styles.card, gridColumn: "span 6" }}>
<h2 style={styles.sectionTitle}>Mood in "Us" Posts</h2>
<p style={styles.sectionSubtitle}>Most likely emotion when in-group wording is stronger.</p>
<div style={styles.topUserName}>{topEmotion(identity?.in_group_emotion_avg)}</div>
</div>
<div style={{ ...styles.card, gridColumn: "span 6" }}>
<h2 style={styles.sectionTitle}>Mood in "Them" Posts</h2>
<p style={styles.sectionSubtitle}>Most likely emotion when out-group wording is stronger.</p>
<div style={styles.topUserName}>{topEmotion(identity?.out_group_emotion_avg)}</div>
</div>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Entity Mood Snapshot</h2>
<p style={styles.sectionSubtitle}>Most mentioned entities and the mood that appears most with each.</p>
{!entities.length ? (
<div style={styles.topUserMeta}>No entity-level cultural data available.</div>
) : (
<div style={{ ...styles.topUsersList, maxHeight: 420, overflowY: "auto" }}>
{entities.map(([entity, aggregate]) => (
<div key={entity} style={styles.topUserItem}>
<div style={styles.topUserName}>{entity}</div>
<div style={styles.topUserMeta}>
{aggregate.post_count.toLocaleString()} posts Likely mood: {topEmotion(aggregate.emotion_avg)}
</div>
</div>
))}
</div>
)}
</div>
</div>
</div>
);
};
export default CulturalStats;

View File

@@ -9,6 +9,9 @@ type EmotionalStatsProps = {
const EmotionalStats = ({contentData}: EmotionalStatsProps) => {
const rows = contentData.average_emotion_by_topic ?? [];
const overallEmotionAverage = contentData.overall_emotion_average ?? [];
const dominantEmotionDistribution = contentData.dominant_emotion_distribution ?? [];
const emotionBySource = contentData.emotion_by_source ?? [];
const lowSampleThreshold = 20;
const stableSampleThreshold = 50;
const emotionKeys = rows.length
@@ -64,41 +67,106 @@ const EmotionalStats = ({contentData}: EmotionalStatsProps) => {
return (
<div style={styles.page}>
<div style={{ ...styles.container, ...styles.card, marginTop: 16 }}>
<h2 style={styles.sectionTitle}>Average Emotion by Topic</h2>
<p style={styles.sectionSubtitle}>Read confidence together with sample size. Topics with fewer than {lowSampleThreshold} events are usually noisy and less reliable.</p>
<h2 style={styles.sectionTitle}>Topic Mood Overview</h2>
<p style={styles.sectionSubtitle}>Use the strength score together with post count. Topics with fewer than {lowSampleThreshold} events are often noisy.</p>
<div style={styles.emotionalSummaryRow}>
<span><strong style={{ color: "#24292f" }}>Topics:</strong> {strongestPerTopic.length}</span>
<span><strong style={{ color: "#24292f" }}>Median Sample:</strong> {medianSampleSize} events</span>
<span><strong style={{ color: "#24292f" }}>Low Sample (&lt;{lowSampleThreshold}):</strong> {lowSampleTopics}</span>
<span><strong style={{ color: "#24292f" }}>Stable Sample ({stableSampleThreshold}+):</strong> {stableSampleTopics}</span>
<span><strong style={{ color: "#24292f" }}>Median Posts:</strong> {medianSampleSize}</span>
<span><strong style={{ color: "#24292f" }}>Small Topics (&lt;{lowSampleThreshold}):</strong> {lowSampleTopics}</span>
<span><strong style={{ color: "#24292f" }}>Stable Topics ({stableSampleThreshold}+):</strong> {stableSampleTopics}</span>
</div>
<p style={{ ...styles.sectionSubtitle, marginTop: 10, marginBottom: 0 }}>
Confidence reflects how strongly one emotion leads within a topic, not model accuracy. Use larger samples for stronger conclusions.
Strength means how far the top emotion is ahead in that topic. It does not mean model accuracy.
</p>
</div>
<div style={{ ...styles.container, ...styles.grid }}>
<div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Mood Averages</h2>
<p style={styles.sectionSubtitle}>Average score for each emotion.</p>
{!overallEmotionAverage.length ? (
<div style={styles.topUserMeta}>No overall emotion averages available.</div>
) : (
<div style={{ ...styles.topUsersList, maxHeight: 260, overflowY: "auto" }}>
{[...overallEmotionAverage]
.sort((a, b) => b.score - a.score)
.map((row) => (
<div key={row.emotion} style={styles.topUserItem}>
<div style={styles.topUserName}>{formatEmotion(row.emotion)}</div>
<div style={styles.topUserMeta}>{row.score.toFixed(3)}</div>
</div>
))}
</div>
)}
</div>
<div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Mood Split</h2>
<p style={styles.sectionSubtitle}>How often each emotion is dominant.</p>
{!dominantEmotionDistribution.length ? (
<div style={styles.topUserMeta}>No dominant-emotion split available.</div>
) : (
<div style={{ ...styles.topUsersList, maxHeight: 260, overflowY: "auto" }}>
{[...dominantEmotionDistribution]
.sort((a, b) => b.ratio - a.ratio)
.map((row) => (
<div key={row.emotion} style={styles.topUserItem}>
<div style={styles.topUserName}>{formatEmotion(row.emotion)}</div>
<div style={styles.topUserMeta}>{(row.ratio * 100).toFixed(1)}% {row.count.toLocaleString()} events</div>
</div>
))}
</div>
)}
</div>
<div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Mood by Source</h2>
<p style={styles.sectionSubtitle}>Leading emotion in each source.</p>
{!emotionBySource.length ? (
<div style={styles.topUserMeta}>No source emotion profile available.</div>
) : (
<div style={{ ...styles.topUsersList, maxHeight: 260, overflowY: "auto" }}>
{[...emotionBySource]
.sort((a, b) => b.event_count - a.event_count)
.map((row) => (
<div key={row.source} style={styles.topUserItem}>
<div style={styles.topUserName}>{row.source}</div>
<div style={styles.topUserMeta}>
{formatEmotion(row.dominant_emotion)} {row.dominant_score.toFixed(3)} {row.event_count.toLocaleString()} events
</div>
</div>
))}
</div>
)}
</div>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Topic Snapshots</h2>
<p style={styles.sectionSubtitle}>Per-topic mood with strength and post count.</p>
<div style={{ ...styles.grid, marginTop: 10 }}>
{strongestPerTopic.map((topic) => (
<div key={topic.topic} style={{ ...styles.card, gridColumn: "span 4" }}>
<div key={topic.topic} style={{ ...styles.cardBase, gridColumn: "span 4" }}>
<h3 style={{ ...styles.sectionTitle, marginBottom: 6 }}>{topic.topic}</h3>
<div style={styles.emotionalTopicLabel}>
Top Emotion
Likely Mood
</div>
<div style={styles.emotionalTopicValue}>
{formatEmotion(topic.emotion)}
</div>
<div style={styles.emotionalMetricRow}>
<span>Confidence</span>
<span>Strength</span>
<span style={styles.emotionalMetricValue}>{topic.value.toFixed(3)}</span>
</div>
<div style={styles.emotionalMetricRowCompact}>
<span>Sample Size</span>
<span style={styles.emotionalMetricValue}>{topic.count} events</span>
<span>Posts in Topic</span>
<span style={styles.emotionalMetricValue}>{topic.count}</span>
</div>
</div>
))}
</div>
</div>
</div>
</div>
);
}

View File

@@ -0,0 +1,208 @@
import Card from "./Card";
import StatsStyling from "../styles/stats_styling";
import type { InteractionAnalysisResponse } from "../types/ApiTypes";
import {
ResponsiveContainer,
BarChart,
Bar,
XAxis,
YAxis,
CartesianGrid,
Tooltip,
PieChart,
Pie,
Cell,
Legend,
} from "recharts";
const styles = StatsStyling;
type InteractionalStatsProps = {
data: InteractionAnalysisResponse;
};
const InteractionalStats = ({ data }: InteractionalStatsProps) => {
const graph = data.interaction_graph ?? {};
const userCount = Object.keys(graph).length;
const edges = Object.values(graph).flatMap((targets) => Object.values(targets));
const edgeCount = edges.length;
const interactionVolume = edges.reduce((sum, value) => sum + value, 0);
const concentration = data.conversation_concentration;
const topTenCommentShare = typeof concentration?.top_10pct_comment_share === "number"
? concentration?.top_10pct_comment_share
: null;
const topTenAuthorCount = typeof concentration?.top_10pct_author_count === "number"
? concentration.top_10pct_author_count
: null;
const totalCommentingAuthors = typeof concentration?.total_commenting_authors === "number"
? concentration.total_commenting_authors
: null;
const singleCommentAuthorRatio = typeof concentration?.single_comment_author_ratio === "number"
? concentration.single_comment_author_ratio
: null;
const singleCommentAuthors = typeof concentration?.single_comment_authors === "number"
? concentration.single_comment_authors
: null;
const topPairs = (data.top_interaction_pairs ?? [])
.filter((item): item is [[string, string], number] => {
if (!Array.isArray(item) || item.length !== 2) {
return false;
}
const pair = item[0];
const count = item[1];
return Array.isArray(pair)
&& pair.length === 2
&& typeof pair[0] === "string"
&& typeof pair[1] === "string"
&& typeof count === "number";
})
.slice(0, 20);
const topPairChartData = topPairs.slice(0, 8).map(([[source, target], value], index) => ({
pair: `${source} -> ${target}`,
replies: value,
rank: index + 1,
}));
const topTenSharePercent = topTenCommentShare === null
? null
: topTenCommentShare * 100;
const nonTopTenSharePercent = topTenSharePercent === null
? null
: Math.max(0, 100 - topTenSharePercent);
let concentrationPieData: { name: string; value: number }[] = [];
if (topTenSharePercent !== null && nonTopTenSharePercent !== null) {
concentrationPieData = [
{ name: "Top 10% authors", value: topTenSharePercent },
{ name: "Other authors", value: nonTopTenSharePercent },
];
}
const PIE_COLORS = ["#2b6777", "#c8d8e4"];
return (
<div style={styles.page}>
<div style={{ ...styles.container, ...styles.grid }}>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Conversation Overview</h2>
<p style={styles.sectionSubtitle}>Who talks to who, and how concentrated the replies are.</p>
</div>
<Card
label="Average Reply Depth"
value={typeof data.average_thread_depth === "number" ? data.average_thread_depth.toFixed(2) : "—"}
sublabel="How deep reply chains usually go"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Users in Network"
value={userCount.toLocaleString()}
sublabel="Users in the reply graph"
style={{ gridColumn: "span 3" }}
/>
<Card
label="User-to-User Links"
value={edgeCount.toLocaleString()}
sublabel="Unique reply directions"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Total Replies"
value={interactionVolume.toLocaleString()}
sublabel="All reply links combined"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Concentrated Replies"
value={topTenSharePercent === null ? "-" : `${topTenSharePercent.toFixed(1)}%`}
sublabel={topTenAuthorCount === null || totalCommentingAuthors === null
? "Reply share from the top 10% commenters"
: `${topTenAuthorCount.toLocaleString()} of ${totalCommentingAuthors.toLocaleString()} authors`}
style={{ gridColumn: "span 6" }}
/>
<Card
label="Single-Comment Authors"
value={singleCommentAuthorRatio === null ? "-" : `${(singleCommentAuthorRatio * 100).toFixed(1)}%`}
sublabel={singleCommentAuthors === null
? "Authors who commented exactly once"
: `${singleCommentAuthors.toLocaleString()} authors commented exactly once`}
style={{ gridColumn: "span 6" }}
/>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Conversation Visuals</h2>
<p style={styles.sectionSubtitle}>Main reply links and concentration split.</p>
<div style={{ ...styles.grid, marginTop: 12 }}>
<div style={{ ...styles.cardBase, gridColumn: "span 6" }}>
<h3 style={{ ...styles.sectionTitle, fontSize: "1rem" }}>Top Interaction Pairs</h3>
<div style={{ width: "100%", height: 300 }}>
<ResponsiveContainer>
<BarChart data={topPairChartData} layout="vertical" margin={{ top: 8, right: 16, left: 16, bottom: 8 }}>
<CartesianGrid strokeDasharray="3 3" stroke="#d9e2ec" />
<XAxis type="number" allowDecimals={false} />
<YAxis
type="category"
dataKey="rank"
tickFormatter={(value) => `#${value}`}
width={36}
/>
<Tooltip />
<Bar dataKey="replies" fill="#2b6777" radius={[0, 6, 6, 0]} />
</BarChart>
</ResponsiveContainer>
</div>
</div>
<div style={{ ...styles.cardBase, gridColumn: "span 6" }}>
<h3 style={{ ...styles.sectionTitle, fontSize: "1rem" }}>Top 10% vs Other Comment Share</h3>
<div style={{ width: "100%", height: 300 }}>
<ResponsiveContainer>
<PieChart>
<Pie
data={concentrationPieData}
dataKey="value"
nameKey="name"
innerRadius={56}
outerRadius={88}
paddingAngle={2}
>
{concentrationPieData.map((entry, index) => (
<Cell key={`${entry.name}-${index}`} fill={PIE_COLORS[index % PIE_COLORS.length]} />
))}
</Pie>
<Tooltip />
<Legend verticalAlign="bottom" height={36} />
</PieChart>
</ResponsiveContainer>
</div>
</div>
</div>
</div>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Frequent Reply Paths</h2>
<p style={styles.sectionSubtitle}>Most common user-to-user reply paths.</p>
{!topPairs.length ? (
<div style={styles.topUserMeta}>No interaction pair data available.</div>
) : (
<div style={{ ...styles.topUsersList, maxHeight: 420, overflowY: "auto" }}>
{topPairs.map(([[source, target], value], index) => (
<div key={`${source}->${target}-${index}`} style={styles.topUserItem}>
<div style={styles.topUserName}>{source} -&gt; {target}</div>
<div style={styles.topUserMeta}>{value.toLocaleString()} replies</div>
</div>
))}
</div>
)}
</div>
</div>
</div>
);
};
export default InteractionalStats;

View File

@@ -0,0 +1,91 @@
import Card from "./Card";
import StatsStyling from "../styles/stats_styling";
import type { LinguisticAnalysisResponse } from "../types/ApiTypes";
const styles = StatsStyling;
type LinguisticStatsProps = {
data: LinguisticAnalysisResponse;
};
const LinguisticStats = ({ data }: LinguisticStatsProps) => {
const lexical = data.lexical_diversity;
const words = data.word_frequencies ?? [];
const bigrams = data.common_two_phrases ?? [];
const trigrams = data.common_three_phrases ?? [];
const topWords = words.slice(0, 20);
const topBigrams = bigrams.slice(0, 10);
const topTrigrams = trigrams.slice(0, 10);
return (
<div style={styles.page}>
<div style={{ ...styles.container, ...styles.grid }}>
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Language Overview</h2>
<p style={styles.sectionSubtitle}>Quick read on how broad and repetitive the wording is.</p>
</div>
<Card
label="Total Words"
value={lexical?.total_tokens?.toLocaleString() ?? "—"}
sublabel="Words after basic filtering"
style={{ gridColumn: "span 4" }}
/>
<Card
label="Unique Words"
value={lexical?.unique_tokens?.toLocaleString() ?? "—"}
sublabel="Different words used"
style={{ gridColumn: "span 4" }}
/>
<Card
label="Vocabulary Variety"
value={typeof lexical?.ttr === "number" ? lexical.ttr.toFixed(4) : "—"}
sublabel="Higher means less repetition"
style={{ gridColumn: "span 4" }}
/>
<div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Top Words</h2>
<p style={styles.sectionSubtitle}>Most used single words.</p>
<div style={{ ...styles.topUsersList, maxHeight: 360, overflowY: "auto" }}>
{topWords.map((item) => (
<div key={item.word} style={styles.topUserItem}>
<div style={styles.topUserName}>{item.word}</div>
<div style={styles.topUserMeta}>{item.count.toLocaleString()} uses</div>
</div>
))}
</div>
</div>
<div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Top Bigrams</h2>
<p style={styles.sectionSubtitle}>Most used 2-word phrases.</p>
<div style={{ ...styles.topUsersList, maxHeight: 360, overflowY: "auto" }}>
{topBigrams.map((item) => (
<div key={item.ngram} style={styles.topUserItem}>
<div style={styles.topUserName}>{item.ngram}</div>
<div style={styles.topUserMeta}>{item.count.toLocaleString()} uses</div>
</div>
))}
</div>
</div>
<div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Top Trigrams</h2>
<p style={styles.sectionSubtitle}>Most used 3-word phrases.</p>
<div style={{ ...styles.topUsersList, maxHeight: 360, overflowY: "auto" }}>
{topTrigrams.map((item) => (
<div key={item.ngram} style={styles.topUserItem}>
<div style={styles.topUserName}>{item.ngram}</div>
<div style={styles.topUserMeta}>{item.count.toLocaleString()} uses</div>
</div>
))}
</div>
</div>
</div>
</div>
);
};
export default LinguisticStats;

View File

@@ -58,15 +58,13 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
const [selectedUser, setSelectedUser] = useState<string | null>(null);
const selectedUserData: User | null = userData?.users.find((u) => u.author === selectedUser) ?? null;
console.log(summary)
return (
<div style={styles.page}>
{/* main grid*/}
<div style={{ ...styles.container, ...styles.grid}}>
<Card
label="Total Events"
label="Total Activity"
value={summary?.total_events ?? "—"}
sublabel="Posts + comments"
style={{
@@ -74,15 +72,15 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
}}
/>
<Card
label="Unique Users"
label="Active People"
value={summary?.unique_users ?? "—"}
sublabel="Distinct authors"
sublabel="Distinct users"
style={{
gridColumn: "span 4"
}}
/>
<Card
label="Posts / Comments"
label="Posts vs Comments"
value={
summary
? `${summary.total_posts} / ${summary.total_comments}`
@@ -108,13 +106,13 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
/>
<Card
label="Lurker Ratio"
label="One-Time Users"
value={
typeof summary?.lurker_ratio === "number"
? `${Math.round(summary.lurker_ratio * 100)}%`
: "—"
}
sublabel="Users with only 1 event"
sublabel="Users with only one event"
style={{
gridColumn: "span 4"
}}
@@ -136,12 +134,12 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
{/* events per day */}
<div style={{ ...styles.card, gridColumn: "span 5" }}>
<h2 style={styles.sectionTitle}>Events per Day</h2>
<p style={styles.sectionSubtitle}>Trend of activity over time</p>
<h2 style={styles.sectionTitle}>Activity Over Time</h2>
<p style={styles.sectionSubtitle}>How much posting happened each day.</p>
<div style={styles.chartWrapper}>
<ResponsiveContainer width="100%" height="100%">
<LineChart data={timeData?.events_per_day.filter((d) => new Date(d.date) >= new Date('2026-01-10'))}>
<LineChart data={timeData?.events_per_day ?? []}>
<CartesianGrid strokeDasharray="3 3" />
<XAxis dataKey="date" />
<YAxis />
@@ -154,8 +152,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
{/* Word Cloud */}
<div style={{ ...styles.card, gridColumn: "span 4" }}>
<h2 style={styles.sectionTitle}>Word Cloud</h2>
<p style={styles.sectionSubtitle}>Most common terms across events</p>
<h2 style={styles.sectionTitle}>Common Words</h2>
<p style={styles.sectionSubtitle}>Frequently used words across the dataset.</p>
<div style={styles.chartWrapper}>
<ReactWordcloud
@@ -174,8 +172,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
<div style={{...styles.card, ...styles.scrollArea, gridColumn: "span 3",
}}
>
<h2 style={styles.sectionTitle}>Top Users</h2>
<p style={styles.sectionSubtitle}>Most active authors</p>
<h2 style={styles.sectionTitle}>Most Active Users</h2>
<p style={styles.sectionSubtitle}>Who posted the most events.</p>
<div style={styles.topUsersList}>
{userData?.top_users.slice(0, 100).map((item) => (
@@ -195,8 +193,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
{/* Heatmap */}
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>Heatmap</h2>
<p style={styles.sectionSubtitle}>Activity density across time</p>
<h2 style={styles.sectionTitle}>Weekly Activity Pattern</h2>
<p style={styles.sectionSubtitle}>When activity tends to happen by weekday and hour.</p>
<div style={styles.heatmapWrapper}>
<ActivityHeatmap data={timeData?.weekday_hour_heatmap ?? []} />

View File

@@ -12,6 +12,9 @@ type Props = {
};
export default function UserModal({ open, onClose, userData, username }: Props) {
const dominantEmotionEntry = Object.entries(userData?.avg_emotions ?? {})
.sort((a, b) => b[1] - a[1])[0];
return (
<Dialog open={open} onClose={onClose} style={styles.modalRoot}>
<div style={styles.modalBackdrop} />
@@ -66,6 +69,15 @@ export default function UserModal({ open, onClose, userData, username }: Props)
</div>
</div>
) : null}
{dominantEmotionEntry ? (
<div style={styles.topUserItem}>
<div style={styles.topUserName}>Dominant Avg Emotion</div>
<div style={styles.topUserMeta}>
{dominantEmotionEntry[0].replace("emotion_", "")} ({dominantEmotionEntry[1].toFixed(3)})
</div>
</div>
) : null}
</div>
)}
</DialogPanel>

View File

@@ -87,15 +87,15 @@ const UserStats = (props: { data: UserAnalysisResponse }) => {
style={{ gridColumn: "span 3" }}
/>
<Card
label="Interactions"
label="Replies"
value={totalInteractions.toLocaleString()}
sublabel="Filtered links (2+ interactions)"
sublabel="Links with at least 2 replies"
style={{ gridColumn: "span 3" }}
/>
<Card
label="Average Intensity"
label="Replies per Connected User"
value={avgInteractionsPerConnectedUser.toFixed(1)}
sublabel="Interactions per connected user"
sublabel="Average from visible graph links"
style={{ gridColumn: "span 3" }}
/>
<Card
@@ -106,13 +106,13 @@ const UserStats = (props: { data: UserAnalysisResponse }) => {
/>
<Card
label="Strongest Connection"
label="Strongest User Link"
value={strongestLink ? `${strongestLink.source} -> ${strongestLink.target}` : "—"}
sublabel={strongestLink ? `${strongestLink.value.toLocaleString()} interactions` : "No graph edges after filtering"}
sublabel={strongestLink ? `${strongestLink.value.toLocaleString()} replies` : "No graph links after filtering"}
style={{ gridColumn: "span 6" }}
/>
<Card
label="Most Reply-Driven User"
label="Most Comment-Heavy User"
value={highlyInteractiveUser?.author ?? "—"}
sublabel={
highlyInteractiveUser
@@ -125,7 +125,7 @@ const UserStats = (props: { data: UserAnalysisResponse }) => {
<div style={{ ...styles.card, gridColumn: "span 12" }}>
<h2 style={styles.sectionTitle}>User Interaction Graph</h2>
<p style={styles.sectionSubtitle}>
Nodes represent users and links represent conversation interactions.
Each node is a user, and each link shows replies between them.
</p>
<div ref={graphContainerRef} style={{ width: "100%", height: graphSize.height }}>
<ForceGraph3D

View File

@@ -191,6 +191,9 @@ const AutoScrapePage = () => {
<p style={styles.sectionHeaderSubtitle}>
Select sources and scrape settings, then queue processing automatically.
</p>
<p style={{ ...styles.subtleBodyText, marginTop: 6, color: "#9a6700" }}>
Warning: Scraping more than 250 posts from any single site can take hours due to rate limits.
</p>
</div>
<button
type="button"

View File

@@ -5,26 +5,42 @@ import StatsStyling from "../styles/stats_styling";
import SummaryStats from "../components/SummaryStats";
import EmotionalStats from "../components/EmotionalStats";
import UserStats from "../components/UserStats";
import LinguisticStats from "../components/LinguisticStats";
import InteractionalStats from "../components/InteractionalStats";
import CulturalStats from "../components/CulturalStats";
import {
type SummaryResponse,
type UserAnalysisResponse,
type TimeAnalysisResponse,
type ContentAnalysisResponse
type ContentAnalysisResponse,
type UserEndpointResponse,
type LinguisticAnalysisResponse,
type EmotionalAnalysisResponse,
type InteractionAnalysisResponse,
type CulturalAnalysisResponse
} from '../types/ApiTypes'
const API_BASE_URL = import.meta.env.VITE_BACKEND_URL
const styles = StatsStyling;
const DELETED_USERS = ["[deleted]"];
const isDeletedUser = (value: string | null | undefined) => (
DELETED_USERS.includes((value ?? "").trim().toLowerCase())
);
const StatPage = () => {
const { datasetId: routeDatasetId } = useParams<{ datasetId: string }>();
const [error, setError] = useState('');
const [loading, setLoading] = useState(false);
const [activeView, setActiveView] = useState<"summary" | "emotional" | "user">("summary");
const [activeView, setActiveView] = useState<"summary" | "emotional" | "user" | "linguistic" | "interactional" | "cultural">("summary");
const [userData, setUserData] = useState<UserAnalysisResponse | null>(null);
const [timeData, setTimeData] = useState<TimeAnalysisResponse | null>(null);
const [contentData, setContentData] = useState<ContentAnalysisResponse | null>(null);
const [linguisticData, setLinguisticData] = useState<LinguisticAnalysisResponse | null>(null);
const [interactionData, setInteractionData] = useState<InteractionAnalysisResponse | null>(null);
const [culturalData, setCulturalData] = useState<CulturalAnalysisResponse | null>(null);
const [summary, setSummary] = useState<SummaryResponse | null>(null);
@@ -83,15 +99,23 @@ const StatPage = () => {
setLoading(true);
Promise.all([
axios.get<TimeAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/time`, {
axios.get<TimeAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/temporal`, {
params,
headers: authHeaders,
}),
axios.get<UserAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/user`, {
axios.get<UserEndpointResponse>(`${API_BASE_URL}/dataset/${datasetId}/user`, {
params,
headers: authHeaders,
}),
axios.get<ContentAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/content`, {
axios.get<LinguisticAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/linguistic`, {
params,
headers: authHeaders,
}),
axios.get<EmotionalAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/emotional`, {
params,
headers: authHeaders,
}),
axios.get<InteractionAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/interactional`, {
params,
headers: authHeaders,
}),
@@ -99,12 +123,87 @@ const StatPage = () => {
params,
headers: authHeaders,
}),
axios.get<CulturalAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/cultural`, {
params,
headers: authHeaders,
}),
])
.then(([timeRes, userRes, contentRes, summaryRes]) => {
setUserData(userRes.data || null);
.then(([timeRes, userRes, linguisticRes, emotionalRes, interactionRes, summaryRes, culturalRes]) => {
const usersList = userRes.data.users ?? [];
const topUsersList = userRes.data.top_users ?? [];
const interactionGraphRaw = interactionRes.data?.interaction_graph ?? {};
const topPairsRaw = interactionRes.data?.top_interaction_pairs ?? [];
const filteredUsers: typeof usersList = [];
for (const user of usersList) {
if (isDeletedUser(user.author)) continue;
filteredUsers.push(user);
}
const filteredTopUsers: typeof topUsersList = [];
for (const user of topUsersList) {
if (isDeletedUser(user.author)) continue;
filteredTopUsers.push(user);
}
const filteredInteractionGraph: Record<string, Record<string, number>> = {};
for (const [source, targets] of Object.entries(interactionGraphRaw)) {
if (isDeletedUser(source)) {
continue;
}
const nextTargets: Record<string, number> = {};
for (const [target, count] of Object.entries(targets)) {
if (isDeletedUser(target)) {
continue;
}
nextTargets[target] = count;
}
filteredInteractionGraph[source] = nextTargets;
}
const filteredTopInteractionPairs: typeof topPairsRaw = [];
for (const pairEntry of topPairsRaw) {
const pair = pairEntry[0];
const source = pair[0];
const target = pair[1];
if (isDeletedUser(source) || isDeletedUser(target)) {
continue;
}
filteredTopInteractionPairs.push(pairEntry);
}
const combinedUserData: UserAnalysisResponse = {
...userRes.data,
users: filteredUsers,
top_users: filteredTopUsers,
interaction_graph: filteredInteractionGraph,
};
const combinedContentData: ContentAnalysisResponse = {
...linguisticRes.data,
...emotionalRes.data,
};
const filteredInteractionData: InteractionAnalysisResponse = {
...interactionRes.data,
interaction_graph: filteredInteractionGraph,
top_interaction_pairs: filteredTopInteractionPairs,
};
const filteredSummary: SummaryResponse = {
...summaryRes.data,
unique_users: filteredUsers.length,
};
setUserData(combinedUserData);
setTimeData(timeRes.data || null);
setContentData(contentRes.data || null);
setSummary(summaryRes.data || null);
setContentData(combinedContentData);
setLinguisticData(linguisticRes.data || null);
setInteractionData(filteredInteractionData || null);
setCulturalData(culturalRes.data || null);
setSummary(filteredSummary || null);
})
.catch((e) => setError("Failed to load statistics: " + String(e)))
.finally(() => setLoading(false));
@@ -198,7 +297,7 @@ return (
<div style={styles.dashboardMeta}>Dataset #{datasetId ?? "-"}</div>
</div>
<div style={{ ...styles.container, ...styles.tabsRow }}>
<div style={{ ...styles.container, ...styles.tabsRow, justifyContent: "center" }}>
<button
onClick={() => setActiveView("summary")}
style={activeView === "summary" ? styles.buttonPrimary : styles.buttonSecondary}
@@ -218,6 +317,24 @@ return (
>
Users
</button>
<button
onClick={() => setActiveView("linguistic")}
style={activeView === "linguistic" ? styles.buttonPrimary : styles.buttonSecondary}
>
Linguistic
</button>
<button
onClick={() => setActiveView("interactional")}
style={activeView === "interactional" ? styles.buttonPrimary : styles.buttonSecondary}
>
Interactional
</button>
<button
onClick={() => setActiveView("cultural")}
style={activeView === "cultural" ? styles.buttonPrimary : styles.buttonSecondary}
>
Cultural
</button>
</div>
{activeView === "summary" && (
@@ -243,6 +360,36 @@ return (
<UserStats data={userData} />
)}
{activeView === "linguistic" && linguisticData && (
<LinguisticStats data={linguisticData} />
)}
{activeView === "linguistic" && !linguisticData && (
<div style={{ ...styles.container, ...styles.card, marginTop: 16 }}>
No linguistic data available.
</div>
)}
{activeView === "interactional" && interactionData && (
<InteractionalStats data={interactionData} />
)}
{activeView === "interactional" && !interactionData && (
<div style={{ ...styles.container, ...styles.card, marginTop: 16 }}>
No interactional data available.
</div>
)}
{activeView === "cultural" && culturalData && (
<CulturalStats data={culturalData} />
)}
{activeView === "cultural" && !culturalData && (
<div style={{ ...styles.container, ...styles.card, marginTop: 16 }}>
No cultural data available.
</div>
)}
</div>
);
}

View File

@@ -1,14 +1,28 @@
// User Responses
type TopUser = {
author: string;
source: string;
count: number
};
// Shared types
type FrequencyWord = {
word: string;
count: number;
}
};
type NGram = {
count: number;
ngram: string;
};
type Emotion = {
emotion_anger: number;
emotion_disgust: number;
emotion_fear: number;
emotion_joy: number;
emotion_sadness: number;
};
// User
type TopUser = {
author: string;
source: string;
count: number;
};
type Vocab = {
author: string;
@@ -26,60 +40,145 @@ type User = {
comment: number;
comment_post_ratio: number;
comment_share: number;
avg_emotions?: Record<string, number>;
vocab?: Vocab | null;
};
type InteractionGraph = Record<string, Record<string, number>>;
type UserEndpointResponse = {
top_users: TopUser[];
users: User[];
};
type UserAnalysisResponse = {
top_users: TopUser[];
users: User[];
interaction_graph: InteractionGraph;
};
// Time Analysis
// Time
type EventsPerDay = {
date: Date;
count: number;
}
};
type HeatmapCell = {
date: Date;
hour: number;
count: number;
}
};
type TimeAnalysisResponse = {
events_per_day: EventsPerDay[];
weekday_hour_heatmap: HeatmapCell[];
}
// Content Analysis
type Emotion = {
emotion_anger: number;
emotion_disgust: number;
emotion_fear: number;
emotion_joy: number;
emotion_sadness: number;
};
type NGram = {
count: number;
ngram: string;
}
// Content (combines emotional and linguistic)
type AverageEmotionByTopic = Emotion & {
n: number;
topic: string;
[key: string]: string | number;
};
type OverallEmotionAverage = {
emotion: string;
score: number;
};
type DominantEmotionDistribution = {
emotion: string;
count: number;
ratio: number;
};
type EmotionBySource = {
source: string;
dominant_emotion: string;
dominant_score: number;
event_count: number;
};
type ContentAnalysisResponse = {
word_frequencies: FrequencyWord[];
average_emotion_by_topic: AverageEmotionByTopic[];
common_three_phrases: NGram[];
common_two_phrases: NGram[];
}
overall_emotion_average?: OverallEmotionAverage[];
dominant_emotion_distribution?: DominantEmotionDistribution[];
emotion_by_source?: EmotionBySource[];
};
// Linguistic
type LinguisticAnalysisResponse = {
word_frequencies: FrequencyWord[];
common_two_phrases: NGram[];
common_three_phrases: NGram[];
lexical_diversity?: Record<string, number>;
};
// Emotional
type EmotionalAnalysisResponse = {
average_emotion_by_topic: AverageEmotionByTopic[];
overall_emotion_average?: OverallEmotionAverage[];
dominant_emotion_distribution?: DominantEmotionDistribution[];
emotion_by_source?: EmotionBySource[];
};
// Interactional
type ConversationConcentration = {
total_commenting_authors: number;
top_10pct_author_count: number;
top_10pct_comment_share: number;
single_comment_authors: number;
single_comment_author_ratio: number;
};
type InteractionAnalysisResponse = {
average_thread_depth?: number;
top_interaction_pairs?: [[string, string], number][];
conversation_concentration?: ConversationConcentration;
interaction_graph: InteractionGraph;
};
// Cultural
type IdentityMarkers = {
in_group_usage: number;
out_group_usage: number;
in_group_ratio: number;
out_group_ratio: number;
in_group_posts: number;
out_group_posts: number;
tie_posts: number;
in_group_emotion_avg?: Record<string, number>;
out_group_emotion_avg?: Record<string, number>;
};
type StanceMarkers = {
hedge_total: number;
certainty_total: number;
deontic_total: number;
permission_total: number;
hedge_per_1k_tokens: number;
certainty_per_1k_tokens: number;
deontic_per_1k_tokens: number;
permission_per_1k_tokens: number;
};
type EntityEmotionAggregate = {
post_count: number;
emotion_avg: Record<string, number>;
};
type AverageEmotionPerEntity = {
entity_emotion_avg: Record<string, EntityEmotionAggregate>;
};
type CulturalAnalysisResponse = {
identity_markers?: IdentityMarkers;
stance_markers?: StanceMarkers;
avg_emotion_per_entity?: AverageEmotionPerEntity;
};
// Summary
type SummaryResponse = {
@@ -96,22 +195,35 @@ type SummaryResponse = {
sources: string[];
};
// Filtering Response
// Filter
type FilterResponse = {
rows: number
rows: number;
data: any;
}
};
export type {
TopUser,
Vocab,
User,
InteractionGraph,
ConversationConcentration,
UserAnalysisResponse,
UserEndpointResponse,
FrequencyWord,
AverageEmotionByTopic,
OverallEmotionAverage,
DominantEmotionDistribution,
EmotionBySource,
SummaryResponse,
TimeAnalysisResponse,
ContentAnalysisResponse,
FilterResponse
}
LinguisticAnalysisResponse,
EmotionalAnalysisResponse,
InteractionAnalysisResponse,
IdentityMarkers,
StanceMarkers,
EntityEmotionAggregate,
AverageEmotionPerEntity,
CulturalAnalysisResponse,
FilterResponse,
};

View File

@@ -1,33 +1,86 @@
import pandas as pd
class EmotionalAnalysis:
def avg_emotion_by_topic(self, df: pd.DataFrame) -> dict:
emotion_cols = [
col for col in df.columns
if col.startswith("emotion_")
]
def _emotion_cols(self, df: pd.DataFrame) -> list[str]:
return [col for col in df.columns if col.startswith("emotion_")]
def avg_emotion_by_topic(self, df: pd.DataFrame) -> list[dict]:
emotion_cols = self._emotion_cols(df)
if not emotion_cols:
return []
counts = (
df[
(df["topic"] != "Misc")
]
.groupby("topic")
.size()
.rename("n")
df[(df["topic"] != "Misc")].groupby("topic").size().reset_index(name="n")
)
avg_emotion_by_topic = (
df[
(df["topic"] != "Misc")
]
df[(df["topic"] != "Misc")]
.groupby("topic")[emotion_cols]
.mean()
.reset_index()
)
avg_emotion_by_topic = avg_emotion_by_topic.merge(
counts,
on="topic"
avg_emotion_by_topic = avg_emotion_by_topic.merge(counts, on="topic")
return avg_emotion_by_topic.to_dict(orient="records")
def overall_emotion_average(self, df: pd.DataFrame) -> list[dict]:
emotion_cols = self._emotion_cols(df)
if not emotion_cols:
return []
means = df[emotion_cols].mean()
return [
{
"emotion": col.replace("emotion_", ""),
"score": float(means[col]),
}
for col in emotion_cols
]
def dominant_emotion_distribution(self, df: pd.DataFrame) -> list[dict]:
emotion_cols = self._emotion_cols(df)
if not emotion_cols or df.empty:
return []
dominant_per_row = df[emotion_cols].idxmax(axis=1)
counts = dominant_per_row.value_counts()
total = max(len(dominant_per_row), 1)
return [
{
"emotion": col.replace("emotion_", ""),
"count": int(count),
"ratio": round(float(count / total), 4),
}
for col, count in counts.items()
]
def emotion_by_source(self, df: pd.DataFrame) -> list[dict]:
emotion_cols = self._emotion_cols(df)
if not emotion_cols or "source" not in df.columns or df.empty:
return []
source_counts = df.groupby("source").size()
source_means = df.groupby("source")[emotion_cols].mean().reset_index()
rows = source_means.to_dict(orient="records")
output = []
for row in rows:
source = row["source"]
dominant_col = max(emotion_cols, key=lambda col: float(row.get(col, 0)))
output.append(
{
"source": str(source),
"dominant_emotion": dominant_col.replace("emotion_", ""),
"dominant_score": round(float(row.get(dominant_col, 0)), 4),
"event_count": int(source_counts.get(source, 0)),
}
)
return avg_emotion_by_topic.to_dict(orient='records')
return output

View File

@@ -1,9 +1,6 @@
import pandas as pd
import re
from collections import Counter
class InteractionAnalysis:
def __init__(self, word_exclusions: set[str]):
self.word_exclusions = word_exclusions
@@ -12,118 +9,6 @@ class InteractionAnalysis:
tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in self.word_exclusions]
def _vocab_richness_per_user(
self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100
) -> list:
df = df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower()
df["tokens"] = df["content"].apply(self._tokenize)
rows = []
for author, group in df.groupby("author"):
all_tokens = [t for tokens in group["tokens"] for t in tokens]
total_words = len(all_tokens)
unique_words = len(set(all_tokens))
events = len(group)
# Min amount of words for a user, any less than this might give weird results
if total_words < min_words:
continue
# 100% = they never reused a word (excluding stop words)
vocab_richness = unique_words / total_words
avg_words = total_words / max(events, 1)
counts = Counter(all_tokens)
top_words = [
{"word": w, "count": int(c)}
for w, c in counts.most_common(top_most_used_words)
]
rows.append(
{
"author": author,
"events": int(events),
"total_words": int(total_words),
"unique_words": int(unique_words),
"vocab_richness": round(vocab_richness, 3),
"avg_words_per_event": round(avg_words, 2),
"top_words": top_words,
}
)
rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True)
return rows
def top_users(self, df: pd.DataFrame) -> list:
counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)
top_users = [
{"author": author, "source": source, "count": int(count)}
for (author, source), count in counts.items()
]
return top_users
def per_user_analysis(self, df: pd.DataFrame) -> dict:
per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
avg_emotions_by_author = {}
if emotion_cols:
avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
avg_emotions_by_author = {
author: {emotion: float(score) for emotion, score in row.items()}
for author, row in avg_emotions.iterrows()
}
# ensure columns always exist
for col in ("post", "comment"):
if col not in per_user.columns:
per_user[col] = 0
per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(
0, 1
)
per_user["comment_share"] = per_user["comment"] / (
per_user["post"] + per_user["comment"]
).replace(0, 1)
per_user = per_user.sort_values("comment_post_ratio", ascending=True)
per_user_records = per_user.reset_index().to_dict(orient="records")
vocab_rows = self._vocab_richness_per_user(df)
vocab_by_author = {row["author"]: row for row in vocab_rows}
# merge vocab richness + per_user information
merged_users = []
for row in per_user_records:
author = row["author"]
merged_users.append(
{
"author": author,
"post": int(row.get("post", 0)),
"comment": int(row.get("comment", 0)),
"comment_post_ratio": float(row.get("comment_post_ratio", 0)),
"comment_share": float(row.get("comment_share", 0)),
"avg_emotions": avg_emotions_by_author.get(author, {}),
"vocab": vocab_by_author.get(
author,
{
"vocab_richness": 0,
"avg_words_per_event": 0,
"top_words": [],
},
),
}
)
merged_users.sort(key=lambda u: u["comment_post_ratio"])
return merged_users
def interaction_graph(self, df: pd.DataFrame):
interactions = {a: {} for a in df["author"].dropna().unique()}
@@ -167,67 +52,36 @@ class InteractionAnalysis:
return round(sum(depths) / len(depths), 2)
def average_thread_length_by_emotion(self, df: pd.DataFrame):
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
def top_interaction_pairs(self, df: pd.DataFrame, top_n=10):
graph = self.interaction_graph(df)
pairs = []
emotion_cols = [
c
for c in df.columns
if c.startswith("emotion_") and c not in emotion_exclusions
]
for a, targets in graph.items():
for b, count in targets.items():
pairs.append(((a, b), count))
id_to_reply = df.set_index("id")["reply_to"].to_dict()
length_cache = {}
pairs.sort(key=lambda x: x[1], reverse=True)
return pairs[:top_n]
def thread_length_from(start_id):
if start_id in length_cache:
return length_cache[start_id]
def conversation_concentration(self, df: pd.DataFrame) -> dict:
if "type" not in df.columns:
return {}
seen = set()
length = 1
current = start_id
comments = df[df["type"] == "comment"]
if comments.empty:
return {}
while True:
if current in seen:
# infinite loop shouldn't happen, but just in case
break
seen.add(current)
author_counts = comments["author"].value_counts()
total_comments = len(comments)
total_authors = len(author_counts)
reply_to = id_to_reply.get(current)
if (
reply_to is None
or (isinstance(reply_to, float) and pd.isna(reply_to))
or reply_to == ""
):
break
length += 1
current = reply_to
if current in length_cache:
length += length_cache[current] - 1
break
length_cache[start_id] = length
return length
emotion_to_lengths = {}
# Fill NaNs in emotion cols to avoid max() issues
emo_df = df[["id"] + emotion_cols].copy()
emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
for _, row in emo_df.iterrows():
msg_id = row["id"]
length = thread_length_from(msg_id)
emotions = {c: row[c] for c in emotion_cols}
dominant = max(emotions, key=emotions.get)
emotion_to_lengths.setdefault(dominant, []).append(length)
top_10_pct_n = max(1, int(total_authors * 0.1))
top_10_pct_share = round(author_counts.head(top_10_pct_n).sum() / total_comments, 4)
return {
emotion: round(sum(lengths) / len(lengths), 2)
for emotion, lengths in emotion_to_lengths.items()
"total_commenting_authors": total_authors,
"top_10pct_author_count": top_10_pct_n,
"top_10pct_comment_share": float(top_10_pct_share),
"single_comment_authors": int((author_counts == 1).sum()),
"single_comment_author_ratio": float(round((author_counts == 1).sum() / total_authors, 4)),
}

View File

@@ -61,3 +61,19 @@ class LinguisticAnalysis:
.head(limit)
.to_dict(orient="records")
)
def lexical_diversity(self, df: pd.DataFrame) -> dict:
tokens = (
df["content"].fillna("").astype(str).str.lower()
.str.findall(r"\b[a-z]{2,}\b")
.explode()
)
tokens = tokens[~tokens.isin(self.word_exclusions)]
total = max(len(tokens), 1)
unique = int(tokens.nunique())
return {
"total_tokens": total,
"unique_tokens": unique,
"ttr": round(unique / total, 4),
}

View File

@@ -6,7 +6,9 @@ from server.analysis.cultural import CulturalAnalysis
from server.analysis.emotional import EmotionalAnalysis
from server.analysis.interactional import InteractionAnalysis
from server.analysis.linguistic import LinguisticAnalysis
from server.analysis.summary import SummaryAnalysis
from server.analysis.temporal import TemporalAnalysis
from server.analysis.user import UserAnalysis
DOMAIN_STOPWORDS = {
"www",
@@ -36,12 +38,11 @@ class StatGen:
self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS)
self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS)
self.cultural_analysis = CulturalAnalysis()
self.summary_analysis = SummaryAnalysis()
self.user_analysis = UserAnalysis(EXCLUDE_WORDS)
## Private Methods
def _prepare_filtered_df(self,
df: pd.DataFrame,
filters: dict | None = None
) -> pd.DataFrame:
def _prepare_filtered_df(self, df: pd.DataFrame, filters: dict | None = None) -> pd.DataFrame:
filters = filters or {}
filtered_df = df.copy()
@@ -51,10 +52,9 @@ class StatGen:
data_source_filter = filters.get("data_sources", None)
if search_query:
mask = (
filtered_df["content"].str.contains(search_query, case=False, na=False)
| filtered_df["author"].str.contains(search_query, case=False, na=False)
)
mask = filtered_df["content"].str.contains(
search_query, case=False, na=False
) | filtered_df["author"].str.contains(search_query, case=False, na=False)
# Only include title if the column exists
if "title" in filtered_df.columns:
@@ -76,10 +76,10 @@ class StatGen:
return filtered_df
## Public Methods
def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]:
return self._prepare_filtered_df(df, filters).to_dict(orient="records")
def get_time_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
return {
@@ -87,84 +87,54 @@ class StatGen:
"weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df),
}
def get_content_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
def linguistic(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
return {
"word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df),
"common_two_phrases": self.linguistic_analysis.ngrams(filtered_df),
"common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3),
"average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(
filtered_df
)
"lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df)
}
def get_user_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
return {
"top_users": self.interaction_analysis.top_users(filtered_df),
"users": self.interaction_analysis.per_user_analysis(filtered_df),
"interaction_graph": self.interaction_analysis.interaction_graph(filtered_df)
"average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(filtered_df),
"overall_emotion_average": self.emotional_analysis.overall_emotion_average(filtered_df),
"dominant_emotion_distribution": self.emotional_analysis.dominant_emotion_distribution(filtered_df),
"emotion_by_source": self.emotional_analysis.emotion_by_source(filtered_df)
}
def get_interactional_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
def user(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
return {
"average_thread_depth": self.interaction_analysis.average_thread_depth(
filtered_df
),
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(
filtered_df
),
"top_users": self.user_analysis.top_users(filtered_df),
"users": self.user_analysis.per_user_analysis(filtered_df)
}
def get_cultural_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
def interactional(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
return {
"identity_markers": self.cultural_analysis.get_identity_markers(
filtered_df
),
"average_thread_depth": self.interaction_analysis.average_thread_depth(filtered_df),
"top_interaction_pairs": self.interaction_analysis.top_interaction_pairs(filtered_df, top_n=100),
"interaction_graph": self.interaction_analysis.interaction_graph(filtered_df),
"conversation_concentration": self.interaction_analysis.conversation_concentration(filtered_df)
}
def cultural(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
return {
"identity_markers": self.cultural_analysis.get_identity_markers(filtered_df),
"stance_markers": self.cultural_analysis.get_stance_markers(filtered_df),
"entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(
filtered_df
),
"avg_emotion_per_entity": self.cultural_analysis.get_avg_emotions_per_entity(filtered_df)
}
def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
total_posts = (filtered_df["type"] == "post").sum()
total_comments = (filtered_df["type"] == "comment").sum()
events_per_user = filtered_df.groupby("author").size()
if filtered_df.empty:
return {
"total_events": 0,
"total_posts": 0,
"total_comments": 0,
"unique_users": 0,
"comments_per_post": 0,
"lurker_ratio": 0,
"time_range": {
"start": None,
"end": None,
},
"sources": [],
}
return {
"total_events": int(len(filtered_df)),
"total_posts": int(total_posts),
"total_comments": int(total_comments),
"unique_users": int(events_per_user.count()),
"comments_per_post": round(total_comments / max(total_posts, 1), 2),
"lurker_ratio": round((events_per_user == 1).mean(), 2),
"time_range": {
"start": int(filtered_df["dt"].min().timestamp()),
"end": int(filtered_df["dt"].max().timestamp()),
},
"sources": filtered_df["source"].dropna().unique().tolist(),
}
return self.summary_analysis.summary(filtered_df)

View File

@@ -0,0 +1,64 @@
import pandas as pd
class SummaryAnalysis:
def total_events(self, df: pd.DataFrame) -> int:
return int(len(df))
def total_posts(self, df: pd.DataFrame) -> int:
return int(len(df[df["type"] == "post"]))
def total_comments(self, df: pd.DataFrame) -> int:
return int(len(df[df["type"] == "comment"]))
def unique_users(self, df: pd.DataFrame) -> int:
return int(len(df["author"].dropna().unique()))
def comments_per_post(self, total_comments: int, total_posts: int) -> float:
return round(total_comments / max(total_posts, 1), 2)
def lurker_ratio(self, df: pd.DataFrame) -> float:
events_per_user = df.groupby("author").size()
return round((events_per_user == 1).mean(), 2)
def time_range(self, df: pd.DataFrame) -> dict:
return {
"start": int(df["dt"].min().timestamp()),
"end": int(df["dt"].max().timestamp()),
}
def sources(self, df: pd.DataFrame) -> list:
return df["source"].dropna().unique().tolist()
def empty_summary(self) -> dict:
return {
"total_events": 0,
"total_posts": 0,
"total_comments": 0,
"unique_users": 0,
"comments_per_post": 0,
"lurker_ratio": 0,
"time_range": {
"start": None,
"end": None,
},
"sources": [],
}
def summary(self, df: pd.DataFrame) -> dict:
if df.empty:
return self.empty_summary()
total_posts = self.total_posts(df)
total_comments = self.total_comments(df)
return {
"total_events": self.total_events(df),
"total_posts": total_posts,
"total_comments": total_comments,
"unique_users": self.unique_users(df),
"comments_per_post": self.comments_per_post(total_comments, total_posts),
"lurker_ratio": self.lurker_ratio(df),
"time_range": self.time_range(df),
"sources": self.sources(df),
}

124
server/analysis/user.py Normal file
View File

@@ -0,0 +1,124 @@
import pandas as pd
import re
from collections import Counter
class UserAnalysis:
def __init__(self, word_exclusions: set[str]):
self.word_exclusions = word_exclusions
def _tokenize(self, text: str):
tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in self.word_exclusions]
def _vocab_richness_per_user(
self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100
) -> list:
df = df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower()
df["tokens"] = df["content"].apply(self._tokenize)
rows = []
for author, group in df.groupby("author"):
all_tokens = [t for tokens in group["tokens"] for t in tokens]
total_words = len(all_tokens)
unique_words = len(set(all_tokens))
events = len(group)
# Min amount of words for a user, any less than this might give weird results
if total_words < min_words:
continue
# 100% = they never reused a word (excluding stop words)
vocab_richness = unique_words / total_words
avg_words = total_words / max(events, 1)
counts = Counter(all_tokens)
top_words = [
{"word": w, "count": int(c)}
for w, c in counts.most_common(top_most_used_words)
]
rows.append(
{
"author": author,
"events": int(events),
"total_words": int(total_words),
"unique_words": int(unique_words),
"vocab_richness": round(vocab_richness, 3),
"avg_words_per_event": round(avg_words, 2),
"top_words": top_words,
}
)
rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True)
return rows
def top_users(self, df: pd.DataFrame) -> list:
counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)
top_users = [
{"author": author, "source": source, "count": int(count)}
for (author, source), count in counts.items()
]
return top_users
def per_user_analysis(self, df: pd.DataFrame) -> dict:
per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
avg_emotions_by_author = {}
if emotion_cols:
avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
avg_emotions_by_author = {
author: {emotion: float(score) for emotion, score in row.items()}
for author, row in avg_emotions.iterrows()
}
# ensure columns always exist
for col in ("post", "comment"):
if col not in per_user.columns:
per_user[col] = 0
per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(
0, 1
)
per_user["comment_share"] = per_user["comment"] / (
per_user["post"] + per_user["comment"]
).replace(0, 1)
per_user = per_user.sort_values("comment_post_ratio", ascending=True)
per_user_records = per_user.reset_index().to_dict(orient="records")
vocab_rows = self._vocab_richness_per_user(df)
vocab_by_author = {row["author"]: row for row in vocab_rows}
# merge vocab richness + per_user information
merged_users = []
for row in per_user_records:
author = row["author"]
merged_users.append(
{
"author": author,
"post": int(row.get("post", 0)),
"comment": int(row.get("comment", 0)),
"comment_post_ratio": float(row.get("comment_post_ratio", 0)),
"comment_share": float(row.get("comment_share", 0)),
"avg_emotions": avg_emotions_by_author.get(author, {}),
"vocab": vocab_by_author.get(
author,
{
"vocab_richness": 0,
"avg_words_per_event": 0,
"top_words": [],
},
),
}
)
merged_users.sort(key=lambda u: u["comment_post_ratio"])
return merged_users

View File

@@ -186,7 +186,7 @@ def scrape_data():
dataset_manager.set_dataset_status(
dataset_id,
"fetching",
f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}",
)
fetch_and_process_dataset.delay(
@@ -198,12 +198,14 @@ def scrape_data():
print(traceback.format_exc())
return jsonify({"error": "Failed to queue dataset processing"}), 500
return jsonify({
return jsonify(
{
"message": "Dataset queued for processing",
"dataset_id": dataset_id,
"status": "processing"
}), 202
"status": "processing",
}
), 202
@app.route("/datasets/upload", methods=["POST"])
@jwt_required()
@@ -233,7 +235,9 @@ def upload_data():
posts_df = pd.read_json(post_file, lines=True, convert_dates=False)
topics = json.load(topic_file)
dataset_id = dataset_manager.save_dataset_info(current_user, dataset_name, topics)
dataset_id = dataset_manager.save_dataset_info(
current_user, dataset_name, topics
)
process_dataset.delay(dataset_id, posts_df.to_dict(orient="records"), topics)
@@ -249,6 +253,7 @@ def upload_data():
except Exception as e:
return jsonify({"error": f"An unexpected error occurred"}), 500
@app.route("/dataset/<int:dataset_id>", methods=["GET"])
@jwt_required()
def get_dataset(dataset_id):
@@ -256,7 +261,9 @@ def get_dataset(dataset_id):
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset")
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_info = dataset_manager.get_dataset_info(dataset_id)
included_cols = {"id", "name", "created_at"}
@@ -270,6 +277,7 @@ def get_dataset(dataset_id):
print(traceback.format_exc())
return jsonify({"error": "An unexpected error occured"}), 500
@app.route("/dataset/<int:dataset_id>", methods=["PATCH"])
@jwt_required()
def update_dataset(dataset_id):
@@ -277,7 +285,9 @@ def update_dataset(dataset_id):
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset")
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
body = request.get_json()
new_name = body.get("name")
@@ -286,7 +296,9 @@ def update_dataset(dataset_id):
return jsonify({"error": "A valid name must be provided"}), 400
dataset_manager.update_dataset_name(dataset_id, new_name.strip())
return jsonify({"message": f"Dataset {dataset_id} renamed to '{new_name.strip()}'"}), 200
return jsonify(
{"message": f"Dataset {dataset_id} renamed to '{new_name.strip()}'"}
), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -295,6 +307,7 @@ def update_dataset(dataset_id):
print(traceback.format_exc())
return jsonify({"error": "An unexpected error occurred"}), 500
@app.route("/dataset/<int:dataset_id>", methods=["DELETE"])
@jwt_required()
def delete_dataset(dataset_id):
@@ -302,11 +315,17 @@ def delete_dataset(dataset_id):
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset")
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_manager.delete_dataset_info(dataset_id)
dataset_manager.delete_dataset_content(dataset_id)
return jsonify({"message": f"Dataset {dataset_id} metadata and content successfully deleted"}), 200
return jsonify(
{
"message": f"Dataset {dataset_id} metadata and content successfully deleted"
}
), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -315,6 +334,7 @@ def delete_dataset(dataset_id):
print(traceback.format_exc())
return jsonify({"error": "An unexpected error occured"}), 500
@app.route("/dataset/<int:dataset_id>/status", methods=["GET"])
@jwt_required()
def get_dataset_status(dataset_id):
@@ -322,7 +342,9 @@ def get_dataset_status(dataset_id):
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset")
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_status = dataset_manager.get_dataset_status(dataset_id)
return jsonify(dataset_status), 200
@@ -334,17 +356,44 @@ def get_dataset_status(dataset_id):
print(traceback.format_exc())
return jsonify({"error": "An unexpected error occured"}), 500
@app.route("/dataset/<int:dataset_id>/content", methods=["GET"])
@app.route("/dataset/<int:dataset_id>/linguistic", methods=["GET"])
@jwt_required()
def content_endpoint(dataset_id):
def get_linguistic_analysis(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset")
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
return jsonify(stat_gen.get_content_analysis(dataset_content, filters)), 200
return jsonify(stat_gen.linguistic(dataset_content, filters)), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
return jsonify({"error": "Dataset does not exist"}), 404
except ValueError as e:
return jsonify({"error": f"Malformed or missing data"}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred"}), 500
@app.route("/dataset/<int:dataset_id>/emotional", methods=["GET"])
@jwt_required()
def get_emotional_analysis(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
return jsonify(stat_gen.emotional(dataset_content, filters)), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -362,7 +411,9 @@ def get_summary(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset")
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
@@ -378,17 +429,19 @@ def get_summary(dataset_id):
return jsonify({"error": f"An unexpected error occurred"}), 500
@app.route("/dataset/<int:dataset_id>/time", methods=["GET"])
@app.route("/dataset/<int:dataset_id>/temporal", methods=["GET"])
@jwt_required()
def get_time_analysis(dataset_id):
def get_temporal_analysis(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset")
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
return jsonify(stat_gen.get_time_analysis(dataset_content, filters)), 200
return jsonify(stat_gen.temporal(dataset_content, filters)), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -406,11 +459,13 @@ def get_user_analysis(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset")
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
return jsonify(stat_gen.get_user_analysis(dataset_content, filters)), 200
return jsonify(stat_gen.user(dataset_content, filters)), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -428,11 +483,13 @@ def get_cultural_analysis(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset")
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
return jsonify(stat_gen.get_cultural_analysis(dataset_content, filters)), 200
return jsonify(stat_gen.cultural(dataset_content, filters)), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -444,17 +501,19 @@ def get_cultural_analysis(dataset_id):
return jsonify({"error": f"An unexpected error occurred"}), 500
@app.route("/dataset/<int:dataset_id>/interaction", methods=["GET"])
@app.route("/dataset/<int:dataset_id>/interactional", methods=["GET"])
@jwt_required()
def get_interaction_analysis(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException("This user is not authorised to access this dataset")
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
return jsonify(stat_gen.get_interactional_analysis(dataset_content, filters)), 200
return jsonify(stat_gen.interactional(dataset_content, filters)), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -465,6 +524,27 @@ def get_interaction_analysis(dataset_id):
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred"}), 500
@app.route("/dataset/<int:dataset_id>/all", methods=["GET"])
@jwt_required()
def get_full_dataset(dataset_id: int):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
raise NotAuthorisedException(
"This user is not authorised to access this dataset"
)
dataset_content = dataset_manager.get_dataset_content(dataset_id)
return jsonify(dataset_content.to_dict(orient="records")), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
return jsonify({"error": "Dataset does not exist"}), 404
except ValueError as e:
return jsonify({"error": f"Malformed or missing data"}), 400
except Exception as e:
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred"}), 500
if __name__ == "__main__":
app.run(debug=True)

View File

@@ -101,7 +101,7 @@ class DatasetManager:
row["source"],
row.get("topic"),
row.get("topic_confidence"),
Json(row["ner_entities"]) if row.get("ner_entities") else None,
Json(row["entities"]) if row.get("entities") is not None else None,
row.get("emotion_anger"),
row.get("emotion_disgust"),
row.get("emotion_fear"),

View File

@@ -43,7 +43,7 @@ CREATE TABLE events (
weekday VARCHAR(255) NOT NULL,
/* Posts Only */
title VARCHAR(255),
title TEXT,
/* Comments Only*/
parent_id VARCHAR(255),