- Top Emotion
+
+
Mood Averages
+
Average score for each emotion.
+ {!overallEmotionAverage.length ? (
+
No overall emotion averages available.
+ ) : (
+
+ {[...overallEmotionAverage]
+ .sort((a, b) => b.score - a.score)
+ .map((row) => (
+
+
{formatEmotion(row.emotion)}
+
{row.score.toFixed(3)}
+
+ ))}
-
- {formatEmotion(topic.emotion)}
+ )}
+
+
+
+
Mood Split
+
How often each emotion is dominant.
+ {!dominantEmotionDistribution.length ? (
+
No dominant-emotion split available.
+ ) : (
+
+ {[...dominantEmotionDistribution]
+ .sort((a, b) => b.ratio - a.ratio)
+ .map((row) => (
+
+
{formatEmotion(row.emotion)}
+
{(row.ratio * 100).toFixed(1)}% • {row.count.toLocaleString()} events
+
+ ))}
-
- Confidence
- {topic.value.toFixed(3)}
-
-
- Sample Size
- {topic.count} events
+ )}
+
+
+
+
Mood by Source
+
Leading emotion in each source.
+ {!emotionBySource.length ? (
+
No source emotion profile available.
+ ) : (
+
+ {[...emotionBySource]
+ .sort((a, b) => b.event_count - a.event_count)
+ .map((row) => (
+
+
{row.source}
+
+ {formatEmotion(row.dominant_emotion)} • {row.dominant_score.toFixed(3)} • {row.event_count.toLocaleString()} events
+
+
+ ))}
+ )}
+
+
+
+
Topic Snapshots
+
Per-topic mood with strength and post count.
+
+ {strongestPerTopic.map((topic) => (
+
+
{topic.topic}
+
+ Likely Mood
+
+
+ {formatEmotion(topic.emotion)}
+
+
+ Strength
+ {topic.value.toFixed(3)}
+
+
+ Posts in Topic
+ {topic.count}
+
+
+ ))}
- ))}
+
);
diff --git a/frontend/src/components/InteractionalStats.tsx b/frontend/src/components/InteractionalStats.tsx
new file mode 100644
index 0000000..11ab3a2
--- /dev/null
+++ b/frontend/src/components/InteractionalStats.tsx
@@ -0,0 +1,208 @@
+import Card from "./Card";
+import StatsStyling from "../styles/stats_styling";
+import type { InteractionAnalysisResponse } from "../types/ApiTypes";
+import {
+ ResponsiveContainer,
+ BarChart,
+ Bar,
+ XAxis,
+ YAxis,
+ CartesianGrid,
+ Tooltip,
+ PieChart,
+ Pie,
+ Cell,
+ Legend,
+} from "recharts";
+
+const styles = StatsStyling;
+
+type InteractionalStatsProps = {
+ data: InteractionAnalysisResponse;
+};
+
+const InteractionalStats = ({ data }: InteractionalStatsProps) => {
+ const graph = data.interaction_graph ?? {};
+ const userCount = Object.keys(graph).length;
+ const edges = Object.values(graph).flatMap((targets) => Object.values(targets));
+ const edgeCount = edges.length;
+ const interactionVolume = edges.reduce((sum, value) => sum + value, 0);
+ const concentration = data.conversation_concentration;
+ const topTenCommentShare = typeof concentration?.top_10pct_comment_share === "number"
+ ? concentration?.top_10pct_comment_share
+ : null;
+ const topTenAuthorCount = typeof concentration?.top_10pct_author_count === "number"
+ ? concentration.top_10pct_author_count
+ : null;
+ const totalCommentingAuthors = typeof concentration?.total_commenting_authors === "number"
+ ? concentration.total_commenting_authors
+ : null;
+ const singleCommentAuthorRatio = typeof concentration?.single_comment_author_ratio === "number"
+ ? concentration.single_comment_author_ratio
+ : null;
+ const singleCommentAuthors = typeof concentration?.single_comment_authors === "number"
+ ? concentration.single_comment_authors
+ : null;
+
+ const topPairs = (data.top_interaction_pairs ?? [])
+ .filter((item): item is [[string, string], number] => {
+ if (!Array.isArray(item) || item.length !== 2) {
+ return false;
+ }
+
+ const pair = item[0];
+ const count = item[1];
+
+ return Array.isArray(pair)
+ && pair.length === 2
+ && typeof pair[0] === "string"
+ && typeof pair[1] === "string"
+ && typeof count === "number";
+ })
+ .slice(0, 20);
+
+ const topPairChartData = topPairs.slice(0, 8).map(([[source, target], value], index) => ({
+ pair: `${source} -> ${target}`,
+ replies: value,
+ rank: index + 1,
+ }));
+
+ const topTenSharePercent = topTenCommentShare === null
+ ? null
+ : topTenCommentShare * 100;
+ const nonTopTenSharePercent = topTenSharePercent === null
+ ? null
+ : Math.max(0, 100 - topTenSharePercent);
+
+ let concentrationPieData: { name: string; value: number }[] = [];
+ if (topTenSharePercent !== null && nonTopTenSharePercent !== null) {
+ concentrationPieData = [
+ { name: "Top 10% authors", value: topTenSharePercent },
+ { name: "Other authors", value: nonTopTenSharePercent },
+ ];
+ }
+
+ const PIE_COLORS = ["#2b6777", "#c8d8e4"];
+
+ return (
+
+
+
+
Conversation Overview
+
Who talks to who, and how concentrated the replies are.
+
+
+
+
+
+
+
+
+
+
+
Conversation Visuals
+
Main reply links and concentration split.
+
+
+
+
Top Interaction Pairs
+
+
+
+
+
+ `#${value}`}
+ width={36}
+ />
+
+
+
+
+
+
+
+
+
Top 10% vs Other Comment Share
+
+
+
+
+ {concentrationPieData.map((entry, index) => (
+ |
+ ))}
+
+
+
+
+
+
+
+
+
+
+
+
Frequent Reply Paths
+
Most common user-to-user reply paths.
+ {!topPairs.length ? (
+
No interaction pair data available.
+ ) : (
+
+ {topPairs.map(([[source, target], value], index) => (
+
${target}-${index}`} style={styles.topUserItem}>
+
{source} -> {target}
+
{value.toLocaleString()} replies
+
+ ))}
+
+ )}
+
+
+
+ );
+};
+
+export default InteractionalStats;
diff --git a/frontend/src/components/LinguisticStats.tsx b/frontend/src/components/LinguisticStats.tsx
new file mode 100644
index 0000000..34fdafd
--- /dev/null
+++ b/frontend/src/components/LinguisticStats.tsx
@@ -0,0 +1,91 @@
+import Card from "./Card";
+import StatsStyling from "../styles/stats_styling";
+import type { LinguisticAnalysisResponse } from "../types/ApiTypes";
+
+const styles = StatsStyling;
+
+type LinguisticStatsProps = {
+ data: LinguisticAnalysisResponse;
+};
+
+const LinguisticStats = ({ data }: LinguisticStatsProps) => {
+ const lexical = data.lexical_diversity;
+ const words = data.word_frequencies ?? [];
+ const bigrams = data.common_two_phrases ?? [];
+ const trigrams = data.common_three_phrases ?? [];
+
+ const topWords = words.slice(0, 20);
+ const topBigrams = bigrams.slice(0, 10);
+ const topTrigrams = trigrams.slice(0, 10);
+
+ return (
+
+
+
+
Language Overview
+
Quick read on how broad and repetitive the wording is.
+
+
+
+
+
+
+
+
Top Words
+
Most used single words.
+
+ {topWords.map((item) => (
+
+
{item.word}
+
{item.count.toLocaleString()} uses
+
+ ))}
+
+
+
+
+
Top Bigrams
+
Most used 2-word phrases.
+
+ {topBigrams.map((item) => (
+
+
{item.ngram}
+
{item.count.toLocaleString()} uses
+
+ ))}
+
+
+
+
+
Top Trigrams
+
Most used 3-word phrases.
+
+ {topTrigrams.map((item) => (
+
+
{item.ngram}
+
{item.count.toLocaleString()} uses
+
+ ))}
+
+
+
+
+ );
+};
+
+export default LinguisticStats;
diff --git a/frontend/src/components/SummaryStats.tsx b/frontend/src/components/SummaryStats.tsx
index 0ff46ac..98f54ad 100644
--- a/frontend/src/components/SummaryStats.tsx
+++ b/frontend/src/components/SummaryStats.tsx
@@ -58,15 +58,13 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
const [selectedUser, setSelectedUser] = useState
(null);
const selectedUserData: User | null = userData?.users.find((u) => u.author === selectedUser) ?? null;
- console.log(summary)
-
return (
{/* main grid*/}
- Events per Day
- Trend of activity over time
+ Activity Over Time
+ How much posting happened each day.
- new Date(d.date) >= new Date('2026-01-10'))}>
+
@@ -154,8 +152,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
{/* Word Cloud */}
-
Word Cloud
-
Most common terms across events
+
Common Words
+
Frequently used words across the dataset.
- Top Users
- Most active authors
+ Most Active Users
+ Who posted the most events.
{userData?.top_users.slice(0, 100).map((item) => (
@@ -195,8 +193,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
{/* Heatmap */}
-
Heatmap
-
Activity density across time
+
Weekly Activity Pattern
+
When activity tends to happen by weekday and hour.
@@ -214,4 +212,4 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr
);
}
-export default SummaryStats;
\ No newline at end of file
+export default SummaryStats;
diff --git a/frontend/src/components/UserModal.tsx b/frontend/src/components/UserModal.tsx
index 54ee5fc..682b730 100644
--- a/frontend/src/components/UserModal.tsx
+++ b/frontend/src/components/UserModal.tsx
@@ -12,6 +12,9 @@ type Props = {
};
export default function UserModal({ open, onClose, userData, username }: Props) {
+ const dominantEmotionEntry = Object.entries(userData?.avg_emotions ?? {})
+ .sort((a, b) => b[1] - a[1])[0];
+
return (
@@ -66,6 +69,15 @@ export default function UserModal({ open, onClose, userData, username }: Props)
) : null}
+
+ {dominantEmotionEntry ? (
+
+
Dominant Avg Emotion
+
+ {dominantEmotionEntry[0].replace("emotion_", "")} ({dominantEmotionEntry[1].toFixed(3)})
+
+
+ ) : null}
)}
diff --git a/frontend/src/components/UserStats.tsx b/frontend/src/components/UserStats.tsx
index bb060cc..b467998 100644
--- a/frontend/src/components/UserStats.tsx
+++ b/frontend/src/components/UserStats.tsx
@@ -87,15 +87,15 @@ const UserStats = (props: { data: UserAnalysisResponse }) => {
style={{ gridColumn: "span 3" }}
/>
{
/>
${strongestLink.target}` : "—"}
- sublabel={strongestLink ? `${strongestLink.value.toLocaleString()} interactions` : "No graph edges after filtering"}
+ sublabel={strongestLink ? `${strongestLink.value.toLocaleString()} replies` : "No graph links after filtering"}
style={{ gridColumn: "span 6" }}
/>
{
User Interaction Graph
- Nodes represent users and links represent conversation interactions.
+ Each node is a user, and each link shows replies between them.
{
Select sources and scrape settings, then queue processing automatically.
+
+ Warning: Scraping more than 250 posts from any single site can take hours due to rate limits.
+
(
+ DELETED_USERS.includes((value ?? "").trim().toLowerCase())
+);
const StatPage = () => {
const { datasetId: routeDatasetId } = useParams<{ datasetId: string }>();
const [error, setError] = useState('');
const [loading, setLoading] = useState(false);
- const [activeView, setActiveView] = useState<"summary" | "emotional" | "user">("summary");
+ const [activeView, setActiveView] = useState<"summary" | "emotional" | "user" | "linguistic" | "interactional" | "cultural">("summary");
const [userData, setUserData] = useState(null);
const [timeData, setTimeData] = useState(null);
const [contentData, setContentData] = useState(null);
+ const [linguisticData, setLinguisticData] = useState(null);
+ const [interactionData, setInteractionData] = useState(null);
+ const [culturalData, setCulturalData] = useState(null);
const [summary, setSummary] = useState(null);
@@ -83,15 +99,23 @@ const StatPage = () => {
setLoading(true);
Promise.all([
- axios.get(`${API_BASE_URL}/dataset/${datasetId}/time`, {
+ axios.get(`${API_BASE_URL}/dataset/${datasetId}/temporal`, {
params,
headers: authHeaders,
}),
- axios.get(`${API_BASE_URL}/dataset/${datasetId}/user`, {
+ axios.get(`${API_BASE_URL}/dataset/${datasetId}/user`, {
params,
headers: authHeaders,
}),
- axios.get(`${API_BASE_URL}/dataset/${datasetId}/content`, {
+ axios.get(`${API_BASE_URL}/dataset/${datasetId}/linguistic`, {
+ params,
+ headers: authHeaders,
+ }),
+ axios.get(`${API_BASE_URL}/dataset/${datasetId}/emotional`, {
+ params,
+ headers: authHeaders,
+ }),
+ axios.get(`${API_BASE_URL}/dataset/${datasetId}/interactional`, {
params,
headers: authHeaders,
}),
@@ -99,12 +123,87 @@ const StatPage = () => {
params,
headers: authHeaders,
}),
+ axios.get(`${API_BASE_URL}/dataset/${datasetId}/cultural`, {
+ params,
+ headers: authHeaders,
+ }),
])
- .then(([timeRes, userRes, contentRes, summaryRes]) => {
- setUserData(userRes.data || null);
+ .then(([timeRes, userRes, linguisticRes, emotionalRes, interactionRes, summaryRes, culturalRes]) => {
+ const usersList = userRes.data.users ?? [];
+ const topUsersList = userRes.data.top_users ?? [];
+ const interactionGraphRaw = interactionRes.data?.interaction_graph ?? {};
+ const topPairsRaw = interactionRes.data?.top_interaction_pairs ?? [];
+
+ const filteredUsers: typeof usersList = [];
+ for (const user of usersList) {
+ if (isDeletedUser(user.author)) continue;
+ filteredUsers.push(user);
+ }
+
+ const filteredTopUsers: typeof topUsersList = [];
+ for (const user of topUsersList) {
+ if (isDeletedUser(user.author)) continue;
+ filteredTopUsers.push(user);
+ }
+
+ const filteredInteractionGraph: Record> = {};
+ for (const [source, targets] of Object.entries(interactionGraphRaw)) {
+ if (isDeletedUser(source)) {
+ continue;
+ }
+
+ const nextTargets: Record = {};
+ for (const [target, count] of Object.entries(targets)) {
+ if (isDeletedUser(target)) {
+ continue;
+ }
+ nextTargets[target] = count;
+ }
+
+ filteredInteractionGraph[source] = nextTargets;
+ }
+
+ const filteredTopInteractionPairs: typeof topPairsRaw = [];
+ for (const pairEntry of topPairsRaw) {
+ const pair = pairEntry[0];
+ const source = pair[0];
+ const target = pair[1];
+ if (isDeletedUser(source) || isDeletedUser(target)) {
+ continue;
+ }
+ filteredTopInteractionPairs.push(pairEntry);
+ }
+
+ const combinedUserData: UserAnalysisResponse = {
+ ...userRes.data,
+ users: filteredUsers,
+ top_users: filteredTopUsers,
+ interaction_graph: filteredInteractionGraph,
+ };
+
+ const combinedContentData: ContentAnalysisResponse = {
+ ...linguisticRes.data,
+ ...emotionalRes.data,
+ };
+
+ const filteredInteractionData: InteractionAnalysisResponse = {
+ ...interactionRes.data,
+ interaction_graph: filteredInteractionGraph,
+ top_interaction_pairs: filteredTopInteractionPairs,
+ };
+
+ const filteredSummary: SummaryResponse = {
+ ...summaryRes.data,
+ unique_users: filteredUsers.length,
+ };
+
+ setUserData(combinedUserData);
setTimeData(timeRes.data || null);
- setContentData(contentRes.data || null);
- setSummary(summaryRes.data || null);
+ setContentData(combinedContentData);
+ setLinguisticData(linguisticRes.data || null);
+ setInteractionData(filteredInteractionData || null);
+ setCulturalData(culturalRes.data || null);
+ setSummary(filteredSummary || null);
})
.catch((e) => setError("Failed to load statistics: " + String(e)))
.finally(() => setLoading(false));
@@ -198,7 +297,7 @@ return (
Dataset #{datasetId ?? "-"}
-
+
setActiveView("summary")}
style={activeView === "summary" ? styles.buttonPrimary : styles.buttonSecondary}
@@ -218,6 +317,24 @@ return (
>
Users
+ setActiveView("linguistic")}
+ style={activeView === "linguistic" ? styles.buttonPrimary : styles.buttonSecondary}
+ >
+ Linguistic
+
+ setActiveView("interactional")}
+ style={activeView === "interactional" ? styles.buttonPrimary : styles.buttonSecondary}
+ >
+ Interactional
+
+ setActiveView("cultural")}
+ style={activeView === "cultural" ? styles.buttonPrimary : styles.buttonSecondary}
+ >
+ Cultural
+
{activeView === "summary" && (
@@ -243,6 +360,36 @@ return (
)}
+ {activeView === "linguistic" && linguisticData && (
+
+ )}
+
+ {activeView === "linguistic" && !linguisticData && (
+
+ No linguistic data available.
+
+ )}
+
+ {activeView === "interactional" && interactionData && (
+
+ )}
+
+ {activeView === "interactional" && !interactionData && (
+
+ No interactional data available.
+
+ )}
+
+ {activeView === "cultural" && culturalData && (
+
+ )}
+
+ {activeView === "cultural" && !culturalData && (
+
+ No cultural data available.
+
+ )}
+
);
}
diff --git a/frontend/src/types/ApiTypes.ts b/frontend/src/types/ApiTypes.ts
index 5feaddf..7a0b521 100644
--- a/frontend/src/types/ApiTypes.ts
+++ b/frontend/src/types/ApiTypes.ts
@@ -1,14 +1,28 @@
-// User Responses
-type TopUser = {
- author: string;
- source: string;
- count: number
+// Shared types
+type FrequencyWord = {
+ word: string;
+ count: number;
};
-type FrequencyWord = {
- word: string;
- count: number;
-}
+type NGram = {
+ count: number;
+ ngram: string;
+};
+
+type Emotion = {
+ emotion_anger: number;
+ emotion_disgust: number;
+ emotion_fear: number;
+ emotion_joy: number;
+ emotion_sadness: number;
+};
+
+// User
+type TopUser = {
+ author: string;
+ source: string;
+ count: number;
+};
type Vocab = {
author: string;
@@ -26,62 +40,147 @@ type User = {
comment: number;
comment_post_ratio: number;
comment_share: number;
+ avg_emotions?: Record;
vocab?: Vocab | null;
};
type InteractionGraph = Record>;
+type UserEndpointResponse = {
+ top_users: TopUser[];
+ users: User[];
+};
+
type UserAnalysisResponse = {
top_users: TopUser[];
users: User[];
interaction_graph: InteractionGraph;
};
-// Time Analysis
+// Time
type EventsPerDay = {
- date: Date;
- count: number;
-}
-
-type HeatmapCell = {
- date: Date;
- hour: number;
- count: number;
-}
-
-type TimeAnalysisResponse = {
- events_per_day: EventsPerDay[];
- weekday_hour_heatmap: HeatmapCell[];
-}
-
-// Content Analysis
-type Emotion = {
- emotion_anger: number;
- emotion_disgust: number;
- emotion_fear: number;
- emotion_joy: number;
- emotion_sadness: number;
+ date: Date;
+ count: number;
};
-type NGram = {
- count: number;
- ngram: string;
-}
+type HeatmapCell = {
+ date: Date;
+ hour: number;
+ count: number;
+};
+type TimeAnalysisResponse = {
+ events_per_day: EventsPerDay[];
+ weekday_hour_heatmap: HeatmapCell[];
+};
+
+// Content (combines emotional and linguistic)
type AverageEmotionByTopic = Emotion & {
n: number;
topic: string;
+ [key: string]: string | number;
};
+type OverallEmotionAverage = {
+ emotion: string;
+ score: number;
+};
+
+type DominantEmotionDistribution = {
+ emotion: string;
+ count: number;
+ ratio: number;
+};
+
+type EmotionBySource = {
+ source: string;
+ dominant_emotion: string;
+ dominant_score: number;
+ event_count: number;
+};
type ContentAnalysisResponse = {
- word_frequencies: FrequencyWord[];
- average_emotion_by_topic: AverageEmotionByTopic[];
- common_three_phrases: NGram[];
- common_two_phrases: NGram[];
-}
+ word_frequencies: FrequencyWord[];
+ average_emotion_by_topic: AverageEmotionByTopic[];
+ common_three_phrases: NGram[];
+ common_two_phrases: NGram[];
+ overall_emotion_average?: OverallEmotionAverage[];
+ dominant_emotion_distribution?: DominantEmotionDistribution[];
+ emotion_by_source?: EmotionBySource[];
+};
-// Summary
+// Linguistic
+type LinguisticAnalysisResponse = {
+ word_frequencies: FrequencyWord[];
+ common_two_phrases: NGram[];
+ common_three_phrases: NGram[];
+ lexical_diversity?: Record;
+};
+
+// Emotional
+type EmotionalAnalysisResponse = {
+ average_emotion_by_topic: AverageEmotionByTopic[];
+ overall_emotion_average?: OverallEmotionAverage[];
+ dominant_emotion_distribution?: DominantEmotionDistribution[];
+ emotion_by_source?: EmotionBySource[];
+};
+
+// Interactional
+type ConversationConcentration = {
+ total_commenting_authors: number;
+ top_10pct_author_count: number;
+ top_10pct_comment_share: number;
+ single_comment_authors: number;
+ single_comment_author_ratio: number;
+};
+
+type InteractionAnalysisResponse = {
+ average_thread_depth?: number;
+ top_interaction_pairs?: [[string, string], number][];
+ conversation_concentration?: ConversationConcentration;
+ interaction_graph: InteractionGraph;
+};
+
+// Cultural
+type IdentityMarkers = {
+ in_group_usage: number;
+ out_group_usage: number;
+ in_group_ratio: number;
+ out_group_ratio: number;
+ in_group_posts: number;
+ out_group_posts: number;
+ tie_posts: number;
+ in_group_emotion_avg?: Record;
+ out_group_emotion_avg?: Record;
+};
+
+type StanceMarkers = {
+ hedge_total: number;
+ certainty_total: number;
+ deontic_total: number;
+ permission_total: number;
+ hedge_per_1k_tokens: number;
+ certainty_per_1k_tokens: number;
+ deontic_per_1k_tokens: number;
+ permission_per_1k_tokens: number;
+};
+
+type EntityEmotionAggregate = {
+ post_count: number;
+ emotion_avg: Record;
+};
+
+type AverageEmotionPerEntity = {
+ entity_emotion_avg: Record;
+};
+
+type CulturalAnalysisResponse = {
+ identity_markers?: IdentityMarkers;
+ stance_markers?: StanceMarkers;
+ avg_emotion_per_entity?: AverageEmotionPerEntity;
+};
+
+// Summary
type SummaryResponse = {
total_events: number;
total_posts: number;
@@ -96,22 +195,35 @@ type SummaryResponse = {
sources: string[];
};
-// Filtering Response
+// Filter
type FilterResponse = {
- rows: number
- data: any;
-}
+ rows: number;
+ data: any;
+};
export type {
- TopUser,
- Vocab,
- User,
- InteractionGraph,
- UserAnalysisResponse,
- FrequencyWord,
- AverageEmotionByTopic,
- SummaryResponse,
- TimeAnalysisResponse,
- ContentAnalysisResponse,
- FilterResponse
-}
+ TopUser,
+ Vocab,
+ User,
+ InteractionGraph,
+ ConversationConcentration,
+ UserAnalysisResponse,
+ UserEndpointResponse,
+ FrequencyWord,
+ AverageEmotionByTopic,
+ OverallEmotionAverage,
+ DominantEmotionDistribution,
+ EmotionBySource,
+ SummaryResponse,
+ TimeAnalysisResponse,
+ ContentAnalysisResponse,
+ LinguisticAnalysisResponse,
+ EmotionalAnalysisResponse,
+ InteractionAnalysisResponse,
+ IdentityMarkers,
+ StanceMarkers,
+ EntityEmotionAggregate,
+ AverageEmotionPerEntity,
+ CulturalAnalysisResponse,
+ FilterResponse,
+};
diff --git a/server/analysis/emotional.py b/server/analysis/emotional.py
index 150aa20..8f78809 100644
--- a/server/analysis/emotional.py
+++ b/server/analysis/emotional.py
@@ -1,33 +1,86 @@
import pandas as pd
+
class EmotionalAnalysis:
- def avg_emotion_by_topic(self, df: pd.DataFrame) -> dict:
- emotion_cols = [
- col for col in df.columns
- if col.startswith("emotion_")
- ]
+ def _emotion_cols(self, df: pd.DataFrame) -> list[str]:
+ return [col for col in df.columns if col.startswith("emotion_")]
+
+ def avg_emotion_by_topic(self, df: pd.DataFrame) -> list[dict]:
+ emotion_cols = self._emotion_cols(df)
+
+ if not emotion_cols:
+ return []
counts = (
- df[
- (df["topic"] != "Misc")
- ]
- .groupby("topic")
- .size()
- .rename("n")
+ df[(df["topic"] != "Misc")].groupby("topic").size().reset_index(name="n")
)
avg_emotion_by_topic = (
- df[
- (df["topic"] != "Misc")
- ]
+ df[(df["topic"] != "Misc")]
.groupby("topic")[emotion_cols]
.mean()
.reset_index()
)
- avg_emotion_by_topic = avg_emotion_by_topic.merge(
- counts,
- on="topic"
- )
+ avg_emotion_by_topic = avg_emotion_by_topic.merge(counts, on="topic")
- return avg_emotion_by_topic.to_dict(orient='records')
\ No newline at end of file
+ return avg_emotion_by_topic.to_dict(orient="records")
+
+ def overall_emotion_average(self, df: pd.DataFrame) -> list[dict]:
+ emotion_cols = self._emotion_cols(df)
+
+ if not emotion_cols:
+ return []
+
+ means = df[emotion_cols].mean()
+ return [
+ {
+ "emotion": col.replace("emotion_", ""),
+ "score": float(means[col]),
+ }
+ for col in emotion_cols
+ ]
+
+ def dominant_emotion_distribution(self, df: pd.DataFrame) -> list[dict]:
+ emotion_cols = self._emotion_cols(df)
+
+ if not emotion_cols or df.empty:
+ return []
+
+ dominant_per_row = df[emotion_cols].idxmax(axis=1)
+ counts = dominant_per_row.value_counts()
+ total = max(len(dominant_per_row), 1)
+
+ return [
+ {
+ "emotion": col.replace("emotion_", ""),
+ "count": int(count),
+ "ratio": round(float(count / total), 4),
+ }
+ for col, count in counts.items()
+ ]
+
+ def emotion_by_source(self, df: pd.DataFrame) -> list[dict]:
+ emotion_cols = self._emotion_cols(df)
+
+ if not emotion_cols or "source" not in df.columns or df.empty:
+ return []
+
+ source_counts = df.groupby("source").size()
+ source_means = df.groupby("source")[emotion_cols].mean().reset_index()
+ rows = source_means.to_dict(orient="records")
+ output = []
+
+ for row in rows:
+ source = row["source"]
+ dominant_col = max(emotion_cols, key=lambda col: float(row.get(col, 0)))
+ output.append(
+ {
+ "source": str(source),
+ "dominant_emotion": dominant_col.replace("emotion_", ""),
+ "dominant_score": round(float(row.get(dominant_col, 0)), 4),
+ "event_count": int(source_counts.get(source, 0)),
+ }
+ )
+
+ return output
diff --git a/server/analysis/interactional.py b/server/analysis/interactional.py
index 864980d..e15940e 100644
--- a/server/analysis/interactional.py
+++ b/server/analysis/interactional.py
@@ -1,9 +1,6 @@
import pandas as pd
import re
-from collections import Counter
-
-
class InteractionAnalysis:
def __init__(self, word_exclusions: set[str]):
self.word_exclusions = word_exclusions
@@ -12,118 +9,6 @@ class InteractionAnalysis:
tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in self.word_exclusions]
- def _vocab_richness_per_user(
- self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100
- ) -> list:
- df = df.copy()
- df["content"] = df["content"].fillna("").astype(str).str.lower()
- df["tokens"] = df["content"].apply(self._tokenize)
-
- rows = []
- for author, group in df.groupby("author"):
- all_tokens = [t for tokens in group["tokens"] for t in tokens]
-
- total_words = len(all_tokens)
- unique_words = len(set(all_tokens))
- events = len(group)
-
- # Min amount of words for a user, any less than this might give weird results
- if total_words < min_words:
- continue
-
- # 100% = they never reused a word (excluding stop words)
- vocab_richness = unique_words / total_words
- avg_words = total_words / max(events, 1)
-
- counts = Counter(all_tokens)
- top_words = [
- {"word": w, "count": int(c)}
- for w, c in counts.most_common(top_most_used_words)
- ]
-
- rows.append(
- {
- "author": author,
- "events": int(events),
- "total_words": int(total_words),
- "unique_words": int(unique_words),
- "vocab_richness": round(vocab_richness, 3),
- "avg_words_per_event": round(avg_words, 2),
- "top_words": top_words,
- }
- )
-
- rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True)
-
- return rows
-
- def top_users(self, df: pd.DataFrame) -> list:
- counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)
-
- top_users = [
- {"author": author, "source": source, "count": int(count)}
- for (author, source), count in counts.items()
- ]
-
- return top_users
-
- def per_user_analysis(self, df: pd.DataFrame) -> dict:
- per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
-
- emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
-
- avg_emotions_by_author = {}
- if emotion_cols:
- avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
- avg_emotions_by_author = {
- author: {emotion: float(score) for emotion, score in row.items()}
- for author, row in avg_emotions.iterrows()
- }
-
- # ensure columns always exist
- for col in ("post", "comment"):
- if col not in per_user.columns:
- per_user[col] = 0
-
- per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(
- 0, 1
- )
- per_user["comment_share"] = per_user["comment"] / (
- per_user["post"] + per_user["comment"]
- ).replace(0, 1)
- per_user = per_user.sort_values("comment_post_ratio", ascending=True)
- per_user_records = per_user.reset_index().to_dict(orient="records")
-
- vocab_rows = self._vocab_richness_per_user(df)
- vocab_by_author = {row["author"]: row for row in vocab_rows}
-
- # merge vocab richness + per_user information
- merged_users = []
- for row in per_user_records:
- author = row["author"]
- merged_users.append(
- {
- "author": author,
- "post": int(row.get("post", 0)),
- "comment": int(row.get("comment", 0)),
- "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
- "comment_share": float(row.get("comment_share", 0)),
- "avg_emotions": avg_emotions_by_author.get(author, {}),
- "vocab": vocab_by_author.get(
- author,
- {
- "vocab_richness": 0,
- "avg_words_per_event": 0,
- "top_words": [],
- },
- ),
- }
- )
-
- merged_users.sort(key=lambda u: u["comment_post_ratio"])
-
- return merged_users
-
def interaction_graph(self, df: pd.DataFrame):
interactions = {a: {} for a in df["author"].dropna().unique()}
@@ -166,68 +51,37 @@ class InteractionAnalysis:
return 0
return round(sum(depths) / len(depths), 2)
+
+ def top_interaction_pairs(self, df: pd.DataFrame, top_n=10):
+ graph = self.interaction_graph(df)
+ pairs = []
- def average_thread_length_by_emotion(self, df: pd.DataFrame):
- emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
+ for a, targets in graph.items():
+ for b, count in targets.items():
+ pairs.append(((a, b), count))
- emotion_cols = [
- c
- for c in df.columns
- if c.startswith("emotion_") and c not in emotion_exclusions
- ]
+ pairs.sort(key=lambda x: x[1], reverse=True)
+ return pairs[:top_n]
+
+ def conversation_concentration(self, df: pd.DataFrame) -> dict:
+ if "type" not in df.columns:
+ return {}
- id_to_reply = df.set_index("id")["reply_to"].to_dict()
- length_cache = {}
+ comments = df[df["type"] == "comment"]
+ if comments.empty:
+ return {}
- def thread_length_from(start_id):
- if start_id in length_cache:
- return length_cache[start_id]
+ author_counts = comments["author"].value_counts()
+ total_comments = len(comments)
+ total_authors = len(author_counts)
- seen = set()
- length = 1
- current = start_id
-
- while True:
- if current in seen:
- # infinite loop shouldn't happen, but just in case
- break
- seen.add(current)
-
- reply_to = id_to_reply.get(current)
-
- if (
- reply_to is None
- or (isinstance(reply_to, float) and pd.isna(reply_to))
- or reply_to == ""
- ):
- break
-
- length += 1
- current = reply_to
-
- if current in length_cache:
- length += length_cache[current] - 1
- break
-
- length_cache[start_id] = length
- return length
-
- emotion_to_lengths = {}
-
- # Fill NaNs in emotion cols to avoid max() issues
- emo_df = df[["id"] + emotion_cols].copy()
- emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0)
-
- for _, row in emo_df.iterrows():
- msg_id = row["id"]
- length = thread_length_from(msg_id)
-
- emotions = {c: row[c] for c in emotion_cols}
- dominant = max(emotions, key=emotions.get)
-
- emotion_to_lengths.setdefault(dominant, []).append(length)
+ top_10_pct_n = max(1, int(total_authors * 0.1))
+ top_10_pct_share = round(author_counts.head(top_10_pct_n).sum() / total_comments, 4)
return {
- emotion: round(sum(lengths) / len(lengths), 2)
- for emotion, lengths in emotion_to_lengths.items()
- }
+ "total_commenting_authors": total_authors,
+ "top_10pct_author_count": top_10_pct_n,
+ "top_10pct_comment_share": float(top_10_pct_share),
+ "single_comment_authors": int((author_counts == 1).sum()),
+ "single_comment_author_ratio": float(round((author_counts == 1).sum() / total_authors, 4)),
+ }
\ No newline at end of file
diff --git a/server/analysis/linguistic.py b/server/analysis/linguistic.py
index dc91faf..7546bbf 100644
--- a/server/analysis/linguistic.py
+++ b/server/analysis/linguistic.py
@@ -61,3 +61,19 @@ class LinguisticAnalysis:
.head(limit)
.to_dict(orient="records")
)
+
+ def lexical_diversity(self, df: pd.DataFrame) -> dict:
+ tokens = (
+ df["content"].fillna("").astype(str).str.lower()
+ .str.findall(r"\b[a-z]{2,}\b")
+ .explode()
+ )
+ tokens = tokens[~tokens.isin(self.word_exclusions)]
+ total = max(len(tokens), 1)
+ unique = int(tokens.nunique())
+
+ return {
+ "total_tokens": total,
+ "unique_tokens": unique,
+ "ttr": round(unique / total, 4),
+ }
diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py
index a9e9289..4368841 100644
--- a/server/analysis/stat_gen.py
+++ b/server/analysis/stat_gen.py
@@ -6,7 +6,9 @@ from server.analysis.cultural import CulturalAnalysis
from server.analysis.emotional import EmotionalAnalysis
from server.analysis.interactional import InteractionAnalysis
from server.analysis.linguistic import LinguisticAnalysis
+from server.analysis.summary import SummaryAnalysis
from server.analysis.temporal import TemporalAnalysis
+from server.analysis.user import UserAnalysis
DOMAIN_STOPWORDS = {
"www",
@@ -36,12 +38,11 @@ class StatGen:
self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS)
self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS)
self.cultural_analysis = CulturalAnalysis()
+ self.summary_analysis = SummaryAnalysis()
+ self.user_analysis = UserAnalysis(EXCLUDE_WORDS)
## Private Methods
- def _prepare_filtered_df(self,
- df: pd.DataFrame,
- filters: dict | None = None
- ) -> pd.DataFrame:
+ def _prepare_filtered_df(self, df: pd.DataFrame, filters: dict | None = None) -> pd.DataFrame:
filters = filters or {}
filtered_df = df.copy()
@@ -51,10 +52,9 @@ class StatGen:
data_source_filter = filters.get("data_sources", None)
if search_query:
- mask = (
- filtered_df["content"].str.contains(search_query, case=False, na=False)
- | filtered_df["author"].str.contains(search_query, case=False, na=False)
- )
+ mask = filtered_df["content"].str.contains(
+ search_query, case=False, na=False
+ ) | filtered_df["author"].str.contains(search_query, case=False, na=False)
# Only include title if the column exists
if "title" in filtered_df.columns:
@@ -76,10 +76,10 @@ class StatGen:
return filtered_df
## Public Methods
- def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+ def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]:
return self._prepare_filtered_df(df, filters).to_dict(orient="records")
- def get_time_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+ def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
return {
@@ -87,84 +87,54 @@ class StatGen:
"weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df),
}
- def get_content_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+ def linguistic(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
return {
"word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df),
"common_two_phrases": self.linguistic_analysis.ngrams(filtered_df),
"common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3),
- "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(
- filtered_df
- )
+ "lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df)
}
- def get_user_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+ def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
return {
- "top_users": self.interaction_analysis.top_users(filtered_df),
- "users": self.interaction_analysis.per_user_analysis(filtered_df),
- "interaction_graph": self.interaction_analysis.interaction_graph(filtered_df)
+ "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(filtered_df),
+ "overall_emotion_average": self.emotional_analysis.overall_emotion_average(filtered_df),
+ "dominant_emotion_distribution": self.emotional_analysis.dominant_emotion_distribution(filtered_df),
+ "emotion_by_source": self.emotional_analysis.emotion_by_source(filtered_df)
}
- def get_interactional_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+ def user(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
return {
- "average_thread_depth": self.interaction_analysis.average_thread_depth(
- filtered_df
- ),
- "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(
- filtered_df
- ),
+ "top_users": self.user_analysis.top_users(filtered_df),
+ "users": self.user_analysis.per_user_analysis(filtered_df)
}
- def get_cultural_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+ def interactional(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
return {
- "identity_markers": self.cultural_analysis.get_identity_markers(
- filtered_df
- ),
+ "average_thread_depth": self.interaction_analysis.average_thread_depth(filtered_df),
+ "top_interaction_pairs": self.interaction_analysis.top_interaction_pairs(filtered_df, top_n=100),
+ "interaction_graph": self.interaction_analysis.interaction_graph(filtered_df),
+ "conversation_concentration": self.interaction_analysis.conversation_concentration(filtered_df)
+ }
+
+ def cultural(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+ filtered_df = self._prepare_filtered_df(df, filters)
+
+ return {
+ "identity_markers": self.cultural_analysis.get_identity_markers(filtered_df),
"stance_markers": self.cultural_analysis.get_stance_markers(filtered_df),
- "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity(
- filtered_df
- ),
+ "avg_emotion_per_entity": self.cultural_analysis.get_avg_emotions_per_entity(filtered_df)
}
def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
filtered_df = self._prepare_filtered_df(df, filters)
- total_posts = (filtered_df["type"] == "post").sum()
- total_comments = (filtered_df["type"] == "comment").sum()
- events_per_user = filtered_df.groupby("author").size()
-
- if filtered_df.empty:
- return {
- "total_events": 0,
- "total_posts": 0,
- "total_comments": 0,
- "unique_users": 0,
- "comments_per_post": 0,
- "lurker_ratio": 0,
- "time_range": {
- "start": None,
- "end": None,
- },
- "sources": [],
- }
-
- return {
- "total_events": int(len(filtered_df)),
- "total_posts": int(total_posts),
- "total_comments": int(total_comments),
- "unique_users": int(events_per_user.count()),
- "comments_per_post": round(total_comments / max(total_posts, 1), 2),
- "lurker_ratio": round((events_per_user == 1).mean(), 2),
- "time_range": {
- "start": int(filtered_df["dt"].min().timestamp()),
- "end": int(filtered_df["dt"].max().timestamp()),
- },
- "sources": filtered_df["source"].dropna().unique().tolist(),
- }
+ return self.summary_analysis.summary(filtered_df)
diff --git a/server/analysis/summary.py b/server/analysis/summary.py
new file mode 100644
index 0000000..14cc8ca
--- /dev/null
+++ b/server/analysis/summary.py
@@ -0,0 +1,64 @@
+import pandas as pd
+
+
+class SummaryAnalysis:
+ def total_events(self, df: pd.DataFrame) -> int:
+ return int(len(df))
+
+ def total_posts(self, df: pd.DataFrame) -> int:
+ return int(len(df[df["type"] == "post"]))
+
+ def total_comments(self, df: pd.DataFrame) -> int:
+ return int(len(df[df["type"] == "comment"]))
+
+ def unique_users(self, df: pd.DataFrame) -> int:
+ return int(len(df["author"].dropna().unique()))
+
+ def comments_per_post(self, total_comments: int, total_posts: int) -> float:
+ return round(total_comments / max(total_posts, 1), 2)
+
+ def lurker_ratio(self, df: pd.DataFrame) -> float:
+ events_per_user = df.groupby("author").size()
+ return round((events_per_user == 1).mean(), 2)
+
+ def time_range(self, df: pd.DataFrame) -> dict:
+ return {
+ "start": int(df["dt"].min().timestamp()),
+ "end": int(df["dt"].max().timestamp()),
+ }
+
+ def sources(self, df: pd.DataFrame) -> list:
+ return df["source"].dropna().unique().tolist()
+
+ def empty_summary(self) -> dict:
+ return {
+ "total_events": 0,
+ "total_posts": 0,
+ "total_comments": 0,
+ "unique_users": 0,
+ "comments_per_post": 0,
+ "lurker_ratio": 0,
+ "time_range": {
+ "start": None,
+ "end": None,
+ },
+ "sources": [],
+ }
+
+ def summary(self, df: pd.DataFrame) -> dict:
+ if df.empty:
+ return self.empty_summary()
+
+ total_posts = self.total_posts(df)
+ total_comments = self.total_comments(df)
+
+ return {
+ "total_events": self.total_events(df),
+ "total_posts": total_posts,
+ "total_comments": total_comments,
+ "unique_users": self.unique_users(df),
+ "comments_per_post": self.comments_per_post(total_comments, total_posts),
+ "lurker_ratio": self.lurker_ratio(df),
+ "time_range": self.time_range(df),
+ "sources": self.sources(df),
+ }
diff --git a/server/analysis/user.py b/server/analysis/user.py
new file mode 100644
index 0000000..fc8e618
--- /dev/null
+++ b/server/analysis/user.py
@@ -0,0 +1,124 @@
+import pandas as pd
+import re
+
+from collections import Counter
+
+class UserAnalysis:
+ def __init__(self, word_exclusions: set[str]):
+ self.word_exclusions = word_exclusions
+
+ def _tokenize(self, text: str):
+ tokens = re.findall(r"\b[a-z]{3,}\b", text)
+ return [t for t in tokens if t not in self.word_exclusions]
+
+ def _vocab_richness_per_user(
+ self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100
+ ) -> list:
+ df = df.copy()
+ df["content"] = df["content"].fillna("").astype(str).str.lower()
+ df["tokens"] = df["content"].apply(self._tokenize)
+
+ rows = []
+ for author, group in df.groupby("author"):
+ all_tokens = [t for tokens in group["tokens"] for t in tokens]
+
+ total_words = len(all_tokens)
+ unique_words = len(set(all_tokens))
+ events = len(group)
+
+ # Min amount of words for a user, any less than this might give weird results
+ if total_words < min_words:
+ continue
+
+ # 100% = they never reused a word (excluding stop words)
+ vocab_richness = unique_words / total_words
+ avg_words = total_words / max(events, 1)
+
+ counts = Counter(all_tokens)
+ top_words = [
+ {"word": w, "count": int(c)}
+ for w, c in counts.most_common(top_most_used_words)
+ ]
+
+ rows.append(
+ {
+ "author": author,
+ "events": int(events),
+ "total_words": int(total_words),
+ "unique_words": int(unique_words),
+ "vocab_richness": round(vocab_richness, 3),
+ "avg_words_per_event": round(avg_words, 2),
+ "top_words": top_words,
+ }
+ )
+
+ rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True)
+
+ return rows
+
+ def top_users(self, df: pd.DataFrame) -> list:
+ counts = df.groupby(["author", "source"]).size().sort_values(ascending=False)
+
+ top_users = [
+ {"author": author, "source": source, "count": int(count)}
+ for (author, source), count in counts.items()
+ ]
+
+ return top_users
+
+ def per_user_analysis(self, df: pd.DataFrame) -> dict:
+ per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
+
+ emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
+
+ avg_emotions_by_author = {}
+ if emotion_cols:
+ avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0)
+ avg_emotions_by_author = {
+ author: {emotion: float(score) for emotion, score in row.items()}
+ for author, row in avg_emotions.iterrows()
+ }
+
+ # ensure columns always exist
+ for col in ("post", "comment"):
+ if col not in per_user.columns:
+ per_user[col] = 0
+
+ per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(
+ 0, 1
+ )
+ per_user["comment_share"] = per_user["comment"] / (
+ per_user["post"] + per_user["comment"]
+ ).replace(0, 1)
+ per_user = per_user.sort_values("comment_post_ratio", ascending=True)
+ per_user_records = per_user.reset_index().to_dict(orient="records")
+
+ vocab_rows = self._vocab_richness_per_user(df)
+ vocab_by_author = {row["author"]: row for row in vocab_rows}
+
+ # merge vocab richness + per_user information
+ merged_users = []
+ for row in per_user_records:
+ author = row["author"]
+ merged_users.append(
+ {
+ "author": author,
+ "post": int(row.get("post", 0)),
+ "comment": int(row.get("comment", 0)),
+ "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
+ "comment_share": float(row.get("comment_share", 0)),
+ "avg_emotions": avg_emotions_by_author.get(author, {}),
+ "vocab": vocab_by_author.get(
+ author,
+ {
+ "vocab_richness": 0,
+ "avg_words_per_event": 0,
+ "top_words": [],
+ },
+ ),
+ }
+ )
+
+ merged_users.sort(key=lambda u: u["comment_post_ratio"])
+
+ return merged_users
diff --git a/server/app.py b/server/app.py
index f373843..7a5dea0 100644
--- a/server/app.py
+++ b/server/app.py
@@ -186,7 +186,7 @@ def scrape_data():
dataset_manager.set_dataset_status(
dataset_id,
"fetching",
- f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
+ f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}",
)
fetch_and_process_dataset.delay(
@@ -198,12 +198,14 @@ def scrape_data():
print(traceback.format_exc())
return jsonify({"error": "Failed to queue dataset processing"}), 500
+ return jsonify(
+ {
+ "message": "Dataset queued for processing",
+ "dataset_id": dataset_id,
+ "status": "processing",
+ }
+ ), 202
- return jsonify({
- "message": "Dataset queued for processing",
- "dataset_id": dataset_id,
- "status": "processing"
- }), 202
@app.route("/datasets/upload", methods=["POST"])
@jwt_required()
@@ -233,7 +235,9 @@ def upload_data():
posts_df = pd.read_json(post_file, lines=True, convert_dates=False)
topics = json.load(topic_file)
- dataset_id = dataset_manager.save_dataset_info(current_user, dataset_name, topics)
+ dataset_id = dataset_manager.save_dataset_info(
+ current_user, dataset_name, topics
+ )
process_dataset.delay(dataset_id, posts_df.to_dict(orient="records"), topics)
@@ -249,6 +253,7 @@ def upload_data():
except Exception as e:
return jsonify({"error": f"An unexpected error occurred"}), 500
+
@app.route("/dataset/", methods=["GET"])
@jwt_required()
def get_dataset(dataset_id):
@@ -256,7 +261,9 @@ def get_dataset(dataset_id):
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
- raise NotAuthorisedException("This user is not authorised to access this dataset")
+ raise NotAuthorisedException(
+ "This user is not authorised to access this dataset"
+ )
dataset_info = dataset_manager.get_dataset_info(dataset_id)
included_cols = {"id", "name", "created_at"}
@@ -269,7 +276,8 @@ def get_dataset(dataset_id):
except Exception:
print(traceback.format_exc())
return jsonify({"error": "An unexpected error occured"}), 500
-
+
+
@app.route("/dataset/", methods=["PATCH"])
@jwt_required()
def update_dataset(dataset_id):
@@ -277,7 +285,9 @@ def update_dataset(dataset_id):
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
- raise NotAuthorisedException("This user is not authorised to access this dataset")
+ raise NotAuthorisedException(
+ "This user is not authorised to access this dataset"
+ )
body = request.get_json()
new_name = body.get("name")
@@ -286,7 +296,9 @@ def update_dataset(dataset_id):
return jsonify({"error": "A valid name must be provided"}), 400
dataset_manager.update_dataset_name(dataset_id, new_name.strip())
- return jsonify({"message": f"Dataset {dataset_id} renamed to '{new_name.strip()}'"}), 200
+ return jsonify(
+ {"message": f"Dataset {dataset_id} renamed to '{new_name.strip()}'"}
+ ), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -294,7 +306,8 @@ def update_dataset(dataset_id):
except Exception:
print(traceback.format_exc())
return jsonify({"error": "An unexpected error occurred"}), 500
-
+
+
@app.route("/dataset/", methods=["DELETE"])
@jwt_required()
def delete_dataset(dataset_id):
@@ -302,11 +315,17 @@ def delete_dataset(dataset_id):
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
- raise NotAuthorisedException("This user is not authorised to access this dataset")
+ raise NotAuthorisedException(
+ "This user is not authorised to access this dataset"
+ )
dataset_manager.delete_dataset_info(dataset_id)
dataset_manager.delete_dataset_content(dataset_id)
- return jsonify({"message": f"Dataset {dataset_id} metadata and content successfully deleted"}), 200
+ return jsonify(
+ {
+ "message": f"Dataset {dataset_id} metadata and content successfully deleted"
+ }
+ ), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -315,6 +334,7 @@ def delete_dataset(dataset_id):
print(traceback.format_exc())
return jsonify({"error": "An unexpected error occured"}), 500
+
@app.route("/dataset//status", methods=["GET"])
@jwt_required()
def get_dataset_status(dataset_id):
@@ -322,7 +342,9 @@ def get_dataset_status(dataset_id):
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
- raise NotAuthorisedException("This user is not authorised to access this dataset")
+ raise NotAuthorisedException(
+ "This user is not authorised to access this dataset"
+ )
dataset_status = dataset_manager.get_dataset_status(dataset_id)
return jsonify(dataset_status), 200
@@ -334,17 +356,44 @@ def get_dataset_status(dataset_id):
print(traceback.format_exc())
return jsonify({"error": "An unexpected error occured"}), 500
-@app.route("/dataset//content", methods=["GET"])
+
+@app.route("/dataset//linguistic", methods=["GET"])
@jwt_required()
-def content_endpoint(dataset_id):
+def get_linguistic_analysis(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
- raise NotAuthorisedException("This user is not authorised to access this dataset")
+ raise NotAuthorisedException(
+ "This user is not authorised to access this dataset"
+ )
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
- return jsonify(stat_gen.get_content_analysis(dataset_content, filters)), 200
+ return jsonify(stat_gen.linguistic(dataset_content, filters)), 200
+ except NotAuthorisedException:
+ return jsonify({"error": "User is not authorised to access this content"}), 403
+ except NonExistentDatasetException:
+ return jsonify({"error": "Dataset does not exist"}), 404
+ except ValueError as e:
+ return jsonify({"error": f"Malformed or missing data"}), 400
+ except Exception as e:
+ print(traceback.format_exc())
+ return jsonify({"error": f"An unexpected error occurred"}), 500
+
+
+@app.route("/dataset//emotional", methods=["GET"])
+@jwt_required()
+def get_emotional_analysis(dataset_id):
+ try:
+ user_id = int(get_jwt_identity())
+ if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
+ raise NotAuthorisedException(
+ "This user is not authorised to access this dataset"
+ )
+
+ dataset_content = dataset_manager.get_dataset_content(dataset_id)
+ filters = get_request_filters()
+ return jsonify(stat_gen.emotional(dataset_content, filters)), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -362,7 +411,9 @@ def get_summary(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
- raise NotAuthorisedException("This user is not authorised to access this dataset")
+ raise NotAuthorisedException(
+ "This user is not authorised to access this dataset"
+ )
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
@@ -378,17 +429,19 @@ def get_summary(dataset_id):
return jsonify({"error": f"An unexpected error occurred"}), 500
-@app.route("/dataset//time", methods=["GET"])
+@app.route("/dataset//temporal", methods=["GET"])
@jwt_required()
-def get_time_analysis(dataset_id):
+def get_temporal_analysis(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
- raise NotAuthorisedException("This user is not authorised to access this dataset")
+ raise NotAuthorisedException(
+ "This user is not authorised to access this dataset"
+ )
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
- return jsonify(stat_gen.get_time_analysis(dataset_content, filters)), 200
+ return jsonify(stat_gen.temporal(dataset_content, filters)), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -406,11 +459,13 @@ def get_user_analysis(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
- raise NotAuthorisedException("This user is not authorised to access this dataset")
+ raise NotAuthorisedException(
+ "This user is not authorised to access this dataset"
+ )
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
- return jsonify(stat_gen.get_user_analysis(dataset_content, filters)), 200
+ return jsonify(stat_gen.user(dataset_content, filters)), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -428,11 +483,13 @@ def get_cultural_analysis(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
- raise NotAuthorisedException("This user is not authorised to access this dataset")
+ raise NotAuthorisedException(
+ "This user is not authorised to access this dataset"
+ )
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
- return jsonify(stat_gen.get_cultural_analysis(dataset_content, filters)), 200
+ return jsonify(stat_gen.cultural(dataset_content, filters)), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -444,17 +501,41 @@ def get_cultural_analysis(dataset_id):
return jsonify({"error": f"An unexpected error occurred"}), 500
-@app.route("/dataset//interaction", methods=["GET"])
+@app.route("/dataset//interactional", methods=["GET"])
@jwt_required()
def get_interaction_analysis(dataset_id):
try:
user_id = int(get_jwt_identity())
if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
- raise NotAuthorisedException("This user is not authorised to access this dataset")
+ raise NotAuthorisedException(
+ "This user is not authorised to access this dataset"
+ )
dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters()
- return jsonify(stat_gen.get_interactional_analysis(dataset_content, filters)), 200
+ return jsonify(stat_gen.interactional(dataset_content, filters)), 200
+ except NotAuthorisedException:
+ return jsonify({"error": "User is not authorised to access this content"}), 403
+ except NonExistentDatasetException:
+ return jsonify({"error": "Dataset does not exist"}), 404
+ except ValueError as e:
+ return jsonify({"error": f"Malformed or missing data"}), 400
+ except Exception as e:
+ print(traceback.format_exc())
+ return jsonify({"error": f"An unexpected error occurred"}), 500
+
+@app.route("/dataset//all", methods=["GET"])
+@jwt_required()
+def get_full_dataset(dataset_id: int):
+ try:
+ user_id = int(get_jwt_identity())
+ if not dataset_manager.authorize_user_dataset(dataset_id, user_id):
+ raise NotAuthorisedException(
+ "This user is not authorised to access this dataset"
+ )
+
+ dataset_content = dataset_manager.get_dataset_content(dataset_id)
+ return jsonify(dataset_content.to_dict(orient="records")), 200
except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException:
@@ -465,6 +546,5 @@ def get_interaction_analysis(dataset_id):
print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred"}), 500
-
if __name__ == "__main__":
app.run(debug=True)
diff --git a/server/core/datasets.py b/server/core/datasets.py
index 4690454..a55445d 100644
--- a/server/core/datasets.py
+++ b/server/core/datasets.py
@@ -101,7 +101,7 @@ class DatasetManager:
row["source"],
row.get("topic"),
row.get("topic_confidence"),
- Json(row["ner_entities"]) if row.get("ner_entities") else None,
+ Json(row["entities"]) if row.get("entities") is not None else None,
row.get("emotion_anger"),
row.get("emotion_disgust"),
row.get("emotion_fear"),
diff --git a/server/db/schema.sql b/server/db/schema.sql
index 4550633..5bec116 100644
--- a/server/db/schema.sql
+++ b/server/db/schema.sql
@@ -43,7 +43,7 @@ CREATE TABLE events (
weekday VARCHAR(255) NOT NULL,
/* Posts Only */
- title VARCHAR(255),
+ title TEXT,
/* Comments Only*/
parent_id VARCHAR(255),