diff --git a/frontend/index.html b/frontend/index.html index 072a57e..8a54e69 100644 --- a/frontend/index.html +++ b/frontend/index.html @@ -2,7 +2,7 @@ - + frontend diff --git a/frontend/public/icon.png b/frontend/public/icon.png new file mode 100644 index 0000000..a9fc02b Binary files /dev/null and b/frontend/public/icon.png differ diff --git a/frontend/src/components/CulturalStats.tsx b/frontend/src/components/CulturalStats.tsx new file mode 100644 index 0000000..c46e0c3 --- /dev/null +++ b/frontend/src/components/CulturalStats.tsx @@ -0,0 +1,158 @@ +import Card from "./Card"; +import StatsStyling from "../styles/stats_styling"; +import type { CulturalAnalysisResponse } from "../types/ApiTypes"; + +const styles = StatsStyling; + +type CulturalStatsProps = { + data: CulturalAnalysisResponse; +}; + +const CulturalStats = ({ data }: CulturalStatsProps) => { + const identity = data.identity_markers; + const stance = data.stance_markers; + const inGroupWords = identity?.in_group_usage ?? 0; + const outGroupWords = identity?.out_group_usage ?? 0; + const totalGroupWords = inGroupWords + outGroupWords; + const inGroupWordRate = typeof identity?.in_group_ratio === "number" + ? identity.in_group_ratio * 100 + : null; + const outGroupWordRate = typeof identity?.out_group_ratio === "number" + ? identity.out_group_ratio * 100 + : null; + const rawEntities = data.avg_emotion_per_entity?.entity_emotion_avg ?? {}; + const entities = Object.entries(rawEntities) + .sort((a, b) => (b[1].post_count - a[1].post_count)) + .slice(0, 20); + + const topEmotion = (emotionAvg: Record | undefined) => { + const entries = Object.entries(emotionAvg ?? {}); + if (!entries.length) { + return "—"; + } + + entries.sort((a, b) => b[1] - a[1]); + const dominant = entries[0] ?? ["emotion_unknown", 0]; + const dominantLabel = dominant[0].replace("emotion_", ""); + return `${dominantLabel} (${(dominant[1] * 100).toFixed(1)}%)`; + }; + + return ( +
+
+
+

Community Framing Overview

+

Simple view of how often people use "us" words vs "them" words, and the tone around that language.

+
+ + + + + + + + + + + + + + + + +
+

Mood in "Us" Posts

+

Most likely emotion when in-group wording is stronger.

+
{topEmotion(identity?.in_group_emotion_avg)}
+
+ +
+

Mood in "Them" Posts

+

Most likely emotion when out-group wording is stronger.

+
{topEmotion(identity?.out_group_emotion_avg)}
+
+ +
+

Entity Mood Snapshot

+

Most mentioned entities and the mood that appears most with each.

+ {!entities.length ? ( +
No entity-level cultural data available.
+ ) : ( +
+ {entities.map(([entity, aggregate]) => ( +
+
{entity}
+
+ {aggregate.post_count.toLocaleString()} posts • Likely mood: {topEmotion(aggregate.emotion_avg)} +
+
+ ))} +
+ )} +
+
+
+ ); +}; + +export default CulturalStats; diff --git a/frontend/src/components/EmotionalStats.tsx b/frontend/src/components/EmotionalStats.tsx index ecc588f..a0a66f3 100644 --- a/frontend/src/components/EmotionalStats.tsx +++ b/frontend/src/components/EmotionalStats.tsx @@ -9,6 +9,9 @@ type EmotionalStatsProps = { const EmotionalStats = ({contentData}: EmotionalStatsProps) => { const rows = contentData.average_emotion_by_topic ?? []; + const overallEmotionAverage = contentData.overall_emotion_average ?? []; + const dominantEmotionDistribution = contentData.dominant_emotion_distribution ?? []; + const emotionBySource = contentData.emotion_by_source ?? []; const lowSampleThreshold = 20; const stableSampleThreshold = 50; const emotionKeys = rows.length @@ -64,39 +67,104 @@ const EmotionalStats = ({contentData}: EmotionalStatsProps) => { return (
-

Average Emotion by Topic

-

Read confidence together with sample size. Topics with fewer than {lowSampleThreshold} events are usually noisy and less reliable.

+

Topic Mood Overview

+

Use the strength score together with post count. Topics with fewer than {lowSampleThreshold} events are often noisy.

Topics: {strongestPerTopic.length} - Median Sample: {medianSampleSize} events - Low Sample (<{lowSampleThreshold}): {lowSampleTopics} - Stable Sample ({stableSampleThreshold}+): {stableSampleTopics} + Median Posts: {medianSampleSize} + Small Topics (<{lowSampleThreshold}): {lowSampleTopics} + Stable Topics ({stableSampleThreshold}+): {stableSampleTopics}

- Confidence reflects how strongly one emotion leads within a topic, not model accuracy. Use larger samples for stronger conclusions. + Strength means how far the top emotion is ahead in that topic. It does not mean model accuracy.

- {strongestPerTopic.map((topic) => ( -
-

{topic.topic}

-
- Top Emotion +
+

Mood Averages

+

Average score for each emotion.

+ {!overallEmotionAverage.length ? ( +
No overall emotion averages available.
+ ) : ( +
+ {[...overallEmotionAverage] + .sort((a, b) => b.score - a.score) + .map((row) => ( +
+
{formatEmotion(row.emotion)}
+
{row.score.toFixed(3)}
+
+ ))}
-
- {formatEmotion(topic.emotion)} + )} +
+ +
+

Mood Split

+

How often each emotion is dominant.

+ {!dominantEmotionDistribution.length ? ( +
No dominant-emotion split available.
+ ) : ( +
+ {[...dominantEmotionDistribution] + .sort((a, b) => b.ratio - a.ratio) + .map((row) => ( +
+
{formatEmotion(row.emotion)}
+
{(row.ratio * 100).toFixed(1)}% • {row.count.toLocaleString()} events
+
+ ))}
-
- Confidence - {topic.value.toFixed(3)} -
-
- Sample Size - {topic.count} events + )} +
+ +
+

Mood by Source

+

Leading emotion in each source.

+ {!emotionBySource.length ? ( +
No source emotion profile available.
+ ) : ( +
+ {[...emotionBySource] + .sort((a, b) => b.event_count - a.event_count) + .map((row) => ( +
+
{row.source}
+
+ {formatEmotion(row.dominant_emotion)} • {row.dominant_score.toFixed(3)} • {row.event_count.toLocaleString()} events +
+
+ ))}
+ )} +
+ +
+

Topic Snapshots

+

Per-topic mood with strength and post count.

+
+ {strongestPerTopic.map((topic) => ( +
+

{topic.topic}

+
+ Likely Mood +
+
+ {formatEmotion(topic.emotion)} +
+
+ Strength + {topic.value.toFixed(3)} +
+
+ Posts in Topic + {topic.count} +
+
+ ))}
- ))} +
); diff --git a/frontend/src/components/InteractionalStats.tsx b/frontend/src/components/InteractionalStats.tsx new file mode 100644 index 0000000..11ab3a2 --- /dev/null +++ b/frontend/src/components/InteractionalStats.tsx @@ -0,0 +1,208 @@ +import Card from "./Card"; +import StatsStyling from "../styles/stats_styling"; +import type { InteractionAnalysisResponse } from "../types/ApiTypes"; +import { + ResponsiveContainer, + BarChart, + Bar, + XAxis, + YAxis, + CartesianGrid, + Tooltip, + PieChart, + Pie, + Cell, + Legend, +} from "recharts"; + +const styles = StatsStyling; + +type InteractionalStatsProps = { + data: InteractionAnalysisResponse; +}; + +const InteractionalStats = ({ data }: InteractionalStatsProps) => { + const graph = data.interaction_graph ?? {}; + const userCount = Object.keys(graph).length; + const edges = Object.values(graph).flatMap((targets) => Object.values(targets)); + const edgeCount = edges.length; + const interactionVolume = edges.reduce((sum, value) => sum + value, 0); + const concentration = data.conversation_concentration; + const topTenCommentShare = typeof concentration?.top_10pct_comment_share === "number" + ? concentration?.top_10pct_comment_share + : null; + const topTenAuthorCount = typeof concentration?.top_10pct_author_count === "number" + ? concentration.top_10pct_author_count + : null; + const totalCommentingAuthors = typeof concentration?.total_commenting_authors === "number" + ? concentration.total_commenting_authors + : null; + const singleCommentAuthorRatio = typeof concentration?.single_comment_author_ratio === "number" + ? concentration.single_comment_author_ratio + : null; + const singleCommentAuthors = typeof concentration?.single_comment_authors === "number" + ? concentration.single_comment_authors + : null; + + const topPairs = (data.top_interaction_pairs ?? []) + .filter((item): item is [[string, string], number] => { + if (!Array.isArray(item) || item.length !== 2) { + return false; + } + + const pair = item[0]; + const count = item[1]; + + return Array.isArray(pair) + && pair.length === 2 + && typeof pair[0] === "string" + && typeof pair[1] === "string" + && typeof count === "number"; + }) + .slice(0, 20); + + const topPairChartData = topPairs.slice(0, 8).map(([[source, target], value], index) => ({ + pair: `${source} -> ${target}`, + replies: value, + rank: index + 1, + })); + + const topTenSharePercent = topTenCommentShare === null + ? null + : topTenCommentShare * 100; + const nonTopTenSharePercent = topTenSharePercent === null + ? null + : Math.max(0, 100 - topTenSharePercent); + + let concentrationPieData: { name: string; value: number }[] = []; + if (topTenSharePercent !== null && nonTopTenSharePercent !== null) { + concentrationPieData = [ + { name: "Top 10% authors", value: topTenSharePercent }, + { name: "Other authors", value: nonTopTenSharePercent }, + ]; + } + + const PIE_COLORS = ["#2b6777", "#c8d8e4"]; + + return ( +
+
+
+

Conversation Overview

+

Who talks to who, and how concentrated the replies are.

+
+ + + + + + + + +
+

Conversation Visuals

+

Main reply links and concentration split.

+ +
+
+

Top Interaction Pairs

+
+ + + + + `#${value}`} + width={36} + /> + + + + +
+
+ +
+

Top 10% vs Other Comment Share

+
+ + + + {concentrationPieData.map((entry, index) => ( + + ))} + + + + + +
+
+
+
+ +
+

Frequent Reply Paths

+

Most common user-to-user reply paths.

+ {!topPairs.length ? ( +
No interaction pair data available.
+ ) : ( +
+ {topPairs.map(([[source, target], value], index) => ( +
${target}-${index}`} style={styles.topUserItem}> +
{source} -> {target}
+
{value.toLocaleString()} replies
+
+ ))} +
+ )} +
+
+
+ ); +}; + +export default InteractionalStats; diff --git a/frontend/src/components/LinguisticStats.tsx b/frontend/src/components/LinguisticStats.tsx new file mode 100644 index 0000000..34fdafd --- /dev/null +++ b/frontend/src/components/LinguisticStats.tsx @@ -0,0 +1,91 @@ +import Card from "./Card"; +import StatsStyling from "../styles/stats_styling"; +import type { LinguisticAnalysisResponse } from "../types/ApiTypes"; + +const styles = StatsStyling; + +type LinguisticStatsProps = { + data: LinguisticAnalysisResponse; +}; + +const LinguisticStats = ({ data }: LinguisticStatsProps) => { + const lexical = data.lexical_diversity; + const words = data.word_frequencies ?? []; + const bigrams = data.common_two_phrases ?? []; + const trigrams = data.common_three_phrases ?? []; + + const topWords = words.slice(0, 20); + const topBigrams = bigrams.slice(0, 10); + const topTrigrams = trigrams.slice(0, 10); + + return ( +
+
+
+

Language Overview

+

Quick read on how broad and repetitive the wording is.

+
+ + + + + +
+

Top Words

+

Most used single words.

+
+ {topWords.map((item) => ( +
+
{item.word}
+
{item.count.toLocaleString()} uses
+
+ ))} +
+
+ +
+

Top Bigrams

+

Most used 2-word phrases.

+
+ {topBigrams.map((item) => ( +
+
{item.ngram}
+
{item.count.toLocaleString()} uses
+
+ ))} +
+
+ +
+

Top Trigrams

+

Most used 3-word phrases.

+
+ {topTrigrams.map((item) => ( +
+
{item.ngram}
+
{item.count.toLocaleString()} uses
+
+ ))} +
+
+
+
+ ); +}; + +export default LinguisticStats; diff --git a/frontend/src/components/SummaryStats.tsx b/frontend/src/components/SummaryStats.tsx index 0ff46ac..98f54ad 100644 --- a/frontend/src/components/SummaryStats.tsx +++ b/frontend/src/components/SummaryStats.tsx @@ -58,15 +58,13 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr const [selectedUser, setSelectedUser] = useState(null); const selectedUserData: User | null = userData?.users.find((u) => u.author === selectedUser) ?? null; - console.log(summary) - return (
{/* main grid*/}
-

Events per Day

-

Trend of activity over time

+

Activity Over Time

+

How much posting happened each day.

- new Date(d.date) >= new Date('2026-01-10'))}> + @@ -154,8 +152,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr {/* Word Cloud */}
-

Word Cloud

-

Most common terms across events

+

Common Words

+

Frequently used words across the dataset.

-

Top Users

-

Most active authors

+

Most Active Users

+

Who posted the most events.

{userData?.top_users.slice(0, 100).map((item) => ( @@ -195,8 +193,8 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr {/* Heatmap */}
-

Heatmap

-

Activity density across time

+

Weekly Activity Pattern

+

When activity tends to happen by weekday and hour.

@@ -214,4 +212,4 @@ const SummaryStats = ({userData, timeData, contentData, summary}: SummaryStatsPr ); } -export default SummaryStats; \ No newline at end of file +export default SummaryStats; diff --git a/frontend/src/components/UserModal.tsx b/frontend/src/components/UserModal.tsx index 54ee5fc..682b730 100644 --- a/frontend/src/components/UserModal.tsx +++ b/frontend/src/components/UserModal.tsx @@ -12,6 +12,9 @@ type Props = { }; export default function UserModal({ open, onClose, userData, username }: Props) { + const dominantEmotionEntry = Object.entries(userData?.avg_emotions ?? {}) + .sort((a, b) => b[1] - a[1])[0]; + return (
@@ -66,6 +69,15 @@ export default function UserModal({ open, onClose, userData, username }: Props)
) : null} + + {dominantEmotionEntry ? ( +
+
Dominant Avg Emotion
+
+ {dominantEmotionEntry[0].replace("emotion_", "")} ({dominantEmotionEntry[1].toFixed(3)}) +
+
+ ) : null}
)} diff --git a/frontend/src/components/UserStats.tsx b/frontend/src/components/UserStats.tsx index bb060cc..b467998 100644 --- a/frontend/src/components/UserStats.tsx +++ b/frontend/src/components/UserStats.tsx @@ -87,15 +87,15 @@ const UserStats = (props: { data: UserAnalysisResponse }) => { style={{ gridColumn: "span 3" }} /> { /> ${strongestLink.target}` : "—"} - sublabel={strongestLink ? `${strongestLink.value.toLocaleString()} interactions` : "No graph edges after filtering"} + sublabel={strongestLink ? `${strongestLink.value.toLocaleString()} replies` : "No graph links after filtering"} style={{ gridColumn: "span 6" }} /> {

User Interaction Graph

- Nodes represent users and links represent conversation interactions. + Each node is a user, and each link shows replies between them.

{

Select sources and scrape settings, then queue processing automatically.

+

+ Warning: Scraping more than 250 posts from any single site can take hours due to rate limits. +

-
+
+ + +
{activeView === "summary" && ( @@ -243,6 +360,36 @@ return ( )} + {activeView === "linguistic" && linguisticData && ( + + )} + + {activeView === "linguistic" && !linguisticData && ( +
+ No linguistic data available. +
+ )} + + {activeView === "interactional" && interactionData && ( + + )} + + {activeView === "interactional" && !interactionData && ( +
+ No interactional data available. +
+ )} + + {activeView === "cultural" && culturalData && ( + + )} + + {activeView === "cultural" && !culturalData && ( +
+ No cultural data available. +
+ )} +
); } diff --git a/frontend/src/types/ApiTypes.ts b/frontend/src/types/ApiTypes.ts index 5feaddf..7a0b521 100644 --- a/frontend/src/types/ApiTypes.ts +++ b/frontend/src/types/ApiTypes.ts @@ -1,14 +1,28 @@ -// User Responses -type TopUser = { - author: string; - source: string; - count: number +// Shared types +type FrequencyWord = { + word: string; + count: number; }; -type FrequencyWord = { - word: string; - count: number; -} +type NGram = { + count: number; + ngram: string; +}; + +type Emotion = { + emotion_anger: number; + emotion_disgust: number; + emotion_fear: number; + emotion_joy: number; + emotion_sadness: number; +}; + +// User +type TopUser = { + author: string; + source: string; + count: number; +}; type Vocab = { author: string; @@ -26,62 +40,147 @@ type User = { comment: number; comment_post_ratio: number; comment_share: number; + avg_emotions?: Record; vocab?: Vocab | null; }; type InteractionGraph = Record>; +type UserEndpointResponse = { + top_users: TopUser[]; + users: User[]; +}; + type UserAnalysisResponse = { top_users: TopUser[]; users: User[]; interaction_graph: InteractionGraph; }; -// Time Analysis +// Time type EventsPerDay = { - date: Date; - count: number; -} - -type HeatmapCell = { - date: Date; - hour: number; - count: number; -} - -type TimeAnalysisResponse = { - events_per_day: EventsPerDay[]; - weekday_hour_heatmap: HeatmapCell[]; -} - -// Content Analysis -type Emotion = { - emotion_anger: number; - emotion_disgust: number; - emotion_fear: number; - emotion_joy: number; - emotion_sadness: number; + date: Date; + count: number; }; -type NGram = { - count: number; - ngram: string; -} +type HeatmapCell = { + date: Date; + hour: number; + count: number; +}; +type TimeAnalysisResponse = { + events_per_day: EventsPerDay[]; + weekday_hour_heatmap: HeatmapCell[]; +}; + +// Content (combines emotional and linguistic) type AverageEmotionByTopic = Emotion & { n: number; topic: string; + [key: string]: string | number; }; +type OverallEmotionAverage = { + emotion: string; + score: number; +}; + +type DominantEmotionDistribution = { + emotion: string; + count: number; + ratio: number; +}; + +type EmotionBySource = { + source: string; + dominant_emotion: string; + dominant_score: number; + event_count: number; +}; type ContentAnalysisResponse = { - word_frequencies: FrequencyWord[]; - average_emotion_by_topic: AverageEmotionByTopic[]; - common_three_phrases: NGram[]; - common_two_phrases: NGram[]; -} + word_frequencies: FrequencyWord[]; + average_emotion_by_topic: AverageEmotionByTopic[]; + common_three_phrases: NGram[]; + common_two_phrases: NGram[]; + overall_emotion_average?: OverallEmotionAverage[]; + dominant_emotion_distribution?: DominantEmotionDistribution[]; + emotion_by_source?: EmotionBySource[]; +}; -// Summary +// Linguistic +type LinguisticAnalysisResponse = { + word_frequencies: FrequencyWord[]; + common_two_phrases: NGram[]; + common_three_phrases: NGram[]; + lexical_diversity?: Record; +}; + +// Emotional +type EmotionalAnalysisResponse = { + average_emotion_by_topic: AverageEmotionByTopic[]; + overall_emotion_average?: OverallEmotionAverage[]; + dominant_emotion_distribution?: DominantEmotionDistribution[]; + emotion_by_source?: EmotionBySource[]; +}; + +// Interactional +type ConversationConcentration = { + total_commenting_authors: number; + top_10pct_author_count: number; + top_10pct_comment_share: number; + single_comment_authors: number; + single_comment_author_ratio: number; +}; + +type InteractionAnalysisResponse = { + average_thread_depth?: number; + top_interaction_pairs?: [[string, string], number][]; + conversation_concentration?: ConversationConcentration; + interaction_graph: InteractionGraph; +}; + +// Cultural +type IdentityMarkers = { + in_group_usage: number; + out_group_usage: number; + in_group_ratio: number; + out_group_ratio: number; + in_group_posts: number; + out_group_posts: number; + tie_posts: number; + in_group_emotion_avg?: Record; + out_group_emotion_avg?: Record; +}; + +type StanceMarkers = { + hedge_total: number; + certainty_total: number; + deontic_total: number; + permission_total: number; + hedge_per_1k_tokens: number; + certainty_per_1k_tokens: number; + deontic_per_1k_tokens: number; + permission_per_1k_tokens: number; +}; + +type EntityEmotionAggregate = { + post_count: number; + emotion_avg: Record; +}; + +type AverageEmotionPerEntity = { + entity_emotion_avg: Record; +}; + +type CulturalAnalysisResponse = { + identity_markers?: IdentityMarkers; + stance_markers?: StanceMarkers; + avg_emotion_per_entity?: AverageEmotionPerEntity; +}; + +// Summary type SummaryResponse = { total_events: number; total_posts: number; @@ -96,22 +195,35 @@ type SummaryResponse = { sources: string[]; }; -// Filtering Response +// Filter type FilterResponse = { - rows: number - data: any; -} + rows: number; + data: any; +}; export type { - TopUser, - Vocab, - User, - InteractionGraph, - UserAnalysisResponse, - FrequencyWord, - AverageEmotionByTopic, - SummaryResponse, - TimeAnalysisResponse, - ContentAnalysisResponse, - FilterResponse -} + TopUser, + Vocab, + User, + InteractionGraph, + ConversationConcentration, + UserAnalysisResponse, + UserEndpointResponse, + FrequencyWord, + AverageEmotionByTopic, + OverallEmotionAverage, + DominantEmotionDistribution, + EmotionBySource, + SummaryResponse, + TimeAnalysisResponse, + ContentAnalysisResponse, + LinguisticAnalysisResponse, + EmotionalAnalysisResponse, + InteractionAnalysisResponse, + IdentityMarkers, + StanceMarkers, + EntityEmotionAggregate, + AverageEmotionPerEntity, + CulturalAnalysisResponse, + FilterResponse, +}; diff --git a/server/analysis/emotional.py b/server/analysis/emotional.py index 150aa20..8f78809 100644 --- a/server/analysis/emotional.py +++ b/server/analysis/emotional.py @@ -1,33 +1,86 @@ import pandas as pd + class EmotionalAnalysis: - def avg_emotion_by_topic(self, df: pd.DataFrame) -> dict: - emotion_cols = [ - col for col in df.columns - if col.startswith("emotion_") - ] + def _emotion_cols(self, df: pd.DataFrame) -> list[str]: + return [col for col in df.columns if col.startswith("emotion_")] + + def avg_emotion_by_topic(self, df: pd.DataFrame) -> list[dict]: + emotion_cols = self._emotion_cols(df) + + if not emotion_cols: + return [] counts = ( - df[ - (df["topic"] != "Misc") - ] - .groupby("topic") - .size() - .rename("n") + df[(df["topic"] != "Misc")].groupby("topic").size().reset_index(name="n") ) avg_emotion_by_topic = ( - df[ - (df["topic"] != "Misc") - ] + df[(df["topic"] != "Misc")] .groupby("topic")[emotion_cols] .mean() .reset_index() ) - avg_emotion_by_topic = avg_emotion_by_topic.merge( - counts, - on="topic" - ) + avg_emotion_by_topic = avg_emotion_by_topic.merge(counts, on="topic") - return avg_emotion_by_topic.to_dict(orient='records') \ No newline at end of file + return avg_emotion_by_topic.to_dict(orient="records") + + def overall_emotion_average(self, df: pd.DataFrame) -> list[dict]: + emotion_cols = self._emotion_cols(df) + + if not emotion_cols: + return [] + + means = df[emotion_cols].mean() + return [ + { + "emotion": col.replace("emotion_", ""), + "score": float(means[col]), + } + for col in emotion_cols + ] + + def dominant_emotion_distribution(self, df: pd.DataFrame) -> list[dict]: + emotion_cols = self._emotion_cols(df) + + if not emotion_cols or df.empty: + return [] + + dominant_per_row = df[emotion_cols].idxmax(axis=1) + counts = dominant_per_row.value_counts() + total = max(len(dominant_per_row), 1) + + return [ + { + "emotion": col.replace("emotion_", ""), + "count": int(count), + "ratio": round(float(count / total), 4), + } + for col, count in counts.items() + ] + + def emotion_by_source(self, df: pd.DataFrame) -> list[dict]: + emotion_cols = self._emotion_cols(df) + + if not emotion_cols or "source" not in df.columns or df.empty: + return [] + + source_counts = df.groupby("source").size() + source_means = df.groupby("source")[emotion_cols].mean().reset_index() + rows = source_means.to_dict(orient="records") + output = [] + + for row in rows: + source = row["source"] + dominant_col = max(emotion_cols, key=lambda col: float(row.get(col, 0))) + output.append( + { + "source": str(source), + "dominant_emotion": dominant_col.replace("emotion_", ""), + "dominant_score": round(float(row.get(dominant_col, 0)), 4), + "event_count": int(source_counts.get(source, 0)), + } + ) + + return output diff --git a/server/analysis/interactional.py b/server/analysis/interactional.py index 864980d..e15940e 100644 --- a/server/analysis/interactional.py +++ b/server/analysis/interactional.py @@ -1,9 +1,6 @@ import pandas as pd import re -from collections import Counter - - class InteractionAnalysis: def __init__(self, word_exclusions: set[str]): self.word_exclusions = word_exclusions @@ -12,118 +9,6 @@ class InteractionAnalysis: tokens = re.findall(r"\b[a-z]{3,}\b", text) return [t for t in tokens if t not in self.word_exclusions] - def _vocab_richness_per_user( - self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100 - ) -> list: - df = df.copy() - df["content"] = df["content"].fillna("").astype(str).str.lower() - df["tokens"] = df["content"].apply(self._tokenize) - - rows = [] - for author, group in df.groupby("author"): - all_tokens = [t for tokens in group["tokens"] for t in tokens] - - total_words = len(all_tokens) - unique_words = len(set(all_tokens)) - events = len(group) - - # Min amount of words for a user, any less than this might give weird results - if total_words < min_words: - continue - - # 100% = they never reused a word (excluding stop words) - vocab_richness = unique_words / total_words - avg_words = total_words / max(events, 1) - - counts = Counter(all_tokens) - top_words = [ - {"word": w, "count": int(c)} - for w, c in counts.most_common(top_most_used_words) - ] - - rows.append( - { - "author": author, - "events": int(events), - "total_words": int(total_words), - "unique_words": int(unique_words), - "vocab_richness": round(vocab_richness, 3), - "avg_words_per_event": round(avg_words, 2), - "top_words": top_words, - } - ) - - rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True) - - return rows - - def top_users(self, df: pd.DataFrame) -> list: - counts = df.groupby(["author", "source"]).size().sort_values(ascending=False) - - top_users = [ - {"author": author, "source": source, "count": int(count)} - for (author, source), count in counts.items() - ] - - return top_users - - def per_user_analysis(self, df: pd.DataFrame) -> dict: - per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0) - - emotion_cols = [col for col in df.columns if col.startswith("emotion_")] - - avg_emotions_by_author = {} - if emotion_cols: - avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0) - avg_emotions_by_author = { - author: {emotion: float(score) for emotion, score in row.items()} - for author, row in avg_emotions.iterrows() - } - - # ensure columns always exist - for col in ("post", "comment"): - if col not in per_user.columns: - per_user[col] = 0 - - per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace( - 0, 1 - ) - per_user["comment_share"] = per_user["comment"] / ( - per_user["post"] + per_user["comment"] - ).replace(0, 1) - per_user = per_user.sort_values("comment_post_ratio", ascending=True) - per_user_records = per_user.reset_index().to_dict(orient="records") - - vocab_rows = self._vocab_richness_per_user(df) - vocab_by_author = {row["author"]: row for row in vocab_rows} - - # merge vocab richness + per_user information - merged_users = [] - for row in per_user_records: - author = row["author"] - merged_users.append( - { - "author": author, - "post": int(row.get("post", 0)), - "comment": int(row.get("comment", 0)), - "comment_post_ratio": float(row.get("comment_post_ratio", 0)), - "comment_share": float(row.get("comment_share", 0)), - "avg_emotions": avg_emotions_by_author.get(author, {}), - "vocab": vocab_by_author.get( - author, - { - "vocab_richness": 0, - "avg_words_per_event": 0, - "top_words": [], - }, - ), - } - ) - - merged_users.sort(key=lambda u: u["comment_post_ratio"]) - - return merged_users - def interaction_graph(self, df: pd.DataFrame): interactions = {a: {} for a in df["author"].dropna().unique()} @@ -166,68 +51,37 @@ class InteractionAnalysis: return 0 return round(sum(depths) / len(depths), 2) + + def top_interaction_pairs(self, df: pd.DataFrame, top_n=10): + graph = self.interaction_graph(df) + pairs = [] - def average_thread_length_by_emotion(self, df: pd.DataFrame): - emotion_exclusions = {"emotion_neutral", "emotion_surprise"} + for a, targets in graph.items(): + for b, count in targets.items(): + pairs.append(((a, b), count)) - emotion_cols = [ - c - for c in df.columns - if c.startswith("emotion_") and c not in emotion_exclusions - ] + pairs.sort(key=lambda x: x[1], reverse=True) + return pairs[:top_n] + + def conversation_concentration(self, df: pd.DataFrame) -> dict: + if "type" not in df.columns: + return {} - id_to_reply = df.set_index("id")["reply_to"].to_dict() - length_cache = {} + comments = df[df["type"] == "comment"] + if comments.empty: + return {} - def thread_length_from(start_id): - if start_id in length_cache: - return length_cache[start_id] + author_counts = comments["author"].value_counts() + total_comments = len(comments) + total_authors = len(author_counts) - seen = set() - length = 1 - current = start_id - - while True: - if current in seen: - # infinite loop shouldn't happen, but just in case - break - seen.add(current) - - reply_to = id_to_reply.get(current) - - if ( - reply_to is None - or (isinstance(reply_to, float) and pd.isna(reply_to)) - or reply_to == "" - ): - break - - length += 1 - current = reply_to - - if current in length_cache: - length += length_cache[current] - 1 - break - - length_cache[start_id] = length - return length - - emotion_to_lengths = {} - - # Fill NaNs in emotion cols to avoid max() issues - emo_df = df[["id"] + emotion_cols].copy() - emo_df[emotion_cols] = emo_df[emotion_cols].fillna(0) - - for _, row in emo_df.iterrows(): - msg_id = row["id"] - length = thread_length_from(msg_id) - - emotions = {c: row[c] for c in emotion_cols} - dominant = max(emotions, key=emotions.get) - - emotion_to_lengths.setdefault(dominant, []).append(length) + top_10_pct_n = max(1, int(total_authors * 0.1)) + top_10_pct_share = round(author_counts.head(top_10_pct_n).sum() / total_comments, 4) return { - emotion: round(sum(lengths) / len(lengths), 2) - for emotion, lengths in emotion_to_lengths.items() - } + "total_commenting_authors": total_authors, + "top_10pct_author_count": top_10_pct_n, + "top_10pct_comment_share": float(top_10_pct_share), + "single_comment_authors": int((author_counts == 1).sum()), + "single_comment_author_ratio": float(round((author_counts == 1).sum() / total_authors, 4)), + } \ No newline at end of file diff --git a/server/analysis/linguistic.py b/server/analysis/linguistic.py index dc91faf..7546bbf 100644 --- a/server/analysis/linguistic.py +++ b/server/analysis/linguistic.py @@ -61,3 +61,19 @@ class LinguisticAnalysis: .head(limit) .to_dict(orient="records") ) + + def lexical_diversity(self, df: pd.DataFrame) -> dict: + tokens = ( + df["content"].fillna("").astype(str).str.lower() + .str.findall(r"\b[a-z]{2,}\b") + .explode() + ) + tokens = tokens[~tokens.isin(self.word_exclusions)] + total = max(len(tokens), 1) + unique = int(tokens.nunique()) + + return { + "total_tokens": total, + "unique_tokens": unique, + "ttr": round(unique / total, 4), + } diff --git a/server/analysis/stat_gen.py b/server/analysis/stat_gen.py index a9e9289..4368841 100644 --- a/server/analysis/stat_gen.py +++ b/server/analysis/stat_gen.py @@ -6,7 +6,9 @@ from server.analysis.cultural import CulturalAnalysis from server.analysis.emotional import EmotionalAnalysis from server.analysis.interactional import InteractionAnalysis from server.analysis.linguistic import LinguisticAnalysis +from server.analysis.summary import SummaryAnalysis from server.analysis.temporal import TemporalAnalysis +from server.analysis.user import UserAnalysis DOMAIN_STOPWORDS = { "www", @@ -36,12 +38,11 @@ class StatGen: self.interaction_analysis = InteractionAnalysis(EXCLUDE_WORDS) self.linguistic_analysis = LinguisticAnalysis(EXCLUDE_WORDS) self.cultural_analysis = CulturalAnalysis() + self.summary_analysis = SummaryAnalysis() + self.user_analysis = UserAnalysis(EXCLUDE_WORDS) ## Private Methods - def _prepare_filtered_df(self, - df: pd.DataFrame, - filters: dict | None = None - ) -> pd.DataFrame: + def _prepare_filtered_df(self, df: pd.DataFrame, filters: dict | None = None) -> pd.DataFrame: filters = filters or {} filtered_df = df.copy() @@ -51,10 +52,9 @@ class StatGen: data_source_filter = filters.get("data_sources", None) if search_query: - mask = ( - filtered_df["content"].str.contains(search_query, case=False, na=False) - | filtered_df["author"].str.contains(search_query, case=False, na=False) - ) + mask = filtered_df["content"].str.contains( + search_query, case=False, na=False + ) | filtered_df["author"].str.contains(search_query, case=False, na=False) # Only include title if the column exists if "title" in filtered_df.columns: @@ -76,10 +76,10 @@ class StatGen: return filtered_df ## Public Methods - def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]: return self._prepare_filtered_df(df, filters).to_dict(orient="records") - def get_time_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) return { @@ -87,84 +87,54 @@ class StatGen: "weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df), } - def get_content_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def linguistic(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) return { "word_frequencies": self.linguistic_analysis.word_frequencies(filtered_df), "common_two_phrases": self.linguistic_analysis.ngrams(filtered_df), "common_three_phrases": self.linguistic_analysis.ngrams(filtered_df, n=3), - "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic( - filtered_df - ) + "lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df) } - def get_user_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) return { - "top_users": self.interaction_analysis.top_users(filtered_df), - "users": self.interaction_analysis.per_user_analysis(filtered_df), - "interaction_graph": self.interaction_analysis.interaction_graph(filtered_df) + "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(filtered_df), + "overall_emotion_average": self.emotional_analysis.overall_emotion_average(filtered_df), + "dominant_emotion_distribution": self.emotional_analysis.dominant_emotion_distribution(filtered_df), + "emotion_by_source": self.emotional_analysis.emotion_by_source(filtered_df) } - def get_interactional_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def user(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) return { - "average_thread_depth": self.interaction_analysis.average_thread_depth( - filtered_df - ), - "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion( - filtered_df - ), + "top_users": self.user_analysis.top_users(filtered_df), + "users": self.user_analysis.per_user_analysis(filtered_df) } - def get_cultural_analysis(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + def interactional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) return { - "identity_markers": self.cultural_analysis.get_identity_markers( - filtered_df - ), + "average_thread_depth": self.interaction_analysis.average_thread_depth(filtered_df), + "top_interaction_pairs": self.interaction_analysis.top_interaction_pairs(filtered_df, top_n=100), + "interaction_graph": self.interaction_analysis.interaction_graph(filtered_df), + "conversation_concentration": self.interaction_analysis.conversation_concentration(filtered_df) + } + + def cultural(self, df: pd.DataFrame, filters: dict | None = None) -> dict: + filtered_df = self._prepare_filtered_df(df, filters) + + return { + "identity_markers": self.cultural_analysis.get_identity_markers(filtered_df), "stance_markers": self.cultural_analysis.get_stance_markers(filtered_df), - "entity_salience": self.cultural_analysis.get_avg_emotions_per_entity( - filtered_df - ), + "avg_emotion_per_entity": self.cultural_analysis.get_avg_emotions_per_entity(filtered_df) } def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict: filtered_df = self._prepare_filtered_df(df, filters) - total_posts = (filtered_df["type"] == "post").sum() - total_comments = (filtered_df["type"] == "comment").sum() - events_per_user = filtered_df.groupby("author").size() - - if filtered_df.empty: - return { - "total_events": 0, - "total_posts": 0, - "total_comments": 0, - "unique_users": 0, - "comments_per_post": 0, - "lurker_ratio": 0, - "time_range": { - "start": None, - "end": None, - }, - "sources": [], - } - - return { - "total_events": int(len(filtered_df)), - "total_posts": int(total_posts), - "total_comments": int(total_comments), - "unique_users": int(events_per_user.count()), - "comments_per_post": round(total_comments / max(total_posts, 1), 2), - "lurker_ratio": round((events_per_user == 1).mean(), 2), - "time_range": { - "start": int(filtered_df["dt"].min().timestamp()), - "end": int(filtered_df["dt"].max().timestamp()), - }, - "sources": filtered_df["source"].dropna().unique().tolist(), - } + return self.summary_analysis.summary(filtered_df) diff --git a/server/analysis/summary.py b/server/analysis/summary.py new file mode 100644 index 0000000..14cc8ca --- /dev/null +++ b/server/analysis/summary.py @@ -0,0 +1,64 @@ +import pandas as pd + + +class SummaryAnalysis: + def total_events(self, df: pd.DataFrame) -> int: + return int(len(df)) + + def total_posts(self, df: pd.DataFrame) -> int: + return int(len(df[df["type"] == "post"])) + + def total_comments(self, df: pd.DataFrame) -> int: + return int(len(df[df["type"] == "comment"])) + + def unique_users(self, df: pd.DataFrame) -> int: + return int(len(df["author"].dropna().unique())) + + def comments_per_post(self, total_comments: int, total_posts: int) -> float: + return round(total_comments / max(total_posts, 1), 2) + + def lurker_ratio(self, df: pd.DataFrame) -> float: + events_per_user = df.groupby("author").size() + return round((events_per_user == 1).mean(), 2) + + def time_range(self, df: pd.DataFrame) -> dict: + return { + "start": int(df["dt"].min().timestamp()), + "end": int(df["dt"].max().timestamp()), + } + + def sources(self, df: pd.DataFrame) -> list: + return df["source"].dropna().unique().tolist() + + def empty_summary(self) -> dict: + return { + "total_events": 0, + "total_posts": 0, + "total_comments": 0, + "unique_users": 0, + "comments_per_post": 0, + "lurker_ratio": 0, + "time_range": { + "start": None, + "end": None, + }, + "sources": [], + } + + def summary(self, df: pd.DataFrame) -> dict: + if df.empty: + return self.empty_summary() + + total_posts = self.total_posts(df) + total_comments = self.total_comments(df) + + return { + "total_events": self.total_events(df), + "total_posts": total_posts, + "total_comments": total_comments, + "unique_users": self.unique_users(df), + "comments_per_post": self.comments_per_post(total_comments, total_posts), + "lurker_ratio": self.lurker_ratio(df), + "time_range": self.time_range(df), + "sources": self.sources(df), + } diff --git a/server/analysis/user.py b/server/analysis/user.py new file mode 100644 index 0000000..fc8e618 --- /dev/null +++ b/server/analysis/user.py @@ -0,0 +1,124 @@ +import pandas as pd +import re + +from collections import Counter + +class UserAnalysis: + def __init__(self, word_exclusions: set[str]): + self.word_exclusions = word_exclusions + + def _tokenize(self, text: str): + tokens = re.findall(r"\b[a-z]{3,}\b", text) + return [t for t in tokens if t not in self.word_exclusions] + + def _vocab_richness_per_user( + self, df: pd.DataFrame, min_words: int = 20, top_most_used_words: int = 100 + ) -> list: + df = df.copy() + df["content"] = df["content"].fillna("").astype(str).str.lower() + df["tokens"] = df["content"].apply(self._tokenize) + + rows = [] + for author, group in df.groupby("author"): + all_tokens = [t for tokens in group["tokens"] for t in tokens] + + total_words = len(all_tokens) + unique_words = len(set(all_tokens)) + events = len(group) + + # Min amount of words for a user, any less than this might give weird results + if total_words < min_words: + continue + + # 100% = they never reused a word (excluding stop words) + vocab_richness = unique_words / total_words + avg_words = total_words / max(events, 1) + + counts = Counter(all_tokens) + top_words = [ + {"word": w, "count": int(c)} + for w, c in counts.most_common(top_most_used_words) + ] + + rows.append( + { + "author": author, + "events": int(events), + "total_words": int(total_words), + "unique_words": int(unique_words), + "vocab_richness": round(vocab_richness, 3), + "avg_words_per_event": round(avg_words, 2), + "top_words": top_words, + } + ) + + rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True) + + return rows + + def top_users(self, df: pd.DataFrame) -> list: + counts = df.groupby(["author", "source"]).size().sort_values(ascending=False) + + top_users = [ + {"author": author, "source": source, "count": int(count)} + for (author, source), count in counts.items() + ] + + return top_users + + def per_user_analysis(self, df: pd.DataFrame) -> dict: + per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0) + + emotion_cols = [col for col in df.columns if col.startswith("emotion_")] + + avg_emotions_by_author = {} + if emotion_cols: + avg_emotions = df.groupby("author")[emotion_cols].mean().fillna(0.0) + avg_emotions_by_author = { + author: {emotion: float(score) for emotion, score in row.items()} + for author, row in avg_emotions.iterrows() + } + + # ensure columns always exist + for col in ("post", "comment"): + if col not in per_user.columns: + per_user[col] = 0 + + per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace( + 0, 1 + ) + per_user["comment_share"] = per_user["comment"] / ( + per_user["post"] + per_user["comment"] + ).replace(0, 1) + per_user = per_user.sort_values("comment_post_ratio", ascending=True) + per_user_records = per_user.reset_index().to_dict(orient="records") + + vocab_rows = self._vocab_richness_per_user(df) + vocab_by_author = {row["author"]: row for row in vocab_rows} + + # merge vocab richness + per_user information + merged_users = [] + for row in per_user_records: + author = row["author"] + merged_users.append( + { + "author": author, + "post": int(row.get("post", 0)), + "comment": int(row.get("comment", 0)), + "comment_post_ratio": float(row.get("comment_post_ratio", 0)), + "comment_share": float(row.get("comment_share", 0)), + "avg_emotions": avg_emotions_by_author.get(author, {}), + "vocab": vocab_by_author.get( + author, + { + "vocab_richness": 0, + "avg_words_per_event": 0, + "top_words": [], + }, + ), + } + ) + + merged_users.sort(key=lambda u: u["comment_post_ratio"]) + + return merged_users diff --git a/server/app.py b/server/app.py index f373843..7a5dea0 100644 --- a/server/app.py +++ b/server/app.py @@ -186,7 +186,7 @@ def scrape_data(): dataset_manager.set_dataset_status( dataset_id, "fetching", - f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}" + f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}", ) fetch_and_process_dataset.delay( @@ -198,12 +198,14 @@ def scrape_data(): print(traceback.format_exc()) return jsonify({"error": "Failed to queue dataset processing"}), 500 + return jsonify( + { + "message": "Dataset queued for processing", + "dataset_id": dataset_id, + "status": "processing", + } + ), 202 - return jsonify({ - "message": "Dataset queued for processing", - "dataset_id": dataset_id, - "status": "processing" - }), 202 @app.route("/datasets/upload", methods=["POST"]) @jwt_required() @@ -233,7 +235,9 @@ def upload_data(): posts_df = pd.read_json(post_file, lines=True, convert_dates=False) topics = json.load(topic_file) - dataset_id = dataset_manager.save_dataset_info(current_user, dataset_name, topics) + dataset_id = dataset_manager.save_dataset_info( + current_user, dataset_name, topics + ) process_dataset.delay(dataset_id, posts_df.to_dict(orient="records"), topics) @@ -249,6 +253,7 @@ def upload_data(): except Exception as e: return jsonify({"error": f"An unexpected error occurred"}), 500 + @app.route("/dataset/", methods=["GET"]) @jwt_required() def get_dataset(dataset_id): @@ -256,7 +261,9 @@ def get_dataset(dataset_id): user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_info = dataset_manager.get_dataset_info(dataset_id) included_cols = {"id", "name", "created_at"} @@ -269,7 +276,8 @@ def get_dataset(dataset_id): except Exception: print(traceback.format_exc()) return jsonify({"error": "An unexpected error occured"}), 500 - + + @app.route("/dataset/", methods=["PATCH"]) @jwt_required() def update_dataset(dataset_id): @@ -277,7 +285,9 @@ def update_dataset(dataset_id): user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) body = request.get_json() new_name = body.get("name") @@ -286,7 +296,9 @@ def update_dataset(dataset_id): return jsonify({"error": "A valid name must be provided"}), 400 dataset_manager.update_dataset_name(dataset_id, new_name.strip()) - return jsonify({"message": f"Dataset {dataset_id} renamed to '{new_name.strip()}'"}), 200 + return jsonify( + {"message": f"Dataset {dataset_id} renamed to '{new_name.strip()}'"} + ), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -294,7 +306,8 @@ def update_dataset(dataset_id): except Exception: print(traceback.format_exc()) return jsonify({"error": "An unexpected error occurred"}), 500 - + + @app.route("/dataset/", methods=["DELETE"]) @jwt_required() def delete_dataset(dataset_id): @@ -302,11 +315,17 @@ def delete_dataset(dataset_id): user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_manager.delete_dataset_info(dataset_id) dataset_manager.delete_dataset_content(dataset_id) - return jsonify({"message": f"Dataset {dataset_id} metadata and content successfully deleted"}), 200 + return jsonify( + { + "message": f"Dataset {dataset_id} metadata and content successfully deleted" + } + ), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -315,6 +334,7 @@ def delete_dataset(dataset_id): print(traceback.format_exc()) return jsonify({"error": "An unexpected error occured"}), 500 + @app.route("/dataset//status", methods=["GET"]) @jwt_required() def get_dataset_status(dataset_id): @@ -322,7 +342,9 @@ def get_dataset_status(dataset_id): user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_status = dataset_manager.get_dataset_status(dataset_id) return jsonify(dataset_status), 200 @@ -334,17 +356,44 @@ def get_dataset_status(dataset_id): print(traceback.format_exc()) return jsonify({"error": "An unexpected error occured"}), 500 -@app.route("/dataset//content", methods=["GET"]) + +@app.route("/dataset//linguistic", methods=["GET"]) @jwt_required() -def content_endpoint(dataset_id): +def get_linguistic_analysis(dataset_id): try: user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.get_content_analysis(dataset_content, filters)), 200 + return jsonify(stat_gen.linguistic(dataset_content, filters)), 200 + except NotAuthorisedException: + return jsonify({"error": "User is not authorised to access this content"}), 403 + except NonExistentDatasetException: + return jsonify({"error": "Dataset does not exist"}), 404 + except ValueError as e: + return jsonify({"error": f"Malformed or missing data"}), 400 + except Exception as e: + print(traceback.format_exc()) + return jsonify({"error": f"An unexpected error occurred"}), 500 + + +@app.route("/dataset//emotional", methods=["GET"]) +@jwt_required() +def get_emotional_analysis(dataset_id): + try: + user_id = int(get_jwt_identity()) + if not dataset_manager.authorize_user_dataset(dataset_id, user_id): + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) + + dataset_content = dataset_manager.get_dataset_content(dataset_id) + filters = get_request_filters() + return jsonify(stat_gen.emotional(dataset_content, filters)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -362,7 +411,9 @@ def get_summary(dataset_id): try: user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() @@ -378,17 +429,19 @@ def get_summary(dataset_id): return jsonify({"error": f"An unexpected error occurred"}), 500 -@app.route("/dataset//time", methods=["GET"]) +@app.route("/dataset//temporal", methods=["GET"]) @jwt_required() -def get_time_analysis(dataset_id): +def get_temporal_analysis(dataset_id): try: user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.get_time_analysis(dataset_content, filters)), 200 + return jsonify(stat_gen.temporal(dataset_content, filters)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -406,11 +459,13 @@ def get_user_analysis(dataset_id): try: user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.get_user_analysis(dataset_content, filters)), 200 + return jsonify(stat_gen.user(dataset_content, filters)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -428,11 +483,13 @@ def get_cultural_analysis(dataset_id): try: user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.get_cultural_analysis(dataset_content, filters)), 200 + return jsonify(stat_gen.cultural(dataset_content, filters)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -444,17 +501,41 @@ def get_cultural_analysis(dataset_id): return jsonify({"error": f"An unexpected error occurred"}), 500 -@app.route("/dataset//interaction", methods=["GET"]) +@app.route("/dataset//interactional", methods=["GET"]) @jwt_required() def get_interaction_analysis(dataset_id): try: user_id = int(get_jwt_identity()) if not dataset_manager.authorize_user_dataset(dataset_id, user_id): - raise NotAuthorisedException("This user is not authorised to access this dataset") + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.get_interactional_analysis(dataset_content, filters)), 200 + return jsonify(stat_gen.interactional(dataset_content, filters)), 200 + except NotAuthorisedException: + return jsonify({"error": "User is not authorised to access this content"}), 403 + except NonExistentDatasetException: + return jsonify({"error": "Dataset does not exist"}), 404 + except ValueError as e: + return jsonify({"error": f"Malformed or missing data"}), 400 + except Exception as e: + print(traceback.format_exc()) + return jsonify({"error": f"An unexpected error occurred"}), 500 + +@app.route("/dataset//all", methods=["GET"]) +@jwt_required() +def get_full_dataset(dataset_id: int): + try: + user_id = int(get_jwt_identity()) + if not dataset_manager.authorize_user_dataset(dataset_id, user_id): + raise NotAuthorisedException( + "This user is not authorised to access this dataset" + ) + + dataset_content = dataset_manager.get_dataset_content(dataset_id) + return jsonify(dataset_content.to_dict(orient="records")), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -465,6 +546,5 @@ def get_interaction_analysis(dataset_id): print(traceback.format_exc()) return jsonify({"error": f"An unexpected error occurred"}), 500 - if __name__ == "__main__": app.run(debug=True) diff --git a/server/core/datasets.py b/server/core/datasets.py index 4690454..a55445d 100644 --- a/server/core/datasets.py +++ b/server/core/datasets.py @@ -101,7 +101,7 @@ class DatasetManager: row["source"], row.get("topic"), row.get("topic_confidence"), - Json(row["ner_entities"]) if row.get("ner_entities") else None, + Json(row["entities"]) if row.get("entities") is not None else None, row.get("emotion_anger"), row.get("emotion_disgust"), row.get("emotion_fear"), diff --git a/server/db/schema.sql b/server/db/schema.sql index 4550633..5bec116 100644 --- a/server/db/schema.sql +++ b/server/db/schema.sql @@ -43,7 +43,7 @@ CREATE TABLE events ( weekday VARCHAR(255) NOT NULL, /* Posts Only */ - title VARCHAR(255), + title TEXT, /* Comments Only*/ parent_id VARCHAR(255),