2026-04-13 19:02:45 +01:00
28 changed files with 1717 additions and 296 deletions
--- a/frontend/src/App.tsx
+++ b/frontend/src/App.tsx
@@ -5,7 +5,7 @@ import DatasetsPage from "./pages/Datasets";
 import DatasetStatusPage from "./pages/DatasetStatus";
 import LoginPage from "./pages/Login";
 import UploadPage from "./pages/Upload";
-import AutoScrapePage from "./pages/AutoScrape";
+import AutoFetchPage from "./pages/AutoFetch";
 import StatPage from "./pages/Stats";
 import { getDocumentTitle } from "./utils/documentTitle";
 import DatasetEditPage from "./pages/DatasetEdit";
@@ -23,7 +23,7 @@ function App() {
        <Route path="/" element={<Navigate to="/login" replace />} />
        <Route path="/login" element={<LoginPage />} />
        <Route path="/upload" element={<UploadPage />} />
-        <Route path="/auto-scrape" element={<AutoScrapePage />} />
+        <Route path="/auto-fetch" element={<AutoFetchPage />} />
        <Route path="/datasets" element={<DatasetsPage />} />
        <Route path="/dataset/:datasetId/status" element={<DatasetStatusPage />} />
        <Route path="/dataset/:datasetId/stats" element={<StatPage />} />
--- a/frontend/src/components/CorpusExplorer.tsx
+++ b/frontend/src/components/CorpusExplorer.tsx
@@ -0,0 +1,247 @@
 import { useEffect, useState } from "react";
 import { Dialog, DialogPanel, DialogTitle } from "@headlessui/react";
 import StatsStyling from "../styles/stats_styling";
 import type { DatasetRecord } from "../utils/corpusExplorer";
 const styles = StatsStyling;
 const INITIAL_RECORD_COUNT = 60;
 const RECORD_BATCH_SIZE = 60;
 const EXCERPT_LENGTH = 320;
 const cleanText = (value: unknown) => {
  if (typeof value !== "string") {
    return "";
  }
  const trimmed = value.trim();
  if (!trimmed) {
    return "";
  }
  const lowered = trimmed.toLowerCase();
  if (lowered === "nan" || lowered === "null" || lowered === "undefined") {
    return "";
  }
  return trimmed;
 };
 const displayText = (value: unknown, fallback: string) => {
  const cleaned = cleanText(value);
  return cleaned || fallback;
 };
 type CorpusExplorerProps = {
  open: boolean;
  onClose: () => void;
  title: string;
  description: string;
  records: DatasetRecord[];
  loading: boolean;
  error: string;
  emptyMessage: string;
 };
 const formatRecordDate = (record: DatasetRecord) => {
  if (typeof record.dt === "string" && record.dt) {
    const date = new Date(record.dt);
    if (!Number.isNaN(date.getTime())) {
      return date.toLocaleString();
    }
  }
  if (typeof record.date === "string" && record.date) {
    return record.date;
  }
  if (typeof record.timestamp === "number") {
    return new Date(record.timestamp * 1000).toLocaleString();
  }
  return "Unknown time";
 };
 const getRecordKey = (record: DatasetRecord, index: number) =>
  String(record.id ?? record.post_id ?? `${record.author ?? "record"}-${index}`);
 const getRecordTitle = (record: DatasetRecord) => {
  if (record.type === "comment") {
    return "";
  }
  const title = cleanText(record.title);
  if (title) {
    return title;
  }
  const content = cleanText(record.content);
  if (!content) {
    return "Untitled record";
  }
  return content.length > 120 ? `${content.slice(0, 117)}...` : content;
 };
 const CorpusExplorer = ({
  open,
  onClose,
  title,
  description,
  records,
  loading,
  error,
  emptyMessage,
 }: CorpusExplorerProps) => {
  const [visibleCount, setVisibleCount] = useState(INITIAL_RECORD_COUNT);
  const [expandedKeys, setExpandedKeys] = useState<Record<string, boolean>>({});
  useEffect(() => {
    if (open) {
      setVisibleCount(INITIAL_RECORD_COUNT);
      setExpandedKeys({});
    }
  }, [open, title, records.length]);
  const hasMoreRecords = visibleCount < records.length;
  return (
    <Dialog open={open} onClose={onClose} style={styles.modalRoot}>
      <div style={styles.modalBackdrop} />
      <div style={styles.modalContainer}>
        <DialogPanel
          style={{
            ...styles.card,
            ...styles.modalPanel,
            width: "min(960px, 96vw)",
            maxHeight: "88vh",
            display: "flex",
            flexDirection: "column",
            gap: 12,
            overflow: "hidden",
          }}
        >
          <div style={styles.headerBar}>
            <div style={{ minWidth: 0 }}>
              <DialogTitle style={styles.sectionTitle}>{title}</DialogTitle>
              <p style={styles.sectionSubtitle}>
                {description} {loading ? "Loading records..." : `${records.length.toLocaleString()} records.`}
              </p>
            </div>
            <button onClick={onClose} style={styles.buttonSecondary}>
              Close
            </button>
          </div>
          {error ? <p style={styles.sectionSubtitle}>{error}</p> : null}
          {!loading && !error && !records.length ? (
            <p style={styles.sectionSubtitle}>{emptyMessage}</p>
          ) : null}
          {loading ? <div style={styles.topUserMeta}>Preparing corpus slice...</div> : null}
          {!loading && !error && records.length ? (
            <>
              <div
                style={{
                  ...styles.topUsersList,
                  overflowY: "auto",
                  overflowX: "hidden",
                  paddingRight: 4,
                }}
              >
                {records.slice(0, visibleCount).map((record, index) => {
                  const recordKey = getRecordKey(record, index);
                  const titleText = getRecordTitle(record);
                  const content = cleanText(record.content);
                  const isExpanded = !!expandedKeys[recordKey];
                  const canExpand = content.length > EXCERPT_LENGTH;
                  const excerpt =
                    canExpand && !isExpanded
                      ? `${content.slice(0, EXCERPT_LENGTH - 3)}...`
                      : content || "No content available.";
                  return (
                    <div key={recordKey} style={styles.topUserItem}>
                      <div style={{ ...styles.headerBar, alignItems: "flex-start" }}>
                        <div style={{ minWidth: 0, flex: 1 }}>
                          {titleText ? <div style={styles.topUserName}>{titleText}</div> : null}
                          <div
                            style={{
                              ...styles.topUserMeta,
                              overflowWrap: "anywhere",
                              wordBreak: "break-word",
                            }}
                          >
                            {displayText(record.author, "Unknown author")} • {displayText(record.source, "Unknown source")} • {displayText(record.type, "record")} • {formatRecordDate(record)}
                          </div>
                        </div>
                        <div
                          style={{
                            ...styles.topUserMeta,
                            marginLeft: 12,
                            textAlign: "right",
                            overflowWrap: "anywhere",
                            wordBreak: "break-word",
                          }}
                        >
                          {cleanText(record.topic) ? `Topic: ${cleanText(record.topic)}` : ""}
                        </div>
                      </div>
                      <div
                        style={{
                          ...styles.topUserMeta,
                          marginTop: 8,
                          whiteSpace: "pre-wrap",
                          overflowWrap: "anywhere",
                          wordBreak: "break-word",
                        }}
                      >
                        {excerpt}
                      </div>
                      {canExpand ? (
                        <div style={{ marginTop: 10 }}>
                          <button
                            onClick={() =>
                              setExpandedKeys((current) => ({
                                ...current,
                                [recordKey]: !current[recordKey],
                              }))
                            }
                            style={styles.buttonSecondary}
                          >
                            {isExpanded ? "Show Less" : "Show More"}
                          </button>
                        </div>
                      ) : null}
                    </div>
                  );
                })}
              </div>
              {hasMoreRecords ? (
                <div style={{ display: "flex", justifyContent: "center" }}>
                  <button
                    onClick={() =>
                      setVisibleCount((current) => current + RECORD_BATCH_SIZE)
                    }
                    style={styles.buttonSecondary}
                  >
                    Show More Records
                  </button>
                </div>
              ) : null}
            </>
          ) : null}
        </DialogPanel>
      </div>
    </Dialog>
  );
 };
 export default CorpusExplorer;
--- a/frontend/src/components/CulturalStats.tsx
+++ b/frontend/src/components/CulturalStats.tsx
@@ -1,14 +1,34 @@
 import Card from "./Card";
 import StatsStyling from "../styles/stats_styling";
 import type { CulturalAnalysisResponse } from "../types/ApiTypes";
 import {
  buildCertaintySpec,
  buildDeonticSpec,
  buildEntitySpec,
  buildHedgeSpec,
  buildIdentityBucketSpec,
  buildPermissionSpec,
  type CorpusExplorerSpec,
 } from "../utils/corpusExplorer";
 const styles = StatsStyling;
 const exploreButtonStyle = { padding: "4px 8px", fontSize: 12 };
 type CulturalStatsProps = {
  data: CulturalAnalysisResponse;
  onExplore: (spec: CorpusExplorerSpec) => void;
 };
-const CulturalStats = ({ data }: CulturalStatsProps) => {
+const renderExploreButton = (onClick: () => void) => (
  <button
    onClick={onClick}
    style={{ ...styles.buttonSecondary, ...exploreButtonStyle }}
  >
    Explore
  </button>
 );
 const CulturalStats = ({ data, onExplore }: CulturalStatsProps) => {
  const identity = data.identity_markers;
  const stance = data.stance_markers;
  const inGroupWords = identity?.in_group_usage ?? 0;
@@ -30,7 +50,7 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {
  const topEmotion = (emotionAvg: Record<string, number> | undefined) => {
    const entries = Object.entries(emotionAvg ?? {});
    if (!entries.length) {
-      return "—";
+      return "-";
    }
    entries.sort((a, b) => b[1] - a[1]);
@@ -39,21 +59,6 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {
    return `${dominantLabel} (${(dominant[1] * 100).toFixed(1)}%)`;
  };
  const stanceSublabel = (
    per1kTokens: number | undefined,
    emotionAvg: Record<string, number> | undefined,
  ) => {
    const rateLabel =
      typeof per1kTokens === "number"
        ? `${per1kTokens.toFixed(1)} per 1k words`
        : "Word frequency";
    const emotionLabel = topEmotion(emotionAvg);
    return emotionLabel === "—"
      ? rateLabel
      : `${rateLabel} • Avg mood: ${emotionLabel}`;
  };
  return (
    <div style={styles.page}>
      <div style={{ ...styles.container, ...styles.grid }}>
@@ -79,21 +84,30 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {
        />
        <Card
          label="In-Group Posts"
-          value={identity?.in_group_posts?.toLocaleString() ?? "—"}
+          value={identity?.in_group_posts?.toLocaleString() ?? "-"}
          sublabel='Posts leaning toward "us" language'
          rightSlot={renderExploreButton(() =>
            onExplore(buildIdentityBucketSpec("in")),
          )}
          style={{ gridColumn: "span 3" }}
        />
        <Card
          label="Out-Group Posts"
-          value={identity?.out_group_posts?.toLocaleString() ?? "—"}
+          value={identity?.out_group_posts?.toLocaleString() ?? "-"}
          sublabel='Posts leaning toward "them" language'
          rightSlot={renderExploreButton(() =>
            onExplore(buildIdentityBucketSpec("out")),
          )}
          style={{ gridColumn: "span 3" }}
        />
        <Card
          label="Balanced Posts"
-          value={identity?.tie_posts?.toLocaleString() ?? "—"}
+          value={identity?.tie_posts?.toLocaleString() ?? "-"}
          sublabel="Posts with equal us/them signals"
          rightSlot={renderExploreButton(() =>
            onExplore(buildIdentityBucketSpec("tie")),
          )}
          style={{ gridColumn: "span 3" }}
        />
        <Card
@@ -105,7 +119,7 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {
        <Card
          label="In-Group Share"
          value={
-            inGroupWordRate === null ? "—" : `${inGroupWordRate.toFixed(2)}%`
+            inGroupWordRate === null ? "-" : `${inGroupWordRate.toFixed(2)}%`
          }
          sublabel="Share of all words"
          style={{ gridColumn: "span 3" }}
@@ -113,7 +127,7 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {
        <Card
          label="Out-Group Share"
          value={
-            outGroupWordRate === null ? "—" : `${outGroupWordRate.toFixed(2)}%`
+            outGroupWordRate === null ? "-" : `${outGroupWordRate.toFixed(2)}%`
          }
          sublabel="Share of all words"
          style={{ gridColumn: "span 3" }}
@@ -121,38 +135,46 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {
        <Card
          label="Hedging Words"
-          value={stance?.hedge_total?.toLocaleString() ?? "—"}
+          value={stance?.hedge_total?.toLocaleString() ?? "-"}
-          sublabel={stanceSublabel(
+          sublabel={
-            stance?.hedge_per_1k_tokens,
+            typeof stance?.hedge_per_1k_tokens === "number"
-            stance?.hedge_emotion_avg,
+              ? `${stance.hedge_per_1k_tokens.toFixed(1)} per 1k words`
-          )}
+              : "Word frequency"
          }
          rightSlot={renderExploreButton(() => onExplore(buildHedgeSpec()))}
          style={{ gridColumn: "span 3" }}
        />
        <Card
          label="Certainty Words"
-          value={stance?.certainty_total?.toLocaleString() ?? "—"}
+          value={stance?.certainty_total?.toLocaleString() ?? "-"}
-          sublabel={stanceSublabel(
+          sublabel={
-            stance?.certainty_per_1k_tokens,
+            typeof stance?.certainty_per_1k_tokens === "number"
-            stance?.certainty_emotion_avg,
+              ? `${stance.certainty_per_1k_tokens.toFixed(1)} per 1k words`
-          )}
+              : "Word frequency"
          }
          rightSlot={renderExploreButton(() => onExplore(buildCertaintySpec()))}
          style={{ gridColumn: "span 3" }}
        />
        <Card
          label="Need/Should Words"
-          value={stance?.deontic_total?.toLocaleString() ?? "—"}
+          value={stance?.deontic_total?.toLocaleString() ?? "-"}
-          sublabel={stanceSublabel(
+          sublabel={
-            stance?.deontic_per_1k_tokens,
+            typeof stance?.deontic_per_1k_tokens === "number"
-            stance?.deontic_emotion_avg,
+              ? `${stance.deontic_per_1k_tokens.toFixed(1)} per 1k words`
-          )}
+              : "Word frequency"
          }
          rightSlot={renderExploreButton(() => onExplore(buildDeonticSpec()))}
          style={{ gridColumn: "span 3" }}
        />
        <Card
          label="Permission Words"
-          value={stance?.permission_total?.toLocaleString() ?? "—"}
+          value={stance?.permission_total?.toLocaleString() ?? "-"}
-          sublabel={stanceSublabel(
+          sublabel={
-            stance?.permission_per_1k_tokens,
+            typeof stance?.permission_per_1k_tokens === "number"
-            stance?.permission_emotion_avg,
+              ? `${stance.permission_per_1k_tokens.toFixed(1)} per 1k words`
-          )}
+              : "Word frequency"
          }
          rightSlot={renderExploreButton(() => onExplore(buildPermissionSpec()))}
          style={{ gridColumn: "span 3" }}
        />
@@ -161,8 +183,14 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {
          <p style={styles.sectionSubtitle}>
            Most likely emotion when in-group wording is stronger.
          </p>
-          <div style={styles.topUserName}>
+          <div style={styles.topUserName}>{topEmotion(identity?.in_group_emotion_avg)}</div>
-            {topEmotion(identity?.in_group_emotion_avg)}
+          <div style={{ marginTop: 12 }}>
            <button
              onClick={() => onExplore(buildIdentityBucketSpec("in"))}
              style={styles.buttonSecondary}
            >
              Explore records
            </button>
          </div>
        </div>
@@ -171,8 +199,14 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {
          <p style={styles.sectionSubtitle}>
            Most likely emotion when out-group wording is stronger.
          </p>
-          <div style={styles.topUserName}>
+          <div style={styles.topUserName}>{topEmotion(identity?.out_group_emotion_avg)}</div>
-            {topEmotion(identity?.out_group_emotion_avg)}
+          <div style={{ marginTop: 12 }}>
            <button
              onClick={() => onExplore(buildIdentityBucketSpec("out"))}
              style={styles.buttonSecondary}
            >
              Explore records
            </button>
          </div>
        </div>
@@ -182,9 +216,7 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {
            Most mentioned entities and the mood that appears most with each.
          </p>
          {!entities.length ? (
-            <div style={styles.topUserMeta}>
+            <div style={styles.topUserMeta}>No entity-level cultural data available.</div>
              No entity-level cultural data available.
            </div>
          ) : (
            <div
              style={{
@@ -194,7 +226,11 @@ const CulturalStats = ({ data }: CulturalStatsProps) => {
              }}
            >
              {entities.map(([entity, aggregate]) => (
-                <div key={entity} style={styles.topUserItem}>
+                <div
                  key={entity}
                  style={{ ...styles.topUserItem, cursor: "pointer" }}
                  onClick={() => onExplore(buildEntitySpec(entity))}
                >
                  <div style={styles.topUserName}>{entity}</div>
                  <div style={styles.topUserMeta}>
                    {aggregate.post_count.toLocaleString()} posts • Likely mood:{" "}
--- a/frontend/src/components/EmotionalStats.tsx
+++ b/frontend/src/components/EmotionalStats.tsx
@@ -1,13 +1,20 @@
 import type { EmotionalAnalysisResponse } from "../types/ApiTypes";
 import StatsStyling from "../styles/stats_styling";
 import {
  buildDominantEmotionSpec,
  buildSourceSpec,
  buildTopicSpec,
  type CorpusExplorerSpec,
 } from "../utils/corpusExplorer";
 const styles = StatsStyling;
 type EmotionalStatsProps = {
  emotionalData: EmotionalAnalysisResponse;
  onExplore: (spec: CorpusExplorerSpec) => void;
 };
-const EmotionalStats = ({ emotionalData }: EmotionalStatsProps) => {
+const EmotionalStats = ({ emotionalData, onExplore }: EmotionalStatsProps) => {
  const rows = emotionalData.average_emotion_by_topic ?? [];
  const overallEmotionAverage = emotionalData.overall_emotion_average ?? [];
  const dominantEmotionDistribution =
@@ -126,7 +133,11 @@ const EmotionalStats = ({ emotionalData }: EmotionalStatsProps) => {
              {[...overallEmotionAverage]
                .sort((a, b) => b.score - a.score)
                .map((row) => (
-                  <div key={row.emotion} style={styles.topUserItem}>
+                  <div
                    key={row.emotion}
                    style={{ ...styles.topUserItem, cursor: "pointer" }}
                    onClick={() => onExplore(buildDominantEmotionSpec(row.emotion))}
                  >
                    <div style={styles.topUserName}>
                      {formatEmotion(row.emotion)}
                    </div>
@@ -157,7 +168,11 @@ const EmotionalStats = ({ emotionalData }: EmotionalStatsProps) => {
              {[...dominantEmotionDistribution]
                .sort((a, b) => b.ratio - a.ratio)
                .map((row) => (
-                  <div key={row.emotion} style={styles.topUserItem}>
+                  <div
                    key={row.emotion}
                    style={{ ...styles.topUserItem, cursor: "pointer" }}
                    onClick={() => onExplore(buildDominantEmotionSpec(row.emotion))}
                  >
                    <div style={styles.topUserName}>
                      {formatEmotion(row.emotion)}
                    </div>
@@ -189,7 +204,11 @@ const EmotionalStats = ({ emotionalData }: EmotionalStatsProps) => {
              {[...emotionBySource]
                .sort((a, b) => b.event_count - a.event_count)
                .map((row) => (
-                  <div key={row.source} style={styles.topUserItem}>
+                  <div
                    key={row.source}
                    style={{ ...styles.topUserItem, cursor: "pointer" }}
                    onClick={() => onExplore(buildSourceSpec(row.source))}
                  >
                    <div style={styles.topUserName}>{row.source}</div>
                    <div style={styles.topUserMeta}>
                      {formatEmotion(row.dominant_emotion)} •{" "}
@@ -211,7 +230,8 @@ const EmotionalStats = ({ emotionalData }: EmotionalStatsProps) => {
            {strongestPerTopic.map((topic) => (
              <div
                key={topic.topic}
-                style={{ ...styles.cardBase, gridColumn: "span 4" }}
+                style={{ ...styles.cardBase, gridColumn: "span 4", cursor: "pointer" }}
                onClick={() => onExplore(buildTopicSpec(topic.topic))}
              >
                <h3 style={{ ...styles.sectionTitle, marginBottom: 6 }}>
                  {topic.topic}
--- a/frontend/src/components/LinguisticStats.tsx
+++ b/frontend/src/components/LinguisticStats.tsx
@@ -1,14 +1,20 @@
 import Card from "./Card";
 import StatsStyling from "../styles/stats_styling";
 import type { LinguisticAnalysisResponse } from "../types/ApiTypes";
 import {
  buildNgramSpec,
  buildWordSpec,
  type CorpusExplorerSpec,
 } from "../utils/corpusExplorer";
 const styles = StatsStyling;
 type LinguisticStatsProps = {
  data: LinguisticAnalysisResponse;
  onExplore: (spec: CorpusExplorerSpec) => void;
 };
-const LinguisticStats = ({ data }: LinguisticStatsProps) => {
+const LinguisticStats = ({ data, onExplore }: LinguisticStatsProps) => {
  const lexical = data.lexical_diversity;
  const words = data.word_frequencies ?? [];
  const bigrams = data.common_two_phrases ?? [];
@@ -60,7 +66,11 @@ const LinguisticStats = ({ data }: LinguisticStatsProps) => {
            }}
          >
            {topWords.map((item) => (
-              <div key={item.word} style={styles.topUserItem}>
+              <div
                key={item.word}
                style={{ ...styles.topUserItem, cursor: "pointer" }}
                onClick={() => onExplore(buildWordSpec(item.word))}
              >
                <div style={styles.topUserName}>{item.word}</div>
                <div style={styles.topUserMeta}>
                  {item.count.toLocaleString()} uses
@@ -81,7 +91,11 @@ const LinguisticStats = ({ data }: LinguisticStatsProps) => {
            }}
          >
            {topBigrams.map((item) => (
-              <div key={item.ngram} style={styles.topUserItem}>
+              <div
                key={item.ngram}
                style={{ ...styles.topUserItem, cursor: "pointer" }}
                onClick={() => onExplore(buildNgramSpec(item.ngram))}
              >
                <div style={styles.topUserName}>{item.ngram}</div>
                <div style={styles.topUserMeta}>
                  {item.count.toLocaleString()} uses
@@ -102,7 +116,11 @@ const LinguisticStats = ({ data }: LinguisticStatsProps) => {
            }}
          >
            {topTrigrams.map((item) => (
-              <div key={item.ngram} style={styles.topUserItem}>
+              <div
                key={item.ngram}
                style={{ ...styles.topUserItem, cursor: "pointer" }}
                onClick={() => onExplore(buildNgramSpec(item.ngram))}
              >
                <div style={styles.topUserName}>{item.ngram}</div>
                <div style={styles.topUserMeta}>
                  {item.count.toLocaleString()} uses
--- a/frontend/src/components/SummaryStats.tsx
+++ b/frontend/src/components/SummaryStats.tsx
@@ -1,4 +1,4 @@
-import { memo, useMemo, useState } from "react";
+import { memo, useMemo } from "react";
 import {
  LineChart,
  Line,
@@ -13,7 +13,6 @@ import ActivityHeatmap from "../stats/ActivityHeatmap";
 import { ReactWordcloud } from "@cp949/react-wordcloud";
 import StatsStyling from "../styles/stats_styling";
 import Card from "../components/Card";
 import UserModal from "../components/UserModal";
 import {
  type SummaryResponse,
@@ -21,11 +20,18 @@ import {
  type UserEndpointResponse,
  type TimeAnalysisResponse,
  type LinguisticAnalysisResponse,
  type User,
 } from "../types/ApiTypes";
 import {
  buildAllRecordsSpec,
  buildDateBucketSpec,
  buildOneTimeUsersSpec,
  buildUserSpec,
  type CorpusExplorerSpec,
 } from "../utils/corpusExplorer";
 const styles = StatsStyling;
 const MAX_WORDCLOUD_WORDS = 250;
 const exploreButtonStyle = { padding: "4px 8px", fontSize: 12 };
 const WORDCLOUD_OPTIONS = {
  rotations: 2,
@@ -39,6 +45,7 @@ type SummaryStatsProps = {
  timeData: TimeAnalysisResponse | null;
  linguisticData: LinguisticAnalysisResponse | null;
  summary: SummaryResponse | null;
  onExplore: (spec: CorpusExplorerSpec) => void;
 };
 type WordCloudPanelProps = {
@@ -60,7 +67,7 @@ function formatDateRange(startUnix: number, endUnix: number) {
      day: "2-digit",
    });
-  return `${fmt(start)} → ${fmt(end)}`;
+  return `${fmt(start)} -> ${fmt(end)}`;
 }
 function convertFrequencyData(data: FrequencyWord[]) {
@@ -70,25 +77,22 @@ function convertFrequencyData(data: FrequencyWord[]) {
  }));
 }
 const renderExploreButton = (onClick: () => void) => (
  <button
    onClick={onClick}
    style={{ ...styles.buttonSecondary, ...exploreButtonStyle }}
  >
    Explore
  </button>
 );
 const SummaryStats = ({
  userData,
  timeData,
  linguisticData,
  summary,
  onExplore,
 }: SummaryStatsProps) => {
  const [selectedUser, setSelectedUser] = useState<string | null>(null);
  const usersByAuthor = useMemo(() => {
    const nextMap = new Map<string, User>();
    for (const user of userData?.users ?? []) {
      nextMap.set(user.author, user);
    }
    return nextMap;
  }, [userData?.users]);
  const selectedUserData: User | null = selectedUser
    ? usersByAuthor.get(selectedUser) ?? null
    : null;
  const wordCloudWords = useMemo(
    () =>
      convertFrequencyData(
@@ -104,49 +108,41 @@ const SummaryStats = ({
  return (
    <div style={styles.page}>
      {/* main grid*/}
      <div style={{ ...styles.container, ...styles.grid }}>
        <Card
          label="Total Activity"
-          value={summary?.total_events ?? "—"}
+          value={summary?.total_events ?? "-"}
          sublabel="Posts + comments"
-          style={{
+          rightSlot={renderExploreButton(() => onExplore(buildAllRecordsSpec()))}
-            gridColumn: "span 4",
+          style={{ gridColumn: "span 4" }}
          }}
        />
        <Card
          label="Active People"
-          value={summary?.unique_users ?? "—"}
+          value={summary?.unique_users ?? "-"}
          sublabel="Distinct users"
-          style={{
+          rightSlot={renderExploreButton(() => onExplore(buildAllRecordsSpec()))}
-            gridColumn: "span 4",
+          style={{ gridColumn: "span 4" }}
          }}
        />
        <Card
          label="Posts vs Comments"
          value={
-            summary ? `${summary.total_posts} / ${summary.total_comments}` : "—"
+            summary ? `${summary.total_posts} / ${summary.total_comments}` : "-"
          }
-          sublabel={`Comments per post: ${summary?.comments_per_post ?? "—"}`}
+          sublabel={`Comments per post: ${summary?.comments_per_post ?? "-"}`}
-          style={{
+          rightSlot={renderExploreButton(() => onExplore(buildAllRecordsSpec()))}
-            gridColumn: "span 4",
+          style={{ gridColumn: "span 4" }}
          }}
        />
        <Card
          label="Time Range"
          value={
            summary?.time_range
-              ? formatDateRange(
+              ? formatDateRange(summary.time_range.start, summary.time_range.end)
-                  summary.time_range.start,
+              : "-"
                  summary.time_range.end,
                )
              : "—"
          }
          sublabel="Based on dataset timestamps"
-          style={{
+          rightSlot={renderExploreButton(() => onExplore(buildAllRecordsSpec()))}
-            gridColumn: "span 4",
+          style={{ gridColumn: "span 4" }}
          }}
        />
        <Card
@@ -154,38 +150,44 @@ const SummaryStats = ({
          value={
            typeof summary?.lurker_ratio === "number"
              ? `${Math.round(summary.lurker_ratio * 100)}%`
-              : "—"
+              : "-"
          }
          sublabel="Users with only one event"
-          style={{
+          rightSlot={renderExploreButton(() => onExplore(buildOneTimeUsersSpec()))}
-            gridColumn: "span 4",
+          style={{ gridColumn: "span 4" }}
          }}
        />
        <Card
          label="Sources"
-          value={summary?.sources?.length ?? "—"}
+          value={summary?.sources?.length ?? "-"}
          sublabel={
            summary?.sources?.length
              ? summary.sources.slice(0, 3).join(", ") +
-                (summary.sources.length > 3 ? "…" : "")
+                (summary.sources.length > 3 ? "..." : "")
-              : "—"
+              : "-"
          }
-          style={{
+          rightSlot={renderExploreButton(() => onExplore(buildAllRecordsSpec()))}
-            gridColumn: "span 4",
+          style={{ gridColumn: "span 4" }}
          }}
        />
        {/* events per day */}
        <div style={{ ...styles.card, gridColumn: "span 5" }}>
          <h2 style={styles.sectionTitle}>Activity Over Time</h2>
-          <p style={styles.sectionSubtitle}>
+          <p style={styles.sectionSubtitle}>How much posting happened each day.</p>
            How much posting happened each day.
          </p>
          <div style={styles.chartWrapper}>
            <ResponsiveContainer width="100%" height="100%">
-              <LineChart data={timeData?.events_per_day ?? []}>
+              <LineChart
                data={timeData?.events_per_day ?? []}
                onClick={(state: unknown) => {
                  const payload = (state as { activePayload?: Array<{ payload?: { date?: string } }> })
                    ?.activePayload?.[0]?.payload as
                    | { date?: string }
                    | undefined;
                  if (payload?.date) {
                    onExplore(buildDateBucketSpec(String(payload.date)));
                  }
                }}
              >
                <CartesianGrid strokeDasharray="3 3" />
                <XAxis dataKey="date" />
                <YAxis />
@@ -201,7 +203,6 @@ const SummaryStats = ({
          </div>
        </div>
        {/* Word Cloud */}
        <div style={{ ...styles.card, gridColumn: "span 4" }}>
          <h2 style={styles.sectionTitle}>Common Words</h2>
          <p style={styles.sectionSubtitle}>
@@ -213,7 +214,6 @@ const SummaryStats = ({
          </div>
        </div>
        {/* Top Users */}
        <div
          style={{ ...styles.card, ...styles.scrollArea, gridColumn: "span 3" }}
        >
@@ -225,7 +225,7 @@ const SummaryStats = ({
              <div
                key={`${item.author}-${item.source}`}
                style={{ ...styles.topUserItem, cursor: "pointer" }}
-                onClick={() => setSelectedUser(item.author)}
+                onClick={() => onExplore(buildUserSpec(item.author))}
              >
                <div style={styles.topUserName}>{item.author}</div>
                <div style={styles.topUserMeta}>
@@ -236,7 +236,6 @@ const SummaryStats = ({
          </div>
        </div>
        {/* Heatmap */}
        <div style={{ ...styles.card, gridColumn: "span 12" }}>
          <h2 style={styles.sectionTitle}>Weekly Activity Pattern</h2>
          <p style={styles.sectionSubtitle}>
@@ -248,13 +247,6 @@ const SummaryStats = ({
          </div>
        </div>
      </div>
      <UserModal
        open={!!selectedUser}
        onClose={() => setSelectedUser(null)}
        username={selectedUser ?? ""}
        userData={selectedUserData}
      />
    </div>
  );
 };
--- a/frontend/src/components/UserStats.tsx
+++ b/frontend/src/components/UserStats.tsx
@@ -5,6 +5,12 @@ import { type TopUser, type InteractionGraph } from "../types/ApiTypes";
 import StatsStyling from "../styles/stats_styling";
 import Card from "./Card";
 import {
  buildReplyPairSpec,
  toText,
  buildUserSpec,
  type CorpusExplorerSpec,
 } from "../utils/corpusExplorer";
 const styles = StatsStyling;
@@ -14,7 +20,7 @@ type GraphLink = {
  value: number;
 };
-function ApiToGraphData(apiData: InteractionGraph) {
+function toGraphData(apiData: InteractionGraph) {
  const links: GraphLink[] = [];
  const connectedNodeIds = new Set<string>();
@@ -39,6 +45,7 @@ type UserStatsProps = {
  interactionGraph: InteractionGraph;
  totalUsers: number;
  mostCommentHeavyUser: { author: string; commentShare: number } | null;
  onExplore: (spec: CorpusExplorerSpec) => void;
 };
 const UserStats = ({
@@ -46,9 +53,10 @@ const UserStats = ({
  interactionGraph,
  totalUsers,
  mostCommentHeavyUser,
  onExplore,
 }: UserStatsProps) => {
  const graphData = useMemo(
-    () => ApiToGraphData(interactionGraph),
+    () => toGraphData(interactionGraph),
    [interactionGraph],
  );
  const graphContainerRef = useRef<HTMLDivElement | null>(null);
@@ -87,9 +95,9 @@ const UserStats = ({
    null,
  );
-  const mostActiveUser = topUsers.find(
+  const mostActiveUser = topUsers.find((u) => u.author !== "[deleted]");
-    (u) => u.author !== "[deleted]",
+  const strongestLinkSource = strongestLink ? toText(strongestLink.source) : "";
-  );
+  const strongestLinkTarget = strongestLink ? toText(strongestLink.target) : "";
  return (
    <div style={styles.page}>
@@ -114,37 +122,69 @@ const UserStats = ({
        />
        <Card
          label="Most Active User"
-          value={mostActiveUser?.author ?? "—"}
+          value={mostActiveUser?.author ?? "-"}
          sublabel={
            mostActiveUser
              ? `${mostActiveUser.count.toLocaleString()} events`
              : "No user activity found"
          }
          rightSlot={
            mostActiveUser ? (
              <button
                onClick={() => onExplore(buildUserSpec(mostActiveUser.author))}
                style={styles.buttonSecondary}
              >
                Explore
              </button>
            ) : null
          }
          style={{ gridColumn: "span 3" }}
        />
        <Card
          label="Strongest User Link"
          value={
-            strongestLink
+            strongestLinkSource && strongestLinkTarget
-              ? `${strongestLink.source} -> ${strongestLink.target}`
+              ? `${strongestLinkSource} -> ${strongestLinkTarget}`
-              : "—"
+              : "-"
          }
          sublabel={
            strongestLink
              ? `${strongestLink.value.toLocaleString()} replies`
              : "No graph links after filtering"
          }
          rightSlot={
            strongestLinkSource && strongestLinkTarget ? (
              <button
                onClick={() =>
                  onExplore(buildReplyPairSpec(strongestLinkSource, strongestLinkTarget))
                }
                style={styles.buttonSecondary}
              >
                Explore
              </button>
            ) : null
          }
          style={{ gridColumn: "span 6" }}
        />
        <Card
          label="Most Comment-Heavy User"
-          value={mostCommentHeavyUser?.author ?? "—"}
+          value={mostCommentHeavyUser?.author ?? "-"}
          sublabel={
            mostCommentHeavyUser
              ? `${Math.round(mostCommentHeavyUser.commentShare * 100)}% comments`
              : "No user distribution available"
          }
          rightSlot={
            mostCommentHeavyUser ? (
              <button
                onClick={() => onExplore(buildUserSpec(mostCommentHeavyUser.author))}
                style={styles.buttonSecondary}
              >
                Explore
              </button>
            ) : null
          }
          style={{ gridColumn: "span 6" }}
        />
@@ -166,6 +206,19 @@ const UserStats = ({
              linkDirectionalParticleSpeed={0.004}
              linkWidth={(link) => Math.sqrt(Number(link.value))}
              nodeLabel={(node) => `${node.id}`}
              onNodeClick={(node) => {
                const userId = toText(node.id);
                if (userId) {
                  onExplore(buildUserSpec(userId));
                }
              }}
              onLinkClick={(link) => {
                const source = toText(link.source);
                const target = toText(link.target);
                if (source && target) {
                  onExplore(buildReplyPairSpec(source, target));
                }
              }}
            />
          </div>
        </div>
--- a/frontend/src/pages/AutoScrape.tsx
+++ b/frontend/src/pages/AutoScrape.tsx
@@ -37,7 +37,7 @@ const supportsSearch = (source?: SourceOption): boolean =>
 const supportsCategories = (source?: SourceOption): boolean =>
  Boolean(source?.categories_enabled ?? source?.categoriesEnabled);
-const AutoScrapePage = () => {
+const AutoFetchPage = () => {
  const navigate = useNavigate();
  const [datasetName, setDatasetName] = useState("");
  const [sourceOptions, setSourceOptions] = useState<SourceOption[]>([]);
@@ -106,11 +106,11 @@ const AutoScrapePage = () => {
    );
  };
-  const autoScrape = async () => {
+  const autoFetch = async () => {
    const token = localStorage.getItem("access_token");
    if (!token) {
      setHasError(true);
-      setReturnMessage("You must be signed in to auto scrape a dataset.");
+      setReturnMessage("You must be signed in to auto fetch a dataset.");
      return;
    }
@@ -243,7 +243,7 @@ const AutoScrapePage = () => {
      setReturnMessage("");
      const response = await axios.post(
-        `${API_BASE_URL}/datasets/scrape`,
+        `${API_BASE_URL}/datasets/fetch`,
        requestBody,
        {
          headers: {
@@ -255,7 +255,7 @@ const AutoScrapePage = () => {
      const datasetId = Number(response.data.dataset_id);
      setReturnMessage(
-        `Auto scrape queued successfully (dataset #${datasetId}). Redirecting to processing status...`,
+        `Auto fetch queued successfully (dataset #${datasetId}). Redirecting to processing status...`,
      );
      setTimeout(() => {
@@ -267,11 +267,11 @@ const AutoScrapePage = () => {
        const message = String(
          requestError.response?.data?.error ||
            requestError.message ||
-            "Auto scrape failed.",
+            "Auto fetch failed.",
        );
-        setReturnMessage(`Auto scrape failed: ${message}`);
+        setReturnMessage(`Auto fetch failed: ${message}`);
      } else {
-        setReturnMessage("Auto scrape failed due to an unexpected error.");
+        setReturnMessage("Auto fetch failed due to an unexpected error.");
      }
    } finally {
      setIsSubmitting(false);
@@ -283,9 +283,9 @@ const AutoScrapePage = () => {
      <div style={styles.containerWide}>
        <div style={{ ...styles.card, ...styles.headerBar }}>
          <div>
-            <h1 style={styles.sectionHeaderTitle}>Auto Scrape Dataset</h1>
+            <h1 style={styles.sectionHeaderTitle}>Auto Fetch Dataset</h1>
            <p style={styles.sectionHeaderSubtitle}>
-              Select sources and scrape settings, then queue processing
+              Select sources and fetch settings, then queue processing
              automatically.
            </p>
            <p
@@ -295,7 +295,7 @@ const AutoScrapePage = () => {
                color: "#9a6700",
              }}
            >
-              Warning: Scraping more than 250 posts from any single site can
+              Warning: Fetching more than 250 posts from any single site can
              take hours due to rate limits.
            </p>
          </div>
@@ -305,10 +305,10 @@ const AutoScrapePage = () => {
              ...styles.buttonPrimary,
              opacity: isSubmitting || isLoadingSources ? 0.75 : 1,
            }}
-            onClick={autoScrape}
+            onClick={autoFetch}
            disabled={isSubmitting || isLoadingSources}
          >
-            {isSubmitting ? "Queueing..." : "Auto Scrape and Analyze"}
+            {isSubmitting ? "Queueing..." : "Auto Fetch and Analyze"}
          </button>
        </div>
@@ -527,4 +527,4 @@ const AutoScrapePage = () => {
  );
 };
-export default AutoScrapePage;
+export default AutoFetchPage;
--- a/frontend/src/pages/DatasetEdit.tsx
+++ b/frontend/src/pages/DatasetEdit.tsx
@@ -22,12 +22,10 @@ const DatasetEditPage = () => {
  const [isSaving, setIsSaving] = useState(false);
  const [isDeleting, setIsDeleting] = useState(false);
  const [isDeleteModalOpen, setIsDeleteModalOpen] = useState(false);
  const [hasError, setHasError] = useState(false);
  const [datasetName, setDatasetName] = useState("");
  useEffect(() => {
    if (!Number.isInteger(parsedDatasetId) || parsedDatasetId <= 0) {
      setHasError(true);
      setStatusMessage("Invalid dataset id.");
      setLoading(false);
      return;
@@ -35,7 +33,6 @@ const DatasetEditPage = () => {
    const token = localStorage.getItem("access_token");
    if (!token) {
      setHasError(true);
      setStatusMessage("You must be signed in to edit datasets.");
      setLoading(false);
      return;
@@ -49,7 +46,6 @@ const DatasetEditPage = () => {
        setDatasetName(response.data.name || "");
      })
      .catch((error: unknown) => {
        setHasError(true);
        if (axios.isAxiosError(error)) {
          setStatusMessage(
            String(error.response?.data?.error || error.message),
@@ -68,21 +64,18 @@ const DatasetEditPage = () => {
    const trimmedName = datasetName.trim();
    if (!trimmedName) {
      setHasError(true);
      setStatusMessage("Please enter a valid dataset name.");
      return;
    }
    const token = localStorage.getItem("access_token");
    if (!token) {
      setHasError(true);
      setStatusMessage("You must be signed in to save changes.");
      return;
    }
    try {
      setIsSaving(true);
      setHasError(false);
      setStatusMessage("");
      await axios.patch(
@@ -93,7 +86,6 @@ const DatasetEditPage = () => {
      navigate("/datasets", { replace: true });
    } catch (error: unknown) {
      setHasError(true);
      if (axios.isAxiosError(error)) {
        setStatusMessage(
          String(
@@ -111,7 +103,6 @@ const DatasetEditPage = () => {
  const deleteDataset = async () => {
    const deleteToken = localStorage.getItem("access_token");
    if (!deleteToken) {
      setHasError(true);
      setStatusMessage("You must be signed in to delete datasets.");
      setIsDeleteModalOpen(false);
      return;
@@ -119,7 +110,6 @@ const DatasetEditPage = () => {
    try {
      setIsDeleting(true);
      setHasError(false);
      setStatusMessage("");
      await axios.delete(`${API_BASE_URL}/dataset/${parsedDatasetId}`, {
@@ -129,7 +119,6 @@ const DatasetEditPage = () => {
      setIsDeleteModalOpen(false);
      navigate("/datasets", { replace: true });
    } catch (error: unknown) {
      setHasError(true);
      if (axios.isAxiosError(error)) {
        setStatusMessage(
          String(
--- a/frontend/src/pages/Datasets.tsx
+++ b/frontend/src/pages/Datasets.tsx
@@ -108,9 +108,9 @@ const DatasetsPage = () => {
            <button
              type="button"
              style={styles.buttonSecondary}
-              onClick={() => navigate("/auto-scrape")}
+              onClick={() => navigate("/auto-fetch")}
            >
-              Auto Scrape Dataset
+              Auto Fetch Dataset
            </button>
          </div>
        </div>
--- a/frontend/src/pages/Stats.tsx
+++ b/frontend/src/pages/Stats.tsx
@@ -1,4 +1,4 @@
-import { useEffect, useState, useRef } from "react";
+import { useEffect, useRef, useState } from "react";
 import axios from "axios";
 import { useParams } from "react-router-dom";
 import StatsStyling from "../styles/stats_styling";
@@ -8,6 +8,7 @@ import UserStats from "../components/UserStats";
 import LinguisticStats from "../components/LinguisticStats";
 import InteractionalStats from "../components/InteractionalStats";
 import CulturalStats from "../components/CulturalStats";
 import CorpusExplorer from "../components/CorpusExplorer";
 import {
  type SummaryResponse,
@@ -19,10 +20,15 @@ import {
  type InteractionAnalysisResponse,
  type CulturalAnalysisResponse,
 } from "../types/ApiTypes";
 import {
  buildExplorerContext,
  type CorpusExplorerSpec,
  type DatasetRecord,
 } from "../utils/corpusExplorer";
 const API_BASE_URL = import.meta.env.VITE_BACKEND_URL;
 const styles = StatsStyling;
-const DELETED_USERS = ["[deleted]"];
+const DELETED_USERS = ["[deleted]", "automoderator"];
 const isDeletedUser = (value: string | null | undefined) =>
  DELETED_USERS.includes((value ?? "").trim().toLowerCase());
@@ -40,6 +46,194 @@ type UserStatsMeta = {
  mostCommentHeavyUser: { author: string; commentShare: number } | null;
 };
 type ExplorerState = {
  open: boolean;
  title: string;
  description: string;
  emptyMessage: string;
  records: DatasetRecord[];
  loading: boolean;
  error: string;
 };
 const EMPTY_EXPLORER_STATE: ExplorerState = {
  open: false,
  title: "Corpus Explorer",
  description: "",
  emptyMessage: "No records found.",
  records: [],
  loading: false,
  error: "",
 };
 const createExplorerState = (
  spec: CorpusExplorerSpec,
  patch: Partial<ExplorerState> = {},
 ): ExplorerState => ({
  open: true,
  title: spec.title,
  description: spec.description,
  emptyMessage: spec.emptyMessage ?? "No matching records found.",
  records: [],
  loading: false,
  error: "",
  ...patch,
 });
 const compareRecordsByNewest = (a: DatasetRecord, b: DatasetRecord) => {
  const aValue = String(a.dt ?? a.date ?? a.timestamp ?? "");
  const bValue = String(b.dt ?? b.date ?? b.timestamp ?? "");
  return bValue.localeCompare(aValue);
 };
 const parseJsonLikePayload = (value: string): unknown => {
  const normalized = value
    .replace(/\uFEFF/g, "")
    .replace(/,\s*([}\]])/g, "$1")
    .replace(/(:\s*)(NaN|Infinity|-Infinity)\b/g, "$1null")
    .replace(/(\[\s*)(NaN|Infinity|-Infinity)\b/g, "$1null")
    .replace(/(,\s*)(NaN|Infinity|-Infinity)\b/g, "$1null")
    .replace(/(:\s*)None\b/g, "$1null")
    .replace(/(:\s*)True\b/g, "$1true")
    .replace(/(:\s*)False\b/g, "$1false")
    .replace(/(\[\s*)None\b/g, "$1null")
    .replace(/(\[\s*)True\b/g, "$1true")
    .replace(/(\[\s*)False\b/g, "$1false")
    .replace(/(,\s*)None\b/g, "$1null")
    .replace(/(,\s*)True\b/g, "$1true")
    .replace(/(,\s*)False\b/g, "$1false");
  return JSON.parse(normalized);
 };
 const tryParseRecords = (value: string) => {
  try {
    return normalizeRecordPayload(parseJsonLikePayload(value));
  } catch {
    return null;
  }
 };
 const parseRecordStringPayload = (payload: string): DatasetRecord[] | null => {
  const trimmed = payload.trim();
  if (!trimmed) {
    return [];
  }
  const direct = tryParseRecords(trimmed);
  if (direct) {
    return direct;
  }
  const ndjsonLines = trimmed
    .split(/\r?\n/)
    .map((line) => line.trim())
    .filter(Boolean);
  if (ndjsonLines.length > 0) {
    try {
      return ndjsonLines.map((line) => parseJsonLikePayload(line)) as DatasetRecord[];
    } catch {
    }
  }
  const bracketStart = trimmed.indexOf("[");
  const bracketEnd = trimmed.lastIndexOf("]");
  if (bracketStart !== -1 && bracketEnd > bracketStart) {
    const parsed = tryParseRecords(trimmed.slice(bracketStart, bracketEnd + 1));
    if (parsed) {
      return parsed;
    }
  }
  const braceStart = trimmed.indexOf("{");
  const braceEnd = trimmed.lastIndexOf("}");
  if (braceStart !== -1 && braceEnd > braceStart) {
    const parsed = tryParseRecords(trimmed.slice(braceStart, braceEnd + 1));
    if (parsed) {
      return parsed;
    }
  }
  return null;
 };
 const normalizeRecordPayload = (payload: unknown): DatasetRecord[] => {
  if (typeof payload === "string") {
    const parsed = parseRecordStringPayload(payload);
    if (parsed) {
      return parsed;
    }
    const preview = payload.trim().slice(0, 120).replace(/\s+/g, " ");
    throw new Error(
      `Corpus endpoint returned a non-JSON string payload.${
        preview ? ` Response preview: ${preview}` : ""
      }`,
    );
  }
  if (
    payload &&
    typeof payload === "object" &&
    "error" in payload &&
    typeof (payload as { error?: unknown }).error === "string"
  ) {
    throw new Error((payload as { error: string }).error);
  }
  if (Array.isArray(payload)) {
    return payload as DatasetRecord[];
  }
  if (
    payload &&
    typeof payload === "object" &&
    "data" in payload &&
    Array.isArray((payload as { data?: unknown }).data)
  ) {
    return (payload as { data: DatasetRecord[] }).data;
  }
  if (
    payload &&
    typeof payload === "object" &&
    "records" in payload &&
    Array.isArray((payload as { records?: unknown }).records)
  ) {
    return (payload as { records: DatasetRecord[] }).records;
  }
  if (
    payload &&
    typeof payload === "object" &&
    "rows" in payload &&
    Array.isArray((payload as { rows?: unknown }).rows)
  ) {
    return (payload as { rows: DatasetRecord[] }).rows;
  }
  if (
    payload &&
    typeof payload === "object" &&
    "result" in payload &&
    Array.isArray((payload as { result?: unknown }).result)
  ) {
    return (payload as { result: DatasetRecord[] }).result;
  }
  if (payload && typeof payload === "object") {
    const values = Object.values(payload);
    if (values.length === 1 && Array.isArray(values[0])) {
      return values[0] as DatasetRecord[];
    }
    if (values.every((value) => value && typeof value === "object")) {
      return values as DatasetRecord[];
    }
  }
  throw new Error("Corpus endpoint returned an unexpected payload.");
 };
 const StatPage = () => {
  const { datasetId: routeDatasetId } = useParams<{ datasetId: string }>();
  const [error, setError] = useState("");
@@ -61,6 +255,12 @@ const StatPage = () => {
    totalUsers: 0,
    mostCommentHeavyUser: null,
  });
  const [appliedFilters, setAppliedFilters] = useState<Record<string, string>>({});
  const [allRecords, setAllRecords] = useState<DatasetRecord[] | null>(null);
  const [allRecordsKey, setAllRecordsKey] = useState("");
  const [explorerState, setExplorerState] = useState<ExplorerState>(
    EMPTY_EXPLORER_STATE,
  );
  const searchInputRef = useRef<HTMLInputElement>(null);
  const beforeDateRef = useRef<HTMLInputElement>(null);
@@ -104,6 +304,59 @@ const StatPage = () => {
    };
  };
  const getFilterKey = (params: Record<string, string>) =>
    JSON.stringify(Object.entries(params).sort(([a], [b]) => a.localeCompare(b)));
  const ensureFilteredRecords = async () => {
    if (!datasetId) {
      throw new Error("Missing dataset id.");
    }
    const authHeaders = getAuthHeaders();
    if (!authHeaders) {
      throw new Error("You must be signed in to load corpus records.");
    }
    const filterKey = getFilterKey(appliedFilters);
    if (allRecords && allRecordsKey === filterKey) {
      return allRecords;
    }
    const response = await axios.get<unknown>(
      `${API_BASE_URL}/dataset/${datasetId}/all`,
      {
        params: appliedFilters,
        headers: authHeaders,
      },
    );
    const normalizedRecords = normalizeRecordPayload(response.data);
    setAllRecords(normalizedRecords);
    setAllRecordsKey(filterKey);
    return normalizedRecords;
  };
  const openExplorer = async (spec: CorpusExplorerSpec) => {
    setExplorerState(createExplorerState(spec, { loading: true }));
    try {
      const records = await ensureFilteredRecords();
      const context = buildExplorerContext(records);
      const matched = records
        .filter((record) => spec.matcher(record, context))
        .sort(compareRecordsByNewest);
      setExplorerState(createExplorerState(spec, { records: matched }));
    } catch (e) {
      setExplorerState(
        createExplorerState(spec, {
          error: `Failed to load corpus records: ${String(e)}`,
        }),
      );
    }
  };
  const getStats = (params: Record<string, string> = {}) => {
    if (!datasetId) {
      setError("Missing dataset id. Open /dataset/<id>/stats.");
@@ -118,22 +371,20 @@ const StatPage = () => {
    setError("");
    setLoading(true);
    setAppliedFilters(params);
    setAllRecords(null);
    setAllRecordsKey("");
    setExplorerState((current) => ({ ...current, open: false }));
    Promise.all([
-      axios.get<TimeAnalysisResponse>(
+      axios.get<TimeAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/temporal`, {
        `${API_BASE_URL}/dataset/${datasetId}/temporal`,
        {
        params,
        headers: authHeaders,
-        },
+      }),
-      ),
+      axios.get<UserEndpointResponse>(`${API_BASE_URL}/dataset/${datasetId}/user`, {
      axios.get<UserEndpointResponse>(
        `${API_BASE_URL}/dataset/${datasetId}/user`,
        {
        params,
        headers: authHeaders,
-        },
+      }),
      ),
      axios.get<LinguisticAnalysisResponse>(
        `${API_BASE_URL}/dataset/${datasetId}/linguistic`,
        {
@@ -141,13 +392,10 @@ const StatPage = () => {
          headers: authHeaders,
        },
      ),
-      axios.get<EmotionalAnalysisResponse>(
+      axios.get<EmotionalAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/emotional`, {
        `${API_BASE_URL}/dataset/${datasetId}/emotional`,
        {
        params,
        headers: authHeaders,
-        },
+      }),
      ),
      axios.get<InteractionAnalysisResponse>(
        `${API_BASE_URL}/dataset/${datasetId}/interactional`,
        {
@@ -155,20 +403,14 @@ const StatPage = () => {
          headers: authHeaders,
        },
      ),
-      axios.get<SummaryResponse>(
+      axios.get<SummaryResponse>(`${API_BASE_URL}/dataset/${datasetId}/summary`, {
        `${API_BASE_URL}/dataset/${datasetId}/summary`,
        {
        params,
        headers: authHeaders,
-        },
+      }),
-      ),
+      axios.get<CulturalAnalysisResponse>(`${API_BASE_URL}/dataset/${datasetId}/cultural`, {
      axios.get<CulturalAnalysisResponse>(
        `${API_BASE_URL}/dataset/${datasetId}/cultural`,
        {
        params,
        headers: authHeaders,
-        },
+      }),
      ),
    ])
      .then(
        ([
@@ -182,8 +424,7 @@ const StatPage = () => {
        ]) => {
          const usersList = userRes.data.users ?? [];
          const topUsersList = userRes.data.top_users ?? [];
-          const interactionGraphRaw =
+          const interactionGraphRaw = interactionRes.data?.interaction_graph ?? {};
            interactionRes.data?.interaction_graph ?? {};
          const topPairsRaw = interactionRes.data?.top_interaction_pairs ?? [];
          const filteredUsers: typeof usersList = [];
@@ -198,14 +439,10 @@ const StatPage = () => {
            filteredTopUsers.push(user);
          }
-          let mostCommentHeavyUser: UserStatsMeta["mostCommentHeavyUser"] =
+          let mostCommentHeavyUser: UserStatsMeta["mostCommentHeavyUser"] = null;
            null;
          for (const user of filteredUsers) {
            const currentShare = user.comment_share ?? 0;
-            if (
+            if (!mostCommentHeavyUser || currentShare > mostCommentHeavyUser.commentShare) {
              !mostCommentHeavyUser ||
              currentShare > mostCommentHeavyUser.commentShare
            ) {
              mostCommentHeavyUser = {
                author: user.author,
                commentShare: currentShare,
@@ -221,8 +458,7 @@ const StatPage = () => {
            }
          }
-          const filteredInteractionGraph: Record<string, Record<string, number>> =
+          const filteredInteractionGraph: Record<string, Record<string, number>> = {};
            {};
          for (const [source, targets] of Object.entries(interactionGraphRaw)) {
            if (isDeletedUser(source)) {
              continue;
@@ -279,7 +515,7 @@ const StatPage = () => {
          setSummary(filteredSummary || null);
        },
      )
-      .catch((e) => setError("Failed to load statistics: " + String(e)))
+      .catch((e) => setError(`Failed to load statistics: ${String(e)}`))
      .finally(() => setLoading(false));
  };
@@ -302,6 +538,9 @@ const StatPage = () => {
  useEffect(() => {
    setError("");
    setAllRecords(null);
    setAllRecordsKey("");
    setExplorerState(EMPTY_EXPLORER_STATE);
    if (!datasetId) {
      setError("Missing dataset id. Open /dataset/<id>/stats.");
      return;
@@ -398,9 +637,7 @@ const StatPage = () => {
        <button
          onClick={() => setActiveView("summary")}
          style={
-            activeView === "summary"
+            activeView === "summary" ? styles.buttonPrimary : styles.buttonSecondary
              ? styles.buttonPrimary
              : styles.buttonSecondary
          }
        >
          Summary
@@ -418,11 +655,7 @@ const StatPage = () => {
        <button
          onClick={() => setActiveView("user")}
-          style={
+          style={activeView === "user" ? styles.buttonPrimary : styles.buttonSecondary}
            activeView === "user"
              ? styles.buttonPrimary
              : styles.buttonSecondary
          }
        >
          Users
        </button>
@@ -449,9 +682,7 @@ const StatPage = () => {
        <button
          onClick={() => setActiveView("cultural")}
          style={
-            activeView === "cultural"
+            activeView === "cultural" ? styles.buttonPrimary : styles.buttonSecondary
              ? styles.buttonPrimary
              : styles.buttonSecondary
          }
        >
          Cultural
@@ -464,11 +695,12 @@ const StatPage = () => {
          timeData={timeData}
          linguisticData={linguisticData}
          summary={summary}
          onExplore={openExplorer}
        />
      )}
      {activeView === "emotional" && emotionalData && (
-        <EmotionalStats emotionalData={emotionalData} />
+        <EmotionalStats emotionalData={emotionalData} onExplore={openExplorer} />
      )}
      {activeView === "emotional" && !emotionalData && (
@@ -483,6 +715,7 @@ const StatPage = () => {
          interactionGraph={interactionData.interaction_graph}
          totalUsers={userStatsMeta.totalUsers}
          mostCommentHeavyUser={userStatsMeta.mostCommentHeavyUser}
          onExplore={openExplorer}
        />
      )}
@@ -493,7 +726,7 @@ const StatPage = () => {
      )}
      {activeView === "linguistic" && linguisticData && (
-        <LinguisticStats data={linguisticData} />
+        <LinguisticStats data={linguisticData} onExplore={openExplorer} />
      )}
      {activeView === "linguistic" && !linguisticData && (
@@ -513,7 +746,7 @@ const StatPage = () => {
      )}
      {activeView === "cultural" && culturalData && (
-        <CulturalStats data={culturalData} />
+        <CulturalStats data={culturalData} onExplore={openExplorer} />
      )}
      {activeView === "cultural" && !culturalData && (
@@ -521,6 +754,17 @@ const StatPage = () => {
          No cultural data available.
        </div>
      )}
      <CorpusExplorer
        open={explorerState.open}
        onClose={() => setExplorerState((current) => ({ ...current, open: false }))}
        title={explorerState.title}
        description={explorerState.description}
        records={explorerState.records}
        loading={explorerState.loading}
        error={explorerState.error}
        emptyMessage={explorerState.emptyMessage}
      />
    </div>
  );
 };
--- a/frontend/src/utils/corpusExplorer.ts
+++ b/frontend/src/utils/corpusExplorer.ts
@@ -0,0 +1,371 @@
 type EntityRecord = {
  text?: string;
  [key: string]: unknown;
 };
 type DatasetRecord = {
  id?: string | number;
  post_id?: string | number | null;
  parent_id?: string | number | null;
  author?: string | null;
  title?: string | null;
  content?: string | null;
  timestamp?: string | number | null;
  date?: string | null;
  dt?: string | null;
  hour?: number | null;
  weekday?: string | null;
  reply_to?: string | number | null;
  source?: string | null;
  topic?: string | null;
  topic_confidence?: number | null;
  type?: string | null;
  ner_entities?: EntityRecord[] | null;
  emotion_anger?: number | null;
  emotion_disgust?: number | null;
  emotion_fear?: number | null;
  emotion_joy?: number | null;
  emotion_sadness?: number | null;
  [key: string]: unknown;
 };
 type CorpusExplorerContext = {
  authorByPostId: Map<string, string>;
  authorEventCounts: Map<string, number>;
  authorCommentCounts: Map<string, number>;
 };
 type CorpusExplorerSpec = {
  title: string;
  description: string;
  emptyMessage?: string;
  matcher: (record: DatasetRecord, context: CorpusExplorerContext) => boolean;
 };
 const IN_GROUP_PATTERN = /\b(we|us|our|ourselves)\b/gi;
 const OUT_GROUP_PATTERN = /\b(they|them|their|themselves)\b/gi;
 const HEDGE_PATTERN = /\b(maybe|perhaps|possibly|probably|likely|seems|seem|i think|i feel|i guess|kind of|sort of|somewhat)\b/i;
 const CERTAINTY_PATTERN = /\b(definitely|certainly|clearly|obviously|undeniably|always|never)\b/i;
 const DEONTIC_PATTERN = /\b(must|should|need|needs|have to|has to|ought|required|require)\b/i;
 const PERMISSION_PATTERN = /\b(can|allowed|okay|ok|permitted)\b/i;
 const EMOTION_KEYS = [
  "emotion_anger",
  "emotion_disgust",
  "emotion_fear",
  "emotion_joy",
  "emotion_sadness",
 ] as const;
 const toText = (value: unknown) => {
  if (typeof value === "string") {
    return value;
  }
  if (typeof value === "number" || typeof value === "boolean") {
    return String(value);
  }
  if (value && typeof value === "object" && "id" in value) {
    const id = (value as { id?: unknown }).id;
    if (typeof id === "string" || typeof id === "number") {
      return String(id);
    }
  }
  return "";
 };
 const normalize = (value: unknown) => toText(value).trim().toLowerCase();
 const getAuthor = (record: DatasetRecord) => toText(record.author).trim();
 const getRecordText = (record: DatasetRecord) =>
  `${record.title ?? ""} ${record.content ?? ""}`.trim();
 const escapeRegExp = (value: string) =>
  value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
 const buildPhrasePattern = (phrase: string) => {
  const tokens = phrase
    .toLowerCase()
    .trim()
    .split(/\s+/)
    .filter(Boolean)
    .map(escapeRegExp);
  if (!tokens.length) {
    return null;
  }
  return new RegExp(`\\b${tokens.join("\\s+")}\\b`, "i");
 };
 const countMatches = (pattern: RegExp, text: string) =>
  Array.from(text.matchAll(new RegExp(pattern.source, "gi"))).length;
 const getDateBucket = (record: DatasetRecord) => {
  if (typeof record.date === "string" && record.date) {
    return record.date.slice(0, 10);
  }
  if (typeof record.dt === "string" && record.dt) {
    return record.dt.slice(0, 10);
  }
  if (typeof record.timestamp === "number") {
    return new Date(record.timestamp * 1000).toISOString().slice(0, 10);
  }
  if (typeof record.timestamp === "string" && record.timestamp) {
    const numeric = Number(record.timestamp);
    if (Number.isFinite(numeric)) {
      return new Date(numeric * 1000).toISOString().slice(0, 10);
    }
  }
  return "";
 };
 const getDominantEmotion = (record: DatasetRecord) => {
  let bestKey = "";
  let bestValue = Number.NEGATIVE_INFINITY;
  for (const key of EMOTION_KEYS) {
    const value = Number(record[key] ?? Number.NEGATIVE_INFINITY);
    if (value > bestValue) {
      bestValue = value;
      bestKey = key;
    }
  }
  return bestKey.replace("emotion_", "");
 };
 const matchesPhrase = (record: DatasetRecord, phrase: string) => {
  const pattern = buildPhrasePattern(phrase);
  if (!pattern) {
    return false;
  }
  return pattern.test(getRecordText(record));
 };
 const recordIdentityBucket = (record: DatasetRecord) => {
  const text = getRecordText(record);
  const inHits = countMatches(IN_GROUP_PATTERN, text);
  const outHits = countMatches(OUT_GROUP_PATTERN, text);
  if (inHits > outHits) {
    return "in";
  }
  if (outHits > inHits) {
    return "out";
  }
  return "tie";
 };
 const buildExplorerContext = (records: DatasetRecord[]): CorpusExplorerContext => {
  const authorByPostId = new Map<string, string>();
  const authorEventCounts = new Map<string, number>();
  const authorCommentCounts = new Map<string, number>();
  for (const record of records) {
    const author = getAuthor(record);
    if (!author) {
      continue;
    }
    authorEventCounts.set(author, (authorEventCounts.get(author) ?? 0) + 1);
    if (record.type === "comment") {
      authorCommentCounts.set(author, (authorCommentCounts.get(author) ?? 0) + 1);
    }
    if (record.post_id !== null && record.post_id !== undefined) {
      authorByPostId.set(String(record.post_id), author);
    }
  }
  return { authorByPostId, authorEventCounts, authorCommentCounts };
 };
 const buildAllRecordsSpec = (): CorpusExplorerSpec => ({
  title: "Corpus Explorer",
  description: "All records in the current filtered dataset.",
  emptyMessage: "No records match the current filters.",
  matcher: () => true,
 });
 const buildUserSpec = (author: string): CorpusExplorerSpec => {
  const target = normalize(author);
  return {
    title: `User: ${author}`,
    description: `All records authored by ${author}.`,
    emptyMessage: `No records found for ${author}.`,
    matcher: (record) => normalize(record.author) === target,
  };
 };
 const buildTopicSpec = (topic: string): CorpusExplorerSpec => {
  const target = normalize(topic);
  return {
    title: `Topic: ${topic}`,
    description: `Records assigned to the ${topic} topic bucket.`,
    emptyMessage: `No records found in the ${topic} topic bucket.`,
    matcher: (record) => normalize(record.topic) === target,
  };
 };
 const buildDateBucketSpec = (date: string): CorpusExplorerSpec => ({
  title: `Date Bucket: ${date}`,
  description: `Records from the ${date} activity bucket.`,
  emptyMessage: `No records found on ${date}.`,
  matcher: (record) => getDateBucket(record) === date,
 });
 const buildWordSpec = (word: string): CorpusExplorerSpec => ({
  title: `Word: ${word}`,
  description: `Records containing the word ${word}.`,
  emptyMessage: `No records mention ${word}.`,
  matcher: (record) => matchesPhrase(record, word),
 });
 const buildNgramSpec = (ngram: string): CorpusExplorerSpec => ({
  title: `N-gram: ${ngram}`,
  description: `Records containing the phrase ${ngram}.`,
  emptyMessage: `No records contain the phrase ${ngram}.`,
  matcher: (record) => matchesPhrase(record, ngram),
 });
 const buildEntitySpec = (entity: string): CorpusExplorerSpec => {
  const target = normalize(entity);
  return {
    title: `Entity: ${entity}`,
    description: `Records mentioning the ${entity} entity.`,
    emptyMessage: `No records found for the ${entity} entity.`,
    matcher: (record) => {
      const entities = Array.isArray(record.ner_entities) ? record.ner_entities : [];
      return entities.some((item) => normalize(item?.text) === target) || matchesPhrase(record, entity);
    },
  };
 };
 const buildSourceSpec = (source: string): CorpusExplorerSpec => {
  const target = normalize(source);
  return {
    title: `Source: ${source}`,
    description: `Records from the ${source} source.`,
    emptyMessage: `No records found for ${source}.`,
    matcher: (record) => normalize(record.source) === target,
  };
 };
 const buildDominantEmotionSpec = (emotion: string): CorpusExplorerSpec => {
  const target = normalize(emotion);
  return {
    title: `Dominant Emotion: ${emotion}`,
    description: `Records where ${emotion} is the strongest emotion score.`,
    emptyMessage: `No records found with dominant emotion ${emotion}.`,
    matcher: (record) => getDominantEmotion(record) === target,
  };
 };
 const buildReplyPairSpec = (source: string, target: string): CorpusExplorerSpec => {
  const sourceName = normalize(source);
  const targetName = normalize(target);
  return {
    title: `Reply Path: ${source} -> ${target}`,
    description: `Reply records authored by ${source} in response to ${target}.`,
    emptyMessage: `No reply records found for ${source} -> ${target}.`,
    matcher: (record, context) => {
      if (normalize(record.author) !== sourceName) {
        return false;
      }
      const replyTo = record.reply_to;
      if (replyTo === null || replyTo === undefined || replyTo === "") {
        return false;
      }
      return normalize(context.authorByPostId.get(String(replyTo))) === targetName;
    },
  };
 };
 const buildOneTimeUsersSpec = (): CorpusExplorerSpec => ({
  title: "One-Time Users",
  description: "Records written by authors who appear exactly once in the filtered corpus.",
  emptyMessage: "No one-time-user records found.",
  matcher: (record, context) => {
    const author = getAuthor(record);
    return !!author && context.authorEventCounts.get(author) === 1;
  },
 });
 const buildIdentityBucketSpec = (bucket: "in" | "out" | "tie"): CorpusExplorerSpec => {
  const labels = {
    in: "In-Group Posts",
    out: "Out-Group Posts",
    tie: "Balanced Posts",
  } as const;
  return {
    title: labels[bucket],
    description: `Records in the ${labels[bucket].toLowerCase()} cultural bucket.`,
    emptyMessage: `No records found for ${labels[bucket].toLowerCase()}.`,
    matcher: (record) => recordIdentityBucket(record) === bucket,
  };
 };
 const buildPatternSpec = (
  title: string,
  description: string,
  pattern: RegExp,
 ): CorpusExplorerSpec => ({
  title,
  description,
  emptyMessage: `No records found for ${title.toLowerCase()}.`,
  matcher: (record) => pattern.test(getRecordText(record)),
 });
 const buildHedgeSpec = () =>
  buildPatternSpec("Hedging Words", "Records containing hedging language.", HEDGE_PATTERN);
 const buildCertaintySpec = () =>
  buildPatternSpec("Certainty Words", "Records containing certainty language.", CERTAINTY_PATTERN);
 const buildDeonticSpec = () =>
  buildPatternSpec("Need/Should Words", "Records containing deontic language.", DEONTIC_PATTERN);
 const buildPermissionSpec = () =>
  buildPatternSpec("Permission Words", "Records containing permission language.", PERMISSION_PATTERN);
 export type { DatasetRecord, CorpusExplorerSpec };
 export {
  buildAllRecordsSpec,
  buildCertaintySpec,
  buildDateBucketSpec,
  buildDeonticSpec,
  buildDominantEmotionSpec,
  buildEntitySpec,
  buildExplorerContext,
  buildHedgeSpec,
  buildIdentityBucketSpec,
  buildNgramSpec,
  buildOneTimeUsersSpec,
  buildPermissionSpec,
  buildReplyPairSpec,
  buildSourceSpec,
  buildTopicSpec,
  buildUserSpec,
  buildWordSpec,
  getDateBucket,
  toText,
 };
--- a/frontend/src/utils/documentTitle.ts
+++ b/frontend/src/utils/documentTitle.ts
@@ -3,7 +3,7 @@ const DEFAULT_TITLE = "Ethnograph View";
 const STATIC_TITLES: Record<string, string> = {
  "/login": "Sign In",
  "/upload": "Upload Dataset",
-  "/auto-scrape": "Auto Scrape Dataset",
+  "/auto-fetch": "Auto Fetch Dataset",
  "/datasets": "My Datasets",
 };
--- a/report/img/analysis_bar.png
+++ b/report/img/analysis_bar.png
--- a/report/img/frontend.png
+++ b/report/img/frontend.png
--- a/report/img/heatmap.png
+++ b/report/img/heatmap.png
--- a/report/img/interaction_graph.png
+++ b/report/img/interaction_graph.png
--- a/report/img/kpi_card.png
+++ b/report/img/kpi_card.png
--- a/report/img/navbar.png
+++ b/report/img/navbar.png
--- a/report/img/nlp_backoff.png
+++ b/report/img/nlp_backoff.png
--- a/report/img/pipeline.png
+++ b/report/img/pipeline.png
--- a/report/img/stance_markers.png
+++ b/report/img/stance_markers.png
--- a/report/main.tex
+++ b/report/main.tex
@@ -115,6 +115,7 @@ This section describes common keywords and metrics use to measure and quantify o
 Sentiment Analysis involves capturing the emotions associated with a specific post, topic or entity. This type of analysis can be as simple as classifying a post as "positive" or "negative", or classifying a post into a set of pre-existing emotions such as anger, joy or sadness.
 \subsubsection{Active vs Passive Participation}
 \label{sec:passive_participation}
 Not everyone in an online community participates in the same way. Some users post regularly and leave comments while others might simply read content without ever contributing anything themselves. Some might only contribute occasionally.
 This distinction between active and passive participation (passive users are often referred to as "lurkers") is important in digital ethnography, because looking only at posts and comments can give a misleading picture of how large or engaged a community actually is. 
@@ -123,11 +124,13 @@ This distinction between active and passive participation (passive users are oft
 Looking at when a community is active can reveal quite a lot about its nature and membership. A subreddit that peaks at 2am UTC might have a mostly American userbase, while one that is consistently active across all hours could suggest a more globally distributed community. Beyond timezones, temporal patterns can also capture things like how a community responds to external events, like a sudden spike in posting activity often corresponds to something newsworthy happening that is relevant to the community.
 \subsubsection{Cultural Markers}
 \label{sec:cultural_markers}
 Cultural markers are the words, phrases, memes, and behaviours that are specific to a particular community and signal that someone is a member of it. These might include in-jokes, niche slang, recurring references, or even particular ways of formatting posts. In the context of digital ethnography, identifying these markers is useful because they reveal how communities build a shared identity and distinguish themselves from outsiders.
 Some patterns, such as usage of words like "we, us, our, ourselves", where posts are referring to themselves as a community might have different sentiment to posts where words like "they, them, their, themselves" are used. These are known as "identity markers" and they can be used to identify how welcoming a community might be to outsiders.
 \subsubsection{Stance Markers}
 \label{sec:stance_markers}
 Stance Markers refer to the usage of different phrasing patterns which can reveal the speakers attitude towards topics. There are different kinds of these phrasings, such as hedge, certainty, deontic and permission patterns.
 \textbf{Hedge Patterns} are usually phrases that contain words like "maybe, possibly, probably, i think, i feel" and generally mean that someone is unsure or suspicious about something.
@@ -461,6 +464,13 @@ As this project is focused on the collection and analysis of online community da
 A unified data model is used to represent all incoming data, regardless of its original source or structure. This ensures that the same pipeline works across YouTube, Reddit and boards.ie data, and can be easily extended to new sources in the future.
 \begin{figure}
    \centering
    \includegraphics[width=1.0\textwidth]{img/pipeline.png}
    \caption{Data Pipeline Diagram}
    \label{fig:pipeline}
 \end{figure}
 \subsubsection{Data Ingestion}
 The system will support two methods of data ingestion:
 \begin{itemize}
@@ -517,25 +527,6 @@ NLP processing lets us perform much richer analysis of the dataset, as it provid
 \subsubsection{Data Storage}
 The enriched dataset is stored in a PostgreSQL database, with a schema similar to the unified data model defined in the normalisation section, with additional fields for the derived data, NLP outputs, and user ownership. Each dataset is associated with a specific user account, and the system supports multiple datasets per user.
 The \texttt{events} table in PostgreSQL contains the following fields:
 \begin{itemize}
    \item \texttt{id}: a unique identifier for the event.
    \item \texttt{dataset\_id}: a foreign key referencing the dataset this event belongs to. If the dataset is deleted.
    \item \texttt{post\_id}: the original identifier of the post or comment as it appeared on the source platform.
    \item \texttt{type}: whether the event is a post or a comment.
    \item \texttt{author}: the username of the content creator.
    \item \texttt{content}: the text content of the event.
    \item \texttt{timestamp}: the Unix epoch time at which the content was created.
    \item \texttt{date}, \texttt{dt}, \texttt{hour}, \texttt{weekday}: datetime fields derived from the timestamp at ingestion time.
    \item \texttt{title}: the title of the post, if the event is a post. Null for comments.
    \item \texttt{parent\_id}: for comments, the identifier of the post it belongs to. Null for posts.
    \item \texttt{reply\_to}: for comments, the identifier of the comment it directly replies to. Null if the comment is a direct reply to a post.
    \item \texttt{source}: the platform from which the content was retrieved.
    \item \texttt{topic}, \texttt{topic\_confidence}: the topic assigned to the event by the NLP model, along with a confidence score.
    \item \texttt{ner\_entities}: a list of named entities identified in the content.
    \item \texttt{emotion\_anger}, \texttt{emotion\_disgust}, \texttt{emotion\_fear}, \texttt{emotion\_joy}, \texttt{emotion\_sadness}: emotion scores assigned to the event by the NLP model.
 \end{itemize}
 \subsubsection{Data Retrieval}
 The stored dataset can then be retrieved through the Flask API endpoints for analysis. The API supports filtering by keywords and date ranges, as well as grouping and aggregation for various analytical outputs.
@@ -611,14 +602,7 @@ User analysis allows researchers to understand the behaviour and activity of ind
 In this system, user analysis will include:
 \begin{itemize}
    \item Identification of top users based on activity.
-    \item Per-user activity such as:
+    \item Per-user activity.
    \begin{itemize}
        \item Total number of events (posts and comments).
        \item Average emotion distribution across their events.
        \item Average topic distribution across their events.
        \item Comment-to-post ratio.
        \item Vocabulary information such as top words used and lexical diversity.
    \end{itemize}
 \end{itemize}
 Initially the user endpoint contained the interactional statistics as well, as a case could be made for the user analysis and interaction analysis being combined, however a distinction can be made between individual user analysis and user analysis on a larger, community-level scale focused on interactions. This allows the user endpoint to stay focused on singular user analysis while still using NLP outputs like emotions and topics.
@@ -675,11 +659,20 @@ In this system, cultural analysis will include:
    \item Average emotions per entity
 \end{itemize}
 These metrics were chosen because they can provide insights into the cultural markers and identity signals that are present in an online community, further described in Section \ref{sec:cultural_markers} and \ref{sec:stance_markers}.
 \subsection{Frontend Design}
-The frontend is built with React and TypeScript, and the analysis sections are structured around a tabbed dashboard interface where each tab corresponds to a distinct analytical perspective: temporal, linguistic, emotional, user, and interaction analysis. This organisation mirrors the shape of the backend API and makes it straightforward for a researcher to navigate between different lenses on the same dataset without losing context.
+The primary audience for this tool is researchers and social scientists, not software developers. Therefore the frontend needs to feel approachable and easy to use for non-technical users. At the same time it must support multi-dataset workflows and handle long-running background processes.
-React was chosen for its efficient rendering model and the breadth of its visualisation ecosystem
+React was chosen as the UI framework primarily for its large amount of pre-built visualisation components. There are many different types of data being visualised in this system, such as word clouds, bar charts, line charts, heatmaps and network graphs, and React has a large library of pre-built components for all of these types of visualisations.
 \subsubsection{Structure}
 A persistent layout shell will wrap every page of the frontend, providing a consistent header for navigation and account management. This will also store login state and user information in a global way, such that no component has to manage authentication state on its own. The main content area will be reserved for the dataset management and analysis interface.
 The frontend will be structured around a tabbed interface, with each tab corresponding to a different analytical endpoint (e.g., temporal analysis, linguistic analysis, emotional analysis). Each tab will fetch data from the backend API and render it using appropriate visualisation libraries. The frontend will also include controls for filtering the dataset based on keywords, date ranges, and data sources.
 \subsubsection{Visual Design}
 The visual design of the frontend will be clean and minimalistic, with a focus on usability and clarity. The styling files will be centralised to allow for developers to easily change or modify the colouring and palettes in the future. 
 \subsection{Automatic Data Collection}
 Originally, the system was designed to only support manual dataset uploads, where users would collect their own data from social media platforms and format it into the required \texttt{.jsonl} format. 
@@ -878,6 +871,7 @@ NER output is stored as JSONB rather than in relational columns, as the number o
 This module is a simple interface to deal with datasets in the database, and abstracts away the details of SQL queries and database interactions from the rest of the application. It is used by the API endpoints to manage datasets and their content.
 \subsubsection{Authentication Manager}
 \label{sec:auth-manager}
 The authentication manager is another higher-level module that provides an interface for managing user authentication in the database. It also uses the low-level \texttt{PostgreConnector} to execute SQL queries, but provides more specific methods for authentication management, such as creating a new user, fetching a user by id, and authenticating a user. It handles password hashing using the \texttt{bcrypt} library, which provides a secure way to hash and verify passwords. Similar to the dataset manager, dependency injection is used to pass an instance of the \texttt{PostgreConnector}.
 The most important authentication methods implemented are as follows:
@@ -952,22 +946,332 @@ A middle ground was found with the "Emotion English DistilRoBERTa-base" model fr
 As the project progressed and more posts were classified, the "surprise" and "neutral" emotions were found to be dominating the dataset, which made it difficult to analyse the other emotions. This could possible be because the model is not fine-tuned for internet slang, and usage of exclamation marks and emojis, which are common in social media posts, may be classified as "surprise" or "neutral" rather than the intended emotion. Therefore, the "surprise" and "neutral" emotion classes were removed from the dataset, and the confidence numbers were re-normalised to the remaining 5 emotions.
 \subsubsection{Topic Classification}
 For topic classification, a zero-shot classification approach was used, which allows for classification of text into arbitrary topic classes without needing to fine-tune a model for each specific set of topics. Initially, attempts were made to automatically generate topic classes based on the most common words in the dataset using TF-IDF, but this led to generic and strange classes that weren't useful for analysis. Therefore, it was decided that a topic list would be provided manually, either by the user or using a generic list of broad common topics. 
 Initially, the "all-mpnet-base-v2" \cite{all_mpnet_base_v2} was used as the base model for the zero-shot classification, which is a general-purpose sentence embedding model. While this worked well and produced good results, it was slow to run interference on large datasets, and would often take hours to classify a dataset of over 60,000 posts and comments.
 Eventually, the "MiniLM-L6-v2 " \cite{minilm_l6_v2} was chosen as the base model for zero-shot classification, which is a smaller and faster sentence embedding model. While it may not produce quite as good results as the larger model, it still produces good results and is much faster to run inference on, which makes it more practical for use in this project.
 \subsubsection{Entity Recognition}
 At this point, the NLP pipeline was taking a long time to run on large datasets (such as the Cork dataset), therefore any NER (Named Entity Recognition) model that was added needed to be small and fast to run interference on large datasets. The "dslim/bert-base-NER" model from HuggingFace \cite{dslim_bert_base_ner} was chosen as it is a fine-tuned BERT model that can perform named entity recognition, and is relatively small and fast compared to other NER models.
 This model outputs a list of entities for each post, and each entity has a type, which are:
 \begin{itemize}
    \item \textbf{PER}: Person
    \item \textbf{ORG}: Organisation
    \item \textbf{LOC}: Location
    \item \textbf{MISC}: Miscellaneous
 \end{itemize}
 Since the model outputs have a variable length, they arestored in the database as a \texttt{JSONB} field, which allows for flexible storage of the variable number of entities per post.
 \subsubsection{Optimization}
 Many issues arose with the performance of the NLP module, as running inference on large datasets can take a long time, especially when using transformer-based models. To optimize the performance of the NLP module, several techniques were used:
 \begin{itemize}
    \item \textbf{Batch Processing}: Instead of running inference on each post individually, posts are processed in batches.
    \item \textbf{Model Caching}: Models are loaded once and cached in memory, rather than being loaded from disk for each inference.
    \item \textbf{Batch Size Backoff}: If the model runs out of memory during inference, the batch size is automatically reduced and the inference is retried until it succeeds.
 \end{itemize}
 An example of the batch size backoff implementation is shown in figure \ref{fig:nlp_backoff}.
 \begin{figure}
    \centering
    \includegraphics[width=1.0\textwidth]{img/nlp_backoff.png}
    \caption{Batch Size Backoff Implementation}
    \label{fig:nlp_backoff}
 \end{figure}
 \subsection{Ethnographic Statistics}
 This section will discuss the implementation of the various ethnographic statistics that are available through the API endpoints, such as temporal analysis, linguistic analysis, emotional analysis, user analysis, interactional analysis, and cultural analysis. Each of these are available through the API and visualised in the frontend. 
 \subsubsection{Temporal Analysis}
 Two statistics are implemented for temporal analysis:
 \begin{itemize}
    \item \textbf{Posts Per Day}: A simple count of the number of posts and comments per day, which can be visualised as a line chart or bar chart to show trends over time.
    \item \textbf{Time Heatmap}: A heatmap of posts and comments by hour of the day and day of the week, which can show patterns in when users are most active.
 \end{itemize}
 Both of these statistics are implemented using Pandas queries to aggregate the data by the relevant time periods, and lists of dictionaries are returned to the API for visualisation in the frontend.
 \subsubsection{Linguistic Analysis}
 Linguistic analysis includes three statistics:
 \begin{itemize}
    \item \textbf{Word Frequency}: A count of the most common words used in the dataset, which can be visualised as a word cloud or bar chart.
    \item \textbf{N-grams}: A count of the most common n-grams (sequences of n words) used in the dataset, which can also be visualised as a word cloud or bar chart.
    \item \textbf{Lexical Diversity}: A measure of the diversity of the language used in the dataset, calculated as the ratio of unique words to total words.
 \end{itemize}
 Both word frequency and n-grams are calculated using the \texttt{collections.Counter} class, which provides a convenient way to count the occurrences of words and n-grams in the dataset. N-Grams take in a number \texttt{n} as a parameter, which specifies the length of the n-grams to calculate. For example, if \texttt{n} = 2, the most common two word phrases will be returned.
 Lexical diversity is calculated using a simple formula that divides the number of unique words by the total number of words in the dataset.
 This class requires a word exclusion list to be provided, which is a list of common words that should be excluded from the analysis, such as stop words and common words that are not relevant for analysis. These are passed in from the higher level StatGen class.
 \subsubsection{User Analysis}
 User analysis returns two major statistics:
 \begin{itemize}
    \item \textbf{Top Users}: A count of the most active users in the dataset, which can be visualised as a bar chart or table.
    \item \textbf{Per User Analysis}: A breakdown of statistics for each user, such as the number of posts and comments, average sentiment, and most common words used by that user. Each user will be analysed as follows:
    \begin{itemize}
        \item Total number of events (posts and comments).
        \item Average emotion distribution across their events.
        \item Average topic distribution across their events.
        \item Comment-to-post ratio.
        \item Vocabulary information such as top words used and lexical diversity.
    \end{itemize}
 \end{itemize}
 \subsubsection{Interactional Analysis}
 Interactional analysis includes three statistics:
 \begin{itemize}
    \item \textbf{Interaction Graph}: A graph of interactions between users, where nodes represent users and edges represent interactions.
    \item \textbf{Top Interaction Pairs}: A count of the most common pairs of users that interact with each other, which can be visualised as a bar chart or table.
    \item \textbf{Conversation Concentration}: A measure of how concentrated conversations are around certain users.
 \end{itemize}
 The conversation concentration statistic shows the inequality of contributions in conversations, described in Section-\ref{sec:passive_participation}. It identifies the total number of unique commenters, calculates what share of all comments are produced by the most active top 10\% of authors, and measures how many authors only ever commented once. Put together, these metrics reveal the degree to which a community's conversation is driven by a small core of prolific contributors versus being broadly distributed. The metrics returned are:
 \begin{itemize}
    \item \textbf{Total Commenting Users}: The total number of unique users who commented in the dataset.
    \item \textbf{Top 10\% Comment Share}: The percentage of all comments that were produced by the top 10\% most active commenters.
    \item \textbf{Top 10\% Author Count}: The number of unique users that make up the top 10\% most active commenters.
    \item \textbf{One-Time Commenters}: The percentage of users that only commented once in the dataset.
 \end{itemize}
 The interaction graph creates an index of post ids to authors to ensure fast and proper link when iterating over the dataset. In addition, issues arose with the distinction between someone replying to a post as a comment, and someone replying to a comment. The fix involved checking both \texttt{parent\_id} and \texttt{reply\_to} fields instead of just \texttt{reply\_to}.
 \subsubsection{Emotional Analysis}
 Emotional analysis includes four statistics:
 \begin{itemize}
    \item \textbf{Average Emotion By Topic}: A breakdown of the average emotion scores for each topic.
    \item \textbf{Overall Emotional Average}: A breakdown of the average emotion scores for the entire dataset.
    \item \textbf{Dominant Emotion Distribution}: The distribution of dominant emotions per event in the dataset.
    \item \textbf{Average Emotion By Source}: A breakdown of the average emotion scores for each source platform.
 \end{itemize}
 Throughout development, the "surprise" and "neutral" emotion classes were in data pipeline, however they were removed from the emotional analysis as they were dominating the dataset and skewing the results.
 \subsubsection{Cultural Analysis}
 Cultural analysis includes three statistics:
 \begin{itemize}
    \item \textbf{Identity Markers}: Statistics regarding in-group vs out-group markers, how common each are and average emotions with each, visualised as KPIs.
    \item \textbf{Stance Markers}: Returns hedge, certainty, deontic and permissive markers, how common each are and average emotions with each, visualised as KPIs.
    \item \textbf{Average Emotions Per Entity}: A breakdown of the average emotion scores for each named entity type (person, organisation, location, miscellaneous).
 \end{itemize}
 For stance and identity markers, the Python module \texttt{re} was used to find certain words in a post along with the counts of each. \texttt{re} was used instead of a more complex NLP approach as the goal is simply to find certain words quickly, whereas a more complex approach would be far slower.
 With the identity markers, in-group markers such as "we", "us", "our" were counted, as well as out-group markers such as "they", "them", "their". For stance markers, hedge markers such as "maybe", "possibly", "might" were counted, as well as certainty markers such as "definitely", "certainly", "undoubtedly", deontic markers such as "should", "must", "ought to", and permissive markers such as "can", "could", "may". An example of the implementation for stance markers can be seen in figure \ref{fig:stance_markers}.
 \begin{figure}
    \centering
    \includegraphics[width=1.0\textwidth]{img/stance_markers.png}
    \caption{Finding Stance Markers with Regular Expressions}
    \label{fig:stance_markers}
 \end{figure}
 \subsubsection{Summary}
 During development, it was helpful to see a high-level summary of the entire dataset and it would also be helpeful for end-users on the frontend to have a quick overview of the dataset. Therefore, a "summary" statistic was implemented that returns a high-level overview of the dataset, including:
 \begin{itemize}
    \item Total number of posts and comments in the dataset.
    \item Total number of unique users in the dataset.
    \item Comments per post.
    \item Lurker Ratio, which is the percentage of users that only have one event in the dataset.
    \item The time range of the dataset, from the earliest event to the latest event.
    \item Sources included in the dataset.
 \end{itemize}
 This is implemented in the same way as the other statistics, using Pandas queries and in it's own class.
 \subsubsection{StatGen Class}
 The \texttt{StatGen} (Statistics Generator) class is a higher level module that aggregates all of the different statistics into a single class that is called by the API endpoints to generate the statistics.
 Initially, all statistics were implemented within this class, however as the class grew larger and larger, it was refactored to delegate the different categories of statistics to separate classes, listed in the sections above. The class directly instantiates these analysis classes. Dependency injection of the analysis classes was considered for looser coupling, but since they were split purely for organisational and neatness purposes, extra decoupling complexity wasn't needed.
 Beyond improving the quality of the code, the other main function of this class is to provide a single centralised area to manage staistical filtering. Each statistical method of the class will take in a dictionary of filters as a parameter, then the private method \texttt{\_prepare\_filtered\_df} will apply the filters to the dataset and return the filtered dataset. Four types of filters are supported:
 \begin{itemize}
    \item \texttt{start\_date}: A date string that filters the dataset to only include events after the specified date.
    \item \texttt{end\_date}: A date string that filters the dataset to only include events before the specified date.
    \item \texttt{source}: A string that filters the dataset to only include events
    \item \texttt{search\_query}: A string that filters the dataset to only include events that contain the search query in their content.
 \end{itemize}
 Initially, stateful filtering was implemented where the filters would be stored within the \texttt{StatGen} object and are applied to all subsequent methods. The filters were applied once and could then be reset. This worked during initial stages when only one dataset was being tested, however with multiple datasets, this stateful filtering applied to other datasets (even with other users) and caused confusion, therefore a stateless approach was implemented where the filters are passed in as a parameter to each method, and the filtered dataset is returned for that method only, without affecting any other methods or datasets.
 \subsection{Flask API}
 The Flask API is responsible for providing the backend data to the frontend. It provides endpoints for user management, dataset management, and analysis endpoints. It also handles authentication and access control for the API. In addition, it handles extra data through some POST endpoints, such as filtering parameters and auto-fetching parameters for the connectors.
 \subsubsection{User Management}
 Three endpoints handle user lifecycle management. 
 \texttt{POST /register} accepts a JSON body containing a username, email, and password, delegates validation and persistence to \texttt{AuthManager}, described in Section \ref{sec:auth-manager}, and returns a structured error if the username or email is already taken. 
 \texttt{POST /login} verifies credentials through \texttt{AuthManager.authenticate\_user()} and, on success, returns a signed JWT access token created with Flask-JWT-Extended's \texttt{create\_access\_token()}. The user's integer ID is embedded as the token identity, which is retrieved on subsequent requests using \texttt{get\_jwt\_identity()}. The token expiry is configurable through the \texttt{JWT\_ACCESS\_TOKEN\_EXPIRES} environment variable. 
 \texttt{GET /profile} is a protected endpoint that verifies the token and returns the user's profile information, and serves as a method for the frontend to display user information.
 \subsubsection{Dataset Management}
 Dataset management is split across several endpoints that cover the full lifecycle of a dataset from creation through deletion.
 \texttt{GET /user/datasets} returns the list of all datasets owned by the authenticated user, used to populate the datasets page in the frontend. 
 \texttt{GET /dataset/<id>} returns the metadata for a single dataset.
 \texttt{PATCH /dataset/<id>} allows the user to rename it. 
 \texttt{DELETE /dataset/<id>} removes the dataset and all associated events from the database. 
 All of these routes begin with an ownership check via \texttt{dataset\_manager.authorize\_user\_dataset()}, and return a \texttt{403} if the requesting user does not own the dataset in question.
 \texttt{POST /datasets/upload} handles manual file upload. It expects a multipart form submission containing a \texttt{.jsonl} posts file, a \texttt{.json} topics file, and a dataset name string. The \texttt{.jsonl} file is read directly into a Pandas DataFrame using \texttt{pd.read\_json(lines=True)}, and the topics file is loaded with the standard \texttt{json} library. Once the dataset metadata is saved to the database, the serialised DataFrame and topics dictionary are passed to the \texttt{process\_dataset} Celery task via \texttt{.delay()}, and the endpoint returns immediately with a \texttt{202 Accepted} response containing the new dataset ID. This non-blocking behaviour is essential given that NLP enrichment can take several minutes for large datasets.
 \texttt{POST /datasets/scrape} handles automated data fetching. The request body contains a list of source configurations, each specifying a connector and optional search query, category, and limit. Careful validation is performed on the source configurations, as any failure within the Celery task would cause a silent failure. The dataset metadata is saved to the database, and the \texttt{fetch\_and\_process\_dataset} task is dispatched asynchronously via Celery. This task fetches each source's data using the appropriate connector, combines the result into a single DataFrame, then passes it through the same enrichment and storage process.
 \texttt{GET /datasets/sources} is an unauthenticated endpoint that returns the connector registry metadata so the frontend can dynamically render the available sources and what they can do.
 \texttt{GET /dataset/<id>/status} allows the frontend to poll the state of a dataset. It returns the current status string and message stored in the \texttt{datasets} table, which the Celery worker updates at each stage of the pipeline, from \texttt{"fetching"} through \texttt{"processing"} to \texttt{"complete"} or \texttt{"error"}.
 \texttt{GET /dataset/<id>/all} returns the full raw event table for a dataset as a list of records, which powers the raw data viewer in the frontend.
 \subsubsection{Analysis Endpoints}
 Several endpoints are implemented that return each ethnographic statistic generated by the \texttt{StatGen} class. Each endpoint takes a URL parameter for the dataset ID, and an optional JSON body containing filter parameters.
 For each type of analysis, there is a corresponding endpoint, the base configuration being: \texttt{/dataset/<id>/<analysis\_type>}
 Each endpoint needs a JWT authorization header that corresponds to the user that owns that dataset, and the dataset ID is validated against the user's datasets to ensure they have access to it. The endpoint then fetches the entire dataset, and passes it through the global \texttt{StatGen} instance to generate statistics. The resulting statistics are returned as JSON to the frontend for visualisation.
 \subsubsection{Access Control}
 Endpoints are protected with Flask's \texttt{@jwt\_required()} decorator. This ensures that only authenticated users can access the protected endpoints. For dataset-specific endpoints, an additional ownership check is performed using \texttt{dataset\_manager.authorize\_user\_dataset()} to ensure that users can only access their own datasets. If a user attempts to access a dataset they do not own, a \texttt{403 Forbidden} response is returned.
 \subsubsection{Error Handling}
 Each route handler wraps its logic in a \texttt{try/except} block that catches three categories of exception. \texttt{NotAuthorisedException} maps to a \texttt{403} response. \texttt{NonExistentDatasetException} maps to \texttt{404}. \texttt{ValueError}, which is raised by input validation in the manager layers, maps to \texttt{400}. 
 A bare \texttt{Exception} try-catch handles anything unexpected and returns a generic \texttt{500}, while printing a full traceback to the server log via \texttt{traceback.format\_exc()} for debugging. Error messages returned to the client are deliberately vague for unexpected errors, to avoid leaking implementation details.
 \subsection{React Frontend}
 The frontend is a single-page application built with React and Typescript. It communicates with the Flask backend over a REST API using Axios, and JWT tokens are attached to every authenticated request using an Axios default header. 
 React Router is used for client-side routing, and the main pages of the application are:
 \begin{itemize}
    \item \textbf{Login Page}: A simple login form that allows users to login and register for an account.
    \item \textbf{Datasets Page}: A page that lists all of the user's datasets, and allows them to create new datasets through file upload or automated fetching.
    \item \textbf{Dataset Status Page}: A page that shows the status of a dataset, including the progress of the data pipeline and any errors that may have occurred.
    \item \textbf{Dataset Edit Page}: A page that allows users to rename or delete a dataset.
    \item \textbf{Dataset Upload Page}: A page that allows users to upload a dataset through a file upload form.
    \item \textbf{Dataset Auto-Fetch Page}: A page that allows users to create a dataset through automated data fetching, by selecting sources and providing search queries and limits.
    \item \textbf{Dataset Analysis Page}: A page that shows all of the ethnographic statistics for a dataset, with various visualisations such as line charts, bar charts and heatmaps.
 \end{itemize}
 \subsubsection{Layouts}
 \paragraph{Main Layout}
 The main layout of the application is a React component that includes a header with the application name, a navigation bar with links to the Datasets page, and a sign out button that clears the JWT token from local storage. The main layout also includes a container for the main content of each page, which is rendered using React Router's \texttt{Outlet} component.
 When logged out, instead of showing the datasets and logout button, it simply shows a sign in button. The navigation bar can be seen in Figure \ref{fig:nav_bar}. The main layout is used for all pages. 
 \begin{figure}
    \centering
    \includegraphics[width=1.0\textwidth]{img/navbar.png}
    \caption{Navigation Bar in the Main Layout}
    \label{fig:nav_bar}
 \end{figure}
 \paragraph{Analysis Layout}
 The analysis layout is a React component that is used for all of the analysis pages. It still uses the Main Layout, however it adds an extra navigation bar that lets users switch between the different types of analysis, such as temporal analysis or linguistic analysis. This allows the code for navigation to be written once and used across all analysis pages, instead of in each analysis page separately. It also simplifies the URL structure, as all analysis pages have the same base URL of \texttt{/dataset/<id>/analysis}, and the type of analysis is determined by an injected React component.
 In addition to an extra navigation bar, it also contains a filter component that allows users to apply filters such as search queries and date filters to the dataset, which are passed in as parameters to the API endpoints to filter the statistics. This allows users to easily filter the dataset and see how the statistics change based on the filters. The analysis layout can be seen in Figure \ref{fig:analysis_layout}.
 \begin{figure}
    \centering
    \includegraphics[width=1.0\textwidth]{img/analysis_bar.png}
    \caption{Analysis Layout with Navigation and Filter Components}
    \label{fig:analysis_layout}
 \end{figure}
 \subsubsection{Analysis Page}
 The Analysis page fires six API requests in parallel to fetch the six categories of statistics (temporal, linguistic, user, interactional, emotional and cultural), and each category is rendered in a separate section on the page with its own visualisation. The API requests are fired when the page loads, and also whenever the filters are updated. This allows the API calls to be centralised into a single component, such that any change in the filters will automatically update all of the statistics on the page. Appying filters re-fetches all six endpoints with new query parameters.
 The majority of statistics are displayed using a custom KPI component that shows the name of the statistic, the value, and a secondary label for other information. An example of this can be seen in Figure \ref{fig:kpi_card}. The statistics that are not displayed as KPIs, such as the temporal analysis line chart and heatmap, will be discussed in the next sections.
 \begin{figure}
    \centering
    \includegraphics[width=0.5\textwidth]{img/kpi_card.png}
    \caption{Custom KPI Component for Displaying Statistics}
    \label{fig:kpi_card}
 \end{figure}
 \paragraph{Summary}
 The summary section contains basic KPI cards such as total posts, total users, comments per post, lurker ratio, time range and sources. Beyond KPIs it also contains a word cloud of the most common words in the dataset, which is generated using the \texttt{react-wordcloud} library. The word cloud provides a visual representation of the most common words in the dataset, with the size of each word corresponding to its frequency.
 A heatmap is included in the Summary section (taken from the temporal analysis endpoint) that shows the distribution of posts and comments by hour of the day and day of the week. This allows users to quickly see when users are most active in the dataset. The heatmap is generated using the \texttt{nivo} library, which provides a convenient way to create a heatmap visualization in React.
 An example of the heatmap can be seen in Figure \ref{fig:heatmap}.
 \begin{figure}
    \centering
    \includegraphics[width=1.0\textwidth]{img/heatmap.png}
    \caption{Heatmap of Posts and Comments by Hour and Day in the Cork Dataset}
    \label{fig:heatmap}
 \end{figure}
 \paragraph{Emotional}
 The emotional analysis section contains KPI cards for the average emotion scores for each emotion class, as well as slightly adjusted KPI cards for showing the emotions per topic, as they include secondary and tertiary labels that show the model confidence for that emotion and the number of posts that were classified with that emotion.
 \paragraph{Users}
 The users analysis section contains an interactive interaction graph that shows the replies and chains between users, this was done with the \texttt{react-force-graph} library, which provides a convenient way to create an interactive graph visualization in React. The nodes of the graph represent users, and the edges represent interactions between users (such as replies). The graph filters out edges with less than two interactions and removes deleted-user nodes (like "[Deleted User]") to remove noise.
 An example of the interaction graph can be seen in Figure \ref{fig:interaction_graph}.
 The rest of the statistics in the users section are displayed as KPI cards.
 \begin{figure}
    \centering
    \includegraphics[width=1.0\textwidth]{img/interaction_graph.png}
    \caption{Interaction Graph Showing User Interactions in the Cork Dataset}
    \label{fig:interaction_graph}
 \end{figure}
 \paragraph{Interactional}
 The interactional analysis section contains KPI cards for the conversation concentration metrics, as well as a bar chart showing the top interaction pairs, which is generated using the \texttt{nivo} library. A pie chart is used to show the inequality of contributions in conversations, with the share of comments from the top 10\% most active commenters shown in one color, and the share of comments from the rest of the commenters shown in another color.
 \subsubsection{Corpus Explorer}
 The corpus explorer is a feature that allows users to explore the raw data of the dataset. It is implemented as a table that shows all of the posts and comments in the dataset, along with their metadata such as author, timestamp, and topic. It uses the \texttt{/dataset/<id>/all} API endpoint to fetch the raw data from the backend. It allows a user to click on most statistics and see the underlying posts that make up that statistic. For example, if a user clicks on the "City Center" topic, then the corpus explorer will filter to only show posts that were classified with the "City Center" topic. 
 This is purely a frontend feature, and did not require any additional backend implementation beyond the existing API endpoint that returns the raw dataset. Initially, it was thought that performance would be an issue with loading the entire dataset into the frontend, however with some optimisations such as pagination and lazy loading, it was possible to load even large datasets without performance issues.
 The full dataset is fetched once per filter state and then cached in component state. Subsequent explore actions within the same filter state reuse this cached payload rather than making further API requests. The component itself only renders 60 posts at a time, and implements pagination to navigate the dataset and keep performance smooth. This allows users to explore the raw data without overwhelming the frontend with too much data at once.
 The Corpus Explorer addresses a limitation of some ethnographic analysis programs, which is statistical outputs are summaries, and a summary can be misleading. By making the source texts viewable from any figure in the dashboard, a researcher can verify the accuracy of the statistics.
 \subsubsection{Styling}
 Where possible, styling is kept with a centralised styling file in the frontend, which contains all of the common styles such as colors, fonts, and spacing. 
 \texttt{palette.ts} contains the color palette for the application, which is used across all components to ensure a consistent look and feel. \texttt{appLayout.ts} contains the layout style for the structure and margins of the main layout. For each individual component / page, a separate TS file is used for styling.
 All analysis pages use a grid layout to structure the different cards and visualisations, which allows for a clean and organised presentation of the statistics. 
 \begin{figure}
    \centering
    \includegraphics[width=1.0\textwidth]{img/frontend.png}
    \caption{Summary Page of the Application}
    \label{fig:summary_page}
 \end{figure}
 \newpage
 \section{Evaluation}
 \subsection{User Feedback}
 A meeting was held with a group of digital ethnographers to demo the application and gather feedback on the design, functionality and usefulness of the application. 
 \subsection{NLP Accuracy}
 \subsection{Performance Benchmarks}
 \subsection{Limitations}
 \newpage
 \section{Conclusions}
 \subsection{Reflection}
 \subsection{Future Work}
 \newpage
 \bibliography{references}
 \end{document}
--- a/report/references.bib
+++ b/report/references.bib
@@ -13,6 +13,27 @@
  howpublished = {\url{https://huggingface.co/j-hartmann/emotion-english-distilroberta-base/}},
 }
@misc{all_mpnet_base_v2,
  author={Microsoft Research},
  title={All-MPNet-Base-V2},
  year={2021},
  howpublished = {\url{https://huggingface.co/sentence-transformers/all-mpnet-base-v2}},
 }
@misc{minilm_l6_v2,
  author={Microsoft Research},
  title={MiniLM-L6-V2},
  year={2021},
  howpublished = {\url{https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2}},
 }
@misc{dslim_bert_base_ner,
  author={deepset},
  title={dslim/bert-base-NER},
  year={2018},
  howpublished = {\url{https://huggingface.co/dslim/bert-base-NER}},
 }
@inproceedings{demszky2020goemotions,
 author = {Demszky, Dorottya and Movshovitz-Attias, Dana and Ko, Jeongwoo and Cowen, Alan and Nemade, Gaurav and Ravi, Sujith},
 booktitle = {58th Annual Meeting of the Association for Computational Linguistics (ACL)},
--- a/server/analysis/linguistic.py
+++ b/server/analysis/linguistic.py
@@ -1,17 +1,30 @@
 import pandas as pd
 import re
 from collections import Counter
-from itertools import islice
+from dataclasses import dataclass
 import pandas as pd
@dataclass(frozen=True)
 class NGramConfig:
    min_token_length: int = 3
    min_count: int = 2
    max_results: int = 100
 class LinguisticAnalysis:
    def __init__(self, word_exclusions: set[str]):
        self.word_exclusions = word_exclusions
        self.ngram_config = NGramConfig()
-    def _tokenize(self, text: str):
+    def _tokenize(self, text: str, *, include_exclusions: bool = False) -> list[str]:
-        tokens = re.findall(r"\b[a-z]{3,}\b", text)
+        pattern = rf"\b[a-z]{{{self.ngram_config.min_token_length},}}\b"
-        return [t for t in tokens if t not in self.word_exclusions]
+        tokens = re.findall(pattern, text)
        if include_exclusions:
            return tokens
        return [token for token in tokens if token not in self.word_exclusions]
    def _clean_text(self, text: str) -> str:
        text = re.sub(r"http\S+", "", text)  # remove URLs
@@ -21,13 +34,24 @@ class LinguisticAnalysis:
        text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
        return text
    def _content_texts(self, df: pd.DataFrame) -> pd.Series:
        return df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
    def _valid_ngram(self, tokens: tuple[str, ...]) -> bool:
        if any(token in self.word_exclusions for token in tokens):
            return False
        if len(set(tokens)) == 1:
            return False
        return True
    def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
-        texts = df["content"].dropna().astype(str).str.lower()
+        texts = self._content_texts(df)
        words = []
        for text in texts:
-            tokens = re.findall(r"\b[a-z]{3,}\b", text)
+            words.extend(self._tokenize(text))
            words.extend(w for w in tokens if w not in self.word_exclusions)
        counts = Counter(words)
@@ -40,25 +64,39 @@ class LinguisticAnalysis:
        return word_frequencies.to_dict(orient="records")
-    def ngrams(self, df: pd.DataFrame, n=2, limit=100):
+    def ngrams(self, df: pd.DataFrame, n: int = 2, limit: int | None = None) -> list[dict]:
-        texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
+        if n < 2:
            raise ValueError("n must be at least 2")
        texts = self._content_texts(df)
        all_ngrams = []
        result_limit = limit or self.ngram_config.max_results
        for text in texts:
-            tokens = re.findall(r"\b[a-z]{3,}\b", text)
+            tokens = self._tokenize(text, include_exclusions=True)
-            # stop word removal causes strange behaviors in ngrams
+            if len(tokens) < n:
-            # tokens = [w for w in tokens if w not in self.word_exclusions]
+                continue
-            ngrams = zip(*(islice(tokens, i, None) for i in range(n)))
+            for index in range(len(tokens) - n + 1):
-            all_ngrams.extend([" ".join(ng) for ng in ngrams])
+                ngram_tokens = tuple(tokens[index : index + n])
                if self._valid_ngram(ngram_tokens):
                    all_ngrams.append(" ".join(ngram_tokens))
        counts = Counter(all_ngrams)
        filtered_counts = [
            (ngram, count)
            for ngram, count in counts.items()
            if count >= self.ngram_config.min_count
        ]
        if not filtered_counts:
            return []
        return (
-            pd.DataFrame(counts.items(), columns=["ngram", "count"])
+            pd.DataFrame(filtered_counts, columns=["ngram", "count"])
-            .sort_values("count", ascending=False)
+            .sort_values(["count", "ngram"], ascending=[False, True])
-            .head(limit)
+            .head(result_limit)
            .to_dict(orient="records")
        )
--- a/server/analysis/stat_gen.py
+++ b/server/analysis/stat_gen.py
@@ -1,4 +1,5 @@
 import nltk
 import json
 import pandas as pd
 from nltk.corpus import stopwords
@@ -27,6 +28,8 @@ DOMAIN_STOPWORDS = {
    "one",
 }
 EXCLUDED_AUTHORS = {"[deleted]", "automoderator"}
 nltk.download("stopwords")
 EXCLUDE_WORDS = set(stopwords.words("english")) | DOMAIN_STOPWORDS
@@ -46,6 +49,12 @@ class StatGen:
        filters = filters or {}
        filtered_df = df.copy()
        if "author" in filtered_df.columns:
            normalized_authors = (
                filtered_df["author"].fillna("").astype(str).str.strip().str.lower()
            )
            filtered_df = filtered_df[~normalized_authors.isin(EXCLUDED_AUTHORS)]
        search_query = filters.get("search_query", None)
        start_date_filter = filters.get("start_date", None)
        end_date_filter = filters.get("end_date", None)
@@ -75,11 +84,22 @@ class StatGen:
        return filtered_df
    def _json_ready_records(self, df: pd.DataFrame) -> list[dict]:
        return json.loads(
            df.to_json(orient="records", date_format="iso", date_unit="s")
        )
    ## Public Methods
    def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]:
-        return self._prepare_filtered_df(df, filters).to_dict(orient="records")
+        filtered_df = self._prepare_filtered_df(df, filters)
        return self._json_ready_records(filtered_df)
-    def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+    def temporal(
        self,
        df: pd.DataFrame,
        filters: dict | None = None,
        dataset_id: int | None = None,
    ) -> dict:
        filtered_df = self._prepare_filtered_df(df, filters)
        return {
@@ -87,7 +107,12 @@ class StatGen:
            "weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df),
        }
-    def linguistic(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+    def linguistic(
        self,
        df: pd.DataFrame,
        filters: dict | None = None,
        dataset_id: int | None = None,
    ) -> dict:
        filtered_df = self._prepare_filtered_df(df, filters)
        return {
@@ -97,7 +122,12 @@ class StatGen:
            "lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df)
        }
-    def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+    def emotional(
        self,
        df: pd.DataFrame,
        filters: dict | None = None,
        dataset_id: int | None = None,
    ) -> dict:
        filtered_df = self._prepare_filtered_df(df, filters)
        return {
@@ -107,7 +137,12 @@ class StatGen:
            "emotion_by_source": self.emotional_analysis.emotion_by_source(filtered_df)
        }
-    def user(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+    def user(
        self,
        df: pd.DataFrame,
        filters: dict | None = None,
        dataset_id: int | None = None,
    ) -> dict:
        filtered_df = self._prepare_filtered_df(df, filters)
        return {
@@ -115,7 +150,12 @@ class StatGen:
            "users": self.user_analysis.per_user_analysis(filtered_df)
        }
-    def interactional(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+    def interactional(
        self,
        df: pd.DataFrame,
        filters: dict | None = None,
        dataset_id: int | None = None,
    ) -> dict:
        filtered_df = self._prepare_filtered_df(df, filters)
        return {
@@ -124,7 +164,12 @@ class StatGen:
            "conversation_concentration": self.interaction_analysis.conversation_concentration(filtered_df)
        }
-    def cultural(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+    def cultural(
        self,
        df: pd.DataFrame,
        filters: dict | None = None,
        dataset_id: int | None = None,
    ) -> dict:
        filtered_df = self._prepare_filtered_df(df, filters)
        return {
@@ -133,7 +178,12 @@ class StatGen:
            "avg_emotion_per_entity": self.cultural_analysis.get_avg_emotions_per_entity(filtered_df)
        }
-    def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict:
+    def summary(
        self,
        df: pd.DataFrame,
        filters: dict | None = None,
        dataset_id: int | None = None,
    ) -> dict:
        filtered_df = self._prepare_filtered_df(df, filters)
        return self.summary_analysis.summary(filtered_df)
--- a/server/app.py
+++ b/server/app.py
@@ -152,9 +152,9 @@ def get_dataset_sources():
    return jsonify(list_metadata)
-@app.route("/datasets/scrape", methods=["POST"])
+@app.route("/datasets/fetch", methods=["POST"])
@jwt_required()
-def scrape_data():
+def fetch_data():
    data = request.get_json()
    connector_metadata = get_connector_metadata()
@@ -424,7 +424,7 @@ def get_linguistic_analysis(dataset_id):
        dataset_content = dataset_manager.get_dataset_content(dataset_id)
        filters = get_request_filters()
-        return jsonify(stat_gen.linguistic(dataset_content, filters)), 200
+        return jsonify(stat_gen.linguistic(dataset_content, filters, dataset_id=dataset_id)), 200
    except NotAuthorisedException:
        return jsonify({"error": "User is not authorised to access this content"}), 403
    except NonExistentDatasetException:
@@ -448,7 +448,7 @@ def get_emotional_analysis(dataset_id):
        dataset_content = dataset_manager.get_dataset_content(dataset_id)
        filters = get_request_filters()
-        return jsonify(stat_gen.emotional(dataset_content, filters)), 200
+        return jsonify(stat_gen.emotional(dataset_content, filters, dataset_id=dataset_id)), 200
    except NotAuthorisedException:
        return jsonify({"error": "User is not authorised to access this content"}), 403
    except NonExistentDatasetException:
@@ -472,7 +472,7 @@ def get_summary(dataset_id):
        dataset_content = dataset_manager.get_dataset_content(dataset_id)
        filters = get_request_filters()
-        return jsonify(stat_gen.summary(dataset_content, filters)), 200
+        return jsonify(stat_gen.summary(dataset_content, filters, dataset_id=dataset_id)), 200
    except NotAuthorisedException:
        return jsonify({"error": "User is not authorised to access this content"}), 403
    except NonExistentDatasetException:
@@ -496,7 +496,7 @@ def get_temporal_analysis(dataset_id):
        dataset_content = dataset_manager.get_dataset_content(dataset_id)
        filters = get_request_filters()
-        return jsonify(stat_gen.temporal(dataset_content, filters)), 200
+        return jsonify(stat_gen.temporal(dataset_content, filters, dataset_id=dataset_id)), 200
    except NotAuthorisedException:
        return jsonify({"error": "User is not authorised to access this content"}), 403
    except NonExistentDatasetException:
@@ -520,7 +520,7 @@ def get_user_analysis(dataset_id):
        dataset_content = dataset_manager.get_dataset_content(dataset_id)
        filters = get_request_filters()
-        return jsonify(stat_gen.user(dataset_content, filters)), 200
+        return jsonify(stat_gen.user(dataset_content, filters, dataset_id=dataset_id)), 200
    except NotAuthorisedException:
        return jsonify({"error": "User is not authorised to access this content"}), 403
    except NonExistentDatasetException:
@@ -544,7 +544,7 @@ def get_cultural_analysis(dataset_id):
        dataset_content = dataset_manager.get_dataset_content(dataset_id)
        filters = get_request_filters()
-        return jsonify(stat_gen.cultural(dataset_content, filters)), 200
+        return jsonify(stat_gen.cultural(dataset_content, filters, dataset_id=dataset_id)), 200
    except NotAuthorisedException:
        return jsonify({"error": "User is not authorised to access this content"}), 403
    except NonExistentDatasetException:
@@ -568,7 +568,7 @@ def get_interaction_analysis(dataset_id):
        dataset_content = dataset_manager.get_dataset_content(dataset_id)
        filters = get_request_filters()
-        return jsonify(stat_gen.interactional(dataset_content, filters)), 200
+        return jsonify(stat_gen.interactional(dataset_content, filters, dataset_id=dataset_id)), 200
    except NotAuthorisedException:
        return jsonify({"error": "User is not authorised to access this content"}), 403
    except NonExistentDatasetException:
@@ -591,7 +591,8 @@ def get_full_dataset(dataset_id: int):
            )
        dataset_content = dataset_manager.get_dataset_content(dataset_id)
-        return jsonify(dataset_content.to_dict(orient="records")), 200
+        filters = get_request_filters()
        return jsonify(stat_gen.filter_dataset(dataset_content, filters)), 200
    except NotAuthorisedException:
        return jsonify({"error": "User is not authorised to access this content"}), 403
    except NonExistentDatasetException:
--- a/server/core/datasets.py
+++ b/server/core/datasets.py
@@ -26,7 +26,34 @@ class DatasetManager:
    def get_dataset_content(self, dataset_id: int) -> pd.DataFrame:
        query = "SELECT * FROM events WHERE dataset_id = %s"
        result = self.db.execute(query, (dataset_id,), fetch=True)
-        return pd.DataFrame(result)
+        df = pd.DataFrame(result)
        if df.empty:
            return df
        dedupe_columns = [
            column
            for column in [
                "post_id",
                "parent_id",
                "reply_to",
                "author",
                "type",
                "timestamp",
                "dt",
                "title",
                "content",
                "source",
                "topic",
            ]
            if column in df.columns
        ]
        if dedupe_columns:
            df = df.drop_duplicates(subset=dedupe_columns, keep="first")
        else:
            df = df.drop_duplicates(keep="first")
        return df.reset_index(drop=True)
    def get_dataset_info(self, dataset_id: int) -> dict:
        query = "SELECT * FROM datasets WHERE id = %s"
@@ -52,6 +79,16 @@ class DatasetManager:
        if event_data.empty:
            return
        dedupe_columns = [
            column for column in ["id", "type", "source"] if column in event_data.columns
        ]
        if dedupe_columns:
            event_data = event_data.drop_duplicates(subset=dedupe_columns, keep="first")
        else:
            event_data = event_data.drop_duplicates(keep="first")
        self.delete_dataset_content(dataset_id)
        query = """
            INSERT INTO events (
                dataset_id,