Compare commits

..

63 Commits

Author SHA1 Message Date
5970f555fa docs(readme): update readme 2026-04-19 13:54:09 +01:00
9b7a51ff33 docs(report): add Declaration of Originality and Acknowledgements sections 2026-04-18 22:10:16 +01:00
2d39ea6e66 refactor(connector): clean up comments 2026-04-18 22:10:03 +01:00
c1e5482f55 docs(report): fix typos 2026-04-18 16:09:22 +01:00
b2d7f6edaf docs(report): add visualizations and emotional analysis for Cork dataset 2026-04-18 15:44:04 +01:00
10efa664df docs(report): fix typos and add more eval 2026-04-17 20:31:39 +01:00
3db7c1d3ae docs(report): add future work section 2026-04-16 16:54:18 +01:00
72e17e900e fix(report): correct typos 2026-04-16 16:41:27 +01:00
7b9a17f395 fix(connector): reduce ThreadPoolExecutor max_workers 2026-04-16 16:37:27 +01:00
0a396dd504 docs(report): add more citations 2026-04-16 16:23:36 +01:00
c6e8144116 docs(report): add traditionl vs digital ethnography reference 2026-04-16 16:08:59 +01:00
760d2daf7f docs(report): remove redundant phrasing 2026-04-16 15:59:24 +01:00
ca38b992eb build(docker): switch backend flask deployment to Gunicorn 2026-04-15 17:57:22 +01:00
ee9c7b4ab2 docs(report): finish evaluation & reflection 2026-04-15 17:52:54 +01:00
703a7c435c fix(youtube_api): video search capped at 50 2026-04-14 17:54:43 +01:00
02ba727d05 chore(connector): add buffer to ratelimit reset 2026-04-14 17:41:09 +01:00
76591bc89e feat(tasks): add fetch and NLP processing time logging to dataset status 2026-04-14 17:35:43 +01:00
e35e51d295 fix(reddit_api): handle rate limit wait time conversion error 2026-04-14 17:35:21 +01:00
d2fe637743 docs: update references for digital ethnography and further work on evaluation 2026-04-14 15:16:56 +01:00
e1831aab7d docs(report): add researcher feedback 2026-04-13 22:00:41 +01:00
a3ef5a5655 chore: add more defaults to example env 2026-04-13 22:00:19 +01:00
5f943ce733 Merge pull request 'Corpus Explorer Feature' (#11) from feat/corpus-explorer into main
Reviewed-on: #11
2026-04-13 19:02:45 +01:00
9964a919c3 docs(report): enhance frontend design section 2026-04-13 19:01:51 +01:00
c11434344a refactor: streamline CorpusExplorer components 2026-04-13 17:06:46 +01:00
bc356848ef docs(report): start frontend section 2026-04-13 16:43:20 +01:00
047427432f docs(report): add summary section for dataset overview and update authentication manager details 2026-04-13 12:24:43 +01:00
d0d02e9ebf docs(report): add stance markers image and update related sections 2026-04-12 16:15:18 +01:00
68342606e3 docs(report): add NLP backoff diagram and update references for NER model 2026-04-11 15:24:57 +01:00
afae7f42a1 docs(report): add data pipeline diagram and update references for embedding models 2026-04-11 15:03:24 +01:00
4dd2721e98 Merge remote-tracking branch 'origin/main' into feat/corpus-explorer 2026-04-10 13:19:17 +01:00
99afe82464 docs(report): refine emotional classification model details 2026-04-10 13:17:11 +01:00
8c44df94c0 docs(report): update references for emotion classification models and NLP techniques 2026-04-09 19:01:21 +01:00
42905cc547 docs(report): add connector implementation & design NLP docs 2026-04-08 20:39:51 +01:00
ec64551881 fix(connectors): update User-Agent header for BoardsAPI 2026-04-08 19:34:30 +01:00
e274b8295a docs(report): add citations and start implementation section 2026-04-08 17:28:41 +01:00
3df6776111 docs(report): add decision tradeoff decisions 2026-04-07 18:04:25 +01:00
a347869353 docs(report): add more justification for ethnographic endpoints 2026-04-07 15:22:47 +01:00
8b4e13702e docs(report): add ucc crest to title page 2026-04-07 12:55:01 +01:00
8fa4f3fbdf refactor(report): move data pipeline above ethnographic analysis 2026-04-07 12:52:48 +01:00
c6cae040f0 feat(analysis): add emotional averages to stance markers 2026-04-07 12:49:18 +01:00
addc1d4087 docs(report): add justification at each stage 2026-04-07 12:17:02 +01:00
225133a074 docs(report): add ethnographic analysis section 2026-04-07 11:54:57 +01:00
e903e1b738 feat(user): add dominant topic information to user data 2026-04-07 11:34:03 +01:00
0c4dc02852 docs(report): add ethnographic analysis section 2026-04-06 19:39:09 +01:00
33e4291def docs(report): add table of contents 2026-04-06 19:34:38 +01:00
cedbce128e docs(report): add auto-fetch section 2026-04-06 19:32:49 +01:00
107dae0e95 docs(report): add data storage section 2026-04-06 19:26:10 +01:00
23833e2c5b docs(report): add custom topic section 2026-04-06 18:47:29 +01:00
f2b6917f1f docs(report); add data ingestion section 2026-04-06 12:44:17 +01:00
b57a8d3c65 docs(report): add data pipeline and connector sections
Also moved requirements to the end of design, where it is more appropriately placed. Requirements can be specified after discussing potential pitfalls.
2026-04-04 14:36:52 +01:00
ac65e26eab docs(report): add ethics section 2026-04-04 13:52:56 +01:00
6efa75dfe6 chore(connectors): reduce aggressive parallel connections to boards.ie 2026-04-04 12:33:06 +01:00
de61e7653f perf(connector): add reddit API authentication to speed up fetching
This aligns better with ethics and massively increases rate limits.
2026-04-04 12:26:54 +01:00
98aa04256b fix(reddit_api): fix reddit ratelimit check 2026-04-04 10:20:48 +01:00
5f81c51979 docs(report): add scalability constraints 2026-04-03 20:06:19 +01:00
361b532766 docs(analysis): add feasability analysis 2026-04-03 20:02:22 +01:00
9ef96661fc report(analysis): update structure & add justifications 2026-04-03 18:35:08 +01:00
9375abded5 docs(design): add docker & async processing sections 2026-04-03 17:59:01 +01:00
74ecdf238a docs: add database schema diagram 2026-04-02 19:30:20 +01:00
b85987e179 docs: add system architecture diagram 2026-04-02 18:59:32 +01:00
37d08c63b8 chore: rename auto-scraper to auto-fetcher
Improves the perception of ethics
2026-04-01 09:50:53 +01:00
1482e96051 feat(datasets): implement deduplication of dataset records in get_dataset_content 2026-04-01 09:06:07 +01:00
cd6030a760 fix(ngrams): remove stop words from ngrams 2026-04-01 08:44:47 +01:00
51 changed files with 1996 additions and 484 deletions

1
.gitignore vendored
View File

@@ -13,3 +13,4 @@ dist/
helper helper
db db
report/build report/build
.DS_Store

View File

@@ -1,29 +1,49 @@
# crosspost # crosspost
**crosspost** is a browser-based tool designed to support *digital ethnography*, the study of how people interact, communicate, and form culture in online spaces such as forums, social media platforms, and comment-driven communities. A web-based analytics platform for exploring online communities. Built as a final year CS project at UCC, crosspost ingests data from Reddit, YouTube, and Boards.ie, runs NLP analysis on it (emotion detection, topic classification, named entity recognition, stance markers), and surfaces the results through an interactive dashboard.
The motivating use case is digital ethnography — studying how people talk, what they talk about, and how culture forms in online spaces. The included dataset is centred on Cork, Ireland.
The project aims to make it easier for students, researchers, and journalists to collect, organise, and explore online discourse in a structured and ethical way, without requiring deep technical expertise. ## What it does
- Fetch posts and comments from Reddit, YouTube, and Boards.ie (or upload your own .jsonl file)
- Normalise everything into a unified schema regardless of source
- Run NLP analysis asynchronously in the background via Celery workers
- Explore results through a tabbed dashboard: temporal patterns, word clouds, emotion breakdowns, user activity, interaction graphs, topic clusters, and more
- Multi-user support — each user has their own datasets, isolated from everyone else
By combining data ingestion, analysis, and visualisation in a single system, crosspost turns raw online interactions into meaningful insights about how conversations emerge, evolve, and spread across platforms. # Prerequisites
- Docker & Docker Compose
- A Reddit App (client id & secret)
- YouTube Data v3 API Key
## Goals for this project # Setup
- Collect data ethically: enable users to link/upload text, images, and interaction data (messages etc) from specified online communities. Potentially and automated method for importing (using APIs or scraping techniques) could be included as well. 1) **Clone the Repo**
- Organise content: Store gathered material in a structured database with tagging for themes, dates, and sources. ```
Analyse patterns: Use natural language processing (NLP) to detect frequent keywords, sentiment, and interaction networks. git clone https://github.com/your-username/crosspost.git
- Visualise insights: Present findings as charts, timelines, and network diagrams to reveal how conversations and topics evolve. cd crosspost
- Have clearly stated and explained ethical and privacy guidelines for users. The student will design the architecture, implement data pipelines, integrate basic NLP models, and create an interactive dashboard. ```
Beyond programming, the project involves applying ethical research principles, handling data responsibly, and designing for non-technical users. By the end, the project will demonstrate how computer science can bridge technology and social research — turning raw online interactions into meaningful cultural insights. 2) **Configure Enviornment Vars**
```
cp example.env .env
```
Fill in each required empty env. Some are already filled in, these are sensible defaults that usually don't need to be changed
## Scope 3) **Start everything**
```
docker compose up -d
```
This project focuses on: This starts:
- Designing a modular data ingestion pipeline - `crosspost_db` — PostgreSQL on port 5432
- Implementing backend data processing and storage - `crosspost_redis` — Redis on port 6379
- Integrating lightweight NLP-based analysis - `crosspost_flask` — Flask API on port 5000
- Building a simple, accessible frontend for exploration and visualisation - `crosspost_worker` — Celery worker for background NLP/fetching tasks
- `crosspost_frontend` — Vite dev server on port 5173
# Requirements # Data Format for Manual Uploads
If you want to upload your own data rather than fetch it via the connectors, the expected format is newline-delimited JSON (.jsonl) where each line is a post object:
```json
{"id": "abc123", "author": "username", "title": "Post title", "content": "Post body", "url": "https://...", "timestamp": 1700000000.0, "source": "reddit", "comments": []}
```
- **Python** ≥ 3.9 # Notes
- **Python packages** listed in `requirements.txt` - **GPU support**: The Celery worker is configured with `--pool=solo` to avoid memory conflicts when multiple NLP models are loaded. If you have an NVIDIA GPU, uncomment the deploy.resources block in docker-compose.yml and make sure the NVIDIA Container Toolkit is installed.
- npm ≥ version 11

View File

@@ -28,7 +28,7 @@ services:
- .env - .env
ports: ports:
- "5000:5000" - "5000:5000"
command: flask --app server.app run --host=0.0.0.0 --debug command: gunicorn server.app:app --bind 0.0.0.0:5000 --workers 2 --threads 4
depends_on: depends_on:
- postgres - postgres
- redis - redis
@@ -48,13 +48,13 @@ services:
depends_on: depends_on:
- postgres - postgres
- redis - redis
# deploy: deploy:
# resources: resources:
# reservations: reservations:
# devices: devices:
# - driver: nvidia - driver: nvidia
# count: 1 count: 1
# capabilities: [gpu] capabilities: [gpu]
frontend: frontend:
build: build:

View File

@@ -1,8 +0,0 @@
# Generic User Data Transfer Object for social media platforms
class User:
def __init__(self, username: str, created_utc: int, ):
self.username = username
self.created_utc = created_utc
# Optionals
self.karma = None

View File

@@ -1,13 +1,16 @@
# API Keys # API Keys
YOUTUBE_API_KEY= YOUTUBE_API_KEY=
REDDIT_CLIENT_ID=
REDDIT_CLIENT_SECRET=
# Database # Database
POSTGRES_USER= # Database
POSTGRES_PASSWORD= POSTGRES_USER=postgres
POSTGRES_DB= POSTGRES_PASSWORD=postgres
POSTGRES_HOST= POSTGRES_DB=mydatabase
POSTGRES_HOST=postgres
POSTGRES_PORT=5432 POSTGRES_PORT=5432
POSTGRES_DIR= POSTGRES_DIR=./db
# JWT # JWT
JWT_SECRET_KEY= JWT_SECRET_KEY=

View File

@@ -5,7 +5,7 @@ import DatasetsPage from "./pages/Datasets";
import DatasetStatusPage from "./pages/DatasetStatus"; import DatasetStatusPage from "./pages/DatasetStatus";
import LoginPage from "./pages/Login"; import LoginPage from "./pages/Login";
import UploadPage from "./pages/Upload"; import UploadPage from "./pages/Upload";
import AutoScrapePage from "./pages/AutoScrape"; import AutoFetchPage from "./pages/AutoFetch";
import StatPage from "./pages/Stats"; import StatPage from "./pages/Stats";
import { getDocumentTitle } from "./utils/documentTitle"; import { getDocumentTitle } from "./utils/documentTitle";
import DatasetEditPage from "./pages/DatasetEdit"; import DatasetEditPage from "./pages/DatasetEdit";
@@ -23,7 +23,7 @@ function App() {
<Route path="/" element={<Navigate to="/login" replace />} /> <Route path="/" element={<Navigate to="/login" replace />} />
<Route path="/login" element={<LoginPage />} /> <Route path="/login" element={<LoginPage />} />
<Route path="/upload" element={<UploadPage />} /> <Route path="/upload" element={<UploadPage />} />
<Route path="/auto-scrape" element={<AutoScrapePage />} /> <Route path="/auto-fetch" element={<AutoFetchPage />} />
<Route path="/datasets" element={<DatasetsPage />} /> <Route path="/datasets" element={<DatasetsPage />} />
<Route path="/dataset/:datasetId/status" element={<DatasetStatusPage />} /> <Route path="/dataset/:datasetId/status" element={<DatasetStatusPage />} />
<Route path="/dataset/:datasetId/stats" element={<StatPage />} /> <Route path="/dataset/:datasetId/stats" element={<StatPage />} />

View File

@@ -1,4 +1,4 @@
import { useEffect, useMemo, useState } from "react"; import { useEffect, useState } from "react";
import { Dialog, DialogPanel, DialogTitle } from "@headlessui/react"; import { Dialog, DialogPanel, DialogTitle } from "@headlessui/react";
import StatsStyling from "../styles/stats_styling"; import StatsStyling from "../styles/stats_styling";
@@ -103,11 +103,6 @@ const CorpusExplorer = ({
} }
}, [open, title, records.length]); }, [open, title, records.length]);
const visibleRecords = useMemo(
() => records.slice(0, visibleCount),
[records, visibleCount],
);
const hasMoreRecords = visibleCount < records.length; const hasMoreRecords = visibleCount < records.length;
return ( return (
@@ -158,7 +153,7 @@ const CorpusExplorer = ({
paddingRight: 4, paddingRight: 4,
}} }}
> >
{visibleRecords.map((record, index) => { {records.slice(0, visibleCount).map((record, index) => {
const recordKey = getRecordKey(record, index); const recordKey = getRecordKey(record, index);
const titleText = getRecordTitle(record); const titleText = getRecordTitle(record);
const content = cleanText(record.content); const content = cleanText(record.content);

View File

@@ -8,11 +8,11 @@ import {
buildHedgeSpec, buildHedgeSpec,
buildIdentityBucketSpec, buildIdentityBucketSpec,
buildPermissionSpec, buildPermissionSpec,
getExplorerButtonStyle,
type CorpusExplorerSpec, type CorpusExplorerSpec,
} from "../utils/corpusExplorer"; } from "../utils/corpusExplorer";
const styles = StatsStyling; const styles = StatsStyling;
const exploreButtonStyle = { padding: "4px 8px", fontSize: 12 };
type CulturalStatsProps = { type CulturalStatsProps = {
data: CulturalAnalysisResponse; data: CulturalAnalysisResponse;
@@ -22,7 +22,7 @@ type CulturalStatsProps = {
const renderExploreButton = (onClick: () => void) => ( const renderExploreButton = (onClick: () => void) => (
<button <button
onClick={onClick} onClick={onClick}
style={{ ...styles.buttonSecondary, ...getExplorerButtonStyle() }} style={{ ...styles.buttonSecondary, ...exploreButtonStyle }}
> >
Explore Explore
</button> </button>

View File

@@ -26,12 +26,12 @@ import {
buildDateBucketSpec, buildDateBucketSpec,
buildOneTimeUsersSpec, buildOneTimeUsersSpec,
buildUserSpec, buildUserSpec,
getExplorerButtonStyle,
type CorpusExplorerSpec, type CorpusExplorerSpec,
} from "../utils/corpusExplorer"; } from "../utils/corpusExplorer";
const styles = StatsStyling; const styles = StatsStyling;
const MAX_WORDCLOUD_WORDS = 250; const MAX_WORDCLOUD_WORDS = 250;
const exploreButtonStyle = { padding: "4px 8px", fontSize: 12 };
const WORDCLOUD_OPTIONS = { const WORDCLOUD_OPTIONS = {
rotations: 2, rotations: 2,
@@ -80,7 +80,7 @@ function convertFrequencyData(data: FrequencyWord[]) {
const renderExploreButton = (onClick: () => void) => ( const renderExploreButton = (onClick: () => void) => (
<button <button
onClick={onClick} onClick={onClick}
style={{ ...styles.buttonSecondary, ...getExplorerButtonStyle() }} style={{ ...styles.buttonSecondary, ...exploreButtonStyle }}
> >
Explore Explore
</button> </button>

View File

@@ -88,6 +88,15 @@ export default function UserModal({
</div> </div>
</div> </div>
) : null} ) : null}
{userData.dominant_topic ? (
<div style={styles.topUserItem}>
<div style={styles.topUserName}>Most Common Topic</div>
<div style={styles.topUserMeta}>
{userData.dominant_topic.topic} ({userData.dominant_topic.count} events)
</div>
</div>
) : null}
</div> </div>
)} )}
</DialogPanel> </DialogPanel>

View File

@@ -20,7 +20,7 @@ type GraphLink = {
value: number; value: number;
}; };
function ApiToGraphData(apiData: InteractionGraph) { function toGraphData(apiData: InteractionGraph) {
const links: GraphLink[] = []; const links: GraphLink[] = [];
const connectedNodeIds = new Set<string>(); const connectedNodeIds = new Set<string>();
@@ -56,7 +56,7 @@ const UserStats = ({
onExplore, onExplore,
}: UserStatsProps) => { }: UserStatsProps) => {
const graphData = useMemo( const graphData = useMemo(
() => ApiToGraphData(interactionGraph), () => toGraphData(interactionGraph),
[interactionGraph], [interactionGraph],
); );
const graphContainerRef = useRef<HTMLDivElement | null>(null); const graphContainerRef = useRef<HTMLDivElement | null>(null);

View File

@@ -37,7 +37,7 @@ const supportsSearch = (source?: SourceOption): boolean =>
const supportsCategories = (source?: SourceOption): boolean => const supportsCategories = (source?: SourceOption): boolean =>
Boolean(source?.categories_enabled ?? source?.categoriesEnabled); Boolean(source?.categories_enabled ?? source?.categoriesEnabled);
const AutoScrapePage = () => { const AutoFetchPage = () => {
const navigate = useNavigate(); const navigate = useNavigate();
const [datasetName, setDatasetName] = useState(""); const [datasetName, setDatasetName] = useState("");
const [sourceOptions, setSourceOptions] = useState<SourceOption[]>([]); const [sourceOptions, setSourceOptions] = useState<SourceOption[]>([]);
@@ -106,11 +106,11 @@ const AutoScrapePage = () => {
); );
}; };
const autoScrape = async () => { const autoFetch = async () => {
const token = localStorage.getItem("access_token"); const token = localStorage.getItem("access_token");
if (!token) { if (!token) {
setHasError(true); setHasError(true);
setReturnMessage("You must be signed in to auto scrape a dataset."); setReturnMessage("You must be signed in to auto fetch a dataset.");
return; return;
} }
@@ -243,7 +243,7 @@ const AutoScrapePage = () => {
setReturnMessage(""); setReturnMessage("");
const response = await axios.post( const response = await axios.post(
`${API_BASE_URL}/datasets/scrape`, `${API_BASE_URL}/datasets/fetch`,
requestBody, requestBody,
{ {
headers: { headers: {
@@ -255,7 +255,7 @@ const AutoScrapePage = () => {
const datasetId = Number(response.data.dataset_id); const datasetId = Number(response.data.dataset_id);
setReturnMessage( setReturnMessage(
`Auto scrape queued successfully (dataset #${datasetId}). Redirecting to processing status...`, `Auto fetch queued successfully (dataset #${datasetId}). Redirecting to processing status...`,
); );
setTimeout(() => { setTimeout(() => {
@@ -267,11 +267,11 @@ const AutoScrapePage = () => {
const message = String( const message = String(
requestError.response?.data?.error || requestError.response?.data?.error ||
requestError.message || requestError.message ||
"Auto scrape failed.", "Auto fetch failed.",
); );
setReturnMessage(`Auto scrape failed: ${message}`); setReturnMessage(`Auto fetch failed: ${message}`);
} else { } else {
setReturnMessage("Auto scrape failed due to an unexpected error."); setReturnMessage("Auto fetch failed due to an unexpected error.");
} }
} finally { } finally {
setIsSubmitting(false); setIsSubmitting(false);
@@ -283,9 +283,9 @@ const AutoScrapePage = () => {
<div style={styles.containerWide}> <div style={styles.containerWide}>
<div style={{ ...styles.card, ...styles.headerBar }}> <div style={{ ...styles.card, ...styles.headerBar }}>
<div> <div>
<h1 style={styles.sectionHeaderTitle}>Auto Scrape Dataset</h1> <h1 style={styles.sectionHeaderTitle}>Auto Fetch Dataset</h1>
<p style={styles.sectionHeaderSubtitle}> <p style={styles.sectionHeaderSubtitle}>
Select sources and scrape settings, then queue processing Select sources and fetch settings, then queue processing
automatically. automatically.
</p> </p>
<p <p
@@ -295,7 +295,7 @@ const AutoScrapePage = () => {
color: "#9a6700", color: "#9a6700",
}} }}
> >
Warning: Scraping more than 250 posts from any single site can Warning: Fetching more than 250 posts from any single site can
take hours due to rate limits. take hours due to rate limits.
</p> </p>
</div> </div>
@@ -305,10 +305,10 @@ const AutoScrapePage = () => {
...styles.buttonPrimary, ...styles.buttonPrimary,
opacity: isSubmitting || isLoadingSources ? 0.75 : 1, opacity: isSubmitting || isLoadingSources ? 0.75 : 1,
}} }}
onClick={autoScrape} onClick={autoFetch}
disabled={isSubmitting || isLoadingSources} disabled={isSubmitting || isLoadingSources}
> >
{isSubmitting ? "Queueing..." : "Auto Scrape and Analyze"} {isSubmitting ? "Queueing..." : "Auto Fetch and Analyze"}
</button> </button>
</div> </div>
@@ -527,4 +527,4 @@ const AutoScrapePage = () => {
); );
}; };
export default AutoScrapePage; export default AutoFetchPage;

View File

@@ -108,9 +108,9 @@ const DatasetsPage = () => {
<button <button
type="button" type="button"
style={styles.buttonSecondary} style={styles.buttonSecondary}
onClick={() => navigate("/auto-scrape")} onClick={() => navigate("/auto-fetch")}
> >
Auto Scrape Dataset Auto Fetch Dataset
</button> </button>
</div> </div>
</div> </div>

View File

@@ -66,45 +66,110 @@ const EMPTY_EXPLORER_STATE: ExplorerState = {
error: "", error: "",
}; };
const getExplorerRecordIdentity = (record: DatasetRecord) => const createExplorerState = (
JSON.stringify({ spec: CorpusExplorerSpec,
post_id: record.post_id ?? null, patch: Partial<ExplorerState> = {},
parent_id: record.parent_id ?? null, ): ExplorerState => ({
reply_to: record.reply_to ?? null, open: true,
author: record.author ?? null, title: spec.title,
type: record.type ?? null, description: spec.description,
timestamp: record.timestamp ?? null, emptyMessage: spec.emptyMessage ?? "No matching records found.",
dt: record.dt ?? null, records: [],
title: record.title ?? null, loading: false,
content: record.content ?? null, error: "",
source: record.source ?? null, ...patch,
topic: record.topic ?? null, });
});
const dedupeExplorerRecords = (records: DatasetRecord[]) => { const compareRecordsByNewest = (a: DatasetRecord, b: DatasetRecord) => {
const uniqueRecords: DatasetRecord[] = []; const aValue = String(a.dt ?? a.date ?? a.timestamp ?? "");
const seen = new Set<string>(); const bValue = String(b.dt ?? b.date ?? b.timestamp ?? "");
return bValue.localeCompare(aValue);
};
for (const record of records) { const parseJsonLikePayload = (value: string): unknown => {
const identity = getExplorerRecordIdentity(record); const normalized = value
if (seen.has(identity)) { .replace(/\uFEFF/g, "")
continue; .replace(/,\s*([}\]])/g, "$1")
.replace(/(:\s*)(NaN|Infinity|-Infinity)\b/g, "$1null")
.replace(/(\[\s*)(NaN|Infinity|-Infinity)\b/g, "$1null")
.replace(/(,\s*)(NaN|Infinity|-Infinity)\b/g, "$1null")
.replace(/(:\s*)None\b/g, "$1null")
.replace(/(:\s*)True\b/g, "$1true")
.replace(/(:\s*)False\b/g, "$1false")
.replace(/(\[\s*)None\b/g, "$1null")
.replace(/(\[\s*)True\b/g, "$1true")
.replace(/(\[\s*)False\b/g, "$1false")
.replace(/(,\s*)None\b/g, "$1null")
.replace(/(,\s*)True\b/g, "$1true")
.replace(/(,\s*)False\b/g, "$1false");
return JSON.parse(normalized);
};
const tryParseRecords = (value: string) => {
try {
return normalizeRecordPayload(parseJsonLikePayload(value));
} catch {
return null;
}
};
const parseRecordStringPayload = (payload: string): DatasetRecord[] | null => {
const trimmed = payload.trim();
if (!trimmed) {
return [];
} }
seen.add(identity); const direct = tryParseRecords(trimmed);
uniqueRecords.push(record); if (direct) {
return direct;
} }
return uniqueRecords; const ndjsonLines = trimmed
.split(/\r?\n/)
.map((line) => line.trim())
.filter(Boolean);
if (ndjsonLines.length > 0) {
try {
return ndjsonLines.map((line) => parseJsonLikePayload(line)) as DatasetRecord[];
} catch {
}
}
const bracketStart = trimmed.indexOf("[");
const bracketEnd = trimmed.lastIndexOf("]");
if (bracketStart !== -1 && bracketEnd > bracketStart) {
const parsed = tryParseRecords(trimmed.slice(bracketStart, bracketEnd + 1));
if (parsed) {
return parsed;
}
}
const braceStart = trimmed.indexOf("{");
const braceEnd = trimmed.lastIndexOf("}");
if (braceStart !== -1 && braceEnd > braceStart) {
const parsed = tryParseRecords(trimmed.slice(braceStart, braceEnd + 1));
if (parsed) {
return parsed;
}
}
return null;
}; };
const normalizeRecordPayload = (payload: unknown): DatasetRecord[] => { const normalizeRecordPayload = (payload: unknown): DatasetRecord[] => {
if (typeof payload === "string") { if (typeof payload === "string") {
try { const parsed = parseRecordStringPayload(payload);
return normalizeRecordPayload(JSON.parse(payload)); if (parsed) {
} catch { return parsed;
throw new Error("Corpus endpoint returned a non-JSON string payload.");
} }
const preview = payload.trim().slice(0, 120).replace(/\s+/g, " ");
throw new Error(
`Corpus endpoint returned a non-JSON string payload.${
preview ? ` Response preview: ${preview}` : ""
}`,
);
} }
if ( if (
@@ -265,9 +330,7 @@ const StatPage = () => {
}, },
); );
const normalizedRecords = dedupeExplorerRecords( const normalizedRecords = normalizeRecordPayload(response.data);
normalizeRecordPayload(response.data),
);
setAllRecords(normalizedRecords); setAllRecords(normalizedRecords);
setAllRecordsKey(filterKey); setAllRecordsKey(filterKey);
@@ -275,47 +338,22 @@ const StatPage = () => {
}; };
const openExplorer = async (spec: CorpusExplorerSpec) => { const openExplorer = async (spec: CorpusExplorerSpec) => {
setExplorerState({ setExplorerState(createExplorerState(spec, { loading: true }));
open: true,
title: spec.title,
description: spec.description,
emptyMessage: spec.emptyMessage ?? "No matching records found.",
records: [],
loading: true,
error: "",
});
try { try {
const records = await ensureFilteredRecords(); const records = await ensureFilteredRecords();
const context = buildExplorerContext(records); const context = buildExplorerContext(records);
const matched = dedupeExplorerRecords( const matched = records
records.filter((record) => spec.matcher(record, context)), .filter((record) => spec.matcher(record, context))
); .sort(compareRecordsByNewest);
matched.sort((a, b) => {
const aValue = String(a.dt ?? a.date ?? a.timestamp ?? "");
const bValue = String(b.dt ?? b.date ?? b.timestamp ?? "");
return bValue.localeCompare(aValue);
});
setExplorerState({ setExplorerState(createExplorerState(spec, { records: matched }));
open: true,
title: spec.title,
description: spec.description,
emptyMessage: spec.emptyMessage ?? "No matching records found.",
records: matched,
loading: false,
error: "",
});
} catch (e) { } catch (e) {
setExplorerState({ setExplorerState(
open: true, createExplorerState(spec, {
title: spec.title,
description: spec.description,
emptyMessage: spec.emptyMessage ?? "No matching records found.",
records: [],
loading: false,
error: `Failed to load corpus records: ${String(e)}`, error: `Failed to load corpus records: ${String(e)}`,
}); }),
);
} }
}; };

View File

@@ -34,6 +34,11 @@ type Vocab = {
top_words: FrequencyWord[]; top_words: FrequencyWord[];
}; };
type DominantTopic = {
topic: string;
count: number;
};
type User = { type User = {
author: string; author: string;
post: number; post: number;
@@ -41,6 +46,7 @@ type User = {
comment_post_ratio: number; comment_post_ratio: number;
comment_share: number; comment_share: number;
avg_emotions?: Record<string, number>; avg_emotions?: Record<string, number>;
dominant_topic?: DominantTopic | null;
vocab?: Vocab | null; vocab?: Vocab | null;
}; };
@@ -162,6 +168,10 @@ type StanceMarkers = {
certainty_per_1k_tokens: number; certainty_per_1k_tokens: number;
deontic_per_1k_tokens: number; deontic_per_1k_tokens: number;
permission_per_1k_tokens: number; permission_per_1k_tokens: number;
hedge_emotion_avg?: Record<string, number>;
certainty_emotion_avg?: Record<string, number>;
deontic_emotion_avg?: Record<string, number>;
permission_emotion_avg?: Record<string, number>;
}; };
type EntityEmotionAggregate = { type EntityEmotionAggregate = {
@@ -202,6 +212,7 @@ type FilterResponse = {
export type { export type {
TopUser, TopUser,
DominantTopic,
Vocab, Vocab,
User, User,
InteractionGraph, InteractionGraph,

View File

@@ -1,5 +1,3 @@
import type { CSSProperties } from "react";
type EntityRecord = { type EntityRecord = {
text?: string; text?: string;
[key: string]: unknown; [key: string]: unknown;
@@ -58,11 +56,6 @@ const EMOTION_KEYS = [
"emotion_sadness", "emotion_sadness",
] as const; ] as const;
const shrinkButtonStyle: CSSProperties = {
padding: "4px 8px",
fontSize: 12,
};
const toText = (value: unknown) => { const toText = (value: unknown) => {
if (typeof value === "string") { if (typeof value === "string") {
return value; return value;
@@ -83,6 +76,7 @@ const toText = (value: unknown) => {
}; };
const normalize = (value: unknown) => toText(value).trim().toLowerCase(); const normalize = (value: unknown) => toText(value).trim().toLowerCase();
const getAuthor = (record: DatasetRecord) => toText(record.author).trim();
const getRecordText = (record: DatasetRecord) => const getRecordText = (record: DatasetRecord) =>
`${record.title ?? ""} ${record.content ?? ""}`.trim(); `${record.title ?? ""} ${record.content ?? ""}`.trim();
@@ -152,11 +146,11 @@ const matchesPhrase = (record: DatasetRecord, phrase: string) => {
return false; return false;
} }
return pattern.test(getRecordText(record).toLowerCase()); return pattern.test(getRecordText(record));
}; };
const recordIdentityBucket = (record: DatasetRecord) => { const recordIdentityBucket = (record: DatasetRecord) => {
const text = getRecordText(record).toLowerCase(); const text = getRecordText(record);
const inHits = countMatches(IN_GROUP_PATTERN, text); const inHits = countMatches(IN_GROUP_PATTERN, text);
const outHits = countMatches(OUT_GROUP_PATTERN, text); const outHits = countMatches(OUT_GROUP_PATTERN, text);
@@ -171,48 +165,30 @@ const recordIdentityBucket = (record: DatasetRecord) => {
return "tie"; return "tie";
}; };
const createAuthorEventCounts = (records: DatasetRecord[]) => { const buildExplorerContext = (records: DatasetRecord[]): CorpusExplorerContext => {
const counts = new Map<string, number>(); const authorByPostId = new Map<string, string>();
const authorEventCounts = new Map<string, number>();
const authorCommentCounts = new Map<string, number>();
for (const record of records) { for (const record of records) {
const author = toText(record.author).trim(); const author = getAuthor(record);
if (!author) { if (!author) {
continue; continue;
} }
counts.set(author, (counts.get(author) ?? 0) + 1);
}
return counts;
};
const createAuthorCommentCounts = (records: DatasetRecord[]) => { authorEventCounts.set(author, (authorEventCounts.get(author) ?? 0) + 1);
const counts = new Map<string, number>();
for (const record of records) {
const author = toText(record.author).trim();
if (!author || record.type !== "comment") {
continue;
}
counts.set(author, (counts.get(author) ?? 0) + 1);
}
return counts;
};
const createAuthorByPostId = (records: DatasetRecord[]) => { if (record.type === "comment") {
const map = new Map<string, string>(); authorCommentCounts.set(author, (authorCommentCounts.get(author) ?? 0) + 1);
for (const record of records) {
const postId = record.post_id;
const author = toText(record.author).trim();
if (postId === null || postId === undefined || !author) {
continue;
} }
map.set(String(postId), author);
}
return map;
};
const buildExplorerContext = (records: DatasetRecord[]): CorpusExplorerContext => ({ if (record.post_id !== null && record.post_id !== undefined) {
authorByPostId: createAuthorByPostId(records), authorByPostId.set(String(record.post_id), author);
authorEventCounts: createAuthorEventCounts(records), }
authorCommentCounts: createAuthorCommentCounts(records), }
});
return { authorByPostId, authorEventCounts, authorCommentCounts };
};
const buildAllRecordsSpec = (): CorpusExplorerSpec => ({ const buildAllRecordsSpec = (): CorpusExplorerSpec => ({
title: "Corpus Explorer", title: "Corpus Explorer",
@@ -221,19 +197,27 @@ const buildAllRecordsSpec = (): CorpusExplorerSpec => ({
matcher: () => true, matcher: () => true,
}); });
const buildUserSpec = (author: string): CorpusExplorerSpec => ({ const buildUserSpec = (author: string): CorpusExplorerSpec => {
const target = normalize(author);
return {
title: `User: ${author}`, title: `User: ${author}`,
description: `All records authored by ${author}.`, description: `All records authored by ${author}.`,
emptyMessage: `No records found for ${author}.`, emptyMessage: `No records found for ${author}.`,
matcher: (record) => normalize(record.author) === normalize(author), matcher: (record) => normalize(record.author) === target,
}); };
};
const buildTopicSpec = (topic: string): CorpusExplorerSpec => ({ const buildTopicSpec = (topic: string): CorpusExplorerSpec => {
const target = normalize(topic);
return {
title: `Topic: ${topic}`, title: `Topic: ${topic}`,
description: `Records assigned to the ${topic} topic bucket.`, description: `Records assigned to the ${topic} topic bucket.`,
emptyMessage: `No records found in the ${topic} topic bucket.`, emptyMessage: `No records found in the ${topic} topic bucket.`,
matcher: (record) => normalize(record.topic) === normalize(topic), matcher: (record) => normalize(record.topic) === target,
}); };
};
const buildDateBucketSpec = (date: string): CorpusExplorerSpec => ({ const buildDateBucketSpec = (date: string): CorpusExplorerSpec => ({
title: `Date Bucket: ${date}`, title: `Date Bucket: ${date}`,
@@ -256,37 +240,52 @@ const buildNgramSpec = (ngram: string): CorpusExplorerSpec => ({
matcher: (record) => matchesPhrase(record, ngram), matcher: (record) => matchesPhrase(record, ngram),
}); });
const buildEntitySpec = (entity: string): CorpusExplorerSpec => ({ const buildEntitySpec = (entity: string): CorpusExplorerSpec => {
const target = normalize(entity);
return {
title: `Entity: ${entity}`, title: `Entity: ${entity}`,
description: `Records mentioning the ${entity} entity.`, description: `Records mentioning the ${entity} entity.`,
emptyMessage: `No records found for the ${entity} entity.`, emptyMessage: `No records found for the ${entity} entity.`,
matcher: (record) => { matcher: (record) => {
const target = normalize(entity);
const entities = Array.isArray(record.ner_entities) ? record.ner_entities : []; const entities = Array.isArray(record.ner_entities) ? record.ner_entities : [];
return entities.some((item) => normalize(item?.text) === target) || matchesPhrase(record, entity); return entities.some((item) => normalize(item?.text) === target) || matchesPhrase(record, entity);
}, },
}); };
};
const buildSourceSpec = (source: string): CorpusExplorerSpec => ({ const buildSourceSpec = (source: string): CorpusExplorerSpec => {
const target = normalize(source);
return {
title: `Source: ${source}`, title: `Source: ${source}`,
description: `Records from the ${source} source.`, description: `Records from the ${source} source.`,
emptyMessage: `No records found for ${source}.`, emptyMessage: `No records found for ${source}.`,
matcher: (record) => normalize(record.source) === normalize(source), matcher: (record) => normalize(record.source) === target,
}); };
};
const buildDominantEmotionSpec = (emotion: string): CorpusExplorerSpec => ({ const buildDominantEmotionSpec = (emotion: string): CorpusExplorerSpec => {
const target = normalize(emotion);
return {
title: `Dominant Emotion: ${emotion}`, title: `Dominant Emotion: ${emotion}`,
description: `Records where ${emotion} is the strongest emotion score.`, description: `Records where ${emotion} is the strongest emotion score.`,
emptyMessage: `No records found with dominant emotion ${emotion}.`, emptyMessage: `No records found with dominant emotion ${emotion}.`,
matcher: (record) => getDominantEmotion(record) === normalize(emotion), matcher: (record) => getDominantEmotion(record) === target,
}); };
};
const buildReplyPairSpec = (source: string, target: string): CorpusExplorerSpec => ({ const buildReplyPairSpec = (source: string, target: string): CorpusExplorerSpec => {
const sourceName = normalize(source);
const targetName = normalize(target);
return {
title: `Reply Path: ${source} -> ${target}`, title: `Reply Path: ${source} -> ${target}`,
description: `Reply records authored by ${source} in response to ${target}.`, description: `Reply records authored by ${source} in response to ${target}.`,
emptyMessage: `No reply records found for ${source} -> ${target}.`, emptyMessage: `No reply records found for ${source} -> ${target}.`,
matcher: (record, context) => { matcher: (record, context) => {
if (normalize(record.author) !== normalize(source)) { if (normalize(record.author) !== sourceName) {
return false; return false;
} }
@@ -295,49 +294,21 @@ const buildReplyPairSpec = (source: string, target: string): CorpusExplorerSpec
return false; return false;
} }
const replyTarget = context.authorByPostId.get(String(replyTo)); return normalize(context.authorByPostId.get(String(replyTo))) === targetName;
return normalize(replyTarget) === normalize(target);
}, },
}); };
};
const buildOneTimeUsersSpec = (): CorpusExplorerSpec => ({ const buildOneTimeUsersSpec = (): CorpusExplorerSpec => ({
title: "One-Time Users", title: "One-Time Users",
description: "Records written by authors who appear exactly once in the filtered corpus.", description: "Records written by authors who appear exactly once in the filtered corpus.",
emptyMessage: "No one-time-user records found.", emptyMessage: "No one-time-user records found.",
matcher: (record, context) => { matcher: (record, context) => {
const author = toText(record.author).trim(); const author = getAuthor(record);
return !!author && context.authorEventCounts.get(author) === 1; return !!author && context.authorEventCounts.get(author) === 1;
}, },
}); });
const buildTopCommentersSpec = (topAuthorCount: number): CorpusExplorerSpec => ({
title: "Top Commenters",
description: `Comment records from the top ${topAuthorCount} commenters in the filtered corpus.`,
emptyMessage: "No top-commenter records found.",
matcher: (record, context) => {
if (record.type !== "comment") {
return false;
}
const rankedAuthors = Array.from(context.authorCommentCounts.entries())
.sort((a, b) => b[1] - a[1])
.slice(0, topAuthorCount)
.map(([author]) => author);
return rankedAuthors.includes(toText(record.author).trim());
},
});
const buildSingleCommentAuthorsSpec = (): CorpusExplorerSpec => ({
title: "Single-Comment Authors",
description: "Comment records from authors who commented exactly once.",
emptyMessage: "No single-comment-author records found.",
matcher: (record, context) => {
const author = toText(record.author).trim();
return record.type === "comment" && !!author && context.authorCommentCounts.get(author) === 1;
},
});
const buildIdentityBucketSpec = (bucket: "in" | "out" | "tie"): CorpusExplorerSpec => { const buildIdentityBucketSpec = (bucket: "in" | "out" | "tie"): CorpusExplorerSpec => {
const labels = { const labels = {
in: "In-Group Posts", in: "In-Group Posts",
@@ -376,9 +347,7 @@ const buildDeonticSpec = () =>
const buildPermissionSpec = () => const buildPermissionSpec = () =>
buildPatternSpec("Permission Words", "Records containing permission language.", PERMISSION_PATTERN); buildPatternSpec("Permission Words", "Records containing permission language.", PERMISSION_PATTERN);
const getExplorerButtonStyle = () => shrinkButtonStyle; export type { DatasetRecord, CorpusExplorerSpec };
export type { DatasetRecord, CorpusExplorerContext, CorpusExplorerSpec };
export { export {
buildAllRecordsSpec, buildAllRecordsSpec,
buildCertaintySpec, buildCertaintySpec,
@@ -393,13 +362,10 @@ export {
buildOneTimeUsersSpec, buildOneTimeUsersSpec,
buildPermissionSpec, buildPermissionSpec,
buildReplyPairSpec, buildReplyPairSpec,
buildSingleCommentAuthorsSpec,
buildSourceSpec, buildSourceSpec,
buildTopicSpec, buildTopicSpec,
buildTopCommentersSpec,
buildUserSpec, buildUserSpec,
buildWordSpec, buildWordSpec,
getDateBucket, getDateBucket,
getExplorerButtonStyle,
toText, toText,
}; };

View File

@@ -3,7 +3,7 @@ const DEFAULT_TITLE = "Ethnograph View";
const STATIC_TITLES: Record<string, string> = { const STATIC_TITLES: Record<string, string> = {
"/login": "Sign In", "/login": "Sign In",
"/upload": "Upload Dataset", "/upload": "Upload Dataset",
"/auto-scrape": "Auto Scrape Dataset", "/auto-fetch": "Auto Fetch Dataset",
"/datasets": "My Datasets", "/datasets": "My Datasets",
}; };

BIN
report/img/analysis_bar.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

BIN
report/img/architecture.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 274 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 90 KiB

BIN
report/img/frontend.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 302 KiB

BIN
report/img/gantt.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

BIN
report/img/heatmap.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 86 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 114 KiB

BIN
report/img/kpi_card.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.7 KiB

BIN
report/img/moods.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

BIN
report/img/navbar.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

BIN
report/img/ngrams.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

BIN
report/img/nlp_backoff.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 143 KiB

BIN
report/img/pipeline.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

BIN
report/img/reddit_bot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 232 KiB

BIN
report/img/schema.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 64 KiB

BIN
report/img/signature.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 152 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 111 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

BIN
report/img/ucc_crest.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

File diff suppressed because it is too large Load Diff

149
report/references.bib Normal file
View File

@@ -0,0 +1,149 @@
@online{reddit_api,
author = {{Reddit Inc.}},
title = {Reddit API Documentation},
year = {2025},
url = {https://www.reddit.com/dev/api/},
urldate = {2026-04-08}
}
@misc{hartmann2022emotionenglish,
author={Hartmann, Jochen},
title={Emotion English DistilRoBERTa-base},
year={2022},
howpublished = {\url{https://huggingface.co/j-hartmann/emotion-english-distilroberta-base/}},
}
@misc{all_mpnet_base_v2,
author={Microsoft Research},
title={All-MPNet-Base-V2},
year={2021},
howpublished = {\url{https://huggingface.co/sentence-transformers/all-mpnet-base-v2}},
}
@misc{minilm_l6_v2,
author={Microsoft Research},
title={MiniLM-L6-V2},
year={2021},
howpublished = {\url{https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2}},
}
@misc{dslim_bert_base_ner,
author={deepset},
title={dslim/bert-base-NER},
year={2018},
howpublished = {\url{https://huggingface.co/dslim/bert-base-NER}},
}
@inproceedings{demszky2020goemotions,
author = {Demszky, Dorottya and Movshovitz-Attias, Dana and Ko, Jeongwoo and Cowen, Alan and Nemade, Gaurav and Ravi, Sujith},
booktitle = {58th Annual Meeting of the Association for Computational Linguistics (ACL)},
title = {{GoEmotions: A Dataset of Fine-Grained Emotions}},
year = {2020}
}
@article{dominguez2007virtual,
author = {Domínguez, Daniel and Beaulieu, Anne and Estalella, Adolfo and Gómez, Edgar and Schnettler, Bernt and Read, Rosie},
title = {Virtual Ethnography},
journal = {Forum Qualitative Sozialforschung / Forum: Qualitative Social Research},
year = {2007},
volume = {8},
number = {3},
url = {http://nbn-resolving.de/urn:nbn:de:0114-fqs0703E19}
}
@article{sun2014lurkers,
author = {Sun, Na and Rau, Pei-Luen Patrick and Ma, Liang},
title = {Understanding Lurkers in Online Communities: A Literature Review},
journal = {Computers in Human Behavior},
year = {2014},
volume = {38},
pages = {110--117},
doi = {10.1016/j.chb.2014.05.022}
}
@article{ahmad2024sentiment,
author = {Ahmad, Waqar and others},
title = {Recent Advancements and Challenges of NLP-based Sentiment Analysis: A State-of-the-art Review},
journal = {Natural Language Processing Journal},
year = {2024},
doi = {10.1016/j.nlp.2024.100059}
}
@article{coleman2010ethnographic,
ISSN = {00846570},
URL = {http://www.jstor.org/stable/25735124},
abstract = {This review surveys and divides the ethnographic corpus on digital media into three broad but overlapping categories: the cultural politics of digital media, the vernacular cultures of digital media, and the prosaics of digital media. Engaging these three categories of scholarship on digital media, I consider how ethnographers are exploring the complex relationships between the local practices and global implications of digital media, their materiality and politics, and thier banal, as well as profound, presence in cultural life and modes of communication. I consider the way these media have become central to the articulation of cherished beliefs, ritual practices, and modes of being in the world; the fact that digital media culturally matters is undeniable but showing how, where, and why it matters is necessary to push against peculiarly narrow presumptions about the universality of digital experience.},
author = {E. Gabriella Coleman},
journal = {Annual Review of Anthropology},
pages = {487--505},
publisher = {Annual Reviews},
title = {Ethnographic Approaches to Digital Media},
urldate = {2026-04-15},
volume = {39},
year = {2010}
}
@article{shen2021stance,
author = {Shen, Qian and Tao, Yating},
title = {Stance Markers in {English} Medical Research Articles and Newspaper Opinion Columns: A Comparative Corpus-Based Study},
journal = {PLOS ONE},
volume = {16},
number = {3},
pages = {e0247981},
year = {2021},
doi = {10.1371/journal.pone.0247981}
}
@incollection{medvedev2019anatomy,
author = {Medvedev, Alexey N. and Lambiotte, Renaud and Delvenne, Jean-Charles},
title = {The Anatomy of Reddit: An Overview of Academic Research},
booktitle = {Dynamics On and Of Complex Networks III},
series = {Springer Proceedings in Complexity},
publisher = {Springer},
year = {2019},
pages = {183--204}
}
@misc{cook2023ethnography,
author = {Cook, Chloe},
title = {What is the Difference Between Ethnography and Digital Ethnography?},
year = {2023},
month = jan,
day = {19},
howpublished = {\url{https://ethosapp.com/blog/what-is-the-difference-between-ethnography-and-digital-ethnography/}},
note = {Accessed: 2026-04-16},
organization = {EthOS}
}
@misc{giuffre2026sentiment,
author = {Giuffre, Steven},
title = {What is Sentiment Analysis?},
year = {2026},
month = mar,
howpublished = {\url{https://www.vonage.com/resources/articles/sentiment-analysis/}},
note = {Accessed: 2026-04-16},
organization = {Vonage}
}
@misc{mungalpara2022stemming,
author = {Mungalpara, Jaimin},
title = {Stemming Lemmatization Stopwords and {N}-Grams in {NLP}},
year = {2022},
month = jul,
day = {26},
howpublished = {\url{https://jaimin-ml2001.medium.com/stemming-lemmatization-stopwords-and-n-grams-in-nlp-96f8e8b6aa6f}},
note = {Accessed: 2026-04-16},
organization = {Medium}
}
@misc{chugani2025ethicalscraping,
author = {Chugani, Vinod},
title = {Ethical Web Scraping: Principles and Practices},
year = {2025},
month = apr,
day = {21},
howpublished = {\url{https://www.datacamp.com/blog/ethical-web-scraping}},
note = {Accessed: 2026-04-16},
organization = {DataCamp}
}

View File

@@ -16,3 +16,4 @@ Requests==2.32.5
sentence_transformers==5.2.2 sentence_transformers==5.2.2
torch==2.10.0 torch==2.10.0
transformers==5.1.0 transformers==5.1.0
gunicorn==25.3.0

View File

@@ -67,6 +67,12 @@ class CulturalAnalysis:
def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]: def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
s = df[self.content_col].fillna("").astype(str) s = df[self.content_col].fillna("").astype(str)
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [
c
for c in df.columns
if c.startswith("emotion_") and c not in emotion_exclusions
]
hedge_pattern = re.compile( hedge_pattern = re.compile(
r"\b(maybe|perhaps|possibly|probably|likely|seems|seem|i think|i feel|i guess|kind of|sort of|somewhat)\b" r"\b(maybe|perhaps|possibly|probably|likely|seems|seem|i think|i feel|i guess|kind of|sort of|somewhat)\b"
@@ -88,7 +94,7 @@ class CulturalAnalysis:
0, 1 0, 1
) )
return { result = {
"hedge_total": int(hedge_counts.sum()), "hedge_total": int(hedge_counts.sum()),
"certainty_total": int(certainty_counts.sum()), "certainty_total": int(certainty_counts.sum()),
"deontic_total": int(deontic_counts.sum()), "deontic_total": int(deontic_counts.sum()),
@@ -107,6 +113,32 @@ class CulturalAnalysis:
), ),
} }
if emotion_cols:
emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
result["hedge_emotion_avg"] = (
emo.loc[hedge_counts > 0].mean()
if (hedge_counts > 0).any()
else pd.Series(0.0, index=emotion_cols)
).to_dict()
result["certainty_emotion_avg"] = (
emo.loc[certainty_counts > 0].mean()
if (certainty_counts > 0).any()
else pd.Series(0.0, index=emotion_cols)
).to_dict()
result["deontic_emotion_avg"] = (
emo.loc[deontic_counts > 0].mean()
if (deontic_counts > 0).any()
else pd.Series(0.0, index=emotion_cols)
).to_dict()
result["permission_emotion_avg"] = (
emo.loc[perm_counts > 0].mean()
if (perm_counts > 0).any()
else pd.Series(0.0, index=emotion_cols)
).to_dict()
return result
def get_avg_emotions_per_entity( def get_avg_emotions_per_entity(
self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10 self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10
) -> dict[str, Any]: ) -> dict[str, Any]:

View File

@@ -1,17 +1,30 @@
import pandas as pd
import re import re
from collections import Counter from collections import Counter
from itertools import islice from dataclasses import dataclass
import pandas as pd
@dataclass(frozen=True)
class NGramConfig:
min_token_length: int = 3
min_count: int = 2
max_results: int = 100
class LinguisticAnalysis: class LinguisticAnalysis:
def __init__(self, word_exclusions: set[str]): def __init__(self, word_exclusions: set[str]):
self.word_exclusions = word_exclusions self.word_exclusions = word_exclusions
self.ngram_config = NGramConfig()
def _tokenize(self, text: str): def _tokenize(self, text: str, *, include_exclusions: bool = False) -> list[str]:
tokens = re.findall(r"\b[a-z]{3,}\b", text) pattern = rf"\b[a-z]{{{self.ngram_config.min_token_length},}}\b"
return [t for t in tokens if t not in self.word_exclusions] tokens = re.findall(pattern, text)
if include_exclusions:
return tokens
return [token for token in tokens if token not in self.word_exclusions]
def _clean_text(self, text: str) -> str: def _clean_text(self, text: str) -> str:
text = re.sub(r"http\S+", "", text) # remove URLs text = re.sub(r"http\S+", "", text) # remove URLs
@@ -21,13 +34,24 @@ class LinguisticAnalysis:
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text) text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
return text return text
def _content_texts(self, df: pd.DataFrame) -> pd.Series:
return df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
def _valid_ngram(self, tokens: tuple[str, ...]) -> bool:
if any(token in self.word_exclusions for token in tokens):
return False
if len(set(tokens)) == 1:
return False
return True
def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]: def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
texts = df["content"].dropna().astype(str).str.lower() texts = self._content_texts(df)
words = [] words = []
for text in texts: for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text) words.extend(self._tokenize(text))
words.extend(w for w in tokens if w not in self.word_exclusions)
counts = Counter(words) counts = Counter(words)
@@ -40,25 +64,39 @@ class LinguisticAnalysis:
return word_frequencies.to_dict(orient="records") return word_frequencies.to_dict(orient="records")
def ngrams(self, df: pd.DataFrame, n=2, limit=100): def ngrams(self, df: pd.DataFrame, n: int = 2, limit: int | None = None) -> list[dict]:
texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower() if n < 2:
raise ValueError("n must be at least 2")
texts = self._content_texts(df)
all_ngrams = [] all_ngrams = []
result_limit = limit or self.ngram_config.max_results
for text in texts: for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text) tokens = self._tokenize(text, include_exclusions=True)
# stop word removal causes strange behaviors in ngrams if len(tokens) < n:
# tokens = [w for w in tokens if w not in self.word_exclusions] continue
ngrams = zip(*(islice(tokens, i, None) for i in range(n))) for index in range(len(tokens) - n + 1):
all_ngrams.extend([" ".join(ng) for ng in ngrams]) ngram_tokens = tuple(tokens[index : index + n])
if self._valid_ngram(ngram_tokens):
all_ngrams.append(" ".join(ngram_tokens))
counts = Counter(all_ngrams) counts = Counter(all_ngrams)
filtered_counts = [
(ngram, count)
for ngram, count in counts.items()
if count >= self.ngram_config.min_count
]
if not filtered_counts:
return []
return ( return (
pd.DataFrame(counts.items(), columns=["ngram", "count"]) pd.DataFrame(filtered_counts, columns=["ngram", "count"])
.sort_values("count", ascending=False) .sort_values(["count", "ngram"], ascending=[False, True])
.head(limit) .head(result_limit)
.to_dict(orient="records") .to_dict(orient="records")
) )

View File

@@ -89,39 +89,17 @@ class StatGen:
df.to_json(orient="records", date_format="iso", date_unit="s") df.to_json(orient="records", date_format="iso", date_unit="s")
) )
def _dedupe_records(self, records: list[dict]) -> list[dict]:
unique_records = []
seen = set()
for record in records:
key_data = {
"post_id": record.get("post_id"),
"parent_id": record.get("parent_id"),
"reply_to": record.get("reply_to"),
"author": record.get("author"),
"type": record.get("type"),
"timestamp": record.get("timestamp"),
"dt": record.get("dt"),
"title": record.get("title"),
"content": record.get("content"),
"source": record.get("source"),
"topic": record.get("topic"),
}
key = json.dumps(key_data, sort_keys=True, separators=(",", ":"))
if key in seen:
continue
seen.add(key)
unique_records.append(record)
return unique_records
## Public Methods ## Public Methods
def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]: def filter_dataset(self, df: pd.DataFrame, filters: dict | None = None) -> list[dict]:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return self._dedupe_records(self._json_ready_records(filtered_df)) return self._json_ready_records(filtered_df)
def temporal(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def temporal(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
@@ -129,7 +107,12 @@ class StatGen:
"weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df), "weekday_hour_heatmap": self.temporal_analysis.heatmap(filtered_df),
} }
def linguistic(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def linguistic(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
@@ -139,7 +122,12 @@ class StatGen:
"lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df) "lexical_diversity": self.linguistic_analysis.lexical_diversity(filtered_df)
} }
def emotional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def emotional(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
@@ -149,7 +137,12 @@ class StatGen:
"emotion_by_source": self.emotional_analysis.emotion_by_source(filtered_df) "emotion_by_source": self.emotional_analysis.emotion_by_source(filtered_df)
} }
def user(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def user(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
@@ -157,7 +150,12 @@ class StatGen:
"users": self.user_analysis.per_user_analysis(filtered_df) "users": self.user_analysis.per_user_analysis(filtered_df)
} }
def interactional(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def interactional(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
@@ -166,7 +164,12 @@ class StatGen:
"conversation_concentration": self.interaction_analysis.conversation_concentration(filtered_df) "conversation_concentration": self.interaction_analysis.conversation_concentration(filtered_df)
} }
def cultural(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def cultural(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return { return {
@@ -175,7 +178,12 @@ class StatGen:
"avg_emotion_per_entity": self.cultural_analysis.get_avg_emotions_per_entity(filtered_df) "avg_emotion_per_entity": self.cultural_analysis.get_avg_emotions_per_entity(filtered_df)
} }
def summary(self, df: pd.DataFrame, filters: dict | None = None) -> dict: def summary(
self,
df: pd.DataFrame,
filters: dict | None = None,
dataset_id: int | None = None,
) -> dict:
filtered_df = self._prepare_filtered_df(df, filters) filtered_df = self._prepare_filtered_df(df, filters)
return self.summary_analysis.summary(filtered_df) return self.summary_analysis.summary(filtered_df)

View File

@@ -71,6 +71,7 @@ class UserAnalysis:
per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0) per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
emotion_cols = [col for col in df.columns if col.startswith("emotion_")] emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
dominant_topic_by_author = {}
avg_emotions_by_author = {} avg_emotions_by_author = {}
if emotion_cols: if emotion_cols:
@@ -80,6 +81,31 @@ class UserAnalysis:
for author, row in avg_emotions.iterrows() for author, row in avg_emotions.iterrows()
} }
if "topic" in df.columns:
topic_df = df[
df["topic"].notna()
& (df["topic"] != "")
& (df["topic"] != "Misc")
]
if not topic_df.empty:
topic_counts = (
topic_df.groupby(["author", "topic"])
.size()
.reset_index(name="count")
.sort_values(
["author", "count", "topic"],
ascending=[True, False, True],
)
.drop_duplicates(subset=["author"])
)
dominant_topic_by_author = {
row["author"]: {
"topic": row["topic"],
"count": int(row["count"]),
}
for _, row in topic_counts.iterrows()
}
# ensure columns always exist # ensure columns always exist
for col in ("post", "comment"): for col in ("post", "comment"):
if col not in per_user.columns: if col not in per_user.columns:
@@ -109,6 +135,7 @@ class UserAnalysis:
"comment_post_ratio": float(row.get("comment_post_ratio", 0)), "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
"comment_share": float(row.get("comment_share", 0)), "comment_share": float(row.get("comment_share", 0)),
"avg_emotions": avg_emotions_by_author.get(author, {}), "avg_emotions": avg_emotions_by_author.get(author, {}),
"dominant_topic": dominant_topic_by_author.get(author),
"vocab": vocab_by_author.get( "vocab": vocab_by_author.get(
author, author,
{ {

View File

@@ -152,9 +152,9 @@ def get_dataset_sources():
return jsonify(list_metadata) return jsonify(list_metadata)
@app.route("/datasets/scrape", methods=["POST"]) @app.route("/datasets/fetch", methods=["POST"])
@jwt_required() @jwt_required()
def scrape_data(): def fetch_data():
data = request.get_json() data = request.get_json()
connector_metadata = get_connector_metadata() connector_metadata = get_connector_metadata()
@@ -424,7 +424,7 @@ def get_linguistic_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.linguistic(dataset_content, filters)), 200 return jsonify(stat_gen.linguistic(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -448,7 +448,7 @@ def get_emotional_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.emotional(dataset_content, filters)), 200 return jsonify(stat_gen.emotional(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -472,7 +472,7 @@ def get_summary(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.summary(dataset_content, filters)), 200 return jsonify(stat_gen.summary(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -496,7 +496,7 @@ def get_temporal_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.temporal(dataset_content, filters)), 200 return jsonify(stat_gen.temporal(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -520,7 +520,7 @@ def get_user_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.user(dataset_content, filters)), 200 return jsonify(stat_gen.user(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -544,7 +544,7 @@ def get_cultural_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.cultural(dataset_content, filters)), 200 return jsonify(stat_gen.cultural(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -568,7 +568,7 @@ def get_interaction_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.interactional(dataset_content, filters)), 200 return jsonify(stat_gen.interactional(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:

View File

@@ -1,21 +1,18 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dto.post import Post from dto.post import Post
import os
class BaseConnector(ABC): class BaseConnector(ABC):
# Each subclass declares these at the class level source_name: str # machine readable
source_name: str # machine-readable: "reddit", "youtube" display_name: str # human readablee
display_name: str # human-readable: "Reddit", "YouTube" required_env: list[str] = []
required_env: list[str] = [] # env vars needed to activate
search_enabled: bool search_enabled: bool
categories_enabled: bool categories_enabled: bool
@classmethod @classmethod
def is_available(cls) -> bool: def is_available(cls) -> bool:
"""Returns True if all required env vars are set."""
import os
return all(os.getenv(var) for var in cls.required_env) return all(os.getenv(var) for var in cls.required_env)
@abstractmethod @abstractmethod

View File

@@ -11,8 +11,7 @@ from server.connectors.base import BaseConnector
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; ForumScraper/1.0)"} HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; Digital-Ethnography-Aid/1.0)"}
class BoardsAPI(BaseConnector): class BoardsAPI(BaseConnector):
source_name: str = "boards.ie" source_name: str = "boards.ie"
@@ -88,7 +87,7 @@ class BoardsAPI(BaseConnector):
post = self._parse_thread(html, post_url) post = self._parse_thread(html, post_url)
return post return post
with ThreadPoolExecutor(max_workers=30) as executor: with ThreadPoolExecutor(max_workers=5) as executor:
futures = {executor.submit(fetch_and_parse, url): url for url in urls} futures = {executor.submit(fetch_and_parse, url): url for url in urls}
for i, future in enumerate(as_completed(futures)): for i, future in enumerate(as_completed(futures)):

View File

@@ -1,6 +1,10 @@
import requests import requests
import logging import logging
import time import time
import os
from dotenv import load_dotenv
from requests.auth import HTTPBasicAuth
from dto.post import Post from dto.post import Post
from dto.user import User from dto.user import User
@@ -9,6 +13,8 @@ from server.connectors.base import BaseConnector
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
class RedditAPI(BaseConnector): class RedditAPI(BaseConnector):
source_name: str = "reddit" source_name: str = "reddit"
@@ -18,6 +24,8 @@ class RedditAPI(BaseConnector):
def __init__(self): def __init__(self):
self.url = "https://www.reddit.com/" self.url = "https://www.reddit.com/"
self.token = None
self.token_expiry = 0
# Public Methods # # Public Methods #
def get_new_posts_by_search( def get_new_posts_by_search(
@@ -172,8 +180,43 @@ class RedditAPI(BaseConnector):
user.karma = user_data["total_karma"] user.karma = user_data["total_karma"]
return user return user
def _get_token(self):
if self.token and time.time() < self.token_expiry:
return self.token
logger.info("Fetching new Reddit access token...")
auth = HTTPBasicAuth(CLIENT_ID, CLIENT_SECRET)
data = {
"grant_type": "client_credentials"
}
headers = {
"User-Agent": "python:ethnography-college-project:0.1 (by /u/ThisBirchWood)"
}
response = requests.post(
"https://www.reddit.com/api/v1/access_token",
auth=auth,
data=data,
headers=headers,
)
response.raise_for_status()
token_json = response.json()
self.token = token_json["access_token"]
self.token_expiry = time.time() + token_json["expires_in"] - 60
logger.info(
f"Obtained new Reddit access token (expires in {token_json['expires_in']}s)"
)
return self.token
def _fetch_post_overviews(self, endpoint: str, params: dict) -> dict: def _fetch_post_overviews(self, endpoint: str, params: dict) -> dict:
url = f"{self.url}{endpoint}" url = f"https://oauth.reddit.com/{endpoint.lstrip('/')}"
max_retries = 15 max_retries = 15
backoff = 1 # seconds backoff = 1 # seconds
@@ -182,13 +225,18 @@ class RedditAPI(BaseConnector):
response = requests.get( response = requests.get(
url, url,
headers={ headers={
"User-agent": "python:ethnography-college-project:0.1 (by /u/ThisBirchWood)" "User-agent": "python:ethnography-college-project:0.1 (by /u/ThisBirchWood)",
"Authorization": f"Bearer {self._get_token()}",
}, },
params=params, params=params,
) )
if response.status_code == 429: if response.status_code == 429:
wait_time = response.headers.get("Retry-After", backoff) try:
wait_time = int(response.headers.get("X-Ratelimit-Reset", backoff))
wait_time += 1 # Add a small buffer to ensure the rate limit has reset
except ValueError:
wait_time = backoff
logger.warning( logger.warning(
f"Rate limited by Reddit API. Retrying in {wait_time} seconds..." f"Rate limited by Reddit API. Retrying in {wait_time} seconds..."

View File

@@ -1,5 +1,6 @@
import os import os
import datetime import datetime
import logging
from dotenv import load_dotenv from dotenv import load_dotenv
from googleapiclient.discovery import build from googleapiclient.discovery import build
@@ -9,9 +10,11 @@ from dto.comment import Comment
from server.connectors.base import BaseConnector from server.connectors.base import BaseConnector
load_dotenv() load_dotenv()
API_KEY = os.getenv("YOUTUBE_API_KEY") API_KEY = os.getenv("YOUTUBE_API_KEY")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
class YouTubeAPI(BaseConnector): class YouTubeAPI(BaseConnector):
source_name: str = "youtube" source_name: str = "youtube"
@@ -77,11 +80,30 @@ class YouTubeAPI(BaseConnector):
return True return True
def _search_videos(self, query, limit): def _search_videos(self, query, limit):
results = []
next_page_token = None
while len(results) < limit:
batch_size = min(50, limit - len(results))
request = self.youtube.search().list( request = self.youtube.search().list(
q=query, part="snippet", type="video", maxResults=limit q=query,
part="snippet",
type="video",
maxResults=batch_size,
pageToken=next_page_token
) )
response = request.execute() response = request.execute()
return response.get("items", []) results.extend(response.get("items", []))
logging.info(f"Fetched {len(results)} out of {limit} videos for query '{query}'")
next_page_token = response.get("nextPageToken")
if not next_page_token:
logging.warning(f"No more pages of results available for query '{query}'")
break
return results[:limit]
def _get_video_comments(self, video_id): def _get_video_comments(self, video_id):
request = self.youtube.commentThreads().list( request = self.youtube.commentThreads().list(

View File

@@ -26,7 +26,34 @@ class DatasetManager:
def get_dataset_content(self, dataset_id: int) -> pd.DataFrame: def get_dataset_content(self, dataset_id: int) -> pd.DataFrame:
query = "SELECT * FROM events WHERE dataset_id = %s" query = "SELECT * FROM events WHERE dataset_id = %s"
result = self.db.execute(query, (dataset_id,), fetch=True) result = self.db.execute(query, (dataset_id,), fetch=True)
return pd.DataFrame(result) df = pd.DataFrame(result)
if df.empty:
return df
dedupe_columns = [
column
for column in [
"post_id",
"parent_id",
"reply_to",
"author",
"type",
"timestamp",
"dt",
"title",
"content",
"source",
"topic",
]
if column in df.columns
]
if dedupe_columns:
df = df.drop_duplicates(subset=dedupe_columns, keep="first")
else:
df = df.drop_duplicates(keep="first")
return df.reset_index(drop=True)
def get_dataset_info(self, dataset_id: int) -> dict: def get_dataset_info(self, dataset_id: int) -> dict:
query = "SELECT * FROM datasets WHERE id = %s" query = "SELECT * FROM datasets WHERE id = %s"
@@ -52,6 +79,16 @@ class DatasetManager:
if event_data.empty: if event_data.empty:
return return
dedupe_columns = [
column for column in ["id", "type", "source"] if column in event_data.columns
]
if dedupe_columns:
event_data = event_data.drop_duplicates(subset=dedupe_columns, keep="first")
else:
event_data = event_data.drop_duplicates(keep="first")
self.delete_dataset_content(dataset_id)
query = """ query = """
INSERT INTO events ( INSERT INTO events (
dataset_id, dataset_id,

View File

@@ -1,3 +1,5 @@
from time import time
import pandas as pd import pandas as pd
import logging import logging
@@ -46,6 +48,7 @@ def fetch_and_process_dataset(
try: try:
for metadata in source_info: for metadata in source_info:
fetch_start = time()
name = metadata["name"] name = metadata["name"]
search = metadata.get("search") search = metadata.get("search")
category = metadata.get("category") category = metadata.get("category")
@@ -57,8 +60,11 @@ def fetch_and_process_dataset(
) )
posts.extend(post.to_dict() for post in raw_posts) posts.extend(post.to_dict() for post in raw_posts)
fetch_time = time() - fetch_start
df = pd.DataFrame(posts) df = pd.DataFrame(posts)
nlp_start = time()
dataset_manager.set_dataset_status( dataset_manager.set_dataset_status(
dataset_id, "processing", "NLP Processing Started" dataset_id, "processing", "NLP Processing Started"
) )
@@ -66,9 +72,11 @@ def fetch_and_process_dataset(
processor = DatasetEnrichment(df, topics) processor = DatasetEnrichment(df, topics)
enriched_df = processor.enrich() enriched_df = processor.enrich()
nlp_time = time() - nlp_start
dataset_manager.save_dataset_content(dataset_id, enriched_df) dataset_manager.save_dataset_content(dataset_id, enriched_df)
dataset_manager.set_dataset_status( dataset_manager.set_dataset_status(
dataset_id, "complete", "NLP Processing Completed Successfully" dataset_id, "complete", f"Completed Successfully. Fetch time: {fetch_time:.2f}s, NLP time: {nlp_time:.2f}s"
) )
except Exception as e: except Exception as e:
dataset_manager.set_dataset_status( dataset_manager.set_dataset_status(