Merge remote-tracking branch 'origin/main' into feat/corpus-explorer

This commit is contained in:
2026-04-10 13:19:17 +01:00
14 changed files with 881 additions and 38 deletions

View File

@@ -67,6 +67,12 @@ class CulturalAnalysis:
def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
s = df[self.content_col].fillna("").astype(str)
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [
c
for c in df.columns
if c.startswith("emotion_") and c not in emotion_exclusions
]
hedge_pattern = re.compile(
r"\b(maybe|perhaps|possibly|probably|likely|seems|seem|i think|i feel|i guess|kind of|sort of|somewhat)\b"
@@ -88,7 +94,7 @@ class CulturalAnalysis:
0, 1
)
return {
result = {
"hedge_total": int(hedge_counts.sum()),
"certainty_total": int(certainty_counts.sum()),
"deontic_total": int(deontic_counts.sum()),
@@ -107,6 +113,32 @@ class CulturalAnalysis:
),
}
if emotion_cols:
emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
result["hedge_emotion_avg"] = (
emo.loc[hedge_counts > 0].mean()
if (hedge_counts > 0).any()
else pd.Series(0.0, index=emotion_cols)
).to_dict()
result["certainty_emotion_avg"] = (
emo.loc[certainty_counts > 0].mean()
if (certainty_counts > 0).any()
else pd.Series(0.0, index=emotion_cols)
).to_dict()
result["deontic_emotion_avg"] = (
emo.loc[deontic_counts > 0].mean()
if (deontic_counts > 0).any()
else pd.Series(0.0, index=emotion_cols)
).to_dict()
result["permission_emotion_avg"] = (
emo.loc[perm_counts > 0].mean()
if (perm_counts > 0).any()
else pd.Series(0.0, index=emotion_cols)
).to_dict()
return result
def get_avg_emotions_per_entity(
self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10
) -> dict[str, Any]:

View File

@@ -71,6 +71,7 @@ class UserAnalysis:
per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
dominant_topic_by_author = {}
avg_emotions_by_author = {}
if emotion_cols:
@@ -80,6 +81,31 @@ class UserAnalysis:
for author, row in avg_emotions.iterrows()
}
if "topic" in df.columns:
topic_df = df[
df["topic"].notna()
& (df["topic"] != "")
& (df["topic"] != "Misc")
]
if not topic_df.empty:
topic_counts = (
topic_df.groupby(["author", "topic"])
.size()
.reset_index(name="count")
.sort_values(
["author", "count", "topic"],
ascending=[True, False, True],
)
.drop_duplicates(subset=["author"])
)
dominant_topic_by_author = {
row["author"]: {
"topic": row["topic"],
"count": int(row["count"]),
}
for _, row in topic_counts.iterrows()
}
# ensure columns always exist
for col in ("post", "comment"):
if col not in per_user.columns:
@@ -109,6 +135,7 @@ class UserAnalysis:
"comment_post_ratio": float(row.get("comment_post_ratio", 0)),
"comment_share": float(row.get("comment_share", 0)),
"avg_emotions": avg_emotions_by_author.get(author, {}),
"dominant_topic": dominant_topic_by_author.get(author),
"vocab": vocab_by_author.get(
author,
{

View File

@@ -11,8 +11,7 @@ from server.connectors.base import BaseConnector
logger = logging.getLogger(__name__)
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; ForumFetcher/1.0)"}
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; Digital-Ethnography-Aid/1.0)"}
class BoardsAPI(BaseConnector):
source_name: str = "boards.ie"
@@ -88,7 +87,7 @@ class BoardsAPI(BaseConnector):
post = self._parse_thread(html, post_url)
return post
with ThreadPoolExecutor(max_workers=30) as executor:
with ThreadPoolExecutor(max_workers=10) as executor:
futures = {executor.submit(fetch_and_parse, url): url for url in urls}
for i, future in enumerate(as_completed(futures)):

View File

@@ -1,6 +1,10 @@
import requests
import logging
import time
import os
from dotenv import load_dotenv
from requests.auth import HTTPBasicAuth
from dto.post import Post
from dto.user import User
@@ -9,6 +13,8 @@ from server.connectors.base import BaseConnector
logger = logging.getLogger(__name__)
CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
class RedditAPI(BaseConnector):
source_name: str = "reddit"
@@ -18,6 +24,8 @@ class RedditAPI(BaseConnector):
def __init__(self):
self.url = "https://www.reddit.com/"
self.token = None
self.token_expiry = 0
# Public Methods #
def get_new_posts_by_search(
@@ -171,9 +179,44 @@ class RedditAPI(BaseConnector):
user = User(username=user_data["name"], created_utc=user_data["created_utc"])
user.karma = user_data["total_karma"]
return user
def _get_token(self):
if self.token and time.time() < self.token_expiry:
return self.token
logger.info("Fetching new Reddit access token...")
auth = HTTPBasicAuth(CLIENT_ID, CLIENT_SECRET)
data = {
"grant_type": "client_credentials"
}
headers = {
"User-Agent": "python:ethnography-college-project:0.1 (by /u/ThisBirchWood)"
}
response = requests.post(
"https://www.reddit.com/api/v1/access_token",
auth=auth,
data=data,
headers=headers,
)
response.raise_for_status()
token_json = response.json()
self.token = token_json["access_token"]
self.token_expiry = time.time() + token_json["expires_in"] - 60
logger.info(
f"Obtained new Reddit access token (expires in {token_json['expires_in']}s)"
)
return self.token
def _fetch_post_overviews(self, endpoint: str, params: dict) -> dict:
url = f"{self.url}{endpoint}"
url = f"https://oauth.reddit.com/{endpoint.lstrip('/')}"
max_retries = 15
backoff = 1 # seconds
@@ -182,13 +225,14 @@ class RedditAPI(BaseConnector):
response = requests.get(
url,
headers={
"User-agent": "python:ethnography-college-project:0.1 (by /u/ThisBirchWood)"
"User-agent": "python:ethnography-college-project:0.1 (by /u/ThisBirchWood)",
"Authorization": f"Bearer {self._get_token()}",
},
params=params,
)
if response.status_code == 429:
wait_time = response.headers.get("Retry-After", backoff)
wait_time = response.headers.get("X-Ratelimit-Reset", backoff)
logger.warning(
f"Rate limited by Reddit API. Retrying in {wait_time} seconds..."