Merge remote-tracking branch 'origin/main' into feat/corpus-explorer
This commit is contained in:
@@ -67,6 +67,12 @@ class CulturalAnalysis:
|
||||
|
||||
def get_stance_markers(self, df: pd.DataFrame) -> dict[str, Any]:
|
||||
s = df[self.content_col].fillna("").astype(str)
|
||||
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
|
||||
emotion_cols = [
|
||||
c
|
||||
for c in df.columns
|
||||
if c.startswith("emotion_") and c not in emotion_exclusions
|
||||
]
|
||||
|
||||
hedge_pattern = re.compile(
|
||||
r"\b(maybe|perhaps|possibly|probably|likely|seems|seem|i think|i feel|i guess|kind of|sort of|somewhat)\b"
|
||||
@@ -88,7 +94,7 @@ class CulturalAnalysis:
|
||||
0, 1
|
||||
)
|
||||
|
||||
return {
|
||||
result = {
|
||||
"hedge_total": int(hedge_counts.sum()),
|
||||
"certainty_total": int(certainty_counts.sum()),
|
||||
"deontic_total": int(deontic_counts.sum()),
|
||||
@@ -107,6 +113,32 @@ class CulturalAnalysis:
|
||||
),
|
||||
}
|
||||
|
||||
if emotion_cols:
|
||||
emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
|
||||
|
||||
result["hedge_emotion_avg"] = (
|
||||
emo.loc[hedge_counts > 0].mean()
|
||||
if (hedge_counts > 0).any()
|
||||
else pd.Series(0.0, index=emotion_cols)
|
||||
).to_dict()
|
||||
result["certainty_emotion_avg"] = (
|
||||
emo.loc[certainty_counts > 0].mean()
|
||||
if (certainty_counts > 0).any()
|
||||
else pd.Series(0.0, index=emotion_cols)
|
||||
).to_dict()
|
||||
result["deontic_emotion_avg"] = (
|
||||
emo.loc[deontic_counts > 0].mean()
|
||||
if (deontic_counts > 0).any()
|
||||
else pd.Series(0.0, index=emotion_cols)
|
||||
).to_dict()
|
||||
result["permission_emotion_avg"] = (
|
||||
emo.loc[perm_counts > 0].mean()
|
||||
if (perm_counts > 0).any()
|
||||
else pd.Series(0.0, index=emotion_cols)
|
||||
).to_dict()
|
||||
|
||||
return result
|
||||
|
||||
def get_avg_emotions_per_entity(
|
||||
self, df: pd.DataFrame, top_n: int = 25, min_posts: int = 10
|
||||
) -> dict[str, Any]:
|
||||
|
||||
@@ -71,6 +71,7 @@ class UserAnalysis:
|
||||
per_user = df.groupby(["author", "type"]).size().unstack(fill_value=0)
|
||||
|
||||
emotion_cols = [col for col in df.columns if col.startswith("emotion_")]
|
||||
dominant_topic_by_author = {}
|
||||
|
||||
avg_emotions_by_author = {}
|
||||
if emotion_cols:
|
||||
@@ -80,6 +81,31 @@ class UserAnalysis:
|
||||
for author, row in avg_emotions.iterrows()
|
||||
}
|
||||
|
||||
if "topic" in df.columns:
|
||||
topic_df = df[
|
||||
df["topic"].notna()
|
||||
& (df["topic"] != "")
|
||||
& (df["topic"] != "Misc")
|
||||
]
|
||||
if not topic_df.empty:
|
||||
topic_counts = (
|
||||
topic_df.groupby(["author", "topic"])
|
||||
.size()
|
||||
.reset_index(name="count")
|
||||
.sort_values(
|
||||
["author", "count", "topic"],
|
||||
ascending=[True, False, True],
|
||||
)
|
||||
.drop_duplicates(subset=["author"])
|
||||
)
|
||||
dominant_topic_by_author = {
|
||||
row["author"]: {
|
||||
"topic": row["topic"],
|
||||
"count": int(row["count"]),
|
||||
}
|
||||
for _, row in topic_counts.iterrows()
|
||||
}
|
||||
|
||||
# ensure columns always exist
|
||||
for col in ("post", "comment"):
|
||||
if col not in per_user.columns:
|
||||
@@ -109,6 +135,7 @@ class UserAnalysis:
|
||||
"comment_post_ratio": float(row.get("comment_post_ratio", 0)),
|
||||
"comment_share": float(row.get("comment_share", 0)),
|
||||
"avg_emotions": avg_emotions_by_author.get(author, {}),
|
||||
"dominant_topic": dominant_topic_by_author.get(author),
|
||||
"vocab": vocab_by_author.get(
|
||||
author,
|
||||
{
|
||||
|
||||
@@ -11,8 +11,7 @@ from server.connectors.base import BaseConnector
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; ForumFetcher/1.0)"}
|
||||
|
||||
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; Digital-Ethnography-Aid/1.0)"}
|
||||
|
||||
class BoardsAPI(BaseConnector):
|
||||
source_name: str = "boards.ie"
|
||||
@@ -88,7 +87,7 @@ class BoardsAPI(BaseConnector):
|
||||
post = self._parse_thread(html, post_url)
|
||||
return post
|
||||
|
||||
with ThreadPoolExecutor(max_workers=30) as executor:
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
futures = {executor.submit(fetch_and_parse, url): url for url in urls}
|
||||
|
||||
for i, future in enumerate(as_completed(futures)):
|
||||
|
||||
@@ -1,6 +1,10 @@
|
||||
import requests
|
||||
import logging
|
||||
import time
|
||||
import os
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
from dto.post import Post
|
||||
from dto.user import User
|
||||
@@ -9,6 +13,8 @@ from server.connectors.base import BaseConnector
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
|
||||
CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
|
||||
|
||||
class RedditAPI(BaseConnector):
|
||||
source_name: str = "reddit"
|
||||
@@ -18,6 +24,8 @@ class RedditAPI(BaseConnector):
|
||||
|
||||
def __init__(self):
|
||||
self.url = "https://www.reddit.com/"
|
||||
self.token = None
|
||||
self.token_expiry = 0
|
||||
|
||||
# Public Methods #
|
||||
def get_new_posts_by_search(
|
||||
@@ -171,9 +179,44 @@ class RedditAPI(BaseConnector):
|
||||
user = User(username=user_data["name"], created_utc=user_data["created_utc"])
|
||||
user.karma = user_data["total_karma"]
|
||||
return user
|
||||
|
||||
def _get_token(self):
|
||||
if self.token and time.time() < self.token_expiry:
|
||||
return self.token
|
||||
|
||||
logger.info("Fetching new Reddit access token...")
|
||||
|
||||
auth = HTTPBasicAuth(CLIENT_ID, CLIENT_SECRET)
|
||||
|
||||
data = {
|
||||
"grant_type": "client_credentials"
|
||||
}
|
||||
|
||||
headers = {
|
||||
"User-Agent": "python:ethnography-college-project:0.1 (by /u/ThisBirchWood)"
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
"https://www.reddit.com/api/v1/access_token",
|
||||
auth=auth,
|
||||
data=data,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
token_json = response.json()
|
||||
|
||||
self.token = token_json["access_token"]
|
||||
self.token_expiry = time.time() + token_json["expires_in"] - 60
|
||||
|
||||
logger.info(
|
||||
f"Obtained new Reddit access token (expires in {token_json['expires_in']}s)"
|
||||
)
|
||||
|
||||
return self.token
|
||||
|
||||
def _fetch_post_overviews(self, endpoint: str, params: dict) -> dict:
|
||||
url = f"{self.url}{endpoint}"
|
||||
url = f"https://oauth.reddit.com/{endpoint.lstrip('/')}"
|
||||
max_retries = 15
|
||||
backoff = 1 # seconds
|
||||
|
||||
@@ -182,13 +225,14 @@ class RedditAPI(BaseConnector):
|
||||
response = requests.get(
|
||||
url,
|
||||
headers={
|
||||
"User-agent": "python:ethnography-college-project:0.1 (by /u/ThisBirchWood)"
|
||||
"User-agent": "python:ethnography-college-project:0.1 (by /u/ThisBirchWood)",
|
||||
"Authorization": f"Bearer {self._get_token()}",
|
||||
},
|
||||
params=params,
|
||||
)
|
||||
|
||||
if response.status_code == 429:
|
||||
wait_time = response.headers.get("Retry-After", backoff)
|
||||
wait_time = response.headers.get("X-Ratelimit-Reset", backoff)
|
||||
|
||||
logger.warning(
|
||||
f"Rate limited by Reddit API. Retrying in {wait_time} seconds..."
|
||||
|
||||
Reference in New Issue
Block a user