Storage of user data and datasets in PostGreSQL #2

Merged
dylan merged 19 commits from feat/database-integration into main 2026-03-01 16:47:25 +00:00
7 changed files with 265 additions and 105 deletions
Showing only changes of commit d3c4d883be - Show all commits

154
server/analysis/cultural.py Normal file
View File

@@ -0,0 +1,154 @@
import pandas as pd
import re
from collections import Counter
from typing import Any
class CulturalAnalysis:
def __init__(self, df: pd.DataFrame, content_col: str = "content", topic_col: str = "topic"):
self.df = df
self.content_col = content_col
self.topic_col = topic_col
def get_identity_markers(self):
df = self.df.copy()
s = df[self.content_col].fillna("").astype(str).str.lower()
in_group_words = {"we", "us", "our", "ourselves"}
out_group_words = {"they", "them", "their", "themselves"}
emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [
c for c in df.columns
if c.startswith("emotion_") and c not in emotion_exclusions
]
# Tokenize per row
tokens_per_row = s.apply(lambda txt: re.findall(r"\b[a-z]{2,}\b", txt))
total_tokens = int(tokens_per_row.map(len).sum())
in_hits = tokens_per_row.map(lambda toks: sum(t in in_group_words for t in toks)).astype(int)
out_hits = tokens_per_row.map(lambda toks: sum(t in out_group_words for t in toks)).astype(int)
in_count = int(in_hits.sum())
out_count = int(out_hits.sum())
in_mask = in_hits > out_hits
out_mask = out_hits > in_hits
tie_mask = ~(in_mask | out_mask)
result = {
"in_group_usage": in_count,
"out_group_usage": out_count,
"in_group_ratio": round(in_count / max(total_tokens, 1), 5),
"out_group_ratio": round(out_count / max(total_tokens, 1), 5),
"in_group_posts": int(in_mask.sum()),
"out_group_posts": int(out_mask.sum()),
"tie_posts": int(tie_mask.sum()),
}
if emotion_cols:
emo = df[emotion_cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)
in_avg = emo.loc[in_mask].mean() if in_mask.any() else pd.Series(0.0, index=emotion_cols)
out_avg = emo.loc[out_mask].mean() if out_mask.any() else pd.Series(0.0, index=emotion_cols)
result["in_group_emotion_avg"] = in_avg.to_dict()
result["out_group_emotion_avg"] = out_avg.to_dict()
return result
def get_stance_markers(self) -> dict[str, Any]:
s = self.df[self.content_col].fillna("").astype(str)
hedges = {
"maybe", "perhaps", "possibly", "probably", "likely", "seems", "seem",
"i think", "i feel", "i guess", "kind of", "sort of", "somewhat"
}
certainty = {
"definitely", "certainly", "clearly", "obviously", "undeniably", "always", "never"
}
deontic = {
"must", "should", "need", "needs", "have to", "has to", "ought", "required", "require"
}
permission = {"can", "allowed", "okay", "ok", "permitted"}
def count_phrases(text: str, phrases: set[str]) -> int:
c = 0
for p in phrases:
if " " in p:
c += len(re.findall(r"\b" + re.escape(p) + r"\b", text))
else:
c += len(re.findall(r"\b" + re.escape(p) + r"\b", text))
return c
hedge_counts = s.apply(lambda t: count_phrases(t, hedges))
certainty_counts = s.apply(lambda t: count_phrases(t, certainty))
deontic_counts = s.apply(lambda t: count_phrases(t, deontic))
perm_counts = s.apply(lambda t: count_phrases(t, permission))
token_counts = s.apply(lambda t: len(re.findall(r"\b[a-z]{2,}\b", t))).replace(0, 1)
return {
"hedge_total": int(hedge_counts.sum()),
"certainty_total": int(certainty_counts.sum()),
"deontic_total": int(deontic_counts.sum()),
"permission_total": int(perm_counts.sum()),
"hedge_per_1k_tokens": round(1000 * hedge_counts.sum() / token_counts.sum(), 3),
"certainty_per_1k_tokens": round(1000 * certainty_counts.sum() / token_counts.sum(), 3),
"deontic_per_1k_tokens": round(1000 * deontic_counts.sum() / token_counts.sum(), 3),
"permission_per_1k_tokens": round(1000 * perm_counts.sum() / token_counts.sum(), 3),
}
def get_avg_emotions_per_entity(self, top_n: int = 25, min_posts: int = 10) -> dict[str, Any]:
if "entities" not in self.df.columns:
return {"entity_emotion_avg": {}}
df = self.df
emotion_cols = [c for c in df.columns if c.startswith("emotion_")]
entity_counter = Counter()
for row in df["entities"].dropna():
if isinstance(row, list):
for ent in row:
if isinstance(ent, dict):
text = ent.get("text")
if isinstance(text, str):
text = text.strip()
if len(text) >= 3: # filter short junk
entity_counter[text] += 1
top_entities = entity_counter.most_common(top_n)
entity_emotion_avg = {}
for entity_text, _ in top_entities:
mask = df["entities"].apply(
lambda ents: isinstance(ents, list) and
any(isinstance(e, dict) and e.get("text") == entity_text for e in ents)
)
post_count = int(mask.sum())
if post_count >= min_posts:
emo_means = (
df.loc[mask, emotion_cols]
.apply(pd.to_numeric, errors="coerce")
.fillna(0.0)
.mean()
.to_dict()
)
entity_emotion_avg[entity_text] = {
"post_count": post_count,
"emotion_avg": emo_means
}
return {
"entity_emotion_avg": entity_emotion_avg
}

View File

@@ -5,14 +5,9 @@ class EmotionalAnalysis:
self.df = df self.df = df
def avg_emotion_by_topic(self) -> dict: def avg_emotion_by_topic(self) -> dict:
emotion_exclusions = [
"emotion_neutral",
"emotion_surprise"
]
emotion_cols = [ emotion_cols = [
col for col in self.df.columns col for col in self.df.columns
if col.startswith("emotion_") and col not in emotion_exclusions if col.startswith("emotion_")
] ]
counts = ( counts = (

View File

@@ -3,6 +3,7 @@ import re
from collections import Counter from collections import Counter
class InteractionAnalysis: class InteractionAnalysis:
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]): def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
self.df = df self.df = df
@@ -12,7 +13,9 @@ class InteractionAnalysis:
tokens = re.findall(r"\b[a-z]{3,}\b", text) tokens = re.findall(r"\b[a-z]{3,}\b", text)
return [t for t in tokens if t not in self.word_exclusions] return [t for t in tokens if t not in self.word_exclusions]
def _vocab_richness_per_user(self, min_words: int = 20, top_most_used_words: int = 100) -> list: def _vocab_richness_per_user(
self, min_words: int = 20, top_most_used_words: int = 100
) -> list:
df = self.df.copy() df = self.df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower() df["content"] = df["content"].fillna("").astype(str).str.lower()
df["tokens"] = df["content"].apply(self._tokenize) df["tokens"] = df["content"].apply(self._tokenize)
@@ -39,15 +42,17 @@ class InteractionAnalysis:
for w, c in counts.most_common(top_most_used_words) for w, c in counts.most_common(top_most_used_words)
] ]
rows.append({ rows.append(
"author": author, {
"events": int(events), "author": author,
"total_words": int(total_words), "events": int(events),
"unique_words": int(unique_words), "total_words": int(total_words),
"vocab_richness": round(vocab_richness, 3), "unique_words": int(unique_words),
"avg_words_per_event": round(avg_words, 2), "vocab_richness": round(vocab_richness, 3),
"top_words": top_words "avg_words_per_event": round(avg_words, 2),
}) "top_words": top_words,
}
)
rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True) rows = sorted(rows, key=lambda x: x["vocab_richness"], reverse=True)
@@ -55,9 +60,7 @@ class InteractionAnalysis:
def top_users(self) -> list: def top_users(self) -> list:
counts = ( counts = (
self.df.groupby(["author", "source"]) self.df.groupby(["author", "source"]).size().sort_values(ascending=False)
.size()
.sort_values(ascending=False)
) )
top_users = [ top_users = [
@@ -66,21 +69,31 @@ class InteractionAnalysis:
] ]
return top_users return top_users
def per_user_analysis(self) -> dict: def per_user_analysis(self) -> dict:
per_user = ( per_user = self.df.groupby(["author", "type"]).size().unstack(fill_value=0)
self.df.groupby(["author", "type"])
.size() emotion_cols = [col for col in self.df.columns if col.startswith("emotion_")]
.unstack(fill_value=0)
) avg_emotions_by_author = {}
if emotion_cols:
avg_emotions = self.df.groupby("author")[emotion_cols].mean().fillna(0.0)
avg_emotions_by_author = {
author: {emotion: float(score) for emotion, score in row.items()}
for author, row in avg_emotions.iterrows()
}
# ensure columns always exist # ensure columns always exist
for col in ("post", "comment"): for col in ("post", "comment"):
if col not in per_user.columns: if col not in per_user.columns:
per_user[col] = 0 per_user[col] = 0
per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(0, 1) per_user["comment_post_ratio"] = per_user["comment"] / per_user["post"].replace(
per_user["comment_share"] = per_user["comment"] / (per_user["post"] + per_user["comment"]).replace(0, 1) 0, 1
)
per_user["comment_share"] = per_user["comment"] / (
per_user["post"] + per_user["comment"]
).replace(0, 1)
per_user = per_user.sort_values("comment_post_ratio", ascending=True) per_user = per_user.sort_values("comment_post_ratio", ascending=True)
per_user_records = per_user.reset_index().to_dict(orient="records") per_user_records = per_user.reset_index().to_dict(orient="records")
@@ -91,19 +104,22 @@ class InteractionAnalysis:
merged_users = [] merged_users = []
for row in per_user_records: for row in per_user_records:
author = row["author"] author = row["author"]
merged_users.append({ merged_users.append(
"author": author, {
"post": int(row.get("post", 0)), "author": author,
"comment": int(row.get("comment", 0)), "post": int(row.get("post", 0)),
"comment_post_ratio": float(row.get("comment_post_ratio", 0)), "comment": int(row.get("comment", 0)),
"comment_share": float(row.get("comment_share", 0)), "comment_post_ratio": float(row.get("comment_post_ratio", 0)),
"vocab": vocab_by_author.get(author) "comment_share": float(row.get("comment_share", 0)),
}) "avg_emotions": avg_emotions_by_author.get(author, {}),
"vocab": vocab_by_author.get(author, {"vocab_richness": 0, "avg_words_per_event": 0, "top_words": []}),
}
)
merged_users.sort(key=lambda u: u["comment_post_ratio"]) merged_users.sort(key=lambda u: u["comment_post_ratio"])
return merged_users return merged_users
def interaction_graph(self): def interaction_graph(self):
interactions = {a: {} for a in self.df["author"].dropna().unique()} interactions = {a: {} for a in self.df["author"].dropna().unique()}
@@ -124,7 +140,7 @@ class InteractionAnalysis:
interactions[a][b] = interactions[a].get(b, 0) + 1 interactions[a][b] = interactions[a].get(b, 0) + 1
return interactions return interactions
def average_thread_depth(self): def average_thread_depth(self):
depths = [] depths = []
id_to_reply = self.df.set_index("id")["reply_to"].to_dict() id_to_reply = self.df.set_index("id")["reply_to"].to_dict()
@@ -144,14 +160,15 @@ class InteractionAnalysis:
if not depths: if not depths:
return 0 return 0
return round(sum(depths) / len(depths), 2) return round(sum(depths) / len(depths), 2)
def average_thread_length_by_emotion(self): def average_thread_length_by_emotion(self):
emotion_exclusions = {"emotion_neutral", "emotion_surprise"} emotion_exclusions = {"emotion_neutral", "emotion_surprise"}
emotion_cols = [ emotion_cols = [
c for c in self.df.columns c
for c in self.df.columns
if c.startswith("emotion_") and c not in emotion_exclusions if c.startswith("emotion_") and c not in emotion_exclusions
] ]
@@ -174,14 +191,18 @@ class InteractionAnalysis:
reply_to = id_to_reply.get(current) reply_to = id_to_reply.get(current)
if reply_to is None or (isinstance(reply_to, float) and pd.isna(reply_to)) or reply_to == "": if (
reply_to is None
or (isinstance(reply_to, float) and pd.isna(reply_to))
or reply_to == ""
):
break break
length += 1 length += 1
current = reply_to current = reply_to
if current in length_cache: if current in length_cache:
length += (length_cache[current] - 1) length += length_cache[current] - 1
break break
length_cache[start_id] = length length_cache[start_id] = length
@@ -205,4 +226,4 @@ class InteractionAnalysis:
return { return {
emotion: round(sum(lengths) / len(lengths), 2) emotion: round(sum(lengths) / len(lengths), 2)
for emotion, lengths in emotion_to_lengths.items() for emotion, lengths in emotion_to_lengths.items()
} }

View File

@@ -69,45 +69,4 @@ class LinguisticAnalysis:
.sort_values("count", ascending=False) .sort_values("count", ascending=False)
.head(limit) .head(limit)
.to_dict(orient="records") .to_dict(orient="records")
) )
def identity_markers(self):
df = self.df.copy()
df["content"] = df["content"].fillna("").astype(str).str.lower()
in_group_words = {"we", "us", "our", "ourselves"}
out_group_words = {"they", "them", "their", "themselves"}
emotion_exclusions = [
"emotion_neutral",
"emotion_surprise"
]
emotion_cols = [
col for col in self.df.columns
if col.startswith("emotion_") and col not in emotion_exclusions
]
in_count = 0
out_count = 0
in_emotions = {e: 0 for e in emotion_cols}
out_emotions = {e: 0 for e in emotion_cols}
total = 0
for post in df:
text = post["content"]
tokens = re.findall(r"\b[a-z]{2,}\b", text)
total += len(tokens)
in_count += sum(t in in_group_words for t in tokens)
out_count += sum(t in out_group_words for t in tokens)
emotions = post[emotion_cols]
print(emotions)
return {
"in_group_usage": in_count,
"out_group_usage": out_count,
"in_group_ratio": round(in_count / max(total, 1), 5),
"out_group_ratio": round(out_count / max(total, 1), 5),
}

View File

@@ -200,6 +200,35 @@ class NLP:
if column.startswith("emotion_") and column not in emotion_df.columns: if column.startswith("emotion_") and column not in emotion_df.columns:
self.df[column] = 0.0 self.df[column] = 0.0
# drop neutral and surprise columns from df and normalize others to sum to 1
drop_cols = ["emotion_neutral", "emotion_surprise"]
existing_drop = [c for c in drop_cols if c in self.df.columns]
self.df.drop(columns=existing_drop, inplace=True)
remaining_emotion_cols = [
c for c in self.df.columns
if c.startswith("emotion_")
]
if remaining_emotion_cols:
emotion_matrix = (
self.df[remaining_emotion_cols]
.apply(pd.to_numeric, errors="coerce")
.fillna(0.0)
)
row_sums = emotion_matrix.sum(axis=1)
# Avoid division by zero
row_sums = row_sums.replace(0, 1.0)
normalized = emotion_matrix.div(row_sums, axis=0)
self.df[remaining_emotion_cols] = normalized.values
def add_topic_col(self, confidence_threshold: float = 0.3) -> None: def add_topic_col(self, confidence_threshold: float = 0.3) -> None:
titles = self.df[self.title_col].fillna("").astype(str) titles = self.df[self.title_col].fillna("").astype(str)
contents = self.df[self.content_col].fillna("").astype(str) contents = self.df[self.content_col].fillna("").astype(str)
@@ -276,3 +305,5 @@ class NLP:
self.df[col_name] = [ self.df[col_name] = [
d.get(label, 0) for d in entity_count_dicts d.get(label, 0) for d in entity_count_dicts
] ]

View File

@@ -215,8 +215,8 @@ def get_interaction_analysis():
print(traceback.format_exc()) print(traceback.format_exc())
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500 return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
@app.route('/filter/search', methods=["POST"]) @app.route('/filter/query', methods=["POST"])
def search_dataset(): def filter_query():
if stat_obj is None: if stat_obj is None:
return jsonify({"error": "No data uploaded"}), 400 return jsonify({"error": "No data uploaded"}), 400
@@ -226,7 +226,7 @@ def search_dataset():
return jsonify(stat_obj.df.to_dict(orient="records")), 200 return jsonify(stat_obj.df.to_dict(orient="records")), 200
query = data["query"] query = data["query"]
filtered_df = stat_obj.search(query) filtered_df = stat_obj.filter_by_query(query)
return jsonify(filtered_df), 200 return jsonify(filtered_df), 200

View File

@@ -8,6 +8,7 @@ from server.analysis.temporal import TemporalAnalysis
from server.analysis.emotional import EmotionalAnalysis from server.analysis.emotional import EmotionalAnalysis
from server.analysis.interactional import InteractionAnalysis from server.analysis.interactional import InteractionAnalysis
from server.analysis.linguistic import LinguisticAnalysis from server.analysis.linguistic import LinguisticAnalysis
from server.analysis.cultural import CulturalAnalysis
DOMAIN_STOPWORDS = { DOMAIN_STOPWORDS = {
"www", "https", "http", "www", "https", "http",
@@ -15,8 +16,7 @@ DOMAIN_STOPWORDS = {
"comment", "comments", "comment", "comments",
"discussion", "thread", "discussion", "thread",
"post", "posts", "post", "posts",
"would", "could", "should", "would", "get", "one"
"like", "get", "one"
} }
nltk.download('stopwords') nltk.download('stopwords')
@@ -40,33 +40,32 @@ class StatGen:
self.df.drop(columns=["post_id"], inplace=True, errors="ignore") self.df.drop(columns=["post_id"], inplace=True, errors="ignore")
self.nlp = NLP(self.df, "title", "content", domain_topics) self.nlp = NLP(self.df, "title", "content", domain_topics)
self._add_extra_cols(self.df) self.nlp.add_emotion_cols()
self.nlp.add_topic_col()
self.nlp.add_ner_cols()
self._add_time_cols(self.df)
self.temporal_analysis = TemporalAnalysis(self.df) self.temporal_analysis = TemporalAnalysis(self.df)
self.emotional_analysis = EmotionalAnalysis(self.df) self.emotional_analysis = EmotionalAnalysis(self.df)
self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS) self.interaction_analysis = InteractionAnalysis(self.df, EXCLUDE_WORDS)
self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS) self.linguistic_analysis = LinguisticAnalysis(self.df, EXCLUDE_WORDS)
self.cultural_analysis = CulturalAnalysis(self.df)
self.original_df = self.df.copy(deep=True) self.original_df = self.df.copy(deep=True)
## Private Methods ## Private Methods
def _add_extra_cols(self, df: pd.DataFrame) -> None: def _add_time_cols(self, df: pd.DataFrame) -> None:
df['timestamp'] = pd.to_numeric(self.df['timestamp'], errors='coerce') df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date df['date'] = pd.to_datetime(df['timestamp'], unit='s').dt.date
df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True) df["dt"] = pd.to_datetime(df["timestamp"], unit="s", utc=True)
df["hour"] = df["dt"].dt.hour df["hour"] = df["dt"].dt.hour
df["weekday"] = df["dt"].dt.day_name() df["weekday"] = df["dt"].dt.day_name()
self.nlp.add_emotion_cols()
self.nlp.add_topic_col()
self.nlp.add_ner_cols()
## Public ## Public
# topics over time # topics over time
# emotions over time # emotions over time
def get_time_analysis(self) -> pd.DataFrame: def get_time_analysis(self) -> dict:
return { return {
"events_per_day": self.temporal_analysis.posts_per_day(), "events_per_day": self.temporal_analysis.posts_per_day(),
"weekday_hour_heatmap": self.temporal_analysis.heatmap() "weekday_hour_heatmap": self.temporal_analysis.heatmap()
@@ -87,24 +86,25 @@ class StatGen:
def get_user_analysis(self) -> dict: def get_user_analysis(self) -> dict:
return { return {
"top_users": self.interaction_analysis.top_users(), "top_users": self.interaction_analysis.top_users(),
"users": self.interaction_analysis.per_user_analysis(), "users": self.interaction_analysis.per_user_analysis()
"interaction_graph": self.interaction_analysis.interaction_graph()
} }
# average / max thread depth # average / max thread depth
# high engagment threads based on volume # high engagment threads based on volume
def get_interactional_analysis(self) -> dict: def get_interactional_analysis(self) -> dict:
return { return {
"average_thread_depth": self.interaction_analysis.average_thread_depth(), "average_thread_depth": self.interaction_analysis.average_thread_depth(),
"average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion() "average_thread_length_by_emotion": self.interaction_analysis.average_thread_length_by_emotion(),
"interaction_graph": self.interaction_analysis.interaction_graph()
} }
# detect community jargon # detect community jargon
# in-group and out-group linguistic markers # in-group and out-group linguistic markers
def get_cultural_analysis(self) -> dict: def get_cultural_analysis(self) -> dict:
return { return {
"identity_markers": self.linguistic_analysis.identity_markers() "identity_markers": self.cultural_analysis.get_identity_markers(),
"stance_markers": self.cultural_analysis.get_stance_markers(),
"entity_salience": self.cultural_analysis.get_avg_emotions_per_entity()
} }
def summary(self) -> dict: def summary(self) -> dict:
@@ -127,7 +127,7 @@ class StatGen:
"sources": self.df["source"].dropna().unique().tolist() "sources": self.df["source"].dropna().unique().tolist()
} }
def search(self, search_query: str) -> dict: def filter_by_query(self, search_query: str) -> dict:
self.df = self.df[ self.df = self.df[
self.df["content"].str.contains(search_query) self.df["content"].str.contains(search_query)
] ]