diff --git a/server/analysis/linguistic.py b/server/analysis/linguistic.py index 3c1fdbc..c292328 100644 --- a/server/analysis/linguistic.py +++ b/server/analysis/linguistic.py @@ -2,12 +2,21 @@ import pandas as pd import re from collections import Counter +from itertools import islice class LinguisticAnalysis: def __init__(self, df: pd.DataFrame, word_exclusions: set[str]): self.df = df self.word_exclusions = word_exclusions + def _clean_text(self, text: str) -> str: + text = re.sub(r"http\S+", "", text) # remove URLs + text = re.sub(r"www\S+", "", text) + text = re.sub(r"&\w+;", "", text) # remove HTML entities + text = re.sub(r"\bamp\b", "", text) # remove stray amp + text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text) + return text + def word_frequencies(self, limit: int = 100) -> dict: texts = ( self.df["content"] @@ -34,4 +43,26 @@ class LinguisticAnalysis: .reset_index(drop=True) ) - return word_frequencies.to_dict(orient="records") \ No newline at end of file + return word_frequencies.to_dict(orient="records") + + def ngrams(self, n=2, limit=100): + texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower() + all_ngrams = [] + + for text in texts: + tokens = re.findall(r"\b[a-z]{3,}\b", text) + + # stop word removal causes strange behaviors in ngrams + #tokens = [w for w in tokens if w not in self.word_exclusions] + + ngrams = zip(*(islice(tokens, i, None) for i in range(n))) + all_ngrams.extend([" ".join(ng) for ng in ngrams]) + + counts = Counter(all_ngrams) + + return ( + pd.DataFrame(counts.items(), columns=["ngram", "count"]) + .sort_values("count", ascending=False) + .head(limit) + .to_dict(orient="records") + ) \ No newline at end of file diff --git a/server/stat_gen.py b/server/stat_gen.py index 3e36010..4200741 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -65,6 +65,22 @@ class StatGen: "events_per_day": self.temporal_analysis.posts_per_day(), "weekday_hour_heatmap": self.temporal_analysis.heatmap() } + + def content_analysis(self) -> dict: + return { + "word_frequencies": self.linguistic_analysis.word_frequencies(), + "common_two_phrases": self.linguistic_analysis.ngrams(), + "common_three_phrases": self.linguistic_analysis.ngrams(n=3), + "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(), + "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion() + } + + def user_analysis(self) -> dict: + return { + "top_users": self.interaction_analysis.top_users(), + "users": self.interaction_analysis.per_user_analysis(), + "interaction_graph": self.interaction_analysis.interaction_graph() + } def summary(self) -> dict: total_posts = (self.df["type"] == "post").sum() @@ -85,20 +101,6 @@ class StatGen: }, "sources": self.df["source"].dropna().unique().tolist() } - - def content_analysis(self) -> dict: - return { - "word_frequencies": self.linguistic_analysis.word_frequencies(), - "average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(), - "reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion() - } - - def user_analysis(self) -> dict: - return { - "top_users": self.interaction_analysis.top_users(), - "users": self.interaction_analysis.per_user_analysis(), - "interaction_graph": self.interaction_analysis.interaction_graph() - } def search(self, search_query: str) -> dict: self.df = self.df[