feat(linguistic): add most common 2, 3 length n-grams
This commit is contained in:
@@ -2,12 +2,21 @@ import pandas as pd
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
from itertools import islice
|
||||||
|
|
||||||
class LinguisticAnalysis:
|
class LinguisticAnalysis:
|
||||||
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
|
def __init__(self, df: pd.DataFrame, word_exclusions: set[str]):
|
||||||
self.df = df
|
self.df = df
|
||||||
self.word_exclusions = word_exclusions
|
self.word_exclusions = word_exclusions
|
||||||
|
|
||||||
|
def _clean_text(self, text: str) -> str:
|
||||||
|
text = re.sub(r"http\S+", "", text) # remove URLs
|
||||||
|
text = re.sub(r"www\S+", "", text)
|
||||||
|
text = re.sub(r"&\w+;", "", text) # remove HTML entities
|
||||||
|
text = re.sub(r"\bamp\b", "", text) # remove stray amp
|
||||||
|
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
|
||||||
|
return text
|
||||||
|
|
||||||
def word_frequencies(self, limit: int = 100) -> dict:
|
def word_frequencies(self, limit: int = 100) -> dict:
|
||||||
texts = (
|
texts = (
|
||||||
self.df["content"]
|
self.df["content"]
|
||||||
@@ -34,4 +43,26 @@ class LinguisticAnalysis:
|
|||||||
.reset_index(drop=True)
|
.reset_index(drop=True)
|
||||||
)
|
)
|
||||||
|
|
||||||
return word_frequencies.to_dict(orient="records")
|
return word_frequencies.to_dict(orient="records")
|
||||||
|
|
||||||
|
def ngrams(self, n=2, limit=100):
|
||||||
|
texts = self.df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
|
||||||
|
all_ngrams = []
|
||||||
|
|
||||||
|
for text in texts:
|
||||||
|
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
||||||
|
|
||||||
|
# stop word removal causes strange behaviors in ngrams
|
||||||
|
#tokens = [w for w in tokens if w not in self.word_exclusions]
|
||||||
|
|
||||||
|
ngrams = zip(*(islice(tokens, i, None) for i in range(n)))
|
||||||
|
all_ngrams.extend([" ".join(ng) for ng in ngrams])
|
||||||
|
|
||||||
|
counts = Counter(all_ngrams)
|
||||||
|
|
||||||
|
return (
|
||||||
|
pd.DataFrame(counts.items(), columns=["ngram", "count"])
|
||||||
|
.sort_values("count", ascending=False)
|
||||||
|
.head(limit)
|
||||||
|
.to_dict(orient="records")
|
||||||
|
)
|
||||||
@@ -65,6 +65,22 @@ class StatGen:
|
|||||||
"events_per_day": self.temporal_analysis.posts_per_day(),
|
"events_per_day": self.temporal_analysis.posts_per_day(),
|
||||||
"weekday_hour_heatmap": self.temporal_analysis.heatmap()
|
"weekday_hour_heatmap": self.temporal_analysis.heatmap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def content_analysis(self) -> dict:
|
||||||
|
return {
|
||||||
|
"word_frequencies": self.linguistic_analysis.word_frequencies(),
|
||||||
|
"common_two_phrases": self.linguistic_analysis.ngrams(),
|
||||||
|
"common_three_phrases": self.linguistic_analysis.ngrams(n=3),
|
||||||
|
"average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(),
|
||||||
|
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
|
||||||
|
}
|
||||||
|
|
||||||
|
def user_analysis(self) -> dict:
|
||||||
|
return {
|
||||||
|
"top_users": self.interaction_analysis.top_users(),
|
||||||
|
"users": self.interaction_analysis.per_user_analysis(),
|
||||||
|
"interaction_graph": self.interaction_analysis.interaction_graph()
|
||||||
|
}
|
||||||
|
|
||||||
def summary(self) -> dict:
|
def summary(self) -> dict:
|
||||||
total_posts = (self.df["type"] == "post").sum()
|
total_posts = (self.df["type"] == "post").sum()
|
||||||
@@ -85,20 +101,6 @@ class StatGen:
|
|||||||
},
|
},
|
||||||
"sources": self.df["source"].dropna().unique().tolist()
|
"sources": self.df["source"].dropna().unique().tolist()
|
||||||
}
|
}
|
||||||
|
|
||||||
def content_analysis(self) -> dict:
|
|
||||||
return {
|
|
||||||
"word_frequencies": self.linguistic_analysis.word_frequencies(),
|
|
||||||
"average_emotion_by_topic": self.emotional_analysis.avg_emotion_by_topic(),
|
|
||||||
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
|
|
||||||
}
|
|
||||||
|
|
||||||
def user_analysis(self) -> dict:
|
|
||||||
return {
|
|
||||||
"top_users": self.interaction_analysis.top_users(),
|
|
||||||
"users": self.interaction_analysis.per_user_analysis(),
|
|
||||||
"interaction_graph": self.interaction_analysis.interaction_graph()
|
|
||||||
}
|
|
||||||
|
|
||||||
def search(self, search_query: str) -> dict:
|
def search(self, search_query: str) -> dict:
|
||||||
self.df = self.df[
|
self.df = self.df[
|
||||||
|
|||||||
Reference in New Issue
Block a user