feat(api): add cultural endpoint
This commit is contained in:
@@ -9,6 +9,10 @@ class LinguisticAnalysis:
|
|||||||
self.df = df
|
self.df = df
|
||||||
self.word_exclusions = word_exclusions
|
self.word_exclusions = word_exclusions
|
||||||
|
|
||||||
|
def _tokenize(self, text: str):
|
||||||
|
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
||||||
|
return [t for t in tokens if t not in self.word_exclusions]
|
||||||
|
|
||||||
def _clean_text(self, text: str) -> str:
|
def _clean_text(self, text: str) -> str:
|
||||||
text = re.sub(r"http\S+", "", text) # remove URLs
|
text = re.sub(r"http\S+", "", text) # remove URLs
|
||||||
text = re.sub(r"www\S+", "", text)
|
text = re.sub(r"www\S+", "", text)
|
||||||
@@ -65,4 +69,45 @@ class LinguisticAnalysis:
|
|||||||
.sort_values("count", ascending=False)
|
.sort_values("count", ascending=False)
|
||||||
.head(limit)
|
.head(limit)
|
||||||
.to_dict(orient="records")
|
.to_dict(orient="records")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def identity_markers(self):
|
||||||
|
df = self.df.copy()
|
||||||
|
df["content"] = df["content"].fillna("").astype(str).str.lower()
|
||||||
|
|
||||||
|
in_group_words = {"we", "us", "our", "ourselves"}
|
||||||
|
out_group_words = {"they", "them", "their", "themselves"}
|
||||||
|
|
||||||
|
emotion_exclusions = [
|
||||||
|
"emotion_neutral",
|
||||||
|
"emotion_surprise"
|
||||||
|
]
|
||||||
|
|
||||||
|
emotion_cols = [
|
||||||
|
col for col in self.df.columns
|
||||||
|
if col.startswith("emotion_") and col not in emotion_exclusions
|
||||||
|
]
|
||||||
|
in_count = 0
|
||||||
|
out_count = 0
|
||||||
|
in_emotions = {e: 0 for e in emotion_cols}
|
||||||
|
out_emotions = {e: 0 for e in emotion_cols}
|
||||||
|
total = 0
|
||||||
|
|
||||||
|
for post in df:
|
||||||
|
text = post["content"]
|
||||||
|
tokens = re.findall(r"\b[a-z]{2,}\b", text)
|
||||||
|
total += len(tokens)
|
||||||
|
in_count += sum(t in in_group_words for t in tokens)
|
||||||
|
out_count += sum(t in out_group_words for t in tokens)
|
||||||
|
|
||||||
|
emotions = post[emotion_cols]
|
||||||
|
print(emotions)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
return {
|
||||||
|
"in_group_usage": in_count,
|
||||||
|
"out_group_usage": out_count,
|
||||||
|
"in_group_ratio": round(in_count / max(total, 1), 5),
|
||||||
|
"out_group_ratio": round(out_count / max(total, 1), 5),
|
||||||
|
}
|
||||||
@@ -100,6 +100,19 @@ def get_user_analysis():
|
|||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||||
|
|
||||||
|
@app.route("/stats/cultural", methods=["GET"])
|
||||||
|
def get_cultural_analysis():
|
||||||
|
if stat_obj is None:
|
||||||
|
return jsonify({"error": "No data uploaded"}), 400
|
||||||
|
|
||||||
|
try:
|
||||||
|
return jsonify(stat_obj.cultural_analysis()), 200
|
||||||
|
except ValueError as e:
|
||||||
|
return jsonify({"error": f"Malformed or missing data: {str(e)}"}), 400
|
||||||
|
except Exception as e:
|
||||||
|
print(traceback.format_exc())
|
||||||
|
return jsonify({"error": f"An unexpected error occurred: {str(e)}"}), 500
|
||||||
|
|
||||||
@app.route('/filter/search', methods=["POST"])
|
@app.route('/filter/search', methods=["POST"])
|
||||||
def search_dataset():
|
def search_dataset():
|
||||||
if stat_obj is None:
|
if stat_obj is None:
|
||||||
|
|||||||
@@ -62,12 +62,17 @@ class StatGen:
|
|||||||
self.nlp.add_ner_cols()
|
self.nlp.add_ner_cols()
|
||||||
|
|
||||||
## Public
|
## Public
|
||||||
|
|
||||||
|
|
||||||
|
# topics over time
|
||||||
|
# emotions over time
|
||||||
def time_analysis(self) -> pd.DataFrame:
|
def time_analysis(self) -> pd.DataFrame:
|
||||||
return {
|
return {
|
||||||
"events_per_day": self.temporal_analysis.posts_per_day(),
|
"events_per_day": self.temporal_analysis.posts_per_day(),
|
||||||
"weekday_hour_heatmap": self.temporal_analysis.heatmap()
|
"weekday_hour_heatmap": self.temporal_analysis.heatmap()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# average topic duration
|
||||||
def content_analysis(self) -> dict:
|
def content_analysis(self) -> dict:
|
||||||
return {
|
return {
|
||||||
"word_frequencies": self.linguistic_analysis.word_frequencies(),
|
"word_frequencies": self.linguistic_analysis.word_frequencies(),
|
||||||
@@ -77,6 +82,8 @@ class StatGen:
|
|||||||
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
|
"reply_time_by_emotion": self.temporal_analysis.avg_reply_time_per_emotion()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# average emotion per user
|
||||||
|
# average chain length
|
||||||
def user_analysis(self) -> dict:
|
def user_analysis(self) -> dict:
|
||||||
return {
|
return {
|
||||||
"top_users": self.interaction_analysis.top_users(),
|
"top_users": self.interaction_analysis.top_users(),
|
||||||
@@ -84,6 +91,21 @@ class StatGen:
|
|||||||
"interaction_graph": self.interaction_analysis.interaction_graph()
|
"interaction_graph": self.interaction_analysis.interaction_graph()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# average / max thread depth
|
||||||
|
# high engagment threads based on volume
|
||||||
|
|
||||||
|
def conversational_analysis(self) -> dict:
|
||||||
|
return {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
# detect community jargon
|
||||||
|
# in-group and out-group linguistic markers
|
||||||
|
def cultural_analysis(self) -> dict:
|
||||||
|
return {
|
||||||
|
"identity_markers": self.linguistic_analysis.identity_markers()
|
||||||
|
}
|
||||||
|
|
||||||
def summary(self) -> dict:
|
def summary(self) -> dict:
|
||||||
total_posts = (self.df["type"] == "post").sum()
|
total_posts = (self.df["type"] == "post").sum()
|
||||||
total_comments = (self.df["type"] == "comment").sum()
|
total_comments = (self.df["type"] == "comment").sum()
|
||||||
|
|||||||
Reference in New Issue
Block a user