diff --git a/server/analysis/linguistic.py b/server/analysis/linguistic.py index a242739..1e91728 100644 --- a/server/analysis/linguistic.py +++ b/server/analysis/linguistic.py @@ -1,17 +1,30 @@ -import pandas as pd import re - from collections import Counter -from itertools import islice +from dataclasses import dataclass + +import pandas as pd + + +@dataclass(frozen=True) +class NGramConfig: + min_token_length: int = 3 + min_count: int = 2 + max_results: int = 100 class LinguisticAnalysis: def __init__(self, word_exclusions: set[str]): self.word_exclusions = word_exclusions + self.ngram_config = NGramConfig() - def _tokenize(self, text: str): - tokens = re.findall(r"\b[a-z]{3,}\b", text) - return [t for t in tokens if t not in self.word_exclusions] + def _tokenize(self, text: str, *, include_exclusions: bool = False) -> list[str]: + pattern = rf"\b[a-z]{{{self.ngram_config.min_token_length},}}\b" + tokens = re.findall(pattern, text) + + if include_exclusions: + return tokens + + return [token for token in tokens if token not in self.word_exclusions] def _clean_text(self, text: str) -> str: text = re.sub(r"http\S+", "", text) # remove URLs @@ -21,13 +34,24 @@ class LinguisticAnalysis: text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text) return text + def _content_texts(self, df: pd.DataFrame) -> pd.Series: + return df["content"].dropna().astype(str).apply(self._clean_text).str.lower() + + def _valid_ngram(self, tokens: tuple[str, ...]) -> bool: + if any(token in self.word_exclusions for token in tokens): + return False + + if len(set(tokens)) == 1: + return False + + return True + def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]: - texts = df["content"].dropna().astype(str).str.lower() + texts = self._content_texts(df) words = [] for text in texts: - tokens = re.findall(r"\b[a-z]{3,}\b", text) - words.extend(w for w in tokens if w not in self.word_exclusions) + words.extend(self._tokenize(text)) counts = Counter(words) @@ -40,25 +64,39 @@ class LinguisticAnalysis: return word_frequencies.to_dict(orient="records") - def ngrams(self, df: pd.DataFrame, n=2, limit=100): - texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower() + def ngrams(self, df: pd.DataFrame, n: int = 2, limit: int | None = None) -> list[dict]: + if n < 2: + raise ValueError("n must be at least 2") + + texts = self._content_texts(df) all_ngrams = [] + result_limit = limit or self.ngram_config.max_results for text in texts: - tokens = re.findall(r"\b[a-z]{3,}\b", text) + tokens = self._tokenize(text, include_exclusions=True) - # stop word removal causes strange behaviors in ngrams - # tokens = [w for w in tokens if w not in self.word_exclusions] + if len(tokens) < n: + continue - ngrams = zip(*(islice(tokens, i, None) for i in range(n))) - all_ngrams.extend([" ".join(ng) for ng in ngrams]) + for index in range(len(tokens) - n + 1): + ngram_tokens = tuple(tokens[index : index + n]) + if self._valid_ngram(ngram_tokens): + all_ngrams.append(" ".join(ngram_tokens)) counts = Counter(all_ngrams) + filtered_counts = [ + (ngram, count) + for ngram, count in counts.items() + if count >= self.ngram_config.min_count + ] + + if not filtered_counts: + return [] return ( - pd.DataFrame(counts.items(), columns=["ngram", "count"]) - .sort_values("count", ascending=False) - .head(limit) + pd.DataFrame(filtered_counts, columns=["ngram", "count"]) + .sort_values(["count", "ngram"], ascending=[False, True]) + .head(result_limit) .to_dict(orient="records") ) diff --git a/server/app.py b/server/app.py index aa22a25..7f7c8fe 100644 --- a/server/app.py +++ b/server/app.py @@ -424,7 +424,7 @@ def get_linguistic_analysis(dataset_id): dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.linguistic(dataset_content, filters)), 200 + return jsonify(stat_gen.linguistic(dataset_content, filters, dataset_id=dataset_id)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -448,7 +448,7 @@ def get_emotional_analysis(dataset_id): dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.emotional(dataset_content, filters)), 200 + return jsonify(stat_gen.emotional(dataset_content, filters, dataset_id=dataset_id)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -472,7 +472,7 @@ def get_summary(dataset_id): dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.summary(dataset_content, filters)), 200 + return jsonify(stat_gen.summary(dataset_content, filters, dataset_id=dataset_id)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -496,7 +496,7 @@ def get_temporal_analysis(dataset_id): dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.temporal(dataset_content, filters)), 200 + return jsonify(stat_gen.temporal(dataset_content, filters, dataset_id=dataset_id)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -520,7 +520,7 @@ def get_user_analysis(dataset_id): dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.user(dataset_content, filters)), 200 + return jsonify(stat_gen.user(dataset_content, filters, dataset_id=dataset_id)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -544,7 +544,7 @@ def get_cultural_analysis(dataset_id): dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.cultural(dataset_content, filters)), 200 + return jsonify(stat_gen.cultural(dataset_content, filters, dataset_id=dataset_id)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: @@ -568,7 +568,7 @@ def get_interaction_analysis(dataset_id): dataset_content = dataset_manager.get_dataset_content(dataset_id) filters = get_request_filters() - return jsonify(stat_gen.interactional(dataset_content, filters)), 200 + return jsonify(stat_gen.interactional(dataset_content, filters, dataset_id=dataset_id)), 200 except NotAuthorisedException: return jsonify({"error": "User is not authorised to access this content"}), 403 except NonExistentDatasetException: