fix(ngrams): remove stop words from ngrams

This commit is contained in:
2026-04-01 08:44:47 +01:00
parent 6378015726
commit cd6030a760
2 changed files with 64 additions and 26 deletions

View File

@@ -1,17 +1,30 @@
import pandas as pd
import re import re
from collections import Counter from collections import Counter
from itertools import islice from dataclasses import dataclass
import pandas as pd
@dataclass(frozen=True)
class NGramConfig:
min_token_length: int = 3
min_count: int = 2
max_results: int = 100
class LinguisticAnalysis: class LinguisticAnalysis:
def __init__(self, word_exclusions: set[str]): def __init__(self, word_exclusions: set[str]):
self.word_exclusions = word_exclusions self.word_exclusions = word_exclusions
self.ngram_config = NGramConfig()
def _tokenize(self, text: str): def _tokenize(self, text: str, *, include_exclusions: bool = False) -> list[str]:
tokens = re.findall(r"\b[a-z]{3,}\b", text) pattern = rf"\b[a-z]{{{self.ngram_config.min_token_length},}}\b"
return [t for t in tokens if t not in self.word_exclusions] tokens = re.findall(pattern, text)
if include_exclusions:
return tokens
return [token for token in tokens if token not in self.word_exclusions]
def _clean_text(self, text: str) -> str: def _clean_text(self, text: str) -> str:
text = re.sub(r"http\S+", "", text) # remove URLs text = re.sub(r"http\S+", "", text) # remove URLs
@@ -21,13 +34,24 @@ class LinguisticAnalysis:
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text) text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
return text return text
def _content_texts(self, df: pd.DataFrame) -> pd.Series:
return df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
def _valid_ngram(self, tokens: tuple[str, ...]) -> bool:
if any(token in self.word_exclusions for token in tokens):
return False
if len(set(tokens)) == 1:
return False
return True
def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]: def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
texts = df["content"].dropna().astype(str).str.lower() texts = self._content_texts(df)
words = [] words = []
for text in texts: for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text) words.extend(self._tokenize(text))
words.extend(w for w in tokens if w not in self.word_exclusions)
counts = Counter(words) counts = Counter(words)
@@ -40,25 +64,39 @@ class LinguisticAnalysis:
return word_frequencies.to_dict(orient="records") return word_frequencies.to_dict(orient="records")
def ngrams(self, df: pd.DataFrame, n=2, limit=100): def ngrams(self, df: pd.DataFrame, n: int = 2, limit: int | None = None) -> list[dict]:
texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower() if n < 2:
raise ValueError("n must be at least 2")
texts = self._content_texts(df)
all_ngrams = [] all_ngrams = []
result_limit = limit or self.ngram_config.max_results
for text in texts: for text in texts:
tokens = re.findall(r"\b[a-z]{3,}\b", text) tokens = self._tokenize(text, include_exclusions=True)
# stop word removal causes strange behaviors in ngrams if len(tokens) < n:
# tokens = [w for w in tokens if w not in self.word_exclusions] continue
ngrams = zip(*(islice(tokens, i, None) for i in range(n))) for index in range(len(tokens) - n + 1):
all_ngrams.extend([" ".join(ng) for ng in ngrams]) ngram_tokens = tuple(tokens[index : index + n])
if self._valid_ngram(ngram_tokens):
all_ngrams.append(" ".join(ngram_tokens))
counts = Counter(all_ngrams) counts = Counter(all_ngrams)
filtered_counts = [
(ngram, count)
for ngram, count in counts.items()
if count >= self.ngram_config.min_count
]
if not filtered_counts:
return []
return ( return (
pd.DataFrame(counts.items(), columns=["ngram", "count"]) pd.DataFrame(filtered_counts, columns=["ngram", "count"])
.sort_values("count", ascending=False) .sort_values(["count", "ngram"], ascending=[False, True])
.head(limit) .head(result_limit)
.to_dict(orient="records") .to_dict(orient="records")
) )

View File

@@ -424,7 +424,7 @@ def get_linguistic_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.linguistic(dataset_content, filters)), 200 return jsonify(stat_gen.linguistic(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -448,7 +448,7 @@ def get_emotional_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.emotional(dataset_content, filters)), 200 return jsonify(stat_gen.emotional(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -472,7 +472,7 @@ def get_summary(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.summary(dataset_content, filters)), 200 return jsonify(stat_gen.summary(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -496,7 +496,7 @@ def get_temporal_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.temporal(dataset_content, filters)), 200 return jsonify(stat_gen.temporal(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -520,7 +520,7 @@ def get_user_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.user(dataset_content, filters)), 200 return jsonify(stat_gen.user(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -544,7 +544,7 @@ def get_cultural_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.cultural(dataset_content, filters)), 200 return jsonify(stat_gen.cultural(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException:
@@ -568,7 +568,7 @@ def get_interaction_analysis(dataset_id):
dataset_content = dataset_manager.get_dataset_content(dataset_id) dataset_content = dataset_manager.get_dataset_content(dataset_id)
filters = get_request_filters() filters = get_request_filters()
return jsonify(stat_gen.interactional(dataset_content, filters)), 200 return jsonify(stat_gen.interactional(dataset_content, filters, dataset_id=dataset_id)), 200
except NotAuthorisedException: except NotAuthorisedException:
return jsonify({"error": "User is not authorised to access this content"}), 403 return jsonify({"error": "User is not authorised to access this content"}), 403
except NonExistentDatasetException: except NonExistentDatasetException: