Corpus Explorer Feature #11
@@ -1,17 +1,30 @@
|
|||||||
import pandas as pd
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from itertools import islice
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class NGramConfig:
|
||||||
|
min_token_length: int = 3
|
||||||
|
min_count: int = 2
|
||||||
|
max_results: int = 100
|
||||||
|
|
||||||
|
|
||||||
class LinguisticAnalysis:
|
class LinguisticAnalysis:
|
||||||
def __init__(self, word_exclusions: set[str]):
|
def __init__(self, word_exclusions: set[str]):
|
||||||
self.word_exclusions = word_exclusions
|
self.word_exclusions = word_exclusions
|
||||||
|
self.ngram_config = NGramConfig()
|
||||||
|
|
||||||
def _tokenize(self, text: str):
|
def _tokenize(self, text: str, *, include_exclusions: bool = False) -> list[str]:
|
||||||
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
pattern = rf"\b[a-z]{{{self.ngram_config.min_token_length},}}\b"
|
||||||
return [t for t in tokens if t not in self.word_exclusions]
|
tokens = re.findall(pattern, text)
|
||||||
|
|
||||||
|
if include_exclusions:
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
return [token for token in tokens if token not in self.word_exclusions]
|
||||||
|
|
||||||
def _clean_text(self, text: str) -> str:
|
def _clean_text(self, text: str) -> str:
|
||||||
text = re.sub(r"http\S+", "", text) # remove URLs
|
text = re.sub(r"http\S+", "", text) # remove URLs
|
||||||
@@ -21,13 +34,24 @@ class LinguisticAnalysis:
|
|||||||
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
|
text = re.sub(r"\S+\.(jpg|jpeg|png|webp|gif)", "", text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def _content_texts(self, df: pd.DataFrame) -> pd.Series:
|
||||||
|
return df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
|
||||||
|
|
||||||
|
def _valid_ngram(self, tokens: tuple[str, ...]) -> bool:
|
||||||
|
if any(token in self.word_exclusions for token in tokens):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if len(set(tokens)) == 1:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
|
def word_frequencies(self, df: pd.DataFrame, limit: int = 100) -> list[dict]:
|
||||||
texts = df["content"].dropna().astype(str).str.lower()
|
texts = self._content_texts(df)
|
||||||
|
|
||||||
words = []
|
words = []
|
||||||
for text in texts:
|
for text in texts:
|
||||||
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
words.extend(self._tokenize(text))
|
||||||
words.extend(w for w in tokens if w not in self.word_exclusions)
|
|
||||||
|
|
||||||
counts = Counter(words)
|
counts = Counter(words)
|
||||||
|
|
||||||
@@ -40,25 +64,39 @@ class LinguisticAnalysis:
|
|||||||
|
|
||||||
return word_frequencies.to_dict(orient="records")
|
return word_frequencies.to_dict(orient="records")
|
||||||
|
|
||||||
def ngrams(self, df: pd.DataFrame, n=2, limit=100):
|
def ngrams(self, df: pd.DataFrame, n: int = 2, limit: int | None = None) -> list[dict]:
|
||||||
texts = df["content"].dropna().astype(str).apply(self._clean_text).str.lower()
|
if n < 2:
|
||||||
|
raise ValueError("n must be at least 2")
|
||||||
|
|
||||||
|
texts = self._content_texts(df)
|
||||||
all_ngrams = []
|
all_ngrams = []
|
||||||
|
result_limit = limit or self.ngram_config.max_results
|
||||||
|
|
||||||
for text in texts:
|
for text in texts:
|
||||||
tokens = re.findall(r"\b[a-z]{3,}\b", text)
|
tokens = self._tokenize(text, include_exclusions=True)
|
||||||
|
|
||||||
# stop word removal causes strange behaviors in ngrams
|
if len(tokens) < n:
|
||||||
# tokens = [w for w in tokens if w not in self.word_exclusions]
|
continue
|
||||||
|
|
||||||
ngrams = zip(*(islice(tokens, i, None) for i in range(n)))
|
for index in range(len(tokens) - n + 1):
|
||||||
all_ngrams.extend([" ".join(ng) for ng in ngrams])
|
ngram_tokens = tuple(tokens[index : index + n])
|
||||||
|
if self._valid_ngram(ngram_tokens):
|
||||||
|
all_ngrams.append(" ".join(ngram_tokens))
|
||||||
|
|
||||||
counts = Counter(all_ngrams)
|
counts = Counter(all_ngrams)
|
||||||
|
filtered_counts = [
|
||||||
|
(ngram, count)
|
||||||
|
for ngram, count in counts.items()
|
||||||
|
if count >= self.ngram_config.min_count
|
||||||
|
]
|
||||||
|
|
||||||
|
if not filtered_counts:
|
||||||
|
return []
|
||||||
|
|
||||||
return (
|
return (
|
||||||
pd.DataFrame(counts.items(), columns=["ngram", "count"])
|
pd.DataFrame(filtered_counts, columns=["ngram", "count"])
|
||||||
.sort_values("count", ascending=False)
|
.sort_values(["count", "ngram"], ascending=[False, True])
|
||||||
.head(limit)
|
.head(result_limit)
|
||||||
.to_dict(orient="records")
|
.to_dict(orient="records")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -424,7 +424,7 @@ def get_linguistic_analysis(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.linguistic(dataset_content, filters)), 200
|
return jsonify(stat_gen.linguistic(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
@@ -448,7 +448,7 @@ def get_emotional_analysis(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.emotional(dataset_content, filters)), 200
|
return jsonify(stat_gen.emotional(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
@@ -472,7 +472,7 @@ def get_summary(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.summary(dataset_content, filters)), 200
|
return jsonify(stat_gen.summary(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
@@ -496,7 +496,7 @@ def get_temporal_analysis(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.temporal(dataset_content, filters)), 200
|
return jsonify(stat_gen.temporal(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
@@ -520,7 +520,7 @@ def get_user_analysis(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.user(dataset_content, filters)), 200
|
return jsonify(stat_gen.user(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
@@ -544,7 +544,7 @@ def get_cultural_analysis(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.cultural(dataset_content, filters)), 200
|
return jsonify(stat_gen.cultural(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
@@ -568,7 +568,7 @@ def get_interaction_analysis(dataset_id):
|
|||||||
|
|
||||||
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
dataset_content = dataset_manager.get_dataset_content(dataset_id)
|
||||||
filters = get_request_filters()
|
filters = get_request_filters()
|
||||||
return jsonify(stat_gen.interactional(dataset_content, filters)), 200
|
return jsonify(stat_gen.interactional(dataset_content, filters, dataset_id=dataset_id)), 200
|
||||||
except NotAuthorisedException:
|
except NotAuthorisedException:
|
||||||
return jsonify({"error": "User is not authorised to access this content"}), 403
|
return jsonify({"error": "User is not authorised to access this content"}), 403
|
||||||
except NonExistentDatasetException:
|
except NonExistentDatasetException:
|
||||||
|
|||||||
Reference in New Issue
Block a user