From ed0dd8cdbc4a058734a91e1359f050be86d5e532 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 17 Feb 2026 18:48:45 +0000 Subject: [PATCH] feat(nlp): add Named Entity Recognition to dataset --- server/nlp.py | 93 ++++++++++++++++++++++++++++++++++++++++++++++ server/stat_gen.py | 1 + 2 files changed, 94 insertions(+) diff --git a/server/nlp.py b/server/nlp.py index 379d918..c3fcf89 100644 --- a/server/nlp.py +++ b/server/nlp.py @@ -9,6 +9,7 @@ from sentence_transformers import SentenceTransformer class NLP: _topic_models: dict[str, SentenceTransformer] = {} _emotion_classifiers: dict[str, Any] = {} + _entity_recognizers: dict[str, Any] = {} _topic_embedding_cache: dict[tuple[str, ...], np.ndarray] = {} def __init__( @@ -29,6 +30,9 @@ class NLP: self.emotion_classifier = self._get_emotion_classifier( self.device_str, self.pipeline_device ) + self.entity_recognizer = self._get_entity_recognizer( + self.device_str, self.pipeline_device + ) except RuntimeError as exc: if self.use_cuda and "out of memory" in str(exc).lower(): torch.cuda.empty_cache() @@ -86,6 +90,27 @@ class NLP: ) cls._emotion_classifiers[device_str] = classifier return classifier + + @classmethod + def _get_entity_recognizer(cls, device_str: str, pipeline_device: int) -> Any: + recognizer = cls._entity_recognizers.get(device_str) + if recognizer is None: + pipeline_kwargs = { + "aggregation_strategy": "simple", # merges subwords + "device": pipeline_device, + } + + if device_str == "cuda": + pipeline_kwargs["dtype"] = torch.float16 + + recognizer = pipeline( + "token-classification", + model="dslim/bert-base-NER", + **pipeline_kwargs, + ) + cls._entity_recognizers[device_str] = recognizer + + return recognizer def _encode_with_backoff( self, texts: list[str], initial_batch_size: int @@ -129,6 +154,26 @@ class NLP: continue raise + def _infer_entities_with_backoff( + self, texts: list[str], initial_batch_size: int + ) -> list[list[dict[str, Any]]]: + + batch_size = initial_batch_size + + while True: + try: + return self.entity_recognizer(texts, batch_size=batch_size) + except RuntimeError as exc: + if ( + self.use_cuda + and "out of memory" in str(exc).lower() + and batch_size > 4 + ): + batch_size = max(4, batch_size // 2) + torch.cuda.empty_cache() + continue + raise + def add_emotion_cols(self) -> None: texts = self.df[self.content_col].astype(str).str.slice(0, 512).tolist() @@ -183,3 +228,51 @@ class NLP: self.df.loc[self.df["topic_confidence"] < confidence_threshold, "topic"] = ( "Misc" ) + + def add_ner_cols(self, max_chars: int = 512) -> None: + texts = ( + self.df[self.content_col] + .fillna("") + .astype(str) + .str.slice(0, max_chars) + .tolist() + ) + + if not texts: + self.df["entities"] = [] + self.df["entity_counts"] = [] + return + + results = self._infer_entities_with_backoff(texts, 32 if self.use_cuda else 8) + + entity_lists = [] + entity_count_dicts = [] + + for row in results: + entities = [] + counts = {} + + for ent in row: + word = ent.get("word") + label = ent.get("entity_group") + + if isinstance(word, str) and isinstance(label, str): + entities.append({"text": word, "label": label}) + counts[label] = counts.get(label, 0) + 1 + + entity_lists.append(entities) + entity_count_dicts.append(counts) + + self.df["entities"] = entity_lists + self.df["entity_counts"] = entity_count_dicts + + # Expand label counts into columns + all_labels = set() + for d in entity_count_dicts: + all_labels.update(d.keys()) + + for label in all_labels: + col_name = f"entity_{label}" + self.df[col_name] = [ + d.get(label, 0) for d in entity_count_dicts + ] diff --git a/server/stat_gen.py b/server/stat_gen.py index 4200741..202e804 100644 --- a/server/stat_gen.py +++ b/server/stat_gen.py @@ -58,6 +58,7 @@ class StatGen: self.nlp.add_emotion_cols() self.nlp.add_topic_col() + self.nlp.add_ner_cols() ## Public def time_analysis(self) -> pd.DataFrame: