Compare commits

...

4 Commits

6 changed files with 100 additions and 10 deletions

19
Dockerfile Normal file
View File

@@ -0,0 +1,19 @@
# Use slim to reduce size
FROM python:3.13-slim
# Prevent Python from buffering stdout
ENV PYTHONUNBUFFERED=1
# System deps required for psycopg2 + torch
RUN apt-get update && apt-get install -y \
build-essential \
libpq-dev \
gcc \
curl \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["python", "main.py"]

View File

@@ -11,5 +11,34 @@ services:
- ./server/db/postgres_vol:/var/lib/postgresql/data
- ./server/db/schema.sql:/docker-entrypoint-initdb.d/schema.sql
volumes:
postgres_data:
redis:
image: redis:7
container_name: redis
restart: unless-stopped
ports:
- "6379:6379"
backend:
build: .
container_name: flask_backend
env_file:
- .env
ports:
- "5000:5000"
command: flask --app server.app run --host=0.0.0.0
depends_on:
- postgres
- redis
worker:
build: .
container_name: celery_worker
env_file:
- .env
command: >
celery -A server.queue.celery_app.celery worker
--loglevel=info
--pool=solo
depends_on:
- postgres
- redis

View File

@@ -1,13 +1,17 @@
beautifulsoup4==4.14.3
celery==5.6.2
redis==7.2.1
Flask==3.1.3
Flask_Bcrypt==1.0.1
flask_cors==6.0.2
Flask_JWT_Extended==4.7.1
google_api_python_client==2.188.0
nltk==3.9.2
numpy==2.4.2
pandas==3.0.1
psycopg2==2.9.11
psycopg2_binary==2.9.11
python-dotenv==1.2.1
python-dotenv==1.2.2
Requests==2.32.5
sentence_transformers==5.2.2
torch==2.10.0

View File

@@ -21,6 +21,7 @@ from server.db.database import PostgresConnector
from server.core.auth import AuthManager
from server.core.datasets import DatasetManager
from server.utils import get_request_filters
from server.queue.tasks import process_dataset
app = Flask(__name__)
@@ -129,19 +130,21 @@ def upload_data():
posts_df = pd.read_json(post_file, lines=True, convert_dates=False)
topics = json.load(topic_file)
processor = DatasetEnrichment(posts_df, topics)
enriched_df = processor.enrich()
dataset_id = dataset_manager.save_dataset_info(current_user, f"dataset_{current_user}", topics)
dataset_manager.save_dataset_content(dataset_id, enriched_df)
process_dataset.delay(
dataset_id,
posts_df.to_dict(orient="records"),
topics
)
return jsonify(
{
"message": "File uploaded successfully",
"event_count": len(enriched_df),
"message": "Dataset queued for processing",
"dataset_id": dataset_id,
"status": "processing"
}
), 200
), 202
except ValueError as e:
return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
except Exception as e:

View File

@@ -0,0 +1,16 @@
from celery import Celery
def create_celery():
celery = Celery(
"ethnograph",
broker="redis://redis:6379/0",
backend="redis://redis:6379/0",
)
celery.conf.task_serializer = "json"
celery.conf.result_serializer = "json"
celery.conf.accept_content = ["json"]
return celery
celery = create_celery()
from server.queue import tasks

19
server/queue/tasks.py Normal file
View File

@@ -0,0 +1,19 @@
import pandas as pd
from server.queue.celery_app import celery
from server.analysis.enrichment import DatasetEnrichment
@celery.task(bind=True, max_retries=3)
def process_dataset(self, dataset_id: int, posts: list, topics: dict):
from server.db.database import PostgresConnector
from server.core.datasets import DatasetManager
db = PostgresConnector()
dataset_manager = DatasetManager(db)
df = pd.DataFrame(posts)
processor = DatasetEnrichment(df, topics)
enriched_df = processor.enrich()
dataset_manager.save_dataset_content(dataset_id, enriched_df)