Compare commits
4 Commits
cc71c80df7
...
3a58705635
| Author | SHA1 | Date | |
|---|---|---|---|
| 3a58705635 | |||
| 2e0e842525 | |||
| 14b472ea60 | |||
| c767f59b26 |
19
Dockerfile
Normal file
19
Dockerfile
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# Use slim to reduce size
|
||||||
|
FROM python:3.13-slim
|
||||||
|
|
||||||
|
# Prevent Python from buffering stdout
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
# System deps required for psycopg2 + torch
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
build-essential \
|
||||||
|
libpq-dev \
|
||||||
|
gcc \
|
||||||
|
curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
COPY . .
|
||||||
|
CMD ["python", "main.py"]
|
||||||
@@ -11,5 +11,34 @@ services:
|
|||||||
- ./server/db/postgres_vol:/var/lib/postgresql/data
|
- ./server/db/postgres_vol:/var/lib/postgresql/data
|
||||||
- ./server/db/schema.sql:/docker-entrypoint-initdb.d/schema.sql
|
- ./server/db/schema.sql:/docker-entrypoint-initdb.d/schema.sql
|
||||||
|
|
||||||
volumes:
|
redis:
|
||||||
postgres_data:
|
image: redis:7
|
||||||
|
container_name: redis
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "6379:6379"
|
||||||
|
|
||||||
|
backend:
|
||||||
|
build: .
|
||||||
|
container_name: flask_backend
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
ports:
|
||||||
|
- "5000:5000"
|
||||||
|
command: flask --app server.app run --host=0.0.0.0
|
||||||
|
depends_on:
|
||||||
|
- postgres
|
||||||
|
- redis
|
||||||
|
|
||||||
|
worker:
|
||||||
|
build: .
|
||||||
|
container_name: celery_worker
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
command: >
|
||||||
|
celery -A server.queue.celery_app.celery worker
|
||||||
|
--loglevel=info
|
||||||
|
--pool=solo
|
||||||
|
depends_on:
|
||||||
|
- postgres
|
||||||
|
- redis
|
||||||
@@ -1,13 +1,17 @@
|
|||||||
beautifulsoup4==4.14.3
|
beautifulsoup4==4.14.3
|
||||||
|
celery==5.6.2
|
||||||
|
redis==7.2.1
|
||||||
Flask==3.1.3
|
Flask==3.1.3
|
||||||
|
Flask_Bcrypt==1.0.1
|
||||||
flask_cors==6.0.2
|
flask_cors==6.0.2
|
||||||
|
Flask_JWT_Extended==4.7.1
|
||||||
google_api_python_client==2.188.0
|
google_api_python_client==2.188.0
|
||||||
nltk==3.9.2
|
nltk==3.9.2
|
||||||
numpy==2.4.2
|
numpy==2.4.2
|
||||||
pandas==3.0.1
|
pandas==3.0.1
|
||||||
psycopg2==2.9.11
|
psycopg2==2.9.11
|
||||||
psycopg2_binary==2.9.11
|
psycopg2_binary==2.9.11
|
||||||
python-dotenv==1.2.1
|
python-dotenv==1.2.2
|
||||||
Requests==2.32.5
|
Requests==2.32.5
|
||||||
sentence_transformers==5.2.2
|
sentence_transformers==5.2.2
|
||||||
torch==2.10.0
|
torch==2.10.0
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ from server.db.database import PostgresConnector
|
|||||||
from server.core.auth import AuthManager
|
from server.core.auth import AuthManager
|
||||||
from server.core.datasets import DatasetManager
|
from server.core.datasets import DatasetManager
|
||||||
from server.utils import get_request_filters
|
from server.utils import get_request_filters
|
||||||
|
from server.queue.tasks import process_dataset
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
@@ -129,19 +130,21 @@ def upload_data():
|
|||||||
|
|
||||||
posts_df = pd.read_json(post_file, lines=True, convert_dates=False)
|
posts_df = pd.read_json(post_file, lines=True, convert_dates=False)
|
||||||
topics = json.load(topic_file)
|
topics = json.load(topic_file)
|
||||||
|
|
||||||
processor = DatasetEnrichment(posts_df, topics)
|
|
||||||
enriched_df = processor.enrich()
|
|
||||||
dataset_id = dataset_manager.save_dataset_info(current_user, f"dataset_{current_user}", topics)
|
dataset_id = dataset_manager.save_dataset_info(current_user, f"dataset_{current_user}", topics)
|
||||||
dataset_manager.save_dataset_content(dataset_id, enriched_df)
|
|
||||||
|
process_dataset.delay(
|
||||||
|
dataset_id,
|
||||||
|
posts_df.to_dict(orient="records"),
|
||||||
|
topics
|
||||||
|
)
|
||||||
|
|
||||||
return jsonify(
|
return jsonify(
|
||||||
{
|
{
|
||||||
"message": "File uploaded successfully",
|
"message": "Dataset queued for processing",
|
||||||
"event_count": len(enriched_df),
|
|
||||||
"dataset_id": dataset_id,
|
"dataset_id": dataset_id,
|
||||||
|
"status": "processing"
|
||||||
}
|
}
|
||||||
), 200
|
), 202
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
|
return jsonify({"error": f"Failed to read JSONL file: {str(e)}"}), 400
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
16
server/queue/celery_app.py
Normal file
16
server/queue/celery_app.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from celery import Celery
|
||||||
|
|
||||||
|
def create_celery():
|
||||||
|
celery = Celery(
|
||||||
|
"ethnograph",
|
||||||
|
broker="redis://redis:6379/0",
|
||||||
|
backend="redis://redis:6379/0",
|
||||||
|
)
|
||||||
|
celery.conf.task_serializer = "json"
|
||||||
|
celery.conf.result_serializer = "json"
|
||||||
|
celery.conf.accept_content = ["json"]
|
||||||
|
return celery
|
||||||
|
|
||||||
|
celery = create_celery()
|
||||||
|
|
||||||
|
from server.queue import tasks
|
||||||
19
server/queue/tasks.py
Normal file
19
server/queue/tasks.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from server.queue.celery_app import celery
|
||||||
|
from server.analysis.enrichment import DatasetEnrichment
|
||||||
|
|
||||||
|
@celery.task(bind=True, max_retries=3)
|
||||||
|
def process_dataset(self, dataset_id: int, posts: list, topics: dict):
|
||||||
|
from server.db.database import PostgresConnector
|
||||||
|
from server.core.datasets import DatasetManager
|
||||||
|
|
||||||
|
db = PostgresConnector()
|
||||||
|
dataset_manager = DatasetManager(db)
|
||||||
|
|
||||||
|
df = pd.DataFrame(posts)
|
||||||
|
|
||||||
|
processor = DatasetEnrichment(df, topics)
|
||||||
|
enriched_df = processor.enrich()
|
||||||
|
|
||||||
|
dataset_manager.save_dataset_content(dataset_id, enriched_df)
|
||||||
Reference in New Issue
Block a user