From 6684780d233ae34e63cc6759ee0ea3d4f28177a7 Mon Sep 17 00:00:00 2001
From: Dylan De Faoite <dylanseandefaoite@gmail.com>
Date: Thu, 12 Mar 2026 09:59:07 +0000
Subject: [PATCH] fix(connectors): add stronger validation to scrape endpoint

Strong validation needed, otherwise data goes to Celery and crashes silently. In addition it checks if that specific source supports search or category.
---
 server/app.py | 82 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 57 insertions(+), 25 deletions(-)

diff --git a/server/app.py b/server/app.py
index 460cb77..b315a60 100644
--- a/server/app.py
+++ b/server/app.py
@@ -119,50 +119,82 @@ def get_user_datasets():
 
 @app.route("/datasets/sources", methods=["GET"])
 def get_dataset_sources():
-    return jsonify(get_connector_metadata())
+    list_metadata = list(get_connector_metadata().values())
+    return jsonify(list_metadata)
 
 @app.route("/datasets/scrape", methods=["POST"])
 @jwt_required()
 def scrape_data():
     data = request.get_json()
+    connector_metadata = get_connector_metadata()
 
+    # Strong validation needed, otherwise data goes to Celery and crashes silently
     if not data or "sources" not in data:
-            return jsonify({"error": "Sources must be provided"}), 400
-    
-    user_id = int(get_jwt_identity())
+        return jsonify({"error": "Sources must be provided"}), 400
+
+    if "name" not in data or not str(data["name"]).strip():
+        return jsonify({"error": "Dataset name is required"}), 400
+
     dataset_name = data["name"].strip()
+    user_id = int(get_jwt_identity())
+
     source_configs = data["sources"]
 
     if not isinstance(source_configs, list) or len(source_configs) == 0:
         return jsonify({"error": "Sources must be a non-empty list"}), 400
 
-    # Light Validation
     for source in source_configs:
+        if not isinstance(source, dict):
+            return jsonify({"error": "Each source must be an object"}), 400
+
         if "name" not in source:
             return jsonify({"error": "Each source must contain a name"}), 400
-        if "limit" in source:
-            source["limit"] = int(source["limit"])
-  
-    dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list)
-    dataset_manager.set_dataset_status(
-        dataset_id,
-        "fetching",
-        f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
-    )
-    
-    try:
-        fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list)
 
-        return jsonify(
-            {
-                "message": "Dataset queued for processing",
-                "dataset_id": dataset_id,
-                "status": "processing",
-            }
-        ), 202
+        if "limit" in source:
+            try:
+                source["limit"] = int(source["limit"])
+            except (ValueError, TypeError):
+                return jsonify({"error": "Limit must be an integer"}), 400
+
+        name = source["name"]
+
+        if name not in connector_metadata:
+            return jsonify({"error": "Source not supported"}), 400
+
+        if "search" in source and not connector_metadata[name]["search_enabled"]:
+            return jsonify({"error": f"Source {name} does not support search"}), 400
+
+        if "category" in source and not connector_metadata[name]["categories_enabled"]:
+            return jsonify({"error": f"Source {name} does not support categories"}), 400
+
+    try:
+        dataset_id = dataset_manager.save_dataset_info(
+            user_id,
+            dataset_name,
+            default_topic_list
+        )
+
+        dataset_manager.set_dataset_status(
+            dataset_id,
+            "fetching",
+            f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
+        )
+
+        fetch_and_process_dataset.delay(
+            dataset_id,
+            source_configs,
+            default_topic_list
+        )
     except Exception:
         print(traceback.format_exc())
-        return jsonify({"error": "An unexpected error occurred"}), 500
+        return jsonify({"error": "Failed to queue dataset processing"}), 500
+
+
+    return jsonify({
+        "message": "Dataset queued for processing",
+        "dataset_id": dataset_id,
+        "status": "processing"
+    }), 202
 
 @app.route("/datasets/upload", methods=["POST"])
 @jwt_required()