fix(connectors): add stronger validation to scrape endpoint

Strong validation needed, otherwise data goes to Celery and crashes silently. In addition it checks if that specific source supports search or category.
2026-03-12 09:59:07 +00:00
parent c12f1b4371
commit 6684780d23
1 changed files with 57 additions and 25 deletions
--- a/server/app.py
+++ b/server/app.py
@@ -119,50 +119,82 @@ def get_user_datasets():

@app.route("/datasets/sources", methods=["GET"])
 def get_dataset_sources():
-    return jsonify(get_connector_metadata())
+    list_metadata = list(get_connector_metadata().values())
+    return jsonify(list_metadata)

@app.route("/datasets/scrape", methods=["POST"])
@jwt_required()
 def scrape_data():
    data = request.get_json()
+    connector_metadata = get_connector_metadata()

+    # Strong validation needed, otherwise data goes to Celery and crashes silently
    if not data or "sources" not in data:
-            return jsonify({"error": "Sources must be provided"}), 400
+        return jsonify({"error": "Sources must be provided"}), 400
+
+    if "name" not in data or not str(data["name"]).strip():
+        return jsonify({"error": "Dataset name is required"}), 400

-    user_id = int(get_jwt_identity())
    dataset_name = data["name"].strip()
+    user_id = int(get_jwt_identity())
+
    source_configs = data["sources"]

    if not isinstance(source_configs, list) or len(source_configs) == 0:
        return jsonify({"error": "Sources must be a non-empty list"}), 400

-    # Light Validation
    for source in source_configs:
+        if not isinstance(source, dict):
+            return jsonify({"error": "Each source must be an object"}), 400
+
        if "name" not in source:
            return jsonify({"error": "Each source must contain a name"}), 400
-        if "limit" in source:
-            source["limit"] = int(source["limit"])

-    dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list)
-    dataset_manager.set_dataset_status(
-        dataset_id,
-        "fetching",
-        f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
-    )
+        if "limit" in source:
+            try:
+                source["limit"] = int(source["limit"])
+            except (ValueError, TypeError):
+                return jsonify({"error": "Limit must be an integer"}), 400
+
+        name = source["name"]
+
+        if name not in connector_metadata:
+            return jsonify({"error": "Source not supported"}), 400
+
+        if "search" in source and not connector_metadata[name]["search_enabled"]:
+            return jsonify({"error": f"Source {name} does not support search"}), 400
+
+        if "category" in source and not connector_metadata[name]["categories_enabled"]:
+            return jsonify({"error": f"Source {name} does not support categories"}), 400

    try:
-        fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list)
+        dataset_id = dataset_manager.save_dataset_info(
+            user_id,
+            dataset_name,
+            default_topic_list
+        )

-        return jsonify(
-            {
-                "message": "Dataset queued for processing",
-                "dataset_id": dataset_id,
-                "status": "processing",
-            }
-        ), 202
+        dataset_manager.set_dataset_status(
+            dataset_id,
+            "fetching",
+            f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
+        )
+
+        fetch_and_process_dataset.delay(
+            dataset_id,
+            source_configs,
+            default_topic_list
+        )
    except Exception:
        print(traceback.format_exc())
-        return jsonify({"error": "An unexpected error occurred"}), 500
+        return jsonify({"error": "Failed to queue dataset processing"}), 500
+
+
+    return jsonify({
+        "message": "Dataset queued for processing",
+        "dataset_id": dataset_id,
+        "status": "processing"
+    }), 202

@app.route("/datasets/upload", methods=["POST"])
@jwt_required()