From 6684780d233ae34e63cc6759ee0ea3d4f28177a7 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Thu, 12 Mar 2026 09:59:07 +0000 Subject: [PATCH] fix(connectors): add stronger validation to scrape endpoint Strong validation needed, otherwise data goes to Celery and crashes silently. In addition it checks if that specific source supports search or category. --- server/app.py | 82 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 57 insertions(+), 25 deletions(-) diff --git a/server/app.py b/server/app.py index 460cb77..b315a60 100644 --- a/server/app.py +++ b/server/app.py @@ -119,50 +119,82 @@ def get_user_datasets(): @app.route("/datasets/sources", methods=["GET"]) def get_dataset_sources(): - return jsonify(get_connector_metadata()) + list_metadata = list(get_connector_metadata().values()) + return jsonify(list_metadata) @app.route("/datasets/scrape", methods=["POST"]) @jwt_required() def scrape_data(): data = request.get_json() + connector_metadata = get_connector_metadata() + # Strong validation needed, otherwise data goes to Celery and crashes silently if not data or "sources" not in data: - return jsonify({"error": "Sources must be provided"}), 400 - - user_id = int(get_jwt_identity()) + return jsonify({"error": "Sources must be provided"}), 400 + + if "name" not in data or not str(data["name"]).strip(): + return jsonify({"error": "Dataset name is required"}), 400 + dataset_name = data["name"].strip() + user_id = int(get_jwt_identity()) + source_configs = data["sources"] if not isinstance(source_configs, list) or len(source_configs) == 0: return jsonify({"error": "Sources must be a non-empty list"}), 400 - # Light Validation for source in source_configs: + if not isinstance(source, dict): + return jsonify({"error": "Each source must be an object"}), 400 + if "name" not in source: return jsonify({"error": "Each source must contain a name"}), 400 - if "limit" in source: - source["limit"] = int(source["limit"]) - - dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list) - dataset_manager.set_dataset_status( - dataset_id, - "fetching", - f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}" - ) - - try: - fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list) - return jsonify( - { - "message": "Dataset queued for processing", - "dataset_id": dataset_id, - "status": "processing", - } - ), 202 + if "limit" in source: + try: + source["limit"] = int(source["limit"]) + except (ValueError, TypeError): + return jsonify({"error": "Limit must be an integer"}), 400 + + name = source["name"] + + if name not in connector_metadata: + return jsonify({"error": "Source not supported"}), 400 + + if "search" in source and not connector_metadata[name]["search_enabled"]: + return jsonify({"error": f"Source {name} does not support search"}), 400 + + if "category" in source and not connector_metadata[name]["categories_enabled"]: + return jsonify({"error": f"Source {name} does not support categories"}), 400 + + try: + dataset_id = dataset_manager.save_dataset_info( + user_id, + dataset_name, + default_topic_list + ) + + dataset_manager.set_dataset_status( + dataset_id, + "fetching", + f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}" + ) + + fetch_and_process_dataset.delay( + dataset_id, + source_configs, + default_topic_list + ) except Exception: print(traceback.format_exc()) - return jsonify({"error": "An unexpected error occurred"}), 500 + return jsonify({"error": "Failed to queue dataset processing"}), 500 + + + return jsonify({ + "message": "Dataset queued for processing", + "dataset_id": dataset_id, + "status": "processing" + }), 202 @app.route("/datasets/upload", methods=["POST"]) @jwt_required()