fix(connectors): update URL references to use base_url in BoardsAPI

fix(frontend): detects which sources support category or search
fix(connectors): add stronger validation to scrape endpoint
2026-03-13 21:59:17 +00:00 · 2026-03-12 10:07:28 +00:00 · 2026-03-12 09:59:07 +00:00 · 2026-03-12 09:56:34 +00:00
7 changed files with 136 additions and 50 deletions
--- a/frontend/src/pages/AutoScrape.tsx
+++ b/frontend/src/pages/AutoScrape.tsx
@@ -9,6 +9,10 @@ const API_BASE_URL = import.meta.env.VITE_BACKEND_URL;
 type SourceOption = {
  id: string;
  label: string;
+  search_enabled?: boolean;
+  categories_enabled?: boolean;
+  searchEnabled?: boolean;
+  categoriesEnabled?: boolean;
 };

 type SourceConfig = {
@@ -25,6 +29,12 @@ const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({
  category: "",
 });

+const supportsSearch = (source?: SourceOption): boolean =>
+  Boolean(source?.search_enabled ?? source?.searchEnabled);
+
+const supportsCategories = (source?: SourceOption): boolean =>
+  Boolean(source?.categories_enabled ?? source?.categoriesEnabled);
+
 const AutoScrapePage = () => {
  const navigate = useNavigate();
  const [datasetName, setDatasetName] = useState("");
@@ -63,11 +73,18 @@ const AutoScrapePage = () => {
  const updateSourceConfig = (index: number, field: keyof SourceConfig, value: string) => {
    setSourceConfigs((previous) =>
      previous.map((config, configIndex) =>
-        configIndex === index ? { ...config, [field]: value } : config
+        configIndex === index
+          ? field === "sourceName"
+            ? { ...config, sourceName: value, search: "", category: "" }
+            : { ...config, [field]: value }
+          : config
      )
    );
  };

+  const getSourceOption = (sourceName: string) =>
+    sourceOptions.find((option) => option.id === sourceName);
+
  const addSourceConfig = () => {
    setSourceConfigs((previous) => [
      ...previous,
@@ -100,12 +117,18 @@ const AutoScrapePage = () => {
      return;
    }

-    const normalizedSources = sourceConfigs.map((source) => ({
-      name: source.sourceName,
-      limit: Number(source.limit || 100),
-      search: source.search.trim() || undefined,
-      category: source.category.trim() || undefined,
-    }));
+    const normalizedSources = sourceConfigs.map((source) => {
+      const sourceOption = getSourceOption(source.sourceName);
+
+      return {
+        name: source.sourceName,
+        limit: Number(source.limit || 100),
+        search: supportsSearch(sourceOption) ? source.search.trim() || undefined : undefined,
+        category: supportsCategories(sourceOption)
+          ? source.category.trim() || undefined
+          : undefined,
+      };
+    });

    const invalidSource = normalizedSources.find(
      (source) => !source.name || !Number.isFinite(source.limit) || source.limit <= 0
@@ -212,7 +235,12 @@ const AutoScrapePage = () => {

            {!isLoadingSources && sourceOptions.length > 0 && (
              <div style={{ display: "flex", flexDirection: "column", gap: 10 }}>
-                {sourceConfigs.map((source, index) => (
+                {sourceConfigs.map((source, index) => {
+                  const sourceOption = getSourceOption(source.sourceName);
+                  const searchEnabled = supportsSearch(sourceOption);
+                  const categoriesEnabled = supportsCategories(sourceOption);
+
+                  return (
                  <div
                    key={`source-${index}`}
                    style={{
@@ -248,16 +276,26 @@ const AutoScrapePage = () => {
                    <input
                      type="text"
                      value={source.search}
-                      placeholder="Search term (optional)"
+                      placeholder={
+                        searchEnabled
+                          ? "Search term (optional)"
+                          : "Search not supported for this source"
+                      }
                      style={{ ...styles.input, ...styles.inputFullWidth }}
+                      disabled={!searchEnabled}
                      onChange={(event) => updateSourceConfig(index, "search", event.target.value)}
                    />

                    <input
                      type="text"
                      value={source.category}
-                      placeholder="Category (optional)"
+                      placeholder={
+                        categoriesEnabled
+                          ? "Category (optional)"
+                          : "Categories not supported for this source"
+                      }
                      style={{ ...styles.input, ...styles.inputFullWidth }}
+                      disabled={!categoriesEnabled}
                      onChange={(event) => updateSourceConfig(index, "category", event.target.value)}
                    />

@@ -271,7 +309,8 @@ const AutoScrapePage = () => {
                      </button>
                    )}
                  </div>
-                ))}
+                  );
+                })}

                <button type="button" style={styles.buttonSecondary} onClick={addSourceConfig}>
                  Add another source
--- a/server/app.py
+++ b/server/app.py
@@ -119,50 +119,82 @@ def get_user_datasets():

@app.route("/datasets/sources", methods=["GET"])
 def get_dataset_sources():
-    return jsonify(get_connector_metadata())
+    list_metadata = list(get_connector_metadata().values())
+    return jsonify(list_metadata)

@app.route("/datasets/scrape", methods=["POST"])
@jwt_required()
 def scrape_data():
    data = request.get_json()
+    connector_metadata = get_connector_metadata()

+    # Strong validation needed, otherwise data goes to Celery and crashes silently
    if not data or "sources" not in data:
-            return jsonify({"error": "Sources must be provided"}), 400
-    
-    user_id = int(get_jwt_identity())
+        return jsonify({"error": "Sources must be provided"}), 400
+
+    if "name" not in data or not str(data["name"]).strip():
+        return jsonify({"error": "Dataset name is required"}), 400
+
    dataset_name = data["name"].strip()
+    user_id = int(get_jwt_identity())
+
    source_configs = data["sources"]

    if not isinstance(source_configs, list) or len(source_configs) == 0:
        return jsonify({"error": "Sources must be a non-empty list"}), 400

-    # Light Validation
    for source in source_configs:
+        if not isinstance(source, dict):
+            return jsonify({"error": "Each source must be an object"}), 400
+
        if "name" not in source:
            return jsonify({"error": "Each source must contain a name"}), 400
-        if "limit" in source:
-            source["limit"] = int(source["limit"])
-  
-    dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list)
-    dataset_manager.set_dataset_status(
-        dataset_id,
-        "fetching",
-        f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
-    )
-    
-    try:
-        fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list)

-        return jsonify(
-            {
-                "message": "Dataset queued for processing",
-                "dataset_id": dataset_id,
-                "status": "processing",
-            }
-        ), 202
+        if "limit" in source:
+            try:
+                source["limit"] = int(source["limit"])
+            except (ValueError, TypeError):
+                return jsonify({"error": "Limit must be an integer"}), 400
+
+        name = source["name"]
+
+        if name not in connector_metadata:
+            return jsonify({"error": "Source not supported"}), 400
+
+        if "search" in source and not connector_metadata[name]["search_enabled"]:
+            return jsonify({"error": f"Source {name} does not support search"}), 400
+
+        if "category" in source and not connector_metadata[name]["categories_enabled"]:
+            return jsonify({"error": f"Source {name} does not support categories"}), 400
+
+    try:
+        dataset_id = dataset_manager.save_dataset_info(
+            user_id,
+            dataset_name,
+            default_topic_list
+        )
+
+        dataset_manager.set_dataset_status(
+            dataset_id,
+            "fetching",
+            f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
+        )
+
+        fetch_and_process_dataset.delay(
+            dataset_id,
+            source_configs,
+            default_topic_list
+        )
    except Exception:
        print(traceback.format_exc())
-        return jsonify({"error": "An unexpected error occurred"}), 500
+        return jsonify({"error": "Failed to queue dataset processing"}), 500
+
+
+    return jsonify({
+        "message": "Dataset queued for processing",
+        "dataset_id": dataset_id,
+        "status": "processing"
+    }), 202

@app.route("/datasets/upload", methods=["POST"])
@jwt_required()
--- a/server/connectors/base.py
+++ b/server/connectors/base.py
@@ -7,6 +7,9 @@ class BaseConnector(ABC):
    display_name: str      # human-readable: "Reddit", "YouTube"
    required_env: list[str] = []  # env vars needed to activate

+    search_enabled: bool
+    categories_enabled: bool
+
    @classmethod
    def is_available(cls) -> bool:
        """Returns True if all required env vars are set."""
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -19,8 +19,11 @@ class BoardsAPI(BaseConnector):
    source_name: str = "boards.ie"
    display_name: str = "Boards.ie"

+    categories_enabled: bool = True
+    search_enabled: bool = False
+
    def __init__(self):
-        self.url = "https://www.boards.ie"
+        self.base_url = "https://www.boards.ie"

    def get_new_posts_by_search(self, 
                                search: str,
@@ -31,9 +34,9 @@ class BoardsAPI(BaseConnector):
            raise NotImplementedError("Search not compatible with boards.ie")
        
        if category:
-            return self._get_posts(f"{self.url}/categories/{category}", post_limit)
+            return self._get_posts(f"{self.base_url}/categories/{category}", post_limit)
        else:
-            return self._get_posts(f"{self.url}/discussions", post_limit)
+            return self._get_posts(f"{self.base_url}/discussions", post_limit)
    
    ## Private
    def _get_posts(self, url, limit) -> list[Post]:
@@ -41,7 +44,7 @@ class BoardsAPI(BaseConnector):
        current_page = 1

        while len(urls) < limit:
-            url = f"{self.url}/p{current_page}"
+            url = f"{url}/p{current_page}"
            html = self._fetch_page(url)
            soup = BeautifulSoup(html, "html.parser")

@@ -145,7 +148,7 @@ class BoardsAPI(BaseConnector):

            if next_link and next_link.get('href'):
                href = next_link.get('href')
-                current_url = href if href.startswith('http') else self.url + href
+                current_url = href if href.startswith('http') else url + href
            else:
                current_url = None

--- a/server/connectors/reddit_api.py
+++ b/server/connectors/reddit_api.py
@@ -10,8 +10,10 @@ from server.connectors.base import BaseConnector
 logger = logging.getLogger(__name__)

 class RedditAPI(BaseConnector):
-    source_name = "reddit"
-    display_name = "Reddit"
+    source_name: str = "reddit"
+    display_name: str = "Reddit"
+    search_enabled: bool = True
+    categories_enabled: bool = True

    def __init__(self):
        self.url = "https://www.reddit.com/"
--- a/server/connectors/registry.py
+++ b/server/connectors/registry.py
@@ -18,8 +18,13 @@ def _discover_connectors() -> list[type[BaseConnector]]:
 def get_available_connectors() -> dict[str, type[BaseConnector]]:
    return {c.source_name: c for c in _discover_connectors() if c.is_available()}

-def get_connector_metadata() -> list[dict]:
-    return [
-        {"id": id, "label": obj.display_name}
-        for id, obj in get_available_connectors().items()
-    ]
+def get_connector_metadata() -> dict[str, dict]:
+    res = {}
+    for id, obj in get_available_connectors().items():
+        res[id] = {"id": id, 
+                   "label": obj.display_name, 
+                   "search_enabled": obj.search_enabled, 
+                   "categories_enabled": obj.categories_enabled
+                   }
+
+    return res
--- a/server/connectors/youtube_api.py
+++ b/server/connectors/youtube_api.py
@@ -13,8 +13,10 @@ load_dotenv()
 API_KEY = os.getenv("YOUTUBE_API_KEY")

 class YouTubeAPI(BaseConnector):
-    source_name = "youtube"
-    display_name = "YouTube"
+    source_name: str = "youtube"
+    display_name: str = "YouTube"
+    search_enabled: bool = True
+    categories_enabled: bool = False

    def __init__(self):
        self.youtube = build('youtube', 'v3', developerKey=API_KEY)
Author	SHA1	Message	Date
Dylan De Faoite	d96f459104	fix(connectors): update URL references to use base_url in BoardsAPI	2026-03-13 21:59:17 +00:00
Dylan De Faoite	162a4de64e	fix(frontend): detects which sources support category or search	2026-03-12 10:07:28 +00:00
Dylan De Faoite	6684780d23	fix(connectors): add stronger validation to scrape endpoint Strong validation needed, otherwise data goes to Celery and crashes silently. In addition it checks if that specific source supports search or category.	2026-03-12 09:59:07 +00:00
Dylan De Faoite	c12f1b4371	chore(connectors): add category and search validation fields	2026-03-12 09:56:34 +00:00