fix(connectors): update URL references to use base_url in BoardsAPI

fix(frontend): detects which sources support category or search
fix(connectors): add stronger validation to scrape endpoint
2026-03-13 21:59:17 +00:00 · 2026-03-12 10:07:28 +00:00 · 2026-03-12 09:59:07 +00:00 · 2026-03-12 09:56:34 +00:00
7 changed files with 136 additions and 50 deletions
--- a/frontend/src/pages/AutoScrape.tsx
+++ b/frontend/src/pages/AutoScrape.tsx
@@ -9,6 +9,10 @@ const API_BASE_URL = import.meta.env.VITE_BACKEND_URL;
 type SourceOption = {
  id: string;
  label: string;
  search_enabled?: boolean;
  categories_enabled?: boolean;
  searchEnabled?: boolean;
  categoriesEnabled?: boolean;
 };
 type SourceConfig = {
@@ -25,6 +29,12 @@ const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({
  category: "",
 });
 const supportsSearch = (source?: SourceOption): boolean =>
  Boolean(source?.search_enabled ?? source?.searchEnabled);
 const supportsCategories = (source?: SourceOption): boolean =>
  Boolean(source?.categories_enabled ?? source?.categoriesEnabled);
 const AutoScrapePage = () => {
  const navigate = useNavigate();
  const [datasetName, setDatasetName] = useState("");
@@ -63,11 +73,18 @@ const AutoScrapePage = () => {
  const updateSourceConfig = (index: number, field: keyof SourceConfig, value: string) => {
    setSourceConfigs((previous) =>
      previous.map((config, configIndex) =>
-        configIndex === index ? { ...config, [field]: value } : config
+        configIndex === index
          ? field === "sourceName"
            ? { ...config, sourceName: value, search: "", category: "" }
            : { ...config, [field]: value }
          : config
      )
    );
  };
  const getSourceOption = (sourceName: string) =>
    sourceOptions.find((option) => option.id === sourceName);
  const addSourceConfig = () => {
    setSourceConfigs((previous) => [
      ...previous,
@@ -100,12 +117,18 @@ const AutoScrapePage = () => {
      return;
    }
-    const normalizedSources = sourceConfigs.map((source) => ({
+    const normalizedSources = sourceConfigs.map((source) => {
-      name: source.sourceName,
+      const sourceOption = getSourceOption(source.sourceName);
-      limit: Number(source.limit || 100),
+
-      search: source.search.trim() || undefined,
+      return {
-      category: source.category.trim() || undefined,
+        name: source.sourceName,
-    }));
+        limit: Number(source.limit || 100),
        search: supportsSearch(sourceOption) ? source.search.trim() || undefined : undefined,
        category: supportsCategories(sourceOption)
          ? source.category.trim() || undefined
          : undefined,
      };
    });
    const invalidSource = normalizedSources.find(
      (source) => !source.name || !Number.isFinite(source.limit) || source.limit <= 0
@@ -212,7 +235,12 @@ const AutoScrapePage = () => {
            {!isLoadingSources && sourceOptions.length > 0 && (
              <div style={{ display: "flex", flexDirection: "column", gap: 10 }}>
-                {sourceConfigs.map((source, index) => (
+                {sourceConfigs.map((source, index) => {
                  const sourceOption = getSourceOption(source.sourceName);
                  const searchEnabled = supportsSearch(sourceOption);
                  const categoriesEnabled = supportsCategories(sourceOption);
                  return (
                  <div
                    key={`source-${index}`}
                    style={{
@@ -248,16 +276,26 @@ const AutoScrapePage = () => {
                    <input
                      type="text"
                      value={source.search}
-                      placeholder="Search term (optional)"
+                      placeholder={
                        searchEnabled
                          ? "Search term (optional)"
                          : "Search not supported for this source"
                      }
                      style={{ ...styles.input, ...styles.inputFullWidth }}
                      disabled={!searchEnabled}
                      onChange={(event) => updateSourceConfig(index, "search", event.target.value)}
                    />
                    <input
                      type="text"
                      value={source.category}
-                      placeholder="Category (optional)"
+                      placeholder={
                        categoriesEnabled
                          ? "Category (optional)"
                          : "Categories not supported for this source"
                      }
                      style={{ ...styles.input, ...styles.inputFullWidth }}
                      disabled={!categoriesEnabled}
                      onChange={(event) => updateSourceConfig(index, "category", event.target.value)}
                    />
@@ -271,7 +309,8 @@ const AutoScrapePage = () => {
                      </button>
                    )}
                  </div>
-                ))}
+                  );
                })}
                <button type="button" style={styles.buttonSecondary} onClick={addSourceConfig}>
                  Add another source
--- a/server/app.py
+++ b/server/app.py
@@ -119,50 +119,82 @@ def get_user_datasets():
@app.route("/datasets/sources", methods=["GET"])
 def get_dataset_sources():
-    return jsonify(get_connector_metadata())
+    list_metadata = list(get_connector_metadata().values())
    return jsonify(list_metadata)
@app.route("/datasets/scrape", methods=["POST"])
@jwt_required()
 def scrape_data():
    data = request.get_json()
    connector_metadata = get_connector_metadata()
    # Strong validation needed, otherwise data goes to Celery and crashes silently
    if not data or "sources" not in data:
-            return jsonify({"error": "Sources must be provided"}), 400
+        return jsonify({"error": "Sources must be provided"}), 400
    if "name" not in data or not str(data["name"]).strip():
        return jsonify({"error": "Dataset name is required"}), 400
    user_id = int(get_jwt_identity())
    dataset_name = data["name"].strip()
    user_id = int(get_jwt_identity())
    source_configs = data["sources"]
    if not isinstance(source_configs, list) or len(source_configs) == 0:
        return jsonify({"error": "Sources must be a non-empty list"}), 400
    # Light Validation
    for source in source_configs:
        if not isinstance(source, dict):
            return jsonify({"error": "Each source must be an object"}), 400
        if "name" not in source:
            return jsonify({"error": "Each source must contain a name"}), 400
        if "limit" in source:
            source["limit"] = int(source["limit"])
-    dataset_id = dataset_manager.save_dataset_info(user_id, dataset_name, default_topic_list)
+        if "limit" in source:
-    dataset_manager.set_dataset_status(
+            try:
-        dataset_id,
+                source["limit"] = int(source["limit"])
-        "fetching",
+            except (ValueError, TypeError):
-        f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
+                return jsonify({"error": "Limit must be an integer"}), 400
-    )
+
        name = source["name"]
        if name not in connector_metadata:
            return jsonify({"error": "Source not supported"}), 400
        if "search" in source and not connector_metadata[name]["search_enabled"]:
            return jsonify({"error": f"Source {name} does not support search"}), 400
        if "category" in source and not connector_metadata[name]["categories_enabled"]:
            return jsonify({"error": f"Source {name} does not support categories"}), 400
    try:
-        fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list)
+        dataset_id = dataset_manager.save_dataset_info(
            user_id,
            dataset_name,
            default_topic_list
        )
-        return jsonify(
+        dataset_manager.set_dataset_status(
-            {
+            dataset_id,
-                "message": "Dataset queued for processing",
+            "fetching",
-                "dataset_id": dataset_id,
+            f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}"
-                "status": "processing",
+        )
-            }
+
-        ), 202
+        fetch_and_process_dataset.delay(
            dataset_id,
            source_configs,
            default_topic_list
        )
    except Exception:
        print(traceback.format_exc())
-        return jsonify({"error": "An unexpected error occurred"}), 500
+        return jsonify({"error": "Failed to queue dataset processing"}), 500
    return jsonify({
        "message": "Dataset queued for processing",
        "dataset_id": dataset_id,
        "status": "processing"
    }), 202
@app.route("/datasets/upload", methods=["POST"])
@jwt_required()
--- a/server/connectors/base.py
+++ b/server/connectors/base.py
@@ -7,6 +7,9 @@ class BaseConnector(ABC):
    display_name: str      # human-readable: "Reddit", "YouTube"
    required_env: list[str] = []  # env vars needed to activate
    search_enabled: bool
    categories_enabled: bool
    @classmethod
    def is_available(cls) -> bool:
        """Returns True if all required env vars are set."""
--- a/server/connectors/boards_api.py
+++ b/server/connectors/boards_api.py
@@ -19,8 +19,11 @@ class BoardsAPI(BaseConnector):
    source_name: str = "boards.ie"
    display_name: str = "Boards.ie"
    categories_enabled: bool = True
    search_enabled: bool = False
    def __init__(self):
-        self.url = "https://www.boards.ie"
+        self.base_url = "https://www.boards.ie"
    def get_new_posts_by_search(self, 
                                search: str,
@@ -31,9 +34,9 @@ class BoardsAPI(BaseConnector):
            raise NotImplementedError("Search not compatible with boards.ie")
        if category:
-            return self._get_posts(f"{self.url}/categories/{category}", post_limit)
+            return self._get_posts(f"{self.base_url}/categories/{category}", post_limit)
        else:
-            return self._get_posts(f"{self.url}/discussions", post_limit)
+            return self._get_posts(f"{self.base_url}/discussions", post_limit)
    ## Private
    def _get_posts(self, url, limit) -> list[Post]:
@@ -41,7 +44,7 @@ class BoardsAPI(BaseConnector):
        current_page = 1
        while len(urls) < limit:
-            url = f"{self.url}/p{current_page}"
+            url = f"{url}/p{current_page}"
            html = self._fetch_page(url)
            soup = BeautifulSoup(html, "html.parser")
@@ -145,7 +148,7 @@ class BoardsAPI(BaseConnector):
            if next_link and next_link.get('href'):
                href = next_link.get('href')
-                current_url = href if href.startswith('http') else self.url + href
+                current_url = href if href.startswith('http') else url + href
            else:
                current_url = None
--- a/server/connectors/reddit_api.py
+++ b/server/connectors/reddit_api.py
@@ -10,8 +10,10 @@ from server.connectors.base import BaseConnector
 logger = logging.getLogger(__name__)
 class RedditAPI(BaseConnector):
-    source_name = "reddit"
+    source_name: str = "reddit"
-    display_name = "Reddit"
+    display_name: str = "Reddit"
    search_enabled: bool = True
    categories_enabled: bool = True
    def __init__(self):
        self.url = "https://www.reddit.com/"
--- a/server/connectors/registry.py
+++ b/server/connectors/registry.py
@@ -18,8 +18,13 @@ def _discover_connectors() -> list[type[BaseConnector]]:
 def get_available_connectors() -> dict[str, type[BaseConnector]]:
    return {c.source_name: c for c in _discover_connectors() if c.is_available()}
-def get_connector_metadata() -> list[dict]:
+def get_connector_metadata() -> dict[str, dict]:
-    return [
+    res = {}
-        {"id": id, "label": obj.display_name}
+    for id, obj in get_available_connectors().items():
-        for id, obj in get_available_connectors().items()
+        res[id] = {"id": id, 
-    ]
+                   "label": obj.display_name, 
                   "search_enabled": obj.search_enabled, 
                   "categories_enabled": obj.categories_enabled
                   }
    return res
--- a/server/connectors/youtube_api.py
+++ b/server/connectors/youtube_api.py
@@ -13,8 +13,10 @@ load_dotenv()
 API_KEY = os.getenv("YOUTUBE_API_KEY")
 class YouTubeAPI(BaseConnector):
-    source_name = "youtube"
+    source_name: str = "youtube"
-    display_name = "YouTube"
+    display_name: str = "YouTube"
    search_enabled: bool = True
    categories_enabled: bool = False
    def __init__(self):
        self.youtube = build('youtube', 'v3', developerKey=API_KEY)
Author	SHA1	Message	Date
Dylan De Faoite	d96f459104	fix(connectors): update URL references to use base_url in BoardsAPI	2026-03-13 21:59:17 +00:00
Dylan De Faoite	162a4de64e	fix(frontend): detects which sources support category or search	2026-03-12 10:07:28 +00:00
Dylan De Faoite	6684780d23	fix(connectors): add stronger validation to scrape endpoint Strong validation needed, otherwise data goes to Celery and crashes silently. In addition it checks if that specific source supports search or category.	2026-03-12 09:59:07 +00:00
Dylan De Faoite	c12f1b4371	chore(connectors): add category and search validation fields	2026-03-12 09:56:34 +00:00