feat(api): add support for custom topic lists when autoscraping

2026-03-31 13:36:37 +01:00
parent e776ef53ac
commit 75fd042d74
2 changed files with 174 additions and 8 deletions
--- a/frontend/src/pages/AutoScrape.tsx
+++ b/frontend/src/pages/AutoScrape.tsx
@@ -22,6 +22,8 @@ type SourceConfig = {
  category: string;
 };

+type TopicMap = Record<string, string>;
+
 const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({
  sourceName,
  limit: "100",
@@ -44,6 +46,8 @@ const AutoScrapePage = () => {
  const [isLoadingSources, setIsLoadingSources] = useState(true);
  const [isSubmitting, setIsSubmitting] = useState(false);
  const [hasError, setHasError] = useState(false);
+  const [useCustomTopics, setUseCustomTopics] = useState(false);
+  const [customTopicsText, setCustomTopicsText] = useState("");

  useEffect(() => {
    axios
@@ -151,6 +155,88 @@ const AutoScrapePage = () => {
      return;
    }

+    let normalizedTopics: TopicMap | undefined;
+
+    if (useCustomTopics) {
+      const customTopicsJson = customTopicsText.trim();
+
+      if (!customTopicsJson) {
+        setHasError(true);
+        setReturnMessage(
+          "Custom topics are enabled, so please provide a JSON topic map.",
+        );
+        return;
+      }
+
+      let parsedTopics: unknown;
+      try {
+        parsedTopics = JSON.parse(customTopicsJson);
+      } catch {
+        setHasError(true);
+        setReturnMessage("Custom topic list must be valid JSON.");
+        return;
+      }
+
+      if (
+        !parsedTopics ||
+        Array.isArray(parsedTopics) ||
+        typeof parsedTopics !== "object"
+      ) {
+        setHasError(true);
+        setReturnMessage(
+          "Custom topic list must be a JSON object: {\"Topic\": \"keywords\"}.",
+        );
+        return;
+      }
+
+      const entries = Object.entries(parsedTopics);
+      if (entries.length === 0) {
+        setHasError(true);
+        setReturnMessage("Custom topic list cannot be empty.");
+        return;
+      }
+
+      const hasInvalidTopic = entries.some(
+        ([topicName, keywords]) =>
+          !topicName.trim() ||
+          typeof keywords !== "string" ||
+          !keywords.trim(),
+      );
+
+      if (hasInvalidTopic) {
+        setHasError(true);
+        setReturnMessage(
+          "Every custom topic must have a non-empty name and keyword string.",
+        );
+        return;
+      }
+
+      normalizedTopics = Object.fromEntries(
+        entries.map(([topicName, keywords]) => [
+          topicName.trim(),
+          String(keywords).trim(),
+        ]),
+      );
+    }
+
+    const requestBody: {
+      name: string;
+      sources: Array<{
+        name: string;
+        limit: number;
+        search?: string;
+        category?: string;
+      }>;
+      topics?: TopicMap;
+    } = {
+      name: normalizedDatasetName,
+      sources: normalizedSources,
+    };
+
+    if (normalizedTopics) {
+      requestBody.topics = normalizedTopics;
+    }
+
    try {
      setIsSubmitting(true);
      setHasError(false);
@@ -158,10 +244,7 @@ const AutoScrapePage = () => {

      const response = await axios.post(
        `${API_BASE_URL}/datasets/scrape`,
-        {
-          name: normalizedDatasetName,
-          sources: normalizedSources,
-        },
+        requestBody,
        {
          headers: {
            Authorization: `Bearer ${token}`,
@@ -381,6 +464,52 @@ const AutoScrapePage = () => {
              </div>
            )}
          </div>
+
+          <div style={{ ...styles.card, gridColumn: "auto" }}>
+            <h2 style={{ ...styles.sectionTitle, color: "#24292f" }}>
+              Topic List
+            </h2>
+            <p style={styles.sectionSubtitle}>
+              Use the default topic list, or provide your own JSON topic map.
+            </p>
+
+            <label
+              style={{
+                display: "flex",
+                alignItems: "center",
+                gap: 8,
+                fontSize: 14,
+                color: "#24292f",
+                marginBottom: 10,
+              }}
+            >
+              <input
+                type="checkbox"
+                checked={useCustomTopics}
+                onChange={(event) => setUseCustomTopics(event.target.checked)}
+              />
+              Use custom topic list
+            </label>
+
+            <textarea
+              value={customTopicsText}
+              onChange={(event) => setCustomTopicsText(event.target.value)}
+              disabled={!useCustomTopics}
+              placeholder='{"Politics": "election, policy, government", "Housing": "rent, landlords, tenancy"}'
+              style={{
+                ...styles.input,
+                ...styles.inputFullWidth,
+                minHeight: 170,
+                resize: "vertical",
+                fontFamily:
+                  '"IBM Plex Mono", "Fira Code", "JetBrains Mono", monospace',
+              }}
+            />
+            <p style={styles.subtleBodyText}>
+              Format: JSON object where each key is a topic and each value is a
+              keyword string.
+            </p>
+          </div>
        </div>

        <div
--- a/server/app.py
+++ b/server/app.py
@@ -55,6 +55,27 @@ with open("server/topics.json") as f:
    default_topic_list = json.load(f)


+def normalize_topics(topics):
+    if not isinstance(topics, dict) or len(topics) == 0:
+        return None
+
+    normalized = {}
+
+    for topic_name, topic_keywords in topics.items():
+        if not isinstance(topic_name, str) or not isinstance(topic_keywords, str):
+            return None
+
+        clean_name = topic_name.strip()
+        clean_keywords = topic_keywords.strip()
+
+        if not clean_name or not clean_keywords:
+            return None
+
+        normalized[clean_name] = clean_keywords
+
+    return normalized
+
+
@app.route("/register", methods=["POST"])
 def register_user():
    data = request.get_json()
@@ -146,6 +167,8 @@ def scrape_data():

    dataset_name = data["name"].strip()
    user_id = int(get_jwt_identity())
+    custom_topics = data.get("topics")
+    topics_for_processing = default_topic_list

    source_configs = data["sources"]

@@ -182,12 +205,26 @@ def scrape_data():
        if category and not connector_metadata[name]["categories_enabled"]:
            return jsonify({"error": f"Source {name} does not support categories"}), 400

-        if category and not connectors[name]().category_exists(category):
-            return jsonify({"error": f"Category does not exist for {name}"}), 400
+        # if category and not connectors[name]().category_exists(category):
+        #     return jsonify({"error": f"Category does not exist for {name}"}), 400
+
+    if custom_topics is not None:
+        normalized_topics = normalize_topics(custom_topics)
+        if not normalized_topics:
+            return (
+                jsonify(
+                    {
+                        "error": "Topics must be a non-empty JSON object with non-empty string keys and values"
+                    }
+                ),
+                400,
+            )
+
+        topics_for_processing = normalized_topics

    try:
        dataset_id = dataset_manager.save_dataset_info(
-            user_id, dataset_name, default_topic_list
+            user_id, dataset_name, topics_for_processing
        )

        dataset_manager.set_dataset_status(
@@ -196,7 +233,7 @@ def scrape_data():
            f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}",
        )

-        fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list)
+        fetch_and_process_dataset.delay(dataset_id, source_configs, topics_for_processing)
    except Exception:
        print(traceback.format_exc())
        return jsonify({"error": "Failed to queue dataset processing"}), 500