feat(api): add support for custom topic lists when autoscraping

2026-03-31 13:36:37 +01:00
parent e776ef53ac
commit 75fd042d74
2 changed files with 174 additions and 8 deletions
--- a/frontend/src/pages/AutoScrape.tsx
+++ b/frontend/src/pages/AutoScrape.tsx
@@ -22,6 +22,8 @@ type SourceConfig = {
  category: string;
 };
 type TopicMap = Record<string, string>;
 const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({
  sourceName,
  limit: "100",
@@ -44,6 +46,8 @@ const AutoScrapePage = () => {
  const [isLoadingSources, setIsLoadingSources] = useState(true);
  const [isSubmitting, setIsSubmitting] = useState(false);
  const [hasError, setHasError] = useState(false);
  const [useCustomTopics, setUseCustomTopics] = useState(false);
  const [customTopicsText, setCustomTopicsText] = useState("");
  useEffect(() => {
    axios
@@ -151,6 +155,88 @@ const AutoScrapePage = () => {
      return;
    }
    let normalizedTopics: TopicMap | undefined;
    if (useCustomTopics) {
      const customTopicsJson = customTopicsText.trim();
      if (!customTopicsJson) {
        setHasError(true);
        setReturnMessage(
          "Custom topics are enabled, so please provide a JSON topic map.",
        );
        return;
      }
      let parsedTopics: unknown;
      try {
        parsedTopics = JSON.parse(customTopicsJson);
      } catch {
        setHasError(true);
        setReturnMessage("Custom topic list must be valid JSON.");
        return;
      }
      if (
        !parsedTopics ||
        Array.isArray(parsedTopics) ||
        typeof parsedTopics !== "object"
      ) {
        setHasError(true);
        setReturnMessage(
          "Custom topic list must be a JSON object: {\"Topic\": \"keywords\"}.",
        );
        return;
      }
      const entries = Object.entries(parsedTopics);
      if (entries.length === 0) {
        setHasError(true);
        setReturnMessage("Custom topic list cannot be empty.");
        return;
      }
      const hasInvalidTopic = entries.some(
        ([topicName, keywords]) =>
          !topicName.trim() ||
          typeof keywords !== "string" ||
          !keywords.trim(),
      );
      if (hasInvalidTopic) {
        setHasError(true);
        setReturnMessage(
          "Every custom topic must have a non-empty name and keyword string.",
        );
        return;
      }
      normalizedTopics = Object.fromEntries(
        entries.map(([topicName, keywords]) => [
          topicName.trim(),
          String(keywords).trim(),
        ]),
      );
    }
    const requestBody: {
      name: string;
      sources: Array<{
        name: string;
        limit: number;
        search?: string;
        category?: string;
      }>;
      topics?: TopicMap;
    } = {
      name: normalizedDatasetName,
      sources: normalizedSources,
    };
    if (normalizedTopics) {
      requestBody.topics = normalizedTopics;
    }
    try {
      setIsSubmitting(true);
      setHasError(false);
@@ -158,10 +244,7 @@ const AutoScrapePage = () => {
      const response = await axios.post(
        `${API_BASE_URL}/datasets/scrape`,
-        {
+        requestBody,
          name: normalizedDatasetName,
          sources: normalizedSources,
        },
        {
          headers: {
            Authorization: `Bearer ${token}`,
@@ -381,6 +464,52 @@ const AutoScrapePage = () => {
              </div>
            )}
          </div>
          <div style={{ ...styles.card, gridColumn: "auto" }}>
            <h2 style={{ ...styles.sectionTitle, color: "#24292f" }}>
              Topic List
            </h2>
            <p style={styles.sectionSubtitle}>
              Use the default topic list, or provide your own JSON topic map.
            </p>
            <label
              style={{
                display: "flex",
                alignItems: "center",
                gap: 8,
                fontSize: 14,
                color: "#24292f",
                marginBottom: 10,
              }}
            >
              <input
                type="checkbox"
                checked={useCustomTopics}
                onChange={(event) => setUseCustomTopics(event.target.checked)}
              />
              Use custom topic list
            </label>
            <textarea
              value={customTopicsText}
              onChange={(event) => setCustomTopicsText(event.target.value)}
              disabled={!useCustomTopics}
              placeholder='{"Politics": "election, policy, government", "Housing": "rent, landlords, tenancy"}'
              style={{
                ...styles.input,
                ...styles.inputFullWidth,
                minHeight: 170,
                resize: "vertical",
                fontFamily:
                  '"IBM Plex Mono", "Fira Code", "JetBrains Mono", monospace',
              }}
            />
            <p style={styles.subtleBodyText}>
              Format: JSON object where each key is a topic and each value is a
              keyword string.
            </p>
          </div>
        </div>
        <div
--- a/server/app.py
+++ b/server/app.py
@@ -55,6 +55,27 @@ with open("server/topics.json") as f:
    default_topic_list = json.load(f)
 def normalize_topics(topics):
    if not isinstance(topics, dict) or len(topics) == 0:
        return None
    normalized = {}
    for topic_name, topic_keywords in topics.items():
        if not isinstance(topic_name, str) or not isinstance(topic_keywords, str):
            return None
        clean_name = topic_name.strip()
        clean_keywords = topic_keywords.strip()
        if not clean_name or not clean_keywords:
            return None
        normalized[clean_name] = clean_keywords
    return normalized
@app.route("/register", methods=["POST"])
 def register_user():
    data = request.get_json()
@@ -146,6 +167,8 @@ def scrape_data():
    dataset_name = data["name"].strip()
    user_id = int(get_jwt_identity())
    custom_topics = data.get("topics")
    topics_for_processing = default_topic_list
    source_configs = data["sources"]
@@ -182,12 +205,26 @@ def scrape_data():
        if category and not connector_metadata[name]["categories_enabled"]:
            return jsonify({"error": f"Source {name} does not support categories"}), 400
-        if category and not connectors[name]().category_exists(category):
+        # if category and not connectors[name]().category_exists(category):
-            return jsonify({"error": f"Category does not exist for {name}"}), 400
+        #     return jsonify({"error": f"Category does not exist for {name}"}), 400
    if custom_topics is not None:
        normalized_topics = normalize_topics(custom_topics)
        if not normalized_topics:
            return (
                jsonify(
                    {
                        "error": "Topics must be a non-empty JSON object with non-empty string keys and values"
                    }
                ),
                400,
            )
        topics_for_processing = normalized_topics
    try:
        dataset_id = dataset_manager.save_dataset_info(
-            user_id, dataset_name, default_topic_list
+            user_id, dataset_name, topics_for_processing
        )
        dataset_manager.set_dataset_status(
@@ -196,7 +233,7 @@ def scrape_data():
            f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}",
        )
-        fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list)
+        fetch_and_process_dataset.delay(dataset_id, source_configs, topics_for_processing)
    except Exception:
        print(traceback.format_exc())
        return jsonify({"error": "Failed to queue dataset processing"}), 500