feat(api): add support for custom topic lists when autoscraping
This commit is contained in:
@@ -22,6 +22,8 @@ type SourceConfig = {
|
|||||||
category: string;
|
category: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
type TopicMap = Record<string, string>;
|
||||||
|
|
||||||
const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({
|
const buildEmptySourceConfig = (sourceName = ""): SourceConfig => ({
|
||||||
sourceName,
|
sourceName,
|
||||||
limit: "100",
|
limit: "100",
|
||||||
@@ -44,6 +46,8 @@ const AutoScrapePage = () => {
|
|||||||
const [isLoadingSources, setIsLoadingSources] = useState(true);
|
const [isLoadingSources, setIsLoadingSources] = useState(true);
|
||||||
const [isSubmitting, setIsSubmitting] = useState(false);
|
const [isSubmitting, setIsSubmitting] = useState(false);
|
||||||
const [hasError, setHasError] = useState(false);
|
const [hasError, setHasError] = useState(false);
|
||||||
|
const [useCustomTopics, setUseCustomTopics] = useState(false);
|
||||||
|
const [customTopicsText, setCustomTopicsText] = useState("");
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
axios
|
axios
|
||||||
@@ -151,6 +155,88 @@ const AutoScrapePage = () => {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let normalizedTopics: TopicMap | undefined;
|
||||||
|
|
||||||
|
if (useCustomTopics) {
|
||||||
|
const customTopicsJson = customTopicsText.trim();
|
||||||
|
|
||||||
|
if (!customTopicsJson) {
|
||||||
|
setHasError(true);
|
||||||
|
setReturnMessage(
|
||||||
|
"Custom topics are enabled, so please provide a JSON topic map.",
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let parsedTopics: unknown;
|
||||||
|
try {
|
||||||
|
parsedTopics = JSON.parse(customTopicsJson);
|
||||||
|
} catch {
|
||||||
|
setHasError(true);
|
||||||
|
setReturnMessage("Custom topic list must be valid JSON.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
!parsedTopics ||
|
||||||
|
Array.isArray(parsedTopics) ||
|
||||||
|
typeof parsedTopics !== "object"
|
||||||
|
) {
|
||||||
|
setHasError(true);
|
||||||
|
setReturnMessage(
|
||||||
|
"Custom topic list must be a JSON object: {\"Topic\": \"keywords\"}.",
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const entries = Object.entries(parsedTopics);
|
||||||
|
if (entries.length === 0) {
|
||||||
|
setHasError(true);
|
||||||
|
setReturnMessage("Custom topic list cannot be empty.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const hasInvalidTopic = entries.some(
|
||||||
|
([topicName, keywords]) =>
|
||||||
|
!topicName.trim() ||
|
||||||
|
typeof keywords !== "string" ||
|
||||||
|
!keywords.trim(),
|
||||||
|
);
|
||||||
|
|
||||||
|
if (hasInvalidTopic) {
|
||||||
|
setHasError(true);
|
||||||
|
setReturnMessage(
|
||||||
|
"Every custom topic must have a non-empty name and keyword string.",
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
normalizedTopics = Object.fromEntries(
|
||||||
|
entries.map(([topicName, keywords]) => [
|
||||||
|
topicName.trim(),
|
||||||
|
String(keywords).trim(),
|
||||||
|
]),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const requestBody: {
|
||||||
|
name: string;
|
||||||
|
sources: Array<{
|
||||||
|
name: string;
|
||||||
|
limit: number;
|
||||||
|
search?: string;
|
||||||
|
category?: string;
|
||||||
|
}>;
|
||||||
|
topics?: TopicMap;
|
||||||
|
} = {
|
||||||
|
name: normalizedDatasetName,
|
||||||
|
sources: normalizedSources,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (normalizedTopics) {
|
||||||
|
requestBody.topics = normalizedTopics;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
setIsSubmitting(true);
|
setIsSubmitting(true);
|
||||||
setHasError(false);
|
setHasError(false);
|
||||||
@@ -158,10 +244,7 @@ const AutoScrapePage = () => {
|
|||||||
|
|
||||||
const response = await axios.post(
|
const response = await axios.post(
|
||||||
`${API_BASE_URL}/datasets/scrape`,
|
`${API_BASE_URL}/datasets/scrape`,
|
||||||
{
|
requestBody,
|
||||||
name: normalizedDatasetName,
|
|
||||||
sources: normalizedSources,
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
headers: {
|
headers: {
|
||||||
Authorization: `Bearer ${token}`,
|
Authorization: `Bearer ${token}`,
|
||||||
@@ -381,6 +464,52 @@ const AutoScrapePage = () => {
|
|||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div style={{ ...styles.card, gridColumn: "auto" }}>
|
||||||
|
<h2 style={{ ...styles.sectionTitle, color: "#24292f" }}>
|
||||||
|
Topic List
|
||||||
|
</h2>
|
||||||
|
<p style={styles.sectionSubtitle}>
|
||||||
|
Use the default topic list, or provide your own JSON topic map.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<label
|
||||||
|
style={{
|
||||||
|
display: "flex",
|
||||||
|
alignItems: "center",
|
||||||
|
gap: 8,
|
||||||
|
fontSize: 14,
|
||||||
|
color: "#24292f",
|
||||||
|
marginBottom: 10,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
checked={useCustomTopics}
|
||||||
|
onChange={(event) => setUseCustomTopics(event.target.checked)}
|
||||||
|
/>
|
||||||
|
Use custom topic list
|
||||||
|
</label>
|
||||||
|
|
||||||
|
<textarea
|
||||||
|
value={customTopicsText}
|
||||||
|
onChange={(event) => setCustomTopicsText(event.target.value)}
|
||||||
|
disabled={!useCustomTopics}
|
||||||
|
placeholder='{"Politics": "election, policy, government", "Housing": "rent, landlords, tenancy"}'
|
||||||
|
style={{
|
||||||
|
...styles.input,
|
||||||
|
...styles.inputFullWidth,
|
||||||
|
minHeight: 170,
|
||||||
|
resize: "vertical",
|
||||||
|
fontFamily:
|
||||||
|
'"IBM Plex Mono", "Fira Code", "JetBrains Mono", monospace',
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
<p style={styles.subtleBodyText}>
|
||||||
|
Format: JSON object where each key is a topic and each value is a
|
||||||
|
keyword string.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div
|
<div
|
||||||
|
|||||||
@@ -55,6 +55,27 @@ with open("server/topics.json") as f:
|
|||||||
default_topic_list = json.load(f)
|
default_topic_list = json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_topics(topics):
|
||||||
|
if not isinstance(topics, dict) or len(topics) == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
normalized = {}
|
||||||
|
|
||||||
|
for topic_name, topic_keywords in topics.items():
|
||||||
|
if not isinstance(topic_name, str) or not isinstance(topic_keywords, str):
|
||||||
|
return None
|
||||||
|
|
||||||
|
clean_name = topic_name.strip()
|
||||||
|
clean_keywords = topic_keywords.strip()
|
||||||
|
|
||||||
|
if not clean_name or not clean_keywords:
|
||||||
|
return None
|
||||||
|
|
||||||
|
normalized[clean_name] = clean_keywords
|
||||||
|
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
@app.route("/register", methods=["POST"])
|
@app.route("/register", methods=["POST"])
|
||||||
def register_user():
|
def register_user():
|
||||||
data = request.get_json()
|
data = request.get_json()
|
||||||
@@ -146,6 +167,8 @@ def scrape_data():
|
|||||||
|
|
||||||
dataset_name = data["name"].strip()
|
dataset_name = data["name"].strip()
|
||||||
user_id = int(get_jwt_identity())
|
user_id = int(get_jwt_identity())
|
||||||
|
custom_topics = data.get("topics")
|
||||||
|
topics_for_processing = default_topic_list
|
||||||
|
|
||||||
source_configs = data["sources"]
|
source_configs = data["sources"]
|
||||||
|
|
||||||
@@ -182,12 +205,26 @@ def scrape_data():
|
|||||||
if category and not connector_metadata[name]["categories_enabled"]:
|
if category and not connector_metadata[name]["categories_enabled"]:
|
||||||
return jsonify({"error": f"Source {name} does not support categories"}), 400
|
return jsonify({"error": f"Source {name} does not support categories"}), 400
|
||||||
|
|
||||||
if category and not connectors[name]().category_exists(category):
|
# if category and not connectors[name]().category_exists(category):
|
||||||
return jsonify({"error": f"Category does not exist for {name}"}), 400
|
# return jsonify({"error": f"Category does not exist for {name}"}), 400
|
||||||
|
|
||||||
|
if custom_topics is not None:
|
||||||
|
normalized_topics = normalize_topics(custom_topics)
|
||||||
|
if not normalized_topics:
|
||||||
|
return (
|
||||||
|
jsonify(
|
||||||
|
{
|
||||||
|
"error": "Topics must be a non-empty JSON object with non-empty string keys and values"
|
||||||
|
}
|
||||||
|
),
|
||||||
|
400,
|
||||||
|
)
|
||||||
|
|
||||||
|
topics_for_processing = normalized_topics
|
||||||
|
|
||||||
try:
|
try:
|
||||||
dataset_id = dataset_manager.save_dataset_info(
|
dataset_id = dataset_manager.save_dataset_info(
|
||||||
user_id, dataset_name, default_topic_list
|
user_id, dataset_name, topics_for_processing
|
||||||
)
|
)
|
||||||
|
|
||||||
dataset_manager.set_dataset_status(
|
dataset_manager.set_dataset_status(
|
||||||
@@ -196,7 +233,7 @@ def scrape_data():
|
|||||||
f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}",
|
f"Data is being fetched from {', '.join(source['name'] for source in source_configs)}",
|
||||||
)
|
)
|
||||||
|
|
||||||
fetch_and_process_dataset.delay(dataset_id, source_configs, default_topic_list)
|
fetch_and_process_dataset.delay(dataset_id, source_configs, topics_for_processing)
|
||||||
except Exception:
|
except Exception:
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
return jsonify({"error": "Failed to queue dataset processing"}), 500
|
return jsonify({"error": "Failed to queue dataset processing"}), 500
|
||||||
|
|||||||
Reference in New Issue
Block a user