Automatic Scraping of dataset options #9

Merged
dylan merged 36 commits from feat/automatic-scraping-datasets into main 2026-03-14 21:58:49 +00:00
5 changed files with 24 additions and 9 deletions
Showing only changes of commit c12f1b4371 - Show all commits

View File

@@ -7,6 +7,9 @@ class BaseConnector(ABC):
display_name: str # human-readable: "Reddit", "YouTube" display_name: str # human-readable: "Reddit", "YouTube"
required_env: list[str] = [] # env vars needed to activate required_env: list[str] = [] # env vars needed to activate
search_enabled: bool
categories_enabled: bool
@classmethod @classmethod
def is_available(cls) -> bool: def is_available(cls) -> bool:
"""Returns True if all required env vars are set.""" """Returns True if all required env vars are set."""

View File

@@ -19,6 +19,9 @@ class BoardsAPI(BaseConnector):
source_name: str = "boards.ie" source_name: str = "boards.ie"
display_name: str = "Boards.ie" display_name: str = "Boards.ie"
categories_enabled: bool = True
search_enabled: bool = False
def __init__(self): def __init__(self):
self.url = "https://www.boards.ie" self.url = "https://www.boards.ie"

View File

@@ -10,8 +10,10 @@ from server.connectors.base import BaseConnector
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class RedditAPI(BaseConnector): class RedditAPI(BaseConnector):
source_name = "reddit" source_name: str = "reddit"
display_name = "Reddit" display_name: str = "Reddit"
search_enabled: bool = True
categories_enabled: bool = True
def __init__(self): def __init__(self):
self.url = "https://www.reddit.com/" self.url = "https://www.reddit.com/"

View File

@@ -18,8 +18,13 @@ def _discover_connectors() -> list[type[BaseConnector]]:
def get_available_connectors() -> dict[str, type[BaseConnector]]: def get_available_connectors() -> dict[str, type[BaseConnector]]:
return {c.source_name: c for c in _discover_connectors() if c.is_available()} return {c.source_name: c for c in _discover_connectors() if c.is_available()}
def get_connector_metadata() -> list[dict]: def get_connector_metadata() -> dict[str, dict]:
return [ res = {}
{"id": id, "label": obj.display_name} for id, obj in get_available_connectors().items():
for id, obj in get_available_connectors().items() res[id] = {"id": id,
] "label": obj.display_name,
"search_enabled": obj.search_enabled,
"categories_enabled": obj.categories_enabled
}
return res

View File

@@ -13,8 +13,10 @@ load_dotenv()
API_KEY = os.getenv("YOUTUBE_API_KEY") API_KEY = os.getenv("YOUTUBE_API_KEY")
class YouTubeAPI(BaseConnector): class YouTubeAPI(BaseConnector):
source_name = "youtube" source_name: str = "youtube"
display_name = "YouTube" display_name: str = "YouTube"
search_enabled: bool = True
categories_enabled: bool = False
def __init__(self): def __init__(self):
self.youtube = build('youtube', 'v3', developerKey=API_KEY) self.youtube = build('youtube', 'v3', developerKey=API_KEY)