Automatic Scraping of dataset options #9
@@ -21,6 +21,7 @@ from server.core.auth import AuthManager
|
||||
from server.core.datasets import DatasetManager
|
||||
from server.utils import get_request_filters
|
||||
from server.queue.tasks import process_dataset
|
||||
from server.connectors.registry import get_connector_metadata
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@@ -112,9 +113,8 @@ def get_user_datasets():
|
||||
return jsonify(dataset_manager.get_user_datasets(current_user)), 200
|
||||
|
||||
@app.route("/datasets/sources", methods=["GET"])
|
||||
@jwt_required()
|
||||
def get_dataset_sources():
|
||||
return jsonify({""})
|
||||
return jsonify(get_connector_metadata())
|
||||
|
||||
@app.route("/datasets/scrape", methods=["POST"])
|
||||
@jwt_required()
|
||||
|
||||
@@ -16,10 +16,11 @@ HEADERS = {
|
||||
}
|
||||
|
||||
class BoardsAPI(BaseConnector):
|
||||
source_name: str = "boards.ie"
|
||||
display_name: str = "Boards.ie"
|
||||
|
||||
def __init__(self):
|
||||
self.url = "https://www.boards.ie"
|
||||
self.source_name = "boards.ie"
|
||||
self.display_name = "Boards.ie"
|
||||
|
||||
def get_new_posts_by_search(self,
|
||||
search: str,
|
||||
|
||||
@@ -10,9 +10,11 @@ from server.connectors.base import BaseConnector
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class RedditAPI(BaseConnector):
|
||||
source_name = "reddit"
|
||||
display_name = "Reddit"
|
||||
|
||||
def __init__(self):
|
||||
self.url = "https://www.reddit.com/"
|
||||
self.source_name = "Reddit"
|
||||
|
||||
# Public Methods #
|
||||
def get_new_posts_by_search(self,
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
import pkgutil
|
||||
import importlib
|
||||
import connectors
|
||||
from connectors.base import BaseConnector
|
||||
import server.connectors
|
||||
from server.connectors.base import BaseConnector
|
||||
|
||||
def _discover_connectors() -> list[type[BaseConnector]]:
|
||||
"""Walk the connectors package and collect all BaseConnector subclasses."""
|
||||
for _, module_name, _ in pkgutil.iter_modules(connectors.__path__):
|
||||
for _, module_name, _ in pkgutil.iter_modules(server.connectors.__path__):
|
||||
if module_name in ("base", "registry"):
|
||||
continue
|
||||
importlib.import_module(f"connectors.{module_name}")
|
||||
importlib.import_module(f"server.connectors.{module_name}")
|
||||
|
||||
return [
|
||||
cls for cls in BaseConnector.__subclasses__()
|
||||
|
||||
@@ -13,6 +13,9 @@ load_dotenv()
|
||||
API_KEY = os.getenv("YOUTUBE_API_KEY")
|
||||
|
||||
class YouTubeAPI(BaseConnector):
|
||||
source_name = "youtube"
|
||||
display_name = "YouTube"
|
||||
|
||||
def __init__(self):
|
||||
self.youtube = build('youtube', 'v3', developerKey=API_KEY)
|
||||
|
||||
@@ -44,7 +47,7 @@ class YouTubeAPI(BaseConnector):
|
||||
author=comment_snippet['authorDisplayName'],
|
||||
timestamp=datetime.datetime.strptime(comment_snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp(),
|
||||
reply_to=None,
|
||||
source="YouTube"
|
||||
source=self.source_name
|
||||
)
|
||||
|
||||
comments.append(comment)
|
||||
|
||||
Reference in New Issue
Block a user