Automatic Scraping of dataset options #9
@@ -21,6 +21,7 @@ from server.core.auth import AuthManager
|
|||||||
from server.core.datasets import DatasetManager
|
from server.core.datasets import DatasetManager
|
||||||
from server.utils import get_request_filters
|
from server.utils import get_request_filters
|
||||||
from server.queue.tasks import process_dataset
|
from server.queue.tasks import process_dataset
|
||||||
|
from server.connectors.registry import get_connector_metadata
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
@@ -112,9 +113,8 @@ def get_user_datasets():
|
|||||||
return jsonify(dataset_manager.get_user_datasets(current_user)), 200
|
return jsonify(dataset_manager.get_user_datasets(current_user)), 200
|
||||||
|
|
||||||
@app.route("/datasets/sources", methods=["GET"])
|
@app.route("/datasets/sources", methods=["GET"])
|
||||||
@jwt_required()
|
|
||||||
def get_dataset_sources():
|
def get_dataset_sources():
|
||||||
return jsonify({""})
|
return jsonify(get_connector_metadata())
|
||||||
|
|
||||||
@app.route("/datasets/scrape", methods=["POST"])
|
@app.route("/datasets/scrape", methods=["POST"])
|
||||||
@jwt_required()
|
@jwt_required()
|
||||||
|
|||||||
@@ -16,10 +16,11 @@ HEADERS = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
class BoardsAPI(BaseConnector):
|
class BoardsAPI(BaseConnector):
|
||||||
|
source_name: str = "boards.ie"
|
||||||
|
display_name: str = "Boards.ie"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.url = "https://www.boards.ie"
|
self.url = "https://www.boards.ie"
|
||||||
self.source_name = "boards.ie"
|
|
||||||
self.display_name = "Boards.ie"
|
|
||||||
|
|
||||||
def get_new_posts_by_search(self,
|
def get_new_posts_by_search(self,
|
||||||
search: str,
|
search: str,
|
||||||
|
|||||||
@@ -10,9 +10,11 @@ from server.connectors.base import BaseConnector
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class RedditAPI(BaseConnector):
|
class RedditAPI(BaseConnector):
|
||||||
|
source_name = "reddit"
|
||||||
|
display_name = "Reddit"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.url = "https://www.reddit.com/"
|
self.url = "https://www.reddit.com/"
|
||||||
self.source_name = "Reddit"
|
|
||||||
|
|
||||||
# Public Methods #
|
# Public Methods #
|
||||||
def get_new_posts_by_search(self,
|
def get_new_posts_by_search(self,
|
||||||
|
|||||||
@@ -1,14 +1,14 @@
|
|||||||
import pkgutil
|
import pkgutil
|
||||||
import importlib
|
import importlib
|
||||||
import connectors
|
import server.connectors
|
||||||
from connectors.base import BaseConnector
|
from server.connectors.base import BaseConnector
|
||||||
|
|
||||||
def _discover_connectors() -> list[type[BaseConnector]]:
|
def _discover_connectors() -> list[type[BaseConnector]]:
|
||||||
"""Walk the connectors package and collect all BaseConnector subclasses."""
|
"""Walk the connectors package and collect all BaseConnector subclasses."""
|
||||||
for _, module_name, _ in pkgutil.iter_modules(connectors.__path__):
|
for _, module_name, _ in pkgutil.iter_modules(server.connectors.__path__):
|
||||||
if module_name in ("base", "registry"):
|
if module_name in ("base", "registry"):
|
||||||
continue
|
continue
|
||||||
importlib.import_module(f"connectors.{module_name}")
|
importlib.import_module(f"server.connectors.{module_name}")
|
||||||
|
|
||||||
return [
|
return [
|
||||||
cls for cls in BaseConnector.__subclasses__()
|
cls for cls in BaseConnector.__subclasses__()
|
||||||
|
|||||||
@@ -13,6 +13,9 @@ load_dotenv()
|
|||||||
API_KEY = os.getenv("YOUTUBE_API_KEY")
|
API_KEY = os.getenv("YOUTUBE_API_KEY")
|
||||||
|
|
||||||
class YouTubeAPI(BaseConnector):
|
class YouTubeAPI(BaseConnector):
|
||||||
|
source_name = "youtube"
|
||||||
|
display_name = "YouTube"
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.youtube = build('youtube', 'v3', developerKey=API_KEY)
|
self.youtube = build('youtube', 'v3', developerKey=API_KEY)
|
||||||
|
|
||||||
@@ -44,7 +47,7 @@ class YouTubeAPI(BaseConnector):
|
|||||||
author=comment_snippet['authorDisplayName'],
|
author=comment_snippet['authorDisplayName'],
|
||||||
timestamp=datetime.datetime.strptime(comment_snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp(),
|
timestamp=datetime.datetime.strptime(comment_snippet['publishedAt'], "%Y-%m-%dT%H:%M:%SZ").timestamp(),
|
||||||
reply_to=None,
|
reply_to=None,
|
||||||
source="YouTube"
|
source=self.source_name
|
||||||
)
|
)
|
||||||
|
|
||||||
comments.append(comment)
|
comments.append(comment)
|
||||||
|
|||||||
Reference in New Issue
Block a user