From 187401c5ebf9e7e368f816ba1f9a0f9b0d5ab4df Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Mon, 19 Jan 2026 20:50:17 +0000 Subject: [PATCH] Implement YouTube API integration for video and comment fetching --- connectors/youtube_api.py | 76 +++++++++++++++++++++++++++++++++++++++ create_dataset.py | 7 +++- requirements.txt | 14 ++++++++ 3 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 connectors/youtube_api.py diff --git a/connectors/youtube_api.py b/connectors/youtube_api.py new file mode 100644 index 0000000..a1df8db --- /dev/null +++ b/connectors/youtube_api.py @@ -0,0 +1,76 @@ +import os + +from dotenv import load_dotenv +from googleapiclient.discovery import build +from dto.post import Post +from dto.comment import Comment + +load_dotenv() + +API_KEY = os.getenv("YOUTUBE_API_KEY") +print(API_KEY) + +class YouTubeAPI: + def __init__(self): + self.youtube = build('youtube', 'v3', developerKey=API_KEY) + + def search_videos(self, query, limit): + request = self.youtube.search().list( + q=query, + part='snippet', + type='video', + maxResults=limit + ) + response = request.execute() + return response.get('items', []) + + def get_video_comments(self, video_id, limit): + request = self.youtube.commentThreads().list( + part='snippet', + videoId=video_id, + maxResults=limit, + textFormat='plainText' + ) + response = request.execute() + return response.get('items', []) + + def fetch_and_parse_videos(self, query, video_limit, comment_limit): + videos = self.search_videos(query, video_limit) + posts = [] + + for video in videos: + video_id = video['id']['videoId'] + snippet = video['snippet'] + title = snippet['title'] + description = snippet['description'] + published_at = snippet['publishedAt'] + channel_title = snippet['channelTitle'] + + post = Post( + id=video_id, + content=f"{title}\n\n{description}", + author=channel_title, + timestamp=published_at, + url=f"https://www.youtube.com/watch?v={video_id}", + title=title, + source="YouTube" + ) + + post.comments = [] + comments_data = self.get_video_comments(video_id, comment_limit) + for comment_thread in comments_data: + comment_snippet = comment_thread['snippet']['topLevelComment']['snippet'] + comment = Comment( + id=comment_thread['id'], + post_id=video_id, + content=comment_snippet['textDisplay'], + author=comment_snippet['authorDisplayName'], + timestamp=comment_snippet['publishedAt'], + reply_to=None, + source="YouTube" + ) + post.comments.append(comment) + + posts.append(post) + + return posts \ No newline at end of file diff --git a/create_dataset.py b/create_dataset.py index 59a3753..2750ccb 100644 --- a/create_dataset.py +++ b/create_dataset.py @@ -2,10 +2,12 @@ import json import logging from connectors.reddit_api import RedditAPI from connectors.boards_api import BoardsAPI +from connectors.youtube_api import YouTubeAPI -data_file = 'data/posts.json' +data_file = 'data/posts.jsonl' reddit_connector = RedditAPI() boards_connector = BoardsAPI() +youtube_connector = YouTubeAPI() logging.basicConfig(level=logging.DEBUG) logging.getLogger("urllib3").setLevel(logging.WARNING) @@ -40,5 +42,8 @@ def main(): ireland_posts = remove_empty_posts(ireland_posts) save_to_jsonl(data_file, ireland_posts) + youtube_videos = youtube_connector.fetch_and_parse_videos('cork city', 50, 50) + save_to_jsonl(data_file, youtube_videos) + if __name__ == "__main__": main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 17bf580..7c8454d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,15 +4,29 @@ certifi==2026.1.4 charset-normalizer==3.4.4 click==8.3.1 Flask==3.1.2 +google-api-core==2.29.0 +google-api-python-client==2.188.0 +google-auth==2.47.0 +google-auth-httplib2==0.3.0 +googleapis-common-protos==1.72.0 +httplib2==0.31.1 idna==3.11 itsdangerous==2.2.0 Jinja2==3.1.6 MarkupSafe==3.0.3 +proto-plus==1.27.0 +protobuf==6.33.4 psycopg2==2.9.11 psycopg2-binary==2.9.11 +pyasn1==0.6.2 +pyasn1_modules==0.4.2 +pyparsing==3.3.1 +python-dotenv==1.2.1 requests==2.32.5 +rsa==4.9.1 soupsieve==2.8.1 typing_extensions==4.15.0 +uritemplate==4.2.0 urllib3==2.6.3 vaderSentiment==3.3.2 Werkzeug==3.1.5