From 05874d233fcdf28dfc94ee855e9452ca36a7052e Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 13 Jan 2026 18:39:55 +0000 Subject: [PATCH 1/6] Implement subreddit search method for new posts --- connectors/reddit_connector.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/connectors/reddit_connector.py b/connectors/reddit_connector.py index 053576e..f255ef0 100644 --- a/connectors/reddit_connector.py +++ b/connectors/reddit_connector.py @@ -29,6 +29,14 @@ class RedditConnector: data = self._fetch_data(url, params) return self._parse_posts(data) + def get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]: + params = { + 'limit': limit + } + url = f"r/{subreddit}/new.json" + data = self._fetch_data(url, params) + return self._parse_posts(data) + def get_user(self, username: str) -> User: data = self._fetch_data(f"user/{username}/about.json", {}) return self._parse_user(data) From d4fb78aac4622127618dc04af1b3b32d5009a2e7 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 13 Jan 2026 18:46:43 +0000 Subject: [PATCH 2/6] Add pagination to new_subreddit method to bypass 100 post limit --- connectors/reddit_connector.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/connectors/reddit_connector.py b/connectors/reddit_connector.py index f255ef0..a05ccc5 100644 --- a/connectors/reddit_connector.py +++ b/connectors/reddit_connector.py @@ -30,12 +30,30 @@ class RedditConnector: return self._parse_posts(data) def get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]: - params = { - 'limit': limit - } + + posts = [] + after = None url = f"r/{subreddit}/new.json" - data = self._fetch_data(url, params) - return self._parse_posts(data) + + while len(posts) < limit: + batch_limit = min(100, limit - len(posts)) + params = { + 'limit': batch_limit, + 'after': after + } + + data = self._fetch_data(url, params) + batch = self._parse_posts(data) + + if not batch: + break + + posts.extend(batch) + after = data['data'].get('after') + if not after: + break + + return posts def get_user(self, username: str) -> User: data = self._fetch_data(f"user/{username}/about.json", {}) From e58c18bf99a9b114b845dd1cdc8454c7a516478a Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 13 Jan 2026 18:57:29 +0000 Subject: [PATCH 3/6] add json files and vscode workspaces to gitignore --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 42f7d5d..9a75247 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .vscode __pycache__/ -*.pyc \ No newline at end of file +*.pyc +*.json +*.code-workspace \ No newline at end of file From 73a19f3ce3b55c1b1130592b8f74aa6edf41ec84 Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 13 Jan 2026 18:59:42 +0000 Subject: [PATCH 4/6] Add script to orchestrate dataset creation --- fetch_data.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 fetch_data.py diff --git a/fetch_data.py b/fetch_data.py new file mode 100644 index 0000000..46b8f5e --- /dev/null +++ b/fetch_data.py @@ -0,0 +1,15 @@ +import json +from connectors.reddit_connector import RedditConnector + +data_file = 'reddit_posts.json' +reddit_connector = RedditConnector() + +def main(): + posts = reddit_connector.get_new_subreddit_posts('cork', limit=1000) + print(f"Fetched {len(posts)} posts from r/cork") + + with open(data_file, 'w') as f: + json.dump([post.__dict__ for post in posts], f, indent=4) + +if __name__ == "__main__": + main() \ No newline at end of file From 538ea9fe1269b8f8dadacc40fdb42f1688bb123f Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 13 Jan 2026 19:01:18 +0000 Subject: [PATCH 5/6] Remove database connection and schema setup from the project --- db/database.py | 34 ---------------------------------- db/sql/01_schema.sql | 16 ---------------- server/app.py | 39 +-------------------------------------- 3 files changed, 1 insertion(+), 88 deletions(-) delete mode 100644 db/database.py delete mode 100644 db/sql/01_schema.sql diff --git a/db/database.py b/db/database.py deleted file mode 100644 index 4593ec4..0000000 --- a/db/database.py +++ /dev/null @@ -1,34 +0,0 @@ -# To connect to PostgreSQL database -import psycopg2 - -from psycopg2.extras import RealDictCursor -from typing import Optional - -class Database: - def __init__(self, db_name: str, user: str, password: str, host: str = 'localhost', port: int = 5432): - self.connection = psycopg2.connect( - dbname=db_name, - user=user, - password=password, - host=host, - port=port - ) - self.connection.autocommit = True - - def execute_query(self, query: str, params: Optional[tuple] = None): - with self.connection.cursor(cursor_factory=RealDictCursor) as cursor: - cursor.execute(query, params) - if cursor.description: - return cursor.fetchall() - return [] - - def execute_many(self, query: str, params_list: list[tuple]): - with self.connection.cursor(cursor_factory=RealDictCursor) as cursor: - cursor.executemany(query, params_list) - - def close(self): - self.connection.close() - print("Database connection closed.") - - def __exit__(self, exc_type, exc_val, exc_tb): - self.close() \ No newline at end of file diff --git a/db/sql/01_schema.sql b/db/sql/01_schema.sql deleted file mode 100644 index 92f6ecb..0000000 --- a/db/sql/01_schema.sql +++ /dev/null @@ -1,16 +0,0 @@ -CREATE SCHEMA IF NOT EXISTS ethnograph; - -CREATE TABLE IF NOT EXISTS ethnograph.users ( - id SERIAL PRIMARY KEY, - username VARCHAR(255) UNIQUE NOT NULL, - created_utc TIMESTAMP NOT NULL, - karma INTEGER -); - -CREATE TABLE IF NOT EXISTS ethnograph.posts ( - id SERIAL PRIMARY KEY, - title TEXT NOT NULL, - content TEXT NOT NULL, - author_username VARCHAR(255), - created_utc TIMESTAMP NOT NULL -); \ No newline at end of file diff --git a/server/app.py b/server/app.py index 917df87..5452fcd 100644 --- a/server/app.py +++ b/server/app.py @@ -1,52 +1,15 @@ from flask import Flask -from db.database import Database from connectors.reddit_connector import RedditConnector from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer app = Flask(__name__) -db = Database(db_name='ethnograph', user='ethnograph_user', password='ethnograph_pass') reddit_connector = RedditConnector() @app.route('/fetch_subreddit//', methods=['GET']) def fetch_subreddit(subreddit, limit = 10): posts = reddit_connector.get_top_subreddit_posts(subreddit, limit=limit, timeframe='all') - - db.execute_many( - """INSERT INTO ethnograph.posts (title, content, author_username, created_utc) - VALUES (%s, %s, %s, to_timestamp(%s));""", - [(post.title, post.content, post.author, post.timestamp) for post in posts] - ) - - return {"status": "success", "inserted_posts": len(posts)} - -@app.route('/sentiment', methods=['GET']) -def sentiment_analysis(): - posts = db.execute_query( - "SELECT id, title, content FROM ethnograph.posts;" - ) - - analyzer = SentimentIntensityAnalyzer() - - total_sentiment = 0.0 - count = 0 - - for post in posts: - content = post.get("title") - if not content: - continue - - score = analyzer.polarity_scores(content)["compound"] - total_sentiment += score - count += 1 - - average_sentiment = total_sentiment / count if count else 0.0 - - return { - "status": "success", - "average_sentiment": average_sentiment, - "posts_analyzed": count - } + return {"status": "success", "posts": [post.__dict__ for post in posts]} if __name__ == "__main__": app.run(debug=True) \ No newline at end of file From b0e079599a8887a977bf6d3392e5f1b7b2923d9b Mon Sep 17 00:00:00 2001 From: Dylan De Faoite Date: Tue, 13 Jan 2026 19:06:00 +0000 Subject: [PATCH 6/6] Rename fetch data script & add check for empty posts --- fetch_data.py => create_dataset.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) rename fetch_data.py => create_dataset.py (68%) diff --git a/fetch_data.py b/create_dataset.py similarity index 68% rename from fetch_data.py rename to create_dataset.py index 46b8f5e..6e74b31 100644 --- a/fetch_data.py +++ b/create_dataset.py @@ -1,11 +1,16 @@ import json from connectors.reddit_connector import RedditConnector -data_file = 'reddit_posts.json' +data_file = 'data/reddit_posts.json' reddit_connector = RedditConnector() +def remove_empty_posts(posts): + return [post for post in posts if post.content.strip() != ""] + def main(): posts = reddit_connector.get_new_subreddit_posts('cork', limit=1000) + posts = remove_empty_posts(posts) + print(f"Fetched {len(posts)} posts from r/cork") with open(data_file, 'w') as f: