diff --git a/.gitignore b/.gitignore index 42f7d5d..9a75247 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .vscode __pycache__/ -*.pyc \ No newline at end of file +*.pyc +*.json +*.code-workspace \ No newline at end of file diff --git a/connectors/reddit_api.py b/connectors/reddit_api.py index 0488ab1..16041ff 100644 --- a/connectors/reddit_api.py +++ b/connectors/reddit_api.py @@ -30,12 +30,30 @@ class RedditAPI: return self._parse_posts(data) def get_new_subreddit_posts(self, subreddit: str, limit: int = 10) -> list[Post]: - params = { - 'limit': limit - } + + posts = [] + after = None url = f"r/{subreddit}/new.json" - data = self._fetch_data(url, params) - return self._parse_posts(data) + + while len(posts) < limit: + batch_limit = min(100, limit - len(posts)) + params = { + 'limit': batch_limit, + 'after': after + } + + data = self._fetch_data(url, params) + batch = self._parse_posts(data) + + if not batch: + break + + posts.extend(batch) + after = data['data'].get('after') + if not after: + break + + return posts def get_user(self, username: str) -> User: data = self._fetch_data(f"user/{username}/about.json", {}) diff --git a/create_dataset.py b/create_dataset.py new file mode 100644 index 0000000..2ec54a7 --- /dev/null +++ b/create_dataset.py @@ -0,0 +1,19 @@ +import json +from connectors.reddit_api import RedditAPI + +data_file = 'data/reddit_posts.json' +reddit_connector = RedditAPI() +def remove_empty_posts(posts): + return [post for post in posts if post.content.strip() != ""] + +def main(): + posts = reddit_connector.get_new_subreddit_posts('cork', limit=1000) + posts = remove_empty_posts(posts) + + print(f"Fetched {len(posts)} posts from r/cork") + + with open(data_file, 'w') as f: + json.dump([post.__dict__ for post in posts], f, indent=4) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/db/database.py b/db/database.py deleted file mode 100644 index 4593ec4..0000000 --- a/db/database.py +++ /dev/null @@ -1,34 +0,0 @@ -# To connect to PostgreSQL database -import psycopg2 - -from psycopg2.extras import RealDictCursor -from typing import Optional - -class Database: - def __init__(self, db_name: str, user: str, password: str, host: str = 'localhost', port: int = 5432): - self.connection = psycopg2.connect( - dbname=db_name, - user=user, - password=password, - host=host, - port=port - ) - self.connection.autocommit = True - - def execute_query(self, query: str, params: Optional[tuple] = None): - with self.connection.cursor(cursor_factory=RealDictCursor) as cursor: - cursor.execute(query, params) - if cursor.description: - return cursor.fetchall() - return [] - - def execute_many(self, query: str, params_list: list[tuple]): - with self.connection.cursor(cursor_factory=RealDictCursor) as cursor: - cursor.executemany(query, params_list) - - def close(self): - self.connection.close() - print("Database connection closed.") - - def __exit__(self, exc_type, exc_val, exc_tb): - self.close() \ No newline at end of file diff --git a/db/sql/01_schema.sql b/db/sql/01_schema.sql deleted file mode 100644 index 92f6ecb..0000000 --- a/db/sql/01_schema.sql +++ /dev/null @@ -1,16 +0,0 @@ -CREATE SCHEMA IF NOT EXISTS ethnograph; - -CREATE TABLE IF NOT EXISTS ethnograph.users ( - id SERIAL PRIMARY KEY, - username VARCHAR(255) UNIQUE NOT NULL, - created_utc TIMESTAMP NOT NULL, - karma INTEGER -); - -CREATE TABLE IF NOT EXISTS ethnograph.posts ( - id SERIAL PRIMARY KEY, - title TEXT NOT NULL, - content TEXT NOT NULL, - author_username VARCHAR(255), - created_utc TIMESTAMP NOT NULL -); \ No newline at end of file diff --git a/server/app.py b/server/app.py index a1c1d79..7d89cee 100644 --- a/server/app.py +++ b/server/app.py @@ -1,52 +1,14 @@ from flask import Flask -from db.database import Database from connectors.reddit_api import RedditAPI -from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer app = Flask(__name__) -db = Database(db_name='ethnograph', user='ethnograph_user', password='ethnograph_pass') reddit_connector = RedditAPI() @app.route('/fetch_subreddit//', methods=['GET']) def fetch_subreddit(subreddit, limit = 10): posts = reddit_connector.get_top_subreddit_posts(subreddit, limit=limit, timeframe='all') - - db.execute_many( - """INSERT INTO ethnograph.posts (title, content, author_username, created_utc) - VALUES (%s, %s, %s, to_timestamp(%s));""", - [(post.title, post.content, post.author, post.timestamp) for post in posts] - ) - - return {"status": "success", "inserted_posts": len(posts)} - -@app.route('/sentiment', methods=['GET']) -def sentiment_analysis(): - posts = db.execute_query( - "SELECT id, title, content FROM ethnograph.posts;" - ) - - analyzer = SentimentIntensityAnalyzer() - - total_sentiment = 0.0 - count = 0 - - for post in posts: - content = post.get("title") - if not content: - continue - - score = analyzer.polarity_scores(content)["compound"] - total_sentiment += score - count += 1 - - average_sentiment = total_sentiment / count if count else 0.0 - - return { - "status": "success", - "average_sentiment": average_sentiment, - "posts_analyzed": count - } + return {"status": "success", "posts": [post.__dict__ for post in posts]} if __name__ == "__main__": app.run(debug=True) \ No newline at end of file