crosspost/server/analysis/summary.py

import pandas as pd


class SummaryAnalysis:
    def total_events(self, df: pd.DataFrame) -> int:
        return int(len(df))

    def total_posts(self, df: pd.DataFrame) -> int:
        return int(len(df[df["type"] == "post"]))

    def total_comments(self, df: pd.DataFrame) -> int:
        return int(len(df[df["type"] == "comment"]))

    def unique_users(self, df: pd.DataFrame) -> int:
        return int(len(df["author"].dropna().unique()))

    def comments_per_post(self, total_comments: int, total_posts: int) -> float:
        return round(total_comments / max(total_posts, 1), 2)

    def lurker_ratio(self, df: pd.DataFrame) -> float:
        events_per_user = df.groupby("author").size()
        return round((events_per_user == 1).mean(), 2)

    def time_range(self, df: pd.DataFrame) -> dict:
        return {
            "start": int(df["dt"].min().timestamp()),
            "end": int(df["dt"].max().timestamp()),
        }

    def sources(self, df: pd.DataFrame) -> list:
        return df["source"].dropna().unique().tolist()

    def empty_summary(self) -> dict:
        return {
            "total_events": 0,
            "total_posts": 0,
            "total_comments": 0,
            "unique_users": 0,
            "comments_per_post": 0,
            "lurker_ratio": 0,
            "time_range": {
                "start": None,
                "end": None,
            },
            "sources": [],
        }

    def summary(self, df: pd.DataFrame) -> dict:
        if df.empty:
            return self.empty_summary()

        total_posts = self.total_posts(df)
        total_comments = self.total_comments(df)

        return {
            "total_events": self.total_events(df),
            "total_posts": total_posts,
            "total_comments": total_comments,
            "unique_users": self.unique_users(df),
            "comments_per_post": self.comments_per_post(total_comments, total_posts),
            "lurker_ratio": self.lurker_ratio(df),
            "time_range": self.time_range(df),
            "sources": self.sources(df),
        }